52deb854482faa979a368464dba8caf53ec04c2c
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 int getVMBlock();
36 #endif
37
38 #include "new_dynarec_config.h"
39 #include "backends/psx/emu_if.h" //emulator interface
40
41 //#define DISASM
42 //#define assem_debug printf
43 //#define inv_debug printf
44 #define assem_debug(...)
45 #define inv_debug(...)
46
47 #ifdef __i386__
48 #include "x86/assem_x86.h"
49 #endif
50 #ifdef __x86_64__
51 #include "x64/assem_x64.h"
52 #endif
53 #ifdef __arm__
54 #include "arm/assem_arm.h"
55 #endif
56
57 #ifdef VITA
58 int _newlib_vm_size_user = 1 << TARGET_SIZE_2;
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 struct regstat
65 {
66   signed char regmap_entry[HOST_REGS];
67   signed char regmap[HOST_REGS];
68   uint64_t was32;
69   uint64_t is32;
70   uint64_t wasdirty;
71   uint64_t dirty;
72   uint64_t u;
73   uint64_t uu;
74   u_int wasconst;
75   u_int isconst;
76   u_int loadedconst;             // host regs that have constants loaded
77   u_int waswritten;              // MIPS regs that were used as store base before
78 };
79
80 // note: asm depends on this layout
81 struct ll_entry
82 {
83   u_int vaddr;
84   u_int reg_sv_flags;
85   void *addr;
86   struct ll_entry *next;
87 };
88
89   // used by asm:
90   u_char *out;
91   u_int hash_table[65536][4]  __attribute__((aligned(16)));
92   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
93   struct ll_entry *jump_dirty[4096];
94
95   static struct ll_entry *jump_out[4096];
96   static u_int start;
97   static u_int *source;
98   static char insn[MAXBLOCK][10];
99   static u_char itype[MAXBLOCK];
100   static u_char opcode[MAXBLOCK];
101   static u_char opcode2[MAXBLOCK];
102   static u_char bt[MAXBLOCK];
103   static u_char rs1[MAXBLOCK];
104   static u_char rs2[MAXBLOCK];
105   static u_char rt1[MAXBLOCK];
106   static u_char rt2[MAXBLOCK];
107   static u_char us1[MAXBLOCK];
108   static u_char us2[MAXBLOCK];
109   static u_char dep1[MAXBLOCK];
110   static u_char dep2[MAXBLOCK];
111   static u_char lt1[MAXBLOCK];
112   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
113   static uint64_t gte_rt[MAXBLOCK];
114   static uint64_t gte_unneeded[MAXBLOCK];
115   static u_int smrv[32]; // speculated MIPS register values
116   static u_int smrv_strong; // mask or regs that are likely to have correct values
117   static u_int smrv_weak; // same, but somewhat less likely
118   static u_int smrv_strong_next; // same, but after current insn executes
119   static u_int smrv_weak_next;
120   static int imm[MAXBLOCK];
121   static u_int ba[MAXBLOCK];
122   static char likely[MAXBLOCK];
123   static char is_ds[MAXBLOCK];
124   static char ooo[MAXBLOCK];
125   static uint64_t unneeded_reg[MAXBLOCK];
126   static uint64_t unneeded_reg_upper[MAXBLOCK];
127   static uint64_t branch_unneeded_reg[MAXBLOCK];
128   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
129   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
130   static uint64_t current_constmap[HOST_REGS];
131   static uint64_t constmap[MAXBLOCK][HOST_REGS];
132   static struct regstat regs[MAXBLOCK];
133   static struct regstat branch_regs[MAXBLOCK];
134   static signed char minimum_free_regs[MAXBLOCK];
135   static u_int needed_reg[MAXBLOCK];
136   static u_int wont_dirty[MAXBLOCK];
137   static u_int will_dirty[MAXBLOCK];
138   static int ccadj[MAXBLOCK];
139   static int slen;
140   static u_int instr_addr[MAXBLOCK];
141   static u_int link_addr[MAXBLOCK][3];
142   static int linkcount;
143   static u_int stubs[MAXBLOCK*3][8];
144   static int stubcount;
145   static u_int literals[1024][2];
146   static int literalcount;
147   static int is_delayslot;
148   static int cop1_usable;
149   static char shadow[1048576]  __attribute__((aligned(16)));
150   static void *copy;
151   static int expirep;
152   static u_int stop_after_jal;
153 #ifndef RAM_FIXED
154   static u_int ram_offset;
155 #else
156   static const u_int ram_offset=0;
157 #endif
158
159   int new_dynarec_hacks;
160   int new_dynarec_did_compile;
161   extern u_char restore_candidate[512];
162   extern int cycle_count;
163
164   /* registers that may be allocated */
165   /* 1-31 gpr */
166 #define HIREG 32 // hi
167 #define LOREG 33 // lo
168 #define FSREG 34 // FPU status (FCSR)
169 #define CSREG 35 // Coprocessor status
170 #define CCREG 36 // Cycle count
171 #define INVCP 37 // Pointer to invalid_code
172 //#define MMREG 38 // Pointer to memory_map
173 #define ROREG 39 // ram offset (if rdram!=0x80000000)
174 #define TEMPREG 40
175 #define FTEMP 40 // FPU temporary register
176 #define PTEMP 41 // Prefetch temporary register
177 //#define TLREG 42 // TLB mapping offset
178 #define RHASH 43 // Return address hash
179 #define RHTBL 44 // Return address hash table address
180 #define RTEMP 45 // JR/JALR address register
181 #define MAXREG 45
182 #define AGEN1 46 // Address generation temporary register
183 //#define AGEN2 47 // Address generation temporary register
184 //#define MGEN1 48 // Maptable address generation temporary register
185 //#define MGEN2 49 // Maptable address generation temporary register
186 #define BTREG 50 // Branch target temporary register
187
188   /* instruction types */
189 #define NOP 0     // No operation
190 #define LOAD 1    // Load
191 #define STORE 2   // Store
192 #define LOADLR 3  // Unaligned load
193 #define STORELR 4 // Unaligned store
194 #define MOV 5     // Move
195 #define ALU 6     // Arithmetic/logic
196 #define MULTDIV 7 // Multiply/divide
197 #define SHIFT 8   // Shift by register
198 #define SHIFTIMM 9// Shift by immediate
199 #define IMM16 10  // 16-bit immediate
200 #define RJUMP 11  // Unconditional jump to register
201 #define UJUMP 12  // Unconditional jump
202 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
203 #define SJUMP 14  // Conditional branch (regimm format)
204 #define COP0 15   // Coprocessor 0
205 #define COP1 16   // Coprocessor 1
206 #define C1LS 17   // Coprocessor 1 load/store
207 #define FJUMP 18  // Conditional branch (floating point)
208 #define FLOAT 19  // Floating point unit
209 #define FCONV 20  // Convert integer to float
210 #define FCOMP 21  // Floating point compare (sets FSREG)
211 #define SYSCALL 22// SYSCALL
212 #define OTHER 23  // Other
213 #define SPAN 24   // Branch/delay slot spans 2 pages
214 #define NI 25     // Not implemented
215 #define HLECALL 26// PCSX fake opcodes for HLE
216 #define COP2 27   // Coprocessor 2 move
217 #define C2LS 28   // Coprocessor 2 load/store
218 #define C2OP 29   // Coprocessor 2 operation
219 #define INTCALL 30// Call interpreter to handle rare corner cases
220
221   /* stubs */
222 #define CC_STUB 1
223 #define FP_STUB 2
224 #define LOADB_STUB 3
225 #define LOADH_STUB 4
226 #define LOADW_STUB 5
227 #define LOADD_STUB 6
228 #define LOADBU_STUB 7
229 #define LOADHU_STUB 8
230 #define STOREB_STUB 9
231 #define STOREH_STUB 10
232 #define STOREW_STUB 11
233 #define STORED_STUB 12
234 #define STORELR_STUB 13
235 #define INVCODE_STUB 14
236
237   /* branch codes */
238 #define TAKEN 1
239 #define NOTTAKEN 2
240 #define NULLDS 3
241
242 // asm linkage
243 int new_recompile_block(int addr);
244 void *get_addr_ht(u_int vaddr);
245 void invalidate_block(u_int block);
246 void invalidate_addr(u_int addr);
247 void remove_hash(int vaddr);
248 void dyna_linker();
249 void dyna_linker_ds();
250 void verify_code();
251 void verify_code_vm();
252 void verify_code_ds();
253 void cc_interrupt();
254 void fp_exception();
255 void fp_exception_ds();
256 void jump_syscall_hle();
257 void jump_hlecall();
258 void jump_intcall();
259 void new_dyna_leave();
260
261 // Needed by assembler
262 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
263 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
264 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
265 static void load_all_regs(signed char i_regmap[]);
266 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
267 static void load_regs_entry(int t);
268 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
269
270 static int verify_dirty(u_int *ptr);
271 static int get_final_value(int hr, int i, int *value);
272 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e);
273 static void add_to_linker(int addr,int target,int ext);
274
275 static int tracedebug=0;
276
277 static void mprotect_w_x(void *start, void *end, int is_x)
278 {
279 #ifdef NO_WRITE_EXEC
280   #if defined(VITA)
281   // *Open* enables write on all memory that was
282   // allocated by sceKernelAllocMemBlockForVM()?
283   if (is_x)
284     sceKernelCloseVMDomain();
285   else
286     sceKernelOpenVMDomain();
287   #else
288   u_long mstart = (u_long)start & ~4095ul;
289   u_long mend = (u_long)end;
290   if (mprotect((void *)mstart, mend - mstart,
291                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
292     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
293   #endif
294 #endif
295 }
296
297 static void start_tcache_write(void *start, void *end)
298 {
299   mprotect_w_x(start, end, 0);
300 }
301
302 static void end_tcache_write(void *start, void *end)
303 {
304 #ifdef __arm__
305   size_t len = (char *)end - (char *)start;
306   #if   defined(__BLACKBERRY_QNX__)
307   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
308   #elif defined(__MACH__)
309   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
310   #elif defined(VITA)
311   sceKernelSyncVMDomain(sceBlock, start, len);
312   #elif defined(_3DS)
313   ctr_flush_invalidate_cache();
314   #else
315   __clear_cache(start, end);
316   #endif
317   (void)len;
318 #endif
319
320   mprotect_w_x(start, end, 1);
321 }
322
323 static void *start_block(void)
324 {
325   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
326   if (end > (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2))
327     end = (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2);
328   start_tcache_write(out, end);
329   return out;
330 }
331
332 static void end_block(void *start)
333 {
334   end_tcache_write(start, out);
335 }
336
337 //#define DEBUG_CYCLE_COUNT 1
338
339 #define NO_CYCLE_PENALTY_THR 12
340
341 int cycle_multiplier; // 100 for 1.0
342
343 static int CLOCK_ADJUST(int x)
344 {
345   int s=(x>>31)|1;
346   return (x * cycle_multiplier + s * 50) / 100;
347 }
348
349 static u_int get_page(u_int vaddr)
350 {
351   u_int page=vaddr&~0xe0000000;
352   if (page < 0x1000000)
353     page &= ~0x0e00000; // RAM mirrors
354   page>>=12;
355   if(page>2048) page=2048+(page&2047);
356   return page;
357 }
358
359 // no virtual mem in PCSX
360 static u_int get_vpage(u_int vaddr)
361 {
362   return get_page(vaddr);
363 }
364
365 // Get address from virtual address
366 // This is called from the recompiled JR/JALR instructions
367 void *get_addr(u_int vaddr)
368 {
369   struct ll_entry *head = NULL;
370   u_int page            = get_page(vaddr);
371   u_int vpage           = get_vpage(vaddr);
372   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
373   head=jump_in[page];
374   while(head!=NULL)
375   {
376     if(head->vaddr==vaddr)
377     {
378       //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
379       u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
380       ht_bin[3]=ht_bin[1];
381       ht_bin[2]=ht_bin[0];
382       ht_bin[1]=(u_int)head->addr;
383       ht_bin[0]=vaddr;
384       return head->addr;
385     }
386     head=head->next;
387   }
388   head=jump_dirty[vpage];
389   while(head!=NULL)
390   {
391     if(head->vaddr==vaddr)
392     {
393       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
394       // Don't restore blocks which are about to expire from the cache
395       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
396         if(verify_dirty(head->addr))
397         {
398           //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
399           invalid_code[vaddr>>12]=0;
400           inv_code_start=inv_code_end=~0;
401           if(vpage<2048)
402           {
403             restore_candidate[vpage>>3]|=1<<(vpage&7);
404           }
405           else
406           {
407             restore_candidate[page>>3]|=1<<(page&7);
408           }
409           u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
410
411           if(ht_bin[0]==vaddr)
412             ht_bin[1]=(u_int)head->addr; // Replace existing entry
413           else
414           {
415             ht_bin[3]=ht_bin[1];
416             ht_bin[2]=ht_bin[0];
417             ht_bin[1]=(int)head->addr;
418             ht_bin[0]=vaddr;
419           }
420           return head->addr;
421         }
422     }
423     head=head->next;
424   }
425   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
426   int r=new_recompile_block(vaddr);
427   if(r==0)
428     return get_addr(vaddr);
429   // Execute in unmapped page, generate pagefault exception
430   Status|=2;
431   Cause=(vaddr<<31)|0x8;
432   EPC=(vaddr&1)?vaddr-5:vaddr;
433   BadVAddr=(vaddr&~1);
434   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
435   EntryHi=BadVAddr&0xFFFFE000;
436   return get_addr_ht(0x80000000);
437 }
438
439 // Look up address in hash table first
440 void *get_addr_ht(u_int vaddr)
441 {
442   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
443   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
444   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
445   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
446   return get_addr(vaddr);
447 }
448
449 void clear_all_regs(signed char regmap[])
450 {
451   int hr;
452   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
453 }
454
455 signed char get_reg(signed char regmap[],int r)
456 {
457   int hr;
458   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
459   return -1;
460 }
461
462 // Find a register that is available for two consecutive cycles
463 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
464 {
465   int hr;
466   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
467   return -1;
468 }
469
470 int count_free_regs(signed char regmap[])
471 {
472   int count=0;
473   int hr;
474   for(hr=0;hr<HOST_REGS;hr++)
475   {
476     if(hr!=EXCLUDE_REG) {
477       if(regmap[hr]<0) count++;
478     }
479   }
480   return count;
481 }
482
483 void dirty_reg(struct regstat *cur,signed char reg)
484 {
485   int hr;
486   if(!reg) return;
487   for (hr=0;hr<HOST_REGS;hr++) {
488     if((cur->regmap[hr]&63)==reg) {
489       cur->dirty|=1<<hr;
490     }
491   }
492 }
493
494 // If we dirty the lower half of a 64 bit register which is now being
495 // sign-extended, we need to dump the upper half.
496 // Note: Do this only after completion of the instruction, because
497 // some instructions may need to read the full 64-bit value even if
498 // overwriting it (eg SLTI, DSRA32).
499 static void flush_dirty_uppers(struct regstat *cur)
500 {
501   int hr,reg;
502   for (hr=0;hr<HOST_REGS;hr++) {
503     if((cur->dirty>>hr)&1) {
504       reg=cur->regmap[hr];
505       if(reg>=64)
506         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
507     }
508   }
509 }
510
511 void set_const(struct regstat *cur,signed char reg,uint64_t value)
512 {
513   int hr;
514   if(!reg) return;
515   for (hr=0;hr<HOST_REGS;hr++) {
516     if(cur->regmap[hr]==reg) {
517       cur->isconst|=1<<hr;
518       current_constmap[hr]=value;
519     }
520     else if((cur->regmap[hr]^64)==reg) {
521       cur->isconst|=1<<hr;
522       current_constmap[hr]=value>>32;
523     }
524   }
525 }
526
527 void clear_const(struct regstat *cur,signed char reg)
528 {
529   int hr;
530   if(!reg) return;
531   for (hr=0;hr<HOST_REGS;hr++) {
532     if((cur->regmap[hr]&63)==reg) {
533       cur->isconst&=~(1<<hr);
534     }
535   }
536 }
537
538 int is_const(struct regstat *cur,signed char reg)
539 {
540   int hr;
541   if(reg<0) return 0;
542   if(!reg) return 1;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if((cur->regmap[hr]&63)==reg) {
545       return (cur->isconst>>hr)&1;
546     }
547   }
548   return 0;
549 }
550 uint64_t get_const(struct regstat *cur,signed char reg)
551 {
552   int hr;
553   if(!reg) return 0;
554   for (hr=0;hr<HOST_REGS;hr++) {
555     if(cur->regmap[hr]==reg) {
556       return current_constmap[hr];
557     }
558   }
559   SysPrintf("Unknown constant in r%d\n",reg);
560   exit(1);
561 }
562
563 // Least soon needed registers
564 // Look at the next ten instructions and see which registers
565 // will be used.  Try not to reallocate these.
566 void lsn(u_char hsn[], int i, int *preferred_reg)
567 {
568   int j;
569   int b=-1;
570   for(j=0;j<9;j++)
571   {
572     if(i+j>=slen) {
573       j=slen-i-1;
574       break;
575     }
576     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
577     {
578       // Don't go past an unconditonal jump
579       j++;
580       break;
581     }
582   }
583   for(;j>=0;j--)
584   {
585     if(rs1[i+j]) hsn[rs1[i+j]]=j;
586     if(rs2[i+j]) hsn[rs2[i+j]]=j;
587     if(rt1[i+j]) hsn[rt1[i+j]]=j;
588     if(rt2[i+j]) hsn[rt2[i+j]]=j;
589     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
590       // Stores can allocate zero
591       hsn[rs1[i+j]]=j;
592       hsn[rs2[i+j]]=j;
593     }
594     // On some architectures stores need invc_ptr
595     #if defined(HOST_IMM8)
596     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
597       hsn[INVCP]=j;
598     }
599     #endif
600     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
601     {
602       hsn[CCREG]=j;
603       b=j;
604     }
605   }
606   if(b>=0)
607   {
608     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
609     {
610       // Follow first branch
611       int t=(ba[i+b]-start)>>2;
612       j=7-b;if(t+j>=slen) j=slen-t-1;
613       for(;j>=0;j--)
614       {
615         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
616         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
617         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
618         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
619       }
620     }
621     // TODO: preferred register based on backward branch
622   }
623   // Delay slot should preferably not overwrite branch conditions or cycle count
624   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
625     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
626     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
627     hsn[CCREG]=1;
628     // ...or hash tables
629     hsn[RHASH]=1;
630     hsn[RHTBL]=1;
631   }
632   // Coprocessor load/store needs FTEMP, even if not declared
633   if(itype[i]==C1LS||itype[i]==C2LS) {
634     hsn[FTEMP]=0;
635   }
636   // Load L/R also uses FTEMP as a temporary register
637   if(itype[i]==LOADLR) {
638     hsn[FTEMP]=0;
639   }
640   // Also SWL/SWR/SDL/SDR
641   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
642     hsn[FTEMP]=0;
643   }
644   // Don't remove the miniht registers
645   if(itype[i]==UJUMP||itype[i]==RJUMP)
646   {
647     hsn[RHASH]=0;
648     hsn[RHTBL]=0;
649   }
650 }
651
652 // We only want to allocate registers if we're going to use them again soon
653 int needed_again(int r, int i)
654 {
655   int j;
656   int b=-1;
657   int rn=10;
658
659   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
660   {
661     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
662       return 0; // Don't need any registers if exiting the block
663   }
664   for(j=0;j<9;j++)
665   {
666     if(i+j>=slen) {
667       j=slen-i-1;
668       break;
669     }
670     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
671     {
672       // Don't go past an unconditonal jump
673       j++;
674       break;
675     }
676     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
677     {
678       break;
679     }
680   }
681   for(;j>=1;j--)
682   {
683     if(rs1[i+j]==r) rn=j;
684     if(rs2[i+j]==r) rn=j;
685     if((unneeded_reg[i+j]>>r)&1) rn=10;
686     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
687     {
688       b=j;
689     }
690   }
691   /*
692   if(b>=0)
693   {
694     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
695     {
696       // Follow first branch
697       int o=rn;
698       int t=(ba[i+b]-start)>>2;
699       j=7-b;if(t+j>=slen) j=slen-t-1;
700       for(;j>=0;j--)
701       {
702         if(!((unneeded_reg[t+j]>>r)&1)) {
703           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
704           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
705         }
706         else rn=o;
707       }
708     }
709   }*/
710   if(rn<10) return 1;
711   (void)b;
712   return 0;
713 }
714
715 // Try to match register allocations at the end of a loop with those
716 // at the beginning
717 int loop_reg(int i, int r, int hr)
718 {
719   int j,k;
720   for(j=0;j<9;j++)
721   {
722     if(i+j>=slen) {
723       j=slen-i-1;
724       break;
725     }
726     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
727     {
728       // Don't go past an unconditonal jump
729       j++;
730       break;
731     }
732   }
733   k=0;
734   if(i>0){
735     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
736       k--;
737   }
738   for(;k<j;k++)
739   {
740     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
741     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
742     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
743     {
744       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
745       {
746         int t=(ba[i+k]-start)>>2;
747         int reg=get_reg(regs[t].regmap_entry,r);
748         if(reg>=0) return reg;
749         //reg=get_reg(regs[t+1].regmap_entry,r);
750         //if(reg>=0) return reg;
751       }
752     }
753   }
754   return hr;
755 }
756
757
758 // Allocate every register, preserving source/target regs
759 void alloc_all(struct regstat *cur,int i)
760 {
761   int hr;
762
763   for(hr=0;hr<HOST_REGS;hr++) {
764     if(hr!=EXCLUDE_REG) {
765       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
766          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
767       {
768         cur->regmap[hr]=-1;
769         cur->dirty&=~(1<<hr);
770       }
771       // Don't need zeros
772       if((cur->regmap[hr]&63)==0)
773       {
774         cur->regmap[hr]=-1;
775         cur->dirty&=~(1<<hr);
776       }
777     }
778   }
779 }
780
781 #ifdef __i386__
782 #include "x86/assem_x86.c"
783 #endif
784 #ifdef __x86_64__
785 #include "x64/assem_x64.c"
786 #endif
787 #ifdef __arm__
788 #include "arm/assem_arm.c"
789 #endif
790
791 // Add virtual address mapping to linked list
792 void ll_add(struct ll_entry **head,int vaddr,void *addr)
793 {
794   struct ll_entry *new_entry;
795   new_entry=malloc(sizeof(struct ll_entry));
796   assert(new_entry!=NULL);
797   new_entry->vaddr=vaddr;
798   new_entry->reg_sv_flags=0;
799   new_entry->addr=addr;
800   new_entry->next=*head;
801   *head=new_entry;
802 }
803
804 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
805 {
806   ll_add(head,vaddr,addr);
807   (*head)->reg_sv_flags=reg_sv_flags;
808 }
809
810 // Check if an address is already compiled
811 // but don't return addresses which are about to expire from the cache
812 void *check_addr(u_int vaddr)
813 {
814   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
815   if(ht_bin[0]==vaddr) {
816     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
817       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
818   }
819   if(ht_bin[2]==vaddr) {
820     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
821       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
822   }
823   u_int page=get_page(vaddr);
824   struct ll_entry *head;
825   head=jump_in[page];
826   while(head!=NULL) {
827     if(head->vaddr==vaddr) {
828       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
829         // Update existing entry with current address
830         if(ht_bin[0]==vaddr) {
831           ht_bin[1]=(int)head->addr;
832           return head->addr;
833         }
834         if(ht_bin[2]==vaddr) {
835           ht_bin[3]=(int)head->addr;
836           return head->addr;
837         }
838         // Insert into hash table with low priority.
839         // Don't evict existing entries, as they are probably
840         // addresses that are being accessed frequently.
841         if(ht_bin[0]==-1) {
842           ht_bin[1]=(int)head->addr;
843           ht_bin[0]=vaddr;
844         }else if(ht_bin[2]==-1) {
845           ht_bin[3]=(int)head->addr;
846           ht_bin[2]=vaddr;
847         }
848         return head->addr;
849       }
850     }
851     head=head->next;
852   }
853   return 0;
854 }
855
856 void remove_hash(int vaddr)
857 {
858   //printf("remove hash: %x\n",vaddr);
859   u_int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
860   if(ht_bin[2]==vaddr) {
861     ht_bin[2]=ht_bin[3]=-1;
862   }
863   if(ht_bin[0]==vaddr) {
864     ht_bin[0]=ht_bin[2];
865     ht_bin[1]=ht_bin[3];
866     ht_bin[2]=ht_bin[3]=-1;
867   }
868 }
869
870 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
871 {
872   struct ll_entry *next;
873   while(*head) {
874     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
875        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
876     {
877       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
878       remove_hash((*head)->vaddr);
879       next=(*head)->next;
880       free(*head);
881       *head=next;
882     }
883     else
884     {
885       head=&((*head)->next);
886     }
887   }
888 }
889
890 // Remove all entries from linked list
891 void ll_clear(struct ll_entry **head)
892 {
893   struct ll_entry *cur;
894   struct ll_entry *next;
895   if((cur=*head)) {
896     *head=0;
897     while(cur) {
898       next=cur->next;
899       free(cur);
900       cur=next;
901     }
902   }
903 }
904
905 // Dereference the pointers and remove if it matches
906 static void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
907 {
908   while(head) {
909     int ptr=get_pointer(head->addr);
910     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
911     if(((ptr>>shift)==(addr>>shift)) ||
912        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
913     {
914       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
915       void *host_addr=find_extjump_insn(head->addr);
916       #ifdef __arm__
917         mark_clear_cache(host_addr);
918       #endif
919       set_jump_target((int)host_addr,(int)head->addr);
920     }
921     head=head->next;
922   }
923 }
924
925 // This is called when we write to a compiled block (see do_invstub)
926 void invalidate_page(u_int page)
927 {
928   struct ll_entry *head;
929   struct ll_entry *next;
930   head=jump_in[page];
931   jump_in[page]=0;
932   while(head!=NULL) {
933     inv_debug("INVALIDATE: %x\n",head->vaddr);
934     remove_hash(head->vaddr);
935     next=head->next;
936     free(head);
937     head=next;
938   }
939   head=jump_out[page];
940   jump_out[page]=0;
941   while(head!=NULL) {
942     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
943     void *host_addr=find_extjump_insn(head->addr);
944     #ifdef __arm__
945       mark_clear_cache(host_addr);
946     #endif
947     set_jump_target((int)host_addr,(int)head->addr);
948     next=head->next;
949     free(head);
950     head=next;
951   }
952 }
953
954 static void invalidate_block_range(u_int block, u_int first, u_int last)
955 {
956   u_int page=get_page(block<<12);
957   //printf("first=%d last=%d\n",first,last);
958   invalidate_page(page);
959   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
960   assert(last<page+5);
961   // Invalidate the adjacent pages if a block crosses a 4K boundary
962   while(first<page)
963   {
964     invalidate_page(first);
965     first++;
966   }
967   for(first=page+1;first<last;first++)
968   {
969     invalidate_page(first);
970   }
971
972 #ifdef __arm__
973   do_clear_cache();
974 #endif
975
976   // Don't trap writes
977   invalid_code[block]=1;
978
979 #ifdef USE_MINI_HT
980   memset(mini_ht,-1,sizeof(mini_ht));
981 #endif
982 }
983
984 void invalidate_block(u_int block)
985 {
986   u_int page=get_page(block<<12);
987   u_int vpage=get_vpage(block<<12);
988   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
989   u_int first,last;
990   first=last=page;
991   struct ll_entry *head;
992   head=jump_dirty[vpage];
993   //printf("page=%d vpage=%d\n",page,vpage);
994   while(head!=NULL)
995   {
996     u_int start,end;
997     if(vpage>2047||(head->vaddr>>12)==block)
998     { // Ignore vaddr hash collision
999       get_bounds((int)head->addr,&start,&end);
1000       //printf("start: %x end: %x\n",start,end);
1001       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE)
1002       {
1003         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page)
1004         {
1005           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1006           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1007         }
1008       }
1009     }
1010     head=head->next;
1011   }
1012   invalidate_block_range(block,first,last);
1013 }
1014
1015 void invalidate_addr(u_int addr)
1016 {
1017   //static int rhits;
1018   // this check is done by the caller
1019   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1020   u_int page=get_vpage(addr);
1021   if(page<2048) { // RAM
1022     struct ll_entry *head;
1023     u_int addr_min=~0, addr_max=0;
1024     u_int mask=RAM_SIZE-1;
1025     u_int addr_main=0x80000000|(addr&mask);
1026     int pg1;
1027     inv_code_start=addr_main&~0xfff;
1028     inv_code_end=addr_main|0xfff;
1029     pg1=page;
1030     if (pg1>0) {
1031       // must check previous page too because of spans..
1032       pg1--;
1033       inv_code_start-=0x1000;
1034     }
1035     for(;pg1<=page;pg1++) {
1036       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1037         u_int start,end;
1038         get_bounds((int)head->addr,&start,&end);
1039         if(ram_offset) {
1040           start-=ram_offset;
1041           end-=ram_offset;
1042         }
1043         if(start<=addr_main&&addr_main<end) {
1044           if(start<addr_min) addr_min=start;
1045           if(end>addr_max) addr_max=end;
1046         }
1047         else if(addr_main<start) {
1048           if(start<inv_code_end)
1049             inv_code_end=start-1;
1050         }
1051         else {
1052           if(end>inv_code_start)
1053             inv_code_start=end;
1054         }
1055       }
1056     }
1057     if (addr_min!=~0) {
1058       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1059       inv_code_start=inv_code_end=~0;
1060       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1061       return;
1062     }
1063     else {
1064       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1065       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1066       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1067       return;
1068     }
1069   }
1070   invalidate_block(addr>>12);
1071 }
1072
1073 // This is called when loading a save state.
1074 // Anything could have changed, so invalidate everything.
1075 void invalidate_all_pages(void)
1076 {
1077   u_int page;
1078   for(page=0;page<4096;page++)
1079     invalidate_page(page);
1080   for(page=0;page<1048576;page++)
1081   {
1082     if(!invalid_code[page])
1083     {
1084       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1085       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1086     }
1087   }
1088
1089 #ifdef USE_MINI_HT
1090   memset(mini_ht,-1,sizeof(mini_ht));
1091 #endif
1092 }
1093
1094 // Add an entry to jump_out after making a link
1095 void add_link(u_int vaddr,void *src)
1096 {
1097   u_int page=get_page(vaddr);
1098   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1099   int *ptr=(int *)(src+4);
1100   assert((*ptr&0x0fff0000)==0x059f0000);
1101   (void)ptr;
1102   ll_add(jump_out+page,vaddr,src);
1103   //int ptr=get_pointer(src);
1104   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1105 }
1106
1107 // If a code block was found to be unmodified (bit was set in
1108 // restore_candidate) and it remains unmodified (bit is clear
1109 // in invalid_code) then move the entries for that 4K page from
1110 // the dirty list to the clean list.
1111 void clean_blocks(u_int page)
1112 {
1113   struct ll_entry *head;
1114   inv_debug("INV: clean_blocks page=%d\n",page);
1115   head=jump_dirty[page];
1116   while(head!=NULL)
1117   {
1118     if(!invalid_code[head->vaddr>>12])
1119     {
1120       // Don't restore blocks which are about to expire from the cache
1121       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1122       {
1123         u_int start,end;
1124         if(verify_dirty(head->addr))
1125         {
1126           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1127           u_int i;
1128           u_int inv=0;
1129           get_bounds((int)head->addr,&start,&end);
1130           if(start-(u_int)rdram<RAM_SIZE)
1131           {
1132             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++)
1133             {
1134               inv|=invalid_code[i];
1135             }
1136           }
1137           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE)
1138           {
1139             inv=1;
1140           }
1141           if(!inv)
1142           {
1143             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1144             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1145             {
1146               u_int ppage=page;
1147               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1148               //printf("page=%x, addr=%x\n",page,head->vaddr);
1149               //assert(head->vaddr>>12==(page|0x80000));
1150               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1151               u_int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1152               if(ht_bin[0]==head->vaddr)
1153               {
1154                 ht_bin[1]=(u_int)clean_addr; // Replace existing entry
1155               }
1156               if(ht_bin[2]==head->vaddr)
1157               {
1158                 ht_bin[3]=(u_int)clean_addr; // Replace existing entry
1159               }
1160             }
1161           }
1162         }
1163       }
1164     }
1165     head=head->next;
1166   }
1167 }
1168
1169 static void mov_alloc(struct regstat *current,int i)
1170 {
1171   // Note: Don't need to actually alloc the source registers
1172   if((~current->is32>>rs1[i])&1)
1173   {
1174     //alloc_reg64(current,i,rs1[i]);
1175     alloc_reg64(current,i,rt1[i]);
1176     current->is32&=~(1LL<<rt1[i]);
1177   }
1178   else
1179   {
1180     //alloc_reg(current,i,rs1[i]);
1181     alloc_reg(current,i,rt1[i]);
1182     current->is32|=(1LL<<rt1[i]);
1183   }
1184   clear_const(current,rs1[i]);
1185   clear_const(current,rt1[i]);
1186   dirty_reg(current,rt1[i]);
1187 }
1188
1189 void shiftimm_alloc(struct regstat *current,int i)
1190 {
1191   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1192   {
1193     if(rt1[i]) {
1194       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1195       else lt1[i]=rs1[i];
1196       alloc_reg(current,i,rt1[i]);
1197       current->is32|=1LL<<rt1[i];
1198       dirty_reg(current,rt1[i]);
1199       if(is_const(current,rs1[i])) {
1200         int v=get_const(current,rs1[i]);
1201         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1202         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1203         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1204       }
1205       else clear_const(current,rt1[i]);
1206     }
1207   }
1208   else
1209   {
1210     clear_const(current,rs1[i]);
1211     clear_const(current,rt1[i]);
1212   }
1213
1214   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1215   {
1216     if(rt1[i]) {
1217       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1218       alloc_reg64(current,i,rt1[i]);
1219       current->is32&=~(1LL<<rt1[i]);
1220       dirty_reg(current,rt1[i]);
1221     }
1222   }
1223   if(opcode2[i]==0x3c) // DSLL32
1224   {
1225     if(rt1[i]) {
1226       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1227       alloc_reg64(current,i,rt1[i]);
1228       current->is32&=~(1LL<<rt1[i]);
1229       dirty_reg(current,rt1[i]);
1230     }
1231   }
1232   if(opcode2[i]==0x3e) // DSRL32
1233   {
1234     if(rt1[i]) {
1235       alloc_reg64(current,i,rs1[i]);
1236       if(imm[i]==32) {
1237         alloc_reg64(current,i,rt1[i]);
1238         current->is32&=~(1LL<<rt1[i]);
1239       } else {
1240         alloc_reg(current,i,rt1[i]);
1241         current->is32|=1LL<<rt1[i];
1242       }
1243       dirty_reg(current,rt1[i]);
1244     }
1245   }
1246   if(opcode2[i]==0x3f) // DSRA32
1247   {
1248     if(rt1[i]) {
1249       alloc_reg64(current,i,rs1[i]);
1250       alloc_reg(current,i,rt1[i]);
1251       current->is32|=1LL<<rt1[i];
1252       dirty_reg(current,rt1[i]);
1253     }
1254   }
1255 }
1256
1257 void shift_alloc(struct regstat *current,int i)
1258 {
1259   if(rt1[i]) {
1260     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1261     {
1262       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1263       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1264       alloc_reg(current,i,rt1[i]);
1265       if(rt1[i]==rs2[i]) {
1266         alloc_reg_temp(current,i,-1);
1267         minimum_free_regs[i]=1;
1268       }
1269       current->is32|=1LL<<rt1[i];
1270     } else { // DSLLV/DSRLV/DSRAV
1271       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1272       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1273       alloc_reg64(current,i,rt1[i]);
1274       current->is32&=~(1LL<<rt1[i]);
1275       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1276       {
1277         alloc_reg_temp(current,i,-1);
1278         minimum_free_regs[i]=1;
1279       }
1280     }
1281     clear_const(current,rs1[i]);
1282     clear_const(current,rs2[i]);
1283     clear_const(current,rt1[i]);
1284     dirty_reg(current,rt1[i]);
1285   }
1286 }
1287
1288 void alu_alloc(struct regstat *current,int i)
1289 {
1290   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1291     if(rt1[i]) {
1292       if(rs1[i]&&rs2[i]) {
1293         alloc_reg(current,i,rs1[i]);
1294         alloc_reg(current,i,rs2[i]);
1295       }
1296       else {
1297         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1298         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1299       }
1300       alloc_reg(current,i,rt1[i]);
1301     }
1302     current->is32|=1LL<<rt1[i];
1303   }
1304   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1305     if(rt1[i]) {
1306       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1307       {
1308         alloc_reg64(current,i,rs1[i]);
1309         alloc_reg64(current,i,rs2[i]);
1310         alloc_reg(current,i,rt1[i]);
1311       } else {
1312         alloc_reg(current,i,rs1[i]);
1313         alloc_reg(current,i,rs2[i]);
1314         alloc_reg(current,i,rt1[i]);
1315       }
1316     }
1317     current->is32|=1LL<<rt1[i];
1318   }
1319   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1320     if(rt1[i]) {
1321       if(rs1[i]&&rs2[i]) {
1322         alloc_reg(current,i,rs1[i]);
1323         alloc_reg(current,i,rs2[i]);
1324       }
1325       else
1326       {
1327         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1328         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1329       }
1330       alloc_reg(current,i,rt1[i]);
1331       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1332       {
1333         if(!((current->uu>>rt1[i])&1)) {
1334           alloc_reg64(current,i,rt1[i]);
1335         }
1336         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1337           if(rs1[i]&&rs2[i]) {
1338             alloc_reg64(current,i,rs1[i]);
1339             alloc_reg64(current,i,rs2[i]);
1340           }
1341           else
1342           {
1343             // Is is really worth it to keep 64-bit values in registers?
1344             #ifdef NATIVE_64BIT
1345             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1346             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1347             #endif
1348           }
1349         }
1350         current->is32&=~(1LL<<rt1[i]);
1351       } else {
1352         current->is32|=1LL<<rt1[i];
1353       }
1354     }
1355   }
1356   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1357     if(rt1[i]) {
1358       if(rs1[i]&&rs2[i]) {
1359         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1360           alloc_reg64(current,i,rs1[i]);
1361           alloc_reg64(current,i,rs2[i]);
1362           alloc_reg64(current,i,rt1[i]);
1363         } else {
1364           alloc_reg(current,i,rs1[i]);
1365           alloc_reg(current,i,rs2[i]);
1366           alloc_reg(current,i,rt1[i]);
1367         }
1368       }
1369       else {
1370         alloc_reg(current,i,rt1[i]);
1371         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1372           // DADD used as move, or zeroing
1373           // If we have a 64-bit source, then make the target 64 bits too
1374           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1375             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1376             alloc_reg64(current,i,rt1[i]);
1377           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1378             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1379             alloc_reg64(current,i,rt1[i]);
1380           }
1381           if(opcode2[i]>=0x2e&&rs2[i]) {
1382             // DSUB used as negation - 64-bit result
1383             // If we have a 32-bit register, extend it to 64 bits
1384             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1385             alloc_reg64(current,i,rt1[i]);
1386           }
1387         }
1388       }
1389       if(rs1[i]&&rs2[i]) {
1390         current->is32&=~(1LL<<rt1[i]);
1391       } else if(rs1[i]) {
1392         current->is32&=~(1LL<<rt1[i]);
1393         if((current->is32>>rs1[i])&1)
1394           current->is32|=1LL<<rt1[i];
1395       } else if(rs2[i]) {
1396         current->is32&=~(1LL<<rt1[i]);
1397         if((current->is32>>rs2[i])&1)
1398           current->is32|=1LL<<rt1[i];
1399       } else {
1400         current->is32|=1LL<<rt1[i];
1401       }
1402     }
1403   }
1404   clear_const(current,rs1[i]);
1405   clear_const(current,rs2[i]);
1406   clear_const(current,rt1[i]);
1407   dirty_reg(current,rt1[i]);
1408 }
1409
1410 void imm16_alloc(struct regstat *current,int i)
1411 {
1412   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1413   else lt1[i]=rs1[i];
1414   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1415   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1416     current->is32&=~(1LL<<rt1[i]);
1417     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1418       // TODO: Could preserve the 32-bit flag if the immediate is zero
1419       alloc_reg64(current,i,rt1[i]);
1420       alloc_reg64(current,i,rs1[i]);
1421     }
1422     clear_const(current,rs1[i]);
1423     clear_const(current,rt1[i]);
1424   }
1425   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1426     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1427     current->is32|=1LL<<rt1[i];
1428     clear_const(current,rs1[i]);
1429     clear_const(current,rt1[i]);
1430   }
1431   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1432     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1433       if(rs1[i]!=rt1[i]) {
1434         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1435         alloc_reg64(current,i,rt1[i]);
1436         current->is32&=~(1LL<<rt1[i]);
1437       }
1438     }
1439     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1440     if(is_const(current,rs1[i])) {
1441       int v=get_const(current,rs1[i]);
1442       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1443       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1444       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1445     }
1446     else clear_const(current,rt1[i]);
1447   }
1448   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1449     if(is_const(current,rs1[i])) {
1450       int v=get_const(current,rs1[i]);
1451       set_const(current,rt1[i],v+imm[i]);
1452     }
1453     else clear_const(current,rt1[i]);
1454     current->is32|=1LL<<rt1[i];
1455   }
1456   else {
1457     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1458     current->is32|=1LL<<rt1[i];
1459   }
1460   dirty_reg(current,rt1[i]);
1461 }
1462
1463 void load_alloc(struct regstat *current,int i)
1464 {
1465   clear_const(current,rt1[i]);
1466   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1467   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1468   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1469   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1470     alloc_reg(current,i,rt1[i]);
1471     assert(get_reg(current->regmap,rt1[i])>=0);
1472     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1473     {
1474       current->is32&=~(1LL<<rt1[i]);
1475       alloc_reg64(current,i,rt1[i]);
1476     }
1477     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1478     {
1479       current->is32&=~(1LL<<rt1[i]);
1480       alloc_reg64(current,i,rt1[i]);
1481       alloc_all(current,i);
1482       alloc_reg64(current,i,FTEMP);
1483       minimum_free_regs[i]=HOST_REGS;
1484     }
1485     else current->is32|=1LL<<rt1[i];
1486     dirty_reg(current,rt1[i]);
1487     // LWL/LWR need a temporary register for the old value
1488     if(opcode[i]==0x22||opcode[i]==0x26)
1489     {
1490       alloc_reg(current,i,FTEMP);
1491       alloc_reg_temp(current,i,-1);
1492       minimum_free_regs[i]=1;
1493     }
1494   }
1495   else
1496   {
1497     // Load to r0 or unneeded register (dummy load)
1498     // but we still need a register to calculate the address
1499     if(opcode[i]==0x22||opcode[i]==0x26)
1500     {
1501       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1502     }
1503     alloc_reg_temp(current,i,-1);
1504     minimum_free_regs[i]=1;
1505     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1506     {
1507       alloc_all(current,i);
1508       alloc_reg64(current,i,FTEMP);
1509       minimum_free_regs[i]=HOST_REGS;
1510     }
1511   }
1512 }
1513
1514 void store_alloc(struct regstat *current,int i)
1515 {
1516   clear_const(current,rs2[i]);
1517   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1518   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1519   alloc_reg(current,i,rs2[i]);
1520   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1521     alloc_reg64(current,i,rs2[i]);
1522     if(rs2[i]) alloc_reg(current,i,FTEMP);
1523   }
1524   #if defined(HOST_IMM8)
1525   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1526   else alloc_reg(current,i,INVCP);
1527   #endif
1528   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1529     alloc_reg(current,i,FTEMP);
1530   }
1531   // We need a temporary register for address generation
1532   alloc_reg_temp(current,i,-1);
1533   minimum_free_regs[i]=1;
1534 }
1535
1536 void c1ls_alloc(struct regstat *current,int i)
1537 {
1538   //clear_const(current,rs1[i]); // FIXME
1539   clear_const(current,rt1[i]);
1540   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1541   alloc_reg(current,i,CSREG); // Status
1542   alloc_reg(current,i,FTEMP);
1543   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1544     alloc_reg64(current,i,FTEMP);
1545   }
1546   #if defined(HOST_IMM8)
1547   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1548   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1549     alloc_reg(current,i,INVCP);
1550   #endif
1551   // We need a temporary register for address generation
1552   alloc_reg_temp(current,i,-1);
1553 }
1554
1555 void c2ls_alloc(struct regstat *current,int i)
1556 {
1557   clear_const(current,rt1[i]);
1558   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1559   alloc_reg(current,i,FTEMP);
1560   #if defined(HOST_IMM8)
1561   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1562   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1563     alloc_reg(current,i,INVCP);
1564   #endif
1565   // We need a temporary register for address generation
1566   alloc_reg_temp(current,i,-1);
1567   minimum_free_regs[i]=1;
1568 }
1569
1570 #ifndef multdiv_alloc
1571 void multdiv_alloc(struct regstat *current,int i)
1572 {
1573   //  case 0x18: MULT
1574   //  case 0x19: MULTU
1575   //  case 0x1A: DIV
1576   //  case 0x1B: DIVU
1577   //  case 0x1C: DMULT
1578   //  case 0x1D: DMULTU
1579   //  case 0x1E: DDIV
1580   //  case 0x1F: DDIVU
1581   clear_const(current,rs1[i]);
1582   clear_const(current,rs2[i]);
1583   if(rs1[i]&&rs2[i])
1584   {
1585     if((opcode2[i]&4)==0) // 32-bit
1586     {
1587       current->u&=~(1LL<<HIREG);
1588       current->u&=~(1LL<<LOREG);
1589       alloc_reg(current,i,HIREG);
1590       alloc_reg(current,i,LOREG);
1591       alloc_reg(current,i,rs1[i]);
1592       alloc_reg(current,i,rs2[i]);
1593       current->is32|=1LL<<HIREG;
1594       current->is32|=1LL<<LOREG;
1595       dirty_reg(current,HIREG);
1596       dirty_reg(current,LOREG);
1597     }
1598     else // 64-bit
1599     {
1600       current->u&=~(1LL<<HIREG);
1601       current->u&=~(1LL<<LOREG);
1602       current->uu&=~(1LL<<HIREG);
1603       current->uu&=~(1LL<<LOREG);
1604       alloc_reg64(current,i,HIREG);
1605       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1606       alloc_reg64(current,i,rs1[i]);
1607       alloc_reg64(current,i,rs2[i]);
1608       alloc_all(current,i);
1609       current->is32&=~(1LL<<HIREG);
1610       current->is32&=~(1LL<<LOREG);
1611       dirty_reg(current,HIREG);
1612       dirty_reg(current,LOREG);
1613       minimum_free_regs[i]=HOST_REGS;
1614     }
1615   }
1616   else
1617   {
1618     // Multiply by zero is zero.
1619     // MIPS does not have a divide by zero exception.
1620     // The result is undefined, we return zero.
1621     alloc_reg(current,i,HIREG);
1622     alloc_reg(current,i,LOREG);
1623     current->is32|=1LL<<HIREG;
1624     current->is32|=1LL<<LOREG;
1625     dirty_reg(current,HIREG);
1626     dirty_reg(current,LOREG);
1627   }
1628 }
1629 #endif
1630
1631 void cop0_alloc(struct regstat *current,int i)
1632 {
1633   if(opcode2[i]==0) // MFC0
1634   {
1635     if(rt1[i]) {
1636       clear_const(current,rt1[i]);
1637       alloc_all(current,i);
1638       alloc_reg(current,i,rt1[i]);
1639       current->is32|=1LL<<rt1[i];
1640       dirty_reg(current,rt1[i]);
1641     }
1642   }
1643   else if(opcode2[i]==4) // MTC0
1644   {
1645     if(rs1[i]){
1646       clear_const(current,rs1[i]);
1647       alloc_reg(current,i,rs1[i]);
1648       alloc_all(current,i);
1649     }
1650     else {
1651       alloc_all(current,i); // FIXME: Keep r0
1652       current->u&=~1LL;
1653       alloc_reg(current,i,0);
1654     }
1655   }
1656   else
1657   {
1658     // TLBR/TLBWI/TLBWR/TLBP/ERET
1659     assert(opcode2[i]==0x10);
1660     alloc_all(current,i);
1661   }
1662   minimum_free_regs[i]=HOST_REGS;
1663 }
1664
1665 void cop1_alloc(struct regstat *current,int i)
1666 {
1667   alloc_reg(current,i,CSREG); // Load status
1668   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1669   {
1670     if(rt1[i]){
1671       clear_const(current,rt1[i]);
1672       if(opcode2[i]==1) {
1673         alloc_reg64(current,i,rt1[i]); // DMFC1
1674         current->is32&=~(1LL<<rt1[i]);
1675       }else{
1676         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1677         current->is32|=1LL<<rt1[i];
1678       }
1679       dirty_reg(current,rt1[i]);
1680     }
1681     alloc_reg_temp(current,i,-1);
1682   }
1683   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1684   {
1685     if(rs1[i]){
1686       clear_const(current,rs1[i]);
1687       if(opcode2[i]==5)
1688         alloc_reg64(current,i,rs1[i]); // DMTC1
1689       else
1690         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1691       alloc_reg_temp(current,i,-1);
1692     }
1693     else {
1694       current->u&=~1LL;
1695       alloc_reg(current,i,0);
1696       alloc_reg_temp(current,i,-1);
1697     }
1698   }
1699   minimum_free_regs[i]=1;
1700 }
1701 void fconv_alloc(struct regstat *current,int i)
1702 {
1703   alloc_reg(current,i,CSREG); // Load status
1704   alloc_reg_temp(current,i,-1);
1705   minimum_free_regs[i]=1;
1706 }
1707 void float_alloc(struct regstat *current,int i)
1708 {
1709   alloc_reg(current,i,CSREG); // Load status
1710   alloc_reg_temp(current,i,-1);
1711   minimum_free_regs[i]=1;
1712 }
1713 void c2op_alloc(struct regstat *current,int i)
1714 {
1715   alloc_reg_temp(current,i,-1);
1716 }
1717 void fcomp_alloc(struct regstat *current,int i)
1718 {
1719   alloc_reg(current,i,CSREG); // Load status
1720   alloc_reg(current,i,FSREG); // Load flags
1721   dirty_reg(current,FSREG); // Flag will be modified
1722   alloc_reg_temp(current,i,-1);
1723   minimum_free_regs[i]=1;
1724 }
1725
1726 void syscall_alloc(struct regstat *current,int i)
1727 {
1728   alloc_cc(current,i);
1729   dirty_reg(current,CCREG);
1730   alloc_all(current,i);
1731   minimum_free_regs[i]=HOST_REGS;
1732   current->isconst=0;
1733 }
1734
1735 void delayslot_alloc(struct regstat *current,int i)
1736 {
1737   switch(itype[i])
1738   {
1739     case UJUMP:
1740     case CJUMP:
1741     case SJUMP:
1742     case RJUMP:
1743     case FJUMP:
1744     case SYSCALL:
1745     case HLECALL:
1746     case SPAN:
1747       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1748       SysPrintf("Disabled speculative precompilation\n");
1749       stop_after_jal=1;
1750       break;
1751     case IMM16:
1752       imm16_alloc(current,i);
1753       break;
1754     case LOAD:
1755     case LOADLR:
1756       load_alloc(current,i);
1757       break;
1758     case STORE:
1759     case STORELR:
1760       store_alloc(current,i);
1761       break;
1762     case ALU:
1763       alu_alloc(current,i);
1764       break;
1765     case SHIFT:
1766       shift_alloc(current,i);
1767       break;
1768     case MULTDIV:
1769       multdiv_alloc(current,i);
1770       break;
1771     case SHIFTIMM:
1772       shiftimm_alloc(current,i);
1773       break;
1774     case MOV:
1775       mov_alloc(current,i);
1776       break;
1777     case COP0:
1778       cop0_alloc(current,i);
1779       break;
1780     case COP1:
1781     case COP2:
1782       cop1_alloc(current,i);
1783       break;
1784     case C1LS:
1785       c1ls_alloc(current,i);
1786       break;
1787     case C2LS:
1788       c2ls_alloc(current,i);
1789       break;
1790     case FCONV:
1791       fconv_alloc(current,i);
1792       break;
1793     case FLOAT:
1794       float_alloc(current,i);
1795       break;
1796     case FCOMP:
1797       fcomp_alloc(current,i);
1798       break;
1799     case C2OP:
1800       c2op_alloc(current,i);
1801       break;
1802   }
1803 }
1804
1805 // Special case where a branch and delay slot span two pages in virtual memory
1806 static void pagespan_alloc(struct regstat *current,int i)
1807 {
1808   current->isconst=0;
1809   current->wasconst=0;
1810   regs[i].wasconst=0;
1811   minimum_free_regs[i]=HOST_REGS;
1812   alloc_all(current,i);
1813   alloc_cc(current,i);
1814   dirty_reg(current,CCREG);
1815   if(opcode[i]==3) // JAL
1816   {
1817     alloc_reg(current,i,31);
1818     dirty_reg(current,31);
1819   }
1820   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1821   {
1822     alloc_reg(current,i,rs1[i]);
1823     if (rt1[i]!=0) {
1824       alloc_reg(current,i,rt1[i]);
1825       dirty_reg(current,rt1[i]);
1826     }
1827   }
1828   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1829   {
1830     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1831     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1832     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1833     {
1834       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1835       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1836     }
1837   }
1838   else
1839   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1840   {
1841     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1842     if(!((current->is32>>rs1[i])&1))
1843     {
1844       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1845     }
1846   }
1847   else
1848   if(opcode[i]==0x11) // BC1
1849   {
1850     alloc_reg(current,i,FSREG);
1851     alloc_reg(current,i,CSREG);
1852   }
1853   //else ...
1854 }
1855
1856 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1857 {
1858   stubs[stubcount][0]=type;
1859   stubs[stubcount][1]=addr;
1860   stubs[stubcount][2]=retaddr;
1861   stubs[stubcount][3]=a;
1862   stubs[stubcount][4]=b;
1863   stubs[stubcount][5]=c;
1864   stubs[stubcount][6]=d;
1865   stubs[stubcount][7]=e;
1866   stubcount++;
1867 }
1868
1869 // Write out a single register
1870 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1871 {
1872   int hr;
1873   for(hr=0;hr<HOST_REGS;hr++) {
1874     if(hr!=EXCLUDE_REG) {
1875       if((regmap[hr]&63)==r) {
1876         if((dirty>>hr)&1) {
1877           if(regmap[hr]<64) {
1878             emit_storereg(r,hr);
1879           }else{
1880             emit_storereg(r|64,hr);
1881           }
1882         }
1883       }
1884     }
1885   }
1886 }
1887
1888 #if 0
1889 static int mchecksum(void)
1890 {
1891   //if(!tracedebug) return 0;
1892   int i;
1893   int sum=0;
1894   for(i=0;i<2097152;i++) {
1895     unsigned int temp=sum;
1896     sum<<=1;
1897     sum|=(~temp)>>31;
1898     sum^=((u_int *)rdram)[i];
1899   }
1900   return sum;
1901 }
1902
1903 static int rchecksum(void)
1904 {
1905   int i;
1906   int sum=0;
1907   for(i=0;i<64;i++)
1908     sum^=((u_int *)reg)[i];
1909   return sum;
1910 }
1911
1912 static void rlist(void)
1913 {
1914   int i;
1915   printf("TRACE: ");
1916   for(i=0;i<32;i++)
1917     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1918   printf("\n");
1919 }
1920
1921 static void enabletrace(void)
1922 {
1923   tracedebug=1;
1924 }
1925
1926 static void memdebug(int i)
1927 {
1928   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1929   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1930   //rlist();
1931   //if(tracedebug) {
1932   //if(Count>=-2084597794) {
1933   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1934   //if(0) {
1935     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1936     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1937     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1938     rlist();
1939     #ifdef __i386__
1940     printf("TRACE: %x\n",(&i)[-1]);
1941     #endif
1942     #ifdef __arm__
1943     int j;
1944     printf("TRACE: %x \n",(&j)[10]);
1945     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1946     #endif
1947     //fflush(stdout);
1948   }
1949   //printf("TRACE: %x\n",(&i)[-1]);
1950 }
1951 #endif
1952
1953 void alu_assemble(int i,struct regstat *i_regs)
1954 {
1955   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1956     if(rt1[i]) {
1957       signed char s1,s2,t;
1958       t=get_reg(i_regs->regmap,rt1[i]);
1959       if(t>=0) {
1960         s1=get_reg(i_regs->regmap,rs1[i]);
1961         s2=get_reg(i_regs->regmap,rs2[i]);
1962         if(rs1[i]&&rs2[i]) {
1963           assert(s1>=0);
1964           assert(s2>=0);
1965           if(opcode2[i]&2) emit_sub(s1,s2,t);
1966           else emit_add(s1,s2,t);
1967         }
1968         else if(rs1[i]) {
1969           if(s1>=0) emit_mov(s1,t);
1970           else emit_loadreg(rs1[i],t);
1971         }
1972         else if(rs2[i]) {
1973           if(s2>=0) {
1974             if(opcode2[i]&2) emit_neg(s2,t);
1975             else emit_mov(s2,t);
1976           }
1977           else {
1978             emit_loadreg(rs2[i],t);
1979             if(opcode2[i]&2) emit_neg(t,t);
1980           }
1981         }
1982         else emit_zeroreg(t);
1983       }
1984     }
1985   }
1986   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1987     if(rt1[i]) {
1988       signed char s1l,s2l,s1h,s2h,tl,th;
1989       tl=get_reg(i_regs->regmap,rt1[i]);
1990       th=get_reg(i_regs->regmap,rt1[i]|64);
1991       if(tl>=0) {
1992         s1l=get_reg(i_regs->regmap,rs1[i]);
1993         s2l=get_reg(i_regs->regmap,rs2[i]);
1994         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1995         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1996         if(rs1[i]&&rs2[i]) {
1997           assert(s1l>=0);
1998           assert(s2l>=0);
1999           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2000           else emit_adds(s1l,s2l,tl);
2001           if(th>=0) {
2002             #ifdef INVERTED_CARRY
2003             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2004             #else
2005             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2006             #endif
2007             else emit_add(s1h,s2h,th);
2008           }
2009         }
2010         else if(rs1[i]) {
2011           if(s1l>=0) emit_mov(s1l,tl);
2012           else emit_loadreg(rs1[i],tl);
2013           if(th>=0) {
2014             if(s1h>=0) emit_mov(s1h,th);
2015             else emit_loadreg(rs1[i]|64,th);
2016           }
2017         }
2018         else if(rs2[i]) {
2019           if(s2l>=0) {
2020             if(opcode2[i]&2) emit_negs(s2l,tl);
2021             else emit_mov(s2l,tl);
2022           }
2023           else {
2024             emit_loadreg(rs2[i],tl);
2025             if(opcode2[i]&2) emit_negs(tl,tl);
2026           }
2027           if(th>=0) {
2028             #ifdef INVERTED_CARRY
2029             if(s2h>=0) emit_mov(s2h,th);
2030             else emit_loadreg(rs2[i]|64,th);
2031             if(opcode2[i]&2) {
2032               emit_adcimm(-1,th); // x86 has inverted carry flag
2033               emit_not(th,th);
2034             }
2035             #else
2036             if(opcode2[i]&2) {
2037               if(s2h>=0) emit_rscimm(s2h,0,th);
2038               else {
2039                 emit_loadreg(rs2[i]|64,th);
2040                 emit_rscimm(th,0,th);
2041               }
2042             }else{
2043               if(s2h>=0) emit_mov(s2h,th);
2044               else emit_loadreg(rs2[i]|64,th);
2045             }
2046             #endif
2047           }
2048         }
2049         else {
2050           emit_zeroreg(tl);
2051           if(th>=0) emit_zeroreg(th);
2052         }
2053       }
2054     }
2055   }
2056   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2057     if(rt1[i]) {
2058       signed char s1l,s1h,s2l,s2h,t;
2059       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2060       {
2061         t=get_reg(i_regs->regmap,rt1[i]);
2062         //assert(t>=0);
2063         if(t>=0) {
2064           s1l=get_reg(i_regs->regmap,rs1[i]);
2065           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2066           s2l=get_reg(i_regs->regmap,rs2[i]);
2067           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2068           if(rs2[i]==0) // rx<r0
2069           {
2070             assert(s1h>=0);
2071             if(opcode2[i]==0x2a) // SLT
2072               emit_shrimm(s1h,31,t);
2073             else // SLTU (unsigned can not be less than zero)
2074               emit_zeroreg(t);
2075           }
2076           else if(rs1[i]==0) // r0<rx
2077           {
2078             assert(s2h>=0);
2079             if(opcode2[i]==0x2a) // SLT
2080               emit_set_gz64_32(s2h,s2l,t);
2081             else // SLTU (set if not zero)
2082               emit_set_nz64_32(s2h,s2l,t);
2083           }
2084           else {
2085             assert(s1l>=0);assert(s1h>=0);
2086             assert(s2l>=0);assert(s2h>=0);
2087             if(opcode2[i]==0x2a) // SLT
2088               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2089             else // SLTU
2090               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2091           }
2092         }
2093       } else {
2094         t=get_reg(i_regs->regmap,rt1[i]);
2095         //assert(t>=0);
2096         if(t>=0) {
2097           s1l=get_reg(i_regs->regmap,rs1[i]);
2098           s2l=get_reg(i_regs->regmap,rs2[i]);
2099           if(rs2[i]==0) // rx<r0
2100           {
2101             assert(s1l>=0);
2102             if(opcode2[i]==0x2a) // SLT
2103               emit_shrimm(s1l,31,t);
2104             else // SLTU (unsigned can not be less than zero)
2105               emit_zeroreg(t);
2106           }
2107           else if(rs1[i]==0) // r0<rx
2108           {
2109             assert(s2l>=0);
2110             if(opcode2[i]==0x2a) // SLT
2111               emit_set_gz32(s2l,t);
2112             else // SLTU (set if not zero)
2113               emit_set_nz32(s2l,t);
2114           }
2115           else{
2116             assert(s1l>=0);assert(s2l>=0);
2117             if(opcode2[i]==0x2a) // SLT
2118               emit_set_if_less32(s1l,s2l,t);
2119             else // SLTU
2120               emit_set_if_carry32(s1l,s2l,t);
2121           }
2122         }
2123       }
2124     }
2125   }
2126   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2127     if(rt1[i]) {
2128       signed char s1l,s1h,s2l,s2h,th,tl;
2129       tl=get_reg(i_regs->regmap,rt1[i]);
2130       th=get_reg(i_regs->regmap,rt1[i]|64);
2131       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2132       {
2133         assert(tl>=0);
2134         if(tl>=0) {
2135           s1l=get_reg(i_regs->regmap,rs1[i]);
2136           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2137           s2l=get_reg(i_regs->regmap,rs2[i]);
2138           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2139           if(rs1[i]&&rs2[i]) {
2140             assert(s1l>=0);assert(s1h>=0);
2141             assert(s2l>=0);assert(s2h>=0);
2142             if(opcode2[i]==0x24) { // AND
2143               emit_and(s1l,s2l,tl);
2144               emit_and(s1h,s2h,th);
2145             } else
2146             if(opcode2[i]==0x25) { // OR
2147               emit_or(s1l,s2l,tl);
2148               emit_or(s1h,s2h,th);
2149             } else
2150             if(opcode2[i]==0x26) { // XOR
2151               emit_xor(s1l,s2l,tl);
2152               emit_xor(s1h,s2h,th);
2153             } else
2154             if(opcode2[i]==0x27) { // NOR
2155               emit_or(s1l,s2l,tl);
2156               emit_or(s1h,s2h,th);
2157               emit_not(tl,tl);
2158               emit_not(th,th);
2159             }
2160           }
2161           else
2162           {
2163             if(opcode2[i]==0x24) { // AND
2164               emit_zeroreg(tl);
2165               emit_zeroreg(th);
2166             } else
2167             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2168               if(rs1[i]){
2169                 if(s1l>=0) emit_mov(s1l,tl);
2170                 else emit_loadreg(rs1[i],tl);
2171                 if(s1h>=0) emit_mov(s1h,th);
2172                 else emit_loadreg(rs1[i]|64,th);
2173               }
2174               else
2175               if(rs2[i]){
2176                 if(s2l>=0) emit_mov(s2l,tl);
2177                 else emit_loadreg(rs2[i],tl);
2178                 if(s2h>=0) emit_mov(s2h,th);
2179                 else emit_loadreg(rs2[i]|64,th);
2180               }
2181               else{
2182                 emit_zeroreg(tl);
2183                 emit_zeroreg(th);
2184               }
2185             } else
2186             if(opcode2[i]==0x27) { // NOR
2187               if(rs1[i]){
2188                 if(s1l>=0) emit_not(s1l,tl);
2189                 else{
2190                   emit_loadreg(rs1[i],tl);
2191                   emit_not(tl,tl);
2192                 }
2193                 if(s1h>=0) emit_not(s1h,th);
2194                 else{
2195                   emit_loadreg(rs1[i]|64,th);
2196                   emit_not(th,th);
2197                 }
2198               }
2199               else
2200               if(rs2[i]){
2201                 if(s2l>=0) emit_not(s2l,tl);
2202                 else{
2203                   emit_loadreg(rs2[i],tl);
2204                   emit_not(tl,tl);
2205                 }
2206                 if(s2h>=0) emit_not(s2h,th);
2207                 else{
2208                   emit_loadreg(rs2[i]|64,th);
2209                   emit_not(th,th);
2210                 }
2211               }
2212               else {
2213                 emit_movimm(-1,tl);
2214                 emit_movimm(-1,th);
2215               }
2216             }
2217           }
2218         }
2219       }
2220       else
2221       {
2222         // 32 bit
2223         if(tl>=0) {
2224           s1l=get_reg(i_regs->regmap,rs1[i]);
2225           s2l=get_reg(i_regs->regmap,rs2[i]);
2226           if(rs1[i]&&rs2[i]) {
2227             assert(s1l>=0);
2228             assert(s2l>=0);
2229             if(opcode2[i]==0x24) { // AND
2230               emit_and(s1l,s2l,tl);
2231             } else
2232             if(opcode2[i]==0x25) { // OR
2233               emit_or(s1l,s2l,tl);
2234             } else
2235             if(opcode2[i]==0x26) { // XOR
2236               emit_xor(s1l,s2l,tl);
2237             } else
2238             if(opcode2[i]==0x27) { // NOR
2239               emit_or(s1l,s2l,tl);
2240               emit_not(tl,tl);
2241             }
2242           }
2243           else
2244           {
2245             if(opcode2[i]==0x24) { // AND
2246               emit_zeroreg(tl);
2247             } else
2248             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2249               if(rs1[i]){
2250                 if(s1l>=0) emit_mov(s1l,tl);
2251                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2252               }
2253               else
2254               if(rs2[i]){
2255                 if(s2l>=0) emit_mov(s2l,tl);
2256                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2257               }
2258               else emit_zeroreg(tl);
2259             } else
2260             if(opcode2[i]==0x27) { // NOR
2261               if(rs1[i]){
2262                 if(s1l>=0) emit_not(s1l,tl);
2263                 else {
2264                   emit_loadreg(rs1[i],tl);
2265                   emit_not(tl,tl);
2266                 }
2267               }
2268               else
2269               if(rs2[i]){
2270                 if(s2l>=0) emit_not(s2l,tl);
2271                 else {
2272                   emit_loadreg(rs2[i],tl);
2273                   emit_not(tl,tl);
2274                 }
2275               }
2276               else emit_movimm(-1,tl);
2277             }
2278           }
2279         }
2280       }
2281     }
2282   }
2283 }
2284
2285 void imm16_assemble(int i,struct regstat *i_regs)
2286 {
2287   if (opcode[i]==0x0f) { // LUI
2288     if(rt1[i]) {
2289       signed char t;
2290       t=get_reg(i_regs->regmap,rt1[i]);
2291       //assert(t>=0);
2292       if(t>=0) {
2293         if(!((i_regs->isconst>>t)&1))
2294           emit_movimm(imm[i]<<16,t);
2295       }
2296     }
2297   }
2298   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2299     if(rt1[i]) {
2300       signed char s,t;
2301       t=get_reg(i_regs->regmap,rt1[i]);
2302       s=get_reg(i_regs->regmap,rs1[i]);
2303       if(rs1[i]) {
2304         //assert(t>=0);
2305         //assert(s>=0);
2306         if(t>=0) {
2307           if(!((i_regs->isconst>>t)&1)) {
2308             if(s<0) {
2309               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2310               emit_addimm(t,imm[i],t);
2311             }else{
2312               if(!((i_regs->wasconst>>s)&1))
2313                 emit_addimm(s,imm[i],t);
2314               else
2315                 emit_movimm(constmap[i][s]+imm[i],t);
2316             }
2317           }
2318         }
2319       } else {
2320         if(t>=0) {
2321           if(!((i_regs->isconst>>t)&1))
2322             emit_movimm(imm[i],t);
2323         }
2324       }
2325     }
2326   }
2327   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2328     if(rt1[i]) {
2329       signed char sh,sl,th,tl;
2330       th=get_reg(i_regs->regmap,rt1[i]|64);
2331       tl=get_reg(i_regs->regmap,rt1[i]);
2332       sh=get_reg(i_regs->regmap,rs1[i]|64);
2333       sl=get_reg(i_regs->regmap,rs1[i]);
2334       if(tl>=0) {
2335         if(rs1[i]) {
2336           assert(sh>=0);
2337           assert(sl>=0);
2338           if(th>=0) {
2339             emit_addimm64_32(sh,sl,imm[i],th,tl);
2340           }
2341           else {
2342             emit_addimm(sl,imm[i],tl);
2343           }
2344         } else {
2345           emit_movimm(imm[i],tl);
2346           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2347         }
2348       }
2349     }
2350   }
2351   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2352     if(rt1[i]) {
2353       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2354       signed char sh,sl,t;
2355       t=get_reg(i_regs->regmap,rt1[i]);
2356       sh=get_reg(i_regs->regmap,rs1[i]|64);
2357       sl=get_reg(i_regs->regmap,rs1[i]);
2358       //assert(t>=0);
2359       if(t>=0) {
2360         if(rs1[i]>0) {
2361           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2362           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2363             if(opcode[i]==0x0a) { // SLTI
2364               if(sl<0) {
2365                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2366                 emit_slti32(t,imm[i],t);
2367               }else{
2368                 emit_slti32(sl,imm[i],t);
2369               }
2370             }
2371             else { // SLTIU
2372               if(sl<0) {
2373                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2374                 emit_sltiu32(t,imm[i],t);
2375               }else{
2376                 emit_sltiu32(sl,imm[i],t);
2377               }
2378             }
2379           }else{ // 64-bit
2380             assert(sl>=0);
2381             if(opcode[i]==0x0a) // SLTI
2382               emit_slti64_32(sh,sl,imm[i],t);
2383             else // SLTIU
2384               emit_sltiu64_32(sh,sl,imm[i],t);
2385           }
2386         }else{
2387           // SLTI(U) with r0 is just stupid,
2388           // nonetheless examples can be found
2389           if(opcode[i]==0x0a) // SLTI
2390             if(0<imm[i]) emit_movimm(1,t);
2391             else emit_zeroreg(t);
2392           else // SLTIU
2393           {
2394             if(imm[i]) emit_movimm(1,t);
2395             else emit_zeroreg(t);
2396           }
2397         }
2398       }
2399     }
2400   }
2401   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2402     if(rt1[i]) {
2403       signed char sh,sl,th,tl;
2404       th=get_reg(i_regs->regmap,rt1[i]|64);
2405       tl=get_reg(i_regs->regmap,rt1[i]);
2406       sh=get_reg(i_regs->regmap,rs1[i]|64);
2407       sl=get_reg(i_regs->regmap,rs1[i]);
2408       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2409         if(opcode[i]==0x0c) //ANDI
2410         {
2411           if(rs1[i]) {
2412             if(sl<0) {
2413               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2414               emit_andimm(tl,imm[i],tl);
2415             }else{
2416               if(!((i_regs->wasconst>>sl)&1))
2417                 emit_andimm(sl,imm[i],tl);
2418               else
2419                 emit_movimm(constmap[i][sl]&imm[i],tl);
2420             }
2421           }
2422           else
2423             emit_zeroreg(tl);
2424           if(th>=0) emit_zeroreg(th);
2425         }
2426         else
2427         {
2428           if(rs1[i]) {
2429             if(sl<0) {
2430               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2431             }
2432             if(th>=0) {
2433               if(sh<0) {
2434                 emit_loadreg(rs1[i]|64,th);
2435               }else{
2436                 emit_mov(sh,th);
2437               }
2438             }
2439             if(opcode[i]==0x0d) { // ORI
2440               if(sl<0) {
2441                 emit_orimm(tl,imm[i],tl);
2442               }else{
2443                 if(!((i_regs->wasconst>>sl)&1))
2444                   emit_orimm(sl,imm[i],tl);
2445                 else
2446                   emit_movimm(constmap[i][sl]|imm[i],tl);
2447               }
2448             }
2449             if(opcode[i]==0x0e) { // XORI
2450               if(sl<0) {
2451                 emit_xorimm(tl,imm[i],tl);
2452               }else{
2453                 if(!((i_regs->wasconst>>sl)&1))
2454                   emit_xorimm(sl,imm[i],tl);
2455                 else
2456                   emit_movimm(constmap[i][sl]^imm[i],tl);
2457               }
2458             }
2459           }
2460           else {
2461             emit_movimm(imm[i],tl);
2462             if(th>=0) emit_zeroreg(th);
2463           }
2464         }
2465       }
2466     }
2467   }
2468 }
2469
2470 void shiftimm_assemble(int i,struct regstat *i_regs)
2471 {
2472   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2473   {
2474     if(rt1[i]) {
2475       signed char s,t;
2476       t=get_reg(i_regs->regmap,rt1[i]);
2477       s=get_reg(i_regs->regmap,rs1[i]);
2478       //assert(t>=0);
2479       if(t>=0&&!((i_regs->isconst>>t)&1)){
2480         if(rs1[i]==0)
2481         {
2482           emit_zeroreg(t);
2483         }
2484         else
2485         {
2486           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2487           if(imm[i]) {
2488             if(opcode2[i]==0) // SLL
2489             {
2490               emit_shlimm(s<0?t:s,imm[i],t);
2491             }
2492             if(opcode2[i]==2) // SRL
2493             {
2494               emit_shrimm(s<0?t:s,imm[i],t);
2495             }
2496             if(opcode2[i]==3) // SRA
2497             {
2498               emit_sarimm(s<0?t:s,imm[i],t);
2499             }
2500           }else{
2501             // Shift by zero
2502             if(s>=0 && s!=t) emit_mov(s,t);
2503           }
2504         }
2505       }
2506       //emit_storereg(rt1[i],t); //DEBUG
2507     }
2508   }
2509   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2510   {
2511     if(rt1[i]) {
2512       signed char sh,sl,th,tl;
2513       th=get_reg(i_regs->regmap,rt1[i]|64);
2514       tl=get_reg(i_regs->regmap,rt1[i]);
2515       sh=get_reg(i_regs->regmap,rs1[i]|64);
2516       sl=get_reg(i_regs->regmap,rs1[i]);
2517       if(tl>=0) {
2518         if(rs1[i]==0)
2519         {
2520           emit_zeroreg(tl);
2521           if(th>=0) emit_zeroreg(th);
2522         }
2523         else
2524         {
2525           assert(sl>=0);
2526           assert(sh>=0);
2527           if(imm[i]) {
2528             if(opcode2[i]==0x38) // DSLL
2529             {
2530               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2531               emit_shlimm(sl,imm[i],tl);
2532             }
2533             if(opcode2[i]==0x3a) // DSRL
2534             {
2535               emit_shrdimm(sl,sh,imm[i],tl);
2536               if(th>=0) emit_shrimm(sh,imm[i],th);
2537             }
2538             if(opcode2[i]==0x3b) // DSRA
2539             {
2540               emit_shrdimm(sl,sh,imm[i],tl);
2541               if(th>=0) emit_sarimm(sh,imm[i],th);
2542             }
2543           }else{
2544             // Shift by zero
2545             if(sl!=tl) emit_mov(sl,tl);
2546             if(th>=0&&sh!=th) emit_mov(sh,th);
2547           }
2548         }
2549       }
2550     }
2551   }
2552   if(opcode2[i]==0x3c) // DSLL32
2553   {
2554     if(rt1[i]) {
2555       signed char sl,tl,th;
2556       tl=get_reg(i_regs->regmap,rt1[i]);
2557       th=get_reg(i_regs->regmap,rt1[i]|64);
2558       sl=get_reg(i_regs->regmap,rs1[i]);
2559       if(th>=0||tl>=0){
2560         assert(tl>=0);
2561         assert(th>=0);
2562         assert(sl>=0);
2563         emit_mov(sl,th);
2564         emit_zeroreg(tl);
2565         if(imm[i]>32)
2566         {
2567           emit_shlimm(th,imm[i]&31,th);
2568         }
2569       }
2570     }
2571   }
2572   if(opcode2[i]==0x3e) // DSRL32
2573   {
2574     if(rt1[i]) {
2575       signed char sh,tl,th;
2576       tl=get_reg(i_regs->regmap,rt1[i]);
2577       th=get_reg(i_regs->regmap,rt1[i]|64);
2578       sh=get_reg(i_regs->regmap,rs1[i]|64);
2579       if(tl>=0){
2580         assert(sh>=0);
2581         emit_mov(sh,tl);
2582         if(th>=0) emit_zeroreg(th);
2583         if(imm[i]>32)
2584         {
2585           emit_shrimm(tl,imm[i]&31,tl);
2586         }
2587       }
2588     }
2589   }
2590   if(opcode2[i]==0x3f) // DSRA32
2591   {
2592     if(rt1[i]) {
2593       signed char sh,tl;
2594       tl=get_reg(i_regs->regmap,rt1[i]);
2595       sh=get_reg(i_regs->regmap,rs1[i]|64);
2596       if(tl>=0){
2597         assert(sh>=0);
2598         emit_mov(sh,tl);
2599         if(imm[i]>32)
2600         {
2601           emit_sarimm(tl,imm[i]&31,tl);
2602         }
2603       }
2604     }
2605   }
2606 }
2607
2608 #ifndef shift_assemble
2609 void shift_assemble(int i,struct regstat *i_regs)
2610 {
2611   printf("Need shift_assemble for this architecture.\n");
2612   exit(1);
2613 }
2614 #endif
2615
2616 void load_assemble(int i,struct regstat *i_regs)
2617 {
2618   int s,th,tl,addr,map=-1;
2619   int offset;
2620   int jaddr=0;
2621   int memtarget=0,c=0;
2622   int fastload_reg_override=0;
2623   u_int hr,reglist=0;
2624   th=get_reg(i_regs->regmap,rt1[i]|64);
2625   tl=get_reg(i_regs->regmap,rt1[i]);
2626   s=get_reg(i_regs->regmap,rs1[i]);
2627   offset=imm[i];
2628   for(hr=0;hr<HOST_REGS;hr++) {
2629     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2630   }
2631   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2632   if(s>=0) {
2633     c=(i_regs->wasconst>>s)&1;
2634     if (c) {
2635       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2636     }
2637   }
2638   //printf("load_assemble: c=%d\n",c);
2639   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2640   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2641   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2642     ||rt1[i]==0) {
2643       // could be FIFO, must perform the read
2644       // ||dummy read
2645       assem_debug("(forced read)\n");
2646       tl=get_reg(i_regs->regmap,-1);
2647       assert(tl>=0);
2648   }
2649   if(offset||s<0||c) addr=tl;
2650   else addr=s;
2651   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2652  if(tl>=0) {
2653   //printf("load_assemble: c=%d\n",c);
2654   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2655   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2656   reglist&=~(1<<tl);
2657   if(th>=0) reglist&=~(1<<th);
2658   if(!c) {
2659     #ifdef RAM_OFFSET
2660     map=get_reg(i_regs->regmap,ROREG);
2661     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2662     #endif
2663     #ifdef R29_HACK
2664     // Strmnnrmn's speed hack
2665     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2666     #endif
2667     {
2668       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2669     }
2670   }
2671   else if(ram_offset&&memtarget) {
2672     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2673     fastload_reg_override=HOST_TEMPREG;
2674   }
2675   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2676   if (opcode[i]==0x20) { // LB
2677     if(!c||memtarget) {
2678       if(!dummy) {
2679         #ifdef HOST_IMM_ADDR32
2680         if(c)
2681           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2682         else
2683         #endif
2684         {
2685           //emit_xorimm(addr,3,tl);
2686           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2687           int x=0,a=tl;
2688 #ifdef BIG_ENDIAN_MIPS
2689           if(!c) emit_xorimm(addr,3,tl);
2690           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2691 #else
2692           if(!c) a=addr;
2693 #endif
2694           if(fastload_reg_override) a=fastload_reg_override;
2695
2696           emit_movsbl_indexed_tlb(x,a,map,tl);
2697         }
2698       }
2699       if(jaddr)
2700         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2701     }
2702     else
2703       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2704   }
2705   if (opcode[i]==0x21) { // LH
2706     if(!c||memtarget) {
2707       if(!dummy) {
2708         #ifdef HOST_IMM_ADDR32
2709         if(c)
2710           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2711         else
2712         #endif
2713         {
2714           int x=0,a=tl;
2715 #ifdef BIG_ENDIAN_MIPS
2716           if(!c) emit_xorimm(addr,2,tl);
2717           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2718 #else
2719           if(!c) a=addr;
2720 #endif
2721           if(fastload_reg_override) a=fastload_reg_override;
2722           //#ifdef
2723           //emit_movswl_indexed_tlb(x,tl,map,tl);
2724           //else
2725           if(map>=0) {
2726             emit_movswl_indexed(x,a,tl);
2727           }else{
2728             #if 1 //def RAM_OFFSET
2729             emit_movswl_indexed(x,a,tl);
2730             #else
2731             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2732             #endif
2733           }
2734         }
2735       }
2736       if(jaddr)
2737         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2738     }
2739     else
2740       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2741   }
2742   if (opcode[i]==0x23) { // LW
2743     if(!c||memtarget) {
2744       if(!dummy) {
2745         int a=addr;
2746         if(fastload_reg_override) a=fastload_reg_override;
2747         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2748         #ifdef HOST_IMM_ADDR32
2749         if(c)
2750           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2751         else
2752         #endif
2753         emit_readword_indexed_tlb(0,a,map,tl);
2754       }
2755       if(jaddr)
2756         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2757     }
2758     else
2759       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2760   }
2761   if (opcode[i]==0x24) { // LBU
2762     if(!c||memtarget) {
2763       if(!dummy) {
2764         #ifdef HOST_IMM_ADDR32
2765         if(c)
2766           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2767         else
2768         #endif
2769         {
2770           //emit_xorimm(addr,3,tl);
2771           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2772           int x=0,a=tl;
2773 #ifdef BIG_ENDIAN_MIPS
2774           if(!c) emit_xorimm(addr,3,tl);
2775           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2776 #else
2777           if(!c) a=addr;
2778 #endif
2779           if(fastload_reg_override) a=fastload_reg_override;
2780
2781           emit_movzbl_indexed_tlb(x,a,map,tl);
2782         }
2783       }
2784       if(jaddr)
2785         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2786     }
2787     else
2788       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2789   }
2790   if (opcode[i]==0x25) { // LHU
2791     if(!c||memtarget) {
2792       if(!dummy) {
2793         #ifdef HOST_IMM_ADDR32
2794         if(c)
2795           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2796         else
2797         #endif
2798         {
2799           int x=0,a=tl;
2800 #ifdef BIG_ENDIAN_MIPS
2801           if(!c) emit_xorimm(addr,2,tl);
2802           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2803 #else
2804           if(!c) a=addr;
2805 #endif
2806           if(fastload_reg_override) a=fastload_reg_override;
2807           //#ifdef
2808           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2809           //#else
2810           if(map>=0) {
2811             emit_movzwl_indexed(x,a,tl);
2812           }else{
2813             #if 1 //def RAM_OFFSET
2814             emit_movzwl_indexed(x,a,tl);
2815             #else
2816             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2817             #endif
2818           }
2819         }
2820       }
2821       if(jaddr)
2822         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2823     }
2824     else
2825       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2826   }
2827   if (opcode[i]==0x27) { // LWU
2828     assert(th>=0);
2829     if(!c||memtarget) {
2830       if(!dummy) {
2831         int a=addr;
2832         if(fastload_reg_override) a=fastload_reg_override;
2833         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2834         #ifdef HOST_IMM_ADDR32
2835         if(c)
2836           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2837         else
2838         #endif
2839         emit_readword_indexed_tlb(0,a,map,tl);
2840       }
2841       if(jaddr)
2842         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2843     }
2844     else {
2845       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2846     }
2847     emit_zeroreg(th);
2848   }
2849   if (opcode[i]==0x37) { // LD
2850     if(!c||memtarget) {
2851       if(!dummy) {
2852         int a=addr;
2853         if(fastload_reg_override) a=fastload_reg_override;
2854         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2855         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2856         #ifdef HOST_IMM_ADDR32
2857         if(c)
2858           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2859         else
2860         #endif
2861         emit_readdword_indexed_tlb(0,a,map,th,tl);
2862       }
2863       if(jaddr)
2864         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2865     }
2866     else
2867       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2868   }
2869  }
2870   //emit_storereg(rt1[i],tl); // DEBUG
2871   //if(opcode[i]==0x23)
2872   //if(opcode[i]==0x24)
2873   //if(opcode[i]==0x23||opcode[i]==0x24)
2874   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2875   {
2876     //emit_pusha();
2877     save_regs(0x100f);
2878         emit_readword((int)&last_count,ECX);
2879         #ifdef __i386__
2880         if(get_reg(i_regs->regmap,CCREG)<0)
2881           emit_loadreg(CCREG,HOST_CCREG);
2882         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2883         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2884         emit_writeword(HOST_CCREG,(int)&Count);
2885         #endif
2886         #ifdef __arm__
2887         if(get_reg(i_regs->regmap,CCREG)<0)
2888           emit_loadreg(CCREG,0);
2889         else
2890           emit_mov(HOST_CCREG,0);
2891         emit_add(0,ECX,0);
2892         emit_addimm(0,2*ccadj[i],0);
2893         emit_writeword(0,(int)&Count);
2894         #endif
2895     emit_call((int)memdebug);
2896     //emit_popa();
2897     restore_regs(0x100f);
2898   }*/
2899 }
2900
2901 #ifndef loadlr_assemble
2902 void loadlr_assemble(int i,struct regstat *i_regs)
2903 {
2904   printf("Need loadlr_assemble for this architecture.\n");
2905   exit(1);
2906 }
2907 #endif
2908
2909 void store_assemble(int i,struct regstat *i_regs)
2910 {
2911   int s,th,tl,map=-1;
2912   int addr,temp;
2913   int offset;
2914   int jaddr=0,type;
2915   int memtarget=0,c=0;
2916   int agr=AGEN1+(i&1);
2917   int faststore_reg_override=0;
2918   u_int hr,reglist=0;
2919   th=get_reg(i_regs->regmap,rs2[i]|64);
2920   tl=get_reg(i_regs->regmap,rs2[i]);
2921   s=get_reg(i_regs->regmap,rs1[i]);
2922   temp=get_reg(i_regs->regmap,agr);
2923   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2924   offset=imm[i];
2925   if(s>=0) {
2926     c=(i_regs->wasconst>>s)&1;
2927     if(c) {
2928       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2929     }
2930   }
2931   assert(tl>=0);
2932   assert(temp>=0);
2933   for(hr=0;hr<HOST_REGS;hr++) {
2934     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2935   }
2936   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2937   if(offset||s<0||c) addr=temp;
2938   else addr=s;
2939   if(!c) {
2940     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2941   }
2942   else if(ram_offset&&memtarget) {
2943     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2944     faststore_reg_override=HOST_TEMPREG;
2945   }
2946
2947   if (opcode[i]==0x28) { // SB
2948     if(!c||memtarget) {
2949       int x=0,a=temp;
2950 #ifdef BIG_ENDIAN_MIPS
2951       if(!c) emit_xorimm(addr,3,temp);
2952       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2953 #else
2954       if(!c) a=addr;
2955 #endif
2956       if(faststore_reg_override) a=faststore_reg_override;
2957       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2958       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2959     }
2960     type=STOREB_STUB;
2961   }
2962   if (opcode[i]==0x29) { // SH
2963     if(!c||memtarget) {
2964       int x=0,a=temp;
2965 #ifdef BIG_ENDIAN_MIPS
2966       if(!c) emit_xorimm(addr,2,temp);
2967       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2968 #else
2969       if(!c) a=addr;
2970 #endif
2971       if(faststore_reg_override) a=faststore_reg_override;
2972       //#ifdef
2973       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2974       //#else
2975       if(map>=0) {
2976         emit_writehword_indexed(tl,x,a);
2977       }else
2978         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2979         emit_writehword_indexed(tl,x,a);
2980     }
2981     type=STOREH_STUB;
2982   }
2983   if (opcode[i]==0x2B) { // SW
2984     if(!c||memtarget) {
2985       int a=addr;
2986       if(faststore_reg_override) a=faststore_reg_override;
2987       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2988       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2989     }
2990     type=STOREW_STUB;
2991   }
2992   if (opcode[i]==0x3F) { // SD
2993     if(!c||memtarget) {
2994       int a=addr;
2995       if(faststore_reg_override) a=faststore_reg_override;
2996       if(rs2[i]) {
2997         assert(th>=0);
2998         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2999         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3000         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3001       }else{
3002         // Store zero
3003         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3004         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3005         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3006       }
3007     }
3008     type=STORED_STUB;
3009   }
3010   if(jaddr) {
3011     // PCSX store handlers don't check invcode again
3012     reglist|=1<<addr;
3013     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3014     jaddr=0;
3015   }
3016   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3017     if(!c||memtarget) {
3018       #ifdef DESTRUCTIVE_SHIFT
3019       // The x86 shift operation is 'destructive'; it overwrites the
3020       // source register, so we need to make a copy first and use that.
3021       addr=temp;
3022       #endif
3023       #if defined(HOST_IMM8)
3024       int ir=get_reg(i_regs->regmap,INVCP);
3025       assert(ir>=0);
3026       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3027       #else
3028       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3029       #endif
3030       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3031       emit_callne(invalidate_addr_reg[addr]);
3032       #else
3033       int jaddr2=(int)out;
3034       emit_jne(0);
3035       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3036       #endif
3037     }
3038   }
3039   u_int addr_val=constmap[i][s]+offset;
3040   if(jaddr) {
3041     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3042   } else if(c&&!memtarget) {
3043     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3044   }
3045   // basic current block modification detection..
3046   // not looking back as that should be in mips cache already
3047   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3048     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3049     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3050     if(i_regs->regmap==regs[i].regmap) {
3051       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3052       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3053       emit_movimm(start+i*4+4,0);
3054       emit_writeword(0,(int)&pcaddr);
3055       emit_jmp((int)do_interrupt);
3056     }
3057   }
3058   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3059   //if(opcode[i]==0x2B || opcode[i]==0x28)
3060   //if(opcode[i]==0x2B || opcode[i]==0x29)
3061   //if(opcode[i]==0x2B)
3062   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3063   {
3064     #ifdef __i386__
3065     emit_pusha();
3066     #endif
3067     #ifdef __arm__
3068     save_regs(0x100f);
3069     #endif
3070         emit_readword((int)&last_count,ECX);
3071         #ifdef __i386__
3072         if(get_reg(i_regs->regmap,CCREG)<0)
3073           emit_loadreg(CCREG,HOST_CCREG);
3074         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3075         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3076         emit_writeword(HOST_CCREG,(int)&Count);
3077         #endif
3078         #ifdef __arm__
3079         if(get_reg(i_regs->regmap,CCREG)<0)
3080           emit_loadreg(CCREG,0);
3081         else
3082           emit_mov(HOST_CCREG,0);
3083         emit_add(0,ECX,0);
3084         emit_addimm(0,2*ccadj[i],0);
3085         emit_writeword(0,(int)&Count);
3086         #endif
3087     emit_call((int)memdebug);
3088     #ifdef __i386__
3089     emit_popa();
3090     #endif
3091     #ifdef __arm__
3092     restore_regs(0x100f);
3093     #endif
3094   }*/
3095 }
3096
3097 void storelr_assemble(int i,struct regstat *i_regs)
3098 {
3099   int s,th,tl;
3100   int temp;
3101   int temp2=-1;
3102   int offset;
3103   int jaddr=0;
3104   int case1,case2,case3;
3105   int done0,done1,done2;
3106   int memtarget=0,c=0;
3107   int agr=AGEN1+(i&1);
3108   u_int hr,reglist=0;
3109   th=get_reg(i_regs->regmap,rs2[i]|64);
3110   tl=get_reg(i_regs->regmap,rs2[i]);
3111   s=get_reg(i_regs->regmap,rs1[i]);
3112   temp=get_reg(i_regs->regmap,agr);
3113   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3114   offset=imm[i];
3115   if(s>=0) {
3116     c=(i_regs->isconst>>s)&1;
3117     if(c) {
3118       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3119     }
3120   }
3121   assert(tl>=0);
3122   for(hr=0;hr<HOST_REGS;hr++) {
3123     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3124   }
3125   assert(temp>=0);
3126   if(!c) {
3127     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3128     if(!offset&&s!=temp) emit_mov(s,temp);
3129     jaddr=(int)out;
3130     emit_jno(0);
3131   }
3132   else
3133   {
3134     if(!memtarget||!rs1[i]) {
3135       jaddr=(int)out;
3136       emit_jmp(0);
3137     }
3138   }
3139   #ifdef RAM_OFFSET
3140   int map=get_reg(i_regs->regmap,ROREG);
3141   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3142   #else
3143   if((u_int)rdram!=0x80000000)
3144     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3145   #endif
3146
3147   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3148     temp2=get_reg(i_regs->regmap,FTEMP);
3149     if(!rs2[i]) temp2=th=tl;
3150   }
3151
3152 #ifndef BIG_ENDIAN_MIPS
3153     emit_xorimm(temp,3,temp);
3154 #endif
3155   emit_testimm(temp,2);
3156   case2=(int)out;
3157   emit_jne(0);
3158   emit_testimm(temp,1);
3159   case1=(int)out;
3160   emit_jne(0);
3161   // 0
3162   if (opcode[i]==0x2A) { // SWL
3163     emit_writeword_indexed(tl,0,temp);
3164   }
3165   if (opcode[i]==0x2E) { // SWR
3166     emit_writebyte_indexed(tl,3,temp);
3167   }
3168   if (opcode[i]==0x2C) { // SDL
3169     emit_writeword_indexed(th,0,temp);
3170     if(rs2[i]) emit_mov(tl,temp2);
3171   }
3172   if (opcode[i]==0x2D) { // SDR
3173     emit_writebyte_indexed(tl,3,temp);
3174     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3175   }
3176   done0=(int)out;
3177   emit_jmp(0);
3178   // 1
3179   set_jump_target(case1,(int)out);
3180   if (opcode[i]==0x2A) { // SWL
3181     // Write 3 msb into three least significant bytes
3182     if(rs2[i]) emit_rorimm(tl,8,tl);
3183     emit_writehword_indexed(tl,-1,temp);
3184     if(rs2[i]) emit_rorimm(tl,16,tl);
3185     emit_writebyte_indexed(tl,1,temp);
3186     if(rs2[i]) emit_rorimm(tl,8,tl);
3187   }
3188   if (opcode[i]==0x2E) { // SWR
3189     // Write two lsb into two most significant bytes
3190     emit_writehword_indexed(tl,1,temp);
3191   }
3192   if (opcode[i]==0x2C) { // SDL
3193     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3194     // Write 3 msb into three least significant bytes
3195     if(rs2[i]) emit_rorimm(th,8,th);
3196     emit_writehword_indexed(th,-1,temp);
3197     if(rs2[i]) emit_rorimm(th,16,th);
3198     emit_writebyte_indexed(th,1,temp);
3199     if(rs2[i]) emit_rorimm(th,8,th);
3200   }
3201   if (opcode[i]==0x2D) { // SDR
3202     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3203     // Write two lsb into two most significant bytes
3204     emit_writehword_indexed(tl,1,temp);
3205   }
3206   done1=(int)out;
3207   emit_jmp(0);
3208   // 2
3209   set_jump_target(case2,(int)out);
3210   emit_testimm(temp,1);
3211   case3=(int)out;
3212   emit_jne(0);
3213   if (opcode[i]==0x2A) { // SWL
3214     // Write two msb into two least significant bytes
3215     if(rs2[i]) emit_rorimm(tl,16,tl);
3216     emit_writehword_indexed(tl,-2,temp);
3217     if(rs2[i]) emit_rorimm(tl,16,tl);
3218   }
3219   if (opcode[i]==0x2E) { // SWR
3220     // Write 3 lsb into three most significant bytes
3221     emit_writebyte_indexed(tl,-1,temp);
3222     if(rs2[i]) emit_rorimm(tl,8,tl);
3223     emit_writehword_indexed(tl,0,temp);
3224     if(rs2[i]) emit_rorimm(tl,24,tl);
3225   }
3226   if (opcode[i]==0x2C) { // SDL
3227     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3228     // Write two msb into two least significant bytes
3229     if(rs2[i]) emit_rorimm(th,16,th);
3230     emit_writehword_indexed(th,-2,temp);
3231     if(rs2[i]) emit_rorimm(th,16,th);
3232   }
3233   if (opcode[i]==0x2D) { // SDR
3234     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3235     // Write 3 lsb into three most significant bytes
3236     emit_writebyte_indexed(tl,-1,temp);
3237     if(rs2[i]) emit_rorimm(tl,8,tl);
3238     emit_writehword_indexed(tl,0,temp);
3239     if(rs2[i]) emit_rorimm(tl,24,tl);
3240   }
3241   done2=(int)out;
3242   emit_jmp(0);
3243   // 3
3244   set_jump_target(case3,(int)out);
3245   if (opcode[i]==0x2A) { // SWL
3246     // Write msb into least significant byte
3247     if(rs2[i]) emit_rorimm(tl,24,tl);
3248     emit_writebyte_indexed(tl,-3,temp);
3249     if(rs2[i]) emit_rorimm(tl,8,tl);
3250   }
3251   if (opcode[i]==0x2E) { // SWR
3252     // Write entire word
3253     emit_writeword_indexed(tl,-3,temp);
3254   }
3255   if (opcode[i]==0x2C) { // SDL
3256     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3257     // Write msb into least significant byte
3258     if(rs2[i]) emit_rorimm(th,24,th);
3259     emit_writebyte_indexed(th,-3,temp);
3260     if(rs2[i]) emit_rorimm(th,8,th);
3261   }
3262   if (opcode[i]==0x2D) { // SDR
3263     if(rs2[i]) emit_mov(th,temp2);
3264     // Write entire word
3265     emit_writeword_indexed(tl,-3,temp);
3266   }
3267   set_jump_target(done0,(int)out);
3268   set_jump_target(done1,(int)out);
3269   set_jump_target(done2,(int)out);
3270   if (opcode[i]==0x2C) { // SDL
3271     emit_testimm(temp,4);
3272     done0=(int)out;
3273     emit_jne(0);
3274     emit_andimm(temp,~3,temp);
3275     emit_writeword_indexed(temp2,4,temp);
3276     set_jump_target(done0,(int)out);
3277   }
3278   if (opcode[i]==0x2D) { // SDR
3279     emit_testimm(temp,4);
3280     done0=(int)out;
3281     emit_jeq(0);
3282     emit_andimm(temp,~3,temp);
3283     emit_writeword_indexed(temp2,-4,temp);
3284     set_jump_target(done0,(int)out);
3285   }
3286   if(!c||!memtarget)
3287     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3288   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3289     #ifdef RAM_OFFSET
3290     int map=get_reg(i_regs->regmap,ROREG);
3291     if(map<0) map=HOST_TEMPREG;
3292     gen_orig_addr_w(temp,map);
3293     #else
3294     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3295     #endif
3296     #if defined(HOST_IMM8)
3297     int ir=get_reg(i_regs->regmap,INVCP);
3298     assert(ir>=0);
3299     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3300     #else
3301     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3302     #endif
3303     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3304     emit_callne(invalidate_addr_reg[temp]);
3305     #else
3306     int jaddr2=(int)out;
3307     emit_jne(0);
3308     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3309     #endif
3310   }
3311   /*
3312     emit_pusha();
3313     //save_regs(0x100f);
3314         emit_readword((int)&last_count,ECX);
3315         if(get_reg(i_regs->regmap,CCREG)<0)
3316           emit_loadreg(CCREG,HOST_CCREG);
3317         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3318         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3319         emit_writeword(HOST_CCREG,(int)&Count);
3320     emit_call((int)memdebug);
3321     emit_popa();
3322     //restore_regs(0x100f);
3323   */
3324 }
3325
3326 void c1ls_assemble(int i,struct regstat *i_regs)
3327 {
3328   cop1_unusable(i, i_regs);
3329 }
3330
3331 void c2ls_assemble(int i,struct regstat *i_regs)
3332 {
3333   int s,tl;
3334   int ar;
3335   int offset;
3336   int memtarget=0,c=0;
3337   int jaddr2=0,type;
3338   int agr=AGEN1+(i&1);
3339   int fastio_reg_override=0;
3340   u_int hr,reglist=0;
3341   u_int copr=(source[i]>>16)&0x1f;
3342   s=get_reg(i_regs->regmap,rs1[i]);
3343   tl=get_reg(i_regs->regmap,FTEMP);
3344   offset=imm[i];
3345   assert(rs1[i]>0);
3346   assert(tl>=0);
3347
3348   for(hr=0;hr<HOST_REGS;hr++) {
3349     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3350   }
3351   if(i_regs->regmap[HOST_CCREG]==CCREG)
3352     reglist&=~(1<<HOST_CCREG);
3353
3354   // get the address
3355   if (opcode[i]==0x3a) { // SWC2
3356     ar=get_reg(i_regs->regmap,agr);
3357     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3358     reglist|=1<<ar;
3359   } else { // LWC2
3360     ar=tl;
3361   }
3362   if(s>=0) c=(i_regs->wasconst>>s)&1;
3363   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3364   if (!offset&&!c&&s>=0) ar=s;
3365   assert(ar>=0);
3366
3367   if (opcode[i]==0x3a) { // SWC2
3368     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3369     type=STOREW_STUB;
3370   }
3371   else
3372     type=LOADW_STUB;
3373
3374   if(c&&!memtarget) {
3375     jaddr2=(int)out;
3376     emit_jmp(0); // inline_readstub/inline_writestub?
3377   }
3378   else {
3379     if(!c) {
3380       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3381     }
3382     else if(ram_offset&&memtarget) {
3383       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3384       fastio_reg_override=HOST_TEMPREG;
3385     }
3386     if (opcode[i]==0x32) { // LWC2
3387       #ifdef HOST_IMM_ADDR32
3388       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3389       else
3390       #endif
3391       int a=ar;
3392       if(fastio_reg_override) a=fastio_reg_override;
3393       emit_readword_indexed(0,a,tl);
3394     }
3395     if (opcode[i]==0x3a) { // SWC2
3396       #ifdef DESTRUCTIVE_SHIFT
3397       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3398       #endif
3399       int a=ar;
3400       if(fastio_reg_override) a=fastio_reg_override;
3401       emit_writeword_indexed(tl,0,a);
3402     }
3403   }
3404   if(jaddr2)
3405     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3406   if(opcode[i]==0x3a) // SWC2
3407   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3408 #if defined(HOST_IMM8)
3409     int ir=get_reg(i_regs->regmap,INVCP);
3410     assert(ir>=0);
3411     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3412 #else
3413     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3414 #endif
3415     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3416     emit_callne(invalidate_addr_reg[ar]);
3417     #else
3418     int jaddr3=(int)out;
3419     emit_jne(0);
3420     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3421     #endif
3422   }
3423   if (opcode[i]==0x32) { // LWC2
3424     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3425   }
3426 }
3427
3428 #ifndef multdiv_assemble
3429 void multdiv_assemble(int i,struct regstat *i_regs)
3430 {
3431   printf("Need multdiv_assemble for this architecture.\n");
3432   exit(1);
3433 }
3434 #endif
3435
3436 void mov_assemble(int i,struct regstat *i_regs)
3437 {
3438   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3439   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3440   if(rt1[i]) {
3441     signed char sh,sl,th,tl;
3442     th=get_reg(i_regs->regmap,rt1[i]|64);
3443     tl=get_reg(i_regs->regmap,rt1[i]);
3444     //assert(tl>=0);
3445     if(tl>=0) {
3446       sh=get_reg(i_regs->regmap,rs1[i]|64);
3447       sl=get_reg(i_regs->regmap,rs1[i]);
3448       if(sl>=0) emit_mov(sl,tl);
3449       else emit_loadreg(rs1[i],tl);
3450       if(th>=0) {
3451         if(sh>=0) emit_mov(sh,th);
3452         else emit_loadreg(rs1[i]|64,th);
3453       }
3454     }
3455   }
3456 }
3457
3458 #ifndef fconv_assemble
3459 void fconv_assemble(int i,struct regstat *i_regs)
3460 {
3461   printf("Need fconv_assemble for this architecture.\n");
3462   exit(1);
3463 }
3464 #endif
3465
3466 #if 0
3467 void float_assemble(int i,struct regstat *i_regs)
3468 {
3469   printf("Need float_assemble for this architecture.\n");
3470   exit(1);
3471 }
3472 #endif
3473
3474 void syscall_assemble(int i,struct regstat *i_regs)
3475 {
3476   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3477   assert(ccreg==HOST_CCREG);
3478   assert(!is_delayslot);
3479   (void)ccreg;
3480   emit_movimm(start+i*4,EAX); // Get PC
3481   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3482   emit_jmp((int)jump_syscall_hle); // XXX
3483 }
3484
3485 void hlecall_assemble(int i,struct regstat *i_regs)
3486 {
3487   extern void psxNULL();
3488   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3489   assert(ccreg==HOST_CCREG);
3490   assert(!is_delayslot);
3491   (void)ccreg;
3492   emit_movimm(start+i*4+4,0); // Get PC
3493   uint32_t hleCode = source[i] & 0x03ffffff;
3494   if (hleCode >= (sizeof(psxHLEt) / sizeof(psxHLEt[0])))
3495     emit_movimm((int)psxNULL,1);
3496   else
3497     emit_movimm((int)psxHLEt[hleCode],1);
3498   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3499   emit_jmp((int)jump_hlecall);
3500 }
3501
3502 void intcall_assemble(int i,struct regstat *i_regs)
3503 {
3504   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3505   assert(ccreg==HOST_CCREG);
3506   assert(!is_delayslot);
3507   (void)ccreg;
3508   emit_movimm(start+i*4,0); // Get PC
3509   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3510   emit_jmp((int)jump_intcall);
3511 }
3512
3513 void ds_assemble(int i,struct regstat *i_regs)
3514 {
3515   speculate_register_values(i);
3516   is_delayslot=1;
3517   switch(itype[i]) {
3518     case ALU:
3519       alu_assemble(i,i_regs);break;
3520     case IMM16:
3521       imm16_assemble(i,i_regs);break;
3522     case SHIFT:
3523       shift_assemble(i,i_regs);break;
3524     case SHIFTIMM:
3525       shiftimm_assemble(i,i_regs);break;
3526     case LOAD:
3527       load_assemble(i,i_regs);break;
3528     case LOADLR:
3529       loadlr_assemble(i,i_regs);break;
3530     case STORE:
3531       store_assemble(i,i_regs);break;
3532     case STORELR:
3533       storelr_assemble(i,i_regs);break;
3534     case COP0:
3535       cop0_assemble(i,i_regs);break;
3536     case COP1:
3537       cop1_assemble(i,i_regs);break;
3538     case C1LS:
3539       c1ls_assemble(i,i_regs);break;
3540     case COP2:
3541       cop2_assemble(i,i_regs);break;
3542     case C2LS:
3543       c2ls_assemble(i,i_regs);break;
3544     case C2OP:
3545       c2op_assemble(i,i_regs);break;
3546     case FCONV:
3547       fconv_assemble(i,i_regs);break;
3548     case FLOAT:
3549       float_assemble(i,i_regs);break;
3550     case FCOMP:
3551       fcomp_assemble(i,i_regs);break;
3552     case MULTDIV:
3553       multdiv_assemble(i,i_regs);break;
3554     case MOV:
3555       mov_assemble(i,i_regs);break;
3556     case SYSCALL:
3557     case HLECALL:
3558     case INTCALL:
3559     case SPAN:
3560     case UJUMP:
3561     case RJUMP:
3562     case CJUMP:
3563     case SJUMP:
3564     case FJUMP:
3565       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3566   }
3567   is_delayslot=0;
3568 }
3569
3570 // Is the branch target a valid internal jump?
3571 int internal_branch(uint64_t i_is32,int addr)
3572 {
3573   if(addr&1) return 0; // Indirect (register) jump
3574   if(addr>=start && addr<start+slen*4-4)
3575   {
3576     //int t=(addr-start)>>2;
3577     // Delay slots are not valid branch targets
3578     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3579     // 64 -> 32 bit transition requires a recompile
3580     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3581     {
3582       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3583       else printf("optimizable: yes\n");
3584     }*/
3585     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3586     return 1;
3587   }
3588   return 0;
3589 }
3590
3591 #ifndef wb_invalidate
3592 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3593   uint64_t u,uint64_t uu)
3594 {
3595   int hr;
3596   for(hr=0;hr<HOST_REGS;hr++) {
3597     if(hr!=EXCLUDE_REG) {
3598       if(pre[hr]!=entry[hr]) {
3599         if(pre[hr]>=0) {
3600           if((dirty>>hr)&1) {
3601             if(get_reg(entry,pre[hr])<0) {
3602               if(pre[hr]<64) {
3603                 if(!((u>>pre[hr])&1)) {
3604                   emit_storereg(pre[hr],hr);
3605                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3606                     emit_sarimm(hr,31,hr);
3607                     emit_storereg(pre[hr]|64,hr);
3608                   }
3609                 }
3610               }else{
3611                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3612                   emit_storereg(pre[hr],hr);
3613                 }
3614               }
3615             }
3616           }
3617         }
3618       }
3619     }
3620   }
3621   // Move from one register to another (no writeback)
3622   for(hr=0;hr<HOST_REGS;hr++) {
3623     if(hr!=EXCLUDE_REG) {
3624       if(pre[hr]!=entry[hr]) {
3625         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3626           int nr;
3627           if((nr=get_reg(entry,pre[hr]))>=0) {
3628             emit_mov(hr,nr);
3629           }
3630         }
3631       }
3632     }
3633   }
3634 }
3635 #endif
3636
3637 // Load the specified registers
3638 // This only loads the registers given as arguments because
3639 // we don't want to load things that will be overwritten
3640 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3641 {
3642   int hr;
3643   // Load 32-bit regs
3644   for(hr=0;hr<HOST_REGS;hr++) {
3645     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3646       if(entry[hr]!=regmap[hr]) {
3647         if(regmap[hr]==rs1||regmap[hr]==rs2)
3648         {
3649           if(regmap[hr]==0) {
3650             emit_zeroreg(hr);
3651           }
3652           else
3653           {
3654             emit_loadreg(regmap[hr],hr);
3655           }
3656         }
3657       }
3658     }
3659   }
3660   //Load 64-bit regs
3661   for(hr=0;hr<HOST_REGS;hr++) {
3662     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3663       if(entry[hr]!=regmap[hr]) {
3664         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3665         {
3666           assert(regmap[hr]!=64);
3667           if((is32>>(regmap[hr]&63))&1) {
3668             int lr=get_reg(regmap,regmap[hr]-64);
3669             if(lr>=0)
3670               emit_sarimm(lr,31,hr);
3671             else
3672               emit_loadreg(regmap[hr],hr);
3673           }
3674           else
3675           {
3676             emit_loadreg(regmap[hr],hr);
3677           }
3678         }
3679       }
3680     }
3681   }
3682 }
3683
3684 // Load registers prior to the start of a loop
3685 // so that they are not loaded within the loop
3686 static void loop_preload(signed char pre[],signed char entry[])
3687 {
3688   int hr;
3689   for(hr=0;hr<HOST_REGS;hr++) {
3690     if(hr!=EXCLUDE_REG) {
3691       if(pre[hr]!=entry[hr]) {
3692         if(entry[hr]>=0) {
3693           if(get_reg(pre,entry[hr])<0) {
3694             assem_debug("loop preload:\n");
3695             //printf("loop preload: %d\n",hr);
3696             if(entry[hr]==0) {
3697               emit_zeroreg(hr);
3698             }
3699             else if(entry[hr]<TEMPREG)
3700             {
3701               emit_loadreg(entry[hr],hr);
3702             }
3703             else if(entry[hr]-64<TEMPREG)
3704             {
3705               emit_loadreg(entry[hr],hr);
3706             }
3707           }
3708         }
3709       }
3710     }
3711   }
3712 }
3713
3714 // Generate address for load/store instruction
3715 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3716 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3717 {
3718   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3719     int ra=-1;
3720     int agr=AGEN1+(i&1);
3721     if(itype[i]==LOAD) {
3722       ra=get_reg(i_regs->regmap,rt1[i]);
3723       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3724       assert(ra>=0);
3725     }
3726     if(itype[i]==LOADLR) {
3727       ra=get_reg(i_regs->regmap,FTEMP);
3728     }
3729     if(itype[i]==STORE||itype[i]==STORELR) {
3730       ra=get_reg(i_regs->regmap,agr);
3731       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3732     }
3733     if(itype[i]==C1LS||itype[i]==C2LS) {
3734       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3735         ra=get_reg(i_regs->regmap,FTEMP);
3736       else { // SWC1/SDC1/SWC2/SDC2
3737         ra=get_reg(i_regs->regmap,agr);
3738         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3739       }
3740     }
3741     int rs=get_reg(i_regs->regmap,rs1[i]);
3742     if(ra>=0) {
3743       int offset=imm[i];
3744       int c=(i_regs->wasconst>>rs)&1;
3745       if(rs1[i]==0) {
3746         // Using r0 as a base address
3747         if(!entry||entry[ra]!=agr) {
3748           if (opcode[i]==0x22||opcode[i]==0x26) {
3749             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3750           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3751             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3752           }else{
3753             emit_movimm(offset,ra);
3754           }
3755         } // else did it in the previous cycle
3756       }
3757       else if(rs<0) {
3758         if(!entry||entry[ra]!=rs1[i])
3759           emit_loadreg(rs1[i],ra);
3760         //if(!entry||entry[ra]!=rs1[i])
3761         //  printf("poor load scheduling!\n");
3762       }
3763       else if(c) {
3764         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3765           if(!entry||entry[ra]!=agr) {
3766             if (opcode[i]==0x22||opcode[i]==0x26) {
3767               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3768             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3769               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3770             }else{
3771               #ifdef HOST_IMM_ADDR32
3772               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3773               #endif
3774               emit_movimm(constmap[i][rs]+offset,ra);
3775               regs[i].loadedconst|=1<<ra;
3776             }
3777           } // else did it in the previous cycle
3778         } // else load_consts already did it
3779       }
3780       if(offset&&!c&&rs1[i]) {
3781         if(rs>=0) {
3782           emit_addimm(rs,offset,ra);
3783         }else{
3784           emit_addimm(ra,offset,ra);
3785         }
3786       }
3787     }
3788   }
3789   // Preload constants for next instruction
3790   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3791     int agr,ra;
3792     // Actual address
3793     agr=AGEN1+((i+1)&1);
3794     ra=get_reg(i_regs->regmap,agr);
3795     if(ra>=0) {
3796       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3797       int offset=imm[i+1];
3798       int c=(regs[i+1].wasconst>>rs)&1;
3799       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3800         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3801           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3802         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3803           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3804         }else{
3805           #ifdef HOST_IMM_ADDR32
3806           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3807           #endif
3808           emit_movimm(constmap[i+1][rs]+offset,ra);
3809           regs[i+1].loadedconst|=1<<ra;
3810         }
3811       }
3812       else if(rs1[i+1]==0) {
3813         // Using r0 as a base address
3814         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3815           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3816         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3817           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3818         }else{
3819           emit_movimm(offset,ra);
3820         }
3821       }
3822     }
3823   }
3824 }
3825
3826 static int get_final_value(int hr, int i, int *value)
3827 {
3828   int reg=regs[i].regmap[hr];
3829   while(i<slen-1) {
3830     if(regs[i+1].regmap[hr]!=reg) break;
3831     if(!((regs[i+1].isconst>>hr)&1)) break;
3832     if(bt[i+1]) break;
3833     i++;
3834   }
3835   if(i<slen-1) {
3836     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3837       *value=constmap[i][hr];
3838       return 1;
3839     }
3840     if(!bt[i+1]) {
3841       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3842         // Load in delay slot, out-of-order execution
3843         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3844         {
3845           // Precompute load address
3846           *value=constmap[i][hr]+imm[i+2];
3847           return 1;
3848         }
3849       }
3850       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3851       {
3852         // Precompute load address
3853         *value=constmap[i][hr]+imm[i+1];
3854         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3855         return 1;
3856       }
3857     }
3858   }
3859   *value=constmap[i][hr];
3860   //printf("c=%x\n",(int)constmap[i][hr]);
3861   if(i==slen-1) return 1;
3862   if(reg<64) {
3863     return !((unneeded_reg[i+1]>>reg)&1);
3864   }else{
3865     return !((unneeded_reg_upper[i+1]>>reg)&1);
3866   }
3867 }
3868
3869 // Load registers with known constants
3870 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3871 {
3872   int hr,hr2;
3873   // propagate loaded constant flags
3874   if(i==0||bt[i])
3875     regs[i].loadedconst=0;
3876   else {
3877     for(hr=0;hr<HOST_REGS;hr++) {
3878       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3879          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3880       {
3881         regs[i].loadedconst|=1<<hr;
3882       }
3883     }
3884   }
3885   // Load 32-bit regs
3886   for(hr=0;hr<HOST_REGS;hr++) {
3887     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3888       //if(entry[hr]!=regmap[hr]) {
3889       if(!((regs[i].loadedconst>>hr)&1)) {
3890         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3891           int value,similar=0;
3892           if(get_final_value(hr,i,&value)) {
3893             // see if some other register has similar value
3894             for(hr2=0;hr2<HOST_REGS;hr2++) {
3895               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3896                 if(is_similar_value(value,constmap[i][hr2])) {
3897                   similar=1;
3898                   break;
3899                 }
3900               }
3901             }
3902             if(similar) {
3903               int value2;
3904               if(get_final_value(hr2,i,&value2)) // is this needed?
3905                 emit_movimm_from(value2,hr2,value,hr);
3906               else
3907                 emit_movimm(value,hr);
3908             }
3909             else if(value==0) {
3910               emit_zeroreg(hr);
3911             }
3912             else {
3913               emit_movimm(value,hr);
3914             }
3915           }
3916           regs[i].loadedconst|=1<<hr;
3917         }
3918       }
3919     }
3920   }
3921   // Load 64-bit regs
3922   for(hr=0;hr<HOST_REGS;hr++) {
3923     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3924       //if(entry[hr]!=regmap[hr]) {
3925       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3926         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3927           if((is32>>(regmap[hr]&63))&1) {
3928             int lr=get_reg(regmap,regmap[hr]-64);
3929             assert(lr>=0);
3930             emit_sarimm(lr,31,hr);
3931           }
3932           else
3933           {
3934             int value;
3935             if(get_final_value(hr,i,&value)) {
3936               if(value==0) {
3937                 emit_zeroreg(hr);
3938               }
3939               else {
3940                 emit_movimm(value,hr);
3941               }
3942             }
3943           }
3944         }
3945       }
3946     }
3947   }
3948 }
3949 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3950 {
3951   int hr;
3952   // Load 32-bit regs
3953   for(hr=0;hr<HOST_REGS;hr++) {
3954     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3955       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3956         int value=constmap[i][hr];
3957         if(value==0) {
3958           emit_zeroreg(hr);
3959         }
3960         else {
3961           emit_movimm(value,hr);
3962         }
3963       }
3964     }
3965   }
3966   // Load 64-bit regs
3967   for(hr=0;hr<HOST_REGS;hr++) {
3968     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3969       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3970         if((is32>>(regmap[hr]&63))&1) {
3971           int lr=get_reg(regmap,regmap[hr]-64);
3972           assert(lr>=0);
3973           emit_sarimm(lr,31,hr);
3974         }
3975         else
3976         {
3977           int value=constmap[i][hr];
3978           if(value==0) {
3979             emit_zeroreg(hr);
3980           }
3981           else {
3982             emit_movimm(value,hr);
3983           }
3984         }
3985       }
3986     }
3987   }
3988 }
3989
3990 // Write out all dirty registers (except cycle count)
3991 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3992 {
3993   int hr;
3994   for(hr=0;hr<HOST_REGS;hr++) {
3995     if(hr!=EXCLUDE_REG) {
3996       if(i_regmap[hr]>0) {
3997         if(i_regmap[hr]!=CCREG) {
3998           if((i_dirty>>hr)&1) {
3999             if(i_regmap[hr]<64) {
4000               emit_storereg(i_regmap[hr],hr);
4001             }else{
4002               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4003                 emit_storereg(i_regmap[hr],hr);
4004               }
4005             }
4006           }
4007         }
4008       }
4009     }
4010   }
4011 }
4012 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4013 // This writes the registers not written by store_regs_bt
4014 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4015 {
4016   int hr;
4017   int t=(addr-start)>>2;
4018   for(hr=0;hr<HOST_REGS;hr++) {
4019     if(hr!=EXCLUDE_REG) {
4020       if(i_regmap[hr]>0) {
4021         if(i_regmap[hr]!=CCREG) {
4022           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4023             if((i_dirty>>hr)&1) {
4024               if(i_regmap[hr]<64) {
4025                 emit_storereg(i_regmap[hr],hr);
4026               }else{
4027                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4028                   emit_storereg(i_regmap[hr],hr);
4029                 }
4030               }
4031             }
4032           }
4033         }
4034       }
4035     }
4036   }
4037 }
4038
4039 // Load all registers (except cycle count)
4040 void load_all_regs(signed char i_regmap[])
4041 {
4042   int hr;
4043   for(hr=0;hr<HOST_REGS;hr++) {
4044     if(hr!=EXCLUDE_REG) {
4045       if(i_regmap[hr]==0) {
4046         emit_zeroreg(hr);
4047       }
4048       else
4049       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4050       {
4051         emit_loadreg(i_regmap[hr],hr);
4052       }
4053     }
4054   }
4055 }
4056
4057 // Load all current registers also needed by next instruction
4058 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4059 {
4060   int hr;
4061   for(hr=0;hr<HOST_REGS;hr++) {
4062     if(hr!=EXCLUDE_REG) {
4063       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4064         if(i_regmap[hr]==0) {
4065           emit_zeroreg(hr);
4066         }
4067         else
4068         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4069         {
4070           emit_loadreg(i_regmap[hr],hr);
4071         }
4072       }
4073     }
4074   }
4075 }
4076
4077 // Load all regs, storing cycle count if necessary
4078 void load_regs_entry(int t)
4079 {
4080   int hr;
4081   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4082   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4083   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4084     emit_storereg(CCREG,HOST_CCREG);
4085   }
4086   // Load 32-bit regs
4087   for(hr=0;hr<HOST_REGS;hr++) {
4088     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4089       if(regs[t].regmap_entry[hr]==0) {
4090         emit_zeroreg(hr);
4091       }
4092       else if(regs[t].regmap_entry[hr]!=CCREG)
4093       {
4094         emit_loadreg(regs[t].regmap_entry[hr],hr);
4095       }
4096     }
4097   }
4098   // Load 64-bit regs
4099   for(hr=0;hr<HOST_REGS;hr++) {
4100     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4101       assert(regs[t].regmap_entry[hr]!=64);
4102       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4103         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4104         if(lr<0) {
4105           emit_loadreg(regs[t].regmap_entry[hr],hr);
4106         }
4107         else
4108         {
4109           emit_sarimm(lr,31,hr);
4110         }
4111       }
4112       else
4113       {
4114         emit_loadreg(regs[t].regmap_entry[hr],hr);
4115       }
4116     }
4117   }
4118 }
4119
4120 // Store dirty registers prior to branch
4121 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4122 {
4123   if(internal_branch(i_is32,addr))
4124   {
4125     int t=(addr-start)>>2;
4126     int hr;
4127     for(hr=0;hr<HOST_REGS;hr++) {
4128       if(hr!=EXCLUDE_REG) {
4129         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4130           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4131             if((i_dirty>>hr)&1) {
4132               if(i_regmap[hr]<64) {
4133                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4134                   emit_storereg(i_regmap[hr],hr);
4135                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4136                     #ifdef DESTRUCTIVE_WRITEBACK
4137                     emit_sarimm(hr,31,hr);
4138                     emit_storereg(i_regmap[hr]|64,hr);
4139                     #else
4140                     emit_sarimm(hr,31,HOST_TEMPREG);
4141                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4142                     #endif
4143                   }
4144                 }
4145               }else{
4146                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4147                   emit_storereg(i_regmap[hr],hr);
4148                 }
4149               }
4150             }
4151           }
4152         }
4153       }
4154     }
4155   }
4156   else
4157   {
4158     // Branch out of this block, write out all dirty regs
4159     wb_dirtys(i_regmap,i_is32,i_dirty);
4160   }
4161 }
4162
4163 // Load all needed registers for branch target
4164 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4165 {
4166   //if(addr>=start && addr<(start+slen*4))
4167   if(internal_branch(i_is32,addr))
4168   {
4169     int t=(addr-start)>>2;
4170     int hr;
4171     // Store the cycle count before loading something else
4172     if(i_regmap[HOST_CCREG]!=CCREG) {
4173       assert(i_regmap[HOST_CCREG]==-1);
4174     }
4175     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4176       emit_storereg(CCREG,HOST_CCREG);
4177     }
4178     // Load 32-bit regs
4179     for(hr=0;hr<HOST_REGS;hr++) {
4180       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4181         #ifdef DESTRUCTIVE_WRITEBACK
4182         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4183         #else
4184         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4185         #endif
4186           if(regs[t].regmap_entry[hr]==0) {
4187             emit_zeroreg(hr);
4188           }
4189           else if(regs[t].regmap_entry[hr]!=CCREG)
4190           {
4191             emit_loadreg(regs[t].regmap_entry[hr],hr);
4192           }
4193         }
4194       }
4195     }
4196     //Load 64-bit regs
4197     for(hr=0;hr<HOST_REGS;hr++) {
4198       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4199         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4200           assert(regs[t].regmap_entry[hr]!=64);
4201           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4202             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4203             if(lr<0) {
4204               emit_loadreg(regs[t].regmap_entry[hr],hr);
4205             }
4206             else
4207             {
4208               emit_sarimm(lr,31,hr);
4209             }
4210           }
4211           else
4212           {
4213             emit_loadreg(regs[t].regmap_entry[hr],hr);
4214           }
4215         }
4216         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4217           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4218           assert(lr>=0);
4219           emit_sarimm(lr,31,hr);
4220         }
4221       }
4222     }
4223   }
4224 }
4225
4226 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4227 {
4228   if(addr>=start && addr<start+slen*4-4)
4229   {
4230     int t=(addr-start)>>2;
4231     int hr;
4232     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4233     for(hr=0;hr<HOST_REGS;hr++)
4234     {
4235       if(hr!=EXCLUDE_REG)
4236       {
4237         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4238         {
4239           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4240           {
4241             return 0;
4242           }
4243           else
4244           if((i_dirty>>hr)&1)
4245           {
4246             if(i_regmap[hr]<TEMPREG)
4247             {
4248               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4249                 return 0;
4250             }
4251             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4252             {
4253               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4254                 return 0;
4255             }
4256           }
4257         }
4258         else // Same register but is it 32-bit or dirty?
4259         if(i_regmap[hr]>=0)
4260         {
4261           if(!((regs[t].dirty>>hr)&1))
4262           {
4263             if((i_dirty>>hr)&1)
4264             {
4265               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4266               {
4267                 //printf("%x: dirty no match\n",addr);
4268                 return 0;
4269               }
4270             }
4271           }
4272           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4273           {
4274             //printf("%x: is32 no match\n",addr);
4275             return 0;
4276           }
4277         }
4278       }
4279     }
4280     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4281     // Delay slots are not valid branch targets
4282     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4283     // Delay slots require additional processing, so do not match
4284     if(is_ds[t]) return 0;
4285   }
4286   else
4287   {
4288     int hr;
4289     for(hr=0;hr<HOST_REGS;hr++)
4290     {
4291       if(hr!=EXCLUDE_REG)
4292       {
4293         if(i_regmap[hr]>=0)
4294         {
4295           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4296           {
4297             if((i_dirty>>hr)&1)
4298             {
4299               return 0;
4300             }
4301           }
4302         }
4303       }
4304     }
4305   }
4306   return 1;
4307 }
4308
4309 // Used when a branch jumps into the delay slot of another branch
4310 void ds_assemble_entry(int i)
4311 {
4312   int t=(ba[i]-start)>>2;
4313   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4314   assem_debug("Assemble delay slot at %x\n",ba[i]);
4315   assem_debug("<->\n");
4316   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4317     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4318   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4319   address_generation(t,&regs[t],regs[t].regmap_entry);
4320   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4321     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4322   cop1_usable=0;
4323   is_delayslot=0;
4324   switch(itype[t]) {
4325     case ALU:
4326       alu_assemble(t,&regs[t]);break;
4327     case IMM16:
4328       imm16_assemble(t,&regs[t]);break;
4329     case SHIFT:
4330       shift_assemble(t,&regs[t]);break;
4331     case SHIFTIMM:
4332       shiftimm_assemble(t,&regs[t]);break;
4333     case LOAD:
4334       load_assemble(t,&regs[t]);break;
4335     case LOADLR:
4336       loadlr_assemble(t,&regs[t]);break;
4337     case STORE:
4338       store_assemble(t,&regs[t]);break;
4339     case STORELR:
4340       storelr_assemble(t,&regs[t]);break;
4341     case COP0:
4342       cop0_assemble(t,&regs[t]);break;
4343     case COP1:
4344       cop1_assemble(t,&regs[t]);break;
4345     case C1LS:
4346       c1ls_assemble(t,&regs[t]);break;
4347     case COP2:
4348       cop2_assemble(t,&regs[t]);break;
4349     case C2LS:
4350       c2ls_assemble(t,&regs[t]);break;
4351     case C2OP:
4352       c2op_assemble(t,&regs[t]);break;
4353     case FCONV:
4354       fconv_assemble(t,&regs[t]);break;
4355     case FLOAT:
4356       float_assemble(t,&regs[t]);break;
4357     case FCOMP:
4358       fcomp_assemble(t,&regs[t]);break;
4359     case MULTDIV:
4360       multdiv_assemble(t,&regs[t]);break;
4361     case MOV:
4362       mov_assemble(t,&regs[t]);break;
4363     case SYSCALL:
4364     case HLECALL:
4365     case INTCALL:
4366     case SPAN:
4367     case UJUMP:
4368     case RJUMP:
4369     case CJUMP:
4370     case SJUMP:
4371     case FJUMP:
4372       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4373   }
4374   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4375   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4376   if(internal_branch(regs[t].is32,ba[i]+4))
4377     assem_debug("branch: internal\n");
4378   else
4379     assem_debug("branch: external\n");
4380   assert(internal_branch(regs[t].is32,ba[i]+4));
4381   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4382   emit_jmp(0);
4383 }
4384
4385 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4386 {
4387   int count;
4388   int jaddr;
4389   int idle=0;
4390   int t=0;
4391   if(itype[i]==RJUMP)
4392   {
4393     *adj=0;
4394   }
4395   //if(ba[i]>=start && ba[i]<(start+slen*4))
4396   if(internal_branch(branch_regs[i].is32,ba[i]))
4397   {
4398     t=(ba[i]-start)>>2;
4399     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4400     else *adj=ccadj[t];
4401   }
4402   else
4403   {
4404     *adj=0;
4405   }
4406   count=ccadj[i];
4407   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4408     // Idle loop
4409     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4410     idle=(int)out;
4411     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4412     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4413     jaddr=(int)out;
4414     emit_jmp(0);
4415   }
4416   else if(*adj==0||invert) {
4417     int cycles=CLOCK_ADJUST(count+2);
4418     // faster loop HACK
4419     if (t&&*adj) {
4420       int rel=t-i;
4421       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4422         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4423     }
4424     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4425     jaddr=(int)out;
4426     emit_jns(0);
4427   }
4428   else
4429   {
4430     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4431     jaddr=(int)out;
4432     emit_jns(0);
4433   }
4434   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4435 }
4436
4437 void do_ccstub(int n)
4438 {
4439   literal_pool(256);
4440   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4441   set_jump_target(stubs[n][1],(int)out);
4442   int i=stubs[n][4];
4443   if(stubs[n][6]==NULLDS) {
4444     // Delay slot instruction is nullified ("likely" branch)
4445     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4446   }
4447   else if(stubs[n][6]!=TAKEN) {
4448     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4449   }
4450   else {
4451     if(internal_branch(branch_regs[i].is32,ba[i]))
4452       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4453   }
4454   if(stubs[n][5]!=-1)
4455   {
4456     // Save PC as return address
4457     emit_movimm(stubs[n][5],EAX);
4458     emit_writeword(EAX,(int)&pcaddr);
4459   }
4460   else
4461   {
4462     // Return address depends on which way the branch goes
4463     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4464     {
4465       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4466       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4467       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4468       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4469       if(rs1[i]==0)
4470       {
4471         s1l=s2l;s1h=s2h;
4472         s2l=s2h=-1;
4473       }
4474       else if(rs2[i]==0)
4475       {
4476         s2l=s2h=-1;
4477       }
4478       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4479         s1h=s2h=-1;
4480       }
4481       assert(s1l>=0);
4482       #ifdef DESTRUCTIVE_WRITEBACK
4483       if(rs1[i]) {
4484         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4485           emit_loadreg(rs1[i],s1l);
4486       }
4487       else {
4488         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4489           emit_loadreg(rs2[i],s1l);
4490       }
4491       if(s2l>=0)
4492         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4493           emit_loadreg(rs2[i],s2l);
4494       #endif
4495       int hr=0;
4496       int addr=-1,alt=-1,ntaddr=-1;
4497       while(hr<HOST_REGS)
4498       {
4499         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4500            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4501            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4502         {
4503           addr=hr++;break;
4504         }
4505         hr++;
4506       }
4507       while(hr<HOST_REGS)
4508       {
4509         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4510            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4511            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4512         {
4513           alt=hr++;break;
4514         }
4515         hr++;
4516       }
4517       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4518       {
4519         while(hr<HOST_REGS)
4520         {
4521           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4522              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4523              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4524           {
4525             ntaddr=hr;break;
4526           }
4527           hr++;
4528         }
4529         assert(hr<HOST_REGS);
4530       }
4531       if((opcode[i]&0x2f)==4) // BEQ
4532       {
4533         #ifdef HAVE_CMOV_IMM
4534         if(s1h<0) {
4535           if(s2l>=0) emit_cmp(s1l,s2l);
4536           else emit_test(s1l,s1l);
4537           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4538         }
4539         else
4540         #endif
4541         {
4542           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4543           if(s1h>=0) {
4544             if(s2h>=0) emit_cmp(s1h,s2h);
4545             else emit_test(s1h,s1h);
4546             emit_cmovne_reg(alt,addr);
4547           }
4548           if(s2l>=0) emit_cmp(s1l,s2l);
4549           else emit_test(s1l,s1l);
4550           emit_cmovne_reg(alt,addr);
4551         }
4552       }
4553       if((opcode[i]&0x2f)==5) // BNE
4554       {
4555         #ifdef HAVE_CMOV_IMM
4556         if(s1h<0) {
4557           if(s2l>=0) emit_cmp(s1l,s2l);
4558           else emit_test(s1l,s1l);
4559           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4560         }
4561         else
4562         #endif
4563         {
4564           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4565           if(s1h>=0) {
4566             if(s2h>=0) emit_cmp(s1h,s2h);
4567             else emit_test(s1h,s1h);
4568             emit_cmovne_reg(alt,addr);
4569           }
4570           if(s2l>=0) emit_cmp(s1l,s2l);
4571           else emit_test(s1l,s1l);
4572           emit_cmovne_reg(alt,addr);
4573         }
4574       }
4575       if((opcode[i]&0x2f)==6) // BLEZ
4576       {
4577         //emit_movimm(ba[i],alt);
4578         //emit_movimm(start+i*4+8,addr);
4579         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4580         emit_cmpimm(s1l,1);
4581         if(s1h>=0) emit_mov(addr,ntaddr);
4582         emit_cmovl_reg(alt,addr);
4583         if(s1h>=0) {
4584           emit_test(s1h,s1h);
4585           emit_cmovne_reg(ntaddr,addr);
4586           emit_cmovs_reg(alt,addr);
4587         }
4588       }
4589       if((opcode[i]&0x2f)==7) // BGTZ
4590       {
4591         //emit_movimm(ba[i],addr);
4592         //emit_movimm(start+i*4+8,ntaddr);
4593         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4594         emit_cmpimm(s1l,1);
4595         if(s1h>=0) emit_mov(addr,alt);
4596         emit_cmovl_reg(ntaddr,addr);
4597         if(s1h>=0) {
4598           emit_test(s1h,s1h);
4599           emit_cmovne_reg(alt,addr);
4600           emit_cmovs_reg(ntaddr,addr);
4601         }
4602       }
4603       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4604       {
4605         //emit_movimm(ba[i],alt);
4606         //emit_movimm(start+i*4+8,addr);
4607         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4608         if(s1h>=0) emit_test(s1h,s1h);
4609         else emit_test(s1l,s1l);
4610         emit_cmovs_reg(alt,addr);
4611       }
4612       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4613       {
4614         //emit_movimm(ba[i],addr);
4615         //emit_movimm(start+i*4+8,alt);
4616         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4617         if(s1h>=0) emit_test(s1h,s1h);
4618         else emit_test(s1l,s1l);
4619         emit_cmovs_reg(alt,addr);
4620       }
4621       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4622         if(source[i]&0x10000) // BC1T
4623         {
4624           //emit_movimm(ba[i],alt);
4625           //emit_movimm(start+i*4+8,addr);
4626           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4627           emit_testimm(s1l,0x800000);
4628           emit_cmovne_reg(alt,addr);
4629         }
4630         else // BC1F
4631         {
4632           //emit_movimm(ba[i],addr);
4633           //emit_movimm(start+i*4+8,alt);
4634           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4635           emit_testimm(s1l,0x800000);
4636           emit_cmovne_reg(alt,addr);
4637         }
4638       }
4639       emit_writeword(addr,(int)&pcaddr);
4640     }
4641     else
4642     if(itype[i]==RJUMP)
4643     {
4644       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4645       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4646         r=get_reg(branch_regs[i].regmap,RTEMP);
4647       }
4648       emit_writeword(r,(int)&pcaddr);
4649     }
4650     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4651   }
4652   // Update cycle count
4653   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4654   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4655   emit_call((int)cc_interrupt);
4656   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4657   if(stubs[n][6]==TAKEN) {
4658     if(internal_branch(branch_regs[i].is32,ba[i]))
4659       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4660     else if(itype[i]==RJUMP) {
4661       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4662         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4663       else
4664         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4665     }
4666   }else if(stubs[n][6]==NOTTAKEN) {
4667     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4668     else load_all_regs(branch_regs[i].regmap);
4669   }else if(stubs[n][6]==NULLDS) {
4670     // Delay slot instruction is nullified ("likely" branch)
4671     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4672     else load_all_regs(regs[i].regmap);
4673   }else{
4674     load_all_regs(branch_regs[i].regmap);
4675   }
4676   emit_jmp(stubs[n][2]); // return address
4677
4678   /* This works but uses a lot of memory...
4679   emit_readword((int)&last_count,ECX);
4680   emit_add(HOST_CCREG,ECX,EAX);
4681   emit_writeword(EAX,(int)&Count);
4682   emit_call((int)gen_interupt);
4683   emit_readword((int)&Count,HOST_CCREG);
4684   emit_readword((int)&next_interupt,EAX);
4685   emit_readword((int)&pending_exception,EBX);
4686   emit_writeword(EAX,(int)&last_count);
4687   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4688   emit_test(EBX,EBX);
4689   int jne_instr=(int)out;
4690   emit_jne(0);
4691   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4692   load_all_regs(branch_regs[i].regmap);
4693   emit_jmp(stubs[n][2]); // return address
4694   set_jump_target(jne_instr,(int)out);
4695   emit_readword((int)&pcaddr,EAX);
4696   // Call get_addr_ht instead of doing the hash table here.
4697   // This code is executed infrequently and takes up a lot of space
4698   // so smaller is better.
4699   emit_storereg(CCREG,HOST_CCREG);
4700   emit_pushreg(EAX);
4701   emit_call((int)get_addr_ht);
4702   emit_loadreg(CCREG,HOST_CCREG);
4703   emit_addimm(ESP,4,ESP);
4704   emit_jmpreg(EAX);*/
4705 }
4706
4707 static void add_to_linker(int addr,int target,int ext)
4708 {
4709   link_addr[linkcount][0]=addr;
4710   link_addr[linkcount][1]=target;
4711   link_addr[linkcount][2]=ext;
4712   linkcount++;
4713 }
4714
4715 static void ujump_assemble_write_ra(int i)
4716 {
4717   int rt;
4718   unsigned int return_address;
4719   rt=get_reg(branch_regs[i].regmap,31);
4720   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4721   //assert(rt>=0);
4722   return_address=start+i*4+8;
4723   if(rt>=0) {
4724     #ifdef USE_MINI_HT
4725     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4726       int temp=-1; // note: must be ds-safe
4727       #ifdef HOST_TEMPREG
4728       temp=HOST_TEMPREG;
4729       #endif
4730       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4731       else emit_movimm(return_address,rt);
4732     }
4733     else
4734     #endif
4735     {
4736       #ifdef REG_PREFETCH
4737       if(temp>=0)
4738       {
4739         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4740       }
4741       #endif
4742       emit_movimm(return_address,rt); // PC into link register
4743       #ifdef IMM_PREFETCH
4744       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4745       #endif
4746     }
4747   }
4748 }
4749
4750 void ujump_assemble(int i,struct regstat *i_regs)
4751 {
4752   int ra_done=0;
4753   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4754   address_generation(i+1,i_regs,regs[i].regmap_entry);
4755   #ifdef REG_PREFETCH
4756   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4757   if(rt1[i]==31&&temp>=0)
4758   {
4759     signed char *i_regmap=i_regs->regmap;
4760     int return_address=start+i*4+8;
4761     if(get_reg(branch_regs[i].regmap,31)>0)
4762     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4763   }
4764   #endif
4765   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4766     ujump_assemble_write_ra(i); // writeback ra for DS
4767     ra_done=1;
4768   }
4769   ds_assemble(i+1,i_regs);
4770   uint64_t bc_unneeded=branch_regs[i].u;
4771   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4772   bc_unneeded|=1|(1LL<<rt1[i]);
4773   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4774   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4775                 bc_unneeded,bc_unneeded_upper);
4776   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4777   if(!ra_done&&rt1[i]==31)
4778     ujump_assemble_write_ra(i);
4779   int cc,adj;
4780   cc=get_reg(branch_regs[i].regmap,CCREG);
4781   assert(cc==HOST_CCREG);
4782   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4783   #ifdef REG_PREFETCH
4784   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4785   #endif
4786   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4787   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4788   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4789   if(internal_branch(branch_regs[i].is32,ba[i]))
4790     assem_debug("branch: internal\n");
4791   else
4792     assem_debug("branch: external\n");
4793   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4794     ds_assemble_entry(i);
4795   }
4796   else {
4797     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4798     emit_jmp(0);
4799   }
4800 }
4801
4802 static void rjump_assemble_write_ra(int i)
4803 {
4804   int rt,return_address;
4805   assert(rt1[i+1]!=rt1[i]);
4806   assert(rt2[i+1]!=rt1[i]);
4807   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4808   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4809   assert(rt>=0);
4810   return_address=start+i*4+8;
4811   #ifdef REG_PREFETCH
4812   if(temp>=0)
4813   {
4814     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4815   }
4816   #endif
4817   emit_movimm(return_address,rt); // PC into link register
4818   #ifdef IMM_PREFETCH
4819   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4820   #endif
4821 }
4822
4823 void rjump_assemble(int i,struct regstat *i_regs)
4824 {
4825   int temp;
4826   int rs,cc;
4827   int ra_done=0;
4828   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4829   assert(rs>=0);
4830   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4831     // Delay slot abuse, make a copy of the branch address register
4832     temp=get_reg(branch_regs[i].regmap,RTEMP);
4833     assert(temp>=0);
4834     assert(regs[i].regmap[temp]==RTEMP);
4835     emit_mov(rs,temp);
4836     rs=temp;
4837   }
4838   address_generation(i+1,i_regs,regs[i].regmap_entry);
4839   #ifdef REG_PREFETCH
4840   if(rt1[i]==31)
4841   {
4842     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4843       signed char *i_regmap=i_regs->regmap;
4844       int return_address=start+i*4+8;
4845       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4846     }
4847   }
4848   #endif
4849   #ifdef USE_MINI_HT
4850   if(rs1[i]==31) {
4851     int rh=get_reg(regs[i].regmap,RHASH);
4852     if(rh>=0) do_preload_rhash(rh);
4853   }
4854   #endif
4855   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4856     rjump_assemble_write_ra(i);
4857     ra_done=1;
4858   }
4859   ds_assemble(i+1,i_regs);
4860   uint64_t bc_unneeded=branch_regs[i].u;
4861   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4862   bc_unneeded|=1|(1LL<<rt1[i]);
4863   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4864   bc_unneeded&=~(1LL<<rs1[i]);
4865   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4866                 bc_unneeded,bc_unneeded_upper);
4867   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4868   if(!ra_done&&rt1[i]!=0)
4869     rjump_assemble_write_ra(i);
4870   cc=get_reg(branch_regs[i].regmap,CCREG);
4871   assert(cc==HOST_CCREG);
4872   (void)cc;
4873   #ifdef USE_MINI_HT
4874   int rh=get_reg(branch_regs[i].regmap,RHASH);
4875   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4876   if(rs1[i]==31) {
4877     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4878     do_preload_rhtbl(ht);
4879     do_rhash(rs,rh);
4880   }
4881   #endif
4882   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4883   #ifdef DESTRUCTIVE_WRITEBACK
4884   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4885     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4886       emit_loadreg(rs1[i],rs);
4887     }
4888   }
4889   #endif
4890   #ifdef REG_PREFETCH
4891   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4892   #endif
4893   #ifdef USE_MINI_HT
4894   if(rs1[i]==31) {
4895     do_miniht_load(ht,rh);
4896   }
4897   #endif
4898   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4899   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4900   //assert(adj==0);
4901   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4902   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4903   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4904     // special case for RFE
4905     emit_jmp(0);
4906   else
4907     emit_jns(0);
4908   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4909   #ifdef USE_MINI_HT
4910   if(rs1[i]==31) {
4911     do_miniht_jump(rs,rh,ht);
4912   }
4913   else
4914   #endif
4915   {
4916     //if(rs!=EAX) emit_mov(rs,EAX);
4917     //emit_jmp((int)jump_vaddr_eax);
4918     emit_jmp(jump_vaddr_reg[rs]);
4919   }
4920   /* Check hash table
4921   temp=!rs;
4922   emit_mov(rs,temp);
4923   emit_shrimm(rs,16,rs);
4924   emit_xor(temp,rs,rs);
4925   emit_movzwl_reg(rs,rs);
4926   emit_shlimm(rs,4,rs);
4927   emit_cmpmem_indexed((int)hash_table,rs,temp);
4928   emit_jne((int)out+14);
4929   emit_readword_indexed((int)hash_table+4,rs,rs);
4930   emit_jmpreg(rs);
4931   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4932   emit_addimm_no_flags(8,rs);
4933   emit_jeq((int)out-17);
4934   // No hit on hash table, call compiler
4935   emit_pushreg(temp);
4936 //DEBUG >
4937 #ifdef DEBUG_CYCLE_COUNT
4938   emit_readword((int)&last_count,ECX);
4939   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4940   emit_readword((int)&next_interupt,ECX);
4941   emit_writeword(HOST_CCREG,(int)&Count);
4942   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4943   emit_writeword(ECX,(int)&last_count);
4944 #endif
4945 //DEBUG <
4946   emit_storereg(CCREG,HOST_CCREG);
4947   emit_call((int)get_addr);
4948   emit_loadreg(CCREG,HOST_CCREG);
4949   emit_addimm(ESP,4,ESP);
4950   emit_jmpreg(EAX);*/
4951   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4952   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4953   #endif
4954 }
4955
4956 void cjump_assemble(int i,struct regstat *i_regs)
4957 {
4958   signed char *i_regmap=i_regs->regmap;
4959   int cc;
4960   int match;
4961   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4962   assem_debug("match=%d\n",match);
4963   int s1h,s1l,s2h,s2l;
4964   int prev_cop1_usable=cop1_usable;
4965   int unconditional=0,nop=0;
4966   int only32=0;
4967   int invert=0;
4968   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4969   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4970   if(!match) invert=1;
4971   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4972   if(i>(ba[i]-start)>>2) invert=1;
4973   #endif
4974
4975   if(ooo[i]) {
4976     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4977     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4978     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4979     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4980   }
4981   else {
4982     s1l=get_reg(i_regmap,rs1[i]);
4983     s1h=get_reg(i_regmap,rs1[i]|64);
4984     s2l=get_reg(i_regmap,rs2[i]);
4985     s2h=get_reg(i_regmap,rs2[i]|64);
4986   }
4987   if(rs1[i]==0&&rs2[i]==0)
4988   {
4989     if(opcode[i]&1) nop=1;
4990     else unconditional=1;
4991     //assert(opcode[i]!=5);
4992     //assert(opcode[i]!=7);
4993     //assert(opcode[i]!=0x15);
4994     //assert(opcode[i]!=0x17);
4995   }
4996   else if(rs1[i]==0)
4997   {
4998     s1l=s2l;s1h=s2h;
4999     s2l=s2h=-1;
5000     only32=(regs[i].was32>>rs2[i])&1;
5001   }
5002   else if(rs2[i]==0)
5003   {
5004     s2l=s2h=-1;
5005     only32=(regs[i].was32>>rs1[i])&1;
5006   }
5007   else {
5008     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5009   }
5010
5011   if(ooo[i]) {
5012     // Out of order execution (delay slot first)
5013     //printf("OOOE\n");
5014     address_generation(i+1,i_regs,regs[i].regmap_entry);
5015     ds_assemble(i+1,i_regs);
5016     int adj;
5017     uint64_t bc_unneeded=branch_regs[i].u;
5018     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5019     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5020     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5021     bc_unneeded|=1;
5022     bc_unneeded_upper|=1;
5023     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5024                   bc_unneeded,bc_unneeded_upper);
5025     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5026     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5027     cc=get_reg(branch_regs[i].regmap,CCREG);
5028     assert(cc==HOST_CCREG);
5029     if(unconditional)
5030       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5031     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5032     //assem_debug("cycle count (adj)\n");
5033     if(unconditional) {
5034       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5035       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5036         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5037         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5038         if(internal)
5039           assem_debug("branch: internal\n");
5040         else
5041           assem_debug("branch: external\n");
5042         if(internal&&is_ds[(ba[i]-start)>>2]) {
5043           ds_assemble_entry(i);
5044         }
5045         else {
5046           add_to_linker((int)out,ba[i],internal);
5047           emit_jmp(0);
5048         }
5049         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5050         if(((u_int)out)&7) emit_addnop(0);
5051         #endif
5052       }
5053     }
5054     else if(nop) {
5055       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5056       int jaddr=(int)out;
5057       emit_jns(0);
5058       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5059     }
5060     else {
5061       int taken=0,nottaken=0,nottaken1=0;
5062       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5063       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5064       if(!only32)
5065       {
5066         assert(s1h>=0);
5067         if(opcode[i]==4) // BEQ
5068         {
5069           if(s2h>=0) emit_cmp(s1h,s2h);
5070           else emit_test(s1h,s1h);
5071           nottaken1=(int)out;
5072           emit_jne(1);
5073         }
5074         if(opcode[i]==5) // BNE
5075         {
5076           if(s2h>=0) emit_cmp(s1h,s2h);
5077           else emit_test(s1h,s1h);
5078           if(invert) taken=(int)out;
5079           else add_to_linker((int)out,ba[i],internal);
5080           emit_jne(0);
5081         }
5082         if(opcode[i]==6) // BLEZ
5083         {
5084           emit_test(s1h,s1h);
5085           if(invert) taken=(int)out;
5086           else add_to_linker((int)out,ba[i],internal);
5087           emit_js(0);
5088           nottaken1=(int)out;
5089           emit_jne(1);
5090         }
5091         if(opcode[i]==7) // BGTZ
5092         {
5093           emit_test(s1h,s1h);
5094           nottaken1=(int)out;
5095           emit_js(1);
5096           if(invert) taken=(int)out;
5097           else add_to_linker((int)out,ba[i],internal);
5098           emit_jne(0);
5099         }
5100       } // if(!only32)
5101
5102       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5103       assert(s1l>=0);
5104       if(opcode[i]==4) // BEQ
5105       {
5106         if(s2l>=0) emit_cmp(s1l,s2l);
5107         else emit_test(s1l,s1l);
5108         if(invert){
5109           nottaken=(int)out;
5110           emit_jne(1);
5111         }else{
5112           add_to_linker((int)out,ba[i],internal);
5113           emit_jeq(0);
5114         }
5115       }
5116       if(opcode[i]==5) // BNE
5117       {
5118         if(s2l>=0) emit_cmp(s1l,s2l);
5119         else emit_test(s1l,s1l);
5120         if(invert){
5121           nottaken=(int)out;
5122           emit_jeq(1);
5123         }else{
5124           add_to_linker((int)out,ba[i],internal);
5125           emit_jne(0);
5126         }
5127       }
5128       if(opcode[i]==6) // BLEZ
5129       {
5130         emit_cmpimm(s1l,1);
5131         if(invert){
5132           nottaken=(int)out;
5133           emit_jge(1);
5134         }else{
5135           add_to_linker((int)out,ba[i],internal);
5136           emit_jl(0);
5137         }
5138       }
5139       if(opcode[i]==7) // BGTZ
5140       {
5141         emit_cmpimm(s1l,1);
5142         if(invert){
5143           nottaken=(int)out;
5144           emit_jl(1);
5145         }else{
5146           add_to_linker((int)out,ba[i],internal);
5147           emit_jge(0);
5148         }
5149       }
5150       if(invert) {
5151         if(taken) set_jump_target(taken,(int)out);
5152         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5153         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5154           if(adj) {
5155             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5156             add_to_linker((int)out,ba[i],internal);
5157           }else{
5158             emit_addnop(13);
5159             add_to_linker((int)out,ba[i],internal*2);
5160           }
5161           emit_jmp(0);
5162         }else
5163         #endif
5164         {
5165           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5166           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5167           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5168           if(internal)
5169             assem_debug("branch: internal\n");
5170           else
5171             assem_debug("branch: external\n");
5172           if(internal&&is_ds[(ba[i]-start)>>2]) {
5173             ds_assemble_entry(i);
5174           }
5175           else {
5176             add_to_linker((int)out,ba[i],internal);
5177             emit_jmp(0);
5178           }
5179         }
5180         set_jump_target(nottaken,(int)out);
5181       }
5182
5183       if(nottaken1) set_jump_target(nottaken1,(int)out);
5184       if(adj) {
5185         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5186       }
5187     } // (!unconditional)
5188   } // if(ooo)
5189   else
5190   {
5191     // In-order execution (branch first)
5192     //if(likely[i]) printf("IOL\n");
5193     //else
5194     //printf("IOE\n");
5195     int taken=0,nottaken=0,nottaken1=0;
5196     if(!unconditional&&!nop) {
5197       if(!only32)
5198       {
5199         assert(s1h>=0);
5200         if((opcode[i]&0x2f)==4) // BEQ
5201         {
5202           if(s2h>=0) emit_cmp(s1h,s2h);
5203           else emit_test(s1h,s1h);
5204           nottaken1=(int)out;
5205           emit_jne(2);
5206         }
5207         if((opcode[i]&0x2f)==5) // BNE
5208         {
5209           if(s2h>=0) emit_cmp(s1h,s2h);
5210           else emit_test(s1h,s1h);
5211           taken=(int)out;
5212           emit_jne(1);
5213         }
5214         if((opcode[i]&0x2f)==6) // BLEZ
5215         {
5216           emit_test(s1h,s1h);
5217           taken=(int)out;
5218           emit_js(1);
5219           nottaken1=(int)out;
5220           emit_jne(2);
5221         }
5222         if((opcode[i]&0x2f)==7) // BGTZ
5223         {
5224           emit_test(s1h,s1h);
5225           nottaken1=(int)out;
5226           emit_js(2);
5227           taken=(int)out;
5228           emit_jne(1);
5229         }
5230       } // if(!only32)
5231
5232       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5233       assert(s1l>=0);
5234       if((opcode[i]&0x2f)==4) // BEQ
5235       {
5236         if(s2l>=0) emit_cmp(s1l,s2l);
5237         else emit_test(s1l,s1l);
5238         nottaken=(int)out;
5239         emit_jne(2);
5240       }
5241       if((opcode[i]&0x2f)==5) // BNE
5242       {
5243         if(s2l>=0) emit_cmp(s1l,s2l);
5244         else emit_test(s1l,s1l);
5245         nottaken=(int)out;
5246         emit_jeq(2);
5247       }
5248       if((opcode[i]&0x2f)==6) // BLEZ
5249       {
5250         emit_cmpimm(s1l,1);
5251         nottaken=(int)out;
5252         emit_jge(2);
5253       }
5254       if((opcode[i]&0x2f)==7) // BGTZ
5255       {
5256         emit_cmpimm(s1l,1);
5257         nottaken=(int)out;
5258         emit_jl(2);
5259       }
5260     } // if(!unconditional)
5261     int adj;
5262     uint64_t ds_unneeded=branch_regs[i].u;
5263     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5264     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5265     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5266     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5267     ds_unneeded|=1;
5268     ds_unneeded_upper|=1;
5269     // branch taken
5270     if(!nop) {
5271       if(taken) set_jump_target(taken,(int)out);
5272       assem_debug("1:\n");
5273       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5274                     ds_unneeded,ds_unneeded_upper);
5275       // load regs
5276       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5277       address_generation(i+1,&branch_regs[i],0);
5278       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5279       ds_assemble(i+1,&branch_regs[i]);
5280       cc=get_reg(branch_regs[i].regmap,CCREG);
5281       if(cc==-1) {
5282         emit_loadreg(CCREG,cc=HOST_CCREG);
5283         // CHECK: Is the following instruction (fall thru) allocated ok?
5284       }
5285       assert(cc==HOST_CCREG);
5286       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5287       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5288       assem_debug("cycle count (adj)\n");
5289       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5290       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5291       if(internal)
5292         assem_debug("branch: internal\n");
5293       else
5294         assem_debug("branch: external\n");
5295       if(internal&&is_ds[(ba[i]-start)>>2]) {
5296         ds_assemble_entry(i);
5297       }
5298       else {
5299         add_to_linker((int)out,ba[i],internal);
5300         emit_jmp(0);
5301       }
5302     }
5303     // branch not taken
5304     cop1_usable=prev_cop1_usable;
5305     if(!unconditional) {
5306       if(nottaken1) set_jump_target(nottaken1,(int)out);
5307       set_jump_target(nottaken,(int)out);
5308       assem_debug("2:\n");
5309       if(!likely[i]) {
5310         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5311                       ds_unneeded,ds_unneeded_upper);
5312         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5313         address_generation(i+1,&branch_regs[i],0);
5314         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5315         ds_assemble(i+1,&branch_regs[i]);
5316       }
5317       cc=get_reg(branch_regs[i].regmap,CCREG);
5318       if(cc==-1&&!likely[i]) {
5319         // Cycle count isn't in a register, temporarily load it then write it out
5320         emit_loadreg(CCREG,HOST_CCREG);
5321         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5322         int jaddr=(int)out;
5323         emit_jns(0);
5324         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5325         emit_storereg(CCREG,HOST_CCREG);
5326       }
5327       else{
5328         cc=get_reg(i_regmap,CCREG);
5329         assert(cc==HOST_CCREG);
5330         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5331         int jaddr=(int)out;
5332         emit_jns(0);
5333         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5334       }
5335     }
5336   }
5337 }
5338
5339 void sjump_assemble(int i,struct regstat *i_regs)
5340 {
5341   signed char *i_regmap=i_regs->regmap;
5342   int cc;
5343   int match;
5344   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5345   assem_debug("smatch=%d\n",match);
5346   int s1h,s1l;
5347   int prev_cop1_usable=cop1_usable;
5348   int unconditional=0,nevertaken=0;
5349   int only32=0;
5350   int invert=0;
5351   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5352   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5353   if(!match) invert=1;
5354   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5355   if(i>(ba[i]-start)>>2) invert=1;
5356   #endif
5357
5358   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5359   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5360
5361   if(ooo[i]) {
5362     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5363     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5364   }
5365   else {
5366     s1l=get_reg(i_regmap,rs1[i]);
5367     s1h=get_reg(i_regmap,rs1[i]|64);
5368   }
5369   if(rs1[i]==0)
5370   {
5371     if(opcode2[i]&1) unconditional=1;
5372     else nevertaken=1;
5373     // These are never taken (r0 is never less than zero)
5374     //assert(opcode2[i]!=0);
5375     //assert(opcode2[i]!=2);
5376     //assert(opcode2[i]!=0x10);
5377     //assert(opcode2[i]!=0x12);
5378   }
5379   else {
5380     only32=(regs[i].was32>>rs1[i])&1;
5381   }
5382
5383   if(ooo[i]) {
5384     // Out of order execution (delay slot first)
5385     //printf("OOOE\n");
5386     address_generation(i+1,i_regs,regs[i].regmap_entry);
5387     ds_assemble(i+1,i_regs);
5388     int adj;
5389     uint64_t bc_unneeded=branch_regs[i].u;
5390     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5391     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5392     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5393     bc_unneeded|=1;
5394     bc_unneeded_upper|=1;
5395     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5396                   bc_unneeded,bc_unneeded_upper);
5397     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5398     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5399     if(rt1[i]==31) {
5400       int rt,return_address;
5401       rt=get_reg(branch_regs[i].regmap,31);
5402       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5403       if(rt>=0) {
5404         // Save the PC even if the branch is not taken
5405         return_address=start+i*4+8;
5406         emit_movimm(return_address,rt); // PC into link register
5407         #ifdef IMM_PREFETCH
5408         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5409         #endif
5410       }
5411     }
5412     cc=get_reg(branch_regs[i].regmap,CCREG);
5413     assert(cc==HOST_CCREG);
5414     if(unconditional)
5415       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5416     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5417     assem_debug("cycle count (adj)\n");
5418     if(unconditional) {
5419       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5420       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5421         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5422         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5423         if(internal)
5424           assem_debug("branch: internal\n");
5425         else
5426           assem_debug("branch: external\n");
5427         if(internal&&is_ds[(ba[i]-start)>>2]) {
5428           ds_assemble_entry(i);
5429         }
5430         else {
5431           add_to_linker((int)out,ba[i],internal);
5432           emit_jmp(0);
5433         }
5434         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5435         if(((u_int)out)&7) emit_addnop(0);
5436         #endif
5437       }
5438     }
5439     else if(nevertaken) {
5440       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5441       int jaddr=(int)out;
5442       emit_jns(0);
5443       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5444     }
5445     else {
5446       int nottaken=0;
5447       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5448       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5449       if(!only32)
5450       {
5451         assert(s1h>=0);
5452         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5453         {
5454           emit_test(s1h,s1h);
5455           if(invert){
5456             nottaken=(int)out;
5457             emit_jns(1);
5458           }else{
5459             add_to_linker((int)out,ba[i],internal);
5460             emit_js(0);
5461           }
5462         }
5463         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5464         {
5465           emit_test(s1h,s1h);
5466           if(invert){
5467             nottaken=(int)out;
5468             emit_js(1);
5469           }else{
5470             add_to_linker((int)out,ba[i],internal);
5471             emit_jns(0);
5472           }
5473         }
5474       } // if(!only32)
5475       else
5476       {
5477         assert(s1l>=0);
5478         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5479         {
5480           emit_test(s1l,s1l);
5481           if(invert){
5482             nottaken=(int)out;
5483             emit_jns(1);
5484           }else{
5485             add_to_linker((int)out,ba[i],internal);
5486             emit_js(0);
5487           }
5488         }
5489         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5490         {
5491           emit_test(s1l,s1l);
5492           if(invert){
5493             nottaken=(int)out;
5494             emit_js(1);
5495           }else{
5496             add_to_linker((int)out,ba[i],internal);
5497             emit_jns(0);
5498           }
5499         }
5500       } // if(!only32)
5501
5502       if(invert) {
5503         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5504         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5505           if(adj) {
5506             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5507             add_to_linker((int)out,ba[i],internal);
5508           }else{
5509             emit_addnop(13);
5510             add_to_linker((int)out,ba[i],internal*2);
5511           }
5512           emit_jmp(0);
5513         }else
5514         #endif
5515         {
5516           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5517           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5518           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5519           if(internal)
5520             assem_debug("branch: internal\n");
5521           else
5522             assem_debug("branch: external\n");
5523           if(internal&&is_ds[(ba[i]-start)>>2]) {
5524             ds_assemble_entry(i);
5525           }
5526           else {
5527             add_to_linker((int)out,ba[i],internal);
5528             emit_jmp(0);
5529           }
5530         }
5531         set_jump_target(nottaken,(int)out);
5532       }
5533
5534       if(adj) {
5535         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5536       }
5537     } // (!unconditional)
5538   } // if(ooo)
5539   else
5540   {
5541     // In-order execution (branch first)
5542     //printf("IOE\n");
5543     int nottaken=0;
5544     if(rt1[i]==31) {
5545       int rt,return_address;
5546       rt=get_reg(branch_regs[i].regmap,31);
5547       if(rt>=0) {
5548         // Save the PC even if the branch is not taken
5549         return_address=start+i*4+8;
5550         emit_movimm(return_address,rt); // PC into link register
5551         #ifdef IMM_PREFETCH
5552         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5553         #endif
5554       }
5555     }
5556     if(!unconditional) {
5557       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5558       if(!only32)
5559       {
5560         assert(s1h>=0);
5561         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5562         {
5563           emit_test(s1h,s1h);
5564           nottaken=(int)out;
5565           emit_jns(1);
5566         }
5567         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5568         {
5569           emit_test(s1h,s1h);
5570           nottaken=(int)out;
5571           emit_js(1);
5572         }
5573       } // if(!only32)
5574       else
5575       {
5576         assert(s1l>=0);
5577         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5578         {
5579           emit_test(s1l,s1l);
5580           nottaken=(int)out;
5581           emit_jns(1);
5582         }
5583         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5584         {
5585           emit_test(s1l,s1l);
5586           nottaken=(int)out;
5587           emit_js(1);
5588         }
5589       }
5590     } // if(!unconditional)
5591     int adj;
5592     uint64_t ds_unneeded=branch_regs[i].u;
5593     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5594     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5595     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5596     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5597     ds_unneeded|=1;
5598     ds_unneeded_upper|=1;
5599     // branch taken
5600     if(!nevertaken) {
5601       //assem_debug("1:\n");
5602       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5603                     ds_unneeded,ds_unneeded_upper);
5604       // load regs
5605       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5606       address_generation(i+1,&branch_regs[i],0);
5607       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5608       ds_assemble(i+1,&branch_regs[i]);
5609       cc=get_reg(branch_regs[i].regmap,CCREG);
5610       if(cc==-1) {
5611         emit_loadreg(CCREG,cc=HOST_CCREG);
5612         // CHECK: Is the following instruction (fall thru) allocated ok?
5613       }
5614       assert(cc==HOST_CCREG);
5615       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5616       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5617       assem_debug("cycle count (adj)\n");
5618       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5619       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5620       if(internal)
5621         assem_debug("branch: internal\n");
5622       else
5623         assem_debug("branch: external\n");
5624       if(internal&&is_ds[(ba[i]-start)>>2]) {
5625         ds_assemble_entry(i);
5626       }
5627       else {
5628         add_to_linker((int)out,ba[i],internal);
5629         emit_jmp(0);
5630       }
5631     }
5632     // branch not taken
5633     cop1_usable=prev_cop1_usable;
5634     if(!unconditional) {
5635       set_jump_target(nottaken,(int)out);
5636       assem_debug("1:\n");
5637       if(!likely[i]) {
5638         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5639                       ds_unneeded,ds_unneeded_upper);
5640         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5641         address_generation(i+1,&branch_regs[i],0);
5642         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5643         ds_assemble(i+1,&branch_regs[i]);
5644       }
5645       cc=get_reg(branch_regs[i].regmap,CCREG);
5646       if(cc==-1&&!likely[i]) {
5647         // Cycle count isn't in a register, temporarily load it then write it out
5648         emit_loadreg(CCREG,HOST_CCREG);
5649         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5650         int jaddr=(int)out;
5651         emit_jns(0);
5652         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5653         emit_storereg(CCREG,HOST_CCREG);
5654       }
5655       else{
5656         cc=get_reg(i_regmap,CCREG);
5657         assert(cc==HOST_CCREG);
5658         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5659         int jaddr=(int)out;
5660         emit_jns(0);
5661         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5662       }
5663     }
5664   }
5665 }
5666
5667 void fjump_assemble(int i,struct regstat *i_regs)
5668 {
5669   signed char *i_regmap=i_regs->regmap;
5670   int cc;
5671   int match;
5672   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5673   assem_debug("fmatch=%d\n",match);
5674   int fs,cs;
5675   int eaddr;
5676   int invert=0;
5677   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5678   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5679   if(!match) invert=1;
5680   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5681   if(i>(ba[i]-start)>>2) invert=1;
5682   #endif
5683
5684   if(ooo[i]) {
5685     fs=get_reg(branch_regs[i].regmap,FSREG);
5686     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5687   }
5688   else {
5689     fs=get_reg(i_regmap,FSREG);
5690   }
5691
5692   // Check cop1 unusable
5693   if(!cop1_usable) {
5694     cs=get_reg(i_regmap,CSREG);
5695     assert(cs>=0);
5696     emit_testimm(cs,0x20000000);
5697     eaddr=(int)out;
5698     emit_jeq(0);
5699     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5700     cop1_usable=1;
5701   }
5702
5703   if(ooo[i]) {
5704     // Out of order execution (delay slot first)
5705     //printf("OOOE\n");
5706     ds_assemble(i+1,i_regs);
5707     int adj;
5708     uint64_t bc_unneeded=branch_regs[i].u;
5709     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5710     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5711     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5712     bc_unneeded|=1;
5713     bc_unneeded_upper|=1;
5714     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5715                   bc_unneeded,bc_unneeded_upper);
5716     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5717     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5718     cc=get_reg(branch_regs[i].regmap,CCREG);
5719     assert(cc==HOST_CCREG);
5720     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5721     assem_debug("cycle count (adj)\n");
5722     if(1) {
5723       int nottaken=0;
5724       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5725       if(1) {
5726         assert(fs>=0);
5727         emit_testimm(fs,0x800000);
5728         if(source[i]&0x10000) // BC1T
5729         {
5730           if(invert){
5731             nottaken=(int)out;
5732             emit_jeq(1);
5733           }else{
5734             add_to_linker((int)out,ba[i],internal);
5735             emit_jne(0);
5736           }
5737         }
5738         else // BC1F
5739           if(invert){
5740             nottaken=(int)out;
5741             emit_jne(1);
5742           }else{
5743             add_to_linker((int)out,ba[i],internal);
5744             emit_jeq(0);
5745           }
5746         {
5747         }
5748       } // if(!only32)
5749
5750       if(invert) {
5751         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5752         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5753         else if(match) emit_addnop(13);
5754         #endif
5755         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5756         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5757         if(internal)
5758           assem_debug("branch: internal\n");
5759         else
5760           assem_debug("branch: external\n");
5761         if(internal&&is_ds[(ba[i]-start)>>2]) {
5762           ds_assemble_entry(i);
5763         }
5764         else {
5765           add_to_linker((int)out,ba[i],internal);
5766           emit_jmp(0);
5767         }
5768         set_jump_target(nottaken,(int)out);
5769       }
5770
5771       if(adj) {
5772         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5773       }
5774     } // (!unconditional)
5775   } // if(ooo)
5776   else
5777   {
5778     // In-order execution (branch first)
5779     //printf("IOE\n");
5780     int nottaken=0;
5781     if(1) {
5782       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5783       if(1) {
5784         assert(fs>=0);
5785         emit_testimm(fs,0x800000);
5786         if(source[i]&0x10000) // BC1T
5787         {
5788           nottaken=(int)out;
5789           emit_jeq(1);
5790         }
5791         else // BC1F
5792         {
5793           nottaken=(int)out;
5794           emit_jne(1);
5795         }
5796       }
5797     } // if(!unconditional)
5798     int adj;
5799     uint64_t ds_unneeded=branch_regs[i].u;
5800     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5801     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5802     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5803     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5804     ds_unneeded|=1;
5805     ds_unneeded_upper|=1;
5806     // branch taken
5807     //assem_debug("1:\n");
5808     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5809                   ds_unneeded,ds_unneeded_upper);
5810     // load regs
5811     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5812     address_generation(i+1,&branch_regs[i],0);
5813     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5814     ds_assemble(i+1,&branch_regs[i]);
5815     cc=get_reg(branch_regs[i].regmap,CCREG);
5816     if(cc==-1) {
5817       emit_loadreg(CCREG,cc=HOST_CCREG);
5818       // CHECK: Is the following instruction (fall thru) allocated ok?
5819     }
5820     assert(cc==HOST_CCREG);
5821     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5822     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5823     assem_debug("cycle count (adj)\n");
5824     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5825     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5826     if(internal)
5827       assem_debug("branch: internal\n");
5828     else
5829       assem_debug("branch: external\n");
5830     if(internal&&is_ds[(ba[i]-start)>>2]) {
5831       ds_assemble_entry(i);
5832     }
5833     else {
5834       add_to_linker((int)out,ba[i],internal);
5835       emit_jmp(0);
5836     }
5837
5838     // branch not taken
5839     if(1) { // <- FIXME (don't need this)
5840       set_jump_target(nottaken,(int)out);
5841       assem_debug("1:\n");
5842       if(!likely[i]) {
5843         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5844                       ds_unneeded,ds_unneeded_upper);
5845         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5846         address_generation(i+1,&branch_regs[i],0);
5847         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5848         ds_assemble(i+1,&branch_regs[i]);
5849       }
5850       cc=get_reg(branch_regs[i].regmap,CCREG);
5851       if(cc==-1&&!likely[i]) {
5852         // Cycle count isn't in a register, temporarily load it then write it out
5853         emit_loadreg(CCREG,HOST_CCREG);
5854         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5855         int jaddr=(int)out;
5856         emit_jns(0);
5857         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5858         emit_storereg(CCREG,HOST_CCREG);
5859       }
5860       else{
5861         cc=get_reg(i_regmap,CCREG);
5862         assert(cc==HOST_CCREG);
5863         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5864         int jaddr=(int)out;
5865         emit_jns(0);
5866         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5867       }
5868     }
5869   }
5870 }
5871
5872 static void pagespan_assemble(int i,struct regstat *i_regs)
5873 {
5874   int s1l=get_reg(i_regs->regmap,rs1[i]);
5875   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5876   int s2l=get_reg(i_regs->regmap,rs2[i]);
5877   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5878   int taken=0;
5879   int nottaken=0;
5880   int unconditional=0;
5881   if(rs1[i]==0)
5882   {
5883     s1l=s2l;s1h=s2h;
5884     s2l=s2h=-1;
5885   }
5886   else if(rs2[i]==0)
5887   {
5888     s2l=s2h=-1;
5889   }
5890   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5891     s1h=s2h=-1;
5892   }
5893   int hr=0;
5894   int addr=-1,alt=-1,ntaddr=-1;
5895   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5896   else {
5897     while(hr<HOST_REGS)
5898     {
5899       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5900          (i_regs->regmap[hr]&63)!=rs1[i] &&
5901          (i_regs->regmap[hr]&63)!=rs2[i] )
5902       {
5903         addr=hr++;break;
5904       }
5905       hr++;
5906     }
5907   }
5908   while(hr<HOST_REGS)
5909   {
5910     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5911        (i_regs->regmap[hr]&63)!=rs1[i] &&
5912        (i_regs->regmap[hr]&63)!=rs2[i] )
5913     {
5914       alt=hr++;break;
5915     }
5916     hr++;
5917   }
5918   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5919   {
5920     while(hr<HOST_REGS)
5921     {
5922       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5923          (i_regs->regmap[hr]&63)!=rs1[i] &&
5924          (i_regs->regmap[hr]&63)!=rs2[i] )
5925       {
5926         ntaddr=hr;break;
5927       }
5928       hr++;
5929     }
5930   }
5931   assert(hr<HOST_REGS);
5932   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5933     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5934   }
5935   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5936   if(opcode[i]==2) // J
5937   {
5938     unconditional=1;
5939   }
5940   if(opcode[i]==3) // JAL
5941   {
5942     // TODO: mini_ht
5943     int rt=get_reg(i_regs->regmap,31);
5944     emit_movimm(start+i*4+8,rt);
5945     unconditional=1;
5946   }
5947   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5948   {
5949     emit_mov(s1l,addr);
5950     if(opcode2[i]==9) // JALR
5951     {
5952       int rt=get_reg(i_regs->regmap,rt1[i]);
5953       emit_movimm(start+i*4+8,rt);
5954     }
5955   }
5956   if((opcode[i]&0x3f)==4) // BEQ
5957   {
5958     if(rs1[i]==rs2[i])
5959     {
5960       unconditional=1;
5961     }
5962     else
5963     #ifdef HAVE_CMOV_IMM
5964     if(s1h<0) {
5965       if(s2l>=0) emit_cmp(s1l,s2l);
5966       else emit_test(s1l,s1l);
5967       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5968     }
5969     else
5970     #endif
5971     {
5972       assert(s1l>=0);
5973       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5974       if(s1h>=0) {
5975         if(s2h>=0) emit_cmp(s1h,s2h);
5976         else emit_test(s1h,s1h);
5977         emit_cmovne_reg(alt,addr);
5978       }
5979       if(s2l>=0) emit_cmp(s1l,s2l);
5980       else emit_test(s1l,s1l);
5981       emit_cmovne_reg(alt,addr);
5982     }
5983   }
5984   if((opcode[i]&0x3f)==5) // BNE
5985   {
5986     #ifdef HAVE_CMOV_IMM
5987     if(s1h<0) {
5988       if(s2l>=0) emit_cmp(s1l,s2l);
5989       else emit_test(s1l,s1l);
5990       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5991     }
5992     else
5993     #endif
5994     {
5995       assert(s1l>=0);
5996       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5997       if(s1h>=0) {
5998         if(s2h>=0) emit_cmp(s1h,s2h);
5999         else emit_test(s1h,s1h);
6000         emit_cmovne_reg(alt,addr);
6001       }
6002       if(s2l>=0) emit_cmp(s1l,s2l);
6003       else emit_test(s1l,s1l);
6004       emit_cmovne_reg(alt,addr);
6005     }
6006   }
6007   if((opcode[i]&0x3f)==0x14) // BEQL
6008   {
6009     if(s1h>=0) {
6010       if(s2h>=0) emit_cmp(s1h,s2h);
6011       else emit_test(s1h,s1h);
6012       nottaken=(int)out;
6013       emit_jne(0);
6014     }
6015     if(s2l>=0) emit_cmp(s1l,s2l);
6016     else emit_test(s1l,s1l);
6017     if(nottaken) set_jump_target(nottaken,(int)out);
6018     nottaken=(int)out;
6019     emit_jne(0);
6020   }
6021   if((opcode[i]&0x3f)==0x15) // BNEL
6022   {
6023     if(s1h>=0) {
6024       if(s2h>=0) emit_cmp(s1h,s2h);
6025       else emit_test(s1h,s1h);
6026       taken=(int)out;
6027       emit_jne(0);
6028     }
6029     if(s2l>=0) emit_cmp(s1l,s2l);
6030     else emit_test(s1l,s1l);
6031     nottaken=(int)out;
6032     emit_jeq(0);
6033     if(taken) set_jump_target(taken,(int)out);
6034   }
6035   if((opcode[i]&0x3f)==6) // BLEZ
6036   {
6037     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6038     emit_cmpimm(s1l,1);
6039     if(s1h>=0) emit_mov(addr,ntaddr);
6040     emit_cmovl_reg(alt,addr);
6041     if(s1h>=0) {
6042       emit_test(s1h,s1h);
6043       emit_cmovne_reg(ntaddr,addr);
6044       emit_cmovs_reg(alt,addr);
6045     }
6046   }
6047   if((opcode[i]&0x3f)==7) // BGTZ
6048   {
6049     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6050     emit_cmpimm(s1l,1);
6051     if(s1h>=0) emit_mov(addr,alt);
6052     emit_cmovl_reg(ntaddr,addr);
6053     if(s1h>=0) {
6054       emit_test(s1h,s1h);
6055       emit_cmovne_reg(alt,addr);
6056       emit_cmovs_reg(ntaddr,addr);
6057     }
6058   }
6059   if((opcode[i]&0x3f)==0x16) // BLEZL
6060   {
6061     assert((opcode[i]&0x3f)!=0x16);
6062   }
6063   if((opcode[i]&0x3f)==0x17) // BGTZL
6064   {
6065     assert((opcode[i]&0x3f)!=0x17);
6066   }
6067   assert(opcode[i]!=1); // BLTZ/BGEZ
6068
6069   //FIXME: Check CSREG
6070   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6071     if((source[i]&0x30000)==0) // BC1F
6072     {
6073       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6074       emit_testimm(s1l,0x800000);
6075       emit_cmovne_reg(alt,addr);
6076     }
6077     if((source[i]&0x30000)==0x10000) // BC1T
6078     {
6079       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6080       emit_testimm(s1l,0x800000);
6081       emit_cmovne_reg(alt,addr);
6082     }
6083     if((source[i]&0x30000)==0x20000) // BC1FL
6084     {
6085       emit_testimm(s1l,0x800000);
6086       nottaken=(int)out;
6087       emit_jne(0);
6088     }
6089     if((source[i]&0x30000)==0x30000) // BC1TL
6090     {
6091       emit_testimm(s1l,0x800000);
6092       nottaken=(int)out;
6093       emit_jeq(0);
6094     }
6095   }
6096
6097   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6098   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6099   if(likely[i]||unconditional)
6100   {
6101     emit_movimm(ba[i],HOST_BTREG);
6102   }
6103   else if(addr!=HOST_BTREG)
6104   {
6105     emit_mov(addr,HOST_BTREG);
6106   }
6107   void *branch_addr=out;
6108   emit_jmp(0);
6109   int target_addr=start+i*4+5;
6110   void *stub=out;
6111   void *compiled_target_addr=check_addr(target_addr);
6112   emit_extjump_ds((int)branch_addr,target_addr);
6113   if(compiled_target_addr) {
6114     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6115     add_link(target_addr,stub);
6116   }
6117   else set_jump_target((int)branch_addr,(int)stub);
6118   if(likely[i]) {
6119     // Not-taken path
6120     set_jump_target((int)nottaken,(int)out);
6121     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6122     void *branch_addr=out;
6123     emit_jmp(0);
6124     int target_addr=start+i*4+8;
6125     void *stub=out;
6126     void *compiled_target_addr=check_addr(target_addr);
6127     emit_extjump_ds((int)branch_addr,target_addr);
6128     if(compiled_target_addr) {
6129       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6130       add_link(target_addr,stub);
6131     }
6132     else set_jump_target((int)branch_addr,(int)stub);
6133   }
6134 }
6135
6136 // Assemble the delay slot for the above
6137 static void pagespan_ds()
6138 {
6139   assem_debug("initial delay slot:\n");
6140   u_int vaddr=start+1;
6141   u_int page=get_page(vaddr);
6142   u_int vpage=get_vpage(vaddr);
6143   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6144   do_dirty_stub_ds();
6145   ll_add(jump_in+page,vaddr,(void *)out);
6146   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6147   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6148     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6149   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6150     emit_writeword(HOST_BTREG,(int)&branch_target);
6151   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6152   address_generation(0,&regs[0],regs[0].regmap_entry);
6153   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6154     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6155   cop1_usable=0;
6156   is_delayslot=0;
6157   switch(itype[0]) {
6158     case ALU:
6159       alu_assemble(0,&regs[0]);break;
6160     case IMM16:
6161       imm16_assemble(0,&regs[0]);break;
6162     case SHIFT:
6163       shift_assemble(0,&regs[0]);break;
6164     case SHIFTIMM:
6165       shiftimm_assemble(0,&regs[0]);break;
6166     case LOAD:
6167       load_assemble(0,&regs[0]);break;
6168     case LOADLR:
6169       loadlr_assemble(0,&regs[0]);break;
6170     case STORE:
6171       store_assemble(0,&regs[0]);break;
6172     case STORELR:
6173       storelr_assemble(0,&regs[0]);break;
6174     case COP0:
6175       cop0_assemble(0,&regs[0]);break;
6176     case COP1:
6177       cop1_assemble(0,&regs[0]);break;
6178     case C1LS:
6179       c1ls_assemble(0,&regs[0]);break;
6180     case COP2:
6181       cop2_assemble(0,&regs[0]);break;
6182     case C2LS:
6183       c2ls_assemble(0,&regs[0]);break;
6184     case C2OP:
6185       c2op_assemble(0,&regs[0]);break;
6186     case FCONV:
6187       fconv_assemble(0,&regs[0]);break;
6188     case FLOAT:
6189       float_assemble(0,&regs[0]);break;
6190     case FCOMP:
6191       fcomp_assemble(0,&regs[0]);break;
6192     case MULTDIV:
6193       multdiv_assemble(0,&regs[0]);break;
6194     case MOV:
6195       mov_assemble(0,&regs[0]);break;
6196     case SYSCALL:
6197     case HLECALL:
6198     case INTCALL:
6199     case SPAN:
6200     case UJUMP:
6201     case RJUMP:
6202     case CJUMP:
6203     case SJUMP:
6204     case FJUMP:
6205       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6206   }
6207   int btaddr=get_reg(regs[0].regmap,BTREG);
6208   if(btaddr<0) {
6209     btaddr=get_reg(regs[0].regmap,-1);
6210     emit_readword((int)&branch_target,btaddr);
6211   }
6212   assert(btaddr!=HOST_CCREG);
6213   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6214 #ifdef HOST_IMM8
6215   emit_movimm(start+4,HOST_TEMPREG);
6216   emit_cmp(btaddr,HOST_TEMPREG);
6217 #else
6218   emit_cmpimm(btaddr,start+4);
6219 #endif
6220   int branch=(int)out;
6221   emit_jeq(0);
6222   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6223   emit_jmp(jump_vaddr_reg[btaddr]);
6224   set_jump_target(branch,(int)out);
6225   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6226   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6227 }
6228
6229 // Basic liveness analysis for MIPS registers
6230 void unneeded_registers(int istart,int iend,int r)
6231 {
6232   int i;
6233   uint64_t u,uu,gte_u,b,bu,gte_bu;
6234   uint64_t temp_u,temp_uu,temp_gte_u=0;
6235   uint64_t tdep;
6236   uint64_t gte_u_unknown=0;
6237   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6238     gte_u_unknown=~0ll;
6239   if(iend==slen-1) {
6240     u=1;uu=1;
6241     gte_u=gte_u_unknown;
6242   }else{
6243     u=unneeded_reg[iend+1];
6244     uu=unneeded_reg_upper[iend+1];
6245     u=1;uu=1;
6246     gte_u=gte_unneeded[iend+1];
6247   }
6248
6249   for (i=iend;i>=istart;i--)
6250   {
6251     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6252     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6253     {
6254       // If subroutine call, flag return address as a possible branch target
6255       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6256
6257       if(ba[i]<start || ba[i]>=(start+slen*4))
6258       {
6259         // Branch out of this block, flush all regs
6260         u=1;
6261         uu=1;
6262         gte_u=gte_u_unknown;
6263         /* Hexagon hack
6264         if(itype[i]==UJUMP&&rt1[i]==31)
6265         {
6266           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6267         }
6268         if(itype[i]==RJUMP&&rs1[i]==31)
6269         {
6270           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6271         }
6272         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6273           if(itype[i]==UJUMP&&rt1[i]==31)
6274           {
6275             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6276             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6277           }
6278           if(itype[i]==RJUMP&&rs1[i]==31)
6279           {
6280             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6281             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6282           }
6283         }*/
6284         branch_unneeded_reg[i]=u;
6285         branch_unneeded_reg_upper[i]=uu;
6286         // Merge in delay slot
6287         tdep=(~uu>>rt1[i+1])&1;
6288         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6289         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6290         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6291         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6292         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6293         u|=1;uu|=1;
6294         gte_u|=gte_rt[i+1];
6295         gte_u&=~gte_rs[i+1];
6296         // If branch is "likely" (and conditional)
6297         // then we skip the delay slot on the fall-thru path
6298         if(likely[i]) {
6299           if(i<slen-1) {
6300             u&=unneeded_reg[i+2];
6301             uu&=unneeded_reg_upper[i+2];
6302             gte_u&=gte_unneeded[i+2];
6303           }
6304           else
6305           {
6306             u=1;
6307             uu=1;
6308             gte_u=gte_u_unknown;
6309           }
6310         }
6311       }
6312       else
6313       {
6314         // Internal branch, flag target
6315         bt[(ba[i]-start)>>2]=1;
6316         if(ba[i]<=start+i*4) {
6317           // Backward branch
6318           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6319           {
6320             // Unconditional branch
6321             temp_u=1;temp_uu=1;
6322             temp_gte_u=0;
6323           } else {
6324             // Conditional branch (not taken case)
6325             temp_u=unneeded_reg[i+2];
6326             temp_uu=unneeded_reg_upper[i+2];
6327             temp_gte_u&=gte_unneeded[i+2];
6328           }
6329           // Merge in delay slot
6330           tdep=(~temp_uu>>rt1[i+1])&1;
6331           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6332           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6333           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6334           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6335           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6336           temp_u|=1;temp_uu|=1;
6337           temp_gte_u|=gte_rt[i+1];
6338           temp_gte_u&=~gte_rs[i+1];
6339           // If branch is "likely" (and conditional)
6340           // then we skip the delay slot on the fall-thru path
6341           if(likely[i]) {
6342             if(i<slen-1) {
6343               temp_u&=unneeded_reg[i+2];
6344               temp_uu&=unneeded_reg_upper[i+2];
6345               temp_gte_u&=gte_unneeded[i+2];
6346             }
6347             else
6348             {
6349               temp_u=1;
6350               temp_uu=1;
6351               temp_gte_u=gte_u_unknown;
6352             }
6353           }
6354           tdep=(~temp_uu>>rt1[i])&1;
6355           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6356           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6357           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6358           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6359           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6360           temp_u|=1;temp_uu|=1;
6361           temp_gte_u|=gte_rt[i];
6362           temp_gte_u&=~gte_rs[i];
6363           unneeded_reg[i]=temp_u;
6364           unneeded_reg_upper[i]=temp_uu;
6365           gte_unneeded[i]=temp_gte_u;
6366           // Only go three levels deep.  This recursion can take an
6367           // excessive amount of time if there are a lot of nested loops.
6368           if(r<2) {
6369             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6370           }else{
6371             unneeded_reg[(ba[i]-start)>>2]=1;
6372             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6373             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6374           }
6375         } /*else*/ if(1) {
6376           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6377           {
6378             // Unconditional branch
6379             u=unneeded_reg[(ba[i]-start)>>2];
6380             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6381             gte_u=gte_unneeded[(ba[i]-start)>>2];
6382             branch_unneeded_reg[i]=u;
6383             branch_unneeded_reg_upper[i]=uu;
6384         //u=1;
6385         //uu=1;
6386         //branch_unneeded_reg[i]=u;
6387         //branch_unneeded_reg_upper[i]=uu;
6388             // Merge in delay slot
6389             tdep=(~uu>>rt1[i+1])&1;
6390             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6391             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6392             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6393             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6394             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6395             u|=1;uu|=1;
6396             gte_u|=gte_rt[i+1];
6397             gte_u&=~gte_rs[i+1];
6398           } else {
6399             // Conditional branch
6400             b=unneeded_reg[(ba[i]-start)>>2];
6401             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6402             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6403             branch_unneeded_reg[i]=b;
6404             branch_unneeded_reg_upper[i]=bu;
6405         //b=1;
6406         //bu=1;
6407         //branch_unneeded_reg[i]=b;
6408         //branch_unneeded_reg_upper[i]=bu;
6409             // Branch delay slot
6410             tdep=(~uu>>rt1[i+1])&1;
6411             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6412             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6413             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6414             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6415             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6416             b|=1;bu|=1;
6417             gte_bu|=gte_rt[i+1];
6418             gte_bu&=~gte_rs[i+1];
6419             // If branch is "likely" then we skip the
6420             // delay slot on the fall-thru path
6421             if(likely[i]) {
6422               u=b;
6423               uu=bu;
6424               gte_u=gte_bu;
6425               if(i<slen-1) {
6426                 u&=unneeded_reg[i+2];
6427                 uu&=unneeded_reg_upper[i+2];
6428                 gte_u&=gte_unneeded[i+2];
6429         //u=1;
6430         //uu=1;
6431               }
6432             } else {
6433               u&=b;
6434               uu&=bu;
6435               gte_u&=gte_bu;
6436         //u=1;
6437         //uu=1;
6438             }
6439             if(i<slen-1) {
6440               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6441               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6442         //branch_unneeded_reg[i]=1;
6443         //branch_unneeded_reg_upper[i]=1;
6444             } else {
6445               branch_unneeded_reg[i]=1;
6446               branch_unneeded_reg_upper[i]=1;
6447             }
6448           }
6449         }
6450       }
6451     }
6452     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6453     {
6454       // SYSCALL instruction (software interrupt)
6455       u=1;
6456       uu=1;
6457     }
6458     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6459     {
6460       // ERET instruction (return from interrupt)
6461       u=1;
6462       uu=1;
6463     }
6464     //u=uu=1; // DEBUG
6465     tdep=(~uu>>rt1[i])&1;
6466     // Written registers are unneeded
6467     u|=1LL<<rt1[i];
6468     u|=1LL<<rt2[i];
6469     uu|=1LL<<rt1[i];
6470     uu|=1LL<<rt2[i];
6471     gte_u|=gte_rt[i];
6472     // Accessed registers are needed
6473     u&=~(1LL<<rs1[i]);
6474     u&=~(1LL<<rs2[i]);
6475     uu&=~(1LL<<us1[i]);
6476     uu&=~(1LL<<us2[i]);
6477     gte_u&=~gte_rs[i];
6478     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6479       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6480     // Source-target dependencies
6481     uu&=~(tdep<<dep1[i]);
6482     uu&=~(tdep<<dep2[i]);
6483     // R0 is always unneeded
6484     u|=1;uu|=1;
6485     // Save it
6486     unneeded_reg[i]=u;
6487     unneeded_reg_upper[i]=uu;
6488     gte_unneeded[i]=gte_u;
6489     /*
6490     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6491     printf("U:");
6492     int r;
6493     for(r=1;r<=CCREG;r++) {
6494       if((unneeded_reg[i]>>r)&1) {
6495         if(r==HIREG) printf(" HI");
6496         else if(r==LOREG) printf(" LO");
6497         else printf(" r%d",r);
6498       }
6499     }
6500     printf(" UU:");
6501     for(r=1;r<=CCREG;r++) {
6502       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6503         if(r==HIREG) printf(" HI");
6504         else if(r==LOREG) printf(" LO");
6505         else printf(" r%d",r);
6506       }
6507     }
6508     printf("\n");*/
6509   }
6510   for (i=iend;i>=istart;i--)
6511   {
6512     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6513   }
6514 }
6515
6516 // Write back dirty registers as soon as we will no longer modify them,
6517 // so that we don't end up with lots of writes at the branches.
6518 void clean_registers(int istart,int iend,int wr)
6519 {
6520   int i;
6521   int r;
6522   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6523   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6524   if(iend==slen-1) {
6525     will_dirty_i=will_dirty_next=0;
6526     wont_dirty_i=wont_dirty_next=0;
6527   }else{
6528     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6529     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6530   }
6531   for (i=iend;i>=istart;i--)
6532   {
6533     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6534     {
6535       if(ba[i]<start || ba[i]>=(start+slen*4))
6536       {
6537         // Branch out of this block, flush all regs
6538         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6539         {
6540           // Unconditional branch
6541           will_dirty_i=0;
6542           wont_dirty_i=0;
6543           // Merge in delay slot (will dirty)
6544           for(r=0;r<HOST_REGS;r++) {
6545             if(r!=EXCLUDE_REG) {
6546               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6547               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6548               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6549               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6550               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6551               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6552               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6553               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6554               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6555               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6556               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6557               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6558               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6559               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6560             }
6561           }
6562         }
6563         else
6564         {
6565           // Conditional branch
6566           will_dirty_i=0;
6567           wont_dirty_i=wont_dirty_next;
6568           // Merge in delay slot (will dirty)
6569           for(r=0;r<HOST_REGS;r++) {
6570             if(r!=EXCLUDE_REG) {
6571               if(!likely[i]) {
6572                 // Might not dirty if likely branch is not taken
6573                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6574                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6575                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6576                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6577                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6578                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6579                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6580                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6581                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6582                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6583                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6584                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6585                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6586                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6587               }
6588             }
6589           }
6590         }
6591         // Merge in delay slot (wont dirty)
6592         for(r=0;r<HOST_REGS;r++) {
6593           if(r!=EXCLUDE_REG) {
6594             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6595             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6596             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6597             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6598             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6599             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6600             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6601             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6602             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6603             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6604           }
6605         }
6606         if(wr) {
6607           #ifndef DESTRUCTIVE_WRITEBACK
6608           branch_regs[i].dirty&=wont_dirty_i;
6609           #endif
6610           branch_regs[i].dirty|=will_dirty_i;
6611         }
6612       }
6613       else
6614       {
6615         // Internal branch
6616         if(ba[i]<=start+i*4) {
6617           // Backward branch
6618           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6619           {
6620             // Unconditional branch
6621             temp_will_dirty=0;
6622             temp_wont_dirty=0;
6623             // Merge in delay slot (will dirty)
6624             for(r=0;r<HOST_REGS;r++) {
6625               if(r!=EXCLUDE_REG) {
6626                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6627                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6628                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6629                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6630                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6631                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6632                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6633                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6634                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6635                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6636                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6637                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6638                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6639                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6640               }
6641             }
6642           } else {
6643             // Conditional branch (not taken case)
6644             temp_will_dirty=will_dirty_next;
6645             temp_wont_dirty=wont_dirty_next;
6646             // Merge in delay slot (will dirty)
6647             for(r=0;r<HOST_REGS;r++) {
6648               if(r!=EXCLUDE_REG) {
6649                 if(!likely[i]) {
6650                   // Will not dirty if likely branch is not taken
6651                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6652                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6653                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6654                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6655                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6656                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6657                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6658                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6659                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6660                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6661                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6662                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6663                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6664                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6665                 }
6666               }
6667             }
6668           }
6669           // Merge in delay slot (wont dirty)
6670           for(r=0;r<HOST_REGS;r++) {
6671             if(r!=EXCLUDE_REG) {
6672               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6673               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6674               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6675               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6676               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6677               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6678               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6679               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6680               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6681               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6682             }
6683           }
6684           // Deal with changed mappings
6685           if(i<iend) {
6686             for(r=0;r<HOST_REGS;r++) {
6687               if(r!=EXCLUDE_REG) {
6688                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6689                   temp_will_dirty&=~(1<<r);
6690                   temp_wont_dirty&=~(1<<r);
6691                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6692                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6693                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6694                   } else {
6695                     temp_will_dirty|=1<<r;
6696                     temp_wont_dirty|=1<<r;
6697                   }
6698                 }
6699               }
6700             }
6701           }
6702           if(wr) {
6703             will_dirty[i]=temp_will_dirty;
6704             wont_dirty[i]=temp_wont_dirty;
6705             clean_registers((ba[i]-start)>>2,i-1,0);
6706           }else{
6707             // Limit recursion.  It can take an excessive amount
6708             // of time if there are a lot of nested loops.
6709             will_dirty[(ba[i]-start)>>2]=0;
6710             wont_dirty[(ba[i]-start)>>2]=-1;
6711           }
6712         }
6713         /*else*/ if(1)
6714         {
6715           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6716           {
6717             // Unconditional branch
6718             will_dirty_i=0;
6719             wont_dirty_i=0;
6720           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6721             for(r=0;r<HOST_REGS;r++) {
6722               if(r!=EXCLUDE_REG) {
6723                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6724                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6725                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6726                 }
6727                 if(branch_regs[i].regmap[r]>=0) {
6728                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6729                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6730                 }
6731               }
6732             }
6733           //}
6734             // Merge in delay slot
6735             for(r=0;r<HOST_REGS;r++) {
6736               if(r!=EXCLUDE_REG) {
6737                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6738                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6739                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6740                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6741                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6742                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6743                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6744                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6745                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6746                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6747                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6748                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6749                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6750                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6751               }
6752             }
6753           } else {
6754             // Conditional branch
6755             will_dirty_i=will_dirty_next;
6756             wont_dirty_i=wont_dirty_next;
6757           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6758             for(r=0;r<HOST_REGS;r++) {
6759               if(r!=EXCLUDE_REG) {
6760                 signed char target_reg=branch_regs[i].regmap[r];
6761                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6762                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6763                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6764                 }
6765                 else if(target_reg>=0) {
6766                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6767                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6768                 }
6769                 // Treat delay slot as part of branch too
6770                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6771                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6772                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6773                 }
6774                 else
6775                 {
6776                   will_dirty[i+1]&=~(1<<r);
6777                 }*/
6778               }
6779             }
6780           //}
6781             // Merge in delay slot
6782             for(r=0;r<HOST_REGS;r++) {
6783               if(r!=EXCLUDE_REG) {
6784                 if(!likely[i]) {
6785                   // Might not dirty if likely branch is not taken
6786                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6787                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6788                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6789                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6790                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6791                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6792                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6793                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6794                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6795                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6796                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6797                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6798                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6799                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6800                 }
6801               }
6802             }
6803           }
6804           // Merge in delay slot (won't dirty)
6805           for(r=0;r<HOST_REGS;r++) {
6806             if(r!=EXCLUDE_REG) {
6807               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6808               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6809               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6810               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6811               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6812               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6813               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6814               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6815               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6816               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6817             }
6818           }
6819           if(wr) {
6820             #ifndef DESTRUCTIVE_WRITEBACK
6821             branch_regs[i].dirty&=wont_dirty_i;
6822             #endif
6823             branch_regs[i].dirty|=will_dirty_i;
6824           }
6825         }
6826       }
6827     }
6828     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6829     {
6830       // SYSCALL instruction (software interrupt)
6831       will_dirty_i=0;
6832       wont_dirty_i=0;
6833     }
6834     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6835     {
6836       // ERET instruction (return from interrupt)
6837       will_dirty_i=0;
6838       wont_dirty_i=0;
6839     }
6840     will_dirty_next=will_dirty_i;
6841     wont_dirty_next=wont_dirty_i;
6842     for(r=0;r<HOST_REGS;r++) {
6843       if(r!=EXCLUDE_REG) {
6844         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6845         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6846         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6847         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6848         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6849         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6850         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6851         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6852         if(i>istart) {
6853           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6854           {
6855             // Don't store a register immediately after writing it,
6856             // may prevent dual-issue.
6857             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6858             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6859           }
6860         }
6861       }
6862     }
6863     // Save it
6864     will_dirty[i]=will_dirty_i;
6865     wont_dirty[i]=wont_dirty_i;
6866     // Mark registers that won't be dirtied as not dirty
6867     if(wr) {
6868       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6869       for(r=0;r<HOST_REGS;r++) {
6870         if((will_dirty_i>>r)&1) {
6871           printf(" r%d",r);
6872         }
6873       }
6874       printf("\n");*/
6875
6876       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6877         regs[i].dirty|=will_dirty_i;
6878         #ifndef DESTRUCTIVE_WRITEBACK
6879         regs[i].dirty&=wont_dirty_i;
6880         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6881         {
6882           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6883             for(r=0;r<HOST_REGS;r++) {
6884               if(r!=EXCLUDE_REG) {
6885                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6886                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6887                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6888               }
6889             }
6890           }
6891         }
6892         else
6893         {
6894           if(i<iend) {
6895             for(r=0;r<HOST_REGS;r++) {
6896               if(r!=EXCLUDE_REG) {
6897                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6898                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6899                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6900               }
6901             }
6902           }
6903         }
6904         #endif
6905       //}
6906     }
6907     // Deal with changed mappings
6908     temp_will_dirty=will_dirty_i;
6909     temp_wont_dirty=wont_dirty_i;
6910     for(r=0;r<HOST_REGS;r++) {
6911       if(r!=EXCLUDE_REG) {
6912         int nr;
6913         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6914           if(wr) {
6915             #ifndef DESTRUCTIVE_WRITEBACK
6916             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6917             #endif
6918             regs[i].wasdirty|=will_dirty_i&(1<<r);
6919           }
6920         }
6921         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6922           // Register moved to a different register
6923           will_dirty_i&=~(1<<r);
6924           wont_dirty_i&=~(1<<r);
6925           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6926           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6927           if(wr) {
6928             #ifndef DESTRUCTIVE_WRITEBACK
6929             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6930             #endif
6931             regs[i].wasdirty|=will_dirty_i&(1<<r);
6932           }
6933         }
6934         else {
6935           will_dirty_i&=~(1<<r);
6936           wont_dirty_i&=~(1<<r);
6937           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6938             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6939             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6940           } else {
6941             wont_dirty_i|=1<<r;
6942             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6943           }
6944         }
6945       }
6946     }
6947   }
6948 }
6949
6950 #ifdef DISASM
6951   /* disassembly */
6952 void disassemble_inst(int i)
6953 {
6954     if (bt[i]) printf("*"); else printf(" ");
6955     switch(itype[i]) {
6956       case UJUMP:
6957         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6958       case CJUMP:
6959         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6960       case SJUMP:
6961         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6962       case FJUMP:
6963         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6964       case RJUMP:
6965         if (opcode[i]==0x9&&rt1[i]!=31)
6966           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6967         else
6968           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6969         break;
6970       case SPAN:
6971         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6972       case IMM16:
6973         if(opcode[i]==0xf) //LUI
6974           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6975         else
6976           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6977         break;
6978       case LOAD:
6979       case LOADLR:
6980         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6981         break;
6982       case STORE:
6983       case STORELR:
6984         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6985         break;
6986       case ALU:
6987       case SHIFT:
6988         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6989         break;
6990       case MULTDIV:
6991         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6992         break;
6993       case SHIFTIMM:
6994         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6995         break;
6996       case MOV:
6997         if((opcode2[i]&0x1d)==0x10)
6998           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6999         else if((opcode2[i]&0x1d)==0x11)
7000           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7001         else
7002           printf (" %x: %s\n",start+i*4,insn[i]);
7003         break;
7004       case COP0:
7005         if(opcode2[i]==0)
7006           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7007         else if(opcode2[i]==4)
7008           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7009         else printf (" %x: %s\n",start+i*4,insn[i]);
7010         break;
7011       case COP1:
7012         if(opcode2[i]<3)
7013           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7014         else if(opcode2[i]>3)
7015           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7016         else printf (" %x: %s\n",start+i*4,insn[i]);
7017         break;
7018       case COP2:
7019         if(opcode2[i]<3)
7020           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7021         else if(opcode2[i]>3)
7022           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7023         else printf (" %x: %s\n",start+i*4,insn[i]);
7024         break;
7025       case C1LS:
7026         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7027         break;
7028       case C2LS:
7029         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7030         break;
7031       case INTCALL:
7032         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7033         break;
7034       default:
7035         //printf (" %s %8x\n",insn[i],source[i]);
7036         printf (" %x: %s\n",start+i*4,insn[i]);
7037     }
7038 }
7039 #else
7040 static void disassemble_inst(int i) {}
7041 #endif // DISASM
7042
7043 #define DRC_TEST_VAL 0x74657374
7044
7045 static int new_dynarec_test(void)
7046 {
7047   int (*testfunc)(void) = (void *)out;
7048   void *beginning;
7049   int ret;
7050
7051   beginning = start_block();
7052   emit_movimm(DRC_TEST_VAL,0); // test
7053   emit_jmpreg(14);
7054   literal_pool(0);
7055   end_block(beginning);
7056   SysPrintf("testing if we can run recompiled code..\n");
7057   ret = testfunc();
7058   if (ret == DRC_TEST_VAL)
7059     SysPrintf("test passed.\n");
7060   else
7061     SysPrintf("test failed: %08x\n", ret);
7062   out=(u_char *)BASE_ADDR;
7063   return ret == DRC_TEST_VAL;
7064 }
7065
7066 // clear the state completely, instead of just marking
7067 // things invalid like invalidate_all_pages() does
7068 void new_dynarec_clear_full(void)
7069 {
7070   int n;
7071   out=(u_char *)BASE_ADDR;
7072   memset(invalid_code,1,sizeof(invalid_code));
7073   memset(hash_table,0xff,sizeof(hash_table));
7074   memset(mini_ht,-1,sizeof(mini_ht));
7075   memset(restore_candidate,0,sizeof(restore_candidate));
7076   memset(shadow,0,sizeof(shadow));
7077   copy=shadow;
7078   expirep=16384; // Expiry pointer, +2 blocks
7079   pending_exception=0;
7080   literalcount=0;
7081   stop_after_jal=0;
7082   inv_code_start=inv_code_end=~0;
7083   // TLB
7084   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7085   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7086   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7087 }
7088
7089 void new_dynarec_init(void)
7090 {
7091   SysPrintf("Init new dynarec\n");
7092
7093 #ifdef _3DS
7094   check_rosalina();
7095 #endif
7096
7097   // allocate/prepare a buffer for translation cache
7098   // see assem_arm.h for some explanation
7099 #if   defined(BASE_ADDR_FIXED)
7100   if (mmap (translation_cache, 1 << TARGET_SIZE_2,
7101         PROT_READ | PROT_WRITE | PROT_EXEC,
7102         MAP_PRIVATE | MAP_ANONYMOUS,
7103         -1, 0) != translation_cache)
7104   {
7105     SysPrintf("mmap() failed: %s\n", strerror(errno));
7106     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
7107     abort();
7108   }
7109 #elif defined(BASE_ADDR_DYNAMIC)
7110 #ifdef VITA
7111   sceBlock = getVMBlock();//sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
7112   if (sceBlock < 0)
7113     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
7114   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
7115   if (ret < 0)
7116     SysPrintf("sceKernelGetMemBlockBase failed\n");
7117     
7118   sceKernelOpenVMDomain();
7119   sceClibPrintf("translation_cache = 0x%08X \n ", translation_cache);
7120 #elif defined(_MSC_VER)
7121   base_addr = VirtualAlloc(NULL, 1<<TARGET_SIZE_2, MEM_COMMIT | MEM_RESERVE,
7122       PAGE_EXECUTE_READWRITE);
7123 #else
7124   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
7125       PROT_READ | PROT_WRITE | PROT_EXEC,
7126       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
7127   if (translation_cache == MAP_FAILED) {
7128     SysPrintf("mmap() failed: %s\n", strerror(errno));
7129     abort();
7130   }
7131 #endif
7132 #else
7133 #ifndef NO_WRITE_EXEC
7134   // not all systems allow execute in data segment by default
7135   if (mprotect((void *)BASE_ADDR, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
7136     SysPrintf("mprotect() failed: %s\n", strerror(errno));
7137 #endif
7138 #endif
7139
7140   out=(u_char *)BASE_ADDR;
7141   cycle_multiplier=200;
7142   new_dynarec_clear_full();
7143 #ifdef HOST_IMM8
7144   // Copy this into local area so we don't have to put it in every literal pool
7145   invc_ptr=invalid_code;
7146 #endif
7147   arch_init();
7148   new_dynarec_test();
7149 #ifndef RAM_FIXED
7150   ram_offset=(u_int)rdram-0x80000000;
7151 #endif
7152   if (ram_offset!=0)
7153     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
7154 }
7155
7156 void new_dynarec_cleanup(void)
7157 {
7158   int n;
7159 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
7160 #ifndef VITA
7161 #if defined(_MSC_VER)
7162   VirtualFree(base_addr, 0, MEM_RELEASE);
7163 #else
7164   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0)
7165     SysPrintf("munmap() failed\n");
7166 #endif
7167 #endif
7168 #endif
7169   for(n=0;n<4096;n++)
7170     ll_clear(jump_in+n);
7171   for(n=0;n<4096;n++)
7172     ll_clear(jump_out+n);
7173   for(n=0;n<4096;n++)
7174     ll_clear(jump_dirty+n);
7175 #ifdef ROM_COPY
7176   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7177 #endif
7178 }
7179
7180 static u_int *get_source_start(u_int addr, u_int *limit)
7181 {
7182   if (addr < 0x00200000 ||
7183     (0xa0000000 <= addr && addr < 0xa0200000)) {
7184     // used for BIOS calls mostly?
7185     *limit = (addr&0xa0000000)|0x00200000;
7186     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7187   }
7188   else if (!Config.HLE && (
7189     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7190     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7191     // BIOS
7192     *limit = (addr & 0xfff00000) | 0x80000;
7193     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7194   }
7195   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7196     *limit = (addr & 0x80600000) + 0x00200000;
7197     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7198   }
7199   return NULL;
7200 }
7201
7202 static u_int scan_for_ret(u_int addr)
7203 {
7204   u_int limit = 0;
7205   u_int *mem;
7206
7207   mem = get_source_start(addr, &limit);
7208   if (mem == NULL)
7209     return addr;
7210
7211   if (limit > addr + 0x1000)
7212     limit = addr + 0x1000;
7213   for (; addr < limit; addr += 4, mem++) {
7214     if (*mem == 0x03e00008) // jr $ra
7215       return addr + 8;
7216   }
7217   return addr;
7218 }
7219
7220 struct savestate_block {
7221   uint32_t addr;
7222   uint32_t regflags;
7223 };
7224
7225 static int addr_cmp(const void *p1_, const void *p2_)
7226 {
7227   const struct savestate_block *p1 = p1_, *p2 = p2_;
7228   return p1->addr - p2->addr;
7229 }
7230
7231 int new_dynarec_save_blocks(void *save, int size)
7232 {
7233   struct savestate_block *blocks = save;
7234   int maxcount = size / sizeof(blocks[0]);
7235   struct savestate_block tmp_blocks[1024];
7236   struct ll_entry *head;
7237   int p, s, d, o, bcnt;
7238   u_int addr;
7239
7240   o = 0;
7241   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
7242     bcnt = 0;
7243     for (head = jump_in[p]; head != NULL; head = head->next) {
7244       tmp_blocks[bcnt].addr = head->vaddr;
7245       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7246       bcnt++;
7247     }
7248     if (bcnt < 1)
7249       continue;
7250     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7251
7252     addr = tmp_blocks[0].addr;
7253     for (s = d = 0; s < bcnt; s++) {
7254       if (tmp_blocks[s].addr < addr)
7255         continue;
7256       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7257         tmp_blocks[d++] = tmp_blocks[s];
7258       addr = scan_for_ret(tmp_blocks[s].addr);
7259     }
7260
7261     if (o + d > maxcount)
7262       d = maxcount - o;
7263     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7264     o += d;
7265   }
7266
7267   return o * sizeof(blocks[0]);
7268 }
7269
7270 void new_dynarec_load_blocks(const void *save, int size)
7271 {
7272   const struct savestate_block *blocks = save;
7273   int count = size / sizeof(blocks[0]);
7274   u_int regs_save[32];
7275   uint32_t f;
7276   int i, b;
7277
7278   get_addr(psxRegs.pc);
7279
7280   // change GPRs for speculation to at least partially work..
7281   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7282   for (i = 1; i < 32; i++)
7283     psxRegs.GPR.r[i] = 0x80000000;
7284
7285   for (b = 0; b < count; b++) {
7286     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7287       if (f & 1)
7288         psxRegs.GPR.r[i] = 0x1f800000;
7289     }
7290
7291     get_addr(blocks[b].addr);
7292
7293     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7294       if (f & 1)
7295         psxRegs.GPR.r[i] = 0x80000000;
7296     }
7297   }
7298
7299   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7300 }
7301
7302 int new_recompile_block(int addr)
7303 {
7304   u_int pagelimit = 0;
7305   u_int state_rflags = 0;
7306   int i;
7307
7308   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7309   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7310   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7311   //if(debug)
7312   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7313   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7314   /*if(Count>=312978186) {
7315     rlist();
7316   }*/
7317   //rlist();
7318
7319   // this is just for speculation
7320   for (i = 1; i < 32; i++) {
7321     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7322       state_rflags |= 1 << i;
7323   }
7324
7325   start = (u_int)addr&~3;
7326   //assert(((u_int)addr&1)==0);
7327   new_dynarec_did_compile=1;
7328   if (Config.HLE && start == 0x80001000) // hlecall
7329   {
7330     // XXX: is this enough? Maybe check hleSoftCall?
7331     void *beginning=start_block();
7332     u_int page=get_page(start);
7333
7334     invalid_code[start>>12]=0;
7335     emit_movimm(start,0);
7336     emit_writeword(0,(int)&pcaddr);
7337     emit_jmp((int)new_dyna_leave);
7338     literal_pool(0);
7339     end_block(beginning);
7340     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7341     return 0;
7342   }
7343
7344   source = get_source_start(start, &pagelimit);
7345   if (source == NULL) {
7346     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7347     exit(1);
7348   }
7349
7350   /* Pass 1: disassemble */
7351   /* Pass 2: register dependencies, branch targets */
7352   /* Pass 3: register allocation */
7353   /* Pass 4: branch dependencies */
7354   /* Pass 5: pre-alloc */
7355   /* Pass 6: optimize clean/dirty state */
7356   /* Pass 7: flag 32-bit registers */
7357   /* Pass 8: assembly */
7358   /* Pass 9: linker */
7359   /* Pass 10: garbage collection / free memory */
7360
7361   int j;
7362   int done=0;
7363   unsigned int type,op,op2;
7364
7365   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7366
7367   /* Pass 1 disassembly */
7368
7369   for(i=0;!done;i++) {
7370     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7371     minimum_free_regs[i]=0;
7372     opcode[i]=op=source[i]>>26;
7373     switch(op)
7374     {
7375       case 0x00: strcpy(insn[i],"special"); type=NI;
7376         op2=source[i]&0x3f;
7377         switch(op2)
7378         {
7379           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7380           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7381           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7382           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7383           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7384           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7385           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7386           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7387           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7388           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7389           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7390           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7391           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7392           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7393           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7394           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7395           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7396           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7397           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7398           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7399           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7400           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7401           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7402           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7403           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7404           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7405           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7406           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7407           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7408           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7409           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7410           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7411           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7412           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7413           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7414 #if 0
7415           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7416           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7417           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7418           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7419           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7420           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7421           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7422           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7423           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7424           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7425           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7426           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7427           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7428           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7429           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7430           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7431           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7432 #endif
7433         }
7434         break;
7435       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7436         op2=(source[i]>>16)&0x1f;
7437         switch(op2)
7438         {
7439           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7440           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7441           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7442           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7443           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7444           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7445           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7446           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7447           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7448           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7449           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7450           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7451           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7452           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7453         }
7454         break;
7455       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7456       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7457       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7458       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7459       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7460       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7461       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7462       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7463       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7464       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7465       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7466       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7467       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7468       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7469       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7470         op2=(source[i]>>21)&0x1f;
7471         switch(op2)
7472         {
7473           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7474           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7475           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7476           switch(source[i]&0x3f)
7477           {
7478             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7479             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7480             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7481             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7482             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7483             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7484           }
7485         }
7486         break;
7487       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7488         op2=(source[i]>>21)&0x1f;
7489         switch(op2)
7490         {
7491           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7492           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7493           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7494           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7495           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7496           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7497           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7498           switch((source[i]>>16)&0x3)
7499           {
7500             case 0x00: strcpy(insn[i],"BC1F"); break;
7501             case 0x01: strcpy(insn[i],"BC1T"); break;
7502             case 0x02: strcpy(insn[i],"BC1FL"); break;
7503             case 0x03: strcpy(insn[i],"BC1TL"); break;
7504           }
7505           break;
7506           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7507           switch(source[i]&0x3f)
7508           {
7509             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7510             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7511             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7512             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7513             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7514             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7515             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7516             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7517             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7518             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7519             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7520             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7521             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7522             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7523             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7524             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7525             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7526             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7527             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7528             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7529             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7530             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7531             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7532             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7533             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7534             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7535             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7536             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7537             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7538             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7539             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7540             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7541             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7542             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7543             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7544           }
7545           break;
7546           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7547           switch(source[i]&0x3f)
7548           {
7549             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7550             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7551             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7552             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7553             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7554             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7555             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7556             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7557             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7558             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7559             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7560             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7561             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7562             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7563             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7564             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7565             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7566             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7567             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7568             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7569             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7570             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7571             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7572             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7573             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7574             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7575             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7576             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7577             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7578             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7579             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7580             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7581             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7582             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7583             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7584           }
7585           break;
7586           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7587           switch(source[i]&0x3f)
7588           {
7589             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7590             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7591           }
7592           break;
7593           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7594           switch(source[i]&0x3f)
7595           {
7596             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7597             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7598           }
7599           break;
7600         }
7601         break;
7602 #if 0
7603       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7604       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7605       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7606       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7607       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7608       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7609       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7610       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7611 #endif
7612       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7613       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7614       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7615       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7616       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7617       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7618       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7619 #if 0
7620       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7621 #endif
7622       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7623       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7624       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7625       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7626 #if 0
7627       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7628       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7629 #endif
7630       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7631       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7632       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7633       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7634 #if 0
7635       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7636       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7637       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7638 #endif
7639       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7640       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7641 #if 0
7642       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7643       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7644       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7645 #endif
7646       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7647         op2=(source[i]>>21)&0x1f;
7648         //if (op2 & 0x10) {
7649         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7650           if (gte_handlers[source[i]&0x3f]!=NULL) {
7651             if (gte_regnames[source[i]&0x3f]!=NULL)
7652               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7653             else
7654               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7655             type=C2OP;
7656           }
7657         }
7658         else switch(op2)
7659         {
7660           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7661           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7662           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7663           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7664         }
7665         break;
7666       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7667       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7668       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7669       default: strcpy(insn[i],"???"); type=NI;
7670         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7671         break;
7672     }
7673     itype[i]=type;
7674     opcode2[i]=op2;
7675     /* Get registers/immediates */
7676     lt1[i]=0;
7677     us1[i]=0;
7678     us2[i]=0;
7679     dep1[i]=0;
7680     dep2[i]=0;
7681     gte_rs[i]=gte_rt[i]=0;
7682     switch(type) {
7683       case LOAD:
7684         rs1[i]=(source[i]>>21)&0x1f;
7685         rs2[i]=0;
7686         rt1[i]=(source[i]>>16)&0x1f;
7687         rt2[i]=0;
7688         imm[i]=(short)source[i];
7689         break;
7690       case STORE:
7691       case STORELR:
7692         rs1[i]=(source[i]>>21)&0x1f;
7693         rs2[i]=(source[i]>>16)&0x1f;
7694         rt1[i]=0;
7695         rt2[i]=0;
7696         imm[i]=(short)source[i];
7697         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7698         break;
7699       case LOADLR:
7700         // LWL/LWR only load part of the register,
7701         // therefore the target register must be treated as a source too
7702         rs1[i]=(source[i]>>21)&0x1f;
7703         rs2[i]=(source[i]>>16)&0x1f;
7704         rt1[i]=(source[i]>>16)&0x1f;
7705         rt2[i]=0;
7706         imm[i]=(short)source[i];
7707         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7708         if(op==0x26) dep1[i]=rt1[i]; // LWR
7709         break;
7710       case IMM16:
7711         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7712         else rs1[i]=(source[i]>>21)&0x1f;
7713         rs2[i]=0;
7714         rt1[i]=(source[i]>>16)&0x1f;
7715         rt2[i]=0;
7716         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7717           imm[i]=(unsigned short)source[i];
7718         }else{
7719           imm[i]=(short)source[i];
7720         }
7721         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7722         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7723         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7724         break;
7725       case UJUMP:
7726         rs1[i]=0;
7727         rs2[i]=0;
7728         rt1[i]=0;
7729         rt2[i]=0;
7730         // The JAL instruction writes to r31.
7731         if (op&1) {
7732           rt1[i]=31;
7733         }
7734         rs2[i]=CCREG;
7735         break;
7736       case RJUMP:
7737         rs1[i]=(source[i]>>21)&0x1f;
7738         rs2[i]=0;
7739         rt1[i]=0;
7740         rt2[i]=0;
7741         // The JALR instruction writes to rd.
7742         if (op2&1) {
7743           rt1[i]=(source[i]>>11)&0x1f;
7744         }
7745         rs2[i]=CCREG;
7746         break;
7747       case CJUMP:
7748         rs1[i]=(source[i]>>21)&0x1f;
7749         rs2[i]=(source[i]>>16)&0x1f;
7750         rt1[i]=0;
7751         rt2[i]=0;
7752         if(op&2) { // BGTZ/BLEZ
7753           rs2[i]=0;
7754         }
7755         us1[i]=rs1[i];
7756         us2[i]=rs2[i];
7757         likely[i]=op>>4;
7758         break;
7759       case SJUMP:
7760         rs1[i]=(source[i]>>21)&0x1f;
7761         rs2[i]=CCREG;
7762         rt1[i]=0;
7763         rt2[i]=0;
7764         us1[i]=rs1[i];
7765         if(op2&0x10) { // BxxAL
7766           rt1[i]=31;
7767           // NOTE: If the branch is not taken, r31 is still overwritten
7768         }
7769         likely[i]=(op2&2)>>1;
7770         break;
7771       case FJUMP:
7772         rs1[i]=FSREG;
7773         rs2[i]=CSREG;
7774         rt1[i]=0;
7775         rt2[i]=0;
7776         likely[i]=((source[i])>>17)&1;
7777         break;
7778       case ALU:
7779         rs1[i]=(source[i]>>21)&0x1f; // source
7780         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7781         rt1[i]=(source[i]>>11)&0x1f; // destination
7782         rt2[i]=0;
7783         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7784           us1[i]=rs1[i];us2[i]=rs2[i];
7785         }
7786         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7787           dep1[i]=rs1[i];dep2[i]=rs2[i];
7788         }
7789         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7790           dep1[i]=rs1[i];dep2[i]=rs2[i];
7791         }
7792         break;
7793       case MULTDIV:
7794         rs1[i]=(source[i]>>21)&0x1f; // source
7795         rs2[i]=(source[i]>>16)&0x1f; // divisor
7796         rt1[i]=HIREG;
7797         rt2[i]=LOREG;
7798         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7799           us1[i]=rs1[i];us2[i]=rs2[i];
7800         }
7801         break;
7802       case MOV:
7803         rs1[i]=0;
7804         rs2[i]=0;
7805         rt1[i]=0;
7806         rt2[i]=0;
7807         if(op2==0x10) rs1[i]=HIREG; // MFHI
7808         if(op2==0x11) rt1[i]=HIREG; // MTHI
7809         if(op2==0x12) rs1[i]=LOREG; // MFLO
7810         if(op2==0x13) rt1[i]=LOREG; // MTLO
7811         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7812         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7813         dep1[i]=rs1[i];
7814         break;
7815       case SHIFT:
7816         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7817         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7818         rt1[i]=(source[i]>>11)&0x1f; // destination
7819         rt2[i]=0;
7820         // DSLLV/DSRLV/DSRAV are 64-bit
7821         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7822         break;
7823       case SHIFTIMM:
7824         rs1[i]=(source[i]>>16)&0x1f;
7825         rs2[i]=0;
7826         rt1[i]=(source[i]>>11)&0x1f;
7827         rt2[i]=0;
7828         imm[i]=(source[i]>>6)&0x1f;
7829         // DSxx32 instructions
7830         if(op2>=0x3c) imm[i]|=0x20;
7831         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7832         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7833         break;
7834       case COP0:
7835         rs1[i]=0;
7836         rs2[i]=0;
7837         rt1[i]=0;
7838         rt2[i]=0;
7839         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7840         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7841         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7842         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7843         break;
7844       case COP1:
7845         rs1[i]=0;
7846         rs2[i]=0;
7847         rt1[i]=0;
7848         rt2[i]=0;
7849         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7850         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7851         if(op2==5) us1[i]=rs1[i]; // DMTC1
7852         rs2[i]=CSREG;
7853         break;
7854       case COP2:
7855         rs1[i]=0;
7856         rs2[i]=0;
7857         rt1[i]=0;
7858         rt2[i]=0;
7859         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7860         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7861         rs2[i]=CSREG;
7862         int gr=(source[i]>>11)&0x1F;
7863         switch(op2)
7864         {
7865           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7866           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7867           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7868           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7869         }
7870         break;
7871       case C1LS:
7872         rs1[i]=(source[i]>>21)&0x1F;
7873         rs2[i]=CSREG;
7874         rt1[i]=0;
7875         rt2[i]=0;
7876         imm[i]=(short)source[i];
7877         break;
7878       case C2LS:
7879         rs1[i]=(source[i]>>21)&0x1F;
7880         rs2[i]=0;
7881         rt1[i]=0;
7882         rt2[i]=0;
7883         imm[i]=(short)source[i];
7884         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7885         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7886         break;
7887       case C2OP:
7888         rs1[i]=0;
7889         rs2[i]=0;
7890         rt1[i]=0;
7891         rt2[i]=0;
7892         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7893         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7894         gte_rt[i]|=1ll<<63; // every op changes flags
7895         if((source[i]&0x3f)==GTE_MVMVA) {
7896           int v = (source[i] >> 15) & 3;
7897           gte_rs[i]&=~0xe3fll;
7898           if(v==3) gte_rs[i]|=0xe00ll;
7899           else gte_rs[i]|=3ll<<(v*2);
7900         }
7901         break;
7902       case FLOAT:
7903       case FCONV:
7904         rs1[i]=0;
7905         rs2[i]=CSREG;
7906         rt1[i]=0;
7907         rt2[i]=0;
7908         break;
7909       case FCOMP:
7910         rs1[i]=FSREG;
7911         rs2[i]=CSREG;
7912         rt1[i]=FSREG;
7913         rt2[i]=0;
7914         break;
7915       case SYSCALL:
7916       case HLECALL:
7917       case INTCALL:
7918         rs1[i]=CCREG;
7919         rs2[i]=0;
7920         rt1[i]=0;
7921         rt2[i]=0;
7922         break;
7923       default:
7924         rs1[i]=0;
7925         rs2[i]=0;
7926         rt1[i]=0;
7927         rt2[i]=0;
7928     }
7929     /* Calculate branch target addresses */
7930     if(type==UJUMP)
7931       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7932     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7933       ba[i]=start+i*4+8; // Ignore never taken branch
7934     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7935       ba[i]=start+i*4+8; // Ignore never taken branch
7936     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7937       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7938     else ba[i]=-1;
7939     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7940       int do_in_intrp=0;
7941       // branch in delay slot?
7942       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7943         // don't handle first branch and call interpreter if it's hit
7944         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7945         do_in_intrp=1;
7946       }
7947       // basic load delay detection
7948       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7949         int t=(ba[i-1]-start)/4;
7950         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7951           // jump target wants DS result - potential load delay effect
7952           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7953           do_in_intrp=1;
7954           bt[t+1]=1; // expected return from interpreter
7955         }
7956         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7957               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7958           // v0 overwrite like this is a sign of trouble, bail out
7959           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7960           do_in_intrp=1;
7961         }
7962       }
7963       if(do_in_intrp) {
7964         rs1[i-1]=CCREG;
7965         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7966         ba[i-1]=-1;
7967         itype[i-1]=INTCALL;
7968         done=2;
7969         i--; // don't compile the DS
7970       }
7971     }
7972     /* Is this the end of the block? */
7973     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7974       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7975         done=2;
7976       }
7977       else {
7978         if(stop_after_jal) done=1;
7979         // Stop on BREAK
7980         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7981       }
7982       // Don't recompile stuff that's already compiled
7983       if(check_addr(start+i*4+4)) done=1;
7984       // Don't get too close to the limit
7985       if(i>MAXBLOCK/2) done=1;
7986     }
7987     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7988     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7989     if(done==2) {
7990       // Does the block continue due to a branch?
7991       for(j=i-1;j>=0;j--)
7992       {
7993         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7994         if(ba[j]==start+i*4+4) done=j=0;
7995         if(ba[j]==start+i*4+8) done=j=0;
7996       }
7997     }
7998     //assert(i<MAXBLOCK-1);
7999     if(start+i*4==pagelimit-4) done=1;
8000     assert(start+i*4<pagelimit);
8001     if (i==MAXBLOCK-1) done=1;
8002     // Stop if we're compiling junk
8003     if(itype[i]==NI&&opcode[i]==0x11) {
8004       done=stop_after_jal=1;
8005       SysPrintf("Disabled speculative precompilation\n");
8006     }
8007   }
8008   slen=i;
8009   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8010     if(start+i*4==pagelimit) {
8011       itype[i-1]=SPAN;
8012     }
8013   }
8014   assert(slen>0);
8015
8016   /* Pass 2 - Register dependencies and branch targets */
8017
8018   unneeded_registers(0,slen-1,0);
8019
8020   /* Pass 3 - Register allocation */
8021
8022   struct regstat current; // Current register allocations/status
8023   current.is32=1;
8024   current.dirty=0;
8025   current.u=unneeded_reg[0];
8026   current.uu=unneeded_reg_upper[0];
8027   clear_all_regs(current.regmap);
8028   alloc_reg(&current,0,CCREG);
8029   dirty_reg(&current,CCREG);
8030   current.isconst=0;
8031   current.wasconst=0;
8032   current.waswritten=0;
8033   int ds=0;
8034   int cc=0;
8035   int hr=-1;
8036
8037   if((u_int)addr&1) {
8038     // First instruction is delay slot
8039     cc=-1;
8040     bt[1]=1;
8041     ds=1;
8042     unneeded_reg[0]=1;
8043     unneeded_reg_upper[0]=1;
8044     current.regmap[HOST_BTREG]=BTREG;
8045   }
8046
8047   for(i=0;i<slen;i++)
8048   {
8049     if(bt[i])
8050     {
8051       int hr;
8052       for(hr=0;hr<HOST_REGS;hr++)
8053       {
8054         // Is this really necessary?
8055         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8056       }
8057       current.isconst=0;
8058       current.waswritten=0;
8059     }
8060     if(i>1)
8061     {
8062       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8063       {
8064         if(rs1[i-2]==0||rs2[i-2]==0)
8065         {
8066           if(rs1[i-2]) {
8067             current.is32|=1LL<<rs1[i-2];
8068             int hr=get_reg(current.regmap,rs1[i-2]|64);
8069             if(hr>=0) current.regmap[hr]=-1;
8070           }
8071           if(rs2[i-2]) {
8072             current.is32|=1LL<<rs2[i-2];
8073             int hr=get_reg(current.regmap,rs2[i-2]|64);
8074             if(hr>=0) current.regmap[hr]=-1;
8075           }
8076         }
8077       }
8078     }
8079     current.is32=-1LL;
8080
8081     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8082     regs[i].wasconst=current.isconst;
8083     regs[i].was32=current.is32;
8084     regs[i].wasdirty=current.dirty;
8085     regs[i].loadedconst=0;
8086     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8087       if(i+1<slen) {
8088         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8089         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8090         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8091         current.u|=1;
8092         current.uu|=1;
8093       } else {
8094         current.u=1;
8095         current.uu=1;
8096       }
8097     } else {
8098       if(i+1<slen) {
8099         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8100         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8101         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8102         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8103         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8104         current.u|=1;
8105         current.uu|=1;
8106       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
8107     }
8108     is_ds[i]=ds;
8109     if(ds) {
8110       ds=0; // Skip delay slot, already allocated as part of branch
8111       // ...but we need to alloc it in case something jumps here
8112       if(i+1<slen) {
8113         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8114         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8115       }else{
8116         current.u=branch_unneeded_reg[i-1];
8117         current.uu=branch_unneeded_reg_upper[i-1];
8118       }
8119       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8120       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8121       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8122       current.u|=1;
8123       current.uu|=1;
8124       struct regstat temp;
8125       memcpy(&temp,&current,sizeof(current));
8126       temp.wasdirty=temp.dirty;
8127       temp.was32=temp.is32;
8128       // TODO: Take into account unconditional branches, as below
8129       delayslot_alloc(&temp,i);
8130       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8131       regs[i].wasdirty=temp.wasdirty;
8132       regs[i].was32=temp.was32;
8133       regs[i].dirty=temp.dirty;
8134       regs[i].is32=temp.is32;
8135       regs[i].isconst=0;
8136       regs[i].wasconst=0;
8137       current.isconst=0;
8138       // Create entry (branch target) regmap
8139       for(hr=0;hr<HOST_REGS;hr++)
8140       {
8141         int r=temp.regmap[hr];
8142         if(r>=0) {
8143           if(r!=regmap_pre[i][hr]) {
8144             regs[i].regmap_entry[hr]=-1;
8145           }
8146           else
8147           {
8148             if(r<64){
8149               if((current.u>>r)&1) {
8150                 regs[i].regmap_entry[hr]=-1;
8151                 regs[i].regmap[hr]=-1;
8152                 //Don't clear regs in the delay slot as the branch might need them
8153                 //current.regmap[hr]=-1;
8154               }else
8155                 regs[i].regmap_entry[hr]=r;
8156             }
8157             else {
8158               if((current.uu>>(r&63))&1) {
8159                 regs[i].regmap_entry[hr]=-1;
8160                 regs[i].regmap[hr]=-1;
8161                 //Don't clear regs in the delay slot as the branch might need them
8162                 //current.regmap[hr]=-1;
8163               }else
8164                 regs[i].regmap_entry[hr]=r;
8165             }
8166           }
8167         } else {
8168           // First instruction expects CCREG to be allocated
8169           if(i==0&&hr==HOST_CCREG)
8170             regs[i].regmap_entry[hr]=CCREG;
8171           else
8172             regs[i].regmap_entry[hr]=-1;
8173         }
8174       }
8175     }
8176     else { // Not delay slot
8177       switch(itype[i]) {
8178         case UJUMP:
8179           //current.isconst=0; // DEBUG
8180           //current.wasconst=0; // DEBUG
8181           //regs[i].wasconst=0; // DEBUG
8182           clear_const(&current,rt1[i]);
8183           alloc_cc(&current,i);
8184           dirty_reg(&current,CCREG);
8185           if (rt1[i]==31) {
8186             alloc_reg(&current,i,31);
8187             dirty_reg(&current,31);
8188             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8189             //assert(rt1[i+1]!=rt1[i]);
8190             #ifdef REG_PREFETCH
8191             alloc_reg(&current,i,PTEMP);
8192             #endif
8193             //current.is32|=1LL<<rt1[i];
8194           }
8195           ooo[i]=1;
8196           delayslot_alloc(&current,i+1);
8197           //current.isconst=0; // DEBUG
8198           ds=1;
8199           //printf("i=%d, isconst=%x\n",i,current.isconst);
8200           break;
8201         case RJUMP:
8202           //current.isconst=0;
8203           //current.wasconst=0;
8204           //regs[i].wasconst=0;
8205           clear_const(&current,rs1[i]);
8206           clear_const(&current,rt1[i]);
8207           alloc_cc(&current,i);
8208           dirty_reg(&current,CCREG);
8209           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8210             alloc_reg(&current,i,rs1[i]);
8211             if (rt1[i]!=0) {
8212               alloc_reg(&current,i,rt1[i]);
8213               dirty_reg(&current,rt1[i]);
8214               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8215               assert(rt1[i+1]!=rt1[i]);
8216               #ifdef REG_PREFETCH
8217               alloc_reg(&current,i,PTEMP);
8218               #endif
8219             }
8220             #ifdef USE_MINI_HT
8221             if(rs1[i]==31) { // JALR
8222               alloc_reg(&current,i,RHASH);
8223               #ifndef HOST_IMM_ADDR32
8224               alloc_reg(&current,i,RHTBL);
8225               #endif
8226             }
8227             #endif
8228             delayslot_alloc(&current,i+1);
8229           } else {
8230             // The delay slot overwrites our source register,
8231             // allocate a temporary register to hold the old value.
8232             current.isconst=0;
8233             current.wasconst=0;
8234             regs[i].wasconst=0;
8235             delayslot_alloc(&current,i+1);
8236             current.isconst=0;
8237             alloc_reg(&current,i,RTEMP);
8238           }
8239           //current.isconst=0; // DEBUG
8240           ooo[i]=1;
8241           ds=1;
8242           break;
8243         case CJUMP:
8244           //current.isconst=0;
8245           //current.wasconst=0;
8246           //regs[i].wasconst=0;
8247           clear_const(&current,rs1[i]);
8248           clear_const(&current,rs2[i]);
8249           if((opcode[i]&0x3E)==4) // BEQ/BNE
8250           {
8251             alloc_cc(&current,i);
8252             dirty_reg(&current,CCREG);
8253             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8254             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8255             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8256             {
8257               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8258               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8259             }
8260             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8261                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8262               // The delay slot overwrites one of our conditions.
8263               // Allocate the branch condition registers instead.
8264               current.isconst=0;
8265               current.wasconst=0;
8266               regs[i].wasconst=0;
8267               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8268               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8269               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8270               {
8271                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8272                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8273               }
8274             }
8275             else
8276             {
8277               ooo[i]=1;
8278               delayslot_alloc(&current,i+1);
8279             }
8280           }
8281           else
8282           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8283           {
8284             alloc_cc(&current,i);
8285             dirty_reg(&current,CCREG);
8286             alloc_reg(&current,i,rs1[i]);
8287             if(!(current.is32>>rs1[i]&1))
8288             {
8289               alloc_reg64(&current,i,rs1[i]);
8290             }
8291             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8292               // The delay slot overwrites one of our conditions.
8293               // Allocate the branch condition registers instead.
8294               current.isconst=0;
8295               current.wasconst=0;
8296               regs[i].wasconst=0;
8297               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8298               if(!((current.is32>>rs1[i])&1))
8299               {
8300                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8301               }
8302             }
8303             else
8304             {
8305               ooo[i]=1;
8306               delayslot_alloc(&current,i+1);
8307             }
8308           }
8309           else
8310           // Don't alloc the delay slot yet because we might not execute it
8311           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8312           {
8313             current.isconst=0;
8314             current.wasconst=0;
8315             regs[i].wasconst=0;
8316             alloc_cc(&current,i);
8317             dirty_reg(&current,CCREG);
8318             alloc_reg(&current,i,rs1[i]);
8319             alloc_reg(&current,i,rs2[i]);
8320             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8321             {
8322               alloc_reg64(&current,i,rs1[i]);
8323               alloc_reg64(&current,i,rs2[i]);
8324             }
8325           }
8326           else
8327           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8328           {
8329             current.isconst=0;
8330             current.wasconst=0;
8331             regs[i].wasconst=0;
8332             alloc_cc(&current,i);
8333             dirty_reg(&current,CCREG);
8334             alloc_reg(&current,i,rs1[i]);
8335             if(!(current.is32>>rs1[i]&1))
8336             {
8337               alloc_reg64(&current,i,rs1[i]);
8338             }
8339           }
8340           ds=1;
8341           //current.isconst=0;
8342           break;
8343         case SJUMP:
8344           //current.isconst=0;
8345           //current.wasconst=0;
8346           //regs[i].wasconst=0;
8347           clear_const(&current,rs1[i]);
8348           clear_const(&current,rt1[i]);
8349           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8350           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8351           {
8352             alloc_cc(&current,i);
8353             dirty_reg(&current,CCREG);
8354             alloc_reg(&current,i,rs1[i]);
8355             if(!(current.is32>>rs1[i]&1))
8356             {
8357               alloc_reg64(&current,i,rs1[i]);
8358             }
8359             if (rt1[i]==31) { // BLTZAL/BGEZAL
8360               alloc_reg(&current,i,31);
8361               dirty_reg(&current,31);
8362               //#ifdef REG_PREFETCH
8363               //alloc_reg(&current,i,PTEMP);
8364               //#endif
8365               //current.is32|=1LL<<rt1[i];
8366             }
8367             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8368                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8369               // Allocate the branch condition registers instead.
8370               current.isconst=0;
8371               current.wasconst=0;
8372               regs[i].wasconst=0;
8373               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8374               if(!((current.is32>>rs1[i])&1))
8375               {
8376                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8377               }
8378             }
8379             else
8380             {
8381               ooo[i]=1;
8382               delayslot_alloc(&current,i+1);
8383             }
8384           }
8385           else
8386           // Don't alloc the delay slot yet because we might not execute it
8387           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8388           {
8389             current.isconst=0;
8390             current.wasconst=0;
8391             regs[i].wasconst=0;
8392             alloc_cc(&current,i);
8393             dirty_reg(&current,CCREG);
8394             alloc_reg(&current,i,rs1[i]);
8395             if(!(current.is32>>rs1[i]&1))
8396             {
8397               alloc_reg64(&current,i,rs1[i]);
8398             }
8399           }
8400           ds=1;
8401           //current.isconst=0;
8402           break;
8403         case FJUMP:
8404           current.isconst=0;
8405           current.wasconst=0;
8406           regs[i].wasconst=0;
8407           if(likely[i]==0) // BC1F/BC1T
8408           {
8409             // TODO: Theoretically we can run out of registers here on x86.
8410             // The delay slot can allocate up to six, and we need to check
8411             // CSREG before executing the delay slot.  Possibly we can drop
8412             // the cycle count and then reload it after checking that the
8413             // FPU is in a usable state, or don't do out-of-order execution.
8414             alloc_cc(&current,i);
8415             dirty_reg(&current,CCREG);
8416             alloc_reg(&current,i,FSREG);
8417             alloc_reg(&current,i,CSREG);
8418             if(itype[i+1]==FCOMP) {
8419               // The delay slot overwrites the branch condition.
8420               // Allocate the branch condition registers instead.
8421               alloc_cc(&current,i);
8422               dirty_reg(&current,CCREG);
8423               alloc_reg(&current,i,CSREG);
8424               alloc_reg(&current,i,FSREG);
8425             }
8426             else {
8427               ooo[i]=1;
8428               delayslot_alloc(&current,i+1);
8429               alloc_reg(&current,i+1,CSREG);
8430             }
8431           }
8432           else
8433           // Don't alloc the delay slot yet because we might not execute it
8434           if(likely[i]) // BC1FL/BC1TL
8435           {
8436             alloc_cc(&current,i);
8437             dirty_reg(&current,CCREG);
8438             alloc_reg(&current,i,CSREG);
8439             alloc_reg(&current,i,FSREG);
8440           }
8441           ds=1;
8442           current.isconst=0;
8443           break;
8444         case IMM16:
8445           imm16_alloc(&current,i);
8446           break;
8447         case LOAD:
8448         case LOADLR:
8449           load_alloc(&current,i);
8450           break;
8451         case STORE:
8452         case STORELR:
8453           store_alloc(&current,i);
8454           break;
8455         case ALU:
8456           alu_alloc(&current,i);
8457           break;
8458         case SHIFT:
8459           shift_alloc(&current,i);
8460           break;
8461         case MULTDIV:
8462           multdiv_alloc(&current,i);
8463           break;
8464         case SHIFTIMM:
8465           shiftimm_alloc(&current,i);
8466           break;
8467         case MOV:
8468           mov_alloc(&current,i);
8469           break;
8470         case COP0:
8471           cop0_alloc(&current,i);
8472           break;
8473         case COP1:
8474         case COP2:
8475           cop1_alloc(&current,i);
8476           break;
8477         case C1LS:
8478           c1ls_alloc(&current,i);
8479           break;
8480         case C2LS:
8481           c2ls_alloc(&current,i);
8482           break;
8483         case C2OP:
8484           c2op_alloc(&current,i);
8485           break;
8486         case FCONV:
8487           fconv_alloc(&current,i);
8488           break;
8489         case FLOAT:
8490           float_alloc(&current,i);
8491           break;
8492         case FCOMP:
8493           fcomp_alloc(&current,i);
8494           break;
8495         case SYSCALL:
8496         case HLECALL:
8497         case INTCALL:
8498           syscall_alloc(&current,i);
8499           break;
8500         case SPAN:
8501           pagespan_alloc(&current,i);
8502           break;
8503       }
8504
8505       // Drop the upper half of registers that have become 32-bit
8506       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8507       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8508         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8509         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8510         current.uu|=1;
8511       } else {
8512         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8513         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8514         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8515         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8516         current.uu|=1;
8517       }
8518
8519       // Create entry (branch target) regmap
8520       for(hr=0;hr<HOST_REGS;hr++)
8521       {
8522         int r,or;
8523         r=current.regmap[hr];
8524         if(r>=0) {
8525           if(r!=regmap_pre[i][hr]) {
8526             // TODO: delay slot (?)
8527             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8528             if(or<0||(r&63)>=TEMPREG){
8529               regs[i].regmap_entry[hr]=-1;
8530             }
8531             else
8532             {
8533               // Just move it to a different register
8534               regs[i].regmap_entry[hr]=r;
8535               // If it was dirty before, it's still dirty
8536               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8537             }
8538           }
8539           else
8540           {
8541             // Unneeded
8542             if(r==0){
8543               regs[i].regmap_entry[hr]=0;
8544             }
8545             else
8546             if(r<64){
8547               if((current.u>>r)&1) {
8548                 regs[i].regmap_entry[hr]=-1;
8549                 //regs[i].regmap[hr]=-1;
8550                 current.regmap[hr]=-1;
8551               }else
8552                 regs[i].regmap_entry[hr]=r;
8553             }
8554             else {
8555               if((current.uu>>(r&63))&1) {
8556                 regs[i].regmap_entry[hr]=-1;
8557                 //regs[i].regmap[hr]=-1;
8558                 current.regmap[hr]=-1;
8559               }else
8560                 regs[i].regmap_entry[hr]=r;
8561             }
8562           }
8563         } else {
8564           // Branches expect CCREG to be allocated at the target
8565           if(regmap_pre[i][hr]==CCREG)
8566             regs[i].regmap_entry[hr]=CCREG;
8567           else
8568             regs[i].regmap_entry[hr]=-1;
8569         }
8570       }
8571       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8572     }
8573
8574     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8575       current.waswritten|=1<<rs1[i-1];
8576     current.waswritten&=~(1<<rt1[i]);
8577     current.waswritten&=~(1<<rt2[i]);
8578     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8579       current.waswritten&=~(1<<rs1[i]);
8580
8581     /* Branch post-alloc */
8582     if(i>0)
8583     {
8584       current.was32=current.is32;
8585       current.wasdirty=current.dirty;
8586       switch(itype[i-1]) {
8587         case UJUMP:
8588           memcpy(&branch_regs[i-1],&current,sizeof(current));
8589           branch_regs[i-1].isconst=0;
8590           branch_regs[i-1].wasconst=0;
8591           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8592           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8593           alloc_cc(&branch_regs[i-1],i-1);
8594           dirty_reg(&branch_regs[i-1],CCREG);
8595           if(rt1[i-1]==31) { // JAL
8596             alloc_reg(&branch_regs[i-1],i-1,31);
8597             dirty_reg(&branch_regs[i-1],31);
8598             branch_regs[i-1].is32|=1LL<<31;
8599           }
8600           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8601           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8602           break;
8603         case RJUMP:
8604           memcpy(&branch_regs[i-1],&current,sizeof(current));
8605           branch_regs[i-1].isconst=0;
8606           branch_regs[i-1].wasconst=0;
8607           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8608           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8609           alloc_cc(&branch_regs[i-1],i-1);
8610           dirty_reg(&branch_regs[i-1],CCREG);
8611           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8612           if(rt1[i-1]!=0) { // JALR
8613             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8614             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8615             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8616           }
8617           #ifdef USE_MINI_HT
8618           if(rs1[i-1]==31) { // JALR
8619             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8620             #ifndef HOST_IMM_ADDR32
8621             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8622             #endif
8623           }
8624           #endif
8625           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8626           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8627           break;
8628         case CJUMP:
8629           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8630           {
8631             alloc_cc(&current,i-1);
8632             dirty_reg(&current,CCREG);
8633             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8634                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8635               // The delay slot overwrote one of our conditions
8636               // Delay slot goes after the test (in order)
8637               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8638               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8639               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8640               current.u|=1;
8641               current.uu|=1;
8642               delayslot_alloc(&current,i);
8643               current.isconst=0;
8644             }
8645             else
8646             {
8647               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8648               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8649               // Alloc the branch condition registers
8650               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8651               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8652               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8653               {
8654                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8655                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8656               }
8657             }
8658             memcpy(&branch_regs[i-1],&current,sizeof(current));
8659             branch_regs[i-1].isconst=0;
8660             branch_regs[i-1].wasconst=0;
8661             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8662             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8663           }
8664           else
8665           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8666           {
8667             alloc_cc(&current,i-1);
8668             dirty_reg(&current,CCREG);
8669             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8670               // The delay slot overwrote the branch condition
8671               // Delay slot goes after the test (in order)
8672               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8673               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8674               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8675               current.u|=1;
8676               current.uu|=1;
8677               delayslot_alloc(&current,i);
8678               current.isconst=0;
8679             }
8680             else
8681             {
8682               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8683               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8684               // Alloc the branch condition register
8685               alloc_reg(&current,i-1,rs1[i-1]);
8686               if(!(current.is32>>rs1[i-1]&1))
8687               {
8688                 alloc_reg64(&current,i-1,rs1[i-1]);
8689               }
8690             }
8691             memcpy(&branch_regs[i-1],&current,sizeof(current));
8692             branch_regs[i-1].isconst=0;
8693             branch_regs[i-1].wasconst=0;
8694             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8695             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8696           }
8697           else
8698           // Alloc the delay slot in case the branch is taken
8699           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8700           {
8701             memcpy(&branch_regs[i-1],&current,sizeof(current));
8702             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8703             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8704             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8705             alloc_cc(&branch_regs[i-1],i);
8706             dirty_reg(&branch_regs[i-1],CCREG);
8707             delayslot_alloc(&branch_regs[i-1],i);
8708             branch_regs[i-1].isconst=0;
8709             alloc_reg(&current,i,CCREG); // Not taken path
8710             dirty_reg(&current,CCREG);
8711             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8712           }
8713           else
8714           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8715           {
8716             memcpy(&branch_regs[i-1],&current,sizeof(current));
8717             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8718             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8719             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8720             alloc_cc(&branch_regs[i-1],i);
8721             dirty_reg(&branch_regs[i-1],CCREG);
8722             delayslot_alloc(&branch_regs[i-1],i);
8723             branch_regs[i-1].isconst=0;
8724             alloc_reg(&current,i,CCREG); // Not taken path
8725             dirty_reg(&current,CCREG);
8726             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8727           }
8728           break;
8729         case SJUMP:
8730           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8731           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8732           {
8733             alloc_cc(&current,i-1);
8734             dirty_reg(&current,CCREG);
8735             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8736               // The delay slot overwrote the branch condition
8737               // Delay slot goes after the test (in order)
8738               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8739               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8740               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8741               current.u|=1;
8742               current.uu|=1;
8743               delayslot_alloc(&current,i);
8744               current.isconst=0;
8745             }
8746             else
8747             {
8748               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8749               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8750               // Alloc the branch condition register
8751               alloc_reg(&current,i-1,rs1[i-1]);
8752               if(!(current.is32>>rs1[i-1]&1))
8753               {
8754                 alloc_reg64(&current,i-1,rs1[i-1]);
8755               }
8756             }
8757             memcpy(&branch_regs[i-1],&current,sizeof(current));
8758             branch_regs[i-1].isconst=0;
8759             branch_regs[i-1].wasconst=0;
8760             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8761             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8762           }
8763           else
8764           // Alloc the delay slot in case the branch is taken
8765           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8766           {
8767             memcpy(&branch_regs[i-1],&current,sizeof(current));
8768             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8769             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8770             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8771             alloc_cc(&branch_regs[i-1],i);
8772             dirty_reg(&branch_regs[i-1],CCREG);
8773             delayslot_alloc(&branch_regs[i-1],i);
8774             branch_regs[i-1].isconst=0;
8775             alloc_reg(&current,i,CCREG); // Not taken path
8776             dirty_reg(&current,CCREG);
8777             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8778           }
8779           // FIXME: BLTZAL/BGEZAL
8780           if(opcode2[i-1]&0x10) { // BxxZAL
8781             alloc_reg(&branch_regs[i-1],i-1,31);
8782             dirty_reg(&branch_regs[i-1],31);
8783             branch_regs[i-1].is32|=1LL<<31;
8784           }
8785           break;
8786         case FJUMP:
8787           if(likely[i-1]==0) // BC1F/BC1T
8788           {
8789             alloc_cc(&current,i-1);
8790             dirty_reg(&current,CCREG);
8791             if(itype[i]==FCOMP) {
8792               // The delay slot overwrote the branch condition
8793               // Delay slot goes after the test (in order)
8794               delayslot_alloc(&current,i);
8795               current.isconst=0;
8796             }
8797             else
8798             {
8799               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8800               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8801               // Alloc the branch condition register
8802               alloc_reg(&current,i-1,FSREG);
8803             }
8804             memcpy(&branch_regs[i-1],&current,sizeof(current));
8805             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8806           }
8807           else // BC1FL/BC1TL
8808           {
8809             // Alloc the delay slot in case the branch is taken
8810             memcpy(&branch_regs[i-1],&current,sizeof(current));
8811             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8812             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8813             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8814             alloc_cc(&branch_regs[i-1],i);
8815             dirty_reg(&branch_regs[i-1],CCREG);
8816             delayslot_alloc(&branch_regs[i-1],i);
8817             branch_regs[i-1].isconst=0;
8818             alloc_reg(&current,i,CCREG); // Not taken path
8819             dirty_reg(&current,CCREG);
8820             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8821           }
8822           break;
8823       }
8824
8825       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8826       {
8827         if(rt1[i-1]==31) // JAL/JALR
8828         {
8829           // Subroutine call will return here, don't alloc any registers
8830           current.is32=1;
8831           current.dirty=0;
8832           clear_all_regs(current.regmap);
8833           alloc_reg(&current,i,CCREG);
8834           dirty_reg(&current,CCREG);
8835         }
8836         else if(i+1<slen)
8837         {
8838           // Internal branch will jump here, match registers to caller
8839           current.is32=0x3FFFFFFFFLL;
8840           current.dirty=0;
8841           clear_all_regs(current.regmap);
8842           alloc_reg(&current,i,CCREG);
8843           dirty_reg(&current,CCREG);
8844           for(j=i-1;j>=0;j--)
8845           {
8846             if(ba[j]==start+i*4+4) {
8847               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8848               current.is32=branch_regs[j].is32;
8849               current.dirty=branch_regs[j].dirty;
8850               break;
8851             }
8852           }
8853           while(j>=0) {
8854             if(ba[j]==start+i*4+4) {
8855               for(hr=0;hr<HOST_REGS;hr++) {
8856                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8857                   current.regmap[hr]=-1;
8858                 }
8859                 current.is32&=branch_regs[j].is32;
8860                 current.dirty&=branch_regs[j].dirty;
8861               }
8862             }
8863             j--;
8864           }
8865         }
8866       }
8867     }
8868
8869     // Count cycles in between branches
8870     ccadj[i]=cc;
8871     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8872     {
8873       cc=0;
8874     }
8875 #if !defined(DRC_DBG)
8876     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8877     {
8878       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8879       cc+=gte_cycletab[source[i]&0x3f]/2;
8880     }
8881     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8882     {
8883       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8884     }
8885     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8886     {
8887       cc+=4;
8888     }
8889     else if(itype[i]==C2LS)
8890     {
8891       cc+=4;
8892     }
8893 #endif
8894     else
8895     {
8896       cc++;
8897     }
8898
8899     flush_dirty_uppers(&current);
8900     if(!is_ds[i]) {
8901       regs[i].is32=current.is32;
8902       regs[i].dirty=current.dirty;
8903       regs[i].isconst=current.isconst;
8904       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8905     }
8906     for(hr=0;hr<HOST_REGS;hr++) {
8907       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8908         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8909           regs[i].wasconst&=~(1<<hr);
8910         }
8911       }
8912     }
8913     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8914     regs[i].waswritten=current.waswritten;
8915   }
8916
8917   /* Pass 4 - Cull unused host registers */
8918
8919   uint64_t nr=0;
8920
8921   for (i=slen-1;i>=0;i--)
8922   {
8923     int hr;
8924     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8925     {
8926       if(ba[i]<start || ba[i]>=(start+slen*4))
8927       {
8928         // Branch out of this block, don't need anything
8929         nr=0;
8930       }
8931       else
8932       {
8933         // Internal branch
8934         // Need whatever matches the target
8935         nr=0;
8936         int t=(ba[i]-start)>>2;
8937         for(hr=0;hr<HOST_REGS;hr++)
8938         {
8939           if(regs[i].regmap_entry[hr]>=0) {
8940             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8941           }
8942         }
8943       }
8944       // Conditional branch may need registers for following instructions
8945       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8946       {
8947         if(i<slen-2) {
8948           nr|=needed_reg[i+2];
8949           for(hr=0;hr<HOST_REGS;hr++)
8950           {
8951             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8952             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8953           }
8954         }
8955       }
8956       // Don't need stuff which is overwritten
8957       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8958       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8959       // Merge in delay slot
8960       for(hr=0;hr<HOST_REGS;hr++)
8961       {
8962         if(!likely[i]) {
8963           // These are overwritten unless the branch is "likely"
8964           // and the delay slot is nullified if not taken
8965           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8966           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8967         }
8968         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8969         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8970         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8971         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8972         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8973         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8974         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8975         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8976         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8977           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8978           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8979         }
8980         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8981           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8982           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8983         }
8984         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8985           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8986           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8987         }
8988       }
8989     }
8990     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8991     {
8992       // SYSCALL instruction (software interrupt)
8993       nr=0;
8994     }
8995     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8996     {
8997       // ERET instruction (return from interrupt)
8998       nr=0;
8999     }
9000     else // Non-branch
9001     {
9002       if(i<slen-1) {
9003         for(hr=0;hr<HOST_REGS;hr++) {
9004           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9005           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9006           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9007           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9008         }
9009       }
9010     }
9011     for(hr=0;hr<HOST_REGS;hr++)
9012     {
9013       // Overwritten registers are not needed
9014       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9015       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9016       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9017       // Source registers are needed
9018       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9019       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9020       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9021       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9022       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9023       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9024       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9025       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9026       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9027         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9028         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9029       }
9030       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9031         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9032         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9033       }
9034       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9035         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9036         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9037       }
9038       // Don't store a register immediately after writing it,
9039       // may prevent dual-issue.
9040       // But do so if this is a branch target, otherwise we
9041       // might have to load the register before the branch.
9042       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9043         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9044            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9045           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9046           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9047         }
9048         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9049            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9050           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9051           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9052         }
9053       }
9054     }
9055     // Cycle count is needed at branches.  Assume it is needed at the target too.
9056     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9057       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9058       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9059     }
9060     // Save it
9061     needed_reg[i]=nr;
9062
9063     // Deallocate unneeded registers
9064     for(hr=0;hr<HOST_REGS;hr++)
9065     {
9066       if(!((nr>>hr)&1)) {
9067         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9068         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9069            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9070            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9071         {
9072           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9073           {
9074             if(likely[i]) {
9075               regs[i].regmap[hr]=-1;
9076               regs[i].isconst&=~(1<<hr);
9077               if(i<slen-2) {
9078                 regmap_pre[i+2][hr]=-1;
9079                 regs[i+2].wasconst&=~(1<<hr);
9080               }
9081             }
9082           }
9083         }
9084         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9085         {
9086           int d1=0,d2=0,map=0,temp=0;
9087           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9088           {
9089             d1=dep1[i+1];
9090             d2=dep2[i+1];
9091           }
9092           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9093              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9094             map=INVCP;
9095           }
9096           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9097              itype[i+1]==C1LS || itype[i+1]==C2LS)
9098             temp=FTEMP;
9099           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9100              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9101              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9102              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9103              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9104              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9105              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9106              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9107              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9108              regs[i].regmap[hr]!=map )
9109           {
9110             regs[i].regmap[hr]=-1;
9111             regs[i].isconst&=~(1<<hr);
9112             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9113                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9114                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9115                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9116                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9117                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9118                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9119                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9120                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9121                branch_regs[i].regmap[hr]!=map)
9122             {
9123               branch_regs[i].regmap[hr]=-1;
9124               branch_regs[i].regmap_entry[hr]=-1;
9125               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9126               {
9127                 if(!likely[i]&&i<slen-2) {
9128                   regmap_pre[i+2][hr]=-1;
9129                   regs[i+2].wasconst&=~(1<<hr);
9130                 }
9131               }
9132             }
9133           }
9134         }
9135         else
9136         {
9137           // Non-branch
9138           if(i>0)
9139           {
9140             int d1=0,d2=0,map=-1,temp=-1;
9141             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9142             {
9143               d1=dep1[i];
9144               d2=dep2[i];
9145             }
9146             if(itype[i]==STORE || itype[i]==STORELR ||
9147                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9148               map=INVCP;
9149             }
9150             if(itype[i]==LOADLR || itype[i]==STORELR ||
9151                itype[i]==C1LS || itype[i]==C2LS)
9152               temp=FTEMP;
9153             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9154                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9155                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9156                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9157                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9158                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9159             {
9160               if(i<slen-1&&!is_ds[i]) {
9161                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9162                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9163                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9164                 {
9165                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9166                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9167                 }
9168                 regmap_pre[i+1][hr]=-1;
9169                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9170                 regs[i+1].wasconst&=~(1<<hr);
9171               }
9172               regs[i].regmap[hr]=-1;
9173               regs[i].isconst&=~(1<<hr);
9174             }
9175           }
9176         }
9177       }
9178     }
9179   }
9180
9181   /* Pass 5 - Pre-allocate registers */
9182
9183   // If a register is allocated during a loop, try to allocate it for the
9184   // entire loop, if possible.  This avoids loading/storing registers
9185   // inside of the loop.
9186
9187   signed char f_regmap[HOST_REGS];
9188   clear_all_regs(f_regmap);
9189   for(i=0;i<slen-1;i++)
9190   {
9191     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9192     {
9193       if(ba[i]>=start && ba[i]<(start+i*4))
9194       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9195       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9196       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9197       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9198       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9199       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9200       {
9201         int t=(ba[i]-start)>>2;
9202         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9203         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9204         for(hr=0;hr<HOST_REGS;hr++)
9205         {
9206           if(regs[i].regmap[hr]>64) {
9207             if(!((regs[i].dirty>>hr)&1))
9208               f_regmap[hr]=regs[i].regmap[hr];
9209             else f_regmap[hr]=-1;
9210           }
9211           else if(regs[i].regmap[hr]>=0) {
9212             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9213               // dealloc old register
9214               int n;
9215               for(n=0;n<HOST_REGS;n++)
9216               {
9217                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9218               }
9219               // and alloc new one
9220               f_regmap[hr]=regs[i].regmap[hr];
9221             }
9222           }
9223           if(branch_regs[i].regmap[hr]>64) {
9224             if(!((branch_regs[i].dirty>>hr)&1))
9225               f_regmap[hr]=branch_regs[i].regmap[hr];
9226             else f_regmap[hr]=-1;
9227           }
9228           else if(branch_regs[i].regmap[hr]>=0) {
9229             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9230               // dealloc old register
9231               int n;
9232               for(n=0;n<HOST_REGS;n++)
9233               {
9234                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9235               }
9236               // and alloc new one
9237               f_regmap[hr]=branch_regs[i].regmap[hr];
9238             }
9239           }
9240           if(ooo[i]) {
9241             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9242               f_regmap[hr]=branch_regs[i].regmap[hr];
9243           }else{
9244             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9245               f_regmap[hr]=branch_regs[i].regmap[hr];
9246           }
9247           // Avoid dirty->clean transition
9248           #ifdef DESTRUCTIVE_WRITEBACK
9249           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9250           #endif
9251           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9252           // case above, however it's always a good idea.  We can't hoist the
9253           // load if the register was already allocated, so there's no point
9254           // wasting time analyzing most of these cases.  It only "succeeds"
9255           // when the mapping was different and the load can be replaced with
9256           // a mov, which is of negligible benefit.  So such cases are
9257           // skipped below.
9258           if(f_regmap[hr]>0) {
9259             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9260               int r=f_regmap[hr];
9261               for(j=t;j<=i;j++)
9262               {
9263                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9264                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9265                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9266                 if(r>63) {
9267                   // NB This can exclude the case where the upper-half
9268                   // register is lower numbered than the lower-half
9269                   // register.  Not sure if it's worth fixing...
9270                   if(get_reg(regs[j].regmap,r&63)<0) break;
9271                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9272                   if(regs[j].is32&(1LL<<(r&63))) break;
9273                 }
9274                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9275                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9276                   int k;
9277                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9278                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9279                     if(r>63) {
9280                       if(get_reg(regs[i].regmap,r&63)<0) break;
9281                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9282                     }
9283                     k=i;
9284                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9285                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9286                         //printf("no free regs for store %x\n",start+(k-1)*4);
9287                         break;
9288                       }
9289                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9290                         //printf("no-match due to different register\n");
9291                         break;
9292                       }
9293                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9294                         //printf("no-match due to branch\n");
9295                         break;
9296                       }
9297                       // call/ret fast path assumes no registers allocated
9298                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9299                         break;
9300                       }
9301                       if(r>63) {
9302                         // NB This can exclude the case where the upper-half
9303                         // register is lower numbered than the lower-half
9304                         // register.  Not sure if it's worth fixing...
9305                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9306                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9307                       }
9308                       k--;
9309                     }
9310                     if(i<slen-1) {
9311                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9312                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9313                         //printf("bad match after branch\n");
9314                         break;
9315                       }
9316                     }
9317                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9318                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9319                       while(k<i) {
9320                         regs[k].regmap_entry[hr]=f_regmap[hr];
9321                         regs[k].regmap[hr]=f_regmap[hr];
9322                         regmap_pre[k+1][hr]=f_regmap[hr];
9323                         regs[k].wasdirty&=~(1<<hr);
9324                         regs[k].dirty&=~(1<<hr);
9325                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9326                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9327                         regs[k].wasconst&=~(1<<hr);
9328                         regs[k].isconst&=~(1<<hr);
9329                         k++;
9330                       }
9331                     }
9332                     else {
9333                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9334                       break;
9335                     }
9336                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9337                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9338                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9339                       regs[i].regmap_entry[hr]=f_regmap[hr];
9340                       regs[i].regmap[hr]=f_regmap[hr];
9341                       regs[i].wasdirty&=~(1<<hr);
9342                       regs[i].dirty&=~(1<<hr);
9343                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9344                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9345                       regs[i].wasconst&=~(1<<hr);
9346                       regs[i].isconst&=~(1<<hr);
9347                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9348                       branch_regs[i].wasdirty&=~(1<<hr);
9349                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9350                       branch_regs[i].regmap[hr]=f_regmap[hr];
9351                       branch_regs[i].dirty&=~(1<<hr);
9352                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9353                       branch_regs[i].wasconst&=~(1<<hr);
9354                       branch_regs[i].isconst&=~(1<<hr);
9355                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9356                         regmap_pre[i+2][hr]=f_regmap[hr];
9357                         regs[i+2].wasdirty&=~(1<<hr);
9358                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9359                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9360                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9361                       }
9362                     }
9363                   }
9364                   for(k=t;k<j;k++) {
9365                     // Alloc register clean at beginning of loop,
9366                     // but may dirty it in pass 6
9367                     regs[k].regmap_entry[hr]=f_regmap[hr];
9368                     regs[k].regmap[hr]=f_regmap[hr];
9369                     regs[k].dirty&=~(1<<hr);
9370                     regs[k].wasconst&=~(1<<hr);
9371                     regs[k].isconst&=~(1<<hr);
9372                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9373                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9374                       branch_regs[k].regmap[hr]=f_regmap[hr];
9375                       branch_regs[k].dirty&=~(1<<hr);
9376                       branch_regs[k].wasconst&=~(1<<hr);
9377                       branch_regs[k].isconst&=~(1<<hr);
9378                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9379                         regmap_pre[k+2][hr]=f_regmap[hr];
9380                         regs[k+2].wasdirty&=~(1<<hr);
9381                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9382                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9383                       }
9384                     }
9385                     else
9386                     {
9387                       regmap_pre[k+1][hr]=f_regmap[hr];
9388                       regs[k+1].wasdirty&=~(1<<hr);
9389                     }
9390                   }
9391                   if(regs[j].regmap[hr]==f_regmap[hr])
9392                     regs[j].regmap_entry[hr]=f_regmap[hr];
9393                   break;
9394                 }
9395                 if(j==i) break;
9396                 if(regs[j].regmap[hr]>=0)
9397                   break;
9398                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9399                   //printf("no-match due to different register\n");
9400                   break;
9401                 }
9402                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9403                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9404                   break;
9405                 }
9406                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9407                 {
9408                   // Stop on unconditional branch
9409                   break;
9410                 }
9411                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9412                 {
9413                   if(ooo[j]) {
9414                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9415                       break;
9416                   }else{
9417                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9418                       break;
9419                   }
9420                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9421                     //printf("no-match due to different register (branch)\n");
9422                     break;
9423                   }
9424                 }
9425                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9426                   //printf("No free regs for store %x\n",start+j*4);
9427                   break;
9428                 }
9429                 if(f_regmap[hr]>=64) {
9430                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9431                     break;
9432                   }
9433                   else
9434                   {
9435                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9436                       break;
9437                     }
9438                   }
9439                 }
9440               }
9441             }
9442           }
9443         }
9444       }
9445     }else{
9446       // Non branch or undetermined branch target
9447       for(hr=0;hr<HOST_REGS;hr++)
9448       {
9449         if(hr!=EXCLUDE_REG) {
9450           if(regs[i].regmap[hr]>64) {
9451             if(!((regs[i].dirty>>hr)&1))
9452               f_regmap[hr]=regs[i].regmap[hr];
9453           }
9454           else if(regs[i].regmap[hr]>=0) {
9455             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9456               // dealloc old register
9457               int n;
9458               for(n=0;n<HOST_REGS;n++)
9459               {
9460                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9461               }
9462               // and alloc new one
9463               f_regmap[hr]=regs[i].regmap[hr];
9464             }
9465           }
9466         }
9467       }
9468       // Try to restore cycle count at branch targets
9469       if(bt[i]) {
9470         for(j=i;j<slen-1;j++) {
9471           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9472           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9473             //printf("no free regs for store %x\n",start+j*4);
9474             break;
9475           }
9476         }
9477         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9478           int k=i;
9479           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9480           while(k<j) {
9481             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9482             regs[k].regmap[HOST_CCREG]=CCREG;
9483             regmap_pre[k+1][HOST_CCREG]=CCREG;
9484             regs[k+1].wasdirty|=1<<HOST_CCREG;
9485             regs[k].dirty|=1<<HOST_CCREG;
9486             regs[k].wasconst&=~(1<<HOST_CCREG);
9487             regs[k].isconst&=~(1<<HOST_CCREG);
9488             k++;
9489           }
9490           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9491         }
9492         // Work backwards from the branch target
9493         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9494         {
9495           //printf("Extend backwards\n");
9496           int k;
9497           k=i;
9498           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9499             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9500               //printf("no free regs for store %x\n",start+(k-1)*4);
9501               break;
9502             }
9503             k--;
9504           }
9505           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9506             //printf("Extend CC, %x ->\n",start+k*4);
9507             while(k<=i) {
9508               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9509               regs[k].regmap[HOST_CCREG]=CCREG;
9510               regmap_pre[k+1][HOST_CCREG]=CCREG;
9511               regs[k+1].wasdirty|=1<<HOST_CCREG;
9512               regs[k].dirty|=1<<HOST_CCREG;
9513               regs[k].wasconst&=~(1<<HOST_CCREG);
9514               regs[k].isconst&=~(1<<HOST_CCREG);
9515               k++;
9516             }
9517           }
9518           else {
9519             //printf("Fail Extend CC, %x ->\n",start+k*4);
9520           }
9521         }
9522       }
9523       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9524          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9525          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9526          itype[i]!=FCONV&&itype[i]!=FCOMP)
9527       {
9528         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9529       }
9530     }
9531   }
9532
9533   // Cache memory offset or tlb map pointer if a register is available
9534   #ifndef HOST_IMM_ADDR32
9535   #ifndef RAM_OFFSET
9536   if(0)
9537   #endif
9538   {
9539     int earliest_available[HOST_REGS];
9540     int loop_start[HOST_REGS];
9541     int score[HOST_REGS];
9542     int end[HOST_REGS];
9543     int reg=ROREG;
9544
9545     // Init
9546     for(hr=0;hr<HOST_REGS;hr++) {
9547       score[hr]=0;earliest_available[hr]=0;
9548       loop_start[hr]=MAXBLOCK;
9549     }
9550     for(i=0;i<slen-1;i++)
9551     {
9552       // Can't do anything if no registers are available
9553       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9554         for(hr=0;hr<HOST_REGS;hr++) {
9555           score[hr]=0;earliest_available[hr]=i+1;
9556           loop_start[hr]=MAXBLOCK;
9557         }
9558       }
9559       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9560         if(!ooo[i]) {
9561           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9562             for(hr=0;hr<HOST_REGS;hr++) {
9563               score[hr]=0;earliest_available[hr]=i+1;
9564               loop_start[hr]=MAXBLOCK;
9565             }
9566           }
9567         }else{
9568           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9569             for(hr=0;hr<HOST_REGS;hr++) {
9570               score[hr]=0;earliest_available[hr]=i+1;
9571               loop_start[hr]=MAXBLOCK;
9572             }
9573           }
9574         }
9575       }
9576       // Mark unavailable registers
9577       for(hr=0;hr<HOST_REGS;hr++) {
9578         if(regs[i].regmap[hr]>=0) {
9579           score[hr]=0;earliest_available[hr]=i+1;
9580           loop_start[hr]=MAXBLOCK;
9581         }
9582         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9583           if(branch_regs[i].regmap[hr]>=0) {
9584             score[hr]=0;earliest_available[hr]=i+2;
9585             loop_start[hr]=MAXBLOCK;
9586           }
9587         }
9588       }
9589       // No register allocations after unconditional jumps
9590       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9591       {
9592         for(hr=0;hr<HOST_REGS;hr++) {
9593           score[hr]=0;earliest_available[hr]=i+2;
9594           loop_start[hr]=MAXBLOCK;
9595         }
9596         i++; // Skip delay slot too
9597         //printf("skip delay slot: %x\n",start+i*4);
9598       }
9599       else
9600       // Possible match
9601       if(itype[i]==LOAD||itype[i]==LOADLR||
9602          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9603         for(hr=0;hr<HOST_REGS;hr++) {
9604           if(hr!=EXCLUDE_REG) {
9605             end[hr]=i-1;
9606             for(j=i;j<slen-1;j++) {
9607               if(regs[j].regmap[hr]>=0) break;
9608               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9609                 if(branch_regs[j].regmap[hr]>=0) break;
9610                 if(ooo[j]) {
9611                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9612                 }else{
9613                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9614                 }
9615               }
9616               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9617               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9618                 int t=(ba[j]-start)>>2;
9619                 if(t<j&&t>=earliest_available[hr]) {
9620                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9621                     // Score a point for hoisting loop invariant
9622                     if(t<loop_start[hr]) loop_start[hr]=t;
9623                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9624                     score[hr]++;
9625                     end[hr]=j;
9626                   }
9627                 }
9628                 else if(t<j) {
9629                   if(regs[t].regmap[hr]==reg) {
9630                     // Score a point if the branch target matches this register
9631                     score[hr]++;
9632                     end[hr]=j;
9633                   }
9634                 }
9635                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9636                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9637                   score[hr]++;
9638                   end[hr]=j;
9639                 }
9640               }
9641               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9642               {
9643                 // Stop on unconditional branch
9644                 break;
9645               }
9646               else
9647               if(itype[j]==LOAD||itype[j]==LOADLR||
9648                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9649                 score[hr]++;
9650                 end[hr]=j;
9651               }
9652             }
9653           }
9654         }
9655         // Find highest score and allocate that register
9656         int maxscore=0;
9657         for(hr=0;hr<HOST_REGS;hr++) {
9658           if(hr!=EXCLUDE_REG) {
9659             if(score[hr]>score[maxscore]) {
9660               maxscore=hr;
9661               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9662             }
9663           }
9664         }
9665         if(score[maxscore]>1)
9666         {
9667           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9668           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9669             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9670             assert(regs[j].regmap[maxscore]<0);
9671             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9672             regs[j].regmap[maxscore]=reg;
9673             regs[j].dirty&=~(1<<maxscore);
9674             regs[j].wasconst&=~(1<<maxscore);
9675             regs[j].isconst&=~(1<<maxscore);
9676             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9677               branch_regs[j].regmap[maxscore]=reg;
9678               branch_regs[j].wasdirty&=~(1<<maxscore);
9679               branch_regs[j].dirty&=~(1<<maxscore);
9680               branch_regs[j].wasconst&=~(1<<maxscore);
9681               branch_regs[j].isconst&=~(1<<maxscore);
9682               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9683                 regmap_pre[j+2][maxscore]=reg;
9684                 regs[j+2].wasdirty&=~(1<<maxscore);
9685               }
9686               // loop optimization (loop_preload)
9687               int t=(ba[j]-start)>>2;
9688               if(t==loop_start[maxscore]) {
9689                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9690                   regs[t].regmap_entry[maxscore]=reg;
9691               }
9692             }
9693             else
9694             {
9695               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9696                 regmap_pre[j+1][maxscore]=reg;
9697                 regs[j+1].wasdirty&=~(1<<maxscore);
9698               }
9699             }
9700           }
9701           i=j-1;
9702           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9703           for(hr=0;hr<HOST_REGS;hr++) {
9704             score[hr]=0;earliest_available[hr]=i+i;
9705             loop_start[hr]=MAXBLOCK;
9706           }
9707         }
9708       }
9709     }
9710   }
9711   #endif
9712
9713   // This allocates registers (if possible) one instruction prior
9714   // to use, which can avoid a load-use penalty on certain CPUs.
9715   for(i=0;i<slen-1;i++)
9716   {
9717     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9718     {
9719       if(!bt[i+1])
9720       {
9721         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9722            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9723         {
9724           if(rs1[i+1]) {
9725             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9726             {
9727               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9728               {
9729                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9730                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9731                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9732                 regs[i].isconst&=~(1<<hr);
9733                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9734                 constmap[i][hr]=constmap[i+1][hr];
9735                 regs[i+1].wasdirty&=~(1<<hr);
9736                 regs[i].dirty&=~(1<<hr);
9737               }
9738             }
9739           }
9740           if(rs2[i+1]) {
9741             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9742             {
9743               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9744               {
9745                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9746                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9747                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9748                 regs[i].isconst&=~(1<<hr);
9749                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9750                 constmap[i][hr]=constmap[i+1][hr];
9751                 regs[i+1].wasdirty&=~(1<<hr);
9752                 regs[i].dirty&=~(1<<hr);
9753               }
9754             }
9755           }
9756           // Preload target address for load instruction (non-constant)
9757           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9758             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9759             {
9760               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9761               {
9762                 regs[i].regmap[hr]=rs1[i+1];
9763                 regmap_pre[i+1][hr]=rs1[i+1];
9764                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9765                 regs[i].isconst&=~(1<<hr);
9766                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9767                 constmap[i][hr]=constmap[i+1][hr];
9768                 regs[i+1].wasdirty&=~(1<<hr);
9769                 regs[i].dirty&=~(1<<hr);
9770               }
9771             }
9772           }
9773           // Load source into target register
9774           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9775             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9776             {
9777               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9778               {
9779                 regs[i].regmap[hr]=rs1[i+1];
9780                 regmap_pre[i+1][hr]=rs1[i+1];
9781                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9782                 regs[i].isconst&=~(1<<hr);
9783                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9784                 constmap[i][hr]=constmap[i+1][hr];
9785                 regs[i+1].wasdirty&=~(1<<hr);
9786                 regs[i].dirty&=~(1<<hr);
9787               }
9788             }
9789           }
9790           // Address for store instruction (non-constant)
9791           if(itype[i+1]==STORE||itype[i+1]==STORELR
9792              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9793             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9794               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9795               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9796               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9797               assert(hr>=0);
9798               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9799               {
9800                 regs[i].regmap[hr]=rs1[i+1];
9801                 regmap_pre[i+1][hr]=rs1[i+1];
9802                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9803                 regs[i].isconst&=~(1<<hr);
9804                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9805                 constmap[i][hr]=constmap[i+1][hr];
9806                 regs[i+1].wasdirty&=~(1<<hr);
9807                 regs[i].dirty&=~(1<<hr);
9808               }
9809             }
9810           }
9811           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9812             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9813               int nr;
9814               hr=get_reg(regs[i+1].regmap,FTEMP);
9815               assert(hr>=0);
9816               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9817               {
9818                 regs[i].regmap[hr]=rs1[i+1];
9819                 regmap_pre[i+1][hr]=rs1[i+1];
9820                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9821                 regs[i].isconst&=~(1<<hr);
9822                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9823                 constmap[i][hr]=constmap[i+1][hr];
9824                 regs[i+1].wasdirty&=~(1<<hr);
9825                 regs[i].dirty&=~(1<<hr);
9826               }
9827               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9828               {
9829                 // move it to another register
9830                 regs[i+1].regmap[hr]=-1;
9831                 regmap_pre[i+2][hr]=-1;
9832                 regs[i+1].regmap[nr]=FTEMP;
9833                 regmap_pre[i+2][nr]=FTEMP;
9834                 regs[i].regmap[nr]=rs1[i+1];
9835                 regmap_pre[i+1][nr]=rs1[i+1];
9836                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9837                 regs[i].isconst&=~(1<<nr);
9838                 regs[i+1].isconst&=~(1<<nr);
9839                 regs[i].dirty&=~(1<<nr);
9840                 regs[i+1].wasdirty&=~(1<<nr);
9841                 regs[i+1].dirty&=~(1<<nr);
9842                 regs[i+2].wasdirty&=~(1<<nr);
9843               }
9844             }
9845           }
9846           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9847             if(itype[i+1]==LOAD)
9848               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9849             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9850               hr=get_reg(regs[i+1].regmap,FTEMP);
9851             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9852               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9853               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9854             }
9855             if(hr>=0&&regs[i].regmap[hr]<0) {
9856               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9857               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9858                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9859                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9860                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9861                 regs[i].isconst&=~(1<<hr);
9862                 regs[i+1].wasdirty&=~(1<<hr);
9863                 regs[i].dirty&=~(1<<hr);
9864               }
9865             }
9866           }
9867         }
9868       }
9869     }
9870   }
9871
9872   /* Pass 6 - Optimize clean/dirty state */
9873   clean_registers(0,slen-1,1);
9874
9875   /* Pass 7 - Identify 32-bit registers */
9876   for (i=slen-1;i>=0;i--)
9877   {
9878     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9879     {
9880       // Conditional branch
9881       if((source[i]>>16)!=0x1000&&i<slen-2) {
9882         // Mark this address as a branch target since it may be called
9883         // upon return from interrupt
9884         bt[i+2]=1;
9885       }
9886     }
9887   }
9888
9889   if(itype[slen-1]==SPAN) {
9890     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9891   }
9892
9893 #ifdef DISASM
9894   /* Debug/disassembly */
9895   for(i=0;i<slen;i++)
9896   {
9897     printf("U:");
9898     int r;
9899     for(r=1;r<=CCREG;r++) {
9900       if((unneeded_reg[i]>>r)&1) {
9901         if(r==HIREG) printf(" HI");
9902         else if(r==LOREG) printf(" LO");
9903         else printf(" r%d",r);
9904       }
9905     }
9906     printf("\n");
9907     #if defined(__i386__) || defined(__x86_64__)
9908     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9909     #endif
9910     #ifdef __arm__
9911     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9912     #endif
9913     printf("needs: ");
9914     if(needed_reg[i]&1) printf("eax ");
9915     if((needed_reg[i]>>1)&1) printf("ecx ");
9916     if((needed_reg[i]>>2)&1) printf("edx ");
9917     if((needed_reg[i]>>3)&1) printf("ebx ");
9918     if((needed_reg[i]>>5)&1) printf("ebp ");
9919     if((needed_reg[i]>>6)&1) printf("esi ");
9920     if((needed_reg[i]>>7)&1) printf("edi ");
9921     printf("\n");
9922     #if defined(__i386__) || defined(__x86_64__)
9923     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9924     printf("dirty: ");
9925     if(regs[i].wasdirty&1) printf("eax ");
9926     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9927     if((regs[i].wasdirty>>2)&1) printf("edx ");
9928     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9929     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9930     if((regs[i].wasdirty>>6)&1) printf("esi ");
9931     if((regs[i].wasdirty>>7)&1) printf("edi ");
9932     #endif
9933     #ifdef __arm__
9934     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9935     printf("dirty: ");
9936     if(regs[i].wasdirty&1) printf("r0 ");
9937     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9938     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9939     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9940     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9941     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9942     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9943     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9944     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9945     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9946     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9947     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9948     #endif
9949     printf("\n");
9950     disassemble_inst(i);
9951     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9952     #if defined(__i386__) || defined(__x86_64__)
9953     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9954     if(regs[i].dirty&1) printf("eax ");
9955     if((regs[i].dirty>>1)&1) printf("ecx ");
9956     if((regs[i].dirty>>2)&1) printf("edx ");
9957     if((regs[i].dirty>>3)&1) printf("ebx ");
9958     if((regs[i].dirty>>5)&1) printf("ebp ");
9959     if((regs[i].dirty>>6)&1) printf("esi ");
9960     if((regs[i].dirty>>7)&1) printf("edi ");
9961     #endif
9962     #ifdef __arm__
9963     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9964     if(regs[i].dirty&1) printf("r0 ");
9965     if((regs[i].dirty>>1)&1) printf("r1 ");
9966     if((regs[i].dirty>>2)&1) printf("r2 ");
9967     if((regs[i].dirty>>3)&1) printf("r3 ");
9968     if((regs[i].dirty>>4)&1) printf("r4 ");
9969     if((regs[i].dirty>>5)&1) printf("r5 ");
9970     if((regs[i].dirty>>6)&1) printf("r6 ");
9971     if((regs[i].dirty>>7)&1) printf("r7 ");
9972     if((regs[i].dirty>>8)&1) printf("r8 ");
9973     if((regs[i].dirty>>9)&1) printf("r9 ");
9974     if((regs[i].dirty>>10)&1) printf("r10 ");
9975     if((regs[i].dirty>>12)&1) printf("r12 ");
9976     #endif
9977     printf("\n");
9978     if(regs[i].isconst) {
9979       printf("constants: ");
9980       #if defined(__i386__) || defined(__x86_64__)
9981       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9982       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9983       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9984       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9985       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9986       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
9987       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
9988       #endif
9989       #ifdef __arm__
9990       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
9991       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
9992       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
9993       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
9994       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
9995       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
9996       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
9997       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
9998       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
9999       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10000       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10001       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10002       #endif
10003       printf("\n");
10004     }
10005     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10006       #if defined(__i386__) || defined(__x86_64__)
10007       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10008       if(branch_regs[i].dirty&1) printf("eax ");
10009       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10010       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10011       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10012       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10013       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10014       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10015       #endif
10016       #ifdef __arm__
10017       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10018       if(branch_regs[i].dirty&1) printf("r0 ");
10019       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10020       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10021       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10022       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10023       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10024       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10025       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10026       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10027       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10028       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10029       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10030       #endif
10031     }
10032   }
10033 #endif // DISASM
10034
10035   /* Pass 8 - Assembly */
10036   linkcount=0;stubcount=0;
10037   ds=0;is_delayslot=0;
10038   cop1_usable=0;
10039   uint64_t is32_pre=0;
10040   u_int dirty_pre=0;
10041   void *beginning=start_block();
10042   if((u_int)addr&1) {
10043     ds=1;
10044     pagespan_ds();
10045   }
10046   u_int instr_addr0_override=0;
10047
10048   if (start == 0x80030000) {
10049     // nasty hack for fastbios thing
10050     // override block entry to this code
10051     instr_addr0_override=(u_int)out;
10052     emit_movimm(start,0);
10053     // abuse io address var as a flag that we
10054     // have already returned here once
10055     emit_readword((int)&address,1);
10056     emit_writeword(0,(int)&pcaddr);
10057     emit_writeword(0,(int)&address);
10058     emit_cmp(0,1);
10059     emit_jne((int)new_dyna_leave);
10060   }
10061   for(i=0;i<slen;i++)
10062   {
10063     //if(ds) printf("ds: ");
10064     disassemble_inst(i);
10065     if(ds) {
10066       ds=0; // Skip delay slot
10067       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10068       instr_addr[i]=0;
10069     } else {
10070       speculate_register_values(i);
10071       #ifndef DESTRUCTIVE_WRITEBACK
10072       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10073       {
10074         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10075               unneeded_reg[i],unneeded_reg_upper[i]);
10076       }
10077       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
10078         is32_pre=branch_regs[i].is32;
10079         dirty_pre=branch_regs[i].dirty;
10080       }else{
10081         is32_pre=regs[i].is32;
10082         dirty_pre=regs[i].dirty;
10083       }
10084       #endif
10085       // write back
10086       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10087       {
10088         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10089                       unneeded_reg[i],unneeded_reg_upper[i]);
10090         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10091       }
10092       // branch target entry point
10093       instr_addr[i]=(u_int)out;
10094       assem_debug("<->\n");
10095       // load regs
10096       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10097         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10098       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10099       address_generation(i,&regs[i],regs[i].regmap_entry);
10100       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10101       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10102       {
10103         // Load the delay slot registers if necessary
10104         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
10105           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10106         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
10107           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10108         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10109           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10110       }
10111       else if(i+1<slen)
10112       {
10113         // Preload registers for following instruction
10114         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10115           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10116             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10117         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10118           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10119             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10120       }
10121       // TODO: if(is_ooo(i)) address_generation(i+1);
10122       if(itype[i]==CJUMP||itype[i]==FJUMP)
10123         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10124       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10125         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10126       if(bt[i]) cop1_usable=0;
10127       // assemble
10128       switch(itype[i]) {
10129         case ALU:
10130           alu_assemble(i,&regs[i]);break;
10131         case IMM16:
10132           imm16_assemble(i,&regs[i]);break;
10133         case SHIFT:
10134           shift_assemble(i,&regs[i]);break;
10135         case SHIFTIMM:
10136           shiftimm_assemble(i,&regs[i]);break;
10137         case LOAD:
10138           load_assemble(i,&regs[i]);break;
10139         case LOADLR:
10140           loadlr_assemble(i,&regs[i]);break;
10141         case STORE:
10142           store_assemble(i,&regs[i]);break;
10143         case STORELR:
10144           storelr_assemble(i,&regs[i]);break;
10145         case COP0:
10146           cop0_assemble(i,&regs[i]);break;
10147         case COP1:
10148           cop1_assemble(i,&regs[i]);break;
10149         case C1LS:
10150           c1ls_assemble(i,&regs[i]);break;
10151         case COP2:
10152           cop2_assemble(i,&regs[i]);break;
10153         case C2LS:
10154           c2ls_assemble(i,&regs[i]);break;
10155         case C2OP:
10156           c2op_assemble(i,&regs[i]);break;
10157         case FCONV:
10158           fconv_assemble(i,&regs[i]);break;
10159         case FLOAT:
10160           float_assemble(i,&regs[i]);break;
10161         case FCOMP:
10162           fcomp_assemble(i,&regs[i]);break;
10163         case MULTDIV:
10164           multdiv_assemble(i,&regs[i]);break;
10165         case MOV:
10166           mov_assemble(i,&regs[i]);break;
10167         case SYSCALL:
10168           syscall_assemble(i,&regs[i]);break;
10169         case HLECALL:
10170           hlecall_assemble(i,&regs[i]);break;
10171         case INTCALL:
10172           intcall_assemble(i,&regs[i]);break;
10173         case UJUMP:
10174           ujump_assemble(i,&regs[i]);ds=1;break;
10175         case RJUMP:
10176           rjump_assemble(i,&regs[i]);ds=1;break;
10177         case CJUMP:
10178           cjump_assemble(i,&regs[i]);ds=1;break;
10179         case SJUMP:
10180           sjump_assemble(i,&regs[i]);ds=1;break;
10181         case FJUMP:
10182           fjump_assemble(i,&regs[i]);ds=1;break;
10183         case SPAN:
10184           pagespan_assemble(i,&regs[i]);break;
10185       }
10186       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10187         literal_pool(1024);
10188       else
10189         literal_pool_jumpover(256);
10190     }
10191   }
10192   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10193   // If the block did not end with an unconditional branch,
10194   // add a jump to the next instruction.
10195   if(i>1) {
10196     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10197       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10198       assert(i==slen);
10199       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10200         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10201         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10202           emit_loadreg(CCREG,HOST_CCREG);
10203         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10204       }
10205       else if(!likely[i-2])
10206       {
10207         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10208         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10209       }
10210       else
10211       {
10212         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10213         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10214       }
10215       add_to_linker((int)out,start+i*4,0);
10216       emit_jmp(0);
10217     }
10218   }
10219   else
10220   {
10221     assert(i>0);
10222     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10223     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10224     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10225       emit_loadreg(CCREG,HOST_CCREG);
10226     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10227     add_to_linker((int)out,start+i*4,0);
10228     emit_jmp(0);
10229   }
10230
10231   // TODO: delay slot stubs?
10232   // Stubs
10233   for(i=0;i<stubcount;i++)
10234   {
10235     switch(stubs[i][0])
10236     {
10237       case LOADB_STUB:
10238       case LOADH_STUB:
10239       case LOADW_STUB:
10240       case LOADD_STUB:
10241       case LOADBU_STUB:
10242       case LOADHU_STUB:
10243         do_readstub(i);break;
10244       case STOREB_STUB:
10245       case STOREH_STUB:
10246       case STOREW_STUB:
10247       case STORED_STUB:
10248         do_writestub(i);break;
10249       case CC_STUB:
10250         do_ccstub(i);break;
10251       case INVCODE_STUB:
10252         do_invstub(i);break;
10253       case FP_STUB:
10254         do_cop1stub(i);break;
10255       case STORELR_STUB:
10256         do_unalignedwritestub(i);break;
10257     }
10258   }
10259
10260   if (instr_addr0_override)
10261     instr_addr[0] = instr_addr0_override;
10262
10263   /* Pass 9 - Linker */
10264   for(i=0;i<linkcount;i++)
10265   {
10266     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10267     literal_pool(64);
10268     if(!link_addr[i][2])
10269     {
10270       void *stub=out;
10271       void *addr=check_addr(link_addr[i][1]);
10272       emit_extjump(link_addr[i][0],link_addr[i][1]);
10273       if(addr) {
10274         set_jump_target(link_addr[i][0],(int)addr);
10275         add_link(link_addr[i][1],stub);
10276       }
10277       else set_jump_target(link_addr[i][0],(int)stub);
10278     }
10279     else
10280     {
10281       // Internal branch
10282       int target=(link_addr[i][1]-start)>>2;
10283       assert(target>=0&&target<slen);
10284       assert(instr_addr[target]);
10285       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10286       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10287       //#else
10288       set_jump_target(link_addr[i][0],instr_addr[target]);
10289       //#endif
10290     }
10291   }
10292   // External Branch Targets (jump_in)
10293   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10294   for(i=0;i<slen;i++)
10295   {
10296     if(bt[i]||i==0)
10297     {
10298       if(instr_addr[i]) // TODO - delay slots (=null)
10299       {
10300         u_int vaddr=start+i*4;
10301         u_int page=get_page(vaddr);
10302         u_int vpage=get_vpage(vaddr);
10303         literal_pool(256);
10304         {
10305           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10306           assem_debug("jump_in: %x\n",start+i*4);
10307           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10308           int entry_point=do_dirty_stub(i);
10309           ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
10310           // If there was an existing entry in the hash table,
10311           // replace it with the new address.
10312           // Don't add new entries.  We'll insert the
10313           // ones that actually get used in check_addr().
10314           u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10315           if(ht_bin[0]==vaddr) {
10316             ht_bin[1]=entry_point;
10317           }
10318           if(ht_bin[2]==vaddr) {
10319             ht_bin[3]=entry_point;
10320           }
10321         }
10322       }
10323     }
10324   }
10325   // Write out the literal pool if necessary
10326   literal_pool(0);
10327   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10328   // Align code
10329   if(((u_int)out)&7) emit_addnop(13);
10330   #endif
10331   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
10332   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10333   memcpy(copy,source,slen*4);
10334   copy+=slen*4;
10335
10336   end_block(beginning);
10337
10338   // If we're within 256K of the end of the buffer,
10339   // start over from the beginning. (Is 256K enough?)
10340   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10341
10342   // Trap writes to any of the pages we compiled
10343   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10344     invalid_code[i]=0;
10345   }
10346   inv_code_start=inv_code_end=~0;
10347
10348   // for PCSX we need to mark all mirrors too
10349   if(get_page(start)<(RAM_SIZE>>12))
10350     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10351       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10352       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10353       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10354
10355   /* Pass 10 - Free memory by expiring oldest blocks */
10356
10357   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10358   while(expirep!=end)
10359   {
10360     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10361     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10362     inv_debug("EXP: Phase %d\n",expirep);
10363     switch((expirep>>11)&3)
10364     {
10365       case 0:
10366         // Clear jump_in and jump_dirty
10367         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10368         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10369         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10370         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10371         break;
10372       case 1:
10373         // Clear pointers
10374         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10375         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10376         break;
10377       case 2:
10378         // Clear hash table
10379         for(i=0;i<32;i++) {
10380           u_int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10381           if((ht_bin[3]>>shift)==(base>>shift) ||
10382              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10383             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10384             ht_bin[2]=ht_bin[3]=-1;
10385           }
10386           if((ht_bin[1]>>shift)==(base>>shift) ||
10387              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10388             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10389             ht_bin[0]=ht_bin[2];
10390             ht_bin[1]=ht_bin[3];
10391             ht_bin[2]=ht_bin[3]=-1;
10392           }
10393         }
10394         break;
10395       case 3:
10396         // Clear jump_out
10397         #ifdef __arm__
10398         if((expirep&2047)==0)
10399           do_clear_cache();
10400         #endif
10401         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10402         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10403         break;
10404     }
10405     expirep=(expirep+1)&65535;
10406   }
10407   return 0;
10408 }
10409
10410 // vim:shiftwidth=2:expandtab