(new_dynarec) Update
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 int getVMBlock();
36 #endif
37
38 #include "new_dynarec_config.h"
39 #include "backends/psx/emu_if.h" //emulator interface
40
41 //#define DISASM
42 //#define assem_debug printf
43 //#define inv_debug printf
44 #define assem_debug(...)
45 #define inv_debug(...)
46
47 #ifdef __i386__
48 #include "x86/assem_x86.h"
49 #endif
50 #ifdef __x86_64__
51 #include "x64/assem_x64.h"
52 #endif
53 #ifdef __arm__
54 #include "arm/assem_arm.h"
55 #endif
56
57 #ifdef VITA
58 int _newlib_vm_size_user = 1 << TARGET_SIZE_2;
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 struct regstat
65 {
66   signed char regmap_entry[HOST_REGS];
67   signed char regmap[HOST_REGS];
68   uint64_t was32;
69   uint64_t is32;
70   uint64_t wasdirty;
71   uint64_t dirty;
72   uint64_t u;
73   uint64_t uu;
74   u_int wasconst;
75   u_int isconst;
76   u_int loadedconst;             // host regs that have constants loaded
77   u_int waswritten;              // MIPS regs that were used as store base before
78 };
79
80 // note: asm depends on this layout
81 struct ll_entry
82 {
83   u_int vaddr;
84   u_int reg_sv_flags;
85   void *addr;
86   struct ll_entry *next;
87 };
88
89   // used by asm:
90   u_char *out;
91   u_int hash_table[65536][4]  __attribute__((aligned(16)));
92   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
93   struct ll_entry *jump_dirty[4096];
94
95   static struct ll_entry *jump_out[4096];
96   static u_int start;
97   static u_int *source;
98   static char insn[MAXBLOCK][10];
99   static u_char itype[MAXBLOCK];
100   static u_char opcode[MAXBLOCK];
101   static u_char opcode2[MAXBLOCK];
102   static u_char bt[MAXBLOCK];
103   static u_char rs1[MAXBLOCK];
104   static u_char rs2[MAXBLOCK];
105   static u_char rt1[MAXBLOCK];
106   static u_char rt2[MAXBLOCK];
107   static u_char us1[MAXBLOCK];
108   static u_char us2[MAXBLOCK];
109   static u_char dep1[MAXBLOCK];
110   static u_char dep2[MAXBLOCK];
111   static u_char lt1[MAXBLOCK];
112   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
113   static uint64_t gte_rt[MAXBLOCK];
114   static uint64_t gte_unneeded[MAXBLOCK];
115   static u_int smrv[32]; // speculated MIPS register values
116   static u_int smrv_strong; // mask or regs that are likely to have correct values
117   static u_int smrv_weak; // same, but somewhat less likely
118   static u_int smrv_strong_next; // same, but after current insn executes
119   static u_int smrv_weak_next;
120   static int imm[MAXBLOCK];
121   static u_int ba[MAXBLOCK];
122   static char likely[MAXBLOCK];
123   static char is_ds[MAXBLOCK];
124   static char ooo[MAXBLOCK];
125   static uint64_t unneeded_reg[MAXBLOCK];
126   static uint64_t unneeded_reg_upper[MAXBLOCK];
127   static uint64_t branch_unneeded_reg[MAXBLOCK];
128   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
129   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
130   static uint64_t current_constmap[HOST_REGS];
131   static uint64_t constmap[MAXBLOCK][HOST_REGS];
132   static struct regstat regs[MAXBLOCK];
133   static struct regstat branch_regs[MAXBLOCK];
134   static signed char minimum_free_regs[MAXBLOCK];
135   static u_int needed_reg[MAXBLOCK];
136   static u_int wont_dirty[MAXBLOCK];
137   static u_int will_dirty[MAXBLOCK];
138   static int ccadj[MAXBLOCK];
139   static int slen;
140   static u_int instr_addr[MAXBLOCK];
141   static u_int link_addr[MAXBLOCK][3];
142   static int linkcount;
143   static u_int stubs[MAXBLOCK*3][8];
144   static int stubcount;
145   static u_int literals[1024][2];
146   static int literalcount;
147   static int is_delayslot;
148   static int cop1_usable;
149   static char shadow[1048576]  __attribute__((aligned(16)));
150   static void *copy;
151   static int expirep;
152   static u_int stop_after_jal;
153 #ifndef RAM_FIXED
154   static u_int ram_offset;
155 #else
156   static const u_int ram_offset=0;
157 #endif
158
159   int new_dynarec_hacks;
160   int new_dynarec_did_compile;
161   extern u_char restore_candidate[512];
162   extern int cycle_count;
163
164   /* registers that may be allocated */
165   /* 1-31 gpr */
166 #define HIREG 32 // hi
167 #define LOREG 33 // lo
168 #define FSREG 34 // FPU status (FCSR)
169 #define CSREG 35 // Coprocessor status
170 #define CCREG 36 // Cycle count
171 #define INVCP 37 // Pointer to invalid_code
172 //#define MMREG 38 // Pointer to memory_map
173 #define ROREG 39 // ram offset (if rdram!=0x80000000)
174 #define TEMPREG 40
175 #define FTEMP 40 // FPU temporary register
176 #define PTEMP 41 // Prefetch temporary register
177 //#define TLREG 42 // TLB mapping offset
178 #define RHASH 43 // Return address hash
179 #define RHTBL 44 // Return address hash table address
180 #define RTEMP 45 // JR/JALR address register
181 #define MAXREG 45
182 #define AGEN1 46 // Address generation temporary register
183 //#define AGEN2 47 // Address generation temporary register
184 //#define MGEN1 48 // Maptable address generation temporary register
185 //#define MGEN2 49 // Maptable address generation temporary register
186 #define BTREG 50 // Branch target temporary register
187
188   /* instruction types */
189 #define NOP 0     // No operation
190 #define LOAD 1    // Load
191 #define STORE 2   // Store
192 #define LOADLR 3  // Unaligned load
193 #define STORELR 4 // Unaligned store
194 #define MOV 5     // Move
195 #define ALU 6     // Arithmetic/logic
196 #define MULTDIV 7 // Multiply/divide
197 #define SHIFT 8   // Shift by register
198 #define SHIFTIMM 9// Shift by immediate
199 #define IMM16 10  // 16-bit immediate
200 #define RJUMP 11  // Unconditional jump to register
201 #define UJUMP 12  // Unconditional jump
202 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
203 #define SJUMP 14  // Conditional branch (regimm format)
204 #define COP0 15   // Coprocessor 0
205 #define COP1 16   // Coprocessor 1
206 #define C1LS 17   // Coprocessor 1 load/store
207 #define FJUMP 18  // Conditional branch (floating point)
208 #define FLOAT 19  // Floating point unit
209 #define FCONV 20  // Convert integer to float
210 #define FCOMP 21  // Floating point compare (sets FSREG)
211 #define SYSCALL 22// SYSCALL
212 #define OTHER 23  // Other
213 #define SPAN 24   // Branch/delay slot spans 2 pages
214 #define NI 25     // Not implemented
215 #define HLECALL 26// PCSX fake opcodes for HLE
216 #define COP2 27   // Coprocessor 2 move
217 #define C2LS 28   // Coprocessor 2 load/store
218 #define C2OP 29   // Coprocessor 2 operation
219 #define INTCALL 30// Call interpreter to handle rare corner cases
220
221   /* stubs */
222 #define CC_STUB 1
223 #define FP_STUB 2
224 #define LOADB_STUB 3
225 #define LOADH_STUB 4
226 #define LOADW_STUB 5
227 #define LOADD_STUB 6
228 #define LOADBU_STUB 7
229 #define LOADHU_STUB 8
230 #define STOREB_STUB 9
231 #define STOREH_STUB 10
232 #define STOREW_STUB 11
233 #define STORED_STUB 12
234 #define STORELR_STUB 13
235 #define INVCODE_STUB 14
236
237   /* branch codes */
238 #define TAKEN 1
239 #define NOTTAKEN 2
240 #define NULLDS 3
241
242 // asm linkage
243 int new_recompile_block(int addr);
244 void *get_addr_ht(u_int vaddr);
245 void invalidate_block(u_int block);
246 void invalidate_addr(u_int addr);
247 void remove_hash(int vaddr);
248 void dyna_linker();
249 void dyna_linker_ds();
250 void verify_code();
251 void verify_code_vm();
252 void verify_code_ds();
253 void cc_interrupt();
254 void fp_exception();
255 void fp_exception_ds();
256 void jump_syscall_hle();
257 void jump_hlecall();
258 void jump_intcall();
259 void new_dyna_leave();
260
261 // Needed by assembler
262 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
263 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
264 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
265 static void load_all_regs(signed char i_regmap[]);
266 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
267 static void load_regs_entry(int t);
268 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
269
270 static int verify_dirty(u_int *ptr);
271 static int get_final_value(int hr, int i, int *value);
272 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e);
273 static void add_to_linker(int addr,int target,int ext);
274
275 static int tracedebug=0;
276
277 static void mprotect_w_x(void *start, void *end, int is_x)
278 {
279 #ifdef NO_WRITE_EXEC
280   #if defined(VITA)
281   // *Open* enables write on all memory that was
282   // allocated by sceKernelAllocMemBlockForVM()?
283   if (is_x)
284     sceKernelCloseVMDomain();
285   else
286     sceKernelOpenVMDomain();
287   #else
288   u_long mstart = (u_long)start & ~4095ul;
289   u_long mend = (u_long)end;
290   if (mprotect((void *)mstart, mend - mstart,
291                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
292     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
293   #endif
294 #endif
295 }
296
297 static void start_tcache_write(void *start, void *end)
298 {
299   mprotect_w_x(start, end, 0);
300 }
301
302 static void end_tcache_write(void *start, void *end)
303 {
304 #ifdef __arm__
305   size_t len = (char *)end - (char *)start;
306   #if   defined(__BLACKBERRY_QNX__)
307   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
308   #elif defined(__MACH__)
309   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
310   #elif defined(VITA)
311   sceKernelSyncVMDomain(sceBlock, start, len);
312   #elif defined(_3DS)
313   ctr_flush_invalidate_cache();
314   #else
315   __clear_cache(start, end);
316   #endif
317   (void)len;
318 #endif
319
320   mprotect_w_x(start, end, 1);
321 }
322
323 static void *start_block(void)
324 {
325   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
326   if (end > (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2))
327     end = (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2);
328   start_tcache_write(out, end);
329   return out;
330 }
331
332 static void end_block(void *start)
333 {
334   end_tcache_write(start, out);
335 }
336
337 //#define DEBUG_CYCLE_COUNT 1
338
339 #define NO_CYCLE_PENALTY_THR 12
340
341 int cycle_multiplier; // 100 for 1.0
342
343 static int CLOCK_ADJUST(int x)
344 {
345   int s=(x>>31)|1;
346   return (x * cycle_multiplier + s * 50) / 100;
347 }
348
349 static u_int get_page(u_int vaddr)
350 {
351   u_int page=vaddr&~0xe0000000;
352   if (page < 0x1000000)
353     page &= ~0x0e00000; // RAM mirrors
354   page>>=12;
355   if(page>2048) page=2048+(page&2047);
356   return page;
357 }
358
359 // no virtual mem in PCSX
360 static u_int get_vpage(u_int vaddr)
361 {
362   return get_page(vaddr);
363 }
364
365 // Get address from virtual address
366 // This is called from the recompiled JR/JALR instructions
367 void *get_addr(u_int vaddr)
368 {
369   u_int page=get_page(vaddr);
370   u_int vpage=get_vpage(vaddr);
371   struct ll_entry *head;
372   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
373   head=jump_in[page];
374   while(head!=NULL) {
375     if(head->vaddr==vaddr) {
376   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
377       u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
378       ht_bin[3]=ht_bin[1];
379       ht_bin[2]=ht_bin[0];
380       ht_bin[1]=(u_int)head->addr;
381       ht_bin[0]=vaddr;
382       return head->addr;
383     }
384     head=head->next;
385   }
386   head=jump_dirty[vpage];
387   while(head!=NULL) {
388     if(head->vaddr==vaddr) {
389       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
390       // Don't restore blocks which are about to expire from the cache
391       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
392       if(verify_dirty(head->addr)) {
393         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
394         invalid_code[vaddr>>12]=0;
395         inv_code_start=inv_code_end=~0;
396         if(vpage<2048) {
397           restore_candidate[vpage>>3]|=1<<(vpage&7);
398         }
399         else restore_candidate[page>>3]|=1<<(page&7);
400         u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
401         if(ht_bin[0]==vaddr) {
402           ht_bin[1]=(u_int)head->addr; // Replace existing entry
403         }
404         else
405         {
406           ht_bin[3]=ht_bin[1];
407           ht_bin[2]=ht_bin[0];
408           ht_bin[1]=(int)head->addr;
409           ht_bin[0]=vaddr;
410         }
411         return head->addr;
412       }
413     }
414     head=head->next;
415   }
416   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
417   int r=new_recompile_block(vaddr);
418   if(r==0) return get_addr(vaddr);
419   // Execute in unmapped page, generate pagefault execption
420   Status|=2;
421   Cause=(vaddr<<31)|0x8;
422   EPC=(vaddr&1)?vaddr-5:vaddr;
423   BadVAddr=(vaddr&~1);
424   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
425   EntryHi=BadVAddr&0xFFFFE000;
426   return get_addr_ht(0x80000000);
427 }
428 // Look up address in hash table first
429 void *get_addr_ht(u_int vaddr)
430 {
431   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
432   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
433   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
434   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
435   return get_addr(vaddr);
436 }
437
438 void clear_all_regs(signed char regmap[])
439 {
440   int hr;
441   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
442 }
443
444 signed char get_reg(signed char regmap[],int r)
445 {
446   int hr;
447   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
448   return -1;
449 }
450
451 // Find a register that is available for two consecutive cycles
452 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
453 {
454   int hr;
455   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
456   return -1;
457 }
458
459 int count_free_regs(signed char regmap[])
460 {
461   int count=0;
462   int hr;
463   for(hr=0;hr<HOST_REGS;hr++)
464   {
465     if(hr!=EXCLUDE_REG) {
466       if(regmap[hr]<0) count++;
467     }
468   }
469   return count;
470 }
471
472 void dirty_reg(struct regstat *cur,signed char reg)
473 {
474   int hr;
475   if(!reg) return;
476   for (hr=0;hr<HOST_REGS;hr++) {
477     if((cur->regmap[hr]&63)==reg) {
478       cur->dirty|=1<<hr;
479     }
480   }
481 }
482
483 // If we dirty the lower half of a 64 bit register which is now being
484 // sign-extended, we need to dump the upper half.
485 // Note: Do this only after completion of the instruction, because
486 // some instructions may need to read the full 64-bit value even if
487 // overwriting it (eg SLTI, DSRA32).
488 static void flush_dirty_uppers(struct regstat *cur)
489 {
490   int hr,reg;
491   for (hr=0;hr<HOST_REGS;hr++) {
492     if((cur->dirty>>hr)&1) {
493       reg=cur->regmap[hr];
494       if(reg>=64)
495         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
496     }
497   }
498 }
499
500 void set_const(struct regstat *cur,signed char reg,uint64_t value)
501 {
502   int hr;
503   if(!reg) return;
504   for (hr=0;hr<HOST_REGS;hr++) {
505     if(cur->regmap[hr]==reg) {
506       cur->isconst|=1<<hr;
507       current_constmap[hr]=value;
508     }
509     else if((cur->regmap[hr]^64)==reg) {
510       cur->isconst|=1<<hr;
511       current_constmap[hr]=value>>32;
512     }
513   }
514 }
515
516 void clear_const(struct regstat *cur,signed char reg)
517 {
518   int hr;
519   if(!reg) return;
520   for (hr=0;hr<HOST_REGS;hr++) {
521     if((cur->regmap[hr]&63)==reg) {
522       cur->isconst&=~(1<<hr);
523     }
524   }
525 }
526
527 int is_const(struct regstat *cur,signed char reg)
528 {
529   int hr;
530   if(reg<0) return 0;
531   if(!reg) return 1;
532   for (hr=0;hr<HOST_REGS;hr++) {
533     if((cur->regmap[hr]&63)==reg) {
534       return (cur->isconst>>hr)&1;
535     }
536   }
537   return 0;
538 }
539 uint64_t get_const(struct regstat *cur,signed char reg)
540 {
541   int hr;
542   if(!reg) return 0;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if(cur->regmap[hr]==reg) {
545       return current_constmap[hr];
546     }
547   }
548   SysPrintf("Unknown constant in r%d\n",reg);
549   exit(1);
550 }
551
552 // Least soon needed registers
553 // Look at the next ten instructions and see which registers
554 // will be used.  Try not to reallocate these.
555 void lsn(u_char hsn[], int i, int *preferred_reg)
556 {
557   int j;
558   int b=-1;
559   for(j=0;j<9;j++)
560   {
561     if(i+j>=slen) {
562       j=slen-i-1;
563       break;
564     }
565     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
566     {
567       // Don't go past an unconditonal jump
568       j++;
569       break;
570     }
571   }
572   for(;j>=0;j--)
573   {
574     if(rs1[i+j]) hsn[rs1[i+j]]=j;
575     if(rs2[i+j]) hsn[rs2[i+j]]=j;
576     if(rt1[i+j]) hsn[rt1[i+j]]=j;
577     if(rt2[i+j]) hsn[rt2[i+j]]=j;
578     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
579       // Stores can allocate zero
580       hsn[rs1[i+j]]=j;
581       hsn[rs2[i+j]]=j;
582     }
583     // On some architectures stores need invc_ptr
584     #if defined(HOST_IMM8)
585     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
586       hsn[INVCP]=j;
587     }
588     #endif
589     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
590     {
591       hsn[CCREG]=j;
592       b=j;
593     }
594   }
595   if(b>=0)
596   {
597     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
598     {
599       // Follow first branch
600       int t=(ba[i+b]-start)>>2;
601       j=7-b;if(t+j>=slen) j=slen-t-1;
602       for(;j>=0;j--)
603       {
604         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
605         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
606         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
607         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
608       }
609     }
610     // TODO: preferred register based on backward branch
611   }
612   // Delay slot should preferably not overwrite branch conditions or cycle count
613   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
614     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
615     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
616     hsn[CCREG]=1;
617     // ...or hash tables
618     hsn[RHASH]=1;
619     hsn[RHTBL]=1;
620   }
621   // Coprocessor load/store needs FTEMP, even if not declared
622   if(itype[i]==C1LS||itype[i]==C2LS) {
623     hsn[FTEMP]=0;
624   }
625   // Load L/R also uses FTEMP as a temporary register
626   if(itype[i]==LOADLR) {
627     hsn[FTEMP]=0;
628   }
629   // Also SWL/SWR/SDL/SDR
630   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
631     hsn[FTEMP]=0;
632   }
633   // Don't remove the miniht registers
634   if(itype[i]==UJUMP||itype[i]==RJUMP)
635   {
636     hsn[RHASH]=0;
637     hsn[RHTBL]=0;
638   }
639 }
640
641 // We only want to allocate registers if we're going to use them again soon
642 int needed_again(int r, int i)
643 {
644   int j;
645   int b=-1;
646   int rn=10;
647
648   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
649   {
650     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
651       return 0; // Don't need any registers if exiting the block
652   }
653   for(j=0;j<9;j++)
654   {
655     if(i+j>=slen) {
656       j=slen-i-1;
657       break;
658     }
659     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
660     {
661       // Don't go past an unconditonal jump
662       j++;
663       break;
664     }
665     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
666     {
667       break;
668     }
669   }
670   for(;j>=1;j--)
671   {
672     if(rs1[i+j]==r) rn=j;
673     if(rs2[i+j]==r) rn=j;
674     if((unneeded_reg[i+j]>>r)&1) rn=10;
675     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
676     {
677       b=j;
678     }
679   }
680   /*
681   if(b>=0)
682   {
683     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
684     {
685       // Follow first branch
686       int o=rn;
687       int t=(ba[i+b]-start)>>2;
688       j=7-b;if(t+j>=slen) j=slen-t-1;
689       for(;j>=0;j--)
690       {
691         if(!((unneeded_reg[t+j]>>r)&1)) {
692           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
693           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
694         }
695         else rn=o;
696       }
697     }
698   }*/
699   if(rn<10) return 1;
700   (void)b;
701   return 0;
702 }
703
704 // Try to match register allocations at the end of a loop with those
705 // at the beginning
706 int loop_reg(int i, int r, int hr)
707 {
708   int j,k;
709   for(j=0;j<9;j++)
710   {
711     if(i+j>=slen) {
712       j=slen-i-1;
713       break;
714     }
715     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
716     {
717       // Don't go past an unconditonal jump
718       j++;
719       break;
720     }
721   }
722   k=0;
723   if(i>0){
724     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
725       k--;
726   }
727   for(;k<j;k++)
728   {
729     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
730     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
731     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
732     {
733       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
734       {
735         int t=(ba[i+k]-start)>>2;
736         int reg=get_reg(regs[t].regmap_entry,r);
737         if(reg>=0) return reg;
738         //reg=get_reg(regs[t+1].regmap_entry,r);
739         //if(reg>=0) return reg;
740       }
741     }
742   }
743   return hr;
744 }
745
746
747 // Allocate every register, preserving source/target regs
748 void alloc_all(struct regstat *cur,int i)
749 {
750   int hr;
751
752   for(hr=0;hr<HOST_REGS;hr++) {
753     if(hr!=EXCLUDE_REG) {
754       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
755          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
756       {
757         cur->regmap[hr]=-1;
758         cur->dirty&=~(1<<hr);
759       }
760       // Don't need zeros
761       if((cur->regmap[hr]&63)==0)
762       {
763         cur->regmap[hr]=-1;
764         cur->dirty&=~(1<<hr);
765       }
766     }
767   }
768 }
769
770 #ifdef __i386__
771 #include "x86/assem_x86.c"
772 #endif
773 #ifdef __x86_64__
774 #include "x64/assem_x64.c"
775 #endif
776 #ifdef __arm__
777 #include "arm/assem_arm.c"
778 #endif
779
780 // Add virtual address mapping to linked list
781 void ll_add(struct ll_entry **head,int vaddr,void *addr)
782 {
783   struct ll_entry *new_entry;
784   new_entry=malloc(sizeof(struct ll_entry));
785   assert(new_entry!=NULL);
786   new_entry->vaddr=vaddr;
787   new_entry->reg_sv_flags=0;
788   new_entry->addr=addr;
789   new_entry->next=*head;
790   *head=new_entry;
791 }
792
793 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
794 {
795   ll_add(head,vaddr,addr);
796   (*head)->reg_sv_flags=reg_sv_flags;
797 }
798
799 // Check if an address is already compiled
800 // but don't return addresses which are about to expire from the cache
801 void *check_addr(u_int vaddr)
802 {
803   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
804   if(ht_bin[0]==vaddr) {
805     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
806       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
807   }
808   if(ht_bin[2]==vaddr) {
809     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
810       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
811   }
812   u_int page=get_page(vaddr);
813   struct ll_entry *head;
814   head=jump_in[page];
815   while(head!=NULL) {
816     if(head->vaddr==vaddr) {
817       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
818         // Update existing entry with current address
819         if(ht_bin[0]==vaddr) {
820           ht_bin[1]=(int)head->addr;
821           return head->addr;
822         }
823         if(ht_bin[2]==vaddr) {
824           ht_bin[3]=(int)head->addr;
825           return head->addr;
826         }
827         // Insert into hash table with low priority.
828         // Don't evict existing entries, as they are probably
829         // addresses that are being accessed frequently.
830         if(ht_bin[0]==-1) {
831           ht_bin[1]=(int)head->addr;
832           ht_bin[0]=vaddr;
833         }else if(ht_bin[2]==-1) {
834           ht_bin[3]=(int)head->addr;
835           ht_bin[2]=vaddr;
836         }
837         return head->addr;
838       }
839     }
840     head=head->next;
841   }
842   return 0;
843 }
844
845 void remove_hash(int vaddr)
846 {
847   //printf("remove hash: %x\n",vaddr);
848   u_int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
849   if(ht_bin[2]==vaddr) {
850     ht_bin[2]=ht_bin[3]=-1;
851   }
852   if(ht_bin[0]==vaddr) {
853     ht_bin[0]=ht_bin[2];
854     ht_bin[1]=ht_bin[3];
855     ht_bin[2]=ht_bin[3]=-1;
856   }
857 }
858
859 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
860 {
861   struct ll_entry *next;
862   while(*head) {
863     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
864        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
865     {
866       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
867       remove_hash((*head)->vaddr);
868       next=(*head)->next;
869       free(*head);
870       *head=next;
871     }
872     else
873     {
874       head=&((*head)->next);
875     }
876   }
877 }
878
879 // Remove all entries from linked list
880 void ll_clear(struct ll_entry **head)
881 {
882   struct ll_entry *cur;
883   struct ll_entry *next;
884   if((cur=*head)) {
885     *head=0;
886     while(cur) {
887       next=cur->next;
888       free(cur);
889       cur=next;
890     }
891   }
892 }
893
894 // Dereference the pointers and remove if it matches
895 static void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
896 {
897   while(head) {
898     int ptr=get_pointer(head->addr);
899     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
900     if(((ptr>>shift)==(addr>>shift)) ||
901        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
902     {
903       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
904       void *host_addr=find_extjump_insn(head->addr);
905       #ifdef __arm__
906         mark_clear_cache(host_addr);
907       #endif
908       set_jump_target((int)host_addr,(int)head->addr);
909     }
910     head=head->next;
911   }
912 }
913
914 // This is called when we write to a compiled block (see do_invstub)
915 void invalidate_page(u_int page)
916 {
917   struct ll_entry *head;
918   struct ll_entry *next;
919   head=jump_in[page];
920   jump_in[page]=0;
921   while(head!=NULL) {
922     inv_debug("INVALIDATE: %x\n",head->vaddr);
923     remove_hash(head->vaddr);
924     next=head->next;
925     free(head);
926     head=next;
927   }
928   head=jump_out[page];
929   jump_out[page]=0;
930   while(head!=NULL) {
931     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
932     void *host_addr=find_extjump_insn(head->addr);
933     #ifdef __arm__
934       mark_clear_cache(host_addr);
935     #endif
936     set_jump_target((int)host_addr,(int)head->addr);
937     next=head->next;
938     free(head);
939     head=next;
940   }
941 }
942
943 static void invalidate_block_range(u_int block, u_int first, u_int last)
944 {
945   u_int page=get_page(block<<12);
946   //printf("first=%d last=%d\n",first,last);
947   invalidate_page(page);
948   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
949   assert(last<page+5);
950   // Invalidate the adjacent pages if a block crosses a 4K boundary
951   while(first<page) {
952     invalidate_page(first);
953     first++;
954   }
955   for(first=page+1;first<last;first++) {
956     invalidate_page(first);
957   }
958   #ifdef __arm__
959     do_clear_cache();
960   #endif
961
962   // Don't trap writes
963   invalid_code[block]=1;
964
965   #ifdef USE_MINI_HT
966   memset(mini_ht,-1,sizeof(mini_ht));
967   #endif
968 }
969
970 void invalidate_block(u_int block)
971 {
972   u_int page=get_page(block<<12);
973   u_int vpage=get_vpage(block<<12);
974   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
975   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
976   u_int first,last;
977   first=last=page;
978   struct ll_entry *head;
979   head=jump_dirty[vpage];
980   //printf("page=%d vpage=%d\n",page,vpage);
981   while(head!=NULL) {
982     u_int start,end;
983     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
984       get_bounds((int)head->addr,&start,&end);
985       //printf("start: %x end: %x\n",start,end);
986       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
987         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
988           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
989           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
990         }
991       }
992     }
993     head=head->next;
994   }
995   invalidate_block_range(block,first,last);
996 }
997
998 void invalidate_addr(u_int addr)
999 {
1000   //static int rhits;
1001   // this check is done by the caller
1002   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1003   u_int page=get_vpage(addr);
1004   if(page<2048) { // RAM
1005     struct ll_entry *head;
1006     u_int addr_min=~0, addr_max=0;
1007     u_int mask=RAM_SIZE-1;
1008     u_int addr_main=0x80000000|(addr&mask);
1009     int pg1;
1010     inv_code_start=addr_main&~0xfff;
1011     inv_code_end=addr_main|0xfff;
1012     pg1=page;
1013     if (pg1>0) {
1014       // must check previous page too because of spans..
1015       pg1--;
1016       inv_code_start-=0x1000;
1017     }
1018     for(;pg1<=page;pg1++) {
1019       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1020         u_int start,end;
1021         get_bounds((int)head->addr,&start,&end);
1022         if(ram_offset) {
1023           start-=ram_offset;
1024           end-=ram_offset;
1025         }
1026         if(start<=addr_main&&addr_main<end) {
1027           if(start<addr_min) addr_min=start;
1028           if(end>addr_max) addr_max=end;
1029         }
1030         else if(addr_main<start) {
1031           if(start<inv_code_end)
1032             inv_code_end=start-1;
1033         }
1034         else {
1035           if(end>inv_code_start)
1036             inv_code_start=end;
1037         }
1038       }
1039     }
1040     if (addr_min!=~0) {
1041       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1042       inv_code_start=inv_code_end=~0;
1043       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1044       return;
1045     }
1046     else {
1047       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1048       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1049       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1050       return;
1051     }
1052   }
1053   invalidate_block(addr>>12);
1054 }
1055
1056 // This is called when loading a save state.
1057 // Anything could have changed, so invalidate everything.
1058 void invalidate_all_pages(void)
1059 {
1060   u_int page;
1061   for(page=0;page<4096;page++)
1062     invalidate_page(page);
1063   for(page=0;page<1048576;page++)
1064   {
1065     if(!invalid_code[page])
1066     {
1067       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1068       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1069     }
1070   }
1071
1072 #ifdef USE_MINI_HT
1073   memset(mini_ht,-1,sizeof(mini_ht));
1074 #endif
1075 }
1076
1077 // Add an entry to jump_out after making a link
1078 void add_link(u_int vaddr,void *src)
1079 {
1080   u_int page=get_page(vaddr);
1081   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1082   int *ptr=(int *)(src+4);
1083   assert((*ptr&0x0fff0000)==0x059f0000);
1084   (void)ptr;
1085   ll_add(jump_out+page,vaddr,src);
1086   //int ptr=get_pointer(src);
1087   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1088 }
1089
1090 // If a code block was found to be unmodified (bit was set in
1091 // restore_candidate) and it remains unmodified (bit is clear
1092 // in invalid_code) then move the entries for that 4K page from
1093 // the dirty list to the clean list.
1094 void clean_blocks(u_int page)
1095 {
1096   struct ll_entry *head;
1097   inv_debug("INV: clean_blocks page=%d\n",page);
1098   head=jump_dirty[page];
1099   while(head!=NULL) {
1100     if(!invalid_code[head->vaddr>>12]) {
1101       // Don't restore blocks which are about to expire from the cache
1102       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1103         u_int start,end;
1104         if(verify_dirty(head->addr)) {
1105           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1106           u_int i;
1107           u_int inv=0;
1108           get_bounds((int)head->addr,&start,&end);
1109           if(start-(u_int)rdram<RAM_SIZE) {
1110             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1111               inv|=invalid_code[i];
1112             }
1113           }
1114           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1115             inv=1;
1116           }
1117           if(!inv) {
1118             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1119             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1120               u_int ppage=page;
1121               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1122               //printf("page=%x, addr=%x\n",page,head->vaddr);
1123               //assert(head->vaddr>>12==(page|0x80000));
1124               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1125               u_int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1126               if(ht_bin[0]==head->vaddr) {
1127                 ht_bin[1]=(u_int)clean_addr; // Replace existing entry
1128               }
1129               if(ht_bin[2]==head->vaddr) {
1130                 ht_bin[3]=(u_int)clean_addr; // Replace existing entry
1131               }
1132             }
1133           }
1134         }
1135       }
1136     }
1137     head=head->next;
1138   }
1139 }
1140
1141
1142 void mov_alloc(struct regstat *current,int i)
1143 {
1144   // Note: Don't need to actually alloc the source registers
1145   if((~current->is32>>rs1[i])&1) {
1146     //alloc_reg64(current,i,rs1[i]);
1147     alloc_reg64(current,i,rt1[i]);
1148     current->is32&=~(1LL<<rt1[i]);
1149   } else {
1150     //alloc_reg(current,i,rs1[i]);
1151     alloc_reg(current,i,rt1[i]);
1152     current->is32|=(1LL<<rt1[i]);
1153   }
1154   clear_const(current,rs1[i]);
1155   clear_const(current,rt1[i]);
1156   dirty_reg(current,rt1[i]);
1157 }
1158
1159 void shiftimm_alloc(struct regstat *current,int i)
1160 {
1161   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1162   {
1163     if(rt1[i]) {
1164       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1165       else lt1[i]=rs1[i];
1166       alloc_reg(current,i,rt1[i]);
1167       current->is32|=1LL<<rt1[i];
1168       dirty_reg(current,rt1[i]);
1169       if(is_const(current,rs1[i])) {
1170         int v=get_const(current,rs1[i]);
1171         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1172         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1173         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1174       }
1175       else clear_const(current,rt1[i]);
1176     }
1177   }
1178   else
1179   {
1180     clear_const(current,rs1[i]);
1181     clear_const(current,rt1[i]);
1182   }
1183
1184   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1185   {
1186     if(rt1[i]) {
1187       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1188       alloc_reg64(current,i,rt1[i]);
1189       current->is32&=~(1LL<<rt1[i]);
1190       dirty_reg(current,rt1[i]);
1191     }
1192   }
1193   if(opcode2[i]==0x3c) // DSLL32
1194   {
1195     if(rt1[i]) {
1196       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1197       alloc_reg64(current,i,rt1[i]);
1198       current->is32&=~(1LL<<rt1[i]);
1199       dirty_reg(current,rt1[i]);
1200     }
1201   }
1202   if(opcode2[i]==0x3e) // DSRL32
1203   {
1204     if(rt1[i]) {
1205       alloc_reg64(current,i,rs1[i]);
1206       if(imm[i]==32) {
1207         alloc_reg64(current,i,rt1[i]);
1208         current->is32&=~(1LL<<rt1[i]);
1209       } else {
1210         alloc_reg(current,i,rt1[i]);
1211         current->is32|=1LL<<rt1[i];
1212       }
1213       dirty_reg(current,rt1[i]);
1214     }
1215   }
1216   if(opcode2[i]==0x3f) // DSRA32
1217   {
1218     if(rt1[i]) {
1219       alloc_reg64(current,i,rs1[i]);
1220       alloc_reg(current,i,rt1[i]);
1221       current->is32|=1LL<<rt1[i];
1222       dirty_reg(current,rt1[i]);
1223     }
1224   }
1225 }
1226
1227 void shift_alloc(struct regstat *current,int i)
1228 {
1229   if(rt1[i]) {
1230     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1231     {
1232       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1233       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1234       alloc_reg(current,i,rt1[i]);
1235       if(rt1[i]==rs2[i]) {
1236         alloc_reg_temp(current,i,-1);
1237         minimum_free_regs[i]=1;
1238       }
1239       current->is32|=1LL<<rt1[i];
1240     } else { // DSLLV/DSRLV/DSRAV
1241       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1242       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1243       alloc_reg64(current,i,rt1[i]);
1244       current->is32&=~(1LL<<rt1[i]);
1245       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1246       {
1247         alloc_reg_temp(current,i,-1);
1248         minimum_free_regs[i]=1;
1249       }
1250     }
1251     clear_const(current,rs1[i]);
1252     clear_const(current,rs2[i]);
1253     clear_const(current,rt1[i]);
1254     dirty_reg(current,rt1[i]);
1255   }
1256 }
1257
1258 void alu_alloc(struct regstat *current,int i)
1259 {
1260   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1261     if(rt1[i]) {
1262       if(rs1[i]&&rs2[i]) {
1263         alloc_reg(current,i,rs1[i]);
1264         alloc_reg(current,i,rs2[i]);
1265       }
1266       else {
1267         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1268         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1269       }
1270       alloc_reg(current,i,rt1[i]);
1271     }
1272     current->is32|=1LL<<rt1[i];
1273   }
1274   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1275     if(rt1[i]) {
1276       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1277       {
1278         alloc_reg64(current,i,rs1[i]);
1279         alloc_reg64(current,i,rs2[i]);
1280         alloc_reg(current,i,rt1[i]);
1281       } else {
1282         alloc_reg(current,i,rs1[i]);
1283         alloc_reg(current,i,rs2[i]);
1284         alloc_reg(current,i,rt1[i]);
1285       }
1286     }
1287     current->is32|=1LL<<rt1[i];
1288   }
1289   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1290     if(rt1[i]) {
1291       if(rs1[i]&&rs2[i]) {
1292         alloc_reg(current,i,rs1[i]);
1293         alloc_reg(current,i,rs2[i]);
1294       }
1295       else
1296       {
1297         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1298         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1299       }
1300       alloc_reg(current,i,rt1[i]);
1301       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1302       {
1303         if(!((current->uu>>rt1[i])&1)) {
1304           alloc_reg64(current,i,rt1[i]);
1305         }
1306         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1307           if(rs1[i]&&rs2[i]) {
1308             alloc_reg64(current,i,rs1[i]);
1309             alloc_reg64(current,i,rs2[i]);
1310           }
1311           else
1312           {
1313             // Is is really worth it to keep 64-bit values in registers?
1314             #ifdef NATIVE_64BIT
1315             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1316             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1317             #endif
1318           }
1319         }
1320         current->is32&=~(1LL<<rt1[i]);
1321       } else {
1322         current->is32|=1LL<<rt1[i];
1323       }
1324     }
1325   }
1326   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1327     if(rt1[i]) {
1328       if(rs1[i]&&rs2[i]) {
1329         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1330           alloc_reg64(current,i,rs1[i]);
1331           alloc_reg64(current,i,rs2[i]);
1332           alloc_reg64(current,i,rt1[i]);
1333         } else {
1334           alloc_reg(current,i,rs1[i]);
1335           alloc_reg(current,i,rs2[i]);
1336           alloc_reg(current,i,rt1[i]);
1337         }
1338       }
1339       else {
1340         alloc_reg(current,i,rt1[i]);
1341         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1342           // DADD used as move, or zeroing
1343           // If we have a 64-bit source, then make the target 64 bits too
1344           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1345             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1346             alloc_reg64(current,i,rt1[i]);
1347           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1348             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1349             alloc_reg64(current,i,rt1[i]);
1350           }
1351           if(opcode2[i]>=0x2e&&rs2[i]) {
1352             // DSUB used as negation - 64-bit result
1353             // If we have a 32-bit register, extend it to 64 bits
1354             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1355             alloc_reg64(current,i,rt1[i]);
1356           }
1357         }
1358       }
1359       if(rs1[i]&&rs2[i]) {
1360         current->is32&=~(1LL<<rt1[i]);
1361       } else if(rs1[i]) {
1362         current->is32&=~(1LL<<rt1[i]);
1363         if((current->is32>>rs1[i])&1)
1364           current->is32|=1LL<<rt1[i];
1365       } else if(rs2[i]) {
1366         current->is32&=~(1LL<<rt1[i]);
1367         if((current->is32>>rs2[i])&1)
1368           current->is32|=1LL<<rt1[i];
1369       } else {
1370         current->is32|=1LL<<rt1[i];
1371       }
1372     }
1373   }
1374   clear_const(current,rs1[i]);
1375   clear_const(current,rs2[i]);
1376   clear_const(current,rt1[i]);
1377   dirty_reg(current,rt1[i]);
1378 }
1379
1380 void imm16_alloc(struct regstat *current,int i)
1381 {
1382   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1383   else lt1[i]=rs1[i];
1384   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1385   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1386     current->is32&=~(1LL<<rt1[i]);
1387     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1388       // TODO: Could preserve the 32-bit flag if the immediate is zero
1389       alloc_reg64(current,i,rt1[i]);
1390       alloc_reg64(current,i,rs1[i]);
1391     }
1392     clear_const(current,rs1[i]);
1393     clear_const(current,rt1[i]);
1394   }
1395   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1396     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1397     current->is32|=1LL<<rt1[i];
1398     clear_const(current,rs1[i]);
1399     clear_const(current,rt1[i]);
1400   }
1401   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1402     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1403       if(rs1[i]!=rt1[i]) {
1404         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1405         alloc_reg64(current,i,rt1[i]);
1406         current->is32&=~(1LL<<rt1[i]);
1407       }
1408     }
1409     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1410     if(is_const(current,rs1[i])) {
1411       int v=get_const(current,rs1[i]);
1412       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1413       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1414       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1415     }
1416     else clear_const(current,rt1[i]);
1417   }
1418   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1419     if(is_const(current,rs1[i])) {
1420       int v=get_const(current,rs1[i]);
1421       set_const(current,rt1[i],v+imm[i]);
1422     }
1423     else clear_const(current,rt1[i]);
1424     current->is32|=1LL<<rt1[i];
1425   }
1426   else {
1427     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1428     current->is32|=1LL<<rt1[i];
1429   }
1430   dirty_reg(current,rt1[i]);
1431 }
1432
1433 void load_alloc(struct regstat *current,int i)
1434 {
1435   clear_const(current,rt1[i]);
1436   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1437   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1438   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1439   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1440     alloc_reg(current,i,rt1[i]);
1441     assert(get_reg(current->regmap,rt1[i])>=0);
1442     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1443     {
1444       current->is32&=~(1LL<<rt1[i]);
1445       alloc_reg64(current,i,rt1[i]);
1446     }
1447     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1448     {
1449       current->is32&=~(1LL<<rt1[i]);
1450       alloc_reg64(current,i,rt1[i]);
1451       alloc_all(current,i);
1452       alloc_reg64(current,i,FTEMP);
1453       minimum_free_regs[i]=HOST_REGS;
1454     }
1455     else current->is32|=1LL<<rt1[i];
1456     dirty_reg(current,rt1[i]);
1457     // LWL/LWR need a temporary register for the old value
1458     if(opcode[i]==0x22||opcode[i]==0x26)
1459     {
1460       alloc_reg(current,i,FTEMP);
1461       alloc_reg_temp(current,i,-1);
1462       minimum_free_regs[i]=1;
1463     }
1464   }
1465   else
1466   {
1467     // Load to r0 or unneeded register (dummy load)
1468     // but we still need a register to calculate the address
1469     if(opcode[i]==0x22||opcode[i]==0x26)
1470     {
1471       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1472     }
1473     alloc_reg_temp(current,i,-1);
1474     minimum_free_regs[i]=1;
1475     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1476     {
1477       alloc_all(current,i);
1478       alloc_reg64(current,i,FTEMP);
1479       minimum_free_regs[i]=HOST_REGS;
1480     }
1481   }
1482 }
1483
1484 void store_alloc(struct regstat *current,int i)
1485 {
1486   clear_const(current,rs2[i]);
1487   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1488   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1489   alloc_reg(current,i,rs2[i]);
1490   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1491     alloc_reg64(current,i,rs2[i]);
1492     if(rs2[i]) alloc_reg(current,i,FTEMP);
1493   }
1494   #if defined(HOST_IMM8)
1495   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1496   else alloc_reg(current,i,INVCP);
1497   #endif
1498   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1499     alloc_reg(current,i,FTEMP);
1500   }
1501   // We need a temporary register for address generation
1502   alloc_reg_temp(current,i,-1);
1503   minimum_free_regs[i]=1;
1504 }
1505
1506 void c1ls_alloc(struct regstat *current,int i)
1507 {
1508   //clear_const(current,rs1[i]); // FIXME
1509   clear_const(current,rt1[i]);
1510   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1511   alloc_reg(current,i,CSREG); // Status
1512   alloc_reg(current,i,FTEMP);
1513   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1514     alloc_reg64(current,i,FTEMP);
1515   }
1516   #if defined(HOST_IMM8)
1517   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1518   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1519     alloc_reg(current,i,INVCP);
1520   #endif
1521   // We need a temporary register for address generation
1522   alloc_reg_temp(current,i,-1);
1523 }
1524
1525 void c2ls_alloc(struct regstat *current,int i)
1526 {
1527   clear_const(current,rt1[i]);
1528   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1529   alloc_reg(current,i,FTEMP);
1530   #if defined(HOST_IMM8)
1531   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1532   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1533     alloc_reg(current,i,INVCP);
1534   #endif
1535   // We need a temporary register for address generation
1536   alloc_reg_temp(current,i,-1);
1537   minimum_free_regs[i]=1;
1538 }
1539
1540 #ifndef multdiv_alloc
1541 void multdiv_alloc(struct regstat *current,int i)
1542 {
1543   //  case 0x18: MULT
1544   //  case 0x19: MULTU
1545   //  case 0x1A: DIV
1546   //  case 0x1B: DIVU
1547   //  case 0x1C: DMULT
1548   //  case 0x1D: DMULTU
1549   //  case 0x1E: DDIV
1550   //  case 0x1F: DDIVU
1551   clear_const(current,rs1[i]);
1552   clear_const(current,rs2[i]);
1553   if(rs1[i]&&rs2[i])
1554   {
1555     if((opcode2[i]&4)==0) // 32-bit
1556     {
1557       current->u&=~(1LL<<HIREG);
1558       current->u&=~(1LL<<LOREG);
1559       alloc_reg(current,i,HIREG);
1560       alloc_reg(current,i,LOREG);
1561       alloc_reg(current,i,rs1[i]);
1562       alloc_reg(current,i,rs2[i]);
1563       current->is32|=1LL<<HIREG;
1564       current->is32|=1LL<<LOREG;
1565       dirty_reg(current,HIREG);
1566       dirty_reg(current,LOREG);
1567     }
1568     else // 64-bit
1569     {
1570       current->u&=~(1LL<<HIREG);
1571       current->u&=~(1LL<<LOREG);
1572       current->uu&=~(1LL<<HIREG);
1573       current->uu&=~(1LL<<LOREG);
1574       alloc_reg64(current,i,HIREG);
1575       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1576       alloc_reg64(current,i,rs1[i]);
1577       alloc_reg64(current,i,rs2[i]);
1578       alloc_all(current,i);
1579       current->is32&=~(1LL<<HIREG);
1580       current->is32&=~(1LL<<LOREG);
1581       dirty_reg(current,HIREG);
1582       dirty_reg(current,LOREG);
1583       minimum_free_regs[i]=HOST_REGS;
1584     }
1585   }
1586   else
1587   {
1588     // Multiply by zero is zero.
1589     // MIPS does not have a divide by zero exception.
1590     // The result is undefined, we return zero.
1591     alloc_reg(current,i,HIREG);
1592     alloc_reg(current,i,LOREG);
1593     current->is32|=1LL<<HIREG;
1594     current->is32|=1LL<<LOREG;
1595     dirty_reg(current,HIREG);
1596     dirty_reg(current,LOREG);
1597   }
1598 }
1599 #endif
1600
1601 void cop0_alloc(struct regstat *current,int i)
1602 {
1603   if(opcode2[i]==0) // MFC0
1604   {
1605     if(rt1[i]) {
1606       clear_const(current,rt1[i]);
1607       alloc_all(current,i);
1608       alloc_reg(current,i,rt1[i]);
1609       current->is32|=1LL<<rt1[i];
1610       dirty_reg(current,rt1[i]);
1611     }
1612   }
1613   else if(opcode2[i]==4) // MTC0
1614   {
1615     if(rs1[i]){
1616       clear_const(current,rs1[i]);
1617       alloc_reg(current,i,rs1[i]);
1618       alloc_all(current,i);
1619     }
1620     else {
1621       alloc_all(current,i); // FIXME: Keep r0
1622       current->u&=~1LL;
1623       alloc_reg(current,i,0);
1624     }
1625   }
1626   else
1627   {
1628     // TLBR/TLBWI/TLBWR/TLBP/ERET
1629     assert(opcode2[i]==0x10);
1630     alloc_all(current,i);
1631   }
1632   minimum_free_regs[i]=HOST_REGS;
1633 }
1634
1635 void cop1_alloc(struct regstat *current,int i)
1636 {
1637   alloc_reg(current,i,CSREG); // Load status
1638   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1639   {
1640     if(rt1[i]){
1641       clear_const(current,rt1[i]);
1642       if(opcode2[i]==1) {
1643         alloc_reg64(current,i,rt1[i]); // DMFC1
1644         current->is32&=~(1LL<<rt1[i]);
1645       }else{
1646         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1647         current->is32|=1LL<<rt1[i];
1648       }
1649       dirty_reg(current,rt1[i]);
1650     }
1651     alloc_reg_temp(current,i,-1);
1652   }
1653   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1654   {
1655     if(rs1[i]){
1656       clear_const(current,rs1[i]);
1657       if(opcode2[i]==5)
1658         alloc_reg64(current,i,rs1[i]); // DMTC1
1659       else
1660         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1661       alloc_reg_temp(current,i,-1);
1662     }
1663     else {
1664       current->u&=~1LL;
1665       alloc_reg(current,i,0);
1666       alloc_reg_temp(current,i,-1);
1667     }
1668   }
1669   minimum_free_regs[i]=1;
1670 }
1671 void fconv_alloc(struct regstat *current,int i)
1672 {
1673   alloc_reg(current,i,CSREG); // Load status
1674   alloc_reg_temp(current,i,-1);
1675   minimum_free_regs[i]=1;
1676 }
1677 void float_alloc(struct regstat *current,int i)
1678 {
1679   alloc_reg(current,i,CSREG); // Load status
1680   alloc_reg_temp(current,i,-1);
1681   minimum_free_regs[i]=1;
1682 }
1683 void c2op_alloc(struct regstat *current,int i)
1684 {
1685   alloc_reg_temp(current,i,-1);
1686 }
1687 void fcomp_alloc(struct regstat *current,int i)
1688 {
1689   alloc_reg(current,i,CSREG); // Load status
1690   alloc_reg(current,i,FSREG); // Load flags
1691   dirty_reg(current,FSREG); // Flag will be modified
1692   alloc_reg_temp(current,i,-1);
1693   minimum_free_regs[i]=1;
1694 }
1695
1696 void syscall_alloc(struct regstat *current,int i)
1697 {
1698   alloc_cc(current,i);
1699   dirty_reg(current,CCREG);
1700   alloc_all(current,i);
1701   minimum_free_regs[i]=HOST_REGS;
1702   current->isconst=0;
1703 }
1704
1705 void delayslot_alloc(struct regstat *current,int i)
1706 {
1707   switch(itype[i])
1708   {
1709     case UJUMP:
1710     case CJUMP:
1711     case SJUMP:
1712     case RJUMP:
1713     case FJUMP:
1714     case SYSCALL:
1715     case HLECALL:
1716     case SPAN:
1717       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1718       SysPrintf("Disabled speculative precompilation\n");
1719       stop_after_jal=1;
1720       break;
1721     case IMM16:
1722       imm16_alloc(current,i);
1723       break;
1724     case LOAD:
1725     case LOADLR:
1726       load_alloc(current,i);
1727       break;
1728     case STORE:
1729     case STORELR:
1730       store_alloc(current,i);
1731       break;
1732     case ALU:
1733       alu_alloc(current,i);
1734       break;
1735     case SHIFT:
1736       shift_alloc(current,i);
1737       break;
1738     case MULTDIV:
1739       multdiv_alloc(current,i);
1740       break;
1741     case SHIFTIMM:
1742       shiftimm_alloc(current,i);
1743       break;
1744     case MOV:
1745       mov_alloc(current,i);
1746       break;
1747     case COP0:
1748       cop0_alloc(current,i);
1749       break;
1750     case COP1:
1751     case COP2:
1752       cop1_alloc(current,i);
1753       break;
1754     case C1LS:
1755       c1ls_alloc(current,i);
1756       break;
1757     case C2LS:
1758       c2ls_alloc(current,i);
1759       break;
1760     case FCONV:
1761       fconv_alloc(current,i);
1762       break;
1763     case FLOAT:
1764       float_alloc(current,i);
1765       break;
1766     case FCOMP:
1767       fcomp_alloc(current,i);
1768       break;
1769     case C2OP:
1770       c2op_alloc(current,i);
1771       break;
1772   }
1773 }
1774
1775 // Special case where a branch and delay slot span two pages in virtual memory
1776 static void pagespan_alloc(struct regstat *current,int i)
1777 {
1778   current->isconst=0;
1779   current->wasconst=0;
1780   regs[i].wasconst=0;
1781   minimum_free_regs[i]=HOST_REGS;
1782   alloc_all(current,i);
1783   alloc_cc(current,i);
1784   dirty_reg(current,CCREG);
1785   if(opcode[i]==3) // JAL
1786   {
1787     alloc_reg(current,i,31);
1788     dirty_reg(current,31);
1789   }
1790   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1791   {
1792     alloc_reg(current,i,rs1[i]);
1793     if (rt1[i]!=0) {
1794       alloc_reg(current,i,rt1[i]);
1795       dirty_reg(current,rt1[i]);
1796     }
1797   }
1798   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1799   {
1800     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1801     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1802     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1803     {
1804       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1805       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1806     }
1807   }
1808   else
1809   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1810   {
1811     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1812     if(!((current->is32>>rs1[i])&1))
1813     {
1814       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1815     }
1816   }
1817   else
1818   if(opcode[i]==0x11) // BC1
1819   {
1820     alloc_reg(current,i,FSREG);
1821     alloc_reg(current,i,CSREG);
1822   }
1823   //else ...
1824 }
1825
1826 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1827 {
1828   stubs[stubcount][0]=type;
1829   stubs[stubcount][1]=addr;
1830   stubs[stubcount][2]=retaddr;
1831   stubs[stubcount][3]=a;
1832   stubs[stubcount][4]=b;
1833   stubs[stubcount][5]=c;
1834   stubs[stubcount][6]=d;
1835   stubs[stubcount][7]=e;
1836   stubcount++;
1837 }
1838
1839 // Write out a single register
1840 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1841 {
1842   int hr;
1843   for(hr=0;hr<HOST_REGS;hr++) {
1844     if(hr!=EXCLUDE_REG) {
1845       if((regmap[hr]&63)==r) {
1846         if((dirty>>hr)&1) {
1847           if(regmap[hr]<64) {
1848             emit_storereg(r,hr);
1849           }else{
1850             emit_storereg(r|64,hr);
1851           }
1852         }
1853       }
1854     }
1855   }
1856 }
1857
1858 #if 0
1859 static int mchecksum(void)
1860 {
1861   //if(!tracedebug) return 0;
1862   int i;
1863   int sum=0;
1864   for(i=0;i<2097152;i++) {
1865     unsigned int temp=sum;
1866     sum<<=1;
1867     sum|=(~temp)>>31;
1868     sum^=((u_int *)rdram)[i];
1869   }
1870   return sum;
1871 }
1872
1873 static int rchecksum(void)
1874 {
1875   int i;
1876   int sum=0;
1877   for(i=0;i<64;i++)
1878     sum^=((u_int *)reg)[i];
1879   return sum;
1880 }
1881
1882 static void rlist(void)
1883 {
1884   int i;
1885   printf("TRACE: ");
1886   for(i=0;i<32;i++)
1887     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1888   printf("\n");
1889 }
1890
1891 static void enabletrace(void)
1892 {
1893   tracedebug=1;
1894 }
1895
1896 static void memdebug(int i)
1897 {
1898   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1899   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1900   //rlist();
1901   //if(tracedebug) {
1902   //if(Count>=-2084597794) {
1903   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1904   //if(0) {
1905     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1906     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1907     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1908     rlist();
1909     #ifdef __i386__
1910     printf("TRACE: %x\n",(&i)[-1]);
1911     #endif
1912     #ifdef __arm__
1913     int j;
1914     printf("TRACE: %x \n",(&j)[10]);
1915     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1916     #endif
1917     //fflush(stdout);
1918   }
1919   //printf("TRACE: %x\n",(&i)[-1]);
1920 }
1921 #endif
1922
1923 void alu_assemble(int i,struct regstat *i_regs)
1924 {
1925   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1926     if(rt1[i]) {
1927       signed char s1,s2,t;
1928       t=get_reg(i_regs->regmap,rt1[i]);
1929       if(t>=0) {
1930         s1=get_reg(i_regs->regmap,rs1[i]);
1931         s2=get_reg(i_regs->regmap,rs2[i]);
1932         if(rs1[i]&&rs2[i]) {
1933           assert(s1>=0);
1934           assert(s2>=0);
1935           if(opcode2[i]&2) emit_sub(s1,s2,t);
1936           else emit_add(s1,s2,t);
1937         }
1938         else if(rs1[i]) {
1939           if(s1>=0) emit_mov(s1,t);
1940           else emit_loadreg(rs1[i],t);
1941         }
1942         else if(rs2[i]) {
1943           if(s2>=0) {
1944             if(opcode2[i]&2) emit_neg(s2,t);
1945             else emit_mov(s2,t);
1946           }
1947           else {
1948             emit_loadreg(rs2[i],t);
1949             if(opcode2[i]&2) emit_neg(t,t);
1950           }
1951         }
1952         else emit_zeroreg(t);
1953       }
1954     }
1955   }
1956   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1957     if(rt1[i]) {
1958       signed char s1l,s2l,s1h,s2h,tl,th;
1959       tl=get_reg(i_regs->regmap,rt1[i]);
1960       th=get_reg(i_regs->regmap,rt1[i]|64);
1961       if(tl>=0) {
1962         s1l=get_reg(i_regs->regmap,rs1[i]);
1963         s2l=get_reg(i_regs->regmap,rs2[i]);
1964         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1965         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1966         if(rs1[i]&&rs2[i]) {
1967           assert(s1l>=0);
1968           assert(s2l>=0);
1969           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1970           else emit_adds(s1l,s2l,tl);
1971           if(th>=0) {
1972             #ifdef INVERTED_CARRY
1973             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1974             #else
1975             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1976             #endif
1977             else emit_add(s1h,s2h,th);
1978           }
1979         }
1980         else if(rs1[i]) {
1981           if(s1l>=0) emit_mov(s1l,tl);
1982           else emit_loadreg(rs1[i],tl);
1983           if(th>=0) {
1984             if(s1h>=0) emit_mov(s1h,th);
1985             else emit_loadreg(rs1[i]|64,th);
1986           }
1987         }
1988         else if(rs2[i]) {
1989           if(s2l>=0) {
1990             if(opcode2[i]&2) emit_negs(s2l,tl);
1991             else emit_mov(s2l,tl);
1992           }
1993           else {
1994             emit_loadreg(rs2[i],tl);
1995             if(opcode2[i]&2) emit_negs(tl,tl);
1996           }
1997           if(th>=0) {
1998             #ifdef INVERTED_CARRY
1999             if(s2h>=0) emit_mov(s2h,th);
2000             else emit_loadreg(rs2[i]|64,th);
2001             if(opcode2[i]&2) {
2002               emit_adcimm(-1,th); // x86 has inverted carry flag
2003               emit_not(th,th);
2004             }
2005             #else
2006             if(opcode2[i]&2) {
2007               if(s2h>=0) emit_rscimm(s2h,0,th);
2008               else {
2009                 emit_loadreg(rs2[i]|64,th);
2010                 emit_rscimm(th,0,th);
2011               }
2012             }else{
2013               if(s2h>=0) emit_mov(s2h,th);
2014               else emit_loadreg(rs2[i]|64,th);
2015             }
2016             #endif
2017           }
2018         }
2019         else {
2020           emit_zeroreg(tl);
2021           if(th>=0) emit_zeroreg(th);
2022         }
2023       }
2024     }
2025   }
2026   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2027     if(rt1[i]) {
2028       signed char s1l,s1h,s2l,s2h,t;
2029       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2030       {
2031         t=get_reg(i_regs->regmap,rt1[i]);
2032         //assert(t>=0);
2033         if(t>=0) {
2034           s1l=get_reg(i_regs->regmap,rs1[i]);
2035           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2036           s2l=get_reg(i_regs->regmap,rs2[i]);
2037           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2038           if(rs2[i]==0) // rx<r0
2039           {
2040             assert(s1h>=0);
2041             if(opcode2[i]==0x2a) // SLT
2042               emit_shrimm(s1h,31,t);
2043             else // SLTU (unsigned can not be less than zero)
2044               emit_zeroreg(t);
2045           }
2046           else if(rs1[i]==0) // r0<rx
2047           {
2048             assert(s2h>=0);
2049             if(opcode2[i]==0x2a) // SLT
2050               emit_set_gz64_32(s2h,s2l,t);
2051             else // SLTU (set if not zero)
2052               emit_set_nz64_32(s2h,s2l,t);
2053           }
2054           else {
2055             assert(s1l>=0);assert(s1h>=0);
2056             assert(s2l>=0);assert(s2h>=0);
2057             if(opcode2[i]==0x2a) // SLT
2058               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2059             else // SLTU
2060               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2061           }
2062         }
2063       } else {
2064         t=get_reg(i_regs->regmap,rt1[i]);
2065         //assert(t>=0);
2066         if(t>=0) {
2067           s1l=get_reg(i_regs->regmap,rs1[i]);
2068           s2l=get_reg(i_regs->regmap,rs2[i]);
2069           if(rs2[i]==0) // rx<r0
2070           {
2071             assert(s1l>=0);
2072             if(opcode2[i]==0x2a) // SLT
2073               emit_shrimm(s1l,31,t);
2074             else // SLTU (unsigned can not be less than zero)
2075               emit_zeroreg(t);
2076           }
2077           else if(rs1[i]==0) // r0<rx
2078           {
2079             assert(s2l>=0);
2080             if(opcode2[i]==0x2a) // SLT
2081               emit_set_gz32(s2l,t);
2082             else // SLTU (set if not zero)
2083               emit_set_nz32(s2l,t);
2084           }
2085           else{
2086             assert(s1l>=0);assert(s2l>=0);
2087             if(opcode2[i]==0x2a) // SLT
2088               emit_set_if_less32(s1l,s2l,t);
2089             else // SLTU
2090               emit_set_if_carry32(s1l,s2l,t);
2091           }
2092         }
2093       }
2094     }
2095   }
2096   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2097     if(rt1[i]) {
2098       signed char s1l,s1h,s2l,s2h,th,tl;
2099       tl=get_reg(i_regs->regmap,rt1[i]);
2100       th=get_reg(i_regs->regmap,rt1[i]|64);
2101       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2102       {
2103         assert(tl>=0);
2104         if(tl>=0) {
2105           s1l=get_reg(i_regs->regmap,rs1[i]);
2106           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2107           s2l=get_reg(i_regs->regmap,rs2[i]);
2108           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2109           if(rs1[i]&&rs2[i]) {
2110             assert(s1l>=0);assert(s1h>=0);
2111             assert(s2l>=0);assert(s2h>=0);
2112             if(opcode2[i]==0x24) { // AND
2113               emit_and(s1l,s2l,tl);
2114               emit_and(s1h,s2h,th);
2115             } else
2116             if(opcode2[i]==0x25) { // OR
2117               emit_or(s1l,s2l,tl);
2118               emit_or(s1h,s2h,th);
2119             } else
2120             if(opcode2[i]==0x26) { // XOR
2121               emit_xor(s1l,s2l,tl);
2122               emit_xor(s1h,s2h,th);
2123             } else
2124             if(opcode2[i]==0x27) { // NOR
2125               emit_or(s1l,s2l,tl);
2126               emit_or(s1h,s2h,th);
2127               emit_not(tl,tl);
2128               emit_not(th,th);
2129             }
2130           }
2131           else
2132           {
2133             if(opcode2[i]==0x24) { // AND
2134               emit_zeroreg(tl);
2135               emit_zeroreg(th);
2136             } else
2137             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2138               if(rs1[i]){
2139                 if(s1l>=0) emit_mov(s1l,tl);
2140                 else emit_loadreg(rs1[i],tl);
2141                 if(s1h>=0) emit_mov(s1h,th);
2142                 else emit_loadreg(rs1[i]|64,th);
2143               }
2144               else
2145               if(rs2[i]){
2146                 if(s2l>=0) emit_mov(s2l,tl);
2147                 else emit_loadreg(rs2[i],tl);
2148                 if(s2h>=0) emit_mov(s2h,th);
2149                 else emit_loadreg(rs2[i]|64,th);
2150               }
2151               else{
2152                 emit_zeroreg(tl);
2153                 emit_zeroreg(th);
2154               }
2155             } else
2156             if(opcode2[i]==0x27) { // NOR
2157               if(rs1[i]){
2158                 if(s1l>=0) emit_not(s1l,tl);
2159                 else{
2160                   emit_loadreg(rs1[i],tl);
2161                   emit_not(tl,tl);
2162                 }
2163                 if(s1h>=0) emit_not(s1h,th);
2164                 else{
2165                   emit_loadreg(rs1[i]|64,th);
2166                   emit_not(th,th);
2167                 }
2168               }
2169               else
2170               if(rs2[i]){
2171                 if(s2l>=0) emit_not(s2l,tl);
2172                 else{
2173                   emit_loadreg(rs2[i],tl);
2174                   emit_not(tl,tl);
2175                 }
2176                 if(s2h>=0) emit_not(s2h,th);
2177                 else{
2178                   emit_loadreg(rs2[i]|64,th);
2179                   emit_not(th,th);
2180                 }
2181               }
2182               else {
2183                 emit_movimm(-1,tl);
2184                 emit_movimm(-1,th);
2185               }
2186             }
2187           }
2188         }
2189       }
2190       else
2191       {
2192         // 32 bit
2193         if(tl>=0) {
2194           s1l=get_reg(i_regs->regmap,rs1[i]);
2195           s2l=get_reg(i_regs->regmap,rs2[i]);
2196           if(rs1[i]&&rs2[i]) {
2197             assert(s1l>=0);
2198             assert(s2l>=0);
2199             if(opcode2[i]==0x24) { // AND
2200               emit_and(s1l,s2l,tl);
2201             } else
2202             if(opcode2[i]==0x25) { // OR
2203               emit_or(s1l,s2l,tl);
2204             } else
2205             if(opcode2[i]==0x26) { // XOR
2206               emit_xor(s1l,s2l,tl);
2207             } else
2208             if(opcode2[i]==0x27) { // NOR
2209               emit_or(s1l,s2l,tl);
2210               emit_not(tl,tl);
2211             }
2212           }
2213           else
2214           {
2215             if(opcode2[i]==0x24) { // AND
2216               emit_zeroreg(tl);
2217             } else
2218             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2219               if(rs1[i]){
2220                 if(s1l>=0) emit_mov(s1l,tl);
2221                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2222               }
2223               else
2224               if(rs2[i]){
2225                 if(s2l>=0) emit_mov(s2l,tl);
2226                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2227               }
2228               else emit_zeroreg(tl);
2229             } else
2230             if(opcode2[i]==0x27) { // NOR
2231               if(rs1[i]){
2232                 if(s1l>=0) emit_not(s1l,tl);
2233                 else {
2234                   emit_loadreg(rs1[i],tl);
2235                   emit_not(tl,tl);
2236                 }
2237               }
2238               else
2239               if(rs2[i]){
2240                 if(s2l>=0) emit_not(s2l,tl);
2241                 else {
2242                   emit_loadreg(rs2[i],tl);
2243                   emit_not(tl,tl);
2244                 }
2245               }
2246               else emit_movimm(-1,tl);
2247             }
2248           }
2249         }
2250       }
2251     }
2252   }
2253 }
2254
2255 void imm16_assemble(int i,struct regstat *i_regs)
2256 {
2257   if (opcode[i]==0x0f) { // LUI
2258     if(rt1[i]) {
2259       signed char t;
2260       t=get_reg(i_regs->regmap,rt1[i]);
2261       //assert(t>=0);
2262       if(t>=0) {
2263         if(!((i_regs->isconst>>t)&1))
2264           emit_movimm(imm[i]<<16,t);
2265       }
2266     }
2267   }
2268   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2269     if(rt1[i]) {
2270       signed char s,t;
2271       t=get_reg(i_regs->regmap,rt1[i]);
2272       s=get_reg(i_regs->regmap,rs1[i]);
2273       if(rs1[i]) {
2274         //assert(t>=0);
2275         //assert(s>=0);
2276         if(t>=0) {
2277           if(!((i_regs->isconst>>t)&1)) {
2278             if(s<0) {
2279               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2280               emit_addimm(t,imm[i],t);
2281             }else{
2282               if(!((i_regs->wasconst>>s)&1))
2283                 emit_addimm(s,imm[i],t);
2284               else
2285                 emit_movimm(constmap[i][s]+imm[i],t);
2286             }
2287           }
2288         }
2289       } else {
2290         if(t>=0) {
2291           if(!((i_regs->isconst>>t)&1))
2292             emit_movimm(imm[i],t);
2293         }
2294       }
2295     }
2296   }
2297   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2298     if(rt1[i]) {
2299       signed char sh,sl,th,tl;
2300       th=get_reg(i_regs->regmap,rt1[i]|64);
2301       tl=get_reg(i_regs->regmap,rt1[i]);
2302       sh=get_reg(i_regs->regmap,rs1[i]|64);
2303       sl=get_reg(i_regs->regmap,rs1[i]);
2304       if(tl>=0) {
2305         if(rs1[i]) {
2306           assert(sh>=0);
2307           assert(sl>=0);
2308           if(th>=0) {
2309             emit_addimm64_32(sh,sl,imm[i],th,tl);
2310           }
2311           else {
2312             emit_addimm(sl,imm[i],tl);
2313           }
2314         } else {
2315           emit_movimm(imm[i],tl);
2316           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2317         }
2318       }
2319     }
2320   }
2321   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2322     if(rt1[i]) {
2323       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2324       signed char sh,sl,t;
2325       t=get_reg(i_regs->regmap,rt1[i]);
2326       sh=get_reg(i_regs->regmap,rs1[i]|64);
2327       sl=get_reg(i_regs->regmap,rs1[i]);
2328       //assert(t>=0);
2329       if(t>=0) {
2330         if(rs1[i]>0) {
2331           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2332           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2333             if(opcode[i]==0x0a) { // SLTI
2334               if(sl<0) {
2335                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2336                 emit_slti32(t,imm[i],t);
2337               }else{
2338                 emit_slti32(sl,imm[i],t);
2339               }
2340             }
2341             else { // SLTIU
2342               if(sl<0) {
2343                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2344                 emit_sltiu32(t,imm[i],t);
2345               }else{
2346                 emit_sltiu32(sl,imm[i],t);
2347               }
2348             }
2349           }else{ // 64-bit
2350             assert(sl>=0);
2351             if(opcode[i]==0x0a) // SLTI
2352               emit_slti64_32(sh,sl,imm[i],t);
2353             else // SLTIU
2354               emit_sltiu64_32(sh,sl,imm[i],t);
2355           }
2356         }else{
2357           // SLTI(U) with r0 is just stupid,
2358           // nonetheless examples can be found
2359           if(opcode[i]==0x0a) // SLTI
2360             if(0<imm[i]) emit_movimm(1,t);
2361             else emit_zeroreg(t);
2362           else // SLTIU
2363           {
2364             if(imm[i]) emit_movimm(1,t);
2365             else emit_zeroreg(t);
2366           }
2367         }
2368       }
2369     }
2370   }
2371   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2372     if(rt1[i]) {
2373       signed char sh,sl,th,tl;
2374       th=get_reg(i_regs->regmap,rt1[i]|64);
2375       tl=get_reg(i_regs->regmap,rt1[i]);
2376       sh=get_reg(i_regs->regmap,rs1[i]|64);
2377       sl=get_reg(i_regs->regmap,rs1[i]);
2378       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2379         if(opcode[i]==0x0c) //ANDI
2380         {
2381           if(rs1[i]) {
2382             if(sl<0) {
2383               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2384               emit_andimm(tl,imm[i],tl);
2385             }else{
2386               if(!((i_regs->wasconst>>sl)&1))
2387                 emit_andimm(sl,imm[i],tl);
2388               else
2389                 emit_movimm(constmap[i][sl]&imm[i],tl);
2390             }
2391           }
2392           else
2393             emit_zeroreg(tl);
2394           if(th>=0) emit_zeroreg(th);
2395         }
2396         else
2397         {
2398           if(rs1[i]) {
2399             if(sl<0) {
2400               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2401             }
2402             if(th>=0) {
2403               if(sh<0) {
2404                 emit_loadreg(rs1[i]|64,th);
2405               }else{
2406                 emit_mov(sh,th);
2407               }
2408             }
2409             if(opcode[i]==0x0d) { // ORI
2410               if(sl<0) {
2411                 emit_orimm(tl,imm[i],tl);
2412               }else{
2413                 if(!((i_regs->wasconst>>sl)&1))
2414                   emit_orimm(sl,imm[i],tl);
2415                 else
2416                   emit_movimm(constmap[i][sl]|imm[i],tl);
2417               }
2418             }
2419             if(opcode[i]==0x0e) { // XORI
2420               if(sl<0) {
2421                 emit_xorimm(tl,imm[i],tl);
2422               }else{
2423                 if(!((i_regs->wasconst>>sl)&1))
2424                   emit_xorimm(sl,imm[i],tl);
2425                 else
2426                   emit_movimm(constmap[i][sl]^imm[i],tl);
2427               }
2428             }
2429           }
2430           else {
2431             emit_movimm(imm[i],tl);
2432             if(th>=0) emit_zeroreg(th);
2433           }
2434         }
2435       }
2436     }
2437   }
2438 }
2439
2440 void shiftimm_assemble(int i,struct regstat *i_regs)
2441 {
2442   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2443   {
2444     if(rt1[i]) {
2445       signed char s,t;
2446       t=get_reg(i_regs->regmap,rt1[i]);
2447       s=get_reg(i_regs->regmap,rs1[i]);
2448       //assert(t>=0);
2449       if(t>=0&&!((i_regs->isconst>>t)&1)){
2450         if(rs1[i]==0)
2451         {
2452           emit_zeroreg(t);
2453         }
2454         else
2455         {
2456           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2457           if(imm[i]) {
2458             if(opcode2[i]==0) // SLL
2459             {
2460               emit_shlimm(s<0?t:s,imm[i],t);
2461             }
2462             if(opcode2[i]==2) // SRL
2463             {
2464               emit_shrimm(s<0?t:s,imm[i],t);
2465             }
2466             if(opcode2[i]==3) // SRA
2467             {
2468               emit_sarimm(s<0?t:s,imm[i],t);
2469             }
2470           }else{
2471             // Shift by zero
2472             if(s>=0 && s!=t) emit_mov(s,t);
2473           }
2474         }
2475       }
2476       //emit_storereg(rt1[i],t); //DEBUG
2477     }
2478   }
2479   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2480   {
2481     if(rt1[i]) {
2482       signed char sh,sl,th,tl;
2483       th=get_reg(i_regs->regmap,rt1[i]|64);
2484       tl=get_reg(i_regs->regmap,rt1[i]);
2485       sh=get_reg(i_regs->regmap,rs1[i]|64);
2486       sl=get_reg(i_regs->regmap,rs1[i]);
2487       if(tl>=0) {
2488         if(rs1[i]==0)
2489         {
2490           emit_zeroreg(tl);
2491           if(th>=0) emit_zeroreg(th);
2492         }
2493         else
2494         {
2495           assert(sl>=0);
2496           assert(sh>=0);
2497           if(imm[i]) {
2498             if(opcode2[i]==0x38) // DSLL
2499             {
2500               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2501               emit_shlimm(sl,imm[i],tl);
2502             }
2503             if(opcode2[i]==0x3a) // DSRL
2504             {
2505               emit_shrdimm(sl,sh,imm[i],tl);
2506               if(th>=0) emit_shrimm(sh,imm[i],th);
2507             }
2508             if(opcode2[i]==0x3b) // DSRA
2509             {
2510               emit_shrdimm(sl,sh,imm[i],tl);
2511               if(th>=0) emit_sarimm(sh,imm[i],th);
2512             }
2513           }else{
2514             // Shift by zero
2515             if(sl!=tl) emit_mov(sl,tl);
2516             if(th>=0&&sh!=th) emit_mov(sh,th);
2517           }
2518         }
2519       }
2520     }
2521   }
2522   if(opcode2[i]==0x3c) // DSLL32
2523   {
2524     if(rt1[i]) {
2525       signed char sl,tl,th;
2526       tl=get_reg(i_regs->regmap,rt1[i]);
2527       th=get_reg(i_regs->regmap,rt1[i]|64);
2528       sl=get_reg(i_regs->regmap,rs1[i]);
2529       if(th>=0||tl>=0){
2530         assert(tl>=0);
2531         assert(th>=0);
2532         assert(sl>=0);
2533         emit_mov(sl,th);
2534         emit_zeroreg(tl);
2535         if(imm[i]>32)
2536         {
2537           emit_shlimm(th,imm[i]&31,th);
2538         }
2539       }
2540     }
2541   }
2542   if(opcode2[i]==0x3e) // DSRL32
2543   {
2544     if(rt1[i]) {
2545       signed char sh,tl,th;
2546       tl=get_reg(i_regs->regmap,rt1[i]);
2547       th=get_reg(i_regs->regmap,rt1[i]|64);
2548       sh=get_reg(i_regs->regmap,rs1[i]|64);
2549       if(tl>=0){
2550         assert(sh>=0);
2551         emit_mov(sh,tl);
2552         if(th>=0) emit_zeroreg(th);
2553         if(imm[i]>32)
2554         {
2555           emit_shrimm(tl,imm[i]&31,tl);
2556         }
2557       }
2558     }
2559   }
2560   if(opcode2[i]==0x3f) // DSRA32
2561   {
2562     if(rt1[i]) {
2563       signed char sh,tl;
2564       tl=get_reg(i_regs->regmap,rt1[i]);
2565       sh=get_reg(i_regs->regmap,rs1[i]|64);
2566       if(tl>=0){
2567         assert(sh>=0);
2568         emit_mov(sh,tl);
2569         if(imm[i]>32)
2570         {
2571           emit_sarimm(tl,imm[i]&31,tl);
2572         }
2573       }
2574     }
2575   }
2576 }
2577
2578 #ifndef shift_assemble
2579 void shift_assemble(int i,struct regstat *i_regs)
2580 {
2581   printf("Need shift_assemble for this architecture.\n");
2582   exit(1);
2583 }
2584 #endif
2585
2586 void load_assemble(int i,struct regstat *i_regs)
2587 {
2588   int s,th,tl,addr,map=-1;
2589   int offset;
2590   int jaddr=0;
2591   int memtarget=0,c=0;
2592   int fastload_reg_override=0;
2593   u_int hr,reglist=0;
2594   th=get_reg(i_regs->regmap,rt1[i]|64);
2595   tl=get_reg(i_regs->regmap,rt1[i]);
2596   s=get_reg(i_regs->regmap,rs1[i]);
2597   offset=imm[i];
2598   for(hr=0;hr<HOST_REGS;hr++) {
2599     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2600   }
2601   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2602   if(s>=0) {
2603     c=(i_regs->wasconst>>s)&1;
2604     if (c) {
2605       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2606     }
2607   }
2608   //printf("load_assemble: c=%d\n",c);
2609   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2610   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2611   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2612     ||rt1[i]==0) {
2613       // could be FIFO, must perform the read
2614       // ||dummy read
2615       assem_debug("(forced read)\n");
2616       tl=get_reg(i_regs->regmap,-1);
2617       assert(tl>=0);
2618   }
2619   if(offset||s<0||c) addr=tl;
2620   else addr=s;
2621   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2622  if(tl>=0) {
2623   //printf("load_assemble: c=%d\n",c);
2624   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2625   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2626   reglist&=~(1<<tl);
2627   if(th>=0) reglist&=~(1<<th);
2628   if(!c) {
2629     #ifdef RAM_OFFSET
2630     map=get_reg(i_regs->regmap,ROREG);
2631     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2632     #endif
2633     #ifdef R29_HACK
2634     // Strmnnrmn's speed hack
2635     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2636     #endif
2637     {
2638       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2639     }
2640   }
2641   else if(ram_offset&&memtarget) {
2642     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2643     fastload_reg_override=HOST_TEMPREG;
2644   }
2645   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2646   if (opcode[i]==0x20) { // LB
2647     if(!c||memtarget) {
2648       if(!dummy) {
2649         #ifdef HOST_IMM_ADDR32
2650         if(c)
2651           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2652         else
2653         #endif
2654         {
2655           //emit_xorimm(addr,3,tl);
2656           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2657           int x=0,a=tl;
2658 #ifdef BIG_ENDIAN_MIPS
2659           if(!c) emit_xorimm(addr,3,tl);
2660           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2661 #else
2662           if(!c) a=addr;
2663 #endif
2664           if(fastload_reg_override) a=fastload_reg_override;
2665
2666           emit_movsbl_indexed_tlb(x,a,map,tl);
2667         }
2668       }
2669       if(jaddr)
2670         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2671     }
2672     else
2673       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2674   }
2675   if (opcode[i]==0x21) { // LH
2676     if(!c||memtarget) {
2677       if(!dummy) {
2678         #ifdef HOST_IMM_ADDR32
2679         if(c)
2680           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2681         else
2682         #endif
2683         {
2684           int x=0,a=tl;
2685 #ifdef BIG_ENDIAN_MIPS
2686           if(!c) emit_xorimm(addr,2,tl);
2687           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2688 #else
2689           if(!c) a=addr;
2690 #endif
2691           if(fastload_reg_override) a=fastload_reg_override;
2692           //#ifdef
2693           //emit_movswl_indexed_tlb(x,tl,map,tl);
2694           //else
2695           if(map>=0) {
2696             emit_movswl_indexed(x,a,tl);
2697           }else{
2698             #if 1 //def RAM_OFFSET
2699             emit_movswl_indexed(x,a,tl);
2700             #else
2701             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2702             #endif
2703           }
2704         }
2705       }
2706       if(jaddr)
2707         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2708     }
2709     else
2710       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2711   }
2712   if (opcode[i]==0x23) { // LW
2713     if(!c||memtarget) {
2714       if(!dummy) {
2715         int a=addr;
2716         if(fastload_reg_override) a=fastload_reg_override;
2717         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2718         #ifdef HOST_IMM_ADDR32
2719         if(c)
2720           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2721         else
2722         #endif
2723         emit_readword_indexed_tlb(0,a,map,tl);
2724       }
2725       if(jaddr)
2726         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2727     }
2728     else
2729       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2730   }
2731   if (opcode[i]==0x24) { // LBU
2732     if(!c||memtarget) {
2733       if(!dummy) {
2734         #ifdef HOST_IMM_ADDR32
2735         if(c)
2736           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2737         else
2738         #endif
2739         {
2740           //emit_xorimm(addr,3,tl);
2741           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2742           int x=0,a=tl;
2743 #ifdef BIG_ENDIAN_MIPS
2744           if(!c) emit_xorimm(addr,3,tl);
2745           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2746 #else
2747           if(!c) a=addr;
2748 #endif
2749           if(fastload_reg_override) a=fastload_reg_override;
2750
2751           emit_movzbl_indexed_tlb(x,a,map,tl);
2752         }
2753       }
2754       if(jaddr)
2755         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2756     }
2757     else
2758       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2759   }
2760   if (opcode[i]==0x25) { // LHU
2761     if(!c||memtarget) {
2762       if(!dummy) {
2763         #ifdef HOST_IMM_ADDR32
2764         if(c)
2765           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2766         else
2767         #endif
2768         {
2769           int x=0,a=tl;
2770 #ifdef BIG_ENDIAN_MIPS
2771           if(!c) emit_xorimm(addr,2,tl);
2772           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2773 #else
2774           if(!c) a=addr;
2775 #endif
2776           if(fastload_reg_override) a=fastload_reg_override;
2777           //#ifdef
2778           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2779           //#else
2780           if(map>=0) {
2781             emit_movzwl_indexed(x,a,tl);
2782           }else{
2783             #if 1 //def RAM_OFFSET
2784             emit_movzwl_indexed(x,a,tl);
2785             #else
2786             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2787             #endif
2788           }
2789         }
2790       }
2791       if(jaddr)
2792         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2793     }
2794     else
2795       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2796   }
2797   if (opcode[i]==0x27) { // LWU
2798     assert(th>=0);
2799     if(!c||memtarget) {
2800       if(!dummy) {
2801         int a=addr;
2802         if(fastload_reg_override) a=fastload_reg_override;
2803         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2804         #ifdef HOST_IMM_ADDR32
2805         if(c)
2806           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2807         else
2808         #endif
2809         emit_readword_indexed_tlb(0,a,map,tl);
2810       }
2811       if(jaddr)
2812         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2813     }
2814     else {
2815       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2816     }
2817     emit_zeroreg(th);
2818   }
2819   if (opcode[i]==0x37) { // LD
2820     if(!c||memtarget) {
2821       if(!dummy) {
2822         int a=addr;
2823         if(fastload_reg_override) a=fastload_reg_override;
2824         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2825         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2826         #ifdef HOST_IMM_ADDR32
2827         if(c)
2828           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2829         else
2830         #endif
2831         emit_readdword_indexed_tlb(0,a,map,th,tl);
2832       }
2833       if(jaddr)
2834         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2835     }
2836     else
2837       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2838   }
2839  }
2840   //emit_storereg(rt1[i],tl); // DEBUG
2841   //if(opcode[i]==0x23)
2842   //if(opcode[i]==0x24)
2843   //if(opcode[i]==0x23||opcode[i]==0x24)
2844   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2845   {
2846     //emit_pusha();
2847     save_regs(0x100f);
2848         emit_readword((int)&last_count,ECX);
2849         #ifdef __i386__
2850         if(get_reg(i_regs->regmap,CCREG)<0)
2851           emit_loadreg(CCREG,HOST_CCREG);
2852         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2853         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2854         emit_writeword(HOST_CCREG,(int)&Count);
2855         #endif
2856         #ifdef __arm__
2857         if(get_reg(i_regs->regmap,CCREG)<0)
2858           emit_loadreg(CCREG,0);
2859         else
2860           emit_mov(HOST_CCREG,0);
2861         emit_add(0,ECX,0);
2862         emit_addimm(0,2*ccadj[i],0);
2863         emit_writeword(0,(int)&Count);
2864         #endif
2865     emit_call((int)memdebug);
2866     //emit_popa();
2867     restore_regs(0x100f);
2868   }*/
2869 }
2870
2871 #ifndef loadlr_assemble
2872 void loadlr_assemble(int i,struct regstat *i_regs)
2873 {
2874   printf("Need loadlr_assemble for this architecture.\n");
2875   exit(1);
2876 }
2877 #endif
2878
2879 void store_assemble(int i,struct regstat *i_regs)
2880 {
2881   int s,th,tl,map=-1;
2882   int addr,temp;
2883   int offset;
2884   int jaddr=0,type;
2885   int memtarget=0,c=0;
2886   int agr=AGEN1+(i&1);
2887   int faststore_reg_override=0;
2888   u_int hr,reglist=0;
2889   th=get_reg(i_regs->regmap,rs2[i]|64);
2890   tl=get_reg(i_regs->regmap,rs2[i]);
2891   s=get_reg(i_regs->regmap,rs1[i]);
2892   temp=get_reg(i_regs->regmap,agr);
2893   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2894   offset=imm[i];
2895   if(s>=0) {
2896     c=(i_regs->wasconst>>s)&1;
2897     if(c) {
2898       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2899     }
2900   }
2901   assert(tl>=0);
2902   assert(temp>=0);
2903   for(hr=0;hr<HOST_REGS;hr++) {
2904     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2905   }
2906   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2907   if(offset||s<0||c) addr=temp;
2908   else addr=s;
2909   if(!c) {
2910     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2911   }
2912   else if(ram_offset&&memtarget) {
2913     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2914     faststore_reg_override=HOST_TEMPREG;
2915   }
2916
2917   if (opcode[i]==0x28) { // SB
2918     if(!c||memtarget) {
2919       int x=0,a=temp;
2920 #ifdef BIG_ENDIAN_MIPS
2921       if(!c) emit_xorimm(addr,3,temp);
2922       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2923 #else
2924       if(!c) a=addr;
2925 #endif
2926       if(faststore_reg_override) a=faststore_reg_override;
2927       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2928       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2929     }
2930     type=STOREB_STUB;
2931   }
2932   if (opcode[i]==0x29) { // SH
2933     if(!c||memtarget) {
2934       int x=0,a=temp;
2935 #ifdef BIG_ENDIAN_MIPS
2936       if(!c) emit_xorimm(addr,2,temp);
2937       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2938 #else
2939       if(!c) a=addr;
2940 #endif
2941       if(faststore_reg_override) a=faststore_reg_override;
2942       //#ifdef
2943       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2944       //#else
2945       if(map>=0) {
2946         emit_writehword_indexed(tl,x,a);
2947       }else
2948         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2949         emit_writehword_indexed(tl,x,a);
2950     }
2951     type=STOREH_STUB;
2952   }
2953   if (opcode[i]==0x2B) { // SW
2954     if(!c||memtarget) {
2955       int a=addr;
2956       if(faststore_reg_override) a=faststore_reg_override;
2957       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2958       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2959     }
2960     type=STOREW_STUB;
2961   }
2962   if (opcode[i]==0x3F) { // SD
2963     if(!c||memtarget) {
2964       int a=addr;
2965       if(faststore_reg_override) a=faststore_reg_override;
2966       if(rs2[i]) {
2967         assert(th>=0);
2968         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2969         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
2970         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
2971       }else{
2972         // Store zero
2973         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
2974         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
2975         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
2976       }
2977     }
2978     type=STORED_STUB;
2979   }
2980   if(jaddr) {
2981     // PCSX store handlers don't check invcode again
2982     reglist|=1<<addr;
2983     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2984     jaddr=0;
2985   }
2986   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2987     if(!c||memtarget) {
2988       #ifdef DESTRUCTIVE_SHIFT
2989       // The x86 shift operation is 'destructive'; it overwrites the
2990       // source register, so we need to make a copy first and use that.
2991       addr=temp;
2992       #endif
2993       #if defined(HOST_IMM8)
2994       int ir=get_reg(i_regs->regmap,INVCP);
2995       assert(ir>=0);
2996       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2997       #else
2998       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
2999       #endif
3000       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3001       emit_callne(invalidate_addr_reg[addr]);
3002       #else
3003       int jaddr2=(int)out;
3004       emit_jne(0);
3005       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3006       #endif
3007     }
3008   }
3009   u_int addr_val=constmap[i][s]+offset;
3010   if(jaddr) {
3011     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3012   } else if(c&&!memtarget) {
3013     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3014   }
3015   // basic current block modification detection..
3016   // not looking back as that should be in mips cache already
3017   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3018     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3019     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3020     if(i_regs->regmap==regs[i].regmap) {
3021       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3022       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3023       emit_movimm(start+i*4+4,0);
3024       emit_writeword(0,(int)&pcaddr);
3025       emit_jmp((int)do_interrupt);
3026     }
3027   }
3028   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3029   //if(opcode[i]==0x2B || opcode[i]==0x28)
3030   //if(opcode[i]==0x2B || opcode[i]==0x29)
3031   //if(opcode[i]==0x2B)
3032   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3033   {
3034     #ifdef __i386__
3035     emit_pusha();
3036     #endif
3037     #ifdef __arm__
3038     save_regs(0x100f);
3039     #endif
3040         emit_readword((int)&last_count,ECX);
3041         #ifdef __i386__
3042         if(get_reg(i_regs->regmap,CCREG)<0)
3043           emit_loadreg(CCREG,HOST_CCREG);
3044         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3045         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3046         emit_writeword(HOST_CCREG,(int)&Count);
3047         #endif
3048         #ifdef __arm__
3049         if(get_reg(i_regs->regmap,CCREG)<0)
3050           emit_loadreg(CCREG,0);
3051         else
3052           emit_mov(HOST_CCREG,0);
3053         emit_add(0,ECX,0);
3054         emit_addimm(0,2*ccadj[i],0);
3055         emit_writeword(0,(int)&Count);
3056         #endif
3057     emit_call((int)memdebug);
3058     #ifdef __i386__
3059     emit_popa();
3060     #endif
3061     #ifdef __arm__
3062     restore_regs(0x100f);
3063     #endif
3064   }*/
3065 }
3066
3067 void storelr_assemble(int i,struct regstat *i_regs)
3068 {
3069   int s,th,tl;
3070   int temp;
3071   int temp2=-1;
3072   int offset;
3073   int jaddr=0;
3074   int case1,case2,case3;
3075   int done0,done1,done2;
3076   int memtarget=0,c=0;
3077   int agr=AGEN1+(i&1);
3078   u_int hr,reglist=0;
3079   th=get_reg(i_regs->regmap,rs2[i]|64);
3080   tl=get_reg(i_regs->regmap,rs2[i]);
3081   s=get_reg(i_regs->regmap,rs1[i]);
3082   temp=get_reg(i_regs->regmap,agr);
3083   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3084   offset=imm[i];
3085   if(s>=0) {
3086     c=(i_regs->isconst>>s)&1;
3087     if(c) {
3088       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3089     }
3090   }
3091   assert(tl>=0);
3092   for(hr=0;hr<HOST_REGS;hr++) {
3093     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3094   }
3095   assert(temp>=0);
3096   if(!c) {
3097     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3098     if(!offset&&s!=temp) emit_mov(s,temp);
3099     jaddr=(int)out;
3100     emit_jno(0);
3101   }
3102   else
3103   {
3104     if(!memtarget||!rs1[i]) {
3105       jaddr=(int)out;
3106       emit_jmp(0);
3107     }
3108   }
3109   #ifdef RAM_OFFSET
3110   int map=get_reg(i_regs->regmap,ROREG);
3111   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3112   #else
3113   if((u_int)rdram!=0x80000000)
3114     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3115   #endif
3116
3117   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3118     temp2=get_reg(i_regs->regmap,FTEMP);
3119     if(!rs2[i]) temp2=th=tl;
3120   }
3121
3122 #ifndef BIG_ENDIAN_MIPS
3123     emit_xorimm(temp,3,temp);
3124 #endif
3125   emit_testimm(temp,2);
3126   case2=(int)out;
3127   emit_jne(0);
3128   emit_testimm(temp,1);
3129   case1=(int)out;
3130   emit_jne(0);
3131   // 0
3132   if (opcode[i]==0x2A) { // SWL
3133     emit_writeword_indexed(tl,0,temp);
3134   }
3135   if (opcode[i]==0x2E) { // SWR
3136     emit_writebyte_indexed(tl,3,temp);
3137   }
3138   if (opcode[i]==0x2C) { // SDL
3139     emit_writeword_indexed(th,0,temp);
3140     if(rs2[i]) emit_mov(tl,temp2);
3141   }
3142   if (opcode[i]==0x2D) { // SDR
3143     emit_writebyte_indexed(tl,3,temp);
3144     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3145   }
3146   done0=(int)out;
3147   emit_jmp(0);
3148   // 1
3149   set_jump_target(case1,(int)out);
3150   if (opcode[i]==0x2A) { // SWL
3151     // Write 3 msb into three least significant bytes
3152     if(rs2[i]) emit_rorimm(tl,8,tl);
3153     emit_writehword_indexed(tl,-1,temp);
3154     if(rs2[i]) emit_rorimm(tl,16,tl);
3155     emit_writebyte_indexed(tl,1,temp);
3156     if(rs2[i]) emit_rorimm(tl,8,tl);
3157   }
3158   if (opcode[i]==0x2E) { // SWR
3159     // Write two lsb into two most significant bytes
3160     emit_writehword_indexed(tl,1,temp);
3161   }
3162   if (opcode[i]==0x2C) { // SDL
3163     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3164     // Write 3 msb into three least significant bytes
3165     if(rs2[i]) emit_rorimm(th,8,th);
3166     emit_writehword_indexed(th,-1,temp);
3167     if(rs2[i]) emit_rorimm(th,16,th);
3168     emit_writebyte_indexed(th,1,temp);
3169     if(rs2[i]) emit_rorimm(th,8,th);
3170   }
3171   if (opcode[i]==0x2D) { // SDR
3172     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3173     // Write two lsb into two most significant bytes
3174     emit_writehword_indexed(tl,1,temp);
3175   }
3176   done1=(int)out;
3177   emit_jmp(0);
3178   // 2
3179   set_jump_target(case2,(int)out);
3180   emit_testimm(temp,1);
3181   case3=(int)out;
3182   emit_jne(0);
3183   if (opcode[i]==0x2A) { // SWL
3184     // Write two msb into two least significant bytes
3185     if(rs2[i]) emit_rorimm(tl,16,tl);
3186     emit_writehword_indexed(tl,-2,temp);
3187     if(rs2[i]) emit_rorimm(tl,16,tl);
3188   }
3189   if (opcode[i]==0x2E) { // SWR
3190     // Write 3 lsb into three most significant bytes
3191     emit_writebyte_indexed(tl,-1,temp);
3192     if(rs2[i]) emit_rorimm(tl,8,tl);
3193     emit_writehword_indexed(tl,0,temp);
3194     if(rs2[i]) emit_rorimm(tl,24,tl);
3195   }
3196   if (opcode[i]==0x2C) { // SDL
3197     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3198     // Write two msb into two least significant bytes
3199     if(rs2[i]) emit_rorimm(th,16,th);
3200     emit_writehword_indexed(th,-2,temp);
3201     if(rs2[i]) emit_rorimm(th,16,th);
3202   }
3203   if (opcode[i]==0x2D) { // SDR
3204     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3205     // Write 3 lsb into three most significant bytes
3206     emit_writebyte_indexed(tl,-1,temp);
3207     if(rs2[i]) emit_rorimm(tl,8,tl);
3208     emit_writehword_indexed(tl,0,temp);
3209     if(rs2[i]) emit_rorimm(tl,24,tl);
3210   }
3211   done2=(int)out;
3212   emit_jmp(0);
3213   // 3
3214   set_jump_target(case3,(int)out);
3215   if (opcode[i]==0x2A) { // SWL
3216     // Write msb into least significant byte
3217     if(rs2[i]) emit_rorimm(tl,24,tl);
3218     emit_writebyte_indexed(tl,-3,temp);
3219     if(rs2[i]) emit_rorimm(tl,8,tl);
3220   }
3221   if (opcode[i]==0x2E) { // SWR
3222     // Write entire word
3223     emit_writeword_indexed(tl,-3,temp);
3224   }
3225   if (opcode[i]==0x2C) { // SDL
3226     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3227     // Write msb into least significant byte
3228     if(rs2[i]) emit_rorimm(th,24,th);
3229     emit_writebyte_indexed(th,-3,temp);
3230     if(rs2[i]) emit_rorimm(th,8,th);
3231   }
3232   if (opcode[i]==0x2D) { // SDR
3233     if(rs2[i]) emit_mov(th,temp2);
3234     // Write entire word
3235     emit_writeword_indexed(tl,-3,temp);
3236   }
3237   set_jump_target(done0,(int)out);
3238   set_jump_target(done1,(int)out);
3239   set_jump_target(done2,(int)out);
3240   if (opcode[i]==0x2C) { // SDL
3241     emit_testimm(temp,4);
3242     done0=(int)out;
3243     emit_jne(0);
3244     emit_andimm(temp,~3,temp);
3245     emit_writeword_indexed(temp2,4,temp);
3246     set_jump_target(done0,(int)out);
3247   }
3248   if (opcode[i]==0x2D) { // SDR
3249     emit_testimm(temp,4);
3250     done0=(int)out;
3251     emit_jeq(0);
3252     emit_andimm(temp,~3,temp);
3253     emit_writeword_indexed(temp2,-4,temp);
3254     set_jump_target(done0,(int)out);
3255   }
3256   if(!c||!memtarget)
3257     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3258   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3259     #ifdef RAM_OFFSET
3260     int map=get_reg(i_regs->regmap,ROREG);
3261     if(map<0) map=HOST_TEMPREG;
3262     gen_orig_addr_w(temp,map);
3263     #else
3264     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3265     #endif
3266     #if defined(HOST_IMM8)
3267     int ir=get_reg(i_regs->regmap,INVCP);
3268     assert(ir>=0);
3269     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3270     #else
3271     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3272     #endif
3273     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3274     emit_callne(invalidate_addr_reg[temp]);
3275     #else
3276     int jaddr2=(int)out;
3277     emit_jne(0);
3278     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3279     #endif
3280   }
3281   /*
3282     emit_pusha();
3283     //save_regs(0x100f);
3284         emit_readword((int)&last_count,ECX);
3285         if(get_reg(i_regs->regmap,CCREG)<0)
3286           emit_loadreg(CCREG,HOST_CCREG);
3287         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3288         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3289         emit_writeword(HOST_CCREG,(int)&Count);
3290     emit_call((int)memdebug);
3291     emit_popa();
3292     //restore_regs(0x100f);
3293   */
3294 }
3295
3296 void c1ls_assemble(int i,struct regstat *i_regs)
3297 {
3298   cop1_unusable(i, i_regs);
3299 }
3300
3301 void c2ls_assemble(int i,struct regstat *i_regs)
3302 {
3303   int s,tl;
3304   int ar;
3305   int offset;
3306   int memtarget=0,c=0;
3307   int jaddr2=0,type;
3308   int agr=AGEN1+(i&1);
3309   int fastio_reg_override=0;
3310   u_int hr,reglist=0;
3311   u_int copr=(source[i]>>16)&0x1f;
3312   s=get_reg(i_regs->regmap,rs1[i]);
3313   tl=get_reg(i_regs->regmap,FTEMP);
3314   offset=imm[i];
3315   assert(rs1[i]>0);
3316   assert(tl>=0);
3317
3318   for(hr=0;hr<HOST_REGS;hr++) {
3319     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3320   }
3321   if(i_regs->regmap[HOST_CCREG]==CCREG)
3322     reglist&=~(1<<HOST_CCREG);
3323
3324   // get the address
3325   if (opcode[i]==0x3a) { // SWC2
3326     ar=get_reg(i_regs->regmap,agr);
3327     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3328     reglist|=1<<ar;
3329   } else { // LWC2
3330     ar=tl;
3331   }
3332   if(s>=0) c=(i_regs->wasconst>>s)&1;
3333   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3334   if (!offset&&!c&&s>=0) ar=s;
3335   assert(ar>=0);
3336
3337   if (opcode[i]==0x3a) { // SWC2
3338     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3339     type=STOREW_STUB;
3340   }
3341   else
3342     type=LOADW_STUB;
3343
3344   if(c&&!memtarget) {
3345     jaddr2=(int)out;
3346     emit_jmp(0); // inline_readstub/inline_writestub?
3347   }
3348   else {
3349     if(!c) {
3350       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3351     }
3352     else if(ram_offset&&memtarget) {
3353       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3354       fastio_reg_override=HOST_TEMPREG;
3355     }
3356     if (opcode[i]==0x32) { // LWC2
3357       #ifdef HOST_IMM_ADDR32
3358       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3359       else
3360       #endif
3361       int a=ar;
3362       if(fastio_reg_override) a=fastio_reg_override;
3363       emit_readword_indexed(0,a,tl);
3364     }
3365     if (opcode[i]==0x3a) { // SWC2
3366       #ifdef DESTRUCTIVE_SHIFT
3367       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3368       #endif
3369       int a=ar;
3370       if(fastio_reg_override) a=fastio_reg_override;
3371       emit_writeword_indexed(tl,0,a);
3372     }
3373   }
3374   if(jaddr2)
3375     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3376   if(opcode[i]==0x3a) // SWC2
3377   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3378 #if defined(HOST_IMM8)
3379     int ir=get_reg(i_regs->regmap,INVCP);
3380     assert(ir>=0);
3381     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3382 #else
3383     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3384 #endif
3385     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3386     emit_callne(invalidate_addr_reg[ar]);
3387     #else
3388     int jaddr3=(int)out;
3389     emit_jne(0);
3390     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3391     #endif
3392   }
3393   if (opcode[i]==0x32) { // LWC2
3394     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3395   }
3396 }
3397
3398 #ifndef multdiv_assemble
3399 void multdiv_assemble(int i,struct regstat *i_regs)
3400 {
3401   printf("Need multdiv_assemble for this architecture.\n");
3402   exit(1);
3403 }
3404 #endif
3405
3406 void mov_assemble(int i,struct regstat *i_regs)
3407 {
3408   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3409   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3410   if(rt1[i]) {
3411     signed char sh,sl,th,tl;
3412     th=get_reg(i_regs->regmap,rt1[i]|64);
3413     tl=get_reg(i_regs->regmap,rt1[i]);
3414     //assert(tl>=0);
3415     if(tl>=0) {
3416       sh=get_reg(i_regs->regmap,rs1[i]|64);
3417       sl=get_reg(i_regs->regmap,rs1[i]);
3418       if(sl>=0) emit_mov(sl,tl);
3419       else emit_loadreg(rs1[i],tl);
3420       if(th>=0) {
3421         if(sh>=0) emit_mov(sh,th);
3422         else emit_loadreg(rs1[i]|64,th);
3423       }
3424     }
3425   }
3426 }
3427
3428 #ifndef fconv_assemble
3429 void fconv_assemble(int i,struct regstat *i_regs)
3430 {
3431   printf("Need fconv_assemble for this architecture.\n");
3432   exit(1);
3433 }
3434 #endif
3435
3436 #if 0
3437 void float_assemble(int i,struct regstat *i_regs)
3438 {
3439   printf("Need float_assemble for this architecture.\n");
3440   exit(1);
3441 }
3442 #endif
3443
3444 void syscall_assemble(int i,struct regstat *i_regs)
3445 {
3446   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3447   assert(ccreg==HOST_CCREG);
3448   assert(!is_delayslot);
3449   (void)ccreg;
3450   emit_movimm(start+i*4,EAX); // Get PC
3451   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3452   emit_jmp((int)jump_syscall_hle); // XXX
3453 }
3454
3455 void hlecall_assemble(int i,struct regstat *i_regs)
3456 {
3457   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3458   assert(ccreg==HOST_CCREG);
3459   assert(!is_delayslot);
3460   (void)ccreg;
3461   emit_movimm(start+i*4+4,0); // Get PC
3462   emit_movimm((int)psxHLEt[source[i]&7],1);
3463   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3464   emit_jmp((int)jump_hlecall);
3465 }
3466
3467 void intcall_assemble(int i,struct regstat *i_regs)
3468 {
3469   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3470   assert(ccreg==HOST_CCREG);
3471   assert(!is_delayslot);
3472   (void)ccreg;
3473   emit_movimm(start+i*4,0); // Get PC
3474   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3475   emit_jmp((int)jump_intcall);
3476 }
3477
3478 void ds_assemble(int i,struct regstat *i_regs)
3479 {
3480   speculate_register_values(i);
3481   is_delayslot=1;
3482   switch(itype[i]) {
3483     case ALU:
3484       alu_assemble(i,i_regs);break;
3485     case IMM16:
3486       imm16_assemble(i,i_regs);break;
3487     case SHIFT:
3488       shift_assemble(i,i_regs);break;
3489     case SHIFTIMM:
3490       shiftimm_assemble(i,i_regs);break;
3491     case LOAD:
3492       load_assemble(i,i_regs);break;
3493     case LOADLR:
3494       loadlr_assemble(i,i_regs);break;
3495     case STORE:
3496       store_assemble(i,i_regs);break;
3497     case STORELR:
3498       storelr_assemble(i,i_regs);break;
3499     case COP0:
3500       cop0_assemble(i,i_regs);break;
3501     case COP1:
3502       cop1_assemble(i,i_regs);break;
3503     case C1LS:
3504       c1ls_assemble(i,i_regs);break;
3505     case COP2:
3506       cop2_assemble(i,i_regs);break;
3507     case C2LS:
3508       c2ls_assemble(i,i_regs);break;
3509     case C2OP:
3510       c2op_assemble(i,i_regs);break;
3511     case FCONV:
3512       fconv_assemble(i,i_regs);break;
3513     case FLOAT:
3514       float_assemble(i,i_regs);break;
3515     case FCOMP:
3516       fcomp_assemble(i,i_regs);break;
3517     case MULTDIV:
3518       multdiv_assemble(i,i_regs);break;
3519     case MOV:
3520       mov_assemble(i,i_regs);break;
3521     case SYSCALL:
3522     case HLECALL:
3523     case INTCALL:
3524     case SPAN:
3525     case UJUMP:
3526     case RJUMP:
3527     case CJUMP:
3528     case SJUMP:
3529     case FJUMP:
3530       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3531   }
3532   is_delayslot=0;
3533 }
3534
3535 // Is the branch target a valid internal jump?
3536 int internal_branch(uint64_t i_is32,int addr)
3537 {
3538   if(addr&1) return 0; // Indirect (register) jump
3539   if(addr>=start && addr<start+slen*4-4)
3540   {
3541     //int t=(addr-start)>>2;
3542     // Delay slots are not valid branch targets
3543     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3544     // 64 -> 32 bit transition requires a recompile
3545     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3546     {
3547       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3548       else printf("optimizable: yes\n");
3549     }*/
3550     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3551     return 1;
3552   }
3553   return 0;
3554 }
3555
3556 #ifndef wb_invalidate
3557 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3558   uint64_t u,uint64_t uu)
3559 {
3560   int hr;
3561   for(hr=0;hr<HOST_REGS;hr++) {
3562     if(hr!=EXCLUDE_REG) {
3563       if(pre[hr]!=entry[hr]) {
3564         if(pre[hr]>=0) {
3565           if((dirty>>hr)&1) {
3566             if(get_reg(entry,pre[hr])<0) {
3567               if(pre[hr]<64) {
3568                 if(!((u>>pre[hr])&1)) {
3569                   emit_storereg(pre[hr],hr);
3570                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3571                     emit_sarimm(hr,31,hr);
3572                     emit_storereg(pre[hr]|64,hr);
3573                   }
3574                 }
3575               }else{
3576                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3577                   emit_storereg(pre[hr],hr);
3578                 }
3579               }
3580             }
3581           }
3582         }
3583       }
3584     }
3585   }
3586   // Move from one register to another (no writeback)
3587   for(hr=0;hr<HOST_REGS;hr++) {
3588     if(hr!=EXCLUDE_REG) {
3589       if(pre[hr]!=entry[hr]) {
3590         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3591           int nr;
3592           if((nr=get_reg(entry,pre[hr]))>=0) {
3593             emit_mov(hr,nr);
3594           }
3595         }
3596       }
3597     }
3598   }
3599 }
3600 #endif
3601
3602 // Load the specified registers
3603 // This only loads the registers given as arguments because
3604 // we don't want to load things that will be overwritten
3605 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3606 {
3607   int hr;
3608   // Load 32-bit regs
3609   for(hr=0;hr<HOST_REGS;hr++) {
3610     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3611       if(entry[hr]!=regmap[hr]) {
3612         if(regmap[hr]==rs1||regmap[hr]==rs2)
3613         {
3614           if(regmap[hr]==0) {
3615             emit_zeroreg(hr);
3616           }
3617           else
3618           {
3619             emit_loadreg(regmap[hr],hr);
3620           }
3621         }
3622       }
3623     }
3624   }
3625   //Load 64-bit regs
3626   for(hr=0;hr<HOST_REGS;hr++) {
3627     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3628       if(entry[hr]!=regmap[hr]) {
3629         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3630         {
3631           assert(regmap[hr]!=64);
3632           if((is32>>(regmap[hr]&63))&1) {
3633             int lr=get_reg(regmap,regmap[hr]-64);
3634             if(lr>=0)
3635               emit_sarimm(lr,31,hr);
3636             else
3637               emit_loadreg(regmap[hr],hr);
3638           }
3639           else
3640           {
3641             emit_loadreg(regmap[hr],hr);
3642           }
3643         }
3644       }
3645     }
3646   }
3647 }
3648
3649 // Load registers prior to the start of a loop
3650 // so that they are not loaded within the loop
3651 static void loop_preload(signed char pre[],signed char entry[])
3652 {
3653   int hr;
3654   for(hr=0;hr<HOST_REGS;hr++) {
3655     if(hr!=EXCLUDE_REG) {
3656       if(pre[hr]!=entry[hr]) {
3657         if(entry[hr]>=0) {
3658           if(get_reg(pre,entry[hr])<0) {
3659             assem_debug("loop preload:\n");
3660             //printf("loop preload: %d\n",hr);
3661             if(entry[hr]==0) {
3662               emit_zeroreg(hr);
3663             }
3664             else if(entry[hr]<TEMPREG)
3665             {
3666               emit_loadreg(entry[hr],hr);
3667             }
3668             else if(entry[hr]-64<TEMPREG)
3669             {
3670               emit_loadreg(entry[hr],hr);
3671             }
3672           }
3673         }
3674       }
3675     }
3676   }
3677 }
3678
3679 // Generate address for load/store instruction
3680 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3681 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3682 {
3683   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3684     int ra=-1;
3685     int agr=AGEN1+(i&1);
3686     if(itype[i]==LOAD) {
3687       ra=get_reg(i_regs->regmap,rt1[i]);
3688       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3689       assert(ra>=0);
3690     }
3691     if(itype[i]==LOADLR) {
3692       ra=get_reg(i_regs->regmap,FTEMP);
3693     }
3694     if(itype[i]==STORE||itype[i]==STORELR) {
3695       ra=get_reg(i_regs->regmap,agr);
3696       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3697     }
3698     if(itype[i]==C1LS||itype[i]==C2LS) {
3699       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3700         ra=get_reg(i_regs->regmap,FTEMP);
3701       else { // SWC1/SDC1/SWC2/SDC2
3702         ra=get_reg(i_regs->regmap,agr);
3703         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3704       }
3705     }
3706     int rs=get_reg(i_regs->regmap,rs1[i]);
3707     if(ra>=0) {
3708       int offset=imm[i];
3709       int c=(i_regs->wasconst>>rs)&1;
3710       if(rs1[i]==0) {
3711         // Using r0 as a base address
3712         if(!entry||entry[ra]!=agr) {
3713           if (opcode[i]==0x22||opcode[i]==0x26) {
3714             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3715           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3716             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3717           }else{
3718             emit_movimm(offset,ra);
3719           }
3720         } // else did it in the previous cycle
3721       }
3722       else if(rs<0) {
3723         if(!entry||entry[ra]!=rs1[i])
3724           emit_loadreg(rs1[i],ra);
3725         //if(!entry||entry[ra]!=rs1[i])
3726         //  printf("poor load scheduling!\n");
3727       }
3728       else if(c) {
3729         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3730           if(!entry||entry[ra]!=agr) {
3731             if (opcode[i]==0x22||opcode[i]==0x26) {
3732               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3733             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3734               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3735             }else{
3736               #ifdef HOST_IMM_ADDR32
3737               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3738               #endif
3739               emit_movimm(constmap[i][rs]+offset,ra);
3740               regs[i].loadedconst|=1<<ra;
3741             }
3742           } // else did it in the previous cycle
3743         } // else load_consts already did it
3744       }
3745       if(offset&&!c&&rs1[i]) {
3746         if(rs>=0) {
3747           emit_addimm(rs,offset,ra);
3748         }else{
3749           emit_addimm(ra,offset,ra);
3750         }
3751       }
3752     }
3753   }
3754   // Preload constants for next instruction
3755   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3756     int agr,ra;
3757     // Actual address
3758     agr=AGEN1+((i+1)&1);
3759     ra=get_reg(i_regs->regmap,agr);
3760     if(ra>=0) {
3761       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3762       int offset=imm[i+1];
3763       int c=(regs[i+1].wasconst>>rs)&1;
3764       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3765         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3766           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3767         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3768           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3769         }else{
3770           #ifdef HOST_IMM_ADDR32
3771           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3772           #endif
3773           emit_movimm(constmap[i+1][rs]+offset,ra);
3774           regs[i+1].loadedconst|=1<<ra;
3775         }
3776       }
3777       else if(rs1[i+1]==0) {
3778         // Using r0 as a base address
3779         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3780           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3781         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3782           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3783         }else{
3784           emit_movimm(offset,ra);
3785         }
3786       }
3787     }
3788   }
3789 }
3790
3791 static int get_final_value(int hr, int i, int *value)
3792 {
3793   int reg=regs[i].regmap[hr];
3794   while(i<slen-1) {
3795     if(regs[i+1].regmap[hr]!=reg) break;
3796     if(!((regs[i+1].isconst>>hr)&1)) break;
3797     if(bt[i+1]) break;
3798     i++;
3799   }
3800   if(i<slen-1) {
3801     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3802       *value=constmap[i][hr];
3803       return 1;
3804     }
3805     if(!bt[i+1]) {
3806       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3807         // Load in delay slot, out-of-order execution
3808         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3809         {
3810           // Precompute load address
3811           *value=constmap[i][hr]+imm[i+2];
3812           return 1;
3813         }
3814       }
3815       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3816       {
3817         // Precompute load address
3818         *value=constmap[i][hr]+imm[i+1];
3819         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3820         return 1;
3821       }
3822     }
3823   }
3824   *value=constmap[i][hr];
3825   //printf("c=%x\n",(int)constmap[i][hr]);
3826   if(i==slen-1) return 1;
3827   if(reg<64) {
3828     return !((unneeded_reg[i+1]>>reg)&1);
3829   }else{
3830     return !((unneeded_reg_upper[i+1]>>reg)&1);
3831   }
3832 }
3833
3834 // Load registers with known constants
3835 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3836 {
3837   int hr,hr2;
3838   // propagate loaded constant flags
3839   if(i==0||bt[i])
3840     regs[i].loadedconst=0;
3841   else {
3842     for(hr=0;hr<HOST_REGS;hr++) {
3843       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3844          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3845       {
3846         regs[i].loadedconst|=1<<hr;
3847       }
3848     }
3849   }
3850   // Load 32-bit regs
3851   for(hr=0;hr<HOST_REGS;hr++) {
3852     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3853       //if(entry[hr]!=regmap[hr]) {
3854       if(!((regs[i].loadedconst>>hr)&1)) {
3855         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3856           int value,similar=0;
3857           if(get_final_value(hr,i,&value)) {
3858             // see if some other register has similar value
3859             for(hr2=0;hr2<HOST_REGS;hr2++) {
3860               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3861                 if(is_similar_value(value,constmap[i][hr2])) {
3862                   similar=1;
3863                   break;
3864                 }
3865               }
3866             }
3867             if(similar) {
3868               int value2;
3869               if(get_final_value(hr2,i,&value2)) // is this needed?
3870                 emit_movimm_from(value2,hr2,value,hr);
3871               else
3872                 emit_movimm(value,hr);
3873             }
3874             else if(value==0) {
3875               emit_zeroreg(hr);
3876             }
3877             else {
3878               emit_movimm(value,hr);
3879             }
3880           }
3881           regs[i].loadedconst|=1<<hr;
3882         }
3883       }
3884     }
3885   }
3886   // Load 64-bit regs
3887   for(hr=0;hr<HOST_REGS;hr++) {
3888     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3889       //if(entry[hr]!=regmap[hr]) {
3890       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3891         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3892           if((is32>>(regmap[hr]&63))&1) {
3893             int lr=get_reg(regmap,regmap[hr]-64);
3894             assert(lr>=0);
3895             emit_sarimm(lr,31,hr);
3896           }
3897           else
3898           {
3899             int value;
3900             if(get_final_value(hr,i,&value)) {
3901               if(value==0) {
3902                 emit_zeroreg(hr);
3903               }
3904               else {
3905                 emit_movimm(value,hr);
3906               }
3907             }
3908           }
3909         }
3910       }
3911     }
3912   }
3913 }
3914 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3915 {
3916   int hr;
3917   // Load 32-bit regs
3918   for(hr=0;hr<HOST_REGS;hr++) {
3919     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3920       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3921         int value=constmap[i][hr];
3922         if(value==0) {
3923           emit_zeroreg(hr);
3924         }
3925         else {
3926           emit_movimm(value,hr);
3927         }
3928       }
3929     }
3930   }
3931   // Load 64-bit regs
3932   for(hr=0;hr<HOST_REGS;hr++) {
3933     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3934       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3935         if((is32>>(regmap[hr]&63))&1) {
3936           int lr=get_reg(regmap,regmap[hr]-64);
3937           assert(lr>=0);
3938           emit_sarimm(lr,31,hr);
3939         }
3940         else
3941         {
3942           int value=constmap[i][hr];
3943           if(value==0) {
3944             emit_zeroreg(hr);
3945           }
3946           else {
3947             emit_movimm(value,hr);
3948           }
3949         }
3950       }
3951     }
3952   }
3953 }
3954
3955 // Write out all dirty registers (except cycle count)
3956 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3957 {
3958   int hr;
3959   for(hr=0;hr<HOST_REGS;hr++) {
3960     if(hr!=EXCLUDE_REG) {
3961       if(i_regmap[hr]>0) {
3962         if(i_regmap[hr]!=CCREG) {
3963           if((i_dirty>>hr)&1) {
3964             if(i_regmap[hr]<64) {
3965               emit_storereg(i_regmap[hr],hr);
3966             }else{
3967               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3968                 emit_storereg(i_regmap[hr],hr);
3969               }
3970             }
3971           }
3972         }
3973       }
3974     }
3975   }
3976 }
3977 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3978 // This writes the registers not written by store_regs_bt
3979 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3980 {
3981   int hr;
3982   int t=(addr-start)>>2;
3983   for(hr=0;hr<HOST_REGS;hr++) {
3984     if(hr!=EXCLUDE_REG) {
3985       if(i_regmap[hr]>0) {
3986         if(i_regmap[hr]!=CCREG) {
3987           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3988             if((i_dirty>>hr)&1) {
3989               if(i_regmap[hr]<64) {
3990                 emit_storereg(i_regmap[hr],hr);
3991               }else{
3992                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3993                   emit_storereg(i_regmap[hr],hr);
3994                 }
3995               }
3996             }
3997           }
3998         }
3999       }
4000     }
4001   }
4002 }
4003
4004 // Load all registers (except cycle count)
4005 void load_all_regs(signed char i_regmap[])
4006 {
4007   int hr;
4008   for(hr=0;hr<HOST_REGS;hr++) {
4009     if(hr!=EXCLUDE_REG) {
4010       if(i_regmap[hr]==0) {
4011         emit_zeroreg(hr);
4012       }
4013       else
4014       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4015       {
4016         emit_loadreg(i_regmap[hr],hr);
4017       }
4018     }
4019   }
4020 }
4021
4022 // Load all current registers also needed by next instruction
4023 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4024 {
4025   int hr;
4026   for(hr=0;hr<HOST_REGS;hr++) {
4027     if(hr!=EXCLUDE_REG) {
4028       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4029         if(i_regmap[hr]==0) {
4030           emit_zeroreg(hr);
4031         }
4032         else
4033         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4034         {
4035           emit_loadreg(i_regmap[hr],hr);
4036         }
4037       }
4038     }
4039   }
4040 }
4041
4042 // Load all regs, storing cycle count if necessary
4043 void load_regs_entry(int t)
4044 {
4045   int hr;
4046   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4047   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4048   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4049     emit_storereg(CCREG,HOST_CCREG);
4050   }
4051   // Load 32-bit regs
4052   for(hr=0;hr<HOST_REGS;hr++) {
4053     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4054       if(regs[t].regmap_entry[hr]==0) {
4055         emit_zeroreg(hr);
4056       }
4057       else if(regs[t].regmap_entry[hr]!=CCREG)
4058       {
4059         emit_loadreg(regs[t].regmap_entry[hr],hr);
4060       }
4061     }
4062   }
4063   // Load 64-bit regs
4064   for(hr=0;hr<HOST_REGS;hr++) {
4065     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4066       assert(regs[t].regmap_entry[hr]!=64);
4067       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4068         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4069         if(lr<0) {
4070           emit_loadreg(regs[t].regmap_entry[hr],hr);
4071         }
4072         else
4073         {
4074           emit_sarimm(lr,31,hr);
4075         }
4076       }
4077       else
4078       {
4079         emit_loadreg(regs[t].regmap_entry[hr],hr);
4080       }
4081     }
4082   }
4083 }
4084
4085 // Store dirty registers prior to branch
4086 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4087 {
4088   if(internal_branch(i_is32,addr))
4089   {
4090     int t=(addr-start)>>2;
4091     int hr;
4092     for(hr=0;hr<HOST_REGS;hr++) {
4093       if(hr!=EXCLUDE_REG) {
4094         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4095           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4096             if((i_dirty>>hr)&1) {
4097               if(i_regmap[hr]<64) {
4098                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4099                   emit_storereg(i_regmap[hr],hr);
4100                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4101                     #ifdef DESTRUCTIVE_WRITEBACK
4102                     emit_sarimm(hr,31,hr);
4103                     emit_storereg(i_regmap[hr]|64,hr);
4104                     #else
4105                     emit_sarimm(hr,31,HOST_TEMPREG);
4106                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4107                     #endif
4108                   }
4109                 }
4110               }else{
4111                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4112                   emit_storereg(i_regmap[hr],hr);
4113                 }
4114               }
4115             }
4116           }
4117         }
4118       }
4119     }
4120   }
4121   else
4122   {
4123     // Branch out of this block, write out all dirty regs
4124     wb_dirtys(i_regmap,i_is32,i_dirty);
4125   }
4126 }
4127
4128 // Load all needed registers for branch target
4129 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4130 {
4131   //if(addr>=start && addr<(start+slen*4))
4132   if(internal_branch(i_is32,addr))
4133   {
4134     int t=(addr-start)>>2;
4135     int hr;
4136     // Store the cycle count before loading something else
4137     if(i_regmap[HOST_CCREG]!=CCREG) {
4138       assert(i_regmap[HOST_CCREG]==-1);
4139     }
4140     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4141       emit_storereg(CCREG,HOST_CCREG);
4142     }
4143     // Load 32-bit regs
4144     for(hr=0;hr<HOST_REGS;hr++) {
4145       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4146         #ifdef DESTRUCTIVE_WRITEBACK
4147         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4148         #else
4149         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4150         #endif
4151           if(regs[t].regmap_entry[hr]==0) {
4152             emit_zeroreg(hr);
4153           }
4154           else if(regs[t].regmap_entry[hr]!=CCREG)
4155           {
4156             emit_loadreg(regs[t].regmap_entry[hr],hr);
4157           }
4158         }
4159       }
4160     }
4161     //Load 64-bit regs
4162     for(hr=0;hr<HOST_REGS;hr++) {
4163       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4164         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4165           assert(regs[t].regmap_entry[hr]!=64);
4166           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4167             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4168             if(lr<0) {
4169               emit_loadreg(regs[t].regmap_entry[hr],hr);
4170             }
4171             else
4172             {
4173               emit_sarimm(lr,31,hr);
4174             }
4175           }
4176           else
4177           {
4178             emit_loadreg(regs[t].regmap_entry[hr],hr);
4179           }
4180         }
4181         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4182           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4183           assert(lr>=0);
4184           emit_sarimm(lr,31,hr);
4185         }
4186       }
4187     }
4188   }
4189 }
4190
4191 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4192 {
4193   if(addr>=start && addr<start+slen*4-4)
4194   {
4195     int t=(addr-start)>>2;
4196     int hr;
4197     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4198     for(hr=0;hr<HOST_REGS;hr++)
4199     {
4200       if(hr!=EXCLUDE_REG)
4201       {
4202         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4203         {
4204           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4205           {
4206             return 0;
4207           }
4208           else
4209           if((i_dirty>>hr)&1)
4210           {
4211             if(i_regmap[hr]<TEMPREG)
4212             {
4213               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4214                 return 0;
4215             }
4216             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4217             {
4218               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4219                 return 0;
4220             }
4221           }
4222         }
4223         else // Same register but is it 32-bit or dirty?
4224         if(i_regmap[hr]>=0)
4225         {
4226           if(!((regs[t].dirty>>hr)&1))
4227           {
4228             if((i_dirty>>hr)&1)
4229             {
4230               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4231               {
4232                 //printf("%x: dirty no match\n",addr);
4233                 return 0;
4234               }
4235             }
4236           }
4237           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4238           {
4239             //printf("%x: is32 no match\n",addr);
4240             return 0;
4241           }
4242         }
4243       }
4244     }
4245     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4246     // Delay slots are not valid branch targets
4247     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4248     // Delay slots require additional processing, so do not match
4249     if(is_ds[t]) return 0;
4250   }
4251   else
4252   {
4253     int hr;
4254     for(hr=0;hr<HOST_REGS;hr++)
4255     {
4256       if(hr!=EXCLUDE_REG)
4257       {
4258         if(i_regmap[hr]>=0)
4259         {
4260           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4261           {
4262             if((i_dirty>>hr)&1)
4263             {
4264               return 0;
4265             }
4266           }
4267         }
4268       }
4269     }
4270   }
4271   return 1;
4272 }
4273
4274 // Used when a branch jumps into the delay slot of another branch
4275 void ds_assemble_entry(int i)
4276 {
4277   int t=(ba[i]-start)>>2;
4278   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4279   assem_debug("Assemble delay slot at %x\n",ba[i]);
4280   assem_debug("<->\n");
4281   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4282     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4283   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4284   address_generation(t,&regs[t],regs[t].regmap_entry);
4285   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4286     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4287   cop1_usable=0;
4288   is_delayslot=0;
4289   switch(itype[t]) {
4290     case ALU:
4291       alu_assemble(t,&regs[t]);break;
4292     case IMM16:
4293       imm16_assemble(t,&regs[t]);break;
4294     case SHIFT:
4295       shift_assemble(t,&regs[t]);break;
4296     case SHIFTIMM:
4297       shiftimm_assemble(t,&regs[t]);break;
4298     case LOAD:
4299       load_assemble(t,&regs[t]);break;
4300     case LOADLR:
4301       loadlr_assemble(t,&regs[t]);break;
4302     case STORE:
4303       store_assemble(t,&regs[t]);break;
4304     case STORELR:
4305       storelr_assemble(t,&regs[t]);break;
4306     case COP0:
4307       cop0_assemble(t,&regs[t]);break;
4308     case COP1:
4309       cop1_assemble(t,&regs[t]);break;
4310     case C1LS:
4311       c1ls_assemble(t,&regs[t]);break;
4312     case COP2:
4313       cop2_assemble(t,&regs[t]);break;
4314     case C2LS:
4315       c2ls_assemble(t,&regs[t]);break;
4316     case C2OP:
4317       c2op_assemble(t,&regs[t]);break;
4318     case FCONV:
4319       fconv_assemble(t,&regs[t]);break;
4320     case FLOAT:
4321       float_assemble(t,&regs[t]);break;
4322     case FCOMP:
4323       fcomp_assemble(t,&regs[t]);break;
4324     case MULTDIV:
4325       multdiv_assemble(t,&regs[t]);break;
4326     case MOV:
4327       mov_assemble(t,&regs[t]);break;
4328     case SYSCALL:
4329     case HLECALL:
4330     case INTCALL:
4331     case SPAN:
4332     case UJUMP:
4333     case RJUMP:
4334     case CJUMP:
4335     case SJUMP:
4336     case FJUMP:
4337       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4338   }
4339   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4340   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4341   if(internal_branch(regs[t].is32,ba[i]+4))
4342     assem_debug("branch: internal\n");
4343   else
4344     assem_debug("branch: external\n");
4345   assert(internal_branch(regs[t].is32,ba[i]+4));
4346   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4347   emit_jmp(0);
4348 }
4349
4350 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4351 {
4352   int count;
4353   int jaddr;
4354   int idle=0;
4355   int t=0;
4356   if(itype[i]==RJUMP)
4357   {
4358     *adj=0;
4359   }
4360   //if(ba[i]>=start && ba[i]<(start+slen*4))
4361   if(internal_branch(branch_regs[i].is32,ba[i]))
4362   {
4363     t=(ba[i]-start)>>2;
4364     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4365     else *adj=ccadj[t];
4366   }
4367   else
4368   {
4369     *adj=0;
4370   }
4371   count=ccadj[i];
4372   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4373     // Idle loop
4374     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4375     idle=(int)out;
4376     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4377     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4378     jaddr=(int)out;
4379     emit_jmp(0);
4380   }
4381   else if(*adj==0||invert) {
4382     int cycles=CLOCK_ADJUST(count+2);
4383     // faster loop HACK
4384     if (t&&*adj) {
4385       int rel=t-i;
4386       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4387         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4388     }
4389     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4390     jaddr=(int)out;
4391     emit_jns(0);
4392   }
4393   else
4394   {
4395     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4396     jaddr=(int)out;
4397     emit_jns(0);
4398   }
4399   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4400 }
4401
4402 void do_ccstub(int n)
4403 {
4404   literal_pool(256);
4405   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4406   set_jump_target(stubs[n][1],(int)out);
4407   int i=stubs[n][4];
4408   if(stubs[n][6]==NULLDS) {
4409     // Delay slot instruction is nullified ("likely" branch)
4410     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4411   }
4412   else if(stubs[n][6]!=TAKEN) {
4413     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4414   }
4415   else {
4416     if(internal_branch(branch_regs[i].is32,ba[i]))
4417       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4418   }
4419   if(stubs[n][5]!=-1)
4420   {
4421     // Save PC as return address
4422     emit_movimm(stubs[n][5],EAX);
4423     emit_writeword(EAX,(int)&pcaddr);
4424   }
4425   else
4426   {
4427     // Return address depends on which way the branch goes
4428     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4429     {
4430       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4431       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4432       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4433       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4434       if(rs1[i]==0)
4435       {
4436         s1l=s2l;s1h=s2h;
4437         s2l=s2h=-1;
4438       }
4439       else if(rs2[i]==0)
4440       {
4441         s2l=s2h=-1;
4442       }
4443       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4444         s1h=s2h=-1;
4445       }
4446       assert(s1l>=0);
4447       #ifdef DESTRUCTIVE_WRITEBACK
4448       if(rs1[i]) {
4449         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4450           emit_loadreg(rs1[i],s1l);
4451       }
4452       else {
4453         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4454           emit_loadreg(rs2[i],s1l);
4455       }
4456       if(s2l>=0)
4457         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4458           emit_loadreg(rs2[i],s2l);
4459       #endif
4460       int hr=0;
4461       int addr=-1,alt=-1,ntaddr=-1;
4462       while(hr<HOST_REGS)
4463       {
4464         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4465            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4466            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4467         {
4468           addr=hr++;break;
4469         }
4470         hr++;
4471       }
4472       while(hr<HOST_REGS)
4473       {
4474         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4475            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4476            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4477         {
4478           alt=hr++;break;
4479         }
4480         hr++;
4481       }
4482       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4483       {
4484         while(hr<HOST_REGS)
4485         {
4486           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4487              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4488              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4489           {
4490             ntaddr=hr;break;
4491           }
4492           hr++;
4493         }
4494         assert(hr<HOST_REGS);
4495       }
4496       if((opcode[i]&0x2f)==4) // BEQ
4497       {
4498         #ifdef HAVE_CMOV_IMM
4499         if(s1h<0) {
4500           if(s2l>=0) emit_cmp(s1l,s2l);
4501           else emit_test(s1l,s1l);
4502           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4503         }
4504         else
4505         #endif
4506         {
4507           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4508           if(s1h>=0) {
4509             if(s2h>=0) emit_cmp(s1h,s2h);
4510             else emit_test(s1h,s1h);
4511             emit_cmovne_reg(alt,addr);
4512           }
4513           if(s2l>=0) emit_cmp(s1l,s2l);
4514           else emit_test(s1l,s1l);
4515           emit_cmovne_reg(alt,addr);
4516         }
4517       }
4518       if((opcode[i]&0x2f)==5) // BNE
4519       {
4520         #ifdef HAVE_CMOV_IMM
4521         if(s1h<0) {
4522           if(s2l>=0) emit_cmp(s1l,s2l);
4523           else emit_test(s1l,s1l);
4524           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4525         }
4526         else
4527         #endif
4528         {
4529           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4530           if(s1h>=0) {
4531             if(s2h>=0) emit_cmp(s1h,s2h);
4532             else emit_test(s1h,s1h);
4533             emit_cmovne_reg(alt,addr);
4534           }
4535           if(s2l>=0) emit_cmp(s1l,s2l);
4536           else emit_test(s1l,s1l);
4537           emit_cmovne_reg(alt,addr);
4538         }
4539       }
4540       if((opcode[i]&0x2f)==6) // BLEZ
4541       {
4542         //emit_movimm(ba[i],alt);
4543         //emit_movimm(start+i*4+8,addr);
4544         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4545         emit_cmpimm(s1l,1);
4546         if(s1h>=0) emit_mov(addr,ntaddr);
4547         emit_cmovl_reg(alt,addr);
4548         if(s1h>=0) {
4549           emit_test(s1h,s1h);
4550           emit_cmovne_reg(ntaddr,addr);
4551           emit_cmovs_reg(alt,addr);
4552         }
4553       }
4554       if((opcode[i]&0x2f)==7) // BGTZ
4555       {
4556         //emit_movimm(ba[i],addr);
4557         //emit_movimm(start+i*4+8,ntaddr);
4558         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4559         emit_cmpimm(s1l,1);
4560         if(s1h>=0) emit_mov(addr,alt);
4561         emit_cmovl_reg(ntaddr,addr);
4562         if(s1h>=0) {
4563           emit_test(s1h,s1h);
4564           emit_cmovne_reg(alt,addr);
4565           emit_cmovs_reg(ntaddr,addr);
4566         }
4567       }
4568       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4569       {
4570         //emit_movimm(ba[i],alt);
4571         //emit_movimm(start+i*4+8,addr);
4572         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4573         if(s1h>=0) emit_test(s1h,s1h);
4574         else emit_test(s1l,s1l);
4575         emit_cmovs_reg(alt,addr);
4576       }
4577       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4578       {
4579         //emit_movimm(ba[i],addr);
4580         //emit_movimm(start+i*4+8,alt);
4581         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4582         if(s1h>=0) emit_test(s1h,s1h);
4583         else emit_test(s1l,s1l);
4584         emit_cmovs_reg(alt,addr);
4585       }
4586       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4587         if(source[i]&0x10000) // BC1T
4588         {
4589           //emit_movimm(ba[i],alt);
4590           //emit_movimm(start+i*4+8,addr);
4591           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4592           emit_testimm(s1l,0x800000);
4593           emit_cmovne_reg(alt,addr);
4594         }
4595         else // BC1F
4596         {
4597           //emit_movimm(ba[i],addr);
4598           //emit_movimm(start+i*4+8,alt);
4599           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4600           emit_testimm(s1l,0x800000);
4601           emit_cmovne_reg(alt,addr);
4602         }
4603       }
4604       emit_writeword(addr,(int)&pcaddr);
4605     }
4606     else
4607     if(itype[i]==RJUMP)
4608     {
4609       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4610       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4611         r=get_reg(branch_regs[i].regmap,RTEMP);
4612       }
4613       emit_writeword(r,(int)&pcaddr);
4614     }
4615     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4616   }
4617   // Update cycle count
4618   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4619   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4620   emit_call((int)cc_interrupt);
4621   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4622   if(stubs[n][6]==TAKEN) {
4623     if(internal_branch(branch_regs[i].is32,ba[i]))
4624       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4625     else if(itype[i]==RJUMP) {
4626       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4627         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4628       else
4629         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4630     }
4631   }else if(stubs[n][6]==NOTTAKEN) {
4632     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4633     else load_all_regs(branch_regs[i].regmap);
4634   }else if(stubs[n][6]==NULLDS) {
4635     // Delay slot instruction is nullified ("likely" branch)
4636     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4637     else load_all_regs(regs[i].regmap);
4638   }else{
4639     load_all_regs(branch_regs[i].regmap);
4640   }
4641   emit_jmp(stubs[n][2]); // return address
4642
4643   /* This works but uses a lot of memory...
4644   emit_readword((int)&last_count,ECX);
4645   emit_add(HOST_CCREG,ECX,EAX);
4646   emit_writeword(EAX,(int)&Count);
4647   emit_call((int)gen_interupt);
4648   emit_readword((int)&Count,HOST_CCREG);
4649   emit_readword((int)&next_interupt,EAX);
4650   emit_readword((int)&pending_exception,EBX);
4651   emit_writeword(EAX,(int)&last_count);
4652   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4653   emit_test(EBX,EBX);
4654   int jne_instr=(int)out;
4655   emit_jne(0);
4656   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4657   load_all_regs(branch_regs[i].regmap);
4658   emit_jmp(stubs[n][2]); // return address
4659   set_jump_target(jne_instr,(int)out);
4660   emit_readword((int)&pcaddr,EAX);
4661   // Call get_addr_ht instead of doing the hash table here.
4662   // This code is executed infrequently and takes up a lot of space
4663   // so smaller is better.
4664   emit_storereg(CCREG,HOST_CCREG);
4665   emit_pushreg(EAX);
4666   emit_call((int)get_addr_ht);
4667   emit_loadreg(CCREG,HOST_CCREG);
4668   emit_addimm(ESP,4,ESP);
4669   emit_jmpreg(EAX);*/
4670 }
4671
4672 static void add_to_linker(int addr,int target,int ext)
4673 {
4674   link_addr[linkcount][0]=addr;
4675   link_addr[linkcount][1]=target;
4676   link_addr[linkcount][2]=ext;
4677   linkcount++;
4678 }
4679
4680 static void ujump_assemble_write_ra(int i)
4681 {
4682   int rt;
4683   unsigned int return_address;
4684   rt=get_reg(branch_regs[i].regmap,31);
4685   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4686   //assert(rt>=0);
4687   return_address=start+i*4+8;
4688   if(rt>=0) {
4689     #ifdef USE_MINI_HT
4690     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4691       int temp=-1; // note: must be ds-safe
4692       #ifdef HOST_TEMPREG
4693       temp=HOST_TEMPREG;
4694       #endif
4695       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4696       else emit_movimm(return_address,rt);
4697     }
4698     else
4699     #endif
4700     {
4701       #ifdef REG_PREFETCH
4702       if(temp>=0)
4703       {
4704         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4705       }
4706       #endif
4707       emit_movimm(return_address,rt); // PC into link register
4708       #ifdef IMM_PREFETCH
4709       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4710       #endif
4711     }
4712   }
4713 }
4714
4715 void ujump_assemble(int i,struct regstat *i_regs)
4716 {
4717   int ra_done=0;
4718   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4719   address_generation(i+1,i_regs,regs[i].regmap_entry);
4720   #ifdef REG_PREFETCH
4721   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4722   if(rt1[i]==31&&temp>=0)
4723   {
4724     signed char *i_regmap=i_regs->regmap;
4725     int return_address=start+i*4+8;
4726     if(get_reg(branch_regs[i].regmap,31)>0)
4727     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4728   }
4729   #endif
4730   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4731     ujump_assemble_write_ra(i); // writeback ra for DS
4732     ra_done=1;
4733   }
4734   ds_assemble(i+1,i_regs);
4735   uint64_t bc_unneeded=branch_regs[i].u;
4736   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4737   bc_unneeded|=1|(1LL<<rt1[i]);
4738   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4739   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4740                 bc_unneeded,bc_unneeded_upper);
4741   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4742   if(!ra_done&&rt1[i]==31)
4743     ujump_assemble_write_ra(i);
4744   int cc,adj;
4745   cc=get_reg(branch_regs[i].regmap,CCREG);
4746   assert(cc==HOST_CCREG);
4747   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4748   #ifdef REG_PREFETCH
4749   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4750   #endif
4751   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4752   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4753   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4754   if(internal_branch(branch_regs[i].is32,ba[i]))
4755     assem_debug("branch: internal\n");
4756   else
4757     assem_debug("branch: external\n");
4758   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4759     ds_assemble_entry(i);
4760   }
4761   else {
4762     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4763     emit_jmp(0);
4764   }
4765 }
4766
4767 static void rjump_assemble_write_ra(int i)
4768 {
4769   int rt,return_address;
4770   assert(rt1[i+1]!=rt1[i]);
4771   assert(rt2[i+1]!=rt1[i]);
4772   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4773   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4774   assert(rt>=0);
4775   return_address=start+i*4+8;
4776   #ifdef REG_PREFETCH
4777   if(temp>=0)
4778   {
4779     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4780   }
4781   #endif
4782   emit_movimm(return_address,rt); // PC into link register
4783   #ifdef IMM_PREFETCH
4784   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4785   #endif
4786 }
4787
4788 void rjump_assemble(int i,struct regstat *i_regs)
4789 {
4790   int temp;
4791   int rs,cc;
4792   int ra_done=0;
4793   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4794   assert(rs>=0);
4795   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4796     // Delay slot abuse, make a copy of the branch address register
4797     temp=get_reg(branch_regs[i].regmap,RTEMP);
4798     assert(temp>=0);
4799     assert(regs[i].regmap[temp]==RTEMP);
4800     emit_mov(rs,temp);
4801     rs=temp;
4802   }
4803   address_generation(i+1,i_regs,regs[i].regmap_entry);
4804   #ifdef REG_PREFETCH
4805   if(rt1[i]==31)
4806   {
4807     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4808       signed char *i_regmap=i_regs->regmap;
4809       int return_address=start+i*4+8;
4810       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4811     }
4812   }
4813   #endif
4814   #ifdef USE_MINI_HT
4815   if(rs1[i]==31) {
4816     int rh=get_reg(regs[i].regmap,RHASH);
4817     if(rh>=0) do_preload_rhash(rh);
4818   }
4819   #endif
4820   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4821     rjump_assemble_write_ra(i);
4822     ra_done=1;
4823   }
4824   ds_assemble(i+1,i_regs);
4825   uint64_t bc_unneeded=branch_regs[i].u;
4826   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4827   bc_unneeded|=1|(1LL<<rt1[i]);
4828   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4829   bc_unneeded&=~(1LL<<rs1[i]);
4830   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4831                 bc_unneeded,bc_unneeded_upper);
4832   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4833   if(!ra_done&&rt1[i]!=0)
4834     rjump_assemble_write_ra(i);
4835   cc=get_reg(branch_regs[i].regmap,CCREG);
4836   assert(cc==HOST_CCREG);
4837   (void)cc;
4838   #ifdef USE_MINI_HT
4839   int rh=get_reg(branch_regs[i].regmap,RHASH);
4840   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4841   if(rs1[i]==31) {
4842     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4843     do_preload_rhtbl(ht);
4844     do_rhash(rs,rh);
4845   }
4846   #endif
4847   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4848   #ifdef DESTRUCTIVE_WRITEBACK
4849   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4850     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4851       emit_loadreg(rs1[i],rs);
4852     }
4853   }
4854   #endif
4855   #ifdef REG_PREFETCH
4856   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4857   #endif
4858   #ifdef USE_MINI_HT
4859   if(rs1[i]==31) {
4860     do_miniht_load(ht,rh);
4861   }
4862   #endif
4863   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4864   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4865   //assert(adj==0);
4866   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4867   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4868   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4869     // special case for RFE
4870     emit_jmp(0);
4871   else
4872     emit_jns(0);
4873   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4874   #ifdef USE_MINI_HT
4875   if(rs1[i]==31) {
4876     do_miniht_jump(rs,rh,ht);
4877   }
4878   else
4879   #endif
4880   {
4881     //if(rs!=EAX) emit_mov(rs,EAX);
4882     //emit_jmp((int)jump_vaddr_eax);
4883     emit_jmp(jump_vaddr_reg[rs]);
4884   }
4885   /* Check hash table
4886   temp=!rs;
4887   emit_mov(rs,temp);
4888   emit_shrimm(rs,16,rs);
4889   emit_xor(temp,rs,rs);
4890   emit_movzwl_reg(rs,rs);
4891   emit_shlimm(rs,4,rs);
4892   emit_cmpmem_indexed((int)hash_table,rs,temp);
4893   emit_jne((int)out+14);
4894   emit_readword_indexed((int)hash_table+4,rs,rs);
4895   emit_jmpreg(rs);
4896   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4897   emit_addimm_no_flags(8,rs);
4898   emit_jeq((int)out-17);
4899   // No hit on hash table, call compiler
4900   emit_pushreg(temp);
4901 //DEBUG >
4902 #ifdef DEBUG_CYCLE_COUNT
4903   emit_readword((int)&last_count,ECX);
4904   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4905   emit_readword((int)&next_interupt,ECX);
4906   emit_writeword(HOST_CCREG,(int)&Count);
4907   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4908   emit_writeword(ECX,(int)&last_count);
4909 #endif
4910 //DEBUG <
4911   emit_storereg(CCREG,HOST_CCREG);
4912   emit_call((int)get_addr);
4913   emit_loadreg(CCREG,HOST_CCREG);
4914   emit_addimm(ESP,4,ESP);
4915   emit_jmpreg(EAX);*/
4916   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4917   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4918   #endif
4919 }
4920
4921 void cjump_assemble(int i,struct regstat *i_regs)
4922 {
4923   signed char *i_regmap=i_regs->regmap;
4924   int cc;
4925   int match;
4926   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4927   assem_debug("match=%d\n",match);
4928   int s1h,s1l,s2h,s2l;
4929   int prev_cop1_usable=cop1_usable;
4930   int unconditional=0,nop=0;
4931   int only32=0;
4932   int invert=0;
4933   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4934   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4935   if(!match) invert=1;
4936   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4937   if(i>(ba[i]-start)>>2) invert=1;
4938   #endif
4939
4940   if(ooo[i]) {
4941     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4942     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4943     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4944     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4945   }
4946   else {
4947     s1l=get_reg(i_regmap,rs1[i]);
4948     s1h=get_reg(i_regmap,rs1[i]|64);
4949     s2l=get_reg(i_regmap,rs2[i]);
4950     s2h=get_reg(i_regmap,rs2[i]|64);
4951   }
4952   if(rs1[i]==0&&rs2[i]==0)
4953   {
4954     if(opcode[i]&1) nop=1;
4955     else unconditional=1;
4956     //assert(opcode[i]!=5);
4957     //assert(opcode[i]!=7);
4958     //assert(opcode[i]!=0x15);
4959     //assert(opcode[i]!=0x17);
4960   }
4961   else if(rs1[i]==0)
4962   {
4963     s1l=s2l;s1h=s2h;
4964     s2l=s2h=-1;
4965     only32=(regs[i].was32>>rs2[i])&1;
4966   }
4967   else if(rs2[i]==0)
4968   {
4969     s2l=s2h=-1;
4970     only32=(regs[i].was32>>rs1[i])&1;
4971   }
4972   else {
4973     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
4974   }
4975
4976   if(ooo[i]) {
4977     // Out of order execution (delay slot first)
4978     //printf("OOOE\n");
4979     address_generation(i+1,i_regs,regs[i].regmap_entry);
4980     ds_assemble(i+1,i_regs);
4981     int adj;
4982     uint64_t bc_unneeded=branch_regs[i].u;
4983     uint64_t bc_unneeded_upper=branch_regs[i].uu;
4984     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4985     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
4986     bc_unneeded|=1;
4987     bc_unneeded_upper|=1;
4988     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4989                   bc_unneeded,bc_unneeded_upper);
4990     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
4991     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4992     cc=get_reg(branch_regs[i].regmap,CCREG);
4993     assert(cc==HOST_CCREG);
4994     if(unconditional)
4995       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4996     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4997     //assem_debug("cycle count (adj)\n");
4998     if(unconditional) {
4999       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5000       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5001         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5002         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5003         if(internal)
5004           assem_debug("branch: internal\n");
5005         else
5006           assem_debug("branch: external\n");
5007         if(internal&&is_ds[(ba[i]-start)>>2]) {
5008           ds_assemble_entry(i);
5009         }
5010         else {
5011           add_to_linker((int)out,ba[i],internal);
5012           emit_jmp(0);
5013         }
5014         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5015         if(((u_int)out)&7) emit_addnop(0);
5016         #endif
5017       }
5018     }
5019     else if(nop) {
5020       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5021       int jaddr=(int)out;
5022       emit_jns(0);
5023       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5024     }
5025     else {
5026       int taken=0,nottaken=0,nottaken1=0;
5027       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5028       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5029       if(!only32)
5030       {
5031         assert(s1h>=0);
5032         if(opcode[i]==4) // BEQ
5033         {
5034           if(s2h>=0) emit_cmp(s1h,s2h);
5035           else emit_test(s1h,s1h);
5036           nottaken1=(int)out;
5037           emit_jne(1);
5038         }
5039         if(opcode[i]==5) // BNE
5040         {
5041           if(s2h>=0) emit_cmp(s1h,s2h);
5042           else emit_test(s1h,s1h);
5043           if(invert) taken=(int)out;
5044           else add_to_linker((int)out,ba[i],internal);
5045           emit_jne(0);
5046         }
5047         if(opcode[i]==6) // BLEZ
5048         {
5049           emit_test(s1h,s1h);
5050           if(invert) taken=(int)out;
5051           else add_to_linker((int)out,ba[i],internal);
5052           emit_js(0);
5053           nottaken1=(int)out;
5054           emit_jne(1);
5055         }
5056         if(opcode[i]==7) // BGTZ
5057         {
5058           emit_test(s1h,s1h);
5059           nottaken1=(int)out;
5060           emit_js(1);
5061           if(invert) taken=(int)out;
5062           else add_to_linker((int)out,ba[i],internal);
5063           emit_jne(0);
5064         }
5065       } // if(!only32)
5066
5067       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5068       assert(s1l>=0);
5069       if(opcode[i]==4) // BEQ
5070       {
5071         if(s2l>=0) emit_cmp(s1l,s2l);
5072         else emit_test(s1l,s1l);
5073         if(invert){
5074           nottaken=(int)out;
5075           emit_jne(1);
5076         }else{
5077           add_to_linker((int)out,ba[i],internal);
5078           emit_jeq(0);
5079         }
5080       }
5081       if(opcode[i]==5) // BNE
5082       {
5083         if(s2l>=0) emit_cmp(s1l,s2l);
5084         else emit_test(s1l,s1l);
5085         if(invert){
5086           nottaken=(int)out;
5087           emit_jeq(1);
5088         }else{
5089           add_to_linker((int)out,ba[i],internal);
5090           emit_jne(0);
5091         }
5092       }
5093       if(opcode[i]==6) // BLEZ
5094       {
5095         emit_cmpimm(s1l,1);
5096         if(invert){
5097           nottaken=(int)out;
5098           emit_jge(1);
5099         }else{
5100           add_to_linker((int)out,ba[i],internal);
5101           emit_jl(0);
5102         }
5103       }
5104       if(opcode[i]==7) // BGTZ
5105       {
5106         emit_cmpimm(s1l,1);
5107         if(invert){
5108           nottaken=(int)out;
5109           emit_jl(1);
5110         }else{
5111           add_to_linker((int)out,ba[i],internal);
5112           emit_jge(0);
5113         }
5114       }
5115       if(invert) {
5116         if(taken) set_jump_target(taken,(int)out);
5117         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5118         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5119           if(adj) {
5120             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5121             add_to_linker((int)out,ba[i],internal);
5122           }else{
5123             emit_addnop(13);
5124             add_to_linker((int)out,ba[i],internal*2);
5125           }
5126           emit_jmp(0);
5127         }else
5128         #endif
5129         {
5130           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5131           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5132           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5133           if(internal)
5134             assem_debug("branch: internal\n");
5135           else
5136             assem_debug("branch: external\n");
5137           if(internal&&is_ds[(ba[i]-start)>>2]) {
5138             ds_assemble_entry(i);
5139           }
5140           else {
5141             add_to_linker((int)out,ba[i],internal);
5142             emit_jmp(0);
5143           }
5144         }
5145         set_jump_target(nottaken,(int)out);
5146       }
5147
5148       if(nottaken1) set_jump_target(nottaken1,(int)out);
5149       if(adj) {
5150         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5151       }
5152     } // (!unconditional)
5153   } // if(ooo)
5154   else
5155   {
5156     // In-order execution (branch first)
5157     //if(likely[i]) printf("IOL\n");
5158     //else
5159     //printf("IOE\n");
5160     int taken=0,nottaken=0,nottaken1=0;
5161     if(!unconditional&&!nop) {
5162       if(!only32)
5163       {
5164         assert(s1h>=0);
5165         if((opcode[i]&0x2f)==4) // BEQ
5166         {
5167           if(s2h>=0) emit_cmp(s1h,s2h);
5168           else emit_test(s1h,s1h);
5169           nottaken1=(int)out;
5170           emit_jne(2);
5171         }
5172         if((opcode[i]&0x2f)==5) // BNE
5173         {
5174           if(s2h>=0) emit_cmp(s1h,s2h);
5175           else emit_test(s1h,s1h);
5176           taken=(int)out;
5177           emit_jne(1);
5178         }
5179         if((opcode[i]&0x2f)==6) // BLEZ
5180         {
5181           emit_test(s1h,s1h);
5182           taken=(int)out;
5183           emit_js(1);
5184           nottaken1=(int)out;
5185           emit_jne(2);
5186         }
5187         if((opcode[i]&0x2f)==7) // BGTZ
5188         {
5189           emit_test(s1h,s1h);
5190           nottaken1=(int)out;
5191           emit_js(2);
5192           taken=(int)out;
5193           emit_jne(1);
5194         }
5195       } // if(!only32)
5196
5197       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5198       assert(s1l>=0);
5199       if((opcode[i]&0x2f)==4) // BEQ
5200       {
5201         if(s2l>=0) emit_cmp(s1l,s2l);
5202         else emit_test(s1l,s1l);
5203         nottaken=(int)out;
5204         emit_jne(2);
5205       }
5206       if((opcode[i]&0x2f)==5) // BNE
5207       {
5208         if(s2l>=0) emit_cmp(s1l,s2l);
5209         else emit_test(s1l,s1l);
5210         nottaken=(int)out;
5211         emit_jeq(2);
5212       }
5213       if((opcode[i]&0x2f)==6) // BLEZ
5214       {
5215         emit_cmpimm(s1l,1);
5216         nottaken=(int)out;
5217         emit_jge(2);
5218       }
5219       if((opcode[i]&0x2f)==7) // BGTZ
5220       {
5221         emit_cmpimm(s1l,1);
5222         nottaken=(int)out;
5223         emit_jl(2);
5224       }
5225     } // if(!unconditional)
5226     int adj;
5227     uint64_t ds_unneeded=branch_regs[i].u;
5228     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5229     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5230     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5231     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5232     ds_unneeded|=1;
5233     ds_unneeded_upper|=1;
5234     // branch taken
5235     if(!nop) {
5236       if(taken) set_jump_target(taken,(int)out);
5237       assem_debug("1:\n");
5238       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5239                     ds_unneeded,ds_unneeded_upper);
5240       // load regs
5241       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5242       address_generation(i+1,&branch_regs[i],0);
5243       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5244       ds_assemble(i+1,&branch_regs[i]);
5245       cc=get_reg(branch_regs[i].regmap,CCREG);
5246       if(cc==-1) {
5247         emit_loadreg(CCREG,cc=HOST_CCREG);
5248         // CHECK: Is the following instruction (fall thru) allocated ok?
5249       }
5250       assert(cc==HOST_CCREG);
5251       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5252       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5253       assem_debug("cycle count (adj)\n");
5254       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5255       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5256       if(internal)
5257         assem_debug("branch: internal\n");
5258       else
5259         assem_debug("branch: external\n");
5260       if(internal&&is_ds[(ba[i]-start)>>2]) {
5261         ds_assemble_entry(i);
5262       }
5263       else {
5264         add_to_linker((int)out,ba[i],internal);
5265         emit_jmp(0);
5266       }
5267     }
5268     // branch not taken
5269     cop1_usable=prev_cop1_usable;
5270     if(!unconditional) {
5271       if(nottaken1) set_jump_target(nottaken1,(int)out);
5272       set_jump_target(nottaken,(int)out);
5273       assem_debug("2:\n");
5274       if(!likely[i]) {
5275         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5276                       ds_unneeded,ds_unneeded_upper);
5277         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5278         address_generation(i+1,&branch_regs[i],0);
5279         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5280         ds_assemble(i+1,&branch_regs[i]);
5281       }
5282       cc=get_reg(branch_regs[i].regmap,CCREG);
5283       if(cc==-1&&!likely[i]) {
5284         // Cycle count isn't in a register, temporarily load it then write it out
5285         emit_loadreg(CCREG,HOST_CCREG);
5286         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5287         int jaddr=(int)out;
5288         emit_jns(0);
5289         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5290         emit_storereg(CCREG,HOST_CCREG);
5291       }
5292       else{
5293         cc=get_reg(i_regmap,CCREG);
5294         assert(cc==HOST_CCREG);
5295         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5296         int jaddr=(int)out;
5297         emit_jns(0);
5298         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5299       }
5300     }
5301   }
5302 }
5303
5304 void sjump_assemble(int i,struct regstat *i_regs)
5305 {
5306   signed char *i_regmap=i_regs->regmap;
5307   int cc;
5308   int match;
5309   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5310   assem_debug("smatch=%d\n",match);
5311   int s1h,s1l;
5312   int prev_cop1_usable=cop1_usable;
5313   int unconditional=0,nevertaken=0;
5314   int only32=0;
5315   int invert=0;
5316   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5317   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5318   if(!match) invert=1;
5319   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5320   if(i>(ba[i]-start)>>2) invert=1;
5321   #endif
5322
5323   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5324   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5325
5326   if(ooo[i]) {
5327     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5328     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5329   }
5330   else {
5331     s1l=get_reg(i_regmap,rs1[i]);
5332     s1h=get_reg(i_regmap,rs1[i]|64);
5333   }
5334   if(rs1[i]==0)
5335   {
5336     if(opcode2[i]&1) unconditional=1;
5337     else nevertaken=1;
5338     // These are never taken (r0 is never less than zero)
5339     //assert(opcode2[i]!=0);
5340     //assert(opcode2[i]!=2);
5341     //assert(opcode2[i]!=0x10);
5342     //assert(opcode2[i]!=0x12);
5343   }
5344   else {
5345     only32=(regs[i].was32>>rs1[i])&1;
5346   }
5347
5348   if(ooo[i]) {
5349     // Out of order execution (delay slot first)
5350     //printf("OOOE\n");
5351     address_generation(i+1,i_regs,regs[i].regmap_entry);
5352     ds_assemble(i+1,i_regs);
5353     int adj;
5354     uint64_t bc_unneeded=branch_regs[i].u;
5355     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5356     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5357     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5358     bc_unneeded|=1;
5359     bc_unneeded_upper|=1;
5360     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5361                   bc_unneeded,bc_unneeded_upper);
5362     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5363     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5364     if(rt1[i]==31) {
5365       int rt,return_address;
5366       rt=get_reg(branch_regs[i].regmap,31);
5367       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5368       if(rt>=0) {
5369         // Save the PC even if the branch is not taken
5370         return_address=start+i*4+8;
5371         emit_movimm(return_address,rt); // PC into link register
5372         #ifdef IMM_PREFETCH
5373         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5374         #endif
5375       }
5376     }
5377     cc=get_reg(branch_regs[i].regmap,CCREG);
5378     assert(cc==HOST_CCREG);
5379     if(unconditional)
5380       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5381     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5382     assem_debug("cycle count (adj)\n");
5383     if(unconditional) {
5384       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5385       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5386         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5387         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5388         if(internal)
5389           assem_debug("branch: internal\n");
5390         else
5391           assem_debug("branch: external\n");
5392         if(internal&&is_ds[(ba[i]-start)>>2]) {
5393           ds_assemble_entry(i);
5394         }
5395         else {
5396           add_to_linker((int)out,ba[i],internal);
5397           emit_jmp(0);
5398         }
5399         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5400         if(((u_int)out)&7) emit_addnop(0);
5401         #endif
5402       }
5403     }
5404     else if(nevertaken) {
5405       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5406       int jaddr=(int)out;
5407       emit_jns(0);
5408       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5409     }
5410     else {
5411       int nottaken=0;
5412       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5413       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5414       if(!only32)
5415       {
5416         assert(s1h>=0);
5417         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5418         {
5419           emit_test(s1h,s1h);
5420           if(invert){
5421             nottaken=(int)out;
5422             emit_jns(1);
5423           }else{
5424             add_to_linker((int)out,ba[i],internal);
5425             emit_js(0);
5426           }
5427         }
5428         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5429         {
5430           emit_test(s1h,s1h);
5431           if(invert){
5432             nottaken=(int)out;
5433             emit_js(1);
5434           }else{
5435             add_to_linker((int)out,ba[i],internal);
5436             emit_jns(0);
5437           }
5438         }
5439       } // if(!only32)
5440       else
5441       {
5442         assert(s1l>=0);
5443         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5444         {
5445           emit_test(s1l,s1l);
5446           if(invert){
5447             nottaken=(int)out;
5448             emit_jns(1);
5449           }else{
5450             add_to_linker((int)out,ba[i],internal);
5451             emit_js(0);
5452           }
5453         }
5454         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5455         {
5456           emit_test(s1l,s1l);
5457           if(invert){
5458             nottaken=(int)out;
5459             emit_js(1);
5460           }else{
5461             add_to_linker((int)out,ba[i],internal);
5462             emit_jns(0);
5463           }
5464         }
5465       } // if(!only32)
5466
5467       if(invert) {
5468         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5469         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5470           if(adj) {
5471             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5472             add_to_linker((int)out,ba[i],internal);
5473           }else{
5474             emit_addnop(13);
5475             add_to_linker((int)out,ba[i],internal*2);
5476           }
5477           emit_jmp(0);
5478         }else
5479         #endif
5480         {
5481           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5482           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5483           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5484           if(internal)
5485             assem_debug("branch: internal\n");
5486           else
5487             assem_debug("branch: external\n");
5488           if(internal&&is_ds[(ba[i]-start)>>2]) {
5489             ds_assemble_entry(i);
5490           }
5491           else {
5492             add_to_linker((int)out,ba[i],internal);
5493             emit_jmp(0);
5494           }
5495         }
5496         set_jump_target(nottaken,(int)out);
5497       }
5498
5499       if(adj) {
5500         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5501       }
5502     } // (!unconditional)
5503   } // if(ooo)
5504   else
5505   {
5506     // In-order execution (branch first)
5507     //printf("IOE\n");
5508     int nottaken=0;
5509     if(rt1[i]==31) {
5510       int rt,return_address;
5511       rt=get_reg(branch_regs[i].regmap,31);
5512       if(rt>=0) {
5513         // Save the PC even if the branch is not taken
5514         return_address=start+i*4+8;
5515         emit_movimm(return_address,rt); // PC into link register
5516         #ifdef IMM_PREFETCH
5517         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5518         #endif
5519       }
5520     }
5521     if(!unconditional) {
5522       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5523       if(!only32)
5524       {
5525         assert(s1h>=0);
5526         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5527         {
5528           emit_test(s1h,s1h);
5529           nottaken=(int)out;
5530           emit_jns(1);
5531         }
5532         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5533         {
5534           emit_test(s1h,s1h);
5535           nottaken=(int)out;
5536           emit_js(1);
5537         }
5538       } // if(!only32)
5539       else
5540       {
5541         assert(s1l>=0);
5542         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5543         {
5544           emit_test(s1l,s1l);
5545           nottaken=(int)out;
5546           emit_jns(1);
5547         }
5548         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5549         {
5550           emit_test(s1l,s1l);
5551           nottaken=(int)out;
5552           emit_js(1);
5553         }
5554       }
5555     } // if(!unconditional)
5556     int adj;
5557     uint64_t ds_unneeded=branch_regs[i].u;
5558     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5559     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5560     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5561     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5562     ds_unneeded|=1;
5563     ds_unneeded_upper|=1;
5564     // branch taken
5565     if(!nevertaken) {
5566       //assem_debug("1:\n");
5567       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5568                     ds_unneeded,ds_unneeded_upper);
5569       // load regs
5570       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5571       address_generation(i+1,&branch_regs[i],0);
5572       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5573       ds_assemble(i+1,&branch_regs[i]);
5574       cc=get_reg(branch_regs[i].regmap,CCREG);
5575       if(cc==-1) {
5576         emit_loadreg(CCREG,cc=HOST_CCREG);
5577         // CHECK: Is the following instruction (fall thru) allocated ok?
5578       }
5579       assert(cc==HOST_CCREG);
5580       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5581       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5582       assem_debug("cycle count (adj)\n");
5583       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5584       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5585       if(internal)
5586         assem_debug("branch: internal\n");
5587       else
5588         assem_debug("branch: external\n");
5589       if(internal&&is_ds[(ba[i]-start)>>2]) {
5590         ds_assemble_entry(i);
5591       }
5592       else {
5593         add_to_linker((int)out,ba[i],internal);
5594         emit_jmp(0);
5595       }
5596     }
5597     // branch not taken
5598     cop1_usable=prev_cop1_usable;
5599     if(!unconditional) {
5600       set_jump_target(nottaken,(int)out);
5601       assem_debug("1:\n");
5602       if(!likely[i]) {
5603         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5604                       ds_unneeded,ds_unneeded_upper);
5605         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5606         address_generation(i+1,&branch_regs[i],0);
5607         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5608         ds_assemble(i+1,&branch_regs[i]);
5609       }
5610       cc=get_reg(branch_regs[i].regmap,CCREG);
5611       if(cc==-1&&!likely[i]) {
5612         // Cycle count isn't in a register, temporarily load it then write it out
5613         emit_loadreg(CCREG,HOST_CCREG);
5614         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5615         int jaddr=(int)out;
5616         emit_jns(0);
5617         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5618         emit_storereg(CCREG,HOST_CCREG);
5619       }
5620       else{
5621         cc=get_reg(i_regmap,CCREG);
5622         assert(cc==HOST_CCREG);
5623         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5624         int jaddr=(int)out;
5625         emit_jns(0);
5626         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5627       }
5628     }
5629   }
5630 }
5631
5632 void fjump_assemble(int i,struct regstat *i_regs)
5633 {
5634   signed char *i_regmap=i_regs->regmap;
5635   int cc;
5636   int match;
5637   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5638   assem_debug("fmatch=%d\n",match);
5639   int fs,cs;
5640   int eaddr;
5641   int invert=0;
5642   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5643   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5644   if(!match) invert=1;
5645   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5646   if(i>(ba[i]-start)>>2) invert=1;
5647   #endif
5648
5649   if(ooo[i]) {
5650     fs=get_reg(branch_regs[i].regmap,FSREG);
5651     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5652   }
5653   else {
5654     fs=get_reg(i_regmap,FSREG);
5655   }
5656
5657   // Check cop1 unusable
5658   if(!cop1_usable) {
5659     cs=get_reg(i_regmap,CSREG);
5660     assert(cs>=0);
5661     emit_testimm(cs,0x20000000);
5662     eaddr=(int)out;
5663     emit_jeq(0);
5664     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5665     cop1_usable=1;
5666   }
5667
5668   if(ooo[i]) {
5669     // Out of order execution (delay slot first)
5670     //printf("OOOE\n");
5671     ds_assemble(i+1,i_regs);
5672     int adj;
5673     uint64_t bc_unneeded=branch_regs[i].u;
5674     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5675     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5676     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5677     bc_unneeded|=1;
5678     bc_unneeded_upper|=1;
5679     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5680                   bc_unneeded,bc_unneeded_upper);
5681     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5682     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5683     cc=get_reg(branch_regs[i].regmap,CCREG);
5684     assert(cc==HOST_CCREG);
5685     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5686     assem_debug("cycle count (adj)\n");
5687     if(1) {
5688       int nottaken=0;
5689       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5690       if(1) {
5691         assert(fs>=0);
5692         emit_testimm(fs,0x800000);
5693         if(source[i]&0x10000) // BC1T
5694         {
5695           if(invert){
5696             nottaken=(int)out;
5697             emit_jeq(1);
5698           }else{
5699             add_to_linker((int)out,ba[i],internal);
5700             emit_jne(0);
5701           }
5702         }
5703         else // BC1F
5704           if(invert){
5705             nottaken=(int)out;
5706             emit_jne(1);
5707           }else{
5708             add_to_linker((int)out,ba[i],internal);
5709             emit_jeq(0);
5710           }
5711         {
5712         }
5713       } // if(!only32)
5714
5715       if(invert) {
5716         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5717         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5718         else if(match) emit_addnop(13);
5719         #endif
5720         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5721         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5722         if(internal)
5723           assem_debug("branch: internal\n");
5724         else
5725           assem_debug("branch: external\n");
5726         if(internal&&is_ds[(ba[i]-start)>>2]) {
5727           ds_assemble_entry(i);
5728         }
5729         else {
5730           add_to_linker((int)out,ba[i],internal);
5731           emit_jmp(0);
5732         }
5733         set_jump_target(nottaken,(int)out);
5734       }
5735
5736       if(adj) {
5737         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5738       }
5739     } // (!unconditional)
5740   } // if(ooo)
5741   else
5742   {
5743     // In-order execution (branch first)
5744     //printf("IOE\n");
5745     int nottaken=0;
5746     if(1) {
5747       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5748       if(1) {
5749         assert(fs>=0);
5750         emit_testimm(fs,0x800000);
5751         if(source[i]&0x10000) // BC1T
5752         {
5753           nottaken=(int)out;
5754           emit_jeq(1);
5755         }
5756         else // BC1F
5757         {
5758           nottaken=(int)out;
5759           emit_jne(1);
5760         }
5761       }
5762     } // if(!unconditional)
5763     int adj;
5764     uint64_t ds_unneeded=branch_regs[i].u;
5765     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5766     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5767     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5768     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5769     ds_unneeded|=1;
5770     ds_unneeded_upper|=1;
5771     // branch taken
5772     //assem_debug("1:\n");
5773     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5774                   ds_unneeded,ds_unneeded_upper);
5775     // load regs
5776     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5777     address_generation(i+1,&branch_regs[i],0);
5778     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5779     ds_assemble(i+1,&branch_regs[i]);
5780     cc=get_reg(branch_regs[i].regmap,CCREG);
5781     if(cc==-1) {
5782       emit_loadreg(CCREG,cc=HOST_CCREG);
5783       // CHECK: Is the following instruction (fall thru) allocated ok?
5784     }
5785     assert(cc==HOST_CCREG);
5786     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5787     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5788     assem_debug("cycle count (adj)\n");
5789     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5790     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5791     if(internal)
5792       assem_debug("branch: internal\n");
5793     else
5794       assem_debug("branch: external\n");
5795     if(internal&&is_ds[(ba[i]-start)>>2]) {
5796       ds_assemble_entry(i);
5797     }
5798     else {
5799       add_to_linker((int)out,ba[i],internal);
5800       emit_jmp(0);
5801     }
5802
5803     // branch not taken
5804     if(1) { // <- FIXME (don't need this)
5805       set_jump_target(nottaken,(int)out);
5806       assem_debug("1:\n");
5807       if(!likely[i]) {
5808         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5809                       ds_unneeded,ds_unneeded_upper);
5810         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5811         address_generation(i+1,&branch_regs[i],0);
5812         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5813         ds_assemble(i+1,&branch_regs[i]);
5814       }
5815       cc=get_reg(branch_regs[i].regmap,CCREG);
5816       if(cc==-1&&!likely[i]) {
5817         // Cycle count isn't in a register, temporarily load it then write it out
5818         emit_loadreg(CCREG,HOST_CCREG);
5819         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5820         int jaddr=(int)out;
5821         emit_jns(0);
5822         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5823         emit_storereg(CCREG,HOST_CCREG);
5824       }
5825       else{
5826         cc=get_reg(i_regmap,CCREG);
5827         assert(cc==HOST_CCREG);
5828         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5829         int jaddr=(int)out;
5830         emit_jns(0);
5831         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5832       }
5833     }
5834   }
5835 }
5836
5837 static void pagespan_assemble(int i,struct regstat *i_regs)
5838 {
5839   int s1l=get_reg(i_regs->regmap,rs1[i]);
5840   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5841   int s2l=get_reg(i_regs->regmap,rs2[i]);
5842   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5843   int taken=0;
5844   int nottaken=0;
5845   int unconditional=0;
5846   if(rs1[i]==0)
5847   {
5848     s1l=s2l;s1h=s2h;
5849     s2l=s2h=-1;
5850   }
5851   else if(rs2[i]==0)
5852   {
5853     s2l=s2h=-1;
5854   }
5855   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5856     s1h=s2h=-1;
5857   }
5858   int hr=0;
5859   int addr=-1,alt=-1,ntaddr=-1;
5860   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5861   else {
5862     while(hr<HOST_REGS)
5863     {
5864       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5865          (i_regs->regmap[hr]&63)!=rs1[i] &&
5866          (i_regs->regmap[hr]&63)!=rs2[i] )
5867       {
5868         addr=hr++;break;
5869       }
5870       hr++;
5871     }
5872   }
5873   while(hr<HOST_REGS)
5874   {
5875     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5876        (i_regs->regmap[hr]&63)!=rs1[i] &&
5877        (i_regs->regmap[hr]&63)!=rs2[i] )
5878     {
5879       alt=hr++;break;
5880     }
5881     hr++;
5882   }
5883   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5884   {
5885     while(hr<HOST_REGS)
5886     {
5887       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5888          (i_regs->regmap[hr]&63)!=rs1[i] &&
5889          (i_regs->regmap[hr]&63)!=rs2[i] )
5890       {
5891         ntaddr=hr;break;
5892       }
5893       hr++;
5894     }
5895   }
5896   assert(hr<HOST_REGS);
5897   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5898     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5899   }
5900   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5901   if(opcode[i]==2) // J
5902   {
5903     unconditional=1;
5904   }
5905   if(opcode[i]==3) // JAL
5906   {
5907     // TODO: mini_ht
5908     int rt=get_reg(i_regs->regmap,31);
5909     emit_movimm(start+i*4+8,rt);
5910     unconditional=1;
5911   }
5912   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5913   {
5914     emit_mov(s1l,addr);
5915     if(opcode2[i]==9) // JALR
5916     {
5917       int rt=get_reg(i_regs->regmap,rt1[i]);
5918       emit_movimm(start+i*4+8,rt);
5919     }
5920   }
5921   if((opcode[i]&0x3f)==4) // BEQ
5922   {
5923     if(rs1[i]==rs2[i])
5924     {
5925       unconditional=1;
5926     }
5927     else
5928     #ifdef HAVE_CMOV_IMM
5929     if(s1h<0) {
5930       if(s2l>=0) emit_cmp(s1l,s2l);
5931       else emit_test(s1l,s1l);
5932       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5933     }
5934     else
5935     #endif
5936     {
5937       assert(s1l>=0);
5938       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5939       if(s1h>=0) {
5940         if(s2h>=0) emit_cmp(s1h,s2h);
5941         else emit_test(s1h,s1h);
5942         emit_cmovne_reg(alt,addr);
5943       }
5944       if(s2l>=0) emit_cmp(s1l,s2l);
5945       else emit_test(s1l,s1l);
5946       emit_cmovne_reg(alt,addr);
5947     }
5948   }
5949   if((opcode[i]&0x3f)==5) // BNE
5950   {
5951     #ifdef HAVE_CMOV_IMM
5952     if(s1h<0) {
5953       if(s2l>=0) emit_cmp(s1l,s2l);
5954       else emit_test(s1l,s1l);
5955       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5956     }
5957     else
5958     #endif
5959     {
5960       assert(s1l>=0);
5961       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5962       if(s1h>=0) {
5963         if(s2h>=0) emit_cmp(s1h,s2h);
5964         else emit_test(s1h,s1h);
5965         emit_cmovne_reg(alt,addr);
5966       }
5967       if(s2l>=0) emit_cmp(s1l,s2l);
5968       else emit_test(s1l,s1l);
5969       emit_cmovne_reg(alt,addr);
5970     }
5971   }
5972   if((opcode[i]&0x3f)==0x14) // BEQL
5973   {
5974     if(s1h>=0) {
5975       if(s2h>=0) emit_cmp(s1h,s2h);
5976       else emit_test(s1h,s1h);
5977       nottaken=(int)out;
5978       emit_jne(0);
5979     }
5980     if(s2l>=0) emit_cmp(s1l,s2l);
5981     else emit_test(s1l,s1l);
5982     if(nottaken) set_jump_target(nottaken,(int)out);
5983     nottaken=(int)out;
5984     emit_jne(0);
5985   }
5986   if((opcode[i]&0x3f)==0x15) // BNEL
5987   {
5988     if(s1h>=0) {
5989       if(s2h>=0) emit_cmp(s1h,s2h);
5990       else emit_test(s1h,s1h);
5991       taken=(int)out;
5992       emit_jne(0);
5993     }
5994     if(s2l>=0) emit_cmp(s1l,s2l);
5995     else emit_test(s1l,s1l);
5996     nottaken=(int)out;
5997     emit_jeq(0);
5998     if(taken) set_jump_target(taken,(int)out);
5999   }
6000   if((opcode[i]&0x3f)==6) // BLEZ
6001   {
6002     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6003     emit_cmpimm(s1l,1);
6004     if(s1h>=0) emit_mov(addr,ntaddr);
6005     emit_cmovl_reg(alt,addr);
6006     if(s1h>=0) {
6007       emit_test(s1h,s1h);
6008       emit_cmovne_reg(ntaddr,addr);
6009       emit_cmovs_reg(alt,addr);
6010     }
6011   }
6012   if((opcode[i]&0x3f)==7) // BGTZ
6013   {
6014     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6015     emit_cmpimm(s1l,1);
6016     if(s1h>=0) emit_mov(addr,alt);
6017     emit_cmovl_reg(ntaddr,addr);
6018     if(s1h>=0) {
6019       emit_test(s1h,s1h);
6020       emit_cmovne_reg(alt,addr);
6021       emit_cmovs_reg(ntaddr,addr);
6022     }
6023   }
6024   if((opcode[i]&0x3f)==0x16) // BLEZL
6025   {
6026     assert((opcode[i]&0x3f)!=0x16);
6027   }
6028   if((opcode[i]&0x3f)==0x17) // BGTZL
6029   {
6030     assert((opcode[i]&0x3f)!=0x17);
6031   }
6032   assert(opcode[i]!=1); // BLTZ/BGEZ
6033
6034   //FIXME: Check CSREG
6035   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6036     if((source[i]&0x30000)==0) // BC1F
6037     {
6038       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6039       emit_testimm(s1l,0x800000);
6040       emit_cmovne_reg(alt,addr);
6041     }
6042     if((source[i]&0x30000)==0x10000) // BC1T
6043     {
6044       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6045       emit_testimm(s1l,0x800000);
6046       emit_cmovne_reg(alt,addr);
6047     }
6048     if((source[i]&0x30000)==0x20000) // BC1FL
6049     {
6050       emit_testimm(s1l,0x800000);
6051       nottaken=(int)out;
6052       emit_jne(0);
6053     }
6054     if((source[i]&0x30000)==0x30000) // BC1TL
6055     {
6056       emit_testimm(s1l,0x800000);
6057       nottaken=(int)out;
6058       emit_jeq(0);
6059     }
6060   }
6061
6062   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6063   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6064   if(likely[i]||unconditional)
6065   {
6066     emit_movimm(ba[i],HOST_BTREG);
6067   }
6068   else if(addr!=HOST_BTREG)
6069   {
6070     emit_mov(addr,HOST_BTREG);
6071   }
6072   void *branch_addr=out;
6073   emit_jmp(0);
6074   int target_addr=start+i*4+5;
6075   void *stub=out;
6076   void *compiled_target_addr=check_addr(target_addr);
6077   emit_extjump_ds((int)branch_addr,target_addr);
6078   if(compiled_target_addr) {
6079     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6080     add_link(target_addr,stub);
6081   }
6082   else set_jump_target((int)branch_addr,(int)stub);
6083   if(likely[i]) {
6084     // Not-taken path
6085     set_jump_target((int)nottaken,(int)out);
6086     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6087     void *branch_addr=out;
6088     emit_jmp(0);
6089     int target_addr=start+i*4+8;
6090     void *stub=out;
6091     void *compiled_target_addr=check_addr(target_addr);
6092     emit_extjump_ds((int)branch_addr,target_addr);
6093     if(compiled_target_addr) {
6094       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6095       add_link(target_addr,stub);
6096     }
6097     else set_jump_target((int)branch_addr,(int)stub);
6098   }
6099 }
6100
6101 // Assemble the delay slot for the above
6102 static void pagespan_ds()
6103 {
6104   assem_debug("initial delay slot:\n");
6105   u_int vaddr=start+1;
6106   u_int page=get_page(vaddr);
6107   u_int vpage=get_vpage(vaddr);
6108   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6109   do_dirty_stub_ds();
6110   ll_add(jump_in+page,vaddr,(void *)out);
6111   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6112   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6113     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6114   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6115     emit_writeword(HOST_BTREG,(int)&branch_target);
6116   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6117   address_generation(0,&regs[0],regs[0].regmap_entry);
6118   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6119     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6120   cop1_usable=0;
6121   is_delayslot=0;
6122   switch(itype[0]) {
6123     case ALU:
6124       alu_assemble(0,&regs[0]);break;
6125     case IMM16:
6126       imm16_assemble(0,&regs[0]);break;
6127     case SHIFT:
6128       shift_assemble(0,&regs[0]);break;
6129     case SHIFTIMM:
6130       shiftimm_assemble(0,&regs[0]);break;
6131     case LOAD:
6132       load_assemble(0,&regs[0]);break;
6133     case LOADLR:
6134       loadlr_assemble(0,&regs[0]);break;
6135     case STORE:
6136       store_assemble(0,&regs[0]);break;
6137     case STORELR:
6138       storelr_assemble(0,&regs[0]);break;
6139     case COP0:
6140       cop0_assemble(0,&regs[0]);break;
6141     case COP1:
6142       cop1_assemble(0,&regs[0]);break;
6143     case C1LS:
6144       c1ls_assemble(0,&regs[0]);break;
6145     case COP2:
6146       cop2_assemble(0,&regs[0]);break;
6147     case C2LS:
6148       c2ls_assemble(0,&regs[0]);break;
6149     case C2OP:
6150       c2op_assemble(0,&regs[0]);break;
6151     case FCONV:
6152       fconv_assemble(0,&regs[0]);break;
6153     case FLOAT:
6154       float_assemble(0,&regs[0]);break;
6155     case FCOMP:
6156       fcomp_assemble(0,&regs[0]);break;
6157     case MULTDIV:
6158       multdiv_assemble(0,&regs[0]);break;
6159     case MOV:
6160       mov_assemble(0,&regs[0]);break;
6161     case SYSCALL:
6162     case HLECALL:
6163     case INTCALL:
6164     case SPAN:
6165     case UJUMP:
6166     case RJUMP:
6167     case CJUMP:
6168     case SJUMP:
6169     case FJUMP:
6170       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6171   }
6172   int btaddr=get_reg(regs[0].regmap,BTREG);
6173   if(btaddr<0) {
6174     btaddr=get_reg(regs[0].regmap,-1);
6175     emit_readword((int)&branch_target,btaddr);
6176   }
6177   assert(btaddr!=HOST_CCREG);
6178   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6179 #ifdef HOST_IMM8
6180   emit_movimm(start+4,HOST_TEMPREG);
6181   emit_cmp(btaddr,HOST_TEMPREG);
6182 #else
6183   emit_cmpimm(btaddr,start+4);
6184 #endif
6185   int branch=(int)out;
6186   emit_jeq(0);
6187   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6188   emit_jmp(jump_vaddr_reg[btaddr]);
6189   set_jump_target(branch,(int)out);
6190   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6191   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6192 }
6193
6194 // Basic liveness analysis for MIPS registers
6195 void unneeded_registers(int istart,int iend,int r)
6196 {
6197   int i;
6198   uint64_t u,uu,gte_u,b,bu,gte_bu;
6199   uint64_t temp_u,temp_uu,temp_gte_u=0;
6200   uint64_t tdep;
6201   uint64_t gte_u_unknown=0;
6202   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6203     gte_u_unknown=~0ll;
6204   if(iend==slen-1) {
6205     u=1;uu=1;
6206     gte_u=gte_u_unknown;
6207   }else{
6208     u=unneeded_reg[iend+1];
6209     uu=unneeded_reg_upper[iend+1];
6210     u=1;uu=1;
6211     gte_u=gte_unneeded[iend+1];
6212   }
6213
6214   for (i=iend;i>=istart;i--)
6215   {
6216     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6217     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6218     {
6219       // If subroutine call, flag return address as a possible branch target
6220       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6221
6222       if(ba[i]<start || ba[i]>=(start+slen*4))
6223       {
6224         // Branch out of this block, flush all regs
6225         u=1;
6226         uu=1;
6227         gte_u=gte_u_unknown;
6228         /* Hexagon hack
6229         if(itype[i]==UJUMP&&rt1[i]==31)
6230         {
6231           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6232         }
6233         if(itype[i]==RJUMP&&rs1[i]==31)
6234         {
6235           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6236         }
6237         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6238           if(itype[i]==UJUMP&&rt1[i]==31)
6239           {
6240             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6241             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6242           }
6243           if(itype[i]==RJUMP&&rs1[i]==31)
6244           {
6245             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6246             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6247           }
6248         }*/
6249         branch_unneeded_reg[i]=u;
6250         branch_unneeded_reg_upper[i]=uu;
6251         // Merge in delay slot
6252         tdep=(~uu>>rt1[i+1])&1;
6253         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6254         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6255         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6256         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6257         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6258         u|=1;uu|=1;
6259         gte_u|=gte_rt[i+1];
6260         gte_u&=~gte_rs[i+1];
6261         // If branch is "likely" (and conditional)
6262         // then we skip the delay slot on the fall-thru path
6263         if(likely[i]) {
6264           if(i<slen-1) {
6265             u&=unneeded_reg[i+2];
6266             uu&=unneeded_reg_upper[i+2];
6267             gte_u&=gte_unneeded[i+2];
6268           }
6269           else
6270           {
6271             u=1;
6272             uu=1;
6273             gte_u=gte_u_unknown;
6274           }
6275         }
6276       }
6277       else
6278       {
6279         // Internal branch, flag target
6280         bt[(ba[i]-start)>>2]=1;
6281         if(ba[i]<=start+i*4) {
6282           // Backward branch
6283           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6284           {
6285             // Unconditional branch
6286             temp_u=1;temp_uu=1;
6287             temp_gte_u=0;
6288           } else {
6289             // Conditional branch (not taken case)
6290             temp_u=unneeded_reg[i+2];
6291             temp_uu=unneeded_reg_upper[i+2];
6292             temp_gte_u&=gte_unneeded[i+2];
6293           }
6294           // Merge in delay slot
6295           tdep=(~temp_uu>>rt1[i+1])&1;
6296           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6297           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6298           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6299           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6300           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6301           temp_u|=1;temp_uu|=1;
6302           temp_gte_u|=gte_rt[i+1];
6303           temp_gte_u&=~gte_rs[i+1];
6304           // If branch is "likely" (and conditional)
6305           // then we skip the delay slot on the fall-thru path
6306           if(likely[i]) {
6307             if(i<slen-1) {
6308               temp_u&=unneeded_reg[i+2];
6309               temp_uu&=unneeded_reg_upper[i+2];
6310               temp_gte_u&=gte_unneeded[i+2];
6311             }
6312             else
6313             {
6314               temp_u=1;
6315               temp_uu=1;
6316               temp_gte_u=gte_u_unknown;
6317             }
6318           }
6319           tdep=(~temp_uu>>rt1[i])&1;
6320           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6321           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6322           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6323           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6324           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6325           temp_u|=1;temp_uu|=1;
6326           temp_gte_u|=gte_rt[i];
6327           temp_gte_u&=~gte_rs[i];
6328           unneeded_reg[i]=temp_u;
6329           unneeded_reg_upper[i]=temp_uu;
6330           gte_unneeded[i]=temp_gte_u;
6331           // Only go three levels deep.  This recursion can take an
6332           // excessive amount of time if there are a lot of nested loops.
6333           if(r<2) {
6334             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6335           }else{
6336             unneeded_reg[(ba[i]-start)>>2]=1;
6337             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6338             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6339           }
6340         } /*else*/ if(1) {
6341           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6342           {
6343             // Unconditional branch
6344             u=unneeded_reg[(ba[i]-start)>>2];
6345             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6346             gte_u=gte_unneeded[(ba[i]-start)>>2];
6347             branch_unneeded_reg[i]=u;
6348             branch_unneeded_reg_upper[i]=uu;
6349         //u=1;
6350         //uu=1;
6351         //branch_unneeded_reg[i]=u;
6352         //branch_unneeded_reg_upper[i]=uu;
6353             // Merge in delay slot
6354             tdep=(~uu>>rt1[i+1])&1;
6355             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6356             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6357             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6358             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6359             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6360             u|=1;uu|=1;
6361             gte_u|=gte_rt[i+1];
6362             gte_u&=~gte_rs[i+1];
6363           } else {
6364             // Conditional branch
6365             b=unneeded_reg[(ba[i]-start)>>2];
6366             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6367             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6368             branch_unneeded_reg[i]=b;
6369             branch_unneeded_reg_upper[i]=bu;
6370         //b=1;
6371         //bu=1;
6372         //branch_unneeded_reg[i]=b;
6373         //branch_unneeded_reg_upper[i]=bu;
6374             // Branch delay slot
6375             tdep=(~uu>>rt1[i+1])&1;
6376             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6377             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6378             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6379             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6380             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6381             b|=1;bu|=1;
6382             gte_bu|=gte_rt[i+1];
6383             gte_bu&=~gte_rs[i+1];
6384             // If branch is "likely" then we skip the
6385             // delay slot on the fall-thru path
6386             if(likely[i]) {
6387               u=b;
6388               uu=bu;
6389               gte_u=gte_bu;
6390               if(i<slen-1) {
6391                 u&=unneeded_reg[i+2];
6392                 uu&=unneeded_reg_upper[i+2];
6393                 gte_u&=gte_unneeded[i+2];
6394         //u=1;
6395         //uu=1;
6396               }
6397             } else {
6398               u&=b;
6399               uu&=bu;
6400               gte_u&=gte_bu;
6401         //u=1;
6402         //uu=1;
6403             }
6404             if(i<slen-1) {
6405               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6406               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6407         //branch_unneeded_reg[i]=1;
6408         //branch_unneeded_reg_upper[i]=1;
6409             } else {
6410               branch_unneeded_reg[i]=1;
6411               branch_unneeded_reg_upper[i]=1;
6412             }
6413           }
6414         }
6415       }
6416     }
6417     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6418     {
6419       // SYSCALL instruction (software interrupt)
6420       u=1;
6421       uu=1;
6422     }
6423     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6424     {
6425       // ERET instruction (return from interrupt)
6426       u=1;
6427       uu=1;
6428     }
6429     //u=uu=1; // DEBUG
6430     tdep=(~uu>>rt1[i])&1;
6431     // Written registers are unneeded
6432     u|=1LL<<rt1[i];
6433     u|=1LL<<rt2[i];
6434     uu|=1LL<<rt1[i];
6435     uu|=1LL<<rt2[i];
6436     gte_u|=gte_rt[i];
6437     // Accessed registers are needed
6438     u&=~(1LL<<rs1[i]);
6439     u&=~(1LL<<rs2[i]);
6440     uu&=~(1LL<<us1[i]);
6441     uu&=~(1LL<<us2[i]);
6442     gte_u&=~gte_rs[i];
6443     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6444       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6445     // Source-target dependencies
6446     uu&=~(tdep<<dep1[i]);
6447     uu&=~(tdep<<dep2[i]);
6448     // R0 is always unneeded
6449     u|=1;uu|=1;
6450     // Save it
6451     unneeded_reg[i]=u;
6452     unneeded_reg_upper[i]=uu;
6453     gte_unneeded[i]=gte_u;
6454     /*
6455     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6456     printf("U:");
6457     int r;
6458     for(r=1;r<=CCREG;r++) {
6459       if((unneeded_reg[i]>>r)&1) {
6460         if(r==HIREG) printf(" HI");
6461         else if(r==LOREG) printf(" LO");
6462         else printf(" r%d",r);
6463       }
6464     }
6465     printf(" UU:");
6466     for(r=1;r<=CCREG;r++) {
6467       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6468         if(r==HIREG) printf(" HI");
6469         else if(r==LOREG) printf(" LO");
6470         else printf(" r%d",r);
6471       }
6472     }
6473     printf("\n");*/
6474   }
6475   for (i=iend;i>=istart;i--)
6476   {
6477     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6478   }
6479 }
6480
6481 // Write back dirty registers as soon as we will no longer modify them,
6482 // so that we don't end up with lots of writes at the branches.
6483 void clean_registers(int istart,int iend,int wr)
6484 {
6485   int i;
6486   int r;
6487   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6488   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6489   if(iend==slen-1) {
6490     will_dirty_i=will_dirty_next=0;
6491     wont_dirty_i=wont_dirty_next=0;
6492   }else{
6493     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6494     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6495   }
6496   for (i=iend;i>=istart;i--)
6497   {
6498     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6499     {
6500       if(ba[i]<start || ba[i]>=(start+slen*4))
6501       {
6502         // Branch out of this block, flush all regs
6503         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6504         {
6505           // Unconditional branch
6506           will_dirty_i=0;
6507           wont_dirty_i=0;
6508           // Merge in delay slot (will dirty)
6509           for(r=0;r<HOST_REGS;r++) {
6510             if(r!=EXCLUDE_REG) {
6511               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6512               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6513               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6514               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6515               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6516               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6517               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6518               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6519               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6520               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6521               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6522               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6523               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6524               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6525             }
6526           }
6527         }
6528         else
6529         {
6530           // Conditional branch
6531           will_dirty_i=0;
6532           wont_dirty_i=wont_dirty_next;
6533           // Merge in delay slot (will dirty)
6534           for(r=0;r<HOST_REGS;r++) {
6535             if(r!=EXCLUDE_REG) {
6536               if(!likely[i]) {
6537                 // Might not dirty if likely branch is not taken
6538                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6539                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6540                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6541                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6542                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6543                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6544                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6545                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6546                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6547                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6548                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6549                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6550                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6551                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6552               }
6553             }
6554           }
6555         }
6556         // Merge in delay slot (wont dirty)
6557         for(r=0;r<HOST_REGS;r++) {
6558           if(r!=EXCLUDE_REG) {
6559             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6560             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6561             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6562             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6563             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6564             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6565             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6566             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6567             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6568             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6569           }
6570         }
6571         if(wr) {
6572           #ifndef DESTRUCTIVE_WRITEBACK
6573           branch_regs[i].dirty&=wont_dirty_i;
6574           #endif
6575           branch_regs[i].dirty|=will_dirty_i;
6576         }
6577       }
6578       else
6579       {
6580         // Internal branch
6581         if(ba[i]<=start+i*4) {
6582           // Backward branch
6583           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6584           {
6585             // Unconditional branch
6586             temp_will_dirty=0;
6587             temp_wont_dirty=0;
6588             // Merge in delay slot (will dirty)
6589             for(r=0;r<HOST_REGS;r++) {
6590               if(r!=EXCLUDE_REG) {
6591                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6592                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6593                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6594                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6595                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6596                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6597                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6598                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6599                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6600                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6601                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6602                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6603                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6604                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6605               }
6606             }
6607           } else {
6608             // Conditional branch (not taken case)
6609             temp_will_dirty=will_dirty_next;
6610             temp_wont_dirty=wont_dirty_next;
6611             // Merge in delay slot (will dirty)
6612             for(r=0;r<HOST_REGS;r++) {
6613               if(r!=EXCLUDE_REG) {
6614                 if(!likely[i]) {
6615                   // Will not dirty if likely branch is not taken
6616                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6617                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6618                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6619                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6620                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6621                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6622                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6623                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6624                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6625                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6626                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6627                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6628                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6629                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6630                 }
6631               }
6632             }
6633           }
6634           // Merge in delay slot (wont dirty)
6635           for(r=0;r<HOST_REGS;r++) {
6636             if(r!=EXCLUDE_REG) {
6637               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6638               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6639               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6640               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6641               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6642               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6643               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6644               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6645               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6646               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6647             }
6648           }
6649           // Deal with changed mappings
6650           if(i<iend) {
6651             for(r=0;r<HOST_REGS;r++) {
6652               if(r!=EXCLUDE_REG) {
6653                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6654                   temp_will_dirty&=~(1<<r);
6655                   temp_wont_dirty&=~(1<<r);
6656                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6657                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6658                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6659                   } else {
6660                     temp_will_dirty|=1<<r;
6661                     temp_wont_dirty|=1<<r;
6662                   }
6663                 }
6664               }
6665             }
6666           }
6667           if(wr) {
6668             will_dirty[i]=temp_will_dirty;
6669             wont_dirty[i]=temp_wont_dirty;
6670             clean_registers((ba[i]-start)>>2,i-1,0);
6671           }else{
6672             // Limit recursion.  It can take an excessive amount
6673             // of time if there are a lot of nested loops.
6674             will_dirty[(ba[i]-start)>>2]=0;
6675             wont_dirty[(ba[i]-start)>>2]=-1;
6676           }
6677         }
6678         /*else*/ if(1)
6679         {
6680           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6681           {
6682             // Unconditional branch
6683             will_dirty_i=0;
6684             wont_dirty_i=0;
6685           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6686             for(r=0;r<HOST_REGS;r++) {
6687               if(r!=EXCLUDE_REG) {
6688                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6689                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6690                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6691                 }
6692                 if(branch_regs[i].regmap[r]>=0) {
6693                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6694                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6695                 }
6696               }
6697             }
6698           //}
6699             // Merge in delay slot
6700             for(r=0;r<HOST_REGS;r++) {
6701               if(r!=EXCLUDE_REG) {
6702                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6703                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6704                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6705                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6706                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6707                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6708                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6709                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6710                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6711                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6712                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6713                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6714                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6715                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6716               }
6717             }
6718           } else {
6719             // Conditional branch
6720             will_dirty_i=will_dirty_next;
6721             wont_dirty_i=wont_dirty_next;
6722           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6723             for(r=0;r<HOST_REGS;r++) {
6724               if(r!=EXCLUDE_REG) {
6725                 signed char target_reg=branch_regs[i].regmap[r];
6726                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6727                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6728                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6729                 }
6730                 else if(target_reg>=0) {
6731                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6732                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6733                 }
6734                 // Treat delay slot as part of branch too
6735                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6736                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6737                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6738                 }
6739                 else
6740                 {
6741                   will_dirty[i+1]&=~(1<<r);
6742                 }*/
6743               }
6744             }
6745           //}
6746             // Merge in delay slot
6747             for(r=0;r<HOST_REGS;r++) {
6748               if(r!=EXCLUDE_REG) {
6749                 if(!likely[i]) {
6750                   // Might not dirty if likely branch is not taken
6751                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6752                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6753                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6754                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6755                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6756                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6757                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6758                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6759                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6760                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6761                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6762                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6763                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6764                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6765                 }
6766               }
6767             }
6768           }
6769           // Merge in delay slot (won't dirty)
6770           for(r=0;r<HOST_REGS;r++) {
6771             if(r!=EXCLUDE_REG) {
6772               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6773               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6774               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6775               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6776               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6777               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6778               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6779               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6780               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6781               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6782             }
6783           }
6784           if(wr) {
6785             #ifndef DESTRUCTIVE_WRITEBACK
6786             branch_regs[i].dirty&=wont_dirty_i;
6787             #endif
6788             branch_regs[i].dirty|=will_dirty_i;
6789           }
6790         }
6791       }
6792     }
6793     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6794     {
6795       // SYSCALL instruction (software interrupt)
6796       will_dirty_i=0;
6797       wont_dirty_i=0;
6798     }
6799     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6800     {
6801       // ERET instruction (return from interrupt)
6802       will_dirty_i=0;
6803       wont_dirty_i=0;
6804     }
6805     will_dirty_next=will_dirty_i;
6806     wont_dirty_next=wont_dirty_i;
6807     for(r=0;r<HOST_REGS;r++) {
6808       if(r!=EXCLUDE_REG) {
6809         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6810         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6811         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6812         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6813         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6814         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6815         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6816         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6817         if(i>istart) {
6818           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6819           {
6820             // Don't store a register immediately after writing it,
6821             // may prevent dual-issue.
6822             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6823             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6824           }
6825         }
6826       }
6827     }
6828     // Save it
6829     will_dirty[i]=will_dirty_i;
6830     wont_dirty[i]=wont_dirty_i;
6831     // Mark registers that won't be dirtied as not dirty
6832     if(wr) {
6833       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6834       for(r=0;r<HOST_REGS;r++) {
6835         if((will_dirty_i>>r)&1) {
6836           printf(" r%d",r);
6837         }
6838       }
6839       printf("\n");*/
6840
6841       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6842         regs[i].dirty|=will_dirty_i;
6843         #ifndef DESTRUCTIVE_WRITEBACK
6844         regs[i].dirty&=wont_dirty_i;
6845         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6846         {
6847           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6848             for(r=0;r<HOST_REGS;r++) {
6849               if(r!=EXCLUDE_REG) {
6850                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6851                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6852                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6853               }
6854             }
6855           }
6856         }
6857         else
6858         {
6859           if(i<iend) {
6860             for(r=0;r<HOST_REGS;r++) {
6861               if(r!=EXCLUDE_REG) {
6862                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6863                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6864                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6865               }
6866             }
6867           }
6868         }
6869         #endif
6870       //}
6871     }
6872     // Deal with changed mappings
6873     temp_will_dirty=will_dirty_i;
6874     temp_wont_dirty=wont_dirty_i;
6875     for(r=0;r<HOST_REGS;r++) {
6876       if(r!=EXCLUDE_REG) {
6877         int nr;
6878         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6879           if(wr) {
6880             #ifndef DESTRUCTIVE_WRITEBACK
6881             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6882             #endif
6883             regs[i].wasdirty|=will_dirty_i&(1<<r);
6884           }
6885         }
6886         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6887           // Register moved to a different register
6888           will_dirty_i&=~(1<<r);
6889           wont_dirty_i&=~(1<<r);
6890           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6891           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6892           if(wr) {
6893             #ifndef DESTRUCTIVE_WRITEBACK
6894             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6895             #endif
6896             regs[i].wasdirty|=will_dirty_i&(1<<r);
6897           }
6898         }
6899         else {
6900           will_dirty_i&=~(1<<r);
6901           wont_dirty_i&=~(1<<r);
6902           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6903             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6904             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6905           } else {
6906             wont_dirty_i|=1<<r;
6907             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6908           }
6909         }
6910       }
6911     }
6912   }
6913 }
6914
6915 #ifdef DISASM
6916   /* disassembly */
6917 void disassemble_inst(int i)
6918 {
6919     if (bt[i]) printf("*"); else printf(" ");
6920     switch(itype[i]) {
6921       case UJUMP:
6922         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6923       case CJUMP:
6924         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6925       case SJUMP:
6926         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6927       case FJUMP:
6928         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6929       case RJUMP:
6930         if (opcode[i]==0x9&&rt1[i]!=31)
6931           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6932         else
6933           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6934         break;
6935       case SPAN:
6936         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6937       case IMM16:
6938         if(opcode[i]==0xf) //LUI
6939           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6940         else
6941           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6942         break;
6943       case LOAD:
6944       case LOADLR:
6945         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6946         break;
6947       case STORE:
6948       case STORELR:
6949         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6950         break;
6951       case ALU:
6952       case SHIFT:
6953         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6954         break;
6955       case MULTDIV:
6956         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6957         break;
6958       case SHIFTIMM:
6959         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6960         break;
6961       case MOV:
6962         if((opcode2[i]&0x1d)==0x10)
6963           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6964         else if((opcode2[i]&0x1d)==0x11)
6965           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6966         else
6967           printf (" %x: %s\n",start+i*4,insn[i]);
6968         break;
6969       case COP0:
6970         if(opcode2[i]==0)
6971           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6972         else if(opcode2[i]==4)
6973           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6974         else printf (" %x: %s\n",start+i*4,insn[i]);
6975         break;
6976       case COP1:
6977         if(opcode2[i]<3)
6978           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6979         else if(opcode2[i]>3)
6980           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6981         else printf (" %x: %s\n",start+i*4,insn[i]);
6982         break;
6983       case COP2:
6984         if(opcode2[i]<3)
6985           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6986         else if(opcode2[i]>3)
6987           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6988         else printf (" %x: %s\n",start+i*4,insn[i]);
6989         break;
6990       case C1LS:
6991         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6992         break;
6993       case C2LS:
6994         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6995         break;
6996       case INTCALL:
6997         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6998         break;
6999       default:
7000         //printf (" %s %8x\n",insn[i],source[i]);
7001         printf (" %x: %s\n",start+i*4,insn[i]);
7002     }
7003 }
7004 #else
7005 static void disassemble_inst(int i) {}
7006 #endif // DISASM
7007
7008 #define DRC_TEST_VAL 0x74657374
7009
7010 static int new_dynarec_test(void)
7011 {
7012   int (*testfunc)(void) = (void *)out;
7013   void *beginning;
7014   int ret;
7015
7016   beginning = start_block();
7017   emit_movimm(DRC_TEST_VAL,0); // test
7018   emit_jmpreg(14);
7019   literal_pool(0);
7020   end_block(beginning);
7021   SysPrintf("testing if we can run recompiled code..\n");
7022   ret = testfunc();
7023   if (ret == DRC_TEST_VAL)
7024     SysPrintf("test passed.\n");
7025   else
7026     SysPrintf("test failed: %08x\n", ret);
7027   out=(u_char *)BASE_ADDR;
7028   return ret == DRC_TEST_VAL;
7029 }
7030
7031 // clear the state completely, instead of just marking
7032 // things invalid like invalidate_all_pages() does
7033 void new_dynarec_clear_full(void)
7034 {
7035   int n;
7036   out=(u_char *)BASE_ADDR;
7037   memset(invalid_code,1,sizeof(invalid_code));
7038   memset(hash_table,0xff,sizeof(hash_table));
7039   memset(mini_ht,-1,sizeof(mini_ht));
7040   memset(restore_candidate,0,sizeof(restore_candidate));
7041   memset(shadow,0,sizeof(shadow));
7042   copy=shadow;
7043   expirep=16384; // Expiry pointer, +2 blocks
7044   pending_exception=0;
7045   literalcount=0;
7046   stop_after_jal=0;
7047   inv_code_start=inv_code_end=~0;
7048   // TLB
7049   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7050   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7051   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7052 }
7053
7054 void new_dynarec_init(void)
7055 {
7056   SysPrintf("Init new dynarec\n");
7057
7058   // allocate/prepare a buffer for translation cache
7059   // see assem_arm.h for some explanation
7060 #if   defined(BASE_ADDR_FIXED)
7061   if (mmap (translation_cache, 1 << TARGET_SIZE_2,
7062             PROT_READ | PROT_WRITE | PROT_EXEC,
7063             MAP_PRIVATE | MAP_ANONYMOUS,
7064             -1, 0) != translation_cache) {
7065     SysPrintf("mmap() failed: %s\n", strerror(errno));
7066     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
7067     abort();
7068   }
7069 #elif defined(BASE_ADDR_DYNAMIC)
7070   #ifdef VITA
7071   sceBlock = getVMBlock();//sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
7072   if (sceBlock < 0)
7073     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
7074   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
7075   if (ret < 0)
7076     SysPrintf("sceKernelGetMemBlockBase failed\n");
7077   sceClibPrintf("translation_cache = 0x%08X \n ", translation_cache);
7078   #else
7079   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
7080             PROT_READ | PROT_WRITE | PROT_EXEC,
7081             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
7082   if (translation_cache == MAP_FAILED) {
7083     SysPrintf("mmap() failed: %s\n", strerror(errno));
7084     abort();
7085   }
7086   #endif
7087 #else
7088   #ifndef NO_WRITE_EXEC
7089   // not all systems allow execute in data segment by default
7090   if (mprotect(out, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
7091     SysPrintf("mprotect() failed: %s\n", strerror(errno));
7092   #endif
7093 #endif
7094   out=(u_char *)BASE_ADDR;
7095   cycle_multiplier=200;
7096   new_dynarec_clear_full();
7097 #ifdef HOST_IMM8
7098   // Copy this into local area so we don't have to put it in every literal pool
7099   invc_ptr=invalid_code;
7100 #endif
7101   arch_init();
7102   new_dynarec_test();
7103 #ifndef RAM_FIXED
7104   ram_offset=(u_int)rdram-0x80000000;
7105 #endif
7106   if (ram_offset!=0)
7107     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
7108 }
7109
7110 void new_dynarec_cleanup(void)
7111 {
7112   int n;
7113 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
7114   #ifdef VITA
7115   //sceKernelFreeMemBlock(sceBlock);
7116   //sceBlock = -1;
7117   #else
7118   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0)
7119     SysPrintf("munmap() failed\n");
7120   #endif
7121 #endif
7122   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7123   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7124   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7125   #ifdef ROM_COPY
7126   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7127   #endif
7128 }
7129
7130 static u_int *get_source_start(u_int addr, u_int *limit)
7131 {
7132   if (addr < 0x00200000 ||
7133     (0xa0000000 <= addr && addr < 0xa0200000)) {
7134     // used for BIOS calls mostly?
7135     *limit = (addr&0xa0000000)|0x00200000;
7136     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7137   }
7138   else if (!Config.HLE && (
7139     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7140     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7141     // BIOS
7142     *limit = (addr & 0xfff00000) | 0x80000;
7143     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7144   }
7145   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7146     *limit = (addr & 0x80600000) + 0x00200000;
7147     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7148   }
7149   return NULL;
7150 }
7151
7152 static u_int scan_for_ret(u_int addr)
7153 {
7154   u_int limit = 0;
7155   u_int *mem;
7156
7157   mem = get_source_start(addr, &limit);
7158   if (mem == NULL)
7159     return addr;
7160
7161   if (limit > addr + 0x1000)
7162     limit = addr + 0x1000;
7163   for (; addr < limit; addr += 4, mem++) {
7164     if (*mem == 0x03e00008) // jr $ra
7165       return addr + 8;
7166   }
7167   return addr;
7168 }
7169
7170 struct savestate_block {
7171   uint32_t addr;
7172   uint32_t regflags;
7173 };
7174
7175 static int addr_cmp(const void *p1_, const void *p2_)
7176 {
7177   const struct savestate_block *p1 = p1_, *p2 = p2_;
7178   return p1->addr - p2->addr;
7179 }
7180
7181 int new_dynarec_save_blocks(void *save, int size)
7182 {
7183   struct savestate_block *blocks = save;
7184   int maxcount = size / sizeof(blocks[0]);
7185   struct savestate_block tmp_blocks[1024];
7186   struct ll_entry *head;
7187   int p, s, d, o, bcnt;
7188   u_int addr;
7189
7190   o = 0;
7191   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
7192     bcnt = 0;
7193     for (head = jump_in[p]; head != NULL; head = head->next) {
7194       tmp_blocks[bcnt].addr = head->vaddr;
7195       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7196       bcnt++;
7197     }
7198     if (bcnt < 1)
7199       continue;
7200     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7201
7202     addr = tmp_blocks[0].addr;
7203     for (s = d = 0; s < bcnt; s++) {
7204       if (tmp_blocks[s].addr < addr)
7205         continue;
7206       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7207         tmp_blocks[d++] = tmp_blocks[s];
7208       addr = scan_for_ret(tmp_blocks[s].addr);
7209     }
7210
7211     if (o + d > maxcount)
7212       d = maxcount - o;
7213     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7214     o += d;
7215   }
7216
7217   return o * sizeof(blocks[0]);
7218 }
7219
7220 void new_dynarec_load_blocks(const void *save, int size)
7221 {
7222   const struct savestate_block *blocks = save;
7223   int count = size / sizeof(blocks[0]);
7224   u_int regs_save[32];
7225   uint32_t f;
7226   int i, b;
7227
7228   get_addr(psxRegs.pc);
7229
7230   // change GPRs for speculation to at least partially work..
7231   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7232   for (i = 1; i < 32; i++)
7233     psxRegs.GPR.r[i] = 0x80000000;
7234
7235   for (b = 0; b < count; b++) {
7236     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7237       if (f & 1)
7238         psxRegs.GPR.r[i] = 0x1f800000;
7239     }
7240
7241     get_addr(blocks[b].addr);
7242
7243     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7244       if (f & 1)
7245         psxRegs.GPR.r[i] = 0x80000000;
7246     }
7247   }
7248
7249   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7250 }
7251
7252 int new_recompile_block(int addr)
7253 {
7254   u_int pagelimit = 0;
7255   u_int state_rflags = 0;
7256   int i;
7257
7258   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7259   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7260   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7261   //if(debug)
7262   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7263   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7264   /*if(Count>=312978186) {
7265     rlist();
7266   }*/
7267   //rlist();
7268
7269   // this is just for speculation
7270   for (i = 1; i < 32; i++) {
7271     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7272       state_rflags |= 1 << i;
7273   }
7274
7275   start = (u_int)addr&~3;
7276   //assert(((u_int)addr&1)==0);
7277   new_dynarec_did_compile=1;
7278   if (Config.HLE && start == 0x80001000) // hlecall
7279   {
7280     // XXX: is this enough? Maybe check hleSoftCall?
7281     void *beginning=start_block();
7282     u_int page=get_page(start);
7283
7284     invalid_code[start>>12]=0;
7285     emit_movimm(start,0);
7286     emit_writeword(0,(int)&pcaddr);
7287     emit_jmp((int)new_dyna_leave);
7288     literal_pool(0);
7289     end_block(beginning);
7290     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7291     return 0;
7292   }
7293
7294   source = get_source_start(start, &pagelimit);
7295   if (source == NULL) {
7296     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7297     exit(1);
7298   }
7299
7300   /* Pass 1: disassemble */
7301   /* Pass 2: register dependencies, branch targets */
7302   /* Pass 3: register allocation */
7303   /* Pass 4: branch dependencies */
7304   /* Pass 5: pre-alloc */
7305   /* Pass 6: optimize clean/dirty state */
7306   /* Pass 7: flag 32-bit registers */
7307   /* Pass 8: assembly */
7308   /* Pass 9: linker */
7309   /* Pass 10: garbage collection / free memory */
7310
7311   int j;
7312   int done=0;
7313   unsigned int type,op,op2;
7314
7315   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7316
7317   /* Pass 1 disassembly */
7318
7319   for(i=0;!done;i++) {
7320     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7321     minimum_free_regs[i]=0;
7322     opcode[i]=op=source[i]>>26;
7323     switch(op)
7324     {
7325       case 0x00: strcpy(insn[i],"special"); type=NI;
7326         op2=source[i]&0x3f;
7327         switch(op2)
7328         {
7329           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7330           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7331           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7332           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7333           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7334           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7335           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7336           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7337           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7338           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7339           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7340           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7341           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7342           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7343           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7344           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7345           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7346           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7347           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7348           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7349           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7350           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7351           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7352           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7353           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7354           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7355           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7356           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7357           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7358           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7359           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7360           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7361           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7362           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7363           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7364 #if 0
7365           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7366           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7367           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7368           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7369           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7370           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7371           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7372           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7373           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7374           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7375           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7376           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7377           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7378           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7379           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7380           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7381           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7382 #endif
7383         }
7384         break;
7385       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7386         op2=(source[i]>>16)&0x1f;
7387         switch(op2)
7388         {
7389           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7390           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7391           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7392           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7393           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7394           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7395           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7396           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7397           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7398           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7399           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7400           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7401           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7402           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7403         }
7404         break;
7405       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7406       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7407       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7408       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7409       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7410       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7411       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7412       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7413       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7414       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7415       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7416       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7417       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7418       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7419       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7420         op2=(source[i]>>21)&0x1f;
7421         switch(op2)
7422         {
7423           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7424           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7425           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7426           switch(source[i]&0x3f)
7427           {
7428             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7429             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7430             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7431             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7432             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7433             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7434           }
7435         }
7436         break;
7437       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7438         op2=(source[i]>>21)&0x1f;
7439         switch(op2)
7440         {
7441           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7442           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7443           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7444           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7445           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7446           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7447           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7448           switch((source[i]>>16)&0x3)
7449           {
7450             case 0x00: strcpy(insn[i],"BC1F"); break;
7451             case 0x01: strcpy(insn[i],"BC1T"); break;
7452             case 0x02: strcpy(insn[i],"BC1FL"); break;
7453             case 0x03: strcpy(insn[i],"BC1TL"); break;
7454           }
7455           break;
7456           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7457           switch(source[i]&0x3f)
7458           {
7459             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7460             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7461             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7462             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7463             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7464             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7465             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7466             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7467             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7468             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7469             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7470             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7471             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7472             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7473             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7474             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7475             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7476             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7477             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7478             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7479             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7480             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7481             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7482             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7483             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7484             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7485             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7486             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7487             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7488             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7489             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7490             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7491             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7492             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7493             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7494           }
7495           break;
7496           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7497           switch(source[i]&0x3f)
7498           {
7499             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7500             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7501             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7502             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7503             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7504             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7505             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7506             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7507             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7508             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7509             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7510             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7511             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7512             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7513             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7514             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7515             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7516             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7517             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7518             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7519             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7520             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7521             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7522             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7523             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7524             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7525             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7526             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7527             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7528             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7529             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7530             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7531             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7532             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7533             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7534           }
7535           break;
7536           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7537           switch(source[i]&0x3f)
7538           {
7539             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7540             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7541           }
7542           break;
7543           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7544           switch(source[i]&0x3f)
7545           {
7546             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7547             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7548           }
7549           break;
7550         }
7551         break;
7552 #if 0
7553       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7554       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7555       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7556       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7557       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7558       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7559       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7560       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7561 #endif
7562       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7563       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7564       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7565       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7566       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7567       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7568       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7569 #if 0
7570       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7571 #endif
7572       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7573       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7574       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7575       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7576 #if 0
7577       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7578       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7579 #endif
7580       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7581       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7582       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7583       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7584 #if 0
7585       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7586       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7587       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7588 #endif
7589       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7590       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7591 #if 0
7592       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7593       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7594       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7595 #endif
7596       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7597         op2=(source[i]>>21)&0x1f;
7598         //if (op2 & 0x10) {
7599         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7600           if (gte_handlers[source[i]&0x3f]!=NULL) {
7601             if (gte_regnames[source[i]&0x3f]!=NULL)
7602               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7603             else
7604               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7605             type=C2OP;
7606           }
7607         }
7608         else switch(op2)
7609         {
7610           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7611           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7612           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7613           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7614         }
7615         break;
7616       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7617       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7618       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7619       default: strcpy(insn[i],"???"); type=NI;
7620         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7621         break;
7622     }
7623     itype[i]=type;
7624     opcode2[i]=op2;
7625     /* Get registers/immediates */
7626     lt1[i]=0;
7627     us1[i]=0;
7628     us2[i]=0;
7629     dep1[i]=0;
7630     dep2[i]=0;
7631     gte_rs[i]=gte_rt[i]=0;
7632     switch(type) {
7633       case LOAD:
7634         rs1[i]=(source[i]>>21)&0x1f;
7635         rs2[i]=0;
7636         rt1[i]=(source[i]>>16)&0x1f;
7637         rt2[i]=0;
7638         imm[i]=(short)source[i];
7639         break;
7640       case STORE:
7641       case STORELR:
7642         rs1[i]=(source[i]>>21)&0x1f;
7643         rs2[i]=(source[i]>>16)&0x1f;
7644         rt1[i]=0;
7645         rt2[i]=0;
7646         imm[i]=(short)source[i];
7647         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7648         break;
7649       case LOADLR:
7650         // LWL/LWR only load part of the register,
7651         // therefore the target register must be treated as a source too
7652         rs1[i]=(source[i]>>21)&0x1f;
7653         rs2[i]=(source[i]>>16)&0x1f;
7654         rt1[i]=(source[i]>>16)&0x1f;
7655         rt2[i]=0;
7656         imm[i]=(short)source[i];
7657         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7658         if(op==0x26) dep1[i]=rt1[i]; // LWR
7659         break;
7660       case IMM16:
7661         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7662         else rs1[i]=(source[i]>>21)&0x1f;
7663         rs2[i]=0;
7664         rt1[i]=(source[i]>>16)&0x1f;
7665         rt2[i]=0;
7666         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7667           imm[i]=(unsigned short)source[i];
7668         }else{
7669           imm[i]=(short)source[i];
7670         }
7671         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7672         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7673         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7674         break;
7675       case UJUMP:
7676         rs1[i]=0;
7677         rs2[i]=0;
7678         rt1[i]=0;
7679         rt2[i]=0;
7680         // The JAL instruction writes to r31.
7681         if (op&1) {
7682           rt1[i]=31;
7683         }
7684         rs2[i]=CCREG;
7685         break;
7686       case RJUMP:
7687         rs1[i]=(source[i]>>21)&0x1f;
7688         rs2[i]=0;
7689         rt1[i]=0;
7690         rt2[i]=0;
7691         // The JALR instruction writes to rd.
7692         if (op2&1) {
7693           rt1[i]=(source[i]>>11)&0x1f;
7694         }
7695         rs2[i]=CCREG;
7696         break;
7697       case CJUMP:
7698         rs1[i]=(source[i]>>21)&0x1f;
7699         rs2[i]=(source[i]>>16)&0x1f;
7700         rt1[i]=0;
7701         rt2[i]=0;
7702         if(op&2) { // BGTZ/BLEZ
7703           rs2[i]=0;
7704         }
7705         us1[i]=rs1[i];
7706         us2[i]=rs2[i];
7707         likely[i]=op>>4;
7708         break;
7709       case SJUMP:
7710         rs1[i]=(source[i]>>21)&0x1f;
7711         rs2[i]=CCREG;
7712         rt1[i]=0;
7713         rt2[i]=0;
7714         us1[i]=rs1[i];
7715         if(op2&0x10) { // BxxAL
7716           rt1[i]=31;
7717           // NOTE: If the branch is not taken, r31 is still overwritten
7718         }
7719         likely[i]=(op2&2)>>1;
7720         break;
7721       case FJUMP:
7722         rs1[i]=FSREG;
7723         rs2[i]=CSREG;
7724         rt1[i]=0;
7725         rt2[i]=0;
7726         likely[i]=((source[i])>>17)&1;
7727         break;
7728       case ALU:
7729         rs1[i]=(source[i]>>21)&0x1f; // source
7730         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7731         rt1[i]=(source[i]>>11)&0x1f; // destination
7732         rt2[i]=0;
7733         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7734           us1[i]=rs1[i];us2[i]=rs2[i];
7735         }
7736         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7737           dep1[i]=rs1[i];dep2[i]=rs2[i];
7738         }
7739         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7740           dep1[i]=rs1[i];dep2[i]=rs2[i];
7741         }
7742         break;
7743       case MULTDIV:
7744         rs1[i]=(source[i]>>21)&0x1f; // source
7745         rs2[i]=(source[i]>>16)&0x1f; // divisor
7746         rt1[i]=HIREG;
7747         rt2[i]=LOREG;
7748         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7749           us1[i]=rs1[i];us2[i]=rs2[i];
7750         }
7751         break;
7752       case MOV:
7753         rs1[i]=0;
7754         rs2[i]=0;
7755         rt1[i]=0;
7756         rt2[i]=0;
7757         if(op2==0x10) rs1[i]=HIREG; // MFHI
7758         if(op2==0x11) rt1[i]=HIREG; // MTHI
7759         if(op2==0x12) rs1[i]=LOREG; // MFLO
7760         if(op2==0x13) rt1[i]=LOREG; // MTLO
7761         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7762         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7763         dep1[i]=rs1[i];
7764         break;
7765       case SHIFT:
7766         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7767         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7768         rt1[i]=(source[i]>>11)&0x1f; // destination
7769         rt2[i]=0;
7770         // DSLLV/DSRLV/DSRAV are 64-bit
7771         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7772         break;
7773       case SHIFTIMM:
7774         rs1[i]=(source[i]>>16)&0x1f;
7775         rs2[i]=0;
7776         rt1[i]=(source[i]>>11)&0x1f;
7777         rt2[i]=0;
7778         imm[i]=(source[i]>>6)&0x1f;
7779         // DSxx32 instructions
7780         if(op2>=0x3c) imm[i]|=0x20;
7781         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7782         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7783         break;
7784       case COP0:
7785         rs1[i]=0;
7786         rs2[i]=0;
7787         rt1[i]=0;
7788         rt2[i]=0;
7789         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7790         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7791         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7792         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7793         break;
7794       case COP1:
7795         rs1[i]=0;
7796         rs2[i]=0;
7797         rt1[i]=0;
7798         rt2[i]=0;
7799         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7800         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7801         if(op2==5) us1[i]=rs1[i]; // DMTC1
7802         rs2[i]=CSREG;
7803         break;
7804       case COP2:
7805         rs1[i]=0;
7806         rs2[i]=0;
7807         rt1[i]=0;
7808         rt2[i]=0;
7809         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7810         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7811         rs2[i]=CSREG;
7812         int gr=(source[i]>>11)&0x1F;
7813         switch(op2)
7814         {
7815           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7816           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7817           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7818           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7819         }
7820         break;
7821       case C1LS:
7822         rs1[i]=(source[i]>>21)&0x1F;
7823         rs2[i]=CSREG;
7824         rt1[i]=0;
7825         rt2[i]=0;
7826         imm[i]=(short)source[i];
7827         break;
7828       case C2LS:
7829         rs1[i]=(source[i]>>21)&0x1F;
7830         rs2[i]=0;
7831         rt1[i]=0;
7832         rt2[i]=0;
7833         imm[i]=(short)source[i];
7834         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7835         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7836         break;
7837       case C2OP:
7838         rs1[i]=0;
7839         rs2[i]=0;
7840         rt1[i]=0;
7841         rt2[i]=0;
7842         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7843         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7844         gte_rt[i]|=1ll<<63; // every op changes flags
7845         if((source[i]&0x3f)==GTE_MVMVA) {
7846           int v = (source[i] >> 15) & 3;
7847           gte_rs[i]&=~0xe3fll;
7848           if(v==3) gte_rs[i]|=0xe00ll;
7849           else gte_rs[i]|=3ll<<(v*2);
7850         }
7851         break;
7852       case FLOAT:
7853       case FCONV:
7854         rs1[i]=0;
7855         rs2[i]=CSREG;
7856         rt1[i]=0;
7857         rt2[i]=0;
7858         break;
7859       case FCOMP:
7860         rs1[i]=FSREG;
7861         rs2[i]=CSREG;
7862         rt1[i]=FSREG;
7863         rt2[i]=0;
7864         break;
7865       case SYSCALL:
7866       case HLECALL:
7867       case INTCALL:
7868         rs1[i]=CCREG;
7869         rs2[i]=0;
7870         rt1[i]=0;
7871         rt2[i]=0;
7872         break;
7873       default:
7874         rs1[i]=0;
7875         rs2[i]=0;
7876         rt1[i]=0;
7877         rt2[i]=0;
7878     }
7879     /* Calculate branch target addresses */
7880     if(type==UJUMP)
7881       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7882     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7883       ba[i]=start+i*4+8; // Ignore never taken branch
7884     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7885       ba[i]=start+i*4+8; // Ignore never taken branch
7886     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7887       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7888     else ba[i]=-1;
7889     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7890       int do_in_intrp=0;
7891       // branch in delay slot?
7892       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7893         // don't handle first branch and call interpreter if it's hit
7894         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7895         do_in_intrp=1;
7896       }
7897       // basic load delay detection
7898       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7899         int t=(ba[i-1]-start)/4;
7900         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7901           // jump target wants DS result - potential load delay effect
7902           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7903           do_in_intrp=1;
7904           bt[t+1]=1; // expected return from interpreter
7905         }
7906         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7907               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7908           // v0 overwrite like this is a sign of trouble, bail out
7909           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7910           do_in_intrp=1;
7911         }
7912       }
7913       if(do_in_intrp) {
7914         rs1[i-1]=CCREG;
7915         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7916         ba[i-1]=-1;
7917         itype[i-1]=INTCALL;
7918         done=2;
7919         i--; // don't compile the DS
7920       }
7921     }
7922     /* Is this the end of the block? */
7923     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7924       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7925         done=2;
7926       }
7927       else {
7928         if(stop_after_jal) done=1;
7929         // Stop on BREAK
7930         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7931       }
7932       // Don't recompile stuff that's already compiled
7933       if(check_addr(start+i*4+4)) done=1;
7934       // Don't get too close to the limit
7935       if(i>MAXBLOCK/2) done=1;
7936     }
7937     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7938     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7939     if(done==2) {
7940       // Does the block continue due to a branch?
7941       for(j=i-1;j>=0;j--)
7942       {
7943         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7944         if(ba[j]==start+i*4+4) done=j=0;
7945         if(ba[j]==start+i*4+8) done=j=0;
7946       }
7947     }
7948     //assert(i<MAXBLOCK-1);
7949     if(start+i*4==pagelimit-4) done=1;
7950     assert(start+i*4<pagelimit);
7951     if (i==MAXBLOCK-1) done=1;
7952     // Stop if we're compiling junk
7953     if(itype[i]==NI&&opcode[i]==0x11) {
7954       done=stop_after_jal=1;
7955       SysPrintf("Disabled speculative precompilation\n");
7956     }
7957   }
7958   slen=i;
7959   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
7960     if(start+i*4==pagelimit) {
7961       itype[i-1]=SPAN;
7962     }
7963   }
7964   assert(slen>0);
7965
7966   /* Pass 2 - Register dependencies and branch targets */
7967
7968   unneeded_registers(0,slen-1,0);
7969
7970   /* Pass 3 - Register allocation */
7971
7972   struct regstat current; // Current register allocations/status
7973   current.is32=1;
7974   current.dirty=0;
7975   current.u=unneeded_reg[0];
7976   current.uu=unneeded_reg_upper[0];
7977   clear_all_regs(current.regmap);
7978   alloc_reg(&current,0,CCREG);
7979   dirty_reg(&current,CCREG);
7980   current.isconst=0;
7981   current.wasconst=0;
7982   current.waswritten=0;
7983   int ds=0;
7984   int cc=0;
7985   int hr=-1;
7986
7987   if((u_int)addr&1) {
7988     // First instruction is delay slot
7989     cc=-1;
7990     bt[1]=1;
7991     ds=1;
7992     unneeded_reg[0]=1;
7993     unneeded_reg_upper[0]=1;
7994     current.regmap[HOST_BTREG]=BTREG;
7995   }
7996
7997   for(i=0;i<slen;i++)
7998   {
7999     if(bt[i])
8000     {
8001       int hr;
8002       for(hr=0;hr<HOST_REGS;hr++)
8003       {
8004         // Is this really necessary?
8005         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8006       }
8007       current.isconst=0;
8008       current.waswritten=0;
8009     }
8010     if(i>1)
8011     {
8012       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8013       {
8014         if(rs1[i-2]==0||rs2[i-2]==0)
8015         {
8016           if(rs1[i-2]) {
8017             current.is32|=1LL<<rs1[i-2];
8018             int hr=get_reg(current.regmap,rs1[i-2]|64);
8019             if(hr>=0) current.regmap[hr]=-1;
8020           }
8021           if(rs2[i-2]) {
8022             current.is32|=1LL<<rs2[i-2];
8023             int hr=get_reg(current.regmap,rs2[i-2]|64);
8024             if(hr>=0) current.regmap[hr]=-1;
8025           }
8026         }
8027       }
8028     }
8029     current.is32=-1LL;
8030
8031     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8032     regs[i].wasconst=current.isconst;
8033     regs[i].was32=current.is32;
8034     regs[i].wasdirty=current.dirty;
8035     regs[i].loadedconst=0;
8036     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8037       if(i+1<slen) {
8038         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8039         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8040         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8041         current.u|=1;
8042         current.uu|=1;
8043       } else {
8044         current.u=1;
8045         current.uu=1;
8046       }
8047     } else {
8048       if(i+1<slen) {
8049         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8050         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8051         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8052         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8053         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8054         current.u|=1;
8055         current.uu|=1;
8056       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
8057     }
8058     is_ds[i]=ds;
8059     if(ds) {
8060       ds=0; // Skip delay slot, already allocated as part of branch
8061       // ...but we need to alloc it in case something jumps here
8062       if(i+1<slen) {
8063         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8064         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8065       }else{
8066         current.u=branch_unneeded_reg[i-1];
8067         current.uu=branch_unneeded_reg_upper[i-1];
8068       }
8069       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8070       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8071       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8072       current.u|=1;
8073       current.uu|=1;
8074       struct regstat temp;
8075       memcpy(&temp,&current,sizeof(current));
8076       temp.wasdirty=temp.dirty;
8077       temp.was32=temp.is32;
8078       // TODO: Take into account unconditional branches, as below
8079       delayslot_alloc(&temp,i);
8080       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8081       regs[i].wasdirty=temp.wasdirty;
8082       regs[i].was32=temp.was32;
8083       regs[i].dirty=temp.dirty;
8084       regs[i].is32=temp.is32;
8085       regs[i].isconst=0;
8086       regs[i].wasconst=0;
8087       current.isconst=0;
8088       // Create entry (branch target) regmap
8089       for(hr=0;hr<HOST_REGS;hr++)
8090       {
8091         int r=temp.regmap[hr];
8092         if(r>=0) {
8093           if(r!=regmap_pre[i][hr]) {
8094             regs[i].regmap_entry[hr]=-1;
8095           }
8096           else
8097           {
8098             if(r<64){
8099               if((current.u>>r)&1) {
8100                 regs[i].regmap_entry[hr]=-1;
8101                 regs[i].regmap[hr]=-1;
8102                 //Don't clear regs in the delay slot as the branch might need them
8103                 //current.regmap[hr]=-1;
8104               }else
8105                 regs[i].regmap_entry[hr]=r;
8106             }
8107             else {
8108               if((current.uu>>(r&63))&1) {
8109                 regs[i].regmap_entry[hr]=-1;
8110                 regs[i].regmap[hr]=-1;
8111                 //Don't clear regs in the delay slot as the branch might need them
8112                 //current.regmap[hr]=-1;
8113               }else
8114                 regs[i].regmap_entry[hr]=r;
8115             }
8116           }
8117         } else {
8118           // First instruction expects CCREG to be allocated
8119           if(i==0&&hr==HOST_CCREG)
8120             regs[i].regmap_entry[hr]=CCREG;
8121           else
8122             regs[i].regmap_entry[hr]=-1;
8123         }
8124       }
8125     }
8126     else { // Not delay slot
8127       switch(itype[i]) {
8128         case UJUMP:
8129           //current.isconst=0; // DEBUG
8130           //current.wasconst=0; // DEBUG
8131           //regs[i].wasconst=0; // DEBUG
8132           clear_const(&current,rt1[i]);
8133           alloc_cc(&current,i);
8134           dirty_reg(&current,CCREG);
8135           if (rt1[i]==31) {
8136             alloc_reg(&current,i,31);
8137             dirty_reg(&current,31);
8138             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8139             //assert(rt1[i+1]!=rt1[i]);
8140             #ifdef REG_PREFETCH
8141             alloc_reg(&current,i,PTEMP);
8142             #endif
8143             //current.is32|=1LL<<rt1[i];
8144           }
8145           ooo[i]=1;
8146           delayslot_alloc(&current,i+1);
8147           //current.isconst=0; // DEBUG
8148           ds=1;
8149           //printf("i=%d, isconst=%x\n",i,current.isconst);
8150           break;
8151         case RJUMP:
8152           //current.isconst=0;
8153           //current.wasconst=0;
8154           //regs[i].wasconst=0;
8155           clear_const(&current,rs1[i]);
8156           clear_const(&current,rt1[i]);
8157           alloc_cc(&current,i);
8158           dirty_reg(&current,CCREG);
8159           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8160             alloc_reg(&current,i,rs1[i]);
8161             if (rt1[i]!=0) {
8162               alloc_reg(&current,i,rt1[i]);
8163               dirty_reg(&current,rt1[i]);
8164               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8165               assert(rt1[i+1]!=rt1[i]);
8166               #ifdef REG_PREFETCH
8167               alloc_reg(&current,i,PTEMP);
8168               #endif
8169             }
8170             #ifdef USE_MINI_HT
8171             if(rs1[i]==31) { // JALR
8172               alloc_reg(&current,i,RHASH);
8173               #ifndef HOST_IMM_ADDR32
8174               alloc_reg(&current,i,RHTBL);
8175               #endif
8176             }
8177             #endif
8178             delayslot_alloc(&current,i+1);
8179           } else {
8180             // The delay slot overwrites our source register,
8181             // allocate a temporary register to hold the old value.
8182             current.isconst=0;
8183             current.wasconst=0;
8184             regs[i].wasconst=0;
8185             delayslot_alloc(&current,i+1);
8186             current.isconst=0;
8187             alloc_reg(&current,i,RTEMP);
8188           }
8189           //current.isconst=0; // DEBUG
8190           ooo[i]=1;
8191           ds=1;
8192           break;
8193         case CJUMP:
8194           //current.isconst=0;
8195           //current.wasconst=0;
8196           //regs[i].wasconst=0;
8197           clear_const(&current,rs1[i]);
8198           clear_const(&current,rs2[i]);
8199           if((opcode[i]&0x3E)==4) // BEQ/BNE
8200           {
8201             alloc_cc(&current,i);
8202             dirty_reg(&current,CCREG);
8203             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8204             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8205             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8206             {
8207               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8208               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8209             }
8210             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8211                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8212               // The delay slot overwrites one of our conditions.
8213               // Allocate the branch condition registers instead.
8214               current.isconst=0;
8215               current.wasconst=0;
8216               regs[i].wasconst=0;
8217               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8218               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8219               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8220               {
8221                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8222                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8223               }
8224             }
8225             else
8226             {
8227               ooo[i]=1;
8228               delayslot_alloc(&current,i+1);
8229             }
8230           }
8231           else
8232           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8233           {
8234             alloc_cc(&current,i);
8235             dirty_reg(&current,CCREG);
8236             alloc_reg(&current,i,rs1[i]);
8237             if(!(current.is32>>rs1[i]&1))
8238             {
8239               alloc_reg64(&current,i,rs1[i]);
8240             }
8241             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8242               // The delay slot overwrites one of our conditions.
8243               // Allocate the branch condition registers instead.
8244               current.isconst=0;
8245               current.wasconst=0;
8246               regs[i].wasconst=0;
8247               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8248               if(!((current.is32>>rs1[i])&1))
8249               {
8250                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8251               }
8252             }
8253             else
8254             {
8255               ooo[i]=1;
8256               delayslot_alloc(&current,i+1);
8257             }
8258           }
8259           else
8260           // Don't alloc the delay slot yet because we might not execute it
8261           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8262           {
8263             current.isconst=0;
8264             current.wasconst=0;
8265             regs[i].wasconst=0;
8266             alloc_cc(&current,i);
8267             dirty_reg(&current,CCREG);
8268             alloc_reg(&current,i,rs1[i]);
8269             alloc_reg(&current,i,rs2[i]);
8270             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8271             {
8272               alloc_reg64(&current,i,rs1[i]);
8273               alloc_reg64(&current,i,rs2[i]);
8274             }
8275           }
8276           else
8277           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8278           {
8279             current.isconst=0;
8280             current.wasconst=0;
8281             regs[i].wasconst=0;
8282             alloc_cc(&current,i);
8283             dirty_reg(&current,CCREG);
8284             alloc_reg(&current,i,rs1[i]);
8285             if(!(current.is32>>rs1[i]&1))
8286             {
8287               alloc_reg64(&current,i,rs1[i]);
8288             }
8289           }
8290           ds=1;
8291           //current.isconst=0;
8292           break;
8293         case SJUMP:
8294           //current.isconst=0;
8295           //current.wasconst=0;
8296           //regs[i].wasconst=0;
8297           clear_const(&current,rs1[i]);
8298           clear_const(&current,rt1[i]);
8299           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8300           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8301           {
8302             alloc_cc(&current,i);
8303             dirty_reg(&current,CCREG);
8304             alloc_reg(&current,i,rs1[i]);
8305             if(!(current.is32>>rs1[i]&1))
8306             {
8307               alloc_reg64(&current,i,rs1[i]);
8308             }
8309             if (rt1[i]==31) { // BLTZAL/BGEZAL
8310               alloc_reg(&current,i,31);
8311               dirty_reg(&current,31);
8312               //#ifdef REG_PREFETCH
8313               //alloc_reg(&current,i,PTEMP);
8314               //#endif
8315               //current.is32|=1LL<<rt1[i];
8316             }
8317             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8318                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8319               // Allocate the branch condition registers instead.
8320               current.isconst=0;
8321               current.wasconst=0;
8322               regs[i].wasconst=0;
8323               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8324               if(!((current.is32>>rs1[i])&1))
8325               {
8326                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8327               }
8328             }
8329             else
8330             {
8331               ooo[i]=1;
8332               delayslot_alloc(&current,i+1);
8333             }
8334           }
8335           else
8336           // Don't alloc the delay slot yet because we might not execute it
8337           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8338           {
8339             current.isconst=0;
8340             current.wasconst=0;
8341             regs[i].wasconst=0;
8342             alloc_cc(&current,i);
8343             dirty_reg(&current,CCREG);
8344             alloc_reg(&current,i,rs1[i]);
8345             if(!(current.is32>>rs1[i]&1))
8346             {
8347               alloc_reg64(&current,i,rs1[i]);
8348             }
8349           }
8350           ds=1;
8351           //current.isconst=0;
8352           break;
8353         case FJUMP:
8354           current.isconst=0;
8355           current.wasconst=0;
8356           regs[i].wasconst=0;
8357           if(likely[i]==0) // BC1F/BC1T
8358           {
8359             // TODO: Theoretically we can run out of registers here on x86.
8360             // The delay slot can allocate up to six, and we need to check
8361             // CSREG before executing the delay slot.  Possibly we can drop
8362             // the cycle count and then reload it after checking that the
8363             // FPU is in a usable state, or don't do out-of-order execution.
8364             alloc_cc(&current,i);
8365             dirty_reg(&current,CCREG);
8366             alloc_reg(&current,i,FSREG);
8367             alloc_reg(&current,i,CSREG);
8368             if(itype[i+1]==FCOMP) {
8369               // The delay slot overwrites the branch condition.
8370               // Allocate the branch condition registers instead.
8371               alloc_cc(&current,i);
8372               dirty_reg(&current,CCREG);
8373               alloc_reg(&current,i,CSREG);
8374               alloc_reg(&current,i,FSREG);
8375             }
8376             else {
8377               ooo[i]=1;
8378               delayslot_alloc(&current,i+1);
8379               alloc_reg(&current,i+1,CSREG);
8380             }
8381           }
8382           else
8383           // Don't alloc the delay slot yet because we might not execute it
8384           if(likely[i]) // BC1FL/BC1TL
8385           {
8386             alloc_cc(&current,i);
8387             dirty_reg(&current,CCREG);
8388             alloc_reg(&current,i,CSREG);
8389             alloc_reg(&current,i,FSREG);
8390           }
8391           ds=1;
8392           current.isconst=0;
8393           break;
8394         case IMM16:
8395           imm16_alloc(&current,i);
8396           break;
8397         case LOAD:
8398         case LOADLR:
8399           load_alloc(&current,i);
8400           break;
8401         case STORE:
8402         case STORELR:
8403           store_alloc(&current,i);
8404           break;
8405         case ALU:
8406           alu_alloc(&current,i);
8407           break;
8408         case SHIFT:
8409           shift_alloc(&current,i);
8410           break;
8411         case MULTDIV:
8412           multdiv_alloc(&current,i);
8413           break;
8414         case SHIFTIMM:
8415           shiftimm_alloc(&current,i);
8416           break;
8417         case MOV:
8418           mov_alloc(&current,i);
8419           break;
8420         case COP0:
8421           cop0_alloc(&current,i);
8422           break;
8423         case COP1:
8424         case COP2:
8425           cop1_alloc(&current,i);
8426           break;
8427         case C1LS:
8428           c1ls_alloc(&current,i);
8429           break;
8430         case C2LS:
8431           c2ls_alloc(&current,i);
8432           break;
8433         case C2OP:
8434           c2op_alloc(&current,i);
8435           break;
8436         case FCONV:
8437           fconv_alloc(&current,i);
8438           break;
8439         case FLOAT:
8440           float_alloc(&current,i);
8441           break;
8442         case FCOMP:
8443           fcomp_alloc(&current,i);
8444           break;
8445         case SYSCALL:
8446         case HLECALL:
8447         case INTCALL:
8448           syscall_alloc(&current,i);
8449           break;
8450         case SPAN:
8451           pagespan_alloc(&current,i);
8452           break;
8453       }
8454
8455       // Drop the upper half of registers that have become 32-bit
8456       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8457       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8458         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8459         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8460         current.uu|=1;
8461       } else {
8462         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8463         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8464         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8465         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8466         current.uu|=1;
8467       }
8468
8469       // Create entry (branch target) regmap
8470       for(hr=0;hr<HOST_REGS;hr++)
8471       {
8472         int r,or;
8473         r=current.regmap[hr];
8474         if(r>=0) {
8475           if(r!=regmap_pre[i][hr]) {
8476             // TODO: delay slot (?)
8477             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8478             if(or<0||(r&63)>=TEMPREG){
8479               regs[i].regmap_entry[hr]=-1;
8480             }
8481             else
8482             {
8483               // Just move it to a different register
8484               regs[i].regmap_entry[hr]=r;
8485               // If it was dirty before, it's still dirty
8486               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8487             }
8488           }
8489           else
8490           {
8491             // Unneeded
8492             if(r==0){
8493               regs[i].regmap_entry[hr]=0;
8494             }
8495             else
8496             if(r<64){
8497               if((current.u>>r)&1) {
8498                 regs[i].regmap_entry[hr]=-1;
8499                 //regs[i].regmap[hr]=-1;
8500                 current.regmap[hr]=-1;
8501               }else
8502                 regs[i].regmap_entry[hr]=r;
8503             }
8504             else {
8505               if((current.uu>>(r&63))&1) {
8506                 regs[i].regmap_entry[hr]=-1;
8507                 //regs[i].regmap[hr]=-1;
8508                 current.regmap[hr]=-1;
8509               }else
8510                 regs[i].regmap_entry[hr]=r;
8511             }
8512           }
8513         } else {
8514           // Branches expect CCREG to be allocated at the target
8515           if(regmap_pre[i][hr]==CCREG)
8516             regs[i].regmap_entry[hr]=CCREG;
8517           else
8518             regs[i].regmap_entry[hr]=-1;
8519         }
8520       }
8521       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8522     }
8523
8524     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8525       current.waswritten|=1<<rs1[i-1];
8526     current.waswritten&=~(1<<rt1[i]);
8527     current.waswritten&=~(1<<rt2[i]);
8528     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8529       current.waswritten&=~(1<<rs1[i]);
8530
8531     /* Branch post-alloc */
8532     if(i>0)
8533     {
8534       current.was32=current.is32;
8535       current.wasdirty=current.dirty;
8536       switch(itype[i-1]) {
8537         case UJUMP:
8538           memcpy(&branch_regs[i-1],&current,sizeof(current));
8539           branch_regs[i-1].isconst=0;
8540           branch_regs[i-1].wasconst=0;
8541           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8542           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8543           alloc_cc(&branch_regs[i-1],i-1);
8544           dirty_reg(&branch_regs[i-1],CCREG);
8545           if(rt1[i-1]==31) { // JAL
8546             alloc_reg(&branch_regs[i-1],i-1,31);
8547             dirty_reg(&branch_regs[i-1],31);
8548             branch_regs[i-1].is32|=1LL<<31;
8549           }
8550           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8551           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8552           break;
8553         case RJUMP:
8554           memcpy(&branch_regs[i-1],&current,sizeof(current));
8555           branch_regs[i-1].isconst=0;
8556           branch_regs[i-1].wasconst=0;
8557           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8558           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8559           alloc_cc(&branch_regs[i-1],i-1);
8560           dirty_reg(&branch_regs[i-1],CCREG);
8561           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8562           if(rt1[i-1]!=0) { // JALR
8563             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8564             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8565             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8566           }
8567           #ifdef USE_MINI_HT
8568           if(rs1[i-1]==31) { // JALR
8569             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8570             #ifndef HOST_IMM_ADDR32
8571             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8572             #endif
8573           }
8574           #endif
8575           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8576           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8577           break;
8578         case CJUMP:
8579           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8580           {
8581             alloc_cc(&current,i-1);
8582             dirty_reg(&current,CCREG);
8583             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8584                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8585               // The delay slot overwrote one of our conditions
8586               // Delay slot goes after the test (in order)
8587               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8588               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8589               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8590               current.u|=1;
8591               current.uu|=1;
8592               delayslot_alloc(&current,i);
8593               current.isconst=0;
8594             }
8595             else
8596             {
8597               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8598               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8599               // Alloc the branch condition registers
8600               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8601               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8602               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8603               {
8604                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8605                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8606               }
8607             }
8608             memcpy(&branch_regs[i-1],&current,sizeof(current));
8609             branch_regs[i-1].isconst=0;
8610             branch_regs[i-1].wasconst=0;
8611             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8612             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8613           }
8614           else
8615           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8616           {
8617             alloc_cc(&current,i-1);
8618             dirty_reg(&current,CCREG);
8619             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8620               // The delay slot overwrote the branch condition
8621               // Delay slot goes after the test (in order)
8622               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8623               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8624               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8625               current.u|=1;
8626               current.uu|=1;
8627               delayslot_alloc(&current,i);
8628               current.isconst=0;
8629             }
8630             else
8631             {
8632               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8633               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8634               // Alloc the branch condition register
8635               alloc_reg(&current,i-1,rs1[i-1]);
8636               if(!(current.is32>>rs1[i-1]&1))
8637               {
8638                 alloc_reg64(&current,i-1,rs1[i-1]);
8639               }
8640             }
8641             memcpy(&branch_regs[i-1],&current,sizeof(current));
8642             branch_regs[i-1].isconst=0;
8643             branch_regs[i-1].wasconst=0;
8644             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8645             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8646           }
8647           else
8648           // Alloc the delay slot in case the branch is taken
8649           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8650           {
8651             memcpy(&branch_regs[i-1],&current,sizeof(current));
8652             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8653             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8654             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8655             alloc_cc(&branch_regs[i-1],i);
8656             dirty_reg(&branch_regs[i-1],CCREG);
8657             delayslot_alloc(&branch_regs[i-1],i);
8658             branch_regs[i-1].isconst=0;
8659             alloc_reg(&current,i,CCREG); // Not taken path
8660             dirty_reg(&current,CCREG);
8661             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8662           }
8663           else
8664           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8665           {
8666             memcpy(&branch_regs[i-1],&current,sizeof(current));
8667             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8668             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8669             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8670             alloc_cc(&branch_regs[i-1],i);
8671             dirty_reg(&branch_regs[i-1],CCREG);
8672             delayslot_alloc(&branch_regs[i-1],i);
8673             branch_regs[i-1].isconst=0;
8674             alloc_reg(&current,i,CCREG); // Not taken path
8675             dirty_reg(&current,CCREG);
8676             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8677           }
8678           break;
8679         case SJUMP:
8680           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8681           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8682           {
8683             alloc_cc(&current,i-1);
8684             dirty_reg(&current,CCREG);
8685             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8686               // The delay slot overwrote the branch condition
8687               // Delay slot goes after the test (in order)
8688               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8689               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8690               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8691               current.u|=1;
8692               current.uu|=1;
8693               delayslot_alloc(&current,i);
8694               current.isconst=0;
8695             }
8696             else
8697             {
8698               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8699               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8700               // Alloc the branch condition register
8701               alloc_reg(&current,i-1,rs1[i-1]);
8702               if(!(current.is32>>rs1[i-1]&1))
8703               {
8704                 alloc_reg64(&current,i-1,rs1[i-1]);
8705               }
8706             }
8707             memcpy(&branch_regs[i-1],&current,sizeof(current));
8708             branch_regs[i-1].isconst=0;
8709             branch_regs[i-1].wasconst=0;
8710             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8711             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8712           }
8713           else
8714           // Alloc the delay slot in case the branch is taken
8715           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8716           {
8717             memcpy(&branch_regs[i-1],&current,sizeof(current));
8718             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8719             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8720             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8721             alloc_cc(&branch_regs[i-1],i);
8722             dirty_reg(&branch_regs[i-1],CCREG);
8723             delayslot_alloc(&branch_regs[i-1],i);
8724             branch_regs[i-1].isconst=0;
8725             alloc_reg(&current,i,CCREG); // Not taken path
8726             dirty_reg(&current,CCREG);
8727             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8728           }
8729           // FIXME: BLTZAL/BGEZAL
8730           if(opcode2[i-1]&0x10) { // BxxZAL
8731             alloc_reg(&branch_regs[i-1],i-1,31);
8732             dirty_reg(&branch_regs[i-1],31);
8733             branch_regs[i-1].is32|=1LL<<31;
8734           }
8735           break;
8736         case FJUMP:
8737           if(likely[i-1]==0) // BC1F/BC1T
8738           {
8739             alloc_cc(&current,i-1);
8740             dirty_reg(&current,CCREG);
8741             if(itype[i]==FCOMP) {
8742               // The delay slot overwrote the branch condition
8743               // Delay slot goes after the test (in order)
8744               delayslot_alloc(&current,i);
8745               current.isconst=0;
8746             }
8747             else
8748             {
8749               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8750               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8751               // Alloc the branch condition register
8752               alloc_reg(&current,i-1,FSREG);
8753             }
8754             memcpy(&branch_regs[i-1],&current,sizeof(current));
8755             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8756           }
8757           else // BC1FL/BC1TL
8758           {
8759             // Alloc the delay slot in case the branch is taken
8760             memcpy(&branch_regs[i-1],&current,sizeof(current));
8761             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8762             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8763             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8764             alloc_cc(&branch_regs[i-1],i);
8765             dirty_reg(&branch_regs[i-1],CCREG);
8766             delayslot_alloc(&branch_regs[i-1],i);
8767             branch_regs[i-1].isconst=0;
8768             alloc_reg(&current,i,CCREG); // Not taken path
8769             dirty_reg(&current,CCREG);
8770             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8771           }
8772           break;
8773       }
8774
8775       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8776       {
8777         if(rt1[i-1]==31) // JAL/JALR
8778         {
8779           // Subroutine call will return here, don't alloc any registers
8780           current.is32=1;
8781           current.dirty=0;
8782           clear_all_regs(current.regmap);
8783           alloc_reg(&current,i,CCREG);
8784           dirty_reg(&current,CCREG);
8785         }
8786         else if(i+1<slen)
8787         {
8788           // Internal branch will jump here, match registers to caller
8789           current.is32=0x3FFFFFFFFLL;
8790           current.dirty=0;
8791           clear_all_regs(current.regmap);
8792           alloc_reg(&current,i,CCREG);
8793           dirty_reg(&current,CCREG);
8794           for(j=i-1;j>=0;j--)
8795           {
8796             if(ba[j]==start+i*4+4) {
8797               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8798               current.is32=branch_regs[j].is32;
8799               current.dirty=branch_regs[j].dirty;
8800               break;
8801             }
8802           }
8803           while(j>=0) {
8804             if(ba[j]==start+i*4+4) {
8805               for(hr=0;hr<HOST_REGS;hr++) {
8806                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8807                   current.regmap[hr]=-1;
8808                 }
8809                 current.is32&=branch_regs[j].is32;
8810                 current.dirty&=branch_regs[j].dirty;
8811               }
8812             }
8813             j--;
8814           }
8815         }
8816       }
8817     }
8818
8819     // Count cycles in between branches
8820     ccadj[i]=cc;
8821     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8822     {
8823       cc=0;
8824     }
8825 #if !defined(DRC_DBG)
8826     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8827     {
8828       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8829       cc+=gte_cycletab[source[i]&0x3f]/2;
8830     }
8831     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8832     {
8833       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8834     }
8835     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8836     {
8837       cc+=4;
8838     }
8839     else if(itype[i]==C2LS)
8840     {
8841       cc+=4;
8842     }
8843 #endif
8844     else
8845     {
8846       cc++;
8847     }
8848
8849     flush_dirty_uppers(&current);
8850     if(!is_ds[i]) {
8851       regs[i].is32=current.is32;
8852       regs[i].dirty=current.dirty;
8853       regs[i].isconst=current.isconst;
8854       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8855     }
8856     for(hr=0;hr<HOST_REGS;hr++) {
8857       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8858         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8859           regs[i].wasconst&=~(1<<hr);
8860         }
8861       }
8862     }
8863     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8864     regs[i].waswritten=current.waswritten;
8865   }
8866
8867   /* Pass 4 - Cull unused host registers */
8868
8869   uint64_t nr=0;
8870
8871   for (i=slen-1;i>=0;i--)
8872   {
8873     int hr;
8874     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8875     {
8876       if(ba[i]<start || ba[i]>=(start+slen*4))
8877       {
8878         // Branch out of this block, don't need anything
8879         nr=0;
8880       }
8881       else
8882       {
8883         // Internal branch
8884         // Need whatever matches the target
8885         nr=0;
8886         int t=(ba[i]-start)>>2;
8887         for(hr=0;hr<HOST_REGS;hr++)
8888         {
8889           if(regs[i].regmap_entry[hr]>=0) {
8890             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8891           }
8892         }
8893       }
8894       // Conditional branch may need registers for following instructions
8895       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8896       {
8897         if(i<slen-2) {
8898           nr|=needed_reg[i+2];
8899           for(hr=0;hr<HOST_REGS;hr++)
8900           {
8901             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8902             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8903           }
8904         }
8905       }
8906       // Don't need stuff which is overwritten
8907       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8908       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8909       // Merge in delay slot
8910       for(hr=0;hr<HOST_REGS;hr++)
8911       {
8912         if(!likely[i]) {
8913           // These are overwritten unless the branch is "likely"
8914           // and the delay slot is nullified if not taken
8915           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8916           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8917         }
8918         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8919         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8920         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8921         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8922         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8923         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8924         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8925         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8926         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8927           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8928           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8929         }
8930         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8931           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8932           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8933         }
8934         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8935           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8936           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8937         }
8938       }
8939     }
8940     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8941     {
8942       // SYSCALL instruction (software interrupt)
8943       nr=0;
8944     }
8945     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8946     {
8947       // ERET instruction (return from interrupt)
8948       nr=0;
8949     }
8950     else // Non-branch
8951     {
8952       if(i<slen-1) {
8953         for(hr=0;hr<HOST_REGS;hr++) {
8954           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8955           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8956           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8957           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8958         }
8959       }
8960     }
8961     for(hr=0;hr<HOST_REGS;hr++)
8962     {
8963       // Overwritten registers are not needed
8964       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8965       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8966       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8967       // Source registers are needed
8968       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8969       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8970       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8971       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8972       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8973       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8974       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8975       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8976       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
8977         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8978         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8979       }
8980       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
8981         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8982         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8983       }
8984       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8985         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8986         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8987       }
8988       // Don't store a register immediately after writing it,
8989       // may prevent dual-issue.
8990       // But do so if this is a branch target, otherwise we
8991       // might have to load the register before the branch.
8992       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8993         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
8994            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
8995           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8996           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8997         }
8998         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
8999            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9000           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9001           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9002         }
9003       }
9004     }
9005     // Cycle count is needed at branches.  Assume it is needed at the target too.
9006     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9007       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9008       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9009     }
9010     // Save it
9011     needed_reg[i]=nr;
9012
9013     // Deallocate unneeded registers
9014     for(hr=0;hr<HOST_REGS;hr++)
9015     {
9016       if(!((nr>>hr)&1)) {
9017         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9018         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9019            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9020            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9021         {
9022           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9023           {
9024             if(likely[i]) {
9025               regs[i].regmap[hr]=-1;
9026               regs[i].isconst&=~(1<<hr);
9027               if(i<slen-2) {
9028                 regmap_pre[i+2][hr]=-1;
9029                 regs[i+2].wasconst&=~(1<<hr);
9030               }
9031             }
9032           }
9033         }
9034         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9035         {
9036           int d1=0,d2=0,map=0,temp=0;
9037           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9038           {
9039             d1=dep1[i+1];
9040             d2=dep2[i+1];
9041           }
9042           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9043              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9044             map=INVCP;
9045           }
9046           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9047              itype[i+1]==C1LS || itype[i+1]==C2LS)
9048             temp=FTEMP;
9049           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9050              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9051              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9052              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9053              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9054              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9055              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9056              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9057              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9058              regs[i].regmap[hr]!=map )
9059           {
9060             regs[i].regmap[hr]=-1;
9061             regs[i].isconst&=~(1<<hr);
9062             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9063                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9064                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9065                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9066                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9067                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9068                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9069                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9070                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9071                branch_regs[i].regmap[hr]!=map)
9072             {
9073               branch_regs[i].regmap[hr]=-1;
9074               branch_regs[i].regmap_entry[hr]=-1;
9075               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9076               {
9077                 if(!likely[i]&&i<slen-2) {
9078                   regmap_pre[i+2][hr]=-1;
9079                   regs[i+2].wasconst&=~(1<<hr);
9080                 }
9081               }
9082             }
9083           }
9084         }
9085         else
9086         {
9087           // Non-branch
9088           if(i>0)
9089           {
9090             int d1=0,d2=0,map=-1,temp=-1;
9091             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9092             {
9093               d1=dep1[i];
9094               d2=dep2[i];
9095             }
9096             if(itype[i]==STORE || itype[i]==STORELR ||
9097                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9098               map=INVCP;
9099             }
9100             if(itype[i]==LOADLR || itype[i]==STORELR ||
9101                itype[i]==C1LS || itype[i]==C2LS)
9102               temp=FTEMP;
9103             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9104                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9105                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9106                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9107                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9108                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9109             {
9110               if(i<slen-1&&!is_ds[i]) {
9111                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9112                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9113                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9114                 {
9115                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9116                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9117                 }
9118                 regmap_pre[i+1][hr]=-1;
9119                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9120                 regs[i+1].wasconst&=~(1<<hr);
9121               }
9122               regs[i].regmap[hr]=-1;
9123               regs[i].isconst&=~(1<<hr);
9124             }
9125           }
9126         }
9127       }
9128     }
9129   }
9130
9131   /* Pass 5 - Pre-allocate registers */
9132
9133   // If a register is allocated during a loop, try to allocate it for the
9134   // entire loop, if possible.  This avoids loading/storing registers
9135   // inside of the loop.
9136
9137   signed char f_regmap[HOST_REGS];
9138   clear_all_regs(f_regmap);
9139   for(i=0;i<slen-1;i++)
9140   {
9141     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9142     {
9143       if(ba[i]>=start && ba[i]<(start+i*4))
9144       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9145       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9146       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9147       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9148       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9149       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9150       {
9151         int t=(ba[i]-start)>>2;
9152         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9153         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9154         for(hr=0;hr<HOST_REGS;hr++)
9155         {
9156           if(regs[i].regmap[hr]>64) {
9157             if(!((regs[i].dirty>>hr)&1))
9158               f_regmap[hr]=regs[i].regmap[hr];
9159             else f_regmap[hr]=-1;
9160           }
9161           else if(regs[i].regmap[hr]>=0) {
9162             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9163               // dealloc old register
9164               int n;
9165               for(n=0;n<HOST_REGS;n++)
9166               {
9167                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9168               }
9169               // and alloc new one
9170               f_regmap[hr]=regs[i].regmap[hr];
9171             }
9172           }
9173           if(branch_regs[i].regmap[hr]>64) {
9174             if(!((branch_regs[i].dirty>>hr)&1))
9175               f_regmap[hr]=branch_regs[i].regmap[hr];
9176             else f_regmap[hr]=-1;
9177           }
9178           else if(branch_regs[i].regmap[hr]>=0) {
9179             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9180               // dealloc old register
9181               int n;
9182               for(n=0;n<HOST_REGS;n++)
9183               {
9184                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9185               }
9186               // and alloc new one
9187               f_regmap[hr]=branch_regs[i].regmap[hr];
9188             }
9189           }
9190           if(ooo[i]) {
9191             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9192               f_regmap[hr]=branch_regs[i].regmap[hr];
9193           }else{
9194             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9195               f_regmap[hr]=branch_regs[i].regmap[hr];
9196           }
9197           // Avoid dirty->clean transition
9198           #ifdef DESTRUCTIVE_WRITEBACK
9199           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9200           #endif
9201           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9202           // case above, however it's always a good idea.  We can't hoist the
9203           // load if the register was already allocated, so there's no point
9204           // wasting time analyzing most of these cases.  It only "succeeds"
9205           // when the mapping was different and the load can be replaced with
9206           // a mov, which is of negligible benefit.  So such cases are
9207           // skipped below.
9208           if(f_regmap[hr]>0) {
9209             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9210               int r=f_regmap[hr];
9211               for(j=t;j<=i;j++)
9212               {
9213                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9214                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9215                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9216                 if(r>63) {
9217                   // NB This can exclude the case where the upper-half
9218                   // register is lower numbered than the lower-half
9219                   // register.  Not sure if it's worth fixing...
9220                   if(get_reg(regs[j].regmap,r&63)<0) break;
9221                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9222                   if(regs[j].is32&(1LL<<(r&63))) break;
9223                 }
9224                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9225                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9226                   int k;
9227                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9228                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9229                     if(r>63) {
9230                       if(get_reg(regs[i].regmap,r&63)<0) break;
9231                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9232                     }
9233                     k=i;
9234                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9235                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9236                         //printf("no free regs for store %x\n",start+(k-1)*4);
9237                         break;
9238                       }
9239                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9240                         //printf("no-match due to different register\n");
9241                         break;
9242                       }
9243                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9244                         //printf("no-match due to branch\n");
9245                         break;
9246                       }
9247                       // call/ret fast path assumes no registers allocated
9248                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9249                         break;
9250                       }
9251                       if(r>63) {
9252                         // NB This can exclude the case where the upper-half
9253                         // register is lower numbered than the lower-half
9254                         // register.  Not sure if it's worth fixing...
9255                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9256                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9257                       }
9258                       k--;
9259                     }
9260                     if(i<slen-1) {
9261                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9262                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9263                         //printf("bad match after branch\n");
9264                         break;
9265                       }
9266                     }
9267                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9268                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9269                       while(k<i) {
9270                         regs[k].regmap_entry[hr]=f_regmap[hr];
9271                         regs[k].regmap[hr]=f_regmap[hr];
9272                         regmap_pre[k+1][hr]=f_regmap[hr];
9273                         regs[k].wasdirty&=~(1<<hr);
9274                         regs[k].dirty&=~(1<<hr);
9275                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9276                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9277                         regs[k].wasconst&=~(1<<hr);
9278                         regs[k].isconst&=~(1<<hr);
9279                         k++;
9280                       }
9281                     }
9282                     else {
9283                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9284                       break;
9285                     }
9286                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9287                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9288                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9289                       regs[i].regmap_entry[hr]=f_regmap[hr];
9290                       regs[i].regmap[hr]=f_regmap[hr];
9291                       regs[i].wasdirty&=~(1<<hr);
9292                       regs[i].dirty&=~(1<<hr);
9293                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9294                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9295                       regs[i].wasconst&=~(1<<hr);
9296                       regs[i].isconst&=~(1<<hr);
9297                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9298                       branch_regs[i].wasdirty&=~(1<<hr);
9299                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9300                       branch_regs[i].regmap[hr]=f_regmap[hr];
9301                       branch_regs[i].dirty&=~(1<<hr);
9302                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9303                       branch_regs[i].wasconst&=~(1<<hr);
9304                       branch_regs[i].isconst&=~(1<<hr);
9305                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9306                         regmap_pre[i+2][hr]=f_regmap[hr];
9307                         regs[i+2].wasdirty&=~(1<<hr);
9308                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9309                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9310                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9311                       }
9312                     }
9313                   }
9314                   for(k=t;k<j;k++) {
9315                     // Alloc register clean at beginning of loop,
9316                     // but may dirty it in pass 6
9317                     regs[k].regmap_entry[hr]=f_regmap[hr];
9318                     regs[k].regmap[hr]=f_regmap[hr];
9319                     regs[k].dirty&=~(1<<hr);
9320                     regs[k].wasconst&=~(1<<hr);
9321                     regs[k].isconst&=~(1<<hr);
9322                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9323                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9324                       branch_regs[k].regmap[hr]=f_regmap[hr];
9325                       branch_regs[k].dirty&=~(1<<hr);
9326                       branch_regs[k].wasconst&=~(1<<hr);
9327                       branch_regs[k].isconst&=~(1<<hr);
9328                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9329                         regmap_pre[k+2][hr]=f_regmap[hr];
9330                         regs[k+2].wasdirty&=~(1<<hr);
9331                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9332                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9333                       }
9334                     }
9335                     else
9336                     {
9337                       regmap_pre[k+1][hr]=f_regmap[hr];
9338                       regs[k+1].wasdirty&=~(1<<hr);
9339                     }
9340                   }
9341                   if(regs[j].regmap[hr]==f_regmap[hr])
9342                     regs[j].regmap_entry[hr]=f_regmap[hr];
9343                   break;
9344                 }
9345                 if(j==i) break;
9346                 if(regs[j].regmap[hr]>=0)
9347                   break;
9348                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9349                   //printf("no-match due to different register\n");
9350                   break;
9351                 }
9352                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9353                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9354                   break;
9355                 }
9356                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9357                 {
9358                   // Stop on unconditional branch
9359                   break;
9360                 }
9361                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9362                 {
9363                   if(ooo[j]) {
9364                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9365                       break;
9366                   }else{
9367                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9368                       break;
9369                   }
9370                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9371                     //printf("no-match due to different register (branch)\n");
9372                     break;
9373                   }
9374                 }
9375                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9376                   //printf("No free regs for store %x\n",start+j*4);
9377                   break;
9378                 }
9379                 if(f_regmap[hr]>=64) {
9380                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9381                     break;
9382                   }
9383                   else
9384                   {
9385                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9386                       break;
9387                     }
9388                   }
9389                 }
9390               }
9391             }
9392           }
9393         }
9394       }
9395     }else{
9396       // Non branch or undetermined branch target
9397       for(hr=0;hr<HOST_REGS;hr++)
9398       {
9399         if(hr!=EXCLUDE_REG) {
9400           if(regs[i].regmap[hr]>64) {
9401             if(!((regs[i].dirty>>hr)&1))
9402               f_regmap[hr]=regs[i].regmap[hr];
9403           }
9404           else if(regs[i].regmap[hr]>=0) {
9405             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9406               // dealloc old register
9407               int n;
9408               for(n=0;n<HOST_REGS;n++)
9409               {
9410                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9411               }
9412               // and alloc new one
9413               f_regmap[hr]=regs[i].regmap[hr];
9414             }
9415           }
9416         }
9417       }
9418       // Try to restore cycle count at branch targets
9419       if(bt[i]) {
9420         for(j=i;j<slen-1;j++) {
9421           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9422           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9423             //printf("no free regs for store %x\n",start+j*4);
9424             break;
9425           }
9426         }
9427         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9428           int k=i;
9429           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9430           while(k<j) {
9431             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9432             regs[k].regmap[HOST_CCREG]=CCREG;
9433             regmap_pre[k+1][HOST_CCREG]=CCREG;
9434             regs[k+1].wasdirty|=1<<HOST_CCREG;
9435             regs[k].dirty|=1<<HOST_CCREG;
9436             regs[k].wasconst&=~(1<<HOST_CCREG);
9437             regs[k].isconst&=~(1<<HOST_CCREG);
9438             k++;
9439           }
9440           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9441         }
9442         // Work backwards from the branch target
9443         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9444         {
9445           //printf("Extend backwards\n");
9446           int k;
9447           k=i;
9448           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9449             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9450               //printf("no free regs for store %x\n",start+(k-1)*4);
9451               break;
9452             }
9453             k--;
9454           }
9455           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9456             //printf("Extend CC, %x ->\n",start+k*4);
9457             while(k<=i) {
9458               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9459               regs[k].regmap[HOST_CCREG]=CCREG;
9460               regmap_pre[k+1][HOST_CCREG]=CCREG;
9461               regs[k+1].wasdirty|=1<<HOST_CCREG;
9462               regs[k].dirty|=1<<HOST_CCREG;
9463               regs[k].wasconst&=~(1<<HOST_CCREG);
9464               regs[k].isconst&=~(1<<HOST_CCREG);
9465               k++;
9466             }
9467           }
9468           else {
9469             //printf("Fail Extend CC, %x ->\n",start+k*4);
9470           }
9471         }
9472       }
9473       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9474          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9475          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9476          itype[i]!=FCONV&&itype[i]!=FCOMP)
9477       {
9478         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9479       }
9480     }
9481   }
9482
9483   // Cache memory offset or tlb map pointer if a register is available
9484   #ifndef HOST_IMM_ADDR32
9485   #ifndef RAM_OFFSET
9486   if(0)
9487   #endif
9488   {
9489     int earliest_available[HOST_REGS];
9490     int loop_start[HOST_REGS];
9491     int score[HOST_REGS];
9492     int end[HOST_REGS];
9493     int reg=ROREG;
9494
9495     // Init
9496     for(hr=0;hr<HOST_REGS;hr++) {
9497       score[hr]=0;earliest_available[hr]=0;
9498       loop_start[hr]=MAXBLOCK;
9499     }
9500     for(i=0;i<slen-1;i++)
9501     {
9502       // Can't do anything if no registers are available
9503       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9504         for(hr=0;hr<HOST_REGS;hr++) {
9505           score[hr]=0;earliest_available[hr]=i+1;
9506           loop_start[hr]=MAXBLOCK;
9507         }
9508       }
9509       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9510         if(!ooo[i]) {
9511           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9512             for(hr=0;hr<HOST_REGS;hr++) {
9513               score[hr]=0;earliest_available[hr]=i+1;
9514               loop_start[hr]=MAXBLOCK;
9515             }
9516           }
9517         }else{
9518           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9519             for(hr=0;hr<HOST_REGS;hr++) {
9520               score[hr]=0;earliest_available[hr]=i+1;
9521               loop_start[hr]=MAXBLOCK;
9522             }
9523           }
9524         }
9525       }
9526       // Mark unavailable registers
9527       for(hr=0;hr<HOST_REGS;hr++) {
9528         if(regs[i].regmap[hr]>=0) {
9529           score[hr]=0;earliest_available[hr]=i+1;
9530           loop_start[hr]=MAXBLOCK;
9531         }
9532         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9533           if(branch_regs[i].regmap[hr]>=0) {
9534             score[hr]=0;earliest_available[hr]=i+2;
9535             loop_start[hr]=MAXBLOCK;
9536           }
9537         }
9538       }
9539       // No register allocations after unconditional jumps
9540       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9541       {
9542         for(hr=0;hr<HOST_REGS;hr++) {
9543           score[hr]=0;earliest_available[hr]=i+2;
9544           loop_start[hr]=MAXBLOCK;
9545         }
9546         i++; // Skip delay slot too
9547         //printf("skip delay slot: %x\n",start+i*4);
9548       }
9549       else
9550       // Possible match
9551       if(itype[i]==LOAD||itype[i]==LOADLR||
9552          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9553         for(hr=0;hr<HOST_REGS;hr++) {
9554           if(hr!=EXCLUDE_REG) {
9555             end[hr]=i-1;
9556             for(j=i;j<slen-1;j++) {
9557               if(regs[j].regmap[hr]>=0) break;
9558               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9559                 if(branch_regs[j].regmap[hr]>=0) break;
9560                 if(ooo[j]) {
9561                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9562                 }else{
9563                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9564                 }
9565               }
9566               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9567               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9568                 int t=(ba[j]-start)>>2;
9569                 if(t<j&&t>=earliest_available[hr]) {
9570                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9571                     // Score a point for hoisting loop invariant
9572                     if(t<loop_start[hr]) loop_start[hr]=t;
9573                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9574                     score[hr]++;
9575                     end[hr]=j;
9576                   }
9577                 }
9578                 else if(t<j) {
9579                   if(regs[t].regmap[hr]==reg) {
9580                     // Score a point if the branch target matches this register
9581                     score[hr]++;
9582                     end[hr]=j;
9583                   }
9584                 }
9585                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9586                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9587                   score[hr]++;
9588                   end[hr]=j;
9589                 }
9590               }
9591               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9592               {
9593                 // Stop on unconditional branch
9594                 break;
9595               }
9596               else
9597               if(itype[j]==LOAD||itype[j]==LOADLR||
9598                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9599                 score[hr]++;
9600                 end[hr]=j;
9601               }
9602             }
9603           }
9604         }
9605         // Find highest score and allocate that register
9606         int maxscore=0;
9607         for(hr=0;hr<HOST_REGS;hr++) {
9608           if(hr!=EXCLUDE_REG) {
9609             if(score[hr]>score[maxscore]) {
9610               maxscore=hr;
9611               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9612             }
9613           }
9614         }
9615         if(score[maxscore]>1)
9616         {
9617           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9618           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9619             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9620             assert(regs[j].regmap[maxscore]<0);
9621             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9622             regs[j].regmap[maxscore]=reg;
9623             regs[j].dirty&=~(1<<maxscore);
9624             regs[j].wasconst&=~(1<<maxscore);
9625             regs[j].isconst&=~(1<<maxscore);
9626             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9627               branch_regs[j].regmap[maxscore]=reg;
9628               branch_regs[j].wasdirty&=~(1<<maxscore);
9629               branch_regs[j].dirty&=~(1<<maxscore);
9630               branch_regs[j].wasconst&=~(1<<maxscore);
9631               branch_regs[j].isconst&=~(1<<maxscore);
9632               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9633                 regmap_pre[j+2][maxscore]=reg;
9634                 regs[j+2].wasdirty&=~(1<<maxscore);
9635               }
9636               // loop optimization (loop_preload)
9637               int t=(ba[j]-start)>>2;
9638               if(t==loop_start[maxscore]) {
9639                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9640                   regs[t].regmap_entry[maxscore]=reg;
9641               }
9642             }
9643             else
9644             {
9645               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9646                 regmap_pre[j+1][maxscore]=reg;
9647                 regs[j+1].wasdirty&=~(1<<maxscore);
9648               }
9649             }
9650           }
9651           i=j-1;
9652           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9653           for(hr=0;hr<HOST_REGS;hr++) {
9654             score[hr]=0;earliest_available[hr]=i+i;
9655             loop_start[hr]=MAXBLOCK;
9656           }
9657         }
9658       }
9659     }
9660   }
9661   #endif
9662
9663   // This allocates registers (if possible) one instruction prior
9664   // to use, which can avoid a load-use penalty on certain CPUs.
9665   for(i=0;i<slen-1;i++)
9666   {
9667     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9668     {
9669       if(!bt[i+1])
9670       {
9671         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9672            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9673         {
9674           if(rs1[i+1]) {
9675             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9676             {
9677               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9678               {
9679                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9680                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9681                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9682                 regs[i].isconst&=~(1<<hr);
9683                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9684                 constmap[i][hr]=constmap[i+1][hr];
9685                 regs[i+1].wasdirty&=~(1<<hr);
9686                 regs[i].dirty&=~(1<<hr);
9687               }
9688             }
9689           }
9690           if(rs2[i+1]) {
9691             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9692             {
9693               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9694               {
9695                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9696                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9697                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9698                 regs[i].isconst&=~(1<<hr);
9699                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9700                 constmap[i][hr]=constmap[i+1][hr];
9701                 regs[i+1].wasdirty&=~(1<<hr);
9702                 regs[i].dirty&=~(1<<hr);
9703               }
9704             }
9705           }
9706           // Preload target address for load instruction (non-constant)
9707           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9708             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9709             {
9710               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9711               {
9712                 regs[i].regmap[hr]=rs1[i+1];
9713                 regmap_pre[i+1][hr]=rs1[i+1];
9714                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9715                 regs[i].isconst&=~(1<<hr);
9716                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9717                 constmap[i][hr]=constmap[i+1][hr];
9718                 regs[i+1].wasdirty&=~(1<<hr);
9719                 regs[i].dirty&=~(1<<hr);
9720               }
9721             }
9722           }
9723           // Load source into target register
9724           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9725             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9726             {
9727               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9728               {
9729                 regs[i].regmap[hr]=rs1[i+1];
9730                 regmap_pre[i+1][hr]=rs1[i+1];
9731                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9732                 regs[i].isconst&=~(1<<hr);
9733                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9734                 constmap[i][hr]=constmap[i+1][hr];
9735                 regs[i+1].wasdirty&=~(1<<hr);
9736                 regs[i].dirty&=~(1<<hr);
9737               }
9738             }
9739           }
9740           // Address for store instruction (non-constant)
9741           if(itype[i+1]==STORE||itype[i+1]==STORELR
9742              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9743             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9744               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9745               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9746               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9747               assert(hr>=0);
9748               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9749               {
9750                 regs[i].regmap[hr]=rs1[i+1];
9751                 regmap_pre[i+1][hr]=rs1[i+1];
9752                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9753                 regs[i].isconst&=~(1<<hr);
9754                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9755                 constmap[i][hr]=constmap[i+1][hr];
9756                 regs[i+1].wasdirty&=~(1<<hr);
9757                 regs[i].dirty&=~(1<<hr);
9758               }
9759             }
9760           }
9761           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9762             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9763               int nr;
9764               hr=get_reg(regs[i+1].regmap,FTEMP);
9765               assert(hr>=0);
9766               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9767               {
9768                 regs[i].regmap[hr]=rs1[i+1];
9769                 regmap_pre[i+1][hr]=rs1[i+1];
9770                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9771                 regs[i].isconst&=~(1<<hr);
9772                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9773                 constmap[i][hr]=constmap[i+1][hr];
9774                 regs[i+1].wasdirty&=~(1<<hr);
9775                 regs[i].dirty&=~(1<<hr);
9776               }
9777               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9778               {
9779                 // move it to another register
9780                 regs[i+1].regmap[hr]=-1;
9781                 regmap_pre[i+2][hr]=-1;
9782                 regs[i+1].regmap[nr]=FTEMP;
9783                 regmap_pre[i+2][nr]=FTEMP;
9784                 regs[i].regmap[nr]=rs1[i+1];
9785                 regmap_pre[i+1][nr]=rs1[i+1];
9786                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9787                 regs[i].isconst&=~(1<<nr);
9788                 regs[i+1].isconst&=~(1<<nr);
9789                 regs[i].dirty&=~(1<<nr);
9790                 regs[i+1].wasdirty&=~(1<<nr);
9791                 regs[i+1].dirty&=~(1<<nr);
9792                 regs[i+2].wasdirty&=~(1<<nr);
9793               }
9794             }
9795           }
9796           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9797             if(itype[i+1]==LOAD)
9798               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9799             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9800               hr=get_reg(regs[i+1].regmap,FTEMP);
9801             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9802               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9803               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9804             }
9805             if(hr>=0&&regs[i].regmap[hr]<0) {
9806               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9807               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9808                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9809                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9810                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9811                 regs[i].isconst&=~(1<<hr);
9812                 regs[i+1].wasdirty&=~(1<<hr);
9813                 regs[i].dirty&=~(1<<hr);
9814               }
9815             }
9816           }
9817         }
9818       }
9819     }
9820   }
9821
9822   /* Pass 6 - Optimize clean/dirty state */
9823   clean_registers(0,slen-1,1);
9824
9825   /* Pass 7 - Identify 32-bit registers */
9826   for (i=slen-1;i>=0;i--)
9827   {
9828     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9829     {
9830       // Conditional branch
9831       if((source[i]>>16)!=0x1000&&i<slen-2) {
9832         // Mark this address as a branch target since it may be called
9833         // upon return from interrupt
9834         bt[i+2]=1;
9835       }
9836     }
9837   }
9838
9839   if(itype[slen-1]==SPAN) {
9840     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9841   }
9842
9843 #ifdef DISASM
9844   /* Debug/disassembly */
9845   for(i=0;i<slen;i++)
9846   {
9847     printf("U:");
9848     int r;
9849     for(r=1;r<=CCREG;r++) {
9850       if((unneeded_reg[i]>>r)&1) {
9851         if(r==HIREG) printf(" HI");
9852         else if(r==LOREG) printf(" LO");
9853         else printf(" r%d",r);
9854       }
9855     }
9856     printf("\n");
9857     #if defined(__i386__) || defined(__x86_64__)
9858     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9859     #endif
9860     #ifdef __arm__
9861     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9862     #endif
9863     printf("needs: ");
9864     if(needed_reg[i]&1) printf("eax ");
9865     if((needed_reg[i]>>1)&1) printf("ecx ");
9866     if((needed_reg[i]>>2)&1) printf("edx ");
9867     if((needed_reg[i]>>3)&1) printf("ebx ");
9868     if((needed_reg[i]>>5)&1) printf("ebp ");
9869     if((needed_reg[i]>>6)&1) printf("esi ");
9870     if((needed_reg[i]>>7)&1) printf("edi ");
9871     printf("\n");
9872     #if defined(__i386__) || defined(__x86_64__)
9873     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9874     printf("dirty: ");
9875     if(regs[i].wasdirty&1) printf("eax ");
9876     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9877     if((regs[i].wasdirty>>2)&1) printf("edx ");
9878     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9879     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9880     if((regs[i].wasdirty>>6)&1) printf("esi ");
9881     if((regs[i].wasdirty>>7)&1) printf("edi ");
9882     #endif
9883     #ifdef __arm__
9884     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9885     printf("dirty: ");
9886     if(regs[i].wasdirty&1) printf("r0 ");
9887     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9888     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9889     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9890     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9891     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9892     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9893     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9894     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9895     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9896     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9897     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9898     #endif
9899     printf("\n");
9900     disassemble_inst(i);
9901     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9902     #if defined(__i386__) || defined(__x86_64__)
9903     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9904     if(regs[i].dirty&1) printf("eax ");
9905     if((regs[i].dirty>>1)&1) printf("ecx ");
9906     if((regs[i].dirty>>2)&1) printf("edx ");
9907     if((regs[i].dirty>>3)&1) printf("ebx ");
9908     if((regs[i].dirty>>5)&1) printf("ebp ");
9909     if((regs[i].dirty>>6)&1) printf("esi ");
9910     if((regs[i].dirty>>7)&1) printf("edi ");
9911     #endif
9912     #ifdef __arm__
9913     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9914     if(regs[i].dirty&1) printf("r0 ");
9915     if((regs[i].dirty>>1)&1) printf("r1 ");
9916     if((regs[i].dirty>>2)&1) printf("r2 ");
9917     if((regs[i].dirty>>3)&1) printf("r3 ");
9918     if((regs[i].dirty>>4)&1) printf("r4 ");
9919     if((regs[i].dirty>>5)&1) printf("r5 ");
9920     if((regs[i].dirty>>6)&1) printf("r6 ");
9921     if((regs[i].dirty>>7)&1) printf("r7 ");
9922     if((regs[i].dirty>>8)&1) printf("r8 ");
9923     if((regs[i].dirty>>9)&1) printf("r9 ");
9924     if((regs[i].dirty>>10)&1) printf("r10 ");
9925     if((regs[i].dirty>>12)&1) printf("r12 ");
9926     #endif
9927     printf("\n");
9928     if(regs[i].isconst) {
9929       printf("constants: ");
9930       #if defined(__i386__) || defined(__x86_64__)
9931       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9932       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9933       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9934       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9935       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9936       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
9937       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
9938       #endif
9939       #ifdef __arm__
9940       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
9941       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
9942       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
9943       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
9944       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
9945       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
9946       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
9947       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
9948       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
9949       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
9950       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
9951       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
9952       #endif
9953       printf("\n");
9954     }
9955     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9956       #if defined(__i386__) || defined(__x86_64__)
9957       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9958       if(branch_regs[i].dirty&1) printf("eax ");
9959       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
9960       if((branch_regs[i].dirty>>2)&1) printf("edx ");
9961       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
9962       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
9963       if((branch_regs[i].dirty>>6)&1) printf("esi ");
9964       if((branch_regs[i].dirty>>7)&1) printf("edi ");
9965       #endif
9966       #ifdef __arm__
9967       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
9968       if(branch_regs[i].dirty&1) printf("r0 ");
9969       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
9970       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
9971       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
9972       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
9973       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
9974       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
9975       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
9976       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
9977       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
9978       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
9979       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
9980       #endif
9981     }
9982   }
9983 #endif // DISASM
9984
9985   /* Pass 8 - Assembly */
9986   linkcount=0;stubcount=0;
9987   ds=0;is_delayslot=0;
9988   cop1_usable=0;
9989   uint64_t is32_pre=0;
9990   u_int dirty_pre=0;
9991   void *beginning=start_block();
9992   if((u_int)addr&1) {
9993     ds=1;
9994     pagespan_ds();
9995   }
9996   u_int instr_addr0_override=0;
9997
9998   if (start == 0x80030000) {
9999     // nasty hack for fastbios thing
10000     // override block entry to this code
10001     instr_addr0_override=(u_int)out;
10002     emit_movimm(start,0);
10003     // abuse io address var as a flag that we
10004     // have already returned here once
10005     emit_readword((int)&address,1);
10006     emit_writeword(0,(int)&pcaddr);
10007     emit_writeword(0,(int)&address);
10008     emit_cmp(0,1);
10009     emit_jne((int)new_dyna_leave);
10010   }
10011   for(i=0;i<slen;i++)
10012   {
10013     //if(ds) printf("ds: ");
10014     disassemble_inst(i);
10015     if(ds) {
10016       ds=0; // Skip delay slot
10017       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10018       instr_addr[i]=0;
10019     } else {
10020       speculate_register_values(i);
10021       #ifndef DESTRUCTIVE_WRITEBACK
10022       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10023       {
10024         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10025               unneeded_reg[i],unneeded_reg_upper[i]);
10026       }
10027       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
10028         is32_pre=branch_regs[i].is32;
10029         dirty_pre=branch_regs[i].dirty;
10030       }else{
10031         is32_pre=regs[i].is32;
10032         dirty_pre=regs[i].dirty;
10033       }
10034       #endif
10035       // write back
10036       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10037       {
10038         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10039                       unneeded_reg[i],unneeded_reg_upper[i]);
10040         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10041       }
10042       // branch target entry point
10043       instr_addr[i]=(u_int)out;
10044       assem_debug("<->\n");
10045       // load regs
10046       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10047         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10048       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10049       address_generation(i,&regs[i],regs[i].regmap_entry);
10050       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10051       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10052       {
10053         // Load the delay slot registers if necessary
10054         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
10055           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10056         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
10057           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10058         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10059           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10060       }
10061       else if(i+1<slen)
10062       {
10063         // Preload registers for following instruction
10064         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10065           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10066             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10067         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10068           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10069             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10070       }
10071       // TODO: if(is_ooo(i)) address_generation(i+1);
10072       if(itype[i]==CJUMP||itype[i]==FJUMP)
10073         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10074       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10075         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10076       if(bt[i]) cop1_usable=0;
10077       // assemble
10078       switch(itype[i]) {
10079         case ALU:
10080           alu_assemble(i,&regs[i]);break;
10081         case IMM16:
10082           imm16_assemble(i,&regs[i]);break;
10083         case SHIFT:
10084           shift_assemble(i,&regs[i]);break;
10085         case SHIFTIMM:
10086           shiftimm_assemble(i,&regs[i]);break;
10087         case LOAD:
10088           load_assemble(i,&regs[i]);break;
10089         case LOADLR:
10090           loadlr_assemble(i,&regs[i]);break;
10091         case STORE:
10092           store_assemble(i,&regs[i]);break;
10093         case STORELR:
10094           storelr_assemble(i,&regs[i]);break;
10095         case COP0:
10096           cop0_assemble(i,&regs[i]);break;
10097         case COP1:
10098           cop1_assemble(i,&regs[i]);break;
10099         case C1LS:
10100           c1ls_assemble(i,&regs[i]);break;
10101         case COP2:
10102           cop2_assemble(i,&regs[i]);break;
10103         case C2LS:
10104           c2ls_assemble(i,&regs[i]);break;
10105         case C2OP:
10106           c2op_assemble(i,&regs[i]);break;
10107         case FCONV:
10108           fconv_assemble(i,&regs[i]);break;
10109         case FLOAT:
10110           float_assemble(i,&regs[i]);break;
10111         case FCOMP:
10112           fcomp_assemble(i,&regs[i]);break;
10113         case MULTDIV:
10114           multdiv_assemble(i,&regs[i]);break;
10115         case MOV:
10116           mov_assemble(i,&regs[i]);break;
10117         case SYSCALL:
10118           syscall_assemble(i,&regs[i]);break;
10119         case HLECALL:
10120           hlecall_assemble(i,&regs[i]);break;
10121         case INTCALL:
10122           intcall_assemble(i,&regs[i]);break;
10123         case UJUMP:
10124           ujump_assemble(i,&regs[i]);ds=1;break;
10125         case RJUMP:
10126           rjump_assemble(i,&regs[i]);ds=1;break;
10127         case CJUMP:
10128           cjump_assemble(i,&regs[i]);ds=1;break;
10129         case SJUMP:
10130           sjump_assemble(i,&regs[i]);ds=1;break;
10131         case FJUMP:
10132           fjump_assemble(i,&regs[i]);ds=1;break;
10133         case SPAN:
10134           pagespan_assemble(i,&regs[i]);break;
10135       }
10136       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10137         literal_pool(1024);
10138       else
10139         literal_pool_jumpover(256);
10140     }
10141   }
10142   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10143   // If the block did not end with an unconditional branch,
10144   // add a jump to the next instruction.
10145   if(i>1) {
10146     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10147       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10148       assert(i==slen);
10149       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10150         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10151         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10152           emit_loadreg(CCREG,HOST_CCREG);
10153         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10154       }
10155       else if(!likely[i-2])
10156       {
10157         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10158         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10159       }
10160       else
10161       {
10162         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10163         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10164       }
10165       add_to_linker((int)out,start+i*4,0);
10166       emit_jmp(0);
10167     }
10168   }
10169   else
10170   {
10171     assert(i>0);
10172     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10173     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10174     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10175       emit_loadreg(CCREG,HOST_CCREG);
10176     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10177     add_to_linker((int)out,start+i*4,0);
10178     emit_jmp(0);
10179   }
10180
10181   // TODO: delay slot stubs?
10182   // Stubs
10183   for(i=0;i<stubcount;i++)
10184   {
10185     switch(stubs[i][0])
10186     {
10187       case LOADB_STUB:
10188       case LOADH_STUB:
10189       case LOADW_STUB:
10190       case LOADD_STUB:
10191       case LOADBU_STUB:
10192       case LOADHU_STUB:
10193         do_readstub(i);break;
10194       case STOREB_STUB:
10195       case STOREH_STUB:
10196       case STOREW_STUB:
10197       case STORED_STUB:
10198         do_writestub(i);break;
10199       case CC_STUB:
10200         do_ccstub(i);break;
10201       case INVCODE_STUB:
10202         do_invstub(i);break;
10203       case FP_STUB:
10204         do_cop1stub(i);break;
10205       case STORELR_STUB:
10206         do_unalignedwritestub(i);break;
10207     }
10208   }
10209
10210   if (instr_addr0_override)
10211     instr_addr[0] = instr_addr0_override;
10212
10213   /* Pass 9 - Linker */
10214   for(i=0;i<linkcount;i++)
10215   {
10216     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10217     literal_pool(64);
10218     if(!link_addr[i][2])
10219     {
10220       void *stub=out;
10221       void *addr=check_addr(link_addr[i][1]);
10222       emit_extjump(link_addr[i][0],link_addr[i][1]);
10223       if(addr) {
10224         set_jump_target(link_addr[i][0],(int)addr);
10225         add_link(link_addr[i][1],stub);
10226       }
10227       else set_jump_target(link_addr[i][0],(int)stub);
10228     }
10229     else
10230     {
10231       // Internal branch
10232       int target=(link_addr[i][1]-start)>>2;
10233       assert(target>=0&&target<slen);
10234       assert(instr_addr[target]);
10235       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10236       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10237       //#else
10238       set_jump_target(link_addr[i][0],instr_addr[target]);
10239       //#endif
10240     }
10241   }
10242   // External Branch Targets (jump_in)
10243   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10244   for(i=0;i<slen;i++)
10245   {
10246     if(bt[i]||i==0)
10247     {
10248       if(instr_addr[i]) // TODO - delay slots (=null)
10249       {
10250         u_int vaddr=start+i*4;
10251         u_int page=get_page(vaddr);
10252         u_int vpage=get_vpage(vaddr);
10253         literal_pool(256);
10254         {
10255           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10256           assem_debug("jump_in: %x\n",start+i*4);
10257           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10258           int entry_point=do_dirty_stub(i);
10259           ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
10260           // If there was an existing entry in the hash table,
10261           // replace it with the new address.
10262           // Don't add new entries.  We'll insert the
10263           // ones that actually get used in check_addr().
10264           u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10265           if(ht_bin[0]==vaddr) {
10266             ht_bin[1]=entry_point;
10267           }
10268           if(ht_bin[2]==vaddr) {
10269             ht_bin[3]=entry_point;
10270           }
10271         }
10272       }
10273     }
10274   }
10275   // Write out the literal pool if necessary
10276   literal_pool(0);
10277   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10278   // Align code
10279   if(((u_int)out)&7) emit_addnop(13);
10280   #endif
10281   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
10282   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10283   memcpy(copy,source,slen*4);
10284   copy+=slen*4;
10285
10286   end_block(beginning);
10287
10288   // If we're within 256K of the end of the buffer,
10289   // start over from the beginning. (Is 256K enough?)
10290   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10291
10292   // Trap writes to any of the pages we compiled
10293   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10294     invalid_code[i]=0;
10295   }
10296   inv_code_start=inv_code_end=~0;
10297
10298   // for PCSX we need to mark all mirrors too
10299   if(get_page(start)<(RAM_SIZE>>12))
10300     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10301       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10302       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10303       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10304
10305   /* Pass 10 - Free memory by expiring oldest blocks */
10306
10307   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10308   while(expirep!=end)
10309   {
10310     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10311     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10312     inv_debug("EXP: Phase %d\n",expirep);
10313     switch((expirep>>11)&3)
10314     {
10315       case 0:
10316         // Clear jump_in and jump_dirty
10317         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10318         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10319         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10320         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10321         break;
10322       case 1:
10323         // Clear pointers
10324         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10325         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10326         break;
10327       case 2:
10328         // Clear hash table
10329         for(i=0;i<32;i++) {
10330           u_int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10331           if((ht_bin[3]>>shift)==(base>>shift) ||
10332              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10333             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10334             ht_bin[2]=ht_bin[3]=-1;
10335           }
10336           if((ht_bin[1]>>shift)==(base>>shift) ||
10337              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10338             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10339             ht_bin[0]=ht_bin[2];
10340             ht_bin[1]=ht_bin[3];
10341             ht_bin[2]=ht_bin[3]=-1;
10342           }
10343         }
10344         break;
10345       case 3:
10346         // Clear jump_out
10347         #ifdef __arm__
10348         if((expirep&2047)==0)
10349           do_clear_cache();
10350         #endif
10351         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10352         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10353         break;
10354     }
10355     expirep=(expirep+1)&65535;
10356   }
10357   return 0;
10358 }
10359
10360 // vim:shiftwidth=2:expandtab