Merge https://github.com/notaz/pcsx_rearmed
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 int getVMBlock();
36 #endif
37
38 #include "new_dynarec_config.h"
39 #include "backends/psx/emu_if.h" //emulator interface
40
41 //#define DISASM
42 //#define assem_debug printf
43 //#define inv_debug printf
44 #define assem_debug(...)
45 #define inv_debug(...)
46
47 #ifdef __i386__
48 #include "x86/assem_x86.h"
49 #endif
50 #ifdef __x86_64__
51 #include "x64/assem_x64.h"
52 #endif
53 #ifdef __arm__
54 #include "arm/assem_arm.h"
55 #endif
56
57 #ifdef VITA
58 int _newlib_vm_size_user = 1 << TARGET_SIZE_2;
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 struct regstat
65 {
66   signed char regmap_entry[HOST_REGS];
67   signed char regmap[HOST_REGS];
68   uint64_t was32;
69   uint64_t is32;
70   uint64_t wasdirty;
71   uint64_t dirty;
72   uint64_t u;
73   uint64_t uu;
74   u_int wasconst;
75   u_int isconst;
76   u_int loadedconst;             // host regs that have constants loaded
77   u_int waswritten;              // MIPS regs that were used as store base before
78 };
79
80 // note: asm depends on this layout
81 struct ll_entry
82 {
83   u_int vaddr;
84   u_int reg_sv_flags;
85   void *addr;
86   struct ll_entry *next;
87 };
88
89   // used by asm:
90   u_char *out;
91   u_int hash_table[65536][4]  __attribute__((aligned(16)));
92   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
93   struct ll_entry *jump_dirty[4096];
94
95   static struct ll_entry *jump_out[4096];
96   static u_int start;
97   static u_int *source;
98   static char insn[MAXBLOCK][10];
99   static u_char itype[MAXBLOCK];
100   static u_char opcode[MAXBLOCK];
101   static u_char opcode2[MAXBLOCK];
102   static u_char bt[MAXBLOCK];
103   static u_char rs1[MAXBLOCK];
104   static u_char rs2[MAXBLOCK];
105   static u_char rt1[MAXBLOCK];
106   static u_char rt2[MAXBLOCK];
107   static u_char us1[MAXBLOCK];
108   static u_char us2[MAXBLOCK];
109   static u_char dep1[MAXBLOCK];
110   static u_char dep2[MAXBLOCK];
111   static u_char lt1[MAXBLOCK];
112   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
113   static uint64_t gte_rt[MAXBLOCK];
114   static uint64_t gte_unneeded[MAXBLOCK];
115   static u_int smrv[32]; // speculated MIPS register values
116   static u_int smrv_strong; // mask or regs that are likely to have correct values
117   static u_int smrv_weak; // same, but somewhat less likely
118   static u_int smrv_strong_next; // same, but after current insn executes
119   static u_int smrv_weak_next;
120   static int imm[MAXBLOCK];
121   static u_int ba[MAXBLOCK];
122   static char likely[MAXBLOCK];
123   static char is_ds[MAXBLOCK];
124   static char ooo[MAXBLOCK];
125   static uint64_t unneeded_reg[MAXBLOCK];
126   static uint64_t unneeded_reg_upper[MAXBLOCK];
127   static uint64_t branch_unneeded_reg[MAXBLOCK];
128   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
129   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
130   static uint64_t current_constmap[HOST_REGS];
131   static uint64_t constmap[MAXBLOCK][HOST_REGS];
132   static struct regstat regs[MAXBLOCK];
133   static struct regstat branch_regs[MAXBLOCK];
134   static signed char minimum_free_regs[MAXBLOCK];
135   static u_int needed_reg[MAXBLOCK];
136   static u_int wont_dirty[MAXBLOCK];
137   static u_int will_dirty[MAXBLOCK];
138   static int ccadj[MAXBLOCK];
139   static int slen;
140   static u_int instr_addr[MAXBLOCK];
141   static u_int link_addr[MAXBLOCK][3];
142   static int linkcount;
143   static u_int stubs[MAXBLOCK*3][8];
144   static int stubcount;
145   static u_int literals[1024][2];
146   static int literalcount;
147   static int is_delayslot;
148   static int cop1_usable;
149   static char shadow[1048576]  __attribute__((aligned(16)));
150   static void *copy;
151   static int expirep;
152   static u_int stop_after_jal;
153 #ifndef RAM_FIXED
154   static u_int ram_offset;
155 #else
156   static const u_int ram_offset=0;
157 #endif
158
159   int new_dynarec_hacks;
160   int new_dynarec_did_compile;
161   extern u_char restore_candidate[512];
162   extern int cycle_count;
163
164   /* registers that may be allocated */
165   /* 1-31 gpr */
166 #define HIREG 32 // hi
167 #define LOREG 33 // lo
168 #define FSREG 34 // FPU status (FCSR)
169 #define CSREG 35 // Coprocessor status
170 #define CCREG 36 // Cycle count
171 #define INVCP 37 // Pointer to invalid_code
172 //#define MMREG 38 // Pointer to memory_map
173 #define ROREG 39 // ram offset (if rdram!=0x80000000)
174 #define TEMPREG 40
175 #define FTEMP 40 // FPU temporary register
176 #define PTEMP 41 // Prefetch temporary register
177 //#define TLREG 42 // TLB mapping offset
178 #define RHASH 43 // Return address hash
179 #define RHTBL 44 // Return address hash table address
180 #define RTEMP 45 // JR/JALR address register
181 #define MAXREG 45
182 #define AGEN1 46 // Address generation temporary register
183 //#define AGEN2 47 // Address generation temporary register
184 //#define MGEN1 48 // Maptable address generation temporary register
185 //#define MGEN2 49 // Maptable address generation temporary register
186 #define BTREG 50 // Branch target temporary register
187
188   /* instruction types */
189 #define NOP 0     // No operation
190 #define LOAD 1    // Load
191 #define STORE 2   // Store
192 #define LOADLR 3  // Unaligned load
193 #define STORELR 4 // Unaligned store
194 #define MOV 5     // Move
195 #define ALU 6     // Arithmetic/logic
196 #define MULTDIV 7 // Multiply/divide
197 #define SHIFT 8   // Shift by register
198 #define SHIFTIMM 9// Shift by immediate
199 #define IMM16 10  // 16-bit immediate
200 #define RJUMP 11  // Unconditional jump to register
201 #define UJUMP 12  // Unconditional jump
202 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
203 #define SJUMP 14  // Conditional branch (regimm format)
204 #define COP0 15   // Coprocessor 0
205 #define COP1 16   // Coprocessor 1
206 #define C1LS 17   // Coprocessor 1 load/store
207 #define FJUMP 18  // Conditional branch (floating point)
208 #define FLOAT 19  // Floating point unit
209 #define FCONV 20  // Convert integer to float
210 #define FCOMP 21  // Floating point compare (sets FSREG)
211 #define SYSCALL 22// SYSCALL
212 #define OTHER 23  // Other
213 #define SPAN 24   // Branch/delay slot spans 2 pages
214 #define NI 25     // Not implemented
215 #define HLECALL 26// PCSX fake opcodes for HLE
216 #define COP2 27   // Coprocessor 2 move
217 #define C2LS 28   // Coprocessor 2 load/store
218 #define C2OP 29   // Coprocessor 2 operation
219 #define INTCALL 30// Call interpreter to handle rare corner cases
220
221   /* stubs */
222 #define CC_STUB 1
223 #define FP_STUB 2
224 #define LOADB_STUB 3
225 #define LOADH_STUB 4
226 #define LOADW_STUB 5
227 #define LOADD_STUB 6
228 #define LOADBU_STUB 7
229 #define LOADHU_STUB 8
230 #define STOREB_STUB 9
231 #define STOREH_STUB 10
232 #define STOREW_STUB 11
233 #define STORED_STUB 12
234 #define STORELR_STUB 13
235 #define INVCODE_STUB 14
236
237   /* branch codes */
238 #define TAKEN 1
239 #define NOTTAKEN 2
240 #define NULLDS 3
241
242 // asm linkage
243 int new_recompile_block(int addr);
244 void *get_addr_ht(u_int vaddr);
245 void invalidate_block(u_int block);
246 void invalidate_addr(u_int addr);
247 void remove_hash(int vaddr);
248 void dyna_linker();
249 void dyna_linker_ds();
250 void verify_code();
251 void verify_code_vm();
252 void verify_code_ds();
253 void cc_interrupt();
254 void fp_exception();
255 void fp_exception_ds();
256 void jump_syscall_hle();
257 void jump_hlecall();
258 void jump_intcall();
259 void new_dyna_leave();
260
261 // Needed by assembler
262 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
263 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
264 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
265 static void load_all_regs(signed char i_regmap[]);
266 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
267 static void load_regs_entry(int t);
268 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
269
270 static int verify_dirty(u_int *ptr);
271 static int get_final_value(int hr, int i, int *value);
272 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e);
273 static void add_to_linker(int addr,int target,int ext);
274
275 static int tracedebug=0;
276
277 static void mprotect_w_x(void *start, void *end, int is_x)
278 {
279 #ifdef NO_WRITE_EXEC
280   #if defined(VITA)
281   // *Open* enables write on all memory that was
282   // allocated by sceKernelAllocMemBlockForVM()?
283   if (is_x)
284     sceKernelCloseVMDomain();
285   else
286     sceKernelOpenVMDomain();
287   #else
288   u_long mstart = (u_long)start & ~4095ul;
289   u_long mend = (u_long)end;
290   if (mprotect((void *)mstart, mend - mstart,
291                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
292     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
293   #endif
294 #endif
295 }
296
297 static void start_tcache_write(void *start, void *end)
298 {
299   mprotect_w_x(start, end, 0);
300 }
301
302 static void end_tcache_write(void *start, void *end)
303 {
304 #ifdef __arm__
305   size_t len = (char *)end - (char *)start;
306   #if   defined(__BLACKBERRY_QNX__)
307   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
308   #elif defined(__MACH__)
309   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
310   #elif defined(VITA)
311   sceKernelSyncVMDomain(sceBlock, start, len);
312   #elif defined(_3DS)
313   ctr_flush_invalidate_cache();
314   #else
315   __clear_cache(start, end);
316   #endif
317   (void)len;
318 #endif
319
320   mprotect_w_x(start, end, 1);
321 }
322
323 static void *start_block(void)
324 {
325   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
326   if (end > (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2))
327     end = (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2);
328   start_tcache_write(out, end);
329   return out;
330 }
331
332 static void end_block(void *start)
333 {
334   end_tcache_write(start, out);
335 }
336
337 //#define DEBUG_CYCLE_COUNT 1
338
339 #define NO_CYCLE_PENALTY_THR 12
340
341 int cycle_multiplier; // 100 for 1.0
342
343 static int CLOCK_ADJUST(int x)
344 {
345   int s=(x>>31)|1;
346   return (x * cycle_multiplier + s * 50) / 100;
347 }
348
349 static u_int get_page(u_int vaddr)
350 {
351   u_int page=vaddr&~0xe0000000;
352   if (page < 0x1000000)
353     page &= ~0x0e00000; // RAM mirrors
354   page>>=12;
355   if(page>2048) page=2048+(page&2047);
356   return page;
357 }
358
359 // no virtual mem in PCSX
360 static u_int get_vpage(u_int vaddr)
361 {
362   return get_page(vaddr);
363 }
364
365 // Get address from virtual address
366 // This is called from the recompiled JR/JALR instructions
367 void *get_addr(u_int vaddr)
368 {
369   struct ll_entry *head = NULL;
370   u_int page            = get_page(vaddr);
371   u_int vpage           = get_vpage(vaddr);
372   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
373   head=jump_in[page];
374   while(head!=NULL)
375   {
376     if(head->vaddr==vaddr)
377     {
378       //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
379       u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
380       ht_bin[3]=ht_bin[1];
381       ht_bin[2]=ht_bin[0];
382       ht_bin[1]=(u_int)head->addr;
383       ht_bin[0]=vaddr;
384       return head->addr;
385     }
386     head=head->next;
387   }
388   head=jump_dirty[vpage];
389   while(head!=NULL)
390   {
391     if(head->vaddr==vaddr)
392     {
393       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
394       // Don't restore blocks which are about to expire from the cache
395       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
396         if(verify_dirty(head->addr))
397         {
398           //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
399           invalid_code[vaddr>>12]=0;
400           inv_code_start=inv_code_end=~0;
401           if(vpage<2048)
402           {
403             restore_candidate[vpage>>3]|=1<<(vpage&7);
404           }
405           else
406           {
407             restore_candidate[page>>3]|=1<<(page&7);
408           }
409           u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
410
411           if(ht_bin[0]==vaddr)
412             ht_bin[1]=(u_int)head->addr; // Replace existing entry
413           else
414           {
415             ht_bin[3]=ht_bin[1];
416             ht_bin[2]=ht_bin[0];
417             ht_bin[1]=(int)head->addr;
418             ht_bin[0]=vaddr;
419           }
420           return head->addr;
421         }
422     }
423     head=head->next;
424   }
425   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
426   int r=new_recompile_block(vaddr);
427   if(r==0)
428     return get_addr(vaddr);
429   // Execute in unmapped page, generate pagefault exception
430   Status|=2;
431   Cause=(vaddr<<31)|0x8;
432   EPC=(vaddr&1)?vaddr-5:vaddr;
433   BadVAddr=(vaddr&~1);
434   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
435   EntryHi=BadVAddr&0xFFFFE000;
436   return get_addr_ht(0x80000000);
437 }
438
439 // Look up address in hash table first
440 void *get_addr_ht(u_int vaddr)
441 {
442   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
443   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
444   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
445   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
446   return get_addr(vaddr);
447 }
448
449 void clear_all_regs(signed char regmap[])
450 {
451   int hr;
452   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
453 }
454
455 signed char get_reg(signed char regmap[],int r)
456 {
457   int hr;
458   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
459   return -1;
460 }
461
462 // Find a register that is available for two consecutive cycles
463 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
464 {
465   int hr;
466   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
467   return -1;
468 }
469
470 int count_free_regs(signed char regmap[])
471 {
472   int count=0;
473   int hr;
474   for(hr=0;hr<HOST_REGS;hr++)
475   {
476     if(hr!=EXCLUDE_REG) {
477       if(regmap[hr]<0) count++;
478     }
479   }
480   return count;
481 }
482
483 void dirty_reg(struct regstat *cur,signed char reg)
484 {
485   int hr;
486   if(!reg) return;
487   for (hr=0;hr<HOST_REGS;hr++) {
488     if((cur->regmap[hr]&63)==reg) {
489       cur->dirty|=1<<hr;
490     }
491   }
492 }
493
494 // If we dirty the lower half of a 64 bit register which is now being
495 // sign-extended, we need to dump the upper half.
496 // Note: Do this only after completion of the instruction, because
497 // some instructions may need to read the full 64-bit value even if
498 // overwriting it (eg SLTI, DSRA32).
499 static void flush_dirty_uppers(struct regstat *cur)
500 {
501   int hr,reg;
502   for (hr=0;hr<HOST_REGS;hr++) {
503     if((cur->dirty>>hr)&1) {
504       reg=cur->regmap[hr];
505       if(reg>=64)
506         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
507     }
508   }
509 }
510
511 void set_const(struct regstat *cur,signed char reg,uint64_t value)
512 {
513   int hr;
514   if(!reg) return;
515   for (hr=0;hr<HOST_REGS;hr++) {
516     if(cur->regmap[hr]==reg) {
517       cur->isconst|=1<<hr;
518       current_constmap[hr]=value;
519     }
520     else if((cur->regmap[hr]^64)==reg) {
521       cur->isconst|=1<<hr;
522       current_constmap[hr]=value>>32;
523     }
524   }
525 }
526
527 void clear_const(struct regstat *cur,signed char reg)
528 {
529   int hr;
530   if(!reg) return;
531   for (hr=0;hr<HOST_REGS;hr++) {
532     if((cur->regmap[hr]&63)==reg) {
533       cur->isconst&=~(1<<hr);
534     }
535   }
536 }
537
538 int is_const(struct regstat *cur,signed char reg)
539 {
540   int hr;
541   if(reg<0) return 0;
542   if(!reg) return 1;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if((cur->regmap[hr]&63)==reg) {
545       return (cur->isconst>>hr)&1;
546     }
547   }
548   return 0;
549 }
550 uint64_t get_const(struct regstat *cur,signed char reg)
551 {
552   int hr;
553   if(!reg) return 0;
554   for (hr=0;hr<HOST_REGS;hr++) {
555     if(cur->regmap[hr]==reg) {
556       return current_constmap[hr];
557     }
558   }
559   SysPrintf("Unknown constant in r%d\n",reg);
560   exit(1);
561 }
562
563 // Least soon needed registers
564 // Look at the next ten instructions and see which registers
565 // will be used.  Try not to reallocate these.
566 void lsn(u_char hsn[], int i, int *preferred_reg)
567 {
568   int j;
569   int b=-1;
570   for(j=0;j<9;j++)
571   {
572     if(i+j>=slen) {
573       j=slen-i-1;
574       break;
575     }
576     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
577     {
578       // Don't go past an unconditonal jump
579       j++;
580       break;
581     }
582   }
583   for(;j>=0;j--)
584   {
585     if(rs1[i+j]) hsn[rs1[i+j]]=j;
586     if(rs2[i+j]) hsn[rs2[i+j]]=j;
587     if(rt1[i+j]) hsn[rt1[i+j]]=j;
588     if(rt2[i+j]) hsn[rt2[i+j]]=j;
589     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
590       // Stores can allocate zero
591       hsn[rs1[i+j]]=j;
592       hsn[rs2[i+j]]=j;
593     }
594     // On some architectures stores need invc_ptr
595     #if defined(HOST_IMM8)
596     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
597       hsn[INVCP]=j;
598     }
599     #endif
600     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
601     {
602       hsn[CCREG]=j;
603       b=j;
604     }
605   }
606   if(b>=0)
607   {
608     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
609     {
610       // Follow first branch
611       int t=(ba[i+b]-start)>>2;
612       j=7-b;if(t+j>=slen) j=slen-t-1;
613       for(;j>=0;j--)
614       {
615         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
616         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
617         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
618         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
619       }
620     }
621     // TODO: preferred register based on backward branch
622   }
623   // Delay slot should preferably not overwrite branch conditions or cycle count
624   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
625     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
626     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
627     hsn[CCREG]=1;
628     // ...or hash tables
629     hsn[RHASH]=1;
630     hsn[RHTBL]=1;
631   }
632   // Coprocessor load/store needs FTEMP, even if not declared
633   if(itype[i]==C1LS||itype[i]==C2LS) {
634     hsn[FTEMP]=0;
635   }
636   // Load L/R also uses FTEMP as a temporary register
637   if(itype[i]==LOADLR) {
638     hsn[FTEMP]=0;
639   }
640   // Also SWL/SWR/SDL/SDR
641   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
642     hsn[FTEMP]=0;
643   }
644   // Don't remove the miniht registers
645   if(itype[i]==UJUMP||itype[i]==RJUMP)
646   {
647     hsn[RHASH]=0;
648     hsn[RHTBL]=0;
649   }
650 }
651
652 // We only want to allocate registers if we're going to use them again soon
653 int needed_again(int r, int i)
654 {
655   int j;
656   int b=-1;
657   int rn=10;
658
659   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
660   {
661     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
662       return 0; // Don't need any registers if exiting the block
663   }
664   for(j=0;j<9;j++)
665   {
666     if(i+j>=slen) {
667       j=slen-i-1;
668       break;
669     }
670     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
671     {
672       // Don't go past an unconditonal jump
673       j++;
674       break;
675     }
676     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
677     {
678       break;
679     }
680   }
681   for(;j>=1;j--)
682   {
683     if(rs1[i+j]==r) rn=j;
684     if(rs2[i+j]==r) rn=j;
685     if((unneeded_reg[i+j]>>r)&1) rn=10;
686     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
687     {
688       b=j;
689     }
690   }
691   /*
692   if(b>=0)
693   {
694     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
695     {
696       // Follow first branch
697       int o=rn;
698       int t=(ba[i+b]-start)>>2;
699       j=7-b;if(t+j>=slen) j=slen-t-1;
700       for(;j>=0;j--)
701       {
702         if(!((unneeded_reg[t+j]>>r)&1)) {
703           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
704           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
705         }
706         else rn=o;
707       }
708     }
709   }*/
710   if(rn<10) return 1;
711   (void)b;
712   return 0;
713 }
714
715 // Try to match register allocations at the end of a loop with those
716 // at the beginning
717 int loop_reg(int i, int r, int hr)
718 {
719   int j,k;
720   for(j=0;j<9;j++)
721   {
722     if(i+j>=slen) {
723       j=slen-i-1;
724       break;
725     }
726     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
727     {
728       // Don't go past an unconditonal jump
729       j++;
730       break;
731     }
732   }
733   k=0;
734   if(i>0){
735     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
736       k--;
737   }
738   for(;k<j;k++)
739   {
740     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
741     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
742     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
743     {
744       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
745       {
746         int t=(ba[i+k]-start)>>2;
747         int reg=get_reg(regs[t].regmap_entry,r);
748         if(reg>=0) return reg;
749         //reg=get_reg(regs[t+1].regmap_entry,r);
750         //if(reg>=0) return reg;
751       }
752     }
753   }
754   return hr;
755 }
756
757
758 // Allocate every register, preserving source/target regs
759 void alloc_all(struct regstat *cur,int i)
760 {
761   int hr;
762
763   for(hr=0;hr<HOST_REGS;hr++) {
764     if(hr!=EXCLUDE_REG) {
765       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
766          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
767       {
768         cur->regmap[hr]=-1;
769         cur->dirty&=~(1<<hr);
770       }
771       // Don't need zeros
772       if((cur->regmap[hr]&63)==0)
773       {
774         cur->regmap[hr]=-1;
775         cur->dirty&=~(1<<hr);
776       }
777     }
778   }
779 }
780
781 #ifdef __i386__
782 #include "x86/assem_x86.c"
783 #endif
784 #ifdef __x86_64__
785 #include "x64/assem_x64.c"
786 #endif
787 #ifdef __arm__
788 #include "arm/assem_arm.c"
789 #endif
790
791 // Add virtual address mapping to linked list
792 void ll_add(struct ll_entry **head,int vaddr,void *addr)
793 {
794   struct ll_entry *new_entry;
795   new_entry=malloc(sizeof(struct ll_entry));
796   assert(new_entry!=NULL);
797   new_entry->vaddr=vaddr;
798   new_entry->reg_sv_flags=0;
799   new_entry->addr=addr;
800   new_entry->next=*head;
801   *head=new_entry;
802 }
803
804 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
805 {
806   ll_add(head,vaddr,addr);
807   (*head)->reg_sv_flags=reg_sv_flags;
808 }
809
810 // Check if an address is already compiled
811 // but don't return addresses which are about to expire from the cache
812 void *check_addr(u_int vaddr)
813 {
814   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
815   if(ht_bin[0]==vaddr) {
816     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
817       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
818   }
819   if(ht_bin[2]==vaddr) {
820     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
821       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
822   }
823   u_int page=get_page(vaddr);
824   struct ll_entry *head;
825   head=jump_in[page];
826   while(head!=NULL) {
827     if(head->vaddr==vaddr) {
828       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
829         // Update existing entry with current address
830         if(ht_bin[0]==vaddr) {
831           ht_bin[1]=(int)head->addr;
832           return head->addr;
833         }
834         if(ht_bin[2]==vaddr) {
835           ht_bin[3]=(int)head->addr;
836           return head->addr;
837         }
838         // Insert into hash table with low priority.
839         // Don't evict existing entries, as they are probably
840         // addresses that are being accessed frequently.
841         if(ht_bin[0]==-1) {
842           ht_bin[1]=(int)head->addr;
843           ht_bin[0]=vaddr;
844         }else if(ht_bin[2]==-1) {
845           ht_bin[3]=(int)head->addr;
846           ht_bin[2]=vaddr;
847         }
848         return head->addr;
849       }
850     }
851     head=head->next;
852   }
853   return 0;
854 }
855
856 void remove_hash(int vaddr)
857 {
858   //printf("remove hash: %x\n",vaddr);
859   u_int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
860   if(ht_bin[2]==vaddr) {
861     ht_bin[2]=ht_bin[3]=-1;
862   }
863   if(ht_bin[0]==vaddr) {
864     ht_bin[0]=ht_bin[2];
865     ht_bin[1]=ht_bin[3];
866     ht_bin[2]=ht_bin[3]=-1;
867   }
868 }
869
870 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
871 {
872   struct ll_entry *next;
873   while(*head) {
874     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
875        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
876     {
877       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
878       remove_hash((*head)->vaddr);
879       next=(*head)->next;
880       free(*head);
881       *head=next;
882     }
883     else
884     {
885       head=&((*head)->next);
886     }
887   }
888 }
889
890 // Remove all entries from linked list
891 void ll_clear(struct ll_entry **head)
892 {
893   struct ll_entry *cur;
894   struct ll_entry *next;
895   if((cur=*head)) {
896     *head=0;
897     while(cur) {
898       next=cur->next;
899       free(cur);
900       cur=next;
901     }
902   }
903 }
904
905 // Dereference the pointers and remove if it matches
906 static void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
907 {
908   while(head) {
909     int ptr=get_pointer(head->addr);
910     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
911     if(((ptr>>shift)==(addr>>shift)) ||
912        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
913     {
914       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
915       void *host_addr=find_extjump_insn(head->addr);
916       #ifdef __arm__
917         mark_clear_cache(host_addr);
918       #endif
919       set_jump_target((int)host_addr,(int)head->addr);
920     }
921     head=head->next;
922   }
923 }
924
925 // This is called when we write to a compiled block (see do_invstub)
926 void invalidate_page(u_int page)
927 {
928   struct ll_entry *head;
929   struct ll_entry *next;
930   head=jump_in[page];
931   jump_in[page]=0;
932   while(head!=NULL) {
933     inv_debug("INVALIDATE: %x\n",head->vaddr);
934     remove_hash(head->vaddr);
935     next=head->next;
936     free(head);
937     head=next;
938   }
939   head=jump_out[page];
940   jump_out[page]=0;
941   while(head!=NULL) {
942     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
943     void *host_addr=find_extjump_insn(head->addr);
944     #ifdef __arm__
945       mark_clear_cache(host_addr);
946     #endif
947     set_jump_target((int)host_addr,(int)head->addr);
948     next=head->next;
949     free(head);
950     head=next;
951   }
952 }
953
954 static void invalidate_block_range(u_int block, u_int first, u_int last)
955 {
956   u_int page=get_page(block<<12);
957   //printf("first=%d last=%d\n",first,last);
958   invalidate_page(page);
959   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
960   assert(last<page+5);
961   // Invalidate the adjacent pages if a block crosses a 4K boundary
962   while(first<page)
963   {
964     invalidate_page(first);
965     first++;
966   }
967   for(first=page+1;first<last;first++)
968   {
969     invalidate_page(first);
970   }
971
972 #ifdef __arm__
973   do_clear_cache();
974 #endif
975
976   // Don't trap writes
977   invalid_code[block]=1;
978
979 #ifdef USE_MINI_HT
980   memset(mini_ht,-1,sizeof(mini_ht));
981 #endif
982 }
983
984 void invalidate_block(u_int block)
985 {
986   u_int page=get_page(block<<12);
987   u_int vpage=get_vpage(block<<12);
988   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
989   u_int first,last;
990   first=last=page;
991   struct ll_entry *head;
992   head=jump_dirty[vpage];
993   //printf("page=%d vpage=%d\n",page,vpage);
994   while(head!=NULL)
995   {
996     u_int start,end;
997     if(vpage>2047||(head->vaddr>>12)==block)
998     { // Ignore vaddr hash collision
999       get_bounds((int)head->addr,&start,&end);
1000       //printf("start: %x end: %x\n",start,end);
1001       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE)
1002       {
1003         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page)
1004         {
1005           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1006           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1007         }
1008       }
1009     }
1010     head=head->next;
1011   }
1012   invalidate_block_range(block,first,last);
1013 }
1014
1015 void invalidate_addr(u_int addr)
1016 {
1017   //static int rhits;
1018   // this check is done by the caller
1019   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1020   u_int page=get_vpage(addr);
1021   if(page<2048) { // RAM
1022     struct ll_entry *head;
1023     u_int addr_min=~0, addr_max=0;
1024     u_int mask=RAM_SIZE-1;
1025     u_int addr_main=0x80000000|(addr&mask);
1026     int pg1;
1027     inv_code_start=addr_main&~0xfff;
1028     inv_code_end=addr_main|0xfff;
1029     pg1=page;
1030     if (pg1>0) {
1031       // must check previous page too because of spans..
1032       pg1--;
1033       inv_code_start-=0x1000;
1034     }
1035     for(;pg1<=page;pg1++) {
1036       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1037         u_int start,end;
1038         get_bounds((int)head->addr,&start,&end);
1039         if(ram_offset) {
1040           start-=ram_offset;
1041           end-=ram_offset;
1042         }
1043         if(start<=addr_main&&addr_main<end) {
1044           if(start<addr_min) addr_min=start;
1045           if(end>addr_max) addr_max=end;
1046         }
1047         else if(addr_main<start) {
1048           if(start<inv_code_end)
1049             inv_code_end=start-1;
1050         }
1051         else {
1052           if(end>inv_code_start)
1053             inv_code_start=end;
1054         }
1055       }
1056     }
1057     if (addr_min!=~0) {
1058       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1059       inv_code_start=inv_code_end=~0;
1060       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1061       return;
1062     }
1063     else {
1064       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1065       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1066       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1067       return;
1068     }
1069   }
1070   invalidate_block(addr>>12);
1071 }
1072
1073 // This is called when loading a save state.
1074 // Anything could have changed, so invalidate everything.
1075 void invalidate_all_pages(void)
1076 {
1077   u_int page;
1078   for(page=0;page<4096;page++)
1079     invalidate_page(page);
1080   for(page=0;page<1048576;page++)
1081   {
1082     if(!invalid_code[page])
1083     {
1084       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1085       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1086     }
1087   }
1088
1089 #ifdef USE_MINI_HT
1090   memset(mini_ht,-1,sizeof(mini_ht));
1091 #endif
1092 }
1093
1094 // Add an entry to jump_out after making a link
1095 void add_link(u_int vaddr,void *src)
1096 {
1097   u_int page=get_page(vaddr);
1098   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1099   int *ptr=(int *)(src+4);
1100   assert((*ptr&0x0fff0000)==0x059f0000);
1101   (void)ptr;
1102   ll_add(jump_out+page,vaddr,src);
1103   //int ptr=get_pointer(src);
1104   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1105 }
1106
1107 // If a code block was found to be unmodified (bit was set in
1108 // restore_candidate) and it remains unmodified (bit is clear
1109 // in invalid_code) then move the entries for that 4K page from
1110 // the dirty list to the clean list.
1111 void clean_blocks(u_int page)
1112 {
1113   struct ll_entry *head;
1114   inv_debug("INV: clean_blocks page=%d\n",page);
1115   head=jump_dirty[page];
1116   while(head!=NULL)
1117   {
1118     if(!invalid_code[head->vaddr>>12])
1119     {
1120       // Don't restore blocks which are about to expire from the cache
1121       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1122       {
1123         u_int start,end;
1124         if(verify_dirty(head->addr))
1125         {
1126           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1127           u_int i;
1128           u_int inv=0;
1129           get_bounds((int)head->addr,&start,&end);
1130           if(start-(u_int)rdram<RAM_SIZE)
1131           {
1132             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++)
1133             {
1134               inv|=invalid_code[i];
1135             }
1136           }
1137           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE)
1138           {
1139             inv=1;
1140           }
1141           if(!inv)
1142           {
1143             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1144             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1145             {
1146               u_int ppage=page;
1147               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1148               //printf("page=%x, addr=%x\n",page,head->vaddr);
1149               //assert(head->vaddr>>12==(page|0x80000));
1150               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1151               u_int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1152               if(ht_bin[0]==head->vaddr)
1153               {
1154                 ht_bin[1]=(u_int)clean_addr; // Replace existing entry
1155               }
1156               if(ht_bin[2]==head->vaddr)
1157               {
1158                 ht_bin[3]=(u_int)clean_addr; // Replace existing entry
1159               }
1160             }
1161           }
1162         }
1163       }
1164     }
1165     head=head->next;
1166   }
1167 }
1168
1169 static void mov_alloc(struct regstat *current,int i)
1170 {
1171   // Note: Don't need to actually alloc the source registers
1172   if((~current->is32>>rs1[i])&1)
1173   {
1174     //alloc_reg64(current,i,rs1[i]);
1175     alloc_reg64(current,i,rt1[i]);
1176     current->is32&=~(1LL<<rt1[i]);
1177   }
1178   else
1179   {
1180     //alloc_reg(current,i,rs1[i]);
1181     alloc_reg(current,i,rt1[i]);
1182     current->is32|=(1LL<<rt1[i]);
1183   }
1184   clear_const(current,rs1[i]);
1185   clear_const(current,rt1[i]);
1186   dirty_reg(current,rt1[i]);
1187 }
1188
1189 void shiftimm_alloc(struct regstat *current,int i)
1190 {
1191   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1192   {
1193     if(rt1[i]) {
1194       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1195       else lt1[i]=rs1[i];
1196       alloc_reg(current,i,rt1[i]);
1197       current->is32|=1LL<<rt1[i];
1198       dirty_reg(current,rt1[i]);
1199       if(is_const(current,rs1[i])) {
1200         int v=get_const(current,rs1[i]);
1201         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1202         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1203         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1204       }
1205       else clear_const(current,rt1[i]);
1206     }
1207   }
1208   else
1209   {
1210     clear_const(current,rs1[i]);
1211     clear_const(current,rt1[i]);
1212   }
1213
1214   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1215   {
1216     if(rt1[i]) {
1217       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1218       alloc_reg64(current,i,rt1[i]);
1219       current->is32&=~(1LL<<rt1[i]);
1220       dirty_reg(current,rt1[i]);
1221     }
1222   }
1223   if(opcode2[i]==0x3c) // DSLL32
1224   {
1225     if(rt1[i]) {
1226       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1227       alloc_reg64(current,i,rt1[i]);
1228       current->is32&=~(1LL<<rt1[i]);
1229       dirty_reg(current,rt1[i]);
1230     }
1231   }
1232   if(opcode2[i]==0x3e) // DSRL32
1233   {
1234     if(rt1[i]) {
1235       alloc_reg64(current,i,rs1[i]);
1236       if(imm[i]==32) {
1237         alloc_reg64(current,i,rt1[i]);
1238         current->is32&=~(1LL<<rt1[i]);
1239       } else {
1240         alloc_reg(current,i,rt1[i]);
1241         current->is32|=1LL<<rt1[i];
1242       }
1243       dirty_reg(current,rt1[i]);
1244     }
1245   }
1246   if(opcode2[i]==0x3f) // DSRA32
1247   {
1248     if(rt1[i]) {
1249       alloc_reg64(current,i,rs1[i]);
1250       alloc_reg(current,i,rt1[i]);
1251       current->is32|=1LL<<rt1[i];
1252       dirty_reg(current,rt1[i]);
1253     }
1254   }
1255 }
1256
1257 void shift_alloc(struct regstat *current,int i)
1258 {
1259   if(rt1[i]) {
1260     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1261     {
1262       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1263       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1264       alloc_reg(current,i,rt1[i]);
1265       if(rt1[i]==rs2[i]) {
1266         alloc_reg_temp(current,i,-1);
1267         minimum_free_regs[i]=1;
1268       }
1269       current->is32|=1LL<<rt1[i];
1270     } else { // DSLLV/DSRLV/DSRAV
1271       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1272       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1273       alloc_reg64(current,i,rt1[i]);
1274       current->is32&=~(1LL<<rt1[i]);
1275       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1276       {
1277         alloc_reg_temp(current,i,-1);
1278         minimum_free_regs[i]=1;
1279       }
1280     }
1281     clear_const(current,rs1[i]);
1282     clear_const(current,rs2[i]);
1283     clear_const(current,rt1[i]);
1284     dirty_reg(current,rt1[i]);
1285   }
1286 }
1287
1288 void alu_alloc(struct regstat *current,int i)
1289 {
1290   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1291     if(rt1[i]) {
1292       if(rs1[i]&&rs2[i]) {
1293         alloc_reg(current,i,rs1[i]);
1294         alloc_reg(current,i,rs2[i]);
1295       }
1296       else {
1297         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1298         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1299       }
1300       alloc_reg(current,i,rt1[i]);
1301     }
1302     current->is32|=1LL<<rt1[i];
1303   }
1304   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1305     if(rt1[i]) {
1306       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1307       {
1308         alloc_reg64(current,i,rs1[i]);
1309         alloc_reg64(current,i,rs2[i]);
1310         alloc_reg(current,i,rt1[i]);
1311       } else {
1312         alloc_reg(current,i,rs1[i]);
1313         alloc_reg(current,i,rs2[i]);
1314         alloc_reg(current,i,rt1[i]);
1315       }
1316     }
1317     current->is32|=1LL<<rt1[i];
1318   }
1319   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1320     if(rt1[i]) {
1321       if(rs1[i]&&rs2[i]) {
1322         alloc_reg(current,i,rs1[i]);
1323         alloc_reg(current,i,rs2[i]);
1324       }
1325       else
1326       {
1327         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1328         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1329       }
1330       alloc_reg(current,i,rt1[i]);
1331       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1332       {
1333         if(!((current->uu>>rt1[i])&1)) {
1334           alloc_reg64(current,i,rt1[i]);
1335         }
1336         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1337           if(rs1[i]&&rs2[i]) {
1338             alloc_reg64(current,i,rs1[i]);
1339             alloc_reg64(current,i,rs2[i]);
1340           }
1341           else
1342           {
1343             // Is is really worth it to keep 64-bit values in registers?
1344             #ifdef NATIVE_64BIT
1345             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1346             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1347             #endif
1348           }
1349         }
1350         current->is32&=~(1LL<<rt1[i]);
1351       } else {
1352         current->is32|=1LL<<rt1[i];
1353       }
1354     }
1355   }
1356   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1357     if(rt1[i]) {
1358       if(rs1[i]&&rs2[i]) {
1359         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1360           alloc_reg64(current,i,rs1[i]);
1361           alloc_reg64(current,i,rs2[i]);
1362           alloc_reg64(current,i,rt1[i]);
1363         } else {
1364           alloc_reg(current,i,rs1[i]);
1365           alloc_reg(current,i,rs2[i]);
1366           alloc_reg(current,i,rt1[i]);
1367         }
1368       }
1369       else {
1370         alloc_reg(current,i,rt1[i]);
1371         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1372           // DADD used as move, or zeroing
1373           // If we have a 64-bit source, then make the target 64 bits too
1374           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1375             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1376             alloc_reg64(current,i,rt1[i]);
1377           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1378             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1379             alloc_reg64(current,i,rt1[i]);
1380           }
1381           if(opcode2[i]>=0x2e&&rs2[i]) {
1382             // DSUB used as negation - 64-bit result
1383             // If we have a 32-bit register, extend it to 64 bits
1384             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1385             alloc_reg64(current,i,rt1[i]);
1386           }
1387         }
1388       }
1389       if(rs1[i]&&rs2[i]) {
1390         current->is32&=~(1LL<<rt1[i]);
1391       } else if(rs1[i]) {
1392         current->is32&=~(1LL<<rt1[i]);
1393         if((current->is32>>rs1[i])&1)
1394           current->is32|=1LL<<rt1[i];
1395       } else if(rs2[i]) {
1396         current->is32&=~(1LL<<rt1[i]);
1397         if((current->is32>>rs2[i])&1)
1398           current->is32|=1LL<<rt1[i];
1399       } else {
1400         current->is32|=1LL<<rt1[i];
1401       }
1402     }
1403   }
1404   clear_const(current,rs1[i]);
1405   clear_const(current,rs2[i]);
1406   clear_const(current,rt1[i]);
1407   dirty_reg(current,rt1[i]);
1408 }
1409
1410 void imm16_alloc(struct regstat *current,int i)
1411 {
1412   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1413   else lt1[i]=rs1[i];
1414   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1415   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1416     current->is32&=~(1LL<<rt1[i]);
1417     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1418       // TODO: Could preserve the 32-bit flag if the immediate is zero
1419       alloc_reg64(current,i,rt1[i]);
1420       alloc_reg64(current,i,rs1[i]);
1421     }
1422     clear_const(current,rs1[i]);
1423     clear_const(current,rt1[i]);
1424   }
1425   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1426     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1427     current->is32|=1LL<<rt1[i];
1428     clear_const(current,rs1[i]);
1429     clear_const(current,rt1[i]);
1430   }
1431   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1432     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1433       if(rs1[i]!=rt1[i]) {
1434         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1435         alloc_reg64(current,i,rt1[i]);
1436         current->is32&=~(1LL<<rt1[i]);
1437       }
1438     }
1439     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1440     if(is_const(current,rs1[i])) {
1441       int v=get_const(current,rs1[i]);
1442       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1443       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1444       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1445     }
1446     else clear_const(current,rt1[i]);
1447   }
1448   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1449     if(is_const(current,rs1[i])) {
1450       int v=get_const(current,rs1[i]);
1451       set_const(current,rt1[i],v+imm[i]);
1452     }
1453     else clear_const(current,rt1[i]);
1454     current->is32|=1LL<<rt1[i];
1455   }
1456   else {
1457     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1458     current->is32|=1LL<<rt1[i];
1459   }
1460   dirty_reg(current,rt1[i]);
1461 }
1462
1463 void load_alloc(struct regstat *current,int i)
1464 {
1465   clear_const(current,rt1[i]);
1466   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1467   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1468   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1469   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1470     alloc_reg(current,i,rt1[i]);
1471     assert(get_reg(current->regmap,rt1[i])>=0);
1472     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1473     {
1474       current->is32&=~(1LL<<rt1[i]);
1475       alloc_reg64(current,i,rt1[i]);
1476     }
1477     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1478     {
1479       current->is32&=~(1LL<<rt1[i]);
1480       alloc_reg64(current,i,rt1[i]);
1481       alloc_all(current,i);
1482       alloc_reg64(current,i,FTEMP);
1483       minimum_free_regs[i]=HOST_REGS;
1484     }
1485     else current->is32|=1LL<<rt1[i];
1486     dirty_reg(current,rt1[i]);
1487     // LWL/LWR need a temporary register for the old value
1488     if(opcode[i]==0x22||opcode[i]==0x26)
1489     {
1490       alloc_reg(current,i,FTEMP);
1491       alloc_reg_temp(current,i,-1);
1492       minimum_free_regs[i]=1;
1493     }
1494   }
1495   else
1496   {
1497     // Load to r0 or unneeded register (dummy load)
1498     // but we still need a register to calculate the address
1499     if(opcode[i]==0x22||opcode[i]==0x26)
1500     {
1501       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1502     }
1503     alloc_reg_temp(current,i,-1);
1504     minimum_free_regs[i]=1;
1505     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1506     {
1507       alloc_all(current,i);
1508       alloc_reg64(current,i,FTEMP);
1509       minimum_free_regs[i]=HOST_REGS;
1510     }
1511   }
1512 }
1513
1514 void store_alloc(struct regstat *current,int i)
1515 {
1516   clear_const(current,rs2[i]);
1517   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1518   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1519   alloc_reg(current,i,rs2[i]);
1520   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1521     alloc_reg64(current,i,rs2[i]);
1522     if(rs2[i]) alloc_reg(current,i,FTEMP);
1523   }
1524   #if defined(HOST_IMM8)
1525   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1526   else alloc_reg(current,i,INVCP);
1527   #endif
1528   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1529     alloc_reg(current,i,FTEMP);
1530   }
1531   // We need a temporary register for address generation
1532   alloc_reg_temp(current,i,-1);
1533   minimum_free_regs[i]=1;
1534 }
1535
1536 void c1ls_alloc(struct regstat *current,int i)
1537 {
1538   //clear_const(current,rs1[i]); // FIXME
1539   clear_const(current,rt1[i]);
1540   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1541   alloc_reg(current,i,CSREG); // Status
1542   alloc_reg(current,i,FTEMP);
1543   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1544     alloc_reg64(current,i,FTEMP);
1545   }
1546   #if defined(HOST_IMM8)
1547   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1548   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1549     alloc_reg(current,i,INVCP);
1550   #endif
1551   // We need a temporary register for address generation
1552   alloc_reg_temp(current,i,-1);
1553 }
1554
1555 void c2ls_alloc(struct regstat *current,int i)
1556 {
1557   clear_const(current,rt1[i]);
1558   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1559   alloc_reg(current,i,FTEMP);
1560   #if defined(HOST_IMM8)
1561   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1562   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1563     alloc_reg(current,i,INVCP);
1564   #endif
1565   // We need a temporary register for address generation
1566   alloc_reg_temp(current,i,-1);
1567   minimum_free_regs[i]=1;
1568 }
1569
1570 #ifndef multdiv_alloc
1571 void multdiv_alloc(struct regstat *current,int i)
1572 {
1573   //  case 0x18: MULT
1574   //  case 0x19: MULTU
1575   //  case 0x1A: DIV
1576   //  case 0x1B: DIVU
1577   //  case 0x1C: DMULT
1578   //  case 0x1D: DMULTU
1579   //  case 0x1E: DDIV
1580   //  case 0x1F: DDIVU
1581   clear_const(current,rs1[i]);
1582   clear_const(current,rs2[i]);
1583   if(rs1[i]&&rs2[i])
1584   {
1585     if((opcode2[i]&4)==0) // 32-bit
1586     {
1587       current->u&=~(1LL<<HIREG);
1588       current->u&=~(1LL<<LOREG);
1589       alloc_reg(current,i,HIREG);
1590       alloc_reg(current,i,LOREG);
1591       alloc_reg(current,i,rs1[i]);
1592       alloc_reg(current,i,rs2[i]);
1593       current->is32|=1LL<<HIREG;
1594       current->is32|=1LL<<LOREG;
1595       dirty_reg(current,HIREG);
1596       dirty_reg(current,LOREG);
1597     }
1598     else // 64-bit
1599     {
1600       current->u&=~(1LL<<HIREG);
1601       current->u&=~(1LL<<LOREG);
1602       current->uu&=~(1LL<<HIREG);
1603       current->uu&=~(1LL<<LOREG);
1604       alloc_reg64(current,i,HIREG);
1605       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1606       alloc_reg64(current,i,rs1[i]);
1607       alloc_reg64(current,i,rs2[i]);
1608       alloc_all(current,i);
1609       current->is32&=~(1LL<<HIREG);
1610       current->is32&=~(1LL<<LOREG);
1611       dirty_reg(current,HIREG);
1612       dirty_reg(current,LOREG);
1613       minimum_free_regs[i]=HOST_REGS;
1614     }
1615   }
1616   else
1617   {
1618     // Multiply by zero is zero.
1619     // MIPS does not have a divide by zero exception.
1620     // The result is undefined, we return zero.
1621     alloc_reg(current,i,HIREG);
1622     alloc_reg(current,i,LOREG);
1623     current->is32|=1LL<<HIREG;
1624     current->is32|=1LL<<LOREG;
1625     dirty_reg(current,HIREG);
1626     dirty_reg(current,LOREG);
1627   }
1628 }
1629 #endif
1630
1631 void cop0_alloc(struct regstat *current,int i)
1632 {
1633   if(opcode2[i]==0) // MFC0
1634   {
1635     if(rt1[i]) {
1636       clear_const(current,rt1[i]);
1637       alloc_all(current,i);
1638       alloc_reg(current,i,rt1[i]);
1639       current->is32|=1LL<<rt1[i];
1640       dirty_reg(current,rt1[i]);
1641     }
1642   }
1643   else if(opcode2[i]==4) // MTC0
1644   {
1645     if(rs1[i]){
1646       clear_const(current,rs1[i]);
1647       alloc_reg(current,i,rs1[i]);
1648       alloc_all(current,i);
1649     }
1650     else {
1651       alloc_all(current,i); // FIXME: Keep r0
1652       current->u&=~1LL;
1653       alloc_reg(current,i,0);
1654     }
1655   }
1656   else
1657   {
1658     // TLBR/TLBWI/TLBWR/TLBP/ERET
1659     assert(opcode2[i]==0x10);
1660     alloc_all(current,i);
1661   }
1662   minimum_free_regs[i]=HOST_REGS;
1663 }
1664
1665 void cop1_alloc(struct regstat *current,int i)
1666 {
1667   alloc_reg(current,i,CSREG); // Load status
1668   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1669   {
1670     if(rt1[i]){
1671       clear_const(current,rt1[i]);
1672       if(opcode2[i]==1) {
1673         alloc_reg64(current,i,rt1[i]); // DMFC1
1674         current->is32&=~(1LL<<rt1[i]);
1675       }else{
1676         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1677         current->is32|=1LL<<rt1[i];
1678       }
1679       dirty_reg(current,rt1[i]);
1680     }
1681     alloc_reg_temp(current,i,-1);
1682   }
1683   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1684   {
1685     if(rs1[i]){
1686       clear_const(current,rs1[i]);
1687       if(opcode2[i]==5)
1688         alloc_reg64(current,i,rs1[i]); // DMTC1
1689       else
1690         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1691       alloc_reg_temp(current,i,-1);
1692     }
1693     else {
1694       current->u&=~1LL;
1695       alloc_reg(current,i,0);
1696       alloc_reg_temp(current,i,-1);
1697     }
1698   }
1699   minimum_free_regs[i]=1;
1700 }
1701 void fconv_alloc(struct regstat *current,int i)
1702 {
1703   alloc_reg(current,i,CSREG); // Load status
1704   alloc_reg_temp(current,i,-1);
1705   minimum_free_regs[i]=1;
1706 }
1707 void float_alloc(struct regstat *current,int i)
1708 {
1709   alloc_reg(current,i,CSREG); // Load status
1710   alloc_reg_temp(current,i,-1);
1711   minimum_free_regs[i]=1;
1712 }
1713 void c2op_alloc(struct regstat *current,int i)
1714 {
1715   alloc_reg_temp(current,i,-1);
1716 }
1717 void fcomp_alloc(struct regstat *current,int i)
1718 {
1719   alloc_reg(current,i,CSREG); // Load status
1720   alloc_reg(current,i,FSREG); // Load flags
1721   dirty_reg(current,FSREG); // Flag will be modified
1722   alloc_reg_temp(current,i,-1);
1723   minimum_free_regs[i]=1;
1724 }
1725
1726 void syscall_alloc(struct regstat *current,int i)
1727 {
1728   alloc_cc(current,i);
1729   dirty_reg(current,CCREG);
1730   alloc_all(current,i);
1731   minimum_free_regs[i]=HOST_REGS;
1732   current->isconst=0;
1733 }
1734
1735 void delayslot_alloc(struct regstat *current,int i)
1736 {
1737   switch(itype[i])
1738   {
1739     case UJUMP:
1740     case CJUMP:
1741     case SJUMP:
1742     case RJUMP:
1743     case FJUMP:
1744     case SYSCALL:
1745     case HLECALL:
1746     case SPAN:
1747       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1748       SysPrintf("Disabled speculative precompilation\n");
1749       stop_after_jal=1;
1750       break;
1751     case IMM16:
1752       imm16_alloc(current,i);
1753       break;
1754     case LOAD:
1755     case LOADLR:
1756       load_alloc(current,i);
1757       break;
1758     case STORE:
1759     case STORELR:
1760       store_alloc(current,i);
1761       break;
1762     case ALU:
1763       alu_alloc(current,i);
1764       break;
1765     case SHIFT:
1766       shift_alloc(current,i);
1767       break;
1768     case MULTDIV:
1769       multdiv_alloc(current,i);
1770       break;
1771     case SHIFTIMM:
1772       shiftimm_alloc(current,i);
1773       break;
1774     case MOV:
1775       mov_alloc(current,i);
1776       break;
1777     case COP0:
1778       cop0_alloc(current,i);
1779       break;
1780     case COP1:
1781     case COP2:
1782       cop1_alloc(current,i);
1783       break;
1784     case C1LS:
1785       c1ls_alloc(current,i);
1786       break;
1787     case C2LS:
1788       c2ls_alloc(current,i);
1789       break;
1790     case FCONV:
1791       fconv_alloc(current,i);
1792       break;
1793     case FLOAT:
1794       float_alloc(current,i);
1795       break;
1796     case FCOMP:
1797       fcomp_alloc(current,i);
1798       break;
1799     case C2OP:
1800       c2op_alloc(current,i);
1801       break;
1802   }
1803 }
1804
1805 // Special case where a branch and delay slot span two pages in virtual memory
1806 static void pagespan_alloc(struct regstat *current,int i)
1807 {
1808   current->isconst=0;
1809   current->wasconst=0;
1810   regs[i].wasconst=0;
1811   minimum_free_regs[i]=HOST_REGS;
1812   alloc_all(current,i);
1813   alloc_cc(current,i);
1814   dirty_reg(current,CCREG);
1815   if(opcode[i]==3) // JAL
1816   {
1817     alloc_reg(current,i,31);
1818     dirty_reg(current,31);
1819   }
1820   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1821   {
1822     alloc_reg(current,i,rs1[i]);
1823     if (rt1[i]!=0) {
1824       alloc_reg(current,i,rt1[i]);
1825       dirty_reg(current,rt1[i]);
1826     }
1827   }
1828   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1829   {
1830     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1831     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1832     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1833     {
1834       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1835       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1836     }
1837   }
1838   else
1839   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1840   {
1841     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1842     if(!((current->is32>>rs1[i])&1))
1843     {
1844       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1845     }
1846   }
1847   else
1848   if(opcode[i]==0x11) // BC1
1849   {
1850     alloc_reg(current,i,FSREG);
1851     alloc_reg(current,i,CSREG);
1852   }
1853   //else ...
1854 }
1855
1856 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1857 {
1858   stubs[stubcount][0]=type;
1859   stubs[stubcount][1]=addr;
1860   stubs[stubcount][2]=retaddr;
1861   stubs[stubcount][3]=a;
1862   stubs[stubcount][4]=b;
1863   stubs[stubcount][5]=c;
1864   stubs[stubcount][6]=d;
1865   stubs[stubcount][7]=e;
1866   stubcount++;
1867 }
1868
1869 // Write out a single register
1870 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1871 {
1872   int hr;
1873   for(hr=0;hr<HOST_REGS;hr++) {
1874     if(hr!=EXCLUDE_REG) {
1875       if((regmap[hr]&63)==r) {
1876         if((dirty>>hr)&1) {
1877           if(regmap[hr]<64) {
1878             emit_storereg(r,hr);
1879           }else{
1880             emit_storereg(r|64,hr);
1881           }
1882         }
1883       }
1884     }
1885   }
1886 }
1887
1888 #if 0
1889 static int mchecksum(void)
1890 {
1891   //if(!tracedebug) return 0;
1892   int i;
1893   int sum=0;
1894   for(i=0;i<2097152;i++) {
1895     unsigned int temp=sum;
1896     sum<<=1;
1897     sum|=(~temp)>>31;
1898     sum^=((u_int *)rdram)[i];
1899   }
1900   return sum;
1901 }
1902
1903 static int rchecksum(void)
1904 {
1905   int i;
1906   int sum=0;
1907   for(i=0;i<64;i++)
1908     sum^=((u_int *)reg)[i];
1909   return sum;
1910 }
1911
1912 static void rlist(void)
1913 {
1914   int i;
1915   printf("TRACE: ");
1916   for(i=0;i<32;i++)
1917     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1918   printf("\n");
1919 }
1920
1921 static void enabletrace(void)
1922 {
1923   tracedebug=1;
1924 }
1925
1926 static void memdebug(int i)
1927 {
1928   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1929   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1930   //rlist();
1931   //if(tracedebug) {
1932   //if(Count>=-2084597794) {
1933   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1934   //if(0) {
1935     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1936     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1937     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1938     rlist();
1939     #ifdef __i386__
1940     printf("TRACE: %x\n",(&i)[-1]);
1941     #endif
1942     #ifdef __arm__
1943     int j;
1944     printf("TRACE: %x \n",(&j)[10]);
1945     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1946     #endif
1947     //fflush(stdout);
1948   }
1949   //printf("TRACE: %x\n",(&i)[-1]);
1950 }
1951 #endif
1952
1953 void alu_assemble(int i,struct regstat *i_regs)
1954 {
1955   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1956     if(rt1[i]) {
1957       signed char s1,s2,t;
1958       t=get_reg(i_regs->regmap,rt1[i]);
1959       if(t>=0) {
1960         s1=get_reg(i_regs->regmap,rs1[i]);
1961         s2=get_reg(i_regs->regmap,rs2[i]);
1962         if(rs1[i]&&rs2[i]) {
1963           assert(s1>=0);
1964           assert(s2>=0);
1965           if(opcode2[i]&2) emit_sub(s1,s2,t);
1966           else emit_add(s1,s2,t);
1967         }
1968         else if(rs1[i]) {
1969           if(s1>=0) emit_mov(s1,t);
1970           else emit_loadreg(rs1[i],t);
1971         }
1972         else if(rs2[i]) {
1973           if(s2>=0) {
1974             if(opcode2[i]&2) emit_neg(s2,t);
1975             else emit_mov(s2,t);
1976           }
1977           else {
1978             emit_loadreg(rs2[i],t);
1979             if(opcode2[i]&2) emit_neg(t,t);
1980           }
1981         }
1982         else emit_zeroreg(t);
1983       }
1984     }
1985   }
1986   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1987     if(rt1[i]) {
1988       signed char s1l,s2l,s1h,s2h,tl,th;
1989       tl=get_reg(i_regs->regmap,rt1[i]);
1990       th=get_reg(i_regs->regmap,rt1[i]|64);
1991       if(tl>=0) {
1992         s1l=get_reg(i_regs->regmap,rs1[i]);
1993         s2l=get_reg(i_regs->regmap,rs2[i]);
1994         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1995         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1996         if(rs1[i]&&rs2[i]) {
1997           assert(s1l>=0);
1998           assert(s2l>=0);
1999           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2000           else emit_adds(s1l,s2l,tl);
2001           if(th>=0) {
2002             #ifdef INVERTED_CARRY
2003             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2004             #else
2005             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2006             #endif
2007             else emit_add(s1h,s2h,th);
2008           }
2009         }
2010         else if(rs1[i]) {
2011           if(s1l>=0) emit_mov(s1l,tl);
2012           else emit_loadreg(rs1[i],tl);
2013           if(th>=0) {
2014             if(s1h>=0) emit_mov(s1h,th);
2015             else emit_loadreg(rs1[i]|64,th);
2016           }
2017         }
2018         else if(rs2[i]) {
2019           if(s2l>=0) {
2020             if(opcode2[i]&2) emit_negs(s2l,tl);
2021             else emit_mov(s2l,tl);
2022           }
2023           else {
2024             emit_loadreg(rs2[i],tl);
2025             if(opcode2[i]&2) emit_negs(tl,tl);
2026           }
2027           if(th>=0) {
2028             #ifdef INVERTED_CARRY
2029             if(s2h>=0) emit_mov(s2h,th);
2030             else emit_loadreg(rs2[i]|64,th);
2031             if(opcode2[i]&2) {
2032               emit_adcimm(-1,th); // x86 has inverted carry flag
2033               emit_not(th,th);
2034             }
2035             #else
2036             if(opcode2[i]&2) {
2037               if(s2h>=0) emit_rscimm(s2h,0,th);
2038               else {
2039                 emit_loadreg(rs2[i]|64,th);
2040                 emit_rscimm(th,0,th);
2041               }
2042             }else{
2043               if(s2h>=0) emit_mov(s2h,th);
2044               else emit_loadreg(rs2[i]|64,th);
2045             }
2046             #endif
2047           }
2048         }
2049         else {
2050           emit_zeroreg(tl);
2051           if(th>=0) emit_zeroreg(th);
2052         }
2053       }
2054     }
2055   }
2056   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2057     if(rt1[i]) {
2058       signed char s1l,s1h,s2l,s2h,t;
2059       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2060       {
2061         t=get_reg(i_regs->regmap,rt1[i]);
2062         //assert(t>=0);
2063         if(t>=0) {
2064           s1l=get_reg(i_regs->regmap,rs1[i]);
2065           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2066           s2l=get_reg(i_regs->regmap,rs2[i]);
2067           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2068           if(rs2[i]==0) // rx<r0
2069           {
2070             assert(s1h>=0);
2071             if(opcode2[i]==0x2a) // SLT
2072               emit_shrimm(s1h,31,t);
2073             else // SLTU (unsigned can not be less than zero)
2074               emit_zeroreg(t);
2075           }
2076           else if(rs1[i]==0) // r0<rx
2077           {
2078             assert(s2h>=0);
2079             if(opcode2[i]==0x2a) // SLT
2080               emit_set_gz64_32(s2h,s2l,t);
2081             else // SLTU (set if not zero)
2082               emit_set_nz64_32(s2h,s2l,t);
2083           }
2084           else {
2085             assert(s1l>=0);assert(s1h>=0);
2086             assert(s2l>=0);assert(s2h>=0);
2087             if(opcode2[i]==0x2a) // SLT
2088               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2089             else // SLTU
2090               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2091           }
2092         }
2093       } else {
2094         t=get_reg(i_regs->regmap,rt1[i]);
2095         //assert(t>=0);
2096         if(t>=0) {
2097           s1l=get_reg(i_regs->regmap,rs1[i]);
2098           s2l=get_reg(i_regs->regmap,rs2[i]);
2099           if(rs2[i]==0) // rx<r0
2100           {
2101             assert(s1l>=0);
2102             if(opcode2[i]==0x2a) // SLT
2103               emit_shrimm(s1l,31,t);
2104             else // SLTU (unsigned can not be less than zero)
2105               emit_zeroreg(t);
2106           }
2107           else if(rs1[i]==0) // r0<rx
2108           {
2109             assert(s2l>=0);
2110             if(opcode2[i]==0x2a) // SLT
2111               emit_set_gz32(s2l,t);
2112             else // SLTU (set if not zero)
2113               emit_set_nz32(s2l,t);
2114           }
2115           else{
2116             assert(s1l>=0);assert(s2l>=0);
2117             if(opcode2[i]==0x2a) // SLT
2118               emit_set_if_less32(s1l,s2l,t);
2119             else // SLTU
2120               emit_set_if_carry32(s1l,s2l,t);
2121           }
2122         }
2123       }
2124     }
2125   }
2126   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2127     if(rt1[i]) {
2128       signed char s1l,s1h,s2l,s2h,th,tl;
2129       tl=get_reg(i_regs->regmap,rt1[i]);
2130       th=get_reg(i_regs->regmap,rt1[i]|64);
2131       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2132       {
2133         assert(tl>=0);
2134         if(tl>=0) {
2135           s1l=get_reg(i_regs->regmap,rs1[i]);
2136           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2137           s2l=get_reg(i_regs->regmap,rs2[i]);
2138           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2139           if(rs1[i]&&rs2[i]) {
2140             assert(s1l>=0);assert(s1h>=0);
2141             assert(s2l>=0);assert(s2h>=0);
2142             if(opcode2[i]==0x24) { // AND
2143               emit_and(s1l,s2l,tl);
2144               emit_and(s1h,s2h,th);
2145             } else
2146             if(opcode2[i]==0x25) { // OR
2147               emit_or(s1l,s2l,tl);
2148               emit_or(s1h,s2h,th);
2149             } else
2150             if(opcode2[i]==0x26) { // XOR
2151               emit_xor(s1l,s2l,tl);
2152               emit_xor(s1h,s2h,th);
2153             } else
2154             if(opcode2[i]==0x27) { // NOR
2155               emit_or(s1l,s2l,tl);
2156               emit_or(s1h,s2h,th);
2157               emit_not(tl,tl);
2158               emit_not(th,th);
2159             }
2160           }
2161           else
2162           {
2163             if(opcode2[i]==0x24) { // AND
2164               emit_zeroreg(tl);
2165               emit_zeroreg(th);
2166             } else
2167             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2168               if(rs1[i]){
2169                 if(s1l>=0) emit_mov(s1l,tl);
2170                 else emit_loadreg(rs1[i],tl);
2171                 if(s1h>=0) emit_mov(s1h,th);
2172                 else emit_loadreg(rs1[i]|64,th);
2173               }
2174               else
2175               if(rs2[i]){
2176                 if(s2l>=0) emit_mov(s2l,tl);
2177                 else emit_loadreg(rs2[i],tl);
2178                 if(s2h>=0) emit_mov(s2h,th);
2179                 else emit_loadreg(rs2[i]|64,th);
2180               }
2181               else{
2182                 emit_zeroreg(tl);
2183                 emit_zeroreg(th);
2184               }
2185             } else
2186             if(opcode2[i]==0x27) { // NOR
2187               if(rs1[i]){
2188                 if(s1l>=0) emit_not(s1l,tl);
2189                 else{
2190                   emit_loadreg(rs1[i],tl);
2191                   emit_not(tl,tl);
2192                 }
2193                 if(s1h>=0) emit_not(s1h,th);
2194                 else{
2195                   emit_loadreg(rs1[i]|64,th);
2196                   emit_not(th,th);
2197                 }
2198               }
2199               else
2200               if(rs2[i]){
2201                 if(s2l>=0) emit_not(s2l,tl);
2202                 else{
2203                   emit_loadreg(rs2[i],tl);
2204                   emit_not(tl,tl);
2205                 }
2206                 if(s2h>=0) emit_not(s2h,th);
2207                 else{
2208                   emit_loadreg(rs2[i]|64,th);
2209                   emit_not(th,th);
2210                 }
2211               }
2212               else {
2213                 emit_movimm(-1,tl);
2214                 emit_movimm(-1,th);
2215               }
2216             }
2217           }
2218         }
2219       }
2220       else
2221       {
2222         // 32 bit
2223         if(tl>=0) {
2224           s1l=get_reg(i_regs->regmap,rs1[i]);
2225           s2l=get_reg(i_regs->regmap,rs2[i]);
2226           if(rs1[i]&&rs2[i]) {
2227             assert(s1l>=0);
2228             assert(s2l>=0);
2229             if(opcode2[i]==0x24) { // AND
2230               emit_and(s1l,s2l,tl);
2231             } else
2232             if(opcode2[i]==0x25) { // OR
2233               emit_or(s1l,s2l,tl);
2234             } else
2235             if(opcode2[i]==0x26) { // XOR
2236               emit_xor(s1l,s2l,tl);
2237             } else
2238             if(opcode2[i]==0x27) { // NOR
2239               emit_or(s1l,s2l,tl);
2240               emit_not(tl,tl);
2241             }
2242           }
2243           else
2244           {
2245             if(opcode2[i]==0x24) { // AND
2246               emit_zeroreg(tl);
2247             } else
2248             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2249               if(rs1[i]){
2250                 if(s1l>=0) emit_mov(s1l,tl);
2251                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2252               }
2253               else
2254               if(rs2[i]){
2255                 if(s2l>=0) emit_mov(s2l,tl);
2256                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2257               }
2258               else emit_zeroreg(tl);
2259             } else
2260             if(opcode2[i]==0x27) { // NOR
2261               if(rs1[i]){
2262                 if(s1l>=0) emit_not(s1l,tl);
2263                 else {
2264                   emit_loadreg(rs1[i],tl);
2265                   emit_not(tl,tl);
2266                 }
2267               }
2268               else
2269               if(rs2[i]){
2270                 if(s2l>=0) emit_not(s2l,tl);
2271                 else {
2272                   emit_loadreg(rs2[i],tl);
2273                   emit_not(tl,tl);
2274                 }
2275               }
2276               else emit_movimm(-1,tl);
2277             }
2278           }
2279         }
2280       }
2281     }
2282   }
2283 }
2284
2285 void imm16_assemble(int i,struct regstat *i_regs)
2286 {
2287   if (opcode[i]==0x0f) { // LUI
2288     if(rt1[i]) {
2289       signed char t;
2290       t=get_reg(i_regs->regmap,rt1[i]);
2291       //assert(t>=0);
2292       if(t>=0) {
2293         if(!((i_regs->isconst>>t)&1))
2294           emit_movimm(imm[i]<<16,t);
2295       }
2296     }
2297   }
2298   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2299     if(rt1[i]) {
2300       signed char s,t;
2301       t=get_reg(i_regs->regmap,rt1[i]);
2302       s=get_reg(i_regs->regmap,rs1[i]);
2303       if(rs1[i]) {
2304         //assert(t>=0);
2305         //assert(s>=0);
2306         if(t>=0) {
2307           if(!((i_regs->isconst>>t)&1)) {
2308             if(s<0) {
2309               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2310               emit_addimm(t,imm[i],t);
2311             }else{
2312               if(!((i_regs->wasconst>>s)&1))
2313                 emit_addimm(s,imm[i],t);
2314               else
2315                 emit_movimm(constmap[i][s]+imm[i],t);
2316             }
2317           }
2318         }
2319       } else {
2320         if(t>=0) {
2321           if(!((i_regs->isconst>>t)&1))
2322             emit_movimm(imm[i],t);
2323         }
2324       }
2325     }
2326   }
2327   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2328     if(rt1[i]) {
2329       signed char sh,sl,th,tl;
2330       th=get_reg(i_regs->regmap,rt1[i]|64);
2331       tl=get_reg(i_regs->regmap,rt1[i]);
2332       sh=get_reg(i_regs->regmap,rs1[i]|64);
2333       sl=get_reg(i_regs->regmap,rs1[i]);
2334       if(tl>=0) {
2335         if(rs1[i]) {
2336           assert(sh>=0);
2337           assert(sl>=0);
2338           if(th>=0) {
2339             emit_addimm64_32(sh,sl,imm[i],th,tl);
2340           }
2341           else {
2342             emit_addimm(sl,imm[i],tl);
2343           }
2344         } else {
2345           emit_movimm(imm[i],tl);
2346           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2347         }
2348       }
2349     }
2350   }
2351   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2352     if(rt1[i]) {
2353       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2354       signed char sh,sl,t;
2355       t=get_reg(i_regs->regmap,rt1[i]);
2356       sh=get_reg(i_regs->regmap,rs1[i]|64);
2357       sl=get_reg(i_regs->regmap,rs1[i]);
2358       //assert(t>=0);
2359       if(t>=0) {
2360         if(rs1[i]>0) {
2361           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2362           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2363             if(opcode[i]==0x0a) { // SLTI
2364               if(sl<0) {
2365                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2366                 emit_slti32(t,imm[i],t);
2367               }else{
2368                 emit_slti32(sl,imm[i],t);
2369               }
2370             }
2371             else { // SLTIU
2372               if(sl<0) {
2373                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2374                 emit_sltiu32(t,imm[i],t);
2375               }else{
2376                 emit_sltiu32(sl,imm[i],t);
2377               }
2378             }
2379           }else{ // 64-bit
2380             assert(sl>=0);
2381             if(opcode[i]==0x0a) // SLTI
2382               emit_slti64_32(sh,sl,imm[i],t);
2383             else // SLTIU
2384               emit_sltiu64_32(sh,sl,imm[i],t);
2385           }
2386         }else{
2387           // SLTI(U) with r0 is just stupid,
2388           // nonetheless examples can be found
2389           if(opcode[i]==0x0a) // SLTI
2390             if(0<imm[i]) emit_movimm(1,t);
2391             else emit_zeroreg(t);
2392           else // SLTIU
2393           {
2394             if(imm[i]) emit_movimm(1,t);
2395             else emit_zeroreg(t);
2396           }
2397         }
2398       }
2399     }
2400   }
2401   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2402     if(rt1[i]) {
2403       signed char sh,sl,th,tl;
2404       th=get_reg(i_regs->regmap,rt1[i]|64);
2405       tl=get_reg(i_regs->regmap,rt1[i]);
2406       sh=get_reg(i_regs->regmap,rs1[i]|64);
2407       sl=get_reg(i_regs->regmap,rs1[i]);
2408       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2409         if(opcode[i]==0x0c) //ANDI
2410         {
2411           if(rs1[i]) {
2412             if(sl<0) {
2413               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2414               emit_andimm(tl,imm[i],tl);
2415             }else{
2416               if(!((i_regs->wasconst>>sl)&1))
2417                 emit_andimm(sl,imm[i],tl);
2418               else
2419                 emit_movimm(constmap[i][sl]&imm[i],tl);
2420             }
2421           }
2422           else
2423             emit_zeroreg(tl);
2424           if(th>=0) emit_zeroreg(th);
2425         }
2426         else
2427         {
2428           if(rs1[i]) {
2429             if(sl<0) {
2430               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2431             }
2432             if(th>=0) {
2433               if(sh<0) {
2434                 emit_loadreg(rs1[i]|64,th);
2435               }else{
2436                 emit_mov(sh,th);
2437               }
2438             }
2439             if(opcode[i]==0x0d) { // ORI
2440               if(sl<0) {
2441                 emit_orimm(tl,imm[i],tl);
2442               }else{
2443                 if(!((i_regs->wasconst>>sl)&1))
2444                   emit_orimm(sl,imm[i],tl);
2445                 else
2446                   emit_movimm(constmap[i][sl]|imm[i],tl);
2447               }
2448             }
2449             if(opcode[i]==0x0e) { // XORI
2450               if(sl<0) {
2451                 emit_xorimm(tl,imm[i],tl);
2452               }else{
2453                 if(!((i_regs->wasconst>>sl)&1))
2454                   emit_xorimm(sl,imm[i],tl);
2455                 else
2456                   emit_movimm(constmap[i][sl]^imm[i],tl);
2457               }
2458             }
2459           }
2460           else {
2461             emit_movimm(imm[i],tl);
2462             if(th>=0) emit_zeroreg(th);
2463           }
2464         }
2465       }
2466     }
2467   }
2468 }
2469
2470 void shiftimm_assemble(int i,struct regstat *i_regs)
2471 {
2472   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2473   {
2474     if(rt1[i]) {
2475       signed char s,t;
2476       t=get_reg(i_regs->regmap,rt1[i]);
2477       s=get_reg(i_regs->regmap,rs1[i]);
2478       //assert(t>=0);
2479       if(t>=0&&!((i_regs->isconst>>t)&1)){
2480         if(rs1[i]==0)
2481         {
2482           emit_zeroreg(t);
2483         }
2484         else
2485         {
2486           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2487           if(imm[i]) {
2488             if(opcode2[i]==0) // SLL
2489             {
2490               emit_shlimm(s<0?t:s,imm[i],t);
2491             }
2492             if(opcode2[i]==2) // SRL
2493             {
2494               emit_shrimm(s<0?t:s,imm[i],t);
2495             }
2496             if(opcode2[i]==3) // SRA
2497             {
2498               emit_sarimm(s<0?t:s,imm[i],t);
2499             }
2500           }else{
2501             // Shift by zero
2502             if(s>=0 && s!=t) emit_mov(s,t);
2503           }
2504         }
2505       }
2506       //emit_storereg(rt1[i],t); //DEBUG
2507     }
2508   }
2509   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2510   {
2511     if(rt1[i]) {
2512       signed char sh,sl,th,tl;
2513       th=get_reg(i_regs->regmap,rt1[i]|64);
2514       tl=get_reg(i_regs->regmap,rt1[i]);
2515       sh=get_reg(i_regs->regmap,rs1[i]|64);
2516       sl=get_reg(i_regs->regmap,rs1[i]);
2517       if(tl>=0) {
2518         if(rs1[i]==0)
2519         {
2520           emit_zeroreg(tl);
2521           if(th>=0) emit_zeroreg(th);
2522         }
2523         else
2524         {
2525           assert(sl>=0);
2526           assert(sh>=0);
2527           if(imm[i]) {
2528             if(opcode2[i]==0x38) // DSLL
2529             {
2530               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2531               emit_shlimm(sl,imm[i],tl);
2532             }
2533             if(opcode2[i]==0x3a) // DSRL
2534             {
2535               emit_shrdimm(sl,sh,imm[i],tl);
2536               if(th>=0) emit_shrimm(sh,imm[i],th);
2537             }
2538             if(opcode2[i]==0x3b) // DSRA
2539             {
2540               emit_shrdimm(sl,sh,imm[i],tl);
2541               if(th>=0) emit_sarimm(sh,imm[i],th);
2542             }
2543           }else{
2544             // Shift by zero
2545             if(sl!=tl) emit_mov(sl,tl);
2546             if(th>=0&&sh!=th) emit_mov(sh,th);
2547           }
2548         }
2549       }
2550     }
2551   }
2552   if(opcode2[i]==0x3c) // DSLL32
2553   {
2554     if(rt1[i]) {
2555       signed char sl,tl,th;
2556       tl=get_reg(i_regs->regmap,rt1[i]);
2557       th=get_reg(i_regs->regmap,rt1[i]|64);
2558       sl=get_reg(i_regs->regmap,rs1[i]);
2559       if(th>=0||tl>=0){
2560         assert(tl>=0);
2561         assert(th>=0);
2562         assert(sl>=0);
2563         emit_mov(sl,th);
2564         emit_zeroreg(tl);
2565         if(imm[i]>32)
2566         {
2567           emit_shlimm(th,imm[i]&31,th);
2568         }
2569       }
2570     }
2571   }
2572   if(opcode2[i]==0x3e) // DSRL32
2573   {
2574     if(rt1[i]) {
2575       signed char sh,tl,th;
2576       tl=get_reg(i_regs->regmap,rt1[i]);
2577       th=get_reg(i_regs->regmap,rt1[i]|64);
2578       sh=get_reg(i_regs->regmap,rs1[i]|64);
2579       if(tl>=0){
2580         assert(sh>=0);
2581         emit_mov(sh,tl);
2582         if(th>=0) emit_zeroreg(th);
2583         if(imm[i]>32)
2584         {
2585           emit_shrimm(tl,imm[i]&31,tl);
2586         }
2587       }
2588     }
2589   }
2590   if(opcode2[i]==0x3f) // DSRA32
2591   {
2592     if(rt1[i]) {
2593       signed char sh,tl;
2594       tl=get_reg(i_regs->regmap,rt1[i]);
2595       sh=get_reg(i_regs->regmap,rs1[i]|64);
2596       if(tl>=0){
2597         assert(sh>=0);
2598         emit_mov(sh,tl);
2599         if(imm[i]>32)
2600         {
2601           emit_sarimm(tl,imm[i]&31,tl);
2602         }
2603       }
2604     }
2605   }
2606 }
2607
2608 #ifndef shift_assemble
2609 void shift_assemble(int i,struct regstat *i_regs)
2610 {
2611   printf("Need shift_assemble for this architecture.\n");
2612   exit(1);
2613 }
2614 #endif
2615
2616 void load_assemble(int i,struct regstat *i_regs)
2617 {
2618   int s,th,tl,addr,map=-1;
2619   int offset;
2620   int jaddr=0;
2621   int memtarget=0,c=0;
2622   int fastload_reg_override=0;
2623   u_int hr,reglist=0;
2624   th=get_reg(i_regs->regmap,rt1[i]|64);
2625   tl=get_reg(i_regs->regmap,rt1[i]);
2626   s=get_reg(i_regs->regmap,rs1[i]);
2627   offset=imm[i];
2628   for(hr=0;hr<HOST_REGS;hr++) {
2629     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2630   }
2631   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2632   if(s>=0) {
2633     c=(i_regs->wasconst>>s)&1;
2634     if (c) {
2635       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2636     }
2637   }
2638   //printf("load_assemble: c=%d\n",c);
2639   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2640   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2641   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2642     ||rt1[i]==0) {
2643       // could be FIFO, must perform the read
2644       // ||dummy read
2645       assem_debug("(forced read)\n");
2646       tl=get_reg(i_regs->regmap,-1);
2647       assert(tl>=0);
2648   }
2649   if(offset||s<0||c) addr=tl;
2650   else addr=s;
2651   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2652  if(tl>=0) {
2653   //printf("load_assemble: c=%d\n",c);
2654   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2655   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2656   reglist&=~(1<<tl);
2657   if(th>=0) reglist&=~(1<<th);
2658   if(!c) {
2659     #ifdef RAM_OFFSET
2660     map=get_reg(i_regs->regmap,ROREG);
2661     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2662     #endif
2663     #ifdef R29_HACK
2664     // Strmnnrmn's speed hack
2665     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2666     #endif
2667     {
2668       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2669     }
2670   }
2671   else if(ram_offset&&memtarget) {
2672     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2673     fastload_reg_override=HOST_TEMPREG;
2674   }
2675   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2676   if (opcode[i]==0x20) { // LB
2677     if(!c||memtarget) {
2678       if(!dummy) {
2679         #ifdef HOST_IMM_ADDR32
2680         if(c)
2681           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2682         else
2683         #endif
2684         {
2685           //emit_xorimm(addr,3,tl);
2686           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2687           int x=0,a=tl;
2688 #ifdef BIG_ENDIAN_MIPS
2689           if(!c) emit_xorimm(addr,3,tl);
2690           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2691 #else
2692           if(!c) a=addr;
2693 #endif
2694           if(fastload_reg_override) a=fastload_reg_override;
2695
2696           emit_movsbl_indexed_tlb(x,a,map,tl);
2697         }
2698       }
2699       if(jaddr)
2700         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2701     }
2702     else
2703       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2704   }
2705   if (opcode[i]==0x21) { // LH
2706     if(!c||memtarget) {
2707       if(!dummy) {
2708         #ifdef HOST_IMM_ADDR32
2709         if(c)
2710           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2711         else
2712         #endif
2713         {
2714           int x=0,a=tl;
2715 #ifdef BIG_ENDIAN_MIPS
2716           if(!c) emit_xorimm(addr,2,tl);
2717           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2718 #else
2719           if(!c) a=addr;
2720 #endif
2721           if(fastload_reg_override) a=fastload_reg_override;
2722           //#ifdef
2723           //emit_movswl_indexed_tlb(x,tl,map,tl);
2724           //else
2725           if(map>=0) {
2726             emit_movswl_indexed(x,a,tl);
2727           }else{
2728             #if 1 //def RAM_OFFSET
2729             emit_movswl_indexed(x,a,tl);
2730             #else
2731             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2732             #endif
2733           }
2734         }
2735       }
2736       if(jaddr)
2737         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2738     }
2739     else
2740       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2741   }
2742   if (opcode[i]==0x23) { // LW
2743     if(!c||memtarget) {
2744       if(!dummy) {
2745         int a=addr;
2746         if(fastload_reg_override) a=fastload_reg_override;
2747         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2748         #ifdef HOST_IMM_ADDR32
2749         if(c)
2750           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2751         else
2752         #endif
2753         emit_readword_indexed_tlb(0,a,map,tl);
2754       }
2755       if(jaddr)
2756         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2757     }
2758     else
2759       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2760   }
2761   if (opcode[i]==0x24) { // LBU
2762     if(!c||memtarget) {
2763       if(!dummy) {
2764         #ifdef HOST_IMM_ADDR32
2765         if(c)
2766           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2767         else
2768         #endif
2769         {
2770           //emit_xorimm(addr,3,tl);
2771           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2772           int x=0,a=tl;
2773 #ifdef BIG_ENDIAN_MIPS
2774           if(!c) emit_xorimm(addr,3,tl);
2775           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2776 #else
2777           if(!c) a=addr;
2778 #endif
2779           if(fastload_reg_override) a=fastload_reg_override;
2780
2781           emit_movzbl_indexed_tlb(x,a,map,tl);
2782         }
2783       }
2784       if(jaddr)
2785         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2786     }
2787     else
2788       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2789   }
2790   if (opcode[i]==0x25) { // LHU
2791     if(!c||memtarget) {
2792       if(!dummy) {
2793         #ifdef HOST_IMM_ADDR32
2794         if(c)
2795           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2796         else
2797         #endif
2798         {
2799           int x=0,a=tl;
2800 #ifdef BIG_ENDIAN_MIPS
2801           if(!c) emit_xorimm(addr,2,tl);
2802           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2803 #else
2804           if(!c) a=addr;
2805 #endif
2806           if(fastload_reg_override) a=fastload_reg_override;
2807           //#ifdef
2808           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2809           //#else
2810           if(map>=0) {
2811             emit_movzwl_indexed(x,a,tl);
2812           }else{
2813             #if 1 //def RAM_OFFSET
2814             emit_movzwl_indexed(x,a,tl);
2815             #else
2816             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2817             #endif
2818           }
2819         }
2820       }
2821       if(jaddr)
2822         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2823     }
2824     else
2825       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2826   }
2827   if (opcode[i]==0x27) { // LWU
2828     assert(th>=0);
2829     if(!c||memtarget) {
2830       if(!dummy) {
2831         int a=addr;
2832         if(fastload_reg_override) a=fastload_reg_override;
2833         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2834         #ifdef HOST_IMM_ADDR32
2835         if(c)
2836           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2837         else
2838         #endif
2839         emit_readword_indexed_tlb(0,a,map,tl);
2840       }
2841       if(jaddr)
2842         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2843     }
2844     else {
2845       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2846     }
2847     emit_zeroreg(th);
2848   }
2849   if (opcode[i]==0x37) { // LD
2850     if(!c||memtarget) {
2851       if(!dummy) {
2852         int a=addr;
2853         if(fastload_reg_override) a=fastload_reg_override;
2854         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2855         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2856         #ifdef HOST_IMM_ADDR32
2857         if(c)
2858           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2859         else
2860         #endif
2861         emit_readdword_indexed_tlb(0,a,map,th,tl);
2862       }
2863       if(jaddr)
2864         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2865     }
2866     else
2867       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2868   }
2869  }
2870   //emit_storereg(rt1[i],tl); // DEBUG
2871   //if(opcode[i]==0x23)
2872   //if(opcode[i]==0x24)
2873   //if(opcode[i]==0x23||opcode[i]==0x24)
2874   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2875   {
2876     //emit_pusha();
2877     save_regs(0x100f);
2878         emit_readword((int)&last_count,ECX);
2879         #ifdef __i386__
2880         if(get_reg(i_regs->regmap,CCREG)<0)
2881           emit_loadreg(CCREG,HOST_CCREG);
2882         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2883         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2884         emit_writeword(HOST_CCREG,(int)&Count);
2885         #endif
2886         #ifdef __arm__
2887         if(get_reg(i_regs->regmap,CCREG)<0)
2888           emit_loadreg(CCREG,0);
2889         else
2890           emit_mov(HOST_CCREG,0);
2891         emit_add(0,ECX,0);
2892         emit_addimm(0,2*ccadj[i],0);
2893         emit_writeword(0,(int)&Count);
2894         #endif
2895     emit_call((int)memdebug);
2896     //emit_popa();
2897     restore_regs(0x100f);
2898   }*/
2899 }
2900
2901 #ifndef loadlr_assemble
2902 void loadlr_assemble(int i,struct regstat *i_regs)
2903 {
2904   printf("Need loadlr_assemble for this architecture.\n");
2905   exit(1);
2906 }
2907 #endif
2908
2909 void store_assemble(int i,struct regstat *i_regs)
2910 {
2911   int s,th,tl,map=-1;
2912   int addr,temp;
2913   int offset;
2914   int jaddr=0,type;
2915   int memtarget=0,c=0;
2916   int agr=AGEN1+(i&1);
2917   int faststore_reg_override=0;
2918   u_int hr,reglist=0;
2919   th=get_reg(i_regs->regmap,rs2[i]|64);
2920   tl=get_reg(i_regs->regmap,rs2[i]);
2921   s=get_reg(i_regs->regmap,rs1[i]);
2922   temp=get_reg(i_regs->regmap,agr);
2923   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2924   offset=imm[i];
2925   if(s>=0) {
2926     c=(i_regs->wasconst>>s)&1;
2927     if(c) {
2928       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2929     }
2930   }
2931   assert(tl>=0);
2932   assert(temp>=0);
2933   for(hr=0;hr<HOST_REGS;hr++) {
2934     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2935   }
2936   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2937   if(offset||s<0||c) addr=temp;
2938   else addr=s;
2939   if(!c) {
2940     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2941   }
2942   else if(ram_offset&&memtarget) {
2943     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2944     faststore_reg_override=HOST_TEMPREG;
2945   }
2946
2947   if (opcode[i]==0x28) { // SB
2948     if(!c||memtarget) {
2949       int x=0,a=temp;
2950 #ifdef BIG_ENDIAN_MIPS
2951       if(!c) emit_xorimm(addr,3,temp);
2952       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2953 #else
2954       if(!c) a=addr;
2955 #endif
2956       if(faststore_reg_override) a=faststore_reg_override;
2957       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2958       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2959     }
2960     type=STOREB_STUB;
2961   }
2962   if (opcode[i]==0x29) { // SH
2963     if(!c||memtarget) {
2964       int x=0,a=temp;
2965 #ifdef BIG_ENDIAN_MIPS
2966       if(!c) emit_xorimm(addr,2,temp);
2967       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2968 #else
2969       if(!c) a=addr;
2970 #endif
2971       if(faststore_reg_override) a=faststore_reg_override;
2972       //#ifdef
2973       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2974       //#else
2975       if(map>=0) {
2976         emit_writehword_indexed(tl,x,a);
2977       }else
2978         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2979         emit_writehword_indexed(tl,x,a);
2980     }
2981     type=STOREH_STUB;
2982   }
2983   if (opcode[i]==0x2B) { // SW
2984     if(!c||memtarget) {
2985       int a=addr;
2986       if(faststore_reg_override) a=faststore_reg_override;
2987       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2988       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2989     }
2990     type=STOREW_STUB;
2991   }
2992   if (opcode[i]==0x3F) { // SD
2993     if(!c||memtarget) {
2994       int a=addr;
2995       if(faststore_reg_override) a=faststore_reg_override;
2996       if(rs2[i]) {
2997         assert(th>=0);
2998         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2999         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3000         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3001       }else{
3002         // Store zero
3003         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3004         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3005         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3006       }
3007     }
3008     type=STORED_STUB;
3009   }
3010   if(jaddr) {
3011     // PCSX store handlers don't check invcode again
3012     reglist|=1<<addr;
3013     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3014     jaddr=0;
3015   }
3016   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3017     if(!c||memtarget) {
3018       #ifdef DESTRUCTIVE_SHIFT
3019       // The x86 shift operation is 'destructive'; it overwrites the
3020       // source register, so we need to make a copy first and use that.
3021       addr=temp;
3022       #endif
3023       #if defined(HOST_IMM8)
3024       int ir=get_reg(i_regs->regmap,INVCP);
3025       assert(ir>=0);
3026       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3027       #else
3028       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3029       #endif
3030       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3031       emit_callne(invalidate_addr_reg[addr]);
3032       #else
3033       int jaddr2=(int)out;
3034       emit_jne(0);
3035       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3036       #endif
3037     }
3038   }
3039   u_int addr_val=constmap[i][s]+offset;
3040   if(jaddr) {
3041     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3042   } else if(c&&!memtarget) {
3043     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3044   }
3045   // basic current block modification detection..
3046   // not looking back as that should be in mips cache already
3047   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3048     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3049     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3050     if(i_regs->regmap==regs[i].regmap) {
3051       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3052       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3053       emit_movimm(start+i*4+4,0);
3054       emit_writeword(0,(int)&pcaddr);
3055       emit_jmp((int)do_interrupt);
3056     }
3057   }
3058   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3059   //if(opcode[i]==0x2B || opcode[i]==0x28)
3060   //if(opcode[i]==0x2B || opcode[i]==0x29)
3061   //if(opcode[i]==0x2B)
3062   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3063   {
3064     #ifdef __i386__
3065     emit_pusha();
3066     #endif
3067     #ifdef __arm__
3068     save_regs(0x100f);
3069     #endif
3070         emit_readword((int)&last_count,ECX);
3071         #ifdef __i386__
3072         if(get_reg(i_regs->regmap,CCREG)<0)
3073           emit_loadreg(CCREG,HOST_CCREG);
3074         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3075         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3076         emit_writeword(HOST_CCREG,(int)&Count);
3077         #endif
3078         #ifdef __arm__
3079         if(get_reg(i_regs->regmap,CCREG)<0)
3080           emit_loadreg(CCREG,0);
3081         else
3082           emit_mov(HOST_CCREG,0);
3083         emit_add(0,ECX,0);
3084         emit_addimm(0,2*ccadj[i],0);
3085         emit_writeword(0,(int)&Count);
3086         #endif
3087     emit_call((int)memdebug);
3088     #ifdef __i386__
3089     emit_popa();
3090     #endif
3091     #ifdef __arm__
3092     restore_regs(0x100f);
3093     #endif
3094   }*/
3095 }
3096
3097 void storelr_assemble(int i,struct regstat *i_regs)
3098 {
3099   int s,th,tl;
3100   int temp;
3101   int temp2=-1;
3102   int offset;
3103   int jaddr=0;
3104   int case1,case2,case3;
3105   int done0,done1,done2;
3106   int memtarget=0,c=0;
3107   int agr=AGEN1+(i&1);
3108   u_int hr,reglist=0;
3109   th=get_reg(i_regs->regmap,rs2[i]|64);
3110   tl=get_reg(i_regs->regmap,rs2[i]);
3111   s=get_reg(i_regs->regmap,rs1[i]);
3112   temp=get_reg(i_regs->regmap,agr);
3113   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3114   offset=imm[i];
3115   if(s>=0) {
3116     c=(i_regs->isconst>>s)&1;
3117     if(c) {
3118       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3119     }
3120   }
3121   assert(tl>=0);
3122   for(hr=0;hr<HOST_REGS;hr++) {
3123     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3124   }
3125   assert(temp>=0);
3126   if(!c) {
3127     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3128     if(!offset&&s!=temp) emit_mov(s,temp);
3129     jaddr=(int)out;
3130     emit_jno(0);
3131   }
3132   else
3133   {
3134     if(!memtarget||!rs1[i]) {
3135       jaddr=(int)out;
3136       emit_jmp(0);
3137     }
3138   }
3139   #ifdef RAM_OFFSET
3140   int map=get_reg(i_regs->regmap,ROREG);
3141   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3142   #else
3143   if((u_int)rdram!=0x80000000)
3144     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3145   #endif
3146
3147   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3148     temp2=get_reg(i_regs->regmap,FTEMP);
3149     if(!rs2[i]) temp2=th=tl;
3150   }
3151
3152 #ifndef BIG_ENDIAN_MIPS
3153     emit_xorimm(temp,3,temp);
3154 #endif
3155   emit_testimm(temp,2);
3156   case2=(int)out;
3157   emit_jne(0);
3158   emit_testimm(temp,1);
3159   case1=(int)out;
3160   emit_jne(0);
3161   // 0
3162   if (opcode[i]==0x2A) { // SWL
3163     emit_writeword_indexed(tl,0,temp);
3164   }
3165   if (opcode[i]==0x2E) { // SWR
3166     emit_writebyte_indexed(tl,3,temp);
3167   }
3168   if (opcode[i]==0x2C) { // SDL
3169     emit_writeword_indexed(th,0,temp);
3170     if(rs2[i]) emit_mov(tl,temp2);
3171   }
3172   if (opcode[i]==0x2D) { // SDR
3173     emit_writebyte_indexed(tl,3,temp);
3174     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3175   }
3176   done0=(int)out;
3177   emit_jmp(0);
3178   // 1
3179   set_jump_target(case1,(int)out);
3180   if (opcode[i]==0x2A) { // SWL
3181     // Write 3 msb into three least significant bytes
3182     if(rs2[i]) emit_rorimm(tl,8,tl);
3183     emit_writehword_indexed(tl,-1,temp);
3184     if(rs2[i]) emit_rorimm(tl,16,tl);
3185     emit_writebyte_indexed(tl,1,temp);
3186     if(rs2[i]) emit_rorimm(tl,8,tl);
3187   }
3188   if (opcode[i]==0x2E) { // SWR
3189     // Write two lsb into two most significant bytes
3190     emit_writehword_indexed(tl,1,temp);
3191   }
3192   if (opcode[i]==0x2C) { // SDL
3193     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3194     // Write 3 msb into three least significant bytes
3195     if(rs2[i]) emit_rorimm(th,8,th);
3196     emit_writehword_indexed(th,-1,temp);
3197     if(rs2[i]) emit_rorimm(th,16,th);
3198     emit_writebyte_indexed(th,1,temp);
3199     if(rs2[i]) emit_rorimm(th,8,th);
3200   }
3201   if (opcode[i]==0x2D) { // SDR
3202     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3203     // Write two lsb into two most significant bytes
3204     emit_writehword_indexed(tl,1,temp);
3205   }
3206   done1=(int)out;
3207   emit_jmp(0);
3208   // 2
3209   set_jump_target(case2,(int)out);
3210   emit_testimm(temp,1);
3211   case3=(int)out;
3212   emit_jne(0);
3213   if (opcode[i]==0x2A) { // SWL
3214     // Write two msb into two least significant bytes
3215     if(rs2[i]) emit_rorimm(tl,16,tl);
3216     emit_writehword_indexed(tl,-2,temp);
3217     if(rs2[i]) emit_rorimm(tl,16,tl);
3218   }
3219   if (opcode[i]==0x2E) { // SWR
3220     // Write 3 lsb into three most significant bytes
3221     emit_writebyte_indexed(tl,-1,temp);
3222     if(rs2[i]) emit_rorimm(tl,8,tl);
3223     emit_writehword_indexed(tl,0,temp);
3224     if(rs2[i]) emit_rorimm(tl,24,tl);
3225   }
3226   if (opcode[i]==0x2C) { // SDL
3227     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3228     // Write two msb into two least significant bytes
3229     if(rs2[i]) emit_rorimm(th,16,th);
3230     emit_writehword_indexed(th,-2,temp);
3231     if(rs2[i]) emit_rorimm(th,16,th);
3232   }
3233   if (opcode[i]==0x2D) { // SDR
3234     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3235     // Write 3 lsb into three most significant bytes
3236     emit_writebyte_indexed(tl,-1,temp);
3237     if(rs2[i]) emit_rorimm(tl,8,tl);
3238     emit_writehword_indexed(tl,0,temp);
3239     if(rs2[i]) emit_rorimm(tl,24,tl);
3240   }
3241   done2=(int)out;
3242   emit_jmp(0);
3243   // 3
3244   set_jump_target(case3,(int)out);
3245   if (opcode[i]==0x2A) { // SWL
3246     // Write msb into least significant byte
3247     if(rs2[i]) emit_rorimm(tl,24,tl);
3248     emit_writebyte_indexed(tl,-3,temp);
3249     if(rs2[i]) emit_rorimm(tl,8,tl);
3250   }
3251   if (opcode[i]==0x2E) { // SWR
3252     // Write entire word
3253     emit_writeword_indexed(tl,-3,temp);
3254   }
3255   if (opcode[i]==0x2C) { // SDL
3256     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3257     // Write msb into least significant byte
3258     if(rs2[i]) emit_rorimm(th,24,th);
3259     emit_writebyte_indexed(th,-3,temp);
3260     if(rs2[i]) emit_rorimm(th,8,th);
3261   }
3262   if (opcode[i]==0x2D) { // SDR
3263     if(rs2[i]) emit_mov(th,temp2);
3264     // Write entire word
3265     emit_writeword_indexed(tl,-3,temp);
3266   }
3267   set_jump_target(done0,(int)out);
3268   set_jump_target(done1,(int)out);
3269   set_jump_target(done2,(int)out);
3270   if (opcode[i]==0x2C) { // SDL
3271     emit_testimm(temp,4);
3272     done0=(int)out;
3273     emit_jne(0);
3274     emit_andimm(temp,~3,temp);
3275     emit_writeword_indexed(temp2,4,temp);
3276     set_jump_target(done0,(int)out);
3277   }
3278   if (opcode[i]==0x2D) { // SDR
3279     emit_testimm(temp,4);
3280     done0=(int)out;
3281     emit_jeq(0);
3282     emit_andimm(temp,~3,temp);
3283     emit_writeword_indexed(temp2,-4,temp);
3284     set_jump_target(done0,(int)out);
3285   }
3286   if(!c||!memtarget)
3287     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3288   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3289     #ifdef RAM_OFFSET
3290     int map=get_reg(i_regs->regmap,ROREG);
3291     if(map<0) map=HOST_TEMPREG;
3292     gen_orig_addr_w(temp,map);
3293     #else
3294     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3295     #endif
3296     #if defined(HOST_IMM8)
3297     int ir=get_reg(i_regs->regmap,INVCP);
3298     assert(ir>=0);
3299     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3300     #else
3301     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3302     #endif
3303     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3304     emit_callne(invalidate_addr_reg[temp]);
3305     #else
3306     int jaddr2=(int)out;
3307     emit_jne(0);
3308     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3309     #endif
3310   }
3311   /*
3312     emit_pusha();
3313     //save_regs(0x100f);
3314         emit_readword((int)&last_count,ECX);
3315         if(get_reg(i_regs->regmap,CCREG)<0)
3316           emit_loadreg(CCREG,HOST_CCREG);
3317         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3318         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3319         emit_writeword(HOST_CCREG,(int)&Count);
3320     emit_call((int)memdebug);
3321     emit_popa();
3322     //restore_regs(0x100f);
3323   */
3324 }
3325
3326 void c1ls_assemble(int i,struct regstat *i_regs)
3327 {
3328   cop1_unusable(i, i_regs);
3329 }
3330
3331 void c2ls_assemble(int i,struct regstat *i_regs)
3332 {
3333   int s,tl;
3334   int ar;
3335   int offset;
3336   int memtarget=0,c=0;
3337   int jaddr2=0,type;
3338   int agr=AGEN1+(i&1);
3339   int fastio_reg_override=0;
3340   u_int hr,reglist=0;
3341   u_int copr=(source[i]>>16)&0x1f;
3342   s=get_reg(i_regs->regmap,rs1[i]);
3343   tl=get_reg(i_regs->regmap,FTEMP);
3344   offset=imm[i];
3345   assert(rs1[i]>0);
3346   assert(tl>=0);
3347
3348   for(hr=0;hr<HOST_REGS;hr++) {
3349     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3350   }
3351   if(i_regs->regmap[HOST_CCREG]==CCREG)
3352     reglist&=~(1<<HOST_CCREG);
3353
3354   // get the address
3355   if (opcode[i]==0x3a) { // SWC2
3356     ar=get_reg(i_regs->regmap,agr);
3357     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3358     reglist|=1<<ar;
3359   } else { // LWC2
3360     ar=tl;
3361   }
3362   if(s>=0) c=(i_regs->wasconst>>s)&1;
3363   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3364   if (!offset&&!c&&s>=0) ar=s;
3365   assert(ar>=0);
3366
3367   if (opcode[i]==0x3a) { // SWC2
3368     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3369     type=STOREW_STUB;
3370   }
3371   else
3372     type=LOADW_STUB;
3373
3374   if(c&&!memtarget) {
3375     jaddr2=(int)out;
3376     emit_jmp(0); // inline_readstub/inline_writestub?
3377   }
3378   else {
3379     if(!c) {
3380       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3381     }
3382     else if(ram_offset&&memtarget) {
3383       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3384       fastio_reg_override=HOST_TEMPREG;
3385     }
3386     if (opcode[i]==0x32) { // LWC2
3387       #ifdef HOST_IMM_ADDR32
3388       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3389       else
3390       #endif
3391       int a=ar;
3392       if(fastio_reg_override) a=fastio_reg_override;
3393       emit_readword_indexed(0,a,tl);
3394     }
3395     if (opcode[i]==0x3a) { // SWC2
3396       #ifdef DESTRUCTIVE_SHIFT
3397       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3398       #endif
3399       int a=ar;
3400       if(fastio_reg_override) a=fastio_reg_override;
3401       emit_writeword_indexed(tl,0,a);
3402     }
3403   }
3404   if(jaddr2)
3405     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3406   if(opcode[i]==0x3a) // SWC2
3407   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3408 #if defined(HOST_IMM8)
3409     int ir=get_reg(i_regs->regmap,INVCP);
3410     assert(ir>=0);
3411     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3412 #else
3413     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3414 #endif
3415     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3416     emit_callne(invalidate_addr_reg[ar]);
3417     #else
3418     int jaddr3=(int)out;
3419     emit_jne(0);
3420     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3421     #endif
3422   }
3423   if (opcode[i]==0x32) { // LWC2
3424     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3425   }
3426 }
3427
3428 #ifndef multdiv_assemble
3429 void multdiv_assemble(int i,struct regstat *i_regs)
3430 {
3431   printf("Need multdiv_assemble for this architecture.\n");
3432   exit(1);
3433 }
3434 #endif
3435
3436 void mov_assemble(int i,struct regstat *i_regs)
3437 {
3438   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3439   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3440   if(rt1[i]) {
3441     signed char sh,sl,th,tl;
3442     th=get_reg(i_regs->regmap,rt1[i]|64);
3443     tl=get_reg(i_regs->regmap,rt1[i]);
3444     //assert(tl>=0);
3445     if(tl>=0) {
3446       sh=get_reg(i_regs->regmap,rs1[i]|64);
3447       sl=get_reg(i_regs->regmap,rs1[i]);
3448       if(sl>=0) emit_mov(sl,tl);
3449       else emit_loadreg(rs1[i],tl);
3450       if(th>=0) {
3451         if(sh>=0) emit_mov(sh,th);
3452         else emit_loadreg(rs1[i]|64,th);
3453       }
3454     }
3455   }
3456 }
3457
3458 #ifndef fconv_assemble
3459 void fconv_assemble(int i,struct regstat *i_regs)
3460 {
3461   printf("Need fconv_assemble for this architecture.\n");
3462   exit(1);
3463 }
3464 #endif
3465
3466 #if 0
3467 void float_assemble(int i,struct regstat *i_regs)
3468 {
3469   printf("Need float_assemble for this architecture.\n");
3470   exit(1);
3471 }
3472 #endif
3473
3474 void syscall_assemble(int i,struct regstat *i_regs)
3475 {
3476   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3477   assert(ccreg==HOST_CCREG);
3478   assert(!is_delayslot);
3479   (void)ccreg;
3480   emit_movimm(start+i*4,EAX); // Get PC
3481   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3482   emit_jmp((int)jump_syscall_hle); // XXX
3483 }
3484
3485 void hlecall_assemble(int i,struct regstat *i_regs)
3486 {
3487   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3488   assert(ccreg==HOST_CCREG);
3489   assert(!is_delayslot);
3490   (void)ccreg;
3491   emit_movimm(start+i*4+4,0); // Get PC
3492   emit_movimm((int)psxHLEt[source[i]&7],1);
3493   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3494   emit_jmp((int)jump_hlecall);
3495 }
3496
3497 void intcall_assemble(int i,struct regstat *i_regs)
3498 {
3499   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3500   assert(ccreg==HOST_CCREG);
3501   assert(!is_delayslot);
3502   (void)ccreg;
3503   emit_movimm(start+i*4,0); // Get PC
3504   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3505   emit_jmp((int)jump_intcall);
3506 }
3507
3508 void ds_assemble(int i,struct regstat *i_regs)
3509 {
3510   speculate_register_values(i);
3511   is_delayslot=1;
3512   switch(itype[i]) {
3513     case ALU:
3514       alu_assemble(i,i_regs);break;
3515     case IMM16:
3516       imm16_assemble(i,i_regs);break;
3517     case SHIFT:
3518       shift_assemble(i,i_regs);break;
3519     case SHIFTIMM:
3520       shiftimm_assemble(i,i_regs);break;
3521     case LOAD:
3522       load_assemble(i,i_regs);break;
3523     case LOADLR:
3524       loadlr_assemble(i,i_regs);break;
3525     case STORE:
3526       store_assemble(i,i_regs);break;
3527     case STORELR:
3528       storelr_assemble(i,i_regs);break;
3529     case COP0:
3530       cop0_assemble(i,i_regs);break;
3531     case COP1:
3532       cop1_assemble(i,i_regs);break;
3533     case C1LS:
3534       c1ls_assemble(i,i_regs);break;
3535     case COP2:
3536       cop2_assemble(i,i_regs);break;
3537     case C2LS:
3538       c2ls_assemble(i,i_regs);break;
3539     case C2OP:
3540       c2op_assemble(i,i_regs);break;
3541     case FCONV:
3542       fconv_assemble(i,i_regs);break;
3543     case FLOAT:
3544       float_assemble(i,i_regs);break;
3545     case FCOMP:
3546       fcomp_assemble(i,i_regs);break;
3547     case MULTDIV:
3548       multdiv_assemble(i,i_regs);break;
3549     case MOV:
3550       mov_assemble(i,i_regs);break;
3551     case SYSCALL:
3552     case HLECALL:
3553     case INTCALL:
3554     case SPAN:
3555     case UJUMP:
3556     case RJUMP:
3557     case CJUMP:
3558     case SJUMP:
3559     case FJUMP:
3560       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3561   }
3562   is_delayslot=0;
3563 }
3564
3565 // Is the branch target a valid internal jump?
3566 int internal_branch(uint64_t i_is32,int addr)
3567 {
3568   if(addr&1) return 0; // Indirect (register) jump
3569   if(addr>=start && addr<start+slen*4-4)
3570   {
3571     //int t=(addr-start)>>2;
3572     // Delay slots are not valid branch targets
3573     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3574     // 64 -> 32 bit transition requires a recompile
3575     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3576     {
3577       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3578       else printf("optimizable: yes\n");
3579     }*/
3580     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3581     return 1;
3582   }
3583   return 0;
3584 }
3585
3586 #ifndef wb_invalidate
3587 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3588   uint64_t u,uint64_t uu)
3589 {
3590   int hr;
3591   for(hr=0;hr<HOST_REGS;hr++) {
3592     if(hr!=EXCLUDE_REG) {
3593       if(pre[hr]!=entry[hr]) {
3594         if(pre[hr]>=0) {
3595           if((dirty>>hr)&1) {
3596             if(get_reg(entry,pre[hr])<0) {
3597               if(pre[hr]<64) {
3598                 if(!((u>>pre[hr])&1)) {
3599                   emit_storereg(pre[hr],hr);
3600                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3601                     emit_sarimm(hr,31,hr);
3602                     emit_storereg(pre[hr]|64,hr);
3603                   }
3604                 }
3605               }else{
3606                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3607                   emit_storereg(pre[hr],hr);
3608                 }
3609               }
3610             }
3611           }
3612         }
3613       }
3614     }
3615   }
3616   // Move from one register to another (no writeback)
3617   for(hr=0;hr<HOST_REGS;hr++) {
3618     if(hr!=EXCLUDE_REG) {
3619       if(pre[hr]!=entry[hr]) {
3620         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3621           int nr;
3622           if((nr=get_reg(entry,pre[hr]))>=0) {
3623             emit_mov(hr,nr);
3624           }
3625         }
3626       }
3627     }
3628   }
3629 }
3630 #endif
3631
3632 // Load the specified registers
3633 // This only loads the registers given as arguments because
3634 // we don't want to load things that will be overwritten
3635 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3636 {
3637   int hr;
3638   // Load 32-bit regs
3639   for(hr=0;hr<HOST_REGS;hr++) {
3640     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3641       if(entry[hr]!=regmap[hr]) {
3642         if(regmap[hr]==rs1||regmap[hr]==rs2)
3643         {
3644           if(regmap[hr]==0) {
3645             emit_zeroreg(hr);
3646           }
3647           else
3648           {
3649             emit_loadreg(regmap[hr],hr);
3650           }
3651         }
3652       }
3653     }
3654   }
3655   //Load 64-bit regs
3656   for(hr=0;hr<HOST_REGS;hr++) {
3657     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3658       if(entry[hr]!=regmap[hr]) {
3659         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3660         {
3661           assert(regmap[hr]!=64);
3662           if((is32>>(regmap[hr]&63))&1) {
3663             int lr=get_reg(regmap,regmap[hr]-64);
3664             if(lr>=0)
3665               emit_sarimm(lr,31,hr);
3666             else
3667               emit_loadreg(regmap[hr],hr);
3668           }
3669           else
3670           {
3671             emit_loadreg(regmap[hr],hr);
3672           }
3673         }
3674       }
3675     }
3676   }
3677 }
3678
3679 // Load registers prior to the start of a loop
3680 // so that they are not loaded within the loop
3681 static void loop_preload(signed char pre[],signed char entry[])
3682 {
3683   int hr;
3684   for(hr=0;hr<HOST_REGS;hr++) {
3685     if(hr!=EXCLUDE_REG) {
3686       if(pre[hr]!=entry[hr]) {
3687         if(entry[hr]>=0) {
3688           if(get_reg(pre,entry[hr])<0) {
3689             assem_debug("loop preload:\n");
3690             //printf("loop preload: %d\n",hr);
3691             if(entry[hr]==0) {
3692               emit_zeroreg(hr);
3693             }
3694             else if(entry[hr]<TEMPREG)
3695             {
3696               emit_loadreg(entry[hr],hr);
3697             }
3698             else if(entry[hr]-64<TEMPREG)
3699             {
3700               emit_loadreg(entry[hr],hr);
3701             }
3702           }
3703         }
3704       }
3705     }
3706   }
3707 }
3708
3709 // Generate address for load/store instruction
3710 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3711 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3712 {
3713   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3714     int ra=-1;
3715     int agr=AGEN1+(i&1);
3716     if(itype[i]==LOAD) {
3717       ra=get_reg(i_regs->regmap,rt1[i]);
3718       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3719       assert(ra>=0);
3720     }
3721     if(itype[i]==LOADLR) {
3722       ra=get_reg(i_regs->regmap,FTEMP);
3723     }
3724     if(itype[i]==STORE||itype[i]==STORELR) {
3725       ra=get_reg(i_regs->regmap,agr);
3726       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3727     }
3728     if(itype[i]==C1LS||itype[i]==C2LS) {
3729       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3730         ra=get_reg(i_regs->regmap,FTEMP);
3731       else { // SWC1/SDC1/SWC2/SDC2
3732         ra=get_reg(i_regs->regmap,agr);
3733         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3734       }
3735     }
3736     int rs=get_reg(i_regs->regmap,rs1[i]);
3737     if(ra>=0) {
3738       int offset=imm[i];
3739       int c=(i_regs->wasconst>>rs)&1;
3740       if(rs1[i]==0) {
3741         // Using r0 as a base address
3742         if(!entry||entry[ra]!=agr) {
3743           if (opcode[i]==0x22||opcode[i]==0x26) {
3744             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3745           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3746             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3747           }else{
3748             emit_movimm(offset,ra);
3749           }
3750         } // else did it in the previous cycle
3751       }
3752       else if(rs<0) {
3753         if(!entry||entry[ra]!=rs1[i])
3754           emit_loadreg(rs1[i],ra);
3755         //if(!entry||entry[ra]!=rs1[i])
3756         //  printf("poor load scheduling!\n");
3757       }
3758       else if(c) {
3759         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3760           if(!entry||entry[ra]!=agr) {
3761             if (opcode[i]==0x22||opcode[i]==0x26) {
3762               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3763             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3764               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3765             }else{
3766               #ifdef HOST_IMM_ADDR32
3767               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3768               #endif
3769               emit_movimm(constmap[i][rs]+offset,ra);
3770               regs[i].loadedconst|=1<<ra;
3771             }
3772           } // else did it in the previous cycle
3773         } // else load_consts already did it
3774       }
3775       if(offset&&!c&&rs1[i]) {
3776         if(rs>=0) {
3777           emit_addimm(rs,offset,ra);
3778         }else{
3779           emit_addimm(ra,offset,ra);
3780         }
3781       }
3782     }
3783   }
3784   // Preload constants for next instruction
3785   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3786     int agr,ra;
3787     // Actual address
3788     agr=AGEN1+((i+1)&1);
3789     ra=get_reg(i_regs->regmap,agr);
3790     if(ra>=0) {
3791       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3792       int offset=imm[i+1];
3793       int c=(regs[i+1].wasconst>>rs)&1;
3794       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3795         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3796           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3797         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3798           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3799         }else{
3800           #ifdef HOST_IMM_ADDR32
3801           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3802           #endif
3803           emit_movimm(constmap[i+1][rs]+offset,ra);
3804           regs[i+1].loadedconst|=1<<ra;
3805         }
3806       }
3807       else if(rs1[i+1]==0) {
3808         // Using r0 as a base address
3809         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3810           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3811         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3812           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3813         }else{
3814           emit_movimm(offset,ra);
3815         }
3816       }
3817     }
3818   }
3819 }
3820
3821 static int get_final_value(int hr, int i, int *value)
3822 {
3823   int reg=regs[i].regmap[hr];
3824   while(i<slen-1) {
3825     if(regs[i+1].regmap[hr]!=reg) break;
3826     if(!((regs[i+1].isconst>>hr)&1)) break;
3827     if(bt[i+1]) break;
3828     i++;
3829   }
3830   if(i<slen-1) {
3831     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3832       *value=constmap[i][hr];
3833       return 1;
3834     }
3835     if(!bt[i+1]) {
3836       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3837         // Load in delay slot, out-of-order execution
3838         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3839         {
3840           // Precompute load address
3841           *value=constmap[i][hr]+imm[i+2];
3842           return 1;
3843         }
3844       }
3845       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3846       {
3847         // Precompute load address
3848         *value=constmap[i][hr]+imm[i+1];
3849         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3850         return 1;
3851       }
3852     }
3853   }
3854   *value=constmap[i][hr];
3855   //printf("c=%x\n",(int)constmap[i][hr]);
3856   if(i==slen-1) return 1;
3857   if(reg<64) {
3858     return !((unneeded_reg[i+1]>>reg)&1);
3859   }else{
3860     return !((unneeded_reg_upper[i+1]>>reg)&1);
3861   }
3862 }
3863
3864 // Load registers with known constants
3865 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3866 {
3867   int hr,hr2;
3868   // propagate loaded constant flags
3869   if(i==0||bt[i])
3870     regs[i].loadedconst=0;
3871   else {
3872     for(hr=0;hr<HOST_REGS;hr++) {
3873       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3874          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3875       {
3876         regs[i].loadedconst|=1<<hr;
3877       }
3878     }
3879   }
3880   // Load 32-bit regs
3881   for(hr=0;hr<HOST_REGS;hr++) {
3882     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3883       //if(entry[hr]!=regmap[hr]) {
3884       if(!((regs[i].loadedconst>>hr)&1)) {
3885         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3886           int value,similar=0;
3887           if(get_final_value(hr,i,&value)) {
3888             // see if some other register has similar value
3889             for(hr2=0;hr2<HOST_REGS;hr2++) {
3890               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3891                 if(is_similar_value(value,constmap[i][hr2])) {
3892                   similar=1;
3893                   break;
3894                 }
3895               }
3896             }
3897             if(similar) {
3898               int value2;
3899               if(get_final_value(hr2,i,&value2)) // is this needed?
3900                 emit_movimm_from(value2,hr2,value,hr);
3901               else
3902                 emit_movimm(value,hr);
3903             }
3904             else if(value==0) {
3905               emit_zeroreg(hr);
3906             }
3907             else {
3908               emit_movimm(value,hr);
3909             }
3910           }
3911           regs[i].loadedconst|=1<<hr;
3912         }
3913       }
3914     }
3915   }
3916   // Load 64-bit regs
3917   for(hr=0;hr<HOST_REGS;hr++) {
3918     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3919       //if(entry[hr]!=regmap[hr]) {
3920       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3921         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3922           if((is32>>(regmap[hr]&63))&1) {
3923             int lr=get_reg(regmap,regmap[hr]-64);
3924             assert(lr>=0);
3925             emit_sarimm(lr,31,hr);
3926           }
3927           else
3928           {
3929             int value;
3930             if(get_final_value(hr,i,&value)) {
3931               if(value==0) {
3932                 emit_zeroreg(hr);
3933               }
3934               else {
3935                 emit_movimm(value,hr);
3936               }
3937             }
3938           }
3939         }
3940       }
3941     }
3942   }
3943 }
3944 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3945 {
3946   int hr;
3947   // Load 32-bit regs
3948   for(hr=0;hr<HOST_REGS;hr++) {
3949     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3950       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3951         int value=constmap[i][hr];
3952         if(value==0) {
3953           emit_zeroreg(hr);
3954         }
3955         else {
3956           emit_movimm(value,hr);
3957         }
3958       }
3959     }
3960   }
3961   // Load 64-bit regs
3962   for(hr=0;hr<HOST_REGS;hr++) {
3963     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3964       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3965         if((is32>>(regmap[hr]&63))&1) {
3966           int lr=get_reg(regmap,regmap[hr]-64);
3967           assert(lr>=0);
3968           emit_sarimm(lr,31,hr);
3969         }
3970         else
3971         {
3972           int value=constmap[i][hr];
3973           if(value==0) {
3974             emit_zeroreg(hr);
3975           }
3976           else {
3977             emit_movimm(value,hr);
3978           }
3979         }
3980       }
3981     }
3982   }
3983 }
3984
3985 // Write out all dirty registers (except cycle count)
3986 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3987 {
3988   int hr;
3989   for(hr=0;hr<HOST_REGS;hr++) {
3990     if(hr!=EXCLUDE_REG) {
3991       if(i_regmap[hr]>0) {
3992         if(i_regmap[hr]!=CCREG) {
3993           if((i_dirty>>hr)&1) {
3994             if(i_regmap[hr]<64) {
3995               emit_storereg(i_regmap[hr],hr);
3996             }else{
3997               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3998                 emit_storereg(i_regmap[hr],hr);
3999               }
4000             }
4001           }
4002         }
4003       }
4004     }
4005   }
4006 }
4007 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4008 // This writes the registers not written by store_regs_bt
4009 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4010 {
4011   int hr;
4012   int t=(addr-start)>>2;
4013   for(hr=0;hr<HOST_REGS;hr++) {
4014     if(hr!=EXCLUDE_REG) {
4015       if(i_regmap[hr]>0) {
4016         if(i_regmap[hr]!=CCREG) {
4017           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4018             if((i_dirty>>hr)&1) {
4019               if(i_regmap[hr]<64) {
4020                 emit_storereg(i_regmap[hr],hr);
4021               }else{
4022                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4023                   emit_storereg(i_regmap[hr],hr);
4024                 }
4025               }
4026             }
4027           }
4028         }
4029       }
4030     }
4031   }
4032 }
4033
4034 // Load all registers (except cycle count)
4035 void load_all_regs(signed char i_regmap[])
4036 {
4037   int hr;
4038   for(hr=0;hr<HOST_REGS;hr++) {
4039     if(hr!=EXCLUDE_REG) {
4040       if(i_regmap[hr]==0) {
4041         emit_zeroreg(hr);
4042       }
4043       else
4044       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4045       {
4046         emit_loadreg(i_regmap[hr],hr);
4047       }
4048     }
4049   }
4050 }
4051
4052 // Load all current registers also needed by next instruction
4053 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4054 {
4055   int hr;
4056   for(hr=0;hr<HOST_REGS;hr++) {
4057     if(hr!=EXCLUDE_REG) {
4058       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4059         if(i_regmap[hr]==0) {
4060           emit_zeroreg(hr);
4061         }
4062         else
4063         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4064         {
4065           emit_loadreg(i_regmap[hr],hr);
4066         }
4067       }
4068     }
4069   }
4070 }
4071
4072 // Load all regs, storing cycle count if necessary
4073 void load_regs_entry(int t)
4074 {
4075   int hr;
4076   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4077   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4078   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4079     emit_storereg(CCREG,HOST_CCREG);
4080   }
4081   // Load 32-bit regs
4082   for(hr=0;hr<HOST_REGS;hr++) {
4083     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4084       if(regs[t].regmap_entry[hr]==0) {
4085         emit_zeroreg(hr);
4086       }
4087       else if(regs[t].regmap_entry[hr]!=CCREG)
4088       {
4089         emit_loadreg(regs[t].regmap_entry[hr],hr);
4090       }
4091     }
4092   }
4093   // Load 64-bit regs
4094   for(hr=0;hr<HOST_REGS;hr++) {
4095     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4096       assert(regs[t].regmap_entry[hr]!=64);
4097       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4098         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4099         if(lr<0) {
4100           emit_loadreg(regs[t].regmap_entry[hr],hr);
4101         }
4102         else
4103         {
4104           emit_sarimm(lr,31,hr);
4105         }
4106       }
4107       else
4108       {
4109         emit_loadreg(regs[t].regmap_entry[hr],hr);
4110       }
4111     }
4112   }
4113 }
4114
4115 // Store dirty registers prior to branch
4116 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4117 {
4118   if(internal_branch(i_is32,addr))
4119   {
4120     int t=(addr-start)>>2;
4121     int hr;
4122     for(hr=0;hr<HOST_REGS;hr++) {
4123       if(hr!=EXCLUDE_REG) {
4124         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4125           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4126             if((i_dirty>>hr)&1) {
4127               if(i_regmap[hr]<64) {
4128                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4129                   emit_storereg(i_regmap[hr],hr);
4130                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4131                     #ifdef DESTRUCTIVE_WRITEBACK
4132                     emit_sarimm(hr,31,hr);
4133                     emit_storereg(i_regmap[hr]|64,hr);
4134                     #else
4135                     emit_sarimm(hr,31,HOST_TEMPREG);
4136                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4137                     #endif
4138                   }
4139                 }
4140               }else{
4141                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4142                   emit_storereg(i_regmap[hr],hr);
4143                 }
4144               }
4145             }
4146           }
4147         }
4148       }
4149     }
4150   }
4151   else
4152   {
4153     // Branch out of this block, write out all dirty regs
4154     wb_dirtys(i_regmap,i_is32,i_dirty);
4155   }
4156 }
4157
4158 // Load all needed registers for branch target
4159 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4160 {
4161   //if(addr>=start && addr<(start+slen*4))
4162   if(internal_branch(i_is32,addr))
4163   {
4164     int t=(addr-start)>>2;
4165     int hr;
4166     // Store the cycle count before loading something else
4167     if(i_regmap[HOST_CCREG]!=CCREG) {
4168       assert(i_regmap[HOST_CCREG]==-1);
4169     }
4170     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4171       emit_storereg(CCREG,HOST_CCREG);
4172     }
4173     // Load 32-bit regs
4174     for(hr=0;hr<HOST_REGS;hr++) {
4175       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4176         #ifdef DESTRUCTIVE_WRITEBACK
4177         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4178         #else
4179         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4180         #endif
4181           if(regs[t].regmap_entry[hr]==0) {
4182             emit_zeroreg(hr);
4183           }
4184           else if(regs[t].regmap_entry[hr]!=CCREG)
4185           {
4186             emit_loadreg(regs[t].regmap_entry[hr],hr);
4187           }
4188         }
4189       }
4190     }
4191     //Load 64-bit regs
4192     for(hr=0;hr<HOST_REGS;hr++) {
4193       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4194         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4195           assert(regs[t].regmap_entry[hr]!=64);
4196           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4197             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4198             if(lr<0) {
4199               emit_loadreg(regs[t].regmap_entry[hr],hr);
4200             }
4201             else
4202             {
4203               emit_sarimm(lr,31,hr);
4204             }
4205           }
4206           else
4207           {
4208             emit_loadreg(regs[t].regmap_entry[hr],hr);
4209           }
4210         }
4211         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4212           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4213           assert(lr>=0);
4214           emit_sarimm(lr,31,hr);
4215         }
4216       }
4217     }
4218   }
4219 }
4220
4221 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4222 {
4223   if(addr>=start && addr<start+slen*4-4)
4224   {
4225     int t=(addr-start)>>2;
4226     int hr;
4227     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4228     for(hr=0;hr<HOST_REGS;hr++)
4229     {
4230       if(hr!=EXCLUDE_REG)
4231       {
4232         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4233         {
4234           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4235           {
4236             return 0;
4237           }
4238           else
4239           if((i_dirty>>hr)&1)
4240           {
4241             if(i_regmap[hr]<TEMPREG)
4242             {
4243               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4244                 return 0;
4245             }
4246             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4247             {
4248               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4249                 return 0;
4250             }
4251           }
4252         }
4253         else // Same register but is it 32-bit or dirty?
4254         if(i_regmap[hr]>=0)
4255         {
4256           if(!((regs[t].dirty>>hr)&1))
4257           {
4258             if((i_dirty>>hr)&1)
4259             {
4260               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4261               {
4262                 //printf("%x: dirty no match\n",addr);
4263                 return 0;
4264               }
4265             }
4266           }
4267           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4268           {
4269             //printf("%x: is32 no match\n",addr);
4270             return 0;
4271           }
4272         }
4273       }
4274     }
4275     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4276     // Delay slots are not valid branch targets
4277     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4278     // Delay slots require additional processing, so do not match
4279     if(is_ds[t]) return 0;
4280   }
4281   else
4282   {
4283     int hr;
4284     for(hr=0;hr<HOST_REGS;hr++)
4285     {
4286       if(hr!=EXCLUDE_REG)
4287       {
4288         if(i_regmap[hr]>=0)
4289         {
4290           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4291           {
4292             if((i_dirty>>hr)&1)
4293             {
4294               return 0;
4295             }
4296           }
4297         }
4298       }
4299     }
4300   }
4301   return 1;
4302 }
4303
4304 // Used when a branch jumps into the delay slot of another branch
4305 void ds_assemble_entry(int i)
4306 {
4307   int t=(ba[i]-start)>>2;
4308   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4309   assem_debug("Assemble delay slot at %x\n",ba[i]);
4310   assem_debug("<->\n");
4311   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4312     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4313   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4314   address_generation(t,&regs[t],regs[t].regmap_entry);
4315   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4316     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4317   cop1_usable=0;
4318   is_delayslot=0;
4319   switch(itype[t]) {
4320     case ALU:
4321       alu_assemble(t,&regs[t]);break;
4322     case IMM16:
4323       imm16_assemble(t,&regs[t]);break;
4324     case SHIFT:
4325       shift_assemble(t,&regs[t]);break;
4326     case SHIFTIMM:
4327       shiftimm_assemble(t,&regs[t]);break;
4328     case LOAD:
4329       load_assemble(t,&regs[t]);break;
4330     case LOADLR:
4331       loadlr_assemble(t,&regs[t]);break;
4332     case STORE:
4333       store_assemble(t,&regs[t]);break;
4334     case STORELR:
4335       storelr_assemble(t,&regs[t]);break;
4336     case COP0:
4337       cop0_assemble(t,&regs[t]);break;
4338     case COP1:
4339       cop1_assemble(t,&regs[t]);break;
4340     case C1LS:
4341       c1ls_assemble(t,&regs[t]);break;
4342     case COP2:
4343       cop2_assemble(t,&regs[t]);break;
4344     case C2LS:
4345       c2ls_assemble(t,&regs[t]);break;
4346     case C2OP:
4347       c2op_assemble(t,&regs[t]);break;
4348     case FCONV:
4349       fconv_assemble(t,&regs[t]);break;
4350     case FLOAT:
4351       float_assemble(t,&regs[t]);break;
4352     case FCOMP:
4353       fcomp_assemble(t,&regs[t]);break;
4354     case MULTDIV:
4355       multdiv_assemble(t,&regs[t]);break;
4356     case MOV:
4357       mov_assemble(t,&regs[t]);break;
4358     case SYSCALL:
4359     case HLECALL:
4360     case INTCALL:
4361     case SPAN:
4362     case UJUMP:
4363     case RJUMP:
4364     case CJUMP:
4365     case SJUMP:
4366     case FJUMP:
4367       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4368   }
4369   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4370   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4371   if(internal_branch(regs[t].is32,ba[i]+4))
4372     assem_debug("branch: internal\n");
4373   else
4374     assem_debug("branch: external\n");
4375   assert(internal_branch(regs[t].is32,ba[i]+4));
4376   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4377   emit_jmp(0);
4378 }
4379
4380 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4381 {
4382   int count;
4383   int jaddr;
4384   int idle=0;
4385   int t=0;
4386   if(itype[i]==RJUMP)
4387   {
4388     *adj=0;
4389   }
4390   //if(ba[i]>=start && ba[i]<(start+slen*4))
4391   if(internal_branch(branch_regs[i].is32,ba[i]))
4392   {
4393     t=(ba[i]-start)>>2;
4394     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4395     else *adj=ccadj[t];
4396   }
4397   else
4398   {
4399     *adj=0;
4400   }
4401   count=ccadj[i];
4402   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4403     // Idle loop
4404     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4405     idle=(int)out;
4406     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4407     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4408     jaddr=(int)out;
4409     emit_jmp(0);
4410   }
4411   else if(*adj==0||invert) {
4412     int cycles=CLOCK_ADJUST(count+2);
4413     // faster loop HACK
4414     if (t&&*adj) {
4415       int rel=t-i;
4416       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4417         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4418     }
4419     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4420     jaddr=(int)out;
4421     emit_jns(0);
4422   }
4423   else
4424   {
4425     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4426     jaddr=(int)out;
4427     emit_jns(0);
4428   }
4429   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4430 }
4431
4432 void do_ccstub(int n)
4433 {
4434   literal_pool(256);
4435   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4436   set_jump_target(stubs[n][1],(int)out);
4437   int i=stubs[n][4];
4438   if(stubs[n][6]==NULLDS) {
4439     // Delay slot instruction is nullified ("likely" branch)
4440     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4441   }
4442   else if(stubs[n][6]!=TAKEN) {
4443     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4444   }
4445   else {
4446     if(internal_branch(branch_regs[i].is32,ba[i]))
4447       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4448   }
4449   if(stubs[n][5]!=-1)
4450   {
4451     // Save PC as return address
4452     emit_movimm(stubs[n][5],EAX);
4453     emit_writeword(EAX,(int)&pcaddr);
4454   }
4455   else
4456   {
4457     // Return address depends on which way the branch goes
4458     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4459     {
4460       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4461       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4462       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4463       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4464       if(rs1[i]==0)
4465       {
4466         s1l=s2l;s1h=s2h;
4467         s2l=s2h=-1;
4468       }
4469       else if(rs2[i]==0)
4470       {
4471         s2l=s2h=-1;
4472       }
4473       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4474         s1h=s2h=-1;
4475       }
4476       assert(s1l>=0);
4477       #ifdef DESTRUCTIVE_WRITEBACK
4478       if(rs1[i]) {
4479         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4480           emit_loadreg(rs1[i],s1l);
4481       }
4482       else {
4483         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4484           emit_loadreg(rs2[i],s1l);
4485       }
4486       if(s2l>=0)
4487         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4488           emit_loadreg(rs2[i],s2l);
4489       #endif
4490       int hr=0;
4491       int addr=-1,alt=-1,ntaddr=-1;
4492       while(hr<HOST_REGS)
4493       {
4494         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4495            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4496            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4497         {
4498           addr=hr++;break;
4499         }
4500         hr++;
4501       }
4502       while(hr<HOST_REGS)
4503       {
4504         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4505            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4506            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4507         {
4508           alt=hr++;break;
4509         }
4510         hr++;
4511       }
4512       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4513       {
4514         while(hr<HOST_REGS)
4515         {
4516           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4517              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4518              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4519           {
4520             ntaddr=hr;break;
4521           }
4522           hr++;
4523         }
4524         assert(hr<HOST_REGS);
4525       }
4526       if((opcode[i]&0x2f)==4) // BEQ
4527       {
4528         #ifdef HAVE_CMOV_IMM
4529         if(s1h<0) {
4530           if(s2l>=0) emit_cmp(s1l,s2l);
4531           else emit_test(s1l,s1l);
4532           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4533         }
4534         else
4535         #endif
4536         {
4537           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4538           if(s1h>=0) {
4539             if(s2h>=0) emit_cmp(s1h,s2h);
4540             else emit_test(s1h,s1h);
4541             emit_cmovne_reg(alt,addr);
4542           }
4543           if(s2l>=0) emit_cmp(s1l,s2l);
4544           else emit_test(s1l,s1l);
4545           emit_cmovne_reg(alt,addr);
4546         }
4547       }
4548       if((opcode[i]&0x2f)==5) // BNE
4549       {
4550         #ifdef HAVE_CMOV_IMM
4551         if(s1h<0) {
4552           if(s2l>=0) emit_cmp(s1l,s2l);
4553           else emit_test(s1l,s1l);
4554           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4555         }
4556         else
4557         #endif
4558         {
4559           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4560           if(s1h>=0) {
4561             if(s2h>=0) emit_cmp(s1h,s2h);
4562             else emit_test(s1h,s1h);
4563             emit_cmovne_reg(alt,addr);
4564           }
4565           if(s2l>=0) emit_cmp(s1l,s2l);
4566           else emit_test(s1l,s1l);
4567           emit_cmovne_reg(alt,addr);
4568         }
4569       }
4570       if((opcode[i]&0x2f)==6) // BLEZ
4571       {
4572         //emit_movimm(ba[i],alt);
4573         //emit_movimm(start+i*4+8,addr);
4574         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4575         emit_cmpimm(s1l,1);
4576         if(s1h>=0) emit_mov(addr,ntaddr);
4577         emit_cmovl_reg(alt,addr);
4578         if(s1h>=0) {
4579           emit_test(s1h,s1h);
4580           emit_cmovne_reg(ntaddr,addr);
4581           emit_cmovs_reg(alt,addr);
4582         }
4583       }
4584       if((opcode[i]&0x2f)==7) // BGTZ
4585       {
4586         //emit_movimm(ba[i],addr);
4587         //emit_movimm(start+i*4+8,ntaddr);
4588         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4589         emit_cmpimm(s1l,1);
4590         if(s1h>=0) emit_mov(addr,alt);
4591         emit_cmovl_reg(ntaddr,addr);
4592         if(s1h>=0) {
4593           emit_test(s1h,s1h);
4594           emit_cmovne_reg(alt,addr);
4595           emit_cmovs_reg(ntaddr,addr);
4596         }
4597       }
4598       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4599       {
4600         //emit_movimm(ba[i],alt);
4601         //emit_movimm(start+i*4+8,addr);
4602         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4603         if(s1h>=0) emit_test(s1h,s1h);
4604         else emit_test(s1l,s1l);
4605         emit_cmovs_reg(alt,addr);
4606       }
4607       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4608       {
4609         //emit_movimm(ba[i],addr);
4610         //emit_movimm(start+i*4+8,alt);
4611         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4612         if(s1h>=0) emit_test(s1h,s1h);
4613         else emit_test(s1l,s1l);
4614         emit_cmovs_reg(alt,addr);
4615       }
4616       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4617         if(source[i]&0x10000) // BC1T
4618         {
4619           //emit_movimm(ba[i],alt);
4620           //emit_movimm(start+i*4+8,addr);
4621           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4622           emit_testimm(s1l,0x800000);
4623           emit_cmovne_reg(alt,addr);
4624         }
4625         else // BC1F
4626         {
4627           //emit_movimm(ba[i],addr);
4628           //emit_movimm(start+i*4+8,alt);
4629           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4630           emit_testimm(s1l,0x800000);
4631           emit_cmovne_reg(alt,addr);
4632         }
4633       }
4634       emit_writeword(addr,(int)&pcaddr);
4635     }
4636     else
4637     if(itype[i]==RJUMP)
4638     {
4639       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4640       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4641         r=get_reg(branch_regs[i].regmap,RTEMP);
4642       }
4643       emit_writeword(r,(int)&pcaddr);
4644     }
4645     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4646   }
4647   // Update cycle count
4648   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4649   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4650   emit_call((int)cc_interrupt);
4651   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4652   if(stubs[n][6]==TAKEN) {
4653     if(internal_branch(branch_regs[i].is32,ba[i]))
4654       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4655     else if(itype[i]==RJUMP) {
4656       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4657         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4658       else
4659         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4660     }
4661   }else if(stubs[n][6]==NOTTAKEN) {
4662     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4663     else load_all_regs(branch_regs[i].regmap);
4664   }else if(stubs[n][6]==NULLDS) {
4665     // Delay slot instruction is nullified ("likely" branch)
4666     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4667     else load_all_regs(regs[i].regmap);
4668   }else{
4669     load_all_regs(branch_regs[i].regmap);
4670   }
4671   emit_jmp(stubs[n][2]); // return address
4672
4673   /* This works but uses a lot of memory...
4674   emit_readword((int)&last_count,ECX);
4675   emit_add(HOST_CCREG,ECX,EAX);
4676   emit_writeword(EAX,(int)&Count);
4677   emit_call((int)gen_interupt);
4678   emit_readword((int)&Count,HOST_CCREG);
4679   emit_readword((int)&next_interupt,EAX);
4680   emit_readword((int)&pending_exception,EBX);
4681   emit_writeword(EAX,(int)&last_count);
4682   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4683   emit_test(EBX,EBX);
4684   int jne_instr=(int)out;
4685   emit_jne(0);
4686   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4687   load_all_regs(branch_regs[i].regmap);
4688   emit_jmp(stubs[n][2]); // return address
4689   set_jump_target(jne_instr,(int)out);
4690   emit_readword((int)&pcaddr,EAX);
4691   // Call get_addr_ht instead of doing the hash table here.
4692   // This code is executed infrequently and takes up a lot of space
4693   // so smaller is better.
4694   emit_storereg(CCREG,HOST_CCREG);
4695   emit_pushreg(EAX);
4696   emit_call((int)get_addr_ht);
4697   emit_loadreg(CCREG,HOST_CCREG);
4698   emit_addimm(ESP,4,ESP);
4699   emit_jmpreg(EAX);*/
4700 }
4701
4702 static void add_to_linker(int addr,int target,int ext)
4703 {
4704   link_addr[linkcount][0]=addr;
4705   link_addr[linkcount][1]=target;
4706   link_addr[linkcount][2]=ext;
4707   linkcount++;
4708 }
4709
4710 static void ujump_assemble_write_ra(int i)
4711 {
4712   int rt;
4713   unsigned int return_address;
4714   rt=get_reg(branch_regs[i].regmap,31);
4715   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4716   //assert(rt>=0);
4717   return_address=start+i*4+8;
4718   if(rt>=0) {
4719     #ifdef USE_MINI_HT
4720     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4721       int temp=-1; // note: must be ds-safe
4722       #ifdef HOST_TEMPREG
4723       temp=HOST_TEMPREG;
4724       #endif
4725       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4726       else emit_movimm(return_address,rt);
4727     }
4728     else
4729     #endif
4730     {
4731       #ifdef REG_PREFETCH
4732       if(temp>=0)
4733       {
4734         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4735       }
4736       #endif
4737       emit_movimm(return_address,rt); // PC into link register
4738       #ifdef IMM_PREFETCH
4739       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4740       #endif
4741     }
4742   }
4743 }
4744
4745 void ujump_assemble(int i,struct regstat *i_regs)
4746 {
4747   int ra_done=0;
4748   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4749   address_generation(i+1,i_regs,regs[i].regmap_entry);
4750   #ifdef REG_PREFETCH
4751   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4752   if(rt1[i]==31&&temp>=0)
4753   {
4754     signed char *i_regmap=i_regs->regmap;
4755     int return_address=start+i*4+8;
4756     if(get_reg(branch_regs[i].regmap,31)>0)
4757     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4758   }
4759   #endif
4760   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4761     ujump_assemble_write_ra(i); // writeback ra for DS
4762     ra_done=1;
4763   }
4764   ds_assemble(i+1,i_regs);
4765   uint64_t bc_unneeded=branch_regs[i].u;
4766   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4767   bc_unneeded|=1|(1LL<<rt1[i]);
4768   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4769   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4770                 bc_unneeded,bc_unneeded_upper);
4771   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4772   if(!ra_done&&rt1[i]==31)
4773     ujump_assemble_write_ra(i);
4774   int cc,adj;
4775   cc=get_reg(branch_regs[i].regmap,CCREG);
4776   assert(cc==HOST_CCREG);
4777   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4778   #ifdef REG_PREFETCH
4779   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4780   #endif
4781   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4782   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4783   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4784   if(internal_branch(branch_regs[i].is32,ba[i]))
4785     assem_debug("branch: internal\n");
4786   else
4787     assem_debug("branch: external\n");
4788   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4789     ds_assemble_entry(i);
4790   }
4791   else {
4792     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4793     emit_jmp(0);
4794   }
4795 }
4796
4797 static void rjump_assemble_write_ra(int i)
4798 {
4799   int rt,return_address;
4800   assert(rt1[i+1]!=rt1[i]);
4801   assert(rt2[i+1]!=rt1[i]);
4802   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4803   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4804   assert(rt>=0);
4805   return_address=start+i*4+8;
4806   #ifdef REG_PREFETCH
4807   if(temp>=0)
4808   {
4809     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4810   }
4811   #endif
4812   emit_movimm(return_address,rt); // PC into link register
4813   #ifdef IMM_PREFETCH
4814   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4815   #endif
4816 }
4817
4818 void rjump_assemble(int i,struct regstat *i_regs)
4819 {
4820   int temp;
4821   int rs,cc;
4822   int ra_done=0;
4823   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4824   assert(rs>=0);
4825   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4826     // Delay slot abuse, make a copy of the branch address register
4827     temp=get_reg(branch_regs[i].regmap,RTEMP);
4828     assert(temp>=0);
4829     assert(regs[i].regmap[temp]==RTEMP);
4830     emit_mov(rs,temp);
4831     rs=temp;
4832   }
4833   address_generation(i+1,i_regs,regs[i].regmap_entry);
4834   #ifdef REG_PREFETCH
4835   if(rt1[i]==31)
4836   {
4837     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4838       signed char *i_regmap=i_regs->regmap;
4839       int return_address=start+i*4+8;
4840       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4841     }
4842   }
4843   #endif
4844   #ifdef USE_MINI_HT
4845   if(rs1[i]==31) {
4846     int rh=get_reg(regs[i].regmap,RHASH);
4847     if(rh>=0) do_preload_rhash(rh);
4848   }
4849   #endif
4850   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4851     rjump_assemble_write_ra(i);
4852     ra_done=1;
4853   }
4854   ds_assemble(i+1,i_regs);
4855   uint64_t bc_unneeded=branch_regs[i].u;
4856   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4857   bc_unneeded|=1|(1LL<<rt1[i]);
4858   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4859   bc_unneeded&=~(1LL<<rs1[i]);
4860   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4861                 bc_unneeded,bc_unneeded_upper);
4862   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4863   if(!ra_done&&rt1[i]!=0)
4864     rjump_assemble_write_ra(i);
4865   cc=get_reg(branch_regs[i].regmap,CCREG);
4866   assert(cc==HOST_CCREG);
4867   (void)cc;
4868   #ifdef USE_MINI_HT
4869   int rh=get_reg(branch_regs[i].regmap,RHASH);
4870   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4871   if(rs1[i]==31) {
4872     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4873     do_preload_rhtbl(ht);
4874     do_rhash(rs,rh);
4875   }
4876   #endif
4877   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4878   #ifdef DESTRUCTIVE_WRITEBACK
4879   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4880     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4881       emit_loadreg(rs1[i],rs);
4882     }
4883   }
4884   #endif
4885   #ifdef REG_PREFETCH
4886   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4887   #endif
4888   #ifdef USE_MINI_HT
4889   if(rs1[i]==31) {
4890     do_miniht_load(ht,rh);
4891   }
4892   #endif
4893   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4894   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4895   //assert(adj==0);
4896   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4897   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4898   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4899     // special case for RFE
4900     emit_jmp(0);
4901   else
4902     emit_jns(0);
4903   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4904   #ifdef USE_MINI_HT
4905   if(rs1[i]==31) {
4906     do_miniht_jump(rs,rh,ht);
4907   }
4908   else
4909   #endif
4910   {
4911     //if(rs!=EAX) emit_mov(rs,EAX);
4912     //emit_jmp((int)jump_vaddr_eax);
4913     emit_jmp(jump_vaddr_reg[rs]);
4914   }
4915   /* Check hash table
4916   temp=!rs;
4917   emit_mov(rs,temp);
4918   emit_shrimm(rs,16,rs);
4919   emit_xor(temp,rs,rs);
4920   emit_movzwl_reg(rs,rs);
4921   emit_shlimm(rs,4,rs);
4922   emit_cmpmem_indexed((int)hash_table,rs,temp);
4923   emit_jne((int)out+14);
4924   emit_readword_indexed((int)hash_table+4,rs,rs);
4925   emit_jmpreg(rs);
4926   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4927   emit_addimm_no_flags(8,rs);
4928   emit_jeq((int)out-17);
4929   // No hit on hash table, call compiler
4930   emit_pushreg(temp);
4931 //DEBUG >
4932 #ifdef DEBUG_CYCLE_COUNT
4933   emit_readword((int)&last_count,ECX);
4934   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4935   emit_readword((int)&next_interupt,ECX);
4936   emit_writeword(HOST_CCREG,(int)&Count);
4937   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4938   emit_writeword(ECX,(int)&last_count);
4939 #endif
4940 //DEBUG <
4941   emit_storereg(CCREG,HOST_CCREG);
4942   emit_call((int)get_addr);
4943   emit_loadreg(CCREG,HOST_CCREG);
4944   emit_addimm(ESP,4,ESP);
4945   emit_jmpreg(EAX);*/
4946   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4947   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4948   #endif
4949 }
4950
4951 void cjump_assemble(int i,struct regstat *i_regs)
4952 {
4953   signed char *i_regmap=i_regs->regmap;
4954   int cc;
4955   int match;
4956   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4957   assem_debug("match=%d\n",match);
4958   int s1h,s1l,s2h,s2l;
4959   int prev_cop1_usable=cop1_usable;
4960   int unconditional=0,nop=0;
4961   int only32=0;
4962   int invert=0;
4963   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4964   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4965   if(!match) invert=1;
4966   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4967   if(i>(ba[i]-start)>>2) invert=1;
4968   #endif
4969
4970   if(ooo[i]) {
4971     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4972     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4973     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4974     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4975   }
4976   else {
4977     s1l=get_reg(i_regmap,rs1[i]);
4978     s1h=get_reg(i_regmap,rs1[i]|64);
4979     s2l=get_reg(i_regmap,rs2[i]);
4980     s2h=get_reg(i_regmap,rs2[i]|64);
4981   }
4982   if(rs1[i]==0&&rs2[i]==0)
4983   {
4984     if(opcode[i]&1) nop=1;
4985     else unconditional=1;
4986     //assert(opcode[i]!=5);
4987     //assert(opcode[i]!=7);
4988     //assert(opcode[i]!=0x15);
4989     //assert(opcode[i]!=0x17);
4990   }
4991   else if(rs1[i]==0)
4992   {
4993     s1l=s2l;s1h=s2h;
4994     s2l=s2h=-1;
4995     only32=(regs[i].was32>>rs2[i])&1;
4996   }
4997   else if(rs2[i]==0)
4998   {
4999     s2l=s2h=-1;
5000     only32=(regs[i].was32>>rs1[i])&1;
5001   }
5002   else {
5003     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5004   }
5005
5006   if(ooo[i]) {
5007     // Out of order execution (delay slot first)
5008     //printf("OOOE\n");
5009     address_generation(i+1,i_regs,regs[i].regmap_entry);
5010     ds_assemble(i+1,i_regs);
5011     int adj;
5012     uint64_t bc_unneeded=branch_regs[i].u;
5013     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5014     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5015     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5016     bc_unneeded|=1;
5017     bc_unneeded_upper|=1;
5018     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5019                   bc_unneeded,bc_unneeded_upper);
5020     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5021     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5022     cc=get_reg(branch_regs[i].regmap,CCREG);
5023     assert(cc==HOST_CCREG);
5024     if(unconditional)
5025       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5026     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5027     //assem_debug("cycle count (adj)\n");
5028     if(unconditional) {
5029       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5030       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5031         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5032         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5033         if(internal)
5034           assem_debug("branch: internal\n");
5035         else
5036           assem_debug("branch: external\n");
5037         if(internal&&is_ds[(ba[i]-start)>>2]) {
5038           ds_assemble_entry(i);
5039         }
5040         else {
5041           add_to_linker((int)out,ba[i],internal);
5042           emit_jmp(0);
5043         }
5044         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5045         if(((u_int)out)&7) emit_addnop(0);
5046         #endif
5047       }
5048     }
5049     else if(nop) {
5050       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5051       int jaddr=(int)out;
5052       emit_jns(0);
5053       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5054     }
5055     else {
5056       int taken=0,nottaken=0,nottaken1=0;
5057       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5058       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5059       if(!only32)
5060       {
5061         assert(s1h>=0);
5062         if(opcode[i]==4) // BEQ
5063         {
5064           if(s2h>=0) emit_cmp(s1h,s2h);
5065           else emit_test(s1h,s1h);
5066           nottaken1=(int)out;
5067           emit_jne(1);
5068         }
5069         if(opcode[i]==5) // BNE
5070         {
5071           if(s2h>=0) emit_cmp(s1h,s2h);
5072           else emit_test(s1h,s1h);
5073           if(invert) taken=(int)out;
5074           else add_to_linker((int)out,ba[i],internal);
5075           emit_jne(0);
5076         }
5077         if(opcode[i]==6) // BLEZ
5078         {
5079           emit_test(s1h,s1h);
5080           if(invert) taken=(int)out;
5081           else add_to_linker((int)out,ba[i],internal);
5082           emit_js(0);
5083           nottaken1=(int)out;
5084           emit_jne(1);
5085         }
5086         if(opcode[i]==7) // BGTZ
5087         {
5088           emit_test(s1h,s1h);
5089           nottaken1=(int)out;
5090           emit_js(1);
5091           if(invert) taken=(int)out;
5092           else add_to_linker((int)out,ba[i],internal);
5093           emit_jne(0);
5094         }
5095       } // if(!only32)
5096
5097       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5098       assert(s1l>=0);
5099       if(opcode[i]==4) // BEQ
5100       {
5101         if(s2l>=0) emit_cmp(s1l,s2l);
5102         else emit_test(s1l,s1l);
5103         if(invert){
5104           nottaken=(int)out;
5105           emit_jne(1);
5106         }else{
5107           add_to_linker((int)out,ba[i],internal);
5108           emit_jeq(0);
5109         }
5110       }
5111       if(opcode[i]==5) // BNE
5112       {
5113         if(s2l>=0) emit_cmp(s1l,s2l);
5114         else emit_test(s1l,s1l);
5115         if(invert){
5116           nottaken=(int)out;
5117           emit_jeq(1);
5118         }else{
5119           add_to_linker((int)out,ba[i],internal);
5120           emit_jne(0);
5121         }
5122       }
5123       if(opcode[i]==6) // BLEZ
5124       {
5125         emit_cmpimm(s1l,1);
5126         if(invert){
5127           nottaken=(int)out;
5128           emit_jge(1);
5129         }else{
5130           add_to_linker((int)out,ba[i],internal);
5131           emit_jl(0);
5132         }
5133       }
5134       if(opcode[i]==7) // BGTZ
5135       {
5136         emit_cmpimm(s1l,1);
5137         if(invert){
5138           nottaken=(int)out;
5139           emit_jl(1);
5140         }else{
5141           add_to_linker((int)out,ba[i],internal);
5142           emit_jge(0);
5143         }
5144       }
5145       if(invert) {
5146         if(taken) set_jump_target(taken,(int)out);
5147         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5148         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5149           if(adj) {
5150             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5151             add_to_linker((int)out,ba[i],internal);
5152           }else{
5153             emit_addnop(13);
5154             add_to_linker((int)out,ba[i],internal*2);
5155           }
5156           emit_jmp(0);
5157         }else
5158         #endif
5159         {
5160           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5161           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5162           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5163           if(internal)
5164             assem_debug("branch: internal\n");
5165           else
5166             assem_debug("branch: external\n");
5167           if(internal&&is_ds[(ba[i]-start)>>2]) {
5168             ds_assemble_entry(i);
5169           }
5170           else {
5171             add_to_linker((int)out,ba[i],internal);
5172             emit_jmp(0);
5173           }
5174         }
5175         set_jump_target(nottaken,(int)out);
5176       }
5177
5178       if(nottaken1) set_jump_target(nottaken1,(int)out);
5179       if(adj) {
5180         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5181       }
5182     } // (!unconditional)
5183   } // if(ooo)
5184   else
5185   {
5186     // In-order execution (branch first)
5187     //if(likely[i]) printf("IOL\n");
5188     //else
5189     //printf("IOE\n");
5190     int taken=0,nottaken=0,nottaken1=0;
5191     if(!unconditional&&!nop) {
5192       if(!only32)
5193       {
5194         assert(s1h>=0);
5195         if((opcode[i]&0x2f)==4) // BEQ
5196         {
5197           if(s2h>=0) emit_cmp(s1h,s2h);
5198           else emit_test(s1h,s1h);
5199           nottaken1=(int)out;
5200           emit_jne(2);
5201         }
5202         if((opcode[i]&0x2f)==5) // BNE
5203         {
5204           if(s2h>=0) emit_cmp(s1h,s2h);
5205           else emit_test(s1h,s1h);
5206           taken=(int)out;
5207           emit_jne(1);
5208         }
5209         if((opcode[i]&0x2f)==6) // BLEZ
5210         {
5211           emit_test(s1h,s1h);
5212           taken=(int)out;
5213           emit_js(1);
5214           nottaken1=(int)out;
5215           emit_jne(2);
5216         }
5217         if((opcode[i]&0x2f)==7) // BGTZ
5218         {
5219           emit_test(s1h,s1h);
5220           nottaken1=(int)out;
5221           emit_js(2);
5222           taken=(int)out;
5223           emit_jne(1);
5224         }
5225       } // if(!only32)
5226
5227       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5228       assert(s1l>=0);
5229       if((opcode[i]&0x2f)==4) // BEQ
5230       {
5231         if(s2l>=0) emit_cmp(s1l,s2l);
5232         else emit_test(s1l,s1l);
5233         nottaken=(int)out;
5234         emit_jne(2);
5235       }
5236       if((opcode[i]&0x2f)==5) // BNE
5237       {
5238         if(s2l>=0) emit_cmp(s1l,s2l);
5239         else emit_test(s1l,s1l);
5240         nottaken=(int)out;
5241         emit_jeq(2);
5242       }
5243       if((opcode[i]&0x2f)==6) // BLEZ
5244       {
5245         emit_cmpimm(s1l,1);
5246         nottaken=(int)out;
5247         emit_jge(2);
5248       }
5249       if((opcode[i]&0x2f)==7) // BGTZ
5250       {
5251         emit_cmpimm(s1l,1);
5252         nottaken=(int)out;
5253         emit_jl(2);
5254       }
5255     } // if(!unconditional)
5256     int adj;
5257     uint64_t ds_unneeded=branch_regs[i].u;
5258     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5259     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5260     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5261     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5262     ds_unneeded|=1;
5263     ds_unneeded_upper|=1;
5264     // branch taken
5265     if(!nop) {
5266       if(taken) set_jump_target(taken,(int)out);
5267       assem_debug("1:\n");
5268       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5269                     ds_unneeded,ds_unneeded_upper);
5270       // load regs
5271       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5272       address_generation(i+1,&branch_regs[i],0);
5273       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5274       ds_assemble(i+1,&branch_regs[i]);
5275       cc=get_reg(branch_regs[i].regmap,CCREG);
5276       if(cc==-1) {
5277         emit_loadreg(CCREG,cc=HOST_CCREG);
5278         // CHECK: Is the following instruction (fall thru) allocated ok?
5279       }
5280       assert(cc==HOST_CCREG);
5281       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5282       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5283       assem_debug("cycle count (adj)\n");
5284       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5285       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5286       if(internal)
5287         assem_debug("branch: internal\n");
5288       else
5289         assem_debug("branch: external\n");
5290       if(internal&&is_ds[(ba[i]-start)>>2]) {
5291         ds_assemble_entry(i);
5292       }
5293       else {
5294         add_to_linker((int)out,ba[i],internal);
5295         emit_jmp(0);
5296       }
5297     }
5298     // branch not taken
5299     cop1_usable=prev_cop1_usable;
5300     if(!unconditional) {
5301       if(nottaken1) set_jump_target(nottaken1,(int)out);
5302       set_jump_target(nottaken,(int)out);
5303       assem_debug("2:\n");
5304       if(!likely[i]) {
5305         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5306                       ds_unneeded,ds_unneeded_upper);
5307         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5308         address_generation(i+1,&branch_regs[i],0);
5309         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5310         ds_assemble(i+1,&branch_regs[i]);
5311       }
5312       cc=get_reg(branch_regs[i].regmap,CCREG);
5313       if(cc==-1&&!likely[i]) {
5314         // Cycle count isn't in a register, temporarily load it then write it out
5315         emit_loadreg(CCREG,HOST_CCREG);
5316         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5317         int jaddr=(int)out;
5318         emit_jns(0);
5319         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5320         emit_storereg(CCREG,HOST_CCREG);
5321       }
5322       else{
5323         cc=get_reg(i_regmap,CCREG);
5324         assert(cc==HOST_CCREG);
5325         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5326         int jaddr=(int)out;
5327         emit_jns(0);
5328         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5329       }
5330     }
5331   }
5332 }
5333
5334 void sjump_assemble(int i,struct regstat *i_regs)
5335 {
5336   signed char *i_regmap=i_regs->regmap;
5337   int cc;
5338   int match;
5339   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5340   assem_debug("smatch=%d\n",match);
5341   int s1h,s1l;
5342   int prev_cop1_usable=cop1_usable;
5343   int unconditional=0,nevertaken=0;
5344   int only32=0;
5345   int invert=0;
5346   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5347   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5348   if(!match) invert=1;
5349   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5350   if(i>(ba[i]-start)>>2) invert=1;
5351   #endif
5352
5353   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5354   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5355
5356   if(ooo[i]) {
5357     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5358     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5359   }
5360   else {
5361     s1l=get_reg(i_regmap,rs1[i]);
5362     s1h=get_reg(i_regmap,rs1[i]|64);
5363   }
5364   if(rs1[i]==0)
5365   {
5366     if(opcode2[i]&1) unconditional=1;
5367     else nevertaken=1;
5368     // These are never taken (r0 is never less than zero)
5369     //assert(opcode2[i]!=0);
5370     //assert(opcode2[i]!=2);
5371     //assert(opcode2[i]!=0x10);
5372     //assert(opcode2[i]!=0x12);
5373   }
5374   else {
5375     only32=(regs[i].was32>>rs1[i])&1;
5376   }
5377
5378   if(ooo[i]) {
5379     // Out of order execution (delay slot first)
5380     //printf("OOOE\n");
5381     address_generation(i+1,i_regs,regs[i].regmap_entry);
5382     ds_assemble(i+1,i_regs);
5383     int adj;
5384     uint64_t bc_unneeded=branch_regs[i].u;
5385     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5386     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5387     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5388     bc_unneeded|=1;
5389     bc_unneeded_upper|=1;
5390     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5391                   bc_unneeded,bc_unneeded_upper);
5392     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5393     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5394     if(rt1[i]==31) {
5395       int rt,return_address;
5396       rt=get_reg(branch_regs[i].regmap,31);
5397       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5398       if(rt>=0) {
5399         // Save the PC even if the branch is not taken
5400         return_address=start+i*4+8;
5401         emit_movimm(return_address,rt); // PC into link register
5402         #ifdef IMM_PREFETCH
5403         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5404         #endif
5405       }
5406     }
5407     cc=get_reg(branch_regs[i].regmap,CCREG);
5408     assert(cc==HOST_CCREG);
5409     if(unconditional)
5410       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5411     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5412     assem_debug("cycle count (adj)\n");
5413     if(unconditional) {
5414       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5415       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5416         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5417         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5418         if(internal)
5419           assem_debug("branch: internal\n");
5420         else
5421           assem_debug("branch: external\n");
5422         if(internal&&is_ds[(ba[i]-start)>>2]) {
5423           ds_assemble_entry(i);
5424         }
5425         else {
5426           add_to_linker((int)out,ba[i],internal);
5427           emit_jmp(0);
5428         }
5429         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5430         if(((u_int)out)&7) emit_addnop(0);
5431         #endif
5432       }
5433     }
5434     else if(nevertaken) {
5435       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5436       int jaddr=(int)out;
5437       emit_jns(0);
5438       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5439     }
5440     else {
5441       int nottaken=0;
5442       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5443       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5444       if(!only32)
5445       {
5446         assert(s1h>=0);
5447         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5448         {
5449           emit_test(s1h,s1h);
5450           if(invert){
5451             nottaken=(int)out;
5452             emit_jns(1);
5453           }else{
5454             add_to_linker((int)out,ba[i],internal);
5455             emit_js(0);
5456           }
5457         }
5458         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5459         {
5460           emit_test(s1h,s1h);
5461           if(invert){
5462             nottaken=(int)out;
5463             emit_js(1);
5464           }else{
5465             add_to_linker((int)out,ba[i],internal);
5466             emit_jns(0);
5467           }
5468         }
5469       } // if(!only32)
5470       else
5471       {
5472         assert(s1l>=0);
5473         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5474         {
5475           emit_test(s1l,s1l);
5476           if(invert){
5477             nottaken=(int)out;
5478             emit_jns(1);
5479           }else{
5480             add_to_linker((int)out,ba[i],internal);
5481             emit_js(0);
5482           }
5483         }
5484         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5485         {
5486           emit_test(s1l,s1l);
5487           if(invert){
5488             nottaken=(int)out;
5489             emit_js(1);
5490           }else{
5491             add_to_linker((int)out,ba[i],internal);
5492             emit_jns(0);
5493           }
5494         }
5495       } // if(!only32)
5496
5497       if(invert) {
5498         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5499         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5500           if(adj) {
5501             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5502             add_to_linker((int)out,ba[i],internal);
5503           }else{
5504             emit_addnop(13);
5505             add_to_linker((int)out,ba[i],internal*2);
5506           }
5507           emit_jmp(0);
5508         }else
5509         #endif
5510         {
5511           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5512           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5513           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5514           if(internal)
5515             assem_debug("branch: internal\n");
5516           else
5517             assem_debug("branch: external\n");
5518           if(internal&&is_ds[(ba[i]-start)>>2]) {
5519             ds_assemble_entry(i);
5520           }
5521           else {
5522             add_to_linker((int)out,ba[i],internal);
5523             emit_jmp(0);
5524           }
5525         }
5526         set_jump_target(nottaken,(int)out);
5527       }
5528
5529       if(adj) {
5530         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5531       }
5532     } // (!unconditional)
5533   } // if(ooo)
5534   else
5535   {
5536     // In-order execution (branch first)
5537     //printf("IOE\n");
5538     int nottaken=0;
5539     if(rt1[i]==31) {
5540       int rt,return_address;
5541       rt=get_reg(branch_regs[i].regmap,31);
5542       if(rt>=0) {
5543         // Save the PC even if the branch is not taken
5544         return_address=start+i*4+8;
5545         emit_movimm(return_address,rt); // PC into link register
5546         #ifdef IMM_PREFETCH
5547         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5548         #endif
5549       }
5550     }
5551     if(!unconditional) {
5552       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5553       if(!only32)
5554       {
5555         assert(s1h>=0);
5556         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5557         {
5558           emit_test(s1h,s1h);
5559           nottaken=(int)out;
5560           emit_jns(1);
5561         }
5562         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5563         {
5564           emit_test(s1h,s1h);
5565           nottaken=(int)out;
5566           emit_js(1);
5567         }
5568       } // if(!only32)
5569       else
5570       {
5571         assert(s1l>=0);
5572         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5573         {
5574           emit_test(s1l,s1l);
5575           nottaken=(int)out;
5576           emit_jns(1);
5577         }
5578         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5579         {
5580           emit_test(s1l,s1l);
5581           nottaken=(int)out;
5582           emit_js(1);
5583         }
5584       }
5585     } // if(!unconditional)
5586     int adj;
5587     uint64_t ds_unneeded=branch_regs[i].u;
5588     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5589     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5590     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5591     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5592     ds_unneeded|=1;
5593     ds_unneeded_upper|=1;
5594     // branch taken
5595     if(!nevertaken) {
5596       //assem_debug("1:\n");
5597       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5598                     ds_unneeded,ds_unneeded_upper);
5599       // load regs
5600       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5601       address_generation(i+1,&branch_regs[i],0);
5602       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5603       ds_assemble(i+1,&branch_regs[i]);
5604       cc=get_reg(branch_regs[i].regmap,CCREG);
5605       if(cc==-1) {
5606         emit_loadreg(CCREG,cc=HOST_CCREG);
5607         // CHECK: Is the following instruction (fall thru) allocated ok?
5608       }
5609       assert(cc==HOST_CCREG);
5610       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5611       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5612       assem_debug("cycle count (adj)\n");
5613       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5614       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5615       if(internal)
5616         assem_debug("branch: internal\n");
5617       else
5618         assem_debug("branch: external\n");
5619       if(internal&&is_ds[(ba[i]-start)>>2]) {
5620         ds_assemble_entry(i);
5621       }
5622       else {
5623         add_to_linker((int)out,ba[i],internal);
5624         emit_jmp(0);
5625       }
5626     }
5627     // branch not taken
5628     cop1_usable=prev_cop1_usable;
5629     if(!unconditional) {
5630       set_jump_target(nottaken,(int)out);
5631       assem_debug("1:\n");
5632       if(!likely[i]) {
5633         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5634                       ds_unneeded,ds_unneeded_upper);
5635         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5636         address_generation(i+1,&branch_regs[i],0);
5637         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5638         ds_assemble(i+1,&branch_regs[i]);
5639       }
5640       cc=get_reg(branch_regs[i].regmap,CCREG);
5641       if(cc==-1&&!likely[i]) {
5642         // Cycle count isn't in a register, temporarily load it then write it out
5643         emit_loadreg(CCREG,HOST_CCREG);
5644         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5645         int jaddr=(int)out;
5646         emit_jns(0);
5647         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5648         emit_storereg(CCREG,HOST_CCREG);
5649       }
5650       else{
5651         cc=get_reg(i_regmap,CCREG);
5652         assert(cc==HOST_CCREG);
5653         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5654         int jaddr=(int)out;
5655         emit_jns(0);
5656         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5657       }
5658     }
5659   }
5660 }
5661
5662 void fjump_assemble(int i,struct regstat *i_regs)
5663 {
5664   signed char *i_regmap=i_regs->regmap;
5665   int cc;
5666   int match;
5667   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5668   assem_debug("fmatch=%d\n",match);
5669   int fs,cs;
5670   int eaddr;
5671   int invert=0;
5672   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5673   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5674   if(!match) invert=1;
5675   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5676   if(i>(ba[i]-start)>>2) invert=1;
5677   #endif
5678
5679   if(ooo[i]) {
5680     fs=get_reg(branch_regs[i].regmap,FSREG);
5681     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5682   }
5683   else {
5684     fs=get_reg(i_regmap,FSREG);
5685   }
5686
5687   // Check cop1 unusable
5688   if(!cop1_usable) {
5689     cs=get_reg(i_regmap,CSREG);
5690     assert(cs>=0);
5691     emit_testimm(cs,0x20000000);
5692     eaddr=(int)out;
5693     emit_jeq(0);
5694     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5695     cop1_usable=1;
5696   }
5697
5698   if(ooo[i]) {
5699     // Out of order execution (delay slot first)
5700     //printf("OOOE\n");
5701     ds_assemble(i+1,i_regs);
5702     int adj;
5703     uint64_t bc_unneeded=branch_regs[i].u;
5704     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5705     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5706     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5707     bc_unneeded|=1;
5708     bc_unneeded_upper|=1;
5709     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5710                   bc_unneeded,bc_unneeded_upper);
5711     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5712     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5713     cc=get_reg(branch_regs[i].regmap,CCREG);
5714     assert(cc==HOST_CCREG);
5715     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5716     assem_debug("cycle count (adj)\n");
5717     if(1) {
5718       int nottaken=0;
5719       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5720       if(1) {
5721         assert(fs>=0);
5722         emit_testimm(fs,0x800000);
5723         if(source[i]&0x10000) // BC1T
5724         {
5725           if(invert){
5726             nottaken=(int)out;
5727             emit_jeq(1);
5728           }else{
5729             add_to_linker((int)out,ba[i],internal);
5730             emit_jne(0);
5731           }
5732         }
5733         else // BC1F
5734           if(invert){
5735             nottaken=(int)out;
5736             emit_jne(1);
5737           }else{
5738             add_to_linker((int)out,ba[i],internal);
5739             emit_jeq(0);
5740           }
5741         {
5742         }
5743       } // if(!only32)
5744
5745       if(invert) {
5746         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5747         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5748         else if(match) emit_addnop(13);
5749         #endif
5750         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5751         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5752         if(internal)
5753           assem_debug("branch: internal\n");
5754         else
5755           assem_debug("branch: external\n");
5756         if(internal&&is_ds[(ba[i]-start)>>2]) {
5757           ds_assemble_entry(i);
5758         }
5759         else {
5760           add_to_linker((int)out,ba[i],internal);
5761           emit_jmp(0);
5762         }
5763         set_jump_target(nottaken,(int)out);
5764       }
5765
5766       if(adj) {
5767         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5768       }
5769     } // (!unconditional)
5770   } // if(ooo)
5771   else
5772   {
5773     // In-order execution (branch first)
5774     //printf("IOE\n");
5775     int nottaken=0;
5776     if(1) {
5777       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5778       if(1) {
5779         assert(fs>=0);
5780         emit_testimm(fs,0x800000);
5781         if(source[i]&0x10000) // BC1T
5782         {
5783           nottaken=(int)out;
5784           emit_jeq(1);
5785         }
5786         else // BC1F
5787         {
5788           nottaken=(int)out;
5789           emit_jne(1);
5790         }
5791       }
5792     } // if(!unconditional)
5793     int adj;
5794     uint64_t ds_unneeded=branch_regs[i].u;
5795     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5796     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5797     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5798     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5799     ds_unneeded|=1;
5800     ds_unneeded_upper|=1;
5801     // branch taken
5802     //assem_debug("1:\n");
5803     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5804                   ds_unneeded,ds_unneeded_upper);
5805     // load regs
5806     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5807     address_generation(i+1,&branch_regs[i],0);
5808     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5809     ds_assemble(i+1,&branch_regs[i]);
5810     cc=get_reg(branch_regs[i].regmap,CCREG);
5811     if(cc==-1) {
5812       emit_loadreg(CCREG,cc=HOST_CCREG);
5813       // CHECK: Is the following instruction (fall thru) allocated ok?
5814     }
5815     assert(cc==HOST_CCREG);
5816     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5817     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5818     assem_debug("cycle count (adj)\n");
5819     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5820     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5821     if(internal)
5822       assem_debug("branch: internal\n");
5823     else
5824       assem_debug("branch: external\n");
5825     if(internal&&is_ds[(ba[i]-start)>>2]) {
5826       ds_assemble_entry(i);
5827     }
5828     else {
5829       add_to_linker((int)out,ba[i],internal);
5830       emit_jmp(0);
5831     }
5832
5833     // branch not taken
5834     if(1) { // <- FIXME (don't need this)
5835       set_jump_target(nottaken,(int)out);
5836       assem_debug("1:\n");
5837       if(!likely[i]) {
5838         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5839                       ds_unneeded,ds_unneeded_upper);
5840         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5841         address_generation(i+1,&branch_regs[i],0);
5842         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5843         ds_assemble(i+1,&branch_regs[i]);
5844       }
5845       cc=get_reg(branch_regs[i].regmap,CCREG);
5846       if(cc==-1&&!likely[i]) {
5847         // Cycle count isn't in a register, temporarily load it then write it out
5848         emit_loadreg(CCREG,HOST_CCREG);
5849         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5850         int jaddr=(int)out;
5851         emit_jns(0);
5852         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5853         emit_storereg(CCREG,HOST_CCREG);
5854       }
5855       else{
5856         cc=get_reg(i_regmap,CCREG);
5857         assert(cc==HOST_CCREG);
5858         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5859         int jaddr=(int)out;
5860         emit_jns(0);
5861         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5862       }
5863     }
5864   }
5865 }
5866
5867 static void pagespan_assemble(int i,struct regstat *i_regs)
5868 {
5869   int s1l=get_reg(i_regs->regmap,rs1[i]);
5870   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5871   int s2l=get_reg(i_regs->regmap,rs2[i]);
5872   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5873   int taken=0;
5874   int nottaken=0;
5875   int unconditional=0;
5876   if(rs1[i]==0)
5877   {
5878     s1l=s2l;s1h=s2h;
5879     s2l=s2h=-1;
5880   }
5881   else if(rs2[i]==0)
5882   {
5883     s2l=s2h=-1;
5884   }
5885   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5886     s1h=s2h=-1;
5887   }
5888   int hr=0;
5889   int addr=-1,alt=-1,ntaddr=-1;
5890   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5891   else {
5892     while(hr<HOST_REGS)
5893     {
5894       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5895          (i_regs->regmap[hr]&63)!=rs1[i] &&
5896          (i_regs->regmap[hr]&63)!=rs2[i] )
5897       {
5898         addr=hr++;break;
5899       }
5900       hr++;
5901     }
5902   }
5903   while(hr<HOST_REGS)
5904   {
5905     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5906        (i_regs->regmap[hr]&63)!=rs1[i] &&
5907        (i_regs->regmap[hr]&63)!=rs2[i] )
5908     {
5909       alt=hr++;break;
5910     }
5911     hr++;
5912   }
5913   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5914   {
5915     while(hr<HOST_REGS)
5916     {
5917       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5918          (i_regs->regmap[hr]&63)!=rs1[i] &&
5919          (i_regs->regmap[hr]&63)!=rs2[i] )
5920       {
5921         ntaddr=hr;break;
5922       }
5923       hr++;
5924     }
5925   }
5926   assert(hr<HOST_REGS);
5927   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5928     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5929   }
5930   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5931   if(opcode[i]==2) // J
5932   {
5933     unconditional=1;
5934   }
5935   if(opcode[i]==3) // JAL
5936   {
5937     // TODO: mini_ht
5938     int rt=get_reg(i_regs->regmap,31);
5939     emit_movimm(start+i*4+8,rt);
5940     unconditional=1;
5941   }
5942   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5943   {
5944     emit_mov(s1l,addr);
5945     if(opcode2[i]==9) // JALR
5946     {
5947       int rt=get_reg(i_regs->regmap,rt1[i]);
5948       emit_movimm(start+i*4+8,rt);
5949     }
5950   }
5951   if((opcode[i]&0x3f)==4) // BEQ
5952   {
5953     if(rs1[i]==rs2[i])
5954     {
5955       unconditional=1;
5956     }
5957     else
5958     #ifdef HAVE_CMOV_IMM
5959     if(s1h<0) {
5960       if(s2l>=0) emit_cmp(s1l,s2l);
5961       else emit_test(s1l,s1l);
5962       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5963     }
5964     else
5965     #endif
5966     {
5967       assert(s1l>=0);
5968       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5969       if(s1h>=0) {
5970         if(s2h>=0) emit_cmp(s1h,s2h);
5971         else emit_test(s1h,s1h);
5972         emit_cmovne_reg(alt,addr);
5973       }
5974       if(s2l>=0) emit_cmp(s1l,s2l);
5975       else emit_test(s1l,s1l);
5976       emit_cmovne_reg(alt,addr);
5977     }
5978   }
5979   if((opcode[i]&0x3f)==5) // BNE
5980   {
5981     #ifdef HAVE_CMOV_IMM
5982     if(s1h<0) {
5983       if(s2l>=0) emit_cmp(s1l,s2l);
5984       else emit_test(s1l,s1l);
5985       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5986     }
5987     else
5988     #endif
5989     {
5990       assert(s1l>=0);
5991       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5992       if(s1h>=0) {
5993         if(s2h>=0) emit_cmp(s1h,s2h);
5994         else emit_test(s1h,s1h);
5995         emit_cmovne_reg(alt,addr);
5996       }
5997       if(s2l>=0) emit_cmp(s1l,s2l);
5998       else emit_test(s1l,s1l);
5999       emit_cmovne_reg(alt,addr);
6000     }
6001   }
6002   if((opcode[i]&0x3f)==0x14) // BEQL
6003   {
6004     if(s1h>=0) {
6005       if(s2h>=0) emit_cmp(s1h,s2h);
6006       else emit_test(s1h,s1h);
6007       nottaken=(int)out;
6008       emit_jne(0);
6009     }
6010     if(s2l>=0) emit_cmp(s1l,s2l);
6011     else emit_test(s1l,s1l);
6012     if(nottaken) set_jump_target(nottaken,(int)out);
6013     nottaken=(int)out;
6014     emit_jne(0);
6015   }
6016   if((opcode[i]&0x3f)==0x15) // BNEL
6017   {
6018     if(s1h>=0) {
6019       if(s2h>=0) emit_cmp(s1h,s2h);
6020       else emit_test(s1h,s1h);
6021       taken=(int)out;
6022       emit_jne(0);
6023     }
6024     if(s2l>=0) emit_cmp(s1l,s2l);
6025     else emit_test(s1l,s1l);
6026     nottaken=(int)out;
6027     emit_jeq(0);
6028     if(taken) set_jump_target(taken,(int)out);
6029   }
6030   if((opcode[i]&0x3f)==6) // BLEZ
6031   {
6032     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6033     emit_cmpimm(s1l,1);
6034     if(s1h>=0) emit_mov(addr,ntaddr);
6035     emit_cmovl_reg(alt,addr);
6036     if(s1h>=0) {
6037       emit_test(s1h,s1h);
6038       emit_cmovne_reg(ntaddr,addr);
6039       emit_cmovs_reg(alt,addr);
6040     }
6041   }
6042   if((opcode[i]&0x3f)==7) // BGTZ
6043   {
6044     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6045     emit_cmpimm(s1l,1);
6046     if(s1h>=0) emit_mov(addr,alt);
6047     emit_cmovl_reg(ntaddr,addr);
6048     if(s1h>=0) {
6049       emit_test(s1h,s1h);
6050       emit_cmovne_reg(alt,addr);
6051       emit_cmovs_reg(ntaddr,addr);
6052     }
6053   }
6054   if((opcode[i]&0x3f)==0x16) // BLEZL
6055   {
6056     assert((opcode[i]&0x3f)!=0x16);
6057   }
6058   if((opcode[i]&0x3f)==0x17) // BGTZL
6059   {
6060     assert((opcode[i]&0x3f)!=0x17);
6061   }
6062   assert(opcode[i]!=1); // BLTZ/BGEZ
6063
6064   //FIXME: Check CSREG
6065   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6066     if((source[i]&0x30000)==0) // BC1F
6067     {
6068       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6069       emit_testimm(s1l,0x800000);
6070       emit_cmovne_reg(alt,addr);
6071     }
6072     if((source[i]&0x30000)==0x10000) // BC1T
6073     {
6074       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6075       emit_testimm(s1l,0x800000);
6076       emit_cmovne_reg(alt,addr);
6077     }
6078     if((source[i]&0x30000)==0x20000) // BC1FL
6079     {
6080       emit_testimm(s1l,0x800000);
6081       nottaken=(int)out;
6082       emit_jne(0);
6083     }
6084     if((source[i]&0x30000)==0x30000) // BC1TL
6085     {
6086       emit_testimm(s1l,0x800000);
6087       nottaken=(int)out;
6088       emit_jeq(0);
6089     }
6090   }
6091
6092   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6093   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6094   if(likely[i]||unconditional)
6095   {
6096     emit_movimm(ba[i],HOST_BTREG);
6097   }
6098   else if(addr!=HOST_BTREG)
6099   {
6100     emit_mov(addr,HOST_BTREG);
6101   }
6102   void *branch_addr=out;
6103   emit_jmp(0);
6104   int target_addr=start+i*4+5;
6105   void *stub=out;
6106   void *compiled_target_addr=check_addr(target_addr);
6107   emit_extjump_ds((int)branch_addr,target_addr);
6108   if(compiled_target_addr) {
6109     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6110     add_link(target_addr,stub);
6111   }
6112   else set_jump_target((int)branch_addr,(int)stub);
6113   if(likely[i]) {
6114     // Not-taken path
6115     set_jump_target((int)nottaken,(int)out);
6116     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6117     void *branch_addr=out;
6118     emit_jmp(0);
6119     int target_addr=start+i*4+8;
6120     void *stub=out;
6121     void *compiled_target_addr=check_addr(target_addr);
6122     emit_extjump_ds((int)branch_addr,target_addr);
6123     if(compiled_target_addr) {
6124       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6125       add_link(target_addr,stub);
6126     }
6127     else set_jump_target((int)branch_addr,(int)stub);
6128   }
6129 }
6130
6131 // Assemble the delay slot for the above
6132 static void pagespan_ds()
6133 {
6134   assem_debug("initial delay slot:\n");
6135   u_int vaddr=start+1;
6136   u_int page=get_page(vaddr);
6137   u_int vpage=get_vpage(vaddr);
6138   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6139   do_dirty_stub_ds();
6140   ll_add(jump_in+page,vaddr,(void *)out);
6141   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6142   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6143     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6144   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6145     emit_writeword(HOST_BTREG,(int)&branch_target);
6146   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6147   address_generation(0,&regs[0],regs[0].regmap_entry);
6148   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6149     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6150   cop1_usable=0;
6151   is_delayslot=0;
6152   switch(itype[0]) {
6153     case ALU:
6154       alu_assemble(0,&regs[0]);break;
6155     case IMM16:
6156       imm16_assemble(0,&regs[0]);break;
6157     case SHIFT:
6158       shift_assemble(0,&regs[0]);break;
6159     case SHIFTIMM:
6160       shiftimm_assemble(0,&regs[0]);break;
6161     case LOAD:
6162       load_assemble(0,&regs[0]);break;
6163     case LOADLR:
6164       loadlr_assemble(0,&regs[0]);break;
6165     case STORE:
6166       store_assemble(0,&regs[0]);break;
6167     case STORELR:
6168       storelr_assemble(0,&regs[0]);break;
6169     case COP0:
6170       cop0_assemble(0,&regs[0]);break;
6171     case COP1:
6172       cop1_assemble(0,&regs[0]);break;
6173     case C1LS:
6174       c1ls_assemble(0,&regs[0]);break;
6175     case COP2:
6176       cop2_assemble(0,&regs[0]);break;
6177     case C2LS:
6178       c2ls_assemble(0,&regs[0]);break;
6179     case C2OP:
6180       c2op_assemble(0,&regs[0]);break;
6181     case FCONV:
6182       fconv_assemble(0,&regs[0]);break;
6183     case FLOAT:
6184       float_assemble(0,&regs[0]);break;
6185     case FCOMP:
6186       fcomp_assemble(0,&regs[0]);break;
6187     case MULTDIV:
6188       multdiv_assemble(0,&regs[0]);break;
6189     case MOV:
6190       mov_assemble(0,&regs[0]);break;
6191     case SYSCALL:
6192     case HLECALL:
6193     case INTCALL:
6194     case SPAN:
6195     case UJUMP:
6196     case RJUMP:
6197     case CJUMP:
6198     case SJUMP:
6199     case FJUMP:
6200       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6201   }
6202   int btaddr=get_reg(regs[0].regmap,BTREG);
6203   if(btaddr<0) {
6204     btaddr=get_reg(regs[0].regmap,-1);
6205     emit_readword((int)&branch_target,btaddr);
6206   }
6207   assert(btaddr!=HOST_CCREG);
6208   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6209 #ifdef HOST_IMM8
6210   emit_movimm(start+4,HOST_TEMPREG);
6211   emit_cmp(btaddr,HOST_TEMPREG);
6212 #else
6213   emit_cmpimm(btaddr,start+4);
6214 #endif
6215   int branch=(int)out;
6216   emit_jeq(0);
6217   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6218   emit_jmp(jump_vaddr_reg[btaddr]);
6219   set_jump_target(branch,(int)out);
6220   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6221   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6222 }
6223
6224 // Basic liveness analysis for MIPS registers
6225 void unneeded_registers(int istart,int iend,int r)
6226 {
6227   int i;
6228   uint64_t u,uu,gte_u,b,bu,gte_bu;
6229   uint64_t temp_u,temp_uu,temp_gte_u=0;
6230   uint64_t tdep;
6231   uint64_t gte_u_unknown=0;
6232   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6233     gte_u_unknown=~0ll;
6234   if(iend==slen-1) {
6235     u=1;uu=1;
6236     gte_u=gte_u_unknown;
6237   }else{
6238     u=unneeded_reg[iend+1];
6239     uu=unneeded_reg_upper[iend+1];
6240     u=1;uu=1;
6241     gte_u=gte_unneeded[iend+1];
6242   }
6243
6244   for (i=iend;i>=istart;i--)
6245   {
6246     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6247     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6248     {
6249       // If subroutine call, flag return address as a possible branch target
6250       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6251
6252       if(ba[i]<start || ba[i]>=(start+slen*4))
6253       {
6254         // Branch out of this block, flush all regs
6255         u=1;
6256         uu=1;
6257         gte_u=gte_u_unknown;
6258         /* Hexagon hack
6259         if(itype[i]==UJUMP&&rt1[i]==31)
6260         {
6261           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6262         }
6263         if(itype[i]==RJUMP&&rs1[i]==31)
6264         {
6265           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6266         }
6267         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6268           if(itype[i]==UJUMP&&rt1[i]==31)
6269           {
6270             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6271             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6272           }
6273           if(itype[i]==RJUMP&&rs1[i]==31)
6274           {
6275             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6276             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6277           }
6278         }*/
6279         branch_unneeded_reg[i]=u;
6280         branch_unneeded_reg_upper[i]=uu;
6281         // Merge in delay slot
6282         tdep=(~uu>>rt1[i+1])&1;
6283         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6284         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6285         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6286         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6287         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6288         u|=1;uu|=1;
6289         gte_u|=gte_rt[i+1];
6290         gte_u&=~gte_rs[i+1];
6291         // If branch is "likely" (and conditional)
6292         // then we skip the delay slot on the fall-thru path
6293         if(likely[i]) {
6294           if(i<slen-1) {
6295             u&=unneeded_reg[i+2];
6296             uu&=unneeded_reg_upper[i+2];
6297             gte_u&=gte_unneeded[i+2];
6298           }
6299           else
6300           {
6301             u=1;
6302             uu=1;
6303             gte_u=gte_u_unknown;
6304           }
6305         }
6306       }
6307       else
6308       {
6309         // Internal branch, flag target
6310         bt[(ba[i]-start)>>2]=1;
6311         if(ba[i]<=start+i*4) {
6312           // Backward branch
6313           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6314           {
6315             // Unconditional branch
6316             temp_u=1;temp_uu=1;
6317             temp_gte_u=0;
6318           } else {
6319             // Conditional branch (not taken case)
6320             temp_u=unneeded_reg[i+2];
6321             temp_uu=unneeded_reg_upper[i+2];
6322             temp_gte_u&=gte_unneeded[i+2];
6323           }
6324           // Merge in delay slot
6325           tdep=(~temp_uu>>rt1[i+1])&1;
6326           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6327           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6328           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6329           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6330           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6331           temp_u|=1;temp_uu|=1;
6332           temp_gte_u|=gte_rt[i+1];
6333           temp_gte_u&=~gte_rs[i+1];
6334           // If branch is "likely" (and conditional)
6335           // then we skip the delay slot on the fall-thru path
6336           if(likely[i]) {
6337             if(i<slen-1) {
6338               temp_u&=unneeded_reg[i+2];
6339               temp_uu&=unneeded_reg_upper[i+2];
6340               temp_gte_u&=gte_unneeded[i+2];
6341             }
6342             else
6343             {
6344               temp_u=1;
6345               temp_uu=1;
6346               temp_gte_u=gte_u_unknown;
6347             }
6348           }
6349           tdep=(~temp_uu>>rt1[i])&1;
6350           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6351           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6352           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6353           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6354           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6355           temp_u|=1;temp_uu|=1;
6356           temp_gte_u|=gte_rt[i];
6357           temp_gte_u&=~gte_rs[i];
6358           unneeded_reg[i]=temp_u;
6359           unneeded_reg_upper[i]=temp_uu;
6360           gte_unneeded[i]=temp_gte_u;
6361           // Only go three levels deep.  This recursion can take an
6362           // excessive amount of time if there are a lot of nested loops.
6363           if(r<2) {
6364             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6365           }else{
6366             unneeded_reg[(ba[i]-start)>>2]=1;
6367             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6368             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6369           }
6370         } /*else*/ if(1) {
6371           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6372           {
6373             // Unconditional branch
6374             u=unneeded_reg[(ba[i]-start)>>2];
6375             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6376             gte_u=gte_unneeded[(ba[i]-start)>>2];
6377             branch_unneeded_reg[i]=u;
6378             branch_unneeded_reg_upper[i]=uu;
6379         //u=1;
6380         //uu=1;
6381         //branch_unneeded_reg[i]=u;
6382         //branch_unneeded_reg_upper[i]=uu;
6383             // Merge in delay slot
6384             tdep=(~uu>>rt1[i+1])&1;
6385             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6386             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6387             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6388             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6389             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6390             u|=1;uu|=1;
6391             gte_u|=gte_rt[i+1];
6392             gte_u&=~gte_rs[i+1];
6393           } else {
6394             // Conditional branch
6395             b=unneeded_reg[(ba[i]-start)>>2];
6396             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6397             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6398             branch_unneeded_reg[i]=b;
6399             branch_unneeded_reg_upper[i]=bu;
6400         //b=1;
6401         //bu=1;
6402         //branch_unneeded_reg[i]=b;
6403         //branch_unneeded_reg_upper[i]=bu;
6404             // Branch delay slot
6405             tdep=(~uu>>rt1[i+1])&1;
6406             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6407             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6408             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6409             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6410             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6411             b|=1;bu|=1;
6412             gte_bu|=gte_rt[i+1];
6413             gte_bu&=~gte_rs[i+1];
6414             // If branch is "likely" then we skip the
6415             // delay slot on the fall-thru path
6416             if(likely[i]) {
6417               u=b;
6418               uu=bu;
6419               gte_u=gte_bu;
6420               if(i<slen-1) {
6421                 u&=unneeded_reg[i+2];
6422                 uu&=unneeded_reg_upper[i+2];
6423                 gte_u&=gte_unneeded[i+2];
6424         //u=1;
6425         //uu=1;
6426               }
6427             } else {
6428               u&=b;
6429               uu&=bu;
6430               gte_u&=gte_bu;
6431         //u=1;
6432         //uu=1;
6433             }
6434             if(i<slen-1) {
6435               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6436               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6437         //branch_unneeded_reg[i]=1;
6438         //branch_unneeded_reg_upper[i]=1;
6439             } else {
6440               branch_unneeded_reg[i]=1;
6441               branch_unneeded_reg_upper[i]=1;
6442             }
6443           }
6444         }
6445       }
6446     }
6447     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6448     {
6449       // SYSCALL instruction (software interrupt)
6450       u=1;
6451       uu=1;
6452     }
6453     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6454     {
6455       // ERET instruction (return from interrupt)
6456       u=1;
6457       uu=1;
6458     }
6459     //u=uu=1; // DEBUG
6460     tdep=(~uu>>rt1[i])&1;
6461     // Written registers are unneeded
6462     u|=1LL<<rt1[i];
6463     u|=1LL<<rt2[i];
6464     uu|=1LL<<rt1[i];
6465     uu|=1LL<<rt2[i];
6466     gte_u|=gte_rt[i];
6467     // Accessed registers are needed
6468     u&=~(1LL<<rs1[i]);
6469     u&=~(1LL<<rs2[i]);
6470     uu&=~(1LL<<us1[i]);
6471     uu&=~(1LL<<us2[i]);
6472     gte_u&=~gte_rs[i];
6473     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6474       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6475     // Source-target dependencies
6476     uu&=~(tdep<<dep1[i]);
6477     uu&=~(tdep<<dep2[i]);
6478     // R0 is always unneeded
6479     u|=1;uu|=1;
6480     // Save it
6481     unneeded_reg[i]=u;
6482     unneeded_reg_upper[i]=uu;
6483     gte_unneeded[i]=gte_u;
6484     /*
6485     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6486     printf("U:");
6487     int r;
6488     for(r=1;r<=CCREG;r++) {
6489       if((unneeded_reg[i]>>r)&1) {
6490         if(r==HIREG) printf(" HI");
6491         else if(r==LOREG) printf(" LO");
6492         else printf(" r%d",r);
6493       }
6494     }
6495     printf(" UU:");
6496     for(r=1;r<=CCREG;r++) {
6497       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6498         if(r==HIREG) printf(" HI");
6499         else if(r==LOREG) printf(" LO");
6500         else printf(" r%d",r);
6501       }
6502     }
6503     printf("\n");*/
6504   }
6505   for (i=iend;i>=istart;i--)
6506   {
6507     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6508   }
6509 }
6510
6511 // Write back dirty registers as soon as we will no longer modify them,
6512 // so that we don't end up with lots of writes at the branches.
6513 void clean_registers(int istart,int iend,int wr)
6514 {
6515   int i;
6516   int r;
6517   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6518   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6519   if(iend==slen-1) {
6520     will_dirty_i=will_dirty_next=0;
6521     wont_dirty_i=wont_dirty_next=0;
6522   }else{
6523     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6524     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6525   }
6526   for (i=iend;i>=istart;i--)
6527   {
6528     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6529     {
6530       if(ba[i]<start || ba[i]>=(start+slen*4))
6531       {
6532         // Branch out of this block, flush all regs
6533         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6534         {
6535           // Unconditional branch
6536           will_dirty_i=0;
6537           wont_dirty_i=0;
6538           // Merge in delay slot (will dirty)
6539           for(r=0;r<HOST_REGS;r++) {
6540             if(r!=EXCLUDE_REG) {
6541               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6542               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6543               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6544               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6545               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6546               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6547               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6548               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6549               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6550               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6551               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6552               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6553               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6554               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6555             }
6556           }
6557         }
6558         else
6559         {
6560           // Conditional branch
6561           will_dirty_i=0;
6562           wont_dirty_i=wont_dirty_next;
6563           // Merge in delay slot (will dirty)
6564           for(r=0;r<HOST_REGS;r++) {
6565             if(r!=EXCLUDE_REG) {
6566               if(!likely[i]) {
6567                 // Might not dirty if likely branch is not taken
6568                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6569                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6570                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6571                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6572                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6573                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6574                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6575                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6576                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6577                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6578                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6579                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6580                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6581                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6582               }
6583             }
6584           }
6585         }
6586         // Merge in delay slot (wont dirty)
6587         for(r=0;r<HOST_REGS;r++) {
6588           if(r!=EXCLUDE_REG) {
6589             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6590             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6591             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6592             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6593             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6594             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6595             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6596             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6597             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6598             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6599           }
6600         }
6601         if(wr) {
6602           #ifndef DESTRUCTIVE_WRITEBACK
6603           branch_regs[i].dirty&=wont_dirty_i;
6604           #endif
6605           branch_regs[i].dirty|=will_dirty_i;
6606         }
6607       }
6608       else
6609       {
6610         // Internal branch
6611         if(ba[i]<=start+i*4) {
6612           // Backward branch
6613           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6614           {
6615             // Unconditional branch
6616             temp_will_dirty=0;
6617             temp_wont_dirty=0;
6618             // Merge in delay slot (will dirty)
6619             for(r=0;r<HOST_REGS;r++) {
6620               if(r!=EXCLUDE_REG) {
6621                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6622                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6623                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6624                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6625                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6626                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6627                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6628                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6629                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6630                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6631                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6632                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6633                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6634                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6635               }
6636             }
6637           } else {
6638             // Conditional branch (not taken case)
6639             temp_will_dirty=will_dirty_next;
6640             temp_wont_dirty=wont_dirty_next;
6641             // Merge in delay slot (will dirty)
6642             for(r=0;r<HOST_REGS;r++) {
6643               if(r!=EXCLUDE_REG) {
6644                 if(!likely[i]) {
6645                   // Will not dirty if likely branch is not taken
6646                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6647                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6648                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6649                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6650                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6651                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6652                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6653                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6654                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6655                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6656                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6657                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6658                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6659                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6660                 }
6661               }
6662             }
6663           }
6664           // Merge in delay slot (wont dirty)
6665           for(r=0;r<HOST_REGS;r++) {
6666             if(r!=EXCLUDE_REG) {
6667               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6668               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6669               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6670               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6671               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6672               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6673               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6674               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6675               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6676               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6677             }
6678           }
6679           // Deal with changed mappings
6680           if(i<iend) {
6681             for(r=0;r<HOST_REGS;r++) {
6682               if(r!=EXCLUDE_REG) {
6683                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6684                   temp_will_dirty&=~(1<<r);
6685                   temp_wont_dirty&=~(1<<r);
6686                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6687                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6688                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6689                   } else {
6690                     temp_will_dirty|=1<<r;
6691                     temp_wont_dirty|=1<<r;
6692                   }
6693                 }
6694               }
6695             }
6696           }
6697           if(wr) {
6698             will_dirty[i]=temp_will_dirty;
6699             wont_dirty[i]=temp_wont_dirty;
6700             clean_registers((ba[i]-start)>>2,i-1,0);
6701           }else{
6702             // Limit recursion.  It can take an excessive amount
6703             // of time if there are a lot of nested loops.
6704             will_dirty[(ba[i]-start)>>2]=0;
6705             wont_dirty[(ba[i]-start)>>2]=-1;
6706           }
6707         }
6708         /*else*/ if(1)
6709         {
6710           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6711           {
6712             // Unconditional branch
6713             will_dirty_i=0;
6714             wont_dirty_i=0;
6715           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6716             for(r=0;r<HOST_REGS;r++) {
6717               if(r!=EXCLUDE_REG) {
6718                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6719                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6720                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6721                 }
6722                 if(branch_regs[i].regmap[r]>=0) {
6723                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6724                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6725                 }
6726               }
6727             }
6728           //}
6729             // Merge in delay slot
6730             for(r=0;r<HOST_REGS;r++) {
6731               if(r!=EXCLUDE_REG) {
6732                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6733                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6734                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6735                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6736                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6737                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6738                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6739                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6740                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6741                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6742                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6743                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6744                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6745                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6746               }
6747             }
6748           } else {
6749             // Conditional branch
6750             will_dirty_i=will_dirty_next;
6751             wont_dirty_i=wont_dirty_next;
6752           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6753             for(r=0;r<HOST_REGS;r++) {
6754               if(r!=EXCLUDE_REG) {
6755                 signed char target_reg=branch_regs[i].regmap[r];
6756                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6757                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6758                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6759                 }
6760                 else if(target_reg>=0) {
6761                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6762                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6763                 }
6764                 // Treat delay slot as part of branch too
6765                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6766                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6767                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6768                 }
6769                 else
6770                 {
6771                   will_dirty[i+1]&=~(1<<r);
6772                 }*/
6773               }
6774             }
6775           //}
6776             // Merge in delay slot
6777             for(r=0;r<HOST_REGS;r++) {
6778               if(r!=EXCLUDE_REG) {
6779                 if(!likely[i]) {
6780                   // Might not dirty if likely branch is not taken
6781                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6782                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6783                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6784                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6785                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6786                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6787                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6788                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6789                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6790                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6791                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6792                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6793                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6794                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6795                 }
6796               }
6797             }
6798           }
6799           // Merge in delay slot (won't dirty)
6800           for(r=0;r<HOST_REGS;r++) {
6801             if(r!=EXCLUDE_REG) {
6802               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6803               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6804               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6805               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6806               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6807               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6808               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6809               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6810               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6811               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6812             }
6813           }
6814           if(wr) {
6815             #ifndef DESTRUCTIVE_WRITEBACK
6816             branch_regs[i].dirty&=wont_dirty_i;
6817             #endif
6818             branch_regs[i].dirty|=will_dirty_i;
6819           }
6820         }
6821       }
6822     }
6823     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6824     {
6825       // SYSCALL instruction (software interrupt)
6826       will_dirty_i=0;
6827       wont_dirty_i=0;
6828     }
6829     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6830     {
6831       // ERET instruction (return from interrupt)
6832       will_dirty_i=0;
6833       wont_dirty_i=0;
6834     }
6835     will_dirty_next=will_dirty_i;
6836     wont_dirty_next=wont_dirty_i;
6837     for(r=0;r<HOST_REGS;r++) {
6838       if(r!=EXCLUDE_REG) {
6839         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6840         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6841         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6842         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6843         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6844         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6845         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6846         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6847         if(i>istart) {
6848           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6849           {
6850             // Don't store a register immediately after writing it,
6851             // may prevent dual-issue.
6852             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6853             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6854           }
6855         }
6856       }
6857     }
6858     // Save it
6859     will_dirty[i]=will_dirty_i;
6860     wont_dirty[i]=wont_dirty_i;
6861     // Mark registers that won't be dirtied as not dirty
6862     if(wr) {
6863       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6864       for(r=0;r<HOST_REGS;r++) {
6865         if((will_dirty_i>>r)&1) {
6866           printf(" r%d",r);
6867         }
6868       }
6869       printf("\n");*/
6870
6871       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6872         regs[i].dirty|=will_dirty_i;
6873         #ifndef DESTRUCTIVE_WRITEBACK
6874         regs[i].dirty&=wont_dirty_i;
6875         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6876         {
6877           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6878             for(r=0;r<HOST_REGS;r++) {
6879               if(r!=EXCLUDE_REG) {
6880                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6881                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6882                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6883               }
6884             }
6885           }
6886         }
6887         else
6888         {
6889           if(i<iend) {
6890             for(r=0;r<HOST_REGS;r++) {
6891               if(r!=EXCLUDE_REG) {
6892                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6893                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6894                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6895               }
6896             }
6897           }
6898         }
6899         #endif
6900       //}
6901     }
6902     // Deal with changed mappings
6903     temp_will_dirty=will_dirty_i;
6904     temp_wont_dirty=wont_dirty_i;
6905     for(r=0;r<HOST_REGS;r++) {
6906       if(r!=EXCLUDE_REG) {
6907         int nr;
6908         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6909           if(wr) {
6910             #ifndef DESTRUCTIVE_WRITEBACK
6911             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6912             #endif
6913             regs[i].wasdirty|=will_dirty_i&(1<<r);
6914           }
6915         }
6916         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6917           // Register moved to a different register
6918           will_dirty_i&=~(1<<r);
6919           wont_dirty_i&=~(1<<r);
6920           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6921           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6922           if(wr) {
6923             #ifndef DESTRUCTIVE_WRITEBACK
6924             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6925             #endif
6926             regs[i].wasdirty|=will_dirty_i&(1<<r);
6927           }
6928         }
6929         else {
6930           will_dirty_i&=~(1<<r);
6931           wont_dirty_i&=~(1<<r);
6932           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6933             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6934             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6935           } else {
6936             wont_dirty_i|=1<<r;
6937             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6938           }
6939         }
6940       }
6941     }
6942   }
6943 }
6944
6945 #ifdef DISASM
6946   /* disassembly */
6947 void disassemble_inst(int i)
6948 {
6949     if (bt[i]) printf("*"); else printf(" ");
6950     switch(itype[i]) {
6951       case UJUMP:
6952         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6953       case CJUMP:
6954         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6955       case SJUMP:
6956         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6957       case FJUMP:
6958         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6959       case RJUMP:
6960         if (opcode[i]==0x9&&rt1[i]!=31)
6961           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6962         else
6963           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6964         break;
6965       case SPAN:
6966         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6967       case IMM16:
6968         if(opcode[i]==0xf) //LUI
6969           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6970         else
6971           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6972         break;
6973       case LOAD:
6974       case LOADLR:
6975         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6976         break;
6977       case STORE:
6978       case STORELR:
6979         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6980         break;
6981       case ALU:
6982       case SHIFT:
6983         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6984         break;
6985       case MULTDIV:
6986         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6987         break;
6988       case SHIFTIMM:
6989         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6990         break;
6991       case MOV:
6992         if((opcode2[i]&0x1d)==0x10)
6993           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6994         else if((opcode2[i]&0x1d)==0x11)
6995           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6996         else
6997           printf (" %x: %s\n",start+i*4,insn[i]);
6998         break;
6999       case COP0:
7000         if(opcode2[i]==0)
7001           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7002         else if(opcode2[i]==4)
7003           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7004         else printf (" %x: %s\n",start+i*4,insn[i]);
7005         break;
7006       case COP1:
7007         if(opcode2[i]<3)
7008           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7009         else if(opcode2[i]>3)
7010           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7011         else printf (" %x: %s\n",start+i*4,insn[i]);
7012         break;
7013       case COP2:
7014         if(opcode2[i]<3)
7015           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7016         else if(opcode2[i]>3)
7017           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7018         else printf (" %x: %s\n",start+i*4,insn[i]);
7019         break;
7020       case C1LS:
7021         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7022         break;
7023       case C2LS:
7024         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7025         break;
7026       case INTCALL:
7027         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7028         break;
7029       default:
7030         //printf (" %s %8x\n",insn[i],source[i]);
7031         printf (" %x: %s\n",start+i*4,insn[i]);
7032     }
7033 }
7034 #else
7035 static void disassemble_inst(int i) {}
7036 #endif // DISASM
7037
7038 #define DRC_TEST_VAL 0x74657374
7039
7040 static int new_dynarec_test(void)
7041 {
7042   int (*testfunc)(void) = (void *)out;
7043   void *beginning;
7044   int ret;
7045
7046   beginning = start_block();
7047   emit_movimm(DRC_TEST_VAL,0); // test
7048   emit_jmpreg(14);
7049   literal_pool(0);
7050   end_block(beginning);
7051   SysPrintf("testing if we can run recompiled code..\n");
7052   ret = testfunc();
7053   if (ret == DRC_TEST_VAL)
7054     SysPrintf("test passed.\n");
7055   else
7056     SysPrintf("test failed: %08x\n", ret);
7057   out=(u_char *)BASE_ADDR;
7058   return ret == DRC_TEST_VAL;
7059 }
7060
7061 // clear the state completely, instead of just marking
7062 // things invalid like invalidate_all_pages() does
7063 void new_dynarec_clear_full(void)
7064 {
7065   int n;
7066   out=(u_char *)BASE_ADDR;
7067   memset(invalid_code,1,sizeof(invalid_code));
7068   memset(hash_table,0xff,sizeof(hash_table));
7069   memset(mini_ht,-1,sizeof(mini_ht));
7070   memset(restore_candidate,0,sizeof(restore_candidate));
7071   memset(shadow,0,sizeof(shadow));
7072   copy=shadow;
7073   expirep=16384; // Expiry pointer, +2 blocks
7074   pending_exception=0;
7075   literalcount=0;
7076   stop_after_jal=0;
7077   inv_code_start=inv_code_end=~0;
7078   // TLB
7079   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7080   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7081   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7082 }
7083
7084 void new_dynarec_init(void)
7085 {
7086   SysPrintf("Init new dynarec\n");
7087
7088   // allocate/prepare a buffer for translation cache
7089   // see assem_arm.h for some explanation
7090 #if   defined(BASE_ADDR_FIXED)
7091   if (mmap (translation_cache, 1 << TARGET_SIZE_2,
7092         PROT_READ | PROT_WRITE | PROT_EXEC,
7093         MAP_PRIVATE | MAP_ANONYMOUS,
7094         -1, 0) != translation_cache)
7095   {
7096     SysPrintf("mmap() failed: %s\n", strerror(errno));
7097     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
7098     abort();
7099   }
7100 #elif defined(BASE_ADDR_DYNAMIC)
7101 #ifdef VITA
7102   sceBlock = getVMBlock();//sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
7103   if (sceBlock < 0)
7104     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
7105   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
7106   if (ret < 0)
7107     SysPrintf("sceKernelGetMemBlockBase failed\n");
7108     
7109   sceKernelOpenVMDomain();
7110   sceClibPrintf("translation_cache = 0x%08X \n ", translation_cache);
7111 #elif defined(_MSC_VER)
7112   base_addr = VirtualAlloc(NULL, 1<<TARGET_SIZE_2, MEM_COMMIT | MEM_RESERVE,
7113       PAGE_EXECUTE_READWRITE);
7114 #else
7115   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
7116       PROT_READ | PROT_WRITE | PROT_EXEC,
7117       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
7118   if (translation_cache == MAP_FAILED) {
7119     SysPrintf("mmap() failed: %s\n", strerror(errno));
7120     abort();
7121   }
7122 #endif
7123 #else
7124 #ifndef NO_WRITE_EXEC
7125   // not all systems allow execute in data segment by default
7126   if (mprotect((void *)BASE_ADDR, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
7127     SysPrintf("mprotect() failed: %s\n", strerror(errno));
7128 #endif
7129 #endif
7130
7131   out=(u_char *)BASE_ADDR;
7132   cycle_multiplier=200;
7133   new_dynarec_clear_full();
7134 #ifdef HOST_IMM8
7135   // Copy this into local area so we don't have to put it in every literal pool
7136   invc_ptr=invalid_code;
7137 #endif
7138   arch_init();
7139   new_dynarec_test();
7140 #ifndef RAM_FIXED
7141   ram_offset=(u_int)rdram-0x80000000;
7142 #endif
7143   if (ram_offset!=0)
7144     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
7145 }
7146
7147 void new_dynarec_cleanup(void)
7148 {
7149   int n;
7150 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
7151 #ifndef VITA
7152 #if defined(_MSC_VER)
7153   VirtualFree(base_addr, 0, MEM_RELEASE);
7154 #else
7155   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0)
7156     SysPrintf("munmap() failed\n");
7157 #endif
7158 #endif
7159 #endif
7160   for(n=0;n<4096;n++)
7161     ll_clear(jump_in+n);
7162   for(n=0;n<4096;n++)
7163     ll_clear(jump_out+n);
7164   for(n=0;n<4096;n++)
7165     ll_clear(jump_dirty+n);
7166 #ifdef ROM_COPY
7167   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7168 #endif
7169 }
7170
7171 static u_int *get_source_start(u_int addr, u_int *limit)
7172 {
7173   if (addr < 0x00200000 ||
7174     (0xa0000000 <= addr && addr < 0xa0200000)) {
7175     // used for BIOS calls mostly?
7176     *limit = (addr&0xa0000000)|0x00200000;
7177     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7178   }
7179   else if (!Config.HLE && (
7180     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7181     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7182     // BIOS
7183     *limit = (addr & 0xfff00000) | 0x80000;
7184     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7185   }
7186   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7187     *limit = (addr & 0x80600000) + 0x00200000;
7188     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7189   }
7190   return NULL;
7191 }
7192
7193 static u_int scan_for_ret(u_int addr)
7194 {
7195   u_int limit = 0;
7196   u_int *mem;
7197
7198   mem = get_source_start(addr, &limit);
7199   if (mem == NULL)
7200     return addr;
7201
7202   if (limit > addr + 0x1000)
7203     limit = addr + 0x1000;
7204   for (; addr < limit; addr += 4, mem++) {
7205     if (*mem == 0x03e00008) // jr $ra
7206       return addr + 8;
7207   }
7208   return addr;
7209 }
7210
7211 struct savestate_block {
7212   uint32_t addr;
7213   uint32_t regflags;
7214 };
7215
7216 static int addr_cmp(const void *p1_, const void *p2_)
7217 {
7218   const struct savestate_block *p1 = p1_, *p2 = p2_;
7219   return p1->addr - p2->addr;
7220 }
7221
7222 int new_dynarec_save_blocks(void *save, int size)
7223 {
7224   struct savestate_block *blocks = save;
7225   int maxcount = size / sizeof(blocks[0]);
7226   struct savestate_block tmp_blocks[1024];
7227   struct ll_entry *head;
7228   int p, s, d, o, bcnt;
7229   u_int addr;
7230
7231   o = 0;
7232   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
7233     bcnt = 0;
7234     for (head = jump_in[p]; head != NULL; head = head->next) {
7235       tmp_blocks[bcnt].addr = head->vaddr;
7236       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7237       bcnt++;
7238     }
7239     if (bcnt < 1)
7240       continue;
7241     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7242
7243     addr = tmp_blocks[0].addr;
7244     for (s = d = 0; s < bcnt; s++) {
7245       if (tmp_blocks[s].addr < addr)
7246         continue;
7247       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7248         tmp_blocks[d++] = tmp_blocks[s];
7249       addr = scan_for_ret(tmp_blocks[s].addr);
7250     }
7251
7252     if (o + d > maxcount)
7253       d = maxcount - o;
7254     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7255     o += d;
7256   }
7257
7258   return o * sizeof(blocks[0]);
7259 }
7260
7261 void new_dynarec_load_blocks(const void *save, int size)
7262 {
7263   const struct savestate_block *blocks = save;
7264   int count = size / sizeof(blocks[0]);
7265   u_int regs_save[32];
7266   uint32_t f;
7267   int i, b;
7268
7269   get_addr(psxRegs.pc);
7270
7271   // change GPRs for speculation to at least partially work..
7272   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7273   for (i = 1; i < 32; i++)
7274     psxRegs.GPR.r[i] = 0x80000000;
7275
7276   for (b = 0; b < count; b++) {
7277     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7278       if (f & 1)
7279         psxRegs.GPR.r[i] = 0x1f800000;
7280     }
7281
7282     get_addr(blocks[b].addr);
7283
7284     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7285       if (f & 1)
7286         psxRegs.GPR.r[i] = 0x80000000;
7287     }
7288   }
7289
7290   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7291 }
7292
7293 int new_recompile_block(int addr)
7294 {
7295   u_int pagelimit = 0;
7296   u_int state_rflags = 0;
7297   int i;
7298
7299   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7300   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7301   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7302   //if(debug)
7303   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7304   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7305   /*if(Count>=312978186) {
7306     rlist();
7307   }*/
7308   //rlist();
7309
7310   // this is just for speculation
7311   for (i = 1; i < 32; i++) {
7312     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7313       state_rflags |= 1 << i;
7314   }
7315
7316   start = (u_int)addr&~3;
7317   //assert(((u_int)addr&1)==0);
7318   new_dynarec_did_compile=1;
7319   if (Config.HLE && start == 0x80001000) // hlecall
7320   {
7321     // XXX: is this enough? Maybe check hleSoftCall?
7322     void *beginning=start_block();
7323     u_int page=get_page(start);
7324
7325     invalid_code[start>>12]=0;
7326     emit_movimm(start,0);
7327     emit_writeword(0,(int)&pcaddr);
7328     emit_jmp((int)new_dyna_leave);
7329     literal_pool(0);
7330     end_block(beginning);
7331     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7332     return 0;
7333   }
7334
7335   source = get_source_start(start, &pagelimit);
7336   if (source == NULL) {
7337     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7338     exit(1);
7339   }
7340
7341   /* Pass 1: disassemble */
7342   /* Pass 2: register dependencies, branch targets */
7343   /* Pass 3: register allocation */
7344   /* Pass 4: branch dependencies */
7345   /* Pass 5: pre-alloc */
7346   /* Pass 6: optimize clean/dirty state */
7347   /* Pass 7: flag 32-bit registers */
7348   /* Pass 8: assembly */
7349   /* Pass 9: linker */
7350   /* Pass 10: garbage collection / free memory */
7351
7352   int j;
7353   int done=0;
7354   unsigned int type,op,op2;
7355
7356   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7357
7358   /* Pass 1 disassembly */
7359
7360   for(i=0;!done;i++) {
7361     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7362     minimum_free_regs[i]=0;
7363     opcode[i]=op=source[i]>>26;
7364     switch(op)
7365     {
7366       case 0x00: strcpy(insn[i],"special"); type=NI;
7367         op2=source[i]&0x3f;
7368         switch(op2)
7369         {
7370           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7371           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7372           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7373           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7374           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7375           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7376           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7377           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7378           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7379           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7380           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7381           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7382           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7383           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7384           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7385           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7386           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7387           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7388           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7389           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7390           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7391           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7392           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7393           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7394           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7395           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7396           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7397           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7398           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7399           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7400           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7401           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7402           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7403           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7404           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7405 #if 0
7406           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7407           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7408           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7409           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7410           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7411           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7412           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7413           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7414           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7415           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7416           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7417           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7418           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7419           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7420           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7421           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7422           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7423 #endif
7424         }
7425         break;
7426       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7427         op2=(source[i]>>16)&0x1f;
7428         switch(op2)
7429         {
7430           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7431           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7432           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7433           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7434           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7435           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7436           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7437           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7438           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7439           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7440           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7441           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7442           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7443           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7444         }
7445         break;
7446       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7447       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7448       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7449       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7450       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7451       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7452       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7453       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7454       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7455       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7456       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7457       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7458       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7459       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7460       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7461         op2=(source[i]>>21)&0x1f;
7462         switch(op2)
7463         {
7464           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7465           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7466           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7467           switch(source[i]&0x3f)
7468           {
7469             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7470             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7471             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7472             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7473             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7474             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7475           }
7476         }
7477         break;
7478       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7479         op2=(source[i]>>21)&0x1f;
7480         switch(op2)
7481         {
7482           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7483           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7484           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7485           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7486           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7487           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7488           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7489           switch((source[i]>>16)&0x3)
7490           {
7491             case 0x00: strcpy(insn[i],"BC1F"); break;
7492             case 0x01: strcpy(insn[i],"BC1T"); break;
7493             case 0x02: strcpy(insn[i],"BC1FL"); break;
7494             case 0x03: strcpy(insn[i],"BC1TL"); break;
7495           }
7496           break;
7497           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7498           switch(source[i]&0x3f)
7499           {
7500             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7501             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7502             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7503             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7504             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7505             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7506             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7507             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7508             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7509             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7510             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7511             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7512             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7513             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7514             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7515             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7516             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7517             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7518             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7519             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7520             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7521             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7522             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7523             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7524             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7525             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7526             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7527             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7528             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7529             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7530             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7531             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7532             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7533             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7534             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7535           }
7536           break;
7537           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7538           switch(source[i]&0x3f)
7539           {
7540             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7541             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7542             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7543             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7544             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7545             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7546             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7547             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7548             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7549             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7550             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7551             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7552             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7553             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7554             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7555             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7556             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7557             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7558             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7559             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7560             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7561             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7562             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7563             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7564             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7565             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7566             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7567             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7568             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7569             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7570             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7571             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7572             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7573             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7574             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7575           }
7576           break;
7577           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7578           switch(source[i]&0x3f)
7579           {
7580             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7581             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7582           }
7583           break;
7584           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7585           switch(source[i]&0x3f)
7586           {
7587             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7588             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7589           }
7590           break;
7591         }
7592         break;
7593 #if 0
7594       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7595       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7596       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7597       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7598       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7599       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7600       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7601       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7602 #endif
7603       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7604       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7605       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7606       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7607       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7608       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7609       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7610 #if 0
7611       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7612 #endif
7613       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7614       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7615       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7616       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7617 #if 0
7618       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7619       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7620 #endif
7621       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7622       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7623       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7624       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7625 #if 0
7626       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7627       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7628       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7629 #endif
7630       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7631       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7632 #if 0
7633       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7634       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7635       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7636 #endif
7637       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7638         op2=(source[i]>>21)&0x1f;
7639         //if (op2 & 0x10) {
7640         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7641           if (gte_handlers[source[i]&0x3f]!=NULL) {
7642             if (gte_regnames[source[i]&0x3f]!=NULL)
7643               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7644             else
7645               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7646             type=C2OP;
7647           }
7648         }
7649         else switch(op2)
7650         {
7651           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7652           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7653           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7654           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7655         }
7656         break;
7657       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7658       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7659       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7660       default: strcpy(insn[i],"???"); type=NI;
7661         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7662         break;
7663     }
7664     itype[i]=type;
7665     opcode2[i]=op2;
7666     /* Get registers/immediates */
7667     lt1[i]=0;
7668     us1[i]=0;
7669     us2[i]=0;
7670     dep1[i]=0;
7671     dep2[i]=0;
7672     gte_rs[i]=gte_rt[i]=0;
7673     switch(type) {
7674       case LOAD:
7675         rs1[i]=(source[i]>>21)&0x1f;
7676         rs2[i]=0;
7677         rt1[i]=(source[i]>>16)&0x1f;
7678         rt2[i]=0;
7679         imm[i]=(short)source[i];
7680         break;
7681       case STORE:
7682       case STORELR:
7683         rs1[i]=(source[i]>>21)&0x1f;
7684         rs2[i]=(source[i]>>16)&0x1f;
7685         rt1[i]=0;
7686         rt2[i]=0;
7687         imm[i]=(short)source[i];
7688         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7689         break;
7690       case LOADLR:
7691         // LWL/LWR only load part of the register,
7692         // therefore the target register must be treated as a source too
7693         rs1[i]=(source[i]>>21)&0x1f;
7694         rs2[i]=(source[i]>>16)&0x1f;
7695         rt1[i]=(source[i]>>16)&0x1f;
7696         rt2[i]=0;
7697         imm[i]=(short)source[i];
7698         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7699         if(op==0x26) dep1[i]=rt1[i]; // LWR
7700         break;
7701       case IMM16:
7702         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7703         else rs1[i]=(source[i]>>21)&0x1f;
7704         rs2[i]=0;
7705         rt1[i]=(source[i]>>16)&0x1f;
7706         rt2[i]=0;
7707         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7708           imm[i]=(unsigned short)source[i];
7709         }else{
7710           imm[i]=(short)source[i];
7711         }
7712         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7713         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7714         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7715         break;
7716       case UJUMP:
7717         rs1[i]=0;
7718         rs2[i]=0;
7719         rt1[i]=0;
7720         rt2[i]=0;
7721         // The JAL instruction writes to r31.
7722         if (op&1) {
7723           rt1[i]=31;
7724         }
7725         rs2[i]=CCREG;
7726         break;
7727       case RJUMP:
7728         rs1[i]=(source[i]>>21)&0x1f;
7729         rs2[i]=0;
7730         rt1[i]=0;
7731         rt2[i]=0;
7732         // The JALR instruction writes to rd.
7733         if (op2&1) {
7734           rt1[i]=(source[i]>>11)&0x1f;
7735         }
7736         rs2[i]=CCREG;
7737         break;
7738       case CJUMP:
7739         rs1[i]=(source[i]>>21)&0x1f;
7740         rs2[i]=(source[i]>>16)&0x1f;
7741         rt1[i]=0;
7742         rt2[i]=0;
7743         if(op&2) { // BGTZ/BLEZ
7744           rs2[i]=0;
7745         }
7746         us1[i]=rs1[i];
7747         us2[i]=rs2[i];
7748         likely[i]=op>>4;
7749         break;
7750       case SJUMP:
7751         rs1[i]=(source[i]>>21)&0x1f;
7752         rs2[i]=CCREG;
7753         rt1[i]=0;
7754         rt2[i]=0;
7755         us1[i]=rs1[i];
7756         if(op2&0x10) { // BxxAL
7757           rt1[i]=31;
7758           // NOTE: If the branch is not taken, r31 is still overwritten
7759         }
7760         likely[i]=(op2&2)>>1;
7761         break;
7762       case FJUMP:
7763         rs1[i]=FSREG;
7764         rs2[i]=CSREG;
7765         rt1[i]=0;
7766         rt2[i]=0;
7767         likely[i]=((source[i])>>17)&1;
7768         break;
7769       case ALU:
7770         rs1[i]=(source[i]>>21)&0x1f; // source
7771         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7772         rt1[i]=(source[i]>>11)&0x1f; // destination
7773         rt2[i]=0;
7774         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7775           us1[i]=rs1[i];us2[i]=rs2[i];
7776         }
7777         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7778           dep1[i]=rs1[i];dep2[i]=rs2[i];
7779         }
7780         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7781           dep1[i]=rs1[i];dep2[i]=rs2[i];
7782         }
7783         break;
7784       case MULTDIV:
7785         rs1[i]=(source[i]>>21)&0x1f; // source
7786         rs2[i]=(source[i]>>16)&0x1f; // divisor
7787         rt1[i]=HIREG;
7788         rt2[i]=LOREG;
7789         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7790           us1[i]=rs1[i];us2[i]=rs2[i];
7791         }
7792         break;
7793       case MOV:
7794         rs1[i]=0;
7795         rs2[i]=0;
7796         rt1[i]=0;
7797         rt2[i]=0;
7798         if(op2==0x10) rs1[i]=HIREG; // MFHI
7799         if(op2==0x11) rt1[i]=HIREG; // MTHI
7800         if(op2==0x12) rs1[i]=LOREG; // MFLO
7801         if(op2==0x13) rt1[i]=LOREG; // MTLO
7802         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7803         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7804         dep1[i]=rs1[i];
7805         break;
7806       case SHIFT:
7807         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7808         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7809         rt1[i]=(source[i]>>11)&0x1f; // destination
7810         rt2[i]=0;
7811         // DSLLV/DSRLV/DSRAV are 64-bit
7812         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7813         break;
7814       case SHIFTIMM:
7815         rs1[i]=(source[i]>>16)&0x1f;
7816         rs2[i]=0;
7817         rt1[i]=(source[i]>>11)&0x1f;
7818         rt2[i]=0;
7819         imm[i]=(source[i]>>6)&0x1f;
7820         // DSxx32 instructions
7821         if(op2>=0x3c) imm[i]|=0x20;
7822         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7823         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7824         break;
7825       case COP0:
7826         rs1[i]=0;
7827         rs2[i]=0;
7828         rt1[i]=0;
7829         rt2[i]=0;
7830         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7831         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7832         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7833         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7834         break;
7835       case COP1:
7836         rs1[i]=0;
7837         rs2[i]=0;
7838         rt1[i]=0;
7839         rt2[i]=0;
7840         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7841         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7842         if(op2==5) us1[i]=rs1[i]; // DMTC1
7843         rs2[i]=CSREG;
7844         break;
7845       case COP2:
7846         rs1[i]=0;
7847         rs2[i]=0;
7848         rt1[i]=0;
7849         rt2[i]=0;
7850         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7851         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7852         rs2[i]=CSREG;
7853         int gr=(source[i]>>11)&0x1F;
7854         switch(op2)
7855         {
7856           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7857           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7858           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7859           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7860         }
7861         break;
7862       case C1LS:
7863         rs1[i]=(source[i]>>21)&0x1F;
7864         rs2[i]=CSREG;
7865         rt1[i]=0;
7866         rt2[i]=0;
7867         imm[i]=(short)source[i];
7868         break;
7869       case C2LS:
7870         rs1[i]=(source[i]>>21)&0x1F;
7871         rs2[i]=0;
7872         rt1[i]=0;
7873         rt2[i]=0;
7874         imm[i]=(short)source[i];
7875         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7876         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7877         break;
7878       case C2OP:
7879         rs1[i]=0;
7880         rs2[i]=0;
7881         rt1[i]=0;
7882         rt2[i]=0;
7883         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7884         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7885         gte_rt[i]|=1ll<<63; // every op changes flags
7886         if((source[i]&0x3f)==GTE_MVMVA) {
7887           int v = (source[i] >> 15) & 3;
7888           gte_rs[i]&=~0xe3fll;
7889           if(v==3) gte_rs[i]|=0xe00ll;
7890           else gte_rs[i]|=3ll<<(v*2);
7891         }
7892         break;
7893       case FLOAT:
7894       case FCONV:
7895         rs1[i]=0;
7896         rs2[i]=CSREG;
7897         rt1[i]=0;
7898         rt2[i]=0;
7899         break;
7900       case FCOMP:
7901         rs1[i]=FSREG;
7902         rs2[i]=CSREG;
7903         rt1[i]=FSREG;
7904         rt2[i]=0;
7905         break;
7906       case SYSCALL:
7907       case HLECALL:
7908       case INTCALL:
7909         rs1[i]=CCREG;
7910         rs2[i]=0;
7911         rt1[i]=0;
7912         rt2[i]=0;
7913         break;
7914       default:
7915         rs1[i]=0;
7916         rs2[i]=0;
7917         rt1[i]=0;
7918         rt2[i]=0;
7919     }
7920     /* Calculate branch target addresses */
7921     if(type==UJUMP)
7922       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7923     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7924       ba[i]=start+i*4+8; // Ignore never taken branch
7925     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7926       ba[i]=start+i*4+8; // Ignore never taken branch
7927     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7928       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7929     else ba[i]=-1;
7930     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7931       int do_in_intrp=0;
7932       // branch in delay slot?
7933       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7934         // don't handle first branch and call interpreter if it's hit
7935         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7936         do_in_intrp=1;
7937       }
7938       // basic load delay detection
7939       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7940         int t=(ba[i-1]-start)/4;
7941         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7942           // jump target wants DS result - potential load delay effect
7943           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7944           do_in_intrp=1;
7945           bt[t+1]=1; // expected return from interpreter
7946         }
7947         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7948               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7949           // v0 overwrite like this is a sign of trouble, bail out
7950           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7951           do_in_intrp=1;
7952         }
7953       }
7954       if(do_in_intrp) {
7955         rs1[i-1]=CCREG;
7956         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7957         ba[i-1]=-1;
7958         itype[i-1]=INTCALL;
7959         done=2;
7960         i--; // don't compile the DS
7961       }
7962     }
7963     /* Is this the end of the block? */
7964     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7965       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7966         done=2;
7967       }
7968       else {
7969         if(stop_after_jal) done=1;
7970         // Stop on BREAK
7971         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7972       }
7973       // Don't recompile stuff that's already compiled
7974       if(check_addr(start+i*4+4)) done=1;
7975       // Don't get too close to the limit
7976       if(i>MAXBLOCK/2) done=1;
7977     }
7978     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7979     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7980     if(done==2) {
7981       // Does the block continue due to a branch?
7982       for(j=i-1;j>=0;j--)
7983       {
7984         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7985         if(ba[j]==start+i*4+4) done=j=0;
7986         if(ba[j]==start+i*4+8) done=j=0;
7987       }
7988     }
7989     //assert(i<MAXBLOCK-1);
7990     if(start+i*4==pagelimit-4) done=1;
7991     assert(start+i*4<pagelimit);
7992     if (i==MAXBLOCK-1) done=1;
7993     // Stop if we're compiling junk
7994     if(itype[i]==NI&&opcode[i]==0x11) {
7995       done=stop_after_jal=1;
7996       SysPrintf("Disabled speculative precompilation\n");
7997     }
7998   }
7999   slen=i;
8000   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8001     if(start+i*4==pagelimit) {
8002       itype[i-1]=SPAN;
8003     }
8004   }
8005   assert(slen>0);
8006
8007   /* Pass 2 - Register dependencies and branch targets */
8008
8009   unneeded_registers(0,slen-1,0);
8010
8011   /* Pass 3 - Register allocation */
8012
8013   struct regstat current; // Current register allocations/status
8014   current.is32=1;
8015   current.dirty=0;
8016   current.u=unneeded_reg[0];
8017   current.uu=unneeded_reg_upper[0];
8018   clear_all_regs(current.regmap);
8019   alloc_reg(&current,0,CCREG);
8020   dirty_reg(&current,CCREG);
8021   current.isconst=0;
8022   current.wasconst=0;
8023   current.waswritten=0;
8024   int ds=0;
8025   int cc=0;
8026   int hr=-1;
8027
8028   if((u_int)addr&1) {
8029     // First instruction is delay slot
8030     cc=-1;
8031     bt[1]=1;
8032     ds=1;
8033     unneeded_reg[0]=1;
8034     unneeded_reg_upper[0]=1;
8035     current.regmap[HOST_BTREG]=BTREG;
8036   }
8037
8038   for(i=0;i<slen;i++)
8039   {
8040     if(bt[i])
8041     {
8042       int hr;
8043       for(hr=0;hr<HOST_REGS;hr++)
8044       {
8045         // Is this really necessary?
8046         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8047       }
8048       current.isconst=0;
8049       current.waswritten=0;
8050     }
8051     if(i>1)
8052     {
8053       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8054       {
8055         if(rs1[i-2]==0||rs2[i-2]==0)
8056         {
8057           if(rs1[i-2]) {
8058             current.is32|=1LL<<rs1[i-2];
8059             int hr=get_reg(current.regmap,rs1[i-2]|64);
8060             if(hr>=0) current.regmap[hr]=-1;
8061           }
8062           if(rs2[i-2]) {
8063             current.is32|=1LL<<rs2[i-2];
8064             int hr=get_reg(current.regmap,rs2[i-2]|64);
8065             if(hr>=0) current.regmap[hr]=-1;
8066           }
8067         }
8068       }
8069     }
8070     current.is32=-1LL;
8071
8072     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8073     regs[i].wasconst=current.isconst;
8074     regs[i].was32=current.is32;
8075     regs[i].wasdirty=current.dirty;
8076     regs[i].loadedconst=0;
8077     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8078       if(i+1<slen) {
8079         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8080         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8081         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8082         current.u|=1;
8083         current.uu|=1;
8084       } else {
8085         current.u=1;
8086         current.uu=1;
8087       }
8088     } else {
8089       if(i+1<slen) {
8090         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8091         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8092         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8093         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8094         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8095         current.u|=1;
8096         current.uu|=1;
8097       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
8098     }
8099     is_ds[i]=ds;
8100     if(ds) {
8101       ds=0; // Skip delay slot, already allocated as part of branch
8102       // ...but we need to alloc it in case something jumps here
8103       if(i+1<slen) {
8104         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8105         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8106       }else{
8107         current.u=branch_unneeded_reg[i-1];
8108         current.uu=branch_unneeded_reg_upper[i-1];
8109       }
8110       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8111       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8112       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8113       current.u|=1;
8114       current.uu|=1;
8115       struct regstat temp;
8116       memcpy(&temp,&current,sizeof(current));
8117       temp.wasdirty=temp.dirty;
8118       temp.was32=temp.is32;
8119       // TODO: Take into account unconditional branches, as below
8120       delayslot_alloc(&temp,i);
8121       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8122       regs[i].wasdirty=temp.wasdirty;
8123       regs[i].was32=temp.was32;
8124       regs[i].dirty=temp.dirty;
8125       regs[i].is32=temp.is32;
8126       regs[i].isconst=0;
8127       regs[i].wasconst=0;
8128       current.isconst=0;
8129       // Create entry (branch target) regmap
8130       for(hr=0;hr<HOST_REGS;hr++)
8131       {
8132         int r=temp.regmap[hr];
8133         if(r>=0) {
8134           if(r!=regmap_pre[i][hr]) {
8135             regs[i].regmap_entry[hr]=-1;
8136           }
8137           else
8138           {
8139             if(r<64){
8140               if((current.u>>r)&1) {
8141                 regs[i].regmap_entry[hr]=-1;
8142                 regs[i].regmap[hr]=-1;
8143                 //Don't clear regs in the delay slot as the branch might need them
8144                 //current.regmap[hr]=-1;
8145               }else
8146                 regs[i].regmap_entry[hr]=r;
8147             }
8148             else {
8149               if((current.uu>>(r&63))&1) {
8150                 regs[i].regmap_entry[hr]=-1;
8151                 regs[i].regmap[hr]=-1;
8152                 //Don't clear regs in the delay slot as the branch might need them
8153                 //current.regmap[hr]=-1;
8154               }else
8155                 regs[i].regmap_entry[hr]=r;
8156             }
8157           }
8158         } else {
8159           // First instruction expects CCREG to be allocated
8160           if(i==0&&hr==HOST_CCREG)
8161             regs[i].regmap_entry[hr]=CCREG;
8162           else
8163             regs[i].regmap_entry[hr]=-1;
8164         }
8165       }
8166     }
8167     else { // Not delay slot
8168       switch(itype[i]) {
8169         case UJUMP:
8170           //current.isconst=0; // DEBUG
8171           //current.wasconst=0; // DEBUG
8172           //regs[i].wasconst=0; // DEBUG
8173           clear_const(&current,rt1[i]);
8174           alloc_cc(&current,i);
8175           dirty_reg(&current,CCREG);
8176           if (rt1[i]==31) {
8177             alloc_reg(&current,i,31);
8178             dirty_reg(&current,31);
8179             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8180             //assert(rt1[i+1]!=rt1[i]);
8181             #ifdef REG_PREFETCH
8182             alloc_reg(&current,i,PTEMP);
8183             #endif
8184             //current.is32|=1LL<<rt1[i];
8185           }
8186           ooo[i]=1;
8187           delayslot_alloc(&current,i+1);
8188           //current.isconst=0; // DEBUG
8189           ds=1;
8190           //printf("i=%d, isconst=%x\n",i,current.isconst);
8191           break;
8192         case RJUMP:
8193           //current.isconst=0;
8194           //current.wasconst=0;
8195           //regs[i].wasconst=0;
8196           clear_const(&current,rs1[i]);
8197           clear_const(&current,rt1[i]);
8198           alloc_cc(&current,i);
8199           dirty_reg(&current,CCREG);
8200           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8201             alloc_reg(&current,i,rs1[i]);
8202             if (rt1[i]!=0) {
8203               alloc_reg(&current,i,rt1[i]);
8204               dirty_reg(&current,rt1[i]);
8205               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8206               assert(rt1[i+1]!=rt1[i]);
8207               #ifdef REG_PREFETCH
8208               alloc_reg(&current,i,PTEMP);
8209               #endif
8210             }
8211             #ifdef USE_MINI_HT
8212             if(rs1[i]==31) { // JALR
8213               alloc_reg(&current,i,RHASH);
8214               #ifndef HOST_IMM_ADDR32
8215               alloc_reg(&current,i,RHTBL);
8216               #endif
8217             }
8218             #endif
8219             delayslot_alloc(&current,i+1);
8220           } else {
8221             // The delay slot overwrites our source register,
8222             // allocate a temporary register to hold the old value.
8223             current.isconst=0;
8224             current.wasconst=0;
8225             regs[i].wasconst=0;
8226             delayslot_alloc(&current,i+1);
8227             current.isconst=0;
8228             alloc_reg(&current,i,RTEMP);
8229           }
8230           //current.isconst=0; // DEBUG
8231           ooo[i]=1;
8232           ds=1;
8233           break;
8234         case CJUMP:
8235           //current.isconst=0;
8236           //current.wasconst=0;
8237           //regs[i].wasconst=0;
8238           clear_const(&current,rs1[i]);
8239           clear_const(&current,rs2[i]);
8240           if((opcode[i]&0x3E)==4) // BEQ/BNE
8241           {
8242             alloc_cc(&current,i);
8243             dirty_reg(&current,CCREG);
8244             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8245             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8246             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8247             {
8248               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8249               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8250             }
8251             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8252                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8253               // The delay slot overwrites one of our conditions.
8254               // Allocate the branch condition registers instead.
8255               current.isconst=0;
8256               current.wasconst=0;
8257               regs[i].wasconst=0;
8258               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8259               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8260               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8261               {
8262                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8263                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8264               }
8265             }
8266             else
8267             {
8268               ooo[i]=1;
8269               delayslot_alloc(&current,i+1);
8270             }
8271           }
8272           else
8273           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8274           {
8275             alloc_cc(&current,i);
8276             dirty_reg(&current,CCREG);
8277             alloc_reg(&current,i,rs1[i]);
8278             if(!(current.is32>>rs1[i]&1))
8279             {
8280               alloc_reg64(&current,i,rs1[i]);
8281             }
8282             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8283               // The delay slot overwrites one of our conditions.
8284               // Allocate the branch condition registers instead.
8285               current.isconst=0;
8286               current.wasconst=0;
8287               regs[i].wasconst=0;
8288               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8289               if(!((current.is32>>rs1[i])&1))
8290               {
8291                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8292               }
8293             }
8294             else
8295             {
8296               ooo[i]=1;
8297               delayslot_alloc(&current,i+1);
8298             }
8299           }
8300           else
8301           // Don't alloc the delay slot yet because we might not execute it
8302           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8303           {
8304             current.isconst=0;
8305             current.wasconst=0;
8306             regs[i].wasconst=0;
8307             alloc_cc(&current,i);
8308             dirty_reg(&current,CCREG);
8309             alloc_reg(&current,i,rs1[i]);
8310             alloc_reg(&current,i,rs2[i]);
8311             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8312             {
8313               alloc_reg64(&current,i,rs1[i]);
8314               alloc_reg64(&current,i,rs2[i]);
8315             }
8316           }
8317           else
8318           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8319           {
8320             current.isconst=0;
8321             current.wasconst=0;
8322             regs[i].wasconst=0;
8323             alloc_cc(&current,i);
8324             dirty_reg(&current,CCREG);
8325             alloc_reg(&current,i,rs1[i]);
8326             if(!(current.is32>>rs1[i]&1))
8327             {
8328               alloc_reg64(&current,i,rs1[i]);
8329             }
8330           }
8331           ds=1;
8332           //current.isconst=0;
8333           break;
8334         case SJUMP:
8335           //current.isconst=0;
8336           //current.wasconst=0;
8337           //regs[i].wasconst=0;
8338           clear_const(&current,rs1[i]);
8339           clear_const(&current,rt1[i]);
8340           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8341           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8342           {
8343             alloc_cc(&current,i);
8344             dirty_reg(&current,CCREG);
8345             alloc_reg(&current,i,rs1[i]);
8346             if(!(current.is32>>rs1[i]&1))
8347             {
8348               alloc_reg64(&current,i,rs1[i]);
8349             }
8350             if (rt1[i]==31) { // BLTZAL/BGEZAL
8351               alloc_reg(&current,i,31);
8352               dirty_reg(&current,31);
8353               //#ifdef REG_PREFETCH
8354               //alloc_reg(&current,i,PTEMP);
8355               //#endif
8356               //current.is32|=1LL<<rt1[i];
8357             }
8358             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8359                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8360               // Allocate the branch condition registers instead.
8361               current.isconst=0;
8362               current.wasconst=0;
8363               regs[i].wasconst=0;
8364               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8365               if(!((current.is32>>rs1[i])&1))
8366               {
8367                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8368               }
8369             }
8370             else
8371             {
8372               ooo[i]=1;
8373               delayslot_alloc(&current,i+1);
8374             }
8375           }
8376           else
8377           // Don't alloc the delay slot yet because we might not execute it
8378           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8379           {
8380             current.isconst=0;
8381             current.wasconst=0;
8382             regs[i].wasconst=0;
8383             alloc_cc(&current,i);
8384             dirty_reg(&current,CCREG);
8385             alloc_reg(&current,i,rs1[i]);
8386             if(!(current.is32>>rs1[i]&1))
8387             {
8388               alloc_reg64(&current,i,rs1[i]);
8389             }
8390           }
8391           ds=1;
8392           //current.isconst=0;
8393           break;
8394         case FJUMP:
8395           current.isconst=0;
8396           current.wasconst=0;
8397           regs[i].wasconst=0;
8398           if(likely[i]==0) // BC1F/BC1T
8399           {
8400             // TODO: Theoretically we can run out of registers here on x86.
8401             // The delay slot can allocate up to six, and we need to check
8402             // CSREG before executing the delay slot.  Possibly we can drop
8403             // the cycle count and then reload it after checking that the
8404             // FPU is in a usable state, or don't do out-of-order execution.
8405             alloc_cc(&current,i);
8406             dirty_reg(&current,CCREG);
8407             alloc_reg(&current,i,FSREG);
8408             alloc_reg(&current,i,CSREG);
8409             if(itype[i+1]==FCOMP) {
8410               // The delay slot overwrites the branch condition.
8411               // Allocate the branch condition registers instead.
8412               alloc_cc(&current,i);
8413               dirty_reg(&current,CCREG);
8414               alloc_reg(&current,i,CSREG);
8415               alloc_reg(&current,i,FSREG);
8416             }
8417             else {
8418               ooo[i]=1;
8419               delayslot_alloc(&current,i+1);
8420               alloc_reg(&current,i+1,CSREG);
8421             }
8422           }
8423           else
8424           // Don't alloc the delay slot yet because we might not execute it
8425           if(likely[i]) // BC1FL/BC1TL
8426           {
8427             alloc_cc(&current,i);
8428             dirty_reg(&current,CCREG);
8429             alloc_reg(&current,i,CSREG);
8430             alloc_reg(&current,i,FSREG);
8431           }
8432           ds=1;
8433           current.isconst=0;
8434           break;
8435         case IMM16:
8436           imm16_alloc(&current,i);
8437           break;
8438         case LOAD:
8439         case LOADLR:
8440           load_alloc(&current,i);
8441           break;
8442         case STORE:
8443         case STORELR:
8444           store_alloc(&current,i);
8445           break;
8446         case ALU:
8447           alu_alloc(&current,i);
8448           break;
8449         case SHIFT:
8450           shift_alloc(&current,i);
8451           break;
8452         case MULTDIV:
8453           multdiv_alloc(&current,i);
8454           break;
8455         case SHIFTIMM:
8456           shiftimm_alloc(&current,i);
8457           break;
8458         case MOV:
8459           mov_alloc(&current,i);
8460           break;
8461         case COP0:
8462           cop0_alloc(&current,i);
8463           break;
8464         case COP1:
8465         case COP2:
8466           cop1_alloc(&current,i);
8467           break;
8468         case C1LS:
8469           c1ls_alloc(&current,i);
8470           break;
8471         case C2LS:
8472           c2ls_alloc(&current,i);
8473           break;
8474         case C2OP:
8475           c2op_alloc(&current,i);
8476           break;
8477         case FCONV:
8478           fconv_alloc(&current,i);
8479           break;
8480         case FLOAT:
8481           float_alloc(&current,i);
8482           break;
8483         case FCOMP:
8484           fcomp_alloc(&current,i);
8485           break;
8486         case SYSCALL:
8487         case HLECALL:
8488         case INTCALL:
8489           syscall_alloc(&current,i);
8490           break;
8491         case SPAN:
8492           pagespan_alloc(&current,i);
8493           break;
8494       }
8495
8496       // Drop the upper half of registers that have become 32-bit
8497       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8498       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8499         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8500         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8501         current.uu|=1;
8502       } else {
8503         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8504         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8505         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8506         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8507         current.uu|=1;
8508       }
8509
8510       // Create entry (branch target) regmap
8511       for(hr=0;hr<HOST_REGS;hr++)
8512       {
8513         int r,or;
8514         r=current.regmap[hr];
8515         if(r>=0) {
8516           if(r!=regmap_pre[i][hr]) {
8517             // TODO: delay slot (?)
8518             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8519             if(or<0||(r&63)>=TEMPREG){
8520               regs[i].regmap_entry[hr]=-1;
8521             }
8522             else
8523             {
8524               // Just move it to a different register
8525               regs[i].regmap_entry[hr]=r;
8526               // If it was dirty before, it's still dirty
8527               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8528             }
8529           }
8530           else
8531           {
8532             // Unneeded
8533             if(r==0){
8534               regs[i].regmap_entry[hr]=0;
8535             }
8536             else
8537             if(r<64){
8538               if((current.u>>r)&1) {
8539                 regs[i].regmap_entry[hr]=-1;
8540                 //regs[i].regmap[hr]=-1;
8541                 current.regmap[hr]=-1;
8542               }else
8543                 regs[i].regmap_entry[hr]=r;
8544             }
8545             else {
8546               if((current.uu>>(r&63))&1) {
8547                 regs[i].regmap_entry[hr]=-1;
8548                 //regs[i].regmap[hr]=-1;
8549                 current.regmap[hr]=-1;
8550               }else
8551                 regs[i].regmap_entry[hr]=r;
8552             }
8553           }
8554         } else {
8555           // Branches expect CCREG to be allocated at the target
8556           if(regmap_pre[i][hr]==CCREG)
8557             regs[i].regmap_entry[hr]=CCREG;
8558           else
8559             regs[i].regmap_entry[hr]=-1;
8560         }
8561       }
8562       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8563     }
8564
8565     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8566       current.waswritten|=1<<rs1[i-1];
8567     current.waswritten&=~(1<<rt1[i]);
8568     current.waswritten&=~(1<<rt2[i]);
8569     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8570       current.waswritten&=~(1<<rs1[i]);
8571
8572     /* Branch post-alloc */
8573     if(i>0)
8574     {
8575       current.was32=current.is32;
8576       current.wasdirty=current.dirty;
8577       switch(itype[i-1]) {
8578         case UJUMP:
8579           memcpy(&branch_regs[i-1],&current,sizeof(current));
8580           branch_regs[i-1].isconst=0;
8581           branch_regs[i-1].wasconst=0;
8582           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8583           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8584           alloc_cc(&branch_regs[i-1],i-1);
8585           dirty_reg(&branch_regs[i-1],CCREG);
8586           if(rt1[i-1]==31) { // JAL
8587             alloc_reg(&branch_regs[i-1],i-1,31);
8588             dirty_reg(&branch_regs[i-1],31);
8589             branch_regs[i-1].is32|=1LL<<31;
8590           }
8591           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8592           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8593           break;
8594         case RJUMP:
8595           memcpy(&branch_regs[i-1],&current,sizeof(current));
8596           branch_regs[i-1].isconst=0;
8597           branch_regs[i-1].wasconst=0;
8598           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8599           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8600           alloc_cc(&branch_regs[i-1],i-1);
8601           dirty_reg(&branch_regs[i-1],CCREG);
8602           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8603           if(rt1[i-1]!=0) { // JALR
8604             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8605             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8606             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8607           }
8608           #ifdef USE_MINI_HT
8609           if(rs1[i-1]==31) { // JALR
8610             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8611             #ifndef HOST_IMM_ADDR32
8612             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8613             #endif
8614           }
8615           #endif
8616           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8617           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8618           break;
8619         case CJUMP:
8620           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8621           {
8622             alloc_cc(&current,i-1);
8623             dirty_reg(&current,CCREG);
8624             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8625                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8626               // The delay slot overwrote one of our conditions
8627               // Delay slot goes after the test (in order)
8628               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8629               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8630               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8631               current.u|=1;
8632               current.uu|=1;
8633               delayslot_alloc(&current,i);
8634               current.isconst=0;
8635             }
8636             else
8637             {
8638               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8639               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8640               // Alloc the branch condition registers
8641               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8642               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8643               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8644               {
8645                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8646                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8647               }
8648             }
8649             memcpy(&branch_regs[i-1],&current,sizeof(current));
8650             branch_regs[i-1].isconst=0;
8651             branch_regs[i-1].wasconst=0;
8652             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8653             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8654           }
8655           else
8656           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8657           {
8658             alloc_cc(&current,i-1);
8659             dirty_reg(&current,CCREG);
8660             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8661               // The delay slot overwrote the branch condition
8662               // Delay slot goes after the test (in order)
8663               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8664               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8665               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8666               current.u|=1;
8667               current.uu|=1;
8668               delayslot_alloc(&current,i);
8669               current.isconst=0;
8670             }
8671             else
8672             {
8673               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8674               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8675               // Alloc the branch condition register
8676               alloc_reg(&current,i-1,rs1[i-1]);
8677               if(!(current.is32>>rs1[i-1]&1))
8678               {
8679                 alloc_reg64(&current,i-1,rs1[i-1]);
8680               }
8681             }
8682             memcpy(&branch_regs[i-1],&current,sizeof(current));
8683             branch_regs[i-1].isconst=0;
8684             branch_regs[i-1].wasconst=0;
8685             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8686             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8687           }
8688           else
8689           // Alloc the delay slot in case the branch is taken
8690           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8691           {
8692             memcpy(&branch_regs[i-1],&current,sizeof(current));
8693             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8694             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8695             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8696             alloc_cc(&branch_regs[i-1],i);
8697             dirty_reg(&branch_regs[i-1],CCREG);
8698             delayslot_alloc(&branch_regs[i-1],i);
8699             branch_regs[i-1].isconst=0;
8700             alloc_reg(&current,i,CCREG); // Not taken path
8701             dirty_reg(&current,CCREG);
8702             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8703           }
8704           else
8705           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8706           {
8707             memcpy(&branch_regs[i-1],&current,sizeof(current));
8708             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8709             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8710             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8711             alloc_cc(&branch_regs[i-1],i);
8712             dirty_reg(&branch_regs[i-1],CCREG);
8713             delayslot_alloc(&branch_regs[i-1],i);
8714             branch_regs[i-1].isconst=0;
8715             alloc_reg(&current,i,CCREG); // Not taken path
8716             dirty_reg(&current,CCREG);
8717             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8718           }
8719           break;
8720         case SJUMP:
8721           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8722           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8723           {
8724             alloc_cc(&current,i-1);
8725             dirty_reg(&current,CCREG);
8726             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8727               // The delay slot overwrote the branch condition
8728               // Delay slot goes after the test (in order)
8729               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8730               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8731               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8732               current.u|=1;
8733               current.uu|=1;
8734               delayslot_alloc(&current,i);
8735               current.isconst=0;
8736             }
8737             else
8738             {
8739               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8740               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8741               // Alloc the branch condition register
8742               alloc_reg(&current,i-1,rs1[i-1]);
8743               if(!(current.is32>>rs1[i-1]&1))
8744               {
8745                 alloc_reg64(&current,i-1,rs1[i-1]);
8746               }
8747             }
8748             memcpy(&branch_regs[i-1],&current,sizeof(current));
8749             branch_regs[i-1].isconst=0;
8750             branch_regs[i-1].wasconst=0;
8751             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8752             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8753           }
8754           else
8755           // Alloc the delay slot in case the branch is taken
8756           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8757           {
8758             memcpy(&branch_regs[i-1],&current,sizeof(current));
8759             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8760             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8761             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8762             alloc_cc(&branch_regs[i-1],i);
8763             dirty_reg(&branch_regs[i-1],CCREG);
8764             delayslot_alloc(&branch_regs[i-1],i);
8765             branch_regs[i-1].isconst=0;
8766             alloc_reg(&current,i,CCREG); // Not taken path
8767             dirty_reg(&current,CCREG);
8768             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8769           }
8770           // FIXME: BLTZAL/BGEZAL
8771           if(opcode2[i-1]&0x10) { // BxxZAL
8772             alloc_reg(&branch_regs[i-1],i-1,31);
8773             dirty_reg(&branch_regs[i-1],31);
8774             branch_regs[i-1].is32|=1LL<<31;
8775           }
8776           break;
8777         case FJUMP:
8778           if(likely[i-1]==0) // BC1F/BC1T
8779           {
8780             alloc_cc(&current,i-1);
8781             dirty_reg(&current,CCREG);
8782             if(itype[i]==FCOMP) {
8783               // The delay slot overwrote the branch condition
8784               // Delay slot goes after the test (in order)
8785               delayslot_alloc(&current,i);
8786               current.isconst=0;
8787             }
8788             else
8789             {
8790               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8791               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8792               // Alloc the branch condition register
8793               alloc_reg(&current,i-1,FSREG);
8794             }
8795             memcpy(&branch_regs[i-1],&current,sizeof(current));
8796             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8797           }
8798           else // BC1FL/BC1TL
8799           {
8800             // Alloc the delay slot in case the branch is taken
8801             memcpy(&branch_regs[i-1],&current,sizeof(current));
8802             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8803             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8804             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8805             alloc_cc(&branch_regs[i-1],i);
8806             dirty_reg(&branch_regs[i-1],CCREG);
8807             delayslot_alloc(&branch_regs[i-1],i);
8808             branch_regs[i-1].isconst=0;
8809             alloc_reg(&current,i,CCREG); // Not taken path
8810             dirty_reg(&current,CCREG);
8811             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8812           }
8813           break;
8814       }
8815
8816       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8817       {
8818         if(rt1[i-1]==31) // JAL/JALR
8819         {
8820           // Subroutine call will return here, don't alloc any registers
8821           current.is32=1;
8822           current.dirty=0;
8823           clear_all_regs(current.regmap);
8824           alloc_reg(&current,i,CCREG);
8825           dirty_reg(&current,CCREG);
8826         }
8827         else if(i+1<slen)
8828         {
8829           // Internal branch will jump here, match registers to caller
8830           current.is32=0x3FFFFFFFFLL;
8831           current.dirty=0;
8832           clear_all_regs(current.regmap);
8833           alloc_reg(&current,i,CCREG);
8834           dirty_reg(&current,CCREG);
8835           for(j=i-1;j>=0;j--)
8836           {
8837             if(ba[j]==start+i*4+4) {
8838               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8839               current.is32=branch_regs[j].is32;
8840               current.dirty=branch_regs[j].dirty;
8841               break;
8842             }
8843           }
8844           while(j>=0) {
8845             if(ba[j]==start+i*4+4) {
8846               for(hr=0;hr<HOST_REGS;hr++) {
8847                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8848                   current.regmap[hr]=-1;
8849                 }
8850                 current.is32&=branch_regs[j].is32;
8851                 current.dirty&=branch_regs[j].dirty;
8852               }
8853             }
8854             j--;
8855           }
8856         }
8857       }
8858     }
8859
8860     // Count cycles in between branches
8861     ccadj[i]=cc;
8862     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8863     {
8864       cc=0;
8865     }
8866 #if !defined(DRC_DBG)
8867     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8868     {
8869       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8870       cc+=gte_cycletab[source[i]&0x3f]/2;
8871     }
8872     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8873     {
8874       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8875     }
8876     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8877     {
8878       cc+=4;
8879     }
8880     else if(itype[i]==C2LS)
8881     {
8882       cc+=4;
8883     }
8884 #endif
8885     else
8886     {
8887       cc++;
8888     }
8889
8890     flush_dirty_uppers(&current);
8891     if(!is_ds[i]) {
8892       regs[i].is32=current.is32;
8893       regs[i].dirty=current.dirty;
8894       regs[i].isconst=current.isconst;
8895       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8896     }
8897     for(hr=0;hr<HOST_REGS;hr++) {
8898       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8899         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8900           regs[i].wasconst&=~(1<<hr);
8901         }
8902       }
8903     }
8904     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8905     regs[i].waswritten=current.waswritten;
8906   }
8907
8908   /* Pass 4 - Cull unused host registers */
8909
8910   uint64_t nr=0;
8911
8912   for (i=slen-1;i>=0;i--)
8913   {
8914     int hr;
8915     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8916     {
8917       if(ba[i]<start || ba[i]>=(start+slen*4))
8918       {
8919         // Branch out of this block, don't need anything
8920         nr=0;
8921       }
8922       else
8923       {
8924         // Internal branch
8925         // Need whatever matches the target
8926         nr=0;
8927         int t=(ba[i]-start)>>2;
8928         for(hr=0;hr<HOST_REGS;hr++)
8929         {
8930           if(regs[i].regmap_entry[hr]>=0) {
8931             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8932           }
8933         }
8934       }
8935       // Conditional branch may need registers for following instructions
8936       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8937       {
8938         if(i<slen-2) {
8939           nr|=needed_reg[i+2];
8940           for(hr=0;hr<HOST_REGS;hr++)
8941           {
8942             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8943             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8944           }
8945         }
8946       }
8947       // Don't need stuff which is overwritten
8948       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8949       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8950       // Merge in delay slot
8951       for(hr=0;hr<HOST_REGS;hr++)
8952       {
8953         if(!likely[i]) {
8954           // These are overwritten unless the branch is "likely"
8955           // and the delay slot is nullified if not taken
8956           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8957           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8958         }
8959         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8960         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8961         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8962         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8963         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8964         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8965         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8966         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8967         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8968           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8969           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8970         }
8971         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8972           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8973           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8974         }
8975         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8976           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8977           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8978         }
8979       }
8980     }
8981     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8982     {
8983       // SYSCALL instruction (software interrupt)
8984       nr=0;
8985     }
8986     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8987     {
8988       // ERET instruction (return from interrupt)
8989       nr=0;
8990     }
8991     else // Non-branch
8992     {
8993       if(i<slen-1) {
8994         for(hr=0;hr<HOST_REGS;hr++) {
8995           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8996           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8997           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8998           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8999         }
9000       }
9001     }
9002     for(hr=0;hr<HOST_REGS;hr++)
9003     {
9004       // Overwritten registers are not needed
9005       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9006       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9007       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9008       // Source registers are needed
9009       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9010       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9011       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9012       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9013       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9014       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9015       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9016       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9017       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9018         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9019         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9020       }
9021       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9022         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9023         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9024       }
9025       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9026         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9027         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9028       }
9029       // Don't store a register immediately after writing it,
9030       // may prevent dual-issue.
9031       // But do so if this is a branch target, otherwise we
9032       // might have to load the register before the branch.
9033       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9034         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9035            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9036           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9037           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9038         }
9039         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9040            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9041           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9042           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9043         }
9044       }
9045     }
9046     // Cycle count is needed at branches.  Assume it is needed at the target too.
9047     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9048       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9049       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9050     }
9051     // Save it
9052     needed_reg[i]=nr;
9053
9054     // Deallocate unneeded registers
9055     for(hr=0;hr<HOST_REGS;hr++)
9056     {
9057       if(!((nr>>hr)&1)) {
9058         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9059         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9060            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9061            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9062         {
9063           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9064           {
9065             if(likely[i]) {
9066               regs[i].regmap[hr]=-1;
9067               regs[i].isconst&=~(1<<hr);
9068               if(i<slen-2) {
9069                 regmap_pre[i+2][hr]=-1;
9070                 regs[i+2].wasconst&=~(1<<hr);
9071               }
9072             }
9073           }
9074         }
9075         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9076         {
9077           int d1=0,d2=0,map=0,temp=0;
9078           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9079           {
9080             d1=dep1[i+1];
9081             d2=dep2[i+1];
9082           }
9083           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9084              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9085             map=INVCP;
9086           }
9087           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9088              itype[i+1]==C1LS || itype[i+1]==C2LS)
9089             temp=FTEMP;
9090           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9091              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9092              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9093              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9094              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9095              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9096              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9097              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9098              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9099              regs[i].regmap[hr]!=map )
9100           {
9101             regs[i].regmap[hr]=-1;
9102             regs[i].isconst&=~(1<<hr);
9103             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9104                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9105                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9106                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9107                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9108                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9109                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9110                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9111                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9112                branch_regs[i].regmap[hr]!=map)
9113             {
9114               branch_regs[i].regmap[hr]=-1;
9115               branch_regs[i].regmap_entry[hr]=-1;
9116               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9117               {
9118                 if(!likely[i]&&i<slen-2) {
9119                   regmap_pre[i+2][hr]=-1;
9120                   regs[i+2].wasconst&=~(1<<hr);
9121                 }
9122               }
9123             }
9124           }
9125         }
9126         else
9127         {
9128           // Non-branch
9129           if(i>0)
9130           {
9131             int d1=0,d2=0,map=-1,temp=-1;
9132             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9133             {
9134               d1=dep1[i];
9135               d2=dep2[i];
9136             }
9137             if(itype[i]==STORE || itype[i]==STORELR ||
9138                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9139               map=INVCP;
9140             }
9141             if(itype[i]==LOADLR || itype[i]==STORELR ||
9142                itype[i]==C1LS || itype[i]==C2LS)
9143               temp=FTEMP;
9144             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9145                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9146                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9147                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9148                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9149                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9150             {
9151               if(i<slen-1&&!is_ds[i]) {
9152                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9153                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9154                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9155                 {
9156                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9157                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9158                 }
9159                 regmap_pre[i+1][hr]=-1;
9160                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9161                 regs[i+1].wasconst&=~(1<<hr);
9162               }
9163               regs[i].regmap[hr]=-1;
9164               regs[i].isconst&=~(1<<hr);
9165             }
9166           }
9167         }
9168       }
9169     }
9170   }
9171
9172   /* Pass 5 - Pre-allocate registers */
9173
9174   // If a register is allocated during a loop, try to allocate it for the
9175   // entire loop, if possible.  This avoids loading/storing registers
9176   // inside of the loop.
9177
9178   signed char f_regmap[HOST_REGS];
9179   clear_all_regs(f_regmap);
9180   for(i=0;i<slen-1;i++)
9181   {
9182     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9183     {
9184       if(ba[i]>=start && ba[i]<(start+i*4))
9185       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9186       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9187       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9188       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9189       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9190       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9191       {
9192         int t=(ba[i]-start)>>2;
9193         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9194         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9195         for(hr=0;hr<HOST_REGS;hr++)
9196         {
9197           if(regs[i].regmap[hr]>64) {
9198             if(!((regs[i].dirty>>hr)&1))
9199               f_regmap[hr]=regs[i].regmap[hr];
9200             else f_regmap[hr]=-1;
9201           }
9202           else if(regs[i].regmap[hr]>=0) {
9203             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9204               // dealloc old register
9205               int n;
9206               for(n=0;n<HOST_REGS;n++)
9207               {
9208                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9209               }
9210               // and alloc new one
9211               f_regmap[hr]=regs[i].regmap[hr];
9212             }
9213           }
9214           if(branch_regs[i].regmap[hr]>64) {
9215             if(!((branch_regs[i].dirty>>hr)&1))
9216               f_regmap[hr]=branch_regs[i].regmap[hr];
9217             else f_regmap[hr]=-1;
9218           }
9219           else if(branch_regs[i].regmap[hr]>=0) {
9220             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9221               // dealloc old register
9222               int n;
9223               for(n=0;n<HOST_REGS;n++)
9224               {
9225                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9226               }
9227               // and alloc new one
9228               f_regmap[hr]=branch_regs[i].regmap[hr];
9229             }
9230           }
9231           if(ooo[i]) {
9232             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9233               f_regmap[hr]=branch_regs[i].regmap[hr];
9234           }else{
9235             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9236               f_regmap[hr]=branch_regs[i].regmap[hr];
9237           }
9238           // Avoid dirty->clean transition
9239           #ifdef DESTRUCTIVE_WRITEBACK
9240           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9241           #endif
9242           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9243           // case above, however it's always a good idea.  We can't hoist the
9244           // load if the register was already allocated, so there's no point
9245           // wasting time analyzing most of these cases.  It only "succeeds"
9246           // when the mapping was different and the load can be replaced with
9247           // a mov, which is of negligible benefit.  So such cases are
9248           // skipped below.
9249           if(f_regmap[hr]>0) {
9250             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9251               int r=f_regmap[hr];
9252               for(j=t;j<=i;j++)
9253               {
9254                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9255                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9256                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9257                 if(r>63) {
9258                   // NB This can exclude the case where the upper-half
9259                   // register is lower numbered than the lower-half
9260                   // register.  Not sure if it's worth fixing...
9261                   if(get_reg(regs[j].regmap,r&63)<0) break;
9262                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9263                   if(regs[j].is32&(1LL<<(r&63))) break;
9264                 }
9265                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9266                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9267                   int k;
9268                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9269                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9270                     if(r>63) {
9271                       if(get_reg(regs[i].regmap,r&63)<0) break;
9272                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9273                     }
9274                     k=i;
9275                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9276                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9277                         //printf("no free regs for store %x\n",start+(k-1)*4);
9278                         break;
9279                       }
9280                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9281                         //printf("no-match due to different register\n");
9282                         break;
9283                       }
9284                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9285                         //printf("no-match due to branch\n");
9286                         break;
9287                       }
9288                       // call/ret fast path assumes no registers allocated
9289                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9290                         break;
9291                       }
9292                       if(r>63) {
9293                         // NB This can exclude the case where the upper-half
9294                         // register is lower numbered than the lower-half
9295                         // register.  Not sure if it's worth fixing...
9296                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9297                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9298                       }
9299                       k--;
9300                     }
9301                     if(i<slen-1) {
9302                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9303                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9304                         //printf("bad match after branch\n");
9305                         break;
9306                       }
9307                     }
9308                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9309                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9310                       while(k<i) {
9311                         regs[k].regmap_entry[hr]=f_regmap[hr];
9312                         regs[k].regmap[hr]=f_regmap[hr];
9313                         regmap_pre[k+1][hr]=f_regmap[hr];
9314                         regs[k].wasdirty&=~(1<<hr);
9315                         regs[k].dirty&=~(1<<hr);
9316                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9317                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9318                         regs[k].wasconst&=~(1<<hr);
9319                         regs[k].isconst&=~(1<<hr);
9320                         k++;
9321                       }
9322                     }
9323                     else {
9324                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9325                       break;
9326                     }
9327                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9328                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9329                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9330                       regs[i].regmap_entry[hr]=f_regmap[hr];
9331                       regs[i].regmap[hr]=f_regmap[hr];
9332                       regs[i].wasdirty&=~(1<<hr);
9333                       regs[i].dirty&=~(1<<hr);
9334                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9335                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9336                       regs[i].wasconst&=~(1<<hr);
9337                       regs[i].isconst&=~(1<<hr);
9338                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9339                       branch_regs[i].wasdirty&=~(1<<hr);
9340                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9341                       branch_regs[i].regmap[hr]=f_regmap[hr];
9342                       branch_regs[i].dirty&=~(1<<hr);
9343                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9344                       branch_regs[i].wasconst&=~(1<<hr);
9345                       branch_regs[i].isconst&=~(1<<hr);
9346                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9347                         regmap_pre[i+2][hr]=f_regmap[hr];
9348                         regs[i+2].wasdirty&=~(1<<hr);
9349                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9350                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9351                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9352                       }
9353                     }
9354                   }
9355                   for(k=t;k<j;k++) {
9356                     // Alloc register clean at beginning of loop,
9357                     // but may dirty it in pass 6
9358                     regs[k].regmap_entry[hr]=f_regmap[hr];
9359                     regs[k].regmap[hr]=f_regmap[hr];
9360                     regs[k].dirty&=~(1<<hr);
9361                     regs[k].wasconst&=~(1<<hr);
9362                     regs[k].isconst&=~(1<<hr);
9363                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9364                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9365                       branch_regs[k].regmap[hr]=f_regmap[hr];
9366                       branch_regs[k].dirty&=~(1<<hr);
9367                       branch_regs[k].wasconst&=~(1<<hr);
9368                       branch_regs[k].isconst&=~(1<<hr);
9369                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9370                         regmap_pre[k+2][hr]=f_regmap[hr];
9371                         regs[k+2].wasdirty&=~(1<<hr);
9372                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9373                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9374                       }
9375                     }
9376                     else
9377                     {
9378                       regmap_pre[k+1][hr]=f_regmap[hr];
9379                       regs[k+1].wasdirty&=~(1<<hr);
9380                     }
9381                   }
9382                   if(regs[j].regmap[hr]==f_regmap[hr])
9383                     regs[j].regmap_entry[hr]=f_regmap[hr];
9384                   break;
9385                 }
9386                 if(j==i) break;
9387                 if(regs[j].regmap[hr]>=0)
9388                   break;
9389                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9390                   //printf("no-match due to different register\n");
9391                   break;
9392                 }
9393                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9394                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9395                   break;
9396                 }
9397                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9398                 {
9399                   // Stop on unconditional branch
9400                   break;
9401                 }
9402                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9403                 {
9404                   if(ooo[j]) {
9405                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9406                       break;
9407                   }else{
9408                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9409                       break;
9410                   }
9411                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9412                     //printf("no-match due to different register (branch)\n");
9413                     break;
9414                   }
9415                 }
9416                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9417                   //printf("No free regs for store %x\n",start+j*4);
9418                   break;
9419                 }
9420                 if(f_regmap[hr]>=64) {
9421                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9422                     break;
9423                   }
9424                   else
9425                   {
9426                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9427                       break;
9428                     }
9429                   }
9430                 }
9431               }
9432             }
9433           }
9434         }
9435       }
9436     }else{
9437       // Non branch or undetermined branch target
9438       for(hr=0;hr<HOST_REGS;hr++)
9439       {
9440         if(hr!=EXCLUDE_REG) {
9441           if(regs[i].regmap[hr]>64) {
9442             if(!((regs[i].dirty>>hr)&1))
9443               f_regmap[hr]=regs[i].regmap[hr];
9444           }
9445           else if(regs[i].regmap[hr]>=0) {
9446             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9447               // dealloc old register
9448               int n;
9449               for(n=0;n<HOST_REGS;n++)
9450               {
9451                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9452               }
9453               // and alloc new one
9454               f_regmap[hr]=regs[i].regmap[hr];
9455             }
9456           }
9457         }
9458       }
9459       // Try to restore cycle count at branch targets
9460       if(bt[i]) {
9461         for(j=i;j<slen-1;j++) {
9462           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9463           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9464             //printf("no free regs for store %x\n",start+j*4);
9465             break;
9466           }
9467         }
9468         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9469           int k=i;
9470           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9471           while(k<j) {
9472             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9473             regs[k].regmap[HOST_CCREG]=CCREG;
9474             regmap_pre[k+1][HOST_CCREG]=CCREG;
9475             regs[k+1].wasdirty|=1<<HOST_CCREG;
9476             regs[k].dirty|=1<<HOST_CCREG;
9477             regs[k].wasconst&=~(1<<HOST_CCREG);
9478             regs[k].isconst&=~(1<<HOST_CCREG);
9479             k++;
9480           }
9481           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9482         }
9483         // Work backwards from the branch target
9484         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9485         {
9486           //printf("Extend backwards\n");
9487           int k;
9488           k=i;
9489           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9490             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9491               //printf("no free regs for store %x\n",start+(k-1)*4);
9492               break;
9493             }
9494             k--;
9495           }
9496           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9497             //printf("Extend CC, %x ->\n",start+k*4);
9498             while(k<=i) {
9499               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9500               regs[k].regmap[HOST_CCREG]=CCREG;
9501               regmap_pre[k+1][HOST_CCREG]=CCREG;
9502               regs[k+1].wasdirty|=1<<HOST_CCREG;
9503               regs[k].dirty|=1<<HOST_CCREG;
9504               regs[k].wasconst&=~(1<<HOST_CCREG);
9505               regs[k].isconst&=~(1<<HOST_CCREG);
9506               k++;
9507             }
9508           }
9509           else {
9510             //printf("Fail Extend CC, %x ->\n",start+k*4);
9511           }
9512         }
9513       }
9514       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9515          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9516          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9517          itype[i]!=FCONV&&itype[i]!=FCOMP)
9518       {
9519         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9520       }
9521     }
9522   }
9523
9524   // Cache memory offset or tlb map pointer if a register is available
9525   #ifndef HOST_IMM_ADDR32
9526   #ifndef RAM_OFFSET
9527   if(0)
9528   #endif
9529   {
9530     int earliest_available[HOST_REGS];
9531     int loop_start[HOST_REGS];
9532     int score[HOST_REGS];
9533     int end[HOST_REGS];
9534     int reg=ROREG;
9535
9536     // Init
9537     for(hr=0;hr<HOST_REGS;hr++) {
9538       score[hr]=0;earliest_available[hr]=0;
9539       loop_start[hr]=MAXBLOCK;
9540     }
9541     for(i=0;i<slen-1;i++)
9542     {
9543       // Can't do anything if no registers are available
9544       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9545         for(hr=0;hr<HOST_REGS;hr++) {
9546           score[hr]=0;earliest_available[hr]=i+1;
9547           loop_start[hr]=MAXBLOCK;
9548         }
9549       }
9550       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9551         if(!ooo[i]) {
9552           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9553             for(hr=0;hr<HOST_REGS;hr++) {
9554               score[hr]=0;earliest_available[hr]=i+1;
9555               loop_start[hr]=MAXBLOCK;
9556             }
9557           }
9558         }else{
9559           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9560             for(hr=0;hr<HOST_REGS;hr++) {
9561               score[hr]=0;earliest_available[hr]=i+1;
9562               loop_start[hr]=MAXBLOCK;
9563             }
9564           }
9565         }
9566       }
9567       // Mark unavailable registers
9568       for(hr=0;hr<HOST_REGS;hr++) {
9569         if(regs[i].regmap[hr]>=0) {
9570           score[hr]=0;earliest_available[hr]=i+1;
9571           loop_start[hr]=MAXBLOCK;
9572         }
9573         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9574           if(branch_regs[i].regmap[hr]>=0) {
9575             score[hr]=0;earliest_available[hr]=i+2;
9576             loop_start[hr]=MAXBLOCK;
9577           }
9578         }
9579       }
9580       // No register allocations after unconditional jumps
9581       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9582       {
9583         for(hr=0;hr<HOST_REGS;hr++) {
9584           score[hr]=0;earliest_available[hr]=i+2;
9585           loop_start[hr]=MAXBLOCK;
9586         }
9587         i++; // Skip delay slot too
9588         //printf("skip delay slot: %x\n",start+i*4);
9589       }
9590       else
9591       // Possible match
9592       if(itype[i]==LOAD||itype[i]==LOADLR||
9593          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9594         for(hr=0;hr<HOST_REGS;hr++) {
9595           if(hr!=EXCLUDE_REG) {
9596             end[hr]=i-1;
9597             for(j=i;j<slen-1;j++) {
9598               if(regs[j].regmap[hr]>=0) break;
9599               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9600                 if(branch_regs[j].regmap[hr]>=0) break;
9601                 if(ooo[j]) {
9602                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9603                 }else{
9604                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9605                 }
9606               }
9607               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9608               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9609                 int t=(ba[j]-start)>>2;
9610                 if(t<j&&t>=earliest_available[hr]) {
9611                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9612                     // Score a point for hoisting loop invariant
9613                     if(t<loop_start[hr]) loop_start[hr]=t;
9614                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9615                     score[hr]++;
9616                     end[hr]=j;
9617                   }
9618                 }
9619                 else if(t<j) {
9620                   if(regs[t].regmap[hr]==reg) {
9621                     // Score a point if the branch target matches this register
9622                     score[hr]++;
9623                     end[hr]=j;
9624                   }
9625                 }
9626                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9627                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9628                   score[hr]++;
9629                   end[hr]=j;
9630                 }
9631               }
9632               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9633               {
9634                 // Stop on unconditional branch
9635                 break;
9636               }
9637               else
9638               if(itype[j]==LOAD||itype[j]==LOADLR||
9639                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9640                 score[hr]++;
9641                 end[hr]=j;
9642               }
9643             }
9644           }
9645         }
9646         // Find highest score and allocate that register
9647         int maxscore=0;
9648         for(hr=0;hr<HOST_REGS;hr++) {
9649           if(hr!=EXCLUDE_REG) {
9650             if(score[hr]>score[maxscore]) {
9651               maxscore=hr;
9652               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9653             }
9654           }
9655         }
9656         if(score[maxscore]>1)
9657         {
9658           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9659           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9660             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9661             assert(regs[j].regmap[maxscore]<0);
9662             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9663             regs[j].regmap[maxscore]=reg;
9664             regs[j].dirty&=~(1<<maxscore);
9665             regs[j].wasconst&=~(1<<maxscore);
9666             regs[j].isconst&=~(1<<maxscore);
9667             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9668               branch_regs[j].regmap[maxscore]=reg;
9669               branch_regs[j].wasdirty&=~(1<<maxscore);
9670               branch_regs[j].dirty&=~(1<<maxscore);
9671               branch_regs[j].wasconst&=~(1<<maxscore);
9672               branch_regs[j].isconst&=~(1<<maxscore);
9673               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9674                 regmap_pre[j+2][maxscore]=reg;
9675                 regs[j+2].wasdirty&=~(1<<maxscore);
9676               }
9677               // loop optimization (loop_preload)
9678               int t=(ba[j]-start)>>2;
9679               if(t==loop_start[maxscore]) {
9680                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9681                   regs[t].regmap_entry[maxscore]=reg;
9682               }
9683             }
9684             else
9685             {
9686               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9687                 regmap_pre[j+1][maxscore]=reg;
9688                 regs[j+1].wasdirty&=~(1<<maxscore);
9689               }
9690             }
9691           }
9692           i=j-1;
9693           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9694           for(hr=0;hr<HOST_REGS;hr++) {
9695             score[hr]=0;earliest_available[hr]=i+i;
9696             loop_start[hr]=MAXBLOCK;
9697           }
9698         }
9699       }
9700     }
9701   }
9702   #endif
9703
9704   // This allocates registers (if possible) one instruction prior
9705   // to use, which can avoid a load-use penalty on certain CPUs.
9706   for(i=0;i<slen-1;i++)
9707   {
9708     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9709     {
9710       if(!bt[i+1])
9711       {
9712         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9713            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9714         {
9715           if(rs1[i+1]) {
9716             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9717             {
9718               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9719               {
9720                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9721                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9722                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9723                 regs[i].isconst&=~(1<<hr);
9724                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9725                 constmap[i][hr]=constmap[i+1][hr];
9726                 regs[i+1].wasdirty&=~(1<<hr);
9727                 regs[i].dirty&=~(1<<hr);
9728               }
9729             }
9730           }
9731           if(rs2[i+1]) {
9732             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9733             {
9734               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9735               {
9736                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9737                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9738                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9739                 regs[i].isconst&=~(1<<hr);
9740                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9741                 constmap[i][hr]=constmap[i+1][hr];
9742                 regs[i+1].wasdirty&=~(1<<hr);
9743                 regs[i].dirty&=~(1<<hr);
9744               }
9745             }
9746           }
9747           // Preload target address for load instruction (non-constant)
9748           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9749             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9750             {
9751               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9752               {
9753                 regs[i].regmap[hr]=rs1[i+1];
9754                 regmap_pre[i+1][hr]=rs1[i+1];
9755                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9756                 regs[i].isconst&=~(1<<hr);
9757                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9758                 constmap[i][hr]=constmap[i+1][hr];
9759                 regs[i+1].wasdirty&=~(1<<hr);
9760                 regs[i].dirty&=~(1<<hr);
9761               }
9762             }
9763           }
9764           // Load source into target register
9765           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9766             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9767             {
9768               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9769               {
9770                 regs[i].regmap[hr]=rs1[i+1];
9771                 regmap_pre[i+1][hr]=rs1[i+1];
9772                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9773                 regs[i].isconst&=~(1<<hr);
9774                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9775                 constmap[i][hr]=constmap[i+1][hr];
9776                 regs[i+1].wasdirty&=~(1<<hr);
9777                 regs[i].dirty&=~(1<<hr);
9778               }
9779             }
9780           }
9781           // Address for store instruction (non-constant)
9782           if(itype[i+1]==STORE||itype[i+1]==STORELR
9783              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9784             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9785               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9786               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9787               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9788               assert(hr>=0);
9789               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9790               {
9791                 regs[i].regmap[hr]=rs1[i+1];
9792                 regmap_pre[i+1][hr]=rs1[i+1];
9793                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9794                 regs[i].isconst&=~(1<<hr);
9795                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9796                 constmap[i][hr]=constmap[i+1][hr];
9797                 regs[i+1].wasdirty&=~(1<<hr);
9798                 regs[i].dirty&=~(1<<hr);
9799               }
9800             }
9801           }
9802           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9803             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9804               int nr;
9805               hr=get_reg(regs[i+1].regmap,FTEMP);
9806               assert(hr>=0);
9807               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9808               {
9809                 regs[i].regmap[hr]=rs1[i+1];
9810                 regmap_pre[i+1][hr]=rs1[i+1];
9811                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9812                 regs[i].isconst&=~(1<<hr);
9813                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9814                 constmap[i][hr]=constmap[i+1][hr];
9815                 regs[i+1].wasdirty&=~(1<<hr);
9816                 regs[i].dirty&=~(1<<hr);
9817               }
9818               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9819               {
9820                 // move it to another register
9821                 regs[i+1].regmap[hr]=-1;
9822                 regmap_pre[i+2][hr]=-1;
9823                 regs[i+1].regmap[nr]=FTEMP;
9824                 regmap_pre[i+2][nr]=FTEMP;
9825                 regs[i].regmap[nr]=rs1[i+1];
9826                 regmap_pre[i+1][nr]=rs1[i+1];
9827                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9828                 regs[i].isconst&=~(1<<nr);
9829                 regs[i+1].isconst&=~(1<<nr);
9830                 regs[i].dirty&=~(1<<nr);
9831                 regs[i+1].wasdirty&=~(1<<nr);
9832                 regs[i+1].dirty&=~(1<<nr);
9833                 regs[i+2].wasdirty&=~(1<<nr);
9834               }
9835             }
9836           }
9837           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9838             if(itype[i+1]==LOAD)
9839               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9840             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9841               hr=get_reg(regs[i+1].regmap,FTEMP);
9842             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9843               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9844               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9845             }
9846             if(hr>=0&&regs[i].regmap[hr]<0) {
9847               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9848               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9849                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9850                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9851                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9852                 regs[i].isconst&=~(1<<hr);
9853                 regs[i+1].wasdirty&=~(1<<hr);
9854                 regs[i].dirty&=~(1<<hr);
9855               }
9856             }
9857           }
9858         }
9859       }
9860     }
9861   }
9862
9863   /* Pass 6 - Optimize clean/dirty state */
9864   clean_registers(0,slen-1,1);
9865
9866   /* Pass 7 - Identify 32-bit registers */
9867   for (i=slen-1;i>=0;i--)
9868   {
9869     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9870     {
9871       // Conditional branch
9872       if((source[i]>>16)!=0x1000&&i<slen-2) {
9873         // Mark this address as a branch target since it may be called
9874         // upon return from interrupt
9875         bt[i+2]=1;
9876       }
9877     }
9878   }
9879
9880   if(itype[slen-1]==SPAN) {
9881     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9882   }
9883
9884 #ifdef DISASM
9885   /* Debug/disassembly */
9886   for(i=0;i<slen;i++)
9887   {
9888     printf("U:");
9889     int r;
9890     for(r=1;r<=CCREG;r++) {
9891       if((unneeded_reg[i]>>r)&1) {
9892         if(r==HIREG) printf(" HI");
9893         else if(r==LOREG) printf(" LO");
9894         else printf(" r%d",r);
9895       }
9896     }
9897     printf("\n");
9898     #if defined(__i386__) || defined(__x86_64__)
9899     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9900     #endif
9901     #ifdef __arm__
9902     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9903     #endif
9904     printf("needs: ");
9905     if(needed_reg[i]&1) printf("eax ");
9906     if((needed_reg[i]>>1)&1) printf("ecx ");
9907     if((needed_reg[i]>>2)&1) printf("edx ");
9908     if((needed_reg[i]>>3)&1) printf("ebx ");
9909     if((needed_reg[i]>>5)&1) printf("ebp ");
9910     if((needed_reg[i]>>6)&1) printf("esi ");
9911     if((needed_reg[i]>>7)&1) printf("edi ");
9912     printf("\n");
9913     #if defined(__i386__) || defined(__x86_64__)
9914     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9915     printf("dirty: ");
9916     if(regs[i].wasdirty&1) printf("eax ");
9917     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9918     if((regs[i].wasdirty>>2)&1) printf("edx ");
9919     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9920     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9921     if((regs[i].wasdirty>>6)&1) printf("esi ");
9922     if((regs[i].wasdirty>>7)&1) printf("edi ");
9923     #endif
9924     #ifdef __arm__
9925     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9926     printf("dirty: ");
9927     if(regs[i].wasdirty&1) printf("r0 ");
9928     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9929     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9930     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9931     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9932     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9933     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9934     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9935     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9936     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9937     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9938     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9939     #endif
9940     printf("\n");
9941     disassemble_inst(i);
9942     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9943     #if defined(__i386__) || defined(__x86_64__)
9944     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9945     if(regs[i].dirty&1) printf("eax ");
9946     if((regs[i].dirty>>1)&1) printf("ecx ");
9947     if((regs[i].dirty>>2)&1) printf("edx ");
9948     if((regs[i].dirty>>3)&1) printf("ebx ");
9949     if((regs[i].dirty>>5)&1) printf("ebp ");
9950     if((regs[i].dirty>>6)&1) printf("esi ");
9951     if((regs[i].dirty>>7)&1) printf("edi ");
9952     #endif
9953     #ifdef __arm__
9954     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9955     if(regs[i].dirty&1) printf("r0 ");
9956     if((regs[i].dirty>>1)&1) printf("r1 ");
9957     if((regs[i].dirty>>2)&1) printf("r2 ");
9958     if((regs[i].dirty>>3)&1) printf("r3 ");
9959     if((regs[i].dirty>>4)&1) printf("r4 ");
9960     if((regs[i].dirty>>5)&1) printf("r5 ");
9961     if((regs[i].dirty>>6)&1) printf("r6 ");
9962     if((regs[i].dirty>>7)&1) printf("r7 ");
9963     if((regs[i].dirty>>8)&1) printf("r8 ");
9964     if((regs[i].dirty>>9)&1) printf("r9 ");
9965     if((regs[i].dirty>>10)&1) printf("r10 ");
9966     if((regs[i].dirty>>12)&1) printf("r12 ");
9967     #endif
9968     printf("\n");
9969     if(regs[i].isconst) {
9970       printf("constants: ");
9971       #if defined(__i386__) || defined(__x86_64__)
9972       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9973       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9974       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9975       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9976       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9977       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
9978       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
9979       #endif
9980       #ifdef __arm__
9981       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
9982       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
9983       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
9984       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
9985       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
9986       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
9987       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
9988       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
9989       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
9990       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
9991       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
9992       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
9993       #endif
9994       printf("\n");
9995     }
9996     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9997       #if defined(__i386__) || defined(__x86_64__)
9998       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9999       if(branch_regs[i].dirty&1) printf("eax ");
10000       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10001       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10002       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10003       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10004       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10005       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10006       #endif
10007       #ifdef __arm__
10008       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10009       if(branch_regs[i].dirty&1) printf("r0 ");
10010       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10011       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10012       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10013       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10014       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10015       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10016       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10017       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10018       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10019       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10020       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10021       #endif
10022     }
10023   }
10024 #endif // DISASM
10025
10026   /* Pass 8 - Assembly */
10027   linkcount=0;stubcount=0;
10028   ds=0;is_delayslot=0;
10029   cop1_usable=0;
10030   uint64_t is32_pre=0;
10031   u_int dirty_pre=0;
10032   void *beginning=start_block();
10033   if((u_int)addr&1) {
10034     ds=1;
10035     pagespan_ds();
10036   }
10037   u_int instr_addr0_override=0;
10038
10039   if (start == 0x80030000) {
10040     // nasty hack for fastbios thing
10041     // override block entry to this code
10042     instr_addr0_override=(u_int)out;
10043     emit_movimm(start,0);
10044     // abuse io address var as a flag that we
10045     // have already returned here once
10046     emit_readword((int)&address,1);
10047     emit_writeword(0,(int)&pcaddr);
10048     emit_writeword(0,(int)&address);
10049     emit_cmp(0,1);
10050     emit_jne((int)new_dyna_leave);
10051   }
10052   for(i=0;i<slen;i++)
10053   {
10054     //if(ds) printf("ds: ");
10055     disassemble_inst(i);
10056     if(ds) {
10057       ds=0; // Skip delay slot
10058       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10059       instr_addr[i]=0;
10060     } else {
10061       speculate_register_values(i);
10062       #ifndef DESTRUCTIVE_WRITEBACK
10063       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10064       {
10065         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10066               unneeded_reg[i],unneeded_reg_upper[i]);
10067       }
10068       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
10069         is32_pre=branch_regs[i].is32;
10070         dirty_pre=branch_regs[i].dirty;
10071       }else{
10072         is32_pre=regs[i].is32;
10073         dirty_pre=regs[i].dirty;
10074       }
10075       #endif
10076       // write back
10077       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10078       {
10079         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10080                       unneeded_reg[i],unneeded_reg_upper[i]);
10081         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10082       }
10083       // branch target entry point
10084       instr_addr[i]=(u_int)out;
10085       assem_debug("<->\n");
10086       // load regs
10087       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10088         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10089       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10090       address_generation(i,&regs[i],regs[i].regmap_entry);
10091       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10092       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10093       {
10094         // Load the delay slot registers if necessary
10095         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
10096           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10097         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
10098           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10099         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10100           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10101       }
10102       else if(i+1<slen)
10103       {
10104         // Preload registers for following instruction
10105         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10106           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10107             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10108         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10109           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10110             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10111       }
10112       // TODO: if(is_ooo(i)) address_generation(i+1);
10113       if(itype[i]==CJUMP||itype[i]==FJUMP)
10114         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10115       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10116         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10117       if(bt[i]) cop1_usable=0;
10118       // assemble
10119       switch(itype[i]) {
10120         case ALU:
10121           alu_assemble(i,&regs[i]);break;
10122         case IMM16:
10123           imm16_assemble(i,&regs[i]);break;
10124         case SHIFT:
10125           shift_assemble(i,&regs[i]);break;
10126         case SHIFTIMM:
10127           shiftimm_assemble(i,&regs[i]);break;
10128         case LOAD:
10129           load_assemble(i,&regs[i]);break;
10130         case LOADLR:
10131           loadlr_assemble(i,&regs[i]);break;
10132         case STORE:
10133           store_assemble(i,&regs[i]);break;
10134         case STORELR:
10135           storelr_assemble(i,&regs[i]);break;
10136         case COP0:
10137           cop0_assemble(i,&regs[i]);break;
10138         case COP1:
10139           cop1_assemble(i,&regs[i]);break;
10140         case C1LS:
10141           c1ls_assemble(i,&regs[i]);break;
10142         case COP2:
10143           cop2_assemble(i,&regs[i]);break;
10144         case C2LS:
10145           c2ls_assemble(i,&regs[i]);break;
10146         case C2OP:
10147           c2op_assemble(i,&regs[i]);break;
10148         case FCONV:
10149           fconv_assemble(i,&regs[i]);break;
10150         case FLOAT:
10151           float_assemble(i,&regs[i]);break;
10152         case FCOMP:
10153           fcomp_assemble(i,&regs[i]);break;
10154         case MULTDIV:
10155           multdiv_assemble(i,&regs[i]);break;
10156         case MOV:
10157           mov_assemble(i,&regs[i]);break;
10158         case SYSCALL:
10159           syscall_assemble(i,&regs[i]);break;
10160         case HLECALL:
10161           hlecall_assemble(i,&regs[i]);break;
10162         case INTCALL:
10163           intcall_assemble(i,&regs[i]);break;
10164         case UJUMP:
10165           ujump_assemble(i,&regs[i]);ds=1;break;
10166         case RJUMP:
10167           rjump_assemble(i,&regs[i]);ds=1;break;
10168         case CJUMP:
10169           cjump_assemble(i,&regs[i]);ds=1;break;
10170         case SJUMP:
10171           sjump_assemble(i,&regs[i]);ds=1;break;
10172         case FJUMP:
10173           fjump_assemble(i,&regs[i]);ds=1;break;
10174         case SPAN:
10175           pagespan_assemble(i,&regs[i]);break;
10176       }
10177       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10178         literal_pool(1024);
10179       else
10180         literal_pool_jumpover(256);
10181     }
10182   }
10183   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10184   // If the block did not end with an unconditional branch,
10185   // add a jump to the next instruction.
10186   if(i>1) {
10187     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10188       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10189       assert(i==slen);
10190       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10191         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10192         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10193           emit_loadreg(CCREG,HOST_CCREG);
10194         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10195       }
10196       else if(!likely[i-2])
10197       {
10198         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10199         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10200       }
10201       else
10202       {
10203         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10204         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10205       }
10206       add_to_linker((int)out,start+i*4,0);
10207       emit_jmp(0);
10208     }
10209   }
10210   else
10211   {
10212     assert(i>0);
10213     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10214     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10215     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10216       emit_loadreg(CCREG,HOST_CCREG);
10217     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10218     add_to_linker((int)out,start+i*4,0);
10219     emit_jmp(0);
10220   }
10221
10222   // TODO: delay slot stubs?
10223   // Stubs
10224   for(i=0;i<stubcount;i++)
10225   {
10226     switch(stubs[i][0])
10227     {
10228       case LOADB_STUB:
10229       case LOADH_STUB:
10230       case LOADW_STUB:
10231       case LOADD_STUB:
10232       case LOADBU_STUB:
10233       case LOADHU_STUB:
10234         do_readstub(i);break;
10235       case STOREB_STUB:
10236       case STOREH_STUB:
10237       case STOREW_STUB:
10238       case STORED_STUB:
10239         do_writestub(i);break;
10240       case CC_STUB:
10241         do_ccstub(i);break;
10242       case INVCODE_STUB:
10243         do_invstub(i);break;
10244       case FP_STUB:
10245         do_cop1stub(i);break;
10246       case STORELR_STUB:
10247         do_unalignedwritestub(i);break;
10248     }
10249   }
10250
10251   if (instr_addr0_override)
10252     instr_addr[0] = instr_addr0_override;
10253
10254   /* Pass 9 - Linker */
10255   for(i=0;i<linkcount;i++)
10256   {
10257     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10258     literal_pool(64);
10259     if(!link_addr[i][2])
10260     {
10261       void *stub=out;
10262       void *addr=check_addr(link_addr[i][1]);
10263       emit_extjump(link_addr[i][0],link_addr[i][1]);
10264       if(addr) {
10265         set_jump_target(link_addr[i][0],(int)addr);
10266         add_link(link_addr[i][1],stub);
10267       }
10268       else set_jump_target(link_addr[i][0],(int)stub);
10269     }
10270     else
10271     {
10272       // Internal branch
10273       int target=(link_addr[i][1]-start)>>2;
10274       assert(target>=0&&target<slen);
10275       assert(instr_addr[target]);
10276       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10277       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10278       //#else
10279       set_jump_target(link_addr[i][0],instr_addr[target]);
10280       //#endif
10281     }
10282   }
10283   // External Branch Targets (jump_in)
10284   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10285   for(i=0;i<slen;i++)
10286   {
10287     if(bt[i]||i==0)
10288     {
10289       if(instr_addr[i]) // TODO - delay slots (=null)
10290       {
10291         u_int vaddr=start+i*4;
10292         u_int page=get_page(vaddr);
10293         u_int vpage=get_vpage(vaddr);
10294         literal_pool(256);
10295         {
10296           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10297           assem_debug("jump_in: %x\n",start+i*4);
10298           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10299           int entry_point=do_dirty_stub(i);
10300           ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
10301           // If there was an existing entry in the hash table,
10302           // replace it with the new address.
10303           // Don't add new entries.  We'll insert the
10304           // ones that actually get used in check_addr().
10305           u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10306           if(ht_bin[0]==vaddr) {
10307             ht_bin[1]=entry_point;
10308           }
10309           if(ht_bin[2]==vaddr) {
10310             ht_bin[3]=entry_point;
10311           }
10312         }
10313       }
10314     }
10315   }
10316   // Write out the literal pool if necessary
10317   literal_pool(0);
10318   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10319   // Align code
10320   if(((u_int)out)&7) emit_addnop(13);
10321   #endif
10322   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
10323   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10324   memcpy(copy,source,slen*4);
10325   copy+=slen*4;
10326
10327   end_block(beginning);
10328
10329   // If we're within 256K of the end of the buffer,
10330   // start over from the beginning. (Is 256K enough?)
10331   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10332
10333   // Trap writes to any of the pages we compiled
10334   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10335     invalid_code[i]=0;
10336   }
10337   inv_code_start=inv_code_end=~0;
10338
10339   // for PCSX we need to mark all mirrors too
10340   if(get_page(start)<(RAM_SIZE>>12))
10341     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10342       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10343       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10344       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10345
10346   /* Pass 10 - Free memory by expiring oldest blocks */
10347
10348   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10349   while(expirep!=end)
10350   {
10351     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10352     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10353     inv_debug("EXP: Phase %d\n",expirep);
10354     switch((expirep>>11)&3)
10355     {
10356       case 0:
10357         // Clear jump_in and jump_dirty
10358         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10359         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10360         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10361         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10362         break;
10363       case 1:
10364         // Clear pointers
10365         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10366         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10367         break;
10368       case 2:
10369         // Clear hash table
10370         for(i=0;i<32;i++) {
10371           u_int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10372           if((ht_bin[3]>>shift)==(base>>shift) ||
10373              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10374             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10375             ht_bin[2]=ht_bin[3]=-1;
10376           }
10377           if((ht_bin[1]>>shift)==(base>>shift) ||
10378              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10379             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10380             ht_bin[0]=ht_bin[2];
10381             ht_bin[1]=ht_bin[3];
10382             ht_bin[2]=ht_bin[3]=-1;
10383           }
10384         }
10385         break;
10386       case 3:
10387         // Clear jump_out
10388         #ifdef __arm__
10389         if((expirep&2047)==0)
10390           do_clear_cache();
10391         #endif
10392         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10393         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10394         break;
10395     }
10396     expirep=(expirep+1)&65535;
10397   }
10398   return 0;
10399 }
10400
10401 // vim:shiftwidth=2:expandtab