79df8dafe19bfd112750c4c5deac482753c0aabb
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 int getVMBlock();
36 #endif
37
38 #include "new_dynarec_config.h"
39 #include "backends/psx/emu_if.h" //emulator interface
40
41 //#define DISASM
42 //#define assem_debug printf
43 //#define inv_debug printf
44 #define assem_debug(...)
45 #define inv_debug(...)
46
47 #ifdef __i386__
48 #include "x86/assem_x86.h"
49 #endif
50 #ifdef __x86_64__
51 #include "x64/assem_x64.h"
52 #endif
53 #ifdef __arm__
54 #include "arm/assem_arm.h"
55 #endif
56
57 #ifdef VITA
58 int _newlib_vm_size_user = 1 << TARGET_SIZE_2;
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 struct regstat
65 {
66   signed char regmap_entry[HOST_REGS];
67   signed char regmap[HOST_REGS];
68   uint64_t was32;
69   uint64_t is32;
70   uint64_t wasdirty;
71   uint64_t dirty;
72   uint64_t u;
73   uint64_t uu;
74   u_int wasconst;
75   u_int isconst;
76   u_int loadedconst;             // host regs that have constants loaded
77   u_int waswritten;              // MIPS regs that were used as store base before
78 };
79
80 // note: asm depends on this layout
81 struct ll_entry
82 {
83   u_int vaddr;
84   u_int reg_sv_flags;
85   void *addr;
86   struct ll_entry *next;
87 };
88
89   // used by asm:
90   u_char *out;
91   u_int hash_table[65536][4]  __attribute__((aligned(16)));
92   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
93   struct ll_entry *jump_dirty[4096];
94
95   static struct ll_entry *jump_out[4096];
96   static u_int start;
97   static u_int *source;
98   static char insn[MAXBLOCK][10];
99   static u_char itype[MAXBLOCK];
100   static u_char opcode[MAXBLOCK];
101   static u_char opcode2[MAXBLOCK];
102   static u_char bt[MAXBLOCK];
103   static u_char rs1[MAXBLOCK];
104   static u_char rs2[MAXBLOCK];
105   static u_char rt1[MAXBLOCK];
106   static u_char rt2[MAXBLOCK];
107   static u_char us1[MAXBLOCK];
108   static u_char us2[MAXBLOCK];
109   static u_char dep1[MAXBLOCK];
110   static u_char dep2[MAXBLOCK];
111   static u_char lt1[MAXBLOCK];
112   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
113   static uint64_t gte_rt[MAXBLOCK];
114   static uint64_t gte_unneeded[MAXBLOCK];
115   static u_int smrv[32]; // speculated MIPS register values
116   static u_int smrv_strong; // mask or regs that are likely to have correct values
117   static u_int smrv_weak; // same, but somewhat less likely
118   static u_int smrv_strong_next; // same, but after current insn executes
119   static u_int smrv_weak_next;
120   static int imm[MAXBLOCK];
121   static u_int ba[MAXBLOCK];
122   static char likely[MAXBLOCK];
123   static char is_ds[MAXBLOCK];
124   static char ooo[MAXBLOCK];
125   static uint64_t unneeded_reg[MAXBLOCK];
126   static uint64_t unneeded_reg_upper[MAXBLOCK];
127   static uint64_t branch_unneeded_reg[MAXBLOCK];
128   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
129   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
130   static uint64_t current_constmap[HOST_REGS];
131   static uint64_t constmap[MAXBLOCK][HOST_REGS];
132   static struct regstat regs[MAXBLOCK];
133   static struct regstat branch_regs[MAXBLOCK];
134   static signed char minimum_free_regs[MAXBLOCK];
135   static u_int needed_reg[MAXBLOCK];
136   static u_int wont_dirty[MAXBLOCK];
137   static u_int will_dirty[MAXBLOCK];
138   static int ccadj[MAXBLOCK];
139   static int slen;
140   static u_int instr_addr[MAXBLOCK];
141   static u_int link_addr[MAXBLOCK][3];
142   static int linkcount;
143   static u_int stubs[MAXBLOCK*3][8];
144   static int stubcount;
145   static u_int literals[1024][2];
146   static int literalcount;
147   static int is_delayslot;
148   static int cop1_usable;
149   static char shadow[1048576]  __attribute__((aligned(16)));
150   static void *copy;
151   static int expirep;
152   static u_int stop_after_jal;
153 #ifndef RAM_FIXED
154   static u_int ram_offset;
155 #else
156   static const u_int ram_offset=0;
157 #endif
158
159   int new_dynarec_hacks;
160   int new_dynarec_did_compile;
161   extern u_char restore_candidate[512];
162   extern int cycle_count;
163
164   /* registers that may be allocated */
165   /* 1-31 gpr */
166 #define HIREG 32 // hi
167 #define LOREG 33 // lo
168 #define FSREG 34 // FPU status (FCSR)
169 #define CSREG 35 // Coprocessor status
170 #define CCREG 36 // Cycle count
171 #define INVCP 37 // Pointer to invalid_code
172 //#define MMREG 38 // Pointer to memory_map
173 #define ROREG 39 // ram offset (if rdram!=0x80000000)
174 #define TEMPREG 40
175 #define FTEMP 40 // FPU temporary register
176 #define PTEMP 41 // Prefetch temporary register
177 //#define TLREG 42 // TLB mapping offset
178 #define RHASH 43 // Return address hash
179 #define RHTBL 44 // Return address hash table address
180 #define RTEMP 45 // JR/JALR address register
181 #define MAXREG 45
182 #define AGEN1 46 // Address generation temporary register
183 //#define AGEN2 47 // Address generation temporary register
184 //#define MGEN1 48 // Maptable address generation temporary register
185 //#define MGEN2 49 // Maptable address generation temporary register
186 #define BTREG 50 // Branch target temporary register
187
188   /* instruction types */
189 #define NOP 0     // No operation
190 #define LOAD 1    // Load
191 #define STORE 2   // Store
192 #define LOADLR 3  // Unaligned load
193 #define STORELR 4 // Unaligned store
194 #define MOV 5     // Move
195 #define ALU 6     // Arithmetic/logic
196 #define MULTDIV 7 // Multiply/divide
197 #define SHIFT 8   // Shift by register
198 #define SHIFTIMM 9// Shift by immediate
199 #define IMM16 10  // 16-bit immediate
200 #define RJUMP 11  // Unconditional jump to register
201 #define UJUMP 12  // Unconditional jump
202 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
203 #define SJUMP 14  // Conditional branch (regimm format)
204 #define COP0 15   // Coprocessor 0
205 #define COP1 16   // Coprocessor 1
206 #define C1LS 17   // Coprocessor 1 load/store
207 #define FJUMP 18  // Conditional branch (floating point)
208 #define FLOAT 19  // Floating point unit
209 #define FCONV 20  // Convert integer to float
210 #define FCOMP 21  // Floating point compare (sets FSREG)
211 #define SYSCALL 22// SYSCALL
212 #define OTHER 23  // Other
213 #define SPAN 24   // Branch/delay slot spans 2 pages
214 #define NI 25     // Not implemented
215 #define HLECALL 26// PCSX fake opcodes for HLE
216 #define COP2 27   // Coprocessor 2 move
217 #define C2LS 28   // Coprocessor 2 load/store
218 #define C2OP 29   // Coprocessor 2 operation
219 #define INTCALL 30// Call interpreter to handle rare corner cases
220
221   /* stubs */
222 #define CC_STUB 1
223 #define FP_STUB 2
224 #define LOADB_STUB 3
225 #define LOADH_STUB 4
226 #define LOADW_STUB 5
227 #define LOADD_STUB 6
228 #define LOADBU_STUB 7
229 #define LOADHU_STUB 8
230 #define STOREB_STUB 9
231 #define STOREH_STUB 10
232 #define STOREW_STUB 11
233 #define STORED_STUB 12
234 #define STORELR_STUB 13
235 #define INVCODE_STUB 14
236
237   /* branch codes */
238 #define TAKEN 1
239 #define NOTTAKEN 2
240 #define NULLDS 3
241
242 // asm linkage
243 int new_recompile_block(int addr);
244 void *get_addr_ht(u_int vaddr);
245 void invalidate_block(u_int block);
246 void invalidate_addr(u_int addr);
247 void remove_hash(int vaddr);
248 void dyna_linker();
249 void dyna_linker_ds();
250 void verify_code();
251 void verify_code_vm();
252 void verify_code_ds();
253 void cc_interrupt();
254 void fp_exception();
255 void fp_exception_ds();
256 void jump_syscall_hle();
257 void jump_hlecall();
258 void jump_intcall();
259 void new_dyna_leave();
260
261 // Needed by assembler
262 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
263 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
264 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
265 static void load_all_regs(signed char i_regmap[]);
266 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
267 static void load_regs_entry(int t);
268 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
269
270 static int verify_dirty(u_int *ptr);
271 static int get_final_value(int hr, int i, int *value);
272 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e);
273 static void add_to_linker(int addr,int target,int ext);
274
275 static int tracedebug=0;
276
277 static void mprotect_w_x(void *start, void *end, int is_x)
278 {
279 #ifdef NO_WRITE_EXEC
280   #if defined(VITA)
281   // *Open* enables write on all memory that was
282   // allocated by sceKernelAllocMemBlockForVM()?
283   if (is_x)
284     sceKernelCloseVMDomain();
285   else
286     sceKernelOpenVMDomain();
287   #else
288   u_long mstart = (u_long)start & ~4095ul;
289   u_long mend = (u_long)end;
290   if (mprotect((void *)mstart, mend - mstart,
291                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
292     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
293   #endif
294 #endif
295 }
296
297 static void start_tcache_write(void *start, void *end)
298 {
299   mprotect_w_x(start, end, 0);
300 }
301
302 static void end_tcache_write(void *start, void *end)
303 {
304 #ifdef __arm__
305   size_t len = (char *)end - (char *)start;
306   #if   defined(__BLACKBERRY_QNX__)
307   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
308   #elif defined(__MACH__)
309   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
310   #elif defined(VITA)
311   sceKernelSyncVMDomain(sceBlock, start, len);
312   #elif defined(_3DS)
313   ctr_flush_invalidate_cache();
314   #else
315   __clear_cache(start, end);
316   #endif
317   (void)len;
318 #endif
319
320   mprotect_w_x(start, end, 1);
321 }
322
323 static void *start_block(void)
324 {
325   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
326   if (end > (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2))
327     end = (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2);
328   start_tcache_write(out, end);
329   return out;
330 }
331
332 static void end_block(void *start)
333 {
334   end_tcache_write(start, out);
335 }
336
337 //#define DEBUG_CYCLE_COUNT 1
338
339 #define NO_CYCLE_PENALTY_THR 12
340
341 int cycle_multiplier; // 100 for 1.0
342
343 static int CLOCK_ADJUST(int x)
344 {
345   int s=(x>>31)|1;
346   return (x * cycle_multiplier + s * 50) / 100;
347 }
348
349 static u_int get_page(u_int vaddr)
350 {
351   u_int page=vaddr&~0xe0000000;
352   if (page < 0x1000000)
353     page &= ~0x0e00000; // RAM mirrors
354   page>>=12;
355   if(page>2048) page=2048+(page&2047);
356   return page;
357 }
358
359 // no virtual mem in PCSX
360 static u_int get_vpage(u_int vaddr)
361 {
362   return get_page(vaddr);
363 }
364
365 // Get address from virtual address
366 // This is called from the recompiled JR/JALR instructions
367 void *get_addr(u_int vaddr)
368 {
369   struct ll_entry *head = NULL;
370   u_int page            = get_page(vaddr);
371   u_int vpage           = get_vpage(vaddr);
372   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
373   head=jump_in[page];
374   while(head!=NULL)
375   {
376     if(head->vaddr==vaddr)
377     {
378       //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
379       u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
380       ht_bin[3]=ht_bin[1];
381       ht_bin[2]=ht_bin[0];
382       ht_bin[1]=(u_int)head->addr;
383       ht_bin[0]=vaddr;
384       return head->addr;
385     }
386     head=head->next;
387   }
388   head=jump_dirty[vpage];
389   while(head!=NULL)
390   {
391     if(head->vaddr==vaddr)
392     {
393       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
394       // Don't restore blocks which are about to expire from the cache
395       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
396         if(verify_dirty(head->addr))
397         {
398           //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
399           invalid_code[vaddr>>12]=0;
400           inv_code_start=inv_code_end=~0;
401           if(vpage<2048)
402           {
403             restore_candidate[vpage>>3]|=1<<(vpage&7);
404           }
405           else
406           {
407             restore_candidate[page>>3]|=1<<(page&7);
408           }
409           u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
410
411           if(ht_bin[0]==vaddr)
412             ht_bin[1]=(u_int)head->addr; // Replace existing entry
413           else
414           {
415             ht_bin[3]=ht_bin[1];
416             ht_bin[2]=ht_bin[0];
417             ht_bin[1]=(int)head->addr;
418             ht_bin[0]=vaddr;
419           }
420           return head->addr;
421         }
422     }
423     head=head->next;
424   }
425   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
426   int r=new_recompile_block(vaddr);
427   if(r==0)
428     return get_addr(vaddr);
429   // Execute in unmapped page, generate pagefault exception
430   Status|=2;
431   Cause=(vaddr<<31)|0x8;
432   EPC=(vaddr&1)?vaddr-5:vaddr;
433   BadVAddr=(vaddr&~1);
434   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
435   EntryHi=BadVAddr&0xFFFFE000;
436   return get_addr_ht(0x80000000);
437 }
438
439 // Look up address in hash table first
440 void *get_addr_ht(u_int vaddr)
441 {
442   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
443   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
444   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
445   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
446   return get_addr(vaddr);
447 }
448
449 void clear_all_regs(signed char regmap[])
450 {
451   int hr;
452   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
453 }
454
455 signed char get_reg(signed char regmap[],int r)
456 {
457   int hr;
458   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
459   return -1;
460 }
461
462 // Find a register that is available for two consecutive cycles
463 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
464 {
465   int hr;
466   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
467   return -1;
468 }
469
470 int count_free_regs(signed char regmap[])
471 {
472   int count=0;
473   int hr;
474   for(hr=0;hr<HOST_REGS;hr++)
475   {
476     if(hr!=EXCLUDE_REG) {
477       if(regmap[hr]<0) count++;
478     }
479   }
480   return count;
481 }
482
483 void dirty_reg(struct regstat *cur,signed char reg)
484 {
485   int hr;
486   if(!reg) return;
487   for (hr=0;hr<HOST_REGS;hr++) {
488     if((cur->regmap[hr]&63)==reg) {
489       cur->dirty|=1<<hr;
490     }
491   }
492 }
493
494 // If we dirty the lower half of a 64 bit register which is now being
495 // sign-extended, we need to dump the upper half.
496 // Note: Do this only after completion of the instruction, because
497 // some instructions may need to read the full 64-bit value even if
498 // overwriting it (eg SLTI, DSRA32).
499 static void flush_dirty_uppers(struct regstat *cur)
500 {
501   int hr,reg;
502   for (hr=0;hr<HOST_REGS;hr++) {
503     if((cur->dirty>>hr)&1) {
504       reg=cur->regmap[hr];
505       if(reg>=64)
506         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
507     }
508   }
509 }
510
511 void set_const(struct regstat *cur,signed char reg,uint64_t value)
512 {
513   int hr;
514   if(!reg) return;
515   for (hr=0;hr<HOST_REGS;hr++) {
516     if(cur->regmap[hr]==reg) {
517       cur->isconst|=1<<hr;
518       current_constmap[hr]=value;
519     }
520     else if((cur->regmap[hr]^64)==reg) {
521       cur->isconst|=1<<hr;
522       current_constmap[hr]=value>>32;
523     }
524   }
525 }
526
527 void clear_const(struct regstat *cur,signed char reg)
528 {
529   int hr;
530   if(!reg) return;
531   for (hr=0;hr<HOST_REGS;hr++) {
532     if((cur->regmap[hr]&63)==reg) {
533       cur->isconst&=~(1<<hr);
534     }
535   }
536 }
537
538 int is_const(struct regstat *cur,signed char reg)
539 {
540   int hr;
541   if(reg<0) return 0;
542   if(!reg) return 1;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if((cur->regmap[hr]&63)==reg) {
545       return (cur->isconst>>hr)&1;
546     }
547   }
548   return 0;
549 }
550 uint64_t get_const(struct regstat *cur,signed char reg)
551 {
552   int hr;
553   if(!reg) return 0;
554   for (hr=0;hr<HOST_REGS;hr++) {
555     if(cur->regmap[hr]==reg) {
556       return current_constmap[hr];
557     }
558   }
559   SysPrintf("Unknown constant in r%d\n",reg);
560   exit(1);
561 }
562
563 // Least soon needed registers
564 // Look at the next ten instructions and see which registers
565 // will be used.  Try not to reallocate these.
566 void lsn(u_char hsn[], int i, int *preferred_reg)
567 {
568   int j;
569   int b=-1;
570   for(j=0;j<9;j++)
571   {
572     if(i+j>=slen) {
573       j=slen-i-1;
574       break;
575     }
576     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
577     {
578       // Don't go past an unconditonal jump
579       j++;
580       break;
581     }
582   }
583   for(;j>=0;j--)
584   {
585     if(rs1[i+j]) hsn[rs1[i+j]]=j;
586     if(rs2[i+j]) hsn[rs2[i+j]]=j;
587     if(rt1[i+j]) hsn[rt1[i+j]]=j;
588     if(rt2[i+j]) hsn[rt2[i+j]]=j;
589     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
590       // Stores can allocate zero
591       hsn[rs1[i+j]]=j;
592       hsn[rs2[i+j]]=j;
593     }
594     // On some architectures stores need invc_ptr
595     #if defined(HOST_IMM8)
596     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
597       hsn[INVCP]=j;
598     }
599     #endif
600     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
601     {
602       hsn[CCREG]=j;
603       b=j;
604     }
605   }
606   if(b>=0)
607   {
608     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
609     {
610       // Follow first branch
611       int t=(ba[i+b]-start)>>2;
612       j=7-b;if(t+j>=slen) j=slen-t-1;
613       for(;j>=0;j--)
614       {
615         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
616         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
617         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
618         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
619       }
620     }
621     // TODO: preferred register based on backward branch
622   }
623   // Delay slot should preferably not overwrite branch conditions or cycle count
624   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
625     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
626     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
627     hsn[CCREG]=1;
628     // ...or hash tables
629     hsn[RHASH]=1;
630     hsn[RHTBL]=1;
631   }
632   // Coprocessor load/store needs FTEMP, even if not declared
633   if(itype[i]==C1LS||itype[i]==C2LS) {
634     hsn[FTEMP]=0;
635   }
636   // Load L/R also uses FTEMP as a temporary register
637   if(itype[i]==LOADLR) {
638     hsn[FTEMP]=0;
639   }
640   // Also SWL/SWR/SDL/SDR
641   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
642     hsn[FTEMP]=0;
643   }
644   // Don't remove the miniht registers
645   if(itype[i]==UJUMP||itype[i]==RJUMP)
646   {
647     hsn[RHASH]=0;
648     hsn[RHTBL]=0;
649   }
650 }
651
652 // We only want to allocate registers if we're going to use them again soon
653 int needed_again(int r, int i)
654 {
655   int j;
656   int b=-1;
657   int rn=10;
658
659   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
660   {
661     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
662       return 0; // Don't need any registers if exiting the block
663   }
664   for(j=0;j<9;j++)
665   {
666     if(i+j>=slen) {
667       j=slen-i-1;
668       break;
669     }
670     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
671     {
672       // Don't go past an unconditonal jump
673       j++;
674       break;
675     }
676     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
677     {
678       break;
679     }
680   }
681   for(;j>=1;j--)
682   {
683     if(rs1[i+j]==r) rn=j;
684     if(rs2[i+j]==r) rn=j;
685     if((unneeded_reg[i+j]>>r)&1) rn=10;
686     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
687     {
688       b=j;
689     }
690   }
691   /*
692   if(b>=0)
693   {
694     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
695     {
696       // Follow first branch
697       int o=rn;
698       int t=(ba[i+b]-start)>>2;
699       j=7-b;if(t+j>=slen) j=slen-t-1;
700       for(;j>=0;j--)
701       {
702         if(!((unneeded_reg[t+j]>>r)&1)) {
703           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
704           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
705         }
706         else rn=o;
707       }
708     }
709   }*/
710   if(rn<10) return 1;
711   (void)b;
712   return 0;
713 }
714
715 // Try to match register allocations at the end of a loop with those
716 // at the beginning
717 int loop_reg(int i, int r, int hr)
718 {
719   int j,k;
720   for(j=0;j<9;j++)
721   {
722     if(i+j>=slen) {
723       j=slen-i-1;
724       break;
725     }
726     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
727     {
728       // Don't go past an unconditonal jump
729       j++;
730       break;
731     }
732   }
733   k=0;
734   if(i>0){
735     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
736       k--;
737   }
738   for(;k<j;k++)
739   {
740     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
741     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
742     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
743     {
744       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
745       {
746         int t=(ba[i+k]-start)>>2;
747         int reg=get_reg(regs[t].regmap_entry,r);
748         if(reg>=0) return reg;
749         //reg=get_reg(regs[t+1].regmap_entry,r);
750         //if(reg>=0) return reg;
751       }
752     }
753   }
754   return hr;
755 }
756
757
758 // Allocate every register, preserving source/target regs
759 void alloc_all(struct regstat *cur,int i)
760 {
761   int hr;
762
763   for(hr=0;hr<HOST_REGS;hr++) {
764     if(hr!=EXCLUDE_REG) {
765       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
766          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
767       {
768         cur->regmap[hr]=-1;
769         cur->dirty&=~(1<<hr);
770       }
771       // Don't need zeros
772       if((cur->regmap[hr]&63)==0)
773       {
774         cur->regmap[hr]=-1;
775         cur->dirty&=~(1<<hr);
776       }
777     }
778   }
779 }
780
781 #ifdef __i386__
782 #include "x86/assem_x86.c"
783 #endif
784 #ifdef __x86_64__
785 #include "x64/assem_x64.c"
786 #endif
787 #ifdef __arm__
788 #include "arm/assem_arm.c"
789 #endif
790
791 // Add virtual address mapping to linked list
792 void ll_add(struct ll_entry **head,int vaddr,void *addr)
793 {
794   struct ll_entry *new_entry;
795   new_entry=malloc(sizeof(struct ll_entry));
796   assert(new_entry!=NULL);
797   new_entry->vaddr=vaddr;
798   new_entry->reg_sv_flags=0;
799   new_entry->addr=addr;
800   new_entry->next=*head;
801   *head=new_entry;
802 }
803
804 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
805 {
806   ll_add(head,vaddr,addr);
807   (*head)->reg_sv_flags=reg_sv_flags;
808 }
809
810 // Check if an address is already compiled
811 // but don't return addresses which are about to expire from the cache
812 void *check_addr(u_int vaddr)
813 {
814   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
815   if(ht_bin[0]==vaddr) {
816     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
817       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
818   }
819   if(ht_bin[2]==vaddr) {
820     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
821       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
822   }
823   u_int page=get_page(vaddr);
824   struct ll_entry *head;
825   head=jump_in[page];
826   while(head!=NULL) {
827     if(head->vaddr==vaddr) {
828       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
829         // Update existing entry with current address
830         if(ht_bin[0]==vaddr) {
831           ht_bin[1]=(int)head->addr;
832           return head->addr;
833         }
834         if(ht_bin[2]==vaddr) {
835           ht_bin[3]=(int)head->addr;
836           return head->addr;
837         }
838         // Insert into hash table with low priority.
839         // Don't evict existing entries, as they are probably
840         // addresses that are being accessed frequently.
841         if(ht_bin[0]==-1) {
842           ht_bin[1]=(int)head->addr;
843           ht_bin[0]=vaddr;
844         }else if(ht_bin[2]==-1) {
845           ht_bin[3]=(int)head->addr;
846           ht_bin[2]=vaddr;
847         }
848         return head->addr;
849       }
850     }
851     head=head->next;
852   }
853   return 0;
854 }
855
856 void remove_hash(int vaddr)
857 {
858   //printf("remove hash: %x\n",vaddr);
859   u_int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
860   if(ht_bin[2]==vaddr) {
861     ht_bin[2]=ht_bin[3]=-1;
862   }
863   if(ht_bin[0]==vaddr) {
864     ht_bin[0]=ht_bin[2];
865     ht_bin[1]=ht_bin[3];
866     ht_bin[2]=ht_bin[3]=-1;
867   }
868 }
869
870 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
871 {
872   struct ll_entry *next;
873   while(*head) {
874     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
875        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
876     {
877       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
878       remove_hash((*head)->vaddr);
879       next=(*head)->next;
880       free(*head);
881       *head=next;
882     }
883     else
884     {
885       head=&((*head)->next);
886     }
887   }
888 }
889
890 // Remove all entries from linked list
891 void ll_clear(struct ll_entry **head)
892 {
893   struct ll_entry *cur;
894   struct ll_entry *next;
895   if((cur=*head)) {
896     *head=0;
897     while(cur) {
898       next=cur->next;
899       free(cur);
900       cur=next;
901     }
902   }
903 }
904
905 // Dereference the pointers and remove if it matches
906 static void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
907 {
908   while(head) {
909     int ptr=get_pointer(head->addr);
910     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
911     if(((ptr>>shift)==(addr>>shift)) ||
912        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
913     {
914       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
915       void *host_addr=find_extjump_insn(head->addr);
916       #ifdef __arm__
917         mark_clear_cache(host_addr);
918       #endif
919       set_jump_target((int)host_addr,(int)head->addr);
920     }
921     head=head->next;
922   }
923 }
924
925 // This is called when we write to a compiled block (see do_invstub)
926 void invalidate_page(u_int page)
927 {
928   struct ll_entry *head;
929   struct ll_entry *next;
930   head=jump_in[page];
931   jump_in[page]=0;
932   while(head!=NULL) {
933     inv_debug("INVALIDATE: %x\n",head->vaddr);
934     remove_hash(head->vaddr);
935     next=head->next;
936     free(head);
937     head=next;
938   }
939   head=jump_out[page];
940   jump_out[page]=0;
941   while(head!=NULL) {
942     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
943     void *host_addr=find_extjump_insn(head->addr);
944     #ifdef __arm__
945       mark_clear_cache(host_addr);
946     #endif
947     set_jump_target((int)host_addr,(int)head->addr);
948     next=head->next;
949     free(head);
950     head=next;
951   }
952 }
953
954 static void invalidate_block_range(u_int block, u_int first, u_int last)
955 {
956   u_int page=get_page(block<<12);
957   //printf("first=%d last=%d\n",first,last);
958   invalidate_page(page);
959   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
960   assert(last<page+5);
961   // Invalidate the adjacent pages if a block crosses a 4K boundary
962   while(first<page)
963   {
964     invalidate_page(first);
965     first++;
966   }
967   for(first=page+1;first<last;first++)
968   {
969     invalidate_page(first);
970   }
971
972 #ifdef __arm__
973   do_clear_cache();
974 #endif
975
976   // Don't trap writes
977   invalid_code[block]=1;
978
979 #ifdef USE_MINI_HT
980   memset(mini_ht,-1,sizeof(mini_ht));
981 #endif
982 }
983
984 void invalidate_block(u_int block)
985 {
986   u_int page=get_page(block<<12);
987   u_int vpage=get_vpage(block<<12);
988   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
989   u_int first,last;
990   first=last=page;
991   struct ll_entry *head;
992   head=jump_dirty[vpage];
993   //printf("page=%d vpage=%d\n",page,vpage);
994   while(head!=NULL)
995   {
996     u_int start,end;
997     if(vpage>2047||(head->vaddr>>12)==block)
998     { // Ignore vaddr hash collision
999       get_bounds((int)head->addr,&start,&end);
1000       //printf("start: %x end: %x\n",start,end);
1001       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE)
1002       {
1003         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page)
1004         {
1005           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1006           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1007         }
1008       }
1009     }
1010     head=head->next;
1011   }
1012   invalidate_block_range(block,first,last);
1013 }
1014
1015 void invalidate_addr(u_int addr)
1016 {
1017   //static int rhits;
1018   // this check is done by the caller
1019   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1020   u_int page=get_vpage(addr);
1021   if(page<2048) { // RAM
1022     struct ll_entry *head;
1023     u_int addr_min=~0, addr_max=0;
1024     u_int mask=RAM_SIZE-1;
1025     u_int addr_main=0x80000000|(addr&mask);
1026     int pg1;
1027     inv_code_start=addr_main&~0xfff;
1028     inv_code_end=addr_main|0xfff;
1029     pg1=page;
1030     if (pg1>0) {
1031       // must check previous page too because of spans..
1032       pg1--;
1033       inv_code_start-=0x1000;
1034     }
1035     for(;pg1<=page;pg1++) {
1036       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1037         u_int start,end;
1038         get_bounds((int)head->addr,&start,&end);
1039         if(ram_offset) {
1040           start-=ram_offset;
1041           end-=ram_offset;
1042         }
1043         if(start<=addr_main&&addr_main<end) {
1044           if(start<addr_min) addr_min=start;
1045           if(end>addr_max) addr_max=end;
1046         }
1047         else if(addr_main<start) {
1048           if(start<inv_code_end)
1049             inv_code_end=start-1;
1050         }
1051         else {
1052           if(end>inv_code_start)
1053             inv_code_start=end;
1054         }
1055       }
1056     }
1057     if (addr_min!=~0) {
1058       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1059       inv_code_start=inv_code_end=~0;
1060       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1061       return;
1062     }
1063     else {
1064       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1065       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1066       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1067       return;
1068     }
1069   }
1070   invalidate_block(addr>>12);
1071 }
1072
1073 // This is called when loading a save state.
1074 // Anything could have changed, so invalidate everything.
1075 void invalidate_all_pages(void)
1076 {
1077   u_int page;
1078   for(page=0;page<4096;page++)
1079     invalidate_page(page);
1080   for(page=0;page<1048576;page++)
1081   {
1082     if(!invalid_code[page])
1083     {
1084       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1085       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1086     }
1087   }
1088
1089 #ifdef USE_MINI_HT
1090   memset(mini_ht,-1,sizeof(mini_ht));
1091 #endif
1092 }
1093
1094 // Add an entry to jump_out after making a link
1095 void add_link(u_int vaddr,void *src)
1096 {
1097   u_int page=get_page(vaddr);
1098   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1099   int *ptr=(int *)(src+4);
1100   assert((*ptr&0x0fff0000)==0x059f0000);
1101   (void)ptr;
1102   ll_add(jump_out+page,vaddr,src);
1103   //int ptr=get_pointer(src);
1104   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1105 }
1106
1107 // If a code block was found to be unmodified (bit was set in
1108 // restore_candidate) and it remains unmodified (bit is clear
1109 // in invalid_code) then move the entries for that 4K page from
1110 // the dirty list to the clean list.
1111 void clean_blocks(u_int page)
1112 {
1113   struct ll_entry *head;
1114   inv_debug("INV: clean_blocks page=%d\n",page);
1115   head=jump_dirty[page];
1116   while(head!=NULL)
1117   {
1118     if(!invalid_code[head->vaddr>>12])
1119     {
1120       // Don't restore blocks which are about to expire from the cache
1121       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1122       {
1123         u_int start,end;
1124         if(verify_dirty(head->addr))
1125         {
1126           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1127           u_int i;
1128           u_int inv=0;
1129           get_bounds((int)head->addr,&start,&end);
1130           if(start-(u_int)rdram<RAM_SIZE)
1131           {
1132             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++)
1133             {
1134               inv|=invalid_code[i];
1135             }
1136           }
1137           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE)
1138           {
1139             inv=1;
1140           }
1141           if(!inv)
1142           {
1143             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1144             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1145             {
1146               u_int ppage=page;
1147               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1148               //printf("page=%x, addr=%x\n",page,head->vaddr);
1149               //assert(head->vaddr>>12==(page|0x80000));
1150               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1151               u_int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1152               if(ht_bin[0]==head->vaddr)
1153               {
1154                 ht_bin[1]=(u_int)clean_addr; // Replace existing entry
1155               }
1156               if(ht_bin[2]==head->vaddr)
1157               {
1158                 ht_bin[3]=(u_int)clean_addr; // Replace existing entry
1159               }
1160             }
1161           }
1162         }
1163       }
1164     }
1165     head=head->next;
1166   }
1167 }
1168
1169 static void mov_alloc(struct regstat *current,int i)
1170 {
1171   // Note: Don't need to actually alloc the source registers
1172   if((~current->is32>>rs1[i])&1)
1173   {
1174     //alloc_reg64(current,i,rs1[i]);
1175     alloc_reg64(current,i,rt1[i]);
1176     current->is32&=~(1LL<<rt1[i]);
1177   }
1178   else
1179   {
1180     //alloc_reg(current,i,rs1[i]);
1181     alloc_reg(current,i,rt1[i]);
1182     current->is32|=(1LL<<rt1[i]);
1183   }
1184   clear_const(current,rs1[i]);
1185   clear_const(current,rt1[i]);
1186   dirty_reg(current,rt1[i]);
1187 }
1188
1189 void shiftimm_alloc(struct regstat *current,int i)
1190 {
1191   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1192   {
1193     if(rt1[i]) {
1194       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1195       else lt1[i]=rs1[i];
1196       alloc_reg(current,i,rt1[i]);
1197       current->is32|=1LL<<rt1[i];
1198       dirty_reg(current,rt1[i]);
1199       if(is_const(current,rs1[i])) {
1200         int v=get_const(current,rs1[i]);
1201         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1202         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1203         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1204       }
1205       else clear_const(current,rt1[i]);
1206     }
1207   }
1208   else
1209   {
1210     clear_const(current,rs1[i]);
1211     clear_const(current,rt1[i]);
1212   }
1213
1214   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1215   {
1216     if(rt1[i]) {
1217       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1218       alloc_reg64(current,i,rt1[i]);
1219       current->is32&=~(1LL<<rt1[i]);
1220       dirty_reg(current,rt1[i]);
1221     }
1222   }
1223   if(opcode2[i]==0x3c) // DSLL32
1224   {
1225     if(rt1[i]) {
1226       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1227       alloc_reg64(current,i,rt1[i]);
1228       current->is32&=~(1LL<<rt1[i]);
1229       dirty_reg(current,rt1[i]);
1230     }
1231   }
1232   if(opcode2[i]==0x3e) // DSRL32
1233   {
1234     if(rt1[i]) {
1235       alloc_reg64(current,i,rs1[i]);
1236       if(imm[i]==32) {
1237         alloc_reg64(current,i,rt1[i]);
1238         current->is32&=~(1LL<<rt1[i]);
1239       } else {
1240         alloc_reg(current,i,rt1[i]);
1241         current->is32|=1LL<<rt1[i];
1242       }
1243       dirty_reg(current,rt1[i]);
1244     }
1245   }
1246   if(opcode2[i]==0x3f) // DSRA32
1247   {
1248     if(rt1[i]) {
1249       alloc_reg64(current,i,rs1[i]);
1250       alloc_reg(current,i,rt1[i]);
1251       current->is32|=1LL<<rt1[i];
1252       dirty_reg(current,rt1[i]);
1253     }
1254   }
1255 }
1256
1257 void shift_alloc(struct regstat *current,int i)
1258 {
1259   if(rt1[i]) {
1260     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1261     {
1262       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1263       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1264       alloc_reg(current,i,rt1[i]);
1265       if(rt1[i]==rs2[i]) {
1266         alloc_reg_temp(current,i,-1);
1267         minimum_free_regs[i]=1;
1268       }
1269       current->is32|=1LL<<rt1[i];
1270     } else { // DSLLV/DSRLV/DSRAV
1271       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1272       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1273       alloc_reg64(current,i,rt1[i]);
1274       current->is32&=~(1LL<<rt1[i]);
1275       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1276       {
1277         alloc_reg_temp(current,i,-1);
1278         minimum_free_regs[i]=1;
1279       }
1280     }
1281     clear_const(current,rs1[i]);
1282     clear_const(current,rs2[i]);
1283     clear_const(current,rt1[i]);
1284     dirty_reg(current,rt1[i]);
1285   }
1286 }
1287
1288 void alu_alloc(struct regstat *current,int i)
1289 {
1290   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1291     if(rt1[i]) {
1292       if(rs1[i]&&rs2[i]) {
1293         alloc_reg(current,i,rs1[i]);
1294         alloc_reg(current,i,rs2[i]);
1295       }
1296       else {
1297         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1298         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1299       }
1300       alloc_reg(current,i,rt1[i]);
1301     }
1302     current->is32|=1LL<<rt1[i];
1303   }
1304   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1305     if(rt1[i]) {
1306       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1307       {
1308         alloc_reg64(current,i,rs1[i]);
1309         alloc_reg64(current,i,rs2[i]);
1310         alloc_reg(current,i,rt1[i]);
1311       } else {
1312         alloc_reg(current,i,rs1[i]);
1313         alloc_reg(current,i,rs2[i]);
1314         alloc_reg(current,i,rt1[i]);
1315       }
1316     }
1317     current->is32|=1LL<<rt1[i];
1318   }
1319   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1320     if(rt1[i]) {
1321       if(rs1[i]&&rs2[i]) {
1322         alloc_reg(current,i,rs1[i]);
1323         alloc_reg(current,i,rs2[i]);
1324       }
1325       else
1326       {
1327         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1328         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1329       }
1330       alloc_reg(current,i,rt1[i]);
1331       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1332       {
1333         if(!((current->uu>>rt1[i])&1)) {
1334           alloc_reg64(current,i,rt1[i]);
1335         }
1336         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1337           if(rs1[i]&&rs2[i]) {
1338             alloc_reg64(current,i,rs1[i]);
1339             alloc_reg64(current,i,rs2[i]);
1340           }
1341           else
1342           {
1343             // Is is really worth it to keep 64-bit values in registers?
1344             #ifdef NATIVE_64BIT
1345             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1346             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1347             #endif
1348           }
1349         }
1350         current->is32&=~(1LL<<rt1[i]);
1351       } else {
1352         current->is32|=1LL<<rt1[i];
1353       }
1354     }
1355   }
1356   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1357     if(rt1[i]) {
1358       if(rs1[i]&&rs2[i]) {
1359         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1360           alloc_reg64(current,i,rs1[i]);
1361           alloc_reg64(current,i,rs2[i]);
1362           alloc_reg64(current,i,rt1[i]);
1363         } else {
1364           alloc_reg(current,i,rs1[i]);
1365           alloc_reg(current,i,rs2[i]);
1366           alloc_reg(current,i,rt1[i]);
1367         }
1368       }
1369       else {
1370         alloc_reg(current,i,rt1[i]);
1371         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1372           // DADD used as move, or zeroing
1373           // If we have a 64-bit source, then make the target 64 bits too
1374           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1375             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1376             alloc_reg64(current,i,rt1[i]);
1377           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1378             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1379             alloc_reg64(current,i,rt1[i]);
1380           }
1381           if(opcode2[i]>=0x2e&&rs2[i]) {
1382             // DSUB used as negation - 64-bit result
1383             // If we have a 32-bit register, extend it to 64 bits
1384             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1385             alloc_reg64(current,i,rt1[i]);
1386           }
1387         }
1388       }
1389       if(rs1[i]&&rs2[i]) {
1390         current->is32&=~(1LL<<rt1[i]);
1391       } else if(rs1[i]) {
1392         current->is32&=~(1LL<<rt1[i]);
1393         if((current->is32>>rs1[i])&1)
1394           current->is32|=1LL<<rt1[i];
1395       } else if(rs2[i]) {
1396         current->is32&=~(1LL<<rt1[i]);
1397         if((current->is32>>rs2[i])&1)
1398           current->is32|=1LL<<rt1[i];
1399       } else {
1400         current->is32|=1LL<<rt1[i];
1401       }
1402     }
1403   }
1404   clear_const(current,rs1[i]);
1405   clear_const(current,rs2[i]);
1406   clear_const(current,rt1[i]);
1407   dirty_reg(current,rt1[i]);
1408 }
1409
1410 void imm16_alloc(struct regstat *current,int i)
1411 {
1412   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1413   else lt1[i]=rs1[i];
1414   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1415   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1416     current->is32&=~(1LL<<rt1[i]);
1417     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1418       // TODO: Could preserve the 32-bit flag if the immediate is zero
1419       alloc_reg64(current,i,rt1[i]);
1420       alloc_reg64(current,i,rs1[i]);
1421     }
1422     clear_const(current,rs1[i]);
1423     clear_const(current,rt1[i]);
1424   }
1425   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1426     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1427     current->is32|=1LL<<rt1[i];
1428     clear_const(current,rs1[i]);
1429     clear_const(current,rt1[i]);
1430   }
1431   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1432     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1433       if(rs1[i]!=rt1[i]) {
1434         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1435         alloc_reg64(current,i,rt1[i]);
1436         current->is32&=~(1LL<<rt1[i]);
1437       }
1438     }
1439     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1440     if(is_const(current,rs1[i])) {
1441       int v=get_const(current,rs1[i]);
1442       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1443       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1444       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1445     }
1446     else clear_const(current,rt1[i]);
1447   }
1448   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1449     if(is_const(current,rs1[i])) {
1450       int v=get_const(current,rs1[i]);
1451       set_const(current,rt1[i],v+imm[i]);
1452     }
1453     else clear_const(current,rt1[i]);
1454     current->is32|=1LL<<rt1[i];
1455   }
1456   else {
1457     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1458     current->is32|=1LL<<rt1[i];
1459   }
1460   dirty_reg(current,rt1[i]);
1461 }
1462
1463 void load_alloc(struct regstat *current,int i)
1464 {
1465   clear_const(current,rt1[i]);
1466   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1467   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1468   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1469   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1470     alloc_reg(current,i,rt1[i]);
1471     assert(get_reg(current->regmap,rt1[i])>=0);
1472     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1473     {
1474       current->is32&=~(1LL<<rt1[i]);
1475       alloc_reg64(current,i,rt1[i]);
1476     }
1477     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1478     {
1479       current->is32&=~(1LL<<rt1[i]);
1480       alloc_reg64(current,i,rt1[i]);
1481       alloc_all(current,i);
1482       alloc_reg64(current,i,FTEMP);
1483       minimum_free_regs[i]=HOST_REGS;
1484     }
1485     else current->is32|=1LL<<rt1[i];
1486     dirty_reg(current,rt1[i]);
1487     // LWL/LWR need a temporary register for the old value
1488     if(opcode[i]==0x22||opcode[i]==0x26)
1489     {
1490       alloc_reg(current,i,FTEMP);
1491       alloc_reg_temp(current,i,-1);
1492       minimum_free_regs[i]=1;
1493     }
1494   }
1495   else
1496   {
1497     // Load to r0 or unneeded register (dummy load)
1498     // but we still need a register to calculate the address
1499     if(opcode[i]==0x22||opcode[i]==0x26)
1500     {
1501       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1502     }
1503     alloc_reg_temp(current,i,-1);
1504     minimum_free_regs[i]=1;
1505     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1506     {
1507       alloc_all(current,i);
1508       alloc_reg64(current,i,FTEMP);
1509       minimum_free_regs[i]=HOST_REGS;
1510     }
1511   }
1512 }
1513
1514 void store_alloc(struct regstat *current,int i)
1515 {
1516   clear_const(current,rs2[i]);
1517   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1518   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1519   alloc_reg(current,i,rs2[i]);
1520   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1521     alloc_reg64(current,i,rs2[i]);
1522     if(rs2[i]) alloc_reg(current,i,FTEMP);
1523   }
1524   #if defined(HOST_IMM8)
1525   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1526   else alloc_reg(current,i,INVCP);
1527   #endif
1528   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1529     alloc_reg(current,i,FTEMP);
1530   }
1531   // We need a temporary register for address generation
1532   alloc_reg_temp(current,i,-1);
1533   minimum_free_regs[i]=1;
1534 }
1535
1536 void c1ls_alloc(struct regstat *current,int i)
1537 {
1538   //clear_const(current,rs1[i]); // FIXME
1539   clear_const(current,rt1[i]);
1540   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1541   alloc_reg(current,i,CSREG); // Status
1542   alloc_reg(current,i,FTEMP);
1543   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1544     alloc_reg64(current,i,FTEMP);
1545   }
1546   #if defined(HOST_IMM8)
1547   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1548   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1549     alloc_reg(current,i,INVCP);
1550   #endif
1551   // We need a temporary register for address generation
1552   alloc_reg_temp(current,i,-1);
1553 }
1554
1555 void c2ls_alloc(struct regstat *current,int i)
1556 {
1557   clear_const(current,rt1[i]);
1558   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1559   alloc_reg(current,i,FTEMP);
1560   #if defined(HOST_IMM8)
1561   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1562   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1563     alloc_reg(current,i,INVCP);
1564   #endif
1565   // We need a temporary register for address generation
1566   alloc_reg_temp(current,i,-1);
1567   minimum_free_regs[i]=1;
1568 }
1569
1570 #ifndef multdiv_alloc
1571 void multdiv_alloc(struct regstat *current,int i)
1572 {
1573   //  case 0x18: MULT
1574   //  case 0x19: MULTU
1575   //  case 0x1A: DIV
1576   //  case 0x1B: DIVU
1577   //  case 0x1C: DMULT
1578   //  case 0x1D: DMULTU
1579   //  case 0x1E: DDIV
1580   //  case 0x1F: DDIVU
1581   clear_const(current,rs1[i]);
1582   clear_const(current,rs2[i]);
1583   if(rs1[i]&&rs2[i])
1584   {
1585     if((opcode2[i]&4)==0) // 32-bit
1586     {
1587       current->u&=~(1LL<<HIREG);
1588       current->u&=~(1LL<<LOREG);
1589       alloc_reg(current,i,HIREG);
1590       alloc_reg(current,i,LOREG);
1591       alloc_reg(current,i,rs1[i]);
1592       alloc_reg(current,i,rs2[i]);
1593       current->is32|=1LL<<HIREG;
1594       current->is32|=1LL<<LOREG;
1595       dirty_reg(current,HIREG);
1596       dirty_reg(current,LOREG);
1597     }
1598     else // 64-bit
1599     {
1600       current->u&=~(1LL<<HIREG);
1601       current->u&=~(1LL<<LOREG);
1602       current->uu&=~(1LL<<HIREG);
1603       current->uu&=~(1LL<<LOREG);
1604       alloc_reg64(current,i,HIREG);
1605       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1606       alloc_reg64(current,i,rs1[i]);
1607       alloc_reg64(current,i,rs2[i]);
1608       alloc_all(current,i);
1609       current->is32&=~(1LL<<HIREG);
1610       current->is32&=~(1LL<<LOREG);
1611       dirty_reg(current,HIREG);
1612       dirty_reg(current,LOREG);
1613       minimum_free_regs[i]=HOST_REGS;
1614     }
1615   }
1616   else
1617   {
1618     // Multiply by zero is zero.
1619     // MIPS does not have a divide by zero exception.
1620     // The result is undefined, we return zero.
1621     alloc_reg(current,i,HIREG);
1622     alloc_reg(current,i,LOREG);
1623     current->is32|=1LL<<HIREG;
1624     current->is32|=1LL<<LOREG;
1625     dirty_reg(current,HIREG);
1626     dirty_reg(current,LOREG);
1627   }
1628 }
1629 #endif
1630
1631 void cop0_alloc(struct regstat *current,int i)
1632 {
1633   if(opcode2[i]==0) // MFC0
1634   {
1635     if(rt1[i]) {
1636       clear_const(current,rt1[i]);
1637       alloc_all(current,i);
1638       alloc_reg(current,i,rt1[i]);
1639       current->is32|=1LL<<rt1[i];
1640       dirty_reg(current,rt1[i]);
1641     }
1642   }
1643   else if(opcode2[i]==4) // MTC0
1644   {
1645     if(rs1[i]){
1646       clear_const(current,rs1[i]);
1647       alloc_reg(current,i,rs1[i]);
1648       alloc_all(current,i);
1649     }
1650     else {
1651       alloc_all(current,i); // FIXME: Keep r0
1652       current->u&=~1LL;
1653       alloc_reg(current,i,0);
1654     }
1655   }
1656   else
1657   {
1658     // TLBR/TLBWI/TLBWR/TLBP/ERET
1659     assert(opcode2[i]==0x10);
1660     alloc_all(current,i);
1661   }
1662   minimum_free_regs[i]=HOST_REGS;
1663 }
1664
1665 void cop1_alloc(struct regstat *current,int i)
1666 {
1667   alloc_reg(current,i,CSREG); // Load status
1668   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1669   {
1670     if(rt1[i]){
1671       clear_const(current,rt1[i]);
1672       if(opcode2[i]==1) {
1673         alloc_reg64(current,i,rt1[i]); // DMFC1
1674         current->is32&=~(1LL<<rt1[i]);
1675       }else{
1676         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1677         current->is32|=1LL<<rt1[i];
1678       }
1679       dirty_reg(current,rt1[i]);
1680     }
1681     alloc_reg_temp(current,i,-1);
1682   }
1683   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1684   {
1685     if(rs1[i]){
1686       clear_const(current,rs1[i]);
1687       if(opcode2[i]==5)
1688         alloc_reg64(current,i,rs1[i]); // DMTC1
1689       else
1690         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1691       alloc_reg_temp(current,i,-1);
1692     }
1693     else {
1694       current->u&=~1LL;
1695       alloc_reg(current,i,0);
1696       alloc_reg_temp(current,i,-1);
1697     }
1698   }
1699   minimum_free_regs[i]=1;
1700 }
1701 void fconv_alloc(struct regstat *current,int i)
1702 {
1703   alloc_reg(current,i,CSREG); // Load status
1704   alloc_reg_temp(current,i,-1);
1705   minimum_free_regs[i]=1;
1706 }
1707 void float_alloc(struct regstat *current,int i)
1708 {
1709   alloc_reg(current,i,CSREG); // Load status
1710   alloc_reg_temp(current,i,-1);
1711   minimum_free_regs[i]=1;
1712 }
1713 void c2op_alloc(struct regstat *current,int i)
1714 {
1715   alloc_reg_temp(current,i,-1);
1716 }
1717 void fcomp_alloc(struct regstat *current,int i)
1718 {
1719   alloc_reg(current,i,CSREG); // Load status
1720   alloc_reg(current,i,FSREG); // Load flags
1721   dirty_reg(current,FSREG); // Flag will be modified
1722   alloc_reg_temp(current,i,-1);
1723   minimum_free_regs[i]=1;
1724 }
1725
1726 void syscall_alloc(struct regstat *current,int i)
1727 {
1728   alloc_cc(current,i);
1729   dirty_reg(current,CCREG);
1730   alloc_all(current,i);
1731   minimum_free_regs[i]=HOST_REGS;
1732   current->isconst=0;
1733 }
1734
1735 void delayslot_alloc(struct regstat *current,int i)
1736 {
1737   switch(itype[i])
1738   {
1739     case UJUMP:
1740     case CJUMP:
1741     case SJUMP:
1742     case RJUMP:
1743     case FJUMP:
1744     case SYSCALL:
1745     case HLECALL:
1746     case SPAN:
1747       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1748       SysPrintf("Disabled speculative precompilation\n");
1749       stop_after_jal=1;
1750       break;
1751     case IMM16:
1752       imm16_alloc(current,i);
1753       break;
1754     case LOAD:
1755     case LOADLR:
1756       load_alloc(current,i);
1757       break;
1758     case STORE:
1759     case STORELR:
1760       store_alloc(current,i);
1761       break;
1762     case ALU:
1763       alu_alloc(current,i);
1764       break;
1765     case SHIFT:
1766       shift_alloc(current,i);
1767       break;
1768     case MULTDIV:
1769       multdiv_alloc(current,i);
1770       break;
1771     case SHIFTIMM:
1772       shiftimm_alloc(current,i);
1773       break;
1774     case MOV:
1775       mov_alloc(current,i);
1776       break;
1777     case COP0:
1778       cop0_alloc(current,i);
1779       break;
1780     case COP1:
1781     case COP2:
1782       cop1_alloc(current,i);
1783       break;
1784     case C1LS:
1785       c1ls_alloc(current,i);
1786       break;
1787     case C2LS:
1788       c2ls_alloc(current,i);
1789       break;
1790     case FCONV:
1791       fconv_alloc(current,i);
1792       break;
1793     case FLOAT:
1794       float_alloc(current,i);
1795       break;
1796     case FCOMP:
1797       fcomp_alloc(current,i);
1798       break;
1799     case C2OP:
1800       c2op_alloc(current,i);
1801       break;
1802   }
1803 }
1804
1805 // Special case where a branch and delay slot span two pages in virtual memory
1806 static void pagespan_alloc(struct regstat *current,int i)
1807 {
1808   current->isconst=0;
1809   current->wasconst=0;
1810   regs[i].wasconst=0;
1811   minimum_free_regs[i]=HOST_REGS;
1812   alloc_all(current,i);
1813   alloc_cc(current,i);
1814   dirty_reg(current,CCREG);
1815   if(opcode[i]==3) // JAL
1816   {
1817     alloc_reg(current,i,31);
1818     dirty_reg(current,31);
1819   }
1820   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1821   {
1822     alloc_reg(current,i,rs1[i]);
1823     if (rt1[i]!=0) {
1824       alloc_reg(current,i,rt1[i]);
1825       dirty_reg(current,rt1[i]);
1826     }
1827   }
1828   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1829   {
1830     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1831     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1832     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1833     {
1834       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1835       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1836     }
1837   }
1838   else
1839   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1840   {
1841     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1842     if(!((current->is32>>rs1[i])&1))
1843     {
1844       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1845     }
1846   }
1847   else
1848   if(opcode[i]==0x11) // BC1
1849   {
1850     alloc_reg(current,i,FSREG);
1851     alloc_reg(current,i,CSREG);
1852   }
1853   //else ...
1854 }
1855
1856 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1857 {
1858   stubs[stubcount][0]=type;
1859   stubs[stubcount][1]=addr;
1860   stubs[stubcount][2]=retaddr;
1861   stubs[stubcount][3]=a;
1862   stubs[stubcount][4]=b;
1863   stubs[stubcount][5]=c;
1864   stubs[stubcount][6]=d;
1865   stubs[stubcount][7]=e;
1866   stubcount++;
1867 }
1868
1869 // Write out a single register
1870 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1871 {
1872   int hr;
1873   for(hr=0;hr<HOST_REGS;hr++) {
1874     if(hr!=EXCLUDE_REG) {
1875       if((regmap[hr]&63)==r) {
1876         if((dirty>>hr)&1) {
1877           if(regmap[hr]<64) {
1878             emit_storereg(r,hr);
1879           }else{
1880             emit_storereg(r|64,hr);
1881           }
1882         }
1883       }
1884     }
1885   }
1886 }
1887
1888 #if 0
1889 static int mchecksum(void)
1890 {
1891   //if(!tracedebug) return 0;
1892   int i;
1893   int sum=0;
1894   for(i=0;i<2097152;i++) {
1895     unsigned int temp=sum;
1896     sum<<=1;
1897     sum|=(~temp)>>31;
1898     sum^=((u_int *)rdram)[i];
1899   }
1900   return sum;
1901 }
1902
1903 static int rchecksum(void)
1904 {
1905   int i;
1906   int sum=0;
1907   for(i=0;i<64;i++)
1908     sum^=((u_int *)reg)[i];
1909   return sum;
1910 }
1911
1912 static void rlist(void)
1913 {
1914   int i;
1915   printf("TRACE: ");
1916   for(i=0;i<32;i++)
1917     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1918   printf("\n");
1919 }
1920
1921 static void enabletrace(void)
1922 {
1923   tracedebug=1;
1924 }
1925
1926 static void memdebug(int i)
1927 {
1928   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1929   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1930   //rlist();
1931   //if(tracedebug) {
1932   //if(Count>=-2084597794) {
1933   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1934   //if(0) {
1935     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1936     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1937     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1938     rlist();
1939     #ifdef __i386__
1940     printf("TRACE: %x\n",(&i)[-1]);
1941     #endif
1942     #ifdef __arm__
1943     int j;
1944     printf("TRACE: %x \n",(&j)[10]);
1945     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1946     #endif
1947     //fflush(stdout);
1948   }
1949   //printf("TRACE: %x\n",(&i)[-1]);
1950 }
1951 #endif
1952
1953 void alu_assemble(int i,struct regstat *i_regs)
1954 {
1955   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1956     if(rt1[i]) {
1957       signed char s1,s2,t;
1958       t=get_reg(i_regs->regmap,rt1[i]);
1959       if(t>=0) {
1960         s1=get_reg(i_regs->regmap,rs1[i]);
1961         s2=get_reg(i_regs->regmap,rs2[i]);
1962         if(rs1[i]&&rs2[i]) {
1963           assert(s1>=0);
1964           assert(s2>=0);
1965           if(opcode2[i]&2) emit_sub(s1,s2,t);
1966           else emit_add(s1,s2,t);
1967         }
1968         else if(rs1[i]) {
1969           if(s1>=0) emit_mov(s1,t);
1970           else emit_loadreg(rs1[i],t);
1971         }
1972         else if(rs2[i]) {
1973           if(s2>=0) {
1974             if(opcode2[i]&2) emit_neg(s2,t);
1975             else emit_mov(s2,t);
1976           }
1977           else {
1978             emit_loadreg(rs2[i],t);
1979             if(opcode2[i]&2) emit_neg(t,t);
1980           }
1981         }
1982         else emit_zeroreg(t);
1983       }
1984     }
1985   }
1986   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1987     if(rt1[i]) {
1988       signed char s1l,s2l,s1h,s2h,tl,th;
1989       tl=get_reg(i_regs->regmap,rt1[i]);
1990       th=get_reg(i_regs->regmap,rt1[i]|64);
1991       if(tl>=0) {
1992         s1l=get_reg(i_regs->regmap,rs1[i]);
1993         s2l=get_reg(i_regs->regmap,rs2[i]);
1994         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1995         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1996         if(rs1[i]&&rs2[i]) {
1997           assert(s1l>=0);
1998           assert(s2l>=0);
1999           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2000           else emit_adds(s1l,s2l,tl);
2001           if(th>=0) {
2002             #ifdef INVERTED_CARRY
2003             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2004             #else
2005             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2006             #endif
2007             else emit_add(s1h,s2h,th);
2008           }
2009         }
2010         else if(rs1[i]) {
2011           if(s1l>=0) emit_mov(s1l,tl);
2012           else emit_loadreg(rs1[i],tl);
2013           if(th>=0) {
2014             if(s1h>=0) emit_mov(s1h,th);
2015             else emit_loadreg(rs1[i]|64,th);
2016           }
2017         }
2018         else if(rs2[i]) {
2019           if(s2l>=0) {
2020             if(opcode2[i]&2) emit_negs(s2l,tl);
2021             else emit_mov(s2l,tl);
2022           }
2023           else {
2024             emit_loadreg(rs2[i],tl);
2025             if(opcode2[i]&2) emit_negs(tl,tl);
2026           }
2027           if(th>=0) {
2028             #ifdef INVERTED_CARRY
2029             if(s2h>=0) emit_mov(s2h,th);
2030             else emit_loadreg(rs2[i]|64,th);
2031             if(opcode2[i]&2) {
2032               emit_adcimm(-1,th); // x86 has inverted carry flag
2033               emit_not(th,th);
2034             }
2035             #else
2036             if(opcode2[i]&2) {
2037               if(s2h>=0) emit_rscimm(s2h,0,th);
2038               else {
2039                 emit_loadreg(rs2[i]|64,th);
2040                 emit_rscimm(th,0,th);
2041               }
2042             }else{
2043               if(s2h>=0) emit_mov(s2h,th);
2044               else emit_loadreg(rs2[i]|64,th);
2045             }
2046             #endif
2047           }
2048         }
2049         else {
2050           emit_zeroreg(tl);
2051           if(th>=0) emit_zeroreg(th);
2052         }
2053       }
2054     }
2055   }
2056   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2057     if(rt1[i]) {
2058       signed char s1l,s1h,s2l,s2h,t;
2059       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2060       {
2061         t=get_reg(i_regs->regmap,rt1[i]);
2062         //assert(t>=0);
2063         if(t>=0) {
2064           s1l=get_reg(i_regs->regmap,rs1[i]);
2065           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2066           s2l=get_reg(i_regs->regmap,rs2[i]);
2067           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2068           if(rs2[i]==0) // rx<r0
2069           {
2070             assert(s1h>=0);
2071             if(opcode2[i]==0x2a) // SLT
2072               emit_shrimm(s1h,31,t);
2073             else // SLTU (unsigned can not be less than zero)
2074               emit_zeroreg(t);
2075           }
2076           else if(rs1[i]==0) // r0<rx
2077           {
2078             assert(s2h>=0);
2079             if(opcode2[i]==0x2a) // SLT
2080               emit_set_gz64_32(s2h,s2l,t);
2081             else // SLTU (set if not zero)
2082               emit_set_nz64_32(s2h,s2l,t);
2083           }
2084           else {
2085             assert(s1l>=0);assert(s1h>=0);
2086             assert(s2l>=0);assert(s2h>=0);
2087             if(opcode2[i]==0x2a) // SLT
2088               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2089             else // SLTU
2090               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2091           }
2092         }
2093       } else {
2094         t=get_reg(i_regs->regmap,rt1[i]);
2095         //assert(t>=0);
2096         if(t>=0) {
2097           s1l=get_reg(i_regs->regmap,rs1[i]);
2098           s2l=get_reg(i_regs->regmap,rs2[i]);
2099           if(rs2[i]==0) // rx<r0
2100           {
2101             assert(s1l>=0);
2102             if(opcode2[i]==0x2a) // SLT
2103               emit_shrimm(s1l,31,t);
2104             else // SLTU (unsigned can not be less than zero)
2105               emit_zeroreg(t);
2106           }
2107           else if(rs1[i]==0) // r0<rx
2108           {
2109             assert(s2l>=0);
2110             if(opcode2[i]==0x2a) // SLT
2111               emit_set_gz32(s2l,t);
2112             else // SLTU (set if not zero)
2113               emit_set_nz32(s2l,t);
2114           }
2115           else{
2116             assert(s1l>=0);assert(s2l>=0);
2117             if(opcode2[i]==0x2a) // SLT
2118               emit_set_if_less32(s1l,s2l,t);
2119             else // SLTU
2120               emit_set_if_carry32(s1l,s2l,t);
2121           }
2122         }
2123       }
2124     }
2125   }
2126   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2127     if(rt1[i]) {
2128       signed char s1l,s1h,s2l,s2h,th,tl;
2129       tl=get_reg(i_regs->regmap,rt1[i]);
2130       th=get_reg(i_regs->regmap,rt1[i]|64);
2131       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2132       {
2133         assert(tl>=0);
2134         if(tl>=0) {
2135           s1l=get_reg(i_regs->regmap,rs1[i]);
2136           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2137           s2l=get_reg(i_regs->regmap,rs2[i]);
2138           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2139           if(rs1[i]&&rs2[i]) {
2140             assert(s1l>=0);assert(s1h>=0);
2141             assert(s2l>=0);assert(s2h>=0);
2142             if(opcode2[i]==0x24) { // AND
2143               emit_and(s1l,s2l,tl);
2144               emit_and(s1h,s2h,th);
2145             } else
2146             if(opcode2[i]==0x25) { // OR
2147               emit_or(s1l,s2l,tl);
2148               emit_or(s1h,s2h,th);
2149             } else
2150             if(opcode2[i]==0x26) { // XOR
2151               emit_xor(s1l,s2l,tl);
2152               emit_xor(s1h,s2h,th);
2153             } else
2154             if(opcode2[i]==0x27) { // NOR
2155               emit_or(s1l,s2l,tl);
2156               emit_or(s1h,s2h,th);
2157               emit_not(tl,tl);
2158               emit_not(th,th);
2159             }
2160           }
2161           else
2162           {
2163             if(opcode2[i]==0x24) { // AND
2164               emit_zeroreg(tl);
2165               emit_zeroreg(th);
2166             } else
2167             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2168               if(rs1[i]){
2169                 if(s1l>=0) emit_mov(s1l,tl);
2170                 else emit_loadreg(rs1[i],tl);
2171                 if(s1h>=0) emit_mov(s1h,th);
2172                 else emit_loadreg(rs1[i]|64,th);
2173               }
2174               else
2175               if(rs2[i]){
2176                 if(s2l>=0) emit_mov(s2l,tl);
2177                 else emit_loadreg(rs2[i],tl);
2178                 if(s2h>=0) emit_mov(s2h,th);
2179                 else emit_loadreg(rs2[i]|64,th);
2180               }
2181               else{
2182                 emit_zeroreg(tl);
2183                 emit_zeroreg(th);
2184               }
2185             } else
2186             if(opcode2[i]==0x27) { // NOR
2187               if(rs1[i]){
2188                 if(s1l>=0) emit_not(s1l,tl);
2189                 else{
2190                   emit_loadreg(rs1[i],tl);
2191                   emit_not(tl,tl);
2192                 }
2193                 if(s1h>=0) emit_not(s1h,th);
2194                 else{
2195                   emit_loadreg(rs1[i]|64,th);
2196                   emit_not(th,th);
2197                 }
2198               }
2199               else
2200               if(rs2[i]){
2201                 if(s2l>=0) emit_not(s2l,tl);
2202                 else{
2203                   emit_loadreg(rs2[i],tl);
2204                   emit_not(tl,tl);
2205                 }
2206                 if(s2h>=0) emit_not(s2h,th);
2207                 else{
2208                   emit_loadreg(rs2[i]|64,th);
2209                   emit_not(th,th);
2210                 }
2211               }
2212               else {
2213                 emit_movimm(-1,tl);
2214                 emit_movimm(-1,th);
2215               }
2216             }
2217           }
2218         }
2219       }
2220       else
2221       {
2222         // 32 bit
2223         if(tl>=0) {
2224           s1l=get_reg(i_regs->regmap,rs1[i]);
2225           s2l=get_reg(i_regs->regmap,rs2[i]);
2226           if(rs1[i]&&rs2[i]) {
2227             assert(s1l>=0);
2228             assert(s2l>=0);
2229             if(opcode2[i]==0x24) { // AND
2230               emit_and(s1l,s2l,tl);
2231             } else
2232             if(opcode2[i]==0x25) { // OR
2233               emit_or(s1l,s2l,tl);
2234             } else
2235             if(opcode2[i]==0x26) { // XOR
2236               emit_xor(s1l,s2l,tl);
2237             } else
2238             if(opcode2[i]==0x27) { // NOR
2239               emit_or(s1l,s2l,tl);
2240               emit_not(tl,tl);
2241             }
2242           }
2243           else
2244           {
2245             if(opcode2[i]==0x24) { // AND
2246               emit_zeroreg(tl);
2247             } else
2248             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2249               if(rs1[i]){
2250                 if(s1l>=0) emit_mov(s1l,tl);
2251                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2252               }
2253               else
2254               if(rs2[i]){
2255                 if(s2l>=0) emit_mov(s2l,tl);
2256                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2257               }
2258               else emit_zeroreg(tl);
2259             } else
2260             if(opcode2[i]==0x27) { // NOR
2261               if(rs1[i]){
2262                 if(s1l>=0) emit_not(s1l,tl);
2263                 else {
2264                   emit_loadreg(rs1[i],tl);
2265                   emit_not(tl,tl);
2266                 }
2267               }
2268               else
2269               if(rs2[i]){
2270                 if(s2l>=0) emit_not(s2l,tl);
2271                 else {
2272                   emit_loadreg(rs2[i],tl);
2273                   emit_not(tl,tl);
2274                 }
2275               }
2276               else emit_movimm(-1,tl);
2277             }
2278           }
2279         }
2280       }
2281     }
2282   }
2283 }
2284
2285 void imm16_assemble(int i,struct regstat *i_regs)
2286 {
2287   if (opcode[i]==0x0f) { // LUI
2288     if(rt1[i]) {
2289       signed char t;
2290       t=get_reg(i_regs->regmap,rt1[i]);
2291       //assert(t>=0);
2292       if(t>=0) {
2293         if(!((i_regs->isconst>>t)&1))
2294           emit_movimm(imm[i]<<16,t);
2295       }
2296     }
2297   }
2298   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2299     if(rt1[i]) {
2300       signed char s,t;
2301       t=get_reg(i_regs->regmap,rt1[i]);
2302       s=get_reg(i_regs->regmap,rs1[i]);
2303       if(rs1[i]) {
2304         //assert(t>=0);
2305         //assert(s>=0);
2306         if(t>=0) {
2307           if(!((i_regs->isconst>>t)&1)) {
2308             if(s<0) {
2309               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2310               emit_addimm(t,imm[i],t);
2311             }else{
2312               if(!((i_regs->wasconst>>s)&1))
2313                 emit_addimm(s,imm[i],t);
2314               else
2315                 emit_movimm(constmap[i][s]+imm[i],t);
2316             }
2317           }
2318         }
2319       } else {
2320         if(t>=0) {
2321           if(!((i_regs->isconst>>t)&1))
2322             emit_movimm(imm[i],t);
2323         }
2324       }
2325     }
2326   }
2327   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2328     if(rt1[i]) {
2329       signed char sh,sl,th,tl;
2330       th=get_reg(i_regs->regmap,rt1[i]|64);
2331       tl=get_reg(i_regs->regmap,rt1[i]);
2332       sh=get_reg(i_regs->regmap,rs1[i]|64);
2333       sl=get_reg(i_regs->regmap,rs1[i]);
2334       if(tl>=0) {
2335         if(rs1[i]) {
2336           assert(sh>=0);
2337           assert(sl>=0);
2338           if(th>=0) {
2339             emit_addimm64_32(sh,sl,imm[i],th,tl);
2340           }
2341           else {
2342             emit_addimm(sl,imm[i],tl);
2343           }
2344         } else {
2345           emit_movimm(imm[i],tl);
2346           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2347         }
2348       }
2349     }
2350   }
2351   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2352     if(rt1[i]) {
2353       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2354       signed char sh,sl,t;
2355       t=get_reg(i_regs->regmap,rt1[i]);
2356       sh=get_reg(i_regs->regmap,rs1[i]|64);
2357       sl=get_reg(i_regs->regmap,rs1[i]);
2358       //assert(t>=0);
2359       if(t>=0) {
2360         if(rs1[i]>0) {
2361           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2362           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2363             if(opcode[i]==0x0a) { // SLTI
2364               if(sl<0) {
2365                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2366                 emit_slti32(t,imm[i],t);
2367               }else{
2368                 emit_slti32(sl,imm[i],t);
2369               }
2370             }
2371             else { // SLTIU
2372               if(sl<0) {
2373                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2374                 emit_sltiu32(t,imm[i],t);
2375               }else{
2376                 emit_sltiu32(sl,imm[i],t);
2377               }
2378             }
2379           }else{ // 64-bit
2380             assert(sl>=0);
2381             if(opcode[i]==0x0a) // SLTI
2382               emit_slti64_32(sh,sl,imm[i],t);
2383             else // SLTIU
2384               emit_sltiu64_32(sh,sl,imm[i],t);
2385           }
2386         }else{
2387           // SLTI(U) with r0 is just stupid,
2388           // nonetheless examples can be found
2389           if(opcode[i]==0x0a) // SLTI
2390             if(0<imm[i]) emit_movimm(1,t);
2391             else emit_zeroreg(t);
2392           else // SLTIU
2393           {
2394             if(imm[i]) emit_movimm(1,t);
2395             else emit_zeroreg(t);
2396           }
2397         }
2398       }
2399     }
2400   }
2401   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2402     if(rt1[i]) {
2403       signed char sh,sl,th,tl;
2404       th=get_reg(i_regs->regmap,rt1[i]|64);
2405       tl=get_reg(i_regs->regmap,rt1[i]);
2406       sh=get_reg(i_regs->regmap,rs1[i]|64);
2407       sl=get_reg(i_regs->regmap,rs1[i]);
2408       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2409         if(opcode[i]==0x0c) //ANDI
2410         {
2411           if(rs1[i]) {
2412             if(sl<0) {
2413               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2414               emit_andimm(tl,imm[i],tl);
2415             }else{
2416               if(!((i_regs->wasconst>>sl)&1))
2417                 emit_andimm(sl,imm[i],tl);
2418               else
2419                 emit_movimm(constmap[i][sl]&imm[i],tl);
2420             }
2421           }
2422           else
2423             emit_zeroreg(tl);
2424           if(th>=0) emit_zeroreg(th);
2425         }
2426         else
2427         {
2428           if(rs1[i]) {
2429             if(sl<0) {
2430               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2431             }
2432             if(th>=0) {
2433               if(sh<0) {
2434                 emit_loadreg(rs1[i]|64,th);
2435               }else{
2436                 emit_mov(sh,th);
2437               }
2438             }
2439             if(opcode[i]==0x0d) { // ORI
2440               if(sl<0) {
2441                 emit_orimm(tl,imm[i],tl);
2442               }else{
2443                 if(!((i_regs->wasconst>>sl)&1))
2444                   emit_orimm(sl,imm[i],tl);
2445                 else
2446                   emit_movimm(constmap[i][sl]|imm[i],tl);
2447               }
2448             }
2449             if(opcode[i]==0x0e) { // XORI
2450               if(sl<0) {
2451                 emit_xorimm(tl,imm[i],tl);
2452               }else{
2453                 if(!((i_regs->wasconst>>sl)&1))
2454                   emit_xorimm(sl,imm[i],tl);
2455                 else
2456                   emit_movimm(constmap[i][sl]^imm[i],tl);
2457               }
2458             }
2459           }
2460           else {
2461             emit_movimm(imm[i],tl);
2462             if(th>=0) emit_zeroreg(th);
2463           }
2464         }
2465       }
2466     }
2467   }
2468 }
2469
2470 void shiftimm_assemble(int i,struct regstat *i_regs)
2471 {
2472   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2473   {
2474     if(rt1[i]) {
2475       signed char s,t;
2476       t=get_reg(i_regs->regmap,rt1[i]);
2477       s=get_reg(i_regs->regmap,rs1[i]);
2478       //assert(t>=0);
2479       if(t>=0&&!((i_regs->isconst>>t)&1)){
2480         if(rs1[i]==0)
2481         {
2482           emit_zeroreg(t);
2483         }
2484         else
2485         {
2486           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2487           if(imm[i]) {
2488             if(opcode2[i]==0) // SLL
2489             {
2490               emit_shlimm(s<0?t:s,imm[i],t);
2491             }
2492             if(opcode2[i]==2) // SRL
2493             {
2494               emit_shrimm(s<0?t:s,imm[i],t);
2495             }
2496             if(opcode2[i]==3) // SRA
2497             {
2498               emit_sarimm(s<0?t:s,imm[i],t);
2499             }
2500           }else{
2501             // Shift by zero
2502             if(s>=0 && s!=t) emit_mov(s,t);
2503           }
2504         }
2505       }
2506       //emit_storereg(rt1[i],t); //DEBUG
2507     }
2508   }
2509   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2510   {
2511     if(rt1[i]) {
2512       signed char sh,sl,th,tl;
2513       th=get_reg(i_regs->regmap,rt1[i]|64);
2514       tl=get_reg(i_regs->regmap,rt1[i]);
2515       sh=get_reg(i_regs->regmap,rs1[i]|64);
2516       sl=get_reg(i_regs->regmap,rs1[i]);
2517       if(tl>=0) {
2518         if(rs1[i]==0)
2519         {
2520           emit_zeroreg(tl);
2521           if(th>=0) emit_zeroreg(th);
2522         }
2523         else
2524         {
2525           assert(sl>=0);
2526           assert(sh>=0);
2527           if(imm[i]) {
2528             if(opcode2[i]==0x38) // DSLL
2529             {
2530               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2531               emit_shlimm(sl,imm[i],tl);
2532             }
2533             if(opcode2[i]==0x3a) // DSRL
2534             {
2535               emit_shrdimm(sl,sh,imm[i],tl);
2536               if(th>=0) emit_shrimm(sh,imm[i],th);
2537             }
2538             if(opcode2[i]==0x3b) // DSRA
2539             {
2540               emit_shrdimm(sl,sh,imm[i],tl);
2541               if(th>=0) emit_sarimm(sh,imm[i],th);
2542             }
2543           }else{
2544             // Shift by zero
2545             if(sl!=tl) emit_mov(sl,tl);
2546             if(th>=0&&sh!=th) emit_mov(sh,th);
2547           }
2548         }
2549       }
2550     }
2551   }
2552   if(opcode2[i]==0x3c) // DSLL32
2553   {
2554     if(rt1[i]) {
2555       signed char sl,tl,th;
2556       tl=get_reg(i_regs->regmap,rt1[i]);
2557       th=get_reg(i_regs->regmap,rt1[i]|64);
2558       sl=get_reg(i_regs->regmap,rs1[i]);
2559       if(th>=0||tl>=0){
2560         assert(tl>=0);
2561         assert(th>=0);
2562         assert(sl>=0);
2563         emit_mov(sl,th);
2564         emit_zeroreg(tl);
2565         if(imm[i]>32)
2566         {
2567           emit_shlimm(th,imm[i]&31,th);
2568         }
2569       }
2570     }
2571   }
2572   if(opcode2[i]==0x3e) // DSRL32
2573   {
2574     if(rt1[i]) {
2575       signed char sh,tl,th;
2576       tl=get_reg(i_regs->regmap,rt1[i]);
2577       th=get_reg(i_regs->regmap,rt1[i]|64);
2578       sh=get_reg(i_regs->regmap,rs1[i]|64);
2579       if(tl>=0){
2580         assert(sh>=0);
2581         emit_mov(sh,tl);
2582         if(th>=0) emit_zeroreg(th);
2583         if(imm[i]>32)
2584         {
2585           emit_shrimm(tl,imm[i]&31,tl);
2586         }
2587       }
2588     }
2589   }
2590   if(opcode2[i]==0x3f) // DSRA32
2591   {
2592     if(rt1[i]) {
2593       signed char sh,tl;
2594       tl=get_reg(i_regs->regmap,rt1[i]);
2595       sh=get_reg(i_regs->regmap,rs1[i]|64);
2596       if(tl>=0){
2597         assert(sh>=0);
2598         emit_mov(sh,tl);
2599         if(imm[i]>32)
2600         {
2601           emit_sarimm(tl,imm[i]&31,tl);
2602         }
2603       }
2604     }
2605   }
2606 }
2607
2608 #ifndef shift_assemble
2609 void shift_assemble(int i,struct regstat *i_regs)
2610 {
2611   printf("Need shift_assemble for this architecture.\n");
2612   exit(1);
2613 }
2614 #endif
2615
2616 void load_assemble(int i,struct regstat *i_regs)
2617 {
2618   int s,th,tl,addr,map=-1;
2619   int offset;
2620   int jaddr=0;
2621   int memtarget=0,c=0;
2622   int fastload_reg_override=0;
2623   u_int hr,reglist=0;
2624   th=get_reg(i_regs->regmap,rt1[i]|64);
2625   tl=get_reg(i_regs->regmap,rt1[i]);
2626   s=get_reg(i_regs->regmap,rs1[i]);
2627   offset=imm[i];
2628   for(hr=0;hr<HOST_REGS;hr++) {
2629     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2630   }
2631   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2632   if(s>=0) {
2633     c=(i_regs->wasconst>>s)&1;
2634     if (c) {
2635       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2636     }
2637   }
2638   //printf("load_assemble: c=%d\n",c);
2639   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2640   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2641   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2642     ||rt1[i]==0) {
2643       // could be FIFO, must perform the read
2644       // ||dummy read
2645       assem_debug("(forced read)\n");
2646       tl=get_reg(i_regs->regmap,-1);
2647       assert(tl>=0);
2648   }
2649   if(offset||s<0||c) addr=tl;
2650   else addr=s;
2651   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2652  if(tl>=0) {
2653   //printf("load_assemble: c=%d\n",c);
2654   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2655   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2656   reglist&=~(1<<tl);
2657   if(th>=0) reglist&=~(1<<th);
2658   if(!c) {
2659     #ifdef RAM_OFFSET
2660     map=get_reg(i_regs->regmap,ROREG);
2661     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2662     #endif
2663     #ifdef R29_HACK
2664     // Strmnnrmn's speed hack
2665     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2666     #endif
2667     {
2668       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2669     }
2670   }
2671   else if(ram_offset&&memtarget) {
2672     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2673     fastload_reg_override=HOST_TEMPREG;
2674   }
2675   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2676   if (opcode[i]==0x20) { // LB
2677     if(!c||memtarget) {
2678       if(!dummy) {
2679         #ifdef HOST_IMM_ADDR32
2680         if(c)
2681           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2682         else
2683         #endif
2684         {
2685           //emit_xorimm(addr,3,tl);
2686           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2687           int x=0,a=tl;
2688 #ifdef BIG_ENDIAN_MIPS
2689           if(!c) emit_xorimm(addr,3,tl);
2690           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2691 #else
2692           if(!c) a=addr;
2693 #endif
2694           if(fastload_reg_override) a=fastload_reg_override;
2695
2696           emit_movsbl_indexed_tlb(x,a,map,tl);
2697         }
2698       }
2699       if(jaddr)
2700         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2701     }
2702     else
2703       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2704   }
2705   if (opcode[i]==0x21) { // LH
2706     if(!c||memtarget) {
2707       if(!dummy) {
2708         #ifdef HOST_IMM_ADDR32
2709         if(c)
2710           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2711         else
2712         #endif
2713         {
2714           int x=0,a=tl;
2715 #ifdef BIG_ENDIAN_MIPS
2716           if(!c) emit_xorimm(addr,2,tl);
2717           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2718 #else
2719           if(!c) a=addr;
2720 #endif
2721           if(fastload_reg_override) a=fastload_reg_override;
2722           //#ifdef
2723           //emit_movswl_indexed_tlb(x,tl,map,tl);
2724           //else
2725           if(map>=0) {
2726             emit_movswl_indexed(x,a,tl);
2727           }else{
2728             #if 1 //def RAM_OFFSET
2729             emit_movswl_indexed(x,a,tl);
2730             #else
2731             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2732             #endif
2733           }
2734         }
2735       }
2736       if(jaddr)
2737         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2738     }
2739     else
2740       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2741   }
2742   if (opcode[i]==0x23) { // LW
2743     if(!c||memtarget) {
2744       if(!dummy) {
2745         int a=addr;
2746         if(fastload_reg_override) a=fastload_reg_override;
2747         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2748         #ifdef HOST_IMM_ADDR32
2749         if(c)
2750           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2751         else
2752         #endif
2753         emit_readword_indexed_tlb(0,a,map,tl);
2754       }
2755       if(jaddr)
2756         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2757     }
2758     else
2759       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2760   }
2761   if (opcode[i]==0x24) { // LBU
2762     if(!c||memtarget) {
2763       if(!dummy) {
2764         #ifdef HOST_IMM_ADDR32
2765         if(c)
2766           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2767         else
2768         #endif
2769         {
2770           //emit_xorimm(addr,3,tl);
2771           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2772           int x=0,a=tl;
2773 #ifdef BIG_ENDIAN_MIPS
2774           if(!c) emit_xorimm(addr,3,tl);
2775           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2776 #else
2777           if(!c) a=addr;
2778 #endif
2779           if(fastload_reg_override) a=fastload_reg_override;
2780
2781           emit_movzbl_indexed_tlb(x,a,map,tl);
2782         }
2783       }
2784       if(jaddr)
2785         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2786     }
2787     else
2788       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2789   }
2790   if (opcode[i]==0x25) { // LHU
2791     if(!c||memtarget) {
2792       if(!dummy) {
2793         #ifdef HOST_IMM_ADDR32
2794         if(c)
2795           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2796         else
2797         #endif
2798         {
2799           int x=0,a=tl;
2800 #ifdef BIG_ENDIAN_MIPS
2801           if(!c) emit_xorimm(addr,2,tl);
2802           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2803 #else
2804           if(!c) a=addr;
2805 #endif
2806           if(fastload_reg_override) a=fastload_reg_override;
2807           //#ifdef
2808           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2809           //#else
2810           if(map>=0) {
2811             emit_movzwl_indexed(x,a,tl);
2812           }else{
2813             #if 1 //def RAM_OFFSET
2814             emit_movzwl_indexed(x,a,tl);
2815             #else
2816             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2817             #endif
2818           }
2819         }
2820       }
2821       if(jaddr)
2822         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2823     }
2824     else
2825       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2826   }
2827   if (opcode[i]==0x27) { // LWU
2828     assert(th>=0);
2829     if(!c||memtarget) {
2830       if(!dummy) {
2831         int a=addr;
2832         if(fastload_reg_override) a=fastload_reg_override;
2833         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2834         #ifdef HOST_IMM_ADDR32
2835         if(c)
2836           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2837         else
2838         #endif
2839         emit_readword_indexed_tlb(0,a,map,tl);
2840       }
2841       if(jaddr)
2842         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2843     }
2844     else {
2845       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2846     }
2847     emit_zeroreg(th);
2848   }
2849   if (opcode[i]==0x37) { // LD
2850     if(!c||memtarget) {
2851       if(!dummy) {
2852         int a=addr;
2853         if(fastload_reg_override) a=fastload_reg_override;
2854         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2855         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2856         #ifdef HOST_IMM_ADDR32
2857         if(c)
2858           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2859         else
2860         #endif
2861         emit_readdword_indexed_tlb(0,a,map,th,tl);
2862       }
2863       if(jaddr)
2864         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2865     }
2866     else
2867       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2868   }
2869  }
2870   //emit_storereg(rt1[i],tl); // DEBUG
2871   //if(opcode[i]==0x23)
2872   //if(opcode[i]==0x24)
2873   //if(opcode[i]==0x23||opcode[i]==0x24)
2874   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2875   {
2876     //emit_pusha();
2877     save_regs(0x100f);
2878         emit_readword((int)&last_count,ECX);
2879         #ifdef __i386__
2880         if(get_reg(i_regs->regmap,CCREG)<0)
2881           emit_loadreg(CCREG,HOST_CCREG);
2882         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2883         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2884         emit_writeword(HOST_CCREG,(int)&Count);
2885         #endif
2886         #ifdef __arm__
2887         if(get_reg(i_regs->regmap,CCREG)<0)
2888           emit_loadreg(CCREG,0);
2889         else
2890           emit_mov(HOST_CCREG,0);
2891         emit_add(0,ECX,0);
2892         emit_addimm(0,2*ccadj[i],0);
2893         emit_writeword(0,(int)&Count);
2894         #endif
2895     emit_call((int)memdebug);
2896     //emit_popa();
2897     restore_regs(0x100f);
2898   }*/
2899 }
2900
2901 #ifndef loadlr_assemble
2902 void loadlr_assemble(int i,struct regstat *i_regs)
2903 {
2904   printf("Need loadlr_assemble for this architecture.\n");
2905   exit(1);
2906 }
2907 #endif
2908
2909 void store_assemble(int i,struct regstat *i_regs)
2910 {
2911   int s,th,tl,map=-1;
2912   int addr,temp;
2913   int offset;
2914   int jaddr=0,type;
2915   int memtarget=0,c=0;
2916   int agr=AGEN1+(i&1);
2917   int faststore_reg_override=0;
2918   u_int hr,reglist=0;
2919   th=get_reg(i_regs->regmap,rs2[i]|64);
2920   tl=get_reg(i_regs->regmap,rs2[i]);
2921   s=get_reg(i_regs->regmap,rs1[i]);
2922   temp=get_reg(i_regs->regmap,agr);
2923   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2924   offset=imm[i];
2925   if(s>=0) {
2926     c=(i_regs->wasconst>>s)&1;
2927     if(c) {
2928       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2929     }
2930   }
2931   assert(tl>=0);
2932   assert(temp>=0);
2933   for(hr=0;hr<HOST_REGS;hr++) {
2934     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2935   }
2936   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2937   if(offset||s<0||c) addr=temp;
2938   else addr=s;
2939   if(!c) {
2940     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2941   }
2942   else if(ram_offset&&memtarget) {
2943     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2944     faststore_reg_override=HOST_TEMPREG;
2945   }
2946
2947   if (opcode[i]==0x28) { // SB
2948     if(!c||memtarget) {
2949       int x=0,a=temp;
2950 #ifdef BIG_ENDIAN_MIPS
2951       if(!c) emit_xorimm(addr,3,temp);
2952       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2953 #else
2954       if(!c) a=addr;
2955 #endif
2956       if(faststore_reg_override) a=faststore_reg_override;
2957       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2958       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2959     }
2960     type=STOREB_STUB;
2961   }
2962   if (opcode[i]==0x29) { // SH
2963     if(!c||memtarget) {
2964       int x=0,a=temp;
2965 #ifdef BIG_ENDIAN_MIPS
2966       if(!c) emit_xorimm(addr,2,temp);
2967       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2968 #else
2969       if(!c) a=addr;
2970 #endif
2971       if(faststore_reg_override) a=faststore_reg_override;
2972       //#ifdef
2973       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2974       //#else
2975       if(map>=0) {
2976         emit_writehword_indexed(tl,x,a);
2977       }else
2978         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2979         emit_writehword_indexed(tl,x,a);
2980     }
2981     type=STOREH_STUB;
2982   }
2983   if (opcode[i]==0x2B) { // SW
2984     if(!c||memtarget) {
2985       int a=addr;
2986       if(faststore_reg_override) a=faststore_reg_override;
2987       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2988       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2989     }
2990     type=STOREW_STUB;
2991   }
2992   if (opcode[i]==0x3F) { // SD
2993     if(!c||memtarget) {
2994       int a=addr;
2995       if(faststore_reg_override) a=faststore_reg_override;
2996       if(rs2[i]) {
2997         assert(th>=0);
2998         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2999         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3000         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3001       }else{
3002         // Store zero
3003         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3004         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3005         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3006       }
3007     }
3008     type=STORED_STUB;
3009   }
3010   if(jaddr) {
3011     // PCSX store handlers don't check invcode again
3012     reglist|=1<<addr;
3013     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3014     jaddr=0;
3015   }
3016   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3017     if(!c||memtarget) {
3018       #ifdef DESTRUCTIVE_SHIFT
3019       // The x86 shift operation is 'destructive'; it overwrites the
3020       // source register, so we need to make a copy first and use that.
3021       addr=temp;
3022       #endif
3023       #if defined(HOST_IMM8)
3024       int ir=get_reg(i_regs->regmap,INVCP);
3025       assert(ir>=0);
3026       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3027       #else
3028       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3029       #endif
3030       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3031       emit_callne(invalidate_addr_reg[addr]);
3032       #else
3033       int jaddr2=(int)out;
3034       emit_jne(0);
3035       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3036       #endif
3037     }
3038   }
3039   u_int addr_val=constmap[i][s]+offset;
3040   if(jaddr) {
3041     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3042   } else if(c&&!memtarget) {
3043     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3044   }
3045   // basic current block modification detection..
3046   // not looking back as that should be in mips cache already
3047   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3048     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3049     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3050     if(i_regs->regmap==regs[i].regmap) {
3051       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3052       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3053       emit_movimm(start+i*4+4,0);
3054       emit_writeword(0,(int)&pcaddr);
3055       emit_jmp((int)do_interrupt);
3056     }
3057   }
3058   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3059   //if(opcode[i]==0x2B || opcode[i]==0x28)
3060   //if(opcode[i]==0x2B || opcode[i]==0x29)
3061   //if(opcode[i]==0x2B)
3062   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3063   {
3064     #ifdef __i386__
3065     emit_pusha();
3066     #endif
3067     #ifdef __arm__
3068     save_regs(0x100f);
3069     #endif
3070         emit_readword((int)&last_count,ECX);
3071         #ifdef __i386__
3072         if(get_reg(i_regs->regmap,CCREG)<0)
3073           emit_loadreg(CCREG,HOST_CCREG);
3074         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3075         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3076         emit_writeword(HOST_CCREG,(int)&Count);
3077         #endif
3078         #ifdef __arm__
3079         if(get_reg(i_regs->regmap,CCREG)<0)
3080           emit_loadreg(CCREG,0);
3081         else
3082           emit_mov(HOST_CCREG,0);
3083         emit_add(0,ECX,0);
3084         emit_addimm(0,2*ccadj[i],0);
3085         emit_writeword(0,(int)&Count);
3086         #endif
3087     emit_call((int)memdebug);
3088     #ifdef __i386__
3089     emit_popa();
3090     #endif
3091     #ifdef __arm__
3092     restore_regs(0x100f);
3093     #endif
3094   }*/
3095 }
3096
3097 void storelr_assemble(int i,struct regstat *i_regs)
3098 {
3099   int s,th,tl;
3100   int temp;
3101   int temp2=-1;
3102   int offset;
3103   int jaddr=0;
3104   int case1,case2,case3;
3105   int done0,done1,done2;
3106   int memtarget=0,c=0;
3107   int agr=AGEN1+(i&1);
3108   u_int hr,reglist=0;
3109   th=get_reg(i_regs->regmap,rs2[i]|64);
3110   tl=get_reg(i_regs->regmap,rs2[i]);
3111   s=get_reg(i_regs->regmap,rs1[i]);
3112   temp=get_reg(i_regs->regmap,agr);
3113   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3114   offset=imm[i];
3115   if(s>=0) {
3116     c=(i_regs->isconst>>s)&1;
3117     if(c) {
3118       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3119     }
3120   }
3121   assert(tl>=0);
3122   for(hr=0;hr<HOST_REGS;hr++) {
3123     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3124   }
3125   assert(temp>=0);
3126   if(!c) {
3127     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3128     if(!offset&&s!=temp) emit_mov(s,temp);
3129     jaddr=(int)out;
3130     emit_jno(0);
3131   }
3132   else
3133   {
3134     if(!memtarget||!rs1[i]) {
3135       jaddr=(int)out;
3136       emit_jmp(0);
3137     }
3138   }
3139   #ifdef RAM_OFFSET
3140   int map=get_reg(i_regs->regmap,ROREG);
3141   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3142   #else
3143   if((u_int)rdram!=0x80000000)
3144     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3145   #endif
3146
3147   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3148     temp2=get_reg(i_regs->regmap,FTEMP);
3149     if(!rs2[i]) temp2=th=tl;
3150   }
3151
3152 #ifndef BIG_ENDIAN_MIPS
3153     emit_xorimm(temp,3,temp);
3154 #endif
3155   emit_testimm(temp,2);
3156   case2=(int)out;
3157   emit_jne(0);
3158   emit_testimm(temp,1);
3159   case1=(int)out;
3160   emit_jne(0);
3161   // 0
3162   if (opcode[i]==0x2A) { // SWL
3163     emit_writeword_indexed(tl,0,temp);
3164   }
3165   if (opcode[i]==0x2E) { // SWR
3166     emit_writebyte_indexed(tl,3,temp);
3167   }
3168   if (opcode[i]==0x2C) { // SDL
3169     emit_writeword_indexed(th,0,temp);
3170     if(rs2[i]) emit_mov(tl,temp2);
3171   }
3172   if (opcode[i]==0x2D) { // SDR
3173     emit_writebyte_indexed(tl,3,temp);
3174     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3175   }
3176   done0=(int)out;
3177   emit_jmp(0);
3178   // 1
3179   set_jump_target(case1,(int)out);
3180   if (opcode[i]==0x2A) { // SWL
3181     // Write 3 msb into three least significant bytes
3182     if(rs2[i]) emit_rorimm(tl,8,tl);
3183     emit_writehword_indexed(tl,-1,temp);
3184     if(rs2[i]) emit_rorimm(tl,16,tl);
3185     emit_writebyte_indexed(tl,1,temp);
3186     if(rs2[i]) emit_rorimm(tl,8,tl);
3187   }
3188   if (opcode[i]==0x2E) { // SWR
3189     // Write two lsb into two most significant bytes
3190     emit_writehword_indexed(tl,1,temp);
3191   }
3192   if (opcode[i]==0x2C) { // SDL
3193     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3194     // Write 3 msb into three least significant bytes
3195     if(rs2[i]) emit_rorimm(th,8,th);
3196     emit_writehword_indexed(th,-1,temp);
3197     if(rs2[i]) emit_rorimm(th,16,th);
3198     emit_writebyte_indexed(th,1,temp);
3199     if(rs2[i]) emit_rorimm(th,8,th);
3200   }
3201   if (opcode[i]==0x2D) { // SDR
3202     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3203     // Write two lsb into two most significant bytes
3204     emit_writehword_indexed(tl,1,temp);
3205   }
3206   done1=(int)out;
3207   emit_jmp(0);
3208   // 2
3209   set_jump_target(case2,(int)out);
3210   emit_testimm(temp,1);
3211   case3=(int)out;
3212   emit_jne(0);
3213   if (opcode[i]==0x2A) { // SWL
3214     // Write two msb into two least significant bytes
3215     if(rs2[i]) emit_rorimm(tl,16,tl);
3216     emit_writehword_indexed(tl,-2,temp);
3217     if(rs2[i]) emit_rorimm(tl,16,tl);
3218   }
3219   if (opcode[i]==0x2E) { // SWR
3220     // Write 3 lsb into three most significant bytes
3221     emit_writebyte_indexed(tl,-1,temp);
3222     if(rs2[i]) emit_rorimm(tl,8,tl);
3223     emit_writehword_indexed(tl,0,temp);
3224     if(rs2[i]) emit_rorimm(tl,24,tl);
3225   }
3226   if (opcode[i]==0x2C) { // SDL
3227     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3228     // Write two msb into two least significant bytes
3229     if(rs2[i]) emit_rorimm(th,16,th);
3230     emit_writehword_indexed(th,-2,temp);
3231     if(rs2[i]) emit_rorimm(th,16,th);
3232   }
3233   if (opcode[i]==0x2D) { // SDR
3234     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3235     // Write 3 lsb into three most significant bytes
3236     emit_writebyte_indexed(tl,-1,temp);
3237     if(rs2[i]) emit_rorimm(tl,8,tl);
3238     emit_writehword_indexed(tl,0,temp);
3239     if(rs2[i]) emit_rorimm(tl,24,tl);
3240   }
3241   done2=(int)out;
3242   emit_jmp(0);
3243   // 3
3244   set_jump_target(case3,(int)out);
3245   if (opcode[i]==0x2A) { // SWL
3246     // Write msb into least significant byte
3247     if(rs2[i]) emit_rorimm(tl,24,tl);
3248     emit_writebyte_indexed(tl,-3,temp);
3249     if(rs2[i]) emit_rorimm(tl,8,tl);
3250   }
3251   if (opcode[i]==0x2E) { // SWR
3252     // Write entire word
3253     emit_writeword_indexed(tl,-3,temp);
3254   }
3255   if (opcode[i]==0x2C) { // SDL
3256     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3257     // Write msb into least significant byte
3258     if(rs2[i]) emit_rorimm(th,24,th);
3259     emit_writebyte_indexed(th,-3,temp);
3260     if(rs2[i]) emit_rorimm(th,8,th);
3261   }
3262   if (opcode[i]==0x2D) { // SDR
3263     if(rs2[i]) emit_mov(th,temp2);
3264     // Write entire word
3265     emit_writeword_indexed(tl,-3,temp);
3266   }
3267   set_jump_target(done0,(int)out);
3268   set_jump_target(done1,(int)out);
3269   set_jump_target(done2,(int)out);
3270   if (opcode[i]==0x2C) { // SDL
3271     emit_testimm(temp,4);
3272     done0=(int)out;
3273     emit_jne(0);
3274     emit_andimm(temp,~3,temp);
3275     emit_writeword_indexed(temp2,4,temp);
3276     set_jump_target(done0,(int)out);
3277   }
3278   if (opcode[i]==0x2D) { // SDR
3279     emit_testimm(temp,4);
3280     done0=(int)out;
3281     emit_jeq(0);
3282     emit_andimm(temp,~3,temp);
3283     emit_writeword_indexed(temp2,-4,temp);
3284     set_jump_target(done0,(int)out);
3285   }
3286   if(!c||!memtarget)
3287     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3288   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3289     #ifdef RAM_OFFSET
3290     int map=get_reg(i_regs->regmap,ROREG);
3291     if(map<0) map=HOST_TEMPREG;
3292     gen_orig_addr_w(temp,map);
3293     #else
3294     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3295     #endif
3296     #if defined(HOST_IMM8)
3297     int ir=get_reg(i_regs->regmap,INVCP);
3298     assert(ir>=0);
3299     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3300     #else
3301     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3302     #endif
3303     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3304     emit_callne(invalidate_addr_reg[temp]);
3305     #else
3306     int jaddr2=(int)out;
3307     emit_jne(0);
3308     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3309     #endif
3310   }
3311   /*
3312     emit_pusha();
3313     //save_regs(0x100f);
3314         emit_readword((int)&last_count,ECX);
3315         if(get_reg(i_regs->regmap,CCREG)<0)
3316           emit_loadreg(CCREG,HOST_CCREG);
3317         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3318         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3319         emit_writeword(HOST_CCREG,(int)&Count);
3320     emit_call((int)memdebug);
3321     emit_popa();
3322     //restore_regs(0x100f);
3323   */
3324 }
3325
3326 void c1ls_assemble(int i,struct regstat *i_regs)
3327 {
3328   cop1_unusable(i, i_regs);
3329 }
3330
3331 void c2ls_assemble(int i,struct regstat *i_regs)
3332 {
3333   int s,tl;
3334   int ar;
3335   int offset;
3336   int memtarget=0,c=0;
3337   int jaddr2=0,type;
3338   int agr=AGEN1+(i&1);
3339   int fastio_reg_override=0;
3340   u_int hr,reglist=0;
3341   u_int copr=(source[i]>>16)&0x1f;
3342   s=get_reg(i_regs->regmap,rs1[i]);
3343   tl=get_reg(i_regs->regmap,FTEMP);
3344   offset=imm[i];
3345   assert(rs1[i]>0);
3346   assert(tl>=0);
3347
3348   for(hr=0;hr<HOST_REGS;hr++) {
3349     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3350   }
3351   if(i_regs->regmap[HOST_CCREG]==CCREG)
3352     reglist&=~(1<<HOST_CCREG);
3353
3354   // get the address
3355   if (opcode[i]==0x3a) { // SWC2
3356     ar=get_reg(i_regs->regmap,agr);
3357     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3358     reglist|=1<<ar;
3359   } else { // LWC2
3360     ar=tl;
3361   }
3362   if(s>=0) c=(i_regs->wasconst>>s)&1;
3363   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3364   if (!offset&&!c&&s>=0) ar=s;
3365   assert(ar>=0);
3366
3367   if (opcode[i]==0x3a) { // SWC2
3368     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3369     type=STOREW_STUB;
3370   }
3371   else
3372     type=LOADW_STUB;
3373
3374   if(c&&!memtarget) {
3375     jaddr2=(int)out;
3376     emit_jmp(0); // inline_readstub/inline_writestub?
3377   }
3378   else {
3379     if(!c) {
3380       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3381     }
3382     else if(ram_offset&&memtarget) {
3383       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3384       fastio_reg_override=HOST_TEMPREG;
3385     }
3386     if (opcode[i]==0x32) { // LWC2
3387       #ifdef HOST_IMM_ADDR32
3388       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3389       else
3390       #endif
3391       int a=ar;
3392       if(fastio_reg_override) a=fastio_reg_override;
3393       emit_readword_indexed(0,a,tl);
3394     }
3395     if (opcode[i]==0x3a) { // SWC2
3396       #ifdef DESTRUCTIVE_SHIFT
3397       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3398       #endif
3399       int a=ar;
3400       if(fastio_reg_override) a=fastio_reg_override;
3401       emit_writeword_indexed(tl,0,a);
3402     }
3403   }
3404   if(jaddr2)
3405     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3406   if(opcode[i]==0x3a) // SWC2
3407   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3408 #if defined(HOST_IMM8)
3409     int ir=get_reg(i_regs->regmap,INVCP);
3410     assert(ir>=0);
3411     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3412 #else
3413     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3414 #endif
3415     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3416     emit_callne(invalidate_addr_reg[ar]);
3417     #else
3418     int jaddr3=(int)out;
3419     emit_jne(0);
3420     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3421     #endif
3422   }
3423   if (opcode[i]==0x32) { // LWC2
3424     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3425   }
3426 }
3427
3428 #ifndef multdiv_assemble
3429 void multdiv_assemble(int i,struct regstat *i_regs)
3430 {
3431   printf("Need multdiv_assemble for this architecture.\n");
3432   exit(1);
3433 }
3434 #endif
3435
3436 void mov_assemble(int i,struct regstat *i_regs)
3437 {
3438   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3439   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3440   if(rt1[i]) {
3441     signed char sh,sl,th,tl;
3442     th=get_reg(i_regs->regmap,rt1[i]|64);
3443     tl=get_reg(i_regs->regmap,rt1[i]);
3444     //assert(tl>=0);
3445     if(tl>=0) {
3446       sh=get_reg(i_regs->regmap,rs1[i]|64);
3447       sl=get_reg(i_regs->regmap,rs1[i]);
3448       if(sl>=0) emit_mov(sl,tl);
3449       else emit_loadreg(rs1[i],tl);
3450       if(th>=0) {
3451         if(sh>=0) emit_mov(sh,th);
3452         else emit_loadreg(rs1[i]|64,th);
3453       }
3454     }
3455   }
3456 }
3457
3458 #ifndef fconv_assemble
3459 void fconv_assemble(int i,struct regstat *i_regs)
3460 {
3461   printf("Need fconv_assemble for this architecture.\n");
3462   exit(1);
3463 }
3464 #endif
3465
3466 #if 0
3467 void float_assemble(int i,struct regstat *i_regs)
3468 {
3469   printf("Need float_assemble for this architecture.\n");
3470   exit(1);
3471 }
3472 #endif
3473
3474 void syscall_assemble(int i,struct regstat *i_regs)
3475 {
3476   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3477   assert(ccreg==HOST_CCREG);
3478   assert(!is_delayslot);
3479   (void)ccreg;
3480   emit_movimm(start+i*4,EAX); // Get PC
3481   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3482   emit_jmp((int)jump_syscall_hle); // XXX
3483 }
3484
3485 void hlecall_assemble(int i,struct regstat *i_regs)
3486 {
3487   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3488   assert(ccreg==HOST_CCREG);
3489   assert(!is_delayslot);
3490   (void)ccreg;
3491   emit_movimm(start+i*4+4,0); // Get PC
3492   uint32_t hleCode = source[i] & 0x03ffffff;
3493   if (hleCode >= (sizeof(psxHLEt) / sizeof(psxHLEt[0])))
3494     emit_movimm((int)psxNULL,1);
3495   else
3496     emit_movimm((int)psxHLEt[hleCode],1);
3497   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3498   emit_jmp((int)jump_hlecall);
3499 }
3500
3501 void intcall_assemble(int i,struct regstat *i_regs)
3502 {
3503   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3504   assert(ccreg==HOST_CCREG);
3505   assert(!is_delayslot);
3506   (void)ccreg;
3507   emit_movimm(start+i*4,0); // Get PC
3508   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3509   emit_jmp((int)jump_intcall);
3510 }
3511
3512 void ds_assemble(int i,struct regstat *i_regs)
3513 {
3514   speculate_register_values(i);
3515   is_delayslot=1;
3516   switch(itype[i]) {
3517     case ALU:
3518       alu_assemble(i,i_regs);break;
3519     case IMM16:
3520       imm16_assemble(i,i_regs);break;
3521     case SHIFT:
3522       shift_assemble(i,i_regs);break;
3523     case SHIFTIMM:
3524       shiftimm_assemble(i,i_regs);break;
3525     case LOAD:
3526       load_assemble(i,i_regs);break;
3527     case LOADLR:
3528       loadlr_assemble(i,i_regs);break;
3529     case STORE:
3530       store_assemble(i,i_regs);break;
3531     case STORELR:
3532       storelr_assemble(i,i_regs);break;
3533     case COP0:
3534       cop0_assemble(i,i_regs);break;
3535     case COP1:
3536       cop1_assemble(i,i_regs);break;
3537     case C1LS:
3538       c1ls_assemble(i,i_regs);break;
3539     case COP2:
3540       cop2_assemble(i,i_regs);break;
3541     case C2LS:
3542       c2ls_assemble(i,i_regs);break;
3543     case C2OP:
3544       c2op_assemble(i,i_regs);break;
3545     case FCONV:
3546       fconv_assemble(i,i_regs);break;
3547     case FLOAT:
3548       float_assemble(i,i_regs);break;
3549     case FCOMP:
3550       fcomp_assemble(i,i_regs);break;
3551     case MULTDIV:
3552       multdiv_assemble(i,i_regs);break;
3553     case MOV:
3554       mov_assemble(i,i_regs);break;
3555     case SYSCALL:
3556     case HLECALL:
3557     case INTCALL:
3558     case SPAN:
3559     case UJUMP:
3560     case RJUMP:
3561     case CJUMP:
3562     case SJUMP:
3563     case FJUMP:
3564       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3565   }
3566   is_delayslot=0;
3567 }
3568
3569 // Is the branch target a valid internal jump?
3570 int internal_branch(uint64_t i_is32,int addr)
3571 {
3572   if(addr&1) return 0; // Indirect (register) jump
3573   if(addr>=start && addr<start+slen*4-4)
3574   {
3575     //int t=(addr-start)>>2;
3576     // Delay slots are not valid branch targets
3577     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3578     // 64 -> 32 bit transition requires a recompile
3579     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3580     {
3581       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3582       else printf("optimizable: yes\n");
3583     }*/
3584     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3585     return 1;
3586   }
3587   return 0;
3588 }
3589
3590 #ifndef wb_invalidate
3591 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3592   uint64_t u,uint64_t uu)
3593 {
3594   int hr;
3595   for(hr=0;hr<HOST_REGS;hr++) {
3596     if(hr!=EXCLUDE_REG) {
3597       if(pre[hr]!=entry[hr]) {
3598         if(pre[hr]>=0) {
3599           if((dirty>>hr)&1) {
3600             if(get_reg(entry,pre[hr])<0) {
3601               if(pre[hr]<64) {
3602                 if(!((u>>pre[hr])&1)) {
3603                   emit_storereg(pre[hr],hr);
3604                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3605                     emit_sarimm(hr,31,hr);
3606                     emit_storereg(pre[hr]|64,hr);
3607                   }
3608                 }
3609               }else{
3610                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3611                   emit_storereg(pre[hr],hr);
3612                 }
3613               }
3614             }
3615           }
3616         }
3617       }
3618     }
3619   }
3620   // Move from one register to another (no writeback)
3621   for(hr=0;hr<HOST_REGS;hr++) {
3622     if(hr!=EXCLUDE_REG) {
3623       if(pre[hr]!=entry[hr]) {
3624         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3625           int nr;
3626           if((nr=get_reg(entry,pre[hr]))>=0) {
3627             emit_mov(hr,nr);
3628           }
3629         }
3630       }
3631     }
3632   }
3633 }
3634 #endif
3635
3636 // Load the specified registers
3637 // This only loads the registers given as arguments because
3638 // we don't want to load things that will be overwritten
3639 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3640 {
3641   int hr;
3642   // Load 32-bit regs
3643   for(hr=0;hr<HOST_REGS;hr++) {
3644     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3645       if(entry[hr]!=regmap[hr]) {
3646         if(regmap[hr]==rs1||regmap[hr]==rs2)
3647         {
3648           if(regmap[hr]==0) {
3649             emit_zeroreg(hr);
3650           }
3651           else
3652           {
3653             emit_loadreg(regmap[hr],hr);
3654           }
3655         }
3656       }
3657     }
3658   }
3659   //Load 64-bit regs
3660   for(hr=0;hr<HOST_REGS;hr++) {
3661     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3662       if(entry[hr]!=regmap[hr]) {
3663         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3664         {
3665           assert(regmap[hr]!=64);
3666           if((is32>>(regmap[hr]&63))&1) {
3667             int lr=get_reg(regmap,regmap[hr]-64);
3668             if(lr>=0)
3669               emit_sarimm(lr,31,hr);
3670             else
3671               emit_loadreg(regmap[hr],hr);
3672           }
3673           else
3674           {
3675             emit_loadreg(regmap[hr],hr);
3676           }
3677         }
3678       }
3679     }
3680   }
3681 }
3682
3683 // Load registers prior to the start of a loop
3684 // so that they are not loaded within the loop
3685 static void loop_preload(signed char pre[],signed char entry[])
3686 {
3687   int hr;
3688   for(hr=0;hr<HOST_REGS;hr++) {
3689     if(hr!=EXCLUDE_REG) {
3690       if(pre[hr]!=entry[hr]) {
3691         if(entry[hr]>=0) {
3692           if(get_reg(pre,entry[hr])<0) {
3693             assem_debug("loop preload:\n");
3694             //printf("loop preload: %d\n",hr);
3695             if(entry[hr]==0) {
3696               emit_zeroreg(hr);
3697             }
3698             else if(entry[hr]<TEMPREG)
3699             {
3700               emit_loadreg(entry[hr],hr);
3701             }
3702             else if(entry[hr]-64<TEMPREG)
3703             {
3704               emit_loadreg(entry[hr],hr);
3705             }
3706           }
3707         }
3708       }
3709     }
3710   }
3711 }
3712
3713 // Generate address for load/store instruction
3714 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3715 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3716 {
3717   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3718     int ra=-1;
3719     int agr=AGEN1+(i&1);
3720     if(itype[i]==LOAD) {
3721       ra=get_reg(i_regs->regmap,rt1[i]);
3722       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3723       assert(ra>=0);
3724     }
3725     if(itype[i]==LOADLR) {
3726       ra=get_reg(i_regs->regmap,FTEMP);
3727     }
3728     if(itype[i]==STORE||itype[i]==STORELR) {
3729       ra=get_reg(i_regs->regmap,agr);
3730       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3731     }
3732     if(itype[i]==C1LS||itype[i]==C2LS) {
3733       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3734         ra=get_reg(i_regs->regmap,FTEMP);
3735       else { // SWC1/SDC1/SWC2/SDC2
3736         ra=get_reg(i_regs->regmap,agr);
3737         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3738       }
3739     }
3740     int rs=get_reg(i_regs->regmap,rs1[i]);
3741     if(ra>=0) {
3742       int offset=imm[i];
3743       int c=(i_regs->wasconst>>rs)&1;
3744       if(rs1[i]==0) {
3745         // Using r0 as a base address
3746         if(!entry||entry[ra]!=agr) {
3747           if (opcode[i]==0x22||opcode[i]==0x26) {
3748             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3749           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3750             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3751           }else{
3752             emit_movimm(offset,ra);
3753           }
3754         } // else did it in the previous cycle
3755       }
3756       else if(rs<0) {
3757         if(!entry||entry[ra]!=rs1[i])
3758           emit_loadreg(rs1[i],ra);
3759         //if(!entry||entry[ra]!=rs1[i])
3760         //  printf("poor load scheduling!\n");
3761       }
3762       else if(c) {
3763         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3764           if(!entry||entry[ra]!=agr) {
3765             if (opcode[i]==0x22||opcode[i]==0x26) {
3766               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3767             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3768               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3769             }else{
3770               #ifdef HOST_IMM_ADDR32
3771               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3772               #endif
3773               emit_movimm(constmap[i][rs]+offset,ra);
3774               regs[i].loadedconst|=1<<ra;
3775             }
3776           } // else did it in the previous cycle
3777         } // else load_consts already did it
3778       }
3779       if(offset&&!c&&rs1[i]) {
3780         if(rs>=0) {
3781           emit_addimm(rs,offset,ra);
3782         }else{
3783           emit_addimm(ra,offset,ra);
3784         }
3785       }
3786     }
3787   }
3788   // Preload constants for next instruction
3789   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3790     int agr,ra;
3791     // Actual address
3792     agr=AGEN1+((i+1)&1);
3793     ra=get_reg(i_regs->regmap,agr);
3794     if(ra>=0) {
3795       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3796       int offset=imm[i+1];
3797       int c=(regs[i+1].wasconst>>rs)&1;
3798       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3799         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3800           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3801         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3802           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3803         }else{
3804           #ifdef HOST_IMM_ADDR32
3805           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3806           #endif
3807           emit_movimm(constmap[i+1][rs]+offset,ra);
3808           regs[i+1].loadedconst|=1<<ra;
3809         }
3810       }
3811       else if(rs1[i+1]==0) {
3812         // Using r0 as a base address
3813         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3814           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3815         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3816           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3817         }else{
3818           emit_movimm(offset,ra);
3819         }
3820       }
3821     }
3822   }
3823 }
3824
3825 static int get_final_value(int hr, int i, int *value)
3826 {
3827   int reg=regs[i].regmap[hr];
3828   while(i<slen-1) {
3829     if(regs[i+1].regmap[hr]!=reg) break;
3830     if(!((regs[i+1].isconst>>hr)&1)) break;
3831     if(bt[i+1]) break;
3832     i++;
3833   }
3834   if(i<slen-1) {
3835     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3836       *value=constmap[i][hr];
3837       return 1;
3838     }
3839     if(!bt[i+1]) {
3840       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3841         // Load in delay slot, out-of-order execution
3842         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3843         {
3844           // Precompute load address
3845           *value=constmap[i][hr]+imm[i+2];
3846           return 1;
3847         }
3848       }
3849       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3850       {
3851         // Precompute load address
3852         *value=constmap[i][hr]+imm[i+1];
3853         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3854         return 1;
3855       }
3856     }
3857   }
3858   *value=constmap[i][hr];
3859   //printf("c=%x\n",(int)constmap[i][hr]);
3860   if(i==slen-1) return 1;
3861   if(reg<64) {
3862     return !((unneeded_reg[i+1]>>reg)&1);
3863   }else{
3864     return !((unneeded_reg_upper[i+1]>>reg)&1);
3865   }
3866 }
3867
3868 // Load registers with known constants
3869 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3870 {
3871   int hr,hr2;
3872   // propagate loaded constant flags
3873   if(i==0||bt[i])
3874     regs[i].loadedconst=0;
3875   else {
3876     for(hr=0;hr<HOST_REGS;hr++) {
3877       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3878          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3879       {
3880         regs[i].loadedconst|=1<<hr;
3881       }
3882     }
3883   }
3884   // Load 32-bit regs
3885   for(hr=0;hr<HOST_REGS;hr++) {
3886     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3887       //if(entry[hr]!=regmap[hr]) {
3888       if(!((regs[i].loadedconst>>hr)&1)) {
3889         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3890           int value,similar=0;
3891           if(get_final_value(hr,i,&value)) {
3892             // see if some other register has similar value
3893             for(hr2=0;hr2<HOST_REGS;hr2++) {
3894               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3895                 if(is_similar_value(value,constmap[i][hr2])) {
3896                   similar=1;
3897                   break;
3898                 }
3899               }
3900             }
3901             if(similar) {
3902               int value2;
3903               if(get_final_value(hr2,i,&value2)) // is this needed?
3904                 emit_movimm_from(value2,hr2,value,hr);
3905               else
3906                 emit_movimm(value,hr);
3907             }
3908             else if(value==0) {
3909               emit_zeroreg(hr);
3910             }
3911             else {
3912               emit_movimm(value,hr);
3913             }
3914           }
3915           regs[i].loadedconst|=1<<hr;
3916         }
3917       }
3918     }
3919   }
3920   // Load 64-bit regs
3921   for(hr=0;hr<HOST_REGS;hr++) {
3922     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3923       //if(entry[hr]!=regmap[hr]) {
3924       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3925         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3926           if((is32>>(regmap[hr]&63))&1) {
3927             int lr=get_reg(regmap,regmap[hr]-64);
3928             assert(lr>=0);
3929             emit_sarimm(lr,31,hr);
3930           }
3931           else
3932           {
3933             int value;
3934             if(get_final_value(hr,i,&value)) {
3935               if(value==0) {
3936                 emit_zeroreg(hr);
3937               }
3938               else {
3939                 emit_movimm(value,hr);
3940               }
3941             }
3942           }
3943         }
3944       }
3945     }
3946   }
3947 }
3948 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3949 {
3950   int hr;
3951   // Load 32-bit regs
3952   for(hr=0;hr<HOST_REGS;hr++) {
3953     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3954       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3955         int value=constmap[i][hr];
3956         if(value==0) {
3957           emit_zeroreg(hr);
3958         }
3959         else {
3960           emit_movimm(value,hr);
3961         }
3962       }
3963     }
3964   }
3965   // Load 64-bit regs
3966   for(hr=0;hr<HOST_REGS;hr++) {
3967     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3968       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3969         if((is32>>(regmap[hr]&63))&1) {
3970           int lr=get_reg(regmap,regmap[hr]-64);
3971           assert(lr>=0);
3972           emit_sarimm(lr,31,hr);
3973         }
3974         else
3975         {
3976           int value=constmap[i][hr];
3977           if(value==0) {
3978             emit_zeroreg(hr);
3979           }
3980           else {
3981             emit_movimm(value,hr);
3982           }
3983         }
3984       }
3985     }
3986   }
3987 }
3988
3989 // Write out all dirty registers (except cycle count)
3990 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3991 {
3992   int hr;
3993   for(hr=0;hr<HOST_REGS;hr++) {
3994     if(hr!=EXCLUDE_REG) {
3995       if(i_regmap[hr]>0) {
3996         if(i_regmap[hr]!=CCREG) {
3997           if((i_dirty>>hr)&1) {
3998             if(i_regmap[hr]<64) {
3999               emit_storereg(i_regmap[hr],hr);
4000             }else{
4001               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4002                 emit_storereg(i_regmap[hr],hr);
4003               }
4004             }
4005           }
4006         }
4007       }
4008     }
4009   }
4010 }
4011 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4012 // This writes the registers not written by store_regs_bt
4013 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4014 {
4015   int hr;
4016   int t=(addr-start)>>2;
4017   for(hr=0;hr<HOST_REGS;hr++) {
4018     if(hr!=EXCLUDE_REG) {
4019       if(i_regmap[hr]>0) {
4020         if(i_regmap[hr]!=CCREG) {
4021           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4022             if((i_dirty>>hr)&1) {
4023               if(i_regmap[hr]<64) {
4024                 emit_storereg(i_regmap[hr],hr);
4025               }else{
4026                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4027                   emit_storereg(i_regmap[hr],hr);
4028                 }
4029               }
4030             }
4031           }
4032         }
4033       }
4034     }
4035   }
4036 }
4037
4038 // Load all registers (except cycle count)
4039 void load_all_regs(signed char i_regmap[])
4040 {
4041   int hr;
4042   for(hr=0;hr<HOST_REGS;hr++) {
4043     if(hr!=EXCLUDE_REG) {
4044       if(i_regmap[hr]==0) {
4045         emit_zeroreg(hr);
4046       }
4047       else
4048       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4049       {
4050         emit_loadreg(i_regmap[hr],hr);
4051       }
4052     }
4053   }
4054 }
4055
4056 // Load all current registers also needed by next instruction
4057 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4058 {
4059   int hr;
4060   for(hr=0;hr<HOST_REGS;hr++) {
4061     if(hr!=EXCLUDE_REG) {
4062       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4063         if(i_regmap[hr]==0) {
4064           emit_zeroreg(hr);
4065         }
4066         else
4067         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4068         {
4069           emit_loadreg(i_regmap[hr],hr);
4070         }
4071       }
4072     }
4073   }
4074 }
4075
4076 // Load all regs, storing cycle count if necessary
4077 void load_regs_entry(int t)
4078 {
4079   int hr;
4080   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4081   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4082   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4083     emit_storereg(CCREG,HOST_CCREG);
4084   }
4085   // Load 32-bit regs
4086   for(hr=0;hr<HOST_REGS;hr++) {
4087     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4088       if(regs[t].regmap_entry[hr]==0) {
4089         emit_zeroreg(hr);
4090       }
4091       else if(regs[t].regmap_entry[hr]!=CCREG)
4092       {
4093         emit_loadreg(regs[t].regmap_entry[hr],hr);
4094       }
4095     }
4096   }
4097   // Load 64-bit regs
4098   for(hr=0;hr<HOST_REGS;hr++) {
4099     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4100       assert(regs[t].regmap_entry[hr]!=64);
4101       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4102         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4103         if(lr<0) {
4104           emit_loadreg(regs[t].regmap_entry[hr],hr);
4105         }
4106         else
4107         {
4108           emit_sarimm(lr,31,hr);
4109         }
4110       }
4111       else
4112       {
4113         emit_loadreg(regs[t].regmap_entry[hr],hr);
4114       }
4115     }
4116   }
4117 }
4118
4119 // Store dirty registers prior to branch
4120 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4121 {
4122   if(internal_branch(i_is32,addr))
4123   {
4124     int t=(addr-start)>>2;
4125     int hr;
4126     for(hr=0;hr<HOST_REGS;hr++) {
4127       if(hr!=EXCLUDE_REG) {
4128         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4129           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4130             if((i_dirty>>hr)&1) {
4131               if(i_regmap[hr]<64) {
4132                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4133                   emit_storereg(i_regmap[hr],hr);
4134                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4135                     #ifdef DESTRUCTIVE_WRITEBACK
4136                     emit_sarimm(hr,31,hr);
4137                     emit_storereg(i_regmap[hr]|64,hr);
4138                     #else
4139                     emit_sarimm(hr,31,HOST_TEMPREG);
4140                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4141                     #endif
4142                   }
4143                 }
4144               }else{
4145                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4146                   emit_storereg(i_regmap[hr],hr);
4147                 }
4148               }
4149             }
4150           }
4151         }
4152       }
4153     }
4154   }
4155   else
4156   {
4157     // Branch out of this block, write out all dirty regs
4158     wb_dirtys(i_regmap,i_is32,i_dirty);
4159   }
4160 }
4161
4162 // Load all needed registers for branch target
4163 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4164 {
4165   //if(addr>=start && addr<(start+slen*4))
4166   if(internal_branch(i_is32,addr))
4167   {
4168     int t=(addr-start)>>2;
4169     int hr;
4170     // Store the cycle count before loading something else
4171     if(i_regmap[HOST_CCREG]!=CCREG) {
4172       assert(i_regmap[HOST_CCREG]==-1);
4173     }
4174     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4175       emit_storereg(CCREG,HOST_CCREG);
4176     }
4177     // Load 32-bit regs
4178     for(hr=0;hr<HOST_REGS;hr++) {
4179       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4180         #ifdef DESTRUCTIVE_WRITEBACK
4181         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4182         #else
4183         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4184         #endif
4185           if(regs[t].regmap_entry[hr]==0) {
4186             emit_zeroreg(hr);
4187           }
4188           else if(regs[t].regmap_entry[hr]!=CCREG)
4189           {
4190             emit_loadreg(regs[t].regmap_entry[hr],hr);
4191           }
4192         }
4193       }
4194     }
4195     //Load 64-bit regs
4196     for(hr=0;hr<HOST_REGS;hr++) {
4197       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4198         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4199           assert(regs[t].regmap_entry[hr]!=64);
4200           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4201             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4202             if(lr<0) {
4203               emit_loadreg(regs[t].regmap_entry[hr],hr);
4204             }
4205             else
4206             {
4207               emit_sarimm(lr,31,hr);
4208             }
4209           }
4210           else
4211           {
4212             emit_loadreg(regs[t].regmap_entry[hr],hr);
4213           }
4214         }
4215         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4216           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4217           assert(lr>=0);
4218           emit_sarimm(lr,31,hr);
4219         }
4220       }
4221     }
4222   }
4223 }
4224
4225 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4226 {
4227   if(addr>=start && addr<start+slen*4-4)
4228   {
4229     int t=(addr-start)>>2;
4230     int hr;
4231     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4232     for(hr=0;hr<HOST_REGS;hr++)
4233     {
4234       if(hr!=EXCLUDE_REG)
4235       {
4236         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4237         {
4238           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4239           {
4240             return 0;
4241           }
4242           else
4243           if((i_dirty>>hr)&1)
4244           {
4245             if(i_regmap[hr]<TEMPREG)
4246             {
4247               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4248                 return 0;
4249             }
4250             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4251             {
4252               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4253                 return 0;
4254             }
4255           }
4256         }
4257         else // Same register but is it 32-bit or dirty?
4258         if(i_regmap[hr]>=0)
4259         {
4260           if(!((regs[t].dirty>>hr)&1))
4261           {
4262             if((i_dirty>>hr)&1)
4263             {
4264               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4265               {
4266                 //printf("%x: dirty no match\n",addr);
4267                 return 0;
4268               }
4269             }
4270           }
4271           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4272           {
4273             //printf("%x: is32 no match\n",addr);
4274             return 0;
4275           }
4276         }
4277       }
4278     }
4279     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4280     // Delay slots are not valid branch targets
4281     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4282     // Delay slots require additional processing, so do not match
4283     if(is_ds[t]) return 0;
4284   }
4285   else
4286   {
4287     int hr;
4288     for(hr=0;hr<HOST_REGS;hr++)
4289     {
4290       if(hr!=EXCLUDE_REG)
4291       {
4292         if(i_regmap[hr]>=0)
4293         {
4294           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4295           {
4296             if((i_dirty>>hr)&1)
4297             {
4298               return 0;
4299             }
4300           }
4301         }
4302       }
4303     }
4304   }
4305   return 1;
4306 }
4307
4308 // Used when a branch jumps into the delay slot of another branch
4309 void ds_assemble_entry(int i)
4310 {
4311   int t=(ba[i]-start)>>2;
4312   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4313   assem_debug("Assemble delay slot at %x\n",ba[i]);
4314   assem_debug("<->\n");
4315   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4316     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4317   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4318   address_generation(t,&regs[t],regs[t].regmap_entry);
4319   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4320     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4321   cop1_usable=0;
4322   is_delayslot=0;
4323   switch(itype[t]) {
4324     case ALU:
4325       alu_assemble(t,&regs[t]);break;
4326     case IMM16:
4327       imm16_assemble(t,&regs[t]);break;
4328     case SHIFT:
4329       shift_assemble(t,&regs[t]);break;
4330     case SHIFTIMM:
4331       shiftimm_assemble(t,&regs[t]);break;
4332     case LOAD:
4333       load_assemble(t,&regs[t]);break;
4334     case LOADLR:
4335       loadlr_assemble(t,&regs[t]);break;
4336     case STORE:
4337       store_assemble(t,&regs[t]);break;
4338     case STORELR:
4339       storelr_assemble(t,&regs[t]);break;
4340     case COP0:
4341       cop0_assemble(t,&regs[t]);break;
4342     case COP1:
4343       cop1_assemble(t,&regs[t]);break;
4344     case C1LS:
4345       c1ls_assemble(t,&regs[t]);break;
4346     case COP2:
4347       cop2_assemble(t,&regs[t]);break;
4348     case C2LS:
4349       c2ls_assemble(t,&regs[t]);break;
4350     case C2OP:
4351       c2op_assemble(t,&regs[t]);break;
4352     case FCONV:
4353       fconv_assemble(t,&regs[t]);break;
4354     case FLOAT:
4355       float_assemble(t,&regs[t]);break;
4356     case FCOMP:
4357       fcomp_assemble(t,&regs[t]);break;
4358     case MULTDIV:
4359       multdiv_assemble(t,&regs[t]);break;
4360     case MOV:
4361       mov_assemble(t,&regs[t]);break;
4362     case SYSCALL:
4363     case HLECALL:
4364     case INTCALL:
4365     case SPAN:
4366     case UJUMP:
4367     case RJUMP:
4368     case CJUMP:
4369     case SJUMP:
4370     case FJUMP:
4371       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4372   }
4373   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4374   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4375   if(internal_branch(regs[t].is32,ba[i]+4))
4376     assem_debug("branch: internal\n");
4377   else
4378     assem_debug("branch: external\n");
4379   assert(internal_branch(regs[t].is32,ba[i]+4));
4380   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4381   emit_jmp(0);
4382 }
4383
4384 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4385 {
4386   int count;
4387   int jaddr;
4388   int idle=0;
4389   int t=0;
4390   if(itype[i]==RJUMP)
4391   {
4392     *adj=0;
4393   }
4394   //if(ba[i]>=start && ba[i]<(start+slen*4))
4395   if(internal_branch(branch_regs[i].is32,ba[i]))
4396   {
4397     t=(ba[i]-start)>>2;
4398     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4399     else *adj=ccadj[t];
4400   }
4401   else
4402   {
4403     *adj=0;
4404   }
4405   count=ccadj[i];
4406   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4407     // Idle loop
4408     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4409     idle=(int)out;
4410     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4411     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4412     jaddr=(int)out;
4413     emit_jmp(0);
4414   }
4415   else if(*adj==0||invert) {
4416     int cycles=CLOCK_ADJUST(count+2);
4417     // faster loop HACK
4418     if (t&&*adj) {
4419       int rel=t-i;
4420       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4421         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4422     }
4423     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4424     jaddr=(int)out;
4425     emit_jns(0);
4426   }
4427   else
4428   {
4429     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4430     jaddr=(int)out;
4431     emit_jns(0);
4432   }
4433   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4434 }
4435
4436 void do_ccstub(int n)
4437 {
4438   literal_pool(256);
4439   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4440   set_jump_target(stubs[n][1],(int)out);
4441   int i=stubs[n][4];
4442   if(stubs[n][6]==NULLDS) {
4443     // Delay slot instruction is nullified ("likely" branch)
4444     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4445   }
4446   else if(stubs[n][6]!=TAKEN) {
4447     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4448   }
4449   else {
4450     if(internal_branch(branch_regs[i].is32,ba[i]))
4451       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4452   }
4453   if(stubs[n][5]!=-1)
4454   {
4455     // Save PC as return address
4456     emit_movimm(stubs[n][5],EAX);
4457     emit_writeword(EAX,(int)&pcaddr);
4458   }
4459   else
4460   {
4461     // Return address depends on which way the branch goes
4462     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4463     {
4464       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4465       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4466       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4467       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4468       if(rs1[i]==0)
4469       {
4470         s1l=s2l;s1h=s2h;
4471         s2l=s2h=-1;
4472       }
4473       else if(rs2[i]==0)
4474       {
4475         s2l=s2h=-1;
4476       }
4477       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4478         s1h=s2h=-1;
4479       }
4480       assert(s1l>=0);
4481       #ifdef DESTRUCTIVE_WRITEBACK
4482       if(rs1[i]) {
4483         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4484           emit_loadreg(rs1[i],s1l);
4485       }
4486       else {
4487         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4488           emit_loadreg(rs2[i],s1l);
4489       }
4490       if(s2l>=0)
4491         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4492           emit_loadreg(rs2[i],s2l);
4493       #endif
4494       int hr=0;
4495       int addr=-1,alt=-1,ntaddr=-1;
4496       while(hr<HOST_REGS)
4497       {
4498         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4499            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4500            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4501         {
4502           addr=hr++;break;
4503         }
4504         hr++;
4505       }
4506       while(hr<HOST_REGS)
4507       {
4508         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4509            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4510            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4511         {
4512           alt=hr++;break;
4513         }
4514         hr++;
4515       }
4516       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4517       {
4518         while(hr<HOST_REGS)
4519         {
4520           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4521              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4522              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4523           {
4524             ntaddr=hr;break;
4525           }
4526           hr++;
4527         }
4528         assert(hr<HOST_REGS);
4529       }
4530       if((opcode[i]&0x2f)==4) // BEQ
4531       {
4532         #ifdef HAVE_CMOV_IMM
4533         if(s1h<0) {
4534           if(s2l>=0) emit_cmp(s1l,s2l);
4535           else emit_test(s1l,s1l);
4536           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4537         }
4538         else
4539         #endif
4540         {
4541           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4542           if(s1h>=0) {
4543             if(s2h>=0) emit_cmp(s1h,s2h);
4544             else emit_test(s1h,s1h);
4545             emit_cmovne_reg(alt,addr);
4546           }
4547           if(s2l>=0) emit_cmp(s1l,s2l);
4548           else emit_test(s1l,s1l);
4549           emit_cmovne_reg(alt,addr);
4550         }
4551       }
4552       if((opcode[i]&0x2f)==5) // BNE
4553       {
4554         #ifdef HAVE_CMOV_IMM
4555         if(s1h<0) {
4556           if(s2l>=0) emit_cmp(s1l,s2l);
4557           else emit_test(s1l,s1l);
4558           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4559         }
4560         else
4561         #endif
4562         {
4563           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4564           if(s1h>=0) {
4565             if(s2h>=0) emit_cmp(s1h,s2h);
4566             else emit_test(s1h,s1h);
4567             emit_cmovne_reg(alt,addr);
4568           }
4569           if(s2l>=0) emit_cmp(s1l,s2l);
4570           else emit_test(s1l,s1l);
4571           emit_cmovne_reg(alt,addr);
4572         }
4573       }
4574       if((opcode[i]&0x2f)==6) // BLEZ
4575       {
4576         //emit_movimm(ba[i],alt);
4577         //emit_movimm(start+i*4+8,addr);
4578         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4579         emit_cmpimm(s1l,1);
4580         if(s1h>=0) emit_mov(addr,ntaddr);
4581         emit_cmovl_reg(alt,addr);
4582         if(s1h>=0) {
4583           emit_test(s1h,s1h);
4584           emit_cmovne_reg(ntaddr,addr);
4585           emit_cmovs_reg(alt,addr);
4586         }
4587       }
4588       if((opcode[i]&0x2f)==7) // BGTZ
4589       {
4590         //emit_movimm(ba[i],addr);
4591         //emit_movimm(start+i*4+8,ntaddr);
4592         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4593         emit_cmpimm(s1l,1);
4594         if(s1h>=0) emit_mov(addr,alt);
4595         emit_cmovl_reg(ntaddr,addr);
4596         if(s1h>=0) {
4597           emit_test(s1h,s1h);
4598           emit_cmovne_reg(alt,addr);
4599           emit_cmovs_reg(ntaddr,addr);
4600         }
4601       }
4602       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4603       {
4604         //emit_movimm(ba[i],alt);
4605         //emit_movimm(start+i*4+8,addr);
4606         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4607         if(s1h>=0) emit_test(s1h,s1h);
4608         else emit_test(s1l,s1l);
4609         emit_cmovs_reg(alt,addr);
4610       }
4611       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4612       {
4613         //emit_movimm(ba[i],addr);
4614         //emit_movimm(start+i*4+8,alt);
4615         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4616         if(s1h>=0) emit_test(s1h,s1h);
4617         else emit_test(s1l,s1l);
4618         emit_cmovs_reg(alt,addr);
4619       }
4620       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4621         if(source[i]&0x10000) // BC1T
4622         {
4623           //emit_movimm(ba[i],alt);
4624           //emit_movimm(start+i*4+8,addr);
4625           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4626           emit_testimm(s1l,0x800000);
4627           emit_cmovne_reg(alt,addr);
4628         }
4629         else // BC1F
4630         {
4631           //emit_movimm(ba[i],addr);
4632           //emit_movimm(start+i*4+8,alt);
4633           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4634           emit_testimm(s1l,0x800000);
4635           emit_cmovne_reg(alt,addr);
4636         }
4637       }
4638       emit_writeword(addr,(int)&pcaddr);
4639     }
4640     else
4641     if(itype[i]==RJUMP)
4642     {
4643       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4644       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4645         r=get_reg(branch_regs[i].regmap,RTEMP);
4646       }
4647       emit_writeword(r,(int)&pcaddr);
4648     }
4649     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4650   }
4651   // Update cycle count
4652   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4653   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4654   emit_call((int)cc_interrupt);
4655   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4656   if(stubs[n][6]==TAKEN) {
4657     if(internal_branch(branch_regs[i].is32,ba[i]))
4658       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4659     else if(itype[i]==RJUMP) {
4660       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4661         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4662       else
4663         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4664     }
4665   }else if(stubs[n][6]==NOTTAKEN) {
4666     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4667     else load_all_regs(branch_regs[i].regmap);
4668   }else if(stubs[n][6]==NULLDS) {
4669     // Delay slot instruction is nullified ("likely" branch)
4670     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4671     else load_all_regs(regs[i].regmap);
4672   }else{
4673     load_all_regs(branch_regs[i].regmap);
4674   }
4675   emit_jmp(stubs[n][2]); // return address
4676
4677   /* This works but uses a lot of memory...
4678   emit_readword((int)&last_count,ECX);
4679   emit_add(HOST_CCREG,ECX,EAX);
4680   emit_writeword(EAX,(int)&Count);
4681   emit_call((int)gen_interupt);
4682   emit_readword((int)&Count,HOST_CCREG);
4683   emit_readword((int)&next_interupt,EAX);
4684   emit_readword((int)&pending_exception,EBX);
4685   emit_writeword(EAX,(int)&last_count);
4686   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4687   emit_test(EBX,EBX);
4688   int jne_instr=(int)out;
4689   emit_jne(0);
4690   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4691   load_all_regs(branch_regs[i].regmap);
4692   emit_jmp(stubs[n][2]); // return address
4693   set_jump_target(jne_instr,(int)out);
4694   emit_readword((int)&pcaddr,EAX);
4695   // Call get_addr_ht instead of doing the hash table here.
4696   // This code is executed infrequently and takes up a lot of space
4697   // so smaller is better.
4698   emit_storereg(CCREG,HOST_CCREG);
4699   emit_pushreg(EAX);
4700   emit_call((int)get_addr_ht);
4701   emit_loadreg(CCREG,HOST_CCREG);
4702   emit_addimm(ESP,4,ESP);
4703   emit_jmpreg(EAX);*/
4704 }
4705
4706 static void add_to_linker(int addr,int target,int ext)
4707 {
4708   link_addr[linkcount][0]=addr;
4709   link_addr[linkcount][1]=target;
4710   link_addr[linkcount][2]=ext;
4711   linkcount++;
4712 }
4713
4714 static void ujump_assemble_write_ra(int i)
4715 {
4716   int rt;
4717   unsigned int return_address;
4718   rt=get_reg(branch_regs[i].regmap,31);
4719   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4720   //assert(rt>=0);
4721   return_address=start+i*4+8;
4722   if(rt>=0) {
4723     #ifdef USE_MINI_HT
4724     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4725       int temp=-1; // note: must be ds-safe
4726       #ifdef HOST_TEMPREG
4727       temp=HOST_TEMPREG;
4728       #endif
4729       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4730       else emit_movimm(return_address,rt);
4731     }
4732     else
4733     #endif
4734     {
4735       #ifdef REG_PREFETCH
4736       if(temp>=0)
4737       {
4738         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4739       }
4740       #endif
4741       emit_movimm(return_address,rt); // PC into link register
4742       #ifdef IMM_PREFETCH
4743       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4744       #endif
4745     }
4746   }
4747 }
4748
4749 void ujump_assemble(int i,struct regstat *i_regs)
4750 {
4751   int ra_done=0;
4752   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4753   address_generation(i+1,i_regs,regs[i].regmap_entry);
4754   #ifdef REG_PREFETCH
4755   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4756   if(rt1[i]==31&&temp>=0)
4757   {
4758     signed char *i_regmap=i_regs->regmap;
4759     int return_address=start+i*4+8;
4760     if(get_reg(branch_regs[i].regmap,31)>0)
4761     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4762   }
4763   #endif
4764   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4765     ujump_assemble_write_ra(i); // writeback ra for DS
4766     ra_done=1;
4767   }
4768   ds_assemble(i+1,i_regs);
4769   uint64_t bc_unneeded=branch_regs[i].u;
4770   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4771   bc_unneeded|=1|(1LL<<rt1[i]);
4772   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4773   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4774                 bc_unneeded,bc_unneeded_upper);
4775   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4776   if(!ra_done&&rt1[i]==31)
4777     ujump_assemble_write_ra(i);
4778   int cc,adj;
4779   cc=get_reg(branch_regs[i].regmap,CCREG);
4780   assert(cc==HOST_CCREG);
4781   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4782   #ifdef REG_PREFETCH
4783   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4784   #endif
4785   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4786   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4787   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4788   if(internal_branch(branch_regs[i].is32,ba[i]))
4789     assem_debug("branch: internal\n");
4790   else
4791     assem_debug("branch: external\n");
4792   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4793     ds_assemble_entry(i);
4794   }
4795   else {
4796     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4797     emit_jmp(0);
4798   }
4799 }
4800
4801 static void rjump_assemble_write_ra(int i)
4802 {
4803   int rt,return_address;
4804   assert(rt1[i+1]!=rt1[i]);
4805   assert(rt2[i+1]!=rt1[i]);
4806   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4807   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4808   assert(rt>=0);
4809   return_address=start+i*4+8;
4810   #ifdef REG_PREFETCH
4811   if(temp>=0)
4812   {
4813     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4814   }
4815   #endif
4816   emit_movimm(return_address,rt); // PC into link register
4817   #ifdef IMM_PREFETCH
4818   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4819   #endif
4820 }
4821
4822 void rjump_assemble(int i,struct regstat *i_regs)
4823 {
4824   int temp;
4825   int rs,cc;
4826   int ra_done=0;
4827   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4828   assert(rs>=0);
4829   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4830     // Delay slot abuse, make a copy of the branch address register
4831     temp=get_reg(branch_regs[i].regmap,RTEMP);
4832     assert(temp>=0);
4833     assert(regs[i].regmap[temp]==RTEMP);
4834     emit_mov(rs,temp);
4835     rs=temp;
4836   }
4837   address_generation(i+1,i_regs,regs[i].regmap_entry);
4838   #ifdef REG_PREFETCH
4839   if(rt1[i]==31)
4840   {
4841     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4842       signed char *i_regmap=i_regs->regmap;
4843       int return_address=start+i*4+8;
4844       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4845     }
4846   }
4847   #endif
4848   #ifdef USE_MINI_HT
4849   if(rs1[i]==31) {
4850     int rh=get_reg(regs[i].regmap,RHASH);
4851     if(rh>=0) do_preload_rhash(rh);
4852   }
4853   #endif
4854   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4855     rjump_assemble_write_ra(i);
4856     ra_done=1;
4857   }
4858   ds_assemble(i+1,i_regs);
4859   uint64_t bc_unneeded=branch_regs[i].u;
4860   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4861   bc_unneeded|=1|(1LL<<rt1[i]);
4862   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4863   bc_unneeded&=~(1LL<<rs1[i]);
4864   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4865                 bc_unneeded,bc_unneeded_upper);
4866   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4867   if(!ra_done&&rt1[i]!=0)
4868     rjump_assemble_write_ra(i);
4869   cc=get_reg(branch_regs[i].regmap,CCREG);
4870   assert(cc==HOST_CCREG);
4871   (void)cc;
4872   #ifdef USE_MINI_HT
4873   int rh=get_reg(branch_regs[i].regmap,RHASH);
4874   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4875   if(rs1[i]==31) {
4876     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4877     do_preload_rhtbl(ht);
4878     do_rhash(rs,rh);
4879   }
4880   #endif
4881   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4882   #ifdef DESTRUCTIVE_WRITEBACK
4883   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4884     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4885       emit_loadreg(rs1[i],rs);
4886     }
4887   }
4888   #endif
4889   #ifdef REG_PREFETCH
4890   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4891   #endif
4892   #ifdef USE_MINI_HT
4893   if(rs1[i]==31) {
4894     do_miniht_load(ht,rh);
4895   }
4896   #endif
4897   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4898   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4899   //assert(adj==0);
4900   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4901   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4902   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4903     // special case for RFE
4904     emit_jmp(0);
4905   else
4906     emit_jns(0);
4907   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4908   #ifdef USE_MINI_HT
4909   if(rs1[i]==31) {
4910     do_miniht_jump(rs,rh,ht);
4911   }
4912   else
4913   #endif
4914   {
4915     //if(rs!=EAX) emit_mov(rs,EAX);
4916     //emit_jmp((int)jump_vaddr_eax);
4917     emit_jmp(jump_vaddr_reg[rs]);
4918   }
4919   /* Check hash table
4920   temp=!rs;
4921   emit_mov(rs,temp);
4922   emit_shrimm(rs,16,rs);
4923   emit_xor(temp,rs,rs);
4924   emit_movzwl_reg(rs,rs);
4925   emit_shlimm(rs,4,rs);
4926   emit_cmpmem_indexed((int)hash_table,rs,temp);
4927   emit_jne((int)out+14);
4928   emit_readword_indexed((int)hash_table+4,rs,rs);
4929   emit_jmpreg(rs);
4930   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4931   emit_addimm_no_flags(8,rs);
4932   emit_jeq((int)out-17);
4933   // No hit on hash table, call compiler
4934   emit_pushreg(temp);
4935 //DEBUG >
4936 #ifdef DEBUG_CYCLE_COUNT
4937   emit_readword((int)&last_count,ECX);
4938   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4939   emit_readword((int)&next_interupt,ECX);
4940   emit_writeword(HOST_CCREG,(int)&Count);
4941   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4942   emit_writeword(ECX,(int)&last_count);
4943 #endif
4944 //DEBUG <
4945   emit_storereg(CCREG,HOST_CCREG);
4946   emit_call((int)get_addr);
4947   emit_loadreg(CCREG,HOST_CCREG);
4948   emit_addimm(ESP,4,ESP);
4949   emit_jmpreg(EAX);*/
4950   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4951   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4952   #endif
4953 }
4954
4955 void cjump_assemble(int i,struct regstat *i_regs)
4956 {
4957   signed char *i_regmap=i_regs->regmap;
4958   int cc;
4959   int match;
4960   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4961   assem_debug("match=%d\n",match);
4962   int s1h,s1l,s2h,s2l;
4963   int prev_cop1_usable=cop1_usable;
4964   int unconditional=0,nop=0;
4965   int only32=0;
4966   int invert=0;
4967   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4968   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4969   if(!match) invert=1;
4970   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4971   if(i>(ba[i]-start)>>2) invert=1;
4972   #endif
4973
4974   if(ooo[i]) {
4975     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4976     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4977     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4978     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4979   }
4980   else {
4981     s1l=get_reg(i_regmap,rs1[i]);
4982     s1h=get_reg(i_regmap,rs1[i]|64);
4983     s2l=get_reg(i_regmap,rs2[i]);
4984     s2h=get_reg(i_regmap,rs2[i]|64);
4985   }
4986   if(rs1[i]==0&&rs2[i]==0)
4987   {
4988     if(opcode[i]&1) nop=1;
4989     else unconditional=1;
4990     //assert(opcode[i]!=5);
4991     //assert(opcode[i]!=7);
4992     //assert(opcode[i]!=0x15);
4993     //assert(opcode[i]!=0x17);
4994   }
4995   else if(rs1[i]==0)
4996   {
4997     s1l=s2l;s1h=s2h;
4998     s2l=s2h=-1;
4999     only32=(regs[i].was32>>rs2[i])&1;
5000   }
5001   else if(rs2[i]==0)
5002   {
5003     s2l=s2h=-1;
5004     only32=(regs[i].was32>>rs1[i])&1;
5005   }
5006   else {
5007     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5008   }
5009
5010   if(ooo[i]) {
5011     // Out of order execution (delay slot first)
5012     //printf("OOOE\n");
5013     address_generation(i+1,i_regs,regs[i].regmap_entry);
5014     ds_assemble(i+1,i_regs);
5015     int adj;
5016     uint64_t bc_unneeded=branch_regs[i].u;
5017     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5018     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5019     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5020     bc_unneeded|=1;
5021     bc_unneeded_upper|=1;
5022     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5023                   bc_unneeded,bc_unneeded_upper);
5024     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5025     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5026     cc=get_reg(branch_regs[i].regmap,CCREG);
5027     assert(cc==HOST_CCREG);
5028     if(unconditional)
5029       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5030     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5031     //assem_debug("cycle count (adj)\n");
5032     if(unconditional) {
5033       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5034       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5035         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5036         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5037         if(internal)
5038           assem_debug("branch: internal\n");
5039         else
5040           assem_debug("branch: external\n");
5041         if(internal&&is_ds[(ba[i]-start)>>2]) {
5042           ds_assemble_entry(i);
5043         }
5044         else {
5045           add_to_linker((int)out,ba[i],internal);
5046           emit_jmp(0);
5047         }
5048         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5049         if(((u_int)out)&7) emit_addnop(0);
5050         #endif
5051       }
5052     }
5053     else if(nop) {
5054       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5055       int jaddr=(int)out;
5056       emit_jns(0);
5057       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5058     }
5059     else {
5060       int taken=0,nottaken=0,nottaken1=0;
5061       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5062       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5063       if(!only32)
5064       {
5065         assert(s1h>=0);
5066         if(opcode[i]==4) // BEQ
5067         {
5068           if(s2h>=0) emit_cmp(s1h,s2h);
5069           else emit_test(s1h,s1h);
5070           nottaken1=(int)out;
5071           emit_jne(1);
5072         }
5073         if(opcode[i]==5) // BNE
5074         {
5075           if(s2h>=0) emit_cmp(s1h,s2h);
5076           else emit_test(s1h,s1h);
5077           if(invert) taken=(int)out;
5078           else add_to_linker((int)out,ba[i],internal);
5079           emit_jne(0);
5080         }
5081         if(opcode[i]==6) // BLEZ
5082         {
5083           emit_test(s1h,s1h);
5084           if(invert) taken=(int)out;
5085           else add_to_linker((int)out,ba[i],internal);
5086           emit_js(0);
5087           nottaken1=(int)out;
5088           emit_jne(1);
5089         }
5090         if(opcode[i]==7) // BGTZ
5091         {
5092           emit_test(s1h,s1h);
5093           nottaken1=(int)out;
5094           emit_js(1);
5095           if(invert) taken=(int)out;
5096           else add_to_linker((int)out,ba[i],internal);
5097           emit_jne(0);
5098         }
5099       } // if(!only32)
5100
5101       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5102       assert(s1l>=0);
5103       if(opcode[i]==4) // BEQ
5104       {
5105         if(s2l>=0) emit_cmp(s1l,s2l);
5106         else emit_test(s1l,s1l);
5107         if(invert){
5108           nottaken=(int)out;
5109           emit_jne(1);
5110         }else{
5111           add_to_linker((int)out,ba[i],internal);
5112           emit_jeq(0);
5113         }
5114       }
5115       if(opcode[i]==5) // BNE
5116       {
5117         if(s2l>=0) emit_cmp(s1l,s2l);
5118         else emit_test(s1l,s1l);
5119         if(invert){
5120           nottaken=(int)out;
5121           emit_jeq(1);
5122         }else{
5123           add_to_linker((int)out,ba[i],internal);
5124           emit_jne(0);
5125         }
5126       }
5127       if(opcode[i]==6) // BLEZ
5128       {
5129         emit_cmpimm(s1l,1);
5130         if(invert){
5131           nottaken=(int)out;
5132           emit_jge(1);
5133         }else{
5134           add_to_linker((int)out,ba[i],internal);
5135           emit_jl(0);
5136         }
5137       }
5138       if(opcode[i]==7) // BGTZ
5139       {
5140         emit_cmpimm(s1l,1);
5141         if(invert){
5142           nottaken=(int)out;
5143           emit_jl(1);
5144         }else{
5145           add_to_linker((int)out,ba[i],internal);
5146           emit_jge(0);
5147         }
5148       }
5149       if(invert) {
5150         if(taken) set_jump_target(taken,(int)out);
5151         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5152         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5153           if(adj) {
5154             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5155             add_to_linker((int)out,ba[i],internal);
5156           }else{
5157             emit_addnop(13);
5158             add_to_linker((int)out,ba[i],internal*2);
5159           }
5160           emit_jmp(0);
5161         }else
5162         #endif
5163         {
5164           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5165           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5166           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5167           if(internal)
5168             assem_debug("branch: internal\n");
5169           else
5170             assem_debug("branch: external\n");
5171           if(internal&&is_ds[(ba[i]-start)>>2]) {
5172             ds_assemble_entry(i);
5173           }
5174           else {
5175             add_to_linker((int)out,ba[i],internal);
5176             emit_jmp(0);
5177           }
5178         }
5179         set_jump_target(nottaken,(int)out);
5180       }
5181
5182       if(nottaken1) set_jump_target(nottaken1,(int)out);
5183       if(adj) {
5184         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5185       }
5186     } // (!unconditional)
5187   } // if(ooo)
5188   else
5189   {
5190     // In-order execution (branch first)
5191     //if(likely[i]) printf("IOL\n");
5192     //else
5193     //printf("IOE\n");
5194     int taken=0,nottaken=0,nottaken1=0;
5195     if(!unconditional&&!nop) {
5196       if(!only32)
5197       {
5198         assert(s1h>=0);
5199         if((opcode[i]&0x2f)==4) // BEQ
5200         {
5201           if(s2h>=0) emit_cmp(s1h,s2h);
5202           else emit_test(s1h,s1h);
5203           nottaken1=(int)out;
5204           emit_jne(2);
5205         }
5206         if((opcode[i]&0x2f)==5) // BNE
5207         {
5208           if(s2h>=0) emit_cmp(s1h,s2h);
5209           else emit_test(s1h,s1h);
5210           taken=(int)out;
5211           emit_jne(1);
5212         }
5213         if((opcode[i]&0x2f)==6) // BLEZ
5214         {
5215           emit_test(s1h,s1h);
5216           taken=(int)out;
5217           emit_js(1);
5218           nottaken1=(int)out;
5219           emit_jne(2);
5220         }
5221         if((opcode[i]&0x2f)==7) // BGTZ
5222         {
5223           emit_test(s1h,s1h);
5224           nottaken1=(int)out;
5225           emit_js(2);
5226           taken=(int)out;
5227           emit_jne(1);
5228         }
5229       } // if(!only32)
5230
5231       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5232       assert(s1l>=0);
5233       if((opcode[i]&0x2f)==4) // BEQ
5234       {
5235         if(s2l>=0) emit_cmp(s1l,s2l);
5236         else emit_test(s1l,s1l);
5237         nottaken=(int)out;
5238         emit_jne(2);
5239       }
5240       if((opcode[i]&0x2f)==5) // BNE
5241       {
5242         if(s2l>=0) emit_cmp(s1l,s2l);
5243         else emit_test(s1l,s1l);
5244         nottaken=(int)out;
5245         emit_jeq(2);
5246       }
5247       if((opcode[i]&0x2f)==6) // BLEZ
5248       {
5249         emit_cmpimm(s1l,1);
5250         nottaken=(int)out;
5251         emit_jge(2);
5252       }
5253       if((opcode[i]&0x2f)==7) // BGTZ
5254       {
5255         emit_cmpimm(s1l,1);
5256         nottaken=(int)out;
5257         emit_jl(2);
5258       }
5259     } // if(!unconditional)
5260     int adj;
5261     uint64_t ds_unneeded=branch_regs[i].u;
5262     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5263     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5264     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5265     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5266     ds_unneeded|=1;
5267     ds_unneeded_upper|=1;
5268     // branch taken
5269     if(!nop) {
5270       if(taken) set_jump_target(taken,(int)out);
5271       assem_debug("1:\n");
5272       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5273                     ds_unneeded,ds_unneeded_upper);
5274       // load regs
5275       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5276       address_generation(i+1,&branch_regs[i],0);
5277       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5278       ds_assemble(i+1,&branch_regs[i]);
5279       cc=get_reg(branch_regs[i].regmap,CCREG);
5280       if(cc==-1) {
5281         emit_loadreg(CCREG,cc=HOST_CCREG);
5282         // CHECK: Is the following instruction (fall thru) allocated ok?
5283       }
5284       assert(cc==HOST_CCREG);
5285       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5286       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5287       assem_debug("cycle count (adj)\n");
5288       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5289       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5290       if(internal)
5291         assem_debug("branch: internal\n");
5292       else
5293         assem_debug("branch: external\n");
5294       if(internal&&is_ds[(ba[i]-start)>>2]) {
5295         ds_assemble_entry(i);
5296       }
5297       else {
5298         add_to_linker((int)out,ba[i],internal);
5299         emit_jmp(0);
5300       }
5301     }
5302     // branch not taken
5303     cop1_usable=prev_cop1_usable;
5304     if(!unconditional) {
5305       if(nottaken1) set_jump_target(nottaken1,(int)out);
5306       set_jump_target(nottaken,(int)out);
5307       assem_debug("2:\n");
5308       if(!likely[i]) {
5309         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5310                       ds_unneeded,ds_unneeded_upper);
5311         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5312         address_generation(i+1,&branch_regs[i],0);
5313         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5314         ds_assemble(i+1,&branch_regs[i]);
5315       }
5316       cc=get_reg(branch_regs[i].regmap,CCREG);
5317       if(cc==-1&&!likely[i]) {
5318         // Cycle count isn't in a register, temporarily load it then write it out
5319         emit_loadreg(CCREG,HOST_CCREG);
5320         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5321         int jaddr=(int)out;
5322         emit_jns(0);
5323         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5324         emit_storereg(CCREG,HOST_CCREG);
5325       }
5326       else{
5327         cc=get_reg(i_regmap,CCREG);
5328         assert(cc==HOST_CCREG);
5329         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5330         int jaddr=(int)out;
5331         emit_jns(0);
5332         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5333       }
5334     }
5335   }
5336 }
5337
5338 void sjump_assemble(int i,struct regstat *i_regs)
5339 {
5340   signed char *i_regmap=i_regs->regmap;
5341   int cc;
5342   int match;
5343   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5344   assem_debug("smatch=%d\n",match);
5345   int s1h,s1l;
5346   int prev_cop1_usable=cop1_usable;
5347   int unconditional=0,nevertaken=0;
5348   int only32=0;
5349   int invert=0;
5350   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5351   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5352   if(!match) invert=1;
5353   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5354   if(i>(ba[i]-start)>>2) invert=1;
5355   #endif
5356
5357   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5358   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5359
5360   if(ooo[i]) {
5361     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5362     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5363   }
5364   else {
5365     s1l=get_reg(i_regmap,rs1[i]);
5366     s1h=get_reg(i_regmap,rs1[i]|64);
5367   }
5368   if(rs1[i]==0)
5369   {
5370     if(opcode2[i]&1) unconditional=1;
5371     else nevertaken=1;
5372     // These are never taken (r0 is never less than zero)
5373     //assert(opcode2[i]!=0);
5374     //assert(opcode2[i]!=2);
5375     //assert(opcode2[i]!=0x10);
5376     //assert(opcode2[i]!=0x12);
5377   }
5378   else {
5379     only32=(regs[i].was32>>rs1[i])&1;
5380   }
5381
5382   if(ooo[i]) {
5383     // Out of order execution (delay slot first)
5384     //printf("OOOE\n");
5385     address_generation(i+1,i_regs,regs[i].regmap_entry);
5386     ds_assemble(i+1,i_regs);
5387     int adj;
5388     uint64_t bc_unneeded=branch_regs[i].u;
5389     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5390     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5391     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5392     bc_unneeded|=1;
5393     bc_unneeded_upper|=1;
5394     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5395                   bc_unneeded,bc_unneeded_upper);
5396     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5397     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5398     if(rt1[i]==31) {
5399       int rt,return_address;
5400       rt=get_reg(branch_regs[i].regmap,31);
5401       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5402       if(rt>=0) {
5403         // Save the PC even if the branch is not taken
5404         return_address=start+i*4+8;
5405         emit_movimm(return_address,rt); // PC into link register
5406         #ifdef IMM_PREFETCH
5407         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5408         #endif
5409       }
5410     }
5411     cc=get_reg(branch_regs[i].regmap,CCREG);
5412     assert(cc==HOST_CCREG);
5413     if(unconditional)
5414       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5415     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5416     assem_debug("cycle count (adj)\n");
5417     if(unconditional) {
5418       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5419       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5420         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5421         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5422         if(internal)
5423           assem_debug("branch: internal\n");
5424         else
5425           assem_debug("branch: external\n");
5426         if(internal&&is_ds[(ba[i]-start)>>2]) {
5427           ds_assemble_entry(i);
5428         }
5429         else {
5430           add_to_linker((int)out,ba[i],internal);
5431           emit_jmp(0);
5432         }
5433         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5434         if(((u_int)out)&7) emit_addnop(0);
5435         #endif
5436       }
5437     }
5438     else if(nevertaken) {
5439       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5440       int jaddr=(int)out;
5441       emit_jns(0);
5442       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5443     }
5444     else {
5445       int nottaken=0;
5446       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5447       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5448       if(!only32)
5449       {
5450         assert(s1h>=0);
5451         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5452         {
5453           emit_test(s1h,s1h);
5454           if(invert){
5455             nottaken=(int)out;
5456             emit_jns(1);
5457           }else{
5458             add_to_linker((int)out,ba[i],internal);
5459             emit_js(0);
5460           }
5461         }
5462         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5463         {
5464           emit_test(s1h,s1h);
5465           if(invert){
5466             nottaken=(int)out;
5467             emit_js(1);
5468           }else{
5469             add_to_linker((int)out,ba[i],internal);
5470             emit_jns(0);
5471           }
5472         }
5473       } // if(!only32)
5474       else
5475       {
5476         assert(s1l>=0);
5477         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5478         {
5479           emit_test(s1l,s1l);
5480           if(invert){
5481             nottaken=(int)out;
5482             emit_jns(1);
5483           }else{
5484             add_to_linker((int)out,ba[i],internal);
5485             emit_js(0);
5486           }
5487         }
5488         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5489         {
5490           emit_test(s1l,s1l);
5491           if(invert){
5492             nottaken=(int)out;
5493             emit_js(1);
5494           }else{
5495             add_to_linker((int)out,ba[i],internal);
5496             emit_jns(0);
5497           }
5498         }
5499       } // if(!only32)
5500
5501       if(invert) {
5502         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5503         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5504           if(adj) {
5505             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5506             add_to_linker((int)out,ba[i],internal);
5507           }else{
5508             emit_addnop(13);
5509             add_to_linker((int)out,ba[i],internal*2);
5510           }
5511           emit_jmp(0);
5512         }else
5513         #endif
5514         {
5515           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5516           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5517           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5518           if(internal)
5519             assem_debug("branch: internal\n");
5520           else
5521             assem_debug("branch: external\n");
5522           if(internal&&is_ds[(ba[i]-start)>>2]) {
5523             ds_assemble_entry(i);
5524           }
5525           else {
5526             add_to_linker((int)out,ba[i],internal);
5527             emit_jmp(0);
5528           }
5529         }
5530         set_jump_target(nottaken,(int)out);
5531       }
5532
5533       if(adj) {
5534         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5535       }
5536     } // (!unconditional)
5537   } // if(ooo)
5538   else
5539   {
5540     // In-order execution (branch first)
5541     //printf("IOE\n");
5542     int nottaken=0;
5543     if(rt1[i]==31) {
5544       int rt,return_address;
5545       rt=get_reg(branch_regs[i].regmap,31);
5546       if(rt>=0) {
5547         // Save the PC even if the branch is not taken
5548         return_address=start+i*4+8;
5549         emit_movimm(return_address,rt); // PC into link register
5550         #ifdef IMM_PREFETCH
5551         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5552         #endif
5553       }
5554     }
5555     if(!unconditional) {
5556       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5557       if(!only32)
5558       {
5559         assert(s1h>=0);
5560         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5561         {
5562           emit_test(s1h,s1h);
5563           nottaken=(int)out;
5564           emit_jns(1);
5565         }
5566         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5567         {
5568           emit_test(s1h,s1h);
5569           nottaken=(int)out;
5570           emit_js(1);
5571         }
5572       } // if(!only32)
5573       else
5574       {
5575         assert(s1l>=0);
5576         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5577         {
5578           emit_test(s1l,s1l);
5579           nottaken=(int)out;
5580           emit_jns(1);
5581         }
5582         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5583         {
5584           emit_test(s1l,s1l);
5585           nottaken=(int)out;
5586           emit_js(1);
5587         }
5588       }
5589     } // if(!unconditional)
5590     int adj;
5591     uint64_t ds_unneeded=branch_regs[i].u;
5592     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5593     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5594     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5595     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5596     ds_unneeded|=1;
5597     ds_unneeded_upper|=1;
5598     // branch taken
5599     if(!nevertaken) {
5600       //assem_debug("1:\n");
5601       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5602                     ds_unneeded,ds_unneeded_upper);
5603       // load regs
5604       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5605       address_generation(i+1,&branch_regs[i],0);
5606       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5607       ds_assemble(i+1,&branch_regs[i]);
5608       cc=get_reg(branch_regs[i].regmap,CCREG);
5609       if(cc==-1) {
5610         emit_loadreg(CCREG,cc=HOST_CCREG);
5611         // CHECK: Is the following instruction (fall thru) allocated ok?
5612       }
5613       assert(cc==HOST_CCREG);
5614       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5615       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5616       assem_debug("cycle count (adj)\n");
5617       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5618       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5619       if(internal)
5620         assem_debug("branch: internal\n");
5621       else
5622         assem_debug("branch: external\n");
5623       if(internal&&is_ds[(ba[i]-start)>>2]) {
5624         ds_assemble_entry(i);
5625       }
5626       else {
5627         add_to_linker((int)out,ba[i],internal);
5628         emit_jmp(0);
5629       }
5630     }
5631     // branch not taken
5632     cop1_usable=prev_cop1_usable;
5633     if(!unconditional) {
5634       set_jump_target(nottaken,(int)out);
5635       assem_debug("1:\n");
5636       if(!likely[i]) {
5637         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5638                       ds_unneeded,ds_unneeded_upper);
5639         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5640         address_generation(i+1,&branch_regs[i],0);
5641         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5642         ds_assemble(i+1,&branch_regs[i]);
5643       }
5644       cc=get_reg(branch_regs[i].regmap,CCREG);
5645       if(cc==-1&&!likely[i]) {
5646         // Cycle count isn't in a register, temporarily load it then write it out
5647         emit_loadreg(CCREG,HOST_CCREG);
5648         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5649         int jaddr=(int)out;
5650         emit_jns(0);
5651         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5652         emit_storereg(CCREG,HOST_CCREG);
5653       }
5654       else{
5655         cc=get_reg(i_regmap,CCREG);
5656         assert(cc==HOST_CCREG);
5657         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5658         int jaddr=(int)out;
5659         emit_jns(0);
5660         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5661       }
5662     }
5663   }
5664 }
5665
5666 void fjump_assemble(int i,struct regstat *i_regs)
5667 {
5668   signed char *i_regmap=i_regs->regmap;
5669   int cc;
5670   int match;
5671   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5672   assem_debug("fmatch=%d\n",match);
5673   int fs,cs;
5674   int eaddr;
5675   int invert=0;
5676   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5677   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5678   if(!match) invert=1;
5679   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5680   if(i>(ba[i]-start)>>2) invert=1;
5681   #endif
5682
5683   if(ooo[i]) {
5684     fs=get_reg(branch_regs[i].regmap,FSREG);
5685     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5686   }
5687   else {
5688     fs=get_reg(i_regmap,FSREG);
5689   }
5690
5691   // Check cop1 unusable
5692   if(!cop1_usable) {
5693     cs=get_reg(i_regmap,CSREG);
5694     assert(cs>=0);
5695     emit_testimm(cs,0x20000000);
5696     eaddr=(int)out;
5697     emit_jeq(0);
5698     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5699     cop1_usable=1;
5700   }
5701
5702   if(ooo[i]) {
5703     // Out of order execution (delay slot first)
5704     //printf("OOOE\n");
5705     ds_assemble(i+1,i_regs);
5706     int adj;
5707     uint64_t bc_unneeded=branch_regs[i].u;
5708     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5709     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5710     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5711     bc_unneeded|=1;
5712     bc_unneeded_upper|=1;
5713     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5714                   bc_unneeded,bc_unneeded_upper);
5715     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5716     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5717     cc=get_reg(branch_regs[i].regmap,CCREG);
5718     assert(cc==HOST_CCREG);
5719     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5720     assem_debug("cycle count (adj)\n");
5721     if(1) {
5722       int nottaken=0;
5723       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5724       if(1) {
5725         assert(fs>=0);
5726         emit_testimm(fs,0x800000);
5727         if(source[i]&0x10000) // BC1T
5728         {
5729           if(invert){
5730             nottaken=(int)out;
5731             emit_jeq(1);
5732           }else{
5733             add_to_linker((int)out,ba[i],internal);
5734             emit_jne(0);
5735           }
5736         }
5737         else // BC1F
5738           if(invert){
5739             nottaken=(int)out;
5740             emit_jne(1);
5741           }else{
5742             add_to_linker((int)out,ba[i],internal);
5743             emit_jeq(0);
5744           }
5745         {
5746         }
5747       } // if(!only32)
5748
5749       if(invert) {
5750         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5751         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5752         else if(match) emit_addnop(13);
5753         #endif
5754         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5755         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5756         if(internal)
5757           assem_debug("branch: internal\n");
5758         else
5759           assem_debug("branch: external\n");
5760         if(internal&&is_ds[(ba[i]-start)>>2]) {
5761           ds_assemble_entry(i);
5762         }
5763         else {
5764           add_to_linker((int)out,ba[i],internal);
5765           emit_jmp(0);
5766         }
5767         set_jump_target(nottaken,(int)out);
5768       }
5769
5770       if(adj) {
5771         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5772       }
5773     } // (!unconditional)
5774   } // if(ooo)
5775   else
5776   {
5777     // In-order execution (branch first)
5778     //printf("IOE\n");
5779     int nottaken=0;
5780     if(1) {
5781       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5782       if(1) {
5783         assert(fs>=0);
5784         emit_testimm(fs,0x800000);
5785         if(source[i]&0x10000) // BC1T
5786         {
5787           nottaken=(int)out;
5788           emit_jeq(1);
5789         }
5790         else // BC1F
5791         {
5792           nottaken=(int)out;
5793           emit_jne(1);
5794         }
5795       }
5796     } // if(!unconditional)
5797     int adj;
5798     uint64_t ds_unneeded=branch_regs[i].u;
5799     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5800     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5801     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5802     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5803     ds_unneeded|=1;
5804     ds_unneeded_upper|=1;
5805     // branch taken
5806     //assem_debug("1:\n");
5807     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5808                   ds_unneeded,ds_unneeded_upper);
5809     // load regs
5810     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5811     address_generation(i+1,&branch_regs[i],0);
5812     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5813     ds_assemble(i+1,&branch_regs[i]);
5814     cc=get_reg(branch_regs[i].regmap,CCREG);
5815     if(cc==-1) {
5816       emit_loadreg(CCREG,cc=HOST_CCREG);
5817       // CHECK: Is the following instruction (fall thru) allocated ok?
5818     }
5819     assert(cc==HOST_CCREG);
5820     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5821     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5822     assem_debug("cycle count (adj)\n");
5823     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5824     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5825     if(internal)
5826       assem_debug("branch: internal\n");
5827     else
5828       assem_debug("branch: external\n");
5829     if(internal&&is_ds[(ba[i]-start)>>2]) {
5830       ds_assemble_entry(i);
5831     }
5832     else {
5833       add_to_linker((int)out,ba[i],internal);
5834       emit_jmp(0);
5835     }
5836
5837     // branch not taken
5838     if(1) { // <- FIXME (don't need this)
5839       set_jump_target(nottaken,(int)out);
5840       assem_debug("1:\n");
5841       if(!likely[i]) {
5842         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5843                       ds_unneeded,ds_unneeded_upper);
5844         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5845         address_generation(i+1,&branch_regs[i],0);
5846         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5847         ds_assemble(i+1,&branch_regs[i]);
5848       }
5849       cc=get_reg(branch_regs[i].regmap,CCREG);
5850       if(cc==-1&&!likely[i]) {
5851         // Cycle count isn't in a register, temporarily load it then write it out
5852         emit_loadreg(CCREG,HOST_CCREG);
5853         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5854         int jaddr=(int)out;
5855         emit_jns(0);
5856         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5857         emit_storereg(CCREG,HOST_CCREG);
5858       }
5859       else{
5860         cc=get_reg(i_regmap,CCREG);
5861         assert(cc==HOST_CCREG);
5862         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5863         int jaddr=(int)out;
5864         emit_jns(0);
5865         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5866       }
5867     }
5868   }
5869 }
5870
5871 static void pagespan_assemble(int i,struct regstat *i_regs)
5872 {
5873   int s1l=get_reg(i_regs->regmap,rs1[i]);
5874   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5875   int s2l=get_reg(i_regs->regmap,rs2[i]);
5876   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5877   int taken=0;
5878   int nottaken=0;
5879   int unconditional=0;
5880   if(rs1[i]==0)
5881   {
5882     s1l=s2l;s1h=s2h;
5883     s2l=s2h=-1;
5884   }
5885   else if(rs2[i]==0)
5886   {
5887     s2l=s2h=-1;
5888   }
5889   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5890     s1h=s2h=-1;
5891   }
5892   int hr=0;
5893   int addr=-1,alt=-1,ntaddr=-1;
5894   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5895   else {
5896     while(hr<HOST_REGS)
5897     {
5898       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5899          (i_regs->regmap[hr]&63)!=rs1[i] &&
5900          (i_regs->regmap[hr]&63)!=rs2[i] )
5901       {
5902         addr=hr++;break;
5903       }
5904       hr++;
5905     }
5906   }
5907   while(hr<HOST_REGS)
5908   {
5909     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5910        (i_regs->regmap[hr]&63)!=rs1[i] &&
5911        (i_regs->regmap[hr]&63)!=rs2[i] )
5912     {
5913       alt=hr++;break;
5914     }
5915     hr++;
5916   }
5917   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5918   {
5919     while(hr<HOST_REGS)
5920     {
5921       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5922          (i_regs->regmap[hr]&63)!=rs1[i] &&
5923          (i_regs->regmap[hr]&63)!=rs2[i] )
5924       {
5925         ntaddr=hr;break;
5926       }
5927       hr++;
5928     }
5929   }
5930   assert(hr<HOST_REGS);
5931   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5932     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5933   }
5934   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5935   if(opcode[i]==2) // J
5936   {
5937     unconditional=1;
5938   }
5939   if(opcode[i]==3) // JAL
5940   {
5941     // TODO: mini_ht
5942     int rt=get_reg(i_regs->regmap,31);
5943     emit_movimm(start+i*4+8,rt);
5944     unconditional=1;
5945   }
5946   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5947   {
5948     emit_mov(s1l,addr);
5949     if(opcode2[i]==9) // JALR
5950     {
5951       int rt=get_reg(i_regs->regmap,rt1[i]);
5952       emit_movimm(start+i*4+8,rt);
5953     }
5954   }
5955   if((opcode[i]&0x3f)==4) // BEQ
5956   {
5957     if(rs1[i]==rs2[i])
5958     {
5959       unconditional=1;
5960     }
5961     else
5962     #ifdef HAVE_CMOV_IMM
5963     if(s1h<0) {
5964       if(s2l>=0) emit_cmp(s1l,s2l);
5965       else emit_test(s1l,s1l);
5966       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5967     }
5968     else
5969     #endif
5970     {
5971       assert(s1l>=0);
5972       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5973       if(s1h>=0) {
5974         if(s2h>=0) emit_cmp(s1h,s2h);
5975         else emit_test(s1h,s1h);
5976         emit_cmovne_reg(alt,addr);
5977       }
5978       if(s2l>=0) emit_cmp(s1l,s2l);
5979       else emit_test(s1l,s1l);
5980       emit_cmovne_reg(alt,addr);
5981     }
5982   }
5983   if((opcode[i]&0x3f)==5) // BNE
5984   {
5985     #ifdef HAVE_CMOV_IMM
5986     if(s1h<0) {
5987       if(s2l>=0) emit_cmp(s1l,s2l);
5988       else emit_test(s1l,s1l);
5989       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5990     }
5991     else
5992     #endif
5993     {
5994       assert(s1l>=0);
5995       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5996       if(s1h>=0) {
5997         if(s2h>=0) emit_cmp(s1h,s2h);
5998         else emit_test(s1h,s1h);
5999         emit_cmovne_reg(alt,addr);
6000       }
6001       if(s2l>=0) emit_cmp(s1l,s2l);
6002       else emit_test(s1l,s1l);
6003       emit_cmovne_reg(alt,addr);
6004     }
6005   }
6006   if((opcode[i]&0x3f)==0x14) // BEQL
6007   {
6008     if(s1h>=0) {
6009       if(s2h>=0) emit_cmp(s1h,s2h);
6010       else emit_test(s1h,s1h);
6011       nottaken=(int)out;
6012       emit_jne(0);
6013     }
6014     if(s2l>=0) emit_cmp(s1l,s2l);
6015     else emit_test(s1l,s1l);
6016     if(nottaken) set_jump_target(nottaken,(int)out);
6017     nottaken=(int)out;
6018     emit_jne(0);
6019   }
6020   if((opcode[i]&0x3f)==0x15) // BNEL
6021   {
6022     if(s1h>=0) {
6023       if(s2h>=0) emit_cmp(s1h,s2h);
6024       else emit_test(s1h,s1h);
6025       taken=(int)out;
6026       emit_jne(0);
6027     }
6028     if(s2l>=0) emit_cmp(s1l,s2l);
6029     else emit_test(s1l,s1l);
6030     nottaken=(int)out;
6031     emit_jeq(0);
6032     if(taken) set_jump_target(taken,(int)out);
6033   }
6034   if((opcode[i]&0x3f)==6) // BLEZ
6035   {
6036     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6037     emit_cmpimm(s1l,1);
6038     if(s1h>=0) emit_mov(addr,ntaddr);
6039     emit_cmovl_reg(alt,addr);
6040     if(s1h>=0) {
6041       emit_test(s1h,s1h);
6042       emit_cmovne_reg(ntaddr,addr);
6043       emit_cmovs_reg(alt,addr);
6044     }
6045   }
6046   if((opcode[i]&0x3f)==7) // BGTZ
6047   {
6048     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6049     emit_cmpimm(s1l,1);
6050     if(s1h>=0) emit_mov(addr,alt);
6051     emit_cmovl_reg(ntaddr,addr);
6052     if(s1h>=0) {
6053       emit_test(s1h,s1h);
6054       emit_cmovne_reg(alt,addr);
6055       emit_cmovs_reg(ntaddr,addr);
6056     }
6057   }
6058   if((opcode[i]&0x3f)==0x16) // BLEZL
6059   {
6060     assert((opcode[i]&0x3f)!=0x16);
6061   }
6062   if((opcode[i]&0x3f)==0x17) // BGTZL
6063   {
6064     assert((opcode[i]&0x3f)!=0x17);
6065   }
6066   assert(opcode[i]!=1); // BLTZ/BGEZ
6067
6068   //FIXME: Check CSREG
6069   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6070     if((source[i]&0x30000)==0) // BC1F
6071     {
6072       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6073       emit_testimm(s1l,0x800000);
6074       emit_cmovne_reg(alt,addr);
6075     }
6076     if((source[i]&0x30000)==0x10000) // BC1T
6077     {
6078       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6079       emit_testimm(s1l,0x800000);
6080       emit_cmovne_reg(alt,addr);
6081     }
6082     if((source[i]&0x30000)==0x20000) // BC1FL
6083     {
6084       emit_testimm(s1l,0x800000);
6085       nottaken=(int)out;
6086       emit_jne(0);
6087     }
6088     if((source[i]&0x30000)==0x30000) // BC1TL
6089     {
6090       emit_testimm(s1l,0x800000);
6091       nottaken=(int)out;
6092       emit_jeq(0);
6093     }
6094   }
6095
6096   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6097   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6098   if(likely[i]||unconditional)
6099   {
6100     emit_movimm(ba[i],HOST_BTREG);
6101   }
6102   else if(addr!=HOST_BTREG)
6103   {
6104     emit_mov(addr,HOST_BTREG);
6105   }
6106   void *branch_addr=out;
6107   emit_jmp(0);
6108   int target_addr=start+i*4+5;
6109   void *stub=out;
6110   void *compiled_target_addr=check_addr(target_addr);
6111   emit_extjump_ds((int)branch_addr,target_addr);
6112   if(compiled_target_addr) {
6113     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6114     add_link(target_addr,stub);
6115   }
6116   else set_jump_target((int)branch_addr,(int)stub);
6117   if(likely[i]) {
6118     // Not-taken path
6119     set_jump_target((int)nottaken,(int)out);
6120     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6121     void *branch_addr=out;
6122     emit_jmp(0);
6123     int target_addr=start+i*4+8;
6124     void *stub=out;
6125     void *compiled_target_addr=check_addr(target_addr);
6126     emit_extjump_ds((int)branch_addr,target_addr);
6127     if(compiled_target_addr) {
6128       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6129       add_link(target_addr,stub);
6130     }
6131     else set_jump_target((int)branch_addr,(int)stub);
6132   }
6133 }
6134
6135 // Assemble the delay slot for the above
6136 static void pagespan_ds()
6137 {
6138   assem_debug("initial delay slot:\n");
6139   u_int vaddr=start+1;
6140   u_int page=get_page(vaddr);
6141   u_int vpage=get_vpage(vaddr);
6142   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6143   do_dirty_stub_ds();
6144   ll_add(jump_in+page,vaddr,(void *)out);
6145   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6146   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6147     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6148   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6149     emit_writeword(HOST_BTREG,(int)&branch_target);
6150   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6151   address_generation(0,&regs[0],regs[0].regmap_entry);
6152   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6153     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6154   cop1_usable=0;
6155   is_delayslot=0;
6156   switch(itype[0]) {
6157     case ALU:
6158       alu_assemble(0,&regs[0]);break;
6159     case IMM16:
6160       imm16_assemble(0,&regs[0]);break;
6161     case SHIFT:
6162       shift_assemble(0,&regs[0]);break;
6163     case SHIFTIMM:
6164       shiftimm_assemble(0,&regs[0]);break;
6165     case LOAD:
6166       load_assemble(0,&regs[0]);break;
6167     case LOADLR:
6168       loadlr_assemble(0,&regs[0]);break;
6169     case STORE:
6170       store_assemble(0,&regs[0]);break;
6171     case STORELR:
6172       storelr_assemble(0,&regs[0]);break;
6173     case COP0:
6174       cop0_assemble(0,&regs[0]);break;
6175     case COP1:
6176       cop1_assemble(0,&regs[0]);break;
6177     case C1LS:
6178       c1ls_assemble(0,&regs[0]);break;
6179     case COP2:
6180       cop2_assemble(0,&regs[0]);break;
6181     case C2LS:
6182       c2ls_assemble(0,&regs[0]);break;
6183     case C2OP:
6184       c2op_assemble(0,&regs[0]);break;
6185     case FCONV:
6186       fconv_assemble(0,&regs[0]);break;
6187     case FLOAT:
6188       float_assemble(0,&regs[0]);break;
6189     case FCOMP:
6190       fcomp_assemble(0,&regs[0]);break;
6191     case MULTDIV:
6192       multdiv_assemble(0,&regs[0]);break;
6193     case MOV:
6194       mov_assemble(0,&regs[0]);break;
6195     case SYSCALL:
6196     case HLECALL:
6197     case INTCALL:
6198     case SPAN:
6199     case UJUMP:
6200     case RJUMP:
6201     case CJUMP:
6202     case SJUMP:
6203     case FJUMP:
6204       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6205   }
6206   int btaddr=get_reg(regs[0].regmap,BTREG);
6207   if(btaddr<0) {
6208     btaddr=get_reg(regs[0].regmap,-1);
6209     emit_readword((int)&branch_target,btaddr);
6210   }
6211   assert(btaddr!=HOST_CCREG);
6212   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6213 #ifdef HOST_IMM8
6214   emit_movimm(start+4,HOST_TEMPREG);
6215   emit_cmp(btaddr,HOST_TEMPREG);
6216 #else
6217   emit_cmpimm(btaddr,start+4);
6218 #endif
6219   int branch=(int)out;
6220   emit_jeq(0);
6221   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6222   emit_jmp(jump_vaddr_reg[btaddr]);
6223   set_jump_target(branch,(int)out);
6224   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6225   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6226 }
6227
6228 // Basic liveness analysis for MIPS registers
6229 void unneeded_registers(int istart,int iend,int r)
6230 {
6231   int i;
6232   uint64_t u,uu,gte_u,b,bu,gte_bu;
6233   uint64_t temp_u,temp_uu,temp_gte_u=0;
6234   uint64_t tdep;
6235   uint64_t gte_u_unknown=0;
6236   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6237     gte_u_unknown=~0ll;
6238   if(iend==slen-1) {
6239     u=1;uu=1;
6240     gte_u=gte_u_unknown;
6241   }else{
6242     u=unneeded_reg[iend+1];
6243     uu=unneeded_reg_upper[iend+1];
6244     u=1;uu=1;
6245     gte_u=gte_unneeded[iend+1];
6246   }
6247
6248   for (i=iend;i>=istart;i--)
6249   {
6250     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6251     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6252     {
6253       // If subroutine call, flag return address as a possible branch target
6254       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6255
6256       if(ba[i]<start || ba[i]>=(start+slen*4))
6257       {
6258         // Branch out of this block, flush all regs
6259         u=1;
6260         uu=1;
6261         gte_u=gte_u_unknown;
6262         /* Hexagon hack
6263         if(itype[i]==UJUMP&&rt1[i]==31)
6264         {
6265           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6266         }
6267         if(itype[i]==RJUMP&&rs1[i]==31)
6268         {
6269           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6270         }
6271         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6272           if(itype[i]==UJUMP&&rt1[i]==31)
6273           {
6274             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6275             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6276           }
6277           if(itype[i]==RJUMP&&rs1[i]==31)
6278           {
6279             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6280             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6281           }
6282         }*/
6283         branch_unneeded_reg[i]=u;
6284         branch_unneeded_reg_upper[i]=uu;
6285         // Merge in delay slot
6286         tdep=(~uu>>rt1[i+1])&1;
6287         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6288         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6289         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6290         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6291         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6292         u|=1;uu|=1;
6293         gte_u|=gte_rt[i+1];
6294         gte_u&=~gte_rs[i+1];
6295         // If branch is "likely" (and conditional)
6296         // then we skip the delay slot on the fall-thru path
6297         if(likely[i]) {
6298           if(i<slen-1) {
6299             u&=unneeded_reg[i+2];
6300             uu&=unneeded_reg_upper[i+2];
6301             gte_u&=gte_unneeded[i+2];
6302           }
6303           else
6304           {
6305             u=1;
6306             uu=1;
6307             gte_u=gte_u_unknown;
6308           }
6309         }
6310       }
6311       else
6312       {
6313         // Internal branch, flag target
6314         bt[(ba[i]-start)>>2]=1;
6315         if(ba[i]<=start+i*4) {
6316           // Backward branch
6317           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6318           {
6319             // Unconditional branch
6320             temp_u=1;temp_uu=1;
6321             temp_gte_u=0;
6322           } else {
6323             // Conditional branch (not taken case)
6324             temp_u=unneeded_reg[i+2];
6325             temp_uu=unneeded_reg_upper[i+2];
6326             temp_gte_u&=gte_unneeded[i+2];
6327           }
6328           // Merge in delay slot
6329           tdep=(~temp_uu>>rt1[i+1])&1;
6330           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6331           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6332           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6333           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6334           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6335           temp_u|=1;temp_uu|=1;
6336           temp_gte_u|=gte_rt[i+1];
6337           temp_gte_u&=~gte_rs[i+1];
6338           // If branch is "likely" (and conditional)
6339           // then we skip the delay slot on the fall-thru path
6340           if(likely[i]) {
6341             if(i<slen-1) {
6342               temp_u&=unneeded_reg[i+2];
6343               temp_uu&=unneeded_reg_upper[i+2];
6344               temp_gte_u&=gte_unneeded[i+2];
6345             }
6346             else
6347             {
6348               temp_u=1;
6349               temp_uu=1;
6350               temp_gte_u=gte_u_unknown;
6351             }
6352           }
6353           tdep=(~temp_uu>>rt1[i])&1;
6354           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6355           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6356           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6357           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6358           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6359           temp_u|=1;temp_uu|=1;
6360           temp_gte_u|=gte_rt[i];
6361           temp_gte_u&=~gte_rs[i];
6362           unneeded_reg[i]=temp_u;
6363           unneeded_reg_upper[i]=temp_uu;
6364           gte_unneeded[i]=temp_gte_u;
6365           // Only go three levels deep.  This recursion can take an
6366           // excessive amount of time if there are a lot of nested loops.
6367           if(r<2) {
6368             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6369           }else{
6370             unneeded_reg[(ba[i]-start)>>2]=1;
6371             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6372             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6373           }
6374         } /*else*/ if(1) {
6375           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6376           {
6377             // Unconditional branch
6378             u=unneeded_reg[(ba[i]-start)>>2];
6379             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6380             gte_u=gte_unneeded[(ba[i]-start)>>2];
6381             branch_unneeded_reg[i]=u;
6382             branch_unneeded_reg_upper[i]=uu;
6383         //u=1;
6384         //uu=1;
6385         //branch_unneeded_reg[i]=u;
6386         //branch_unneeded_reg_upper[i]=uu;
6387             // Merge in delay slot
6388             tdep=(~uu>>rt1[i+1])&1;
6389             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6390             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6391             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6392             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6393             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6394             u|=1;uu|=1;
6395             gte_u|=gte_rt[i+1];
6396             gte_u&=~gte_rs[i+1];
6397           } else {
6398             // Conditional branch
6399             b=unneeded_reg[(ba[i]-start)>>2];
6400             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6401             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6402             branch_unneeded_reg[i]=b;
6403             branch_unneeded_reg_upper[i]=bu;
6404         //b=1;
6405         //bu=1;
6406         //branch_unneeded_reg[i]=b;
6407         //branch_unneeded_reg_upper[i]=bu;
6408             // Branch delay slot
6409             tdep=(~uu>>rt1[i+1])&1;
6410             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6411             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6412             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6413             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6414             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6415             b|=1;bu|=1;
6416             gte_bu|=gte_rt[i+1];
6417             gte_bu&=~gte_rs[i+1];
6418             // If branch is "likely" then we skip the
6419             // delay slot on the fall-thru path
6420             if(likely[i]) {
6421               u=b;
6422               uu=bu;
6423               gte_u=gte_bu;
6424               if(i<slen-1) {
6425                 u&=unneeded_reg[i+2];
6426                 uu&=unneeded_reg_upper[i+2];
6427                 gte_u&=gte_unneeded[i+2];
6428         //u=1;
6429         //uu=1;
6430               }
6431             } else {
6432               u&=b;
6433               uu&=bu;
6434               gte_u&=gte_bu;
6435         //u=1;
6436         //uu=1;
6437             }
6438             if(i<slen-1) {
6439               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6440               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6441         //branch_unneeded_reg[i]=1;
6442         //branch_unneeded_reg_upper[i]=1;
6443             } else {
6444               branch_unneeded_reg[i]=1;
6445               branch_unneeded_reg_upper[i]=1;
6446             }
6447           }
6448         }
6449       }
6450     }
6451     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6452     {
6453       // SYSCALL instruction (software interrupt)
6454       u=1;
6455       uu=1;
6456     }
6457     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6458     {
6459       // ERET instruction (return from interrupt)
6460       u=1;
6461       uu=1;
6462     }
6463     //u=uu=1; // DEBUG
6464     tdep=(~uu>>rt1[i])&1;
6465     // Written registers are unneeded
6466     u|=1LL<<rt1[i];
6467     u|=1LL<<rt2[i];
6468     uu|=1LL<<rt1[i];
6469     uu|=1LL<<rt2[i];
6470     gte_u|=gte_rt[i];
6471     // Accessed registers are needed
6472     u&=~(1LL<<rs1[i]);
6473     u&=~(1LL<<rs2[i]);
6474     uu&=~(1LL<<us1[i]);
6475     uu&=~(1LL<<us2[i]);
6476     gte_u&=~gte_rs[i];
6477     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6478       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6479     // Source-target dependencies
6480     uu&=~(tdep<<dep1[i]);
6481     uu&=~(tdep<<dep2[i]);
6482     // R0 is always unneeded
6483     u|=1;uu|=1;
6484     // Save it
6485     unneeded_reg[i]=u;
6486     unneeded_reg_upper[i]=uu;
6487     gte_unneeded[i]=gte_u;
6488     /*
6489     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6490     printf("U:");
6491     int r;
6492     for(r=1;r<=CCREG;r++) {
6493       if((unneeded_reg[i]>>r)&1) {
6494         if(r==HIREG) printf(" HI");
6495         else if(r==LOREG) printf(" LO");
6496         else printf(" r%d",r);
6497       }
6498     }
6499     printf(" UU:");
6500     for(r=1;r<=CCREG;r++) {
6501       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6502         if(r==HIREG) printf(" HI");
6503         else if(r==LOREG) printf(" LO");
6504         else printf(" r%d",r);
6505       }
6506     }
6507     printf("\n");*/
6508   }
6509   for (i=iend;i>=istart;i--)
6510   {
6511     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6512   }
6513 }
6514
6515 // Write back dirty registers as soon as we will no longer modify them,
6516 // so that we don't end up with lots of writes at the branches.
6517 void clean_registers(int istart,int iend,int wr)
6518 {
6519   int i;
6520   int r;
6521   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6522   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6523   if(iend==slen-1) {
6524     will_dirty_i=will_dirty_next=0;
6525     wont_dirty_i=wont_dirty_next=0;
6526   }else{
6527     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6528     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6529   }
6530   for (i=iend;i>=istart;i--)
6531   {
6532     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6533     {
6534       if(ba[i]<start || ba[i]>=(start+slen*4))
6535       {
6536         // Branch out of this block, flush all regs
6537         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6538         {
6539           // Unconditional branch
6540           will_dirty_i=0;
6541           wont_dirty_i=0;
6542           // Merge in delay slot (will dirty)
6543           for(r=0;r<HOST_REGS;r++) {
6544             if(r!=EXCLUDE_REG) {
6545               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6546               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6547               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6548               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6549               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6550               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6551               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6552               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6553               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6554               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6555               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6556               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6557               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6558               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6559             }
6560           }
6561         }
6562         else
6563         {
6564           // Conditional branch
6565           will_dirty_i=0;
6566           wont_dirty_i=wont_dirty_next;
6567           // Merge in delay slot (will dirty)
6568           for(r=0;r<HOST_REGS;r++) {
6569             if(r!=EXCLUDE_REG) {
6570               if(!likely[i]) {
6571                 // Might not dirty if likely branch is not taken
6572                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6573                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6574                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6575                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6576                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6577                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6578                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6579                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6580                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6581                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6582                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6583                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6584                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6585                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6586               }
6587             }
6588           }
6589         }
6590         // Merge in delay slot (wont dirty)
6591         for(r=0;r<HOST_REGS;r++) {
6592           if(r!=EXCLUDE_REG) {
6593             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6594             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6595             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6596             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6597             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6598             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6599             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6600             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6601             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6602             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6603           }
6604         }
6605         if(wr) {
6606           #ifndef DESTRUCTIVE_WRITEBACK
6607           branch_regs[i].dirty&=wont_dirty_i;
6608           #endif
6609           branch_regs[i].dirty|=will_dirty_i;
6610         }
6611       }
6612       else
6613       {
6614         // Internal branch
6615         if(ba[i]<=start+i*4) {
6616           // Backward branch
6617           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6618           {
6619             // Unconditional branch
6620             temp_will_dirty=0;
6621             temp_wont_dirty=0;
6622             // Merge in delay slot (will dirty)
6623             for(r=0;r<HOST_REGS;r++) {
6624               if(r!=EXCLUDE_REG) {
6625                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6626                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6627                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6628                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6629                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6630                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6631                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6632                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6633                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6634                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6635                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6636                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6637                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6638                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6639               }
6640             }
6641           } else {
6642             // Conditional branch (not taken case)
6643             temp_will_dirty=will_dirty_next;
6644             temp_wont_dirty=wont_dirty_next;
6645             // Merge in delay slot (will dirty)
6646             for(r=0;r<HOST_REGS;r++) {
6647               if(r!=EXCLUDE_REG) {
6648                 if(!likely[i]) {
6649                   // Will not dirty if likely branch is not taken
6650                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6651                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6652                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6653                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6654                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6655                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6656                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6657                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6658                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6659                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6660                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6661                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6662                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6663                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6664                 }
6665               }
6666             }
6667           }
6668           // Merge in delay slot (wont dirty)
6669           for(r=0;r<HOST_REGS;r++) {
6670             if(r!=EXCLUDE_REG) {
6671               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6672               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6673               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6674               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6675               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6676               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6677               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6678               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6679               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6680               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6681             }
6682           }
6683           // Deal with changed mappings
6684           if(i<iend) {
6685             for(r=0;r<HOST_REGS;r++) {
6686               if(r!=EXCLUDE_REG) {
6687                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6688                   temp_will_dirty&=~(1<<r);
6689                   temp_wont_dirty&=~(1<<r);
6690                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6691                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6692                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6693                   } else {
6694                     temp_will_dirty|=1<<r;
6695                     temp_wont_dirty|=1<<r;
6696                   }
6697                 }
6698               }
6699             }
6700           }
6701           if(wr) {
6702             will_dirty[i]=temp_will_dirty;
6703             wont_dirty[i]=temp_wont_dirty;
6704             clean_registers((ba[i]-start)>>2,i-1,0);
6705           }else{
6706             // Limit recursion.  It can take an excessive amount
6707             // of time if there are a lot of nested loops.
6708             will_dirty[(ba[i]-start)>>2]=0;
6709             wont_dirty[(ba[i]-start)>>2]=-1;
6710           }
6711         }
6712         /*else*/ if(1)
6713         {
6714           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6715           {
6716             // Unconditional branch
6717             will_dirty_i=0;
6718             wont_dirty_i=0;
6719           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6720             for(r=0;r<HOST_REGS;r++) {
6721               if(r!=EXCLUDE_REG) {
6722                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6723                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6724                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6725                 }
6726                 if(branch_regs[i].regmap[r]>=0) {
6727                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6728                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6729                 }
6730               }
6731             }
6732           //}
6733             // Merge in delay slot
6734             for(r=0;r<HOST_REGS;r++) {
6735               if(r!=EXCLUDE_REG) {
6736                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6737                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6738                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6739                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6740                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6741                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6742                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6743                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6744                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6745                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6746                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6747                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6748                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6749                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6750               }
6751             }
6752           } else {
6753             // Conditional branch
6754             will_dirty_i=will_dirty_next;
6755             wont_dirty_i=wont_dirty_next;
6756           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6757             for(r=0;r<HOST_REGS;r++) {
6758               if(r!=EXCLUDE_REG) {
6759                 signed char target_reg=branch_regs[i].regmap[r];
6760                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6761                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6762                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6763                 }
6764                 else if(target_reg>=0) {
6765                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6766                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6767                 }
6768                 // Treat delay slot as part of branch too
6769                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6770                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6771                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6772                 }
6773                 else
6774                 {
6775                   will_dirty[i+1]&=~(1<<r);
6776                 }*/
6777               }
6778             }
6779           //}
6780             // Merge in delay slot
6781             for(r=0;r<HOST_REGS;r++) {
6782               if(r!=EXCLUDE_REG) {
6783                 if(!likely[i]) {
6784                   // Might not dirty if likely branch is not taken
6785                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6786                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6787                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6788                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6789                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6790                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6791                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6792                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6793                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6794                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6795                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6796                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6797                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6798                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6799                 }
6800               }
6801             }
6802           }
6803           // Merge in delay slot (won't dirty)
6804           for(r=0;r<HOST_REGS;r++) {
6805             if(r!=EXCLUDE_REG) {
6806               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6807               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6808               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6809               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6810               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6811               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6812               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6813               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6814               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6815               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6816             }
6817           }
6818           if(wr) {
6819             #ifndef DESTRUCTIVE_WRITEBACK
6820             branch_regs[i].dirty&=wont_dirty_i;
6821             #endif
6822             branch_regs[i].dirty|=will_dirty_i;
6823           }
6824         }
6825       }
6826     }
6827     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6828     {
6829       // SYSCALL instruction (software interrupt)
6830       will_dirty_i=0;
6831       wont_dirty_i=0;
6832     }
6833     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6834     {
6835       // ERET instruction (return from interrupt)
6836       will_dirty_i=0;
6837       wont_dirty_i=0;
6838     }
6839     will_dirty_next=will_dirty_i;
6840     wont_dirty_next=wont_dirty_i;
6841     for(r=0;r<HOST_REGS;r++) {
6842       if(r!=EXCLUDE_REG) {
6843         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6844         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6845         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6846         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6847         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6848         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6849         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6850         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6851         if(i>istart) {
6852           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6853           {
6854             // Don't store a register immediately after writing it,
6855             // may prevent dual-issue.
6856             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6857             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6858           }
6859         }
6860       }
6861     }
6862     // Save it
6863     will_dirty[i]=will_dirty_i;
6864     wont_dirty[i]=wont_dirty_i;
6865     // Mark registers that won't be dirtied as not dirty
6866     if(wr) {
6867       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6868       for(r=0;r<HOST_REGS;r++) {
6869         if((will_dirty_i>>r)&1) {
6870           printf(" r%d",r);
6871         }
6872       }
6873       printf("\n");*/
6874
6875       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6876         regs[i].dirty|=will_dirty_i;
6877         #ifndef DESTRUCTIVE_WRITEBACK
6878         regs[i].dirty&=wont_dirty_i;
6879         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6880         {
6881           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6882             for(r=0;r<HOST_REGS;r++) {
6883               if(r!=EXCLUDE_REG) {
6884                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6885                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6886                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6887               }
6888             }
6889           }
6890         }
6891         else
6892         {
6893           if(i<iend) {
6894             for(r=0;r<HOST_REGS;r++) {
6895               if(r!=EXCLUDE_REG) {
6896                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6897                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6898                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6899               }
6900             }
6901           }
6902         }
6903         #endif
6904       //}
6905     }
6906     // Deal with changed mappings
6907     temp_will_dirty=will_dirty_i;
6908     temp_wont_dirty=wont_dirty_i;
6909     for(r=0;r<HOST_REGS;r++) {
6910       if(r!=EXCLUDE_REG) {
6911         int nr;
6912         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6913           if(wr) {
6914             #ifndef DESTRUCTIVE_WRITEBACK
6915             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6916             #endif
6917             regs[i].wasdirty|=will_dirty_i&(1<<r);
6918           }
6919         }
6920         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6921           // Register moved to a different register
6922           will_dirty_i&=~(1<<r);
6923           wont_dirty_i&=~(1<<r);
6924           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6925           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6926           if(wr) {
6927             #ifndef DESTRUCTIVE_WRITEBACK
6928             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6929             #endif
6930             regs[i].wasdirty|=will_dirty_i&(1<<r);
6931           }
6932         }
6933         else {
6934           will_dirty_i&=~(1<<r);
6935           wont_dirty_i&=~(1<<r);
6936           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6937             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6938             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6939           } else {
6940             wont_dirty_i|=1<<r;
6941             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6942           }
6943         }
6944       }
6945     }
6946   }
6947 }
6948
6949 #ifdef DISASM
6950   /* disassembly */
6951 void disassemble_inst(int i)
6952 {
6953     if (bt[i]) printf("*"); else printf(" ");
6954     switch(itype[i]) {
6955       case UJUMP:
6956         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6957       case CJUMP:
6958         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6959       case SJUMP:
6960         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6961       case FJUMP:
6962         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6963       case RJUMP:
6964         if (opcode[i]==0x9&&rt1[i]!=31)
6965           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6966         else
6967           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6968         break;
6969       case SPAN:
6970         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6971       case IMM16:
6972         if(opcode[i]==0xf) //LUI
6973           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6974         else
6975           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6976         break;
6977       case LOAD:
6978       case LOADLR:
6979         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6980         break;
6981       case STORE:
6982       case STORELR:
6983         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6984         break;
6985       case ALU:
6986       case SHIFT:
6987         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6988         break;
6989       case MULTDIV:
6990         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6991         break;
6992       case SHIFTIMM:
6993         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6994         break;
6995       case MOV:
6996         if((opcode2[i]&0x1d)==0x10)
6997           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6998         else if((opcode2[i]&0x1d)==0x11)
6999           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7000         else
7001           printf (" %x: %s\n",start+i*4,insn[i]);
7002         break;
7003       case COP0:
7004         if(opcode2[i]==0)
7005           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7006         else if(opcode2[i]==4)
7007           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7008         else printf (" %x: %s\n",start+i*4,insn[i]);
7009         break;
7010       case COP1:
7011         if(opcode2[i]<3)
7012           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7013         else if(opcode2[i]>3)
7014           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7015         else printf (" %x: %s\n",start+i*4,insn[i]);
7016         break;
7017       case COP2:
7018         if(opcode2[i]<3)
7019           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7020         else if(opcode2[i]>3)
7021           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7022         else printf (" %x: %s\n",start+i*4,insn[i]);
7023         break;
7024       case C1LS:
7025         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7026         break;
7027       case C2LS:
7028         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7029         break;
7030       case INTCALL:
7031         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7032         break;
7033       default:
7034         //printf (" %s %8x\n",insn[i],source[i]);
7035         printf (" %x: %s\n",start+i*4,insn[i]);
7036     }
7037 }
7038 #else
7039 static void disassemble_inst(int i) {}
7040 #endif // DISASM
7041
7042 #define DRC_TEST_VAL 0x74657374
7043
7044 static int new_dynarec_test(void)
7045 {
7046   int (*testfunc)(void) = (void *)out;
7047   void *beginning;
7048   int ret;
7049
7050   beginning = start_block();
7051   emit_movimm(DRC_TEST_VAL,0); // test
7052   emit_jmpreg(14);
7053   literal_pool(0);
7054   end_block(beginning);
7055   SysPrintf("testing if we can run recompiled code..\n");
7056   ret = testfunc();
7057   if (ret == DRC_TEST_VAL)
7058     SysPrintf("test passed.\n");
7059   else
7060     SysPrintf("test failed: %08x\n", ret);
7061   out=(u_char *)BASE_ADDR;
7062   return ret == DRC_TEST_VAL;
7063 }
7064
7065 // clear the state completely, instead of just marking
7066 // things invalid like invalidate_all_pages() does
7067 void new_dynarec_clear_full(void)
7068 {
7069   int n;
7070   out=(u_char *)BASE_ADDR;
7071   memset(invalid_code,1,sizeof(invalid_code));
7072   memset(hash_table,0xff,sizeof(hash_table));
7073   memset(mini_ht,-1,sizeof(mini_ht));
7074   memset(restore_candidate,0,sizeof(restore_candidate));
7075   memset(shadow,0,sizeof(shadow));
7076   copy=shadow;
7077   expirep=16384; // Expiry pointer, +2 blocks
7078   pending_exception=0;
7079   literalcount=0;
7080   stop_after_jal=0;
7081   inv_code_start=inv_code_end=~0;
7082   // TLB
7083   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7084   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7085   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7086 }
7087
7088 void new_dynarec_init(void)
7089 {
7090   SysPrintf("Init new dynarec\n");
7091
7092 #ifdef _3DS
7093   check_rosalina();
7094 #endif
7095
7096   // allocate/prepare a buffer for translation cache
7097   // see assem_arm.h for some explanation
7098 #if   defined(BASE_ADDR_FIXED)
7099   if (mmap (translation_cache, 1 << TARGET_SIZE_2,
7100         PROT_READ | PROT_WRITE | PROT_EXEC,
7101         MAP_PRIVATE | MAP_ANONYMOUS,
7102         -1, 0) != translation_cache)
7103   {
7104     SysPrintf("mmap() failed: %s\n", strerror(errno));
7105     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
7106     abort();
7107   }
7108 #elif defined(BASE_ADDR_DYNAMIC)
7109 #ifdef VITA
7110   sceBlock = getVMBlock();//sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
7111   if (sceBlock < 0)
7112     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
7113   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
7114   if (ret < 0)
7115     SysPrintf("sceKernelGetMemBlockBase failed\n");
7116     
7117   sceKernelOpenVMDomain();
7118   sceClibPrintf("translation_cache = 0x%08X \n ", translation_cache);
7119 #elif defined(_MSC_VER)
7120   base_addr = VirtualAlloc(NULL, 1<<TARGET_SIZE_2, MEM_COMMIT | MEM_RESERVE,
7121       PAGE_EXECUTE_READWRITE);
7122 #else
7123   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
7124       PROT_READ | PROT_WRITE | PROT_EXEC,
7125       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
7126   if (translation_cache == MAP_FAILED) {
7127     SysPrintf("mmap() failed: %s\n", strerror(errno));
7128     abort();
7129   }
7130 #endif
7131 #else
7132 #ifndef NO_WRITE_EXEC
7133   // not all systems allow execute in data segment by default
7134   if (mprotect((void *)BASE_ADDR, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
7135     SysPrintf("mprotect() failed: %s\n", strerror(errno));
7136 #endif
7137 #endif
7138
7139   out=(u_char *)BASE_ADDR;
7140   cycle_multiplier=200;
7141   new_dynarec_clear_full();
7142 #ifdef HOST_IMM8
7143   // Copy this into local area so we don't have to put it in every literal pool
7144   invc_ptr=invalid_code;
7145 #endif
7146   arch_init();
7147   new_dynarec_test();
7148 #ifndef RAM_FIXED
7149   ram_offset=(u_int)rdram-0x80000000;
7150 #endif
7151   if (ram_offset!=0)
7152     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
7153 }
7154
7155 void new_dynarec_cleanup(void)
7156 {
7157   int n;
7158 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
7159 #ifndef VITA
7160 #if defined(_MSC_VER)
7161   VirtualFree(base_addr, 0, MEM_RELEASE);
7162 #else
7163   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0)
7164     SysPrintf("munmap() failed\n");
7165 #endif
7166 #endif
7167 #endif
7168   for(n=0;n<4096;n++)
7169     ll_clear(jump_in+n);
7170   for(n=0;n<4096;n++)
7171     ll_clear(jump_out+n);
7172   for(n=0;n<4096;n++)
7173     ll_clear(jump_dirty+n);
7174 #ifdef ROM_COPY
7175   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7176 #endif
7177 }
7178
7179 static u_int *get_source_start(u_int addr, u_int *limit)
7180 {
7181   if (addr < 0x00200000 ||
7182     (0xa0000000 <= addr && addr < 0xa0200000)) {
7183     // used for BIOS calls mostly?
7184     *limit = (addr&0xa0000000)|0x00200000;
7185     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7186   }
7187   else if (!Config.HLE && (
7188     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7189     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7190     // BIOS
7191     *limit = (addr & 0xfff00000) | 0x80000;
7192     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7193   }
7194   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7195     *limit = (addr & 0x80600000) + 0x00200000;
7196     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7197   }
7198   return NULL;
7199 }
7200
7201 static u_int scan_for_ret(u_int addr)
7202 {
7203   u_int limit = 0;
7204   u_int *mem;
7205
7206   mem = get_source_start(addr, &limit);
7207   if (mem == NULL)
7208     return addr;
7209
7210   if (limit > addr + 0x1000)
7211     limit = addr + 0x1000;
7212   for (; addr < limit; addr += 4, mem++) {
7213     if (*mem == 0x03e00008) // jr $ra
7214       return addr + 8;
7215   }
7216   return addr;
7217 }
7218
7219 struct savestate_block {
7220   uint32_t addr;
7221   uint32_t regflags;
7222 };
7223
7224 static int addr_cmp(const void *p1_, const void *p2_)
7225 {
7226   const struct savestate_block *p1 = p1_, *p2 = p2_;
7227   return p1->addr - p2->addr;
7228 }
7229
7230 int new_dynarec_save_blocks(void *save, int size)
7231 {
7232   struct savestate_block *blocks = save;
7233   int maxcount = size / sizeof(blocks[0]);
7234   struct savestate_block tmp_blocks[1024];
7235   struct ll_entry *head;
7236   int p, s, d, o, bcnt;
7237   u_int addr;
7238
7239   o = 0;
7240   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
7241     bcnt = 0;
7242     for (head = jump_in[p]; head != NULL; head = head->next) {
7243       tmp_blocks[bcnt].addr = head->vaddr;
7244       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7245       bcnt++;
7246     }
7247     if (bcnt < 1)
7248       continue;
7249     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7250
7251     addr = tmp_blocks[0].addr;
7252     for (s = d = 0; s < bcnt; s++) {
7253       if (tmp_blocks[s].addr < addr)
7254         continue;
7255       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7256         tmp_blocks[d++] = tmp_blocks[s];
7257       addr = scan_for_ret(tmp_blocks[s].addr);
7258     }
7259
7260     if (o + d > maxcount)
7261       d = maxcount - o;
7262     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7263     o += d;
7264   }
7265
7266   return o * sizeof(blocks[0]);
7267 }
7268
7269 void new_dynarec_load_blocks(const void *save, int size)
7270 {
7271   const struct savestate_block *blocks = save;
7272   int count = size / sizeof(blocks[0]);
7273   u_int regs_save[32];
7274   uint32_t f;
7275   int i, b;
7276
7277   get_addr(psxRegs.pc);
7278
7279   // change GPRs for speculation to at least partially work..
7280   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7281   for (i = 1; i < 32; i++)
7282     psxRegs.GPR.r[i] = 0x80000000;
7283
7284   for (b = 0; b < count; b++) {
7285     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7286       if (f & 1)
7287         psxRegs.GPR.r[i] = 0x1f800000;
7288     }
7289
7290     get_addr(blocks[b].addr);
7291
7292     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7293       if (f & 1)
7294         psxRegs.GPR.r[i] = 0x80000000;
7295     }
7296   }
7297
7298   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7299 }
7300
7301 int new_recompile_block(int addr)
7302 {
7303   u_int pagelimit = 0;
7304   u_int state_rflags = 0;
7305   int i;
7306
7307   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7308   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7309   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7310   //if(debug)
7311   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7312   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7313   /*if(Count>=312978186) {
7314     rlist();
7315   }*/
7316   //rlist();
7317
7318   // this is just for speculation
7319   for (i = 1; i < 32; i++) {
7320     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7321       state_rflags |= 1 << i;
7322   }
7323
7324   start = (u_int)addr&~3;
7325   //assert(((u_int)addr&1)==0);
7326   new_dynarec_did_compile=1;
7327   if (Config.HLE && start == 0x80001000) // hlecall
7328   {
7329     // XXX: is this enough? Maybe check hleSoftCall?
7330     void *beginning=start_block();
7331     u_int page=get_page(start);
7332
7333     invalid_code[start>>12]=0;
7334     emit_movimm(start,0);
7335     emit_writeword(0,(int)&pcaddr);
7336     emit_jmp((int)new_dyna_leave);
7337     literal_pool(0);
7338     end_block(beginning);
7339     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7340     return 0;
7341   }
7342
7343   source = get_source_start(start, &pagelimit);
7344   if (source == NULL) {
7345     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7346     exit(1);
7347   }
7348
7349   /* Pass 1: disassemble */
7350   /* Pass 2: register dependencies, branch targets */
7351   /* Pass 3: register allocation */
7352   /* Pass 4: branch dependencies */
7353   /* Pass 5: pre-alloc */
7354   /* Pass 6: optimize clean/dirty state */
7355   /* Pass 7: flag 32-bit registers */
7356   /* Pass 8: assembly */
7357   /* Pass 9: linker */
7358   /* Pass 10: garbage collection / free memory */
7359
7360   int j;
7361   int done=0;
7362   unsigned int type,op,op2;
7363
7364   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7365
7366   /* Pass 1 disassembly */
7367
7368   for(i=0;!done;i++) {
7369     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7370     minimum_free_regs[i]=0;
7371     opcode[i]=op=source[i]>>26;
7372     switch(op)
7373     {
7374       case 0x00: strcpy(insn[i],"special"); type=NI;
7375         op2=source[i]&0x3f;
7376         switch(op2)
7377         {
7378           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7379           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7380           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7381           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7382           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7383           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7384           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7385           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7386           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7387           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7388           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7389           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7390           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7391           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7392           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7393           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7394           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7395           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7396           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7397           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7398           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7399           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7400           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7401           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7402           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7403           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7404           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7405           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7406           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7407           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7408           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7409           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7410           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7411           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7412           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7413 #if 0
7414           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7415           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7416           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7417           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7418           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7419           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7420           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7421           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7422           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7423           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7424           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7425           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7426           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7427           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7428           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7429           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7430           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7431 #endif
7432         }
7433         break;
7434       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7435         op2=(source[i]>>16)&0x1f;
7436         switch(op2)
7437         {
7438           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7439           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7440           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7441           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7442           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7443           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7444           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7445           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7446           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7447           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7448           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7449           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7450           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7451           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7452         }
7453         break;
7454       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7455       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7456       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7457       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7458       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7459       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7460       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7461       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7462       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7463       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7464       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7465       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7466       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7467       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7468       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7469         op2=(source[i]>>21)&0x1f;
7470         switch(op2)
7471         {
7472           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7473           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7474           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7475           switch(source[i]&0x3f)
7476           {
7477             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7478             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7479             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7480             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7481             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7482             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7483           }
7484         }
7485         break;
7486       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7487         op2=(source[i]>>21)&0x1f;
7488         switch(op2)
7489         {
7490           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7491           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7492           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7493           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7494           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7495           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7496           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7497           switch((source[i]>>16)&0x3)
7498           {
7499             case 0x00: strcpy(insn[i],"BC1F"); break;
7500             case 0x01: strcpy(insn[i],"BC1T"); break;
7501             case 0x02: strcpy(insn[i],"BC1FL"); break;
7502             case 0x03: strcpy(insn[i],"BC1TL"); break;
7503           }
7504           break;
7505           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7506           switch(source[i]&0x3f)
7507           {
7508             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7509             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7510             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7511             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7512             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7513             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7514             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7515             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7516             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7517             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7518             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7519             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7520             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7521             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7522             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7523             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7524             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7525             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7526             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7527             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7528             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7529             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7530             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7531             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7532             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7533             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7534             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7535             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7536             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7537             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7538             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7539             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7540             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7541             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7542             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7543           }
7544           break;
7545           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7546           switch(source[i]&0x3f)
7547           {
7548             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7549             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7550             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7551             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7552             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7553             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7554             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7555             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7556             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7557             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7558             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7559             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7560             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7561             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7562             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7563             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7564             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7565             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7566             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7567             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7568             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7569             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7570             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7571             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7572             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7573             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7574             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7575             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7576             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7577             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7578             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7579             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7580             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7581             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7582             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7583           }
7584           break;
7585           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7586           switch(source[i]&0x3f)
7587           {
7588             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7589             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7590           }
7591           break;
7592           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7593           switch(source[i]&0x3f)
7594           {
7595             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7596             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7597           }
7598           break;
7599         }
7600         break;
7601 #if 0
7602       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7603       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7604       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7605       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7606       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7607       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7608       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7609       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7610 #endif
7611       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7612       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7613       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7614       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7615       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7616       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7617       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7618 #if 0
7619       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7620 #endif
7621       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7622       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7623       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7624       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7625 #if 0
7626       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7627       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7628 #endif
7629       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7630       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7631       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7632       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7633 #if 0
7634       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7635       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7636       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7637 #endif
7638       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7639       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7640 #if 0
7641       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7642       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7643       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7644 #endif
7645       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7646         op2=(source[i]>>21)&0x1f;
7647         //if (op2 & 0x10) {
7648         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7649           if (gte_handlers[source[i]&0x3f]!=NULL) {
7650             if (gte_regnames[source[i]&0x3f]!=NULL)
7651               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7652             else
7653               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7654             type=C2OP;
7655           }
7656         }
7657         else switch(op2)
7658         {
7659           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7660           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7661           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7662           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7663         }
7664         break;
7665       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7666       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7667       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7668       default: strcpy(insn[i],"???"); type=NI;
7669         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7670         break;
7671     }
7672     itype[i]=type;
7673     opcode2[i]=op2;
7674     /* Get registers/immediates */
7675     lt1[i]=0;
7676     us1[i]=0;
7677     us2[i]=0;
7678     dep1[i]=0;
7679     dep2[i]=0;
7680     gte_rs[i]=gte_rt[i]=0;
7681     switch(type) {
7682       case LOAD:
7683         rs1[i]=(source[i]>>21)&0x1f;
7684         rs2[i]=0;
7685         rt1[i]=(source[i]>>16)&0x1f;
7686         rt2[i]=0;
7687         imm[i]=(short)source[i];
7688         break;
7689       case STORE:
7690       case STORELR:
7691         rs1[i]=(source[i]>>21)&0x1f;
7692         rs2[i]=(source[i]>>16)&0x1f;
7693         rt1[i]=0;
7694         rt2[i]=0;
7695         imm[i]=(short)source[i];
7696         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7697         break;
7698       case LOADLR:
7699         // LWL/LWR only load part of the register,
7700         // therefore the target register must be treated as a source too
7701         rs1[i]=(source[i]>>21)&0x1f;
7702         rs2[i]=(source[i]>>16)&0x1f;
7703         rt1[i]=(source[i]>>16)&0x1f;
7704         rt2[i]=0;
7705         imm[i]=(short)source[i];
7706         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7707         if(op==0x26) dep1[i]=rt1[i]; // LWR
7708         break;
7709       case IMM16:
7710         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7711         else rs1[i]=(source[i]>>21)&0x1f;
7712         rs2[i]=0;
7713         rt1[i]=(source[i]>>16)&0x1f;
7714         rt2[i]=0;
7715         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7716           imm[i]=(unsigned short)source[i];
7717         }else{
7718           imm[i]=(short)source[i];
7719         }
7720         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7721         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7722         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7723         break;
7724       case UJUMP:
7725         rs1[i]=0;
7726         rs2[i]=0;
7727         rt1[i]=0;
7728         rt2[i]=0;
7729         // The JAL instruction writes to r31.
7730         if (op&1) {
7731           rt1[i]=31;
7732         }
7733         rs2[i]=CCREG;
7734         break;
7735       case RJUMP:
7736         rs1[i]=(source[i]>>21)&0x1f;
7737         rs2[i]=0;
7738         rt1[i]=0;
7739         rt2[i]=0;
7740         // The JALR instruction writes to rd.
7741         if (op2&1) {
7742           rt1[i]=(source[i]>>11)&0x1f;
7743         }
7744         rs2[i]=CCREG;
7745         break;
7746       case CJUMP:
7747         rs1[i]=(source[i]>>21)&0x1f;
7748         rs2[i]=(source[i]>>16)&0x1f;
7749         rt1[i]=0;
7750         rt2[i]=0;
7751         if(op&2) { // BGTZ/BLEZ
7752           rs2[i]=0;
7753         }
7754         us1[i]=rs1[i];
7755         us2[i]=rs2[i];
7756         likely[i]=op>>4;
7757         break;
7758       case SJUMP:
7759         rs1[i]=(source[i]>>21)&0x1f;
7760         rs2[i]=CCREG;
7761         rt1[i]=0;
7762         rt2[i]=0;
7763         us1[i]=rs1[i];
7764         if(op2&0x10) { // BxxAL
7765           rt1[i]=31;
7766           // NOTE: If the branch is not taken, r31 is still overwritten
7767         }
7768         likely[i]=(op2&2)>>1;
7769         break;
7770       case FJUMP:
7771         rs1[i]=FSREG;
7772         rs2[i]=CSREG;
7773         rt1[i]=0;
7774         rt2[i]=0;
7775         likely[i]=((source[i])>>17)&1;
7776         break;
7777       case ALU:
7778         rs1[i]=(source[i]>>21)&0x1f; // source
7779         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7780         rt1[i]=(source[i]>>11)&0x1f; // destination
7781         rt2[i]=0;
7782         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7783           us1[i]=rs1[i];us2[i]=rs2[i];
7784         }
7785         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7786           dep1[i]=rs1[i];dep2[i]=rs2[i];
7787         }
7788         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7789           dep1[i]=rs1[i];dep2[i]=rs2[i];
7790         }
7791         break;
7792       case MULTDIV:
7793         rs1[i]=(source[i]>>21)&0x1f; // source
7794         rs2[i]=(source[i]>>16)&0x1f; // divisor
7795         rt1[i]=HIREG;
7796         rt2[i]=LOREG;
7797         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7798           us1[i]=rs1[i];us2[i]=rs2[i];
7799         }
7800         break;
7801       case MOV:
7802         rs1[i]=0;
7803         rs2[i]=0;
7804         rt1[i]=0;
7805         rt2[i]=0;
7806         if(op2==0x10) rs1[i]=HIREG; // MFHI
7807         if(op2==0x11) rt1[i]=HIREG; // MTHI
7808         if(op2==0x12) rs1[i]=LOREG; // MFLO
7809         if(op2==0x13) rt1[i]=LOREG; // MTLO
7810         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7811         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7812         dep1[i]=rs1[i];
7813         break;
7814       case SHIFT:
7815         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7816         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7817         rt1[i]=(source[i]>>11)&0x1f; // destination
7818         rt2[i]=0;
7819         // DSLLV/DSRLV/DSRAV are 64-bit
7820         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7821         break;
7822       case SHIFTIMM:
7823         rs1[i]=(source[i]>>16)&0x1f;
7824         rs2[i]=0;
7825         rt1[i]=(source[i]>>11)&0x1f;
7826         rt2[i]=0;
7827         imm[i]=(source[i]>>6)&0x1f;
7828         // DSxx32 instructions
7829         if(op2>=0x3c) imm[i]|=0x20;
7830         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7831         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7832         break;
7833       case COP0:
7834         rs1[i]=0;
7835         rs2[i]=0;
7836         rt1[i]=0;
7837         rt2[i]=0;
7838         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7839         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7840         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7841         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7842         break;
7843       case COP1:
7844         rs1[i]=0;
7845         rs2[i]=0;
7846         rt1[i]=0;
7847         rt2[i]=0;
7848         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7849         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7850         if(op2==5) us1[i]=rs1[i]; // DMTC1
7851         rs2[i]=CSREG;
7852         break;
7853       case COP2:
7854         rs1[i]=0;
7855         rs2[i]=0;
7856         rt1[i]=0;
7857         rt2[i]=0;
7858         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7859         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7860         rs2[i]=CSREG;
7861         int gr=(source[i]>>11)&0x1F;
7862         switch(op2)
7863         {
7864           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7865           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7866           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7867           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7868         }
7869         break;
7870       case C1LS:
7871         rs1[i]=(source[i]>>21)&0x1F;
7872         rs2[i]=CSREG;
7873         rt1[i]=0;
7874         rt2[i]=0;
7875         imm[i]=(short)source[i];
7876         break;
7877       case C2LS:
7878         rs1[i]=(source[i]>>21)&0x1F;
7879         rs2[i]=0;
7880         rt1[i]=0;
7881         rt2[i]=0;
7882         imm[i]=(short)source[i];
7883         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7884         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7885         break;
7886       case C2OP:
7887         rs1[i]=0;
7888         rs2[i]=0;
7889         rt1[i]=0;
7890         rt2[i]=0;
7891         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7892         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7893         gte_rt[i]|=1ll<<63; // every op changes flags
7894         if((source[i]&0x3f)==GTE_MVMVA) {
7895           int v = (source[i] >> 15) & 3;
7896           gte_rs[i]&=~0xe3fll;
7897           if(v==3) gte_rs[i]|=0xe00ll;
7898           else gte_rs[i]|=3ll<<(v*2);
7899         }
7900         break;
7901       case FLOAT:
7902       case FCONV:
7903         rs1[i]=0;
7904         rs2[i]=CSREG;
7905         rt1[i]=0;
7906         rt2[i]=0;
7907         break;
7908       case FCOMP:
7909         rs1[i]=FSREG;
7910         rs2[i]=CSREG;
7911         rt1[i]=FSREG;
7912         rt2[i]=0;
7913         break;
7914       case SYSCALL:
7915       case HLECALL:
7916       case INTCALL:
7917         rs1[i]=CCREG;
7918         rs2[i]=0;
7919         rt1[i]=0;
7920         rt2[i]=0;
7921         break;
7922       default:
7923         rs1[i]=0;
7924         rs2[i]=0;
7925         rt1[i]=0;
7926         rt2[i]=0;
7927     }
7928     /* Calculate branch target addresses */
7929     if(type==UJUMP)
7930       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7931     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7932       ba[i]=start+i*4+8; // Ignore never taken branch
7933     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7934       ba[i]=start+i*4+8; // Ignore never taken branch
7935     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7936       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7937     else ba[i]=-1;
7938     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7939       int do_in_intrp=0;
7940       // branch in delay slot?
7941       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7942         // don't handle first branch and call interpreter if it's hit
7943         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7944         do_in_intrp=1;
7945       }
7946       // basic load delay detection
7947       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7948         int t=(ba[i-1]-start)/4;
7949         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7950           // jump target wants DS result - potential load delay effect
7951           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7952           do_in_intrp=1;
7953           bt[t+1]=1; // expected return from interpreter
7954         }
7955         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7956               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7957           // v0 overwrite like this is a sign of trouble, bail out
7958           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7959           do_in_intrp=1;
7960         }
7961       }
7962       if(do_in_intrp) {
7963         rs1[i-1]=CCREG;
7964         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7965         ba[i-1]=-1;
7966         itype[i-1]=INTCALL;
7967         done=2;
7968         i--; // don't compile the DS
7969       }
7970     }
7971     /* Is this the end of the block? */
7972     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7973       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7974         done=2;
7975       }
7976       else {
7977         if(stop_after_jal) done=1;
7978         // Stop on BREAK
7979         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7980       }
7981       // Don't recompile stuff that's already compiled
7982       if(check_addr(start+i*4+4)) done=1;
7983       // Don't get too close to the limit
7984       if(i>MAXBLOCK/2) done=1;
7985     }
7986     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7987     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7988     if(done==2) {
7989       // Does the block continue due to a branch?
7990       for(j=i-1;j>=0;j--)
7991       {
7992         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7993         if(ba[j]==start+i*4+4) done=j=0;
7994         if(ba[j]==start+i*4+8) done=j=0;
7995       }
7996     }
7997     //assert(i<MAXBLOCK-1);
7998     if(start+i*4==pagelimit-4) done=1;
7999     assert(start+i*4<pagelimit);
8000     if (i==MAXBLOCK-1) done=1;
8001     // Stop if we're compiling junk
8002     if(itype[i]==NI&&opcode[i]==0x11) {
8003       done=stop_after_jal=1;
8004       SysPrintf("Disabled speculative precompilation\n");
8005     }
8006   }
8007   slen=i;
8008   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8009     if(start+i*4==pagelimit) {
8010       itype[i-1]=SPAN;
8011     }
8012   }
8013   assert(slen>0);
8014
8015   /* Pass 2 - Register dependencies and branch targets */
8016
8017   unneeded_registers(0,slen-1,0);
8018
8019   /* Pass 3 - Register allocation */
8020
8021   struct regstat current; // Current register allocations/status
8022   current.is32=1;
8023   current.dirty=0;
8024   current.u=unneeded_reg[0];
8025   current.uu=unneeded_reg_upper[0];
8026   clear_all_regs(current.regmap);
8027   alloc_reg(&current,0,CCREG);
8028   dirty_reg(&current,CCREG);
8029   current.isconst=0;
8030   current.wasconst=0;
8031   current.waswritten=0;
8032   int ds=0;
8033   int cc=0;
8034   int hr=-1;
8035
8036   if((u_int)addr&1) {
8037     // First instruction is delay slot
8038     cc=-1;
8039     bt[1]=1;
8040     ds=1;
8041     unneeded_reg[0]=1;
8042     unneeded_reg_upper[0]=1;
8043     current.regmap[HOST_BTREG]=BTREG;
8044   }
8045
8046   for(i=0;i<slen;i++)
8047   {
8048     if(bt[i])
8049     {
8050       int hr;
8051       for(hr=0;hr<HOST_REGS;hr++)
8052       {
8053         // Is this really necessary?
8054         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8055       }
8056       current.isconst=0;
8057       current.waswritten=0;
8058     }
8059     if(i>1)
8060     {
8061       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8062       {
8063         if(rs1[i-2]==0||rs2[i-2]==0)
8064         {
8065           if(rs1[i-2]) {
8066             current.is32|=1LL<<rs1[i-2];
8067             int hr=get_reg(current.regmap,rs1[i-2]|64);
8068             if(hr>=0) current.regmap[hr]=-1;
8069           }
8070           if(rs2[i-2]) {
8071             current.is32|=1LL<<rs2[i-2];
8072             int hr=get_reg(current.regmap,rs2[i-2]|64);
8073             if(hr>=0) current.regmap[hr]=-1;
8074           }
8075         }
8076       }
8077     }
8078     current.is32=-1LL;
8079
8080     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8081     regs[i].wasconst=current.isconst;
8082     regs[i].was32=current.is32;
8083     regs[i].wasdirty=current.dirty;
8084     regs[i].loadedconst=0;
8085     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8086       if(i+1<slen) {
8087         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8088         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8089         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8090         current.u|=1;
8091         current.uu|=1;
8092       } else {
8093         current.u=1;
8094         current.uu=1;
8095       }
8096     } else {
8097       if(i+1<slen) {
8098         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8099         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8100         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8101         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8102         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8103         current.u|=1;
8104         current.uu|=1;
8105       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
8106     }
8107     is_ds[i]=ds;
8108     if(ds) {
8109       ds=0; // Skip delay slot, already allocated as part of branch
8110       // ...but we need to alloc it in case something jumps here
8111       if(i+1<slen) {
8112         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8113         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8114       }else{
8115         current.u=branch_unneeded_reg[i-1];
8116         current.uu=branch_unneeded_reg_upper[i-1];
8117       }
8118       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8119       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8120       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8121       current.u|=1;
8122       current.uu|=1;
8123       struct regstat temp;
8124       memcpy(&temp,&current,sizeof(current));
8125       temp.wasdirty=temp.dirty;
8126       temp.was32=temp.is32;
8127       // TODO: Take into account unconditional branches, as below
8128       delayslot_alloc(&temp,i);
8129       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8130       regs[i].wasdirty=temp.wasdirty;
8131       regs[i].was32=temp.was32;
8132       regs[i].dirty=temp.dirty;
8133       regs[i].is32=temp.is32;
8134       regs[i].isconst=0;
8135       regs[i].wasconst=0;
8136       current.isconst=0;
8137       // Create entry (branch target) regmap
8138       for(hr=0;hr<HOST_REGS;hr++)
8139       {
8140         int r=temp.regmap[hr];
8141         if(r>=0) {
8142           if(r!=regmap_pre[i][hr]) {
8143             regs[i].regmap_entry[hr]=-1;
8144           }
8145           else
8146           {
8147             if(r<64){
8148               if((current.u>>r)&1) {
8149                 regs[i].regmap_entry[hr]=-1;
8150                 regs[i].regmap[hr]=-1;
8151                 //Don't clear regs in the delay slot as the branch might need them
8152                 //current.regmap[hr]=-1;
8153               }else
8154                 regs[i].regmap_entry[hr]=r;
8155             }
8156             else {
8157               if((current.uu>>(r&63))&1) {
8158                 regs[i].regmap_entry[hr]=-1;
8159                 regs[i].regmap[hr]=-1;
8160                 //Don't clear regs in the delay slot as the branch might need them
8161                 //current.regmap[hr]=-1;
8162               }else
8163                 regs[i].regmap_entry[hr]=r;
8164             }
8165           }
8166         } else {
8167           // First instruction expects CCREG to be allocated
8168           if(i==0&&hr==HOST_CCREG)
8169             regs[i].regmap_entry[hr]=CCREG;
8170           else
8171             regs[i].regmap_entry[hr]=-1;
8172         }
8173       }
8174     }
8175     else { // Not delay slot
8176       switch(itype[i]) {
8177         case UJUMP:
8178           //current.isconst=0; // DEBUG
8179           //current.wasconst=0; // DEBUG
8180           //regs[i].wasconst=0; // DEBUG
8181           clear_const(&current,rt1[i]);
8182           alloc_cc(&current,i);
8183           dirty_reg(&current,CCREG);
8184           if (rt1[i]==31) {
8185             alloc_reg(&current,i,31);
8186             dirty_reg(&current,31);
8187             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8188             //assert(rt1[i+1]!=rt1[i]);
8189             #ifdef REG_PREFETCH
8190             alloc_reg(&current,i,PTEMP);
8191             #endif
8192             //current.is32|=1LL<<rt1[i];
8193           }
8194           ooo[i]=1;
8195           delayslot_alloc(&current,i+1);
8196           //current.isconst=0; // DEBUG
8197           ds=1;
8198           //printf("i=%d, isconst=%x\n",i,current.isconst);
8199           break;
8200         case RJUMP:
8201           //current.isconst=0;
8202           //current.wasconst=0;
8203           //regs[i].wasconst=0;
8204           clear_const(&current,rs1[i]);
8205           clear_const(&current,rt1[i]);
8206           alloc_cc(&current,i);
8207           dirty_reg(&current,CCREG);
8208           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8209             alloc_reg(&current,i,rs1[i]);
8210             if (rt1[i]!=0) {
8211               alloc_reg(&current,i,rt1[i]);
8212               dirty_reg(&current,rt1[i]);
8213               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8214               assert(rt1[i+1]!=rt1[i]);
8215               #ifdef REG_PREFETCH
8216               alloc_reg(&current,i,PTEMP);
8217               #endif
8218             }
8219             #ifdef USE_MINI_HT
8220             if(rs1[i]==31) { // JALR
8221               alloc_reg(&current,i,RHASH);
8222               #ifndef HOST_IMM_ADDR32
8223               alloc_reg(&current,i,RHTBL);
8224               #endif
8225             }
8226             #endif
8227             delayslot_alloc(&current,i+1);
8228           } else {
8229             // The delay slot overwrites our source register,
8230             // allocate a temporary register to hold the old value.
8231             current.isconst=0;
8232             current.wasconst=0;
8233             regs[i].wasconst=0;
8234             delayslot_alloc(&current,i+1);
8235             current.isconst=0;
8236             alloc_reg(&current,i,RTEMP);
8237           }
8238           //current.isconst=0; // DEBUG
8239           ooo[i]=1;
8240           ds=1;
8241           break;
8242         case CJUMP:
8243           //current.isconst=0;
8244           //current.wasconst=0;
8245           //regs[i].wasconst=0;
8246           clear_const(&current,rs1[i]);
8247           clear_const(&current,rs2[i]);
8248           if((opcode[i]&0x3E)==4) // BEQ/BNE
8249           {
8250             alloc_cc(&current,i);
8251             dirty_reg(&current,CCREG);
8252             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8253             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8254             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8255             {
8256               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8257               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8258             }
8259             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8260                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8261               // The delay slot overwrites one of our conditions.
8262               // Allocate the branch condition registers instead.
8263               current.isconst=0;
8264               current.wasconst=0;
8265               regs[i].wasconst=0;
8266               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8267               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8268               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8269               {
8270                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8271                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8272               }
8273             }
8274             else
8275             {
8276               ooo[i]=1;
8277               delayslot_alloc(&current,i+1);
8278             }
8279           }
8280           else
8281           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8282           {
8283             alloc_cc(&current,i);
8284             dirty_reg(&current,CCREG);
8285             alloc_reg(&current,i,rs1[i]);
8286             if(!(current.is32>>rs1[i]&1))
8287             {
8288               alloc_reg64(&current,i,rs1[i]);
8289             }
8290             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8291               // The delay slot overwrites one of our conditions.
8292               // Allocate the branch condition registers instead.
8293               current.isconst=0;
8294               current.wasconst=0;
8295               regs[i].wasconst=0;
8296               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8297               if(!((current.is32>>rs1[i])&1))
8298               {
8299                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8300               }
8301             }
8302             else
8303             {
8304               ooo[i]=1;
8305               delayslot_alloc(&current,i+1);
8306             }
8307           }
8308           else
8309           // Don't alloc the delay slot yet because we might not execute it
8310           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8311           {
8312             current.isconst=0;
8313             current.wasconst=0;
8314             regs[i].wasconst=0;
8315             alloc_cc(&current,i);
8316             dirty_reg(&current,CCREG);
8317             alloc_reg(&current,i,rs1[i]);
8318             alloc_reg(&current,i,rs2[i]);
8319             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8320             {
8321               alloc_reg64(&current,i,rs1[i]);
8322               alloc_reg64(&current,i,rs2[i]);
8323             }
8324           }
8325           else
8326           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8327           {
8328             current.isconst=0;
8329             current.wasconst=0;
8330             regs[i].wasconst=0;
8331             alloc_cc(&current,i);
8332             dirty_reg(&current,CCREG);
8333             alloc_reg(&current,i,rs1[i]);
8334             if(!(current.is32>>rs1[i]&1))
8335             {
8336               alloc_reg64(&current,i,rs1[i]);
8337             }
8338           }
8339           ds=1;
8340           //current.isconst=0;
8341           break;
8342         case SJUMP:
8343           //current.isconst=0;
8344           //current.wasconst=0;
8345           //regs[i].wasconst=0;
8346           clear_const(&current,rs1[i]);
8347           clear_const(&current,rt1[i]);
8348           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8349           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8350           {
8351             alloc_cc(&current,i);
8352             dirty_reg(&current,CCREG);
8353             alloc_reg(&current,i,rs1[i]);
8354             if(!(current.is32>>rs1[i]&1))
8355             {
8356               alloc_reg64(&current,i,rs1[i]);
8357             }
8358             if (rt1[i]==31) { // BLTZAL/BGEZAL
8359               alloc_reg(&current,i,31);
8360               dirty_reg(&current,31);
8361               //#ifdef REG_PREFETCH
8362               //alloc_reg(&current,i,PTEMP);
8363               //#endif
8364               //current.is32|=1LL<<rt1[i];
8365             }
8366             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8367                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8368               // Allocate the branch condition registers instead.
8369               current.isconst=0;
8370               current.wasconst=0;
8371               regs[i].wasconst=0;
8372               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8373               if(!((current.is32>>rs1[i])&1))
8374               {
8375                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8376               }
8377             }
8378             else
8379             {
8380               ooo[i]=1;
8381               delayslot_alloc(&current,i+1);
8382             }
8383           }
8384           else
8385           // Don't alloc the delay slot yet because we might not execute it
8386           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8387           {
8388             current.isconst=0;
8389             current.wasconst=0;
8390             regs[i].wasconst=0;
8391             alloc_cc(&current,i);
8392             dirty_reg(&current,CCREG);
8393             alloc_reg(&current,i,rs1[i]);
8394             if(!(current.is32>>rs1[i]&1))
8395             {
8396               alloc_reg64(&current,i,rs1[i]);
8397             }
8398           }
8399           ds=1;
8400           //current.isconst=0;
8401           break;
8402         case FJUMP:
8403           current.isconst=0;
8404           current.wasconst=0;
8405           regs[i].wasconst=0;
8406           if(likely[i]==0) // BC1F/BC1T
8407           {
8408             // TODO: Theoretically we can run out of registers here on x86.
8409             // The delay slot can allocate up to six, and we need to check
8410             // CSREG before executing the delay slot.  Possibly we can drop
8411             // the cycle count and then reload it after checking that the
8412             // FPU is in a usable state, or don't do out-of-order execution.
8413             alloc_cc(&current,i);
8414             dirty_reg(&current,CCREG);
8415             alloc_reg(&current,i,FSREG);
8416             alloc_reg(&current,i,CSREG);
8417             if(itype[i+1]==FCOMP) {
8418               // The delay slot overwrites the branch condition.
8419               // Allocate the branch condition registers instead.
8420               alloc_cc(&current,i);
8421               dirty_reg(&current,CCREG);
8422               alloc_reg(&current,i,CSREG);
8423               alloc_reg(&current,i,FSREG);
8424             }
8425             else {
8426               ooo[i]=1;
8427               delayslot_alloc(&current,i+1);
8428               alloc_reg(&current,i+1,CSREG);
8429             }
8430           }
8431           else
8432           // Don't alloc the delay slot yet because we might not execute it
8433           if(likely[i]) // BC1FL/BC1TL
8434           {
8435             alloc_cc(&current,i);
8436             dirty_reg(&current,CCREG);
8437             alloc_reg(&current,i,CSREG);
8438             alloc_reg(&current,i,FSREG);
8439           }
8440           ds=1;
8441           current.isconst=0;
8442           break;
8443         case IMM16:
8444           imm16_alloc(&current,i);
8445           break;
8446         case LOAD:
8447         case LOADLR:
8448           load_alloc(&current,i);
8449           break;
8450         case STORE:
8451         case STORELR:
8452           store_alloc(&current,i);
8453           break;
8454         case ALU:
8455           alu_alloc(&current,i);
8456           break;
8457         case SHIFT:
8458           shift_alloc(&current,i);
8459           break;
8460         case MULTDIV:
8461           multdiv_alloc(&current,i);
8462           break;
8463         case SHIFTIMM:
8464           shiftimm_alloc(&current,i);
8465           break;
8466         case MOV:
8467           mov_alloc(&current,i);
8468           break;
8469         case COP0:
8470           cop0_alloc(&current,i);
8471           break;
8472         case COP1:
8473         case COP2:
8474           cop1_alloc(&current,i);
8475           break;
8476         case C1LS:
8477           c1ls_alloc(&current,i);
8478           break;
8479         case C2LS:
8480           c2ls_alloc(&current,i);
8481           break;
8482         case C2OP:
8483           c2op_alloc(&current,i);
8484           break;
8485         case FCONV:
8486           fconv_alloc(&current,i);
8487           break;
8488         case FLOAT:
8489           float_alloc(&current,i);
8490           break;
8491         case FCOMP:
8492           fcomp_alloc(&current,i);
8493           break;
8494         case SYSCALL:
8495         case HLECALL:
8496         case INTCALL:
8497           syscall_alloc(&current,i);
8498           break;
8499         case SPAN:
8500           pagespan_alloc(&current,i);
8501           break;
8502       }
8503
8504       // Drop the upper half of registers that have become 32-bit
8505       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8506       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8507         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8508         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8509         current.uu|=1;
8510       } else {
8511         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8512         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8513         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8514         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8515         current.uu|=1;
8516       }
8517
8518       // Create entry (branch target) regmap
8519       for(hr=0;hr<HOST_REGS;hr++)
8520       {
8521         int r,or;
8522         r=current.regmap[hr];
8523         if(r>=0) {
8524           if(r!=regmap_pre[i][hr]) {
8525             // TODO: delay slot (?)
8526             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8527             if(or<0||(r&63)>=TEMPREG){
8528               regs[i].regmap_entry[hr]=-1;
8529             }
8530             else
8531             {
8532               // Just move it to a different register
8533               regs[i].regmap_entry[hr]=r;
8534               // If it was dirty before, it's still dirty
8535               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8536             }
8537           }
8538           else
8539           {
8540             // Unneeded
8541             if(r==0){
8542               regs[i].regmap_entry[hr]=0;
8543             }
8544             else
8545             if(r<64){
8546               if((current.u>>r)&1) {
8547                 regs[i].regmap_entry[hr]=-1;
8548                 //regs[i].regmap[hr]=-1;
8549                 current.regmap[hr]=-1;
8550               }else
8551                 regs[i].regmap_entry[hr]=r;
8552             }
8553             else {
8554               if((current.uu>>(r&63))&1) {
8555                 regs[i].regmap_entry[hr]=-1;
8556                 //regs[i].regmap[hr]=-1;
8557                 current.regmap[hr]=-1;
8558               }else
8559                 regs[i].regmap_entry[hr]=r;
8560             }
8561           }
8562         } else {
8563           // Branches expect CCREG to be allocated at the target
8564           if(regmap_pre[i][hr]==CCREG)
8565             regs[i].regmap_entry[hr]=CCREG;
8566           else
8567             regs[i].regmap_entry[hr]=-1;
8568         }
8569       }
8570       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8571     }
8572
8573     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8574       current.waswritten|=1<<rs1[i-1];
8575     current.waswritten&=~(1<<rt1[i]);
8576     current.waswritten&=~(1<<rt2[i]);
8577     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8578       current.waswritten&=~(1<<rs1[i]);
8579
8580     /* Branch post-alloc */
8581     if(i>0)
8582     {
8583       current.was32=current.is32;
8584       current.wasdirty=current.dirty;
8585       switch(itype[i-1]) {
8586         case UJUMP:
8587           memcpy(&branch_regs[i-1],&current,sizeof(current));
8588           branch_regs[i-1].isconst=0;
8589           branch_regs[i-1].wasconst=0;
8590           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8591           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8592           alloc_cc(&branch_regs[i-1],i-1);
8593           dirty_reg(&branch_regs[i-1],CCREG);
8594           if(rt1[i-1]==31) { // JAL
8595             alloc_reg(&branch_regs[i-1],i-1,31);
8596             dirty_reg(&branch_regs[i-1],31);
8597             branch_regs[i-1].is32|=1LL<<31;
8598           }
8599           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8600           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8601           break;
8602         case RJUMP:
8603           memcpy(&branch_regs[i-1],&current,sizeof(current));
8604           branch_regs[i-1].isconst=0;
8605           branch_regs[i-1].wasconst=0;
8606           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8607           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8608           alloc_cc(&branch_regs[i-1],i-1);
8609           dirty_reg(&branch_regs[i-1],CCREG);
8610           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8611           if(rt1[i-1]!=0) { // JALR
8612             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8613             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8614             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8615           }
8616           #ifdef USE_MINI_HT
8617           if(rs1[i-1]==31) { // JALR
8618             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8619             #ifndef HOST_IMM_ADDR32
8620             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8621             #endif
8622           }
8623           #endif
8624           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8625           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8626           break;
8627         case CJUMP:
8628           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8629           {
8630             alloc_cc(&current,i-1);
8631             dirty_reg(&current,CCREG);
8632             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8633                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8634               // The delay slot overwrote one of our conditions
8635               // Delay slot goes after the test (in order)
8636               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8637               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8638               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8639               current.u|=1;
8640               current.uu|=1;
8641               delayslot_alloc(&current,i);
8642               current.isconst=0;
8643             }
8644             else
8645             {
8646               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8647               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8648               // Alloc the branch condition registers
8649               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8650               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8651               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8652               {
8653                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8654                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8655               }
8656             }
8657             memcpy(&branch_regs[i-1],&current,sizeof(current));
8658             branch_regs[i-1].isconst=0;
8659             branch_regs[i-1].wasconst=0;
8660             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8661             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8662           }
8663           else
8664           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8665           {
8666             alloc_cc(&current,i-1);
8667             dirty_reg(&current,CCREG);
8668             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8669               // The delay slot overwrote the branch condition
8670               // Delay slot goes after the test (in order)
8671               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8672               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8673               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8674               current.u|=1;
8675               current.uu|=1;
8676               delayslot_alloc(&current,i);
8677               current.isconst=0;
8678             }
8679             else
8680             {
8681               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8682               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8683               // Alloc the branch condition register
8684               alloc_reg(&current,i-1,rs1[i-1]);
8685               if(!(current.is32>>rs1[i-1]&1))
8686               {
8687                 alloc_reg64(&current,i-1,rs1[i-1]);
8688               }
8689             }
8690             memcpy(&branch_regs[i-1],&current,sizeof(current));
8691             branch_regs[i-1].isconst=0;
8692             branch_regs[i-1].wasconst=0;
8693             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8694             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8695           }
8696           else
8697           // Alloc the delay slot in case the branch is taken
8698           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8699           {
8700             memcpy(&branch_regs[i-1],&current,sizeof(current));
8701             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8702             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8703             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8704             alloc_cc(&branch_regs[i-1],i);
8705             dirty_reg(&branch_regs[i-1],CCREG);
8706             delayslot_alloc(&branch_regs[i-1],i);
8707             branch_regs[i-1].isconst=0;
8708             alloc_reg(&current,i,CCREG); // Not taken path
8709             dirty_reg(&current,CCREG);
8710             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8711           }
8712           else
8713           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8714           {
8715             memcpy(&branch_regs[i-1],&current,sizeof(current));
8716             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8717             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8718             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8719             alloc_cc(&branch_regs[i-1],i);
8720             dirty_reg(&branch_regs[i-1],CCREG);
8721             delayslot_alloc(&branch_regs[i-1],i);
8722             branch_regs[i-1].isconst=0;
8723             alloc_reg(&current,i,CCREG); // Not taken path
8724             dirty_reg(&current,CCREG);
8725             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8726           }
8727           break;
8728         case SJUMP:
8729           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8730           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8731           {
8732             alloc_cc(&current,i-1);
8733             dirty_reg(&current,CCREG);
8734             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8735               // The delay slot overwrote the branch condition
8736               // Delay slot goes after the test (in order)
8737               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8738               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8739               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8740               current.u|=1;
8741               current.uu|=1;
8742               delayslot_alloc(&current,i);
8743               current.isconst=0;
8744             }
8745             else
8746             {
8747               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8748               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8749               // Alloc the branch condition register
8750               alloc_reg(&current,i-1,rs1[i-1]);
8751               if(!(current.is32>>rs1[i-1]&1))
8752               {
8753                 alloc_reg64(&current,i-1,rs1[i-1]);
8754               }
8755             }
8756             memcpy(&branch_regs[i-1],&current,sizeof(current));
8757             branch_regs[i-1].isconst=0;
8758             branch_regs[i-1].wasconst=0;
8759             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8760             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8761           }
8762           else
8763           // Alloc the delay slot in case the branch is taken
8764           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8765           {
8766             memcpy(&branch_regs[i-1],&current,sizeof(current));
8767             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8768             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8769             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8770             alloc_cc(&branch_regs[i-1],i);
8771             dirty_reg(&branch_regs[i-1],CCREG);
8772             delayslot_alloc(&branch_regs[i-1],i);
8773             branch_regs[i-1].isconst=0;
8774             alloc_reg(&current,i,CCREG); // Not taken path
8775             dirty_reg(&current,CCREG);
8776             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8777           }
8778           // FIXME: BLTZAL/BGEZAL
8779           if(opcode2[i-1]&0x10) { // BxxZAL
8780             alloc_reg(&branch_regs[i-1],i-1,31);
8781             dirty_reg(&branch_regs[i-1],31);
8782             branch_regs[i-1].is32|=1LL<<31;
8783           }
8784           break;
8785         case FJUMP:
8786           if(likely[i-1]==0) // BC1F/BC1T
8787           {
8788             alloc_cc(&current,i-1);
8789             dirty_reg(&current,CCREG);
8790             if(itype[i]==FCOMP) {
8791               // The delay slot overwrote the branch condition
8792               // Delay slot goes after the test (in order)
8793               delayslot_alloc(&current,i);
8794               current.isconst=0;
8795             }
8796             else
8797             {
8798               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8799               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8800               // Alloc the branch condition register
8801               alloc_reg(&current,i-1,FSREG);
8802             }
8803             memcpy(&branch_regs[i-1],&current,sizeof(current));
8804             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8805           }
8806           else // BC1FL/BC1TL
8807           {
8808             // Alloc the delay slot in case the branch is taken
8809             memcpy(&branch_regs[i-1],&current,sizeof(current));
8810             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8811             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8812             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8813             alloc_cc(&branch_regs[i-1],i);
8814             dirty_reg(&branch_regs[i-1],CCREG);
8815             delayslot_alloc(&branch_regs[i-1],i);
8816             branch_regs[i-1].isconst=0;
8817             alloc_reg(&current,i,CCREG); // Not taken path
8818             dirty_reg(&current,CCREG);
8819             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8820           }
8821           break;
8822       }
8823
8824       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8825       {
8826         if(rt1[i-1]==31) // JAL/JALR
8827         {
8828           // Subroutine call will return here, don't alloc any registers
8829           current.is32=1;
8830           current.dirty=0;
8831           clear_all_regs(current.regmap);
8832           alloc_reg(&current,i,CCREG);
8833           dirty_reg(&current,CCREG);
8834         }
8835         else if(i+1<slen)
8836         {
8837           // Internal branch will jump here, match registers to caller
8838           current.is32=0x3FFFFFFFFLL;
8839           current.dirty=0;
8840           clear_all_regs(current.regmap);
8841           alloc_reg(&current,i,CCREG);
8842           dirty_reg(&current,CCREG);
8843           for(j=i-1;j>=0;j--)
8844           {
8845             if(ba[j]==start+i*4+4) {
8846               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8847               current.is32=branch_regs[j].is32;
8848               current.dirty=branch_regs[j].dirty;
8849               break;
8850             }
8851           }
8852           while(j>=0) {
8853             if(ba[j]==start+i*4+4) {
8854               for(hr=0;hr<HOST_REGS;hr++) {
8855                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8856                   current.regmap[hr]=-1;
8857                 }
8858                 current.is32&=branch_regs[j].is32;
8859                 current.dirty&=branch_regs[j].dirty;
8860               }
8861             }
8862             j--;
8863           }
8864         }
8865       }
8866     }
8867
8868     // Count cycles in between branches
8869     ccadj[i]=cc;
8870     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8871     {
8872       cc=0;
8873     }
8874 #if !defined(DRC_DBG)
8875     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8876     {
8877       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8878       cc+=gte_cycletab[source[i]&0x3f]/2;
8879     }
8880     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8881     {
8882       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8883     }
8884     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8885     {
8886       cc+=4;
8887     }
8888     else if(itype[i]==C2LS)
8889     {
8890       cc+=4;
8891     }
8892 #endif
8893     else
8894     {
8895       cc++;
8896     }
8897
8898     flush_dirty_uppers(&current);
8899     if(!is_ds[i]) {
8900       regs[i].is32=current.is32;
8901       regs[i].dirty=current.dirty;
8902       regs[i].isconst=current.isconst;
8903       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8904     }
8905     for(hr=0;hr<HOST_REGS;hr++) {
8906       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8907         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8908           regs[i].wasconst&=~(1<<hr);
8909         }
8910       }
8911     }
8912     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8913     regs[i].waswritten=current.waswritten;
8914   }
8915
8916   /* Pass 4 - Cull unused host registers */
8917
8918   uint64_t nr=0;
8919
8920   for (i=slen-1;i>=0;i--)
8921   {
8922     int hr;
8923     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8924     {
8925       if(ba[i]<start || ba[i]>=(start+slen*4))
8926       {
8927         // Branch out of this block, don't need anything
8928         nr=0;
8929       }
8930       else
8931       {
8932         // Internal branch
8933         // Need whatever matches the target
8934         nr=0;
8935         int t=(ba[i]-start)>>2;
8936         for(hr=0;hr<HOST_REGS;hr++)
8937         {
8938           if(regs[i].regmap_entry[hr]>=0) {
8939             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8940           }
8941         }
8942       }
8943       // Conditional branch may need registers for following instructions
8944       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8945       {
8946         if(i<slen-2) {
8947           nr|=needed_reg[i+2];
8948           for(hr=0;hr<HOST_REGS;hr++)
8949           {
8950             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8951             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8952           }
8953         }
8954       }
8955       // Don't need stuff which is overwritten
8956       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8957       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8958       // Merge in delay slot
8959       for(hr=0;hr<HOST_REGS;hr++)
8960       {
8961         if(!likely[i]) {
8962           // These are overwritten unless the branch is "likely"
8963           // and the delay slot is nullified if not taken
8964           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8965           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8966         }
8967         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8968         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8969         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8970         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8971         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8972         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8973         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8974         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8975         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8976           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8977           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8978         }
8979         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8980           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8981           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8982         }
8983         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8984           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8985           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8986         }
8987       }
8988     }
8989     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8990     {
8991       // SYSCALL instruction (software interrupt)
8992       nr=0;
8993     }
8994     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8995     {
8996       // ERET instruction (return from interrupt)
8997       nr=0;
8998     }
8999     else // Non-branch
9000     {
9001       if(i<slen-1) {
9002         for(hr=0;hr<HOST_REGS;hr++) {
9003           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9004           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9005           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9006           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9007         }
9008       }
9009     }
9010     for(hr=0;hr<HOST_REGS;hr++)
9011     {
9012       // Overwritten registers are not needed
9013       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9014       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9015       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9016       // Source registers are needed
9017       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9018       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9019       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9020       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9021       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9022       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9023       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9024       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9025       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9026         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9027         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9028       }
9029       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9030         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9031         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9032       }
9033       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9034         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9035         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9036       }
9037       // Don't store a register immediately after writing it,
9038       // may prevent dual-issue.
9039       // But do so if this is a branch target, otherwise we
9040       // might have to load the register before the branch.
9041       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9042         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9043            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9044           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9045           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9046         }
9047         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9048            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9049           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9050           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9051         }
9052       }
9053     }
9054     // Cycle count is needed at branches.  Assume it is needed at the target too.
9055     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9056       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9057       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9058     }
9059     // Save it
9060     needed_reg[i]=nr;
9061
9062     // Deallocate unneeded registers
9063     for(hr=0;hr<HOST_REGS;hr++)
9064     {
9065       if(!((nr>>hr)&1)) {
9066         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9067         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9068            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9069            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9070         {
9071           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9072           {
9073             if(likely[i]) {
9074               regs[i].regmap[hr]=-1;
9075               regs[i].isconst&=~(1<<hr);
9076               if(i<slen-2) {
9077                 regmap_pre[i+2][hr]=-1;
9078                 regs[i+2].wasconst&=~(1<<hr);
9079               }
9080             }
9081           }
9082         }
9083         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9084         {
9085           int d1=0,d2=0,map=0,temp=0;
9086           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9087           {
9088             d1=dep1[i+1];
9089             d2=dep2[i+1];
9090           }
9091           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9092              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9093             map=INVCP;
9094           }
9095           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9096              itype[i+1]==C1LS || itype[i+1]==C2LS)
9097             temp=FTEMP;
9098           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9099              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9100              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9101              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9102              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9103              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9104              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9105              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9106              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9107              regs[i].regmap[hr]!=map )
9108           {
9109             regs[i].regmap[hr]=-1;
9110             regs[i].isconst&=~(1<<hr);
9111             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9112                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9113                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9114                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9115                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9116                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9117                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9118                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9119                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9120                branch_regs[i].regmap[hr]!=map)
9121             {
9122               branch_regs[i].regmap[hr]=-1;
9123               branch_regs[i].regmap_entry[hr]=-1;
9124               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9125               {
9126                 if(!likely[i]&&i<slen-2) {
9127                   regmap_pre[i+2][hr]=-1;
9128                   regs[i+2].wasconst&=~(1<<hr);
9129                 }
9130               }
9131             }
9132           }
9133         }
9134         else
9135         {
9136           // Non-branch
9137           if(i>0)
9138           {
9139             int d1=0,d2=0,map=-1,temp=-1;
9140             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9141             {
9142               d1=dep1[i];
9143               d2=dep2[i];
9144             }
9145             if(itype[i]==STORE || itype[i]==STORELR ||
9146                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9147               map=INVCP;
9148             }
9149             if(itype[i]==LOADLR || itype[i]==STORELR ||
9150                itype[i]==C1LS || itype[i]==C2LS)
9151               temp=FTEMP;
9152             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9153                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9154                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9155                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9156                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9157                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9158             {
9159               if(i<slen-1&&!is_ds[i]) {
9160                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9161                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9162                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9163                 {
9164                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9165                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9166                 }
9167                 regmap_pre[i+1][hr]=-1;
9168                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9169                 regs[i+1].wasconst&=~(1<<hr);
9170               }
9171               regs[i].regmap[hr]=-1;
9172               regs[i].isconst&=~(1<<hr);
9173             }
9174           }
9175         }
9176       }
9177     }
9178   }
9179
9180   /* Pass 5 - Pre-allocate registers */
9181
9182   // If a register is allocated during a loop, try to allocate it for the
9183   // entire loop, if possible.  This avoids loading/storing registers
9184   // inside of the loop.
9185
9186   signed char f_regmap[HOST_REGS];
9187   clear_all_regs(f_regmap);
9188   for(i=0;i<slen-1;i++)
9189   {
9190     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9191     {
9192       if(ba[i]>=start && ba[i]<(start+i*4))
9193       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9194       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9195       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9196       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9197       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9198       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9199       {
9200         int t=(ba[i]-start)>>2;
9201         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9202         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9203         for(hr=0;hr<HOST_REGS;hr++)
9204         {
9205           if(regs[i].regmap[hr]>64) {
9206             if(!((regs[i].dirty>>hr)&1))
9207               f_regmap[hr]=regs[i].regmap[hr];
9208             else f_regmap[hr]=-1;
9209           }
9210           else if(regs[i].regmap[hr]>=0) {
9211             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9212               // dealloc old register
9213               int n;
9214               for(n=0;n<HOST_REGS;n++)
9215               {
9216                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9217               }
9218               // and alloc new one
9219               f_regmap[hr]=regs[i].regmap[hr];
9220             }
9221           }
9222           if(branch_regs[i].regmap[hr]>64) {
9223             if(!((branch_regs[i].dirty>>hr)&1))
9224               f_regmap[hr]=branch_regs[i].regmap[hr];
9225             else f_regmap[hr]=-1;
9226           }
9227           else if(branch_regs[i].regmap[hr]>=0) {
9228             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9229               // dealloc old register
9230               int n;
9231               for(n=0;n<HOST_REGS;n++)
9232               {
9233                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9234               }
9235               // and alloc new one
9236               f_regmap[hr]=branch_regs[i].regmap[hr];
9237             }
9238           }
9239           if(ooo[i]) {
9240             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9241               f_regmap[hr]=branch_regs[i].regmap[hr];
9242           }else{
9243             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9244               f_regmap[hr]=branch_regs[i].regmap[hr];
9245           }
9246           // Avoid dirty->clean transition
9247           #ifdef DESTRUCTIVE_WRITEBACK
9248           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9249           #endif
9250           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9251           // case above, however it's always a good idea.  We can't hoist the
9252           // load if the register was already allocated, so there's no point
9253           // wasting time analyzing most of these cases.  It only "succeeds"
9254           // when the mapping was different and the load can be replaced with
9255           // a mov, which is of negligible benefit.  So such cases are
9256           // skipped below.
9257           if(f_regmap[hr]>0) {
9258             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9259               int r=f_regmap[hr];
9260               for(j=t;j<=i;j++)
9261               {
9262                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9263                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9264                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9265                 if(r>63) {
9266                   // NB This can exclude the case where the upper-half
9267                   // register is lower numbered than the lower-half
9268                   // register.  Not sure if it's worth fixing...
9269                   if(get_reg(regs[j].regmap,r&63)<0) break;
9270                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9271                   if(regs[j].is32&(1LL<<(r&63))) break;
9272                 }
9273                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9274                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9275                   int k;
9276                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9277                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9278                     if(r>63) {
9279                       if(get_reg(regs[i].regmap,r&63)<0) break;
9280                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9281                     }
9282                     k=i;
9283                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9284                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9285                         //printf("no free regs for store %x\n",start+(k-1)*4);
9286                         break;
9287                       }
9288                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9289                         //printf("no-match due to different register\n");
9290                         break;
9291                       }
9292                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9293                         //printf("no-match due to branch\n");
9294                         break;
9295                       }
9296                       // call/ret fast path assumes no registers allocated
9297                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9298                         break;
9299                       }
9300                       if(r>63) {
9301                         // NB This can exclude the case where the upper-half
9302                         // register is lower numbered than the lower-half
9303                         // register.  Not sure if it's worth fixing...
9304                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9305                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9306                       }
9307                       k--;
9308                     }
9309                     if(i<slen-1) {
9310                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9311                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9312                         //printf("bad match after branch\n");
9313                         break;
9314                       }
9315                     }
9316                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9317                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9318                       while(k<i) {
9319                         regs[k].regmap_entry[hr]=f_regmap[hr];
9320                         regs[k].regmap[hr]=f_regmap[hr];
9321                         regmap_pre[k+1][hr]=f_regmap[hr];
9322                         regs[k].wasdirty&=~(1<<hr);
9323                         regs[k].dirty&=~(1<<hr);
9324                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9325                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9326                         regs[k].wasconst&=~(1<<hr);
9327                         regs[k].isconst&=~(1<<hr);
9328                         k++;
9329                       }
9330                     }
9331                     else {
9332                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9333                       break;
9334                     }
9335                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9336                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9337                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9338                       regs[i].regmap_entry[hr]=f_regmap[hr];
9339                       regs[i].regmap[hr]=f_regmap[hr];
9340                       regs[i].wasdirty&=~(1<<hr);
9341                       regs[i].dirty&=~(1<<hr);
9342                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9343                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9344                       regs[i].wasconst&=~(1<<hr);
9345                       regs[i].isconst&=~(1<<hr);
9346                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9347                       branch_regs[i].wasdirty&=~(1<<hr);
9348                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9349                       branch_regs[i].regmap[hr]=f_regmap[hr];
9350                       branch_regs[i].dirty&=~(1<<hr);
9351                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9352                       branch_regs[i].wasconst&=~(1<<hr);
9353                       branch_regs[i].isconst&=~(1<<hr);
9354                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9355                         regmap_pre[i+2][hr]=f_regmap[hr];
9356                         regs[i+2].wasdirty&=~(1<<hr);
9357                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9358                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9359                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9360                       }
9361                     }
9362                   }
9363                   for(k=t;k<j;k++) {
9364                     // Alloc register clean at beginning of loop,
9365                     // but may dirty it in pass 6
9366                     regs[k].regmap_entry[hr]=f_regmap[hr];
9367                     regs[k].regmap[hr]=f_regmap[hr];
9368                     regs[k].dirty&=~(1<<hr);
9369                     regs[k].wasconst&=~(1<<hr);
9370                     regs[k].isconst&=~(1<<hr);
9371                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9372                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9373                       branch_regs[k].regmap[hr]=f_regmap[hr];
9374                       branch_regs[k].dirty&=~(1<<hr);
9375                       branch_regs[k].wasconst&=~(1<<hr);
9376                       branch_regs[k].isconst&=~(1<<hr);
9377                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9378                         regmap_pre[k+2][hr]=f_regmap[hr];
9379                         regs[k+2].wasdirty&=~(1<<hr);
9380                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9381                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9382                       }
9383                     }
9384                     else
9385                     {
9386                       regmap_pre[k+1][hr]=f_regmap[hr];
9387                       regs[k+1].wasdirty&=~(1<<hr);
9388                     }
9389                   }
9390                   if(regs[j].regmap[hr]==f_regmap[hr])
9391                     regs[j].regmap_entry[hr]=f_regmap[hr];
9392                   break;
9393                 }
9394                 if(j==i) break;
9395                 if(regs[j].regmap[hr]>=0)
9396                   break;
9397                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9398                   //printf("no-match due to different register\n");
9399                   break;
9400                 }
9401                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9402                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9403                   break;
9404                 }
9405                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9406                 {
9407                   // Stop on unconditional branch
9408                   break;
9409                 }
9410                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9411                 {
9412                   if(ooo[j]) {
9413                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9414                       break;
9415                   }else{
9416                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9417                       break;
9418                   }
9419                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9420                     //printf("no-match due to different register (branch)\n");
9421                     break;
9422                   }
9423                 }
9424                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9425                   //printf("No free regs for store %x\n",start+j*4);
9426                   break;
9427                 }
9428                 if(f_regmap[hr]>=64) {
9429                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9430                     break;
9431                   }
9432                   else
9433                   {
9434                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9435                       break;
9436                     }
9437                   }
9438                 }
9439               }
9440             }
9441           }
9442         }
9443       }
9444     }else{
9445       // Non branch or undetermined branch target
9446       for(hr=0;hr<HOST_REGS;hr++)
9447       {
9448         if(hr!=EXCLUDE_REG) {
9449           if(regs[i].regmap[hr]>64) {
9450             if(!((regs[i].dirty>>hr)&1))
9451               f_regmap[hr]=regs[i].regmap[hr];
9452           }
9453           else if(regs[i].regmap[hr]>=0) {
9454             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9455               // dealloc old register
9456               int n;
9457               for(n=0;n<HOST_REGS;n++)
9458               {
9459                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9460               }
9461               // and alloc new one
9462               f_regmap[hr]=regs[i].regmap[hr];
9463             }
9464           }
9465         }
9466       }
9467       // Try to restore cycle count at branch targets
9468       if(bt[i]) {
9469         for(j=i;j<slen-1;j++) {
9470           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9471           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9472             //printf("no free regs for store %x\n",start+j*4);
9473             break;
9474           }
9475         }
9476         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9477           int k=i;
9478           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9479           while(k<j) {
9480             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9481             regs[k].regmap[HOST_CCREG]=CCREG;
9482             regmap_pre[k+1][HOST_CCREG]=CCREG;
9483             regs[k+1].wasdirty|=1<<HOST_CCREG;
9484             regs[k].dirty|=1<<HOST_CCREG;
9485             regs[k].wasconst&=~(1<<HOST_CCREG);
9486             regs[k].isconst&=~(1<<HOST_CCREG);
9487             k++;
9488           }
9489           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9490         }
9491         // Work backwards from the branch target
9492         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9493         {
9494           //printf("Extend backwards\n");
9495           int k;
9496           k=i;
9497           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9498             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9499               //printf("no free regs for store %x\n",start+(k-1)*4);
9500               break;
9501             }
9502             k--;
9503           }
9504           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9505             //printf("Extend CC, %x ->\n",start+k*4);
9506             while(k<=i) {
9507               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9508               regs[k].regmap[HOST_CCREG]=CCREG;
9509               regmap_pre[k+1][HOST_CCREG]=CCREG;
9510               regs[k+1].wasdirty|=1<<HOST_CCREG;
9511               regs[k].dirty|=1<<HOST_CCREG;
9512               regs[k].wasconst&=~(1<<HOST_CCREG);
9513               regs[k].isconst&=~(1<<HOST_CCREG);
9514               k++;
9515             }
9516           }
9517           else {
9518             //printf("Fail Extend CC, %x ->\n",start+k*4);
9519           }
9520         }
9521       }
9522       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9523          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9524          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9525          itype[i]!=FCONV&&itype[i]!=FCOMP)
9526       {
9527         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9528       }
9529     }
9530   }
9531
9532   // Cache memory offset or tlb map pointer if a register is available
9533   #ifndef HOST_IMM_ADDR32
9534   #ifndef RAM_OFFSET
9535   if(0)
9536   #endif
9537   {
9538     int earliest_available[HOST_REGS];
9539     int loop_start[HOST_REGS];
9540     int score[HOST_REGS];
9541     int end[HOST_REGS];
9542     int reg=ROREG;
9543
9544     // Init
9545     for(hr=0;hr<HOST_REGS;hr++) {
9546       score[hr]=0;earliest_available[hr]=0;
9547       loop_start[hr]=MAXBLOCK;
9548     }
9549     for(i=0;i<slen-1;i++)
9550     {
9551       // Can't do anything if no registers are available
9552       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9553         for(hr=0;hr<HOST_REGS;hr++) {
9554           score[hr]=0;earliest_available[hr]=i+1;
9555           loop_start[hr]=MAXBLOCK;
9556         }
9557       }
9558       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9559         if(!ooo[i]) {
9560           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9561             for(hr=0;hr<HOST_REGS;hr++) {
9562               score[hr]=0;earliest_available[hr]=i+1;
9563               loop_start[hr]=MAXBLOCK;
9564             }
9565           }
9566         }else{
9567           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9568             for(hr=0;hr<HOST_REGS;hr++) {
9569               score[hr]=0;earliest_available[hr]=i+1;
9570               loop_start[hr]=MAXBLOCK;
9571             }
9572           }
9573         }
9574       }
9575       // Mark unavailable registers
9576       for(hr=0;hr<HOST_REGS;hr++) {
9577         if(regs[i].regmap[hr]>=0) {
9578           score[hr]=0;earliest_available[hr]=i+1;
9579           loop_start[hr]=MAXBLOCK;
9580         }
9581         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9582           if(branch_regs[i].regmap[hr]>=0) {
9583             score[hr]=0;earliest_available[hr]=i+2;
9584             loop_start[hr]=MAXBLOCK;
9585           }
9586         }
9587       }
9588       // No register allocations after unconditional jumps
9589       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9590       {
9591         for(hr=0;hr<HOST_REGS;hr++) {
9592           score[hr]=0;earliest_available[hr]=i+2;
9593           loop_start[hr]=MAXBLOCK;
9594         }
9595         i++; // Skip delay slot too
9596         //printf("skip delay slot: %x\n",start+i*4);
9597       }
9598       else
9599       // Possible match
9600       if(itype[i]==LOAD||itype[i]==LOADLR||
9601          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9602         for(hr=0;hr<HOST_REGS;hr++) {
9603           if(hr!=EXCLUDE_REG) {
9604             end[hr]=i-1;
9605             for(j=i;j<slen-1;j++) {
9606               if(regs[j].regmap[hr]>=0) break;
9607               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9608                 if(branch_regs[j].regmap[hr]>=0) break;
9609                 if(ooo[j]) {
9610                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9611                 }else{
9612                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9613                 }
9614               }
9615               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9616               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9617                 int t=(ba[j]-start)>>2;
9618                 if(t<j&&t>=earliest_available[hr]) {
9619                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9620                     // Score a point for hoisting loop invariant
9621                     if(t<loop_start[hr]) loop_start[hr]=t;
9622                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9623                     score[hr]++;
9624                     end[hr]=j;
9625                   }
9626                 }
9627                 else if(t<j) {
9628                   if(regs[t].regmap[hr]==reg) {
9629                     // Score a point if the branch target matches this register
9630                     score[hr]++;
9631                     end[hr]=j;
9632                   }
9633                 }
9634                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9635                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9636                   score[hr]++;
9637                   end[hr]=j;
9638                 }
9639               }
9640               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9641               {
9642                 // Stop on unconditional branch
9643                 break;
9644               }
9645               else
9646               if(itype[j]==LOAD||itype[j]==LOADLR||
9647                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9648                 score[hr]++;
9649                 end[hr]=j;
9650               }
9651             }
9652           }
9653         }
9654         // Find highest score and allocate that register
9655         int maxscore=0;
9656         for(hr=0;hr<HOST_REGS;hr++) {
9657           if(hr!=EXCLUDE_REG) {
9658             if(score[hr]>score[maxscore]) {
9659               maxscore=hr;
9660               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9661             }
9662           }
9663         }
9664         if(score[maxscore]>1)
9665         {
9666           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9667           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9668             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9669             assert(regs[j].regmap[maxscore]<0);
9670             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9671             regs[j].regmap[maxscore]=reg;
9672             regs[j].dirty&=~(1<<maxscore);
9673             regs[j].wasconst&=~(1<<maxscore);
9674             regs[j].isconst&=~(1<<maxscore);
9675             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9676               branch_regs[j].regmap[maxscore]=reg;
9677               branch_regs[j].wasdirty&=~(1<<maxscore);
9678               branch_regs[j].dirty&=~(1<<maxscore);
9679               branch_regs[j].wasconst&=~(1<<maxscore);
9680               branch_regs[j].isconst&=~(1<<maxscore);
9681               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9682                 regmap_pre[j+2][maxscore]=reg;
9683                 regs[j+2].wasdirty&=~(1<<maxscore);
9684               }
9685               // loop optimization (loop_preload)
9686               int t=(ba[j]-start)>>2;
9687               if(t==loop_start[maxscore]) {
9688                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9689                   regs[t].regmap_entry[maxscore]=reg;
9690               }
9691             }
9692             else
9693             {
9694               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9695                 regmap_pre[j+1][maxscore]=reg;
9696                 regs[j+1].wasdirty&=~(1<<maxscore);
9697               }
9698             }
9699           }
9700           i=j-1;
9701           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9702           for(hr=0;hr<HOST_REGS;hr++) {
9703             score[hr]=0;earliest_available[hr]=i+i;
9704             loop_start[hr]=MAXBLOCK;
9705           }
9706         }
9707       }
9708     }
9709   }
9710   #endif
9711
9712   // This allocates registers (if possible) one instruction prior
9713   // to use, which can avoid a load-use penalty on certain CPUs.
9714   for(i=0;i<slen-1;i++)
9715   {
9716     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9717     {
9718       if(!bt[i+1])
9719       {
9720         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9721            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9722         {
9723           if(rs1[i+1]) {
9724             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9725             {
9726               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9727               {
9728                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9729                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9730                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9731                 regs[i].isconst&=~(1<<hr);
9732                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9733                 constmap[i][hr]=constmap[i+1][hr];
9734                 regs[i+1].wasdirty&=~(1<<hr);
9735                 regs[i].dirty&=~(1<<hr);
9736               }
9737             }
9738           }
9739           if(rs2[i+1]) {
9740             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9741             {
9742               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9743               {
9744                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9745                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9746                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9747                 regs[i].isconst&=~(1<<hr);
9748                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9749                 constmap[i][hr]=constmap[i+1][hr];
9750                 regs[i+1].wasdirty&=~(1<<hr);
9751                 regs[i].dirty&=~(1<<hr);
9752               }
9753             }
9754           }
9755           // Preload target address for load instruction (non-constant)
9756           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9757             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9758             {
9759               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9760               {
9761                 regs[i].regmap[hr]=rs1[i+1];
9762                 regmap_pre[i+1][hr]=rs1[i+1];
9763                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9764                 regs[i].isconst&=~(1<<hr);
9765                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9766                 constmap[i][hr]=constmap[i+1][hr];
9767                 regs[i+1].wasdirty&=~(1<<hr);
9768                 regs[i].dirty&=~(1<<hr);
9769               }
9770             }
9771           }
9772           // Load source into target register
9773           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9774             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9775             {
9776               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9777               {
9778                 regs[i].regmap[hr]=rs1[i+1];
9779                 regmap_pre[i+1][hr]=rs1[i+1];
9780                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9781                 regs[i].isconst&=~(1<<hr);
9782                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9783                 constmap[i][hr]=constmap[i+1][hr];
9784                 regs[i+1].wasdirty&=~(1<<hr);
9785                 regs[i].dirty&=~(1<<hr);
9786               }
9787             }
9788           }
9789           // Address for store instruction (non-constant)
9790           if(itype[i+1]==STORE||itype[i+1]==STORELR
9791              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9792             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9793               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9794               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9795               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9796               assert(hr>=0);
9797               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9798               {
9799                 regs[i].regmap[hr]=rs1[i+1];
9800                 regmap_pre[i+1][hr]=rs1[i+1];
9801                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9802                 regs[i].isconst&=~(1<<hr);
9803                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9804                 constmap[i][hr]=constmap[i+1][hr];
9805                 regs[i+1].wasdirty&=~(1<<hr);
9806                 regs[i].dirty&=~(1<<hr);
9807               }
9808             }
9809           }
9810           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9811             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9812               int nr;
9813               hr=get_reg(regs[i+1].regmap,FTEMP);
9814               assert(hr>=0);
9815               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9816               {
9817                 regs[i].regmap[hr]=rs1[i+1];
9818                 regmap_pre[i+1][hr]=rs1[i+1];
9819                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9820                 regs[i].isconst&=~(1<<hr);
9821                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9822                 constmap[i][hr]=constmap[i+1][hr];
9823                 regs[i+1].wasdirty&=~(1<<hr);
9824                 regs[i].dirty&=~(1<<hr);
9825               }
9826               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9827               {
9828                 // move it to another register
9829                 regs[i+1].regmap[hr]=-1;
9830                 regmap_pre[i+2][hr]=-1;
9831                 regs[i+1].regmap[nr]=FTEMP;
9832                 regmap_pre[i+2][nr]=FTEMP;
9833                 regs[i].regmap[nr]=rs1[i+1];
9834                 regmap_pre[i+1][nr]=rs1[i+1];
9835                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9836                 regs[i].isconst&=~(1<<nr);
9837                 regs[i+1].isconst&=~(1<<nr);
9838                 regs[i].dirty&=~(1<<nr);
9839                 regs[i+1].wasdirty&=~(1<<nr);
9840                 regs[i+1].dirty&=~(1<<nr);
9841                 regs[i+2].wasdirty&=~(1<<nr);
9842               }
9843             }
9844           }
9845           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9846             if(itype[i+1]==LOAD)
9847               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9848             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9849               hr=get_reg(regs[i+1].regmap,FTEMP);
9850             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9851               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9852               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9853             }
9854             if(hr>=0&&regs[i].regmap[hr]<0) {
9855               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9856               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9857                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9858                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9859                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9860                 regs[i].isconst&=~(1<<hr);
9861                 regs[i+1].wasdirty&=~(1<<hr);
9862                 regs[i].dirty&=~(1<<hr);
9863               }
9864             }
9865           }
9866         }
9867       }
9868     }
9869   }
9870
9871   /* Pass 6 - Optimize clean/dirty state */
9872   clean_registers(0,slen-1,1);
9873
9874   /* Pass 7 - Identify 32-bit registers */
9875   for (i=slen-1;i>=0;i--)
9876   {
9877     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9878     {
9879       // Conditional branch
9880       if((source[i]>>16)!=0x1000&&i<slen-2) {
9881         // Mark this address as a branch target since it may be called
9882         // upon return from interrupt
9883         bt[i+2]=1;
9884       }
9885     }
9886   }
9887
9888   if(itype[slen-1]==SPAN) {
9889     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9890   }
9891
9892 #ifdef DISASM
9893   /* Debug/disassembly */
9894   for(i=0;i<slen;i++)
9895   {
9896     printf("U:");
9897     int r;
9898     for(r=1;r<=CCREG;r++) {
9899       if((unneeded_reg[i]>>r)&1) {
9900         if(r==HIREG) printf(" HI");
9901         else if(r==LOREG) printf(" LO");
9902         else printf(" r%d",r);
9903       }
9904     }
9905     printf("\n");
9906     #if defined(__i386__) || defined(__x86_64__)
9907     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9908     #endif
9909     #ifdef __arm__
9910     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9911     #endif
9912     printf("needs: ");
9913     if(needed_reg[i]&1) printf("eax ");
9914     if((needed_reg[i]>>1)&1) printf("ecx ");
9915     if((needed_reg[i]>>2)&1) printf("edx ");
9916     if((needed_reg[i]>>3)&1) printf("ebx ");
9917     if((needed_reg[i]>>5)&1) printf("ebp ");
9918     if((needed_reg[i]>>6)&1) printf("esi ");
9919     if((needed_reg[i]>>7)&1) printf("edi ");
9920     printf("\n");
9921     #if defined(__i386__) || defined(__x86_64__)
9922     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9923     printf("dirty: ");
9924     if(regs[i].wasdirty&1) printf("eax ");
9925     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9926     if((regs[i].wasdirty>>2)&1) printf("edx ");
9927     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9928     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9929     if((regs[i].wasdirty>>6)&1) printf("esi ");
9930     if((regs[i].wasdirty>>7)&1) printf("edi ");
9931     #endif
9932     #ifdef __arm__
9933     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9934     printf("dirty: ");
9935     if(regs[i].wasdirty&1) printf("r0 ");
9936     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9937     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9938     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9939     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9940     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9941     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9942     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9943     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9944     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9945     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9946     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9947     #endif
9948     printf("\n");
9949     disassemble_inst(i);
9950     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9951     #if defined(__i386__) || defined(__x86_64__)
9952     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9953     if(regs[i].dirty&1) printf("eax ");
9954     if((regs[i].dirty>>1)&1) printf("ecx ");
9955     if((regs[i].dirty>>2)&1) printf("edx ");
9956     if((regs[i].dirty>>3)&1) printf("ebx ");
9957     if((regs[i].dirty>>5)&1) printf("ebp ");
9958     if((regs[i].dirty>>6)&1) printf("esi ");
9959     if((regs[i].dirty>>7)&1) printf("edi ");
9960     #endif
9961     #ifdef __arm__
9962     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9963     if(regs[i].dirty&1) printf("r0 ");
9964     if((regs[i].dirty>>1)&1) printf("r1 ");
9965     if((regs[i].dirty>>2)&1) printf("r2 ");
9966     if((regs[i].dirty>>3)&1) printf("r3 ");
9967     if((regs[i].dirty>>4)&1) printf("r4 ");
9968     if((regs[i].dirty>>5)&1) printf("r5 ");
9969     if((regs[i].dirty>>6)&1) printf("r6 ");
9970     if((regs[i].dirty>>7)&1) printf("r7 ");
9971     if((regs[i].dirty>>8)&1) printf("r8 ");
9972     if((regs[i].dirty>>9)&1) printf("r9 ");
9973     if((regs[i].dirty>>10)&1) printf("r10 ");
9974     if((regs[i].dirty>>12)&1) printf("r12 ");
9975     #endif
9976     printf("\n");
9977     if(regs[i].isconst) {
9978       printf("constants: ");
9979       #if defined(__i386__) || defined(__x86_64__)
9980       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9981       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9982       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9983       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9984       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9985       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
9986       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
9987       #endif
9988       #ifdef __arm__
9989       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
9990       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
9991       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
9992       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
9993       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
9994       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
9995       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
9996       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
9997       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
9998       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
9999       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10000       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10001       #endif
10002       printf("\n");
10003     }
10004     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10005       #if defined(__i386__) || defined(__x86_64__)
10006       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10007       if(branch_regs[i].dirty&1) printf("eax ");
10008       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10009       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10010       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10011       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10012       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10013       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10014       #endif
10015       #ifdef __arm__
10016       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10017       if(branch_regs[i].dirty&1) printf("r0 ");
10018       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10019       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10020       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10021       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10022       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10023       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10024       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10025       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10026       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10027       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10028       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10029       #endif
10030     }
10031   }
10032 #endif // DISASM
10033
10034   /* Pass 8 - Assembly */
10035   linkcount=0;stubcount=0;
10036   ds=0;is_delayslot=0;
10037   cop1_usable=0;
10038   uint64_t is32_pre=0;
10039   u_int dirty_pre=0;
10040   void *beginning=start_block();
10041   if((u_int)addr&1) {
10042     ds=1;
10043     pagespan_ds();
10044   }
10045   u_int instr_addr0_override=0;
10046
10047   if (start == 0x80030000) {
10048     // nasty hack for fastbios thing
10049     // override block entry to this code
10050     instr_addr0_override=(u_int)out;
10051     emit_movimm(start,0);
10052     // abuse io address var as a flag that we
10053     // have already returned here once
10054     emit_readword((int)&address,1);
10055     emit_writeword(0,(int)&pcaddr);
10056     emit_writeword(0,(int)&address);
10057     emit_cmp(0,1);
10058     emit_jne((int)new_dyna_leave);
10059   }
10060   for(i=0;i<slen;i++)
10061   {
10062     //if(ds) printf("ds: ");
10063     disassemble_inst(i);
10064     if(ds) {
10065       ds=0; // Skip delay slot
10066       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10067       instr_addr[i]=0;
10068     } else {
10069       speculate_register_values(i);
10070       #ifndef DESTRUCTIVE_WRITEBACK
10071       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10072       {
10073         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10074               unneeded_reg[i],unneeded_reg_upper[i]);
10075       }
10076       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
10077         is32_pre=branch_regs[i].is32;
10078         dirty_pre=branch_regs[i].dirty;
10079       }else{
10080         is32_pre=regs[i].is32;
10081         dirty_pre=regs[i].dirty;
10082       }
10083       #endif
10084       // write back
10085       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10086       {
10087         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10088                       unneeded_reg[i],unneeded_reg_upper[i]);
10089         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10090       }
10091       // branch target entry point
10092       instr_addr[i]=(u_int)out;
10093       assem_debug("<->\n");
10094       // load regs
10095       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10096         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10097       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10098       address_generation(i,&regs[i],regs[i].regmap_entry);
10099       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10100       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10101       {
10102         // Load the delay slot registers if necessary
10103         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
10104           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10105         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
10106           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10107         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10108           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10109       }
10110       else if(i+1<slen)
10111       {
10112         // Preload registers for following instruction
10113         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10114           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10115             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10116         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10117           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10118             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10119       }
10120       // TODO: if(is_ooo(i)) address_generation(i+1);
10121       if(itype[i]==CJUMP||itype[i]==FJUMP)
10122         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10123       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10124         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10125       if(bt[i]) cop1_usable=0;
10126       // assemble
10127       switch(itype[i]) {
10128         case ALU:
10129           alu_assemble(i,&regs[i]);break;
10130         case IMM16:
10131           imm16_assemble(i,&regs[i]);break;
10132         case SHIFT:
10133           shift_assemble(i,&regs[i]);break;
10134         case SHIFTIMM:
10135           shiftimm_assemble(i,&regs[i]);break;
10136         case LOAD:
10137           load_assemble(i,&regs[i]);break;
10138         case LOADLR:
10139           loadlr_assemble(i,&regs[i]);break;
10140         case STORE:
10141           store_assemble(i,&regs[i]);break;
10142         case STORELR:
10143           storelr_assemble(i,&regs[i]);break;
10144         case COP0:
10145           cop0_assemble(i,&regs[i]);break;
10146         case COP1:
10147           cop1_assemble(i,&regs[i]);break;
10148         case C1LS:
10149           c1ls_assemble(i,&regs[i]);break;
10150         case COP2:
10151           cop2_assemble(i,&regs[i]);break;
10152         case C2LS:
10153           c2ls_assemble(i,&regs[i]);break;
10154         case C2OP:
10155           c2op_assemble(i,&regs[i]);break;
10156         case FCONV:
10157           fconv_assemble(i,&regs[i]);break;
10158         case FLOAT:
10159           float_assemble(i,&regs[i]);break;
10160         case FCOMP:
10161           fcomp_assemble(i,&regs[i]);break;
10162         case MULTDIV:
10163           multdiv_assemble(i,&regs[i]);break;
10164         case MOV:
10165           mov_assemble(i,&regs[i]);break;
10166         case SYSCALL:
10167           syscall_assemble(i,&regs[i]);break;
10168         case HLECALL:
10169           hlecall_assemble(i,&regs[i]);break;
10170         case INTCALL:
10171           intcall_assemble(i,&regs[i]);break;
10172         case UJUMP:
10173           ujump_assemble(i,&regs[i]);ds=1;break;
10174         case RJUMP:
10175           rjump_assemble(i,&regs[i]);ds=1;break;
10176         case CJUMP:
10177           cjump_assemble(i,&regs[i]);ds=1;break;
10178         case SJUMP:
10179           sjump_assemble(i,&regs[i]);ds=1;break;
10180         case FJUMP:
10181           fjump_assemble(i,&regs[i]);ds=1;break;
10182         case SPAN:
10183           pagespan_assemble(i,&regs[i]);break;
10184       }
10185       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10186         literal_pool(1024);
10187       else
10188         literal_pool_jumpover(256);
10189     }
10190   }
10191   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10192   // If the block did not end with an unconditional branch,
10193   // add a jump to the next instruction.
10194   if(i>1) {
10195     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10196       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10197       assert(i==slen);
10198       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10199         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10200         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10201           emit_loadreg(CCREG,HOST_CCREG);
10202         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10203       }
10204       else if(!likely[i-2])
10205       {
10206         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10207         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10208       }
10209       else
10210       {
10211         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10212         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10213       }
10214       add_to_linker((int)out,start+i*4,0);
10215       emit_jmp(0);
10216     }
10217   }
10218   else
10219   {
10220     assert(i>0);
10221     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10222     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10223     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10224       emit_loadreg(CCREG,HOST_CCREG);
10225     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10226     add_to_linker((int)out,start+i*4,0);
10227     emit_jmp(0);
10228   }
10229
10230   // TODO: delay slot stubs?
10231   // Stubs
10232   for(i=0;i<stubcount;i++)
10233   {
10234     switch(stubs[i][0])
10235     {
10236       case LOADB_STUB:
10237       case LOADH_STUB:
10238       case LOADW_STUB:
10239       case LOADD_STUB:
10240       case LOADBU_STUB:
10241       case LOADHU_STUB:
10242         do_readstub(i);break;
10243       case STOREB_STUB:
10244       case STOREH_STUB:
10245       case STOREW_STUB:
10246       case STORED_STUB:
10247         do_writestub(i);break;
10248       case CC_STUB:
10249         do_ccstub(i);break;
10250       case INVCODE_STUB:
10251         do_invstub(i);break;
10252       case FP_STUB:
10253         do_cop1stub(i);break;
10254       case STORELR_STUB:
10255         do_unalignedwritestub(i);break;
10256     }
10257   }
10258
10259   if (instr_addr0_override)
10260     instr_addr[0] = instr_addr0_override;
10261
10262   /* Pass 9 - Linker */
10263   for(i=0;i<linkcount;i++)
10264   {
10265     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10266     literal_pool(64);
10267     if(!link_addr[i][2])
10268     {
10269       void *stub=out;
10270       void *addr=check_addr(link_addr[i][1]);
10271       emit_extjump(link_addr[i][0],link_addr[i][1]);
10272       if(addr) {
10273         set_jump_target(link_addr[i][0],(int)addr);
10274         add_link(link_addr[i][1],stub);
10275       }
10276       else set_jump_target(link_addr[i][0],(int)stub);
10277     }
10278     else
10279     {
10280       // Internal branch
10281       int target=(link_addr[i][1]-start)>>2;
10282       assert(target>=0&&target<slen);
10283       assert(instr_addr[target]);
10284       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10285       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10286       //#else
10287       set_jump_target(link_addr[i][0],instr_addr[target]);
10288       //#endif
10289     }
10290   }
10291   // External Branch Targets (jump_in)
10292   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10293   for(i=0;i<slen;i++)
10294   {
10295     if(bt[i]||i==0)
10296     {
10297       if(instr_addr[i]) // TODO - delay slots (=null)
10298       {
10299         u_int vaddr=start+i*4;
10300         u_int page=get_page(vaddr);
10301         u_int vpage=get_vpage(vaddr);
10302         literal_pool(256);
10303         {
10304           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10305           assem_debug("jump_in: %x\n",start+i*4);
10306           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10307           int entry_point=do_dirty_stub(i);
10308           ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
10309           // If there was an existing entry in the hash table,
10310           // replace it with the new address.
10311           // Don't add new entries.  We'll insert the
10312           // ones that actually get used in check_addr().
10313           u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10314           if(ht_bin[0]==vaddr) {
10315             ht_bin[1]=entry_point;
10316           }
10317           if(ht_bin[2]==vaddr) {
10318             ht_bin[3]=entry_point;
10319           }
10320         }
10321       }
10322     }
10323   }
10324   // Write out the literal pool if necessary
10325   literal_pool(0);
10326   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10327   // Align code
10328   if(((u_int)out)&7) emit_addnop(13);
10329   #endif
10330   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
10331   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10332   memcpy(copy,source,slen*4);
10333   copy+=slen*4;
10334
10335   end_block(beginning);
10336
10337   // If we're within 256K of the end of the buffer,
10338   // start over from the beginning. (Is 256K enough?)
10339   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10340
10341   // Trap writes to any of the pages we compiled
10342   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10343     invalid_code[i]=0;
10344   }
10345   inv_code_start=inv_code_end=~0;
10346
10347   // for PCSX we need to mark all mirrors too
10348   if(get_page(start)<(RAM_SIZE>>12))
10349     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10350       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10351       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10352       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10353
10354   /* Pass 10 - Free memory by expiring oldest blocks */
10355
10356   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10357   while(expirep!=end)
10358   {
10359     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10360     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10361     inv_debug("EXP: Phase %d\n",expirep);
10362     switch((expirep>>11)&3)
10363     {
10364       case 0:
10365         // Clear jump_in and jump_dirty
10366         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10367         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10368         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10369         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10370         break;
10371       case 1:
10372         // Clear pointers
10373         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10374         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10375         break;
10376       case 2:
10377         // Clear hash table
10378         for(i=0;i<32;i++) {
10379           u_int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10380           if((ht_bin[3]>>shift)==(base>>shift) ||
10381              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10382             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10383             ht_bin[2]=ht_bin[3]=-1;
10384           }
10385           if((ht_bin[1]>>shift)==(base>>shift) ||
10386              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10387             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10388             ht_bin[0]=ht_bin[2];
10389             ht_bin[1]=ht_bin[3];
10390             ht_bin[2]=ht_bin[3]=-1;
10391           }
10392         }
10393         break;
10394       case 3:
10395         // Clear jump_out
10396         #ifdef __arm__
10397         if((expirep&2047)==0)
10398           do_clear_cache();
10399         #endif
10400         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10401         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10402         break;
10403     }
10404     expirep=(expirep+1)&65535;
10405   }
10406   return 0;
10407 }
10408
10409 // vim:shiftwidth=2:expandtab