6501d26a3fc379ae5a7303de263bf966c57fafd9
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26
27 #include "emu_if.h" //emulator interface
28
29 //#define DISASM
30 //#define assem_debug printf
31 //#define inv_debug printf
32 #define assem_debug(...)
33 #define inv_debug(...)
34
35 #ifdef __i386__
36 #include "assem_x86.h"
37 #endif
38 #ifdef __x86_64__
39 #include "assem_x64.h"
40 #endif
41 #ifdef __arm__
42 #include "assem_arm.h"
43 #endif
44
45 #ifdef __BLACKBERRY_QNX__
46 #undef __clear_cache
47 #define __clear_cache(start,end) msync(start, (size_t)((void*)end - (void*)start), MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
48 #elif defined(__MACH__)
49 #include <libkern/OSCacheControl.h>
50 #define __clear_cache mach_clear_cache
51 static void __clear_cache(void *start, void *end) {
52   size_t len = (char *)end - (char *)start;
53   sys_dcache_flush(start, len);
54   sys_icache_invalidate(start, len);
55 }
56 #endif
57
58 #define MAXBLOCK 4096
59 #define MAX_OUTPUT_BLOCK_SIZE 262144
60
61 struct regstat
62 {
63   signed char regmap_entry[HOST_REGS];
64   signed char regmap[HOST_REGS];
65   uint64_t was32;
66   uint64_t is32;
67   uint64_t wasdirty;
68   uint64_t dirty;
69   uint64_t u;
70   uint64_t uu;
71   u_int wasconst;
72   u_int isconst;
73   u_int loadedconst;             // host regs that have constants loaded
74   u_int waswritten;              // MIPS regs that were used as store base before
75 };
76
77 // note: asm depends on this layout
78 struct ll_entry
79 {
80   u_int vaddr;
81   u_int reg_sv_flags;
82   void *addr;
83   struct ll_entry *next;
84 };
85
86   // used by asm:
87   u_char *out;
88   u_int hash_table[65536][4]  __attribute__((aligned(16)));
89   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
90   struct ll_entry *jump_dirty[4096];
91
92   static struct ll_entry *jump_out[4096];
93   static u_int start;
94   static u_int *source;
95   static char insn[MAXBLOCK][10];
96   static u_char itype[MAXBLOCK];
97   static u_char opcode[MAXBLOCK];
98   static u_char opcode2[MAXBLOCK];
99   static u_char bt[MAXBLOCK];
100   static u_char rs1[MAXBLOCK];
101   static u_char rs2[MAXBLOCK];
102   static u_char rt1[MAXBLOCK];
103   static u_char rt2[MAXBLOCK];
104   static u_char us1[MAXBLOCK];
105   static u_char us2[MAXBLOCK];
106   static u_char dep1[MAXBLOCK];
107   static u_char dep2[MAXBLOCK];
108   static u_char lt1[MAXBLOCK];
109   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
110   static uint64_t gte_rt[MAXBLOCK];
111   static uint64_t gte_unneeded[MAXBLOCK];
112   static u_int smrv[32]; // speculated MIPS register values
113   static u_int smrv_strong; // mask or regs that are likely to have correct values
114   static u_int smrv_weak; // same, but somewhat less likely
115   static u_int smrv_strong_next; // same, but after current insn executes
116   static u_int smrv_weak_next;
117   static int imm[MAXBLOCK];
118   static u_int ba[MAXBLOCK];
119   static char likely[MAXBLOCK];
120   static char is_ds[MAXBLOCK];
121   static char ooo[MAXBLOCK];
122   static uint64_t unneeded_reg[MAXBLOCK];
123   static uint64_t unneeded_reg_upper[MAXBLOCK];
124   static uint64_t branch_unneeded_reg[MAXBLOCK];
125   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
126   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
127   static uint64_t current_constmap[HOST_REGS];
128   static uint64_t constmap[MAXBLOCK][HOST_REGS];
129   static struct regstat regs[MAXBLOCK];
130   static struct regstat branch_regs[MAXBLOCK];
131   static signed char minimum_free_regs[MAXBLOCK];
132   static u_int needed_reg[MAXBLOCK];
133   static u_int wont_dirty[MAXBLOCK];
134   static u_int will_dirty[MAXBLOCK];
135   static int ccadj[MAXBLOCK];
136   static int slen;
137   static u_int instr_addr[MAXBLOCK];
138   static u_int link_addr[MAXBLOCK][3];
139   static int linkcount;
140   static u_int stubs[MAXBLOCK*3][8];
141   static int stubcount;
142   static u_int literals[1024][2];
143   static int literalcount;
144   static int is_delayslot;
145   static int cop1_usable;
146   static char shadow[1048576]  __attribute__((aligned(16)));
147   static void *copy;
148   static int expirep;
149   static u_int stop_after_jal;
150 #ifndef RAM_FIXED
151   static u_int ram_offset;
152 #else
153   static const u_int ram_offset=0;
154 #endif
155
156   int new_dynarec_hacks;
157   int new_dynarec_did_compile;
158   extern u_char restore_candidate[512];
159   extern int cycle_count;
160
161   /* registers that may be allocated */
162   /* 1-31 gpr */
163 #define HIREG 32 // hi
164 #define LOREG 33 // lo
165 #define FSREG 34 // FPU status (FCSR)
166 #define CSREG 35 // Coprocessor status
167 #define CCREG 36 // Cycle count
168 #define INVCP 37 // Pointer to invalid_code
169 //#define MMREG 38 // Pointer to memory_map
170 #define ROREG 39 // ram offset (if rdram!=0x80000000)
171 #define TEMPREG 40
172 #define FTEMP 40 // FPU temporary register
173 #define PTEMP 41 // Prefetch temporary register
174 //#define TLREG 42 // TLB mapping offset
175 #define RHASH 43 // Return address hash
176 #define RHTBL 44 // Return address hash table address
177 #define RTEMP 45 // JR/JALR address register
178 #define MAXREG 45
179 #define AGEN1 46 // Address generation temporary register
180 //#define AGEN2 47 // Address generation temporary register
181 //#define MGEN1 48 // Maptable address generation temporary register
182 //#define MGEN2 49 // Maptable address generation temporary register
183 #define BTREG 50 // Branch target temporary register
184
185   /* instruction types */
186 #define NOP 0     // No operation
187 #define LOAD 1    // Load
188 #define STORE 2   // Store
189 #define LOADLR 3  // Unaligned load
190 #define STORELR 4 // Unaligned store
191 #define MOV 5     // Move
192 #define ALU 6     // Arithmetic/logic
193 #define MULTDIV 7 // Multiply/divide
194 #define SHIFT 8   // Shift by register
195 #define SHIFTIMM 9// Shift by immediate
196 #define IMM16 10  // 16-bit immediate
197 #define RJUMP 11  // Unconditional jump to register
198 #define UJUMP 12  // Unconditional jump
199 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
200 #define SJUMP 14  // Conditional branch (regimm format)
201 #define COP0 15   // Coprocessor 0
202 #define COP1 16   // Coprocessor 1
203 #define C1LS 17   // Coprocessor 1 load/store
204 #define FJUMP 18  // Conditional branch (floating point)
205 #define FLOAT 19  // Floating point unit
206 #define FCONV 20  // Convert integer to float
207 #define FCOMP 21  // Floating point compare (sets FSREG)
208 #define SYSCALL 22// SYSCALL
209 #define OTHER 23  // Other
210 #define SPAN 24   // Branch/delay slot spans 2 pages
211 #define NI 25     // Not implemented
212 #define HLECALL 26// PCSX fake opcodes for HLE
213 #define COP2 27   // Coprocessor 2 move
214 #define C2LS 28   // Coprocessor 2 load/store
215 #define C2OP 29   // Coprocessor 2 operation
216 #define INTCALL 30// Call interpreter to handle rare corner cases
217
218   /* stubs */
219 #define CC_STUB 1
220 #define FP_STUB 2
221 #define LOADB_STUB 3
222 #define LOADH_STUB 4
223 #define LOADW_STUB 5
224 #define LOADD_STUB 6
225 #define LOADBU_STUB 7
226 #define LOADHU_STUB 8
227 #define STOREB_STUB 9
228 #define STOREH_STUB 10
229 #define STOREW_STUB 11
230 #define STORED_STUB 12
231 #define STORELR_STUB 13
232 #define INVCODE_STUB 14
233
234   /* branch codes */
235 #define TAKEN 1
236 #define NOTTAKEN 2
237 #define NULLDS 3
238
239 // asm linkage
240 int new_recompile_block(int addr);
241 void *get_addr_ht(u_int vaddr);
242 void invalidate_block(u_int block);
243 void invalidate_addr(u_int addr);
244 void remove_hash(int vaddr);
245 void dyna_linker();
246 void dyna_linker_ds();
247 void verify_code();
248 void verify_code_vm();
249 void verify_code_ds();
250 void cc_interrupt();
251 void fp_exception();
252 void fp_exception_ds();
253 void jump_syscall_hle();
254 void jump_hlecall();
255 void jump_intcall();
256 void new_dyna_leave();
257
258 // Needed by assembler
259 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
260 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
261 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
262 static void load_all_regs(signed char i_regmap[]);
263 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
264 static void load_regs_entry(int t);
265 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
266
267 static int verify_dirty(u_int *ptr);
268 static int get_final_value(int hr, int i, int *value);
269 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e);
270 static void add_to_linker(int addr,int target,int ext);
271
272 static int tracedebug=0;
273
274 //#define DEBUG_CYCLE_COUNT 1
275
276 #define NO_CYCLE_PENALTY_THR 12
277
278 int cycle_multiplier; // 100 for 1.0
279
280 static int CLOCK_ADJUST(int x)
281 {
282   int s=(x>>31)|1;
283   return (x * cycle_multiplier + s * 50) / 100;
284 }
285
286 static u_int get_page(u_int vaddr)
287 {
288   u_int page=vaddr&~0xe0000000;
289   if (page < 0x1000000)
290     page &= ~0x0e00000; // RAM mirrors
291   page>>=12;
292   if(page>2048) page=2048+(page&2047);
293   return page;
294 }
295
296 // no virtual mem in PCSX
297 static u_int get_vpage(u_int vaddr)
298 {
299   return get_page(vaddr);
300 }
301
302 // Get address from virtual address
303 // This is called from the recompiled JR/JALR instructions
304 void *get_addr(u_int vaddr)
305 {
306   u_int page=get_page(vaddr);
307   u_int vpage=get_vpage(vaddr);
308   struct ll_entry *head;
309   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
310   head=jump_in[page];
311   while(head!=NULL) {
312     if(head->vaddr==vaddr) {
313   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
314       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
315       ht_bin[3]=ht_bin[1];
316       ht_bin[2]=ht_bin[0];
317       ht_bin[1]=(int)head->addr;
318       ht_bin[0]=vaddr;
319       return head->addr;
320     }
321     head=head->next;
322   }
323   head=jump_dirty[vpage];
324   while(head!=NULL) {
325     if(head->vaddr==vaddr) {
326       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
327       // Don't restore blocks which are about to expire from the cache
328       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
329       if(verify_dirty(head->addr)) {
330         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
331         invalid_code[vaddr>>12]=0;
332         inv_code_start=inv_code_end=~0;
333         if(vpage<2048) {
334           restore_candidate[vpage>>3]|=1<<(vpage&7);
335         }
336         else restore_candidate[page>>3]|=1<<(page&7);
337         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
338         if(ht_bin[0]==vaddr) {
339           ht_bin[1]=(int)head->addr; // Replace existing entry
340         }
341         else
342         {
343           ht_bin[3]=ht_bin[1];
344           ht_bin[2]=ht_bin[0];
345           ht_bin[1]=(int)head->addr;
346           ht_bin[0]=vaddr;
347         }
348         return head->addr;
349       }
350     }
351     head=head->next;
352   }
353   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
354   int r=new_recompile_block(vaddr);
355   if(r==0) return get_addr(vaddr);
356   // Execute in unmapped page, generate pagefault execption
357   Status|=2;
358   Cause=(vaddr<<31)|0x8;
359   EPC=(vaddr&1)?vaddr-5:vaddr;
360   BadVAddr=(vaddr&~1);
361   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
362   EntryHi=BadVAddr&0xFFFFE000;
363   return get_addr_ht(0x80000000);
364 }
365 // Look up address in hash table first
366 void *get_addr_ht(u_int vaddr)
367 {
368   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
369   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
370   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
371   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
372   return get_addr(vaddr);
373 }
374
375 void clear_all_regs(signed char regmap[])
376 {
377   int hr;
378   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
379 }
380
381 signed char get_reg(signed char regmap[],int r)
382 {
383   int hr;
384   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
385   return -1;
386 }
387
388 // Find a register that is available for two consecutive cycles
389 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
390 {
391   int hr;
392   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
393   return -1;
394 }
395
396 int count_free_regs(signed char regmap[])
397 {
398   int count=0;
399   int hr;
400   for(hr=0;hr<HOST_REGS;hr++)
401   {
402     if(hr!=EXCLUDE_REG) {
403       if(regmap[hr]<0) count++;
404     }
405   }
406   return count;
407 }
408
409 void dirty_reg(struct regstat *cur,signed char reg)
410 {
411   int hr;
412   if(!reg) return;
413   for (hr=0;hr<HOST_REGS;hr++) {
414     if((cur->regmap[hr]&63)==reg) {
415       cur->dirty|=1<<hr;
416     }
417   }
418 }
419
420 // If we dirty the lower half of a 64 bit register which is now being
421 // sign-extended, we need to dump the upper half.
422 // Note: Do this only after completion of the instruction, because
423 // some instructions may need to read the full 64-bit value even if
424 // overwriting it (eg SLTI, DSRA32).
425 static void flush_dirty_uppers(struct regstat *cur)
426 {
427   int hr,reg;
428   for (hr=0;hr<HOST_REGS;hr++) {
429     if((cur->dirty>>hr)&1) {
430       reg=cur->regmap[hr];
431       if(reg>=64)
432         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
433     }
434   }
435 }
436
437 void set_const(struct regstat *cur,signed char reg,uint64_t value)
438 {
439   int hr;
440   if(!reg) return;
441   for (hr=0;hr<HOST_REGS;hr++) {
442     if(cur->regmap[hr]==reg) {
443       cur->isconst|=1<<hr;
444       current_constmap[hr]=value;
445     }
446     else if((cur->regmap[hr]^64)==reg) {
447       cur->isconst|=1<<hr;
448       current_constmap[hr]=value>>32;
449     }
450   }
451 }
452
453 void clear_const(struct regstat *cur,signed char reg)
454 {
455   int hr;
456   if(!reg) return;
457   for (hr=0;hr<HOST_REGS;hr++) {
458     if((cur->regmap[hr]&63)==reg) {
459       cur->isconst&=~(1<<hr);
460     }
461   }
462 }
463
464 int is_const(struct regstat *cur,signed char reg)
465 {
466   int hr;
467   if(reg<0) return 0;
468   if(!reg) return 1;
469   for (hr=0;hr<HOST_REGS;hr++) {
470     if((cur->regmap[hr]&63)==reg) {
471       return (cur->isconst>>hr)&1;
472     }
473   }
474   return 0;
475 }
476 uint64_t get_const(struct regstat *cur,signed char reg)
477 {
478   int hr;
479   if(!reg) return 0;
480   for (hr=0;hr<HOST_REGS;hr++) {
481     if(cur->regmap[hr]==reg) {
482       return current_constmap[hr];
483     }
484   }
485   SysPrintf("Unknown constant in r%d\n",reg);
486   exit(1);
487 }
488
489 // Least soon needed registers
490 // Look at the next ten instructions and see which registers
491 // will be used.  Try not to reallocate these.
492 void lsn(u_char hsn[], int i, int *preferred_reg)
493 {
494   int j;
495   int b=-1;
496   for(j=0;j<9;j++)
497   {
498     if(i+j>=slen) {
499       j=slen-i-1;
500       break;
501     }
502     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
503     {
504       // Don't go past an unconditonal jump
505       j++;
506       break;
507     }
508   }
509   for(;j>=0;j--)
510   {
511     if(rs1[i+j]) hsn[rs1[i+j]]=j;
512     if(rs2[i+j]) hsn[rs2[i+j]]=j;
513     if(rt1[i+j]) hsn[rt1[i+j]]=j;
514     if(rt2[i+j]) hsn[rt2[i+j]]=j;
515     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
516       // Stores can allocate zero
517       hsn[rs1[i+j]]=j;
518       hsn[rs2[i+j]]=j;
519     }
520     // On some architectures stores need invc_ptr
521     #if defined(HOST_IMM8)
522     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
523       hsn[INVCP]=j;
524     }
525     #endif
526     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
527     {
528       hsn[CCREG]=j;
529       b=j;
530     }
531   }
532   if(b>=0)
533   {
534     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
535     {
536       // Follow first branch
537       int t=(ba[i+b]-start)>>2;
538       j=7-b;if(t+j>=slen) j=slen-t-1;
539       for(;j>=0;j--)
540       {
541         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
542         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
543         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
544         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
545       }
546     }
547     // TODO: preferred register based on backward branch
548   }
549   // Delay slot should preferably not overwrite branch conditions or cycle count
550   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
551     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
552     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
553     hsn[CCREG]=1;
554     // ...or hash tables
555     hsn[RHASH]=1;
556     hsn[RHTBL]=1;
557   }
558   // Coprocessor load/store needs FTEMP, even if not declared
559   if(itype[i]==C1LS||itype[i]==C2LS) {
560     hsn[FTEMP]=0;
561   }
562   // Load L/R also uses FTEMP as a temporary register
563   if(itype[i]==LOADLR) {
564     hsn[FTEMP]=0;
565   }
566   // Also SWL/SWR/SDL/SDR
567   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
568     hsn[FTEMP]=0;
569   }
570   // Don't remove the miniht registers
571   if(itype[i]==UJUMP||itype[i]==RJUMP)
572   {
573     hsn[RHASH]=0;
574     hsn[RHTBL]=0;
575   }
576 }
577
578 // We only want to allocate registers if we're going to use them again soon
579 int needed_again(int r, int i)
580 {
581   int j;
582   int b=-1;
583   int rn=10;
584
585   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
586   {
587     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
588       return 0; // Don't need any registers if exiting the block
589   }
590   for(j=0;j<9;j++)
591   {
592     if(i+j>=slen) {
593       j=slen-i-1;
594       break;
595     }
596     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
597     {
598       // Don't go past an unconditonal jump
599       j++;
600       break;
601     }
602     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
603     {
604       break;
605     }
606   }
607   for(;j>=1;j--)
608   {
609     if(rs1[i+j]==r) rn=j;
610     if(rs2[i+j]==r) rn=j;
611     if((unneeded_reg[i+j]>>r)&1) rn=10;
612     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
613     {
614       b=j;
615     }
616   }
617   /*
618   if(b>=0)
619   {
620     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
621     {
622       // Follow first branch
623       int o=rn;
624       int t=(ba[i+b]-start)>>2;
625       j=7-b;if(t+j>=slen) j=slen-t-1;
626       for(;j>=0;j--)
627       {
628         if(!((unneeded_reg[t+j]>>r)&1)) {
629           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
630           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
631         }
632         else rn=o;
633       }
634     }
635   }*/
636   if(rn<10) return 1;
637   return 0;
638 }
639
640 // Try to match register allocations at the end of a loop with those
641 // at the beginning
642 int loop_reg(int i, int r, int hr)
643 {
644   int j,k;
645   for(j=0;j<9;j++)
646   {
647     if(i+j>=slen) {
648       j=slen-i-1;
649       break;
650     }
651     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
652     {
653       // Don't go past an unconditonal jump
654       j++;
655       break;
656     }
657   }
658   k=0;
659   if(i>0){
660     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
661       k--;
662   }
663   for(;k<j;k++)
664   {
665     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
666     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
667     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
668     {
669       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
670       {
671         int t=(ba[i+k]-start)>>2;
672         int reg=get_reg(regs[t].regmap_entry,r);
673         if(reg>=0) return reg;
674         //reg=get_reg(regs[t+1].regmap_entry,r);
675         //if(reg>=0) return reg;
676       }
677     }
678   }
679   return hr;
680 }
681
682
683 // Allocate every register, preserving source/target regs
684 void alloc_all(struct regstat *cur,int i)
685 {
686   int hr;
687
688   for(hr=0;hr<HOST_REGS;hr++) {
689     if(hr!=EXCLUDE_REG) {
690       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
691          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
692       {
693         cur->regmap[hr]=-1;
694         cur->dirty&=~(1<<hr);
695       }
696       // Don't need zeros
697       if((cur->regmap[hr]&63)==0)
698       {
699         cur->regmap[hr]=-1;
700         cur->dirty&=~(1<<hr);
701       }
702     }
703   }
704 }
705
706 #ifdef __i386__
707 #include "assem_x86.c"
708 #endif
709 #ifdef __x86_64__
710 #include "assem_x64.c"
711 #endif
712 #ifdef __arm__
713 #include "assem_arm.c"
714 #endif
715
716 // Add virtual address mapping to linked list
717 void ll_add(struct ll_entry **head,int vaddr,void *addr)
718 {
719   struct ll_entry *new_entry;
720   new_entry=malloc(sizeof(struct ll_entry));
721   assert(new_entry!=NULL);
722   new_entry->vaddr=vaddr;
723   new_entry->reg_sv_flags=0;
724   new_entry->addr=addr;
725   new_entry->next=*head;
726   *head=new_entry;
727 }
728
729 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
730 {
731   ll_add(head,vaddr,addr);
732   (*head)->reg_sv_flags=reg_sv_flags;
733 }
734
735 // Check if an address is already compiled
736 // but don't return addresses which are about to expire from the cache
737 void *check_addr(u_int vaddr)
738 {
739   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
740   if(ht_bin[0]==vaddr) {
741     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
742       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
743   }
744   if(ht_bin[2]==vaddr) {
745     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
746       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
747   }
748   u_int page=get_page(vaddr);
749   struct ll_entry *head;
750   head=jump_in[page];
751   while(head!=NULL) {
752     if(head->vaddr==vaddr) {
753       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
754         // Update existing entry with current address
755         if(ht_bin[0]==vaddr) {
756           ht_bin[1]=(int)head->addr;
757           return head->addr;
758         }
759         if(ht_bin[2]==vaddr) {
760           ht_bin[3]=(int)head->addr;
761           return head->addr;
762         }
763         // Insert into hash table with low priority.
764         // Don't evict existing entries, as they are probably
765         // addresses that are being accessed frequently.
766         if(ht_bin[0]==-1) {
767           ht_bin[1]=(int)head->addr;
768           ht_bin[0]=vaddr;
769         }else if(ht_bin[2]==-1) {
770           ht_bin[3]=(int)head->addr;
771           ht_bin[2]=vaddr;
772         }
773         return head->addr;
774       }
775     }
776     head=head->next;
777   }
778   return 0;
779 }
780
781 void remove_hash(int vaddr)
782 {
783   //printf("remove hash: %x\n",vaddr);
784   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
785   if(ht_bin[2]==vaddr) {
786     ht_bin[2]=ht_bin[3]=-1;
787   }
788   if(ht_bin[0]==vaddr) {
789     ht_bin[0]=ht_bin[2];
790     ht_bin[1]=ht_bin[3];
791     ht_bin[2]=ht_bin[3]=-1;
792   }
793 }
794
795 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
796 {
797   struct ll_entry *next;
798   while(*head) {
799     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
800        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
801     {
802       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
803       remove_hash((*head)->vaddr);
804       next=(*head)->next;
805       free(*head);
806       *head=next;
807     }
808     else
809     {
810       head=&((*head)->next);
811     }
812   }
813 }
814
815 // Remove all entries from linked list
816 void ll_clear(struct ll_entry **head)
817 {
818   struct ll_entry *cur;
819   struct ll_entry *next;
820   if(cur=*head) {
821     *head=0;
822     while(cur) {
823       next=cur->next;
824       free(cur);
825       cur=next;
826     }
827   }
828 }
829
830 // Dereference the pointers and remove if it matches
831 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
832 {
833   while(head) {
834     int ptr=get_pointer(head->addr);
835     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
836     if(((ptr>>shift)==(addr>>shift)) ||
837        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
838     {
839       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
840       u_int host_addr=(u_int)kill_pointer(head->addr);
841       #ifdef __arm__
842         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
843       #endif
844     }
845     head=head->next;
846   }
847 }
848
849 // This is called when we write to a compiled block (see do_invstub)
850 void invalidate_page(u_int page)
851 {
852   struct ll_entry *head;
853   struct ll_entry *next;
854   head=jump_in[page];
855   jump_in[page]=0;
856   while(head!=NULL) {
857     inv_debug("INVALIDATE: %x\n",head->vaddr);
858     remove_hash(head->vaddr);
859     next=head->next;
860     free(head);
861     head=next;
862   }
863   head=jump_out[page];
864   jump_out[page]=0;
865   while(head!=NULL) {
866     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
867     u_int host_addr=(u_int)kill_pointer(head->addr);
868     #ifdef __arm__
869       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
870     #endif
871     next=head->next;
872     free(head);
873     head=next;
874   }
875 }
876
877 static void invalidate_block_range(u_int block, u_int first, u_int last)
878 {
879   u_int page=get_page(block<<12);
880   //printf("first=%d last=%d\n",first,last);
881   invalidate_page(page);
882   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
883   assert(last<page+5);
884   // Invalidate the adjacent pages if a block crosses a 4K boundary
885   while(first<page) {
886     invalidate_page(first);
887     first++;
888   }
889   for(first=page+1;first<last;first++) {
890     invalidate_page(first);
891   }
892   #ifdef __arm__
893     do_clear_cache();
894   #endif
895
896   // Don't trap writes
897   invalid_code[block]=1;
898
899   #ifdef USE_MINI_HT
900   memset(mini_ht,-1,sizeof(mini_ht));
901   #endif
902 }
903
904 void invalidate_block(u_int block)
905 {
906   u_int page=get_page(block<<12);
907   u_int vpage=get_vpage(block<<12);
908   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
909   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
910   u_int first,last;
911   first=last=page;
912   struct ll_entry *head;
913   head=jump_dirty[vpage];
914   //printf("page=%d vpage=%d\n",page,vpage);
915   while(head!=NULL) {
916     u_int start,end;
917     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
918       get_bounds((int)head->addr,&start,&end);
919       //printf("start: %x end: %x\n",start,end);
920       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
921         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
922           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
923           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
924         }
925       }
926     }
927     head=head->next;
928   }
929   invalidate_block_range(block,first,last);
930 }
931
932 void invalidate_addr(u_int addr)
933 {
934   //static int rhits;
935   // this check is done by the caller
936   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
937   u_int page=get_vpage(addr);
938   if(page<2048) { // RAM
939     struct ll_entry *head;
940     u_int addr_min=~0, addr_max=0;
941     u_int mask=RAM_SIZE-1;
942     u_int addr_main=0x80000000|(addr&mask);
943     int pg1;
944     inv_code_start=addr_main&~0xfff;
945     inv_code_end=addr_main|0xfff;
946     pg1=page;
947     if (pg1>0) {
948       // must check previous page too because of spans..
949       pg1--;
950       inv_code_start-=0x1000;
951     }
952     for(;pg1<=page;pg1++) {
953       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
954         u_int start,end;
955         get_bounds((int)head->addr,&start,&end);
956         if(ram_offset) {
957           start-=ram_offset;
958           end-=ram_offset;
959         }
960         if(start<=addr_main&&addr_main<end) {
961           if(start<addr_min) addr_min=start;
962           if(end>addr_max) addr_max=end;
963         }
964         else if(addr_main<start) {
965           if(start<inv_code_end)
966             inv_code_end=start-1;
967         }
968         else {
969           if(end>inv_code_start)
970             inv_code_start=end;
971         }
972       }
973     }
974     if (addr_min!=~0) {
975       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
976       inv_code_start=inv_code_end=~0;
977       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
978       return;
979     }
980     else {
981       inv_code_start=(addr&~mask)|(inv_code_start&mask);
982       inv_code_end=(addr&~mask)|(inv_code_end&mask);
983       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
984       return;
985     }
986   }
987   invalidate_block(addr>>12);
988 }
989
990 // This is called when loading a save state.
991 // Anything could have changed, so invalidate everything.
992 void invalidate_all_pages()
993 {
994   u_int page,n;
995   for(page=0;page<4096;page++)
996     invalidate_page(page);
997   for(page=0;page<1048576;page++)
998     if(!invalid_code[page]) {
999       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1000       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1001     }
1002   #ifdef __arm__
1003   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1004   #endif
1005   #ifdef USE_MINI_HT
1006   memset(mini_ht,-1,sizeof(mini_ht));
1007   #endif
1008 }
1009
1010 // Add an entry to jump_out after making a link
1011 void add_link(u_int vaddr,void *src)
1012 {
1013   u_int page=get_page(vaddr);
1014   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1015   int *ptr=(int *)(src+4);
1016   assert((*ptr&0x0fff0000)==0x059f0000);
1017   ll_add(jump_out+page,vaddr,src);
1018   //int ptr=get_pointer(src);
1019   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1020 }
1021
1022 // If a code block was found to be unmodified (bit was set in
1023 // restore_candidate) and it remains unmodified (bit is clear
1024 // in invalid_code) then move the entries for that 4K page from
1025 // the dirty list to the clean list.
1026 void clean_blocks(u_int page)
1027 {
1028   struct ll_entry *head;
1029   inv_debug("INV: clean_blocks page=%d\n",page);
1030   head=jump_dirty[page];
1031   while(head!=NULL) {
1032     if(!invalid_code[head->vaddr>>12]) {
1033       // Don't restore blocks which are about to expire from the cache
1034       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1035         u_int start,end;
1036         if(verify_dirty((int)head->addr)) {
1037           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1038           u_int i;
1039           u_int inv=0;
1040           get_bounds((int)head->addr,&start,&end);
1041           if(start-(u_int)rdram<RAM_SIZE) {
1042             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1043               inv|=invalid_code[i];
1044             }
1045           }
1046           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1047             inv=1;
1048           }
1049           if(!inv) {
1050             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1051             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1052               u_int ppage=page;
1053               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1054               //printf("page=%x, addr=%x\n",page,head->vaddr);
1055               //assert(head->vaddr>>12==(page|0x80000));
1056               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1057               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1058               if(ht_bin[0]==head->vaddr) {
1059                 ht_bin[1]=(int)clean_addr; // Replace existing entry
1060               }
1061               if(ht_bin[2]==head->vaddr) {
1062                 ht_bin[3]=(int)clean_addr; // Replace existing entry
1063               }
1064             }
1065           }
1066         }
1067       }
1068     }
1069     head=head->next;
1070   }
1071 }
1072
1073
1074 void mov_alloc(struct regstat *current,int i)
1075 {
1076   // Note: Don't need to actually alloc the source registers
1077   if((~current->is32>>rs1[i])&1) {
1078     //alloc_reg64(current,i,rs1[i]);
1079     alloc_reg64(current,i,rt1[i]);
1080     current->is32&=~(1LL<<rt1[i]);
1081   } else {
1082     //alloc_reg(current,i,rs1[i]);
1083     alloc_reg(current,i,rt1[i]);
1084     current->is32|=(1LL<<rt1[i]);
1085   }
1086   clear_const(current,rs1[i]);
1087   clear_const(current,rt1[i]);
1088   dirty_reg(current,rt1[i]);
1089 }
1090
1091 void shiftimm_alloc(struct regstat *current,int i)
1092 {
1093   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1094   {
1095     if(rt1[i]) {
1096       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1097       else lt1[i]=rs1[i];
1098       alloc_reg(current,i,rt1[i]);
1099       current->is32|=1LL<<rt1[i];
1100       dirty_reg(current,rt1[i]);
1101       if(is_const(current,rs1[i])) {
1102         int v=get_const(current,rs1[i]);
1103         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1104         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1105         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1106       }
1107       else clear_const(current,rt1[i]);
1108     }
1109   }
1110   else
1111   {
1112     clear_const(current,rs1[i]);
1113     clear_const(current,rt1[i]);
1114   }
1115
1116   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1117   {
1118     if(rt1[i]) {
1119       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1120       alloc_reg64(current,i,rt1[i]);
1121       current->is32&=~(1LL<<rt1[i]);
1122       dirty_reg(current,rt1[i]);
1123     }
1124   }
1125   if(opcode2[i]==0x3c) // DSLL32
1126   {
1127     if(rt1[i]) {
1128       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1129       alloc_reg64(current,i,rt1[i]);
1130       current->is32&=~(1LL<<rt1[i]);
1131       dirty_reg(current,rt1[i]);
1132     }
1133   }
1134   if(opcode2[i]==0x3e) // DSRL32
1135   {
1136     if(rt1[i]) {
1137       alloc_reg64(current,i,rs1[i]);
1138       if(imm[i]==32) {
1139         alloc_reg64(current,i,rt1[i]);
1140         current->is32&=~(1LL<<rt1[i]);
1141       } else {
1142         alloc_reg(current,i,rt1[i]);
1143         current->is32|=1LL<<rt1[i];
1144       }
1145       dirty_reg(current,rt1[i]);
1146     }
1147   }
1148   if(opcode2[i]==0x3f) // DSRA32
1149   {
1150     if(rt1[i]) {
1151       alloc_reg64(current,i,rs1[i]);
1152       alloc_reg(current,i,rt1[i]);
1153       current->is32|=1LL<<rt1[i];
1154       dirty_reg(current,rt1[i]);
1155     }
1156   }
1157 }
1158
1159 void shift_alloc(struct regstat *current,int i)
1160 {
1161   if(rt1[i]) {
1162     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1163     {
1164       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1165       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1166       alloc_reg(current,i,rt1[i]);
1167       if(rt1[i]==rs2[i]) {
1168         alloc_reg_temp(current,i,-1);
1169         minimum_free_regs[i]=1;
1170       }
1171       current->is32|=1LL<<rt1[i];
1172     } else { // DSLLV/DSRLV/DSRAV
1173       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1174       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1175       alloc_reg64(current,i,rt1[i]);
1176       current->is32&=~(1LL<<rt1[i]);
1177       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1178       {
1179         alloc_reg_temp(current,i,-1);
1180         minimum_free_regs[i]=1;
1181       }
1182     }
1183     clear_const(current,rs1[i]);
1184     clear_const(current,rs2[i]);
1185     clear_const(current,rt1[i]);
1186     dirty_reg(current,rt1[i]);
1187   }
1188 }
1189
1190 void alu_alloc(struct regstat *current,int i)
1191 {
1192   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1193     if(rt1[i]) {
1194       if(rs1[i]&&rs2[i]) {
1195         alloc_reg(current,i,rs1[i]);
1196         alloc_reg(current,i,rs2[i]);
1197       }
1198       else {
1199         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1200         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1201       }
1202       alloc_reg(current,i,rt1[i]);
1203     }
1204     current->is32|=1LL<<rt1[i];
1205   }
1206   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1207     if(rt1[i]) {
1208       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1209       {
1210         alloc_reg64(current,i,rs1[i]);
1211         alloc_reg64(current,i,rs2[i]);
1212         alloc_reg(current,i,rt1[i]);
1213       } else {
1214         alloc_reg(current,i,rs1[i]);
1215         alloc_reg(current,i,rs2[i]);
1216         alloc_reg(current,i,rt1[i]);
1217       }
1218     }
1219     current->is32|=1LL<<rt1[i];
1220   }
1221   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1222     if(rt1[i]) {
1223       if(rs1[i]&&rs2[i]) {
1224         alloc_reg(current,i,rs1[i]);
1225         alloc_reg(current,i,rs2[i]);
1226       }
1227       else
1228       {
1229         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1230         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1231       }
1232       alloc_reg(current,i,rt1[i]);
1233       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1234       {
1235         if(!((current->uu>>rt1[i])&1)) {
1236           alloc_reg64(current,i,rt1[i]);
1237         }
1238         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1239           if(rs1[i]&&rs2[i]) {
1240             alloc_reg64(current,i,rs1[i]);
1241             alloc_reg64(current,i,rs2[i]);
1242           }
1243           else
1244           {
1245             // Is is really worth it to keep 64-bit values in registers?
1246             #ifdef NATIVE_64BIT
1247             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1248             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1249             #endif
1250           }
1251         }
1252         current->is32&=~(1LL<<rt1[i]);
1253       } else {
1254         current->is32|=1LL<<rt1[i];
1255       }
1256     }
1257   }
1258   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1259     if(rt1[i]) {
1260       if(rs1[i]&&rs2[i]) {
1261         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1262           alloc_reg64(current,i,rs1[i]);
1263           alloc_reg64(current,i,rs2[i]);
1264           alloc_reg64(current,i,rt1[i]);
1265         } else {
1266           alloc_reg(current,i,rs1[i]);
1267           alloc_reg(current,i,rs2[i]);
1268           alloc_reg(current,i,rt1[i]);
1269         }
1270       }
1271       else {
1272         alloc_reg(current,i,rt1[i]);
1273         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1274           // DADD used as move, or zeroing
1275           // If we have a 64-bit source, then make the target 64 bits too
1276           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1277             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1278             alloc_reg64(current,i,rt1[i]);
1279           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1280             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1281             alloc_reg64(current,i,rt1[i]);
1282           }
1283           if(opcode2[i]>=0x2e&&rs2[i]) {
1284             // DSUB used as negation - 64-bit result
1285             // If we have a 32-bit register, extend it to 64 bits
1286             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1287             alloc_reg64(current,i,rt1[i]);
1288           }
1289         }
1290       }
1291       if(rs1[i]&&rs2[i]) {
1292         current->is32&=~(1LL<<rt1[i]);
1293       } else if(rs1[i]) {
1294         current->is32&=~(1LL<<rt1[i]);
1295         if((current->is32>>rs1[i])&1)
1296           current->is32|=1LL<<rt1[i];
1297       } else if(rs2[i]) {
1298         current->is32&=~(1LL<<rt1[i]);
1299         if((current->is32>>rs2[i])&1)
1300           current->is32|=1LL<<rt1[i];
1301       } else {
1302         current->is32|=1LL<<rt1[i];
1303       }
1304     }
1305   }
1306   clear_const(current,rs1[i]);
1307   clear_const(current,rs2[i]);
1308   clear_const(current,rt1[i]);
1309   dirty_reg(current,rt1[i]);
1310 }
1311
1312 void imm16_alloc(struct regstat *current,int i)
1313 {
1314   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1315   else lt1[i]=rs1[i];
1316   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1317   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1318     current->is32&=~(1LL<<rt1[i]);
1319     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1320       // TODO: Could preserve the 32-bit flag if the immediate is zero
1321       alloc_reg64(current,i,rt1[i]);
1322       alloc_reg64(current,i,rs1[i]);
1323     }
1324     clear_const(current,rs1[i]);
1325     clear_const(current,rt1[i]);
1326   }
1327   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1328     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1329     current->is32|=1LL<<rt1[i];
1330     clear_const(current,rs1[i]);
1331     clear_const(current,rt1[i]);
1332   }
1333   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1334     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1335       if(rs1[i]!=rt1[i]) {
1336         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1337         alloc_reg64(current,i,rt1[i]);
1338         current->is32&=~(1LL<<rt1[i]);
1339       }
1340     }
1341     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1342     if(is_const(current,rs1[i])) {
1343       int v=get_const(current,rs1[i]);
1344       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1345       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1346       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1347     }
1348     else clear_const(current,rt1[i]);
1349   }
1350   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1351     if(is_const(current,rs1[i])) {
1352       int v=get_const(current,rs1[i]);
1353       set_const(current,rt1[i],v+imm[i]);
1354     }
1355     else clear_const(current,rt1[i]);
1356     current->is32|=1LL<<rt1[i];
1357   }
1358   else {
1359     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1360     current->is32|=1LL<<rt1[i];
1361   }
1362   dirty_reg(current,rt1[i]);
1363 }
1364
1365 void load_alloc(struct regstat *current,int i)
1366 {
1367   clear_const(current,rt1[i]);
1368   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1369   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1370   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1371   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1372     alloc_reg(current,i,rt1[i]);
1373     assert(get_reg(current->regmap,rt1[i])>=0);
1374     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1375     {
1376       current->is32&=~(1LL<<rt1[i]);
1377       alloc_reg64(current,i,rt1[i]);
1378     }
1379     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1380     {
1381       current->is32&=~(1LL<<rt1[i]);
1382       alloc_reg64(current,i,rt1[i]);
1383       alloc_all(current,i);
1384       alloc_reg64(current,i,FTEMP);
1385       minimum_free_regs[i]=HOST_REGS;
1386     }
1387     else current->is32|=1LL<<rt1[i];
1388     dirty_reg(current,rt1[i]);
1389     // LWL/LWR need a temporary register for the old value
1390     if(opcode[i]==0x22||opcode[i]==0x26)
1391     {
1392       alloc_reg(current,i,FTEMP);
1393       alloc_reg_temp(current,i,-1);
1394       minimum_free_regs[i]=1;
1395     }
1396   }
1397   else
1398   {
1399     // Load to r0 or unneeded register (dummy load)
1400     // but we still need a register to calculate the address
1401     if(opcode[i]==0x22||opcode[i]==0x26)
1402     {
1403       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1404     }
1405     alloc_reg_temp(current,i,-1);
1406     minimum_free_regs[i]=1;
1407     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1408     {
1409       alloc_all(current,i);
1410       alloc_reg64(current,i,FTEMP);
1411       minimum_free_regs[i]=HOST_REGS;
1412     }
1413   }
1414 }
1415
1416 void store_alloc(struct regstat *current,int i)
1417 {
1418   clear_const(current,rs2[i]);
1419   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1420   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1421   alloc_reg(current,i,rs2[i]);
1422   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1423     alloc_reg64(current,i,rs2[i]);
1424     if(rs2[i]) alloc_reg(current,i,FTEMP);
1425   }
1426   #if defined(HOST_IMM8)
1427   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1428   else alloc_reg(current,i,INVCP);
1429   #endif
1430   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1431     alloc_reg(current,i,FTEMP);
1432   }
1433   // We need a temporary register for address generation
1434   alloc_reg_temp(current,i,-1);
1435   minimum_free_regs[i]=1;
1436 }
1437
1438 void c1ls_alloc(struct regstat *current,int i)
1439 {
1440   //clear_const(current,rs1[i]); // FIXME
1441   clear_const(current,rt1[i]);
1442   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1443   alloc_reg(current,i,CSREG); // Status
1444   alloc_reg(current,i,FTEMP);
1445   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1446     alloc_reg64(current,i,FTEMP);
1447   }
1448   #if defined(HOST_IMM8)
1449   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1450   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1451     alloc_reg(current,i,INVCP);
1452   #endif
1453   // We need a temporary register for address generation
1454   alloc_reg_temp(current,i,-1);
1455 }
1456
1457 void c2ls_alloc(struct regstat *current,int i)
1458 {
1459   clear_const(current,rt1[i]);
1460   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1461   alloc_reg(current,i,FTEMP);
1462   #if defined(HOST_IMM8)
1463   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1464   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1465     alloc_reg(current,i,INVCP);
1466   #endif
1467   // We need a temporary register for address generation
1468   alloc_reg_temp(current,i,-1);
1469   minimum_free_regs[i]=1;
1470 }
1471
1472 #ifndef multdiv_alloc
1473 void multdiv_alloc(struct regstat *current,int i)
1474 {
1475   //  case 0x18: MULT
1476   //  case 0x19: MULTU
1477   //  case 0x1A: DIV
1478   //  case 0x1B: DIVU
1479   //  case 0x1C: DMULT
1480   //  case 0x1D: DMULTU
1481   //  case 0x1E: DDIV
1482   //  case 0x1F: DDIVU
1483   clear_const(current,rs1[i]);
1484   clear_const(current,rs2[i]);
1485   if(rs1[i]&&rs2[i])
1486   {
1487     if((opcode2[i]&4)==0) // 32-bit
1488     {
1489       current->u&=~(1LL<<HIREG);
1490       current->u&=~(1LL<<LOREG);
1491       alloc_reg(current,i,HIREG);
1492       alloc_reg(current,i,LOREG);
1493       alloc_reg(current,i,rs1[i]);
1494       alloc_reg(current,i,rs2[i]);
1495       current->is32|=1LL<<HIREG;
1496       current->is32|=1LL<<LOREG;
1497       dirty_reg(current,HIREG);
1498       dirty_reg(current,LOREG);
1499     }
1500     else // 64-bit
1501     {
1502       current->u&=~(1LL<<HIREG);
1503       current->u&=~(1LL<<LOREG);
1504       current->uu&=~(1LL<<HIREG);
1505       current->uu&=~(1LL<<LOREG);
1506       alloc_reg64(current,i,HIREG);
1507       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1508       alloc_reg64(current,i,rs1[i]);
1509       alloc_reg64(current,i,rs2[i]);
1510       alloc_all(current,i);
1511       current->is32&=~(1LL<<HIREG);
1512       current->is32&=~(1LL<<LOREG);
1513       dirty_reg(current,HIREG);
1514       dirty_reg(current,LOREG);
1515       minimum_free_regs[i]=HOST_REGS;
1516     }
1517   }
1518   else
1519   {
1520     // Multiply by zero is zero.
1521     // MIPS does not have a divide by zero exception.
1522     // The result is undefined, we return zero.
1523     alloc_reg(current,i,HIREG);
1524     alloc_reg(current,i,LOREG);
1525     current->is32|=1LL<<HIREG;
1526     current->is32|=1LL<<LOREG;
1527     dirty_reg(current,HIREG);
1528     dirty_reg(current,LOREG);
1529   }
1530 }
1531 #endif
1532
1533 void cop0_alloc(struct regstat *current,int i)
1534 {
1535   if(opcode2[i]==0) // MFC0
1536   {
1537     if(rt1[i]) {
1538       clear_const(current,rt1[i]);
1539       alloc_all(current,i);
1540       alloc_reg(current,i,rt1[i]);
1541       current->is32|=1LL<<rt1[i];
1542       dirty_reg(current,rt1[i]);
1543     }
1544   }
1545   else if(opcode2[i]==4) // MTC0
1546   {
1547     if(rs1[i]){
1548       clear_const(current,rs1[i]);
1549       alloc_reg(current,i,rs1[i]);
1550       alloc_all(current,i);
1551     }
1552     else {
1553       alloc_all(current,i); // FIXME: Keep r0
1554       current->u&=~1LL;
1555       alloc_reg(current,i,0);
1556     }
1557   }
1558   else
1559   {
1560     // TLBR/TLBWI/TLBWR/TLBP/ERET
1561     assert(opcode2[i]==0x10);
1562     alloc_all(current,i);
1563   }
1564   minimum_free_regs[i]=HOST_REGS;
1565 }
1566
1567 void cop1_alloc(struct regstat *current,int i)
1568 {
1569   alloc_reg(current,i,CSREG); // Load status
1570   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1571   {
1572     if(rt1[i]){
1573       clear_const(current,rt1[i]);
1574       if(opcode2[i]==1) {
1575         alloc_reg64(current,i,rt1[i]); // DMFC1
1576         current->is32&=~(1LL<<rt1[i]);
1577       }else{
1578         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1579         current->is32|=1LL<<rt1[i];
1580       }
1581       dirty_reg(current,rt1[i]);
1582     }
1583     alloc_reg_temp(current,i,-1);
1584   }
1585   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1586   {
1587     if(rs1[i]){
1588       clear_const(current,rs1[i]);
1589       if(opcode2[i]==5)
1590         alloc_reg64(current,i,rs1[i]); // DMTC1
1591       else
1592         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1593       alloc_reg_temp(current,i,-1);
1594     }
1595     else {
1596       current->u&=~1LL;
1597       alloc_reg(current,i,0);
1598       alloc_reg_temp(current,i,-1);
1599     }
1600   }
1601   minimum_free_regs[i]=1;
1602 }
1603 void fconv_alloc(struct regstat *current,int i)
1604 {
1605   alloc_reg(current,i,CSREG); // Load status
1606   alloc_reg_temp(current,i,-1);
1607   minimum_free_regs[i]=1;
1608 }
1609 void float_alloc(struct regstat *current,int i)
1610 {
1611   alloc_reg(current,i,CSREG); // Load status
1612   alloc_reg_temp(current,i,-1);
1613   minimum_free_regs[i]=1;
1614 }
1615 void c2op_alloc(struct regstat *current,int i)
1616 {
1617   alloc_reg_temp(current,i,-1);
1618 }
1619 void fcomp_alloc(struct regstat *current,int i)
1620 {
1621   alloc_reg(current,i,CSREG); // Load status
1622   alloc_reg(current,i,FSREG); // Load flags
1623   dirty_reg(current,FSREG); // Flag will be modified
1624   alloc_reg_temp(current,i,-1);
1625   minimum_free_regs[i]=1;
1626 }
1627
1628 void syscall_alloc(struct regstat *current,int i)
1629 {
1630   alloc_cc(current,i);
1631   dirty_reg(current,CCREG);
1632   alloc_all(current,i);
1633   minimum_free_regs[i]=HOST_REGS;
1634   current->isconst=0;
1635 }
1636
1637 void delayslot_alloc(struct regstat *current,int i)
1638 {
1639   switch(itype[i]) {
1640     case UJUMP:
1641     case CJUMP:
1642     case SJUMP:
1643     case RJUMP:
1644     case FJUMP:
1645     case SYSCALL:
1646     case HLECALL:
1647     case SPAN:
1648       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1649       SysPrintf("Disabled speculative precompilation\n");
1650       stop_after_jal=1;
1651       break;
1652     case IMM16:
1653       imm16_alloc(current,i);
1654       break;
1655     case LOAD:
1656     case LOADLR:
1657       load_alloc(current,i);
1658       break;
1659     case STORE:
1660     case STORELR:
1661       store_alloc(current,i);
1662       break;
1663     case ALU:
1664       alu_alloc(current,i);
1665       break;
1666     case SHIFT:
1667       shift_alloc(current,i);
1668       break;
1669     case MULTDIV:
1670       multdiv_alloc(current,i);
1671       break;
1672     case SHIFTIMM:
1673       shiftimm_alloc(current,i);
1674       break;
1675     case MOV:
1676       mov_alloc(current,i);
1677       break;
1678     case COP0:
1679       cop0_alloc(current,i);
1680       break;
1681     case COP1:
1682     case COP2:
1683       cop1_alloc(current,i);
1684       break;
1685     case C1LS:
1686       c1ls_alloc(current,i);
1687       break;
1688     case C2LS:
1689       c2ls_alloc(current,i);
1690       break;
1691     case FCONV:
1692       fconv_alloc(current,i);
1693       break;
1694     case FLOAT:
1695       float_alloc(current,i);
1696       break;
1697     case FCOMP:
1698       fcomp_alloc(current,i);
1699       break;
1700     case C2OP:
1701       c2op_alloc(current,i);
1702       break;
1703   }
1704 }
1705
1706 // Special case where a branch and delay slot span two pages in virtual memory
1707 static void pagespan_alloc(struct regstat *current,int i)
1708 {
1709   current->isconst=0;
1710   current->wasconst=0;
1711   regs[i].wasconst=0;
1712   minimum_free_regs[i]=HOST_REGS;
1713   alloc_all(current,i);
1714   alloc_cc(current,i);
1715   dirty_reg(current,CCREG);
1716   if(opcode[i]==3) // JAL
1717   {
1718     alloc_reg(current,i,31);
1719     dirty_reg(current,31);
1720   }
1721   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1722   {
1723     alloc_reg(current,i,rs1[i]);
1724     if (rt1[i]!=0) {
1725       alloc_reg(current,i,rt1[i]);
1726       dirty_reg(current,rt1[i]);
1727     }
1728   }
1729   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1730   {
1731     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1732     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1733     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1734     {
1735       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1736       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1737     }
1738   }
1739   else
1740   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1741   {
1742     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1743     if(!((current->is32>>rs1[i])&1))
1744     {
1745       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1746     }
1747   }
1748   else
1749   if(opcode[i]==0x11) // BC1
1750   {
1751     alloc_reg(current,i,FSREG);
1752     alloc_reg(current,i,CSREG);
1753   }
1754   //else ...
1755 }
1756
1757 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1758 {
1759   stubs[stubcount][0]=type;
1760   stubs[stubcount][1]=addr;
1761   stubs[stubcount][2]=retaddr;
1762   stubs[stubcount][3]=a;
1763   stubs[stubcount][4]=b;
1764   stubs[stubcount][5]=c;
1765   stubs[stubcount][6]=d;
1766   stubs[stubcount][7]=e;
1767   stubcount++;
1768 }
1769
1770 // Write out a single register
1771 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1772 {
1773   int hr;
1774   for(hr=0;hr<HOST_REGS;hr++) {
1775     if(hr!=EXCLUDE_REG) {
1776       if((regmap[hr]&63)==r) {
1777         if((dirty>>hr)&1) {
1778           if(regmap[hr]<64) {
1779             emit_storereg(r,hr);
1780           }else{
1781             emit_storereg(r|64,hr);
1782           }
1783         }
1784       }
1785     }
1786   }
1787 }
1788
1789 int mchecksum()
1790 {
1791   //if(!tracedebug) return 0;
1792   int i;
1793   int sum=0;
1794   for(i=0;i<2097152;i++) {
1795     unsigned int temp=sum;
1796     sum<<=1;
1797     sum|=(~temp)>>31;
1798     sum^=((u_int *)rdram)[i];
1799   }
1800   return sum;
1801 }
1802 int rchecksum()
1803 {
1804   int i;
1805   int sum=0;
1806   for(i=0;i<64;i++)
1807     sum^=((u_int *)reg)[i];
1808   return sum;
1809 }
1810 void rlist()
1811 {
1812   int i;
1813   printf("TRACE: ");
1814   for(i=0;i<32;i++)
1815     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1816   printf("\n");
1817 }
1818
1819 void enabletrace()
1820 {
1821   tracedebug=1;
1822 }
1823
1824 void memdebug(int i)
1825 {
1826   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1827   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1828   //rlist();
1829   //if(tracedebug) {
1830   //if(Count>=-2084597794) {
1831   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1832   //if(0) {
1833     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1834     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1835     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1836     rlist();
1837     #ifdef __i386__
1838     printf("TRACE: %x\n",(&i)[-1]);
1839     #endif
1840     #ifdef __arm__
1841     int j;
1842     printf("TRACE: %x \n",(&j)[10]);
1843     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1844     #endif
1845     //fflush(stdout);
1846   }
1847   //printf("TRACE: %x\n",(&i)[-1]);
1848 }
1849
1850 void alu_assemble(int i,struct regstat *i_regs)
1851 {
1852   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1853     if(rt1[i]) {
1854       signed char s1,s2,t;
1855       t=get_reg(i_regs->regmap,rt1[i]);
1856       if(t>=0) {
1857         s1=get_reg(i_regs->regmap,rs1[i]);
1858         s2=get_reg(i_regs->regmap,rs2[i]);
1859         if(rs1[i]&&rs2[i]) {
1860           assert(s1>=0);
1861           assert(s2>=0);
1862           if(opcode2[i]&2) emit_sub(s1,s2,t);
1863           else emit_add(s1,s2,t);
1864         }
1865         else if(rs1[i]) {
1866           if(s1>=0) emit_mov(s1,t);
1867           else emit_loadreg(rs1[i],t);
1868         }
1869         else if(rs2[i]) {
1870           if(s2>=0) {
1871             if(opcode2[i]&2) emit_neg(s2,t);
1872             else emit_mov(s2,t);
1873           }
1874           else {
1875             emit_loadreg(rs2[i],t);
1876             if(opcode2[i]&2) emit_neg(t,t);
1877           }
1878         }
1879         else emit_zeroreg(t);
1880       }
1881     }
1882   }
1883   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1884     if(rt1[i]) {
1885       signed char s1l,s2l,s1h,s2h,tl,th;
1886       tl=get_reg(i_regs->regmap,rt1[i]);
1887       th=get_reg(i_regs->regmap,rt1[i]|64);
1888       if(tl>=0) {
1889         s1l=get_reg(i_regs->regmap,rs1[i]);
1890         s2l=get_reg(i_regs->regmap,rs2[i]);
1891         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1892         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1893         if(rs1[i]&&rs2[i]) {
1894           assert(s1l>=0);
1895           assert(s2l>=0);
1896           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1897           else emit_adds(s1l,s2l,tl);
1898           if(th>=0) {
1899             #ifdef INVERTED_CARRY
1900             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1901             #else
1902             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1903             #endif
1904             else emit_add(s1h,s2h,th);
1905           }
1906         }
1907         else if(rs1[i]) {
1908           if(s1l>=0) emit_mov(s1l,tl);
1909           else emit_loadreg(rs1[i],tl);
1910           if(th>=0) {
1911             if(s1h>=0) emit_mov(s1h,th);
1912             else emit_loadreg(rs1[i]|64,th);
1913           }
1914         }
1915         else if(rs2[i]) {
1916           if(s2l>=0) {
1917             if(opcode2[i]&2) emit_negs(s2l,tl);
1918             else emit_mov(s2l,tl);
1919           }
1920           else {
1921             emit_loadreg(rs2[i],tl);
1922             if(opcode2[i]&2) emit_negs(tl,tl);
1923           }
1924           if(th>=0) {
1925             #ifdef INVERTED_CARRY
1926             if(s2h>=0) emit_mov(s2h,th);
1927             else emit_loadreg(rs2[i]|64,th);
1928             if(opcode2[i]&2) {
1929               emit_adcimm(-1,th); // x86 has inverted carry flag
1930               emit_not(th,th);
1931             }
1932             #else
1933             if(opcode2[i]&2) {
1934               if(s2h>=0) emit_rscimm(s2h,0,th);
1935               else {
1936                 emit_loadreg(rs2[i]|64,th);
1937                 emit_rscimm(th,0,th);
1938               }
1939             }else{
1940               if(s2h>=0) emit_mov(s2h,th);
1941               else emit_loadreg(rs2[i]|64,th);
1942             }
1943             #endif
1944           }
1945         }
1946         else {
1947           emit_zeroreg(tl);
1948           if(th>=0) emit_zeroreg(th);
1949         }
1950       }
1951     }
1952   }
1953   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1954     if(rt1[i]) {
1955       signed char s1l,s1h,s2l,s2h,t;
1956       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
1957       {
1958         t=get_reg(i_regs->regmap,rt1[i]);
1959         //assert(t>=0);
1960         if(t>=0) {
1961           s1l=get_reg(i_regs->regmap,rs1[i]);
1962           s1h=get_reg(i_regs->regmap,rs1[i]|64);
1963           s2l=get_reg(i_regs->regmap,rs2[i]);
1964           s2h=get_reg(i_regs->regmap,rs2[i]|64);
1965           if(rs2[i]==0) // rx<r0
1966           {
1967             assert(s1h>=0);
1968             if(opcode2[i]==0x2a) // SLT
1969               emit_shrimm(s1h,31,t);
1970             else // SLTU (unsigned can not be less than zero)
1971               emit_zeroreg(t);
1972           }
1973           else if(rs1[i]==0) // r0<rx
1974           {
1975             assert(s2h>=0);
1976             if(opcode2[i]==0x2a) // SLT
1977               emit_set_gz64_32(s2h,s2l,t);
1978             else // SLTU (set if not zero)
1979               emit_set_nz64_32(s2h,s2l,t);
1980           }
1981           else {
1982             assert(s1l>=0);assert(s1h>=0);
1983             assert(s2l>=0);assert(s2h>=0);
1984             if(opcode2[i]==0x2a) // SLT
1985               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
1986             else // SLTU
1987               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
1988           }
1989         }
1990       } else {
1991         t=get_reg(i_regs->regmap,rt1[i]);
1992         //assert(t>=0);
1993         if(t>=0) {
1994           s1l=get_reg(i_regs->regmap,rs1[i]);
1995           s2l=get_reg(i_regs->regmap,rs2[i]);
1996           if(rs2[i]==0) // rx<r0
1997           {
1998             assert(s1l>=0);
1999             if(opcode2[i]==0x2a) // SLT
2000               emit_shrimm(s1l,31,t);
2001             else // SLTU (unsigned can not be less than zero)
2002               emit_zeroreg(t);
2003           }
2004           else if(rs1[i]==0) // r0<rx
2005           {
2006             assert(s2l>=0);
2007             if(opcode2[i]==0x2a) // SLT
2008               emit_set_gz32(s2l,t);
2009             else // SLTU (set if not zero)
2010               emit_set_nz32(s2l,t);
2011           }
2012           else{
2013             assert(s1l>=0);assert(s2l>=0);
2014             if(opcode2[i]==0x2a) // SLT
2015               emit_set_if_less32(s1l,s2l,t);
2016             else // SLTU
2017               emit_set_if_carry32(s1l,s2l,t);
2018           }
2019         }
2020       }
2021     }
2022   }
2023   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2024     if(rt1[i]) {
2025       signed char s1l,s1h,s2l,s2h,th,tl;
2026       tl=get_reg(i_regs->regmap,rt1[i]);
2027       th=get_reg(i_regs->regmap,rt1[i]|64);
2028       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2029       {
2030         assert(tl>=0);
2031         if(tl>=0) {
2032           s1l=get_reg(i_regs->regmap,rs1[i]);
2033           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2034           s2l=get_reg(i_regs->regmap,rs2[i]);
2035           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2036           if(rs1[i]&&rs2[i]) {
2037             assert(s1l>=0);assert(s1h>=0);
2038             assert(s2l>=0);assert(s2h>=0);
2039             if(opcode2[i]==0x24) { // AND
2040               emit_and(s1l,s2l,tl);
2041               emit_and(s1h,s2h,th);
2042             } else
2043             if(opcode2[i]==0x25) { // OR
2044               emit_or(s1l,s2l,tl);
2045               emit_or(s1h,s2h,th);
2046             } else
2047             if(opcode2[i]==0x26) { // XOR
2048               emit_xor(s1l,s2l,tl);
2049               emit_xor(s1h,s2h,th);
2050             } else
2051             if(opcode2[i]==0x27) { // NOR
2052               emit_or(s1l,s2l,tl);
2053               emit_or(s1h,s2h,th);
2054               emit_not(tl,tl);
2055               emit_not(th,th);
2056             }
2057           }
2058           else
2059           {
2060             if(opcode2[i]==0x24) { // AND
2061               emit_zeroreg(tl);
2062               emit_zeroreg(th);
2063             } else
2064             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2065               if(rs1[i]){
2066                 if(s1l>=0) emit_mov(s1l,tl);
2067                 else emit_loadreg(rs1[i],tl);
2068                 if(s1h>=0) emit_mov(s1h,th);
2069                 else emit_loadreg(rs1[i]|64,th);
2070               }
2071               else
2072               if(rs2[i]){
2073                 if(s2l>=0) emit_mov(s2l,tl);
2074                 else emit_loadreg(rs2[i],tl);
2075                 if(s2h>=0) emit_mov(s2h,th);
2076                 else emit_loadreg(rs2[i]|64,th);
2077               }
2078               else{
2079                 emit_zeroreg(tl);
2080                 emit_zeroreg(th);
2081               }
2082             } else
2083             if(opcode2[i]==0x27) { // NOR
2084               if(rs1[i]){
2085                 if(s1l>=0) emit_not(s1l,tl);
2086                 else{
2087                   emit_loadreg(rs1[i],tl);
2088                   emit_not(tl,tl);
2089                 }
2090                 if(s1h>=0) emit_not(s1h,th);
2091                 else{
2092                   emit_loadreg(rs1[i]|64,th);
2093                   emit_not(th,th);
2094                 }
2095               }
2096               else
2097               if(rs2[i]){
2098                 if(s2l>=0) emit_not(s2l,tl);
2099                 else{
2100                   emit_loadreg(rs2[i],tl);
2101                   emit_not(tl,tl);
2102                 }
2103                 if(s2h>=0) emit_not(s2h,th);
2104                 else{
2105                   emit_loadreg(rs2[i]|64,th);
2106                   emit_not(th,th);
2107                 }
2108               }
2109               else {
2110                 emit_movimm(-1,tl);
2111                 emit_movimm(-1,th);
2112               }
2113             }
2114           }
2115         }
2116       }
2117       else
2118       {
2119         // 32 bit
2120         if(tl>=0) {
2121           s1l=get_reg(i_regs->regmap,rs1[i]);
2122           s2l=get_reg(i_regs->regmap,rs2[i]);
2123           if(rs1[i]&&rs2[i]) {
2124             assert(s1l>=0);
2125             assert(s2l>=0);
2126             if(opcode2[i]==0x24) { // AND
2127               emit_and(s1l,s2l,tl);
2128             } else
2129             if(opcode2[i]==0x25) { // OR
2130               emit_or(s1l,s2l,tl);
2131             } else
2132             if(opcode2[i]==0x26) { // XOR
2133               emit_xor(s1l,s2l,tl);
2134             } else
2135             if(opcode2[i]==0x27) { // NOR
2136               emit_or(s1l,s2l,tl);
2137               emit_not(tl,tl);
2138             }
2139           }
2140           else
2141           {
2142             if(opcode2[i]==0x24) { // AND
2143               emit_zeroreg(tl);
2144             } else
2145             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2146               if(rs1[i]){
2147                 if(s1l>=0) emit_mov(s1l,tl);
2148                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2149               }
2150               else
2151               if(rs2[i]){
2152                 if(s2l>=0) emit_mov(s2l,tl);
2153                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2154               }
2155               else emit_zeroreg(tl);
2156             } else
2157             if(opcode2[i]==0x27) { // NOR
2158               if(rs1[i]){
2159                 if(s1l>=0) emit_not(s1l,tl);
2160                 else {
2161                   emit_loadreg(rs1[i],tl);
2162                   emit_not(tl,tl);
2163                 }
2164               }
2165               else
2166               if(rs2[i]){
2167                 if(s2l>=0) emit_not(s2l,tl);
2168                 else {
2169                   emit_loadreg(rs2[i],tl);
2170                   emit_not(tl,tl);
2171                 }
2172               }
2173               else emit_movimm(-1,tl);
2174             }
2175           }
2176         }
2177       }
2178     }
2179   }
2180 }
2181
2182 void imm16_assemble(int i,struct regstat *i_regs)
2183 {
2184   if (opcode[i]==0x0f) { // LUI
2185     if(rt1[i]) {
2186       signed char t;
2187       t=get_reg(i_regs->regmap,rt1[i]);
2188       //assert(t>=0);
2189       if(t>=0) {
2190         if(!((i_regs->isconst>>t)&1))
2191           emit_movimm(imm[i]<<16,t);
2192       }
2193     }
2194   }
2195   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2196     if(rt1[i]) {
2197       signed char s,t;
2198       t=get_reg(i_regs->regmap,rt1[i]);
2199       s=get_reg(i_regs->regmap,rs1[i]);
2200       if(rs1[i]) {
2201         //assert(t>=0);
2202         //assert(s>=0);
2203         if(t>=0) {
2204           if(!((i_regs->isconst>>t)&1)) {
2205             if(s<0) {
2206               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2207               emit_addimm(t,imm[i],t);
2208             }else{
2209               if(!((i_regs->wasconst>>s)&1))
2210                 emit_addimm(s,imm[i],t);
2211               else
2212                 emit_movimm(constmap[i][s]+imm[i],t);
2213             }
2214           }
2215         }
2216       } else {
2217         if(t>=0) {
2218           if(!((i_regs->isconst>>t)&1))
2219             emit_movimm(imm[i],t);
2220         }
2221       }
2222     }
2223   }
2224   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2225     if(rt1[i]) {
2226       signed char sh,sl,th,tl;
2227       th=get_reg(i_regs->regmap,rt1[i]|64);
2228       tl=get_reg(i_regs->regmap,rt1[i]);
2229       sh=get_reg(i_regs->regmap,rs1[i]|64);
2230       sl=get_reg(i_regs->regmap,rs1[i]);
2231       if(tl>=0) {
2232         if(rs1[i]) {
2233           assert(sh>=0);
2234           assert(sl>=0);
2235           if(th>=0) {
2236             emit_addimm64_32(sh,sl,imm[i],th,tl);
2237           }
2238           else {
2239             emit_addimm(sl,imm[i],tl);
2240           }
2241         } else {
2242           emit_movimm(imm[i],tl);
2243           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2244         }
2245       }
2246     }
2247   }
2248   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2249     if(rt1[i]) {
2250       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2251       signed char sh,sl,t;
2252       t=get_reg(i_regs->regmap,rt1[i]);
2253       sh=get_reg(i_regs->regmap,rs1[i]|64);
2254       sl=get_reg(i_regs->regmap,rs1[i]);
2255       //assert(t>=0);
2256       if(t>=0) {
2257         if(rs1[i]>0) {
2258           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2259           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2260             if(opcode[i]==0x0a) { // SLTI
2261               if(sl<0) {
2262                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2263                 emit_slti32(t,imm[i],t);
2264               }else{
2265                 emit_slti32(sl,imm[i],t);
2266               }
2267             }
2268             else { // SLTIU
2269               if(sl<0) {
2270                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2271                 emit_sltiu32(t,imm[i],t);
2272               }else{
2273                 emit_sltiu32(sl,imm[i],t);
2274               }
2275             }
2276           }else{ // 64-bit
2277             assert(sl>=0);
2278             if(opcode[i]==0x0a) // SLTI
2279               emit_slti64_32(sh,sl,imm[i],t);
2280             else // SLTIU
2281               emit_sltiu64_32(sh,sl,imm[i],t);
2282           }
2283         }else{
2284           // SLTI(U) with r0 is just stupid,
2285           // nonetheless examples can be found
2286           if(opcode[i]==0x0a) // SLTI
2287             if(0<imm[i]) emit_movimm(1,t);
2288             else emit_zeroreg(t);
2289           else // SLTIU
2290           {
2291             if(imm[i]) emit_movimm(1,t);
2292             else emit_zeroreg(t);
2293           }
2294         }
2295       }
2296     }
2297   }
2298   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2299     if(rt1[i]) {
2300       signed char sh,sl,th,tl;
2301       th=get_reg(i_regs->regmap,rt1[i]|64);
2302       tl=get_reg(i_regs->regmap,rt1[i]);
2303       sh=get_reg(i_regs->regmap,rs1[i]|64);
2304       sl=get_reg(i_regs->regmap,rs1[i]);
2305       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2306         if(opcode[i]==0x0c) //ANDI
2307         {
2308           if(rs1[i]) {
2309             if(sl<0) {
2310               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2311               emit_andimm(tl,imm[i],tl);
2312             }else{
2313               if(!((i_regs->wasconst>>sl)&1))
2314                 emit_andimm(sl,imm[i],tl);
2315               else
2316                 emit_movimm(constmap[i][sl]&imm[i],tl);
2317             }
2318           }
2319           else
2320             emit_zeroreg(tl);
2321           if(th>=0) emit_zeroreg(th);
2322         }
2323         else
2324         {
2325           if(rs1[i]) {
2326             if(sl<0) {
2327               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2328             }
2329             if(th>=0) {
2330               if(sh<0) {
2331                 emit_loadreg(rs1[i]|64,th);
2332               }else{
2333                 emit_mov(sh,th);
2334               }
2335             }
2336             if(opcode[i]==0x0d) //ORI
2337             if(sl<0) {
2338               emit_orimm(tl,imm[i],tl);
2339             }else{
2340               if(!((i_regs->wasconst>>sl)&1))
2341                 emit_orimm(sl,imm[i],tl);
2342               else
2343                 emit_movimm(constmap[i][sl]|imm[i],tl);
2344             }
2345             if(opcode[i]==0x0e) //XORI
2346             if(sl<0) {
2347               emit_xorimm(tl,imm[i],tl);
2348             }else{
2349               if(!((i_regs->wasconst>>sl)&1))
2350                 emit_xorimm(sl,imm[i],tl);
2351               else
2352                 emit_movimm(constmap[i][sl]^imm[i],tl);
2353             }
2354           }
2355           else {
2356             emit_movimm(imm[i],tl);
2357             if(th>=0) emit_zeroreg(th);
2358           }
2359         }
2360       }
2361     }
2362   }
2363 }
2364
2365 void shiftimm_assemble(int i,struct regstat *i_regs)
2366 {
2367   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2368   {
2369     if(rt1[i]) {
2370       signed char s,t;
2371       t=get_reg(i_regs->regmap,rt1[i]);
2372       s=get_reg(i_regs->regmap,rs1[i]);
2373       //assert(t>=0);
2374       if(t>=0&&!((i_regs->isconst>>t)&1)){
2375         if(rs1[i]==0)
2376         {
2377           emit_zeroreg(t);
2378         }
2379         else
2380         {
2381           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2382           if(imm[i]) {
2383             if(opcode2[i]==0) // SLL
2384             {
2385               emit_shlimm(s<0?t:s,imm[i],t);
2386             }
2387             if(opcode2[i]==2) // SRL
2388             {
2389               emit_shrimm(s<0?t:s,imm[i],t);
2390             }
2391             if(opcode2[i]==3) // SRA
2392             {
2393               emit_sarimm(s<0?t:s,imm[i],t);
2394             }
2395           }else{
2396             // Shift by zero
2397             if(s>=0 && s!=t) emit_mov(s,t);
2398           }
2399         }
2400       }
2401       //emit_storereg(rt1[i],t); //DEBUG
2402     }
2403   }
2404   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2405   {
2406     if(rt1[i]) {
2407       signed char sh,sl,th,tl;
2408       th=get_reg(i_regs->regmap,rt1[i]|64);
2409       tl=get_reg(i_regs->regmap,rt1[i]);
2410       sh=get_reg(i_regs->regmap,rs1[i]|64);
2411       sl=get_reg(i_regs->regmap,rs1[i]);
2412       if(tl>=0) {
2413         if(rs1[i]==0)
2414         {
2415           emit_zeroreg(tl);
2416           if(th>=0) emit_zeroreg(th);
2417         }
2418         else
2419         {
2420           assert(sl>=0);
2421           assert(sh>=0);
2422           if(imm[i]) {
2423             if(opcode2[i]==0x38) // DSLL
2424             {
2425               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2426               emit_shlimm(sl,imm[i],tl);
2427             }
2428             if(opcode2[i]==0x3a) // DSRL
2429             {
2430               emit_shrdimm(sl,sh,imm[i],tl);
2431               if(th>=0) emit_shrimm(sh,imm[i],th);
2432             }
2433             if(opcode2[i]==0x3b) // DSRA
2434             {
2435               emit_shrdimm(sl,sh,imm[i],tl);
2436               if(th>=0) emit_sarimm(sh,imm[i],th);
2437             }
2438           }else{
2439             // Shift by zero
2440             if(sl!=tl) emit_mov(sl,tl);
2441             if(th>=0&&sh!=th) emit_mov(sh,th);
2442           }
2443         }
2444       }
2445     }
2446   }
2447   if(opcode2[i]==0x3c) // DSLL32
2448   {
2449     if(rt1[i]) {
2450       signed char sl,tl,th;
2451       tl=get_reg(i_regs->regmap,rt1[i]);
2452       th=get_reg(i_regs->regmap,rt1[i]|64);
2453       sl=get_reg(i_regs->regmap,rs1[i]);
2454       if(th>=0||tl>=0){
2455         assert(tl>=0);
2456         assert(th>=0);
2457         assert(sl>=0);
2458         emit_mov(sl,th);
2459         emit_zeroreg(tl);
2460         if(imm[i]>32)
2461         {
2462           emit_shlimm(th,imm[i]&31,th);
2463         }
2464       }
2465     }
2466   }
2467   if(opcode2[i]==0x3e) // DSRL32
2468   {
2469     if(rt1[i]) {
2470       signed char sh,tl,th;
2471       tl=get_reg(i_regs->regmap,rt1[i]);
2472       th=get_reg(i_regs->regmap,rt1[i]|64);
2473       sh=get_reg(i_regs->regmap,rs1[i]|64);
2474       if(tl>=0){
2475         assert(sh>=0);
2476         emit_mov(sh,tl);
2477         if(th>=0) emit_zeroreg(th);
2478         if(imm[i]>32)
2479         {
2480           emit_shrimm(tl,imm[i]&31,tl);
2481         }
2482       }
2483     }
2484   }
2485   if(opcode2[i]==0x3f) // DSRA32
2486   {
2487     if(rt1[i]) {
2488       signed char sh,tl;
2489       tl=get_reg(i_regs->regmap,rt1[i]);
2490       sh=get_reg(i_regs->regmap,rs1[i]|64);
2491       if(tl>=0){
2492         assert(sh>=0);
2493         emit_mov(sh,tl);
2494         if(imm[i]>32)
2495         {
2496           emit_sarimm(tl,imm[i]&31,tl);
2497         }
2498       }
2499     }
2500   }
2501 }
2502
2503 #ifndef shift_assemble
2504 void shift_assemble(int i,struct regstat *i_regs)
2505 {
2506   printf("Need shift_assemble for this architecture.\n");
2507   exit(1);
2508 }
2509 #endif
2510
2511 void load_assemble(int i,struct regstat *i_regs)
2512 {
2513   int s,th,tl,addr,map=-1;
2514   int offset;
2515   int jaddr=0;
2516   int memtarget=0,c=0;
2517   int fastload_reg_override=0;
2518   u_int hr,reglist=0;
2519   th=get_reg(i_regs->regmap,rt1[i]|64);
2520   tl=get_reg(i_regs->regmap,rt1[i]);
2521   s=get_reg(i_regs->regmap,rs1[i]);
2522   offset=imm[i];
2523   for(hr=0;hr<HOST_REGS;hr++) {
2524     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2525   }
2526   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2527   if(s>=0) {
2528     c=(i_regs->wasconst>>s)&1;
2529     if (c) {
2530       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2531     }
2532   }
2533   //printf("load_assemble: c=%d\n",c);
2534   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2535   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2536   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2537     ||rt1[i]==0) {
2538       // could be FIFO, must perform the read
2539       // ||dummy read
2540       assem_debug("(forced read)\n");
2541       tl=get_reg(i_regs->regmap,-1);
2542       assert(tl>=0);
2543   }
2544   if(offset||s<0||c) addr=tl;
2545   else addr=s;
2546   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2547  if(tl>=0) {
2548   //printf("load_assemble: c=%d\n",c);
2549   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2550   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2551   reglist&=~(1<<tl);
2552   if(th>=0) reglist&=~(1<<th);
2553   if(!c) {
2554     #ifdef RAM_OFFSET
2555     map=get_reg(i_regs->regmap,ROREG);
2556     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2557     #endif
2558     #ifdef R29_HACK
2559     // Strmnnrmn's speed hack
2560     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2561     #endif
2562     {
2563       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2564     }
2565   }
2566   else if(ram_offset&&memtarget) {
2567     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2568     fastload_reg_override=HOST_TEMPREG;
2569   }
2570   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2571   if (opcode[i]==0x20) { // LB
2572     if(!c||memtarget) {
2573       if(!dummy) {
2574         #ifdef HOST_IMM_ADDR32
2575         if(c)
2576           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2577         else
2578         #endif
2579         {
2580           //emit_xorimm(addr,3,tl);
2581           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2582           int x=0,a=tl;
2583 #ifdef BIG_ENDIAN_MIPS
2584           if(!c) emit_xorimm(addr,3,tl);
2585           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2586 #else
2587           if(!c) a=addr;
2588 #endif
2589           if(fastload_reg_override) a=fastload_reg_override;
2590
2591           emit_movsbl_indexed_tlb(x,a,map,tl);
2592         }
2593       }
2594       if(jaddr)
2595         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2596     }
2597     else
2598       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2599   }
2600   if (opcode[i]==0x21) { // LH
2601     if(!c||memtarget) {
2602       if(!dummy) {
2603         #ifdef HOST_IMM_ADDR32
2604         if(c)
2605           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2606         else
2607         #endif
2608         {
2609           int x=0,a=tl;
2610 #ifdef BIG_ENDIAN_MIPS
2611           if(!c) emit_xorimm(addr,2,tl);
2612           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2613 #else
2614           if(!c) a=addr;
2615 #endif
2616           if(fastload_reg_override) a=fastload_reg_override;
2617           //#ifdef
2618           //emit_movswl_indexed_tlb(x,tl,map,tl);
2619           //else
2620           if(map>=0) {
2621             emit_movswl_indexed(x,a,tl);
2622           }else{
2623             #if 1 //def RAM_OFFSET
2624             emit_movswl_indexed(x,a,tl);
2625             #else
2626             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2627             #endif
2628           }
2629         }
2630       }
2631       if(jaddr)
2632         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2633     }
2634     else
2635       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2636   }
2637   if (opcode[i]==0x23) { // LW
2638     if(!c||memtarget) {
2639       if(!dummy) {
2640         int a=addr;
2641         if(fastload_reg_override) a=fastload_reg_override;
2642         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2643         #ifdef HOST_IMM_ADDR32
2644         if(c)
2645           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2646         else
2647         #endif
2648         emit_readword_indexed_tlb(0,a,map,tl);
2649       }
2650       if(jaddr)
2651         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2652     }
2653     else
2654       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2655   }
2656   if (opcode[i]==0x24) { // LBU
2657     if(!c||memtarget) {
2658       if(!dummy) {
2659         #ifdef HOST_IMM_ADDR32
2660         if(c)
2661           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2662         else
2663         #endif
2664         {
2665           //emit_xorimm(addr,3,tl);
2666           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2667           int x=0,a=tl;
2668 #ifdef BIG_ENDIAN_MIPS
2669           if(!c) emit_xorimm(addr,3,tl);
2670           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2671 #else
2672           if(!c) a=addr;
2673 #endif
2674           if(fastload_reg_override) a=fastload_reg_override;
2675
2676           emit_movzbl_indexed_tlb(x,a,map,tl);
2677         }
2678       }
2679       if(jaddr)
2680         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2681     }
2682     else
2683       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2684   }
2685   if (opcode[i]==0x25) { // LHU
2686     if(!c||memtarget) {
2687       if(!dummy) {
2688         #ifdef HOST_IMM_ADDR32
2689         if(c)
2690           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2691         else
2692         #endif
2693         {
2694           int x=0,a=tl;
2695 #ifdef BIG_ENDIAN_MIPS
2696           if(!c) emit_xorimm(addr,2,tl);
2697           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2698 #else
2699           if(!c) a=addr;
2700 #endif
2701           if(fastload_reg_override) a=fastload_reg_override;
2702           //#ifdef
2703           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2704           //#else
2705           if(map>=0) {
2706             emit_movzwl_indexed(x,a,tl);
2707           }else{
2708             #if 1 //def RAM_OFFSET
2709             emit_movzwl_indexed(x,a,tl);
2710             #else
2711             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2712             #endif
2713           }
2714         }
2715       }
2716       if(jaddr)
2717         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2718     }
2719     else
2720       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2721   }
2722   if (opcode[i]==0x27) { // LWU
2723     assert(th>=0);
2724     if(!c||memtarget) {
2725       if(!dummy) {
2726         int a=addr;
2727         if(fastload_reg_override) a=fastload_reg_override;
2728         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2729         #ifdef HOST_IMM_ADDR32
2730         if(c)
2731           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2732         else
2733         #endif
2734         emit_readword_indexed_tlb(0,a,map,tl);
2735       }
2736       if(jaddr)
2737         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2738     }
2739     else {
2740       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2741     }
2742     emit_zeroreg(th);
2743   }
2744   if (opcode[i]==0x37) { // LD
2745     if(!c||memtarget) {
2746       if(!dummy) {
2747         int a=addr;
2748         if(fastload_reg_override) a=fastload_reg_override;
2749         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2750         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2751         #ifdef HOST_IMM_ADDR32
2752         if(c)
2753           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2754         else
2755         #endif
2756         emit_readdword_indexed_tlb(0,a,map,th,tl);
2757       }
2758       if(jaddr)
2759         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2760     }
2761     else
2762       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2763   }
2764  }
2765   //emit_storereg(rt1[i],tl); // DEBUG
2766   //if(opcode[i]==0x23)
2767   //if(opcode[i]==0x24)
2768   //if(opcode[i]==0x23||opcode[i]==0x24)
2769   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2770   {
2771     //emit_pusha();
2772     save_regs(0x100f);
2773         emit_readword((int)&last_count,ECX);
2774         #ifdef __i386__
2775         if(get_reg(i_regs->regmap,CCREG)<0)
2776           emit_loadreg(CCREG,HOST_CCREG);
2777         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2778         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2779         emit_writeword(HOST_CCREG,(int)&Count);
2780         #endif
2781         #ifdef __arm__
2782         if(get_reg(i_regs->regmap,CCREG)<0)
2783           emit_loadreg(CCREG,0);
2784         else
2785           emit_mov(HOST_CCREG,0);
2786         emit_add(0,ECX,0);
2787         emit_addimm(0,2*ccadj[i],0);
2788         emit_writeword(0,(int)&Count);
2789         #endif
2790     emit_call((int)memdebug);
2791     //emit_popa();
2792     restore_regs(0x100f);
2793   }/**/
2794 }
2795
2796 #ifndef loadlr_assemble
2797 void loadlr_assemble(int i,struct regstat *i_regs)
2798 {
2799   printf("Need loadlr_assemble for this architecture.\n");
2800   exit(1);
2801 }
2802 #endif
2803
2804 void store_assemble(int i,struct regstat *i_regs)
2805 {
2806   int s,th,tl,map=-1;
2807   int addr,temp;
2808   int offset;
2809   int jaddr=0,jaddr2,type;
2810   int memtarget=0,c=0;
2811   int agr=AGEN1+(i&1);
2812   int faststore_reg_override=0;
2813   u_int hr,reglist=0;
2814   th=get_reg(i_regs->regmap,rs2[i]|64);
2815   tl=get_reg(i_regs->regmap,rs2[i]);
2816   s=get_reg(i_regs->regmap,rs1[i]);
2817   temp=get_reg(i_regs->regmap,agr);
2818   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2819   offset=imm[i];
2820   if(s>=0) {
2821     c=(i_regs->wasconst>>s)&1;
2822     if(c) {
2823       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2824     }
2825   }
2826   assert(tl>=0);
2827   assert(temp>=0);
2828   for(hr=0;hr<HOST_REGS;hr++) {
2829     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2830   }
2831   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2832   if(offset||s<0||c) addr=temp;
2833   else addr=s;
2834   if(!c) {
2835     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2836   }
2837   else if(ram_offset&&memtarget) {
2838     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2839     faststore_reg_override=HOST_TEMPREG;
2840   }
2841
2842   if (opcode[i]==0x28) { // SB
2843     if(!c||memtarget) {
2844       int x=0,a=temp;
2845 #ifdef BIG_ENDIAN_MIPS
2846       if(!c) emit_xorimm(addr,3,temp);
2847       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2848 #else
2849       if(!c) a=addr;
2850 #endif
2851       if(faststore_reg_override) a=faststore_reg_override;
2852       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2853       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2854     }
2855     type=STOREB_STUB;
2856   }
2857   if (opcode[i]==0x29) { // SH
2858     if(!c||memtarget) {
2859       int x=0,a=temp;
2860 #ifdef BIG_ENDIAN_MIPS
2861       if(!c) emit_xorimm(addr,2,temp);
2862       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2863 #else
2864       if(!c) a=addr;
2865 #endif
2866       if(faststore_reg_override) a=faststore_reg_override;
2867       //#ifdef
2868       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2869       //#else
2870       if(map>=0) {
2871         emit_writehword_indexed(tl,x,a);
2872       }else
2873         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2874         emit_writehword_indexed(tl,x,a);
2875     }
2876     type=STOREH_STUB;
2877   }
2878   if (opcode[i]==0x2B) { // SW
2879     if(!c||memtarget) {
2880       int a=addr;
2881       if(faststore_reg_override) a=faststore_reg_override;
2882       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2883       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2884     }
2885     type=STOREW_STUB;
2886   }
2887   if (opcode[i]==0x3F) { // SD
2888     if(!c||memtarget) {
2889       int a=addr;
2890       if(faststore_reg_override) a=faststore_reg_override;
2891       if(rs2[i]) {
2892         assert(th>=0);
2893         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2894         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
2895         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
2896       }else{
2897         // Store zero
2898         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
2899         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
2900         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
2901       }
2902     }
2903     type=STORED_STUB;
2904   }
2905   if(jaddr) {
2906     // PCSX store handlers don't check invcode again
2907     reglist|=1<<addr;
2908     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2909     jaddr=0;
2910   }
2911   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2912     if(!c||memtarget) {
2913       #ifdef DESTRUCTIVE_SHIFT
2914       // The x86 shift operation is 'destructive'; it overwrites the
2915       // source register, so we need to make a copy first and use that.
2916       addr=temp;
2917       #endif
2918       #if defined(HOST_IMM8)
2919       int ir=get_reg(i_regs->regmap,INVCP);
2920       assert(ir>=0);
2921       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2922       #else
2923       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
2924       #endif
2925       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2926       emit_callne(invalidate_addr_reg[addr]);
2927       #else
2928       jaddr2=(int)out;
2929       emit_jne(0);
2930       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2931       #endif
2932     }
2933   }
2934   u_int addr_val=constmap[i][s]+offset;
2935   if(jaddr) {
2936     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2937   } else if(c&&!memtarget) {
2938     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2939   }
2940   // basic current block modification detection..
2941   // not looking back as that should be in mips cache already
2942   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2943     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2944     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2945     if(i_regs->regmap==regs[i].regmap) {
2946       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
2947       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
2948       emit_movimm(start+i*4+4,0);
2949       emit_writeword(0,(int)&pcaddr);
2950       emit_jmp((int)do_interrupt);
2951     }
2952   }
2953   //if(opcode[i]==0x2B || opcode[i]==0x3F)
2954   //if(opcode[i]==0x2B || opcode[i]==0x28)
2955   //if(opcode[i]==0x2B || opcode[i]==0x29)
2956   //if(opcode[i]==0x2B)
2957   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
2958   {
2959     #ifdef __i386__
2960     emit_pusha();
2961     #endif
2962     #ifdef __arm__
2963     save_regs(0x100f);
2964     #endif
2965         emit_readword((int)&last_count,ECX);
2966         #ifdef __i386__
2967         if(get_reg(i_regs->regmap,CCREG)<0)
2968           emit_loadreg(CCREG,HOST_CCREG);
2969         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2970         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2971         emit_writeword(HOST_CCREG,(int)&Count);
2972         #endif
2973         #ifdef __arm__
2974         if(get_reg(i_regs->regmap,CCREG)<0)
2975           emit_loadreg(CCREG,0);
2976         else
2977           emit_mov(HOST_CCREG,0);
2978         emit_add(0,ECX,0);
2979         emit_addimm(0,2*ccadj[i],0);
2980         emit_writeword(0,(int)&Count);
2981         #endif
2982     emit_call((int)memdebug);
2983     #ifdef __i386__
2984     emit_popa();
2985     #endif
2986     #ifdef __arm__
2987     restore_regs(0x100f);
2988     #endif
2989   }/**/
2990 }
2991
2992 void storelr_assemble(int i,struct regstat *i_regs)
2993 {
2994   int s,th,tl;
2995   int temp;
2996   int temp2;
2997   int offset;
2998   int jaddr=0,jaddr2;
2999   int case1,case2,case3;
3000   int done0,done1,done2;
3001   int memtarget=0,c=0;
3002   int agr=AGEN1+(i&1);
3003   u_int hr,reglist=0;
3004   th=get_reg(i_regs->regmap,rs2[i]|64);
3005   tl=get_reg(i_regs->regmap,rs2[i]);
3006   s=get_reg(i_regs->regmap,rs1[i]);
3007   temp=get_reg(i_regs->regmap,agr);
3008   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3009   offset=imm[i];
3010   if(s>=0) {
3011     c=(i_regs->isconst>>s)&1;
3012     if(c) {
3013       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3014     }
3015   }
3016   assert(tl>=0);
3017   for(hr=0;hr<HOST_REGS;hr++) {
3018     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3019   }
3020   assert(temp>=0);
3021   if(!c) {
3022     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3023     if(!offset&&s!=temp) emit_mov(s,temp);
3024     jaddr=(int)out;
3025     emit_jno(0);
3026   }
3027   else
3028   {
3029     if(!memtarget||!rs1[i]) {
3030       jaddr=(int)out;
3031       emit_jmp(0);
3032     }
3033   }
3034   #ifdef RAM_OFFSET
3035   int map=get_reg(i_regs->regmap,ROREG);
3036   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3037   #else
3038   if((u_int)rdram!=0x80000000)
3039     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3040   #endif
3041
3042   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3043     temp2=get_reg(i_regs->regmap,FTEMP);
3044     if(!rs2[i]) temp2=th=tl;
3045   }
3046
3047 #ifndef BIG_ENDIAN_MIPS
3048     emit_xorimm(temp,3,temp);
3049 #endif
3050   emit_testimm(temp,2);
3051   case2=(int)out;
3052   emit_jne(0);
3053   emit_testimm(temp,1);
3054   case1=(int)out;
3055   emit_jne(0);
3056   // 0
3057   if (opcode[i]==0x2A) { // SWL
3058     emit_writeword_indexed(tl,0,temp);
3059   }
3060   if (opcode[i]==0x2E) { // SWR
3061     emit_writebyte_indexed(tl,3,temp);
3062   }
3063   if (opcode[i]==0x2C) { // SDL
3064     emit_writeword_indexed(th,0,temp);
3065     if(rs2[i]) emit_mov(tl,temp2);
3066   }
3067   if (opcode[i]==0x2D) { // SDR
3068     emit_writebyte_indexed(tl,3,temp);
3069     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3070   }
3071   done0=(int)out;
3072   emit_jmp(0);
3073   // 1
3074   set_jump_target(case1,(int)out);
3075   if (opcode[i]==0x2A) { // SWL
3076     // Write 3 msb into three least significant bytes
3077     if(rs2[i]) emit_rorimm(tl,8,tl);
3078     emit_writehword_indexed(tl,-1,temp);
3079     if(rs2[i]) emit_rorimm(tl,16,tl);
3080     emit_writebyte_indexed(tl,1,temp);
3081     if(rs2[i]) emit_rorimm(tl,8,tl);
3082   }
3083   if (opcode[i]==0x2E) { // SWR
3084     // Write two lsb into two most significant bytes
3085     emit_writehword_indexed(tl,1,temp);
3086   }
3087   if (opcode[i]==0x2C) { // SDL
3088     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3089     // Write 3 msb into three least significant bytes
3090     if(rs2[i]) emit_rorimm(th,8,th);
3091     emit_writehword_indexed(th,-1,temp);
3092     if(rs2[i]) emit_rorimm(th,16,th);
3093     emit_writebyte_indexed(th,1,temp);
3094     if(rs2[i]) emit_rorimm(th,8,th);
3095   }
3096   if (opcode[i]==0x2D) { // SDR
3097     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3098     // Write two lsb into two most significant bytes
3099     emit_writehword_indexed(tl,1,temp);
3100   }
3101   done1=(int)out;
3102   emit_jmp(0);
3103   // 2
3104   set_jump_target(case2,(int)out);
3105   emit_testimm(temp,1);
3106   case3=(int)out;
3107   emit_jne(0);
3108   if (opcode[i]==0x2A) { // SWL
3109     // Write two msb into two least significant bytes
3110     if(rs2[i]) emit_rorimm(tl,16,tl);
3111     emit_writehword_indexed(tl,-2,temp);
3112     if(rs2[i]) emit_rorimm(tl,16,tl);
3113   }
3114   if (opcode[i]==0x2E) { // SWR
3115     // Write 3 lsb into three most significant bytes
3116     emit_writebyte_indexed(tl,-1,temp);
3117     if(rs2[i]) emit_rorimm(tl,8,tl);
3118     emit_writehword_indexed(tl,0,temp);
3119     if(rs2[i]) emit_rorimm(tl,24,tl);
3120   }
3121   if (opcode[i]==0x2C) { // SDL
3122     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3123     // Write two msb into two least significant bytes
3124     if(rs2[i]) emit_rorimm(th,16,th);
3125     emit_writehword_indexed(th,-2,temp);
3126     if(rs2[i]) emit_rorimm(th,16,th);
3127   }
3128   if (opcode[i]==0x2D) { // SDR
3129     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3130     // Write 3 lsb into three most significant bytes
3131     emit_writebyte_indexed(tl,-1,temp);
3132     if(rs2[i]) emit_rorimm(tl,8,tl);
3133     emit_writehword_indexed(tl,0,temp);
3134     if(rs2[i]) emit_rorimm(tl,24,tl);
3135   }
3136   done2=(int)out;
3137   emit_jmp(0);
3138   // 3
3139   set_jump_target(case3,(int)out);
3140   if (opcode[i]==0x2A) { // SWL
3141     // Write msb into least significant byte
3142     if(rs2[i]) emit_rorimm(tl,24,tl);
3143     emit_writebyte_indexed(tl,-3,temp);
3144     if(rs2[i]) emit_rorimm(tl,8,tl);
3145   }
3146   if (opcode[i]==0x2E) { // SWR
3147     // Write entire word
3148     emit_writeword_indexed(tl,-3,temp);
3149   }
3150   if (opcode[i]==0x2C) { // SDL
3151     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3152     // Write msb into least significant byte
3153     if(rs2[i]) emit_rorimm(th,24,th);
3154     emit_writebyte_indexed(th,-3,temp);
3155     if(rs2[i]) emit_rorimm(th,8,th);
3156   }
3157   if (opcode[i]==0x2D) { // SDR
3158     if(rs2[i]) emit_mov(th,temp2);
3159     // Write entire word
3160     emit_writeword_indexed(tl,-3,temp);
3161   }
3162   set_jump_target(done0,(int)out);
3163   set_jump_target(done1,(int)out);
3164   set_jump_target(done2,(int)out);
3165   if (opcode[i]==0x2C) { // SDL
3166     emit_testimm(temp,4);
3167     done0=(int)out;
3168     emit_jne(0);
3169     emit_andimm(temp,~3,temp);
3170     emit_writeword_indexed(temp2,4,temp);
3171     set_jump_target(done0,(int)out);
3172   }
3173   if (opcode[i]==0x2D) { // SDR
3174     emit_testimm(temp,4);
3175     done0=(int)out;
3176     emit_jeq(0);
3177     emit_andimm(temp,~3,temp);
3178     emit_writeword_indexed(temp2,-4,temp);
3179     set_jump_target(done0,(int)out);
3180   }
3181   if(!c||!memtarget)
3182     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3183   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3184     #ifdef RAM_OFFSET
3185     int map=get_reg(i_regs->regmap,ROREG);
3186     if(map<0) map=HOST_TEMPREG;
3187     gen_orig_addr_w(temp,map);
3188     #else
3189     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3190     #endif
3191     #if defined(HOST_IMM8)
3192     int ir=get_reg(i_regs->regmap,INVCP);
3193     assert(ir>=0);
3194     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3195     #else
3196     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3197     #endif
3198     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3199     emit_callne(invalidate_addr_reg[temp]);
3200     #else
3201     jaddr2=(int)out;
3202     emit_jne(0);
3203     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3204     #endif
3205   }
3206   /*
3207     emit_pusha();
3208     //save_regs(0x100f);
3209         emit_readword((int)&last_count,ECX);
3210         if(get_reg(i_regs->regmap,CCREG)<0)
3211           emit_loadreg(CCREG,HOST_CCREG);
3212         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3213         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3214         emit_writeword(HOST_CCREG,(int)&Count);
3215     emit_call((int)memdebug);
3216     emit_popa();
3217     //restore_regs(0x100f);
3218   /**/
3219 }
3220
3221 void c1ls_assemble(int i,struct regstat *i_regs)
3222 {
3223   cop1_unusable(i, i_regs);
3224 }
3225
3226 void c2ls_assemble(int i,struct regstat *i_regs)
3227 {
3228   int s,tl;
3229   int ar;
3230   int offset;
3231   int memtarget=0,c=0;
3232   int jaddr2=0,jaddr3,type;
3233   int agr=AGEN1+(i&1);
3234   int fastio_reg_override=0;
3235   u_int hr,reglist=0;
3236   u_int copr=(source[i]>>16)&0x1f;
3237   s=get_reg(i_regs->regmap,rs1[i]);
3238   tl=get_reg(i_regs->regmap,FTEMP);
3239   offset=imm[i];
3240   assert(rs1[i]>0);
3241   assert(tl>=0);
3242
3243   for(hr=0;hr<HOST_REGS;hr++) {
3244     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3245   }
3246   if(i_regs->regmap[HOST_CCREG]==CCREG)
3247     reglist&=~(1<<HOST_CCREG);
3248
3249   // get the address
3250   if (opcode[i]==0x3a) { // SWC2
3251     ar=get_reg(i_regs->regmap,agr);
3252     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3253     reglist|=1<<ar;
3254   } else { // LWC2
3255     ar=tl;
3256   }
3257   if(s>=0) c=(i_regs->wasconst>>s)&1;
3258   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3259   if (!offset&&!c&&s>=0) ar=s;
3260   assert(ar>=0);
3261
3262   if (opcode[i]==0x3a) { // SWC2
3263     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3264     type=STOREW_STUB;
3265   }
3266   else
3267     type=LOADW_STUB;
3268
3269   if(c&&!memtarget) {
3270     jaddr2=(int)out;
3271     emit_jmp(0); // inline_readstub/inline_writestub?
3272   }
3273   else {
3274     if(!c) {
3275       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3276     }
3277     else if(ram_offset&&memtarget) {
3278       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3279       fastio_reg_override=HOST_TEMPREG;
3280     }
3281     if (opcode[i]==0x32) { // LWC2
3282       #ifdef HOST_IMM_ADDR32
3283       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3284       else
3285       #endif
3286       int a=ar;
3287       if(fastio_reg_override) a=fastio_reg_override;
3288       emit_readword_indexed(0,a,tl);
3289     }
3290     if (opcode[i]==0x3a) { // SWC2
3291       #ifdef DESTRUCTIVE_SHIFT
3292       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3293       #endif
3294       int a=ar;
3295       if(fastio_reg_override) a=fastio_reg_override;
3296       emit_writeword_indexed(tl,0,a);
3297     }
3298   }
3299   if(jaddr2)
3300     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3301   if(opcode[i]==0x3a) // SWC2
3302   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3303 #if defined(HOST_IMM8)
3304     int ir=get_reg(i_regs->regmap,INVCP);
3305     assert(ir>=0);
3306     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3307 #else
3308     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3309 #endif
3310     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3311     emit_callne(invalidate_addr_reg[ar]);
3312     #else
3313     jaddr3=(int)out;
3314     emit_jne(0);
3315     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3316     #endif
3317   }
3318   if (opcode[i]==0x32) { // LWC2
3319     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3320   }
3321 }
3322
3323 #ifndef multdiv_assemble
3324 void multdiv_assemble(int i,struct regstat *i_regs)
3325 {
3326   printf("Need multdiv_assemble for this architecture.\n");
3327   exit(1);
3328 }
3329 #endif
3330
3331 void mov_assemble(int i,struct regstat *i_regs)
3332 {
3333   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3334   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3335   if(rt1[i]) {
3336     signed char sh,sl,th,tl;
3337     th=get_reg(i_regs->regmap,rt1[i]|64);
3338     tl=get_reg(i_regs->regmap,rt1[i]);
3339     //assert(tl>=0);
3340     if(tl>=0) {
3341       sh=get_reg(i_regs->regmap,rs1[i]|64);
3342       sl=get_reg(i_regs->regmap,rs1[i]);
3343       if(sl>=0) emit_mov(sl,tl);
3344       else emit_loadreg(rs1[i],tl);
3345       if(th>=0) {
3346         if(sh>=0) emit_mov(sh,th);
3347         else emit_loadreg(rs1[i]|64,th);
3348       }
3349     }
3350   }
3351 }
3352
3353 #ifndef fconv_assemble
3354 void fconv_assemble(int i,struct regstat *i_regs)
3355 {
3356   printf("Need fconv_assemble for this architecture.\n");
3357   exit(1);
3358 }
3359 #endif
3360
3361 #if 0
3362 void float_assemble(int i,struct regstat *i_regs)
3363 {
3364   printf("Need float_assemble for this architecture.\n");
3365   exit(1);
3366 }
3367 #endif
3368
3369 void syscall_assemble(int i,struct regstat *i_regs)
3370 {
3371   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3372   assert(ccreg==HOST_CCREG);
3373   assert(!is_delayslot);
3374   emit_movimm(start+i*4,EAX); // Get PC
3375   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3376   emit_jmp((int)jump_syscall_hle); // XXX
3377 }
3378
3379 void hlecall_assemble(int i,struct regstat *i_regs)
3380 {
3381   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3382   assert(ccreg==HOST_CCREG);
3383   assert(!is_delayslot);
3384   emit_movimm(start+i*4+4,0); // Get PC
3385   emit_movimm((int)psxHLEt[source[i]&7],1);
3386   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3387   emit_jmp((int)jump_hlecall);
3388 }
3389
3390 void intcall_assemble(int i,struct regstat *i_regs)
3391 {
3392   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3393   assert(ccreg==HOST_CCREG);
3394   assert(!is_delayslot);
3395   emit_movimm(start+i*4,0); // Get PC
3396   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3397   emit_jmp((int)jump_intcall);
3398 }
3399
3400 void ds_assemble(int i,struct regstat *i_regs)
3401 {
3402   speculate_register_values(i);
3403   is_delayslot=1;
3404   switch(itype[i]) {
3405     case ALU:
3406       alu_assemble(i,i_regs);break;
3407     case IMM16:
3408       imm16_assemble(i,i_regs);break;
3409     case SHIFT:
3410       shift_assemble(i,i_regs);break;
3411     case SHIFTIMM:
3412       shiftimm_assemble(i,i_regs);break;
3413     case LOAD:
3414       load_assemble(i,i_regs);break;
3415     case LOADLR:
3416       loadlr_assemble(i,i_regs);break;
3417     case STORE:
3418       store_assemble(i,i_regs);break;
3419     case STORELR:
3420       storelr_assemble(i,i_regs);break;
3421     case COP0:
3422       cop0_assemble(i,i_regs);break;
3423     case COP1:
3424       cop1_assemble(i,i_regs);break;
3425     case C1LS:
3426       c1ls_assemble(i,i_regs);break;
3427     case COP2:
3428       cop2_assemble(i,i_regs);break;
3429     case C2LS:
3430       c2ls_assemble(i,i_regs);break;
3431     case C2OP:
3432       c2op_assemble(i,i_regs);break;
3433     case FCONV:
3434       fconv_assemble(i,i_regs);break;
3435     case FLOAT:
3436       float_assemble(i,i_regs);break;
3437     case FCOMP:
3438       fcomp_assemble(i,i_regs);break;
3439     case MULTDIV:
3440       multdiv_assemble(i,i_regs);break;
3441     case MOV:
3442       mov_assemble(i,i_regs);break;
3443     case SYSCALL:
3444     case HLECALL:
3445     case INTCALL:
3446     case SPAN:
3447     case UJUMP:
3448     case RJUMP:
3449     case CJUMP:
3450     case SJUMP:
3451     case FJUMP:
3452       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3453   }
3454   is_delayslot=0;
3455 }
3456
3457 // Is the branch target a valid internal jump?
3458 int internal_branch(uint64_t i_is32,int addr)
3459 {
3460   if(addr&1) return 0; // Indirect (register) jump
3461   if(addr>=start && addr<start+slen*4-4)
3462   {
3463     //int t=(addr-start)>>2;
3464     // Delay slots are not valid branch targets
3465     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3466     // 64 -> 32 bit transition requires a recompile
3467     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3468     {
3469       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3470       else printf("optimizable: yes\n");
3471     }*/
3472     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3473     return 1;
3474   }
3475   return 0;
3476 }
3477
3478 #ifndef wb_invalidate
3479 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3480   uint64_t u,uint64_t uu)
3481 {
3482   int hr;
3483   for(hr=0;hr<HOST_REGS;hr++) {
3484     if(hr!=EXCLUDE_REG) {
3485       if(pre[hr]!=entry[hr]) {
3486         if(pre[hr]>=0) {
3487           if((dirty>>hr)&1) {
3488             if(get_reg(entry,pre[hr])<0) {
3489               if(pre[hr]<64) {
3490                 if(!((u>>pre[hr])&1)) {
3491                   emit_storereg(pre[hr],hr);
3492                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3493                     emit_sarimm(hr,31,hr);
3494                     emit_storereg(pre[hr]|64,hr);
3495                   }
3496                 }
3497               }else{
3498                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3499                   emit_storereg(pre[hr],hr);
3500                 }
3501               }
3502             }
3503           }
3504         }
3505       }
3506     }
3507   }
3508   // Move from one register to another (no writeback)
3509   for(hr=0;hr<HOST_REGS;hr++) {
3510     if(hr!=EXCLUDE_REG) {
3511       if(pre[hr]!=entry[hr]) {
3512         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3513           int nr;
3514           if((nr=get_reg(entry,pre[hr]))>=0) {
3515             emit_mov(hr,nr);
3516           }
3517         }
3518       }
3519     }
3520   }
3521 }
3522 #endif
3523
3524 // Load the specified registers
3525 // This only loads the registers given as arguments because
3526 // we don't want to load things that will be overwritten
3527 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3528 {
3529   int hr;
3530   // Load 32-bit regs
3531   for(hr=0;hr<HOST_REGS;hr++) {
3532     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3533       if(entry[hr]!=regmap[hr]) {
3534         if(regmap[hr]==rs1||regmap[hr]==rs2)
3535         {
3536           if(regmap[hr]==0) {
3537             emit_zeroreg(hr);
3538           }
3539           else
3540           {
3541             emit_loadreg(regmap[hr],hr);
3542           }
3543         }
3544       }
3545     }
3546   }
3547   //Load 64-bit regs
3548   for(hr=0;hr<HOST_REGS;hr++) {
3549     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3550       if(entry[hr]!=regmap[hr]) {
3551         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3552         {
3553           assert(regmap[hr]!=64);
3554           if((is32>>(regmap[hr]&63))&1) {
3555             int lr=get_reg(regmap,regmap[hr]-64);
3556             if(lr>=0)
3557               emit_sarimm(lr,31,hr);
3558             else
3559               emit_loadreg(regmap[hr],hr);
3560           }
3561           else
3562           {
3563             emit_loadreg(regmap[hr],hr);
3564           }
3565         }
3566       }
3567     }
3568   }
3569 }
3570
3571 // Load registers prior to the start of a loop
3572 // so that they are not loaded within the loop
3573 static void loop_preload(signed char pre[],signed char entry[])
3574 {
3575   int hr;
3576   for(hr=0;hr<HOST_REGS;hr++) {
3577     if(hr!=EXCLUDE_REG) {
3578       if(pre[hr]!=entry[hr]) {
3579         if(entry[hr]>=0) {
3580           if(get_reg(pre,entry[hr])<0) {
3581             assem_debug("loop preload:\n");
3582             //printf("loop preload: %d\n",hr);
3583             if(entry[hr]==0) {
3584               emit_zeroreg(hr);
3585             }
3586             else if(entry[hr]<TEMPREG)
3587             {
3588               emit_loadreg(entry[hr],hr);
3589             }
3590             else if(entry[hr]-64<TEMPREG)
3591             {
3592               emit_loadreg(entry[hr],hr);
3593             }
3594           }
3595         }
3596       }
3597     }
3598   }
3599 }
3600
3601 // Generate address for load/store instruction
3602 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3603 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3604 {
3605   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3606     int ra=-1;
3607     int agr=AGEN1+(i&1);
3608     if(itype[i]==LOAD) {
3609       ra=get_reg(i_regs->regmap,rt1[i]);
3610       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3611       assert(ra>=0);
3612     }
3613     if(itype[i]==LOADLR) {
3614       ra=get_reg(i_regs->regmap,FTEMP);
3615     }
3616     if(itype[i]==STORE||itype[i]==STORELR) {
3617       ra=get_reg(i_regs->regmap,agr);
3618       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3619     }
3620     if(itype[i]==C1LS||itype[i]==C2LS) {
3621       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3622         ra=get_reg(i_regs->regmap,FTEMP);
3623       else { // SWC1/SDC1/SWC2/SDC2
3624         ra=get_reg(i_regs->regmap,agr);
3625         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3626       }
3627     }
3628     int rs=get_reg(i_regs->regmap,rs1[i]);
3629     if(ra>=0) {
3630       int offset=imm[i];
3631       int c=(i_regs->wasconst>>rs)&1;
3632       if(rs1[i]==0) {
3633         // Using r0 as a base address
3634         if(!entry||entry[ra]!=agr) {
3635           if (opcode[i]==0x22||opcode[i]==0x26) {
3636             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3637           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3638             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3639           }else{
3640             emit_movimm(offset,ra);
3641           }
3642         } // else did it in the previous cycle
3643       }
3644       else if(rs<0) {
3645         if(!entry||entry[ra]!=rs1[i])
3646           emit_loadreg(rs1[i],ra);
3647         //if(!entry||entry[ra]!=rs1[i])
3648         //  printf("poor load scheduling!\n");
3649       }
3650       else if(c) {
3651         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3652           if(!entry||entry[ra]!=agr) {
3653             if (opcode[i]==0x22||opcode[i]==0x26) {
3654               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3655             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3656               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3657             }else{
3658               #ifdef HOST_IMM_ADDR32
3659               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3660               #endif
3661               emit_movimm(constmap[i][rs]+offset,ra);
3662               regs[i].loadedconst|=1<<ra;
3663             }
3664           } // else did it in the previous cycle
3665         } // else load_consts already did it
3666       }
3667       if(offset&&!c&&rs1[i]) {
3668         if(rs>=0) {
3669           emit_addimm(rs,offset,ra);
3670         }else{
3671           emit_addimm(ra,offset,ra);
3672         }
3673       }
3674     }
3675   }
3676   // Preload constants for next instruction
3677   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3678     int agr,ra;
3679     // Actual address
3680     agr=AGEN1+((i+1)&1);
3681     ra=get_reg(i_regs->regmap,agr);
3682     if(ra>=0) {
3683       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3684       int offset=imm[i+1];
3685       int c=(regs[i+1].wasconst>>rs)&1;
3686       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3687         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3688           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3689         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3690           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3691         }else{
3692           #ifdef HOST_IMM_ADDR32
3693           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3694           #endif
3695           emit_movimm(constmap[i+1][rs]+offset,ra);
3696           regs[i+1].loadedconst|=1<<ra;
3697         }
3698       }
3699       else if(rs1[i+1]==0) {
3700         // Using r0 as a base address
3701         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3702           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3703         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3704           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3705         }else{
3706           emit_movimm(offset,ra);
3707         }
3708       }
3709     }
3710   }
3711 }
3712
3713 static int get_final_value(int hr, int i, int *value)
3714 {
3715   int reg=regs[i].regmap[hr];
3716   while(i<slen-1) {
3717     if(regs[i+1].regmap[hr]!=reg) break;
3718     if(!((regs[i+1].isconst>>hr)&1)) break;
3719     if(bt[i+1]) break;
3720     i++;
3721   }
3722   if(i<slen-1) {
3723     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3724       *value=constmap[i][hr];
3725       return 1;
3726     }
3727     if(!bt[i+1]) {
3728       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3729         // Load in delay slot, out-of-order execution
3730         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3731         {
3732           // Precompute load address
3733           *value=constmap[i][hr]+imm[i+2];
3734           return 1;
3735         }
3736       }
3737       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3738       {
3739         // Precompute load address
3740         *value=constmap[i][hr]+imm[i+1];
3741         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3742         return 1;
3743       }
3744     }
3745   }
3746   *value=constmap[i][hr];
3747   //printf("c=%x\n",(int)constmap[i][hr]);
3748   if(i==slen-1) return 1;
3749   if(reg<64) {
3750     return !((unneeded_reg[i+1]>>reg)&1);
3751   }else{
3752     return !((unneeded_reg_upper[i+1]>>reg)&1);
3753   }
3754 }
3755
3756 // Load registers with known constants
3757 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3758 {
3759   int hr,hr2;
3760   // propagate loaded constant flags
3761   if(i==0||bt[i])
3762     regs[i].loadedconst=0;
3763   else {
3764     for(hr=0;hr<HOST_REGS;hr++) {
3765       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3766          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3767       {
3768         regs[i].loadedconst|=1<<hr;
3769       }
3770     }
3771   }
3772   // Load 32-bit regs
3773   for(hr=0;hr<HOST_REGS;hr++) {
3774     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3775       //if(entry[hr]!=regmap[hr]) {
3776       if(!((regs[i].loadedconst>>hr)&1)) {
3777         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3778           int value,similar=0;
3779           if(get_final_value(hr,i,&value)) {
3780             // see if some other register has similar value
3781             for(hr2=0;hr2<HOST_REGS;hr2++) {
3782               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3783                 if(is_similar_value(value,constmap[i][hr2])) {
3784                   similar=1;
3785                   break;
3786                 }
3787               }
3788             }
3789             if(similar) {
3790               int value2;
3791               if(get_final_value(hr2,i,&value2)) // is this needed?
3792                 emit_movimm_from(value2,hr2,value,hr);
3793               else
3794                 emit_movimm(value,hr);
3795             }
3796             else if(value==0) {
3797               emit_zeroreg(hr);
3798             }
3799             else {
3800               emit_movimm(value,hr);
3801             }
3802           }
3803           regs[i].loadedconst|=1<<hr;
3804         }
3805       }
3806     }
3807   }
3808   // Load 64-bit regs
3809   for(hr=0;hr<HOST_REGS;hr++) {
3810     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3811       //if(entry[hr]!=regmap[hr]) {
3812       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3813         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3814           if((is32>>(regmap[hr]&63))&1) {
3815             int lr=get_reg(regmap,regmap[hr]-64);
3816             assert(lr>=0);
3817             emit_sarimm(lr,31,hr);
3818           }
3819           else
3820           {
3821             int value;
3822             if(get_final_value(hr,i,&value)) {
3823               if(value==0) {
3824                 emit_zeroreg(hr);
3825               }
3826               else {
3827                 emit_movimm(value,hr);
3828               }
3829             }
3830           }
3831         }
3832       }
3833     }
3834   }
3835 }
3836 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3837 {
3838   int hr;
3839   // Load 32-bit regs
3840   for(hr=0;hr<HOST_REGS;hr++) {
3841     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3842       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3843         int value=constmap[i][hr];
3844         if(value==0) {
3845           emit_zeroreg(hr);
3846         }
3847         else {
3848           emit_movimm(value,hr);
3849         }
3850       }
3851     }
3852   }
3853   // Load 64-bit regs
3854   for(hr=0;hr<HOST_REGS;hr++) {
3855     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3856       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3857         if((is32>>(regmap[hr]&63))&1) {
3858           int lr=get_reg(regmap,regmap[hr]-64);
3859           assert(lr>=0);
3860           emit_sarimm(lr,31,hr);
3861         }
3862         else
3863         {
3864           int value=constmap[i][hr];
3865           if(value==0) {
3866             emit_zeroreg(hr);
3867           }
3868           else {
3869             emit_movimm(value,hr);
3870           }
3871         }
3872       }
3873     }
3874   }
3875 }
3876
3877 // Write out all dirty registers (except cycle count)
3878 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3879 {
3880   int hr;
3881   for(hr=0;hr<HOST_REGS;hr++) {
3882     if(hr!=EXCLUDE_REG) {
3883       if(i_regmap[hr]>0) {
3884         if(i_regmap[hr]!=CCREG) {
3885           if((i_dirty>>hr)&1) {
3886             if(i_regmap[hr]<64) {
3887               emit_storereg(i_regmap[hr],hr);
3888             }else{
3889               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3890                 emit_storereg(i_regmap[hr],hr);
3891               }
3892             }
3893           }
3894         }
3895       }
3896     }
3897   }
3898 }
3899 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3900 // This writes the registers not written by store_regs_bt
3901 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3902 {
3903   int hr;
3904   int t=(addr-start)>>2;
3905   for(hr=0;hr<HOST_REGS;hr++) {
3906     if(hr!=EXCLUDE_REG) {
3907       if(i_regmap[hr]>0) {
3908         if(i_regmap[hr]!=CCREG) {
3909           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3910             if((i_dirty>>hr)&1) {
3911               if(i_regmap[hr]<64) {
3912                 emit_storereg(i_regmap[hr],hr);
3913               }else{
3914                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3915                   emit_storereg(i_regmap[hr],hr);
3916                 }
3917               }
3918             }
3919           }
3920         }
3921       }
3922     }
3923   }
3924 }
3925
3926 // Load all registers (except cycle count)
3927 void load_all_regs(signed char i_regmap[])
3928 {
3929   int hr;
3930   for(hr=0;hr<HOST_REGS;hr++) {
3931     if(hr!=EXCLUDE_REG) {
3932       if(i_regmap[hr]==0) {
3933         emit_zeroreg(hr);
3934       }
3935       else
3936       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3937       {
3938         emit_loadreg(i_regmap[hr],hr);
3939       }
3940     }
3941   }
3942 }
3943
3944 // Load all current registers also needed by next instruction
3945 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
3946 {
3947   int hr;
3948   for(hr=0;hr<HOST_REGS;hr++) {
3949     if(hr!=EXCLUDE_REG) {
3950       if(get_reg(next_regmap,i_regmap[hr])>=0) {
3951         if(i_regmap[hr]==0) {
3952           emit_zeroreg(hr);
3953         }
3954         else
3955         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3956         {
3957           emit_loadreg(i_regmap[hr],hr);
3958         }
3959       }
3960     }
3961   }
3962 }
3963
3964 // Load all regs, storing cycle count if necessary
3965 void load_regs_entry(int t)
3966 {
3967   int hr;
3968   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
3969   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
3970   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3971     emit_storereg(CCREG,HOST_CCREG);
3972   }
3973   // Load 32-bit regs
3974   for(hr=0;hr<HOST_REGS;hr++) {
3975     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3976       if(regs[t].regmap_entry[hr]==0) {
3977         emit_zeroreg(hr);
3978       }
3979       else if(regs[t].regmap_entry[hr]!=CCREG)
3980       {
3981         emit_loadreg(regs[t].regmap_entry[hr],hr);
3982       }
3983     }
3984   }
3985   // Load 64-bit regs
3986   for(hr=0;hr<HOST_REGS;hr++) {
3987     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
3988       assert(regs[t].regmap_entry[hr]!=64);
3989       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
3990         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3991         if(lr<0) {
3992           emit_loadreg(regs[t].regmap_entry[hr],hr);
3993         }
3994         else
3995         {
3996           emit_sarimm(lr,31,hr);
3997         }
3998       }
3999       else
4000       {
4001         emit_loadreg(regs[t].regmap_entry[hr],hr);
4002       }
4003     }
4004   }
4005 }
4006
4007 // Store dirty registers prior to branch
4008 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4009 {
4010   if(internal_branch(i_is32,addr))
4011   {
4012     int t=(addr-start)>>2;
4013     int hr;
4014     for(hr=0;hr<HOST_REGS;hr++) {
4015       if(hr!=EXCLUDE_REG) {
4016         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4017           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4018             if((i_dirty>>hr)&1) {
4019               if(i_regmap[hr]<64) {
4020                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4021                   emit_storereg(i_regmap[hr],hr);
4022                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4023                     #ifdef DESTRUCTIVE_WRITEBACK
4024                     emit_sarimm(hr,31,hr);
4025                     emit_storereg(i_regmap[hr]|64,hr);
4026                     #else
4027                     emit_sarimm(hr,31,HOST_TEMPREG);
4028                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4029                     #endif
4030                   }
4031                 }
4032               }else{
4033                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4034                   emit_storereg(i_regmap[hr],hr);
4035                 }
4036               }
4037             }
4038           }
4039         }
4040       }
4041     }
4042   }
4043   else
4044   {
4045     // Branch out of this block, write out all dirty regs
4046     wb_dirtys(i_regmap,i_is32,i_dirty);
4047   }
4048 }
4049
4050 // Load all needed registers for branch target
4051 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4052 {
4053   //if(addr>=start && addr<(start+slen*4))
4054   if(internal_branch(i_is32,addr))
4055   {
4056     int t=(addr-start)>>2;
4057     int hr;
4058     // Store the cycle count before loading something else
4059     if(i_regmap[HOST_CCREG]!=CCREG) {
4060       assert(i_regmap[HOST_CCREG]==-1);
4061     }
4062     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4063       emit_storereg(CCREG,HOST_CCREG);
4064     }
4065     // Load 32-bit regs
4066     for(hr=0;hr<HOST_REGS;hr++) {
4067       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4068         #ifdef DESTRUCTIVE_WRITEBACK
4069         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4070         #else
4071         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4072         #endif
4073           if(regs[t].regmap_entry[hr]==0) {
4074             emit_zeroreg(hr);
4075           }
4076           else if(regs[t].regmap_entry[hr]!=CCREG)
4077           {
4078             emit_loadreg(regs[t].regmap_entry[hr],hr);
4079           }
4080         }
4081       }
4082     }
4083     //Load 64-bit regs
4084     for(hr=0;hr<HOST_REGS;hr++) {
4085       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4086         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4087           assert(regs[t].regmap_entry[hr]!=64);
4088           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4089             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4090             if(lr<0) {
4091               emit_loadreg(regs[t].regmap_entry[hr],hr);
4092             }
4093             else
4094             {
4095               emit_sarimm(lr,31,hr);
4096             }
4097           }
4098           else
4099           {
4100             emit_loadreg(regs[t].regmap_entry[hr],hr);
4101           }
4102         }
4103         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4104           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4105           assert(lr>=0);
4106           emit_sarimm(lr,31,hr);
4107         }
4108       }
4109     }
4110   }
4111 }
4112
4113 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4114 {
4115   if(addr>=start && addr<start+slen*4-4)
4116   {
4117     int t=(addr-start)>>2;
4118     int hr;
4119     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4120     for(hr=0;hr<HOST_REGS;hr++)
4121     {
4122       if(hr!=EXCLUDE_REG)
4123       {
4124         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4125         {
4126           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4127           {
4128             return 0;
4129           }
4130           else
4131           if((i_dirty>>hr)&1)
4132           {
4133             if(i_regmap[hr]<TEMPREG)
4134             {
4135               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4136                 return 0;
4137             }
4138             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4139             {
4140               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4141                 return 0;
4142             }
4143           }
4144         }
4145         else // Same register but is it 32-bit or dirty?
4146         if(i_regmap[hr]>=0)
4147         {
4148           if(!((regs[t].dirty>>hr)&1))
4149           {
4150             if((i_dirty>>hr)&1)
4151             {
4152               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4153               {
4154                 //printf("%x: dirty no match\n",addr);
4155                 return 0;
4156               }
4157             }
4158           }
4159           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4160           {
4161             //printf("%x: is32 no match\n",addr);
4162             return 0;
4163           }
4164         }
4165       }
4166     }
4167     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4168     // Delay slots are not valid branch targets
4169     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4170     // Delay slots require additional processing, so do not match
4171     if(is_ds[t]) return 0;
4172   }
4173   else
4174   {
4175     int hr;
4176     for(hr=0;hr<HOST_REGS;hr++)
4177     {
4178       if(hr!=EXCLUDE_REG)
4179       {
4180         if(i_regmap[hr]>=0)
4181         {
4182           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4183           {
4184             if((i_dirty>>hr)&1)
4185             {
4186               return 0;
4187             }
4188           }
4189         }
4190       }
4191     }
4192   }
4193   return 1;
4194 }
4195
4196 // Used when a branch jumps into the delay slot of another branch
4197 void ds_assemble_entry(int i)
4198 {
4199   int t=(ba[i]-start)>>2;
4200   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4201   assem_debug("Assemble delay slot at %x\n",ba[i]);
4202   assem_debug("<->\n");
4203   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4204     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4205   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4206   address_generation(t,&regs[t],regs[t].regmap_entry);
4207   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4208     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4209   cop1_usable=0;
4210   is_delayslot=0;
4211   switch(itype[t]) {
4212     case ALU:
4213       alu_assemble(t,&regs[t]);break;
4214     case IMM16:
4215       imm16_assemble(t,&regs[t]);break;
4216     case SHIFT:
4217       shift_assemble(t,&regs[t]);break;
4218     case SHIFTIMM:
4219       shiftimm_assemble(t,&regs[t]);break;
4220     case LOAD:
4221       load_assemble(t,&regs[t]);break;
4222     case LOADLR:
4223       loadlr_assemble(t,&regs[t]);break;
4224     case STORE:
4225       store_assemble(t,&regs[t]);break;
4226     case STORELR:
4227       storelr_assemble(t,&regs[t]);break;
4228     case COP0:
4229       cop0_assemble(t,&regs[t]);break;
4230     case COP1:
4231       cop1_assemble(t,&regs[t]);break;
4232     case C1LS:
4233       c1ls_assemble(t,&regs[t]);break;
4234     case COP2:
4235       cop2_assemble(t,&regs[t]);break;
4236     case C2LS:
4237       c2ls_assemble(t,&regs[t]);break;
4238     case C2OP:
4239       c2op_assemble(t,&regs[t]);break;
4240     case FCONV:
4241       fconv_assemble(t,&regs[t]);break;
4242     case FLOAT:
4243       float_assemble(t,&regs[t]);break;
4244     case FCOMP:
4245       fcomp_assemble(t,&regs[t]);break;
4246     case MULTDIV:
4247       multdiv_assemble(t,&regs[t]);break;
4248     case MOV:
4249       mov_assemble(t,&regs[t]);break;
4250     case SYSCALL:
4251     case HLECALL:
4252     case INTCALL:
4253     case SPAN:
4254     case UJUMP:
4255     case RJUMP:
4256     case CJUMP:
4257     case SJUMP:
4258     case FJUMP:
4259       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4260   }
4261   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4262   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4263   if(internal_branch(regs[t].is32,ba[i]+4))
4264     assem_debug("branch: internal\n");
4265   else
4266     assem_debug("branch: external\n");
4267   assert(internal_branch(regs[t].is32,ba[i]+4));
4268   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4269   emit_jmp(0);
4270 }
4271
4272 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4273 {
4274   int count;
4275   int jaddr;
4276   int idle=0;
4277   int t=0;
4278   if(itype[i]==RJUMP)
4279   {
4280     *adj=0;
4281   }
4282   //if(ba[i]>=start && ba[i]<(start+slen*4))
4283   if(internal_branch(branch_regs[i].is32,ba[i]))
4284   {
4285     t=(ba[i]-start)>>2;
4286     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4287     else *adj=ccadj[t];
4288   }
4289   else
4290   {
4291     *adj=0;
4292   }
4293   count=ccadj[i];
4294   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4295     // Idle loop
4296     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4297     idle=(int)out;
4298     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4299     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4300     jaddr=(int)out;
4301     emit_jmp(0);
4302   }
4303   else if(*adj==0||invert) {
4304     int cycles=CLOCK_ADJUST(count+2);
4305     // faster loop HACK
4306     if (t&&*adj) {
4307       int rel=t-i;
4308       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4309         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4310     }
4311     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4312     jaddr=(int)out;
4313     emit_jns(0);
4314   }
4315   else
4316   {
4317     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4318     jaddr=(int)out;
4319     emit_jns(0);
4320   }
4321   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4322 }
4323
4324 void do_ccstub(int n)
4325 {
4326   literal_pool(256);
4327   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4328   set_jump_target(stubs[n][1],(int)out);
4329   int i=stubs[n][4];
4330   if(stubs[n][6]==NULLDS) {
4331     // Delay slot instruction is nullified ("likely" branch)
4332     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4333   }
4334   else if(stubs[n][6]!=TAKEN) {
4335     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4336   }
4337   else {
4338     if(internal_branch(branch_regs[i].is32,ba[i]))
4339       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4340   }
4341   if(stubs[n][5]!=-1)
4342   {
4343     // Save PC as return address
4344     emit_movimm(stubs[n][5],EAX);
4345     emit_writeword(EAX,(int)&pcaddr);
4346   }
4347   else
4348   {
4349     // Return address depends on which way the branch goes
4350     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4351     {
4352       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4353       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4354       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4355       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4356       if(rs1[i]==0)
4357       {
4358         s1l=s2l;s1h=s2h;
4359         s2l=s2h=-1;
4360       }
4361       else if(rs2[i]==0)
4362       {
4363         s2l=s2h=-1;
4364       }
4365       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4366         s1h=s2h=-1;
4367       }
4368       assert(s1l>=0);
4369       #ifdef DESTRUCTIVE_WRITEBACK
4370       if(rs1[i]) {
4371         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4372           emit_loadreg(rs1[i],s1l);
4373       }
4374       else {
4375         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4376           emit_loadreg(rs2[i],s1l);
4377       }
4378       if(s2l>=0)
4379         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4380           emit_loadreg(rs2[i],s2l);
4381       #endif
4382       int hr=0;
4383       int addr=-1,alt=-1,ntaddr=-1;
4384       while(hr<HOST_REGS)
4385       {
4386         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4387            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4388            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4389         {
4390           addr=hr++;break;
4391         }
4392         hr++;
4393       }
4394       while(hr<HOST_REGS)
4395       {
4396         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4397            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4398            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4399         {
4400           alt=hr++;break;
4401         }
4402         hr++;
4403       }
4404       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4405       {
4406         while(hr<HOST_REGS)
4407         {
4408           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4409              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4410              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4411           {
4412             ntaddr=hr;break;
4413           }
4414           hr++;
4415         }
4416         assert(hr<HOST_REGS);
4417       }
4418       if((opcode[i]&0x2f)==4) // BEQ
4419       {
4420         #ifdef HAVE_CMOV_IMM
4421         if(s1h<0) {
4422           if(s2l>=0) emit_cmp(s1l,s2l);
4423           else emit_test(s1l,s1l);
4424           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4425         }
4426         else
4427         #endif
4428         {
4429           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4430           if(s1h>=0) {
4431             if(s2h>=0) emit_cmp(s1h,s2h);
4432             else emit_test(s1h,s1h);
4433             emit_cmovne_reg(alt,addr);
4434           }
4435           if(s2l>=0) emit_cmp(s1l,s2l);
4436           else emit_test(s1l,s1l);
4437           emit_cmovne_reg(alt,addr);
4438         }
4439       }
4440       if((opcode[i]&0x2f)==5) // BNE
4441       {
4442         #ifdef HAVE_CMOV_IMM
4443         if(s1h<0) {
4444           if(s2l>=0) emit_cmp(s1l,s2l);
4445           else emit_test(s1l,s1l);
4446           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4447         }
4448         else
4449         #endif
4450         {
4451           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4452           if(s1h>=0) {
4453             if(s2h>=0) emit_cmp(s1h,s2h);
4454             else emit_test(s1h,s1h);
4455             emit_cmovne_reg(alt,addr);
4456           }
4457           if(s2l>=0) emit_cmp(s1l,s2l);
4458           else emit_test(s1l,s1l);
4459           emit_cmovne_reg(alt,addr);
4460         }
4461       }
4462       if((opcode[i]&0x2f)==6) // BLEZ
4463       {
4464         //emit_movimm(ba[i],alt);
4465         //emit_movimm(start+i*4+8,addr);
4466         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4467         emit_cmpimm(s1l,1);
4468         if(s1h>=0) emit_mov(addr,ntaddr);
4469         emit_cmovl_reg(alt,addr);
4470         if(s1h>=0) {
4471           emit_test(s1h,s1h);
4472           emit_cmovne_reg(ntaddr,addr);
4473           emit_cmovs_reg(alt,addr);
4474         }
4475       }
4476       if((opcode[i]&0x2f)==7) // BGTZ
4477       {
4478         //emit_movimm(ba[i],addr);
4479         //emit_movimm(start+i*4+8,ntaddr);
4480         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4481         emit_cmpimm(s1l,1);
4482         if(s1h>=0) emit_mov(addr,alt);
4483         emit_cmovl_reg(ntaddr,addr);
4484         if(s1h>=0) {
4485           emit_test(s1h,s1h);
4486           emit_cmovne_reg(alt,addr);
4487           emit_cmovs_reg(ntaddr,addr);
4488         }
4489       }
4490       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4491       {
4492         //emit_movimm(ba[i],alt);
4493         //emit_movimm(start+i*4+8,addr);
4494         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4495         if(s1h>=0) emit_test(s1h,s1h);
4496         else emit_test(s1l,s1l);
4497         emit_cmovs_reg(alt,addr);
4498       }
4499       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4500       {
4501         //emit_movimm(ba[i],addr);
4502         //emit_movimm(start+i*4+8,alt);
4503         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4504         if(s1h>=0) emit_test(s1h,s1h);
4505         else emit_test(s1l,s1l);
4506         emit_cmovs_reg(alt,addr);
4507       }
4508       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4509         if(source[i]&0x10000) // BC1T
4510         {
4511           //emit_movimm(ba[i],alt);
4512           //emit_movimm(start+i*4+8,addr);
4513           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4514           emit_testimm(s1l,0x800000);
4515           emit_cmovne_reg(alt,addr);
4516         }
4517         else // BC1F
4518         {
4519           //emit_movimm(ba[i],addr);
4520           //emit_movimm(start+i*4+8,alt);
4521           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4522           emit_testimm(s1l,0x800000);
4523           emit_cmovne_reg(alt,addr);
4524         }
4525       }
4526       emit_writeword(addr,(int)&pcaddr);
4527     }
4528     else
4529     if(itype[i]==RJUMP)
4530     {
4531       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4532       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4533         r=get_reg(branch_regs[i].regmap,RTEMP);
4534       }
4535       emit_writeword(r,(int)&pcaddr);
4536     }
4537     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4538   }
4539   // Update cycle count
4540   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4541   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4542   emit_call((int)cc_interrupt);
4543   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4544   if(stubs[n][6]==TAKEN) {
4545     if(internal_branch(branch_regs[i].is32,ba[i]))
4546       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4547     else if(itype[i]==RJUMP) {
4548       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4549         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4550       else
4551         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4552     }
4553   }else if(stubs[n][6]==NOTTAKEN) {
4554     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4555     else load_all_regs(branch_regs[i].regmap);
4556   }else if(stubs[n][6]==NULLDS) {
4557     // Delay slot instruction is nullified ("likely" branch)
4558     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4559     else load_all_regs(regs[i].regmap);
4560   }else{
4561     load_all_regs(branch_regs[i].regmap);
4562   }
4563   emit_jmp(stubs[n][2]); // return address
4564
4565   /* This works but uses a lot of memory...
4566   emit_readword((int)&last_count,ECX);
4567   emit_add(HOST_CCREG,ECX,EAX);
4568   emit_writeword(EAX,(int)&Count);
4569   emit_call((int)gen_interupt);
4570   emit_readword((int)&Count,HOST_CCREG);
4571   emit_readword((int)&next_interupt,EAX);
4572   emit_readword((int)&pending_exception,EBX);
4573   emit_writeword(EAX,(int)&last_count);
4574   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4575   emit_test(EBX,EBX);
4576   int jne_instr=(int)out;
4577   emit_jne(0);
4578   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4579   load_all_regs(branch_regs[i].regmap);
4580   emit_jmp(stubs[n][2]); // return address
4581   set_jump_target(jne_instr,(int)out);
4582   emit_readword((int)&pcaddr,EAX);
4583   // Call get_addr_ht instead of doing the hash table here.
4584   // This code is executed infrequently and takes up a lot of space
4585   // so smaller is better.
4586   emit_storereg(CCREG,HOST_CCREG);
4587   emit_pushreg(EAX);
4588   emit_call((int)get_addr_ht);
4589   emit_loadreg(CCREG,HOST_CCREG);
4590   emit_addimm(ESP,4,ESP);
4591   emit_jmpreg(EAX);*/
4592 }
4593
4594 static void add_to_linker(int addr,int target,int ext)
4595 {
4596   link_addr[linkcount][0]=addr;
4597   link_addr[linkcount][1]=target;
4598   link_addr[linkcount][2]=ext;
4599   linkcount++;
4600 }
4601
4602 static void ujump_assemble_write_ra(int i)
4603 {
4604   int rt;
4605   unsigned int return_address;
4606   rt=get_reg(branch_regs[i].regmap,31);
4607   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4608   //assert(rt>=0);
4609   return_address=start+i*4+8;
4610   if(rt>=0) {
4611     #ifdef USE_MINI_HT
4612     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4613       int temp=-1; // note: must be ds-safe
4614       #ifdef HOST_TEMPREG
4615       temp=HOST_TEMPREG;
4616       #endif
4617       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4618       else emit_movimm(return_address,rt);
4619     }
4620     else
4621     #endif
4622     {
4623       #ifdef REG_PREFETCH
4624       if(temp>=0)
4625       {
4626         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4627       }
4628       #endif
4629       emit_movimm(return_address,rt); // PC into link register
4630       #ifdef IMM_PREFETCH
4631       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4632       #endif
4633     }
4634   }
4635 }
4636
4637 void ujump_assemble(int i,struct regstat *i_regs)
4638 {
4639   signed char *i_regmap=i_regs->regmap;
4640   int ra_done=0;
4641   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4642   address_generation(i+1,i_regs,regs[i].regmap_entry);
4643   #ifdef REG_PREFETCH
4644   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4645   if(rt1[i]==31&&temp>=0)
4646   {
4647     int return_address=start+i*4+8;
4648     if(get_reg(branch_regs[i].regmap,31)>0)
4649     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4650   }
4651   #endif
4652   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4653     ujump_assemble_write_ra(i); // writeback ra for DS
4654     ra_done=1;
4655   }
4656   ds_assemble(i+1,i_regs);
4657   uint64_t bc_unneeded=branch_regs[i].u;
4658   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4659   bc_unneeded|=1|(1LL<<rt1[i]);
4660   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4661   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4662                 bc_unneeded,bc_unneeded_upper);
4663   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4664   if(!ra_done&&rt1[i]==31)
4665     ujump_assemble_write_ra(i);
4666   int cc,adj;
4667   cc=get_reg(branch_regs[i].regmap,CCREG);
4668   assert(cc==HOST_CCREG);
4669   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4670   #ifdef REG_PREFETCH
4671   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4672   #endif
4673   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4674   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4675   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4676   if(internal_branch(branch_regs[i].is32,ba[i]))
4677     assem_debug("branch: internal\n");
4678   else
4679     assem_debug("branch: external\n");
4680   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4681     ds_assemble_entry(i);
4682   }
4683   else {
4684     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4685     emit_jmp(0);
4686   }
4687 }
4688
4689 static void rjump_assemble_write_ra(int i)
4690 {
4691   int rt,return_address;
4692   assert(rt1[i+1]!=rt1[i]);
4693   assert(rt2[i+1]!=rt1[i]);
4694   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4695   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4696   assert(rt>=0);
4697   return_address=start+i*4+8;
4698   #ifdef REG_PREFETCH
4699   if(temp>=0)
4700   {
4701     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4702   }
4703   #endif
4704   emit_movimm(return_address,rt); // PC into link register
4705   #ifdef IMM_PREFETCH
4706   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4707   #endif
4708 }
4709
4710 void rjump_assemble(int i,struct regstat *i_regs)
4711 {
4712   signed char *i_regmap=i_regs->regmap;
4713   int temp;
4714   int rs,cc,adj;
4715   int ra_done=0;
4716   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4717   assert(rs>=0);
4718   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4719     // Delay slot abuse, make a copy of the branch address register
4720     temp=get_reg(branch_regs[i].regmap,RTEMP);
4721     assert(temp>=0);
4722     assert(regs[i].regmap[temp]==RTEMP);
4723     emit_mov(rs,temp);
4724     rs=temp;
4725   }
4726   address_generation(i+1,i_regs,regs[i].regmap_entry);
4727   #ifdef REG_PREFETCH
4728   if(rt1[i]==31)
4729   {
4730     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4731       int return_address=start+i*4+8;
4732       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4733     }
4734   }
4735   #endif
4736   #ifdef USE_MINI_HT
4737   if(rs1[i]==31) {
4738     int rh=get_reg(regs[i].regmap,RHASH);
4739     if(rh>=0) do_preload_rhash(rh);
4740   }
4741   #endif
4742   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4743     rjump_assemble_write_ra(i);
4744     ra_done=1;
4745   }
4746   ds_assemble(i+1,i_regs);
4747   uint64_t bc_unneeded=branch_regs[i].u;
4748   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4749   bc_unneeded|=1|(1LL<<rt1[i]);
4750   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4751   bc_unneeded&=~(1LL<<rs1[i]);
4752   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4753                 bc_unneeded,bc_unneeded_upper);
4754   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4755   if(!ra_done&&rt1[i]!=0)
4756     rjump_assemble_write_ra(i);
4757   cc=get_reg(branch_regs[i].regmap,CCREG);
4758   assert(cc==HOST_CCREG);
4759   #ifdef USE_MINI_HT
4760   int rh=get_reg(branch_regs[i].regmap,RHASH);
4761   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4762   if(rs1[i]==31) {
4763     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4764     do_preload_rhtbl(ht);
4765     do_rhash(rs,rh);
4766   }
4767   #endif
4768   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4769   #ifdef DESTRUCTIVE_WRITEBACK
4770   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4771     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4772       emit_loadreg(rs1[i],rs);
4773     }
4774   }
4775   #endif
4776   #ifdef REG_PREFETCH
4777   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4778   #endif
4779   #ifdef USE_MINI_HT
4780   if(rs1[i]==31) {
4781     do_miniht_load(ht,rh);
4782   }
4783   #endif
4784   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4785   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4786   //assert(adj==0);
4787   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4788   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4789   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4790     // special case for RFE
4791     emit_jmp(0);
4792   else
4793     emit_jns(0);
4794   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4795   #ifdef USE_MINI_HT
4796   if(rs1[i]==31) {
4797     do_miniht_jump(rs,rh,ht);
4798   }
4799   else
4800   #endif
4801   {
4802     //if(rs!=EAX) emit_mov(rs,EAX);
4803     //emit_jmp((int)jump_vaddr_eax);
4804     emit_jmp(jump_vaddr_reg[rs]);
4805   }
4806   /* Check hash table
4807   temp=!rs;
4808   emit_mov(rs,temp);
4809   emit_shrimm(rs,16,rs);
4810   emit_xor(temp,rs,rs);
4811   emit_movzwl_reg(rs,rs);
4812   emit_shlimm(rs,4,rs);
4813   emit_cmpmem_indexed((int)hash_table,rs,temp);
4814   emit_jne((int)out+14);
4815   emit_readword_indexed((int)hash_table+4,rs,rs);
4816   emit_jmpreg(rs);
4817   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4818   emit_addimm_no_flags(8,rs);
4819   emit_jeq((int)out-17);
4820   // No hit on hash table, call compiler
4821   emit_pushreg(temp);
4822 //DEBUG >
4823 #ifdef DEBUG_CYCLE_COUNT
4824   emit_readword((int)&last_count,ECX);
4825   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4826   emit_readword((int)&next_interupt,ECX);
4827   emit_writeword(HOST_CCREG,(int)&Count);
4828   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4829   emit_writeword(ECX,(int)&last_count);
4830 #endif
4831 //DEBUG <
4832   emit_storereg(CCREG,HOST_CCREG);
4833   emit_call((int)get_addr);
4834   emit_loadreg(CCREG,HOST_CCREG);
4835   emit_addimm(ESP,4,ESP);
4836   emit_jmpreg(EAX);*/
4837   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4838   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4839   #endif
4840 }
4841
4842 void cjump_assemble(int i,struct regstat *i_regs)
4843 {
4844   signed char *i_regmap=i_regs->regmap;
4845   int cc;
4846   int match;
4847   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4848   assem_debug("match=%d\n",match);
4849   int s1h,s1l,s2h,s2l;
4850   int prev_cop1_usable=cop1_usable;
4851   int unconditional=0,nop=0;
4852   int only32=0;
4853   int invert=0;
4854   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4855   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4856   if(!match) invert=1;
4857   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4858   if(i>(ba[i]-start)>>2) invert=1;
4859   #endif
4860
4861   if(ooo[i]) {
4862     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4863     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4864     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4865     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4866   }
4867   else {
4868     s1l=get_reg(i_regmap,rs1[i]);
4869     s1h=get_reg(i_regmap,rs1[i]|64);
4870     s2l=get_reg(i_regmap,rs2[i]);
4871     s2h=get_reg(i_regmap,rs2[i]|64);
4872   }
4873   if(rs1[i]==0&&rs2[i]==0)
4874   {
4875     if(opcode[i]&1) nop=1;
4876     else unconditional=1;
4877     //assert(opcode[i]!=5);
4878     //assert(opcode[i]!=7);
4879     //assert(opcode[i]!=0x15);
4880     //assert(opcode[i]!=0x17);
4881   }
4882   else if(rs1[i]==0)
4883   {
4884     s1l=s2l;s1h=s2h;
4885     s2l=s2h=-1;
4886     only32=(regs[i].was32>>rs2[i])&1;
4887   }
4888   else if(rs2[i]==0)
4889   {
4890     s2l=s2h=-1;
4891     only32=(regs[i].was32>>rs1[i])&1;
4892   }
4893   else {
4894     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
4895   }
4896
4897   if(ooo[i]) {
4898     // Out of order execution (delay slot first)
4899     //printf("OOOE\n");
4900     address_generation(i+1,i_regs,regs[i].regmap_entry);
4901     ds_assemble(i+1,i_regs);
4902     int adj;
4903     uint64_t bc_unneeded=branch_regs[i].u;
4904     uint64_t bc_unneeded_upper=branch_regs[i].uu;
4905     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4906     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
4907     bc_unneeded|=1;
4908     bc_unneeded_upper|=1;
4909     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4910                   bc_unneeded,bc_unneeded_upper);
4911     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
4912     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4913     cc=get_reg(branch_regs[i].regmap,CCREG);
4914     assert(cc==HOST_CCREG);
4915     if(unconditional)
4916       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4917     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4918     //assem_debug("cycle count (adj)\n");
4919     if(unconditional) {
4920       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4921       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4922         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4923         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4924         if(internal)
4925           assem_debug("branch: internal\n");
4926         else
4927           assem_debug("branch: external\n");
4928         if(internal&&is_ds[(ba[i]-start)>>2]) {
4929           ds_assemble_entry(i);
4930         }
4931         else {
4932           add_to_linker((int)out,ba[i],internal);
4933           emit_jmp(0);
4934         }
4935         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4936         if(((u_int)out)&7) emit_addnop(0);
4937         #endif
4938       }
4939     }
4940     else if(nop) {
4941       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4942       int jaddr=(int)out;
4943       emit_jns(0);
4944       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
4945     }
4946     else {
4947       int taken=0,nottaken=0,nottaken1=0;
4948       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4949       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4950       if(!only32)
4951       {
4952         assert(s1h>=0);
4953         if(opcode[i]==4) // BEQ
4954         {
4955           if(s2h>=0) emit_cmp(s1h,s2h);
4956           else emit_test(s1h,s1h);
4957           nottaken1=(int)out;
4958           emit_jne(1);
4959         }
4960         if(opcode[i]==5) // BNE
4961         {
4962           if(s2h>=0) emit_cmp(s1h,s2h);
4963           else emit_test(s1h,s1h);
4964           if(invert) taken=(int)out;
4965           else add_to_linker((int)out,ba[i],internal);
4966           emit_jne(0);
4967         }
4968         if(opcode[i]==6) // BLEZ
4969         {
4970           emit_test(s1h,s1h);
4971           if(invert) taken=(int)out;
4972           else add_to_linker((int)out,ba[i],internal);
4973           emit_js(0);
4974           nottaken1=(int)out;
4975           emit_jne(1);
4976         }
4977         if(opcode[i]==7) // BGTZ
4978         {
4979           emit_test(s1h,s1h);
4980           nottaken1=(int)out;
4981           emit_js(1);
4982           if(invert) taken=(int)out;
4983           else add_to_linker((int)out,ba[i],internal);
4984           emit_jne(0);
4985         }
4986       } // if(!only32)
4987
4988       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4989       assert(s1l>=0);
4990       if(opcode[i]==4) // BEQ
4991       {
4992         if(s2l>=0) emit_cmp(s1l,s2l);
4993         else emit_test(s1l,s1l);
4994         if(invert){
4995           nottaken=(int)out;
4996           emit_jne(1);
4997         }else{
4998           add_to_linker((int)out,ba[i],internal);
4999           emit_jeq(0);
5000         }
5001       }
5002       if(opcode[i]==5) // BNE
5003       {
5004         if(s2l>=0) emit_cmp(s1l,s2l);
5005         else emit_test(s1l,s1l);
5006         if(invert){
5007           nottaken=(int)out;
5008           emit_jeq(1);
5009         }else{
5010           add_to_linker((int)out,ba[i],internal);
5011           emit_jne(0);
5012         }
5013       }
5014       if(opcode[i]==6) // BLEZ
5015       {
5016         emit_cmpimm(s1l,1);
5017         if(invert){
5018           nottaken=(int)out;
5019           emit_jge(1);
5020         }else{
5021           add_to_linker((int)out,ba[i],internal);
5022           emit_jl(0);
5023         }
5024       }
5025       if(opcode[i]==7) // BGTZ
5026       {
5027         emit_cmpimm(s1l,1);
5028         if(invert){
5029           nottaken=(int)out;
5030           emit_jl(1);
5031         }else{
5032           add_to_linker((int)out,ba[i],internal);
5033           emit_jge(0);
5034         }
5035       }
5036       if(invert) {
5037         if(taken) set_jump_target(taken,(int)out);
5038         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5039         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5040           if(adj) {
5041             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5042             add_to_linker((int)out,ba[i],internal);
5043           }else{
5044             emit_addnop(13);
5045             add_to_linker((int)out,ba[i],internal*2);
5046           }
5047           emit_jmp(0);
5048         }else
5049         #endif
5050         {
5051           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5052           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5053           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5054           if(internal)
5055             assem_debug("branch: internal\n");
5056           else
5057             assem_debug("branch: external\n");
5058           if(internal&&is_ds[(ba[i]-start)>>2]) {
5059             ds_assemble_entry(i);
5060           }
5061           else {
5062             add_to_linker((int)out,ba[i],internal);
5063             emit_jmp(0);
5064           }
5065         }
5066         set_jump_target(nottaken,(int)out);
5067       }
5068
5069       if(nottaken1) set_jump_target(nottaken1,(int)out);
5070       if(adj) {
5071         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5072       }
5073     } // (!unconditional)
5074   } // if(ooo)
5075   else
5076   {
5077     // In-order execution (branch first)
5078     //if(likely[i]) printf("IOL\n");
5079     //else
5080     //printf("IOE\n");
5081     int taken=0,nottaken=0,nottaken1=0;
5082     if(!unconditional&&!nop) {
5083       if(!only32)
5084       {
5085         assert(s1h>=0);
5086         if((opcode[i]&0x2f)==4) // BEQ
5087         {
5088           if(s2h>=0) emit_cmp(s1h,s2h);
5089           else emit_test(s1h,s1h);
5090           nottaken1=(int)out;
5091           emit_jne(2);
5092         }
5093         if((opcode[i]&0x2f)==5) // BNE
5094         {
5095           if(s2h>=0) emit_cmp(s1h,s2h);
5096           else emit_test(s1h,s1h);
5097           taken=(int)out;
5098           emit_jne(1);
5099         }
5100         if((opcode[i]&0x2f)==6) // BLEZ
5101         {
5102           emit_test(s1h,s1h);
5103           taken=(int)out;
5104           emit_js(1);
5105           nottaken1=(int)out;
5106           emit_jne(2);
5107         }
5108         if((opcode[i]&0x2f)==7) // BGTZ
5109         {
5110           emit_test(s1h,s1h);
5111           nottaken1=(int)out;
5112           emit_js(2);
5113           taken=(int)out;
5114           emit_jne(1);
5115         }
5116       } // if(!only32)
5117
5118       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5119       assert(s1l>=0);
5120       if((opcode[i]&0x2f)==4) // BEQ
5121       {
5122         if(s2l>=0) emit_cmp(s1l,s2l);
5123         else emit_test(s1l,s1l);
5124         nottaken=(int)out;
5125         emit_jne(2);
5126       }
5127       if((opcode[i]&0x2f)==5) // BNE
5128       {
5129         if(s2l>=0) emit_cmp(s1l,s2l);
5130         else emit_test(s1l,s1l);
5131         nottaken=(int)out;
5132         emit_jeq(2);
5133       }
5134       if((opcode[i]&0x2f)==6) // BLEZ
5135       {
5136         emit_cmpimm(s1l,1);
5137         nottaken=(int)out;
5138         emit_jge(2);
5139       }
5140       if((opcode[i]&0x2f)==7) // BGTZ
5141       {
5142         emit_cmpimm(s1l,1);
5143         nottaken=(int)out;
5144         emit_jl(2);
5145       }
5146     } // if(!unconditional)
5147     int adj;
5148     uint64_t ds_unneeded=branch_regs[i].u;
5149     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5150     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5151     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5152     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5153     ds_unneeded|=1;
5154     ds_unneeded_upper|=1;
5155     // branch taken
5156     if(!nop) {
5157       if(taken) set_jump_target(taken,(int)out);
5158       assem_debug("1:\n");
5159       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5160                     ds_unneeded,ds_unneeded_upper);
5161       // load regs
5162       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5163       address_generation(i+1,&branch_regs[i],0);
5164       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5165       ds_assemble(i+1,&branch_regs[i]);
5166       cc=get_reg(branch_regs[i].regmap,CCREG);
5167       if(cc==-1) {
5168         emit_loadreg(CCREG,cc=HOST_CCREG);
5169         // CHECK: Is the following instruction (fall thru) allocated ok?
5170       }
5171       assert(cc==HOST_CCREG);
5172       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5173       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5174       assem_debug("cycle count (adj)\n");
5175       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5176       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5177       if(internal)
5178         assem_debug("branch: internal\n");
5179       else
5180         assem_debug("branch: external\n");
5181       if(internal&&is_ds[(ba[i]-start)>>2]) {
5182         ds_assemble_entry(i);
5183       }
5184       else {
5185         add_to_linker((int)out,ba[i],internal);
5186         emit_jmp(0);
5187       }
5188     }
5189     // branch not taken
5190     cop1_usable=prev_cop1_usable;
5191     if(!unconditional) {
5192       if(nottaken1) set_jump_target(nottaken1,(int)out);
5193       set_jump_target(nottaken,(int)out);
5194       assem_debug("2:\n");
5195       if(!likely[i]) {
5196         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5197                       ds_unneeded,ds_unneeded_upper);
5198         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5199         address_generation(i+1,&branch_regs[i],0);
5200         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5201         ds_assemble(i+1,&branch_regs[i]);
5202       }
5203       cc=get_reg(branch_regs[i].regmap,CCREG);
5204       if(cc==-1&&!likely[i]) {
5205         // Cycle count isn't in a register, temporarily load it then write it out
5206         emit_loadreg(CCREG,HOST_CCREG);
5207         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5208         int jaddr=(int)out;
5209         emit_jns(0);
5210         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5211         emit_storereg(CCREG,HOST_CCREG);
5212       }
5213       else{
5214         cc=get_reg(i_regmap,CCREG);
5215         assert(cc==HOST_CCREG);
5216         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5217         int jaddr=(int)out;
5218         emit_jns(0);
5219         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5220       }
5221     }
5222   }
5223 }
5224
5225 void sjump_assemble(int i,struct regstat *i_regs)
5226 {
5227   signed char *i_regmap=i_regs->regmap;
5228   int cc;
5229   int match;
5230   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5231   assem_debug("smatch=%d\n",match);
5232   int s1h,s1l;
5233   int prev_cop1_usable=cop1_usable;
5234   int unconditional=0,nevertaken=0;
5235   int only32=0;
5236   int invert=0;
5237   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5238   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5239   if(!match) invert=1;
5240   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5241   if(i>(ba[i]-start)>>2) invert=1;
5242   #endif
5243
5244   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5245   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5246
5247   if(ooo[i]) {
5248     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5249     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5250   }
5251   else {
5252     s1l=get_reg(i_regmap,rs1[i]);
5253     s1h=get_reg(i_regmap,rs1[i]|64);
5254   }
5255   if(rs1[i]==0)
5256   {
5257     if(opcode2[i]&1) unconditional=1;
5258     else nevertaken=1;
5259     // These are never taken (r0 is never less than zero)
5260     //assert(opcode2[i]!=0);
5261     //assert(opcode2[i]!=2);
5262     //assert(opcode2[i]!=0x10);
5263     //assert(opcode2[i]!=0x12);
5264   }
5265   else {
5266     only32=(regs[i].was32>>rs1[i])&1;
5267   }
5268
5269   if(ooo[i]) {
5270     // Out of order execution (delay slot first)
5271     //printf("OOOE\n");
5272     address_generation(i+1,i_regs,regs[i].regmap_entry);
5273     ds_assemble(i+1,i_regs);
5274     int adj;
5275     uint64_t bc_unneeded=branch_regs[i].u;
5276     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5277     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5278     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5279     bc_unneeded|=1;
5280     bc_unneeded_upper|=1;
5281     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5282                   bc_unneeded,bc_unneeded_upper);
5283     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5284     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5285     if(rt1[i]==31) {
5286       int rt,return_address;
5287       rt=get_reg(branch_regs[i].regmap,31);
5288       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5289       if(rt>=0) {
5290         // Save the PC even if the branch is not taken
5291         return_address=start+i*4+8;
5292         emit_movimm(return_address,rt); // PC into link register
5293         #ifdef IMM_PREFETCH
5294         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5295         #endif
5296       }
5297     }
5298     cc=get_reg(branch_regs[i].regmap,CCREG);
5299     assert(cc==HOST_CCREG);
5300     if(unconditional)
5301       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5302     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5303     assem_debug("cycle count (adj)\n");
5304     if(unconditional) {
5305       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5306       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5307         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5308         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5309         if(internal)
5310           assem_debug("branch: internal\n");
5311         else
5312           assem_debug("branch: external\n");
5313         if(internal&&is_ds[(ba[i]-start)>>2]) {
5314           ds_assemble_entry(i);
5315         }
5316         else {
5317           add_to_linker((int)out,ba[i],internal);
5318           emit_jmp(0);
5319         }
5320         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5321         if(((u_int)out)&7) emit_addnop(0);
5322         #endif
5323       }
5324     }
5325     else if(nevertaken) {
5326       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5327       int jaddr=(int)out;
5328       emit_jns(0);
5329       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5330     }
5331     else {
5332       int nottaken=0;
5333       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5334       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5335       if(!only32)
5336       {
5337         assert(s1h>=0);
5338         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5339         {
5340           emit_test(s1h,s1h);
5341           if(invert){
5342             nottaken=(int)out;
5343             emit_jns(1);
5344           }else{
5345             add_to_linker((int)out,ba[i],internal);
5346             emit_js(0);
5347           }
5348         }
5349         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5350         {
5351           emit_test(s1h,s1h);
5352           if(invert){
5353             nottaken=(int)out;
5354             emit_js(1);
5355           }else{
5356             add_to_linker((int)out,ba[i],internal);
5357             emit_jns(0);
5358           }
5359         }
5360       } // if(!only32)
5361       else
5362       {
5363         assert(s1l>=0);
5364         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5365         {
5366           emit_test(s1l,s1l);
5367           if(invert){
5368             nottaken=(int)out;
5369             emit_jns(1);
5370           }else{
5371             add_to_linker((int)out,ba[i],internal);
5372             emit_js(0);
5373           }
5374         }
5375         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5376         {
5377           emit_test(s1l,s1l);
5378           if(invert){
5379             nottaken=(int)out;
5380             emit_js(1);
5381           }else{
5382             add_to_linker((int)out,ba[i],internal);
5383             emit_jns(0);
5384           }
5385         }
5386       } // if(!only32)
5387
5388       if(invert) {
5389         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5390         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5391           if(adj) {
5392             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5393             add_to_linker((int)out,ba[i],internal);
5394           }else{
5395             emit_addnop(13);
5396             add_to_linker((int)out,ba[i],internal*2);
5397           }
5398           emit_jmp(0);
5399         }else
5400         #endif
5401         {
5402           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5403           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5404           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5405           if(internal)
5406             assem_debug("branch: internal\n");
5407           else
5408             assem_debug("branch: external\n");
5409           if(internal&&is_ds[(ba[i]-start)>>2]) {
5410             ds_assemble_entry(i);
5411           }
5412           else {
5413             add_to_linker((int)out,ba[i],internal);
5414             emit_jmp(0);
5415           }
5416         }
5417         set_jump_target(nottaken,(int)out);
5418       }
5419
5420       if(adj) {
5421         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5422       }
5423     } // (!unconditional)
5424   } // if(ooo)
5425   else
5426   {
5427     // In-order execution (branch first)
5428     //printf("IOE\n");
5429     int nottaken=0;
5430     if(rt1[i]==31) {
5431       int rt,return_address;
5432       rt=get_reg(branch_regs[i].regmap,31);
5433       if(rt>=0) {
5434         // Save the PC even if the branch is not taken
5435         return_address=start+i*4+8;
5436         emit_movimm(return_address,rt); // PC into link register
5437         #ifdef IMM_PREFETCH
5438         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5439         #endif
5440       }
5441     }
5442     if(!unconditional) {
5443       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5444       if(!only32)
5445       {
5446         assert(s1h>=0);
5447         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5448         {
5449           emit_test(s1h,s1h);
5450           nottaken=(int)out;
5451           emit_jns(1);
5452         }
5453         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5454         {
5455           emit_test(s1h,s1h);
5456           nottaken=(int)out;
5457           emit_js(1);
5458         }
5459       } // if(!only32)
5460       else
5461       {
5462         assert(s1l>=0);
5463         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5464         {
5465           emit_test(s1l,s1l);
5466           nottaken=(int)out;
5467           emit_jns(1);
5468         }
5469         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5470         {
5471           emit_test(s1l,s1l);
5472           nottaken=(int)out;
5473           emit_js(1);
5474         }
5475       }
5476     } // if(!unconditional)
5477     int adj;
5478     uint64_t ds_unneeded=branch_regs[i].u;
5479     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5480     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5481     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5482     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5483     ds_unneeded|=1;
5484     ds_unneeded_upper|=1;
5485     // branch taken
5486     if(!nevertaken) {
5487       //assem_debug("1:\n");
5488       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5489                     ds_unneeded,ds_unneeded_upper);
5490       // load regs
5491       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5492       address_generation(i+1,&branch_regs[i],0);
5493       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5494       ds_assemble(i+1,&branch_regs[i]);
5495       cc=get_reg(branch_regs[i].regmap,CCREG);
5496       if(cc==-1) {
5497         emit_loadreg(CCREG,cc=HOST_CCREG);
5498         // CHECK: Is the following instruction (fall thru) allocated ok?
5499       }
5500       assert(cc==HOST_CCREG);
5501       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5502       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5503       assem_debug("cycle count (adj)\n");
5504       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5505       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5506       if(internal)
5507         assem_debug("branch: internal\n");
5508       else
5509         assem_debug("branch: external\n");
5510       if(internal&&is_ds[(ba[i]-start)>>2]) {
5511         ds_assemble_entry(i);
5512       }
5513       else {
5514         add_to_linker((int)out,ba[i],internal);
5515         emit_jmp(0);
5516       }
5517     }
5518     // branch not taken
5519     cop1_usable=prev_cop1_usable;
5520     if(!unconditional) {
5521       set_jump_target(nottaken,(int)out);
5522       assem_debug("1:\n");
5523       if(!likely[i]) {
5524         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5525                       ds_unneeded,ds_unneeded_upper);
5526         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5527         address_generation(i+1,&branch_regs[i],0);
5528         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5529         ds_assemble(i+1,&branch_regs[i]);
5530       }
5531       cc=get_reg(branch_regs[i].regmap,CCREG);
5532       if(cc==-1&&!likely[i]) {
5533         // Cycle count isn't in a register, temporarily load it then write it out
5534         emit_loadreg(CCREG,HOST_CCREG);
5535         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5536         int jaddr=(int)out;
5537         emit_jns(0);
5538         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5539         emit_storereg(CCREG,HOST_CCREG);
5540       }
5541       else{
5542         cc=get_reg(i_regmap,CCREG);
5543         assert(cc==HOST_CCREG);
5544         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5545         int jaddr=(int)out;
5546         emit_jns(0);
5547         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5548       }
5549     }
5550   }
5551 }
5552
5553 void fjump_assemble(int i,struct regstat *i_regs)
5554 {
5555   signed char *i_regmap=i_regs->regmap;
5556   int cc;
5557   int match;
5558   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5559   assem_debug("fmatch=%d\n",match);
5560   int fs,cs;
5561   int eaddr;
5562   int invert=0;
5563   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5564   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5565   if(!match) invert=1;
5566   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5567   if(i>(ba[i]-start)>>2) invert=1;
5568   #endif
5569
5570   if(ooo[i]) {
5571     fs=get_reg(branch_regs[i].regmap,FSREG);
5572     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5573   }
5574   else {
5575     fs=get_reg(i_regmap,FSREG);
5576   }
5577
5578   // Check cop1 unusable
5579   if(!cop1_usable) {
5580     cs=get_reg(i_regmap,CSREG);
5581     assert(cs>=0);
5582     emit_testimm(cs,0x20000000);
5583     eaddr=(int)out;
5584     emit_jeq(0);
5585     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5586     cop1_usable=1;
5587   }
5588
5589   if(ooo[i]) {
5590     // Out of order execution (delay slot first)
5591     //printf("OOOE\n");
5592     ds_assemble(i+1,i_regs);
5593     int adj;
5594     uint64_t bc_unneeded=branch_regs[i].u;
5595     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5596     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5597     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5598     bc_unneeded|=1;
5599     bc_unneeded_upper|=1;
5600     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5601                   bc_unneeded,bc_unneeded_upper);
5602     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5603     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5604     cc=get_reg(branch_regs[i].regmap,CCREG);
5605     assert(cc==HOST_CCREG);
5606     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5607     assem_debug("cycle count (adj)\n");
5608     if(1) {
5609       int nottaken=0;
5610       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5611       if(1) {
5612         assert(fs>=0);
5613         emit_testimm(fs,0x800000);
5614         if(source[i]&0x10000) // BC1T
5615         {
5616           if(invert){
5617             nottaken=(int)out;
5618             emit_jeq(1);
5619           }else{
5620             add_to_linker((int)out,ba[i],internal);
5621             emit_jne(0);
5622           }
5623         }
5624         else // BC1F
5625           if(invert){
5626             nottaken=(int)out;
5627             emit_jne(1);
5628           }else{
5629             add_to_linker((int)out,ba[i],internal);
5630             emit_jeq(0);
5631           }
5632         {
5633         }
5634       } // if(!only32)
5635
5636       if(invert) {
5637         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5638         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5639         else if(match) emit_addnop(13);
5640         #endif
5641         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5642         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5643         if(internal)
5644           assem_debug("branch: internal\n");
5645         else
5646           assem_debug("branch: external\n");
5647         if(internal&&is_ds[(ba[i]-start)>>2]) {
5648           ds_assemble_entry(i);
5649         }
5650         else {
5651           add_to_linker((int)out,ba[i],internal);
5652           emit_jmp(0);
5653         }
5654         set_jump_target(nottaken,(int)out);
5655       }
5656
5657       if(adj) {
5658         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5659       }
5660     } // (!unconditional)
5661   } // if(ooo)
5662   else
5663   {
5664     // In-order execution (branch first)
5665     //printf("IOE\n");
5666     int nottaken=0;
5667     if(1) {
5668       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5669       if(1) {
5670         assert(fs>=0);
5671         emit_testimm(fs,0x800000);
5672         if(source[i]&0x10000) // BC1T
5673         {
5674           nottaken=(int)out;
5675           emit_jeq(1);
5676         }
5677         else // BC1F
5678         {
5679           nottaken=(int)out;
5680           emit_jne(1);
5681         }
5682       }
5683     } // if(!unconditional)
5684     int adj;
5685     uint64_t ds_unneeded=branch_regs[i].u;
5686     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5687     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5688     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5689     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5690     ds_unneeded|=1;
5691     ds_unneeded_upper|=1;
5692     // branch taken
5693     //assem_debug("1:\n");
5694     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5695                   ds_unneeded,ds_unneeded_upper);
5696     // load regs
5697     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5698     address_generation(i+1,&branch_regs[i],0);
5699     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5700     ds_assemble(i+1,&branch_regs[i]);
5701     cc=get_reg(branch_regs[i].regmap,CCREG);
5702     if(cc==-1) {
5703       emit_loadreg(CCREG,cc=HOST_CCREG);
5704       // CHECK: Is the following instruction (fall thru) allocated ok?
5705     }
5706     assert(cc==HOST_CCREG);
5707     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5708     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5709     assem_debug("cycle count (adj)\n");
5710     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5711     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5712     if(internal)
5713       assem_debug("branch: internal\n");
5714     else
5715       assem_debug("branch: external\n");
5716     if(internal&&is_ds[(ba[i]-start)>>2]) {
5717       ds_assemble_entry(i);
5718     }
5719     else {
5720       add_to_linker((int)out,ba[i],internal);
5721       emit_jmp(0);
5722     }
5723
5724     // branch not taken
5725     if(1) { // <- FIXME (don't need this)
5726       set_jump_target(nottaken,(int)out);
5727       assem_debug("1:\n");
5728       if(!likely[i]) {
5729         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5730                       ds_unneeded,ds_unneeded_upper);
5731         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5732         address_generation(i+1,&branch_regs[i],0);
5733         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5734         ds_assemble(i+1,&branch_regs[i]);
5735       }
5736       cc=get_reg(branch_regs[i].regmap,CCREG);
5737       if(cc==-1&&!likely[i]) {
5738         // Cycle count isn't in a register, temporarily load it then write it out
5739         emit_loadreg(CCREG,HOST_CCREG);
5740         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5741         int jaddr=(int)out;
5742         emit_jns(0);
5743         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5744         emit_storereg(CCREG,HOST_CCREG);
5745       }
5746       else{
5747         cc=get_reg(i_regmap,CCREG);
5748         assert(cc==HOST_CCREG);
5749         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5750         int jaddr=(int)out;
5751         emit_jns(0);
5752         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5753       }
5754     }
5755   }
5756 }
5757
5758 static void pagespan_assemble(int i,struct regstat *i_regs)
5759 {
5760   int s1l=get_reg(i_regs->regmap,rs1[i]);
5761   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5762   int s2l=get_reg(i_regs->regmap,rs2[i]);
5763   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5764   void *nt_branch=NULL;
5765   int taken=0;
5766   int nottaken=0;
5767   int unconditional=0;
5768   if(rs1[i]==0)
5769   {
5770     s1l=s2l;s1h=s2h;
5771     s2l=s2h=-1;
5772   }
5773   else if(rs2[i]==0)
5774   {
5775     s2l=s2h=-1;
5776   }
5777   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5778     s1h=s2h=-1;
5779   }
5780   int hr=0;
5781   int addr,alt,ntaddr;
5782   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5783   else {
5784     while(hr<HOST_REGS)
5785     {
5786       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5787          (i_regs->regmap[hr]&63)!=rs1[i] &&
5788          (i_regs->regmap[hr]&63)!=rs2[i] )
5789       {
5790         addr=hr++;break;
5791       }
5792       hr++;
5793     }
5794   }
5795   while(hr<HOST_REGS)
5796   {
5797     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5798        (i_regs->regmap[hr]&63)!=rs1[i] &&
5799        (i_regs->regmap[hr]&63)!=rs2[i] )
5800     {
5801       alt=hr++;break;
5802     }
5803     hr++;
5804   }
5805   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5806   {
5807     while(hr<HOST_REGS)
5808     {
5809       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5810          (i_regs->regmap[hr]&63)!=rs1[i] &&
5811          (i_regs->regmap[hr]&63)!=rs2[i] )
5812       {
5813         ntaddr=hr;break;
5814       }
5815       hr++;
5816     }
5817   }
5818   assert(hr<HOST_REGS);
5819   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5820     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5821   }
5822   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5823   if(opcode[i]==2) // J
5824   {
5825     unconditional=1;
5826   }
5827   if(opcode[i]==3) // JAL
5828   {
5829     // TODO: mini_ht
5830     int rt=get_reg(i_regs->regmap,31);
5831     emit_movimm(start+i*4+8,rt);
5832     unconditional=1;
5833   }
5834   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5835   {
5836     emit_mov(s1l,addr);
5837     if(opcode2[i]==9) // JALR
5838     {
5839       int rt=get_reg(i_regs->regmap,rt1[i]);
5840       emit_movimm(start+i*4+8,rt);
5841     }
5842   }
5843   if((opcode[i]&0x3f)==4) // BEQ
5844   {
5845     if(rs1[i]==rs2[i])
5846     {
5847       unconditional=1;
5848     }
5849     else
5850     #ifdef HAVE_CMOV_IMM
5851     if(s1h<0) {
5852       if(s2l>=0) emit_cmp(s1l,s2l);
5853       else emit_test(s1l,s1l);
5854       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5855     }
5856     else
5857     #endif
5858     {
5859       assert(s1l>=0);
5860       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5861       if(s1h>=0) {
5862         if(s2h>=0) emit_cmp(s1h,s2h);
5863         else emit_test(s1h,s1h);
5864         emit_cmovne_reg(alt,addr);
5865       }
5866       if(s2l>=0) emit_cmp(s1l,s2l);
5867       else emit_test(s1l,s1l);
5868       emit_cmovne_reg(alt,addr);
5869     }
5870   }
5871   if((opcode[i]&0x3f)==5) // BNE
5872   {
5873     #ifdef HAVE_CMOV_IMM
5874     if(s1h<0) {
5875       if(s2l>=0) emit_cmp(s1l,s2l);
5876       else emit_test(s1l,s1l);
5877       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5878     }
5879     else
5880     #endif
5881     {
5882       assert(s1l>=0);
5883       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5884       if(s1h>=0) {
5885         if(s2h>=0) emit_cmp(s1h,s2h);
5886         else emit_test(s1h,s1h);
5887         emit_cmovne_reg(alt,addr);
5888       }
5889       if(s2l>=0) emit_cmp(s1l,s2l);
5890       else emit_test(s1l,s1l);
5891       emit_cmovne_reg(alt,addr);
5892     }
5893   }
5894   if((opcode[i]&0x3f)==0x14) // BEQL
5895   {
5896     if(s1h>=0) {
5897       if(s2h>=0) emit_cmp(s1h,s2h);
5898       else emit_test(s1h,s1h);
5899       nottaken=(int)out;
5900       emit_jne(0);
5901     }
5902     if(s2l>=0) emit_cmp(s1l,s2l);
5903     else emit_test(s1l,s1l);
5904     if(nottaken) set_jump_target(nottaken,(int)out);
5905     nottaken=(int)out;
5906     emit_jne(0);
5907   }
5908   if((opcode[i]&0x3f)==0x15) // BNEL
5909   {
5910     if(s1h>=0) {
5911       if(s2h>=0) emit_cmp(s1h,s2h);
5912       else emit_test(s1h,s1h);
5913       taken=(int)out;
5914       emit_jne(0);
5915     }
5916     if(s2l>=0) emit_cmp(s1l,s2l);
5917     else emit_test(s1l,s1l);
5918     nottaken=(int)out;
5919     emit_jeq(0);
5920     if(taken) set_jump_target(taken,(int)out);
5921   }
5922   if((opcode[i]&0x3f)==6) // BLEZ
5923   {
5924     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5925     emit_cmpimm(s1l,1);
5926     if(s1h>=0) emit_mov(addr,ntaddr);
5927     emit_cmovl_reg(alt,addr);
5928     if(s1h>=0) {
5929       emit_test(s1h,s1h);
5930       emit_cmovne_reg(ntaddr,addr);
5931       emit_cmovs_reg(alt,addr);
5932     }
5933   }
5934   if((opcode[i]&0x3f)==7) // BGTZ
5935   {
5936     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5937     emit_cmpimm(s1l,1);
5938     if(s1h>=0) emit_mov(addr,alt);
5939     emit_cmovl_reg(ntaddr,addr);
5940     if(s1h>=0) {
5941       emit_test(s1h,s1h);
5942       emit_cmovne_reg(alt,addr);
5943       emit_cmovs_reg(ntaddr,addr);
5944     }
5945   }
5946   if((opcode[i]&0x3f)==0x16) // BLEZL
5947   {
5948     assert((opcode[i]&0x3f)!=0x16);
5949   }
5950   if((opcode[i]&0x3f)==0x17) // BGTZL
5951   {
5952     assert((opcode[i]&0x3f)!=0x17);
5953   }
5954   assert(opcode[i]!=1); // BLTZ/BGEZ
5955
5956   //FIXME: Check CSREG
5957   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5958     if((source[i]&0x30000)==0) // BC1F
5959     {
5960       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5961       emit_testimm(s1l,0x800000);
5962       emit_cmovne_reg(alt,addr);
5963     }
5964     if((source[i]&0x30000)==0x10000) // BC1T
5965     {
5966       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5967       emit_testimm(s1l,0x800000);
5968       emit_cmovne_reg(alt,addr);
5969     }
5970     if((source[i]&0x30000)==0x20000) // BC1FL
5971     {
5972       emit_testimm(s1l,0x800000);
5973       nottaken=(int)out;
5974       emit_jne(0);
5975     }
5976     if((source[i]&0x30000)==0x30000) // BC1TL
5977     {
5978       emit_testimm(s1l,0x800000);
5979       nottaken=(int)out;
5980       emit_jeq(0);
5981     }
5982   }
5983
5984   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5985   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5986   if(likely[i]||unconditional)
5987   {
5988     emit_movimm(ba[i],HOST_BTREG);
5989   }
5990   else if(addr!=HOST_BTREG)
5991   {
5992     emit_mov(addr,HOST_BTREG);
5993   }
5994   void *branch_addr=out;
5995   emit_jmp(0);
5996   int target_addr=start+i*4+5;
5997   void *stub=out;
5998   void *compiled_target_addr=check_addr(target_addr);
5999   emit_extjump_ds((int)branch_addr,target_addr);
6000   if(compiled_target_addr) {
6001     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6002     add_link(target_addr,stub);
6003   }
6004   else set_jump_target((int)branch_addr,(int)stub);
6005   if(likely[i]) {
6006     // Not-taken path
6007     set_jump_target((int)nottaken,(int)out);
6008     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6009     void *branch_addr=out;
6010     emit_jmp(0);
6011     int target_addr=start+i*4+8;
6012     void *stub=out;
6013     void *compiled_target_addr=check_addr(target_addr);
6014     emit_extjump_ds((int)branch_addr,target_addr);
6015     if(compiled_target_addr) {
6016       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6017       add_link(target_addr,stub);
6018     }
6019     else set_jump_target((int)branch_addr,(int)stub);
6020   }
6021 }
6022
6023 // Assemble the delay slot for the above
6024 static void pagespan_ds()
6025 {
6026   assem_debug("initial delay slot:\n");
6027   u_int vaddr=start+1;
6028   u_int page=get_page(vaddr);
6029   u_int vpage=get_vpage(vaddr);
6030   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6031   do_dirty_stub_ds();
6032   ll_add(jump_in+page,vaddr,(void *)out);
6033   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6034   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6035     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6036   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6037     emit_writeword(HOST_BTREG,(int)&branch_target);
6038   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6039   address_generation(0,&regs[0],regs[0].regmap_entry);
6040   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6041     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6042   cop1_usable=0;
6043   is_delayslot=0;
6044   switch(itype[0]) {
6045     case ALU:
6046       alu_assemble(0,&regs[0]);break;
6047     case IMM16:
6048       imm16_assemble(0,&regs[0]);break;
6049     case SHIFT:
6050       shift_assemble(0,&regs[0]);break;
6051     case SHIFTIMM:
6052       shiftimm_assemble(0,&regs[0]);break;
6053     case LOAD:
6054       load_assemble(0,&regs[0]);break;
6055     case LOADLR:
6056       loadlr_assemble(0,&regs[0]);break;
6057     case STORE:
6058       store_assemble(0,&regs[0]);break;
6059     case STORELR:
6060       storelr_assemble(0,&regs[0]);break;
6061     case COP0:
6062       cop0_assemble(0,&regs[0]);break;
6063     case COP1:
6064       cop1_assemble(0,&regs[0]);break;
6065     case C1LS:
6066       c1ls_assemble(0,&regs[0]);break;
6067     case COP2:
6068       cop2_assemble(0,&regs[0]);break;
6069     case C2LS:
6070       c2ls_assemble(0,&regs[0]);break;
6071     case C2OP:
6072       c2op_assemble(0,&regs[0]);break;
6073     case FCONV:
6074       fconv_assemble(0,&regs[0]);break;
6075     case FLOAT:
6076       float_assemble(0,&regs[0]);break;
6077     case FCOMP:
6078       fcomp_assemble(0,&regs[0]);break;
6079     case MULTDIV:
6080       multdiv_assemble(0,&regs[0]);break;
6081     case MOV:
6082       mov_assemble(0,&regs[0]);break;
6083     case SYSCALL:
6084     case HLECALL:
6085     case INTCALL:
6086     case SPAN:
6087     case UJUMP:
6088     case RJUMP:
6089     case CJUMP:
6090     case SJUMP:
6091     case FJUMP:
6092       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6093   }
6094   int btaddr=get_reg(regs[0].regmap,BTREG);
6095   if(btaddr<0) {
6096     btaddr=get_reg(regs[0].regmap,-1);
6097     emit_readword((int)&branch_target,btaddr);
6098   }
6099   assert(btaddr!=HOST_CCREG);
6100   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6101 #ifdef HOST_IMM8
6102   emit_movimm(start+4,HOST_TEMPREG);
6103   emit_cmp(btaddr,HOST_TEMPREG);
6104 #else
6105   emit_cmpimm(btaddr,start+4);
6106 #endif
6107   int branch=(int)out;
6108   emit_jeq(0);
6109   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6110   emit_jmp(jump_vaddr_reg[btaddr]);
6111   set_jump_target(branch,(int)out);
6112   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6113   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6114 }
6115
6116 // Basic liveness analysis for MIPS registers
6117 void unneeded_registers(int istart,int iend,int r)
6118 {
6119   int i;
6120   uint64_t u,uu,gte_u,b,bu,gte_bu;
6121   uint64_t temp_u,temp_uu,temp_gte_u=0;
6122   uint64_t tdep;
6123   uint64_t gte_u_unknown=0;
6124   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6125     gte_u_unknown=~0ll;
6126   if(iend==slen-1) {
6127     u=1;uu=1;
6128     gte_u=gte_u_unknown;
6129   }else{
6130     u=unneeded_reg[iend+1];
6131     uu=unneeded_reg_upper[iend+1];
6132     u=1;uu=1;
6133     gte_u=gte_unneeded[iend+1];
6134   }
6135
6136   for (i=iend;i>=istart;i--)
6137   {
6138     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6139     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6140     {
6141       // If subroutine call, flag return address as a possible branch target
6142       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6143
6144       if(ba[i]<start || ba[i]>=(start+slen*4))
6145       {
6146         // Branch out of this block, flush all regs
6147         u=1;
6148         uu=1;
6149         gte_u=gte_u_unknown;
6150         /* Hexagon hack
6151         if(itype[i]==UJUMP&&rt1[i]==31)
6152         {
6153           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6154         }
6155         if(itype[i]==RJUMP&&rs1[i]==31)
6156         {
6157           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6158         }
6159         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6160           if(itype[i]==UJUMP&&rt1[i]==31)
6161           {
6162             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6163             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6164           }
6165           if(itype[i]==RJUMP&&rs1[i]==31)
6166           {
6167             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6168             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6169           }
6170         }*/
6171         branch_unneeded_reg[i]=u;
6172         branch_unneeded_reg_upper[i]=uu;
6173         // Merge in delay slot
6174         tdep=(~uu>>rt1[i+1])&1;
6175         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6176         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6177         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6178         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6179         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6180         u|=1;uu|=1;
6181         gte_u|=gte_rt[i+1];
6182         gte_u&=~gte_rs[i+1];
6183         // If branch is "likely" (and conditional)
6184         // then we skip the delay slot on the fall-thru path
6185         if(likely[i]) {
6186           if(i<slen-1) {
6187             u&=unneeded_reg[i+2];
6188             uu&=unneeded_reg_upper[i+2];
6189             gte_u&=gte_unneeded[i+2];
6190           }
6191           else
6192           {
6193             u=1;
6194             uu=1;
6195             gte_u=gte_u_unknown;
6196           }
6197         }
6198       }
6199       else
6200       {
6201         // Internal branch, flag target
6202         bt[(ba[i]-start)>>2]=1;
6203         if(ba[i]<=start+i*4) {
6204           // Backward branch
6205           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6206           {
6207             // Unconditional branch
6208             temp_u=1;temp_uu=1;
6209             temp_gte_u=0;
6210           } else {
6211             // Conditional branch (not taken case)
6212             temp_u=unneeded_reg[i+2];
6213             temp_uu=unneeded_reg_upper[i+2];
6214             temp_gte_u&=gte_unneeded[i+2];
6215           }
6216           // Merge in delay slot
6217           tdep=(~temp_uu>>rt1[i+1])&1;
6218           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6219           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6220           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6221           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6222           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6223           temp_u|=1;temp_uu|=1;
6224           temp_gte_u|=gte_rt[i+1];
6225           temp_gte_u&=~gte_rs[i+1];
6226           // If branch is "likely" (and conditional)
6227           // then we skip the delay slot on the fall-thru path
6228           if(likely[i]) {
6229             if(i<slen-1) {
6230               temp_u&=unneeded_reg[i+2];
6231               temp_uu&=unneeded_reg_upper[i+2];
6232               temp_gte_u&=gte_unneeded[i+2];
6233             }
6234             else
6235             {
6236               temp_u=1;
6237               temp_uu=1;
6238               temp_gte_u=gte_u_unknown;
6239             }
6240           }
6241           tdep=(~temp_uu>>rt1[i])&1;
6242           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6243           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6244           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6245           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6246           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6247           temp_u|=1;temp_uu|=1;
6248           temp_gte_u|=gte_rt[i];
6249           temp_gte_u&=~gte_rs[i];
6250           unneeded_reg[i]=temp_u;
6251           unneeded_reg_upper[i]=temp_uu;
6252           gte_unneeded[i]=temp_gte_u;
6253           // Only go three levels deep.  This recursion can take an
6254           // excessive amount of time if there are a lot of nested loops.
6255           if(r<2) {
6256             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6257           }else{
6258             unneeded_reg[(ba[i]-start)>>2]=1;
6259             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6260             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6261           }
6262         } /*else*/ if(1) {
6263           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6264           {
6265             // Unconditional branch
6266             u=unneeded_reg[(ba[i]-start)>>2];
6267             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6268             gte_u=gte_unneeded[(ba[i]-start)>>2];
6269             branch_unneeded_reg[i]=u;
6270             branch_unneeded_reg_upper[i]=uu;
6271         //u=1;
6272         //uu=1;
6273         //branch_unneeded_reg[i]=u;
6274         //branch_unneeded_reg_upper[i]=uu;
6275             // Merge in delay slot
6276             tdep=(~uu>>rt1[i+1])&1;
6277             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6278             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6279             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6280             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6281             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6282             u|=1;uu|=1;
6283             gte_u|=gte_rt[i+1];
6284             gte_u&=~gte_rs[i+1];
6285           } else {
6286             // Conditional branch
6287             b=unneeded_reg[(ba[i]-start)>>2];
6288             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6289             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6290             branch_unneeded_reg[i]=b;
6291             branch_unneeded_reg_upper[i]=bu;
6292         //b=1;
6293         //bu=1;
6294         //branch_unneeded_reg[i]=b;
6295         //branch_unneeded_reg_upper[i]=bu;
6296             // Branch delay slot
6297             tdep=(~uu>>rt1[i+1])&1;
6298             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6299             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6300             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6301             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6302             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6303             b|=1;bu|=1;
6304             gte_bu|=gte_rt[i+1];
6305             gte_bu&=~gte_rs[i+1];
6306             // If branch is "likely" then we skip the
6307             // delay slot on the fall-thru path
6308             if(likely[i]) {
6309               u=b;
6310               uu=bu;
6311               gte_u=gte_bu;
6312               if(i<slen-1) {
6313                 u&=unneeded_reg[i+2];
6314                 uu&=unneeded_reg_upper[i+2];
6315                 gte_u&=gte_unneeded[i+2];
6316         //u=1;
6317         //uu=1;
6318               }
6319             } else {
6320               u&=b;
6321               uu&=bu;
6322               gte_u&=gte_bu;
6323         //u=1;
6324         //uu=1;
6325             }
6326             if(i<slen-1) {
6327               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6328               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6329         //branch_unneeded_reg[i]=1;
6330         //branch_unneeded_reg_upper[i]=1;
6331             } else {
6332               branch_unneeded_reg[i]=1;
6333               branch_unneeded_reg_upper[i]=1;
6334             }
6335           }
6336         }
6337       }
6338     }
6339     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6340     {
6341       // SYSCALL instruction (software interrupt)
6342       u=1;
6343       uu=1;
6344     }
6345     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6346     {
6347       // ERET instruction (return from interrupt)
6348       u=1;
6349       uu=1;
6350     }
6351     //u=uu=1; // DEBUG
6352     tdep=(~uu>>rt1[i])&1;
6353     // Written registers are unneeded
6354     u|=1LL<<rt1[i];
6355     u|=1LL<<rt2[i];
6356     uu|=1LL<<rt1[i];
6357     uu|=1LL<<rt2[i];
6358     gte_u|=gte_rt[i];
6359     // Accessed registers are needed
6360     u&=~(1LL<<rs1[i]);
6361     u&=~(1LL<<rs2[i]);
6362     uu&=~(1LL<<us1[i]);
6363     uu&=~(1LL<<us2[i]);
6364     gte_u&=~gte_rs[i];
6365     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6366       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6367     // Source-target dependencies
6368     uu&=~(tdep<<dep1[i]);
6369     uu&=~(tdep<<dep2[i]);
6370     // R0 is always unneeded
6371     u|=1;uu|=1;
6372     // Save it
6373     unneeded_reg[i]=u;
6374     unneeded_reg_upper[i]=uu;
6375     gte_unneeded[i]=gte_u;
6376     /*
6377     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6378     printf("U:");
6379     int r;
6380     for(r=1;r<=CCREG;r++) {
6381       if((unneeded_reg[i]>>r)&1) {
6382         if(r==HIREG) printf(" HI");
6383         else if(r==LOREG) printf(" LO");
6384         else printf(" r%d",r);
6385       }
6386     }
6387     printf(" UU:");
6388     for(r=1;r<=CCREG;r++) {
6389       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6390         if(r==HIREG) printf(" HI");
6391         else if(r==LOREG) printf(" LO");
6392         else printf(" r%d",r);
6393       }
6394     }
6395     printf("\n");*/
6396   }
6397   for (i=iend;i>=istart;i--)
6398   {
6399     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6400   }
6401 }
6402
6403 // Write back dirty registers as soon as we will no longer modify them,
6404 // so that we don't end up with lots of writes at the branches.
6405 void clean_registers(int istart,int iend,int wr)
6406 {
6407   int i;
6408   int r;
6409   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6410   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6411   if(iend==slen-1) {
6412     will_dirty_i=will_dirty_next=0;
6413     wont_dirty_i=wont_dirty_next=0;
6414   }else{
6415     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6416     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6417   }
6418   for (i=iend;i>=istart;i--)
6419   {
6420     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6421     {
6422       if(ba[i]<start || ba[i]>=(start+slen*4))
6423       {
6424         // Branch out of this block, flush all regs
6425         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6426         {
6427           // Unconditional branch
6428           will_dirty_i=0;
6429           wont_dirty_i=0;
6430           // Merge in delay slot (will dirty)
6431           for(r=0;r<HOST_REGS;r++) {
6432             if(r!=EXCLUDE_REG) {
6433               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6434               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6435               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6436               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6437               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6438               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6439               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6440               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6441               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6442               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6443               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6444               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6445               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6446               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6447             }
6448           }
6449         }
6450         else
6451         {
6452           // Conditional branch
6453           will_dirty_i=0;
6454           wont_dirty_i=wont_dirty_next;
6455           // Merge in delay slot (will dirty)
6456           for(r=0;r<HOST_REGS;r++) {
6457             if(r!=EXCLUDE_REG) {
6458               if(!likely[i]) {
6459                 // Might not dirty if likely branch is not taken
6460                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6461                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6462                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6463                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6464                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6465                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6466                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6467                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6468                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6469                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6470                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6471                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6472                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6473                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6474               }
6475             }
6476           }
6477         }
6478         // Merge in delay slot (wont dirty)
6479         for(r=0;r<HOST_REGS;r++) {
6480           if(r!=EXCLUDE_REG) {
6481             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6482             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6483             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6484             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6485             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6486             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6487             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6488             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6489             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6490             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6491           }
6492         }
6493         if(wr) {
6494           #ifndef DESTRUCTIVE_WRITEBACK
6495           branch_regs[i].dirty&=wont_dirty_i;
6496           #endif
6497           branch_regs[i].dirty|=will_dirty_i;
6498         }
6499       }
6500       else
6501       {
6502         // Internal branch
6503         if(ba[i]<=start+i*4) {
6504           // Backward branch
6505           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6506           {
6507             // Unconditional branch
6508             temp_will_dirty=0;
6509             temp_wont_dirty=0;
6510             // Merge in delay slot (will dirty)
6511             for(r=0;r<HOST_REGS;r++) {
6512               if(r!=EXCLUDE_REG) {
6513                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6514                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6515                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6516                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6517                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6518                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6519                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6520                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6521                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6522                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6523                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6524                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6525                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6526                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6527               }
6528             }
6529           } else {
6530             // Conditional branch (not taken case)
6531             temp_will_dirty=will_dirty_next;
6532             temp_wont_dirty=wont_dirty_next;
6533             // Merge in delay slot (will dirty)
6534             for(r=0;r<HOST_REGS;r++) {
6535               if(r!=EXCLUDE_REG) {
6536                 if(!likely[i]) {
6537                   // Will not dirty if likely branch is not taken
6538                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6539                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6540                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6541                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6542                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6543                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6544                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6545                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6546                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6547                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6548                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6549                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6550                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6551                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6552                 }
6553               }
6554             }
6555           }
6556           // Merge in delay slot (wont dirty)
6557           for(r=0;r<HOST_REGS;r++) {
6558             if(r!=EXCLUDE_REG) {
6559               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6560               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6561               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6562               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6563               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6564               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6565               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6566               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6567               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6568               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6569             }
6570           }
6571           // Deal with changed mappings
6572           if(i<iend) {
6573             for(r=0;r<HOST_REGS;r++) {
6574               if(r!=EXCLUDE_REG) {
6575                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6576                   temp_will_dirty&=~(1<<r);
6577                   temp_wont_dirty&=~(1<<r);
6578                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6579                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6580                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6581                   } else {
6582                     temp_will_dirty|=1<<r;
6583                     temp_wont_dirty|=1<<r;
6584                   }
6585                 }
6586               }
6587             }
6588           }
6589           if(wr) {
6590             will_dirty[i]=temp_will_dirty;
6591             wont_dirty[i]=temp_wont_dirty;
6592             clean_registers((ba[i]-start)>>2,i-1,0);
6593           }else{
6594             // Limit recursion.  It can take an excessive amount
6595             // of time if there are a lot of nested loops.
6596             will_dirty[(ba[i]-start)>>2]=0;
6597             wont_dirty[(ba[i]-start)>>2]=-1;
6598           }
6599         }
6600         /*else*/ if(1)
6601         {
6602           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6603           {
6604             // Unconditional branch
6605             will_dirty_i=0;
6606             wont_dirty_i=0;
6607           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6608             for(r=0;r<HOST_REGS;r++) {
6609               if(r!=EXCLUDE_REG) {
6610                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6611                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6612                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6613                 }
6614                 if(branch_regs[i].regmap[r]>=0) {
6615                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6616                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6617                 }
6618               }
6619             }
6620           //}
6621             // Merge in delay slot
6622             for(r=0;r<HOST_REGS;r++) {
6623               if(r!=EXCLUDE_REG) {
6624                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6625                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6626                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6627                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6628                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6629                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6630                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6631                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6632                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6633                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6634                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6635                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6636                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6637                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6638               }
6639             }
6640           } else {
6641             // Conditional branch
6642             will_dirty_i=will_dirty_next;
6643             wont_dirty_i=wont_dirty_next;
6644           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6645             for(r=0;r<HOST_REGS;r++) {
6646               if(r!=EXCLUDE_REG) {
6647                 signed char target_reg=branch_regs[i].regmap[r];
6648                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6649                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6650                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6651                 }
6652                 else if(target_reg>=0) {
6653                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6654                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6655                 }
6656                 // Treat delay slot as part of branch too
6657                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6658                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6659                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6660                 }
6661                 else
6662                 {
6663                   will_dirty[i+1]&=~(1<<r);
6664                 }*/
6665               }
6666             }
6667           //}
6668             // Merge in delay slot
6669             for(r=0;r<HOST_REGS;r++) {
6670               if(r!=EXCLUDE_REG) {
6671                 if(!likely[i]) {
6672                   // Might not dirty if likely branch is not taken
6673                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6674                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6675                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6676                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6677                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6678                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6679                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6680                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6681                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6682                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6683                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6684                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6685                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6686                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6687                 }
6688               }
6689             }
6690           }
6691           // Merge in delay slot (won't dirty)
6692           for(r=0;r<HOST_REGS;r++) {
6693             if(r!=EXCLUDE_REG) {
6694               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6695               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6696               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6697               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6698               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6699               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6700               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6701               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6702               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6703               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6704             }
6705           }
6706           if(wr) {
6707             #ifndef DESTRUCTIVE_WRITEBACK
6708             branch_regs[i].dirty&=wont_dirty_i;
6709             #endif
6710             branch_regs[i].dirty|=will_dirty_i;
6711           }
6712         }
6713       }
6714     }
6715     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6716     {
6717       // SYSCALL instruction (software interrupt)
6718       will_dirty_i=0;
6719       wont_dirty_i=0;
6720     }
6721     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6722     {
6723       // ERET instruction (return from interrupt)
6724       will_dirty_i=0;
6725       wont_dirty_i=0;
6726     }
6727     will_dirty_next=will_dirty_i;
6728     wont_dirty_next=wont_dirty_i;
6729     for(r=0;r<HOST_REGS;r++) {
6730       if(r!=EXCLUDE_REG) {
6731         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6732         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6733         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6734         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6735         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6736         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6737         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6738         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6739         if(i>istart) {
6740           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6741           {
6742             // Don't store a register immediately after writing it,
6743             // may prevent dual-issue.
6744             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6745             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6746           }
6747         }
6748       }
6749     }
6750     // Save it
6751     will_dirty[i]=will_dirty_i;
6752     wont_dirty[i]=wont_dirty_i;
6753     // Mark registers that won't be dirtied as not dirty
6754     if(wr) {
6755       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6756       for(r=0;r<HOST_REGS;r++) {
6757         if((will_dirty_i>>r)&1) {
6758           printf(" r%d",r);
6759         }
6760       }
6761       printf("\n");*/
6762
6763       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6764         regs[i].dirty|=will_dirty_i;
6765         #ifndef DESTRUCTIVE_WRITEBACK
6766         regs[i].dirty&=wont_dirty_i;
6767         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6768         {
6769           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6770             for(r=0;r<HOST_REGS;r++) {
6771               if(r!=EXCLUDE_REG) {
6772                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6773                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6774                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
6775               }
6776             }
6777           }
6778         }
6779         else
6780         {
6781           if(i<iend) {
6782             for(r=0;r<HOST_REGS;r++) {
6783               if(r!=EXCLUDE_REG) {
6784                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6785                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6786                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
6787               }
6788             }
6789           }
6790         }
6791         #endif
6792       //}
6793     }
6794     // Deal with changed mappings
6795     temp_will_dirty=will_dirty_i;
6796     temp_wont_dirty=wont_dirty_i;
6797     for(r=0;r<HOST_REGS;r++) {
6798       if(r!=EXCLUDE_REG) {
6799         int nr;
6800         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6801           if(wr) {
6802             #ifndef DESTRUCTIVE_WRITEBACK
6803             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6804             #endif
6805             regs[i].wasdirty|=will_dirty_i&(1<<r);
6806           }
6807         }
6808         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6809           // Register moved to a different register
6810           will_dirty_i&=~(1<<r);
6811           wont_dirty_i&=~(1<<r);
6812           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6813           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6814           if(wr) {
6815             #ifndef DESTRUCTIVE_WRITEBACK
6816             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6817             #endif
6818             regs[i].wasdirty|=will_dirty_i&(1<<r);
6819           }
6820         }
6821         else {
6822           will_dirty_i&=~(1<<r);
6823           wont_dirty_i&=~(1<<r);
6824           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6825             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6826             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6827           } else {
6828             wont_dirty_i|=1<<r;
6829             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
6830           }
6831         }
6832       }
6833     }
6834   }
6835 }
6836
6837 #ifdef DISASM
6838   /* disassembly */
6839 void disassemble_inst(int i)
6840 {
6841     if (bt[i]) printf("*"); else printf(" ");
6842     switch(itype[i]) {
6843       case UJUMP:
6844         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6845       case CJUMP:
6846         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6847       case SJUMP:
6848         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6849       case FJUMP:
6850         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6851       case RJUMP:
6852         if (opcode[i]==0x9&&rt1[i]!=31)
6853           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6854         else
6855           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6856         break;
6857       case SPAN:
6858         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6859       case IMM16:
6860         if(opcode[i]==0xf) //LUI
6861           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6862         else
6863           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6864         break;
6865       case LOAD:
6866       case LOADLR:
6867         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6868         break;
6869       case STORE:
6870       case STORELR:
6871         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6872         break;
6873       case ALU:
6874       case SHIFT:
6875         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6876         break;
6877       case MULTDIV:
6878         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6879         break;
6880       case SHIFTIMM:
6881         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6882         break;
6883       case MOV:
6884         if((opcode2[i]&0x1d)==0x10)
6885           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6886         else if((opcode2[i]&0x1d)==0x11)
6887           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6888         else
6889           printf (" %x: %s\n",start+i*4,insn[i]);
6890         break;
6891       case COP0:
6892         if(opcode2[i]==0)
6893           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6894         else if(opcode2[i]==4)
6895           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6896         else printf (" %x: %s\n",start+i*4,insn[i]);
6897         break;
6898       case COP1:
6899         if(opcode2[i]<3)
6900           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6901         else if(opcode2[i]>3)
6902           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6903         else printf (" %x: %s\n",start+i*4,insn[i]);
6904         break;
6905       case COP2:
6906         if(opcode2[i]<3)
6907           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6908         else if(opcode2[i]>3)
6909           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6910         else printf (" %x: %s\n",start+i*4,insn[i]);
6911         break;
6912       case C1LS:
6913         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6914         break;
6915       case C2LS:
6916         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6917         break;
6918       case INTCALL:
6919         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6920         break;
6921       default:
6922         //printf (" %s %8x\n",insn[i],source[i]);
6923         printf (" %x: %s\n",start+i*4,insn[i]);
6924     }
6925 }
6926 #else
6927 static void disassemble_inst(int i) {}
6928 #endif // DISASM
6929
6930 #define DRC_TEST_VAL 0x74657374
6931
6932 static int new_dynarec_test(void)
6933 {
6934   int (*testfunc)(void) = (void *)out;
6935   int ret;
6936   emit_movimm(DRC_TEST_VAL,0); // test
6937   emit_jmpreg(14);
6938   literal_pool(0);
6939 #ifdef __arm__
6940   __clear_cache((void *)testfunc, out);
6941 #endif
6942   SysPrintf("testing if we can run recompiled code..\n");
6943   ret = testfunc();
6944   if (ret == DRC_TEST_VAL)
6945     SysPrintf("test passed.\n");
6946   else
6947     SysPrintf("test failed: %08x\n", ret);
6948   out=(u_char *)BASE_ADDR;
6949   return ret == DRC_TEST_VAL;
6950 }
6951
6952 // clear the state completely, instead of just marking
6953 // things invalid like invalidate_all_pages() does
6954 void new_dynarec_clear_full()
6955 {
6956   int n;
6957   out=(u_char *)BASE_ADDR;
6958   memset(invalid_code,1,sizeof(invalid_code));
6959   memset(hash_table,0xff,sizeof(hash_table));
6960   memset(mini_ht,-1,sizeof(mini_ht));
6961   memset(restore_candidate,0,sizeof(restore_candidate));
6962   memset(shadow,0,sizeof(shadow));
6963   copy=shadow;
6964   expirep=16384; // Expiry pointer, +2 blocks
6965   pending_exception=0;
6966   literalcount=0;
6967   stop_after_jal=0;
6968   inv_code_start=inv_code_end=~0;
6969   // TLB
6970   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6971   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6972   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6973 }
6974
6975 void new_dynarec_init()
6976 {
6977   SysPrintf("Init new dynarec\n");
6978   out=(u_char *)BASE_ADDR;
6979 #if BASE_ADDR_FIXED
6980   if (mmap (out, 1<<TARGET_SIZE_2,
6981             PROT_READ | PROT_WRITE | PROT_EXEC,
6982             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
6983             -1, 0) <= 0) {
6984     SysPrintf("mmap() failed: %s\n", strerror(errno));
6985   }
6986 #else
6987   // not all systems allow execute in data segment by default
6988   if (mprotect(out, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6989     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6990 #endif
6991   int n;
6992   cycle_multiplier=200;
6993   new_dynarec_clear_full();
6994 #ifdef HOST_IMM8
6995   // Copy this into local area so we don't have to put it in every literal pool
6996   invc_ptr=invalid_code;
6997 #endif
6998   arch_init();
6999   new_dynarec_test();
7000 #ifndef RAM_FIXED
7001   ram_offset=(u_int)rdram-0x80000000;
7002 #endif
7003   if (ram_offset!=0)
7004     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
7005 }
7006
7007 void new_dynarec_cleanup()
7008 {
7009   int n;
7010   #if BASE_ADDR_FIXED
7011   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {SysPrintf("munmap() failed\n");}
7012   #endif
7013   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7014   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7015   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7016   #ifdef ROM_COPY
7017   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7018   #endif
7019 }
7020
7021 static u_int *get_source_start(u_int addr, u_int *limit)
7022 {
7023   if (addr < 0x00200000 ||
7024     (0xa0000000 <= addr && addr < 0xa0200000)) {
7025     // used for BIOS calls mostly?
7026     *limit = (addr&0xa0000000)|0x00200000;
7027     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7028   }
7029   else if (!Config.HLE && (
7030     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7031     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7032     // BIOS
7033     *limit = (addr & 0xfff00000) | 0x80000;
7034     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7035   }
7036   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7037     *limit = (addr & 0x80600000) + 0x00200000;
7038     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7039   }
7040 }
7041
7042 static u_int scan_for_ret(u_int addr)
7043 {
7044   u_int limit = 0;
7045   u_int *mem;
7046
7047   mem = get_source_start(addr, &limit);
7048   if (mem == NULL)
7049     return addr;
7050
7051   if (limit > addr + 0x1000)
7052     limit = addr + 0x1000;
7053   for (; addr < limit; addr += 4, mem++) {
7054     if (*mem == 0x03e00008) // jr $ra
7055       return addr + 8;
7056   }
7057 }
7058
7059 struct savestate_block {
7060   uint32_t addr;
7061   uint32_t regflags;
7062 };
7063
7064 static int addr_cmp(const void *p1_, const void *p2_)
7065 {
7066   const struct savestate_block *p1 = p1_, *p2 = p2_;
7067   return p1->addr - p2->addr;
7068 }
7069
7070 int new_dynarec_save_blocks(void *save, int size)
7071 {
7072   struct savestate_block *blocks = save;
7073   int maxcount = size / sizeof(blocks[0]);
7074   struct savestate_block tmp_blocks[1024];
7075   struct ll_entry *head;
7076   int p, s, d, o, bcnt;
7077   u_int addr;
7078
7079   o = 0;
7080   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
7081     bcnt = 0;
7082     for (head = jump_in[p]; head != NULL; head = head->next) {
7083       tmp_blocks[bcnt].addr = head->vaddr;
7084       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7085       bcnt++;
7086     }
7087     if (bcnt < 1)
7088       continue;
7089     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7090
7091     addr = tmp_blocks[0].addr;
7092     for (s = d = 0; s < bcnt; s++) {
7093       if (tmp_blocks[s].addr < addr)
7094         continue;
7095       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7096         tmp_blocks[d++] = tmp_blocks[s];
7097       addr = scan_for_ret(tmp_blocks[s].addr);
7098     }
7099
7100     if (o + d > maxcount)
7101       d = maxcount - o;
7102     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7103     o += d;
7104   }
7105
7106   return o * sizeof(blocks[0]);
7107 }
7108
7109 void new_dynarec_load_blocks(const void *save, int size)
7110 {
7111   const struct savestate_block *blocks = save;
7112   int count = size / sizeof(blocks[0]);
7113   u_int regs_save[32];
7114   uint32_t f;
7115   int i, b;
7116
7117   get_addr(psxRegs.pc);
7118
7119   // change GPRs for speculation to at least partially work..
7120   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7121   for (i = 1; i < 32; i++)
7122     psxRegs.GPR.r[i] = 0x80000000;
7123
7124   for (b = 0; b < count; b++) {
7125     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7126       if (f & 1)
7127         psxRegs.GPR.r[i] = 0x1f800000;
7128     }
7129
7130     get_addr(blocks[b].addr);
7131
7132     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7133       if (f & 1)
7134         psxRegs.GPR.r[i] = 0x80000000;
7135     }
7136   }
7137
7138   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7139 }
7140
7141 int new_recompile_block(int addr)
7142 {
7143   u_int pagelimit = 0;
7144   u_int state_rflags = 0;
7145   int i;
7146
7147   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7148   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7149   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7150   //if(debug)
7151   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7152   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7153   /*if(Count>=312978186) {
7154     rlist();
7155   }*/
7156   //rlist();
7157
7158   // this is just for speculation
7159   for (i = 1; i < 32; i++) {
7160     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7161       state_rflags |= 1 << i;
7162   }
7163
7164   start = (u_int)addr&~3;
7165   //assert(((u_int)addr&1)==0);
7166   new_dynarec_did_compile=1;
7167   if (Config.HLE && start == 0x80001000) // hlecall
7168   {
7169     // XXX: is this enough? Maybe check hleSoftCall?
7170     u_int beginning=(u_int)out;
7171     u_int page=get_page(start);
7172     invalid_code[start>>12]=0;
7173     emit_movimm(start,0);
7174     emit_writeword(0,(int)&pcaddr);
7175     emit_jmp((int)new_dyna_leave);
7176     literal_pool(0);
7177 #ifdef __arm__
7178     __clear_cache((void *)beginning,out);
7179 #endif
7180     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7181     return 0;
7182   }
7183
7184   source = get_source_start(start, &pagelimit);
7185   if (source == NULL) {
7186     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7187     exit(1);
7188   }
7189
7190   /* Pass 1: disassemble */
7191   /* Pass 2: register dependencies, branch targets */
7192   /* Pass 3: register allocation */
7193   /* Pass 4: branch dependencies */
7194   /* Pass 5: pre-alloc */
7195   /* Pass 6: optimize clean/dirty state */
7196   /* Pass 7: flag 32-bit registers */
7197   /* Pass 8: assembly */
7198   /* Pass 9: linker */
7199   /* Pass 10: garbage collection / free memory */
7200
7201   int j;
7202   int done=0;
7203   unsigned int type,op,op2;
7204
7205   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7206
7207   /* Pass 1 disassembly */
7208
7209   for(i=0;!done;i++) {
7210     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7211     minimum_free_regs[i]=0;
7212     opcode[i]=op=source[i]>>26;
7213     switch(op)
7214     {
7215       case 0x00: strcpy(insn[i],"special"); type=NI;
7216         op2=source[i]&0x3f;
7217         switch(op2)
7218         {
7219           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7220           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7221           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7222           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7223           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7224           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7225           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7226           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7227           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7228           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7229           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7230           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7231           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7232           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7233           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7234           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7235           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7236           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7237           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7238           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7239           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7240           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7241           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7242           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7243           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7244           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7245           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7246           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7247           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7248           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7249           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7250           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7251           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7252           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7253           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7254 #if 0
7255           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7256           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7257           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7258           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7259           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7260           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7261           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7262           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7263           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7264           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7265           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7266           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7267           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7268           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7269           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7270           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7271           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7272 #endif
7273         }
7274         break;
7275       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7276         op2=(source[i]>>16)&0x1f;
7277         switch(op2)
7278         {
7279           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7280           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7281           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7282           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7283           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7284           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7285           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7286           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7287           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7288           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7289           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7290           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7291           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7292           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7293         }
7294         break;
7295       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7296       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7297       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7298       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7299       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7300       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7301       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7302       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7303       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7304       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7305       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7306       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7307       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7308       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7309       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7310         op2=(source[i]>>21)&0x1f;
7311         switch(op2)
7312         {
7313           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7314           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7315           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7316           switch(source[i]&0x3f)
7317           {
7318             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7319             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7320             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7321             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7322             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7323             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7324           }
7325         }
7326         break;
7327       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7328         op2=(source[i]>>21)&0x1f;
7329         switch(op2)
7330         {
7331           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7332           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7333           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7334           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7335           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7336           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7337           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7338           switch((source[i]>>16)&0x3)
7339           {
7340             case 0x00: strcpy(insn[i],"BC1F"); break;
7341             case 0x01: strcpy(insn[i],"BC1T"); break;
7342             case 0x02: strcpy(insn[i],"BC1FL"); break;
7343             case 0x03: strcpy(insn[i],"BC1TL"); break;
7344           }
7345           break;
7346           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7347           switch(source[i]&0x3f)
7348           {
7349             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7350             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7351             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7352             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7353             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7354             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7355             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7356             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7357             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7358             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7359             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7360             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7361             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7362             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7363             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7364             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7365             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7366             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7367             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7368             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7369             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7370             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7371             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7372             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7373             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7374             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7375             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7376             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7377             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7378             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7379             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7380             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7381             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7382             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7383             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7384           }
7385           break;
7386           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7387           switch(source[i]&0x3f)
7388           {
7389             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7390             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7391             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7392             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7393             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7394             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7395             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7396             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7397             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7398             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7399             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7400             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7401             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7402             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7403             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7404             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7405             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7406             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7407             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7408             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7409             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7410             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7411             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7412             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7413             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7414             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7415             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7416             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7417             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7418             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7419             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7420             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7421             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7422             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7423             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7424           }
7425           break;
7426           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7427           switch(source[i]&0x3f)
7428           {
7429             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7430             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7431           }
7432           break;
7433           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7434           switch(source[i]&0x3f)
7435           {
7436             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7437             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7438           }
7439           break;
7440         }
7441         break;
7442 #if 0
7443       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7444       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7445       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7446       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7447       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7448       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7449       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7450       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7451 #endif
7452       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7453       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7454       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7455       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7456       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7457       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7458       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7459 #if 0
7460       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7461 #endif
7462       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7463       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7464       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7465       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7466 #if 0
7467       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7468       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7469 #endif
7470       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7471       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7472       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7473       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7474 #if 0
7475       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7476       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7477       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7478 #endif
7479       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7480       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7481 #if 0
7482       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7483       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7484       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7485 #endif
7486       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7487         op2=(source[i]>>21)&0x1f;
7488         //if (op2 & 0x10) {
7489         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7490           if (gte_handlers[source[i]&0x3f]!=NULL) {
7491             if (gte_regnames[source[i]&0x3f]!=NULL)
7492               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7493             else
7494               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7495             type=C2OP;
7496           }
7497         }
7498         else switch(op2)
7499         {
7500           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7501           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7502           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7503           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7504         }
7505         break;
7506       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7507       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7508       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7509       default: strcpy(insn[i],"???"); type=NI;
7510         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7511         break;
7512     }
7513     itype[i]=type;
7514     opcode2[i]=op2;
7515     /* Get registers/immediates */
7516     lt1[i]=0;
7517     us1[i]=0;
7518     us2[i]=0;
7519     dep1[i]=0;
7520     dep2[i]=0;
7521     gte_rs[i]=gte_rt[i]=0;
7522     switch(type) {
7523       case LOAD:
7524         rs1[i]=(source[i]>>21)&0x1f;
7525         rs2[i]=0;
7526         rt1[i]=(source[i]>>16)&0x1f;
7527         rt2[i]=0;
7528         imm[i]=(short)source[i];
7529         break;
7530       case STORE:
7531       case STORELR:
7532         rs1[i]=(source[i]>>21)&0x1f;
7533         rs2[i]=(source[i]>>16)&0x1f;
7534         rt1[i]=0;
7535         rt2[i]=0;
7536         imm[i]=(short)source[i];
7537         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7538         break;
7539       case LOADLR:
7540         // LWL/LWR only load part of the register,
7541         // therefore the target register must be treated as a source too
7542         rs1[i]=(source[i]>>21)&0x1f;
7543         rs2[i]=(source[i]>>16)&0x1f;
7544         rt1[i]=(source[i]>>16)&0x1f;
7545         rt2[i]=0;
7546         imm[i]=(short)source[i];
7547         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7548         if(op==0x26) dep1[i]=rt1[i]; // LWR
7549         break;
7550       case IMM16:
7551         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7552         else rs1[i]=(source[i]>>21)&0x1f;
7553         rs2[i]=0;
7554         rt1[i]=(source[i]>>16)&0x1f;
7555         rt2[i]=0;
7556         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7557           imm[i]=(unsigned short)source[i];
7558         }else{
7559           imm[i]=(short)source[i];
7560         }
7561         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7562         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7563         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7564         break;
7565       case UJUMP:
7566         rs1[i]=0;
7567         rs2[i]=0;
7568         rt1[i]=0;
7569         rt2[i]=0;
7570         // The JAL instruction writes to r31.
7571         if (op&1) {
7572           rt1[i]=31;
7573         }
7574         rs2[i]=CCREG;
7575         break;
7576       case RJUMP:
7577         rs1[i]=(source[i]>>21)&0x1f;
7578         rs2[i]=0;
7579         rt1[i]=0;
7580         rt2[i]=0;
7581         // The JALR instruction writes to rd.
7582         if (op2&1) {
7583           rt1[i]=(source[i]>>11)&0x1f;
7584         }
7585         rs2[i]=CCREG;
7586         break;
7587       case CJUMP:
7588         rs1[i]=(source[i]>>21)&0x1f;
7589         rs2[i]=(source[i]>>16)&0x1f;
7590         rt1[i]=0;
7591         rt2[i]=0;
7592         if(op&2) { // BGTZ/BLEZ
7593           rs2[i]=0;
7594         }
7595         us1[i]=rs1[i];
7596         us2[i]=rs2[i];
7597         likely[i]=op>>4;
7598         break;
7599       case SJUMP:
7600         rs1[i]=(source[i]>>21)&0x1f;
7601         rs2[i]=CCREG;
7602         rt1[i]=0;
7603         rt2[i]=0;
7604         us1[i]=rs1[i];
7605         if(op2&0x10) { // BxxAL
7606           rt1[i]=31;
7607           // NOTE: If the branch is not taken, r31 is still overwritten
7608         }
7609         likely[i]=(op2&2)>>1;
7610         break;
7611       case FJUMP:
7612         rs1[i]=FSREG;
7613         rs2[i]=CSREG;
7614         rt1[i]=0;
7615         rt2[i]=0;
7616         likely[i]=((source[i])>>17)&1;
7617         break;
7618       case ALU:
7619         rs1[i]=(source[i]>>21)&0x1f; // source
7620         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7621         rt1[i]=(source[i]>>11)&0x1f; // destination
7622         rt2[i]=0;
7623         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7624           us1[i]=rs1[i];us2[i]=rs2[i];
7625         }
7626         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7627           dep1[i]=rs1[i];dep2[i]=rs2[i];
7628         }
7629         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7630           dep1[i]=rs1[i];dep2[i]=rs2[i];
7631         }
7632         break;
7633       case MULTDIV:
7634         rs1[i]=(source[i]>>21)&0x1f; // source
7635         rs2[i]=(source[i]>>16)&0x1f; // divisor
7636         rt1[i]=HIREG;
7637         rt2[i]=LOREG;
7638         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7639           us1[i]=rs1[i];us2[i]=rs2[i];
7640         }
7641         break;
7642       case MOV:
7643         rs1[i]=0;
7644         rs2[i]=0;
7645         rt1[i]=0;
7646         rt2[i]=0;
7647         if(op2==0x10) rs1[i]=HIREG; // MFHI
7648         if(op2==0x11) rt1[i]=HIREG; // MTHI
7649         if(op2==0x12) rs1[i]=LOREG; // MFLO
7650         if(op2==0x13) rt1[i]=LOREG; // MTLO
7651         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7652         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7653         dep1[i]=rs1[i];
7654         break;
7655       case SHIFT:
7656         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7657         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7658         rt1[i]=(source[i]>>11)&0x1f; // destination
7659         rt2[i]=0;
7660         // DSLLV/DSRLV/DSRAV are 64-bit
7661         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7662         break;
7663       case SHIFTIMM:
7664         rs1[i]=(source[i]>>16)&0x1f;
7665         rs2[i]=0;
7666         rt1[i]=(source[i]>>11)&0x1f;
7667         rt2[i]=0;
7668         imm[i]=(source[i]>>6)&0x1f;
7669         // DSxx32 instructions
7670         if(op2>=0x3c) imm[i]|=0x20;
7671         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7672         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7673         break;
7674       case COP0:
7675         rs1[i]=0;
7676         rs2[i]=0;
7677         rt1[i]=0;
7678         rt2[i]=0;
7679         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7680         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7681         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7682         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7683         break;
7684       case COP1:
7685         rs1[i]=0;
7686         rs2[i]=0;
7687         rt1[i]=0;
7688         rt2[i]=0;
7689         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7690         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7691         if(op2==5) us1[i]=rs1[i]; // DMTC1
7692         rs2[i]=CSREG;
7693         break;
7694       case COP2:
7695         rs1[i]=0;
7696         rs2[i]=0;
7697         rt1[i]=0;
7698         rt2[i]=0;
7699         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7700         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7701         rs2[i]=CSREG;
7702         int gr=(source[i]>>11)&0x1F;
7703         switch(op2)
7704         {
7705           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7706           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7707           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7708           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7709         }
7710         break;
7711       case C1LS:
7712         rs1[i]=(source[i]>>21)&0x1F;
7713         rs2[i]=CSREG;
7714         rt1[i]=0;
7715         rt2[i]=0;
7716         imm[i]=(short)source[i];
7717         break;
7718       case C2LS:
7719         rs1[i]=(source[i]>>21)&0x1F;
7720         rs2[i]=0;
7721         rt1[i]=0;
7722         rt2[i]=0;
7723         imm[i]=(short)source[i];
7724         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7725         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7726         break;
7727       case C2OP:
7728         rs1[i]=0;
7729         rs2[i]=0;
7730         rt1[i]=0;
7731         rt2[i]=0;
7732         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7733         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7734         gte_rt[i]|=1ll<<63; // every op changes flags
7735         if((source[i]&0x3f)==GTE_MVMVA) {
7736           int v = (source[i] >> 15) & 3;
7737           gte_rs[i]&=~0xe3fll;
7738           if(v==3) gte_rs[i]|=0xe00ll;
7739           else gte_rs[i]|=3ll<<(v*2);
7740         }
7741         break;
7742       case FLOAT:
7743       case FCONV:
7744         rs1[i]=0;
7745         rs2[i]=CSREG;
7746         rt1[i]=0;
7747         rt2[i]=0;
7748         break;
7749       case FCOMP:
7750         rs1[i]=FSREG;
7751         rs2[i]=CSREG;
7752         rt1[i]=FSREG;
7753         rt2[i]=0;
7754         break;
7755       case SYSCALL:
7756       case HLECALL:
7757       case INTCALL:
7758         rs1[i]=CCREG;
7759         rs2[i]=0;
7760         rt1[i]=0;
7761         rt2[i]=0;
7762         break;
7763       default:
7764         rs1[i]=0;
7765         rs2[i]=0;
7766         rt1[i]=0;
7767         rt2[i]=0;
7768     }
7769     /* Calculate branch target addresses */
7770     if(type==UJUMP)
7771       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7772     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7773       ba[i]=start+i*4+8; // Ignore never taken branch
7774     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7775       ba[i]=start+i*4+8; // Ignore never taken branch
7776     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7777       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7778     else ba[i]=-1;
7779     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7780       int do_in_intrp=0;
7781       // branch in delay slot?
7782       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7783         // don't handle first branch and call interpreter if it's hit
7784         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7785         do_in_intrp=1;
7786       }
7787       // basic load delay detection
7788       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7789         int t=(ba[i-1]-start)/4;
7790         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7791           // jump target wants DS result - potential load delay effect
7792           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7793           do_in_intrp=1;
7794           bt[t+1]=1; // expected return from interpreter
7795         }
7796         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7797               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7798           // v0 overwrite like this is a sign of trouble, bail out
7799           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7800           do_in_intrp=1;
7801         }
7802       }
7803       if(do_in_intrp) {
7804         rs1[i-1]=CCREG;
7805         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7806         ba[i-1]=-1;
7807         itype[i-1]=INTCALL;
7808         done=2;
7809         i--; // don't compile the DS
7810       }
7811     }
7812     /* Is this the end of the block? */
7813     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7814       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7815         done=2;
7816       }
7817       else {
7818         if(stop_after_jal) done=1;
7819         // Stop on BREAK
7820         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7821       }
7822       // Don't recompile stuff that's already compiled
7823       if(check_addr(start+i*4+4)) done=1;
7824       // Don't get too close to the limit
7825       if(i>MAXBLOCK/2) done=1;
7826     }
7827     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7828     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7829     if(done==2) {
7830       // Does the block continue due to a branch?
7831       for(j=i-1;j>=0;j--)
7832       {
7833         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7834         if(ba[j]==start+i*4+4) done=j=0;
7835         if(ba[j]==start+i*4+8) done=j=0;
7836       }
7837     }
7838     //assert(i<MAXBLOCK-1);
7839     if(start+i*4==pagelimit-4) done=1;
7840     assert(start+i*4<pagelimit);
7841     if (i==MAXBLOCK-1) done=1;
7842     // Stop if we're compiling junk
7843     if(itype[i]==NI&&opcode[i]==0x11) {
7844       done=stop_after_jal=1;
7845       SysPrintf("Disabled speculative precompilation\n");
7846     }
7847   }
7848   slen=i;
7849   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
7850     if(start+i*4==pagelimit) {
7851       itype[i-1]=SPAN;
7852     }
7853   }
7854   assert(slen>0);
7855
7856   /* Pass 2 - Register dependencies and branch targets */
7857
7858   unneeded_registers(0,slen-1,0);
7859
7860   /* Pass 3 - Register allocation */
7861
7862   struct regstat current; // Current register allocations/status
7863   current.is32=1;
7864   current.dirty=0;
7865   current.u=unneeded_reg[0];
7866   current.uu=unneeded_reg_upper[0];
7867   clear_all_regs(current.regmap);
7868   alloc_reg(&current,0,CCREG);
7869   dirty_reg(&current,CCREG);
7870   current.isconst=0;
7871   current.wasconst=0;
7872   current.waswritten=0;
7873   int ds=0;
7874   int cc=0;
7875   int hr=-1;
7876
7877   if((u_int)addr&1) {
7878     // First instruction is delay slot
7879     cc=-1;
7880     bt[1]=1;
7881     ds=1;
7882     unneeded_reg[0]=1;
7883     unneeded_reg_upper[0]=1;
7884     current.regmap[HOST_BTREG]=BTREG;
7885   }
7886
7887   for(i=0;i<slen;i++)
7888   {
7889     if(bt[i])
7890     {
7891       int hr;
7892       for(hr=0;hr<HOST_REGS;hr++)
7893       {
7894         // Is this really necessary?
7895         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7896       }
7897       current.isconst=0;
7898       current.waswritten=0;
7899     }
7900     if(i>1)
7901     {
7902       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7903       {
7904         if(rs1[i-2]==0||rs2[i-2]==0)
7905         {
7906           if(rs1[i-2]) {
7907             current.is32|=1LL<<rs1[i-2];
7908             int hr=get_reg(current.regmap,rs1[i-2]|64);
7909             if(hr>=0) current.regmap[hr]=-1;
7910           }
7911           if(rs2[i-2]) {
7912             current.is32|=1LL<<rs2[i-2];
7913             int hr=get_reg(current.regmap,rs2[i-2]|64);
7914             if(hr>=0) current.regmap[hr]=-1;
7915           }
7916         }
7917       }
7918     }
7919     current.is32=-1LL;
7920
7921     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7922     regs[i].wasconst=current.isconst;
7923     regs[i].was32=current.is32;
7924     regs[i].wasdirty=current.dirty;
7925     regs[i].loadedconst=0;
7926     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
7927       if(i+1<slen) {
7928         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7929         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
7930         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7931         current.u|=1;
7932         current.uu|=1;
7933       } else {
7934         current.u=1;
7935         current.uu=1;
7936       }
7937     } else {
7938       if(i+1<slen) {
7939         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7940         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
7941         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
7942         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7943         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7944         current.u|=1;
7945         current.uu|=1;
7946       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
7947     }
7948     is_ds[i]=ds;
7949     if(ds) {
7950       ds=0; // Skip delay slot, already allocated as part of branch
7951       // ...but we need to alloc it in case something jumps here
7952       if(i+1<slen) {
7953         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7954         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
7955       }else{
7956         current.u=branch_unneeded_reg[i-1];
7957         current.uu=branch_unneeded_reg_upper[i-1];
7958       }
7959       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7960       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7961       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7962       current.u|=1;
7963       current.uu|=1;
7964       struct regstat temp;
7965       memcpy(&temp,&current,sizeof(current));
7966       temp.wasdirty=temp.dirty;
7967       temp.was32=temp.is32;
7968       // TODO: Take into account unconditional branches, as below
7969       delayslot_alloc(&temp,i);
7970       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7971       regs[i].wasdirty=temp.wasdirty;
7972       regs[i].was32=temp.was32;
7973       regs[i].dirty=temp.dirty;
7974       regs[i].is32=temp.is32;
7975       regs[i].isconst=0;
7976       regs[i].wasconst=0;
7977       current.isconst=0;
7978       // Create entry (branch target) regmap
7979       for(hr=0;hr<HOST_REGS;hr++)
7980       {
7981         int r=temp.regmap[hr];
7982         if(r>=0) {
7983           if(r!=regmap_pre[i][hr]) {
7984             regs[i].regmap_entry[hr]=-1;
7985           }
7986           else
7987           {
7988             if(r<64){
7989               if((current.u>>r)&1) {
7990                 regs[i].regmap_entry[hr]=-1;
7991                 regs[i].regmap[hr]=-1;
7992                 //Don't clear regs in the delay slot as the branch might need them
7993                 //current.regmap[hr]=-1;
7994               }else
7995                 regs[i].regmap_entry[hr]=r;
7996             }
7997             else {
7998               if((current.uu>>(r&63))&1) {
7999                 regs[i].regmap_entry[hr]=-1;
8000                 regs[i].regmap[hr]=-1;
8001                 //Don't clear regs in the delay slot as the branch might need them
8002                 //current.regmap[hr]=-1;
8003               }else
8004                 regs[i].regmap_entry[hr]=r;
8005             }
8006           }
8007         } else {
8008           // First instruction expects CCREG to be allocated
8009           if(i==0&&hr==HOST_CCREG)
8010             regs[i].regmap_entry[hr]=CCREG;
8011           else
8012             regs[i].regmap_entry[hr]=-1;
8013         }
8014       }
8015     }
8016     else { // Not delay slot
8017       switch(itype[i]) {
8018         case UJUMP:
8019           //current.isconst=0; // DEBUG
8020           //current.wasconst=0; // DEBUG
8021           //regs[i].wasconst=0; // DEBUG
8022           clear_const(&current,rt1[i]);
8023           alloc_cc(&current,i);
8024           dirty_reg(&current,CCREG);
8025           if (rt1[i]==31) {
8026             alloc_reg(&current,i,31);
8027             dirty_reg(&current,31);
8028             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8029             //assert(rt1[i+1]!=rt1[i]);
8030             #ifdef REG_PREFETCH
8031             alloc_reg(&current,i,PTEMP);
8032             #endif
8033             //current.is32|=1LL<<rt1[i];
8034           }
8035           ooo[i]=1;
8036           delayslot_alloc(&current,i+1);
8037           //current.isconst=0; // DEBUG
8038           ds=1;
8039           //printf("i=%d, isconst=%x\n",i,current.isconst);
8040           break;
8041         case RJUMP:
8042           //current.isconst=0;
8043           //current.wasconst=0;
8044           //regs[i].wasconst=0;
8045           clear_const(&current,rs1[i]);
8046           clear_const(&current,rt1[i]);
8047           alloc_cc(&current,i);
8048           dirty_reg(&current,CCREG);
8049           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8050             alloc_reg(&current,i,rs1[i]);
8051             if (rt1[i]!=0) {
8052               alloc_reg(&current,i,rt1[i]);
8053               dirty_reg(&current,rt1[i]);
8054               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8055               assert(rt1[i+1]!=rt1[i]);
8056               #ifdef REG_PREFETCH
8057               alloc_reg(&current,i,PTEMP);
8058               #endif
8059             }
8060             #ifdef USE_MINI_HT
8061             if(rs1[i]==31) { // JALR
8062               alloc_reg(&current,i,RHASH);
8063               #ifndef HOST_IMM_ADDR32
8064               alloc_reg(&current,i,RHTBL);
8065               #endif
8066             }
8067             #endif
8068             delayslot_alloc(&current,i+1);
8069           } else {
8070             // The delay slot overwrites our source register,
8071             // allocate a temporary register to hold the old value.
8072             current.isconst=0;
8073             current.wasconst=0;
8074             regs[i].wasconst=0;
8075             delayslot_alloc(&current,i+1);
8076             current.isconst=0;
8077             alloc_reg(&current,i,RTEMP);
8078           }
8079           //current.isconst=0; // DEBUG
8080           ooo[i]=1;
8081           ds=1;
8082           break;
8083         case CJUMP:
8084           //current.isconst=0;
8085           //current.wasconst=0;
8086           //regs[i].wasconst=0;
8087           clear_const(&current,rs1[i]);
8088           clear_const(&current,rs2[i]);
8089           if((opcode[i]&0x3E)==4) // BEQ/BNE
8090           {
8091             alloc_cc(&current,i);
8092             dirty_reg(&current,CCREG);
8093             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8094             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8095             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8096             {
8097               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8098               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8099             }
8100             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8101                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8102               // The delay slot overwrites one of our conditions.
8103               // Allocate the branch condition registers instead.
8104               current.isconst=0;
8105               current.wasconst=0;
8106               regs[i].wasconst=0;
8107               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8108               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8109               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8110               {
8111                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8112                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8113               }
8114             }
8115             else
8116             {
8117               ooo[i]=1;
8118               delayslot_alloc(&current,i+1);
8119             }
8120           }
8121           else
8122           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8123           {
8124             alloc_cc(&current,i);
8125             dirty_reg(&current,CCREG);
8126             alloc_reg(&current,i,rs1[i]);
8127             if(!(current.is32>>rs1[i]&1))
8128             {
8129               alloc_reg64(&current,i,rs1[i]);
8130             }
8131             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8132               // The delay slot overwrites one of our conditions.
8133               // Allocate the branch condition registers instead.
8134               current.isconst=0;
8135               current.wasconst=0;
8136               regs[i].wasconst=0;
8137               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8138               if(!((current.is32>>rs1[i])&1))
8139               {
8140                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8141               }
8142             }
8143             else
8144             {
8145               ooo[i]=1;
8146               delayslot_alloc(&current,i+1);
8147             }
8148           }
8149           else
8150           // Don't alloc the delay slot yet because we might not execute it
8151           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8152           {
8153             current.isconst=0;
8154             current.wasconst=0;
8155             regs[i].wasconst=0;
8156             alloc_cc(&current,i);
8157             dirty_reg(&current,CCREG);
8158             alloc_reg(&current,i,rs1[i]);
8159             alloc_reg(&current,i,rs2[i]);
8160             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8161             {
8162               alloc_reg64(&current,i,rs1[i]);
8163               alloc_reg64(&current,i,rs2[i]);
8164             }
8165           }
8166           else
8167           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8168           {
8169             current.isconst=0;
8170             current.wasconst=0;
8171             regs[i].wasconst=0;
8172             alloc_cc(&current,i);
8173             dirty_reg(&current,CCREG);
8174             alloc_reg(&current,i,rs1[i]);
8175             if(!(current.is32>>rs1[i]&1))
8176             {
8177               alloc_reg64(&current,i,rs1[i]);
8178             }
8179           }
8180           ds=1;
8181           //current.isconst=0;
8182           break;
8183         case SJUMP:
8184           //current.isconst=0;
8185           //current.wasconst=0;
8186           //regs[i].wasconst=0;
8187           clear_const(&current,rs1[i]);
8188           clear_const(&current,rt1[i]);
8189           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8190           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8191           {
8192             alloc_cc(&current,i);
8193             dirty_reg(&current,CCREG);
8194             alloc_reg(&current,i,rs1[i]);
8195             if(!(current.is32>>rs1[i]&1))
8196             {
8197               alloc_reg64(&current,i,rs1[i]);
8198             }
8199             if (rt1[i]==31) { // BLTZAL/BGEZAL
8200               alloc_reg(&current,i,31);
8201               dirty_reg(&current,31);
8202               //#ifdef REG_PREFETCH
8203               //alloc_reg(&current,i,PTEMP);
8204               //#endif
8205               //current.is32|=1LL<<rt1[i];
8206             }
8207             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8208                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8209               // Allocate the branch condition registers instead.
8210               current.isconst=0;
8211               current.wasconst=0;
8212               regs[i].wasconst=0;
8213               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8214               if(!((current.is32>>rs1[i])&1))
8215               {
8216                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8217               }
8218             }
8219             else
8220             {
8221               ooo[i]=1;
8222               delayslot_alloc(&current,i+1);
8223             }
8224           }
8225           else
8226           // Don't alloc the delay slot yet because we might not execute it
8227           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8228           {
8229             current.isconst=0;
8230             current.wasconst=0;
8231             regs[i].wasconst=0;
8232             alloc_cc(&current,i);
8233             dirty_reg(&current,CCREG);
8234             alloc_reg(&current,i,rs1[i]);
8235             if(!(current.is32>>rs1[i]&1))
8236             {
8237               alloc_reg64(&current,i,rs1[i]);
8238             }
8239           }
8240           ds=1;
8241           //current.isconst=0;
8242           break;
8243         case FJUMP:
8244           current.isconst=0;
8245           current.wasconst=0;
8246           regs[i].wasconst=0;
8247           if(likely[i]==0) // BC1F/BC1T
8248           {
8249             // TODO: Theoretically we can run out of registers here on x86.
8250             // The delay slot can allocate up to six, and we need to check
8251             // CSREG before executing the delay slot.  Possibly we can drop
8252             // the cycle count and then reload it after checking that the
8253             // FPU is in a usable state, or don't do out-of-order execution.
8254             alloc_cc(&current,i);
8255             dirty_reg(&current,CCREG);
8256             alloc_reg(&current,i,FSREG);
8257             alloc_reg(&current,i,CSREG);
8258             if(itype[i+1]==FCOMP) {
8259               // The delay slot overwrites the branch condition.
8260               // Allocate the branch condition registers instead.
8261               alloc_cc(&current,i);
8262               dirty_reg(&current,CCREG);
8263               alloc_reg(&current,i,CSREG);
8264               alloc_reg(&current,i,FSREG);
8265             }
8266             else {
8267               ooo[i]=1;
8268               delayslot_alloc(&current,i+1);
8269               alloc_reg(&current,i+1,CSREG);
8270             }
8271           }
8272           else
8273           // Don't alloc the delay slot yet because we might not execute it
8274           if(likely[i]) // BC1FL/BC1TL
8275           {
8276             alloc_cc(&current,i);
8277             dirty_reg(&current,CCREG);
8278             alloc_reg(&current,i,CSREG);
8279             alloc_reg(&current,i,FSREG);
8280           }
8281           ds=1;
8282           current.isconst=0;
8283           break;
8284         case IMM16:
8285           imm16_alloc(&current,i);
8286           break;
8287         case LOAD:
8288         case LOADLR:
8289           load_alloc(&current,i);
8290           break;
8291         case STORE:
8292         case STORELR:
8293           store_alloc(&current,i);
8294           break;
8295         case ALU:
8296           alu_alloc(&current,i);
8297           break;
8298         case SHIFT:
8299           shift_alloc(&current,i);
8300           break;
8301         case MULTDIV:
8302           multdiv_alloc(&current,i);
8303           break;
8304         case SHIFTIMM:
8305           shiftimm_alloc(&current,i);
8306           break;
8307         case MOV:
8308           mov_alloc(&current,i);
8309           break;
8310         case COP0:
8311           cop0_alloc(&current,i);
8312           break;
8313         case COP1:
8314         case COP2:
8315           cop1_alloc(&current,i);
8316           break;
8317         case C1LS:
8318           c1ls_alloc(&current,i);
8319           break;
8320         case C2LS:
8321           c2ls_alloc(&current,i);
8322           break;
8323         case C2OP:
8324           c2op_alloc(&current,i);
8325           break;
8326         case FCONV:
8327           fconv_alloc(&current,i);
8328           break;
8329         case FLOAT:
8330           float_alloc(&current,i);
8331           break;
8332         case FCOMP:
8333           fcomp_alloc(&current,i);
8334           break;
8335         case SYSCALL:
8336         case HLECALL:
8337         case INTCALL:
8338           syscall_alloc(&current,i);
8339           break;
8340         case SPAN:
8341           pagespan_alloc(&current,i);
8342           break;
8343       }
8344
8345       // Drop the upper half of registers that have become 32-bit
8346       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8347       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8348         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8349         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8350         current.uu|=1;
8351       } else {
8352         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8353         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8354         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8355         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8356         current.uu|=1;
8357       }
8358
8359       // Create entry (branch target) regmap
8360       for(hr=0;hr<HOST_REGS;hr++)
8361       {
8362         int r,or,er;
8363         r=current.regmap[hr];
8364         if(r>=0) {
8365           if(r!=regmap_pre[i][hr]) {
8366             // TODO: delay slot (?)
8367             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8368             if(or<0||(r&63)>=TEMPREG){
8369               regs[i].regmap_entry[hr]=-1;
8370             }
8371             else
8372             {
8373               // Just move it to a different register
8374               regs[i].regmap_entry[hr]=r;
8375               // If it was dirty before, it's still dirty
8376               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8377             }
8378           }
8379           else
8380           {
8381             // Unneeded
8382             if(r==0){
8383               regs[i].regmap_entry[hr]=0;
8384             }
8385             else
8386             if(r<64){
8387               if((current.u>>r)&1) {
8388                 regs[i].regmap_entry[hr]=-1;
8389                 //regs[i].regmap[hr]=-1;
8390                 current.regmap[hr]=-1;
8391               }else
8392                 regs[i].regmap_entry[hr]=r;
8393             }
8394             else {
8395               if((current.uu>>(r&63))&1) {
8396                 regs[i].regmap_entry[hr]=-1;
8397                 //regs[i].regmap[hr]=-1;
8398                 current.regmap[hr]=-1;
8399               }else
8400                 regs[i].regmap_entry[hr]=r;
8401             }
8402           }
8403         } else {
8404           // Branches expect CCREG to be allocated at the target
8405           if(regmap_pre[i][hr]==CCREG)
8406             regs[i].regmap_entry[hr]=CCREG;
8407           else
8408             regs[i].regmap_entry[hr]=-1;
8409         }
8410       }
8411       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8412     }
8413
8414     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8415       current.waswritten|=1<<rs1[i-1];
8416     current.waswritten&=~(1<<rt1[i]);
8417     current.waswritten&=~(1<<rt2[i]);
8418     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8419       current.waswritten&=~(1<<rs1[i]);
8420
8421     /* Branch post-alloc */
8422     if(i>0)
8423     {
8424       current.was32=current.is32;
8425       current.wasdirty=current.dirty;
8426       switch(itype[i-1]) {
8427         case UJUMP:
8428           memcpy(&branch_regs[i-1],&current,sizeof(current));
8429           branch_regs[i-1].isconst=0;
8430           branch_regs[i-1].wasconst=0;
8431           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8432           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8433           alloc_cc(&branch_regs[i-1],i-1);
8434           dirty_reg(&branch_regs[i-1],CCREG);
8435           if(rt1[i-1]==31) { // JAL
8436             alloc_reg(&branch_regs[i-1],i-1,31);
8437             dirty_reg(&branch_regs[i-1],31);
8438             branch_regs[i-1].is32|=1LL<<31;
8439           }
8440           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8441           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8442           break;
8443         case RJUMP:
8444           memcpy(&branch_regs[i-1],&current,sizeof(current));
8445           branch_regs[i-1].isconst=0;
8446           branch_regs[i-1].wasconst=0;
8447           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8448           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8449           alloc_cc(&branch_regs[i-1],i-1);
8450           dirty_reg(&branch_regs[i-1],CCREG);
8451           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8452           if(rt1[i-1]!=0) { // JALR
8453             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8454             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8455             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8456           }
8457           #ifdef USE_MINI_HT
8458           if(rs1[i-1]==31) { // JALR
8459             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8460             #ifndef HOST_IMM_ADDR32
8461             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8462             #endif
8463           }
8464           #endif
8465           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8466           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8467           break;
8468         case CJUMP:
8469           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8470           {
8471             alloc_cc(&current,i-1);
8472             dirty_reg(&current,CCREG);
8473             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8474                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8475               // The delay slot overwrote one of our conditions
8476               // Delay slot goes after the test (in order)
8477               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8478               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8479               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8480               current.u|=1;
8481               current.uu|=1;
8482               delayslot_alloc(&current,i);
8483               current.isconst=0;
8484             }
8485             else
8486             {
8487               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8488               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8489               // Alloc the branch condition registers
8490               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8491               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8492               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8493               {
8494                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8495                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8496               }
8497             }
8498             memcpy(&branch_regs[i-1],&current,sizeof(current));
8499             branch_regs[i-1].isconst=0;
8500             branch_regs[i-1].wasconst=0;
8501             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8502             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8503           }
8504           else
8505           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8506           {
8507             alloc_cc(&current,i-1);
8508             dirty_reg(&current,CCREG);
8509             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8510               // The delay slot overwrote the branch condition
8511               // Delay slot goes after the test (in order)
8512               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8513               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8514               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8515               current.u|=1;
8516               current.uu|=1;
8517               delayslot_alloc(&current,i);
8518               current.isconst=0;
8519             }
8520             else
8521             {
8522               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8523               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8524               // Alloc the branch condition register
8525               alloc_reg(&current,i-1,rs1[i-1]);
8526               if(!(current.is32>>rs1[i-1]&1))
8527               {
8528                 alloc_reg64(&current,i-1,rs1[i-1]);
8529               }
8530             }
8531             memcpy(&branch_regs[i-1],&current,sizeof(current));
8532             branch_regs[i-1].isconst=0;
8533             branch_regs[i-1].wasconst=0;
8534             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8535             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8536           }
8537           else
8538           // Alloc the delay slot in case the branch is taken
8539           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8540           {
8541             memcpy(&branch_regs[i-1],&current,sizeof(current));
8542             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8543             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8544             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8545             alloc_cc(&branch_regs[i-1],i);
8546             dirty_reg(&branch_regs[i-1],CCREG);
8547             delayslot_alloc(&branch_regs[i-1],i);
8548             branch_regs[i-1].isconst=0;
8549             alloc_reg(&current,i,CCREG); // Not taken path
8550             dirty_reg(&current,CCREG);
8551             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8552           }
8553           else
8554           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8555           {
8556             memcpy(&branch_regs[i-1],&current,sizeof(current));
8557             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8558             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8559             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8560             alloc_cc(&branch_regs[i-1],i);
8561             dirty_reg(&branch_regs[i-1],CCREG);
8562             delayslot_alloc(&branch_regs[i-1],i);
8563             branch_regs[i-1].isconst=0;
8564             alloc_reg(&current,i,CCREG); // Not taken path
8565             dirty_reg(&current,CCREG);
8566             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8567           }
8568           break;
8569         case SJUMP:
8570           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8571           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8572           {
8573             alloc_cc(&current,i-1);
8574             dirty_reg(&current,CCREG);
8575             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8576               // The delay slot overwrote the branch condition
8577               // Delay slot goes after the test (in order)
8578               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8579               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8580               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8581               current.u|=1;
8582               current.uu|=1;
8583               delayslot_alloc(&current,i);
8584               current.isconst=0;
8585             }
8586             else
8587             {
8588               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8589               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8590               // Alloc the branch condition register
8591               alloc_reg(&current,i-1,rs1[i-1]);
8592               if(!(current.is32>>rs1[i-1]&1))
8593               {
8594                 alloc_reg64(&current,i-1,rs1[i-1]);
8595               }
8596             }
8597             memcpy(&branch_regs[i-1],&current,sizeof(current));
8598             branch_regs[i-1].isconst=0;
8599             branch_regs[i-1].wasconst=0;
8600             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8601             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8602           }
8603           else
8604           // Alloc the delay slot in case the branch is taken
8605           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8606           {
8607             memcpy(&branch_regs[i-1],&current,sizeof(current));
8608             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8609             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8610             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8611             alloc_cc(&branch_regs[i-1],i);
8612             dirty_reg(&branch_regs[i-1],CCREG);
8613             delayslot_alloc(&branch_regs[i-1],i);
8614             branch_regs[i-1].isconst=0;
8615             alloc_reg(&current,i,CCREG); // Not taken path
8616             dirty_reg(&current,CCREG);
8617             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8618           }
8619           // FIXME: BLTZAL/BGEZAL
8620           if(opcode2[i-1]&0x10) { // BxxZAL
8621             alloc_reg(&branch_regs[i-1],i-1,31);
8622             dirty_reg(&branch_regs[i-1],31);
8623             branch_regs[i-1].is32|=1LL<<31;
8624           }
8625           break;
8626         case FJUMP:
8627           if(likely[i-1]==0) // BC1F/BC1T
8628           {
8629             alloc_cc(&current,i-1);
8630             dirty_reg(&current,CCREG);
8631             if(itype[i]==FCOMP) {
8632               // The delay slot overwrote the branch condition
8633               // Delay slot goes after the test (in order)
8634               delayslot_alloc(&current,i);
8635               current.isconst=0;
8636             }
8637             else
8638             {
8639               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8640               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8641               // Alloc the branch condition register
8642               alloc_reg(&current,i-1,FSREG);
8643             }
8644             memcpy(&branch_regs[i-1],&current,sizeof(current));
8645             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8646           }
8647           else // BC1FL/BC1TL
8648           {
8649             // Alloc the delay slot in case the branch is taken
8650             memcpy(&branch_regs[i-1],&current,sizeof(current));
8651             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8652             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8653             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8654             alloc_cc(&branch_regs[i-1],i);
8655             dirty_reg(&branch_regs[i-1],CCREG);
8656             delayslot_alloc(&branch_regs[i-1],i);
8657             branch_regs[i-1].isconst=0;
8658             alloc_reg(&current,i,CCREG); // Not taken path
8659             dirty_reg(&current,CCREG);
8660             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8661           }
8662           break;
8663       }
8664
8665       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8666       {
8667         if(rt1[i-1]==31) // JAL/JALR
8668         {
8669           // Subroutine call will return here, don't alloc any registers
8670           current.is32=1;
8671           current.dirty=0;
8672           clear_all_regs(current.regmap);
8673           alloc_reg(&current,i,CCREG);
8674           dirty_reg(&current,CCREG);
8675         }
8676         else if(i+1<slen)
8677         {
8678           // Internal branch will jump here, match registers to caller
8679           current.is32=0x3FFFFFFFFLL;
8680           current.dirty=0;
8681           clear_all_regs(current.regmap);
8682           alloc_reg(&current,i,CCREG);
8683           dirty_reg(&current,CCREG);
8684           for(j=i-1;j>=0;j--)
8685           {
8686             if(ba[j]==start+i*4+4) {
8687               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8688               current.is32=branch_regs[j].is32;
8689               current.dirty=branch_regs[j].dirty;
8690               break;
8691             }
8692           }
8693           while(j>=0) {
8694             if(ba[j]==start+i*4+4) {
8695               for(hr=0;hr<HOST_REGS;hr++) {
8696                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8697                   current.regmap[hr]=-1;
8698                 }
8699                 current.is32&=branch_regs[j].is32;
8700                 current.dirty&=branch_regs[j].dirty;
8701               }
8702             }
8703             j--;
8704           }
8705         }
8706       }
8707     }
8708
8709     // Count cycles in between branches
8710     ccadj[i]=cc;
8711     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8712     {
8713       cc=0;
8714     }
8715 #if !defined(DRC_DBG)
8716     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8717     {
8718       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8719       cc+=gte_cycletab[source[i]&0x3f]/2;
8720     }
8721     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8722     {
8723       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8724     }
8725     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8726     {
8727       cc+=4;
8728     }
8729     else if(itype[i]==C2LS)
8730     {
8731       cc+=4;
8732     }
8733 #endif
8734     else
8735     {
8736       cc++;
8737     }
8738
8739     flush_dirty_uppers(&current);
8740     if(!is_ds[i]) {
8741       regs[i].is32=current.is32;
8742       regs[i].dirty=current.dirty;
8743       regs[i].isconst=current.isconst;
8744       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8745     }
8746     for(hr=0;hr<HOST_REGS;hr++) {
8747       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8748         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8749           regs[i].wasconst&=~(1<<hr);
8750         }
8751       }
8752     }
8753     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8754     regs[i].waswritten=current.waswritten;
8755   }
8756
8757   /* Pass 4 - Cull unused host registers */
8758
8759   uint64_t nr=0;
8760
8761   for (i=slen-1;i>=0;i--)
8762   {
8763     int hr;
8764     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8765     {
8766       if(ba[i]<start || ba[i]>=(start+slen*4))
8767       {
8768         // Branch out of this block, don't need anything
8769         nr=0;
8770       }
8771       else
8772       {
8773         // Internal branch
8774         // Need whatever matches the target
8775         nr=0;
8776         int t=(ba[i]-start)>>2;
8777         for(hr=0;hr<HOST_REGS;hr++)
8778         {
8779           if(regs[i].regmap_entry[hr]>=0) {
8780             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8781           }
8782         }
8783       }
8784       // Conditional branch may need registers for following instructions
8785       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8786       {
8787         if(i<slen-2) {
8788           nr|=needed_reg[i+2];
8789           for(hr=0;hr<HOST_REGS;hr++)
8790           {
8791             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8792             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8793           }
8794         }
8795       }
8796       // Don't need stuff which is overwritten
8797       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8798       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8799       // Merge in delay slot
8800       for(hr=0;hr<HOST_REGS;hr++)
8801       {
8802         if(!likely[i]) {
8803           // These are overwritten unless the branch is "likely"
8804           // and the delay slot is nullified if not taken
8805           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8806           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8807         }
8808         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8809         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8810         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8811         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8812         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8813         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8814         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8815         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8816         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8817           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8818           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8819         }
8820         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8821           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8822           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8823         }
8824         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8825           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8826           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8827         }
8828       }
8829     }
8830     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8831     {
8832       // SYSCALL instruction (software interrupt)
8833       nr=0;
8834     }
8835     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8836     {
8837       // ERET instruction (return from interrupt)
8838       nr=0;
8839     }
8840     else // Non-branch
8841     {
8842       if(i<slen-1) {
8843         for(hr=0;hr<HOST_REGS;hr++) {
8844           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8845           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8846           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8847           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8848         }
8849       }
8850     }
8851     for(hr=0;hr<HOST_REGS;hr++)
8852     {
8853       // Overwritten registers are not needed
8854       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8855       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8856       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8857       // Source registers are needed
8858       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8859       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8860       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8861       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8862       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8863       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8864       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8865       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8866       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
8867         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8868         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8869       }
8870       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
8871         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8872         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8873       }
8874       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8875         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8876         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8877       }
8878       // Don't store a register immediately after writing it,
8879       // may prevent dual-issue.
8880       // But do so if this is a branch target, otherwise we
8881       // might have to load the register before the branch.
8882       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8883         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
8884            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
8885           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8886           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8887         }
8888         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
8889            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
8890           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8891           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8892         }
8893       }
8894     }
8895     // Cycle count is needed at branches.  Assume it is needed at the target too.
8896     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
8897       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8898       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8899     }
8900     // Save it
8901     needed_reg[i]=nr;
8902
8903     // Deallocate unneeded registers
8904     for(hr=0;hr<HOST_REGS;hr++)
8905     {
8906       if(!((nr>>hr)&1)) {
8907         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8908         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8909            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8910            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8911         {
8912           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8913           {
8914             if(likely[i]) {
8915               regs[i].regmap[hr]=-1;
8916               regs[i].isconst&=~(1<<hr);
8917               if(i<slen-2) {
8918                 regmap_pre[i+2][hr]=-1;
8919                 regs[i+2].wasconst&=~(1<<hr);
8920               }
8921             }
8922           }
8923         }
8924         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8925         {
8926           int d1=0,d2=0,map=0,temp=0;
8927           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
8928           {
8929             d1=dep1[i+1];
8930             d2=dep2[i+1];
8931           }
8932           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8933              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8934             map=INVCP;
8935           }
8936           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8937              itype[i+1]==C1LS || itype[i+1]==C2LS)
8938             temp=FTEMP;
8939           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8940              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8941              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8942              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
8943              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8944              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8945              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8946              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8947              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8948              regs[i].regmap[hr]!=map )
8949           {
8950             regs[i].regmap[hr]=-1;
8951             regs[i].isconst&=~(1<<hr);
8952             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8953                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8954                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8955                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
8956                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
8957                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8958                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8959                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8960                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8961                branch_regs[i].regmap[hr]!=map)
8962             {
8963               branch_regs[i].regmap[hr]=-1;
8964               branch_regs[i].regmap_entry[hr]=-1;
8965               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8966               {
8967                 if(!likely[i]&&i<slen-2) {
8968                   regmap_pre[i+2][hr]=-1;
8969                   regs[i+2].wasconst&=~(1<<hr);
8970                 }
8971               }
8972             }
8973           }
8974         }
8975         else
8976         {
8977           // Non-branch
8978           if(i>0)
8979           {
8980             int d1=0,d2=0,map=-1,temp=-1;
8981             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
8982             {
8983               d1=dep1[i];
8984               d2=dep2[i];
8985             }
8986             if(itype[i]==STORE || itype[i]==STORELR ||
8987                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8988               map=INVCP;
8989             }
8990             if(itype[i]==LOADLR || itype[i]==STORELR ||
8991                itype[i]==C1LS || itype[i]==C2LS)
8992               temp=FTEMP;
8993             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8994                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
8995                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8996                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8997                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8998                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8999             {
9000               if(i<slen-1&&!is_ds[i]) {
9001                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9002                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9003                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9004                 {
9005                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9006                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9007                 }
9008                 regmap_pre[i+1][hr]=-1;
9009                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9010                 regs[i+1].wasconst&=~(1<<hr);
9011               }
9012               regs[i].regmap[hr]=-1;
9013               regs[i].isconst&=~(1<<hr);
9014             }
9015           }
9016         }
9017       }
9018     }
9019   }
9020
9021   /* Pass 5 - Pre-allocate registers */
9022
9023   // If a register is allocated during a loop, try to allocate it for the
9024   // entire loop, if possible.  This avoids loading/storing registers
9025   // inside of the loop.
9026
9027   signed char f_regmap[HOST_REGS];
9028   clear_all_regs(f_regmap);
9029   for(i=0;i<slen-1;i++)
9030   {
9031     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9032     {
9033       if(ba[i]>=start && ba[i]<(start+i*4))
9034       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9035       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9036       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9037       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9038       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9039       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9040       {
9041         int t=(ba[i]-start)>>2;
9042         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9043         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9044         for(hr=0;hr<HOST_REGS;hr++)
9045         {
9046           if(regs[i].regmap[hr]>64) {
9047             if(!((regs[i].dirty>>hr)&1))
9048               f_regmap[hr]=regs[i].regmap[hr];
9049             else f_regmap[hr]=-1;
9050           }
9051           else if(regs[i].regmap[hr]>=0) {
9052             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9053               // dealloc old register
9054               int n;
9055               for(n=0;n<HOST_REGS;n++)
9056               {
9057                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9058               }
9059               // and alloc new one
9060               f_regmap[hr]=regs[i].regmap[hr];
9061             }
9062           }
9063           if(branch_regs[i].regmap[hr]>64) {
9064             if(!((branch_regs[i].dirty>>hr)&1))
9065               f_regmap[hr]=branch_regs[i].regmap[hr];
9066             else f_regmap[hr]=-1;
9067           }
9068           else if(branch_regs[i].regmap[hr]>=0) {
9069             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9070               // dealloc old register
9071               int n;
9072               for(n=0;n<HOST_REGS;n++)
9073               {
9074                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9075               }
9076               // and alloc new one
9077               f_regmap[hr]=branch_regs[i].regmap[hr];
9078             }
9079           }
9080           if(ooo[i]) {
9081             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9082               f_regmap[hr]=branch_regs[i].regmap[hr];
9083           }else{
9084             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9085               f_regmap[hr]=branch_regs[i].regmap[hr];
9086           }
9087           // Avoid dirty->clean transition
9088           #ifdef DESTRUCTIVE_WRITEBACK
9089           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9090           #endif
9091           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9092           // case above, however it's always a good idea.  We can't hoist the
9093           // load if the register was already allocated, so there's no point
9094           // wasting time analyzing most of these cases.  It only "succeeds"
9095           // when the mapping was different and the load can be replaced with
9096           // a mov, which is of negligible benefit.  So such cases are
9097           // skipped below.
9098           if(f_regmap[hr]>0) {
9099             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9100               int r=f_regmap[hr];
9101               for(j=t;j<=i;j++)
9102               {
9103                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9104                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9105                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9106                 if(r>63) {
9107                   // NB This can exclude the case where the upper-half
9108                   // register is lower numbered than the lower-half
9109                   // register.  Not sure if it's worth fixing...
9110                   if(get_reg(regs[j].regmap,r&63)<0) break;
9111                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9112                   if(regs[j].is32&(1LL<<(r&63))) break;
9113                 }
9114                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9115                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9116                   int k;
9117                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9118                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9119                     if(r>63) {
9120                       if(get_reg(regs[i].regmap,r&63)<0) break;
9121                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9122                     }
9123                     k=i;
9124                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9125                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9126                         //printf("no free regs for store %x\n",start+(k-1)*4);
9127                         break;
9128                       }
9129                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9130                         //printf("no-match due to different register\n");
9131                         break;
9132                       }
9133                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9134                         //printf("no-match due to branch\n");
9135                         break;
9136                       }
9137                       // call/ret fast path assumes no registers allocated
9138                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9139                         break;
9140                       }
9141                       if(r>63) {
9142                         // NB This can exclude the case where the upper-half
9143                         // register is lower numbered than the lower-half
9144                         // register.  Not sure if it's worth fixing...
9145                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9146                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9147                       }
9148                       k--;
9149                     }
9150                     if(i<slen-1) {
9151                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9152                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9153                         //printf("bad match after branch\n");
9154                         break;
9155                       }
9156                     }
9157                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9158                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9159                       while(k<i) {
9160                         regs[k].regmap_entry[hr]=f_regmap[hr];
9161                         regs[k].regmap[hr]=f_regmap[hr];
9162                         regmap_pre[k+1][hr]=f_regmap[hr];
9163                         regs[k].wasdirty&=~(1<<hr);
9164                         regs[k].dirty&=~(1<<hr);
9165                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9166                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9167                         regs[k].wasconst&=~(1<<hr);
9168                         regs[k].isconst&=~(1<<hr);
9169                         k++;
9170                       }
9171                     }
9172                     else {
9173                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9174                       break;
9175                     }
9176                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9177                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9178                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9179                       regs[i].regmap_entry[hr]=f_regmap[hr];
9180                       regs[i].regmap[hr]=f_regmap[hr];
9181                       regs[i].wasdirty&=~(1<<hr);
9182                       regs[i].dirty&=~(1<<hr);
9183                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9184                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9185                       regs[i].wasconst&=~(1<<hr);
9186                       regs[i].isconst&=~(1<<hr);
9187                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9188                       branch_regs[i].wasdirty&=~(1<<hr);
9189                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9190                       branch_regs[i].regmap[hr]=f_regmap[hr];
9191                       branch_regs[i].dirty&=~(1<<hr);
9192                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9193                       branch_regs[i].wasconst&=~(1<<hr);
9194                       branch_regs[i].isconst&=~(1<<hr);
9195                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9196                         regmap_pre[i+2][hr]=f_regmap[hr];
9197                         regs[i+2].wasdirty&=~(1<<hr);
9198                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9199                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9200                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9201                       }
9202                     }
9203                   }
9204                   for(k=t;k<j;k++) {
9205                     // Alloc register clean at beginning of loop,
9206                     // but may dirty it in pass 6
9207                     regs[k].regmap_entry[hr]=f_regmap[hr];
9208                     regs[k].regmap[hr]=f_regmap[hr];
9209                     regs[k].dirty&=~(1<<hr);
9210                     regs[k].wasconst&=~(1<<hr);
9211                     regs[k].isconst&=~(1<<hr);
9212                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9213                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9214                       branch_regs[k].regmap[hr]=f_regmap[hr];
9215                       branch_regs[k].dirty&=~(1<<hr);
9216                       branch_regs[k].wasconst&=~(1<<hr);
9217                       branch_regs[k].isconst&=~(1<<hr);
9218                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9219                         regmap_pre[k+2][hr]=f_regmap[hr];
9220                         regs[k+2].wasdirty&=~(1<<hr);
9221                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9222                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9223                       }
9224                     }
9225                     else
9226                     {
9227                       regmap_pre[k+1][hr]=f_regmap[hr];
9228                       regs[k+1].wasdirty&=~(1<<hr);
9229                     }
9230                   }
9231                   if(regs[j].regmap[hr]==f_regmap[hr])
9232                     regs[j].regmap_entry[hr]=f_regmap[hr];
9233                   break;
9234                 }
9235                 if(j==i) break;
9236                 if(regs[j].regmap[hr]>=0)
9237                   break;
9238                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9239                   //printf("no-match due to different register\n");
9240                   break;
9241                 }
9242                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9243                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9244                   break;
9245                 }
9246                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9247                 {
9248                   // Stop on unconditional branch
9249                   break;
9250                 }
9251                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9252                 {
9253                   if(ooo[j]) {
9254                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9255                       break;
9256                   }else{
9257                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9258                       break;
9259                   }
9260                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9261                     //printf("no-match due to different register (branch)\n");
9262                     break;
9263                   }
9264                 }
9265                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9266                   //printf("No free regs for store %x\n",start+j*4);
9267                   break;
9268                 }
9269                 if(f_regmap[hr]>=64) {
9270                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9271                     break;
9272                   }
9273                   else
9274                   {
9275                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9276                       break;
9277                     }
9278                   }
9279                 }
9280               }
9281             }
9282           }
9283         }
9284       }
9285     }else{
9286       // Non branch or undetermined branch target
9287       for(hr=0;hr<HOST_REGS;hr++)
9288       {
9289         if(hr!=EXCLUDE_REG) {
9290           if(regs[i].regmap[hr]>64) {
9291             if(!((regs[i].dirty>>hr)&1))
9292               f_regmap[hr]=regs[i].regmap[hr];
9293           }
9294           else if(regs[i].regmap[hr]>=0) {
9295             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9296               // dealloc old register
9297               int n;
9298               for(n=0;n<HOST_REGS;n++)
9299               {
9300                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9301               }
9302               // and alloc new one
9303               f_regmap[hr]=regs[i].regmap[hr];
9304             }
9305           }
9306         }
9307       }
9308       // Try to restore cycle count at branch targets
9309       if(bt[i]) {
9310         for(j=i;j<slen-1;j++) {
9311           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9312           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9313             //printf("no free regs for store %x\n",start+j*4);
9314             break;
9315           }
9316         }
9317         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9318           int k=i;
9319           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9320           while(k<j) {
9321             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9322             regs[k].regmap[HOST_CCREG]=CCREG;
9323             regmap_pre[k+1][HOST_CCREG]=CCREG;
9324             regs[k+1].wasdirty|=1<<HOST_CCREG;
9325             regs[k].dirty|=1<<HOST_CCREG;
9326             regs[k].wasconst&=~(1<<HOST_CCREG);
9327             regs[k].isconst&=~(1<<HOST_CCREG);
9328             k++;
9329           }
9330           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9331         }
9332         // Work backwards from the branch target
9333         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9334         {
9335           //printf("Extend backwards\n");
9336           int k;
9337           k=i;
9338           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9339             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9340               //printf("no free regs for store %x\n",start+(k-1)*4);
9341               break;
9342             }
9343             k--;
9344           }
9345           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9346             //printf("Extend CC, %x ->\n",start+k*4);
9347             while(k<=i) {
9348               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9349               regs[k].regmap[HOST_CCREG]=CCREG;
9350               regmap_pre[k+1][HOST_CCREG]=CCREG;
9351               regs[k+1].wasdirty|=1<<HOST_CCREG;
9352               regs[k].dirty|=1<<HOST_CCREG;
9353               regs[k].wasconst&=~(1<<HOST_CCREG);
9354               regs[k].isconst&=~(1<<HOST_CCREG);
9355               k++;
9356             }
9357           }
9358           else {
9359             //printf("Fail Extend CC, %x ->\n",start+k*4);
9360           }
9361         }
9362       }
9363       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9364          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9365          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9366          itype[i]!=FCONV&&itype[i]!=FCOMP)
9367       {
9368         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9369       }
9370     }
9371   }
9372
9373   // Cache memory offset or tlb map pointer if a register is available
9374   #ifndef HOST_IMM_ADDR32
9375   #ifndef RAM_OFFSET
9376   if(0)
9377   #endif
9378   {
9379     int earliest_available[HOST_REGS];
9380     int loop_start[HOST_REGS];
9381     int score[HOST_REGS];
9382     int end[HOST_REGS];
9383     int reg=ROREG;
9384
9385     // Init
9386     for(hr=0;hr<HOST_REGS;hr++) {
9387       score[hr]=0;earliest_available[hr]=0;
9388       loop_start[hr]=MAXBLOCK;
9389     }
9390     for(i=0;i<slen-1;i++)
9391     {
9392       // Can't do anything if no registers are available
9393       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9394         for(hr=0;hr<HOST_REGS;hr++) {
9395           score[hr]=0;earliest_available[hr]=i+1;
9396           loop_start[hr]=MAXBLOCK;
9397         }
9398       }
9399       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9400         if(!ooo[i]) {
9401           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9402             for(hr=0;hr<HOST_REGS;hr++) {
9403               score[hr]=0;earliest_available[hr]=i+1;
9404               loop_start[hr]=MAXBLOCK;
9405             }
9406           }
9407         }else{
9408           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9409             for(hr=0;hr<HOST_REGS;hr++) {
9410               score[hr]=0;earliest_available[hr]=i+1;
9411               loop_start[hr]=MAXBLOCK;
9412             }
9413           }
9414         }
9415       }
9416       // Mark unavailable registers
9417       for(hr=0;hr<HOST_REGS;hr++) {
9418         if(regs[i].regmap[hr]>=0) {
9419           score[hr]=0;earliest_available[hr]=i+1;
9420           loop_start[hr]=MAXBLOCK;
9421         }
9422         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9423           if(branch_regs[i].regmap[hr]>=0) {
9424             score[hr]=0;earliest_available[hr]=i+2;
9425             loop_start[hr]=MAXBLOCK;
9426           }
9427         }
9428       }
9429       // No register allocations after unconditional jumps
9430       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9431       {
9432         for(hr=0;hr<HOST_REGS;hr++) {
9433           score[hr]=0;earliest_available[hr]=i+2;
9434           loop_start[hr]=MAXBLOCK;
9435         }
9436         i++; // Skip delay slot too
9437         //printf("skip delay slot: %x\n",start+i*4);
9438       }
9439       else
9440       // Possible match
9441       if(itype[i]==LOAD||itype[i]==LOADLR||
9442          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9443         for(hr=0;hr<HOST_REGS;hr++) {
9444           if(hr!=EXCLUDE_REG) {
9445             end[hr]=i-1;
9446             for(j=i;j<slen-1;j++) {
9447               if(regs[j].regmap[hr]>=0) break;
9448               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9449                 if(branch_regs[j].regmap[hr]>=0) break;
9450                 if(ooo[j]) {
9451                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9452                 }else{
9453                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9454                 }
9455               }
9456               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9457               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9458                 int t=(ba[j]-start)>>2;
9459                 if(t<j&&t>=earliest_available[hr]) {
9460                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9461                     // Score a point for hoisting loop invariant
9462                     if(t<loop_start[hr]) loop_start[hr]=t;
9463                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9464                     score[hr]++;
9465                     end[hr]=j;
9466                   }
9467                 }
9468                 else if(t<j) {
9469                   if(regs[t].regmap[hr]==reg) {
9470                     // Score a point if the branch target matches this register
9471                     score[hr]++;
9472                     end[hr]=j;
9473                   }
9474                 }
9475                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9476                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9477                   score[hr]++;
9478                   end[hr]=j;
9479                 }
9480               }
9481               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9482               {
9483                 // Stop on unconditional branch
9484                 break;
9485               }
9486               else
9487               if(itype[j]==LOAD||itype[j]==LOADLR||
9488                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9489                 score[hr]++;
9490                 end[hr]=j;
9491               }
9492             }
9493           }
9494         }
9495         // Find highest score and allocate that register
9496         int maxscore=0;
9497         for(hr=0;hr<HOST_REGS;hr++) {
9498           if(hr!=EXCLUDE_REG) {
9499             if(score[hr]>score[maxscore]) {
9500               maxscore=hr;
9501               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9502             }
9503           }
9504         }
9505         if(score[maxscore]>1)
9506         {
9507           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9508           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9509             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9510             assert(regs[j].regmap[maxscore]<0);
9511             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9512             regs[j].regmap[maxscore]=reg;
9513             regs[j].dirty&=~(1<<maxscore);
9514             regs[j].wasconst&=~(1<<maxscore);
9515             regs[j].isconst&=~(1<<maxscore);
9516             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9517               branch_regs[j].regmap[maxscore]=reg;
9518               branch_regs[j].wasdirty&=~(1<<maxscore);
9519               branch_regs[j].dirty&=~(1<<maxscore);
9520               branch_regs[j].wasconst&=~(1<<maxscore);
9521               branch_regs[j].isconst&=~(1<<maxscore);
9522               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9523                 regmap_pre[j+2][maxscore]=reg;
9524                 regs[j+2].wasdirty&=~(1<<maxscore);
9525               }
9526               // loop optimization (loop_preload)
9527               int t=(ba[j]-start)>>2;
9528               if(t==loop_start[maxscore]) {
9529                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9530                   regs[t].regmap_entry[maxscore]=reg;
9531               }
9532             }
9533             else
9534             {
9535               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9536                 regmap_pre[j+1][maxscore]=reg;
9537                 regs[j+1].wasdirty&=~(1<<maxscore);
9538               }
9539             }
9540           }
9541           i=j-1;
9542           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9543           for(hr=0;hr<HOST_REGS;hr++) {
9544             score[hr]=0;earliest_available[hr]=i+i;
9545             loop_start[hr]=MAXBLOCK;
9546           }
9547         }
9548       }
9549     }
9550   }
9551   #endif
9552
9553   // This allocates registers (if possible) one instruction prior
9554   // to use, which can avoid a load-use penalty on certain CPUs.
9555   for(i=0;i<slen-1;i++)
9556   {
9557     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9558     {
9559       if(!bt[i+1])
9560       {
9561         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9562            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9563         {
9564           if(rs1[i+1]) {
9565             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9566             {
9567               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9568               {
9569                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9570                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9571                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9572                 regs[i].isconst&=~(1<<hr);
9573                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9574                 constmap[i][hr]=constmap[i+1][hr];
9575                 regs[i+1].wasdirty&=~(1<<hr);
9576                 regs[i].dirty&=~(1<<hr);
9577               }
9578             }
9579           }
9580           if(rs2[i+1]) {
9581             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9582             {
9583               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9584               {
9585                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9586                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9587                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9588                 regs[i].isconst&=~(1<<hr);
9589                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9590                 constmap[i][hr]=constmap[i+1][hr];
9591                 regs[i+1].wasdirty&=~(1<<hr);
9592                 regs[i].dirty&=~(1<<hr);
9593               }
9594             }
9595           }
9596           // Preload target address for load instruction (non-constant)
9597           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9598             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9599             {
9600               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9601               {
9602                 regs[i].regmap[hr]=rs1[i+1];
9603                 regmap_pre[i+1][hr]=rs1[i+1];
9604                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9605                 regs[i].isconst&=~(1<<hr);
9606                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9607                 constmap[i][hr]=constmap[i+1][hr];
9608                 regs[i+1].wasdirty&=~(1<<hr);
9609                 regs[i].dirty&=~(1<<hr);
9610               }
9611             }
9612           }
9613           // Load source into target register
9614           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9615             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9616             {
9617               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9618               {
9619                 regs[i].regmap[hr]=rs1[i+1];
9620                 regmap_pre[i+1][hr]=rs1[i+1];
9621                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9622                 regs[i].isconst&=~(1<<hr);
9623                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9624                 constmap[i][hr]=constmap[i+1][hr];
9625                 regs[i+1].wasdirty&=~(1<<hr);
9626                 regs[i].dirty&=~(1<<hr);
9627               }
9628             }
9629           }
9630           // Address for store instruction (non-constant)
9631           if(itype[i+1]==STORE||itype[i+1]==STORELR
9632              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9633             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9634               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9635               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9636               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9637               assert(hr>=0);
9638               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9639               {
9640                 regs[i].regmap[hr]=rs1[i+1];
9641                 regmap_pre[i+1][hr]=rs1[i+1];
9642                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9643                 regs[i].isconst&=~(1<<hr);
9644                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9645                 constmap[i][hr]=constmap[i+1][hr];
9646                 regs[i+1].wasdirty&=~(1<<hr);
9647                 regs[i].dirty&=~(1<<hr);
9648               }
9649             }
9650           }
9651           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9652             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9653               int nr;
9654               hr=get_reg(regs[i+1].regmap,FTEMP);
9655               assert(hr>=0);
9656               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9657               {
9658                 regs[i].regmap[hr]=rs1[i+1];
9659                 regmap_pre[i+1][hr]=rs1[i+1];
9660                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9661                 regs[i].isconst&=~(1<<hr);
9662                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9663                 constmap[i][hr]=constmap[i+1][hr];
9664                 regs[i+1].wasdirty&=~(1<<hr);
9665                 regs[i].dirty&=~(1<<hr);
9666               }
9667               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9668               {
9669                 // move it to another register
9670                 regs[i+1].regmap[hr]=-1;
9671                 regmap_pre[i+2][hr]=-1;
9672                 regs[i+1].regmap[nr]=FTEMP;
9673                 regmap_pre[i+2][nr]=FTEMP;
9674                 regs[i].regmap[nr]=rs1[i+1];
9675                 regmap_pre[i+1][nr]=rs1[i+1];
9676                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9677                 regs[i].isconst&=~(1<<nr);
9678                 regs[i+1].isconst&=~(1<<nr);
9679                 regs[i].dirty&=~(1<<nr);
9680                 regs[i+1].wasdirty&=~(1<<nr);
9681                 regs[i+1].dirty&=~(1<<nr);
9682                 regs[i+2].wasdirty&=~(1<<nr);
9683               }
9684             }
9685           }
9686           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9687             if(itype[i+1]==LOAD)
9688               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9689             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9690               hr=get_reg(regs[i+1].regmap,FTEMP);
9691             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9692               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9693               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9694             }
9695             if(hr>=0&&regs[i].regmap[hr]<0) {
9696               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9697               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9698                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9699                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9700                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9701                 regs[i].isconst&=~(1<<hr);
9702                 regs[i+1].wasdirty&=~(1<<hr);
9703                 regs[i].dirty&=~(1<<hr);
9704               }
9705             }
9706           }
9707         }
9708       }
9709     }
9710   }
9711
9712   /* Pass 6 - Optimize clean/dirty state */
9713   clean_registers(0,slen-1,1);
9714
9715   /* Pass 7 - Identify 32-bit registers */
9716   for (i=slen-1;i>=0;i--)
9717   {
9718     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9719     {
9720       // Conditional branch
9721       if((source[i]>>16)!=0x1000&&i<slen-2) {
9722         // Mark this address as a branch target since it may be called
9723         // upon return from interrupt
9724         bt[i+2]=1;
9725       }
9726     }
9727   }
9728
9729   if(itype[slen-1]==SPAN) {
9730     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9731   }
9732
9733 #ifdef DISASM
9734   /* Debug/disassembly */
9735   for(i=0;i<slen;i++)
9736   {
9737     printf("U:");
9738     int r;
9739     for(r=1;r<=CCREG;r++) {
9740       if((unneeded_reg[i]>>r)&1) {
9741         if(r==HIREG) printf(" HI");
9742         else if(r==LOREG) printf(" LO");
9743         else printf(" r%d",r);
9744       }
9745     }
9746     printf("\n");
9747     #if defined(__i386__) || defined(__x86_64__)
9748     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9749     #endif
9750     #ifdef __arm__
9751     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9752     #endif
9753     printf("needs: ");
9754     if(needed_reg[i]&1) printf("eax ");
9755     if((needed_reg[i]>>1)&1) printf("ecx ");
9756     if((needed_reg[i]>>2)&1) printf("edx ");
9757     if((needed_reg[i]>>3)&1) printf("ebx ");
9758     if((needed_reg[i]>>5)&1) printf("ebp ");
9759     if((needed_reg[i]>>6)&1) printf("esi ");
9760     if((needed_reg[i]>>7)&1) printf("edi ");
9761     printf("\n");
9762     #if defined(__i386__) || defined(__x86_64__)
9763     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9764     printf("dirty: ");
9765     if(regs[i].wasdirty&1) printf("eax ");
9766     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9767     if((regs[i].wasdirty>>2)&1) printf("edx ");
9768     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9769     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9770     if((regs[i].wasdirty>>6)&1) printf("esi ");
9771     if((regs[i].wasdirty>>7)&1) printf("edi ");
9772     #endif
9773     #ifdef __arm__
9774     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9775     printf("dirty: ");
9776     if(regs[i].wasdirty&1) printf("r0 ");
9777     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9778     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9779     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9780     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9781     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9782     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9783     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9784     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9785     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9786     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9787     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9788     #endif
9789     printf("\n");
9790     disassemble_inst(i);
9791     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9792     #if defined(__i386__) || defined(__x86_64__)
9793     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9794     if(regs[i].dirty&1) printf("eax ");
9795     if((regs[i].dirty>>1)&1) printf("ecx ");
9796     if((regs[i].dirty>>2)&1) printf("edx ");
9797     if((regs[i].dirty>>3)&1) printf("ebx ");
9798     if((regs[i].dirty>>5)&1) printf("ebp ");
9799     if((regs[i].dirty>>6)&1) printf("esi ");
9800     if((regs[i].dirty>>7)&1) printf("edi ");
9801     #endif
9802     #ifdef __arm__
9803     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9804     if(regs[i].dirty&1) printf("r0 ");
9805     if((regs[i].dirty>>1)&1) printf("r1 ");
9806     if((regs[i].dirty>>2)&1) printf("r2 ");
9807     if((regs[i].dirty>>3)&1) printf("r3 ");
9808     if((regs[i].dirty>>4)&1) printf("r4 ");
9809     if((regs[i].dirty>>5)&1) printf("r5 ");
9810     if((regs[i].dirty>>6)&1) printf("r6 ");
9811     if((regs[i].dirty>>7)&1) printf("r7 ");
9812     if((regs[i].dirty>>8)&1) printf("r8 ");
9813     if((regs[i].dirty>>9)&1) printf("r9 ");
9814     if((regs[i].dirty>>10)&1) printf("r10 ");
9815     if((regs[i].dirty>>12)&1) printf("r12 ");
9816     #endif
9817     printf("\n");
9818     if(regs[i].isconst) {
9819       printf("constants: ");
9820       #if defined(__i386__) || defined(__x86_64__)
9821       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9822       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9823       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9824       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9825       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9826       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
9827       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
9828       #endif
9829       #ifdef __arm__
9830       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
9831       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
9832       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
9833       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
9834       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
9835       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
9836       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
9837       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
9838       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
9839       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
9840       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
9841       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
9842       #endif
9843       printf("\n");
9844     }
9845     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9846       #if defined(__i386__) || defined(__x86_64__)
9847       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9848       if(branch_regs[i].dirty&1) printf("eax ");
9849       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
9850       if((branch_regs[i].dirty>>2)&1) printf("edx ");
9851       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
9852       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
9853       if((branch_regs[i].dirty>>6)&1) printf("esi ");
9854       if((branch_regs[i].dirty>>7)&1) printf("edi ");
9855       #endif
9856       #ifdef __arm__
9857       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
9858       if(branch_regs[i].dirty&1) printf("r0 ");
9859       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
9860       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
9861       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
9862       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
9863       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
9864       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
9865       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
9866       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
9867       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
9868       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
9869       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
9870       #endif
9871     }
9872   }
9873 #endif // DISASM
9874
9875   /* Pass 8 - Assembly */
9876   linkcount=0;stubcount=0;
9877   ds=0;is_delayslot=0;
9878   cop1_usable=0;
9879   uint64_t is32_pre=0;
9880   u_int dirty_pre=0;
9881   u_int beginning=(u_int)out;
9882   if((u_int)addr&1) {
9883     ds=1;
9884     pagespan_ds();
9885   }
9886   u_int instr_addr0_override=0;
9887
9888   if (start == 0x80030000) {
9889     // nasty hack for fastbios thing
9890     // override block entry to this code
9891     instr_addr0_override=(u_int)out;
9892     emit_movimm(start,0);
9893     // abuse io address var as a flag that we
9894     // have already returned here once
9895     emit_readword((int)&address,1);
9896     emit_writeword(0,(int)&pcaddr);
9897     emit_writeword(0,(int)&address);
9898     emit_cmp(0,1);
9899     emit_jne((int)new_dyna_leave);
9900   }
9901   for(i=0;i<slen;i++)
9902   {
9903     //if(ds) printf("ds: ");
9904     disassemble_inst(i);
9905     if(ds) {
9906       ds=0; // Skip delay slot
9907       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
9908       instr_addr[i]=0;
9909     } else {
9910       speculate_register_values(i);
9911       #ifndef DESTRUCTIVE_WRITEBACK
9912       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9913       {
9914         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
9915               unneeded_reg[i],unneeded_reg_upper[i]);
9916       }
9917       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
9918         is32_pre=branch_regs[i].is32;
9919         dirty_pre=branch_regs[i].dirty;
9920       }else{
9921         is32_pre=regs[i].is32;
9922         dirty_pre=regs[i].dirty;
9923       }
9924       #endif
9925       // write back
9926       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9927       {
9928         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
9929                       unneeded_reg[i],unneeded_reg_upper[i]);
9930         loop_preload(regmap_pre[i],regs[i].regmap_entry);
9931       }
9932       // branch target entry point
9933       instr_addr[i]=(u_int)out;
9934       assem_debug("<->\n");
9935       // load regs
9936       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
9937         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
9938       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
9939       address_generation(i,&regs[i],regs[i].regmap_entry);
9940       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
9941       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9942       {
9943         // Load the delay slot registers if necessary
9944         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
9945           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9946         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
9947           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9948         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
9949           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9950       }
9951       else if(i+1<slen)
9952       {
9953         // Preload registers for following instruction
9954         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
9955           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
9956             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9957         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
9958           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
9959             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9960       }
9961       // TODO: if(is_ooo(i)) address_generation(i+1);
9962       if(itype[i]==CJUMP||itype[i]==FJUMP)
9963         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
9964       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
9965         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9966       if(bt[i]) cop1_usable=0;
9967       // assemble
9968       switch(itype[i]) {
9969         case ALU:
9970           alu_assemble(i,&regs[i]);break;
9971         case IMM16:
9972           imm16_assemble(i,&regs[i]);break;
9973         case SHIFT:
9974           shift_assemble(i,&regs[i]);break;
9975         case SHIFTIMM:
9976           shiftimm_assemble(i,&regs[i]);break;
9977         case LOAD:
9978           load_assemble(i,&regs[i]);break;
9979         case LOADLR:
9980           loadlr_assemble(i,&regs[i]);break;
9981         case STORE:
9982           store_assemble(i,&regs[i]);break;
9983         case STORELR:
9984           storelr_assemble(i,&regs[i]);break;
9985         case COP0:
9986           cop0_assemble(i,&regs[i]);break;
9987         case COP1:
9988           cop1_assemble(i,&regs[i]);break;
9989         case C1LS:
9990           c1ls_assemble(i,&regs[i]);break;
9991         case COP2:
9992           cop2_assemble(i,&regs[i]);break;
9993         case C2LS:
9994           c2ls_assemble(i,&regs[i]);break;
9995         case C2OP:
9996           c2op_assemble(i,&regs[i]);break;
9997         case FCONV:
9998           fconv_assemble(i,&regs[i]);break;
9999         case FLOAT:
10000           float_assemble(i,&regs[i]);break;
10001         case FCOMP:
10002           fcomp_assemble(i,&regs[i]);break;
10003         case MULTDIV:
10004           multdiv_assemble(i,&regs[i]);break;
10005         case MOV:
10006           mov_assemble(i,&regs[i]);break;
10007         case SYSCALL:
10008           syscall_assemble(i,&regs[i]);break;
10009         case HLECALL:
10010           hlecall_assemble(i,&regs[i]);break;
10011         case INTCALL:
10012           intcall_assemble(i,&regs[i]);break;
10013         case UJUMP:
10014           ujump_assemble(i,&regs[i]);ds=1;break;
10015         case RJUMP:
10016           rjump_assemble(i,&regs[i]);ds=1;break;
10017         case CJUMP:
10018           cjump_assemble(i,&regs[i]);ds=1;break;
10019         case SJUMP:
10020           sjump_assemble(i,&regs[i]);ds=1;break;
10021         case FJUMP:
10022           fjump_assemble(i,&regs[i]);ds=1;break;
10023         case SPAN:
10024           pagespan_assemble(i,&regs[i]);break;
10025       }
10026       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10027         literal_pool(1024);
10028       else
10029         literal_pool_jumpover(256);
10030     }
10031   }
10032   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10033   // If the block did not end with an unconditional branch,
10034   // add a jump to the next instruction.
10035   if(i>1) {
10036     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10037       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10038       assert(i==slen);
10039       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10040         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10041         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10042           emit_loadreg(CCREG,HOST_CCREG);
10043         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10044       }
10045       else if(!likely[i-2])
10046       {
10047         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10048         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10049       }
10050       else
10051       {
10052         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10053         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10054       }
10055       add_to_linker((int)out,start+i*4,0);
10056       emit_jmp(0);
10057     }
10058   }
10059   else
10060   {
10061     assert(i>0);
10062     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10063     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10064     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10065       emit_loadreg(CCREG,HOST_CCREG);
10066     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10067     add_to_linker((int)out,start+i*4,0);
10068     emit_jmp(0);
10069   }
10070
10071   // TODO: delay slot stubs?
10072   // Stubs
10073   for(i=0;i<stubcount;i++)
10074   {
10075     switch(stubs[i][0])
10076     {
10077       case LOADB_STUB:
10078       case LOADH_STUB:
10079       case LOADW_STUB:
10080       case LOADD_STUB:
10081       case LOADBU_STUB:
10082       case LOADHU_STUB:
10083         do_readstub(i);break;
10084       case STOREB_STUB:
10085       case STOREH_STUB:
10086       case STOREW_STUB:
10087       case STORED_STUB:
10088         do_writestub(i);break;
10089       case CC_STUB:
10090         do_ccstub(i);break;
10091       case INVCODE_STUB:
10092         do_invstub(i);break;
10093       case FP_STUB:
10094         do_cop1stub(i);break;
10095       case STORELR_STUB:
10096         do_unalignedwritestub(i);break;
10097     }
10098   }
10099
10100   if (instr_addr0_override)
10101     instr_addr[0] = instr_addr0_override;
10102
10103   /* Pass 9 - Linker */
10104   for(i=0;i<linkcount;i++)
10105   {
10106     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10107     literal_pool(64);
10108     if(!link_addr[i][2])
10109     {
10110       void *stub=out;
10111       void *addr=check_addr(link_addr[i][1]);
10112       emit_extjump(link_addr[i][0],link_addr[i][1]);
10113       if(addr) {
10114         set_jump_target(link_addr[i][0],(int)addr);
10115         add_link(link_addr[i][1],stub);
10116       }
10117       else set_jump_target(link_addr[i][0],(int)stub);
10118     }
10119     else
10120     {
10121       // Internal branch
10122       int target=(link_addr[i][1]-start)>>2;
10123       assert(target>=0&&target<slen);
10124       assert(instr_addr[target]);
10125       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10126       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10127       //#else
10128       set_jump_target(link_addr[i][0],instr_addr[target]);
10129       //#endif
10130     }
10131   }
10132   // External Branch Targets (jump_in)
10133   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10134   for(i=0;i<slen;i++)
10135   {
10136     if(bt[i]||i==0)
10137     {
10138       if(instr_addr[i]) // TODO - delay slots (=null)
10139       {
10140         u_int vaddr=start+i*4;
10141         u_int page=get_page(vaddr);
10142         u_int vpage=get_vpage(vaddr);
10143         literal_pool(256);
10144         {
10145           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10146           assem_debug("jump_in: %x\n",start+i*4);
10147           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10148           int entry_point=do_dirty_stub(i);
10149           ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
10150           // If there was an existing entry in the hash table,
10151           // replace it with the new address.
10152           // Don't add new entries.  We'll insert the
10153           // ones that actually get used in check_addr().
10154           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10155           if(ht_bin[0]==vaddr) {
10156             ht_bin[1]=entry_point;
10157           }
10158           if(ht_bin[2]==vaddr) {
10159             ht_bin[3]=entry_point;
10160           }
10161         }
10162       }
10163     }
10164   }
10165   // Write out the literal pool if necessary
10166   literal_pool(0);
10167   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10168   // Align code
10169   if(((u_int)out)&7) emit_addnop(13);
10170   #endif
10171   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
10172   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10173   memcpy(copy,source,slen*4);
10174   copy+=slen*4;
10175
10176   #ifdef __arm__
10177   __clear_cache((void *)beginning,out);
10178   #endif
10179
10180   // If we're within 256K of the end of the buffer,
10181   // start over from the beginning. (Is 256K enough?)
10182   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10183
10184   // Trap writes to any of the pages we compiled
10185   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10186     invalid_code[i]=0;
10187   }
10188   inv_code_start=inv_code_end=~0;
10189
10190   // for PCSX we need to mark all mirrors too
10191   if(get_page(start)<(RAM_SIZE>>12))
10192     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10193       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10194       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10195       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10196
10197   /* Pass 10 - Free memory by expiring oldest blocks */
10198
10199   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10200   while(expirep!=end)
10201   {
10202     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10203     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10204     inv_debug("EXP: Phase %d\n",expirep);
10205     switch((expirep>>11)&3)
10206     {
10207       case 0:
10208         // Clear jump_in and jump_dirty
10209         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10210         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10211         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10212         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10213         break;
10214       case 1:
10215         // Clear pointers
10216         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10217         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10218         break;
10219       case 2:
10220         // Clear hash table
10221         for(i=0;i<32;i++) {
10222           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10223           if((ht_bin[3]>>shift)==(base>>shift) ||
10224              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10225             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10226             ht_bin[2]=ht_bin[3]=-1;
10227           }
10228           if((ht_bin[1]>>shift)==(base>>shift) ||
10229              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10230             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10231             ht_bin[0]=ht_bin[2];
10232             ht_bin[1]=ht_bin[3];
10233             ht_bin[2]=ht_bin[3]=-1;
10234           }
10235         }
10236         break;
10237       case 3:
10238         // Clear jump_out
10239         #ifdef __arm__
10240         if((expirep&2047)==0)
10241           do_clear_cache();
10242         #endif
10243         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10244         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10245         break;
10246     }
10247     expirep=(expirep+1)&65535;
10248   }
10249   return 0;
10250 }
10251
10252 // vim:shiftwidth=2:expandtab