ec2a6faebad8404eff9ce29ccad8a72cb9bca6c2
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29
30 #include "new_dynarec_config.h"
31 #include "emu_if.h" //emulator interface
32
33 //#define DISASM
34 //#define assem_debug printf
35 //#define inv_debug printf
36 #define assem_debug(...)
37 #define inv_debug(...)
38
39 #ifdef __i386__
40 #include "assem_x86.h"
41 #endif
42 #ifdef __x86_64__
43 #include "assem_x64.h"
44 #endif
45 #ifdef __arm__
46 #include "assem_arm.h"
47 #endif
48
49 #define MAXBLOCK 4096
50 #define MAX_OUTPUT_BLOCK_SIZE 262144
51
52 struct regstat
53 {
54   signed char regmap_entry[HOST_REGS];
55   signed char regmap[HOST_REGS];
56   uint64_t was32;
57   uint64_t is32;
58   uint64_t wasdirty;
59   uint64_t dirty;
60   uint64_t u;
61   uint64_t uu;
62   u_int wasconst;
63   u_int isconst;
64   u_int loadedconst;             // host regs that have constants loaded
65   u_int waswritten;              // MIPS regs that were used as store base before
66 };
67
68 // note: asm depends on this layout
69 struct ll_entry
70 {
71   u_int vaddr;
72   u_int reg_sv_flags;
73   void *addr;
74   struct ll_entry *next;
75 };
76
77   // used by asm:
78   u_char *out;
79   u_int hash_table[65536][4]  __attribute__((aligned(16)));
80   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
81   struct ll_entry *jump_dirty[4096];
82
83   static struct ll_entry *jump_out[4096];
84   static u_int start;
85   static u_int *source;
86   static char insn[MAXBLOCK][10];
87   static u_char itype[MAXBLOCK];
88   static u_char opcode[MAXBLOCK];
89   static u_char opcode2[MAXBLOCK];
90   static u_char bt[MAXBLOCK];
91   static u_char rs1[MAXBLOCK];
92   static u_char rs2[MAXBLOCK];
93   static u_char rt1[MAXBLOCK];
94   static u_char rt2[MAXBLOCK];
95   static u_char us1[MAXBLOCK];
96   static u_char us2[MAXBLOCK];
97   static u_char dep1[MAXBLOCK];
98   static u_char dep2[MAXBLOCK];
99   static u_char lt1[MAXBLOCK];
100   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
101   static uint64_t gte_rt[MAXBLOCK];
102   static uint64_t gte_unneeded[MAXBLOCK];
103   static u_int smrv[32]; // speculated MIPS register values
104   static u_int smrv_strong; // mask or regs that are likely to have correct values
105   static u_int smrv_weak; // same, but somewhat less likely
106   static u_int smrv_strong_next; // same, but after current insn executes
107   static u_int smrv_weak_next;
108   static int imm[MAXBLOCK];
109   static u_int ba[MAXBLOCK];
110   static char likely[MAXBLOCK];
111   static char is_ds[MAXBLOCK];
112   static char ooo[MAXBLOCK];
113   static uint64_t unneeded_reg[MAXBLOCK];
114   static uint64_t unneeded_reg_upper[MAXBLOCK];
115   static uint64_t branch_unneeded_reg[MAXBLOCK];
116   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
117   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
118   static uint64_t current_constmap[HOST_REGS];
119   static uint64_t constmap[MAXBLOCK][HOST_REGS];
120   static struct regstat regs[MAXBLOCK];
121   static struct regstat branch_regs[MAXBLOCK];
122   static signed char minimum_free_regs[MAXBLOCK];
123   static u_int needed_reg[MAXBLOCK];
124   static u_int wont_dirty[MAXBLOCK];
125   static u_int will_dirty[MAXBLOCK];
126   static int ccadj[MAXBLOCK];
127   static int slen;
128   static u_int instr_addr[MAXBLOCK];
129   static u_int link_addr[MAXBLOCK][3];
130   static int linkcount;
131   static u_int stubs[MAXBLOCK*3][8];
132   static int stubcount;
133   static u_int literals[1024][2];
134   static int literalcount;
135   static int is_delayslot;
136   static int cop1_usable;
137   static char shadow[1048576]  __attribute__((aligned(16)));
138   static void *copy;
139   static int expirep;
140   static u_int stop_after_jal;
141 #ifndef RAM_FIXED
142   static u_int ram_offset;
143 #else
144   static const u_int ram_offset=0;
145 #endif
146
147   int new_dynarec_hacks;
148   int new_dynarec_did_compile;
149   extern u_char restore_candidate[512];
150   extern int cycle_count;
151
152   /* registers that may be allocated */
153   /* 1-31 gpr */
154 #define HIREG 32 // hi
155 #define LOREG 33 // lo
156 #define FSREG 34 // FPU status (FCSR)
157 #define CSREG 35 // Coprocessor status
158 #define CCREG 36 // Cycle count
159 #define INVCP 37 // Pointer to invalid_code
160 //#define MMREG 38 // Pointer to memory_map
161 #define ROREG 39 // ram offset (if rdram!=0x80000000)
162 #define TEMPREG 40
163 #define FTEMP 40 // FPU temporary register
164 #define PTEMP 41 // Prefetch temporary register
165 //#define TLREG 42 // TLB mapping offset
166 #define RHASH 43 // Return address hash
167 #define RHTBL 44 // Return address hash table address
168 #define RTEMP 45 // JR/JALR address register
169 #define MAXREG 45
170 #define AGEN1 46 // Address generation temporary register
171 //#define AGEN2 47 // Address generation temporary register
172 //#define MGEN1 48 // Maptable address generation temporary register
173 //#define MGEN2 49 // Maptable address generation temporary register
174 #define BTREG 50 // Branch target temporary register
175
176   /* instruction types */
177 #define NOP 0     // No operation
178 #define LOAD 1    // Load
179 #define STORE 2   // Store
180 #define LOADLR 3  // Unaligned load
181 #define STORELR 4 // Unaligned store
182 #define MOV 5     // Move
183 #define ALU 6     // Arithmetic/logic
184 #define MULTDIV 7 // Multiply/divide
185 #define SHIFT 8   // Shift by register
186 #define SHIFTIMM 9// Shift by immediate
187 #define IMM16 10  // 16-bit immediate
188 #define RJUMP 11  // Unconditional jump to register
189 #define UJUMP 12  // Unconditional jump
190 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
191 #define SJUMP 14  // Conditional branch (regimm format)
192 #define COP0 15   // Coprocessor 0
193 #define COP1 16   // Coprocessor 1
194 #define C1LS 17   // Coprocessor 1 load/store
195 #define FJUMP 18  // Conditional branch (floating point)
196 #define FLOAT 19  // Floating point unit
197 #define FCONV 20  // Convert integer to float
198 #define FCOMP 21  // Floating point compare (sets FSREG)
199 #define SYSCALL 22// SYSCALL
200 #define OTHER 23  // Other
201 #define SPAN 24   // Branch/delay slot spans 2 pages
202 #define NI 25     // Not implemented
203 #define HLECALL 26// PCSX fake opcodes for HLE
204 #define COP2 27   // Coprocessor 2 move
205 #define C2LS 28   // Coprocessor 2 load/store
206 #define C2OP 29   // Coprocessor 2 operation
207 #define INTCALL 30// Call interpreter to handle rare corner cases
208
209   /* stubs */
210 #define CC_STUB 1
211 #define FP_STUB 2
212 #define LOADB_STUB 3
213 #define LOADH_STUB 4
214 #define LOADW_STUB 5
215 #define LOADD_STUB 6
216 #define LOADBU_STUB 7
217 #define LOADHU_STUB 8
218 #define STOREB_STUB 9
219 #define STOREH_STUB 10
220 #define STOREW_STUB 11
221 #define STORED_STUB 12
222 #define STORELR_STUB 13
223 #define INVCODE_STUB 14
224
225   /* branch codes */
226 #define TAKEN 1
227 #define NOTTAKEN 2
228 #define NULLDS 3
229
230 // asm linkage
231 int new_recompile_block(int addr);
232 void *get_addr_ht(u_int vaddr);
233 void invalidate_block(u_int block);
234 void invalidate_addr(u_int addr);
235 void remove_hash(int vaddr);
236 void dyna_linker();
237 void dyna_linker_ds();
238 void verify_code();
239 void verify_code_vm();
240 void verify_code_ds();
241 void cc_interrupt();
242 void fp_exception();
243 void fp_exception_ds();
244 void jump_syscall_hle();
245 void jump_hlecall();
246 void jump_intcall();
247 void new_dyna_leave();
248
249 // Needed by assembler
250 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
251 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
252 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
253 static void load_all_regs(signed char i_regmap[]);
254 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
255 static void load_regs_entry(int t);
256 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
257
258 static int verify_dirty(u_int *ptr);
259 static int get_final_value(int hr, int i, int *value);
260 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e);
261 static void add_to_linker(int addr,int target,int ext);
262
263 static int tracedebug=0;
264
265 static void mprotect_w_x(void *start, void *end, int is_x)
266 {
267 #ifdef NO_WRITE_EXEC
268   u_long mstart = (u_long)start & ~4095ul;
269   u_long mend = (u_long)end;
270   if (mprotect((void *)mstart, mend - mstart,
271                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
272     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
273 #endif
274 }
275
276 static void start_tcache_write(void *start, void *end)
277 {
278   mprotect_w_x(start, end, 0);
279 }
280
281 static void end_tcache_write(void *start, void *end)
282 {
283 #ifdef __arm__
284   size_t len = (char *)end - (char *)start;
285   #if   defined(__BLACKBERRY_QNX__)
286   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
287   #elif defined(__MACH__)
288   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
289   #elif defined(VITA)
290   int block = sceKernelFindMemBlockByAddr(start, len);
291   sceKernelSyncVMDomain(block, start, len);
292   #else
293   __clear_cache(start, end);
294   #endif
295   (void)len;
296 #endif
297
298   mprotect_w_x(start, end, 1);
299 }
300
301 static void *start_block(void)
302 {
303   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
304   if (end > (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2))
305     end = (u_char *)BASE_ADDR + (1<<TARGET_SIZE_2);
306   start_tcache_write(out, end);
307   return out;
308 }
309
310 static void end_block(void *start)
311 {
312   end_tcache_write(start, out);
313 }
314
315 //#define DEBUG_CYCLE_COUNT 1
316
317 #define NO_CYCLE_PENALTY_THR 12
318
319 int cycle_multiplier; // 100 for 1.0
320
321 static int CLOCK_ADJUST(int x)
322 {
323   int s=(x>>31)|1;
324   return (x * cycle_multiplier + s * 50) / 100;
325 }
326
327 static u_int get_page(u_int vaddr)
328 {
329   u_int page=vaddr&~0xe0000000;
330   if (page < 0x1000000)
331     page &= ~0x0e00000; // RAM mirrors
332   page>>=12;
333   if(page>2048) page=2048+(page&2047);
334   return page;
335 }
336
337 // no virtual mem in PCSX
338 static u_int get_vpage(u_int vaddr)
339 {
340   return get_page(vaddr);
341 }
342
343 // Get address from virtual address
344 // This is called from the recompiled JR/JALR instructions
345 void *get_addr(u_int vaddr)
346 {
347   u_int page=get_page(vaddr);
348   u_int vpage=get_vpage(vaddr);
349   struct ll_entry *head;
350   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
351   head=jump_in[page];
352   while(head!=NULL) {
353     if(head->vaddr==vaddr) {
354   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
355       u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
356       ht_bin[3]=ht_bin[1];
357       ht_bin[2]=ht_bin[0];
358       ht_bin[1]=(u_int)head->addr;
359       ht_bin[0]=vaddr;
360       return head->addr;
361     }
362     head=head->next;
363   }
364   head=jump_dirty[vpage];
365   while(head!=NULL) {
366     if(head->vaddr==vaddr) {
367       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
368       // Don't restore blocks which are about to expire from the cache
369       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
370       if(verify_dirty(head->addr)) {
371         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
372         invalid_code[vaddr>>12]=0;
373         inv_code_start=inv_code_end=~0;
374         if(vpage<2048) {
375           restore_candidate[vpage>>3]|=1<<(vpage&7);
376         }
377         else restore_candidate[page>>3]|=1<<(page&7);
378         u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
379         if(ht_bin[0]==vaddr) {
380           ht_bin[1]=(u_int)head->addr; // Replace existing entry
381         }
382         else
383         {
384           ht_bin[3]=ht_bin[1];
385           ht_bin[2]=ht_bin[0];
386           ht_bin[1]=(int)head->addr;
387           ht_bin[0]=vaddr;
388         }
389         return head->addr;
390       }
391     }
392     head=head->next;
393   }
394   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
395   int r=new_recompile_block(vaddr);
396   if(r==0) return get_addr(vaddr);
397   // Execute in unmapped page, generate pagefault execption
398   Status|=2;
399   Cause=(vaddr<<31)|0x8;
400   EPC=(vaddr&1)?vaddr-5:vaddr;
401   BadVAddr=(vaddr&~1);
402   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
403   EntryHi=BadVAddr&0xFFFFE000;
404   return get_addr_ht(0x80000000);
405 }
406 // Look up address in hash table first
407 void *get_addr_ht(u_int vaddr)
408 {
409   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
410   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
411   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
412   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
413   return get_addr(vaddr);
414 }
415
416 void clear_all_regs(signed char regmap[])
417 {
418   int hr;
419   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
420 }
421
422 signed char get_reg(signed char regmap[],int r)
423 {
424   int hr;
425   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
426   return -1;
427 }
428
429 // Find a register that is available for two consecutive cycles
430 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
431 {
432   int hr;
433   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
434   return -1;
435 }
436
437 int count_free_regs(signed char regmap[])
438 {
439   int count=0;
440   int hr;
441   for(hr=0;hr<HOST_REGS;hr++)
442   {
443     if(hr!=EXCLUDE_REG) {
444       if(regmap[hr]<0) count++;
445     }
446   }
447   return count;
448 }
449
450 void dirty_reg(struct regstat *cur,signed char reg)
451 {
452   int hr;
453   if(!reg) return;
454   for (hr=0;hr<HOST_REGS;hr++) {
455     if((cur->regmap[hr]&63)==reg) {
456       cur->dirty|=1<<hr;
457     }
458   }
459 }
460
461 // If we dirty the lower half of a 64 bit register which is now being
462 // sign-extended, we need to dump the upper half.
463 // Note: Do this only after completion of the instruction, because
464 // some instructions may need to read the full 64-bit value even if
465 // overwriting it (eg SLTI, DSRA32).
466 static void flush_dirty_uppers(struct regstat *cur)
467 {
468   int hr,reg;
469   for (hr=0;hr<HOST_REGS;hr++) {
470     if((cur->dirty>>hr)&1) {
471       reg=cur->regmap[hr];
472       if(reg>=64)
473         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
474     }
475   }
476 }
477
478 void set_const(struct regstat *cur,signed char reg,uint64_t value)
479 {
480   int hr;
481   if(!reg) return;
482   for (hr=0;hr<HOST_REGS;hr++) {
483     if(cur->regmap[hr]==reg) {
484       cur->isconst|=1<<hr;
485       current_constmap[hr]=value;
486     }
487     else if((cur->regmap[hr]^64)==reg) {
488       cur->isconst|=1<<hr;
489       current_constmap[hr]=value>>32;
490     }
491   }
492 }
493
494 void clear_const(struct regstat *cur,signed char reg)
495 {
496   int hr;
497   if(!reg) return;
498   for (hr=0;hr<HOST_REGS;hr++) {
499     if((cur->regmap[hr]&63)==reg) {
500       cur->isconst&=~(1<<hr);
501     }
502   }
503 }
504
505 int is_const(struct regstat *cur,signed char reg)
506 {
507   int hr;
508   if(reg<0) return 0;
509   if(!reg) return 1;
510   for (hr=0;hr<HOST_REGS;hr++) {
511     if((cur->regmap[hr]&63)==reg) {
512       return (cur->isconst>>hr)&1;
513     }
514   }
515   return 0;
516 }
517 uint64_t get_const(struct regstat *cur,signed char reg)
518 {
519   int hr;
520   if(!reg) return 0;
521   for (hr=0;hr<HOST_REGS;hr++) {
522     if(cur->regmap[hr]==reg) {
523       return current_constmap[hr];
524     }
525   }
526   SysPrintf("Unknown constant in r%d\n",reg);
527   exit(1);
528 }
529
530 // Least soon needed registers
531 // Look at the next ten instructions and see which registers
532 // will be used.  Try not to reallocate these.
533 void lsn(u_char hsn[], int i, int *preferred_reg)
534 {
535   int j;
536   int b=-1;
537   for(j=0;j<9;j++)
538   {
539     if(i+j>=slen) {
540       j=slen-i-1;
541       break;
542     }
543     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
544     {
545       // Don't go past an unconditonal jump
546       j++;
547       break;
548     }
549   }
550   for(;j>=0;j--)
551   {
552     if(rs1[i+j]) hsn[rs1[i+j]]=j;
553     if(rs2[i+j]) hsn[rs2[i+j]]=j;
554     if(rt1[i+j]) hsn[rt1[i+j]]=j;
555     if(rt2[i+j]) hsn[rt2[i+j]]=j;
556     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
557       // Stores can allocate zero
558       hsn[rs1[i+j]]=j;
559       hsn[rs2[i+j]]=j;
560     }
561     // On some architectures stores need invc_ptr
562     #if defined(HOST_IMM8)
563     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
564       hsn[INVCP]=j;
565     }
566     #endif
567     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
568     {
569       hsn[CCREG]=j;
570       b=j;
571     }
572   }
573   if(b>=0)
574   {
575     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
576     {
577       // Follow first branch
578       int t=(ba[i+b]-start)>>2;
579       j=7-b;if(t+j>=slen) j=slen-t-1;
580       for(;j>=0;j--)
581       {
582         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
583         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
584         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
585         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
586       }
587     }
588     // TODO: preferred register based on backward branch
589   }
590   // Delay slot should preferably not overwrite branch conditions or cycle count
591   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
592     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
593     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
594     hsn[CCREG]=1;
595     // ...or hash tables
596     hsn[RHASH]=1;
597     hsn[RHTBL]=1;
598   }
599   // Coprocessor load/store needs FTEMP, even if not declared
600   if(itype[i]==C1LS||itype[i]==C2LS) {
601     hsn[FTEMP]=0;
602   }
603   // Load L/R also uses FTEMP as a temporary register
604   if(itype[i]==LOADLR) {
605     hsn[FTEMP]=0;
606   }
607   // Also SWL/SWR/SDL/SDR
608   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
609     hsn[FTEMP]=0;
610   }
611   // Don't remove the miniht registers
612   if(itype[i]==UJUMP||itype[i]==RJUMP)
613   {
614     hsn[RHASH]=0;
615     hsn[RHTBL]=0;
616   }
617 }
618
619 // We only want to allocate registers if we're going to use them again soon
620 int needed_again(int r, int i)
621 {
622   int j;
623   int b=-1;
624   int rn=10;
625
626   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
627   {
628     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
629       return 0; // Don't need any registers if exiting the block
630   }
631   for(j=0;j<9;j++)
632   {
633     if(i+j>=slen) {
634       j=slen-i-1;
635       break;
636     }
637     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
638     {
639       // Don't go past an unconditonal jump
640       j++;
641       break;
642     }
643     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
644     {
645       break;
646     }
647   }
648   for(;j>=1;j--)
649   {
650     if(rs1[i+j]==r) rn=j;
651     if(rs2[i+j]==r) rn=j;
652     if((unneeded_reg[i+j]>>r)&1) rn=10;
653     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
654     {
655       b=j;
656     }
657   }
658   /*
659   if(b>=0)
660   {
661     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
662     {
663       // Follow first branch
664       int o=rn;
665       int t=(ba[i+b]-start)>>2;
666       j=7-b;if(t+j>=slen) j=slen-t-1;
667       for(;j>=0;j--)
668       {
669         if(!((unneeded_reg[t+j]>>r)&1)) {
670           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
671           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
672         }
673         else rn=o;
674       }
675     }
676   }*/
677   if(rn<10) return 1;
678   (void)b;
679   return 0;
680 }
681
682 // Try to match register allocations at the end of a loop with those
683 // at the beginning
684 int loop_reg(int i, int r, int hr)
685 {
686   int j,k;
687   for(j=0;j<9;j++)
688   {
689     if(i+j>=slen) {
690       j=slen-i-1;
691       break;
692     }
693     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
694     {
695       // Don't go past an unconditonal jump
696       j++;
697       break;
698     }
699   }
700   k=0;
701   if(i>0){
702     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
703       k--;
704   }
705   for(;k<j;k++)
706   {
707     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
708     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
709     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
710     {
711       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
712       {
713         int t=(ba[i+k]-start)>>2;
714         int reg=get_reg(regs[t].regmap_entry,r);
715         if(reg>=0) return reg;
716         //reg=get_reg(regs[t+1].regmap_entry,r);
717         //if(reg>=0) return reg;
718       }
719     }
720   }
721   return hr;
722 }
723
724
725 // Allocate every register, preserving source/target regs
726 void alloc_all(struct regstat *cur,int i)
727 {
728   int hr;
729
730   for(hr=0;hr<HOST_REGS;hr++) {
731     if(hr!=EXCLUDE_REG) {
732       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
733          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
734       {
735         cur->regmap[hr]=-1;
736         cur->dirty&=~(1<<hr);
737       }
738       // Don't need zeros
739       if((cur->regmap[hr]&63)==0)
740       {
741         cur->regmap[hr]=-1;
742         cur->dirty&=~(1<<hr);
743       }
744     }
745   }
746 }
747
748 #ifdef __i386__
749 #include "assem_x86.c"
750 #endif
751 #ifdef __x86_64__
752 #include "assem_x64.c"
753 #endif
754 #ifdef __arm__
755 #include "assem_arm.c"
756 #endif
757
758 // Add virtual address mapping to linked list
759 void ll_add(struct ll_entry **head,int vaddr,void *addr)
760 {
761   struct ll_entry *new_entry;
762   new_entry=malloc(sizeof(struct ll_entry));
763   assert(new_entry!=NULL);
764   new_entry->vaddr=vaddr;
765   new_entry->reg_sv_flags=0;
766   new_entry->addr=addr;
767   new_entry->next=*head;
768   *head=new_entry;
769 }
770
771 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
772 {
773   ll_add(head,vaddr,addr);
774   (*head)->reg_sv_flags=reg_sv_flags;
775 }
776
777 // Check if an address is already compiled
778 // but don't return addresses which are about to expire from the cache
779 void *check_addr(u_int vaddr)
780 {
781   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
782   if(ht_bin[0]==vaddr) {
783     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
784       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
785   }
786   if(ht_bin[2]==vaddr) {
787     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
788       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
789   }
790   u_int page=get_page(vaddr);
791   struct ll_entry *head;
792   head=jump_in[page];
793   while(head!=NULL) {
794     if(head->vaddr==vaddr) {
795       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
796         // Update existing entry with current address
797         if(ht_bin[0]==vaddr) {
798           ht_bin[1]=(int)head->addr;
799           return head->addr;
800         }
801         if(ht_bin[2]==vaddr) {
802           ht_bin[3]=(int)head->addr;
803           return head->addr;
804         }
805         // Insert into hash table with low priority.
806         // Don't evict existing entries, as they are probably
807         // addresses that are being accessed frequently.
808         if(ht_bin[0]==-1) {
809           ht_bin[1]=(int)head->addr;
810           ht_bin[0]=vaddr;
811         }else if(ht_bin[2]==-1) {
812           ht_bin[3]=(int)head->addr;
813           ht_bin[2]=vaddr;
814         }
815         return head->addr;
816       }
817     }
818     head=head->next;
819   }
820   return 0;
821 }
822
823 void remove_hash(int vaddr)
824 {
825   //printf("remove hash: %x\n",vaddr);
826   u_int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
827   if(ht_bin[2]==vaddr) {
828     ht_bin[2]=ht_bin[3]=-1;
829   }
830   if(ht_bin[0]==vaddr) {
831     ht_bin[0]=ht_bin[2];
832     ht_bin[1]=ht_bin[3];
833     ht_bin[2]=ht_bin[3]=-1;
834   }
835 }
836
837 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
838 {
839   struct ll_entry *next;
840   while(*head) {
841     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
842        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
843     {
844       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
845       remove_hash((*head)->vaddr);
846       next=(*head)->next;
847       free(*head);
848       *head=next;
849     }
850     else
851     {
852       head=&((*head)->next);
853     }
854   }
855 }
856
857 // Remove all entries from linked list
858 void ll_clear(struct ll_entry **head)
859 {
860   struct ll_entry *cur;
861   struct ll_entry *next;
862   if((cur=*head)) {
863     *head=0;
864     while(cur) {
865       next=cur->next;
866       free(cur);
867       cur=next;
868     }
869   }
870 }
871
872 // Dereference the pointers and remove if it matches
873 static void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
874 {
875   while(head) {
876     int ptr=get_pointer(head->addr);
877     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
878     if(((ptr>>shift)==(addr>>shift)) ||
879        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
880     {
881       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
882       void *host_addr=find_extjump_insn(head->addr);
883       #ifdef __arm__
884         mark_clear_cache(host_addr);
885       #endif
886       set_jump_target((int)host_addr,(int)head->addr);
887     }
888     head=head->next;
889   }
890 }
891
892 // This is called when we write to a compiled block (see do_invstub)
893 void invalidate_page(u_int page)
894 {
895   struct ll_entry *head;
896   struct ll_entry *next;
897   head=jump_in[page];
898   jump_in[page]=0;
899   while(head!=NULL) {
900     inv_debug("INVALIDATE: %x\n",head->vaddr);
901     remove_hash(head->vaddr);
902     next=head->next;
903     free(head);
904     head=next;
905   }
906   head=jump_out[page];
907   jump_out[page]=0;
908   while(head!=NULL) {
909     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
910     void *host_addr=find_extjump_insn(head->addr);
911     #ifdef __arm__
912       mark_clear_cache(host_addr);
913     #endif
914     set_jump_target((int)host_addr,(int)head->addr);
915     next=head->next;
916     free(head);
917     head=next;
918   }
919 }
920
921 static void invalidate_block_range(u_int block, u_int first, u_int last)
922 {
923   u_int page=get_page(block<<12);
924   //printf("first=%d last=%d\n",first,last);
925   invalidate_page(page);
926   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
927   assert(last<page+5);
928   // Invalidate the adjacent pages if a block crosses a 4K boundary
929   while(first<page) {
930     invalidate_page(first);
931     first++;
932   }
933   for(first=page+1;first<last;first++) {
934     invalidate_page(first);
935   }
936   #ifdef __arm__
937     do_clear_cache();
938   #endif
939
940   // Don't trap writes
941   invalid_code[block]=1;
942
943   #ifdef USE_MINI_HT
944   memset(mini_ht,-1,sizeof(mini_ht));
945   #endif
946 }
947
948 void invalidate_block(u_int block)
949 {
950   u_int page=get_page(block<<12);
951   u_int vpage=get_vpage(block<<12);
952   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
953   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
954   u_int first,last;
955   first=last=page;
956   struct ll_entry *head;
957   head=jump_dirty[vpage];
958   //printf("page=%d vpage=%d\n",page,vpage);
959   while(head!=NULL) {
960     u_int start,end;
961     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
962       get_bounds((int)head->addr,&start,&end);
963       //printf("start: %x end: %x\n",start,end);
964       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
965         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
966           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
967           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
968         }
969       }
970     }
971     head=head->next;
972   }
973   invalidate_block_range(block,first,last);
974 }
975
976 void invalidate_addr(u_int addr)
977 {
978   //static int rhits;
979   // this check is done by the caller
980   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
981   u_int page=get_vpage(addr);
982   if(page<2048) { // RAM
983     struct ll_entry *head;
984     u_int addr_min=~0, addr_max=0;
985     u_int mask=RAM_SIZE-1;
986     u_int addr_main=0x80000000|(addr&mask);
987     int pg1;
988     inv_code_start=addr_main&~0xfff;
989     inv_code_end=addr_main|0xfff;
990     pg1=page;
991     if (pg1>0) {
992       // must check previous page too because of spans..
993       pg1--;
994       inv_code_start-=0x1000;
995     }
996     for(;pg1<=page;pg1++) {
997       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
998         u_int start,end;
999         get_bounds((int)head->addr,&start,&end);
1000         if(ram_offset) {
1001           start-=ram_offset;
1002           end-=ram_offset;
1003         }
1004         if(start<=addr_main&&addr_main<end) {
1005           if(start<addr_min) addr_min=start;
1006           if(end>addr_max) addr_max=end;
1007         }
1008         else if(addr_main<start) {
1009           if(start<inv_code_end)
1010             inv_code_end=start-1;
1011         }
1012         else {
1013           if(end>inv_code_start)
1014             inv_code_start=end;
1015         }
1016       }
1017     }
1018     if (addr_min!=~0) {
1019       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1020       inv_code_start=inv_code_end=~0;
1021       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1022       return;
1023     }
1024     else {
1025       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1026       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1027       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1028       return;
1029     }
1030   }
1031   invalidate_block(addr>>12);
1032 }
1033
1034 // This is called when loading a save state.
1035 // Anything could have changed, so invalidate everything.
1036 void invalidate_all_pages()
1037 {
1038   u_int page;
1039   for(page=0;page<4096;page++)
1040     invalidate_page(page);
1041   for(page=0;page<1048576;page++)
1042     if(!invalid_code[page]) {
1043       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1044       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1045     }
1046   #ifdef USE_MINI_HT
1047   memset(mini_ht,-1,sizeof(mini_ht));
1048   #endif
1049 }
1050
1051 // Add an entry to jump_out after making a link
1052 void add_link(u_int vaddr,void *src)
1053 {
1054   u_int page=get_page(vaddr);
1055   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1056   int *ptr=(int *)(src+4);
1057   assert((*ptr&0x0fff0000)==0x059f0000);
1058   (void)ptr;
1059   ll_add(jump_out+page,vaddr,src);
1060   //int ptr=get_pointer(src);
1061   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1062 }
1063
1064 // If a code block was found to be unmodified (bit was set in
1065 // restore_candidate) and it remains unmodified (bit is clear
1066 // in invalid_code) then move the entries for that 4K page from
1067 // the dirty list to the clean list.
1068 void clean_blocks(u_int page)
1069 {
1070   struct ll_entry *head;
1071   inv_debug("INV: clean_blocks page=%d\n",page);
1072   head=jump_dirty[page];
1073   while(head!=NULL) {
1074     if(!invalid_code[head->vaddr>>12]) {
1075       // Don't restore blocks which are about to expire from the cache
1076       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1077         u_int start,end;
1078         if(verify_dirty(head->addr)) {
1079           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1080           u_int i;
1081           u_int inv=0;
1082           get_bounds((int)head->addr,&start,&end);
1083           if(start-(u_int)rdram<RAM_SIZE) {
1084             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1085               inv|=invalid_code[i];
1086             }
1087           }
1088           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1089             inv=1;
1090           }
1091           if(!inv) {
1092             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1093             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1094               u_int ppage=page;
1095               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1096               //printf("page=%x, addr=%x\n",page,head->vaddr);
1097               //assert(head->vaddr>>12==(page|0x80000));
1098               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1099               u_int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1100               if(ht_bin[0]==head->vaddr) {
1101                 ht_bin[1]=(u_int)clean_addr; // Replace existing entry
1102               }
1103               if(ht_bin[2]==head->vaddr) {
1104                 ht_bin[3]=(u_int)clean_addr; // Replace existing entry
1105               }
1106             }
1107           }
1108         }
1109       }
1110     }
1111     head=head->next;
1112   }
1113 }
1114
1115
1116 void mov_alloc(struct regstat *current,int i)
1117 {
1118   // Note: Don't need to actually alloc the source registers
1119   if((~current->is32>>rs1[i])&1) {
1120     //alloc_reg64(current,i,rs1[i]);
1121     alloc_reg64(current,i,rt1[i]);
1122     current->is32&=~(1LL<<rt1[i]);
1123   } else {
1124     //alloc_reg(current,i,rs1[i]);
1125     alloc_reg(current,i,rt1[i]);
1126     current->is32|=(1LL<<rt1[i]);
1127   }
1128   clear_const(current,rs1[i]);
1129   clear_const(current,rt1[i]);
1130   dirty_reg(current,rt1[i]);
1131 }
1132
1133 void shiftimm_alloc(struct regstat *current,int i)
1134 {
1135   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1136   {
1137     if(rt1[i]) {
1138       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1139       else lt1[i]=rs1[i];
1140       alloc_reg(current,i,rt1[i]);
1141       current->is32|=1LL<<rt1[i];
1142       dirty_reg(current,rt1[i]);
1143       if(is_const(current,rs1[i])) {
1144         int v=get_const(current,rs1[i]);
1145         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1146         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1147         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1148       }
1149       else clear_const(current,rt1[i]);
1150     }
1151   }
1152   else
1153   {
1154     clear_const(current,rs1[i]);
1155     clear_const(current,rt1[i]);
1156   }
1157
1158   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1159   {
1160     if(rt1[i]) {
1161       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1162       alloc_reg64(current,i,rt1[i]);
1163       current->is32&=~(1LL<<rt1[i]);
1164       dirty_reg(current,rt1[i]);
1165     }
1166   }
1167   if(opcode2[i]==0x3c) // DSLL32
1168   {
1169     if(rt1[i]) {
1170       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1171       alloc_reg64(current,i,rt1[i]);
1172       current->is32&=~(1LL<<rt1[i]);
1173       dirty_reg(current,rt1[i]);
1174     }
1175   }
1176   if(opcode2[i]==0x3e) // DSRL32
1177   {
1178     if(rt1[i]) {
1179       alloc_reg64(current,i,rs1[i]);
1180       if(imm[i]==32) {
1181         alloc_reg64(current,i,rt1[i]);
1182         current->is32&=~(1LL<<rt1[i]);
1183       } else {
1184         alloc_reg(current,i,rt1[i]);
1185         current->is32|=1LL<<rt1[i];
1186       }
1187       dirty_reg(current,rt1[i]);
1188     }
1189   }
1190   if(opcode2[i]==0x3f) // DSRA32
1191   {
1192     if(rt1[i]) {
1193       alloc_reg64(current,i,rs1[i]);
1194       alloc_reg(current,i,rt1[i]);
1195       current->is32|=1LL<<rt1[i];
1196       dirty_reg(current,rt1[i]);
1197     }
1198   }
1199 }
1200
1201 void shift_alloc(struct regstat *current,int i)
1202 {
1203   if(rt1[i]) {
1204     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1205     {
1206       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1207       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1208       alloc_reg(current,i,rt1[i]);
1209       if(rt1[i]==rs2[i]) {
1210         alloc_reg_temp(current,i,-1);
1211         minimum_free_regs[i]=1;
1212       }
1213       current->is32|=1LL<<rt1[i];
1214     } else { // DSLLV/DSRLV/DSRAV
1215       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1216       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1217       alloc_reg64(current,i,rt1[i]);
1218       current->is32&=~(1LL<<rt1[i]);
1219       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1220       {
1221         alloc_reg_temp(current,i,-1);
1222         minimum_free_regs[i]=1;
1223       }
1224     }
1225     clear_const(current,rs1[i]);
1226     clear_const(current,rs2[i]);
1227     clear_const(current,rt1[i]);
1228     dirty_reg(current,rt1[i]);
1229   }
1230 }
1231
1232 void alu_alloc(struct regstat *current,int i)
1233 {
1234   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1235     if(rt1[i]) {
1236       if(rs1[i]&&rs2[i]) {
1237         alloc_reg(current,i,rs1[i]);
1238         alloc_reg(current,i,rs2[i]);
1239       }
1240       else {
1241         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1242         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1243       }
1244       alloc_reg(current,i,rt1[i]);
1245     }
1246     current->is32|=1LL<<rt1[i];
1247   }
1248   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1249     if(rt1[i]) {
1250       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1251       {
1252         alloc_reg64(current,i,rs1[i]);
1253         alloc_reg64(current,i,rs2[i]);
1254         alloc_reg(current,i,rt1[i]);
1255       } else {
1256         alloc_reg(current,i,rs1[i]);
1257         alloc_reg(current,i,rs2[i]);
1258         alloc_reg(current,i,rt1[i]);
1259       }
1260     }
1261     current->is32|=1LL<<rt1[i];
1262   }
1263   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1264     if(rt1[i]) {
1265       if(rs1[i]&&rs2[i]) {
1266         alloc_reg(current,i,rs1[i]);
1267         alloc_reg(current,i,rs2[i]);
1268       }
1269       else
1270       {
1271         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1272         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1273       }
1274       alloc_reg(current,i,rt1[i]);
1275       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1276       {
1277         if(!((current->uu>>rt1[i])&1)) {
1278           alloc_reg64(current,i,rt1[i]);
1279         }
1280         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1281           if(rs1[i]&&rs2[i]) {
1282             alloc_reg64(current,i,rs1[i]);
1283             alloc_reg64(current,i,rs2[i]);
1284           }
1285           else
1286           {
1287             // Is is really worth it to keep 64-bit values in registers?
1288             #ifdef NATIVE_64BIT
1289             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1290             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1291             #endif
1292           }
1293         }
1294         current->is32&=~(1LL<<rt1[i]);
1295       } else {
1296         current->is32|=1LL<<rt1[i];
1297       }
1298     }
1299   }
1300   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1301     if(rt1[i]) {
1302       if(rs1[i]&&rs2[i]) {
1303         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1304           alloc_reg64(current,i,rs1[i]);
1305           alloc_reg64(current,i,rs2[i]);
1306           alloc_reg64(current,i,rt1[i]);
1307         } else {
1308           alloc_reg(current,i,rs1[i]);
1309           alloc_reg(current,i,rs2[i]);
1310           alloc_reg(current,i,rt1[i]);
1311         }
1312       }
1313       else {
1314         alloc_reg(current,i,rt1[i]);
1315         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1316           // DADD used as move, or zeroing
1317           // If we have a 64-bit source, then make the target 64 bits too
1318           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1319             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1320             alloc_reg64(current,i,rt1[i]);
1321           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1322             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1323             alloc_reg64(current,i,rt1[i]);
1324           }
1325           if(opcode2[i]>=0x2e&&rs2[i]) {
1326             // DSUB used as negation - 64-bit result
1327             // If we have a 32-bit register, extend it to 64 bits
1328             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1329             alloc_reg64(current,i,rt1[i]);
1330           }
1331         }
1332       }
1333       if(rs1[i]&&rs2[i]) {
1334         current->is32&=~(1LL<<rt1[i]);
1335       } else if(rs1[i]) {
1336         current->is32&=~(1LL<<rt1[i]);
1337         if((current->is32>>rs1[i])&1)
1338           current->is32|=1LL<<rt1[i];
1339       } else if(rs2[i]) {
1340         current->is32&=~(1LL<<rt1[i]);
1341         if((current->is32>>rs2[i])&1)
1342           current->is32|=1LL<<rt1[i];
1343       } else {
1344         current->is32|=1LL<<rt1[i];
1345       }
1346     }
1347   }
1348   clear_const(current,rs1[i]);
1349   clear_const(current,rs2[i]);
1350   clear_const(current,rt1[i]);
1351   dirty_reg(current,rt1[i]);
1352 }
1353
1354 void imm16_alloc(struct regstat *current,int i)
1355 {
1356   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1357   else lt1[i]=rs1[i];
1358   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1359   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1360     current->is32&=~(1LL<<rt1[i]);
1361     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1362       // TODO: Could preserve the 32-bit flag if the immediate is zero
1363       alloc_reg64(current,i,rt1[i]);
1364       alloc_reg64(current,i,rs1[i]);
1365     }
1366     clear_const(current,rs1[i]);
1367     clear_const(current,rt1[i]);
1368   }
1369   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1370     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1371     current->is32|=1LL<<rt1[i];
1372     clear_const(current,rs1[i]);
1373     clear_const(current,rt1[i]);
1374   }
1375   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1376     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1377       if(rs1[i]!=rt1[i]) {
1378         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1379         alloc_reg64(current,i,rt1[i]);
1380         current->is32&=~(1LL<<rt1[i]);
1381       }
1382     }
1383     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1384     if(is_const(current,rs1[i])) {
1385       int v=get_const(current,rs1[i]);
1386       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1387       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1388       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1389     }
1390     else clear_const(current,rt1[i]);
1391   }
1392   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1393     if(is_const(current,rs1[i])) {
1394       int v=get_const(current,rs1[i]);
1395       set_const(current,rt1[i],v+imm[i]);
1396     }
1397     else clear_const(current,rt1[i]);
1398     current->is32|=1LL<<rt1[i];
1399   }
1400   else {
1401     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1402     current->is32|=1LL<<rt1[i];
1403   }
1404   dirty_reg(current,rt1[i]);
1405 }
1406
1407 void load_alloc(struct regstat *current,int i)
1408 {
1409   clear_const(current,rt1[i]);
1410   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1411   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1412   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1413   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1414     alloc_reg(current,i,rt1[i]);
1415     assert(get_reg(current->regmap,rt1[i])>=0);
1416     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1417     {
1418       current->is32&=~(1LL<<rt1[i]);
1419       alloc_reg64(current,i,rt1[i]);
1420     }
1421     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1422     {
1423       current->is32&=~(1LL<<rt1[i]);
1424       alloc_reg64(current,i,rt1[i]);
1425       alloc_all(current,i);
1426       alloc_reg64(current,i,FTEMP);
1427       minimum_free_regs[i]=HOST_REGS;
1428     }
1429     else current->is32|=1LL<<rt1[i];
1430     dirty_reg(current,rt1[i]);
1431     // LWL/LWR need a temporary register for the old value
1432     if(opcode[i]==0x22||opcode[i]==0x26)
1433     {
1434       alloc_reg(current,i,FTEMP);
1435       alloc_reg_temp(current,i,-1);
1436       minimum_free_regs[i]=1;
1437     }
1438   }
1439   else
1440   {
1441     // Load to r0 or unneeded register (dummy load)
1442     // but we still need a register to calculate the address
1443     if(opcode[i]==0x22||opcode[i]==0x26)
1444     {
1445       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1446     }
1447     alloc_reg_temp(current,i,-1);
1448     minimum_free_regs[i]=1;
1449     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1450     {
1451       alloc_all(current,i);
1452       alloc_reg64(current,i,FTEMP);
1453       minimum_free_regs[i]=HOST_REGS;
1454     }
1455   }
1456 }
1457
1458 void store_alloc(struct regstat *current,int i)
1459 {
1460   clear_const(current,rs2[i]);
1461   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1462   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1463   alloc_reg(current,i,rs2[i]);
1464   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1465     alloc_reg64(current,i,rs2[i]);
1466     if(rs2[i]) alloc_reg(current,i,FTEMP);
1467   }
1468   #if defined(HOST_IMM8)
1469   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1470   else alloc_reg(current,i,INVCP);
1471   #endif
1472   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1473     alloc_reg(current,i,FTEMP);
1474   }
1475   // We need a temporary register for address generation
1476   alloc_reg_temp(current,i,-1);
1477   minimum_free_regs[i]=1;
1478 }
1479
1480 void c1ls_alloc(struct regstat *current,int i)
1481 {
1482   //clear_const(current,rs1[i]); // FIXME
1483   clear_const(current,rt1[i]);
1484   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1485   alloc_reg(current,i,CSREG); // Status
1486   alloc_reg(current,i,FTEMP);
1487   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1488     alloc_reg64(current,i,FTEMP);
1489   }
1490   #if defined(HOST_IMM8)
1491   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1492   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1493     alloc_reg(current,i,INVCP);
1494   #endif
1495   // We need a temporary register for address generation
1496   alloc_reg_temp(current,i,-1);
1497 }
1498
1499 void c2ls_alloc(struct regstat *current,int i)
1500 {
1501   clear_const(current,rt1[i]);
1502   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1503   alloc_reg(current,i,FTEMP);
1504   #if defined(HOST_IMM8)
1505   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1506   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1507     alloc_reg(current,i,INVCP);
1508   #endif
1509   // We need a temporary register for address generation
1510   alloc_reg_temp(current,i,-1);
1511   minimum_free_regs[i]=1;
1512 }
1513
1514 #ifndef multdiv_alloc
1515 void multdiv_alloc(struct regstat *current,int i)
1516 {
1517   //  case 0x18: MULT
1518   //  case 0x19: MULTU
1519   //  case 0x1A: DIV
1520   //  case 0x1B: DIVU
1521   //  case 0x1C: DMULT
1522   //  case 0x1D: DMULTU
1523   //  case 0x1E: DDIV
1524   //  case 0x1F: DDIVU
1525   clear_const(current,rs1[i]);
1526   clear_const(current,rs2[i]);
1527   if(rs1[i]&&rs2[i])
1528   {
1529     if((opcode2[i]&4)==0) // 32-bit
1530     {
1531       current->u&=~(1LL<<HIREG);
1532       current->u&=~(1LL<<LOREG);
1533       alloc_reg(current,i,HIREG);
1534       alloc_reg(current,i,LOREG);
1535       alloc_reg(current,i,rs1[i]);
1536       alloc_reg(current,i,rs2[i]);
1537       current->is32|=1LL<<HIREG;
1538       current->is32|=1LL<<LOREG;
1539       dirty_reg(current,HIREG);
1540       dirty_reg(current,LOREG);
1541     }
1542     else // 64-bit
1543     {
1544       current->u&=~(1LL<<HIREG);
1545       current->u&=~(1LL<<LOREG);
1546       current->uu&=~(1LL<<HIREG);
1547       current->uu&=~(1LL<<LOREG);
1548       alloc_reg64(current,i,HIREG);
1549       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1550       alloc_reg64(current,i,rs1[i]);
1551       alloc_reg64(current,i,rs2[i]);
1552       alloc_all(current,i);
1553       current->is32&=~(1LL<<HIREG);
1554       current->is32&=~(1LL<<LOREG);
1555       dirty_reg(current,HIREG);
1556       dirty_reg(current,LOREG);
1557       minimum_free_regs[i]=HOST_REGS;
1558     }
1559   }
1560   else
1561   {
1562     // Multiply by zero is zero.
1563     // MIPS does not have a divide by zero exception.
1564     // The result is undefined, we return zero.
1565     alloc_reg(current,i,HIREG);
1566     alloc_reg(current,i,LOREG);
1567     current->is32|=1LL<<HIREG;
1568     current->is32|=1LL<<LOREG;
1569     dirty_reg(current,HIREG);
1570     dirty_reg(current,LOREG);
1571   }
1572 }
1573 #endif
1574
1575 void cop0_alloc(struct regstat *current,int i)
1576 {
1577   if(opcode2[i]==0) // MFC0
1578   {
1579     if(rt1[i]) {
1580       clear_const(current,rt1[i]);
1581       alloc_all(current,i);
1582       alloc_reg(current,i,rt1[i]);
1583       current->is32|=1LL<<rt1[i];
1584       dirty_reg(current,rt1[i]);
1585     }
1586   }
1587   else if(opcode2[i]==4) // MTC0
1588   {
1589     if(rs1[i]){
1590       clear_const(current,rs1[i]);
1591       alloc_reg(current,i,rs1[i]);
1592       alloc_all(current,i);
1593     }
1594     else {
1595       alloc_all(current,i); // FIXME: Keep r0
1596       current->u&=~1LL;
1597       alloc_reg(current,i,0);
1598     }
1599   }
1600   else
1601   {
1602     // TLBR/TLBWI/TLBWR/TLBP/ERET
1603     assert(opcode2[i]==0x10);
1604     alloc_all(current,i);
1605   }
1606   minimum_free_regs[i]=HOST_REGS;
1607 }
1608
1609 void cop1_alloc(struct regstat *current,int i)
1610 {
1611   alloc_reg(current,i,CSREG); // Load status
1612   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1613   {
1614     if(rt1[i]){
1615       clear_const(current,rt1[i]);
1616       if(opcode2[i]==1) {
1617         alloc_reg64(current,i,rt1[i]); // DMFC1
1618         current->is32&=~(1LL<<rt1[i]);
1619       }else{
1620         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1621         current->is32|=1LL<<rt1[i];
1622       }
1623       dirty_reg(current,rt1[i]);
1624     }
1625     alloc_reg_temp(current,i,-1);
1626   }
1627   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1628   {
1629     if(rs1[i]){
1630       clear_const(current,rs1[i]);
1631       if(opcode2[i]==5)
1632         alloc_reg64(current,i,rs1[i]); // DMTC1
1633       else
1634         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1635       alloc_reg_temp(current,i,-1);
1636     }
1637     else {
1638       current->u&=~1LL;
1639       alloc_reg(current,i,0);
1640       alloc_reg_temp(current,i,-1);
1641     }
1642   }
1643   minimum_free_regs[i]=1;
1644 }
1645 void fconv_alloc(struct regstat *current,int i)
1646 {
1647   alloc_reg(current,i,CSREG); // Load status
1648   alloc_reg_temp(current,i,-1);
1649   minimum_free_regs[i]=1;
1650 }
1651 void float_alloc(struct regstat *current,int i)
1652 {
1653   alloc_reg(current,i,CSREG); // Load status
1654   alloc_reg_temp(current,i,-1);
1655   minimum_free_regs[i]=1;
1656 }
1657 void c2op_alloc(struct regstat *current,int i)
1658 {
1659   alloc_reg_temp(current,i,-1);
1660 }
1661 void fcomp_alloc(struct regstat *current,int i)
1662 {
1663   alloc_reg(current,i,CSREG); // Load status
1664   alloc_reg(current,i,FSREG); // Load flags
1665   dirty_reg(current,FSREG); // Flag will be modified
1666   alloc_reg_temp(current,i,-1);
1667   minimum_free_regs[i]=1;
1668 }
1669
1670 void syscall_alloc(struct regstat *current,int i)
1671 {
1672   alloc_cc(current,i);
1673   dirty_reg(current,CCREG);
1674   alloc_all(current,i);
1675   minimum_free_regs[i]=HOST_REGS;
1676   current->isconst=0;
1677 }
1678
1679 void delayslot_alloc(struct regstat *current,int i)
1680 {
1681   switch(itype[i]) {
1682     case UJUMP:
1683     case CJUMP:
1684     case SJUMP:
1685     case RJUMP:
1686     case FJUMP:
1687     case SYSCALL:
1688     case HLECALL:
1689     case SPAN:
1690       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1691       SysPrintf("Disabled speculative precompilation\n");
1692       stop_after_jal=1;
1693       break;
1694     case IMM16:
1695       imm16_alloc(current,i);
1696       break;
1697     case LOAD:
1698     case LOADLR:
1699       load_alloc(current,i);
1700       break;
1701     case STORE:
1702     case STORELR:
1703       store_alloc(current,i);
1704       break;
1705     case ALU:
1706       alu_alloc(current,i);
1707       break;
1708     case SHIFT:
1709       shift_alloc(current,i);
1710       break;
1711     case MULTDIV:
1712       multdiv_alloc(current,i);
1713       break;
1714     case SHIFTIMM:
1715       shiftimm_alloc(current,i);
1716       break;
1717     case MOV:
1718       mov_alloc(current,i);
1719       break;
1720     case COP0:
1721       cop0_alloc(current,i);
1722       break;
1723     case COP1:
1724     case COP2:
1725       cop1_alloc(current,i);
1726       break;
1727     case C1LS:
1728       c1ls_alloc(current,i);
1729       break;
1730     case C2LS:
1731       c2ls_alloc(current,i);
1732       break;
1733     case FCONV:
1734       fconv_alloc(current,i);
1735       break;
1736     case FLOAT:
1737       float_alloc(current,i);
1738       break;
1739     case FCOMP:
1740       fcomp_alloc(current,i);
1741       break;
1742     case C2OP:
1743       c2op_alloc(current,i);
1744       break;
1745   }
1746 }
1747
1748 // Special case where a branch and delay slot span two pages in virtual memory
1749 static void pagespan_alloc(struct regstat *current,int i)
1750 {
1751   current->isconst=0;
1752   current->wasconst=0;
1753   regs[i].wasconst=0;
1754   minimum_free_regs[i]=HOST_REGS;
1755   alloc_all(current,i);
1756   alloc_cc(current,i);
1757   dirty_reg(current,CCREG);
1758   if(opcode[i]==3) // JAL
1759   {
1760     alloc_reg(current,i,31);
1761     dirty_reg(current,31);
1762   }
1763   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1764   {
1765     alloc_reg(current,i,rs1[i]);
1766     if (rt1[i]!=0) {
1767       alloc_reg(current,i,rt1[i]);
1768       dirty_reg(current,rt1[i]);
1769     }
1770   }
1771   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1772   {
1773     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1774     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1775     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1776     {
1777       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1778       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1779     }
1780   }
1781   else
1782   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1783   {
1784     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1785     if(!((current->is32>>rs1[i])&1))
1786     {
1787       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1788     }
1789   }
1790   else
1791   if(opcode[i]==0x11) // BC1
1792   {
1793     alloc_reg(current,i,FSREG);
1794     alloc_reg(current,i,CSREG);
1795   }
1796   //else ...
1797 }
1798
1799 static void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1800 {
1801   stubs[stubcount][0]=type;
1802   stubs[stubcount][1]=addr;
1803   stubs[stubcount][2]=retaddr;
1804   stubs[stubcount][3]=a;
1805   stubs[stubcount][4]=b;
1806   stubs[stubcount][5]=c;
1807   stubs[stubcount][6]=d;
1808   stubs[stubcount][7]=e;
1809   stubcount++;
1810 }
1811
1812 // Write out a single register
1813 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1814 {
1815   int hr;
1816   for(hr=0;hr<HOST_REGS;hr++) {
1817     if(hr!=EXCLUDE_REG) {
1818       if((regmap[hr]&63)==r) {
1819         if((dirty>>hr)&1) {
1820           if(regmap[hr]<64) {
1821             emit_storereg(r,hr);
1822           }else{
1823             emit_storereg(r|64,hr);
1824           }
1825         }
1826       }
1827     }
1828   }
1829 }
1830
1831 int mchecksum()
1832 {
1833   //if(!tracedebug) return 0;
1834   int i;
1835   int sum=0;
1836   for(i=0;i<2097152;i++) {
1837     unsigned int temp=sum;
1838     sum<<=1;
1839     sum|=(~temp)>>31;
1840     sum^=((u_int *)rdram)[i];
1841   }
1842   return sum;
1843 }
1844 int rchecksum()
1845 {
1846   int i;
1847   int sum=0;
1848   for(i=0;i<64;i++)
1849     sum^=((u_int *)reg)[i];
1850   return sum;
1851 }
1852 void rlist()
1853 {
1854   int i;
1855   printf("TRACE: ");
1856   for(i=0;i<32;i++)
1857     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1858   printf("\n");
1859 }
1860
1861 void enabletrace()
1862 {
1863   tracedebug=1;
1864 }
1865
1866 void memdebug(int i)
1867 {
1868   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1869   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1870   //rlist();
1871   //if(tracedebug) {
1872   //if(Count>=-2084597794) {
1873   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1874   //if(0) {
1875     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1876     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1877     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1878     rlist();
1879     #ifdef __i386__
1880     printf("TRACE: %x\n",(&i)[-1]);
1881     #endif
1882     #ifdef __arm__
1883     int j;
1884     printf("TRACE: %x \n",(&j)[10]);
1885     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1886     #endif
1887     //fflush(stdout);
1888   }
1889   //printf("TRACE: %x\n",(&i)[-1]);
1890 }
1891
1892 void alu_assemble(int i,struct regstat *i_regs)
1893 {
1894   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1895     if(rt1[i]) {
1896       signed char s1,s2,t;
1897       t=get_reg(i_regs->regmap,rt1[i]);
1898       if(t>=0) {
1899         s1=get_reg(i_regs->regmap,rs1[i]);
1900         s2=get_reg(i_regs->regmap,rs2[i]);
1901         if(rs1[i]&&rs2[i]) {
1902           assert(s1>=0);
1903           assert(s2>=0);
1904           if(opcode2[i]&2) emit_sub(s1,s2,t);
1905           else emit_add(s1,s2,t);
1906         }
1907         else if(rs1[i]) {
1908           if(s1>=0) emit_mov(s1,t);
1909           else emit_loadreg(rs1[i],t);
1910         }
1911         else if(rs2[i]) {
1912           if(s2>=0) {
1913             if(opcode2[i]&2) emit_neg(s2,t);
1914             else emit_mov(s2,t);
1915           }
1916           else {
1917             emit_loadreg(rs2[i],t);
1918             if(opcode2[i]&2) emit_neg(t,t);
1919           }
1920         }
1921         else emit_zeroreg(t);
1922       }
1923     }
1924   }
1925   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1926     if(rt1[i]) {
1927       signed char s1l,s2l,s1h,s2h,tl,th;
1928       tl=get_reg(i_regs->regmap,rt1[i]);
1929       th=get_reg(i_regs->regmap,rt1[i]|64);
1930       if(tl>=0) {
1931         s1l=get_reg(i_regs->regmap,rs1[i]);
1932         s2l=get_reg(i_regs->regmap,rs2[i]);
1933         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1934         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1935         if(rs1[i]&&rs2[i]) {
1936           assert(s1l>=0);
1937           assert(s2l>=0);
1938           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1939           else emit_adds(s1l,s2l,tl);
1940           if(th>=0) {
1941             #ifdef INVERTED_CARRY
1942             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1943             #else
1944             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1945             #endif
1946             else emit_add(s1h,s2h,th);
1947           }
1948         }
1949         else if(rs1[i]) {
1950           if(s1l>=0) emit_mov(s1l,tl);
1951           else emit_loadreg(rs1[i],tl);
1952           if(th>=0) {
1953             if(s1h>=0) emit_mov(s1h,th);
1954             else emit_loadreg(rs1[i]|64,th);
1955           }
1956         }
1957         else if(rs2[i]) {
1958           if(s2l>=0) {
1959             if(opcode2[i]&2) emit_negs(s2l,tl);
1960             else emit_mov(s2l,tl);
1961           }
1962           else {
1963             emit_loadreg(rs2[i],tl);
1964             if(opcode2[i]&2) emit_negs(tl,tl);
1965           }
1966           if(th>=0) {
1967             #ifdef INVERTED_CARRY
1968             if(s2h>=0) emit_mov(s2h,th);
1969             else emit_loadreg(rs2[i]|64,th);
1970             if(opcode2[i]&2) {
1971               emit_adcimm(-1,th); // x86 has inverted carry flag
1972               emit_not(th,th);
1973             }
1974             #else
1975             if(opcode2[i]&2) {
1976               if(s2h>=0) emit_rscimm(s2h,0,th);
1977               else {
1978                 emit_loadreg(rs2[i]|64,th);
1979                 emit_rscimm(th,0,th);
1980               }
1981             }else{
1982               if(s2h>=0) emit_mov(s2h,th);
1983               else emit_loadreg(rs2[i]|64,th);
1984             }
1985             #endif
1986           }
1987         }
1988         else {
1989           emit_zeroreg(tl);
1990           if(th>=0) emit_zeroreg(th);
1991         }
1992       }
1993     }
1994   }
1995   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1996     if(rt1[i]) {
1997       signed char s1l,s1h,s2l,s2h,t;
1998       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
1999       {
2000         t=get_reg(i_regs->regmap,rt1[i]);
2001         //assert(t>=0);
2002         if(t>=0) {
2003           s1l=get_reg(i_regs->regmap,rs1[i]);
2004           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2005           s2l=get_reg(i_regs->regmap,rs2[i]);
2006           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2007           if(rs2[i]==0) // rx<r0
2008           {
2009             assert(s1h>=0);
2010             if(opcode2[i]==0x2a) // SLT
2011               emit_shrimm(s1h,31,t);
2012             else // SLTU (unsigned can not be less than zero)
2013               emit_zeroreg(t);
2014           }
2015           else if(rs1[i]==0) // r0<rx
2016           {
2017             assert(s2h>=0);
2018             if(opcode2[i]==0x2a) // SLT
2019               emit_set_gz64_32(s2h,s2l,t);
2020             else // SLTU (set if not zero)
2021               emit_set_nz64_32(s2h,s2l,t);
2022           }
2023           else {
2024             assert(s1l>=0);assert(s1h>=0);
2025             assert(s2l>=0);assert(s2h>=0);
2026             if(opcode2[i]==0x2a) // SLT
2027               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2028             else // SLTU
2029               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2030           }
2031         }
2032       } else {
2033         t=get_reg(i_regs->regmap,rt1[i]);
2034         //assert(t>=0);
2035         if(t>=0) {
2036           s1l=get_reg(i_regs->regmap,rs1[i]);
2037           s2l=get_reg(i_regs->regmap,rs2[i]);
2038           if(rs2[i]==0) // rx<r0
2039           {
2040             assert(s1l>=0);
2041             if(opcode2[i]==0x2a) // SLT
2042               emit_shrimm(s1l,31,t);
2043             else // SLTU (unsigned can not be less than zero)
2044               emit_zeroreg(t);
2045           }
2046           else if(rs1[i]==0) // r0<rx
2047           {
2048             assert(s2l>=0);
2049             if(opcode2[i]==0x2a) // SLT
2050               emit_set_gz32(s2l,t);
2051             else // SLTU (set if not zero)
2052               emit_set_nz32(s2l,t);
2053           }
2054           else{
2055             assert(s1l>=0);assert(s2l>=0);
2056             if(opcode2[i]==0x2a) // SLT
2057               emit_set_if_less32(s1l,s2l,t);
2058             else // SLTU
2059               emit_set_if_carry32(s1l,s2l,t);
2060           }
2061         }
2062       }
2063     }
2064   }
2065   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2066     if(rt1[i]) {
2067       signed char s1l,s1h,s2l,s2h,th,tl;
2068       tl=get_reg(i_regs->regmap,rt1[i]);
2069       th=get_reg(i_regs->regmap,rt1[i]|64);
2070       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2071       {
2072         assert(tl>=0);
2073         if(tl>=0) {
2074           s1l=get_reg(i_regs->regmap,rs1[i]);
2075           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2076           s2l=get_reg(i_regs->regmap,rs2[i]);
2077           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2078           if(rs1[i]&&rs2[i]) {
2079             assert(s1l>=0);assert(s1h>=0);
2080             assert(s2l>=0);assert(s2h>=0);
2081             if(opcode2[i]==0x24) { // AND
2082               emit_and(s1l,s2l,tl);
2083               emit_and(s1h,s2h,th);
2084             } else
2085             if(opcode2[i]==0x25) { // OR
2086               emit_or(s1l,s2l,tl);
2087               emit_or(s1h,s2h,th);
2088             } else
2089             if(opcode2[i]==0x26) { // XOR
2090               emit_xor(s1l,s2l,tl);
2091               emit_xor(s1h,s2h,th);
2092             } else
2093             if(opcode2[i]==0x27) { // NOR
2094               emit_or(s1l,s2l,tl);
2095               emit_or(s1h,s2h,th);
2096               emit_not(tl,tl);
2097               emit_not(th,th);
2098             }
2099           }
2100           else
2101           {
2102             if(opcode2[i]==0x24) { // AND
2103               emit_zeroreg(tl);
2104               emit_zeroreg(th);
2105             } else
2106             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2107               if(rs1[i]){
2108                 if(s1l>=0) emit_mov(s1l,tl);
2109                 else emit_loadreg(rs1[i],tl);
2110                 if(s1h>=0) emit_mov(s1h,th);
2111                 else emit_loadreg(rs1[i]|64,th);
2112               }
2113               else
2114               if(rs2[i]){
2115                 if(s2l>=0) emit_mov(s2l,tl);
2116                 else emit_loadreg(rs2[i],tl);
2117                 if(s2h>=0) emit_mov(s2h,th);
2118                 else emit_loadreg(rs2[i]|64,th);
2119               }
2120               else{
2121                 emit_zeroreg(tl);
2122                 emit_zeroreg(th);
2123               }
2124             } else
2125             if(opcode2[i]==0x27) { // NOR
2126               if(rs1[i]){
2127                 if(s1l>=0) emit_not(s1l,tl);
2128                 else{
2129                   emit_loadreg(rs1[i],tl);
2130                   emit_not(tl,tl);
2131                 }
2132                 if(s1h>=0) emit_not(s1h,th);
2133                 else{
2134                   emit_loadreg(rs1[i]|64,th);
2135                   emit_not(th,th);
2136                 }
2137               }
2138               else
2139               if(rs2[i]){
2140                 if(s2l>=0) emit_not(s2l,tl);
2141                 else{
2142                   emit_loadreg(rs2[i],tl);
2143                   emit_not(tl,tl);
2144                 }
2145                 if(s2h>=0) emit_not(s2h,th);
2146                 else{
2147                   emit_loadreg(rs2[i]|64,th);
2148                   emit_not(th,th);
2149                 }
2150               }
2151               else {
2152                 emit_movimm(-1,tl);
2153                 emit_movimm(-1,th);
2154               }
2155             }
2156           }
2157         }
2158       }
2159       else
2160       {
2161         // 32 bit
2162         if(tl>=0) {
2163           s1l=get_reg(i_regs->regmap,rs1[i]);
2164           s2l=get_reg(i_regs->regmap,rs2[i]);
2165           if(rs1[i]&&rs2[i]) {
2166             assert(s1l>=0);
2167             assert(s2l>=0);
2168             if(opcode2[i]==0x24) { // AND
2169               emit_and(s1l,s2l,tl);
2170             } else
2171             if(opcode2[i]==0x25) { // OR
2172               emit_or(s1l,s2l,tl);
2173             } else
2174             if(opcode2[i]==0x26) { // XOR
2175               emit_xor(s1l,s2l,tl);
2176             } else
2177             if(opcode2[i]==0x27) { // NOR
2178               emit_or(s1l,s2l,tl);
2179               emit_not(tl,tl);
2180             }
2181           }
2182           else
2183           {
2184             if(opcode2[i]==0x24) { // AND
2185               emit_zeroreg(tl);
2186             } else
2187             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2188               if(rs1[i]){
2189                 if(s1l>=0) emit_mov(s1l,tl);
2190                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2191               }
2192               else
2193               if(rs2[i]){
2194                 if(s2l>=0) emit_mov(s2l,tl);
2195                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2196               }
2197               else emit_zeroreg(tl);
2198             } else
2199             if(opcode2[i]==0x27) { // NOR
2200               if(rs1[i]){
2201                 if(s1l>=0) emit_not(s1l,tl);
2202                 else {
2203                   emit_loadreg(rs1[i],tl);
2204                   emit_not(tl,tl);
2205                 }
2206               }
2207               else
2208               if(rs2[i]){
2209                 if(s2l>=0) emit_not(s2l,tl);
2210                 else {
2211                   emit_loadreg(rs2[i],tl);
2212                   emit_not(tl,tl);
2213                 }
2214               }
2215               else emit_movimm(-1,tl);
2216             }
2217           }
2218         }
2219       }
2220     }
2221   }
2222 }
2223
2224 void imm16_assemble(int i,struct regstat *i_regs)
2225 {
2226   if (opcode[i]==0x0f) { // LUI
2227     if(rt1[i]) {
2228       signed char t;
2229       t=get_reg(i_regs->regmap,rt1[i]);
2230       //assert(t>=0);
2231       if(t>=0) {
2232         if(!((i_regs->isconst>>t)&1))
2233           emit_movimm(imm[i]<<16,t);
2234       }
2235     }
2236   }
2237   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2238     if(rt1[i]) {
2239       signed char s,t;
2240       t=get_reg(i_regs->regmap,rt1[i]);
2241       s=get_reg(i_regs->regmap,rs1[i]);
2242       if(rs1[i]) {
2243         //assert(t>=0);
2244         //assert(s>=0);
2245         if(t>=0) {
2246           if(!((i_regs->isconst>>t)&1)) {
2247             if(s<0) {
2248               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2249               emit_addimm(t,imm[i],t);
2250             }else{
2251               if(!((i_regs->wasconst>>s)&1))
2252                 emit_addimm(s,imm[i],t);
2253               else
2254                 emit_movimm(constmap[i][s]+imm[i],t);
2255             }
2256           }
2257         }
2258       } else {
2259         if(t>=0) {
2260           if(!((i_regs->isconst>>t)&1))
2261             emit_movimm(imm[i],t);
2262         }
2263       }
2264     }
2265   }
2266   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2267     if(rt1[i]) {
2268       signed char sh,sl,th,tl;
2269       th=get_reg(i_regs->regmap,rt1[i]|64);
2270       tl=get_reg(i_regs->regmap,rt1[i]);
2271       sh=get_reg(i_regs->regmap,rs1[i]|64);
2272       sl=get_reg(i_regs->regmap,rs1[i]);
2273       if(tl>=0) {
2274         if(rs1[i]) {
2275           assert(sh>=0);
2276           assert(sl>=0);
2277           if(th>=0) {
2278             emit_addimm64_32(sh,sl,imm[i],th,tl);
2279           }
2280           else {
2281             emit_addimm(sl,imm[i],tl);
2282           }
2283         } else {
2284           emit_movimm(imm[i],tl);
2285           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2286         }
2287       }
2288     }
2289   }
2290   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2291     if(rt1[i]) {
2292       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2293       signed char sh,sl,t;
2294       t=get_reg(i_regs->regmap,rt1[i]);
2295       sh=get_reg(i_regs->regmap,rs1[i]|64);
2296       sl=get_reg(i_regs->regmap,rs1[i]);
2297       //assert(t>=0);
2298       if(t>=0) {
2299         if(rs1[i]>0) {
2300           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2301           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2302             if(opcode[i]==0x0a) { // SLTI
2303               if(sl<0) {
2304                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2305                 emit_slti32(t,imm[i],t);
2306               }else{
2307                 emit_slti32(sl,imm[i],t);
2308               }
2309             }
2310             else { // SLTIU
2311               if(sl<0) {
2312                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2313                 emit_sltiu32(t,imm[i],t);
2314               }else{
2315                 emit_sltiu32(sl,imm[i],t);
2316               }
2317             }
2318           }else{ // 64-bit
2319             assert(sl>=0);
2320             if(opcode[i]==0x0a) // SLTI
2321               emit_slti64_32(sh,sl,imm[i],t);
2322             else // SLTIU
2323               emit_sltiu64_32(sh,sl,imm[i],t);
2324           }
2325         }else{
2326           // SLTI(U) with r0 is just stupid,
2327           // nonetheless examples can be found
2328           if(opcode[i]==0x0a) // SLTI
2329             if(0<imm[i]) emit_movimm(1,t);
2330             else emit_zeroreg(t);
2331           else // SLTIU
2332           {
2333             if(imm[i]) emit_movimm(1,t);
2334             else emit_zeroreg(t);
2335           }
2336         }
2337       }
2338     }
2339   }
2340   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2341     if(rt1[i]) {
2342       signed char sh,sl,th,tl;
2343       th=get_reg(i_regs->regmap,rt1[i]|64);
2344       tl=get_reg(i_regs->regmap,rt1[i]);
2345       sh=get_reg(i_regs->regmap,rs1[i]|64);
2346       sl=get_reg(i_regs->regmap,rs1[i]);
2347       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2348         if(opcode[i]==0x0c) //ANDI
2349         {
2350           if(rs1[i]) {
2351             if(sl<0) {
2352               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2353               emit_andimm(tl,imm[i],tl);
2354             }else{
2355               if(!((i_regs->wasconst>>sl)&1))
2356                 emit_andimm(sl,imm[i],tl);
2357               else
2358                 emit_movimm(constmap[i][sl]&imm[i],tl);
2359             }
2360           }
2361           else
2362             emit_zeroreg(tl);
2363           if(th>=0) emit_zeroreg(th);
2364         }
2365         else
2366         {
2367           if(rs1[i]) {
2368             if(sl<0) {
2369               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2370             }
2371             if(th>=0) {
2372               if(sh<0) {
2373                 emit_loadreg(rs1[i]|64,th);
2374               }else{
2375                 emit_mov(sh,th);
2376               }
2377             }
2378             if(opcode[i]==0x0d) { // ORI
2379               if(sl<0) {
2380                 emit_orimm(tl,imm[i],tl);
2381               }else{
2382                 if(!((i_regs->wasconst>>sl)&1))
2383                   emit_orimm(sl,imm[i],tl);
2384                 else
2385                   emit_movimm(constmap[i][sl]|imm[i],tl);
2386               }
2387             }
2388             if(opcode[i]==0x0e) { // XORI
2389               if(sl<0) {
2390                 emit_xorimm(tl,imm[i],tl);
2391               }else{
2392                 if(!((i_regs->wasconst>>sl)&1))
2393                   emit_xorimm(sl,imm[i],tl);
2394                 else
2395                   emit_movimm(constmap[i][sl]^imm[i],tl);
2396               }
2397             }
2398           }
2399           else {
2400             emit_movimm(imm[i],tl);
2401             if(th>=0) emit_zeroreg(th);
2402           }
2403         }
2404       }
2405     }
2406   }
2407 }
2408
2409 void shiftimm_assemble(int i,struct regstat *i_regs)
2410 {
2411   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2412   {
2413     if(rt1[i]) {
2414       signed char s,t;
2415       t=get_reg(i_regs->regmap,rt1[i]);
2416       s=get_reg(i_regs->regmap,rs1[i]);
2417       //assert(t>=0);
2418       if(t>=0&&!((i_regs->isconst>>t)&1)){
2419         if(rs1[i]==0)
2420         {
2421           emit_zeroreg(t);
2422         }
2423         else
2424         {
2425           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2426           if(imm[i]) {
2427             if(opcode2[i]==0) // SLL
2428             {
2429               emit_shlimm(s<0?t:s,imm[i],t);
2430             }
2431             if(opcode2[i]==2) // SRL
2432             {
2433               emit_shrimm(s<0?t:s,imm[i],t);
2434             }
2435             if(opcode2[i]==3) // SRA
2436             {
2437               emit_sarimm(s<0?t:s,imm[i],t);
2438             }
2439           }else{
2440             // Shift by zero
2441             if(s>=0 && s!=t) emit_mov(s,t);
2442           }
2443         }
2444       }
2445       //emit_storereg(rt1[i],t); //DEBUG
2446     }
2447   }
2448   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2449   {
2450     if(rt1[i]) {
2451       signed char sh,sl,th,tl;
2452       th=get_reg(i_regs->regmap,rt1[i]|64);
2453       tl=get_reg(i_regs->regmap,rt1[i]);
2454       sh=get_reg(i_regs->regmap,rs1[i]|64);
2455       sl=get_reg(i_regs->regmap,rs1[i]);
2456       if(tl>=0) {
2457         if(rs1[i]==0)
2458         {
2459           emit_zeroreg(tl);
2460           if(th>=0) emit_zeroreg(th);
2461         }
2462         else
2463         {
2464           assert(sl>=0);
2465           assert(sh>=0);
2466           if(imm[i]) {
2467             if(opcode2[i]==0x38) // DSLL
2468             {
2469               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2470               emit_shlimm(sl,imm[i],tl);
2471             }
2472             if(opcode2[i]==0x3a) // DSRL
2473             {
2474               emit_shrdimm(sl,sh,imm[i],tl);
2475               if(th>=0) emit_shrimm(sh,imm[i],th);
2476             }
2477             if(opcode2[i]==0x3b) // DSRA
2478             {
2479               emit_shrdimm(sl,sh,imm[i],tl);
2480               if(th>=0) emit_sarimm(sh,imm[i],th);
2481             }
2482           }else{
2483             // Shift by zero
2484             if(sl!=tl) emit_mov(sl,tl);
2485             if(th>=0&&sh!=th) emit_mov(sh,th);
2486           }
2487         }
2488       }
2489     }
2490   }
2491   if(opcode2[i]==0x3c) // DSLL32
2492   {
2493     if(rt1[i]) {
2494       signed char sl,tl,th;
2495       tl=get_reg(i_regs->regmap,rt1[i]);
2496       th=get_reg(i_regs->regmap,rt1[i]|64);
2497       sl=get_reg(i_regs->regmap,rs1[i]);
2498       if(th>=0||tl>=0){
2499         assert(tl>=0);
2500         assert(th>=0);
2501         assert(sl>=0);
2502         emit_mov(sl,th);
2503         emit_zeroreg(tl);
2504         if(imm[i]>32)
2505         {
2506           emit_shlimm(th,imm[i]&31,th);
2507         }
2508       }
2509     }
2510   }
2511   if(opcode2[i]==0x3e) // DSRL32
2512   {
2513     if(rt1[i]) {
2514       signed char sh,tl,th;
2515       tl=get_reg(i_regs->regmap,rt1[i]);
2516       th=get_reg(i_regs->regmap,rt1[i]|64);
2517       sh=get_reg(i_regs->regmap,rs1[i]|64);
2518       if(tl>=0){
2519         assert(sh>=0);
2520         emit_mov(sh,tl);
2521         if(th>=0) emit_zeroreg(th);
2522         if(imm[i]>32)
2523         {
2524           emit_shrimm(tl,imm[i]&31,tl);
2525         }
2526       }
2527     }
2528   }
2529   if(opcode2[i]==0x3f) // DSRA32
2530   {
2531     if(rt1[i]) {
2532       signed char sh,tl;
2533       tl=get_reg(i_regs->regmap,rt1[i]);
2534       sh=get_reg(i_regs->regmap,rs1[i]|64);
2535       if(tl>=0){
2536         assert(sh>=0);
2537         emit_mov(sh,tl);
2538         if(imm[i]>32)
2539         {
2540           emit_sarimm(tl,imm[i]&31,tl);
2541         }
2542       }
2543     }
2544   }
2545 }
2546
2547 #ifndef shift_assemble
2548 void shift_assemble(int i,struct regstat *i_regs)
2549 {
2550   printf("Need shift_assemble for this architecture.\n");
2551   exit(1);
2552 }
2553 #endif
2554
2555 void load_assemble(int i,struct regstat *i_regs)
2556 {
2557   int s,th,tl,addr,map=-1;
2558   int offset;
2559   int jaddr=0;
2560   int memtarget=0,c=0;
2561   int fastload_reg_override=0;
2562   u_int hr,reglist=0;
2563   th=get_reg(i_regs->regmap,rt1[i]|64);
2564   tl=get_reg(i_regs->regmap,rt1[i]);
2565   s=get_reg(i_regs->regmap,rs1[i]);
2566   offset=imm[i];
2567   for(hr=0;hr<HOST_REGS;hr++) {
2568     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2569   }
2570   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2571   if(s>=0) {
2572     c=(i_regs->wasconst>>s)&1;
2573     if (c) {
2574       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2575     }
2576   }
2577   //printf("load_assemble: c=%d\n",c);
2578   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2579   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2580   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2581     ||rt1[i]==0) {
2582       // could be FIFO, must perform the read
2583       // ||dummy read
2584       assem_debug("(forced read)\n");
2585       tl=get_reg(i_regs->regmap,-1);
2586       assert(tl>=0);
2587   }
2588   if(offset||s<0||c) addr=tl;
2589   else addr=s;
2590   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2591  if(tl>=0) {
2592   //printf("load_assemble: c=%d\n",c);
2593   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2594   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2595   reglist&=~(1<<tl);
2596   if(th>=0) reglist&=~(1<<th);
2597   if(!c) {
2598     #ifdef RAM_OFFSET
2599     map=get_reg(i_regs->regmap,ROREG);
2600     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2601     #endif
2602     #ifdef R29_HACK
2603     // Strmnnrmn's speed hack
2604     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2605     #endif
2606     {
2607       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2608     }
2609   }
2610   else if(ram_offset&&memtarget) {
2611     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2612     fastload_reg_override=HOST_TEMPREG;
2613   }
2614   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2615   if (opcode[i]==0x20) { // LB
2616     if(!c||memtarget) {
2617       if(!dummy) {
2618         #ifdef HOST_IMM_ADDR32
2619         if(c)
2620           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2621         else
2622         #endif
2623         {
2624           //emit_xorimm(addr,3,tl);
2625           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2626           int x=0,a=tl;
2627 #ifdef BIG_ENDIAN_MIPS
2628           if(!c) emit_xorimm(addr,3,tl);
2629           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2630 #else
2631           if(!c) a=addr;
2632 #endif
2633           if(fastload_reg_override) a=fastload_reg_override;
2634
2635           emit_movsbl_indexed_tlb(x,a,map,tl);
2636         }
2637       }
2638       if(jaddr)
2639         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2640     }
2641     else
2642       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2643   }
2644   if (opcode[i]==0x21) { // LH
2645     if(!c||memtarget) {
2646       if(!dummy) {
2647         #ifdef HOST_IMM_ADDR32
2648         if(c)
2649           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2650         else
2651         #endif
2652         {
2653           int x=0,a=tl;
2654 #ifdef BIG_ENDIAN_MIPS
2655           if(!c) emit_xorimm(addr,2,tl);
2656           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2657 #else
2658           if(!c) a=addr;
2659 #endif
2660           if(fastload_reg_override) a=fastload_reg_override;
2661           //#ifdef
2662           //emit_movswl_indexed_tlb(x,tl,map,tl);
2663           //else
2664           if(map>=0) {
2665             emit_movswl_indexed(x,a,tl);
2666           }else{
2667             #if 1 //def RAM_OFFSET
2668             emit_movswl_indexed(x,a,tl);
2669             #else
2670             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2671             #endif
2672           }
2673         }
2674       }
2675       if(jaddr)
2676         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2677     }
2678     else
2679       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2680   }
2681   if (opcode[i]==0x23) { // LW
2682     if(!c||memtarget) {
2683       if(!dummy) {
2684         int a=addr;
2685         if(fastload_reg_override) a=fastload_reg_override;
2686         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2687         #ifdef HOST_IMM_ADDR32
2688         if(c)
2689           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2690         else
2691         #endif
2692         emit_readword_indexed_tlb(0,a,map,tl);
2693       }
2694       if(jaddr)
2695         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2696     }
2697     else
2698       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2699   }
2700   if (opcode[i]==0x24) { // LBU
2701     if(!c||memtarget) {
2702       if(!dummy) {
2703         #ifdef HOST_IMM_ADDR32
2704         if(c)
2705           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2706         else
2707         #endif
2708         {
2709           //emit_xorimm(addr,3,tl);
2710           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2711           int x=0,a=tl;
2712 #ifdef BIG_ENDIAN_MIPS
2713           if(!c) emit_xorimm(addr,3,tl);
2714           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2715 #else
2716           if(!c) a=addr;
2717 #endif
2718           if(fastload_reg_override) a=fastload_reg_override;
2719
2720           emit_movzbl_indexed_tlb(x,a,map,tl);
2721         }
2722       }
2723       if(jaddr)
2724         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2725     }
2726     else
2727       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2728   }
2729   if (opcode[i]==0x25) { // LHU
2730     if(!c||memtarget) {
2731       if(!dummy) {
2732         #ifdef HOST_IMM_ADDR32
2733         if(c)
2734           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2735         else
2736         #endif
2737         {
2738           int x=0,a=tl;
2739 #ifdef BIG_ENDIAN_MIPS
2740           if(!c) emit_xorimm(addr,2,tl);
2741           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2742 #else
2743           if(!c) a=addr;
2744 #endif
2745           if(fastload_reg_override) a=fastload_reg_override;
2746           //#ifdef
2747           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2748           //#else
2749           if(map>=0) {
2750             emit_movzwl_indexed(x,a,tl);
2751           }else{
2752             #if 1 //def RAM_OFFSET
2753             emit_movzwl_indexed(x,a,tl);
2754             #else
2755             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2756             #endif
2757           }
2758         }
2759       }
2760       if(jaddr)
2761         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2762     }
2763     else
2764       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2765   }
2766   if (opcode[i]==0x27) { // LWU
2767     assert(th>=0);
2768     if(!c||memtarget) {
2769       if(!dummy) {
2770         int a=addr;
2771         if(fastload_reg_override) a=fastload_reg_override;
2772         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2773         #ifdef HOST_IMM_ADDR32
2774         if(c)
2775           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2776         else
2777         #endif
2778         emit_readword_indexed_tlb(0,a,map,tl);
2779       }
2780       if(jaddr)
2781         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2782     }
2783     else {
2784       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2785     }
2786     emit_zeroreg(th);
2787   }
2788   if (opcode[i]==0x37) { // LD
2789     if(!c||memtarget) {
2790       if(!dummy) {
2791         int a=addr;
2792         if(fastload_reg_override) a=fastload_reg_override;
2793         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2794         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2795         #ifdef HOST_IMM_ADDR32
2796         if(c)
2797           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2798         else
2799         #endif
2800         emit_readdword_indexed_tlb(0,a,map,th,tl);
2801       }
2802       if(jaddr)
2803         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2804     }
2805     else
2806       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2807   }
2808  }
2809   //emit_storereg(rt1[i],tl); // DEBUG
2810   //if(opcode[i]==0x23)
2811   //if(opcode[i]==0x24)
2812   //if(opcode[i]==0x23||opcode[i]==0x24)
2813   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2814   {
2815     //emit_pusha();
2816     save_regs(0x100f);
2817         emit_readword((int)&last_count,ECX);
2818         #ifdef __i386__
2819         if(get_reg(i_regs->regmap,CCREG)<0)
2820           emit_loadreg(CCREG,HOST_CCREG);
2821         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2822         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2823         emit_writeword(HOST_CCREG,(int)&Count);
2824         #endif
2825         #ifdef __arm__
2826         if(get_reg(i_regs->regmap,CCREG)<0)
2827           emit_loadreg(CCREG,0);
2828         else
2829           emit_mov(HOST_CCREG,0);
2830         emit_add(0,ECX,0);
2831         emit_addimm(0,2*ccadj[i],0);
2832         emit_writeword(0,(int)&Count);
2833         #endif
2834     emit_call((int)memdebug);
2835     //emit_popa();
2836     restore_regs(0x100f);
2837   }*/
2838 }
2839
2840 #ifndef loadlr_assemble
2841 void loadlr_assemble(int i,struct regstat *i_regs)
2842 {
2843   printf("Need loadlr_assemble for this architecture.\n");
2844   exit(1);
2845 }
2846 #endif
2847
2848 void store_assemble(int i,struct regstat *i_regs)
2849 {
2850   int s,th,tl,map=-1;
2851   int addr,temp;
2852   int offset;
2853   int jaddr=0,type;
2854   int memtarget=0,c=0;
2855   int agr=AGEN1+(i&1);
2856   int faststore_reg_override=0;
2857   u_int hr,reglist=0;
2858   th=get_reg(i_regs->regmap,rs2[i]|64);
2859   tl=get_reg(i_regs->regmap,rs2[i]);
2860   s=get_reg(i_regs->regmap,rs1[i]);
2861   temp=get_reg(i_regs->regmap,agr);
2862   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2863   offset=imm[i];
2864   if(s>=0) {
2865     c=(i_regs->wasconst>>s)&1;
2866     if(c) {
2867       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2868     }
2869   }
2870   assert(tl>=0);
2871   assert(temp>=0);
2872   for(hr=0;hr<HOST_REGS;hr++) {
2873     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2874   }
2875   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2876   if(offset||s<0||c) addr=temp;
2877   else addr=s;
2878   if(!c) {
2879     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2880   }
2881   else if(ram_offset&&memtarget) {
2882     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2883     faststore_reg_override=HOST_TEMPREG;
2884   }
2885
2886   if (opcode[i]==0x28) { // SB
2887     if(!c||memtarget) {
2888       int x=0,a=temp;
2889 #ifdef BIG_ENDIAN_MIPS
2890       if(!c) emit_xorimm(addr,3,temp);
2891       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2892 #else
2893       if(!c) a=addr;
2894 #endif
2895       if(faststore_reg_override) a=faststore_reg_override;
2896       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2897       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2898     }
2899     type=STOREB_STUB;
2900   }
2901   if (opcode[i]==0x29) { // SH
2902     if(!c||memtarget) {
2903       int x=0,a=temp;
2904 #ifdef BIG_ENDIAN_MIPS
2905       if(!c) emit_xorimm(addr,2,temp);
2906       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2907 #else
2908       if(!c) a=addr;
2909 #endif
2910       if(faststore_reg_override) a=faststore_reg_override;
2911       //#ifdef
2912       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2913       //#else
2914       if(map>=0) {
2915         emit_writehword_indexed(tl,x,a);
2916       }else
2917         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2918         emit_writehword_indexed(tl,x,a);
2919     }
2920     type=STOREH_STUB;
2921   }
2922   if (opcode[i]==0x2B) { // SW
2923     if(!c||memtarget) {
2924       int a=addr;
2925       if(faststore_reg_override) a=faststore_reg_override;
2926       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2927       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2928     }
2929     type=STOREW_STUB;
2930   }
2931   if (opcode[i]==0x3F) { // SD
2932     if(!c||memtarget) {
2933       int a=addr;
2934       if(faststore_reg_override) a=faststore_reg_override;
2935       if(rs2[i]) {
2936         assert(th>=0);
2937         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2938         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
2939         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
2940       }else{
2941         // Store zero
2942         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
2943         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
2944         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
2945       }
2946     }
2947     type=STORED_STUB;
2948   }
2949   if(jaddr) {
2950     // PCSX store handlers don't check invcode again
2951     reglist|=1<<addr;
2952     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2953     jaddr=0;
2954   }
2955   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2956     if(!c||memtarget) {
2957       #ifdef DESTRUCTIVE_SHIFT
2958       // The x86 shift operation is 'destructive'; it overwrites the
2959       // source register, so we need to make a copy first and use that.
2960       addr=temp;
2961       #endif
2962       #if defined(HOST_IMM8)
2963       int ir=get_reg(i_regs->regmap,INVCP);
2964       assert(ir>=0);
2965       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2966       #else
2967       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
2968       #endif
2969       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2970       emit_callne(invalidate_addr_reg[addr]);
2971       #else
2972       int jaddr2=(int)out;
2973       emit_jne(0);
2974       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2975       #endif
2976     }
2977   }
2978   u_int addr_val=constmap[i][s]+offset;
2979   if(jaddr) {
2980     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2981   } else if(c&&!memtarget) {
2982     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2983   }
2984   // basic current block modification detection..
2985   // not looking back as that should be in mips cache already
2986   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2987     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2988     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2989     if(i_regs->regmap==regs[i].regmap) {
2990       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
2991       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
2992       emit_movimm(start+i*4+4,0);
2993       emit_writeword(0,(int)&pcaddr);
2994       emit_jmp((int)do_interrupt);
2995     }
2996   }
2997   //if(opcode[i]==0x2B || opcode[i]==0x3F)
2998   //if(opcode[i]==0x2B || opcode[i]==0x28)
2999   //if(opcode[i]==0x2B || opcode[i]==0x29)
3000   //if(opcode[i]==0x2B)
3001   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3002   {
3003     #ifdef __i386__
3004     emit_pusha();
3005     #endif
3006     #ifdef __arm__
3007     save_regs(0x100f);
3008     #endif
3009         emit_readword((int)&last_count,ECX);
3010         #ifdef __i386__
3011         if(get_reg(i_regs->regmap,CCREG)<0)
3012           emit_loadreg(CCREG,HOST_CCREG);
3013         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3014         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3015         emit_writeword(HOST_CCREG,(int)&Count);
3016         #endif
3017         #ifdef __arm__
3018         if(get_reg(i_regs->regmap,CCREG)<0)
3019           emit_loadreg(CCREG,0);
3020         else
3021           emit_mov(HOST_CCREG,0);
3022         emit_add(0,ECX,0);
3023         emit_addimm(0,2*ccadj[i],0);
3024         emit_writeword(0,(int)&Count);
3025         #endif
3026     emit_call((int)memdebug);
3027     #ifdef __i386__
3028     emit_popa();
3029     #endif
3030     #ifdef __arm__
3031     restore_regs(0x100f);
3032     #endif
3033   }*/
3034 }
3035
3036 void storelr_assemble(int i,struct regstat *i_regs)
3037 {
3038   int s,th,tl;
3039   int temp;
3040   int temp2=-1;
3041   int offset;
3042   int jaddr=0;
3043   int case1,case2,case3;
3044   int done0,done1,done2;
3045   int memtarget=0,c=0;
3046   int agr=AGEN1+(i&1);
3047   u_int hr,reglist=0;
3048   th=get_reg(i_regs->regmap,rs2[i]|64);
3049   tl=get_reg(i_regs->regmap,rs2[i]);
3050   s=get_reg(i_regs->regmap,rs1[i]);
3051   temp=get_reg(i_regs->regmap,agr);
3052   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3053   offset=imm[i];
3054   if(s>=0) {
3055     c=(i_regs->isconst>>s)&1;
3056     if(c) {
3057       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3058     }
3059   }
3060   assert(tl>=0);
3061   for(hr=0;hr<HOST_REGS;hr++) {
3062     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3063   }
3064   assert(temp>=0);
3065   if(!c) {
3066     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3067     if(!offset&&s!=temp) emit_mov(s,temp);
3068     jaddr=(int)out;
3069     emit_jno(0);
3070   }
3071   else
3072   {
3073     if(!memtarget||!rs1[i]) {
3074       jaddr=(int)out;
3075       emit_jmp(0);
3076     }
3077   }
3078   #ifdef RAM_OFFSET
3079   int map=get_reg(i_regs->regmap,ROREG);
3080   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3081   #else
3082   if((u_int)rdram!=0x80000000)
3083     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3084   #endif
3085
3086   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3087     temp2=get_reg(i_regs->regmap,FTEMP);
3088     if(!rs2[i]) temp2=th=tl;
3089   }
3090
3091 #ifndef BIG_ENDIAN_MIPS
3092     emit_xorimm(temp,3,temp);
3093 #endif
3094   emit_testimm(temp,2);
3095   case2=(int)out;
3096   emit_jne(0);
3097   emit_testimm(temp,1);
3098   case1=(int)out;
3099   emit_jne(0);
3100   // 0
3101   if (opcode[i]==0x2A) { // SWL
3102     emit_writeword_indexed(tl,0,temp);
3103   }
3104   if (opcode[i]==0x2E) { // SWR
3105     emit_writebyte_indexed(tl,3,temp);
3106   }
3107   if (opcode[i]==0x2C) { // SDL
3108     emit_writeword_indexed(th,0,temp);
3109     if(rs2[i]) emit_mov(tl,temp2);
3110   }
3111   if (opcode[i]==0x2D) { // SDR
3112     emit_writebyte_indexed(tl,3,temp);
3113     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3114   }
3115   done0=(int)out;
3116   emit_jmp(0);
3117   // 1
3118   set_jump_target(case1,(int)out);
3119   if (opcode[i]==0x2A) { // SWL
3120     // Write 3 msb into three least significant bytes
3121     if(rs2[i]) emit_rorimm(tl,8,tl);
3122     emit_writehword_indexed(tl,-1,temp);
3123     if(rs2[i]) emit_rorimm(tl,16,tl);
3124     emit_writebyte_indexed(tl,1,temp);
3125     if(rs2[i]) emit_rorimm(tl,8,tl);
3126   }
3127   if (opcode[i]==0x2E) { // SWR
3128     // Write two lsb into two most significant bytes
3129     emit_writehword_indexed(tl,1,temp);
3130   }
3131   if (opcode[i]==0x2C) { // SDL
3132     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3133     // Write 3 msb into three least significant bytes
3134     if(rs2[i]) emit_rorimm(th,8,th);
3135     emit_writehword_indexed(th,-1,temp);
3136     if(rs2[i]) emit_rorimm(th,16,th);
3137     emit_writebyte_indexed(th,1,temp);
3138     if(rs2[i]) emit_rorimm(th,8,th);
3139   }
3140   if (opcode[i]==0x2D) { // SDR
3141     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3142     // Write two lsb into two most significant bytes
3143     emit_writehword_indexed(tl,1,temp);
3144   }
3145   done1=(int)out;
3146   emit_jmp(0);
3147   // 2
3148   set_jump_target(case2,(int)out);
3149   emit_testimm(temp,1);
3150   case3=(int)out;
3151   emit_jne(0);
3152   if (opcode[i]==0x2A) { // SWL
3153     // Write two msb into two least significant bytes
3154     if(rs2[i]) emit_rorimm(tl,16,tl);
3155     emit_writehword_indexed(tl,-2,temp);
3156     if(rs2[i]) emit_rorimm(tl,16,tl);
3157   }
3158   if (opcode[i]==0x2E) { // SWR
3159     // Write 3 lsb into three most significant bytes
3160     emit_writebyte_indexed(tl,-1,temp);
3161     if(rs2[i]) emit_rorimm(tl,8,tl);
3162     emit_writehword_indexed(tl,0,temp);
3163     if(rs2[i]) emit_rorimm(tl,24,tl);
3164   }
3165   if (opcode[i]==0x2C) { // SDL
3166     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3167     // Write two msb into two least significant bytes
3168     if(rs2[i]) emit_rorimm(th,16,th);
3169     emit_writehword_indexed(th,-2,temp);
3170     if(rs2[i]) emit_rorimm(th,16,th);
3171   }
3172   if (opcode[i]==0x2D) { // SDR
3173     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3174     // Write 3 lsb into three most significant bytes
3175     emit_writebyte_indexed(tl,-1,temp);
3176     if(rs2[i]) emit_rorimm(tl,8,tl);
3177     emit_writehword_indexed(tl,0,temp);
3178     if(rs2[i]) emit_rorimm(tl,24,tl);
3179   }
3180   done2=(int)out;
3181   emit_jmp(0);
3182   // 3
3183   set_jump_target(case3,(int)out);
3184   if (opcode[i]==0x2A) { // SWL
3185     // Write msb into least significant byte
3186     if(rs2[i]) emit_rorimm(tl,24,tl);
3187     emit_writebyte_indexed(tl,-3,temp);
3188     if(rs2[i]) emit_rorimm(tl,8,tl);
3189   }
3190   if (opcode[i]==0x2E) { // SWR
3191     // Write entire word
3192     emit_writeword_indexed(tl,-3,temp);
3193   }
3194   if (opcode[i]==0x2C) { // SDL
3195     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3196     // Write msb into least significant byte
3197     if(rs2[i]) emit_rorimm(th,24,th);
3198     emit_writebyte_indexed(th,-3,temp);
3199     if(rs2[i]) emit_rorimm(th,8,th);
3200   }
3201   if (opcode[i]==0x2D) { // SDR
3202     if(rs2[i]) emit_mov(th,temp2);
3203     // Write entire word
3204     emit_writeword_indexed(tl,-3,temp);
3205   }
3206   set_jump_target(done0,(int)out);
3207   set_jump_target(done1,(int)out);
3208   set_jump_target(done2,(int)out);
3209   if (opcode[i]==0x2C) { // SDL
3210     emit_testimm(temp,4);
3211     done0=(int)out;
3212     emit_jne(0);
3213     emit_andimm(temp,~3,temp);
3214     emit_writeword_indexed(temp2,4,temp);
3215     set_jump_target(done0,(int)out);
3216   }
3217   if (opcode[i]==0x2D) { // SDR
3218     emit_testimm(temp,4);
3219     done0=(int)out;
3220     emit_jeq(0);
3221     emit_andimm(temp,~3,temp);
3222     emit_writeword_indexed(temp2,-4,temp);
3223     set_jump_target(done0,(int)out);
3224   }
3225   if(!c||!memtarget)
3226     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3227   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3228     #ifdef RAM_OFFSET
3229     int map=get_reg(i_regs->regmap,ROREG);
3230     if(map<0) map=HOST_TEMPREG;
3231     gen_orig_addr_w(temp,map);
3232     #else
3233     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3234     #endif
3235     #if defined(HOST_IMM8)
3236     int ir=get_reg(i_regs->regmap,INVCP);
3237     assert(ir>=0);
3238     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3239     #else
3240     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3241     #endif
3242     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3243     emit_callne(invalidate_addr_reg[temp]);
3244     #else
3245     int jaddr2=(int)out;
3246     emit_jne(0);
3247     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3248     #endif
3249   }
3250   /*
3251     emit_pusha();
3252     //save_regs(0x100f);
3253         emit_readword((int)&last_count,ECX);
3254         if(get_reg(i_regs->regmap,CCREG)<0)
3255           emit_loadreg(CCREG,HOST_CCREG);
3256         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3257         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3258         emit_writeword(HOST_CCREG,(int)&Count);
3259     emit_call((int)memdebug);
3260     emit_popa();
3261     //restore_regs(0x100f);
3262   */
3263 }
3264
3265 void c1ls_assemble(int i,struct regstat *i_regs)
3266 {
3267   cop1_unusable(i, i_regs);
3268 }
3269
3270 void c2ls_assemble(int i,struct regstat *i_regs)
3271 {
3272   int s,tl;
3273   int ar;
3274   int offset;
3275   int memtarget=0,c=0;
3276   int jaddr2=0,type;
3277   int agr=AGEN1+(i&1);
3278   int fastio_reg_override=0;
3279   u_int hr,reglist=0;
3280   u_int copr=(source[i]>>16)&0x1f;
3281   s=get_reg(i_regs->regmap,rs1[i]);
3282   tl=get_reg(i_regs->regmap,FTEMP);
3283   offset=imm[i];
3284   assert(rs1[i]>0);
3285   assert(tl>=0);
3286
3287   for(hr=0;hr<HOST_REGS;hr++) {
3288     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3289   }
3290   if(i_regs->regmap[HOST_CCREG]==CCREG)
3291     reglist&=~(1<<HOST_CCREG);
3292
3293   // get the address
3294   if (opcode[i]==0x3a) { // SWC2
3295     ar=get_reg(i_regs->regmap,agr);
3296     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3297     reglist|=1<<ar;
3298   } else { // LWC2
3299     ar=tl;
3300   }
3301   if(s>=0) c=(i_regs->wasconst>>s)&1;
3302   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3303   if (!offset&&!c&&s>=0) ar=s;
3304   assert(ar>=0);
3305
3306   if (opcode[i]==0x3a) { // SWC2
3307     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3308     type=STOREW_STUB;
3309   }
3310   else
3311     type=LOADW_STUB;
3312
3313   if(c&&!memtarget) {
3314     jaddr2=(int)out;
3315     emit_jmp(0); // inline_readstub/inline_writestub?
3316   }
3317   else {
3318     if(!c) {
3319       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3320     }
3321     else if(ram_offset&&memtarget) {
3322       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3323       fastio_reg_override=HOST_TEMPREG;
3324     }
3325     if (opcode[i]==0x32) { // LWC2
3326       #ifdef HOST_IMM_ADDR32
3327       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3328       else
3329       #endif
3330       int a=ar;
3331       if(fastio_reg_override) a=fastio_reg_override;
3332       emit_readword_indexed(0,a,tl);
3333     }
3334     if (opcode[i]==0x3a) { // SWC2
3335       #ifdef DESTRUCTIVE_SHIFT
3336       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3337       #endif
3338       int a=ar;
3339       if(fastio_reg_override) a=fastio_reg_override;
3340       emit_writeword_indexed(tl,0,a);
3341     }
3342   }
3343   if(jaddr2)
3344     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3345   if(opcode[i]==0x3a) // SWC2
3346   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3347 #if defined(HOST_IMM8)
3348     int ir=get_reg(i_regs->regmap,INVCP);
3349     assert(ir>=0);
3350     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3351 #else
3352     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3353 #endif
3354     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3355     emit_callne(invalidate_addr_reg[ar]);
3356     #else
3357     int jaddr3=(int)out;
3358     emit_jne(0);
3359     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3360     #endif
3361   }
3362   if (opcode[i]==0x32) { // LWC2
3363     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3364   }
3365 }
3366
3367 #ifndef multdiv_assemble
3368 void multdiv_assemble(int i,struct regstat *i_regs)
3369 {
3370   printf("Need multdiv_assemble for this architecture.\n");
3371   exit(1);
3372 }
3373 #endif
3374
3375 void mov_assemble(int i,struct regstat *i_regs)
3376 {
3377   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3378   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3379   if(rt1[i]) {
3380     signed char sh,sl,th,tl;
3381     th=get_reg(i_regs->regmap,rt1[i]|64);
3382     tl=get_reg(i_regs->regmap,rt1[i]);
3383     //assert(tl>=0);
3384     if(tl>=0) {
3385       sh=get_reg(i_regs->regmap,rs1[i]|64);
3386       sl=get_reg(i_regs->regmap,rs1[i]);
3387       if(sl>=0) emit_mov(sl,tl);
3388       else emit_loadreg(rs1[i],tl);
3389       if(th>=0) {
3390         if(sh>=0) emit_mov(sh,th);
3391         else emit_loadreg(rs1[i]|64,th);
3392       }
3393     }
3394   }
3395 }
3396
3397 #ifndef fconv_assemble
3398 void fconv_assemble(int i,struct regstat *i_regs)
3399 {
3400   printf("Need fconv_assemble for this architecture.\n");
3401   exit(1);
3402 }
3403 #endif
3404
3405 #if 0
3406 void float_assemble(int i,struct regstat *i_regs)
3407 {
3408   printf("Need float_assemble for this architecture.\n");
3409   exit(1);
3410 }
3411 #endif
3412
3413 void syscall_assemble(int i,struct regstat *i_regs)
3414 {
3415   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3416   assert(ccreg==HOST_CCREG);
3417   assert(!is_delayslot);
3418   (void)ccreg;
3419   emit_movimm(start+i*4,EAX); // Get PC
3420   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3421   emit_jmp((int)jump_syscall_hle); // XXX
3422 }
3423
3424 void hlecall_assemble(int i,struct regstat *i_regs)
3425 {
3426   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3427   assert(ccreg==HOST_CCREG);
3428   assert(!is_delayslot);
3429   (void)ccreg;
3430   emit_movimm(start+i*4+4,0); // Get PC
3431   emit_movimm((int)psxHLEt[source[i]&7],1);
3432   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3433   emit_jmp((int)jump_hlecall);
3434 }
3435
3436 void intcall_assemble(int i,struct regstat *i_regs)
3437 {
3438   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3439   assert(ccreg==HOST_CCREG);
3440   assert(!is_delayslot);
3441   (void)ccreg;
3442   emit_movimm(start+i*4,0); // Get PC
3443   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3444   emit_jmp((int)jump_intcall);
3445 }
3446
3447 void ds_assemble(int i,struct regstat *i_regs)
3448 {
3449   speculate_register_values(i);
3450   is_delayslot=1;
3451   switch(itype[i]) {
3452     case ALU:
3453       alu_assemble(i,i_regs);break;
3454     case IMM16:
3455       imm16_assemble(i,i_regs);break;
3456     case SHIFT:
3457       shift_assemble(i,i_regs);break;
3458     case SHIFTIMM:
3459       shiftimm_assemble(i,i_regs);break;
3460     case LOAD:
3461       load_assemble(i,i_regs);break;
3462     case LOADLR:
3463       loadlr_assemble(i,i_regs);break;
3464     case STORE:
3465       store_assemble(i,i_regs);break;
3466     case STORELR:
3467       storelr_assemble(i,i_regs);break;
3468     case COP0:
3469       cop0_assemble(i,i_regs);break;
3470     case COP1:
3471       cop1_assemble(i,i_regs);break;
3472     case C1LS:
3473       c1ls_assemble(i,i_regs);break;
3474     case COP2:
3475       cop2_assemble(i,i_regs);break;
3476     case C2LS:
3477       c2ls_assemble(i,i_regs);break;
3478     case C2OP:
3479       c2op_assemble(i,i_regs);break;
3480     case FCONV:
3481       fconv_assemble(i,i_regs);break;
3482     case FLOAT:
3483       float_assemble(i,i_regs);break;
3484     case FCOMP:
3485       fcomp_assemble(i,i_regs);break;
3486     case MULTDIV:
3487       multdiv_assemble(i,i_regs);break;
3488     case MOV:
3489       mov_assemble(i,i_regs);break;
3490     case SYSCALL:
3491     case HLECALL:
3492     case INTCALL:
3493     case SPAN:
3494     case UJUMP:
3495     case RJUMP:
3496     case CJUMP:
3497     case SJUMP:
3498     case FJUMP:
3499       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3500   }
3501   is_delayslot=0;
3502 }
3503
3504 // Is the branch target a valid internal jump?
3505 int internal_branch(uint64_t i_is32,int addr)
3506 {
3507   if(addr&1) return 0; // Indirect (register) jump
3508   if(addr>=start && addr<start+slen*4-4)
3509   {
3510     //int t=(addr-start)>>2;
3511     // Delay slots are not valid branch targets
3512     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3513     // 64 -> 32 bit transition requires a recompile
3514     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3515     {
3516       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3517       else printf("optimizable: yes\n");
3518     }*/
3519     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3520     return 1;
3521   }
3522   return 0;
3523 }
3524
3525 #ifndef wb_invalidate
3526 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3527   uint64_t u,uint64_t uu)
3528 {
3529   int hr;
3530   for(hr=0;hr<HOST_REGS;hr++) {
3531     if(hr!=EXCLUDE_REG) {
3532       if(pre[hr]!=entry[hr]) {
3533         if(pre[hr]>=0) {
3534           if((dirty>>hr)&1) {
3535             if(get_reg(entry,pre[hr])<0) {
3536               if(pre[hr]<64) {
3537                 if(!((u>>pre[hr])&1)) {
3538                   emit_storereg(pre[hr],hr);
3539                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3540                     emit_sarimm(hr,31,hr);
3541                     emit_storereg(pre[hr]|64,hr);
3542                   }
3543                 }
3544               }else{
3545                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3546                   emit_storereg(pre[hr],hr);
3547                 }
3548               }
3549             }
3550           }
3551         }
3552       }
3553     }
3554   }
3555   // Move from one register to another (no writeback)
3556   for(hr=0;hr<HOST_REGS;hr++) {
3557     if(hr!=EXCLUDE_REG) {
3558       if(pre[hr]!=entry[hr]) {
3559         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3560           int nr;
3561           if((nr=get_reg(entry,pre[hr]))>=0) {
3562             emit_mov(hr,nr);
3563           }
3564         }
3565       }
3566     }
3567   }
3568 }
3569 #endif
3570
3571 // Load the specified registers
3572 // This only loads the registers given as arguments because
3573 // we don't want to load things that will be overwritten
3574 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3575 {
3576   int hr;
3577   // Load 32-bit regs
3578   for(hr=0;hr<HOST_REGS;hr++) {
3579     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3580       if(entry[hr]!=regmap[hr]) {
3581         if(regmap[hr]==rs1||regmap[hr]==rs2)
3582         {
3583           if(regmap[hr]==0) {
3584             emit_zeroreg(hr);
3585           }
3586           else
3587           {
3588             emit_loadreg(regmap[hr],hr);
3589           }
3590         }
3591       }
3592     }
3593   }
3594   //Load 64-bit regs
3595   for(hr=0;hr<HOST_REGS;hr++) {
3596     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3597       if(entry[hr]!=regmap[hr]) {
3598         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3599         {
3600           assert(regmap[hr]!=64);
3601           if((is32>>(regmap[hr]&63))&1) {
3602             int lr=get_reg(regmap,regmap[hr]-64);
3603             if(lr>=0)
3604               emit_sarimm(lr,31,hr);
3605             else
3606               emit_loadreg(regmap[hr],hr);
3607           }
3608           else
3609           {
3610             emit_loadreg(regmap[hr],hr);
3611           }
3612         }
3613       }
3614     }
3615   }
3616 }
3617
3618 // Load registers prior to the start of a loop
3619 // so that they are not loaded within the loop
3620 static void loop_preload(signed char pre[],signed char entry[])
3621 {
3622   int hr;
3623   for(hr=0;hr<HOST_REGS;hr++) {
3624     if(hr!=EXCLUDE_REG) {
3625       if(pre[hr]!=entry[hr]) {
3626         if(entry[hr]>=0) {
3627           if(get_reg(pre,entry[hr])<0) {
3628             assem_debug("loop preload:\n");
3629             //printf("loop preload: %d\n",hr);
3630             if(entry[hr]==0) {
3631               emit_zeroreg(hr);
3632             }
3633             else if(entry[hr]<TEMPREG)
3634             {
3635               emit_loadreg(entry[hr],hr);
3636             }
3637             else if(entry[hr]-64<TEMPREG)
3638             {
3639               emit_loadreg(entry[hr],hr);
3640             }
3641           }
3642         }
3643       }
3644     }
3645   }
3646 }
3647
3648 // Generate address for load/store instruction
3649 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3650 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3651 {
3652   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3653     int ra=-1;
3654     int agr=AGEN1+(i&1);
3655     if(itype[i]==LOAD) {
3656       ra=get_reg(i_regs->regmap,rt1[i]);
3657       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3658       assert(ra>=0);
3659     }
3660     if(itype[i]==LOADLR) {
3661       ra=get_reg(i_regs->regmap,FTEMP);
3662     }
3663     if(itype[i]==STORE||itype[i]==STORELR) {
3664       ra=get_reg(i_regs->regmap,agr);
3665       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3666     }
3667     if(itype[i]==C1LS||itype[i]==C2LS) {
3668       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3669         ra=get_reg(i_regs->regmap,FTEMP);
3670       else { // SWC1/SDC1/SWC2/SDC2
3671         ra=get_reg(i_regs->regmap,agr);
3672         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3673       }
3674     }
3675     int rs=get_reg(i_regs->regmap,rs1[i]);
3676     if(ra>=0) {
3677       int offset=imm[i];
3678       int c=(i_regs->wasconst>>rs)&1;
3679       if(rs1[i]==0) {
3680         // Using r0 as a base address
3681         if(!entry||entry[ra]!=agr) {
3682           if (opcode[i]==0x22||opcode[i]==0x26) {
3683             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3684           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3685             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3686           }else{
3687             emit_movimm(offset,ra);
3688           }
3689         } // else did it in the previous cycle
3690       }
3691       else if(rs<0) {
3692         if(!entry||entry[ra]!=rs1[i])
3693           emit_loadreg(rs1[i],ra);
3694         //if(!entry||entry[ra]!=rs1[i])
3695         //  printf("poor load scheduling!\n");
3696       }
3697       else if(c) {
3698         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3699           if(!entry||entry[ra]!=agr) {
3700             if (opcode[i]==0x22||opcode[i]==0x26) {
3701               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3702             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3703               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3704             }else{
3705               #ifdef HOST_IMM_ADDR32
3706               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3707               #endif
3708               emit_movimm(constmap[i][rs]+offset,ra);
3709               regs[i].loadedconst|=1<<ra;
3710             }
3711           } // else did it in the previous cycle
3712         } // else load_consts already did it
3713       }
3714       if(offset&&!c&&rs1[i]) {
3715         if(rs>=0) {
3716           emit_addimm(rs,offset,ra);
3717         }else{
3718           emit_addimm(ra,offset,ra);
3719         }
3720       }
3721     }
3722   }
3723   // Preload constants for next instruction
3724   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3725     int agr,ra;
3726     // Actual address
3727     agr=AGEN1+((i+1)&1);
3728     ra=get_reg(i_regs->regmap,agr);
3729     if(ra>=0) {
3730       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3731       int offset=imm[i+1];
3732       int c=(regs[i+1].wasconst>>rs)&1;
3733       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3734         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3735           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3736         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3737           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3738         }else{
3739           #ifdef HOST_IMM_ADDR32
3740           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3741           #endif
3742           emit_movimm(constmap[i+1][rs]+offset,ra);
3743           regs[i+1].loadedconst|=1<<ra;
3744         }
3745       }
3746       else if(rs1[i+1]==0) {
3747         // Using r0 as a base address
3748         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3749           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3750         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3751           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3752         }else{
3753           emit_movimm(offset,ra);
3754         }
3755       }
3756     }
3757   }
3758 }
3759
3760 static int get_final_value(int hr, int i, int *value)
3761 {
3762   int reg=regs[i].regmap[hr];
3763   while(i<slen-1) {
3764     if(regs[i+1].regmap[hr]!=reg) break;
3765     if(!((regs[i+1].isconst>>hr)&1)) break;
3766     if(bt[i+1]) break;
3767     i++;
3768   }
3769   if(i<slen-1) {
3770     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3771       *value=constmap[i][hr];
3772       return 1;
3773     }
3774     if(!bt[i+1]) {
3775       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3776         // Load in delay slot, out-of-order execution
3777         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3778         {
3779           // Precompute load address
3780           *value=constmap[i][hr]+imm[i+2];
3781           return 1;
3782         }
3783       }
3784       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3785       {
3786         // Precompute load address
3787         *value=constmap[i][hr]+imm[i+1];
3788         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3789         return 1;
3790       }
3791     }
3792   }
3793   *value=constmap[i][hr];
3794   //printf("c=%x\n",(int)constmap[i][hr]);
3795   if(i==slen-1) return 1;
3796   if(reg<64) {
3797     return !((unneeded_reg[i+1]>>reg)&1);
3798   }else{
3799     return !((unneeded_reg_upper[i+1]>>reg)&1);
3800   }
3801 }
3802
3803 // Load registers with known constants
3804 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3805 {
3806   int hr,hr2;
3807   // propagate loaded constant flags
3808   if(i==0||bt[i])
3809     regs[i].loadedconst=0;
3810   else {
3811     for(hr=0;hr<HOST_REGS;hr++) {
3812       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3813          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3814       {
3815         regs[i].loadedconst|=1<<hr;
3816       }
3817     }
3818   }
3819   // Load 32-bit regs
3820   for(hr=0;hr<HOST_REGS;hr++) {
3821     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3822       //if(entry[hr]!=regmap[hr]) {
3823       if(!((regs[i].loadedconst>>hr)&1)) {
3824         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3825           int value,similar=0;
3826           if(get_final_value(hr,i,&value)) {
3827             // see if some other register has similar value
3828             for(hr2=0;hr2<HOST_REGS;hr2++) {
3829               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3830                 if(is_similar_value(value,constmap[i][hr2])) {
3831                   similar=1;
3832                   break;
3833                 }
3834               }
3835             }
3836             if(similar) {
3837               int value2;
3838               if(get_final_value(hr2,i,&value2)) // is this needed?
3839                 emit_movimm_from(value2,hr2,value,hr);
3840               else
3841                 emit_movimm(value,hr);
3842             }
3843             else if(value==0) {
3844               emit_zeroreg(hr);
3845             }
3846             else {
3847               emit_movimm(value,hr);
3848             }
3849           }
3850           regs[i].loadedconst|=1<<hr;
3851         }
3852       }
3853     }
3854   }
3855   // Load 64-bit regs
3856   for(hr=0;hr<HOST_REGS;hr++) {
3857     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3858       //if(entry[hr]!=regmap[hr]) {
3859       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3860         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3861           if((is32>>(regmap[hr]&63))&1) {
3862             int lr=get_reg(regmap,regmap[hr]-64);
3863             assert(lr>=0);
3864             emit_sarimm(lr,31,hr);
3865           }
3866           else
3867           {
3868             int value;
3869             if(get_final_value(hr,i,&value)) {
3870               if(value==0) {
3871                 emit_zeroreg(hr);
3872               }
3873               else {
3874                 emit_movimm(value,hr);
3875               }
3876             }
3877           }
3878         }
3879       }
3880     }
3881   }
3882 }
3883 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3884 {
3885   int hr;
3886   // Load 32-bit regs
3887   for(hr=0;hr<HOST_REGS;hr++) {
3888     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3889       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3890         int value=constmap[i][hr];
3891         if(value==0) {
3892           emit_zeroreg(hr);
3893         }
3894         else {
3895           emit_movimm(value,hr);
3896         }
3897       }
3898     }
3899   }
3900   // Load 64-bit regs
3901   for(hr=0;hr<HOST_REGS;hr++) {
3902     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3903       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3904         if((is32>>(regmap[hr]&63))&1) {
3905           int lr=get_reg(regmap,regmap[hr]-64);
3906           assert(lr>=0);
3907           emit_sarimm(lr,31,hr);
3908         }
3909         else
3910         {
3911           int value=constmap[i][hr];
3912           if(value==0) {
3913             emit_zeroreg(hr);
3914           }
3915           else {
3916             emit_movimm(value,hr);
3917           }
3918         }
3919       }
3920     }
3921   }
3922 }
3923
3924 // Write out all dirty registers (except cycle count)
3925 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3926 {
3927   int hr;
3928   for(hr=0;hr<HOST_REGS;hr++) {
3929     if(hr!=EXCLUDE_REG) {
3930       if(i_regmap[hr]>0) {
3931         if(i_regmap[hr]!=CCREG) {
3932           if((i_dirty>>hr)&1) {
3933             if(i_regmap[hr]<64) {
3934               emit_storereg(i_regmap[hr],hr);
3935             }else{
3936               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3937                 emit_storereg(i_regmap[hr],hr);
3938               }
3939             }
3940           }
3941         }
3942       }
3943     }
3944   }
3945 }
3946 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3947 // This writes the registers not written by store_regs_bt
3948 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3949 {
3950   int hr;
3951   int t=(addr-start)>>2;
3952   for(hr=0;hr<HOST_REGS;hr++) {
3953     if(hr!=EXCLUDE_REG) {
3954       if(i_regmap[hr]>0) {
3955         if(i_regmap[hr]!=CCREG) {
3956           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3957             if((i_dirty>>hr)&1) {
3958               if(i_regmap[hr]<64) {
3959                 emit_storereg(i_regmap[hr],hr);
3960               }else{
3961                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3962                   emit_storereg(i_regmap[hr],hr);
3963                 }
3964               }
3965             }
3966           }
3967         }
3968       }
3969     }
3970   }
3971 }
3972
3973 // Load all registers (except cycle count)
3974 void load_all_regs(signed char i_regmap[])
3975 {
3976   int hr;
3977   for(hr=0;hr<HOST_REGS;hr++) {
3978     if(hr!=EXCLUDE_REG) {
3979       if(i_regmap[hr]==0) {
3980         emit_zeroreg(hr);
3981       }
3982       else
3983       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3984       {
3985         emit_loadreg(i_regmap[hr],hr);
3986       }
3987     }
3988   }
3989 }
3990
3991 // Load all current registers also needed by next instruction
3992 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
3993 {
3994   int hr;
3995   for(hr=0;hr<HOST_REGS;hr++) {
3996     if(hr!=EXCLUDE_REG) {
3997       if(get_reg(next_regmap,i_regmap[hr])>=0) {
3998         if(i_regmap[hr]==0) {
3999           emit_zeroreg(hr);
4000         }
4001         else
4002         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4003         {
4004           emit_loadreg(i_regmap[hr],hr);
4005         }
4006       }
4007     }
4008   }
4009 }
4010
4011 // Load all regs, storing cycle count if necessary
4012 void load_regs_entry(int t)
4013 {
4014   int hr;
4015   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4016   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4017   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4018     emit_storereg(CCREG,HOST_CCREG);
4019   }
4020   // Load 32-bit regs
4021   for(hr=0;hr<HOST_REGS;hr++) {
4022     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4023       if(regs[t].regmap_entry[hr]==0) {
4024         emit_zeroreg(hr);
4025       }
4026       else if(regs[t].regmap_entry[hr]!=CCREG)
4027       {
4028         emit_loadreg(regs[t].regmap_entry[hr],hr);
4029       }
4030     }
4031   }
4032   // Load 64-bit regs
4033   for(hr=0;hr<HOST_REGS;hr++) {
4034     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4035       assert(regs[t].regmap_entry[hr]!=64);
4036       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4037         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4038         if(lr<0) {
4039           emit_loadreg(regs[t].regmap_entry[hr],hr);
4040         }
4041         else
4042         {
4043           emit_sarimm(lr,31,hr);
4044         }
4045       }
4046       else
4047       {
4048         emit_loadreg(regs[t].regmap_entry[hr],hr);
4049       }
4050     }
4051   }
4052 }
4053
4054 // Store dirty registers prior to branch
4055 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4056 {
4057   if(internal_branch(i_is32,addr))
4058   {
4059     int t=(addr-start)>>2;
4060     int hr;
4061     for(hr=0;hr<HOST_REGS;hr++) {
4062       if(hr!=EXCLUDE_REG) {
4063         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4064           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4065             if((i_dirty>>hr)&1) {
4066               if(i_regmap[hr]<64) {
4067                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4068                   emit_storereg(i_regmap[hr],hr);
4069                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4070                     #ifdef DESTRUCTIVE_WRITEBACK
4071                     emit_sarimm(hr,31,hr);
4072                     emit_storereg(i_regmap[hr]|64,hr);
4073                     #else
4074                     emit_sarimm(hr,31,HOST_TEMPREG);
4075                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4076                     #endif
4077                   }
4078                 }
4079               }else{
4080                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4081                   emit_storereg(i_regmap[hr],hr);
4082                 }
4083               }
4084             }
4085           }
4086         }
4087       }
4088     }
4089   }
4090   else
4091   {
4092     // Branch out of this block, write out all dirty regs
4093     wb_dirtys(i_regmap,i_is32,i_dirty);
4094   }
4095 }
4096
4097 // Load all needed registers for branch target
4098 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4099 {
4100   //if(addr>=start && addr<(start+slen*4))
4101   if(internal_branch(i_is32,addr))
4102   {
4103     int t=(addr-start)>>2;
4104     int hr;
4105     // Store the cycle count before loading something else
4106     if(i_regmap[HOST_CCREG]!=CCREG) {
4107       assert(i_regmap[HOST_CCREG]==-1);
4108     }
4109     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4110       emit_storereg(CCREG,HOST_CCREG);
4111     }
4112     // Load 32-bit regs
4113     for(hr=0;hr<HOST_REGS;hr++) {
4114       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4115         #ifdef DESTRUCTIVE_WRITEBACK
4116         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4117         #else
4118         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4119         #endif
4120           if(regs[t].regmap_entry[hr]==0) {
4121             emit_zeroreg(hr);
4122           }
4123           else if(regs[t].regmap_entry[hr]!=CCREG)
4124           {
4125             emit_loadreg(regs[t].regmap_entry[hr],hr);
4126           }
4127         }
4128       }
4129     }
4130     //Load 64-bit regs
4131     for(hr=0;hr<HOST_REGS;hr++) {
4132       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4133         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4134           assert(regs[t].regmap_entry[hr]!=64);
4135           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4136             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4137             if(lr<0) {
4138               emit_loadreg(regs[t].regmap_entry[hr],hr);
4139             }
4140             else
4141             {
4142               emit_sarimm(lr,31,hr);
4143             }
4144           }
4145           else
4146           {
4147             emit_loadreg(regs[t].regmap_entry[hr],hr);
4148           }
4149         }
4150         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4151           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4152           assert(lr>=0);
4153           emit_sarimm(lr,31,hr);
4154         }
4155       }
4156     }
4157   }
4158 }
4159
4160 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4161 {
4162   if(addr>=start && addr<start+slen*4-4)
4163   {
4164     int t=(addr-start)>>2;
4165     int hr;
4166     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4167     for(hr=0;hr<HOST_REGS;hr++)
4168     {
4169       if(hr!=EXCLUDE_REG)
4170       {
4171         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4172         {
4173           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4174           {
4175             return 0;
4176           }
4177           else
4178           if((i_dirty>>hr)&1)
4179           {
4180             if(i_regmap[hr]<TEMPREG)
4181             {
4182               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4183                 return 0;
4184             }
4185             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4186             {
4187               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4188                 return 0;
4189             }
4190           }
4191         }
4192         else // Same register but is it 32-bit or dirty?
4193         if(i_regmap[hr]>=0)
4194         {
4195           if(!((regs[t].dirty>>hr)&1))
4196           {
4197             if((i_dirty>>hr)&1)
4198             {
4199               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4200               {
4201                 //printf("%x: dirty no match\n",addr);
4202                 return 0;
4203               }
4204             }
4205           }
4206           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4207           {
4208             //printf("%x: is32 no match\n",addr);
4209             return 0;
4210           }
4211         }
4212       }
4213     }
4214     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4215     // Delay slots are not valid branch targets
4216     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4217     // Delay slots require additional processing, so do not match
4218     if(is_ds[t]) return 0;
4219   }
4220   else
4221   {
4222     int hr;
4223     for(hr=0;hr<HOST_REGS;hr++)
4224     {
4225       if(hr!=EXCLUDE_REG)
4226       {
4227         if(i_regmap[hr]>=0)
4228         {
4229           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4230           {
4231             if((i_dirty>>hr)&1)
4232             {
4233               return 0;
4234             }
4235           }
4236         }
4237       }
4238     }
4239   }
4240   return 1;
4241 }
4242
4243 // Used when a branch jumps into the delay slot of another branch
4244 void ds_assemble_entry(int i)
4245 {
4246   int t=(ba[i]-start)>>2;
4247   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4248   assem_debug("Assemble delay slot at %x\n",ba[i]);
4249   assem_debug("<->\n");
4250   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4251     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4252   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4253   address_generation(t,&regs[t],regs[t].regmap_entry);
4254   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4255     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4256   cop1_usable=0;
4257   is_delayslot=0;
4258   switch(itype[t]) {
4259     case ALU:
4260       alu_assemble(t,&regs[t]);break;
4261     case IMM16:
4262       imm16_assemble(t,&regs[t]);break;
4263     case SHIFT:
4264       shift_assemble(t,&regs[t]);break;
4265     case SHIFTIMM:
4266       shiftimm_assemble(t,&regs[t]);break;
4267     case LOAD:
4268       load_assemble(t,&regs[t]);break;
4269     case LOADLR:
4270       loadlr_assemble(t,&regs[t]);break;
4271     case STORE:
4272       store_assemble(t,&regs[t]);break;
4273     case STORELR:
4274       storelr_assemble(t,&regs[t]);break;
4275     case COP0:
4276       cop0_assemble(t,&regs[t]);break;
4277     case COP1:
4278       cop1_assemble(t,&regs[t]);break;
4279     case C1LS:
4280       c1ls_assemble(t,&regs[t]);break;
4281     case COP2:
4282       cop2_assemble(t,&regs[t]);break;
4283     case C2LS:
4284       c2ls_assemble(t,&regs[t]);break;
4285     case C2OP:
4286       c2op_assemble(t,&regs[t]);break;
4287     case FCONV:
4288       fconv_assemble(t,&regs[t]);break;
4289     case FLOAT:
4290       float_assemble(t,&regs[t]);break;
4291     case FCOMP:
4292       fcomp_assemble(t,&regs[t]);break;
4293     case MULTDIV:
4294       multdiv_assemble(t,&regs[t]);break;
4295     case MOV:
4296       mov_assemble(t,&regs[t]);break;
4297     case SYSCALL:
4298     case HLECALL:
4299     case INTCALL:
4300     case SPAN:
4301     case UJUMP:
4302     case RJUMP:
4303     case CJUMP:
4304     case SJUMP:
4305     case FJUMP:
4306       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4307   }
4308   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4309   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4310   if(internal_branch(regs[t].is32,ba[i]+4))
4311     assem_debug("branch: internal\n");
4312   else
4313     assem_debug("branch: external\n");
4314   assert(internal_branch(regs[t].is32,ba[i]+4));
4315   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4316   emit_jmp(0);
4317 }
4318
4319 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4320 {
4321   int count;
4322   int jaddr;
4323   int idle=0;
4324   int t=0;
4325   if(itype[i]==RJUMP)
4326   {
4327     *adj=0;
4328   }
4329   //if(ba[i]>=start && ba[i]<(start+slen*4))
4330   if(internal_branch(branch_regs[i].is32,ba[i]))
4331   {
4332     t=(ba[i]-start)>>2;
4333     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4334     else *adj=ccadj[t];
4335   }
4336   else
4337   {
4338     *adj=0;
4339   }
4340   count=ccadj[i];
4341   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4342     // Idle loop
4343     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4344     idle=(int)out;
4345     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4346     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4347     jaddr=(int)out;
4348     emit_jmp(0);
4349   }
4350   else if(*adj==0||invert) {
4351     int cycles=CLOCK_ADJUST(count+2);
4352     // faster loop HACK
4353     if (t&&*adj) {
4354       int rel=t-i;
4355       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4356         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4357     }
4358     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4359     jaddr=(int)out;
4360     emit_jns(0);
4361   }
4362   else
4363   {
4364     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4365     jaddr=(int)out;
4366     emit_jns(0);
4367   }
4368   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4369 }
4370
4371 void do_ccstub(int n)
4372 {
4373   literal_pool(256);
4374   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4375   set_jump_target(stubs[n][1],(int)out);
4376   int i=stubs[n][4];
4377   if(stubs[n][6]==NULLDS) {
4378     // Delay slot instruction is nullified ("likely" branch)
4379     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4380   }
4381   else if(stubs[n][6]!=TAKEN) {
4382     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4383   }
4384   else {
4385     if(internal_branch(branch_regs[i].is32,ba[i]))
4386       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4387   }
4388   if(stubs[n][5]!=-1)
4389   {
4390     // Save PC as return address
4391     emit_movimm(stubs[n][5],EAX);
4392     emit_writeword(EAX,(int)&pcaddr);
4393   }
4394   else
4395   {
4396     // Return address depends on which way the branch goes
4397     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4398     {
4399       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4400       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4401       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4402       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4403       if(rs1[i]==0)
4404       {
4405         s1l=s2l;s1h=s2h;
4406         s2l=s2h=-1;
4407       }
4408       else if(rs2[i]==0)
4409       {
4410         s2l=s2h=-1;
4411       }
4412       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4413         s1h=s2h=-1;
4414       }
4415       assert(s1l>=0);
4416       #ifdef DESTRUCTIVE_WRITEBACK
4417       if(rs1[i]) {
4418         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4419           emit_loadreg(rs1[i],s1l);
4420       }
4421       else {
4422         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4423           emit_loadreg(rs2[i],s1l);
4424       }
4425       if(s2l>=0)
4426         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4427           emit_loadreg(rs2[i],s2l);
4428       #endif
4429       int hr=0;
4430       int addr=-1,alt=-1,ntaddr=-1;
4431       while(hr<HOST_REGS)
4432       {
4433         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4434            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4435            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4436         {
4437           addr=hr++;break;
4438         }
4439         hr++;
4440       }
4441       while(hr<HOST_REGS)
4442       {
4443         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4444            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4445            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4446         {
4447           alt=hr++;break;
4448         }
4449         hr++;
4450       }
4451       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4452       {
4453         while(hr<HOST_REGS)
4454         {
4455           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4456              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4457              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4458           {
4459             ntaddr=hr;break;
4460           }
4461           hr++;
4462         }
4463         assert(hr<HOST_REGS);
4464       }
4465       if((opcode[i]&0x2f)==4) // BEQ
4466       {
4467         #ifdef HAVE_CMOV_IMM
4468         if(s1h<0) {
4469           if(s2l>=0) emit_cmp(s1l,s2l);
4470           else emit_test(s1l,s1l);
4471           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4472         }
4473         else
4474         #endif
4475         {
4476           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4477           if(s1h>=0) {
4478             if(s2h>=0) emit_cmp(s1h,s2h);
4479             else emit_test(s1h,s1h);
4480             emit_cmovne_reg(alt,addr);
4481           }
4482           if(s2l>=0) emit_cmp(s1l,s2l);
4483           else emit_test(s1l,s1l);
4484           emit_cmovne_reg(alt,addr);
4485         }
4486       }
4487       if((opcode[i]&0x2f)==5) // BNE
4488       {
4489         #ifdef HAVE_CMOV_IMM
4490         if(s1h<0) {
4491           if(s2l>=0) emit_cmp(s1l,s2l);
4492           else emit_test(s1l,s1l);
4493           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4494         }
4495         else
4496         #endif
4497         {
4498           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4499           if(s1h>=0) {
4500             if(s2h>=0) emit_cmp(s1h,s2h);
4501             else emit_test(s1h,s1h);
4502             emit_cmovne_reg(alt,addr);
4503           }
4504           if(s2l>=0) emit_cmp(s1l,s2l);
4505           else emit_test(s1l,s1l);
4506           emit_cmovne_reg(alt,addr);
4507         }
4508       }
4509       if((opcode[i]&0x2f)==6) // BLEZ
4510       {
4511         //emit_movimm(ba[i],alt);
4512         //emit_movimm(start+i*4+8,addr);
4513         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4514         emit_cmpimm(s1l,1);
4515         if(s1h>=0) emit_mov(addr,ntaddr);
4516         emit_cmovl_reg(alt,addr);
4517         if(s1h>=0) {
4518           emit_test(s1h,s1h);
4519           emit_cmovne_reg(ntaddr,addr);
4520           emit_cmovs_reg(alt,addr);
4521         }
4522       }
4523       if((opcode[i]&0x2f)==7) // BGTZ
4524       {
4525         //emit_movimm(ba[i],addr);
4526         //emit_movimm(start+i*4+8,ntaddr);
4527         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4528         emit_cmpimm(s1l,1);
4529         if(s1h>=0) emit_mov(addr,alt);
4530         emit_cmovl_reg(ntaddr,addr);
4531         if(s1h>=0) {
4532           emit_test(s1h,s1h);
4533           emit_cmovne_reg(alt,addr);
4534           emit_cmovs_reg(ntaddr,addr);
4535         }
4536       }
4537       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4538       {
4539         //emit_movimm(ba[i],alt);
4540         //emit_movimm(start+i*4+8,addr);
4541         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4542         if(s1h>=0) emit_test(s1h,s1h);
4543         else emit_test(s1l,s1l);
4544         emit_cmovs_reg(alt,addr);
4545       }
4546       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4547       {
4548         //emit_movimm(ba[i],addr);
4549         //emit_movimm(start+i*4+8,alt);
4550         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4551         if(s1h>=0) emit_test(s1h,s1h);
4552         else emit_test(s1l,s1l);
4553         emit_cmovs_reg(alt,addr);
4554       }
4555       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4556         if(source[i]&0x10000) // BC1T
4557         {
4558           //emit_movimm(ba[i],alt);
4559           //emit_movimm(start+i*4+8,addr);
4560           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4561           emit_testimm(s1l,0x800000);
4562           emit_cmovne_reg(alt,addr);
4563         }
4564         else // BC1F
4565         {
4566           //emit_movimm(ba[i],addr);
4567           //emit_movimm(start+i*4+8,alt);
4568           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4569           emit_testimm(s1l,0x800000);
4570           emit_cmovne_reg(alt,addr);
4571         }
4572       }
4573       emit_writeword(addr,(int)&pcaddr);
4574     }
4575     else
4576     if(itype[i]==RJUMP)
4577     {
4578       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4579       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4580         r=get_reg(branch_regs[i].regmap,RTEMP);
4581       }
4582       emit_writeword(r,(int)&pcaddr);
4583     }
4584     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4585   }
4586   // Update cycle count
4587   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4588   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4589   emit_call((int)cc_interrupt);
4590   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4591   if(stubs[n][6]==TAKEN) {
4592     if(internal_branch(branch_regs[i].is32,ba[i]))
4593       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4594     else if(itype[i]==RJUMP) {
4595       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4596         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4597       else
4598         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4599     }
4600   }else if(stubs[n][6]==NOTTAKEN) {
4601     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4602     else load_all_regs(branch_regs[i].regmap);
4603   }else if(stubs[n][6]==NULLDS) {
4604     // Delay slot instruction is nullified ("likely" branch)
4605     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4606     else load_all_regs(regs[i].regmap);
4607   }else{
4608     load_all_regs(branch_regs[i].regmap);
4609   }
4610   emit_jmp(stubs[n][2]); // return address
4611
4612   /* This works but uses a lot of memory...
4613   emit_readword((int)&last_count,ECX);
4614   emit_add(HOST_CCREG,ECX,EAX);
4615   emit_writeword(EAX,(int)&Count);
4616   emit_call((int)gen_interupt);
4617   emit_readword((int)&Count,HOST_CCREG);
4618   emit_readword((int)&next_interupt,EAX);
4619   emit_readword((int)&pending_exception,EBX);
4620   emit_writeword(EAX,(int)&last_count);
4621   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4622   emit_test(EBX,EBX);
4623   int jne_instr=(int)out;
4624   emit_jne(0);
4625   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4626   load_all_regs(branch_regs[i].regmap);
4627   emit_jmp(stubs[n][2]); // return address
4628   set_jump_target(jne_instr,(int)out);
4629   emit_readword((int)&pcaddr,EAX);
4630   // Call get_addr_ht instead of doing the hash table here.
4631   // This code is executed infrequently and takes up a lot of space
4632   // so smaller is better.
4633   emit_storereg(CCREG,HOST_CCREG);
4634   emit_pushreg(EAX);
4635   emit_call((int)get_addr_ht);
4636   emit_loadreg(CCREG,HOST_CCREG);
4637   emit_addimm(ESP,4,ESP);
4638   emit_jmpreg(EAX);*/
4639 }
4640
4641 static void add_to_linker(int addr,int target,int ext)
4642 {
4643   link_addr[linkcount][0]=addr;
4644   link_addr[linkcount][1]=target;
4645   link_addr[linkcount][2]=ext;
4646   linkcount++;
4647 }
4648
4649 static void ujump_assemble_write_ra(int i)
4650 {
4651   int rt;
4652   unsigned int return_address;
4653   rt=get_reg(branch_regs[i].regmap,31);
4654   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4655   //assert(rt>=0);
4656   return_address=start+i*4+8;
4657   if(rt>=0) {
4658     #ifdef USE_MINI_HT
4659     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4660       int temp=-1; // note: must be ds-safe
4661       #ifdef HOST_TEMPREG
4662       temp=HOST_TEMPREG;
4663       #endif
4664       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4665       else emit_movimm(return_address,rt);
4666     }
4667     else
4668     #endif
4669     {
4670       #ifdef REG_PREFETCH
4671       if(temp>=0)
4672       {
4673         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4674       }
4675       #endif
4676       emit_movimm(return_address,rt); // PC into link register
4677       #ifdef IMM_PREFETCH
4678       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4679       #endif
4680     }
4681   }
4682 }
4683
4684 void ujump_assemble(int i,struct regstat *i_regs)
4685 {
4686   int ra_done=0;
4687   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4688   address_generation(i+1,i_regs,regs[i].regmap_entry);
4689   #ifdef REG_PREFETCH
4690   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4691   if(rt1[i]==31&&temp>=0)
4692   {
4693     signed char *i_regmap=i_regs->regmap;
4694     int return_address=start+i*4+8;
4695     if(get_reg(branch_regs[i].regmap,31)>0)
4696     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4697   }
4698   #endif
4699   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4700     ujump_assemble_write_ra(i); // writeback ra for DS
4701     ra_done=1;
4702   }
4703   ds_assemble(i+1,i_regs);
4704   uint64_t bc_unneeded=branch_regs[i].u;
4705   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4706   bc_unneeded|=1|(1LL<<rt1[i]);
4707   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4708   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4709                 bc_unneeded,bc_unneeded_upper);
4710   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4711   if(!ra_done&&rt1[i]==31)
4712     ujump_assemble_write_ra(i);
4713   int cc,adj;
4714   cc=get_reg(branch_regs[i].regmap,CCREG);
4715   assert(cc==HOST_CCREG);
4716   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4717   #ifdef REG_PREFETCH
4718   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4719   #endif
4720   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4721   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4722   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4723   if(internal_branch(branch_regs[i].is32,ba[i]))
4724     assem_debug("branch: internal\n");
4725   else
4726     assem_debug("branch: external\n");
4727   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4728     ds_assemble_entry(i);
4729   }
4730   else {
4731     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4732     emit_jmp(0);
4733   }
4734 }
4735
4736 static void rjump_assemble_write_ra(int i)
4737 {
4738   int rt,return_address;
4739   assert(rt1[i+1]!=rt1[i]);
4740   assert(rt2[i+1]!=rt1[i]);
4741   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4742   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4743   assert(rt>=0);
4744   return_address=start+i*4+8;
4745   #ifdef REG_PREFETCH
4746   if(temp>=0)
4747   {
4748     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4749   }
4750   #endif
4751   emit_movimm(return_address,rt); // PC into link register
4752   #ifdef IMM_PREFETCH
4753   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4754   #endif
4755 }
4756
4757 void rjump_assemble(int i,struct regstat *i_regs)
4758 {
4759   int temp;
4760   int rs,cc;
4761   int ra_done=0;
4762   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4763   assert(rs>=0);
4764   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4765     // Delay slot abuse, make a copy of the branch address register
4766     temp=get_reg(branch_regs[i].regmap,RTEMP);
4767     assert(temp>=0);
4768     assert(regs[i].regmap[temp]==RTEMP);
4769     emit_mov(rs,temp);
4770     rs=temp;
4771   }
4772   address_generation(i+1,i_regs,regs[i].regmap_entry);
4773   #ifdef REG_PREFETCH
4774   if(rt1[i]==31)
4775   {
4776     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4777       signed char *i_regmap=i_regs->regmap;
4778       int return_address=start+i*4+8;
4779       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4780     }
4781   }
4782   #endif
4783   #ifdef USE_MINI_HT
4784   if(rs1[i]==31) {
4785     int rh=get_reg(regs[i].regmap,RHASH);
4786     if(rh>=0) do_preload_rhash(rh);
4787   }
4788   #endif
4789   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4790     rjump_assemble_write_ra(i);
4791     ra_done=1;
4792   }
4793   ds_assemble(i+1,i_regs);
4794   uint64_t bc_unneeded=branch_regs[i].u;
4795   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4796   bc_unneeded|=1|(1LL<<rt1[i]);
4797   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4798   bc_unneeded&=~(1LL<<rs1[i]);
4799   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4800                 bc_unneeded,bc_unneeded_upper);
4801   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4802   if(!ra_done&&rt1[i]!=0)
4803     rjump_assemble_write_ra(i);
4804   cc=get_reg(branch_regs[i].regmap,CCREG);
4805   assert(cc==HOST_CCREG);
4806   (void)cc;
4807   #ifdef USE_MINI_HT
4808   int rh=get_reg(branch_regs[i].regmap,RHASH);
4809   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4810   if(rs1[i]==31) {
4811     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4812     do_preload_rhtbl(ht);
4813     do_rhash(rs,rh);
4814   }
4815   #endif
4816   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4817   #ifdef DESTRUCTIVE_WRITEBACK
4818   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4819     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4820       emit_loadreg(rs1[i],rs);
4821     }
4822   }
4823   #endif
4824   #ifdef REG_PREFETCH
4825   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4826   #endif
4827   #ifdef USE_MINI_HT
4828   if(rs1[i]==31) {
4829     do_miniht_load(ht,rh);
4830   }
4831   #endif
4832   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4833   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4834   //assert(adj==0);
4835   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4836   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4837   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4838     // special case for RFE
4839     emit_jmp(0);
4840   else
4841     emit_jns(0);
4842   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4843   #ifdef USE_MINI_HT
4844   if(rs1[i]==31) {
4845     do_miniht_jump(rs,rh,ht);
4846   }
4847   else
4848   #endif
4849   {
4850     //if(rs!=EAX) emit_mov(rs,EAX);
4851     //emit_jmp((int)jump_vaddr_eax);
4852     emit_jmp(jump_vaddr_reg[rs]);
4853   }
4854   /* Check hash table
4855   temp=!rs;
4856   emit_mov(rs,temp);
4857   emit_shrimm(rs,16,rs);
4858   emit_xor(temp,rs,rs);
4859   emit_movzwl_reg(rs,rs);
4860   emit_shlimm(rs,4,rs);
4861   emit_cmpmem_indexed((int)hash_table,rs,temp);
4862   emit_jne((int)out+14);
4863   emit_readword_indexed((int)hash_table+4,rs,rs);
4864   emit_jmpreg(rs);
4865   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4866   emit_addimm_no_flags(8,rs);
4867   emit_jeq((int)out-17);
4868   // No hit on hash table, call compiler
4869   emit_pushreg(temp);
4870 //DEBUG >
4871 #ifdef DEBUG_CYCLE_COUNT
4872   emit_readword((int)&last_count,ECX);
4873   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4874   emit_readword((int)&next_interupt,ECX);
4875   emit_writeword(HOST_CCREG,(int)&Count);
4876   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4877   emit_writeword(ECX,(int)&last_count);
4878 #endif
4879 //DEBUG <
4880   emit_storereg(CCREG,HOST_CCREG);
4881   emit_call((int)get_addr);
4882   emit_loadreg(CCREG,HOST_CCREG);
4883   emit_addimm(ESP,4,ESP);
4884   emit_jmpreg(EAX);*/
4885   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4886   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4887   #endif
4888 }
4889
4890 void cjump_assemble(int i,struct regstat *i_regs)
4891 {
4892   signed char *i_regmap=i_regs->regmap;
4893   int cc;
4894   int match;
4895   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4896   assem_debug("match=%d\n",match);
4897   int s1h,s1l,s2h,s2l;
4898   int prev_cop1_usable=cop1_usable;
4899   int unconditional=0,nop=0;
4900   int only32=0;
4901   int invert=0;
4902   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4903   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4904   if(!match) invert=1;
4905   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4906   if(i>(ba[i]-start)>>2) invert=1;
4907   #endif
4908
4909   if(ooo[i]) {
4910     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4911     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4912     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4913     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4914   }
4915   else {
4916     s1l=get_reg(i_regmap,rs1[i]);
4917     s1h=get_reg(i_regmap,rs1[i]|64);
4918     s2l=get_reg(i_regmap,rs2[i]);
4919     s2h=get_reg(i_regmap,rs2[i]|64);
4920   }
4921   if(rs1[i]==0&&rs2[i]==0)
4922   {
4923     if(opcode[i]&1) nop=1;
4924     else unconditional=1;
4925     //assert(opcode[i]!=5);
4926     //assert(opcode[i]!=7);
4927     //assert(opcode[i]!=0x15);
4928     //assert(opcode[i]!=0x17);
4929   }
4930   else if(rs1[i]==0)
4931   {
4932     s1l=s2l;s1h=s2h;
4933     s2l=s2h=-1;
4934     only32=(regs[i].was32>>rs2[i])&1;
4935   }
4936   else if(rs2[i]==0)
4937   {
4938     s2l=s2h=-1;
4939     only32=(regs[i].was32>>rs1[i])&1;
4940   }
4941   else {
4942     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
4943   }
4944
4945   if(ooo[i]) {
4946     // Out of order execution (delay slot first)
4947     //printf("OOOE\n");
4948     address_generation(i+1,i_regs,regs[i].regmap_entry);
4949     ds_assemble(i+1,i_regs);
4950     int adj;
4951     uint64_t bc_unneeded=branch_regs[i].u;
4952     uint64_t bc_unneeded_upper=branch_regs[i].uu;
4953     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4954     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
4955     bc_unneeded|=1;
4956     bc_unneeded_upper|=1;
4957     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4958                   bc_unneeded,bc_unneeded_upper);
4959     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
4960     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4961     cc=get_reg(branch_regs[i].regmap,CCREG);
4962     assert(cc==HOST_CCREG);
4963     if(unconditional)
4964       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4965     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4966     //assem_debug("cycle count (adj)\n");
4967     if(unconditional) {
4968       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4969       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4970         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4971         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4972         if(internal)
4973           assem_debug("branch: internal\n");
4974         else
4975           assem_debug("branch: external\n");
4976         if(internal&&is_ds[(ba[i]-start)>>2]) {
4977           ds_assemble_entry(i);
4978         }
4979         else {
4980           add_to_linker((int)out,ba[i],internal);
4981           emit_jmp(0);
4982         }
4983         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4984         if(((u_int)out)&7) emit_addnop(0);
4985         #endif
4986       }
4987     }
4988     else if(nop) {
4989       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4990       int jaddr=(int)out;
4991       emit_jns(0);
4992       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
4993     }
4994     else {
4995       int taken=0,nottaken=0,nottaken1=0;
4996       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4997       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4998       if(!only32)
4999       {
5000         assert(s1h>=0);
5001         if(opcode[i]==4) // BEQ
5002         {
5003           if(s2h>=0) emit_cmp(s1h,s2h);
5004           else emit_test(s1h,s1h);
5005           nottaken1=(int)out;
5006           emit_jne(1);
5007         }
5008         if(opcode[i]==5) // BNE
5009         {
5010           if(s2h>=0) emit_cmp(s1h,s2h);
5011           else emit_test(s1h,s1h);
5012           if(invert) taken=(int)out;
5013           else add_to_linker((int)out,ba[i],internal);
5014           emit_jne(0);
5015         }
5016         if(opcode[i]==6) // BLEZ
5017         {
5018           emit_test(s1h,s1h);
5019           if(invert) taken=(int)out;
5020           else add_to_linker((int)out,ba[i],internal);
5021           emit_js(0);
5022           nottaken1=(int)out;
5023           emit_jne(1);
5024         }
5025         if(opcode[i]==7) // BGTZ
5026         {
5027           emit_test(s1h,s1h);
5028           nottaken1=(int)out;
5029           emit_js(1);
5030           if(invert) taken=(int)out;
5031           else add_to_linker((int)out,ba[i],internal);
5032           emit_jne(0);
5033         }
5034       } // if(!only32)
5035
5036       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5037       assert(s1l>=0);
5038       if(opcode[i]==4) // BEQ
5039       {
5040         if(s2l>=0) emit_cmp(s1l,s2l);
5041         else emit_test(s1l,s1l);
5042         if(invert){
5043           nottaken=(int)out;
5044           emit_jne(1);
5045         }else{
5046           add_to_linker((int)out,ba[i],internal);
5047           emit_jeq(0);
5048         }
5049       }
5050       if(opcode[i]==5) // BNE
5051       {
5052         if(s2l>=0) emit_cmp(s1l,s2l);
5053         else emit_test(s1l,s1l);
5054         if(invert){
5055           nottaken=(int)out;
5056           emit_jeq(1);
5057         }else{
5058           add_to_linker((int)out,ba[i],internal);
5059           emit_jne(0);
5060         }
5061       }
5062       if(opcode[i]==6) // BLEZ
5063       {
5064         emit_cmpimm(s1l,1);
5065         if(invert){
5066           nottaken=(int)out;
5067           emit_jge(1);
5068         }else{
5069           add_to_linker((int)out,ba[i],internal);
5070           emit_jl(0);
5071         }
5072       }
5073       if(opcode[i]==7) // BGTZ
5074       {
5075         emit_cmpimm(s1l,1);
5076         if(invert){
5077           nottaken=(int)out;
5078           emit_jl(1);
5079         }else{
5080           add_to_linker((int)out,ba[i],internal);
5081           emit_jge(0);
5082         }
5083       }
5084       if(invert) {
5085         if(taken) set_jump_target(taken,(int)out);
5086         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5087         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5088           if(adj) {
5089             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5090             add_to_linker((int)out,ba[i],internal);
5091           }else{
5092             emit_addnop(13);
5093             add_to_linker((int)out,ba[i],internal*2);
5094           }
5095           emit_jmp(0);
5096         }else
5097         #endif
5098         {
5099           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5100           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5101           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5102           if(internal)
5103             assem_debug("branch: internal\n");
5104           else
5105             assem_debug("branch: external\n");
5106           if(internal&&is_ds[(ba[i]-start)>>2]) {
5107             ds_assemble_entry(i);
5108           }
5109           else {
5110             add_to_linker((int)out,ba[i],internal);
5111             emit_jmp(0);
5112           }
5113         }
5114         set_jump_target(nottaken,(int)out);
5115       }
5116
5117       if(nottaken1) set_jump_target(nottaken1,(int)out);
5118       if(adj) {
5119         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5120       }
5121     } // (!unconditional)
5122   } // if(ooo)
5123   else
5124   {
5125     // In-order execution (branch first)
5126     //if(likely[i]) printf("IOL\n");
5127     //else
5128     //printf("IOE\n");
5129     int taken=0,nottaken=0,nottaken1=0;
5130     if(!unconditional&&!nop) {
5131       if(!only32)
5132       {
5133         assert(s1h>=0);
5134         if((opcode[i]&0x2f)==4) // BEQ
5135         {
5136           if(s2h>=0) emit_cmp(s1h,s2h);
5137           else emit_test(s1h,s1h);
5138           nottaken1=(int)out;
5139           emit_jne(2);
5140         }
5141         if((opcode[i]&0x2f)==5) // BNE
5142         {
5143           if(s2h>=0) emit_cmp(s1h,s2h);
5144           else emit_test(s1h,s1h);
5145           taken=(int)out;
5146           emit_jne(1);
5147         }
5148         if((opcode[i]&0x2f)==6) // BLEZ
5149         {
5150           emit_test(s1h,s1h);
5151           taken=(int)out;
5152           emit_js(1);
5153           nottaken1=(int)out;
5154           emit_jne(2);
5155         }
5156         if((opcode[i]&0x2f)==7) // BGTZ
5157         {
5158           emit_test(s1h,s1h);
5159           nottaken1=(int)out;
5160           emit_js(2);
5161           taken=(int)out;
5162           emit_jne(1);
5163         }
5164       } // if(!only32)
5165
5166       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5167       assert(s1l>=0);
5168       if((opcode[i]&0x2f)==4) // BEQ
5169       {
5170         if(s2l>=0) emit_cmp(s1l,s2l);
5171         else emit_test(s1l,s1l);
5172         nottaken=(int)out;
5173         emit_jne(2);
5174       }
5175       if((opcode[i]&0x2f)==5) // BNE
5176       {
5177         if(s2l>=0) emit_cmp(s1l,s2l);
5178         else emit_test(s1l,s1l);
5179         nottaken=(int)out;
5180         emit_jeq(2);
5181       }
5182       if((opcode[i]&0x2f)==6) // BLEZ
5183       {
5184         emit_cmpimm(s1l,1);
5185         nottaken=(int)out;
5186         emit_jge(2);
5187       }
5188       if((opcode[i]&0x2f)==7) // BGTZ
5189       {
5190         emit_cmpimm(s1l,1);
5191         nottaken=(int)out;
5192         emit_jl(2);
5193       }
5194     } // if(!unconditional)
5195     int adj;
5196     uint64_t ds_unneeded=branch_regs[i].u;
5197     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5198     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5199     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5200     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5201     ds_unneeded|=1;
5202     ds_unneeded_upper|=1;
5203     // branch taken
5204     if(!nop) {
5205       if(taken) set_jump_target(taken,(int)out);
5206       assem_debug("1:\n");
5207       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5208                     ds_unneeded,ds_unneeded_upper);
5209       // load regs
5210       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5211       address_generation(i+1,&branch_regs[i],0);
5212       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5213       ds_assemble(i+1,&branch_regs[i]);
5214       cc=get_reg(branch_regs[i].regmap,CCREG);
5215       if(cc==-1) {
5216         emit_loadreg(CCREG,cc=HOST_CCREG);
5217         // CHECK: Is the following instruction (fall thru) allocated ok?
5218       }
5219       assert(cc==HOST_CCREG);
5220       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5221       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5222       assem_debug("cycle count (adj)\n");
5223       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5224       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5225       if(internal)
5226         assem_debug("branch: internal\n");
5227       else
5228         assem_debug("branch: external\n");
5229       if(internal&&is_ds[(ba[i]-start)>>2]) {
5230         ds_assemble_entry(i);
5231       }
5232       else {
5233         add_to_linker((int)out,ba[i],internal);
5234         emit_jmp(0);
5235       }
5236     }
5237     // branch not taken
5238     cop1_usable=prev_cop1_usable;
5239     if(!unconditional) {
5240       if(nottaken1) set_jump_target(nottaken1,(int)out);
5241       set_jump_target(nottaken,(int)out);
5242       assem_debug("2:\n");
5243       if(!likely[i]) {
5244         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5245                       ds_unneeded,ds_unneeded_upper);
5246         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5247         address_generation(i+1,&branch_regs[i],0);
5248         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5249         ds_assemble(i+1,&branch_regs[i]);
5250       }
5251       cc=get_reg(branch_regs[i].regmap,CCREG);
5252       if(cc==-1&&!likely[i]) {
5253         // Cycle count isn't in a register, temporarily load it then write it out
5254         emit_loadreg(CCREG,HOST_CCREG);
5255         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5256         int jaddr=(int)out;
5257         emit_jns(0);
5258         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5259         emit_storereg(CCREG,HOST_CCREG);
5260       }
5261       else{
5262         cc=get_reg(i_regmap,CCREG);
5263         assert(cc==HOST_CCREG);
5264         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5265         int jaddr=(int)out;
5266         emit_jns(0);
5267         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5268       }
5269     }
5270   }
5271 }
5272
5273 void sjump_assemble(int i,struct regstat *i_regs)
5274 {
5275   signed char *i_regmap=i_regs->regmap;
5276   int cc;
5277   int match;
5278   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5279   assem_debug("smatch=%d\n",match);
5280   int s1h,s1l;
5281   int prev_cop1_usable=cop1_usable;
5282   int unconditional=0,nevertaken=0;
5283   int only32=0;
5284   int invert=0;
5285   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5286   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5287   if(!match) invert=1;
5288   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5289   if(i>(ba[i]-start)>>2) invert=1;
5290   #endif
5291
5292   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5293   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5294
5295   if(ooo[i]) {
5296     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5297     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5298   }
5299   else {
5300     s1l=get_reg(i_regmap,rs1[i]);
5301     s1h=get_reg(i_regmap,rs1[i]|64);
5302   }
5303   if(rs1[i]==0)
5304   {
5305     if(opcode2[i]&1) unconditional=1;
5306     else nevertaken=1;
5307     // These are never taken (r0 is never less than zero)
5308     //assert(opcode2[i]!=0);
5309     //assert(opcode2[i]!=2);
5310     //assert(opcode2[i]!=0x10);
5311     //assert(opcode2[i]!=0x12);
5312   }
5313   else {
5314     only32=(regs[i].was32>>rs1[i])&1;
5315   }
5316
5317   if(ooo[i]) {
5318     // Out of order execution (delay slot first)
5319     //printf("OOOE\n");
5320     address_generation(i+1,i_regs,regs[i].regmap_entry);
5321     ds_assemble(i+1,i_regs);
5322     int adj;
5323     uint64_t bc_unneeded=branch_regs[i].u;
5324     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5325     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5326     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5327     bc_unneeded|=1;
5328     bc_unneeded_upper|=1;
5329     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5330                   bc_unneeded,bc_unneeded_upper);
5331     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5332     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5333     if(rt1[i]==31) {
5334       int rt,return_address;
5335       rt=get_reg(branch_regs[i].regmap,31);
5336       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5337       if(rt>=0) {
5338         // Save the PC even if the branch is not taken
5339         return_address=start+i*4+8;
5340         emit_movimm(return_address,rt); // PC into link register
5341         #ifdef IMM_PREFETCH
5342         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5343         #endif
5344       }
5345     }
5346     cc=get_reg(branch_regs[i].regmap,CCREG);
5347     assert(cc==HOST_CCREG);
5348     if(unconditional)
5349       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5350     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5351     assem_debug("cycle count (adj)\n");
5352     if(unconditional) {
5353       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5354       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5355         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5356         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5357         if(internal)
5358           assem_debug("branch: internal\n");
5359         else
5360           assem_debug("branch: external\n");
5361         if(internal&&is_ds[(ba[i]-start)>>2]) {
5362           ds_assemble_entry(i);
5363         }
5364         else {
5365           add_to_linker((int)out,ba[i],internal);
5366           emit_jmp(0);
5367         }
5368         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5369         if(((u_int)out)&7) emit_addnop(0);
5370         #endif
5371       }
5372     }
5373     else if(nevertaken) {
5374       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5375       int jaddr=(int)out;
5376       emit_jns(0);
5377       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5378     }
5379     else {
5380       int nottaken=0;
5381       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5382       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5383       if(!only32)
5384       {
5385         assert(s1h>=0);
5386         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5387         {
5388           emit_test(s1h,s1h);
5389           if(invert){
5390             nottaken=(int)out;
5391             emit_jns(1);
5392           }else{
5393             add_to_linker((int)out,ba[i],internal);
5394             emit_js(0);
5395           }
5396         }
5397         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5398         {
5399           emit_test(s1h,s1h);
5400           if(invert){
5401             nottaken=(int)out;
5402             emit_js(1);
5403           }else{
5404             add_to_linker((int)out,ba[i],internal);
5405             emit_jns(0);
5406           }
5407         }
5408       } // if(!only32)
5409       else
5410       {
5411         assert(s1l>=0);
5412         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5413         {
5414           emit_test(s1l,s1l);
5415           if(invert){
5416             nottaken=(int)out;
5417             emit_jns(1);
5418           }else{
5419             add_to_linker((int)out,ba[i],internal);
5420             emit_js(0);
5421           }
5422         }
5423         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5424         {
5425           emit_test(s1l,s1l);
5426           if(invert){
5427             nottaken=(int)out;
5428             emit_js(1);
5429           }else{
5430             add_to_linker((int)out,ba[i],internal);
5431             emit_jns(0);
5432           }
5433         }
5434       } // if(!only32)
5435
5436       if(invert) {
5437         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5438         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5439           if(adj) {
5440             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5441             add_to_linker((int)out,ba[i],internal);
5442           }else{
5443             emit_addnop(13);
5444             add_to_linker((int)out,ba[i],internal*2);
5445           }
5446           emit_jmp(0);
5447         }else
5448         #endif
5449         {
5450           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5451           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5452           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5453           if(internal)
5454             assem_debug("branch: internal\n");
5455           else
5456             assem_debug("branch: external\n");
5457           if(internal&&is_ds[(ba[i]-start)>>2]) {
5458             ds_assemble_entry(i);
5459           }
5460           else {
5461             add_to_linker((int)out,ba[i],internal);
5462             emit_jmp(0);
5463           }
5464         }
5465         set_jump_target(nottaken,(int)out);
5466       }
5467
5468       if(adj) {
5469         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5470       }
5471     } // (!unconditional)
5472   } // if(ooo)
5473   else
5474   {
5475     // In-order execution (branch first)
5476     //printf("IOE\n");
5477     int nottaken=0;
5478     if(rt1[i]==31) {
5479       int rt,return_address;
5480       rt=get_reg(branch_regs[i].regmap,31);
5481       if(rt>=0) {
5482         // Save the PC even if the branch is not taken
5483         return_address=start+i*4+8;
5484         emit_movimm(return_address,rt); // PC into link register
5485         #ifdef IMM_PREFETCH
5486         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5487         #endif
5488       }
5489     }
5490     if(!unconditional) {
5491       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5492       if(!only32)
5493       {
5494         assert(s1h>=0);
5495         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5496         {
5497           emit_test(s1h,s1h);
5498           nottaken=(int)out;
5499           emit_jns(1);
5500         }
5501         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5502         {
5503           emit_test(s1h,s1h);
5504           nottaken=(int)out;
5505           emit_js(1);
5506         }
5507       } // if(!only32)
5508       else
5509       {
5510         assert(s1l>=0);
5511         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5512         {
5513           emit_test(s1l,s1l);
5514           nottaken=(int)out;
5515           emit_jns(1);
5516         }
5517         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5518         {
5519           emit_test(s1l,s1l);
5520           nottaken=(int)out;
5521           emit_js(1);
5522         }
5523       }
5524     } // if(!unconditional)
5525     int adj;
5526     uint64_t ds_unneeded=branch_regs[i].u;
5527     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5528     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5529     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5530     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5531     ds_unneeded|=1;
5532     ds_unneeded_upper|=1;
5533     // branch taken
5534     if(!nevertaken) {
5535       //assem_debug("1:\n");
5536       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5537                     ds_unneeded,ds_unneeded_upper);
5538       // load regs
5539       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5540       address_generation(i+1,&branch_regs[i],0);
5541       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5542       ds_assemble(i+1,&branch_regs[i]);
5543       cc=get_reg(branch_regs[i].regmap,CCREG);
5544       if(cc==-1) {
5545         emit_loadreg(CCREG,cc=HOST_CCREG);
5546         // CHECK: Is the following instruction (fall thru) allocated ok?
5547       }
5548       assert(cc==HOST_CCREG);
5549       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5550       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5551       assem_debug("cycle count (adj)\n");
5552       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5553       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5554       if(internal)
5555         assem_debug("branch: internal\n");
5556       else
5557         assem_debug("branch: external\n");
5558       if(internal&&is_ds[(ba[i]-start)>>2]) {
5559         ds_assemble_entry(i);
5560       }
5561       else {
5562         add_to_linker((int)out,ba[i],internal);
5563         emit_jmp(0);
5564       }
5565     }
5566     // branch not taken
5567     cop1_usable=prev_cop1_usable;
5568     if(!unconditional) {
5569       set_jump_target(nottaken,(int)out);
5570       assem_debug("1:\n");
5571       if(!likely[i]) {
5572         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5573                       ds_unneeded,ds_unneeded_upper);
5574         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5575         address_generation(i+1,&branch_regs[i],0);
5576         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5577         ds_assemble(i+1,&branch_regs[i]);
5578       }
5579       cc=get_reg(branch_regs[i].regmap,CCREG);
5580       if(cc==-1&&!likely[i]) {
5581         // Cycle count isn't in a register, temporarily load it then write it out
5582         emit_loadreg(CCREG,HOST_CCREG);
5583         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5584         int jaddr=(int)out;
5585         emit_jns(0);
5586         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5587         emit_storereg(CCREG,HOST_CCREG);
5588       }
5589       else{
5590         cc=get_reg(i_regmap,CCREG);
5591         assert(cc==HOST_CCREG);
5592         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5593         int jaddr=(int)out;
5594         emit_jns(0);
5595         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5596       }
5597     }
5598   }
5599 }
5600
5601 void fjump_assemble(int i,struct regstat *i_regs)
5602 {
5603   signed char *i_regmap=i_regs->regmap;
5604   int cc;
5605   int match;
5606   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5607   assem_debug("fmatch=%d\n",match);
5608   int fs,cs;
5609   int eaddr;
5610   int invert=0;
5611   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5612   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5613   if(!match) invert=1;
5614   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5615   if(i>(ba[i]-start)>>2) invert=1;
5616   #endif
5617
5618   if(ooo[i]) {
5619     fs=get_reg(branch_regs[i].regmap,FSREG);
5620     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5621   }
5622   else {
5623     fs=get_reg(i_regmap,FSREG);
5624   }
5625
5626   // Check cop1 unusable
5627   if(!cop1_usable) {
5628     cs=get_reg(i_regmap,CSREG);
5629     assert(cs>=0);
5630     emit_testimm(cs,0x20000000);
5631     eaddr=(int)out;
5632     emit_jeq(0);
5633     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5634     cop1_usable=1;
5635   }
5636
5637   if(ooo[i]) {
5638     // Out of order execution (delay slot first)
5639     //printf("OOOE\n");
5640     ds_assemble(i+1,i_regs);
5641     int adj;
5642     uint64_t bc_unneeded=branch_regs[i].u;
5643     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5644     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5645     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5646     bc_unneeded|=1;
5647     bc_unneeded_upper|=1;
5648     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5649                   bc_unneeded,bc_unneeded_upper);
5650     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5651     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5652     cc=get_reg(branch_regs[i].regmap,CCREG);
5653     assert(cc==HOST_CCREG);
5654     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5655     assem_debug("cycle count (adj)\n");
5656     if(1) {
5657       int nottaken=0;
5658       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5659       if(1) {
5660         assert(fs>=0);
5661         emit_testimm(fs,0x800000);
5662         if(source[i]&0x10000) // BC1T
5663         {
5664           if(invert){
5665             nottaken=(int)out;
5666             emit_jeq(1);
5667           }else{
5668             add_to_linker((int)out,ba[i],internal);
5669             emit_jne(0);
5670           }
5671         }
5672         else // BC1F
5673           if(invert){
5674             nottaken=(int)out;
5675             emit_jne(1);
5676           }else{
5677             add_to_linker((int)out,ba[i],internal);
5678             emit_jeq(0);
5679           }
5680         {
5681         }
5682       } // if(!only32)
5683
5684       if(invert) {
5685         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5686         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5687         else if(match) emit_addnop(13);
5688         #endif
5689         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5690         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5691         if(internal)
5692           assem_debug("branch: internal\n");
5693         else
5694           assem_debug("branch: external\n");
5695         if(internal&&is_ds[(ba[i]-start)>>2]) {
5696           ds_assemble_entry(i);
5697         }
5698         else {
5699           add_to_linker((int)out,ba[i],internal);
5700           emit_jmp(0);
5701         }
5702         set_jump_target(nottaken,(int)out);
5703       }
5704
5705       if(adj) {
5706         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5707       }
5708     } // (!unconditional)
5709   } // if(ooo)
5710   else
5711   {
5712     // In-order execution (branch first)
5713     //printf("IOE\n");
5714     int nottaken=0;
5715     if(1) {
5716       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5717       if(1) {
5718         assert(fs>=0);
5719         emit_testimm(fs,0x800000);
5720         if(source[i]&0x10000) // BC1T
5721         {
5722           nottaken=(int)out;
5723           emit_jeq(1);
5724         }
5725         else // BC1F
5726         {
5727           nottaken=(int)out;
5728           emit_jne(1);
5729         }
5730       }
5731     } // if(!unconditional)
5732     int adj;
5733     uint64_t ds_unneeded=branch_regs[i].u;
5734     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5735     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5736     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5737     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5738     ds_unneeded|=1;
5739     ds_unneeded_upper|=1;
5740     // branch taken
5741     //assem_debug("1:\n");
5742     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5743                   ds_unneeded,ds_unneeded_upper);
5744     // load regs
5745     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5746     address_generation(i+1,&branch_regs[i],0);
5747     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5748     ds_assemble(i+1,&branch_regs[i]);
5749     cc=get_reg(branch_regs[i].regmap,CCREG);
5750     if(cc==-1) {
5751       emit_loadreg(CCREG,cc=HOST_CCREG);
5752       // CHECK: Is the following instruction (fall thru) allocated ok?
5753     }
5754     assert(cc==HOST_CCREG);
5755     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5756     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5757     assem_debug("cycle count (adj)\n");
5758     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5759     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5760     if(internal)
5761       assem_debug("branch: internal\n");
5762     else
5763       assem_debug("branch: external\n");
5764     if(internal&&is_ds[(ba[i]-start)>>2]) {
5765       ds_assemble_entry(i);
5766     }
5767     else {
5768       add_to_linker((int)out,ba[i],internal);
5769       emit_jmp(0);
5770     }
5771
5772     // branch not taken
5773     if(1) { // <- FIXME (don't need this)
5774       set_jump_target(nottaken,(int)out);
5775       assem_debug("1:\n");
5776       if(!likely[i]) {
5777         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5778                       ds_unneeded,ds_unneeded_upper);
5779         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5780         address_generation(i+1,&branch_regs[i],0);
5781         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5782         ds_assemble(i+1,&branch_regs[i]);
5783       }
5784       cc=get_reg(branch_regs[i].regmap,CCREG);
5785       if(cc==-1&&!likely[i]) {
5786         // Cycle count isn't in a register, temporarily load it then write it out
5787         emit_loadreg(CCREG,HOST_CCREG);
5788         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5789         int jaddr=(int)out;
5790         emit_jns(0);
5791         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5792         emit_storereg(CCREG,HOST_CCREG);
5793       }
5794       else{
5795         cc=get_reg(i_regmap,CCREG);
5796         assert(cc==HOST_CCREG);
5797         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5798         int jaddr=(int)out;
5799         emit_jns(0);
5800         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5801       }
5802     }
5803   }
5804 }
5805
5806 static void pagespan_assemble(int i,struct regstat *i_regs)
5807 {
5808   int s1l=get_reg(i_regs->regmap,rs1[i]);
5809   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5810   int s2l=get_reg(i_regs->regmap,rs2[i]);
5811   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5812   int taken=0;
5813   int nottaken=0;
5814   int unconditional=0;
5815   if(rs1[i]==0)
5816   {
5817     s1l=s2l;s1h=s2h;
5818     s2l=s2h=-1;
5819   }
5820   else if(rs2[i]==0)
5821   {
5822     s2l=s2h=-1;
5823   }
5824   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5825     s1h=s2h=-1;
5826   }
5827   int hr=0;
5828   int addr=-1,alt=-1,ntaddr=-1;
5829   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5830   else {
5831     while(hr<HOST_REGS)
5832     {
5833       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5834          (i_regs->regmap[hr]&63)!=rs1[i] &&
5835          (i_regs->regmap[hr]&63)!=rs2[i] )
5836       {
5837         addr=hr++;break;
5838       }
5839       hr++;
5840     }
5841   }
5842   while(hr<HOST_REGS)
5843   {
5844     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5845        (i_regs->regmap[hr]&63)!=rs1[i] &&
5846        (i_regs->regmap[hr]&63)!=rs2[i] )
5847     {
5848       alt=hr++;break;
5849     }
5850     hr++;
5851   }
5852   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5853   {
5854     while(hr<HOST_REGS)
5855     {
5856       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5857          (i_regs->regmap[hr]&63)!=rs1[i] &&
5858          (i_regs->regmap[hr]&63)!=rs2[i] )
5859       {
5860         ntaddr=hr;break;
5861       }
5862       hr++;
5863     }
5864   }
5865   assert(hr<HOST_REGS);
5866   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5867     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5868   }
5869   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5870   if(opcode[i]==2) // J
5871   {
5872     unconditional=1;
5873   }
5874   if(opcode[i]==3) // JAL
5875   {
5876     // TODO: mini_ht
5877     int rt=get_reg(i_regs->regmap,31);
5878     emit_movimm(start+i*4+8,rt);
5879     unconditional=1;
5880   }
5881   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5882   {
5883     emit_mov(s1l,addr);
5884     if(opcode2[i]==9) // JALR
5885     {
5886       int rt=get_reg(i_regs->regmap,rt1[i]);
5887       emit_movimm(start+i*4+8,rt);
5888     }
5889   }
5890   if((opcode[i]&0x3f)==4) // BEQ
5891   {
5892     if(rs1[i]==rs2[i])
5893     {
5894       unconditional=1;
5895     }
5896     else
5897     #ifdef HAVE_CMOV_IMM
5898     if(s1h<0) {
5899       if(s2l>=0) emit_cmp(s1l,s2l);
5900       else emit_test(s1l,s1l);
5901       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5902     }
5903     else
5904     #endif
5905     {
5906       assert(s1l>=0);
5907       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5908       if(s1h>=0) {
5909         if(s2h>=0) emit_cmp(s1h,s2h);
5910         else emit_test(s1h,s1h);
5911         emit_cmovne_reg(alt,addr);
5912       }
5913       if(s2l>=0) emit_cmp(s1l,s2l);
5914       else emit_test(s1l,s1l);
5915       emit_cmovne_reg(alt,addr);
5916     }
5917   }
5918   if((opcode[i]&0x3f)==5) // BNE
5919   {
5920     #ifdef HAVE_CMOV_IMM
5921     if(s1h<0) {
5922       if(s2l>=0) emit_cmp(s1l,s2l);
5923       else emit_test(s1l,s1l);
5924       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5925     }
5926     else
5927     #endif
5928     {
5929       assert(s1l>=0);
5930       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5931       if(s1h>=0) {
5932         if(s2h>=0) emit_cmp(s1h,s2h);
5933         else emit_test(s1h,s1h);
5934         emit_cmovne_reg(alt,addr);
5935       }
5936       if(s2l>=0) emit_cmp(s1l,s2l);
5937       else emit_test(s1l,s1l);
5938       emit_cmovne_reg(alt,addr);
5939     }
5940   }
5941   if((opcode[i]&0x3f)==0x14) // BEQL
5942   {
5943     if(s1h>=0) {
5944       if(s2h>=0) emit_cmp(s1h,s2h);
5945       else emit_test(s1h,s1h);
5946       nottaken=(int)out;
5947       emit_jne(0);
5948     }
5949     if(s2l>=0) emit_cmp(s1l,s2l);
5950     else emit_test(s1l,s1l);
5951     if(nottaken) set_jump_target(nottaken,(int)out);
5952     nottaken=(int)out;
5953     emit_jne(0);
5954   }
5955   if((opcode[i]&0x3f)==0x15) // BNEL
5956   {
5957     if(s1h>=0) {
5958       if(s2h>=0) emit_cmp(s1h,s2h);
5959       else emit_test(s1h,s1h);
5960       taken=(int)out;
5961       emit_jne(0);
5962     }
5963     if(s2l>=0) emit_cmp(s1l,s2l);
5964     else emit_test(s1l,s1l);
5965     nottaken=(int)out;
5966     emit_jeq(0);
5967     if(taken) set_jump_target(taken,(int)out);
5968   }
5969   if((opcode[i]&0x3f)==6) // BLEZ
5970   {
5971     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5972     emit_cmpimm(s1l,1);
5973     if(s1h>=0) emit_mov(addr,ntaddr);
5974     emit_cmovl_reg(alt,addr);
5975     if(s1h>=0) {
5976       emit_test(s1h,s1h);
5977       emit_cmovne_reg(ntaddr,addr);
5978       emit_cmovs_reg(alt,addr);
5979     }
5980   }
5981   if((opcode[i]&0x3f)==7) // BGTZ
5982   {
5983     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5984     emit_cmpimm(s1l,1);
5985     if(s1h>=0) emit_mov(addr,alt);
5986     emit_cmovl_reg(ntaddr,addr);
5987     if(s1h>=0) {
5988       emit_test(s1h,s1h);
5989       emit_cmovne_reg(alt,addr);
5990       emit_cmovs_reg(ntaddr,addr);
5991     }
5992   }
5993   if((opcode[i]&0x3f)==0x16) // BLEZL
5994   {
5995     assert((opcode[i]&0x3f)!=0x16);
5996   }
5997   if((opcode[i]&0x3f)==0x17) // BGTZL
5998   {
5999     assert((opcode[i]&0x3f)!=0x17);
6000   }
6001   assert(opcode[i]!=1); // BLTZ/BGEZ
6002
6003   //FIXME: Check CSREG
6004   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6005     if((source[i]&0x30000)==0) // BC1F
6006     {
6007       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6008       emit_testimm(s1l,0x800000);
6009       emit_cmovne_reg(alt,addr);
6010     }
6011     if((source[i]&0x30000)==0x10000) // BC1T
6012     {
6013       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6014       emit_testimm(s1l,0x800000);
6015       emit_cmovne_reg(alt,addr);
6016     }
6017     if((source[i]&0x30000)==0x20000) // BC1FL
6018     {
6019       emit_testimm(s1l,0x800000);
6020       nottaken=(int)out;
6021       emit_jne(0);
6022     }
6023     if((source[i]&0x30000)==0x30000) // BC1TL
6024     {
6025       emit_testimm(s1l,0x800000);
6026       nottaken=(int)out;
6027       emit_jeq(0);
6028     }
6029   }
6030
6031   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6032   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6033   if(likely[i]||unconditional)
6034   {
6035     emit_movimm(ba[i],HOST_BTREG);
6036   }
6037   else if(addr!=HOST_BTREG)
6038   {
6039     emit_mov(addr,HOST_BTREG);
6040   }
6041   void *branch_addr=out;
6042   emit_jmp(0);
6043   int target_addr=start+i*4+5;
6044   void *stub=out;
6045   void *compiled_target_addr=check_addr(target_addr);
6046   emit_extjump_ds((int)branch_addr,target_addr);
6047   if(compiled_target_addr) {
6048     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6049     add_link(target_addr,stub);
6050   }
6051   else set_jump_target((int)branch_addr,(int)stub);
6052   if(likely[i]) {
6053     // Not-taken path
6054     set_jump_target((int)nottaken,(int)out);
6055     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6056     void *branch_addr=out;
6057     emit_jmp(0);
6058     int target_addr=start+i*4+8;
6059     void *stub=out;
6060     void *compiled_target_addr=check_addr(target_addr);
6061     emit_extjump_ds((int)branch_addr,target_addr);
6062     if(compiled_target_addr) {
6063       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6064       add_link(target_addr,stub);
6065     }
6066     else set_jump_target((int)branch_addr,(int)stub);
6067   }
6068 }
6069
6070 // Assemble the delay slot for the above
6071 static void pagespan_ds()
6072 {
6073   assem_debug("initial delay slot:\n");
6074   u_int vaddr=start+1;
6075   u_int page=get_page(vaddr);
6076   u_int vpage=get_vpage(vaddr);
6077   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6078   do_dirty_stub_ds();
6079   ll_add(jump_in+page,vaddr,(void *)out);
6080   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6081   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6082     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6083   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6084     emit_writeword(HOST_BTREG,(int)&branch_target);
6085   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6086   address_generation(0,&regs[0],regs[0].regmap_entry);
6087   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6088     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6089   cop1_usable=0;
6090   is_delayslot=0;
6091   switch(itype[0]) {
6092     case ALU:
6093       alu_assemble(0,&regs[0]);break;
6094     case IMM16:
6095       imm16_assemble(0,&regs[0]);break;
6096     case SHIFT:
6097       shift_assemble(0,&regs[0]);break;
6098     case SHIFTIMM:
6099       shiftimm_assemble(0,&regs[0]);break;
6100     case LOAD:
6101       load_assemble(0,&regs[0]);break;
6102     case LOADLR:
6103       loadlr_assemble(0,&regs[0]);break;
6104     case STORE:
6105       store_assemble(0,&regs[0]);break;
6106     case STORELR:
6107       storelr_assemble(0,&regs[0]);break;
6108     case COP0:
6109       cop0_assemble(0,&regs[0]);break;
6110     case COP1:
6111       cop1_assemble(0,&regs[0]);break;
6112     case C1LS:
6113       c1ls_assemble(0,&regs[0]);break;
6114     case COP2:
6115       cop2_assemble(0,&regs[0]);break;
6116     case C2LS:
6117       c2ls_assemble(0,&regs[0]);break;
6118     case C2OP:
6119       c2op_assemble(0,&regs[0]);break;
6120     case FCONV:
6121       fconv_assemble(0,&regs[0]);break;
6122     case FLOAT:
6123       float_assemble(0,&regs[0]);break;
6124     case FCOMP:
6125       fcomp_assemble(0,&regs[0]);break;
6126     case MULTDIV:
6127       multdiv_assemble(0,&regs[0]);break;
6128     case MOV:
6129       mov_assemble(0,&regs[0]);break;
6130     case SYSCALL:
6131     case HLECALL:
6132     case INTCALL:
6133     case SPAN:
6134     case UJUMP:
6135     case RJUMP:
6136     case CJUMP:
6137     case SJUMP:
6138     case FJUMP:
6139       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6140   }
6141   int btaddr=get_reg(regs[0].regmap,BTREG);
6142   if(btaddr<0) {
6143     btaddr=get_reg(regs[0].regmap,-1);
6144     emit_readword((int)&branch_target,btaddr);
6145   }
6146   assert(btaddr!=HOST_CCREG);
6147   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6148 #ifdef HOST_IMM8
6149   emit_movimm(start+4,HOST_TEMPREG);
6150   emit_cmp(btaddr,HOST_TEMPREG);
6151 #else
6152   emit_cmpimm(btaddr,start+4);
6153 #endif
6154   int branch=(int)out;
6155   emit_jeq(0);
6156   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6157   emit_jmp(jump_vaddr_reg[btaddr]);
6158   set_jump_target(branch,(int)out);
6159   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6160   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6161 }
6162
6163 // Basic liveness analysis for MIPS registers
6164 void unneeded_registers(int istart,int iend,int r)
6165 {
6166   int i;
6167   uint64_t u,uu,gte_u,b,bu,gte_bu;
6168   uint64_t temp_u,temp_uu,temp_gte_u=0;
6169   uint64_t tdep;
6170   uint64_t gte_u_unknown=0;
6171   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6172     gte_u_unknown=~0ll;
6173   if(iend==slen-1) {
6174     u=1;uu=1;
6175     gte_u=gte_u_unknown;
6176   }else{
6177     u=unneeded_reg[iend+1];
6178     uu=unneeded_reg_upper[iend+1];
6179     u=1;uu=1;
6180     gte_u=gte_unneeded[iend+1];
6181   }
6182
6183   for (i=iend;i>=istart;i--)
6184   {
6185     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6186     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6187     {
6188       // If subroutine call, flag return address as a possible branch target
6189       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6190
6191       if(ba[i]<start || ba[i]>=(start+slen*4))
6192       {
6193         // Branch out of this block, flush all regs
6194         u=1;
6195         uu=1;
6196         gte_u=gte_u_unknown;
6197         /* Hexagon hack
6198         if(itype[i]==UJUMP&&rt1[i]==31)
6199         {
6200           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6201         }
6202         if(itype[i]==RJUMP&&rs1[i]==31)
6203         {
6204           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6205         }
6206         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6207           if(itype[i]==UJUMP&&rt1[i]==31)
6208           {
6209             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6210             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6211           }
6212           if(itype[i]==RJUMP&&rs1[i]==31)
6213           {
6214             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6215             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6216           }
6217         }*/
6218         branch_unneeded_reg[i]=u;
6219         branch_unneeded_reg_upper[i]=uu;
6220         // Merge in delay slot
6221         tdep=(~uu>>rt1[i+1])&1;
6222         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6223         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6224         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6225         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6226         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6227         u|=1;uu|=1;
6228         gte_u|=gte_rt[i+1];
6229         gte_u&=~gte_rs[i+1];
6230         // If branch is "likely" (and conditional)
6231         // then we skip the delay slot on the fall-thru path
6232         if(likely[i]) {
6233           if(i<slen-1) {
6234             u&=unneeded_reg[i+2];
6235             uu&=unneeded_reg_upper[i+2];
6236             gte_u&=gte_unneeded[i+2];
6237           }
6238           else
6239           {
6240             u=1;
6241             uu=1;
6242             gte_u=gte_u_unknown;
6243           }
6244         }
6245       }
6246       else
6247       {
6248         // Internal branch, flag target
6249         bt[(ba[i]-start)>>2]=1;
6250         if(ba[i]<=start+i*4) {
6251           // Backward branch
6252           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6253           {
6254             // Unconditional branch
6255             temp_u=1;temp_uu=1;
6256             temp_gte_u=0;
6257           } else {
6258             // Conditional branch (not taken case)
6259             temp_u=unneeded_reg[i+2];
6260             temp_uu=unneeded_reg_upper[i+2];
6261             temp_gte_u&=gte_unneeded[i+2];
6262           }
6263           // Merge in delay slot
6264           tdep=(~temp_uu>>rt1[i+1])&1;
6265           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6266           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6267           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6268           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6269           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6270           temp_u|=1;temp_uu|=1;
6271           temp_gte_u|=gte_rt[i+1];
6272           temp_gte_u&=~gte_rs[i+1];
6273           // If branch is "likely" (and conditional)
6274           // then we skip the delay slot on the fall-thru path
6275           if(likely[i]) {
6276             if(i<slen-1) {
6277               temp_u&=unneeded_reg[i+2];
6278               temp_uu&=unneeded_reg_upper[i+2];
6279               temp_gte_u&=gte_unneeded[i+2];
6280             }
6281             else
6282             {
6283               temp_u=1;
6284               temp_uu=1;
6285               temp_gte_u=gte_u_unknown;
6286             }
6287           }
6288           tdep=(~temp_uu>>rt1[i])&1;
6289           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6290           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6291           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6292           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6293           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6294           temp_u|=1;temp_uu|=1;
6295           temp_gte_u|=gte_rt[i];
6296           temp_gte_u&=~gte_rs[i];
6297           unneeded_reg[i]=temp_u;
6298           unneeded_reg_upper[i]=temp_uu;
6299           gte_unneeded[i]=temp_gte_u;
6300           // Only go three levels deep.  This recursion can take an
6301           // excessive amount of time if there are a lot of nested loops.
6302           if(r<2) {
6303             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6304           }else{
6305             unneeded_reg[(ba[i]-start)>>2]=1;
6306             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6307             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6308           }
6309         } /*else*/ if(1) {
6310           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6311           {
6312             // Unconditional branch
6313             u=unneeded_reg[(ba[i]-start)>>2];
6314             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6315             gte_u=gte_unneeded[(ba[i]-start)>>2];
6316             branch_unneeded_reg[i]=u;
6317             branch_unneeded_reg_upper[i]=uu;
6318         //u=1;
6319         //uu=1;
6320         //branch_unneeded_reg[i]=u;
6321         //branch_unneeded_reg_upper[i]=uu;
6322             // Merge in delay slot
6323             tdep=(~uu>>rt1[i+1])&1;
6324             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6325             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6326             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6327             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6328             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6329             u|=1;uu|=1;
6330             gte_u|=gte_rt[i+1];
6331             gte_u&=~gte_rs[i+1];
6332           } else {
6333             // Conditional branch
6334             b=unneeded_reg[(ba[i]-start)>>2];
6335             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6336             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6337             branch_unneeded_reg[i]=b;
6338             branch_unneeded_reg_upper[i]=bu;
6339         //b=1;
6340         //bu=1;
6341         //branch_unneeded_reg[i]=b;
6342         //branch_unneeded_reg_upper[i]=bu;
6343             // Branch delay slot
6344             tdep=(~uu>>rt1[i+1])&1;
6345             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6346             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6347             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6348             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6349             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6350             b|=1;bu|=1;
6351             gte_bu|=gte_rt[i+1];
6352             gte_bu&=~gte_rs[i+1];
6353             // If branch is "likely" then we skip the
6354             // delay slot on the fall-thru path
6355             if(likely[i]) {
6356               u=b;
6357               uu=bu;
6358               gte_u=gte_bu;
6359               if(i<slen-1) {
6360                 u&=unneeded_reg[i+2];
6361                 uu&=unneeded_reg_upper[i+2];
6362                 gte_u&=gte_unneeded[i+2];
6363         //u=1;
6364         //uu=1;
6365               }
6366             } else {
6367               u&=b;
6368               uu&=bu;
6369               gte_u&=gte_bu;
6370         //u=1;
6371         //uu=1;
6372             }
6373             if(i<slen-1) {
6374               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6375               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6376         //branch_unneeded_reg[i]=1;
6377         //branch_unneeded_reg_upper[i]=1;
6378             } else {
6379               branch_unneeded_reg[i]=1;
6380               branch_unneeded_reg_upper[i]=1;
6381             }
6382           }
6383         }
6384       }
6385     }
6386     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6387     {
6388       // SYSCALL instruction (software interrupt)
6389       u=1;
6390       uu=1;
6391     }
6392     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6393     {
6394       // ERET instruction (return from interrupt)
6395       u=1;
6396       uu=1;
6397     }
6398     //u=uu=1; // DEBUG
6399     tdep=(~uu>>rt1[i])&1;
6400     // Written registers are unneeded
6401     u|=1LL<<rt1[i];
6402     u|=1LL<<rt2[i];
6403     uu|=1LL<<rt1[i];
6404     uu|=1LL<<rt2[i];
6405     gte_u|=gte_rt[i];
6406     // Accessed registers are needed
6407     u&=~(1LL<<rs1[i]);
6408     u&=~(1LL<<rs2[i]);
6409     uu&=~(1LL<<us1[i]);
6410     uu&=~(1LL<<us2[i]);
6411     gte_u&=~gte_rs[i];
6412     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6413       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6414     // Source-target dependencies
6415     uu&=~(tdep<<dep1[i]);
6416     uu&=~(tdep<<dep2[i]);
6417     // R0 is always unneeded
6418     u|=1;uu|=1;
6419     // Save it
6420     unneeded_reg[i]=u;
6421     unneeded_reg_upper[i]=uu;
6422     gte_unneeded[i]=gte_u;
6423     /*
6424     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6425     printf("U:");
6426     int r;
6427     for(r=1;r<=CCREG;r++) {
6428       if((unneeded_reg[i]>>r)&1) {
6429         if(r==HIREG) printf(" HI");
6430         else if(r==LOREG) printf(" LO");
6431         else printf(" r%d",r);
6432       }
6433     }
6434     printf(" UU:");
6435     for(r=1;r<=CCREG;r++) {
6436       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6437         if(r==HIREG) printf(" HI");
6438         else if(r==LOREG) printf(" LO");
6439         else printf(" r%d",r);
6440       }
6441     }
6442     printf("\n");*/
6443   }
6444   for (i=iend;i>=istart;i--)
6445   {
6446     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6447   }
6448 }
6449
6450 // Write back dirty registers as soon as we will no longer modify them,
6451 // so that we don't end up with lots of writes at the branches.
6452 void clean_registers(int istart,int iend,int wr)
6453 {
6454   int i;
6455   int r;
6456   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6457   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6458   if(iend==slen-1) {
6459     will_dirty_i=will_dirty_next=0;
6460     wont_dirty_i=wont_dirty_next=0;
6461   }else{
6462     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6463     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6464   }
6465   for (i=iend;i>=istart;i--)
6466   {
6467     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6468     {
6469       if(ba[i]<start || ba[i]>=(start+slen*4))
6470       {
6471         // Branch out of this block, flush all regs
6472         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6473         {
6474           // Unconditional branch
6475           will_dirty_i=0;
6476           wont_dirty_i=0;
6477           // Merge in delay slot (will dirty)
6478           for(r=0;r<HOST_REGS;r++) {
6479             if(r!=EXCLUDE_REG) {
6480               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6481               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6482               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6483               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6484               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6485               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6486               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6487               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6488               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6489               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6490               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6491               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6492               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6493               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6494             }
6495           }
6496         }
6497         else
6498         {
6499           // Conditional branch
6500           will_dirty_i=0;
6501           wont_dirty_i=wont_dirty_next;
6502           // Merge in delay slot (will dirty)
6503           for(r=0;r<HOST_REGS;r++) {
6504             if(r!=EXCLUDE_REG) {
6505               if(!likely[i]) {
6506                 // Might not dirty if likely branch is not taken
6507                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6508                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6509                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6510                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6511                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6512                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6513                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6514                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6515                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6516                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6517                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6518                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6519                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6520                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6521               }
6522             }
6523           }
6524         }
6525         // Merge in delay slot (wont dirty)
6526         for(r=0;r<HOST_REGS;r++) {
6527           if(r!=EXCLUDE_REG) {
6528             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6529             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6530             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6531             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6532             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6533             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6534             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6535             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6536             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6537             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6538           }
6539         }
6540         if(wr) {
6541           #ifndef DESTRUCTIVE_WRITEBACK
6542           branch_regs[i].dirty&=wont_dirty_i;
6543           #endif
6544           branch_regs[i].dirty|=will_dirty_i;
6545         }
6546       }
6547       else
6548       {
6549         // Internal branch
6550         if(ba[i]<=start+i*4) {
6551           // Backward branch
6552           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6553           {
6554             // Unconditional branch
6555             temp_will_dirty=0;
6556             temp_wont_dirty=0;
6557             // Merge in delay slot (will dirty)
6558             for(r=0;r<HOST_REGS;r++) {
6559               if(r!=EXCLUDE_REG) {
6560                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6561                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6562                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6563                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6564                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6565                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6566                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6567                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6568                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6569                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6570                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6571                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6572                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6573                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6574               }
6575             }
6576           } else {
6577             // Conditional branch (not taken case)
6578             temp_will_dirty=will_dirty_next;
6579             temp_wont_dirty=wont_dirty_next;
6580             // Merge in delay slot (will dirty)
6581             for(r=0;r<HOST_REGS;r++) {
6582               if(r!=EXCLUDE_REG) {
6583                 if(!likely[i]) {
6584                   // Will not dirty if likely branch is not taken
6585                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6586                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6587                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6588                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6589                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6590                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6591                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6592                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6593                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6594                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6595                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6596                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6597                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6598                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6599                 }
6600               }
6601             }
6602           }
6603           // Merge in delay slot (wont dirty)
6604           for(r=0;r<HOST_REGS;r++) {
6605             if(r!=EXCLUDE_REG) {
6606               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6607               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6608               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6609               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6610               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6611               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6612               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6613               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6614               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6615               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6616             }
6617           }
6618           // Deal with changed mappings
6619           if(i<iend) {
6620             for(r=0;r<HOST_REGS;r++) {
6621               if(r!=EXCLUDE_REG) {
6622                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6623                   temp_will_dirty&=~(1<<r);
6624                   temp_wont_dirty&=~(1<<r);
6625                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6626                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6627                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6628                   } else {
6629                     temp_will_dirty|=1<<r;
6630                     temp_wont_dirty|=1<<r;
6631                   }
6632                 }
6633               }
6634             }
6635           }
6636           if(wr) {
6637             will_dirty[i]=temp_will_dirty;
6638             wont_dirty[i]=temp_wont_dirty;
6639             clean_registers((ba[i]-start)>>2,i-1,0);
6640           }else{
6641             // Limit recursion.  It can take an excessive amount
6642             // of time if there are a lot of nested loops.
6643             will_dirty[(ba[i]-start)>>2]=0;
6644             wont_dirty[(ba[i]-start)>>2]=-1;
6645           }
6646         }
6647         /*else*/ if(1)
6648         {
6649           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6650           {
6651             // Unconditional branch
6652             will_dirty_i=0;
6653             wont_dirty_i=0;
6654           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6655             for(r=0;r<HOST_REGS;r++) {
6656               if(r!=EXCLUDE_REG) {
6657                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6658                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6659                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6660                 }
6661                 if(branch_regs[i].regmap[r]>=0) {
6662                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6663                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6664                 }
6665               }
6666             }
6667           //}
6668             // Merge in delay slot
6669             for(r=0;r<HOST_REGS;r++) {
6670               if(r!=EXCLUDE_REG) {
6671                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6672                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6673                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6674                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6675                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6676                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6677                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6678                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6679                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6680                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6681                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6682                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6683                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6684                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6685               }
6686             }
6687           } else {
6688             // Conditional branch
6689             will_dirty_i=will_dirty_next;
6690             wont_dirty_i=wont_dirty_next;
6691           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6692             for(r=0;r<HOST_REGS;r++) {
6693               if(r!=EXCLUDE_REG) {
6694                 signed char target_reg=branch_regs[i].regmap[r];
6695                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6696                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6697                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6698                 }
6699                 else if(target_reg>=0) {
6700                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6701                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6702                 }
6703                 // Treat delay slot as part of branch too
6704                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6705                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6706                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6707                 }
6708                 else
6709                 {
6710                   will_dirty[i+1]&=~(1<<r);
6711                 }*/
6712               }
6713             }
6714           //}
6715             // Merge in delay slot
6716             for(r=0;r<HOST_REGS;r++) {
6717               if(r!=EXCLUDE_REG) {
6718                 if(!likely[i]) {
6719                   // Might not dirty if likely branch is not taken
6720                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6721                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6722                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6723                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6724                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6725                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6726                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6727                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6728                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6729                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6730                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6731                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6732                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6733                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6734                 }
6735               }
6736             }
6737           }
6738           // Merge in delay slot (won't dirty)
6739           for(r=0;r<HOST_REGS;r++) {
6740             if(r!=EXCLUDE_REG) {
6741               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6742               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6743               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6744               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6745               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6746               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6747               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6748               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6749               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6750               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6751             }
6752           }
6753           if(wr) {
6754             #ifndef DESTRUCTIVE_WRITEBACK
6755             branch_regs[i].dirty&=wont_dirty_i;
6756             #endif
6757             branch_regs[i].dirty|=will_dirty_i;
6758           }
6759         }
6760       }
6761     }
6762     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6763     {
6764       // SYSCALL instruction (software interrupt)
6765       will_dirty_i=0;
6766       wont_dirty_i=0;
6767     }
6768     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6769     {
6770       // ERET instruction (return from interrupt)
6771       will_dirty_i=0;
6772       wont_dirty_i=0;
6773     }
6774     will_dirty_next=will_dirty_i;
6775     wont_dirty_next=wont_dirty_i;
6776     for(r=0;r<HOST_REGS;r++) {
6777       if(r!=EXCLUDE_REG) {
6778         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6779         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6780         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6781         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6782         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6783         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6784         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6785         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6786         if(i>istart) {
6787           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6788           {
6789             // Don't store a register immediately after writing it,
6790             // may prevent dual-issue.
6791             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6792             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6793           }
6794         }
6795       }
6796     }
6797     // Save it
6798     will_dirty[i]=will_dirty_i;
6799     wont_dirty[i]=wont_dirty_i;
6800     // Mark registers that won't be dirtied as not dirty
6801     if(wr) {
6802       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6803       for(r=0;r<HOST_REGS;r++) {
6804         if((will_dirty_i>>r)&1) {
6805           printf(" r%d",r);
6806         }
6807       }
6808       printf("\n");*/
6809
6810       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6811         regs[i].dirty|=will_dirty_i;
6812         #ifndef DESTRUCTIVE_WRITEBACK
6813         regs[i].dirty&=wont_dirty_i;
6814         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6815         {
6816           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6817             for(r=0;r<HOST_REGS;r++) {
6818               if(r!=EXCLUDE_REG) {
6819                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6820                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6821                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6822               }
6823             }
6824           }
6825         }
6826         else
6827         {
6828           if(i<iend) {
6829             for(r=0;r<HOST_REGS;r++) {
6830               if(r!=EXCLUDE_REG) {
6831                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6832                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6833                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6834               }
6835             }
6836           }
6837         }
6838         #endif
6839       //}
6840     }
6841     // Deal with changed mappings
6842     temp_will_dirty=will_dirty_i;
6843     temp_wont_dirty=wont_dirty_i;
6844     for(r=0;r<HOST_REGS;r++) {
6845       if(r!=EXCLUDE_REG) {
6846         int nr;
6847         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6848           if(wr) {
6849             #ifndef DESTRUCTIVE_WRITEBACK
6850             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6851             #endif
6852             regs[i].wasdirty|=will_dirty_i&(1<<r);
6853           }
6854         }
6855         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6856           // Register moved to a different register
6857           will_dirty_i&=~(1<<r);
6858           wont_dirty_i&=~(1<<r);
6859           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6860           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6861           if(wr) {
6862             #ifndef DESTRUCTIVE_WRITEBACK
6863             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6864             #endif
6865             regs[i].wasdirty|=will_dirty_i&(1<<r);
6866           }
6867         }
6868         else {
6869           will_dirty_i&=~(1<<r);
6870           wont_dirty_i&=~(1<<r);
6871           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6872             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6873             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6874           } else {
6875             wont_dirty_i|=1<<r;
6876             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6877           }
6878         }
6879       }
6880     }
6881   }
6882 }
6883
6884 #ifdef DISASM
6885   /* disassembly */
6886 void disassemble_inst(int i)
6887 {
6888     if (bt[i]) printf("*"); else printf(" ");
6889     switch(itype[i]) {
6890       case UJUMP:
6891         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6892       case CJUMP:
6893         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6894       case SJUMP:
6895         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6896       case FJUMP:
6897         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6898       case RJUMP:
6899         if (opcode[i]==0x9&&rt1[i]!=31)
6900           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6901         else
6902           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6903         break;
6904       case SPAN:
6905         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6906       case IMM16:
6907         if(opcode[i]==0xf) //LUI
6908           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6909         else
6910           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6911         break;
6912       case LOAD:
6913       case LOADLR:
6914         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6915         break;
6916       case STORE:
6917       case STORELR:
6918         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6919         break;
6920       case ALU:
6921       case SHIFT:
6922         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6923         break;
6924       case MULTDIV:
6925         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6926         break;
6927       case SHIFTIMM:
6928         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6929         break;
6930       case MOV:
6931         if((opcode2[i]&0x1d)==0x10)
6932           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6933         else if((opcode2[i]&0x1d)==0x11)
6934           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6935         else
6936           printf (" %x: %s\n",start+i*4,insn[i]);
6937         break;
6938       case COP0:
6939         if(opcode2[i]==0)
6940           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6941         else if(opcode2[i]==4)
6942           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6943         else printf (" %x: %s\n",start+i*4,insn[i]);
6944         break;
6945       case COP1:
6946         if(opcode2[i]<3)
6947           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6948         else if(opcode2[i]>3)
6949           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6950         else printf (" %x: %s\n",start+i*4,insn[i]);
6951         break;
6952       case COP2:
6953         if(opcode2[i]<3)
6954           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6955         else if(opcode2[i]>3)
6956           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6957         else printf (" %x: %s\n",start+i*4,insn[i]);
6958         break;
6959       case C1LS:
6960         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6961         break;
6962       case C2LS:
6963         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6964         break;
6965       case INTCALL:
6966         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6967         break;
6968       default:
6969         //printf (" %s %8x\n",insn[i],source[i]);
6970         printf (" %x: %s\n",start+i*4,insn[i]);
6971     }
6972 }
6973 #else
6974 static void disassemble_inst(int i) {}
6975 #endif // DISASM
6976
6977 #define DRC_TEST_VAL 0x74657374
6978
6979 static int new_dynarec_test(void)
6980 {
6981   int (*testfunc)(void) = (void *)out;
6982   void *beginning;
6983   int ret;
6984
6985   beginning = start_block();
6986   emit_movimm(DRC_TEST_VAL,0); // test
6987   emit_jmpreg(14);
6988   literal_pool(0);
6989   end_block(beginning);
6990   SysPrintf("testing if we can run recompiled code..\n");
6991   ret = testfunc();
6992   if (ret == DRC_TEST_VAL)
6993     SysPrintf("test passed.\n");
6994   else
6995     SysPrintf("test failed: %08x\n", ret);
6996   out=(u_char *)BASE_ADDR;
6997   return ret == DRC_TEST_VAL;
6998 }
6999
7000 // clear the state completely, instead of just marking
7001 // things invalid like invalidate_all_pages() does
7002 void new_dynarec_clear_full()
7003 {
7004   int n;
7005   out=(u_char *)BASE_ADDR;
7006   memset(invalid_code,1,sizeof(invalid_code));
7007   memset(hash_table,0xff,sizeof(hash_table));
7008   memset(mini_ht,-1,sizeof(mini_ht));
7009   memset(restore_candidate,0,sizeof(restore_candidate));
7010   memset(shadow,0,sizeof(shadow));
7011   copy=shadow;
7012   expirep=16384; // Expiry pointer, +2 blocks
7013   pending_exception=0;
7014   literalcount=0;
7015   stop_after_jal=0;
7016   inv_code_start=inv_code_end=~0;
7017   // TLB
7018   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7019   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7020   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7021 }
7022
7023 void new_dynarec_init()
7024 {
7025   SysPrintf("Init new dynarec\n");
7026   out=(u_char *)BASE_ADDR;
7027 #if BASE_ADDR_FIXED
7028   if (mmap (out, 1<<TARGET_SIZE_2,
7029             PROT_READ | PROT_WRITE | PROT_EXEC,
7030             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7031             -1, 0) <= 0) {
7032     SysPrintf("mmap() failed: %s\n", strerror(errno));
7033   }
7034 #elif !defined(NO_WRITE_EXEC)
7035   // not all systems allow execute in data segment by default
7036   if (mprotect(out, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
7037     SysPrintf("mprotect() failed: %s\n", strerror(errno));
7038 #endif
7039   cycle_multiplier=200;
7040   new_dynarec_clear_full();
7041 #ifdef HOST_IMM8
7042   // Copy this into local area so we don't have to put it in every literal pool
7043   invc_ptr=invalid_code;
7044 #endif
7045   arch_init();
7046   new_dynarec_test();
7047 #ifndef RAM_FIXED
7048   ram_offset=(u_int)rdram-0x80000000;
7049 #endif
7050   if (ram_offset!=0)
7051     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
7052 }
7053
7054 void new_dynarec_cleanup()
7055 {
7056   int n;
7057   #if BASE_ADDR_FIXED
7058   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {SysPrintf("munmap() failed\n");}
7059   #endif
7060   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7061   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7062   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7063   #ifdef ROM_COPY
7064   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7065   #endif
7066 }
7067
7068 static u_int *get_source_start(u_int addr, u_int *limit)
7069 {
7070   if (addr < 0x00200000 ||
7071     (0xa0000000 <= addr && addr < 0xa0200000)) {
7072     // used for BIOS calls mostly?
7073     *limit = (addr&0xa0000000)|0x00200000;
7074     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7075   }
7076   else if (!Config.HLE && (
7077     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7078     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7079     // BIOS
7080     *limit = (addr & 0xfff00000) | 0x80000;
7081     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7082   }
7083   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7084     *limit = (addr & 0x80600000) + 0x00200000;
7085     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7086   }
7087   return NULL;
7088 }
7089
7090 static u_int scan_for_ret(u_int addr)
7091 {
7092   u_int limit = 0;
7093   u_int *mem;
7094
7095   mem = get_source_start(addr, &limit);
7096   if (mem == NULL)
7097     return addr;
7098
7099   if (limit > addr + 0x1000)
7100     limit = addr + 0x1000;
7101   for (; addr < limit; addr += 4, mem++) {
7102     if (*mem == 0x03e00008) // jr $ra
7103       return addr + 8;
7104   }
7105   return addr;
7106 }
7107
7108 struct savestate_block {
7109   uint32_t addr;
7110   uint32_t regflags;
7111 };
7112
7113 static int addr_cmp(const void *p1_, const void *p2_)
7114 {
7115   const struct savestate_block *p1 = p1_, *p2 = p2_;
7116   return p1->addr - p2->addr;
7117 }
7118
7119 int new_dynarec_save_blocks(void *save, int size)
7120 {
7121   struct savestate_block *blocks = save;
7122   int maxcount = size / sizeof(blocks[0]);
7123   struct savestate_block tmp_blocks[1024];
7124   struct ll_entry *head;
7125   int p, s, d, o, bcnt;
7126   u_int addr;
7127
7128   o = 0;
7129   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
7130     bcnt = 0;
7131     for (head = jump_in[p]; head != NULL; head = head->next) {
7132       tmp_blocks[bcnt].addr = head->vaddr;
7133       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7134       bcnt++;
7135     }
7136     if (bcnt < 1)
7137       continue;
7138     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7139
7140     addr = tmp_blocks[0].addr;
7141     for (s = d = 0; s < bcnt; s++) {
7142       if (tmp_blocks[s].addr < addr)
7143         continue;
7144       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7145         tmp_blocks[d++] = tmp_blocks[s];
7146       addr = scan_for_ret(tmp_blocks[s].addr);
7147     }
7148
7149     if (o + d > maxcount)
7150       d = maxcount - o;
7151     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7152     o += d;
7153   }
7154
7155   return o * sizeof(blocks[0]);
7156 }
7157
7158 void new_dynarec_load_blocks(const void *save, int size)
7159 {
7160   const struct savestate_block *blocks = save;
7161   int count = size / sizeof(blocks[0]);
7162   u_int regs_save[32];
7163   uint32_t f;
7164   int i, b;
7165
7166   get_addr(psxRegs.pc);
7167
7168   // change GPRs for speculation to at least partially work..
7169   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7170   for (i = 1; i < 32; i++)
7171     psxRegs.GPR.r[i] = 0x80000000;
7172
7173   for (b = 0; b < count; b++) {
7174     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7175       if (f & 1)
7176         psxRegs.GPR.r[i] = 0x1f800000;
7177     }
7178
7179     get_addr(blocks[b].addr);
7180
7181     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7182       if (f & 1)
7183         psxRegs.GPR.r[i] = 0x80000000;
7184     }
7185   }
7186
7187   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7188 }
7189
7190 int new_recompile_block(int addr)
7191 {
7192   u_int pagelimit = 0;
7193   u_int state_rflags = 0;
7194   int i;
7195
7196   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7197   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7198   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7199   //if(debug)
7200   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7201   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7202   /*if(Count>=312978186) {
7203     rlist();
7204   }*/
7205   //rlist();
7206
7207   // this is just for speculation
7208   for (i = 1; i < 32; i++) {
7209     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7210       state_rflags |= 1 << i;
7211   }
7212
7213   start = (u_int)addr&~3;
7214   //assert(((u_int)addr&1)==0);
7215   new_dynarec_did_compile=1;
7216   if (Config.HLE && start == 0x80001000) // hlecall
7217   {
7218     // XXX: is this enough? Maybe check hleSoftCall?
7219     void *beginning=start_block();
7220     u_int page=get_page(start);
7221
7222     invalid_code[start>>12]=0;
7223     emit_movimm(start,0);
7224     emit_writeword(0,(int)&pcaddr);
7225     emit_jmp((int)new_dyna_leave);
7226     literal_pool(0);
7227     end_block(beginning);
7228     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7229     return 0;
7230   }
7231
7232   source = get_source_start(start, &pagelimit);
7233   if (source == NULL) {
7234     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7235     exit(1);
7236   }
7237
7238   /* Pass 1: disassemble */
7239   /* Pass 2: register dependencies, branch targets */
7240   /* Pass 3: register allocation */
7241   /* Pass 4: branch dependencies */
7242   /* Pass 5: pre-alloc */
7243   /* Pass 6: optimize clean/dirty state */
7244   /* Pass 7: flag 32-bit registers */
7245   /* Pass 8: assembly */
7246   /* Pass 9: linker */
7247   /* Pass 10: garbage collection / free memory */
7248
7249   int j;
7250   int done=0;
7251   unsigned int type,op,op2;
7252
7253   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7254
7255   /* Pass 1 disassembly */
7256
7257   for(i=0;!done;i++) {
7258     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7259     minimum_free_regs[i]=0;
7260     opcode[i]=op=source[i]>>26;
7261     switch(op)
7262     {
7263       case 0x00: strcpy(insn[i],"special"); type=NI;
7264         op2=source[i]&0x3f;
7265         switch(op2)
7266         {
7267           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7268           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7269           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7270           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7271           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7272           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7273           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7274           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7275           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7276           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7277           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7278           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7279           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7280           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7281           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7282           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7283           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7284           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7285           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7286           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7287           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7288           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7289           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7290           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7291           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7292           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7293           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7294           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7295           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7296           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7297           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7298           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7299           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7300           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7301           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7302 #if 0
7303           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7304           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7305           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7306           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7307           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7308           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7309           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7310           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7311           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7312           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7313           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7314           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7315           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7316           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7317           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7318           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7319           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7320 #endif
7321         }
7322         break;
7323       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7324         op2=(source[i]>>16)&0x1f;
7325         switch(op2)
7326         {
7327           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7328           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7329           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7330           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7331           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7332           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7333           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7334           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7335           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7336           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7337           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7338           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7339           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7340           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7341         }
7342         break;
7343       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7344       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7345       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7346       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7347       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7348       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7349       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7350       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7351       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7352       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7353       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7354       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7355       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7356       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7357       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7358         op2=(source[i]>>21)&0x1f;
7359         switch(op2)
7360         {
7361           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7362           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7363           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7364           switch(source[i]&0x3f)
7365           {
7366             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7367             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7368             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7369             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7370             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7371             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7372           }
7373         }
7374         break;
7375       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7376         op2=(source[i]>>21)&0x1f;
7377         switch(op2)
7378         {
7379           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7380           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7381           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7382           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7383           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7384           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7385           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7386           switch((source[i]>>16)&0x3)
7387           {
7388             case 0x00: strcpy(insn[i],"BC1F"); break;
7389             case 0x01: strcpy(insn[i],"BC1T"); break;
7390             case 0x02: strcpy(insn[i],"BC1FL"); break;
7391             case 0x03: strcpy(insn[i],"BC1TL"); break;
7392           }
7393           break;
7394           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7395           switch(source[i]&0x3f)
7396           {
7397             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7398             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7399             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7400             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7401             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7402             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7403             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7404             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7405             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7406             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7407             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7408             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7409             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7410             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7411             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7412             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7413             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7414             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7415             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7416             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7417             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7418             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7419             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7420             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7421             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7422             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7423             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7424             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7425             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7426             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7427             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7428             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7429             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7430             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7431             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7432           }
7433           break;
7434           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7435           switch(source[i]&0x3f)
7436           {
7437             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7438             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7439             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7440             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7441             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7442             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7443             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7444             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7445             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7446             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7447             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7448             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7449             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7450             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7451             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7452             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7453             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7454             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7455             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7456             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7457             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7458             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7459             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7460             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7461             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7462             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7463             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7464             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7465             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7466             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7467             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7468             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7469             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7470             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7471             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7472           }
7473           break;
7474           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7475           switch(source[i]&0x3f)
7476           {
7477             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7478             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7479           }
7480           break;
7481           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7482           switch(source[i]&0x3f)
7483           {
7484             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7485             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7486           }
7487           break;
7488         }
7489         break;
7490 #if 0
7491       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7492       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7493       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7494       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7495       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7496       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7497       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7498       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7499 #endif
7500       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7501       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7502       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7503       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7504       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7505       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7506       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7507 #if 0
7508       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7509 #endif
7510       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7511       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7512       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7513       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7514 #if 0
7515       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7516       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7517 #endif
7518       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7519       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7520       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7521       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7522 #if 0
7523       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7524       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7525       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7526 #endif
7527       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7528       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7529 #if 0
7530       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7531       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7532       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7533 #endif
7534       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7535         op2=(source[i]>>21)&0x1f;
7536         //if (op2 & 0x10) {
7537         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7538           if (gte_handlers[source[i]&0x3f]!=NULL) {
7539             if (gte_regnames[source[i]&0x3f]!=NULL)
7540               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7541             else
7542               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7543             type=C2OP;
7544           }
7545         }
7546         else switch(op2)
7547         {
7548           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7549           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7550           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7551           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7552         }
7553         break;
7554       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7555       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7556       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7557       default: strcpy(insn[i],"???"); type=NI;
7558         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7559         break;
7560     }
7561     itype[i]=type;
7562     opcode2[i]=op2;
7563     /* Get registers/immediates */
7564     lt1[i]=0;
7565     us1[i]=0;
7566     us2[i]=0;
7567     dep1[i]=0;
7568     dep2[i]=0;
7569     gte_rs[i]=gte_rt[i]=0;
7570     switch(type) {
7571       case LOAD:
7572         rs1[i]=(source[i]>>21)&0x1f;
7573         rs2[i]=0;
7574         rt1[i]=(source[i]>>16)&0x1f;
7575         rt2[i]=0;
7576         imm[i]=(short)source[i];
7577         break;
7578       case STORE:
7579       case STORELR:
7580         rs1[i]=(source[i]>>21)&0x1f;
7581         rs2[i]=(source[i]>>16)&0x1f;
7582         rt1[i]=0;
7583         rt2[i]=0;
7584         imm[i]=(short)source[i];
7585         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7586         break;
7587       case LOADLR:
7588         // LWL/LWR only load part of the register,
7589         // therefore the target register must be treated as a source too
7590         rs1[i]=(source[i]>>21)&0x1f;
7591         rs2[i]=(source[i]>>16)&0x1f;
7592         rt1[i]=(source[i]>>16)&0x1f;
7593         rt2[i]=0;
7594         imm[i]=(short)source[i];
7595         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7596         if(op==0x26) dep1[i]=rt1[i]; // LWR
7597         break;
7598       case IMM16:
7599         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7600         else rs1[i]=(source[i]>>21)&0x1f;
7601         rs2[i]=0;
7602         rt1[i]=(source[i]>>16)&0x1f;
7603         rt2[i]=0;
7604         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7605           imm[i]=(unsigned short)source[i];
7606         }else{
7607           imm[i]=(short)source[i];
7608         }
7609         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7610         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7611         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7612         break;
7613       case UJUMP:
7614         rs1[i]=0;
7615         rs2[i]=0;
7616         rt1[i]=0;
7617         rt2[i]=0;
7618         // The JAL instruction writes to r31.
7619         if (op&1) {
7620           rt1[i]=31;
7621         }
7622         rs2[i]=CCREG;
7623         break;
7624       case RJUMP:
7625         rs1[i]=(source[i]>>21)&0x1f;
7626         rs2[i]=0;
7627         rt1[i]=0;
7628         rt2[i]=0;
7629         // The JALR instruction writes to rd.
7630         if (op2&1) {
7631           rt1[i]=(source[i]>>11)&0x1f;
7632         }
7633         rs2[i]=CCREG;
7634         break;
7635       case CJUMP:
7636         rs1[i]=(source[i]>>21)&0x1f;
7637         rs2[i]=(source[i]>>16)&0x1f;
7638         rt1[i]=0;
7639         rt2[i]=0;
7640         if(op&2) { // BGTZ/BLEZ
7641           rs2[i]=0;
7642         }
7643         us1[i]=rs1[i];
7644         us2[i]=rs2[i];
7645         likely[i]=op>>4;
7646         break;
7647       case SJUMP:
7648         rs1[i]=(source[i]>>21)&0x1f;
7649         rs2[i]=CCREG;
7650         rt1[i]=0;
7651         rt2[i]=0;
7652         us1[i]=rs1[i];
7653         if(op2&0x10) { // BxxAL
7654           rt1[i]=31;
7655           // NOTE: If the branch is not taken, r31 is still overwritten
7656         }
7657         likely[i]=(op2&2)>>1;
7658         break;
7659       case FJUMP:
7660         rs1[i]=FSREG;
7661         rs2[i]=CSREG;
7662         rt1[i]=0;
7663         rt2[i]=0;
7664         likely[i]=((source[i])>>17)&1;
7665         break;
7666       case ALU:
7667         rs1[i]=(source[i]>>21)&0x1f; // source
7668         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7669         rt1[i]=(source[i]>>11)&0x1f; // destination
7670         rt2[i]=0;
7671         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7672           us1[i]=rs1[i];us2[i]=rs2[i];
7673         }
7674         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7675           dep1[i]=rs1[i];dep2[i]=rs2[i];
7676         }
7677         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7678           dep1[i]=rs1[i];dep2[i]=rs2[i];
7679         }
7680         break;
7681       case MULTDIV:
7682         rs1[i]=(source[i]>>21)&0x1f; // source
7683         rs2[i]=(source[i]>>16)&0x1f; // divisor
7684         rt1[i]=HIREG;
7685         rt2[i]=LOREG;
7686         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7687           us1[i]=rs1[i];us2[i]=rs2[i];
7688         }
7689         break;
7690       case MOV:
7691         rs1[i]=0;
7692         rs2[i]=0;
7693         rt1[i]=0;
7694         rt2[i]=0;
7695         if(op2==0x10) rs1[i]=HIREG; // MFHI
7696         if(op2==0x11) rt1[i]=HIREG; // MTHI
7697         if(op2==0x12) rs1[i]=LOREG; // MFLO
7698         if(op2==0x13) rt1[i]=LOREG; // MTLO
7699         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7700         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7701         dep1[i]=rs1[i];
7702         break;
7703       case SHIFT:
7704         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7705         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7706         rt1[i]=(source[i]>>11)&0x1f; // destination
7707         rt2[i]=0;
7708         // DSLLV/DSRLV/DSRAV are 64-bit
7709         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7710         break;
7711       case SHIFTIMM:
7712         rs1[i]=(source[i]>>16)&0x1f;
7713         rs2[i]=0;
7714         rt1[i]=(source[i]>>11)&0x1f;
7715         rt2[i]=0;
7716         imm[i]=(source[i]>>6)&0x1f;
7717         // DSxx32 instructions
7718         if(op2>=0x3c) imm[i]|=0x20;
7719         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7720         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7721         break;
7722       case COP0:
7723         rs1[i]=0;
7724         rs2[i]=0;
7725         rt1[i]=0;
7726         rt2[i]=0;
7727         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7728         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7729         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7730         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7731         break;
7732       case COP1:
7733         rs1[i]=0;
7734         rs2[i]=0;
7735         rt1[i]=0;
7736         rt2[i]=0;
7737         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7738         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7739         if(op2==5) us1[i]=rs1[i]; // DMTC1
7740         rs2[i]=CSREG;
7741         break;
7742       case COP2:
7743         rs1[i]=0;
7744         rs2[i]=0;
7745         rt1[i]=0;
7746         rt2[i]=0;
7747         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7748         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7749         rs2[i]=CSREG;
7750         int gr=(source[i]>>11)&0x1F;
7751         switch(op2)
7752         {
7753           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7754           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7755           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7756           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7757         }
7758         break;
7759       case C1LS:
7760         rs1[i]=(source[i]>>21)&0x1F;
7761         rs2[i]=CSREG;
7762         rt1[i]=0;
7763         rt2[i]=0;
7764         imm[i]=(short)source[i];
7765         break;
7766       case C2LS:
7767         rs1[i]=(source[i]>>21)&0x1F;
7768         rs2[i]=0;
7769         rt1[i]=0;
7770         rt2[i]=0;
7771         imm[i]=(short)source[i];
7772         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7773         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7774         break;
7775       case C2OP:
7776         rs1[i]=0;
7777         rs2[i]=0;
7778         rt1[i]=0;
7779         rt2[i]=0;
7780         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7781         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7782         gte_rt[i]|=1ll<<63; // every op changes flags
7783         if((source[i]&0x3f)==GTE_MVMVA) {
7784           int v = (source[i] >> 15) & 3;
7785           gte_rs[i]&=~0xe3fll;
7786           if(v==3) gte_rs[i]|=0xe00ll;
7787           else gte_rs[i]|=3ll<<(v*2);
7788         }
7789         break;
7790       case FLOAT:
7791       case FCONV:
7792         rs1[i]=0;
7793         rs2[i]=CSREG;
7794         rt1[i]=0;
7795         rt2[i]=0;
7796         break;
7797       case FCOMP:
7798         rs1[i]=FSREG;
7799         rs2[i]=CSREG;
7800         rt1[i]=FSREG;
7801         rt2[i]=0;
7802         break;
7803       case SYSCALL:
7804       case HLECALL:
7805       case INTCALL:
7806         rs1[i]=CCREG;
7807         rs2[i]=0;
7808         rt1[i]=0;
7809         rt2[i]=0;
7810         break;
7811       default:
7812         rs1[i]=0;
7813         rs2[i]=0;
7814         rt1[i]=0;
7815         rt2[i]=0;
7816     }
7817     /* Calculate branch target addresses */
7818     if(type==UJUMP)
7819       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7820     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7821       ba[i]=start+i*4+8; // Ignore never taken branch
7822     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7823       ba[i]=start+i*4+8; // Ignore never taken branch
7824     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7825       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7826     else ba[i]=-1;
7827     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7828       int do_in_intrp=0;
7829       // branch in delay slot?
7830       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7831         // don't handle first branch and call interpreter if it's hit
7832         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7833         do_in_intrp=1;
7834       }
7835       // basic load delay detection
7836       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7837         int t=(ba[i-1]-start)/4;
7838         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7839           // jump target wants DS result - potential load delay effect
7840           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7841           do_in_intrp=1;
7842           bt[t+1]=1; // expected return from interpreter
7843         }
7844         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7845               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7846           // v0 overwrite like this is a sign of trouble, bail out
7847           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7848           do_in_intrp=1;
7849         }
7850       }
7851       if(do_in_intrp) {
7852         rs1[i-1]=CCREG;
7853         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7854         ba[i-1]=-1;
7855         itype[i-1]=INTCALL;
7856         done=2;
7857         i--; // don't compile the DS
7858       }
7859     }
7860     /* Is this the end of the block? */
7861     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7862       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7863         done=2;
7864       }
7865       else {
7866         if(stop_after_jal) done=1;
7867         // Stop on BREAK
7868         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7869       }
7870       // Don't recompile stuff that's already compiled
7871       if(check_addr(start+i*4+4)) done=1;
7872       // Don't get too close to the limit
7873       if(i>MAXBLOCK/2) done=1;
7874     }
7875     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7876     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7877     if(done==2) {
7878       // Does the block continue due to a branch?
7879       for(j=i-1;j>=0;j--)
7880       {
7881         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7882         if(ba[j]==start+i*4+4) done=j=0;
7883         if(ba[j]==start+i*4+8) done=j=0;
7884       }
7885     }
7886     //assert(i<MAXBLOCK-1);
7887     if(start+i*4==pagelimit-4) done=1;
7888     assert(start+i*4<pagelimit);
7889     if (i==MAXBLOCK-1) done=1;
7890     // Stop if we're compiling junk
7891     if(itype[i]==NI&&opcode[i]==0x11) {
7892       done=stop_after_jal=1;
7893       SysPrintf("Disabled speculative precompilation\n");
7894     }
7895   }
7896   slen=i;
7897   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
7898     if(start+i*4==pagelimit) {
7899       itype[i-1]=SPAN;
7900     }
7901   }
7902   assert(slen>0);
7903
7904   /* Pass 2 - Register dependencies and branch targets */
7905
7906   unneeded_registers(0,slen-1,0);
7907
7908   /* Pass 3 - Register allocation */
7909
7910   struct regstat current; // Current register allocations/status
7911   current.is32=1;
7912   current.dirty=0;
7913   current.u=unneeded_reg[0];
7914   current.uu=unneeded_reg_upper[0];
7915   clear_all_regs(current.regmap);
7916   alloc_reg(&current,0,CCREG);
7917   dirty_reg(&current,CCREG);
7918   current.isconst=0;
7919   current.wasconst=0;
7920   current.waswritten=0;
7921   int ds=0;
7922   int cc=0;
7923   int hr=-1;
7924
7925   if((u_int)addr&1) {
7926     // First instruction is delay slot
7927     cc=-1;
7928     bt[1]=1;
7929     ds=1;
7930     unneeded_reg[0]=1;
7931     unneeded_reg_upper[0]=1;
7932     current.regmap[HOST_BTREG]=BTREG;
7933   }
7934
7935   for(i=0;i<slen;i++)
7936   {
7937     if(bt[i])
7938     {
7939       int hr;
7940       for(hr=0;hr<HOST_REGS;hr++)
7941       {
7942         // Is this really necessary?
7943         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7944       }
7945       current.isconst=0;
7946       current.waswritten=0;
7947     }
7948     if(i>1)
7949     {
7950       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7951       {
7952         if(rs1[i-2]==0||rs2[i-2]==0)
7953         {
7954           if(rs1[i-2]) {
7955             current.is32|=1LL<<rs1[i-2];
7956             int hr=get_reg(current.regmap,rs1[i-2]|64);
7957             if(hr>=0) current.regmap[hr]=-1;
7958           }
7959           if(rs2[i-2]) {
7960             current.is32|=1LL<<rs2[i-2];
7961             int hr=get_reg(current.regmap,rs2[i-2]|64);
7962             if(hr>=0) current.regmap[hr]=-1;
7963           }
7964         }
7965       }
7966     }
7967     current.is32=-1LL;
7968
7969     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7970     regs[i].wasconst=current.isconst;
7971     regs[i].was32=current.is32;
7972     regs[i].wasdirty=current.dirty;
7973     regs[i].loadedconst=0;
7974     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
7975       if(i+1<slen) {
7976         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7977         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
7978         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7979         current.u|=1;
7980         current.uu|=1;
7981       } else {
7982         current.u=1;
7983         current.uu=1;
7984       }
7985     } else {
7986       if(i+1<slen) {
7987         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7988         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
7989         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
7990         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7991         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7992         current.u|=1;
7993         current.uu|=1;
7994       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
7995     }
7996     is_ds[i]=ds;
7997     if(ds) {
7998       ds=0; // Skip delay slot, already allocated as part of branch
7999       // ...but we need to alloc it in case something jumps here
8000       if(i+1<slen) {
8001         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8002         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8003       }else{
8004         current.u=branch_unneeded_reg[i-1];
8005         current.uu=branch_unneeded_reg_upper[i-1];
8006       }
8007       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8008       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8009       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8010       current.u|=1;
8011       current.uu|=1;
8012       struct regstat temp;
8013       memcpy(&temp,&current,sizeof(current));
8014       temp.wasdirty=temp.dirty;
8015       temp.was32=temp.is32;
8016       // TODO: Take into account unconditional branches, as below
8017       delayslot_alloc(&temp,i);
8018       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8019       regs[i].wasdirty=temp.wasdirty;
8020       regs[i].was32=temp.was32;
8021       regs[i].dirty=temp.dirty;
8022       regs[i].is32=temp.is32;
8023       regs[i].isconst=0;
8024       regs[i].wasconst=0;
8025       current.isconst=0;
8026       // Create entry (branch target) regmap
8027       for(hr=0;hr<HOST_REGS;hr++)
8028       {
8029         int r=temp.regmap[hr];
8030         if(r>=0) {
8031           if(r!=regmap_pre[i][hr]) {
8032             regs[i].regmap_entry[hr]=-1;
8033           }
8034           else
8035           {
8036             if(r<64){
8037               if((current.u>>r)&1) {
8038                 regs[i].regmap_entry[hr]=-1;
8039                 regs[i].regmap[hr]=-1;
8040                 //Don't clear regs in the delay slot as the branch might need them
8041                 //current.regmap[hr]=-1;
8042               }else
8043                 regs[i].regmap_entry[hr]=r;
8044             }
8045             else {
8046               if((current.uu>>(r&63))&1) {
8047                 regs[i].regmap_entry[hr]=-1;
8048                 regs[i].regmap[hr]=-1;
8049                 //Don't clear regs in the delay slot as the branch might need them
8050                 //current.regmap[hr]=-1;
8051               }else
8052                 regs[i].regmap_entry[hr]=r;
8053             }
8054           }
8055         } else {
8056           // First instruction expects CCREG to be allocated
8057           if(i==0&&hr==HOST_CCREG)
8058             regs[i].regmap_entry[hr]=CCREG;
8059           else
8060             regs[i].regmap_entry[hr]=-1;
8061         }
8062       }
8063     }
8064     else { // Not delay slot
8065       switch(itype[i]) {
8066         case UJUMP:
8067           //current.isconst=0; // DEBUG
8068           //current.wasconst=0; // DEBUG
8069           //regs[i].wasconst=0; // DEBUG
8070           clear_const(&current,rt1[i]);
8071           alloc_cc(&current,i);
8072           dirty_reg(&current,CCREG);
8073           if (rt1[i]==31) {
8074             alloc_reg(&current,i,31);
8075             dirty_reg(&current,31);
8076             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8077             //assert(rt1[i+1]!=rt1[i]);
8078             #ifdef REG_PREFETCH
8079             alloc_reg(&current,i,PTEMP);
8080             #endif
8081             //current.is32|=1LL<<rt1[i];
8082           }
8083           ooo[i]=1;
8084           delayslot_alloc(&current,i+1);
8085           //current.isconst=0; // DEBUG
8086           ds=1;
8087           //printf("i=%d, isconst=%x\n",i,current.isconst);
8088           break;
8089         case RJUMP:
8090           //current.isconst=0;
8091           //current.wasconst=0;
8092           //regs[i].wasconst=0;
8093           clear_const(&current,rs1[i]);
8094           clear_const(&current,rt1[i]);
8095           alloc_cc(&current,i);
8096           dirty_reg(&current,CCREG);
8097           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8098             alloc_reg(&current,i,rs1[i]);
8099             if (rt1[i]!=0) {
8100               alloc_reg(&current,i,rt1[i]);
8101               dirty_reg(&current,rt1[i]);
8102               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8103               assert(rt1[i+1]!=rt1[i]);
8104               #ifdef REG_PREFETCH
8105               alloc_reg(&current,i,PTEMP);
8106               #endif
8107             }
8108             #ifdef USE_MINI_HT
8109             if(rs1[i]==31) { // JALR
8110               alloc_reg(&current,i,RHASH);
8111               #ifndef HOST_IMM_ADDR32
8112               alloc_reg(&current,i,RHTBL);
8113               #endif
8114             }
8115             #endif
8116             delayslot_alloc(&current,i+1);
8117           } else {
8118             // The delay slot overwrites our source register,
8119             // allocate a temporary register to hold the old value.
8120             current.isconst=0;
8121             current.wasconst=0;
8122             regs[i].wasconst=0;
8123             delayslot_alloc(&current,i+1);
8124             current.isconst=0;
8125             alloc_reg(&current,i,RTEMP);
8126           }
8127           //current.isconst=0; // DEBUG
8128           ooo[i]=1;
8129           ds=1;
8130           break;
8131         case CJUMP:
8132           //current.isconst=0;
8133           //current.wasconst=0;
8134           //regs[i].wasconst=0;
8135           clear_const(&current,rs1[i]);
8136           clear_const(&current,rs2[i]);
8137           if((opcode[i]&0x3E)==4) // BEQ/BNE
8138           {
8139             alloc_cc(&current,i);
8140             dirty_reg(&current,CCREG);
8141             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8142             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8143             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8144             {
8145               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8146               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8147             }
8148             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8149                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8150               // The delay slot overwrites one of our conditions.
8151               // Allocate the branch condition registers instead.
8152               current.isconst=0;
8153               current.wasconst=0;
8154               regs[i].wasconst=0;
8155               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8156               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8157               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8158               {
8159                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8160                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8161               }
8162             }
8163             else
8164             {
8165               ooo[i]=1;
8166               delayslot_alloc(&current,i+1);
8167             }
8168           }
8169           else
8170           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8171           {
8172             alloc_cc(&current,i);
8173             dirty_reg(&current,CCREG);
8174             alloc_reg(&current,i,rs1[i]);
8175             if(!(current.is32>>rs1[i]&1))
8176             {
8177               alloc_reg64(&current,i,rs1[i]);
8178             }
8179             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8180               // The delay slot overwrites one of our conditions.
8181               // Allocate the branch condition registers instead.
8182               current.isconst=0;
8183               current.wasconst=0;
8184               regs[i].wasconst=0;
8185               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8186               if(!((current.is32>>rs1[i])&1))
8187               {
8188                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8189               }
8190             }
8191             else
8192             {
8193               ooo[i]=1;
8194               delayslot_alloc(&current,i+1);
8195             }
8196           }
8197           else
8198           // Don't alloc the delay slot yet because we might not execute it
8199           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8200           {
8201             current.isconst=0;
8202             current.wasconst=0;
8203             regs[i].wasconst=0;
8204             alloc_cc(&current,i);
8205             dirty_reg(&current,CCREG);
8206             alloc_reg(&current,i,rs1[i]);
8207             alloc_reg(&current,i,rs2[i]);
8208             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8209             {
8210               alloc_reg64(&current,i,rs1[i]);
8211               alloc_reg64(&current,i,rs2[i]);
8212             }
8213           }
8214           else
8215           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8216           {
8217             current.isconst=0;
8218             current.wasconst=0;
8219             regs[i].wasconst=0;
8220             alloc_cc(&current,i);
8221             dirty_reg(&current,CCREG);
8222             alloc_reg(&current,i,rs1[i]);
8223             if(!(current.is32>>rs1[i]&1))
8224             {
8225               alloc_reg64(&current,i,rs1[i]);
8226             }
8227           }
8228           ds=1;
8229           //current.isconst=0;
8230           break;
8231         case SJUMP:
8232           //current.isconst=0;
8233           //current.wasconst=0;
8234           //regs[i].wasconst=0;
8235           clear_const(&current,rs1[i]);
8236           clear_const(&current,rt1[i]);
8237           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8238           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8239           {
8240             alloc_cc(&current,i);
8241             dirty_reg(&current,CCREG);
8242             alloc_reg(&current,i,rs1[i]);
8243             if(!(current.is32>>rs1[i]&1))
8244             {
8245               alloc_reg64(&current,i,rs1[i]);
8246             }
8247             if (rt1[i]==31) { // BLTZAL/BGEZAL
8248               alloc_reg(&current,i,31);
8249               dirty_reg(&current,31);
8250               //#ifdef REG_PREFETCH
8251               //alloc_reg(&current,i,PTEMP);
8252               //#endif
8253               //current.is32|=1LL<<rt1[i];
8254             }
8255             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8256                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8257               // Allocate the branch condition registers instead.
8258               current.isconst=0;
8259               current.wasconst=0;
8260               regs[i].wasconst=0;
8261               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8262               if(!((current.is32>>rs1[i])&1))
8263               {
8264                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8265               }
8266             }
8267             else
8268             {
8269               ooo[i]=1;
8270               delayslot_alloc(&current,i+1);
8271             }
8272           }
8273           else
8274           // Don't alloc the delay slot yet because we might not execute it
8275           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8276           {
8277             current.isconst=0;
8278             current.wasconst=0;
8279             regs[i].wasconst=0;
8280             alloc_cc(&current,i);
8281             dirty_reg(&current,CCREG);
8282             alloc_reg(&current,i,rs1[i]);
8283             if(!(current.is32>>rs1[i]&1))
8284             {
8285               alloc_reg64(&current,i,rs1[i]);
8286             }
8287           }
8288           ds=1;
8289           //current.isconst=0;
8290           break;
8291         case FJUMP:
8292           current.isconst=0;
8293           current.wasconst=0;
8294           regs[i].wasconst=0;
8295           if(likely[i]==0) // BC1F/BC1T
8296           {
8297             // TODO: Theoretically we can run out of registers here on x86.
8298             // The delay slot can allocate up to six, and we need to check
8299             // CSREG before executing the delay slot.  Possibly we can drop
8300             // the cycle count and then reload it after checking that the
8301             // FPU is in a usable state, or don't do out-of-order execution.
8302             alloc_cc(&current,i);
8303             dirty_reg(&current,CCREG);
8304             alloc_reg(&current,i,FSREG);
8305             alloc_reg(&current,i,CSREG);
8306             if(itype[i+1]==FCOMP) {
8307               // The delay slot overwrites the branch condition.
8308               // Allocate the branch condition registers instead.
8309               alloc_cc(&current,i);
8310               dirty_reg(&current,CCREG);
8311               alloc_reg(&current,i,CSREG);
8312               alloc_reg(&current,i,FSREG);
8313             }
8314             else {
8315               ooo[i]=1;
8316               delayslot_alloc(&current,i+1);
8317               alloc_reg(&current,i+1,CSREG);
8318             }
8319           }
8320           else
8321           // Don't alloc the delay slot yet because we might not execute it
8322           if(likely[i]) // BC1FL/BC1TL
8323           {
8324             alloc_cc(&current,i);
8325             dirty_reg(&current,CCREG);
8326             alloc_reg(&current,i,CSREG);
8327             alloc_reg(&current,i,FSREG);
8328           }
8329           ds=1;
8330           current.isconst=0;
8331           break;
8332         case IMM16:
8333           imm16_alloc(&current,i);
8334           break;
8335         case LOAD:
8336         case LOADLR:
8337           load_alloc(&current,i);
8338           break;
8339         case STORE:
8340         case STORELR:
8341           store_alloc(&current,i);
8342           break;
8343         case ALU:
8344           alu_alloc(&current,i);
8345           break;
8346         case SHIFT:
8347           shift_alloc(&current,i);
8348           break;
8349         case MULTDIV:
8350           multdiv_alloc(&current,i);
8351           break;
8352         case SHIFTIMM:
8353           shiftimm_alloc(&current,i);
8354           break;
8355         case MOV:
8356           mov_alloc(&current,i);
8357           break;
8358         case COP0:
8359           cop0_alloc(&current,i);
8360           break;
8361         case COP1:
8362         case COP2:
8363           cop1_alloc(&current,i);
8364           break;
8365         case C1LS:
8366           c1ls_alloc(&current,i);
8367           break;
8368         case C2LS:
8369           c2ls_alloc(&current,i);
8370           break;
8371         case C2OP:
8372           c2op_alloc(&current,i);
8373           break;
8374         case FCONV:
8375           fconv_alloc(&current,i);
8376           break;
8377         case FLOAT:
8378           float_alloc(&current,i);
8379           break;
8380         case FCOMP:
8381           fcomp_alloc(&current,i);
8382           break;
8383         case SYSCALL:
8384         case HLECALL:
8385         case INTCALL:
8386           syscall_alloc(&current,i);
8387           break;
8388         case SPAN:
8389           pagespan_alloc(&current,i);
8390           break;
8391       }
8392
8393       // Drop the upper half of registers that have become 32-bit
8394       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8395       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8396         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8397         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8398         current.uu|=1;
8399       } else {
8400         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8401         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8402         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8403         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8404         current.uu|=1;
8405       }
8406
8407       // Create entry (branch target) regmap
8408       for(hr=0;hr<HOST_REGS;hr++)
8409       {
8410         int r,or;
8411         r=current.regmap[hr];
8412         if(r>=0) {
8413           if(r!=regmap_pre[i][hr]) {
8414             // TODO: delay slot (?)
8415             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8416             if(or<0||(r&63)>=TEMPREG){
8417               regs[i].regmap_entry[hr]=-1;
8418             }
8419             else
8420             {
8421               // Just move it to a different register
8422               regs[i].regmap_entry[hr]=r;
8423               // If it was dirty before, it's still dirty
8424               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8425             }
8426           }
8427           else
8428           {
8429             // Unneeded
8430             if(r==0){
8431               regs[i].regmap_entry[hr]=0;
8432             }
8433             else
8434             if(r<64){
8435               if((current.u>>r)&1) {
8436                 regs[i].regmap_entry[hr]=-1;
8437                 //regs[i].regmap[hr]=-1;
8438                 current.regmap[hr]=-1;
8439               }else
8440                 regs[i].regmap_entry[hr]=r;
8441             }
8442             else {
8443               if((current.uu>>(r&63))&1) {
8444                 regs[i].regmap_entry[hr]=-1;
8445                 //regs[i].regmap[hr]=-1;
8446                 current.regmap[hr]=-1;
8447               }else
8448                 regs[i].regmap_entry[hr]=r;
8449             }
8450           }
8451         } else {
8452           // Branches expect CCREG to be allocated at the target
8453           if(regmap_pre[i][hr]==CCREG)
8454             regs[i].regmap_entry[hr]=CCREG;
8455           else
8456             regs[i].regmap_entry[hr]=-1;
8457         }
8458       }
8459       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8460     }
8461
8462     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8463       current.waswritten|=1<<rs1[i-1];
8464     current.waswritten&=~(1<<rt1[i]);
8465     current.waswritten&=~(1<<rt2[i]);
8466     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8467       current.waswritten&=~(1<<rs1[i]);
8468
8469     /* Branch post-alloc */
8470     if(i>0)
8471     {
8472       current.was32=current.is32;
8473       current.wasdirty=current.dirty;
8474       switch(itype[i-1]) {
8475         case UJUMP:
8476           memcpy(&branch_regs[i-1],&current,sizeof(current));
8477           branch_regs[i-1].isconst=0;
8478           branch_regs[i-1].wasconst=0;
8479           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8480           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8481           alloc_cc(&branch_regs[i-1],i-1);
8482           dirty_reg(&branch_regs[i-1],CCREG);
8483           if(rt1[i-1]==31) { // JAL
8484             alloc_reg(&branch_regs[i-1],i-1,31);
8485             dirty_reg(&branch_regs[i-1],31);
8486             branch_regs[i-1].is32|=1LL<<31;
8487           }
8488           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8489           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8490           break;
8491         case RJUMP:
8492           memcpy(&branch_regs[i-1],&current,sizeof(current));
8493           branch_regs[i-1].isconst=0;
8494           branch_regs[i-1].wasconst=0;
8495           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8496           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8497           alloc_cc(&branch_regs[i-1],i-1);
8498           dirty_reg(&branch_regs[i-1],CCREG);
8499           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8500           if(rt1[i-1]!=0) { // JALR
8501             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8502             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8503             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8504           }
8505           #ifdef USE_MINI_HT
8506           if(rs1[i-1]==31) { // JALR
8507             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8508             #ifndef HOST_IMM_ADDR32
8509             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8510             #endif
8511           }
8512           #endif
8513           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8514           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8515           break;
8516         case CJUMP:
8517           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8518           {
8519             alloc_cc(&current,i-1);
8520             dirty_reg(&current,CCREG);
8521             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8522                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8523               // The delay slot overwrote one of our conditions
8524               // Delay slot goes after the test (in order)
8525               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8526               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8527               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8528               current.u|=1;
8529               current.uu|=1;
8530               delayslot_alloc(&current,i);
8531               current.isconst=0;
8532             }
8533             else
8534             {
8535               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8536               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8537               // Alloc the branch condition registers
8538               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8539               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8540               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8541               {
8542                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8543                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8544               }
8545             }
8546             memcpy(&branch_regs[i-1],&current,sizeof(current));
8547             branch_regs[i-1].isconst=0;
8548             branch_regs[i-1].wasconst=0;
8549             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8550             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8551           }
8552           else
8553           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8554           {
8555             alloc_cc(&current,i-1);
8556             dirty_reg(&current,CCREG);
8557             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8558               // The delay slot overwrote the branch condition
8559               // Delay slot goes after the test (in order)
8560               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8561               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8562               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8563               current.u|=1;
8564               current.uu|=1;
8565               delayslot_alloc(&current,i);
8566               current.isconst=0;
8567             }
8568             else
8569             {
8570               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8571               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8572               // Alloc the branch condition register
8573               alloc_reg(&current,i-1,rs1[i-1]);
8574               if(!(current.is32>>rs1[i-1]&1))
8575               {
8576                 alloc_reg64(&current,i-1,rs1[i-1]);
8577               }
8578             }
8579             memcpy(&branch_regs[i-1],&current,sizeof(current));
8580             branch_regs[i-1].isconst=0;
8581             branch_regs[i-1].wasconst=0;
8582             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8583             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8584           }
8585           else
8586           // Alloc the delay slot in case the branch is taken
8587           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8588           {
8589             memcpy(&branch_regs[i-1],&current,sizeof(current));
8590             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8591             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8592             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8593             alloc_cc(&branch_regs[i-1],i);
8594             dirty_reg(&branch_regs[i-1],CCREG);
8595             delayslot_alloc(&branch_regs[i-1],i);
8596             branch_regs[i-1].isconst=0;
8597             alloc_reg(&current,i,CCREG); // Not taken path
8598             dirty_reg(&current,CCREG);
8599             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8600           }
8601           else
8602           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8603           {
8604             memcpy(&branch_regs[i-1],&current,sizeof(current));
8605             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8606             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8607             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8608             alloc_cc(&branch_regs[i-1],i);
8609             dirty_reg(&branch_regs[i-1],CCREG);
8610             delayslot_alloc(&branch_regs[i-1],i);
8611             branch_regs[i-1].isconst=0;
8612             alloc_reg(&current,i,CCREG); // Not taken path
8613             dirty_reg(&current,CCREG);
8614             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8615           }
8616           break;
8617         case SJUMP:
8618           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8619           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8620           {
8621             alloc_cc(&current,i-1);
8622             dirty_reg(&current,CCREG);
8623             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8624               // The delay slot overwrote the branch condition
8625               // Delay slot goes after the test (in order)
8626               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8627               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8628               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8629               current.u|=1;
8630               current.uu|=1;
8631               delayslot_alloc(&current,i);
8632               current.isconst=0;
8633             }
8634             else
8635             {
8636               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8637               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8638               // Alloc the branch condition register
8639               alloc_reg(&current,i-1,rs1[i-1]);
8640               if(!(current.is32>>rs1[i-1]&1))
8641               {
8642                 alloc_reg64(&current,i-1,rs1[i-1]);
8643               }
8644             }
8645             memcpy(&branch_regs[i-1],&current,sizeof(current));
8646             branch_regs[i-1].isconst=0;
8647             branch_regs[i-1].wasconst=0;
8648             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8649             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8650           }
8651           else
8652           // Alloc the delay slot in case the branch is taken
8653           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8654           {
8655             memcpy(&branch_regs[i-1],&current,sizeof(current));
8656             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8657             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8658             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8659             alloc_cc(&branch_regs[i-1],i);
8660             dirty_reg(&branch_regs[i-1],CCREG);
8661             delayslot_alloc(&branch_regs[i-1],i);
8662             branch_regs[i-1].isconst=0;
8663             alloc_reg(&current,i,CCREG); // Not taken path
8664             dirty_reg(&current,CCREG);
8665             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8666           }
8667           // FIXME: BLTZAL/BGEZAL
8668           if(opcode2[i-1]&0x10) { // BxxZAL
8669             alloc_reg(&branch_regs[i-1],i-1,31);
8670             dirty_reg(&branch_regs[i-1],31);
8671             branch_regs[i-1].is32|=1LL<<31;
8672           }
8673           break;
8674         case FJUMP:
8675           if(likely[i-1]==0) // BC1F/BC1T
8676           {
8677             alloc_cc(&current,i-1);
8678             dirty_reg(&current,CCREG);
8679             if(itype[i]==FCOMP) {
8680               // The delay slot overwrote the branch condition
8681               // Delay slot goes after the test (in order)
8682               delayslot_alloc(&current,i);
8683               current.isconst=0;
8684             }
8685             else
8686             {
8687               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8688               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8689               // Alloc the branch condition register
8690               alloc_reg(&current,i-1,FSREG);
8691             }
8692             memcpy(&branch_regs[i-1],&current,sizeof(current));
8693             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8694           }
8695           else // BC1FL/BC1TL
8696           {
8697             // Alloc the delay slot in case the branch is taken
8698             memcpy(&branch_regs[i-1],&current,sizeof(current));
8699             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8700             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8701             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8702             alloc_cc(&branch_regs[i-1],i);
8703             dirty_reg(&branch_regs[i-1],CCREG);
8704             delayslot_alloc(&branch_regs[i-1],i);
8705             branch_regs[i-1].isconst=0;
8706             alloc_reg(&current,i,CCREG); // Not taken path
8707             dirty_reg(&current,CCREG);
8708             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8709           }
8710           break;
8711       }
8712
8713       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8714       {
8715         if(rt1[i-1]==31) // JAL/JALR
8716         {
8717           // Subroutine call will return here, don't alloc any registers
8718           current.is32=1;
8719           current.dirty=0;
8720           clear_all_regs(current.regmap);
8721           alloc_reg(&current,i,CCREG);
8722           dirty_reg(&current,CCREG);
8723         }
8724         else if(i+1<slen)
8725         {
8726           // Internal branch will jump here, match registers to caller
8727           current.is32=0x3FFFFFFFFLL;
8728           current.dirty=0;
8729           clear_all_regs(current.regmap);
8730           alloc_reg(&current,i,CCREG);
8731           dirty_reg(&current,CCREG);
8732           for(j=i-1;j>=0;j--)
8733           {
8734             if(ba[j]==start+i*4+4) {
8735               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8736               current.is32=branch_regs[j].is32;
8737               current.dirty=branch_regs[j].dirty;
8738               break;
8739             }
8740           }
8741           while(j>=0) {
8742             if(ba[j]==start+i*4+4) {
8743               for(hr=0;hr<HOST_REGS;hr++) {
8744                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8745                   current.regmap[hr]=-1;
8746                 }
8747                 current.is32&=branch_regs[j].is32;
8748                 current.dirty&=branch_regs[j].dirty;
8749               }
8750             }
8751             j--;
8752           }
8753         }
8754       }
8755     }
8756
8757     // Count cycles in between branches
8758     ccadj[i]=cc;
8759     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8760     {
8761       cc=0;
8762     }
8763 #if !defined(DRC_DBG)
8764     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8765     {
8766       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8767       cc+=gte_cycletab[source[i]&0x3f]/2;
8768     }
8769     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8770     {
8771       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8772     }
8773     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8774     {
8775       cc+=4;
8776     }
8777     else if(itype[i]==C2LS)
8778     {
8779       cc+=4;
8780     }
8781 #endif
8782     else
8783     {
8784       cc++;
8785     }
8786
8787     flush_dirty_uppers(&current);
8788     if(!is_ds[i]) {
8789       regs[i].is32=current.is32;
8790       regs[i].dirty=current.dirty;
8791       regs[i].isconst=current.isconst;
8792       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8793     }
8794     for(hr=0;hr<HOST_REGS;hr++) {
8795       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8796         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8797           regs[i].wasconst&=~(1<<hr);
8798         }
8799       }
8800     }
8801     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8802     regs[i].waswritten=current.waswritten;
8803   }
8804
8805   /* Pass 4 - Cull unused host registers */
8806
8807   uint64_t nr=0;
8808
8809   for (i=slen-1;i>=0;i--)
8810   {
8811     int hr;
8812     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8813     {
8814       if(ba[i]<start || ba[i]>=(start+slen*4))
8815       {
8816         // Branch out of this block, don't need anything
8817         nr=0;
8818       }
8819       else
8820       {
8821         // Internal branch
8822         // Need whatever matches the target
8823         nr=0;
8824         int t=(ba[i]-start)>>2;
8825         for(hr=0;hr<HOST_REGS;hr++)
8826         {
8827           if(regs[i].regmap_entry[hr]>=0) {
8828             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8829           }
8830         }
8831       }
8832       // Conditional branch may need registers for following instructions
8833       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8834       {
8835         if(i<slen-2) {
8836           nr|=needed_reg[i+2];
8837           for(hr=0;hr<HOST_REGS;hr++)
8838           {
8839             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8840             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8841           }
8842         }
8843       }
8844       // Don't need stuff which is overwritten
8845       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8846       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8847       // Merge in delay slot
8848       for(hr=0;hr<HOST_REGS;hr++)
8849       {
8850         if(!likely[i]) {
8851           // These are overwritten unless the branch is "likely"
8852           // and the delay slot is nullified if not taken
8853           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8854           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8855         }
8856         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8857         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8858         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8859         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8860         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8861         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8862         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8863         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8864         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8865           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8866           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8867         }
8868         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8869           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8870           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8871         }
8872         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8873           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8874           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8875         }
8876       }
8877     }
8878     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8879     {
8880       // SYSCALL instruction (software interrupt)
8881       nr=0;
8882     }
8883     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8884     {
8885       // ERET instruction (return from interrupt)
8886       nr=0;
8887     }
8888     else // Non-branch
8889     {
8890       if(i<slen-1) {
8891         for(hr=0;hr<HOST_REGS;hr++) {
8892           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8893           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8894           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8895           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8896         }
8897       }
8898     }
8899     for(hr=0;hr<HOST_REGS;hr++)
8900     {
8901       // Overwritten registers are not needed
8902       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8903       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8904       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8905       // Source registers are needed
8906       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8907       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8908       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8909       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8910       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8911       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8912       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8913       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8914       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
8915         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8916         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8917       }
8918       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
8919         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8920         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8921       }
8922       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8923         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8924         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8925       }
8926       // Don't store a register immediately after writing it,
8927       // may prevent dual-issue.
8928       // But do so if this is a branch target, otherwise we
8929       // might have to load the register before the branch.
8930       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8931         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
8932            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
8933           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8934           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8935         }
8936         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
8937            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
8938           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8939           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8940         }
8941       }
8942     }
8943     // Cycle count is needed at branches.  Assume it is needed at the target too.
8944     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
8945       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8946       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8947     }
8948     // Save it
8949     needed_reg[i]=nr;
8950
8951     // Deallocate unneeded registers
8952     for(hr=0;hr<HOST_REGS;hr++)
8953     {
8954       if(!((nr>>hr)&1)) {
8955         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8956         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8957            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8958            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8959         {
8960           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8961           {
8962             if(likely[i]) {
8963               regs[i].regmap[hr]=-1;
8964               regs[i].isconst&=~(1<<hr);
8965               if(i<slen-2) {
8966                 regmap_pre[i+2][hr]=-1;
8967                 regs[i+2].wasconst&=~(1<<hr);
8968               }
8969             }
8970           }
8971         }
8972         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8973         {
8974           int d1=0,d2=0,map=0,temp=0;
8975           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
8976           {
8977             d1=dep1[i+1];
8978             d2=dep2[i+1];
8979           }
8980           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8981              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8982             map=INVCP;
8983           }
8984           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8985              itype[i+1]==C1LS || itype[i+1]==C2LS)
8986             temp=FTEMP;
8987           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8988              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8989              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8990              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
8991              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8992              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8993              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8994              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8995              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8996              regs[i].regmap[hr]!=map )
8997           {
8998             regs[i].regmap[hr]=-1;
8999             regs[i].isconst&=~(1<<hr);
9000             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9001                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9002                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9003                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9004                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9005                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9006                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9007                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9008                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9009                branch_regs[i].regmap[hr]!=map)
9010             {
9011               branch_regs[i].regmap[hr]=-1;
9012               branch_regs[i].regmap_entry[hr]=-1;
9013               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9014               {
9015                 if(!likely[i]&&i<slen-2) {
9016                   regmap_pre[i+2][hr]=-1;
9017                   regs[i+2].wasconst&=~(1<<hr);
9018                 }
9019               }
9020             }
9021           }
9022         }
9023         else
9024         {
9025           // Non-branch
9026           if(i>0)
9027           {
9028             int d1=0,d2=0,map=-1,temp=-1;
9029             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9030             {
9031               d1=dep1[i];
9032               d2=dep2[i];
9033             }
9034             if(itype[i]==STORE || itype[i]==STORELR ||
9035                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9036               map=INVCP;
9037             }
9038             if(itype[i]==LOADLR || itype[i]==STORELR ||
9039                itype[i]==C1LS || itype[i]==C2LS)
9040               temp=FTEMP;
9041             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9042                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9043                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9044                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9045                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9046                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9047             {
9048               if(i<slen-1&&!is_ds[i]) {
9049                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9050                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9051                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9052                 {
9053                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9054                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9055                 }
9056                 regmap_pre[i+1][hr]=-1;
9057                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9058                 regs[i+1].wasconst&=~(1<<hr);
9059               }
9060               regs[i].regmap[hr]=-1;
9061               regs[i].isconst&=~(1<<hr);
9062             }
9063           }
9064         }
9065       }
9066     }
9067   }
9068
9069   /* Pass 5 - Pre-allocate registers */
9070
9071   // If a register is allocated during a loop, try to allocate it for the
9072   // entire loop, if possible.  This avoids loading/storing registers
9073   // inside of the loop.
9074
9075   signed char f_regmap[HOST_REGS];
9076   clear_all_regs(f_regmap);
9077   for(i=0;i<slen-1;i++)
9078   {
9079     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9080     {
9081       if(ba[i]>=start && ba[i]<(start+i*4))
9082       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9083       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9084       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9085       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9086       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9087       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9088       {
9089         int t=(ba[i]-start)>>2;
9090         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9091         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9092         for(hr=0;hr<HOST_REGS;hr++)
9093         {
9094           if(regs[i].regmap[hr]>64) {
9095             if(!((regs[i].dirty>>hr)&1))
9096               f_regmap[hr]=regs[i].regmap[hr];
9097             else f_regmap[hr]=-1;
9098           }
9099           else if(regs[i].regmap[hr]>=0) {
9100             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9101               // dealloc old register
9102               int n;
9103               for(n=0;n<HOST_REGS;n++)
9104               {
9105                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9106               }
9107               // and alloc new one
9108               f_regmap[hr]=regs[i].regmap[hr];
9109             }
9110           }
9111           if(branch_regs[i].regmap[hr]>64) {
9112             if(!((branch_regs[i].dirty>>hr)&1))
9113               f_regmap[hr]=branch_regs[i].regmap[hr];
9114             else f_regmap[hr]=-1;
9115           }
9116           else if(branch_regs[i].regmap[hr]>=0) {
9117             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9118               // dealloc old register
9119               int n;
9120               for(n=0;n<HOST_REGS;n++)
9121               {
9122                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9123               }
9124               // and alloc new one
9125               f_regmap[hr]=branch_regs[i].regmap[hr];
9126             }
9127           }
9128           if(ooo[i]) {
9129             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9130               f_regmap[hr]=branch_regs[i].regmap[hr];
9131           }else{
9132             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9133               f_regmap[hr]=branch_regs[i].regmap[hr];
9134           }
9135           // Avoid dirty->clean transition
9136           #ifdef DESTRUCTIVE_WRITEBACK
9137           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9138           #endif
9139           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9140           // case above, however it's always a good idea.  We can't hoist the
9141           // load if the register was already allocated, so there's no point
9142           // wasting time analyzing most of these cases.  It only "succeeds"
9143           // when the mapping was different and the load can be replaced with
9144           // a mov, which is of negligible benefit.  So such cases are
9145           // skipped below.
9146           if(f_regmap[hr]>0) {
9147             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9148               int r=f_regmap[hr];
9149               for(j=t;j<=i;j++)
9150               {
9151                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9152                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9153                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9154                 if(r>63) {
9155                   // NB This can exclude the case where the upper-half
9156                   // register is lower numbered than the lower-half
9157                   // register.  Not sure if it's worth fixing...
9158                   if(get_reg(regs[j].regmap,r&63)<0) break;
9159                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9160                   if(regs[j].is32&(1LL<<(r&63))) break;
9161                 }
9162                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9163                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9164                   int k;
9165                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9166                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9167                     if(r>63) {
9168                       if(get_reg(regs[i].regmap,r&63)<0) break;
9169                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9170                     }
9171                     k=i;
9172                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9173                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9174                         //printf("no free regs for store %x\n",start+(k-1)*4);
9175                         break;
9176                       }
9177                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9178                         //printf("no-match due to different register\n");
9179                         break;
9180                       }
9181                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9182                         //printf("no-match due to branch\n");
9183                         break;
9184                       }
9185                       // call/ret fast path assumes no registers allocated
9186                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9187                         break;
9188                       }
9189                       if(r>63) {
9190                         // NB This can exclude the case where the upper-half
9191                         // register is lower numbered than the lower-half
9192                         // register.  Not sure if it's worth fixing...
9193                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9194                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9195                       }
9196                       k--;
9197                     }
9198                     if(i<slen-1) {
9199                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9200                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9201                         //printf("bad match after branch\n");
9202                         break;
9203                       }
9204                     }
9205                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9206                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9207                       while(k<i) {
9208                         regs[k].regmap_entry[hr]=f_regmap[hr];
9209                         regs[k].regmap[hr]=f_regmap[hr];
9210                         regmap_pre[k+1][hr]=f_regmap[hr];
9211                         regs[k].wasdirty&=~(1<<hr);
9212                         regs[k].dirty&=~(1<<hr);
9213                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9214                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9215                         regs[k].wasconst&=~(1<<hr);
9216                         regs[k].isconst&=~(1<<hr);
9217                         k++;
9218                       }
9219                     }
9220                     else {
9221                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9222                       break;
9223                     }
9224                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9225                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9226                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9227                       regs[i].regmap_entry[hr]=f_regmap[hr];
9228                       regs[i].regmap[hr]=f_regmap[hr];
9229                       regs[i].wasdirty&=~(1<<hr);
9230                       regs[i].dirty&=~(1<<hr);
9231                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9232                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9233                       regs[i].wasconst&=~(1<<hr);
9234                       regs[i].isconst&=~(1<<hr);
9235                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9236                       branch_regs[i].wasdirty&=~(1<<hr);
9237                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9238                       branch_regs[i].regmap[hr]=f_regmap[hr];
9239                       branch_regs[i].dirty&=~(1<<hr);
9240                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9241                       branch_regs[i].wasconst&=~(1<<hr);
9242                       branch_regs[i].isconst&=~(1<<hr);
9243                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9244                         regmap_pre[i+2][hr]=f_regmap[hr];
9245                         regs[i+2].wasdirty&=~(1<<hr);
9246                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9247                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9248                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9249                       }
9250                     }
9251                   }
9252                   for(k=t;k<j;k++) {
9253                     // Alloc register clean at beginning of loop,
9254                     // but may dirty it in pass 6
9255                     regs[k].regmap_entry[hr]=f_regmap[hr];
9256                     regs[k].regmap[hr]=f_regmap[hr];
9257                     regs[k].dirty&=~(1<<hr);
9258                     regs[k].wasconst&=~(1<<hr);
9259                     regs[k].isconst&=~(1<<hr);
9260                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9261                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9262                       branch_regs[k].regmap[hr]=f_regmap[hr];
9263                       branch_regs[k].dirty&=~(1<<hr);
9264                       branch_regs[k].wasconst&=~(1<<hr);
9265                       branch_regs[k].isconst&=~(1<<hr);
9266                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9267                         regmap_pre[k+2][hr]=f_regmap[hr];
9268                         regs[k+2].wasdirty&=~(1<<hr);
9269                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9270                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9271                       }
9272                     }
9273                     else
9274                     {
9275                       regmap_pre[k+1][hr]=f_regmap[hr];
9276                       regs[k+1].wasdirty&=~(1<<hr);
9277                     }
9278                   }
9279                   if(regs[j].regmap[hr]==f_regmap[hr])
9280                     regs[j].regmap_entry[hr]=f_regmap[hr];
9281                   break;
9282                 }
9283                 if(j==i) break;
9284                 if(regs[j].regmap[hr]>=0)
9285                   break;
9286                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9287                   //printf("no-match due to different register\n");
9288                   break;
9289                 }
9290                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9291                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9292                   break;
9293                 }
9294                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9295                 {
9296                   // Stop on unconditional branch
9297                   break;
9298                 }
9299                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9300                 {
9301                   if(ooo[j]) {
9302                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9303                       break;
9304                   }else{
9305                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9306                       break;
9307                   }
9308                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9309                     //printf("no-match due to different register (branch)\n");
9310                     break;
9311                   }
9312                 }
9313                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9314                   //printf("No free regs for store %x\n",start+j*4);
9315                   break;
9316                 }
9317                 if(f_regmap[hr]>=64) {
9318                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9319                     break;
9320                   }
9321                   else
9322                   {
9323                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9324                       break;
9325                     }
9326                   }
9327                 }
9328               }
9329             }
9330           }
9331         }
9332       }
9333     }else{
9334       // Non branch or undetermined branch target
9335       for(hr=0;hr<HOST_REGS;hr++)
9336       {
9337         if(hr!=EXCLUDE_REG) {
9338           if(regs[i].regmap[hr]>64) {
9339             if(!((regs[i].dirty>>hr)&1))
9340               f_regmap[hr]=regs[i].regmap[hr];
9341           }
9342           else if(regs[i].regmap[hr]>=0) {
9343             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9344               // dealloc old register
9345               int n;
9346               for(n=0;n<HOST_REGS;n++)
9347               {
9348                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9349               }
9350               // and alloc new one
9351               f_regmap[hr]=regs[i].regmap[hr];
9352             }
9353           }
9354         }
9355       }
9356       // Try to restore cycle count at branch targets
9357       if(bt[i]) {
9358         for(j=i;j<slen-1;j++) {
9359           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9360           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9361             //printf("no free regs for store %x\n",start+j*4);
9362             break;
9363           }
9364         }
9365         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9366           int k=i;
9367           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9368           while(k<j) {
9369             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9370             regs[k].regmap[HOST_CCREG]=CCREG;
9371             regmap_pre[k+1][HOST_CCREG]=CCREG;
9372             regs[k+1].wasdirty|=1<<HOST_CCREG;
9373             regs[k].dirty|=1<<HOST_CCREG;
9374             regs[k].wasconst&=~(1<<HOST_CCREG);
9375             regs[k].isconst&=~(1<<HOST_CCREG);
9376             k++;
9377           }
9378           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9379         }
9380         // Work backwards from the branch target
9381         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9382         {
9383           //printf("Extend backwards\n");
9384           int k;
9385           k=i;
9386           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9387             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9388               //printf("no free regs for store %x\n",start+(k-1)*4);
9389               break;
9390             }
9391             k--;
9392           }
9393           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9394             //printf("Extend CC, %x ->\n",start+k*4);
9395             while(k<=i) {
9396               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9397               regs[k].regmap[HOST_CCREG]=CCREG;
9398               regmap_pre[k+1][HOST_CCREG]=CCREG;
9399               regs[k+1].wasdirty|=1<<HOST_CCREG;
9400               regs[k].dirty|=1<<HOST_CCREG;
9401               regs[k].wasconst&=~(1<<HOST_CCREG);
9402               regs[k].isconst&=~(1<<HOST_CCREG);
9403               k++;
9404             }
9405           }
9406           else {
9407             //printf("Fail Extend CC, %x ->\n",start+k*4);
9408           }
9409         }
9410       }
9411       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9412          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9413          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9414          itype[i]!=FCONV&&itype[i]!=FCOMP)
9415       {
9416         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9417       }
9418     }
9419   }
9420
9421   // Cache memory offset or tlb map pointer if a register is available
9422   #ifndef HOST_IMM_ADDR32
9423   #ifndef RAM_OFFSET
9424   if(0)
9425   #endif
9426   {
9427     int earliest_available[HOST_REGS];
9428     int loop_start[HOST_REGS];
9429     int score[HOST_REGS];
9430     int end[HOST_REGS];
9431     int reg=ROREG;
9432
9433     // Init
9434     for(hr=0;hr<HOST_REGS;hr++) {
9435       score[hr]=0;earliest_available[hr]=0;
9436       loop_start[hr]=MAXBLOCK;
9437     }
9438     for(i=0;i<slen-1;i++)
9439     {
9440       // Can't do anything if no registers are available
9441       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9442         for(hr=0;hr<HOST_REGS;hr++) {
9443           score[hr]=0;earliest_available[hr]=i+1;
9444           loop_start[hr]=MAXBLOCK;
9445         }
9446       }
9447       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9448         if(!ooo[i]) {
9449           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9450             for(hr=0;hr<HOST_REGS;hr++) {
9451               score[hr]=0;earliest_available[hr]=i+1;
9452               loop_start[hr]=MAXBLOCK;
9453             }
9454           }
9455         }else{
9456           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9457             for(hr=0;hr<HOST_REGS;hr++) {
9458               score[hr]=0;earliest_available[hr]=i+1;
9459               loop_start[hr]=MAXBLOCK;
9460             }
9461           }
9462         }
9463       }
9464       // Mark unavailable registers
9465       for(hr=0;hr<HOST_REGS;hr++) {
9466         if(regs[i].regmap[hr]>=0) {
9467           score[hr]=0;earliest_available[hr]=i+1;
9468           loop_start[hr]=MAXBLOCK;
9469         }
9470         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9471           if(branch_regs[i].regmap[hr]>=0) {
9472             score[hr]=0;earliest_available[hr]=i+2;
9473             loop_start[hr]=MAXBLOCK;
9474           }
9475         }
9476       }
9477       // No register allocations after unconditional jumps
9478       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9479       {
9480         for(hr=0;hr<HOST_REGS;hr++) {
9481           score[hr]=0;earliest_available[hr]=i+2;
9482           loop_start[hr]=MAXBLOCK;
9483         }
9484         i++; // Skip delay slot too
9485         //printf("skip delay slot: %x\n",start+i*4);
9486       }
9487       else
9488       // Possible match
9489       if(itype[i]==LOAD||itype[i]==LOADLR||
9490          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9491         for(hr=0;hr<HOST_REGS;hr++) {
9492           if(hr!=EXCLUDE_REG) {
9493             end[hr]=i-1;
9494             for(j=i;j<slen-1;j++) {
9495               if(regs[j].regmap[hr]>=0) break;
9496               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9497                 if(branch_regs[j].regmap[hr]>=0) break;
9498                 if(ooo[j]) {
9499                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9500                 }else{
9501                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9502                 }
9503               }
9504               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9505               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9506                 int t=(ba[j]-start)>>2;
9507                 if(t<j&&t>=earliest_available[hr]) {
9508                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9509                     // Score a point for hoisting loop invariant
9510                     if(t<loop_start[hr]) loop_start[hr]=t;
9511                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9512                     score[hr]++;
9513                     end[hr]=j;
9514                   }
9515                 }
9516                 else if(t<j) {
9517                   if(regs[t].regmap[hr]==reg) {
9518                     // Score a point if the branch target matches this register
9519                     score[hr]++;
9520                     end[hr]=j;
9521                   }
9522                 }
9523                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9524                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9525                   score[hr]++;
9526                   end[hr]=j;
9527                 }
9528               }
9529               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9530               {
9531                 // Stop on unconditional branch
9532                 break;
9533               }
9534               else
9535               if(itype[j]==LOAD||itype[j]==LOADLR||
9536                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9537                 score[hr]++;
9538                 end[hr]=j;
9539               }
9540             }
9541           }
9542         }
9543         // Find highest score and allocate that register
9544         int maxscore=0;
9545         for(hr=0;hr<HOST_REGS;hr++) {
9546           if(hr!=EXCLUDE_REG) {
9547             if(score[hr]>score[maxscore]) {
9548               maxscore=hr;
9549               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9550             }
9551           }
9552         }
9553         if(score[maxscore]>1)
9554         {
9555           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9556           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9557             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9558             assert(regs[j].regmap[maxscore]<0);
9559             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9560             regs[j].regmap[maxscore]=reg;
9561             regs[j].dirty&=~(1<<maxscore);
9562             regs[j].wasconst&=~(1<<maxscore);
9563             regs[j].isconst&=~(1<<maxscore);
9564             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9565               branch_regs[j].regmap[maxscore]=reg;
9566               branch_regs[j].wasdirty&=~(1<<maxscore);
9567               branch_regs[j].dirty&=~(1<<maxscore);
9568               branch_regs[j].wasconst&=~(1<<maxscore);
9569               branch_regs[j].isconst&=~(1<<maxscore);
9570               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9571                 regmap_pre[j+2][maxscore]=reg;
9572                 regs[j+2].wasdirty&=~(1<<maxscore);
9573               }
9574               // loop optimization (loop_preload)
9575               int t=(ba[j]-start)>>2;
9576               if(t==loop_start[maxscore]) {
9577                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9578                   regs[t].regmap_entry[maxscore]=reg;
9579               }
9580             }
9581             else
9582             {
9583               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9584                 regmap_pre[j+1][maxscore]=reg;
9585                 regs[j+1].wasdirty&=~(1<<maxscore);
9586               }
9587             }
9588           }
9589           i=j-1;
9590           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9591           for(hr=0;hr<HOST_REGS;hr++) {
9592             score[hr]=0;earliest_available[hr]=i+i;
9593             loop_start[hr]=MAXBLOCK;
9594           }
9595         }
9596       }
9597     }
9598   }
9599   #endif
9600
9601   // This allocates registers (if possible) one instruction prior
9602   // to use, which can avoid a load-use penalty on certain CPUs.
9603   for(i=0;i<slen-1;i++)
9604   {
9605     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9606     {
9607       if(!bt[i+1])
9608       {
9609         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9610            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9611         {
9612           if(rs1[i+1]) {
9613             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9614             {
9615               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9616               {
9617                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9618                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9619                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9620                 regs[i].isconst&=~(1<<hr);
9621                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9622                 constmap[i][hr]=constmap[i+1][hr];
9623                 regs[i+1].wasdirty&=~(1<<hr);
9624                 regs[i].dirty&=~(1<<hr);
9625               }
9626             }
9627           }
9628           if(rs2[i+1]) {
9629             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9630             {
9631               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9632               {
9633                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9634                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9635                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9636                 regs[i].isconst&=~(1<<hr);
9637                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9638                 constmap[i][hr]=constmap[i+1][hr];
9639                 regs[i+1].wasdirty&=~(1<<hr);
9640                 regs[i].dirty&=~(1<<hr);
9641               }
9642             }
9643           }
9644           // Preload target address for load instruction (non-constant)
9645           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9646             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9647             {
9648               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9649               {
9650                 regs[i].regmap[hr]=rs1[i+1];
9651                 regmap_pre[i+1][hr]=rs1[i+1];
9652                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9653                 regs[i].isconst&=~(1<<hr);
9654                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9655                 constmap[i][hr]=constmap[i+1][hr];
9656                 regs[i+1].wasdirty&=~(1<<hr);
9657                 regs[i].dirty&=~(1<<hr);
9658               }
9659             }
9660           }
9661           // Load source into target register
9662           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9663             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9664             {
9665               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9666               {
9667                 regs[i].regmap[hr]=rs1[i+1];
9668                 regmap_pre[i+1][hr]=rs1[i+1];
9669                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9670                 regs[i].isconst&=~(1<<hr);
9671                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9672                 constmap[i][hr]=constmap[i+1][hr];
9673                 regs[i+1].wasdirty&=~(1<<hr);
9674                 regs[i].dirty&=~(1<<hr);
9675               }
9676             }
9677           }
9678           // Address for store instruction (non-constant)
9679           if(itype[i+1]==STORE||itype[i+1]==STORELR
9680              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9681             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9682               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9683               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9684               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9685               assert(hr>=0);
9686               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9687               {
9688                 regs[i].regmap[hr]=rs1[i+1];
9689                 regmap_pre[i+1][hr]=rs1[i+1];
9690                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9691                 regs[i].isconst&=~(1<<hr);
9692                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9693                 constmap[i][hr]=constmap[i+1][hr];
9694                 regs[i+1].wasdirty&=~(1<<hr);
9695                 regs[i].dirty&=~(1<<hr);
9696               }
9697             }
9698           }
9699           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9700             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9701               int nr;
9702               hr=get_reg(regs[i+1].regmap,FTEMP);
9703               assert(hr>=0);
9704               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9705               {
9706                 regs[i].regmap[hr]=rs1[i+1];
9707                 regmap_pre[i+1][hr]=rs1[i+1];
9708                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9709                 regs[i].isconst&=~(1<<hr);
9710                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9711                 constmap[i][hr]=constmap[i+1][hr];
9712                 regs[i+1].wasdirty&=~(1<<hr);
9713                 regs[i].dirty&=~(1<<hr);
9714               }
9715               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9716               {
9717                 // move it to another register
9718                 regs[i+1].regmap[hr]=-1;
9719                 regmap_pre[i+2][hr]=-1;
9720                 regs[i+1].regmap[nr]=FTEMP;
9721                 regmap_pre[i+2][nr]=FTEMP;
9722                 regs[i].regmap[nr]=rs1[i+1];
9723                 regmap_pre[i+1][nr]=rs1[i+1];
9724                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9725                 regs[i].isconst&=~(1<<nr);
9726                 regs[i+1].isconst&=~(1<<nr);
9727                 regs[i].dirty&=~(1<<nr);
9728                 regs[i+1].wasdirty&=~(1<<nr);
9729                 regs[i+1].dirty&=~(1<<nr);
9730                 regs[i+2].wasdirty&=~(1<<nr);
9731               }
9732             }
9733           }
9734           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9735             if(itype[i+1]==LOAD)
9736               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9737             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9738               hr=get_reg(regs[i+1].regmap,FTEMP);
9739             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9740               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9741               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9742             }
9743             if(hr>=0&&regs[i].regmap[hr]<0) {
9744               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9745               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9746                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9747                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9748                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9749                 regs[i].isconst&=~(1<<hr);
9750                 regs[i+1].wasdirty&=~(1<<hr);
9751                 regs[i].dirty&=~(1<<hr);
9752               }
9753             }
9754           }
9755         }
9756       }
9757     }
9758   }
9759
9760   /* Pass 6 - Optimize clean/dirty state */
9761   clean_registers(0,slen-1,1);
9762
9763   /* Pass 7 - Identify 32-bit registers */
9764   for (i=slen-1;i>=0;i--)
9765   {
9766     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9767     {
9768       // Conditional branch
9769       if((source[i]>>16)!=0x1000&&i<slen-2) {
9770         // Mark this address as a branch target since it may be called
9771         // upon return from interrupt
9772         bt[i+2]=1;
9773       }
9774     }
9775   }
9776
9777   if(itype[slen-1]==SPAN) {
9778     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9779   }
9780
9781 #ifdef DISASM
9782   /* Debug/disassembly */
9783   for(i=0;i<slen;i++)
9784   {
9785     printf("U:");
9786     int r;
9787     for(r=1;r<=CCREG;r++) {
9788       if((unneeded_reg[i]>>r)&1) {
9789         if(r==HIREG) printf(" HI");
9790         else if(r==LOREG) printf(" LO");
9791         else printf(" r%d",r);
9792       }
9793     }
9794     printf("\n");
9795     #if defined(__i386__) || defined(__x86_64__)
9796     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9797     #endif
9798     #ifdef __arm__
9799     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9800     #endif
9801     printf("needs: ");
9802     if(needed_reg[i]&1) printf("eax ");
9803     if((needed_reg[i]>>1)&1) printf("ecx ");
9804     if((needed_reg[i]>>2)&1) printf("edx ");
9805     if((needed_reg[i]>>3)&1) printf("ebx ");
9806     if((needed_reg[i]>>5)&1) printf("ebp ");
9807     if((needed_reg[i]>>6)&1) printf("esi ");
9808     if((needed_reg[i]>>7)&1) printf("edi ");
9809     printf("\n");
9810     #if defined(__i386__) || defined(__x86_64__)
9811     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9812     printf("dirty: ");
9813     if(regs[i].wasdirty&1) printf("eax ");
9814     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9815     if((regs[i].wasdirty>>2)&1) printf("edx ");
9816     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9817     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9818     if((regs[i].wasdirty>>6)&1) printf("esi ");
9819     if((regs[i].wasdirty>>7)&1) printf("edi ");
9820     #endif
9821     #ifdef __arm__
9822     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9823     printf("dirty: ");
9824     if(regs[i].wasdirty&1) printf("r0 ");
9825     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9826     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9827     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9828     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9829     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9830     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9831     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9832     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9833     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9834     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9835     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9836     #endif
9837     printf("\n");
9838     disassemble_inst(i);
9839     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9840     #if defined(__i386__) || defined(__x86_64__)
9841     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9842     if(regs[i].dirty&1) printf("eax ");
9843     if((regs[i].dirty>>1)&1) printf("ecx ");
9844     if((regs[i].dirty>>2)&1) printf("edx ");
9845     if((regs[i].dirty>>3)&1) printf("ebx ");
9846     if((regs[i].dirty>>5)&1) printf("ebp ");
9847     if((regs[i].dirty>>6)&1) printf("esi ");
9848     if((regs[i].dirty>>7)&1) printf("edi ");
9849     #endif
9850     #ifdef __arm__
9851     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9852     if(regs[i].dirty&1) printf("r0 ");
9853     if((regs[i].dirty>>1)&1) printf("r1 ");
9854     if((regs[i].dirty>>2)&1) printf("r2 ");
9855     if((regs[i].dirty>>3)&1) printf("r3 ");
9856     if((regs[i].dirty>>4)&1) printf("r4 ");
9857     if((regs[i].dirty>>5)&1) printf("r5 ");
9858     if((regs[i].dirty>>6)&1) printf("r6 ");
9859     if((regs[i].dirty>>7)&1) printf("r7 ");
9860     if((regs[i].dirty>>8)&1) printf("r8 ");
9861     if((regs[i].dirty>>9)&1) printf("r9 ");
9862     if((regs[i].dirty>>10)&1) printf("r10 ");
9863     if((regs[i].dirty>>12)&1) printf("r12 ");
9864     #endif
9865     printf("\n");
9866     if(regs[i].isconst) {
9867       printf("constants: ");
9868       #if defined(__i386__) || defined(__x86_64__)
9869       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9870       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9871       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9872       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9873       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9874       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
9875       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
9876       #endif
9877       #ifdef __arm__
9878       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
9879       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
9880       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
9881       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
9882       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
9883       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
9884       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
9885       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
9886       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
9887       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
9888       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
9889       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
9890       #endif
9891       printf("\n");
9892     }
9893     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9894       #if defined(__i386__) || defined(__x86_64__)
9895       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9896       if(branch_regs[i].dirty&1) printf("eax ");
9897       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
9898       if((branch_regs[i].dirty>>2)&1) printf("edx ");
9899       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
9900       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
9901       if((branch_regs[i].dirty>>6)&1) printf("esi ");
9902       if((branch_regs[i].dirty>>7)&1) printf("edi ");
9903       #endif
9904       #ifdef __arm__
9905       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
9906       if(branch_regs[i].dirty&1) printf("r0 ");
9907       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
9908       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
9909       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
9910       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
9911       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
9912       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
9913       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
9914       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
9915       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
9916       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
9917       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
9918       #endif
9919     }
9920   }
9921 #endif // DISASM
9922
9923   /* Pass 8 - Assembly */
9924   linkcount=0;stubcount=0;
9925   ds=0;is_delayslot=0;
9926   cop1_usable=0;
9927   uint64_t is32_pre=0;
9928   u_int dirty_pre=0;
9929   void *beginning=start_block();
9930   if((u_int)addr&1) {
9931     ds=1;
9932     pagespan_ds();
9933   }
9934   u_int instr_addr0_override=0;
9935
9936   if (start == 0x80030000) {
9937     // nasty hack for fastbios thing
9938     // override block entry to this code
9939     instr_addr0_override=(u_int)out;
9940     emit_movimm(start,0);
9941     // abuse io address var as a flag that we
9942     // have already returned here once
9943     emit_readword((int)&address,1);
9944     emit_writeword(0,(int)&pcaddr);
9945     emit_writeword(0,(int)&address);
9946     emit_cmp(0,1);
9947     emit_jne((int)new_dyna_leave);
9948   }
9949   for(i=0;i<slen;i++)
9950   {
9951     //if(ds) printf("ds: ");
9952     disassemble_inst(i);
9953     if(ds) {
9954       ds=0; // Skip delay slot
9955       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
9956       instr_addr[i]=0;
9957     } else {
9958       speculate_register_values(i);
9959       #ifndef DESTRUCTIVE_WRITEBACK
9960       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9961       {
9962         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
9963               unneeded_reg[i],unneeded_reg_upper[i]);
9964       }
9965       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
9966         is32_pre=branch_regs[i].is32;
9967         dirty_pre=branch_regs[i].dirty;
9968       }else{
9969         is32_pre=regs[i].is32;
9970         dirty_pre=regs[i].dirty;
9971       }
9972       #endif
9973       // write back
9974       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9975       {
9976         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
9977                       unneeded_reg[i],unneeded_reg_upper[i]);
9978         loop_preload(regmap_pre[i],regs[i].regmap_entry);
9979       }
9980       // branch target entry point
9981       instr_addr[i]=(u_int)out;
9982       assem_debug("<->\n");
9983       // load regs
9984       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
9985         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
9986       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
9987       address_generation(i,&regs[i],regs[i].regmap_entry);
9988       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
9989       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9990       {
9991         // Load the delay slot registers if necessary
9992         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
9993           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9994         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
9995           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9996         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
9997           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9998       }
9999       else if(i+1<slen)
10000       {
10001         // Preload registers for following instruction
10002         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10003           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10004             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10005         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10006           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10007             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10008       }
10009       // TODO: if(is_ooo(i)) address_generation(i+1);
10010       if(itype[i]==CJUMP||itype[i]==FJUMP)
10011         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10012       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10013         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10014       if(bt[i]) cop1_usable=0;
10015       // assemble
10016       switch(itype[i]) {
10017         case ALU:
10018           alu_assemble(i,&regs[i]);break;
10019         case IMM16:
10020           imm16_assemble(i,&regs[i]);break;
10021         case SHIFT:
10022           shift_assemble(i,&regs[i]);break;
10023         case SHIFTIMM:
10024           shiftimm_assemble(i,&regs[i]);break;
10025         case LOAD:
10026           load_assemble(i,&regs[i]);break;
10027         case LOADLR:
10028           loadlr_assemble(i,&regs[i]);break;
10029         case STORE:
10030           store_assemble(i,&regs[i]);break;
10031         case STORELR:
10032           storelr_assemble(i,&regs[i]);break;
10033         case COP0:
10034           cop0_assemble(i,&regs[i]);break;
10035         case COP1:
10036           cop1_assemble(i,&regs[i]);break;
10037         case C1LS:
10038           c1ls_assemble(i,&regs[i]);break;
10039         case COP2:
10040           cop2_assemble(i,&regs[i]);break;
10041         case C2LS:
10042           c2ls_assemble(i,&regs[i]);break;
10043         case C2OP:
10044           c2op_assemble(i,&regs[i]);break;
10045         case FCONV:
10046           fconv_assemble(i,&regs[i]);break;
10047         case FLOAT:
10048           float_assemble(i,&regs[i]);break;
10049         case FCOMP:
10050           fcomp_assemble(i,&regs[i]);break;
10051         case MULTDIV:
10052           multdiv_assemble(i,&regs[i]);break;
10053         case MOV:
10054           mov_assemble(i,&regs[i]);break;
10055         case SYSCALL:
10056           syscall_assemble(i,&regs[i]);break;
10057         case HLECALL:
10058           hlecall_assemble(i,&regs[i]);break;
10059         case INTCALL:
10060           intcall_assemble(i,&regs[i]);break;
10061         case UJUMP:
10062           ujump_assemble(i,&regs[i]);ds=1;break;
10063         case RJUMP:
10064           rjump_assemble(i,&regs[i]);ds=1;break;
10065         case CJUMP:
10066           cjump_assemble(i,&regs[i]);ds=1;break;
10067         case SJUMP:
10068           sjump_assemble(i,&regs[i]);ds=1;break;
10069         case FJUMP:
10070           fjump_assemble(i,&regs[i]);ds=1;break;
10071         case SPAN:
10072           pagespan_assemble(i,&regs[i]);break;
10073       }
10074       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10075         literal_pool(1024);
10076       else
10077         literal_pool_jumpover(256);
10078     }
10079   }
10080   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10081   // If the block did not end with an unconditional branch,
10082   // add a jump to the next instruction.
10083   if(i>1) {
10084     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10085       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10086       assert(i==slen);
10087       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10088         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10089         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10090           emit_loadreg(CCREG,HOST_CCREG);
10091         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10092       }
10093       else if(!likely[i-2])
10094       {
10095         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10096         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10097       }
10098       else
10099       {
10100         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10101         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10102       }
10103       add_to_linker((int)out,start+i*4,0);
10104       emit_jmp(0);
10105     }
10106   }
10107   else
10108   {
10109     assert(i>0);
10110     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10111     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10112     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10113       emit_loadreg(CCREG,HOST_CCREG);
10114     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10115     add_to_linker((int)out,start+i*4,0);
10116     emit_jmp(0);
10117   }
10118
10119   // TODO: delay slot stubs?
10120   // Stubs
10121   for(i=0;i<stubcount;i++)
10122   {
10123     switch(stubs[i][0])
10124     {
10125       case LOADB_STUB:
10126       case LOADH_STUB:
10127       case LOADW_STUB:
10128       case LOADD_STUB:
10129       case LOADBU_STUB:
10130       case LOADHU_STUB:
10131         do_readstub(i);break;
10132       case STOREB_STUB:
10133       case STOREH_STUB:
10134       case STOREW_STUB:
10135       case STORED_STUB:
10136         do_writestub(i);break;
10137       case CC_STUB:
10138         do_ccstub(i);break;
10139       case INVCODE_STUB:
10140         do_invstub(i);break;
10141       case FP_STUB:
10142         do_cop1stub(i);break;
10143       case STORELR_STUB:
10144         do_unalignedwritestub(i);break;
10145     }
10146   }
10147
10148   if (instr_addr0_override)
10149     instr_addr[0] = instr_addr0_override;
10150
10151   /* Pass 9 - Linker */
10152   for(i=0;i<linkcount;i++)
10153   {
10154     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10155     literal_pool(64);
10156     if(!link_addr[i][2])
10157     {
10158       void *stub=out;
10159       void *addr=check_addr(link_addr[i][1]);
10160       emit_extjump(link_addr[i][0],link_addr[i][1]);
10161       if(addr) {
10162         set_jump_target(link_addr[i][0],(int)addr);
10163         add_link(link_addr[i][1],stub);
10164       }
10165       else set_jump_target(link_addr[i][0],(int)stub);
10166     }
10167     else
10168     {
10169       // Internal branch
10170       int target=(link_addr[i][1]-start)>>2;
10171       assert(target>=0&&target<slen);
10172       assert(instr_addr[target]);
10173       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10174       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10175       //#else
10176       set_jump_target(link_addr[i][0],instr_addr[target]);
10177       //#endif
10178     }
10179   }
10180   // External Branch Targets (jump_in)
10181   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10182   for(i=0;i<slen;i++)
10183   {
10184     if(bt[i]||i==0)
10185     {
10186       if(instr_addr[i]) // TODO - delay slots (=null)
10187       {
10188         u_int vaddr=start+i*4;
10189         u_int page=get_page(vaddr);
10190         u_int vpage=get_vpage(vaddr);
10191         literal_pool(256);
10192         {
10193           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10194           assem_debug("jump_in: %x\n",start+i*4);
10195           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10196           int entry_point=do_dirty_stub(i);
10197           ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
10198           // If there was an existing entry in the hash table,
10199           // replace it with the new address.
10200           // Don't add new entries.  We'll insert the
10201           // ones that actually get used in check_addr().
10202           u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10203           if(ht_bin[0]==vaddr) {
10204             ht_bin[1]=entry_point;
10205           }
10206           if(ht_bin[2]==vaddr) {
10207             ht_bin[3]=entry_point;
10208           }
10209         }
10210       }
10211     }
10212   }
10213   // Write out the literal pool if necessary
10214   literal_pool(0);
10215   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10216   // Align code
10217   if(((u_int)out)&7) emit_addnop(13);
10218   #endif
10219   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
10220   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10221   memcpy(copy,source,slen*4);
10222   copy+=slen*4;
10223
10224   end_block(beginning);
10225
10226   // If we're within 256K of the end of the buffer,
10227   // start over from the beginning. (Is 256K enough?)
10228   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10229
10230   // Trap writes to any of the pages we compiled
10231   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10232     invalid_code[i]=0;
10233   }
10234   inv_code_start=inv_code_end=~0;
10235
10236   // for PCSX we need to mark all mirrors too
10237   if(get_page(start)<(RAM_SIZE>>12))
10238     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10239       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10240       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10241       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10242
10243   /* Pass 10 - Free memory by expiring oldest blocks */
10244
10245   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10246   while(expirep!=end)
10247   {
10248     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10249     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10250     inv_debug("EXP: Phase %d\n",expirep);
10251     switch((expirep>>11)&3)
10252     {
10253       case 0:
10254         // Clear jump_in and jump_dirty
10255         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10256         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10257         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10258         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10259         break;
10260       case 1:
10261         // Clear pointers
10262         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10263         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10264         break;
10265       case 2:
10266         // Clear hash table
10267         for(i=0;i<32;i++) {
10268           u_int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10269           if((ht_bin[3]>>shift)==(base>>shift) ||
10270              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10271             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10272             ht_bin[2]=ht_bin[3]=-1;
10273           }
10274           if((ht_bin[1]>>shift)==(base>>shift) ||
10275              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10276             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10277             ht_bin[0]=ht_bin[2];
10278             ht_bin[1]=ht_bin[3];
10279             ht_bin[2]=ht_bin[3]=-1;
10280           }
10281         }
10282         break;
10283       case 3:
10284         // Clear jump_out
10285         #ifdef __arm__
10286         if((expirep&2047)==0)
10287           do_clear_cache();
10288         #endif
10289         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10290         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10291         break;
10292     }
10293     expirep=(expirep+1)&65535;
10294   }
10295   return 0;
10296 }
10297
10298 // vim:shiftwidth=2:expandtab