drc: strip eol blanks
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26
27 #include "emu_if.h" //emulator interface
28
29 //#define DISASM
30 //#define assem_debug printf
31 //#define inv_debug printf
32 #define assem_debug(...)
33 #define inv_debug(...)
34
35 #ifdef __i386__
36 #include "assem_x86.h"
37 #endif
38 #ifdef __x86_64__
39 #include "assem_x64.h"
40 #endif
41 #ifdef __arm__
42 #include "assem_arm.h"
43 #endif
44
45 #ifdef __BLACKBERRY_QNX__
46 #undef __clear_cache
47 #define __clear_cache(start,end) msync(start, (size_t)((void*)end - (void*)start), MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
48 #elif defined(__MACH__)
49 #include <libkern/OSCacheControl.h>
50 #define __clear_cache mach_clear_cache
51 static void __clear_cache(void *start, void *end) {
52   size_t len = (char *)end - (char *)start;
53   sys_dcache_flush(start, len);
54   sys_icache_invalidate(start, len);
55 }
56 #endif
57
58 #define MAXBLOCK 4096
59 #define MAX_OUTPUT_BLOCK_SIZE 262144
60
61 struct regstat
62 {
63   signed char regmap_entry[HOST_REGS];
64   signed char regmap[HOST_REGS];
65   uint64_t was32;
66   uint64_t is32;
67   uint64_t wasdirty;
68   uint64_t dirty;
69   uint64_t u;
70   uint64_t uu;
71   u_int wasconst;
72   u_int isconst;
73   u_int loadedconst;             // host regs that have constants loaded
74   u_int waswritten;              // MIPS regs that were used as store base before
75 };
76
77 // note: asm depends on this layout
78 struct ll_entry
79 {
80   u_int vaddr;
81   u_int reg_sv_flags;
82   void *addr;
83   struct ll_entry *next;
84 };
85
86   u_int start;
87   u_int *source;
88   char insn[MAXBLOCK][10];
89   u_char itype[MAXBLOCK];
90   u_char opcode[MAXBLOCK];
91   u_char opcode2[MAXBLOCK];
92   u_char bt[MAXBLOCK];
93   u_char rs1[MAXBLOCK];
94   u_char rs2[MAXBLOCK];
95   u_char rt1[MAXBLOCK];
96   u_char rt2[MAXBLOCK];
97   u_char us1[MAXBLOCK];
98   u_char us2[MAXBLOCK];
99   u_char dep1[MAXBLOCK];
100   u_char dep2[MAXBLOCK];
101   u_char lt1[MAXBLOCK];
102   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
103   static uint64_t gte_rt[MAXBLOCK];
104   static uint64_t gte_unneeded[MAXBLOCK];
105   static u_int smrv[32]; // speculated MIPS register values
106   static u_int smrv_strong; // mask or regs that are likely to have correct values
107   static u_int smrv_weak; // same, but somewhat less likely
108   static u_int smrv_strong_next; // same, but after current insn executes
109   static u_int smrv_weak_next;
110   int imm[MAXBLOCK];
111   u_int ba[MAXBLOCK];
112   char likely[MAXBLOCK];
113   char is_ds[MAXBLOCK];
114   char ooo[MAXBLOCK];
115   uint64_t unneeded_reg[MAXBLOCK];
116   uint64_t unneeded_reg_upper[MAXBLOCK];
117   uint64_t branch_unneeded_reg[MAXBLOCK];
118   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
119   uint64_t pr32[MAXBLOCK];
120   signed char regmap_pre[MAXBLOCK][HOST_REGS];
121   static uint64_t current_constmap[HOST_REGS];
122   static uint64_t constmap[MAXBLOCK][HOST_REGS];
123   static struct regstat regs[MAXBLOCK];
124   static struct regstat branch_regs[MAXBLOCK];
125   signed char minimum_free_regs[MAXBLOCK];
126   u_int needed_reg[MAXBLOCK];
127   u_int wont_dirty[MAXBLOCK];
128   u_int will_dirty[MAXBLOCK];
129   int ccadj[MAXBLOCK];
130   int slen;
131   u_int instr_addr[MAXBLOCK];
132   u_int link_addr[MAXBLOCK][3];
133   int linkcount;
134   u_int stubs[MAXBLOCK*3][8];
135   int stubcount;
136   u_int literals[1024][2];
137   int literalcount;
138   int is_delayslot;
139   int cop1_usable;
140   u_char *out;
141   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
142   struct ll_entry *jump_out[4096];
143   struct ll_entry *jump_dirty[4096];
144   u_int hash_table[65536][4]  __attribute__((aligned(16)));
145   char shadow[1048576]  __attribute__((aligned(16)));
146   void *copy;
147   int expirep;
148   int new_dynarec_did_compile;
149   int new_dynarec_hacks;
150   u_int stop_after_jal;
151 #ifndef RAM_FIXED
152   static u_int ram_offset;
153 #else
154   static const u_int ram_offset=0;
155 #endif
156   extern u_char restore_candidate[512];
157   extern int cycle_count;
158
159   /* registers that may be allocated */
160   /* 1-31 gpr */
161 #define HIREG 32 // hi
162 #define LOREG 33 // lo
163 #define FSREG 34 // FPU status (FCSR)
164 #define CSREG 35 // Coprocessor status
165 #define CCREG 36 // Cycle count
166 #define INVCP 37 // Pointer to invalid_code
167 //#define MMREG 38 // Pointer to memory_map
168 #define ROREG 39 // ram offset (if rdram!=0x80000000)
169 #define TEMPREG 40
170 #define FTEMP 40 // FPU temporary register
171 #define PTEMP 41 // Prefetch temporary register
172 //#define TLREG 42 // TLB mapping offset
173 #define RHASH 43 // Return address hash
174 #define RHTBL 44 // Return address hash table address
175 #define RTEMP 45 // JR/JALR address register
176 #define MAXREG 45
177 #define AGEN1 46 // Address generation temporary register
178 //#define AGEN2 47 // Address generation temporary register
179 //#define MGEN1 48 // Maptable address generation temporary register
180 //#define MGEN2 49 // Maptable address generation temporary register
181 #define BTREG 50 // Branch target temporary register
182
183   /* instruction types */
184 #define NOP 0     // No operation
185 #define LOAD 1    // Load
186 #define STORE 2   // Store
187 #define LOADLR 3  // Unaligned load
188 #define STORELR 4 // Unaligned store
189 #define MOV 5     // Move
190 #define ALU 6     // Arithmetic/logic
191 #define MULTDIV 7 // Multiply/divide
192 #define SHIFT 8   // Shift by register
193 #define SHIFTIMM 9// Shift by immediate
194 #define IMM16 10  // 16-bit immediate
195 #define RJUMP 11  // Unconditional jump to register
196 #define UJUMP 12  // Unconditional jump
197 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
198 #define SJUMP 14  // Conditional branch (regimm format)
199 #define COP0 15   // Coprocessor 0
200 #define COP1 16   // Coprocessor 1
201 #define C1LS 17   // Coprocessor 1 load/store
202 #define FJUMP 18  // Conditional branch (floating point)
203 #define FLOAT 19  // Floating point unit
204 #define FCONV 20  // Convert integer to float
205 #define FCOMP 21  // Floating point compare (sets FSREG)
206 #define SYSCALL 22// SYSCALL
207 #define OTHER 23  // Other
208 #define SPAN 24   // Branch/delay slot spans 2 pages
209 #define NI 25     // Not implemented
210 #define HLECALL 26// PCSX fake opcodes for HLE
211 #define COP2 27   // Coprocessor 2 move
212 #define C2LS 28   // Coprocessor 2 load/store
213 #define C2OP 29   // Coprocessor 2 operation
214 #define INTCALL 30// Call interpreter to handle rare corner cases
215
216   /* stubs */
217 #define CC_STUB 1
218 #define FP_STUB 2
219 #define LOADB_STUB 3
220 #define LOADH_STUB 4
221 #define LOADW_STUB 5
222 #define LOADD_STUB 6
223 #define LOADBU_STUB 7
224 #define LOADHU_STUB 8
225 #define STOREB_STUB 9
226 #define STOREH_STUB 10
227 #define STOREW_STUB 11
228 #define STORED_STUB 12
229 #define STORELR_STUB 13
230 #define INVCODE_STUB 14
231
232   /* branch codes */
233 #define TAKEN 1
234 #define NOTTAKEN 2
235 #define NULLDS 3
236
237 // asm linkage
238 int new_recompile_block(int addr);
239 void *get_addr_ht(u_int vaddr);
240 void invalidate_block(u_int block);
241 void invalidate_addr(u_int addr);
242 void remove_hash(int vaddr);
243 void dyna_linker();
244 void dyna_linker_ds();
245 void verify_code();
246 void verify_code_vm();
247 void verify_code_ds();
248 void cc_interrupt();
249 void fp_exception();
250 void fp_exception_ds();
251 void jump_syscall_hle();
252 void jump_hlecall();
253 void jump_intcall();
254 void new_dyna_leave();
255
256 // Needed by assembler
257 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
258 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
259 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
260 void load_all_regs(signed char i_regmap[]);
261 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
262 void load_regs_entry(int t);
263 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
264
265 int tracedebug=0;
266
267 //#define DEBUG_CYCLE_COUNT 1
268
269 #define NO_CYCLE_PENALTY_THR 12
270
271 int cycle_multiplier; // 100 for 1.0
272
273 static int CLOCK_ADJUST(int x)
274 {
275   int s=(x>>31)|1;
276   return (x * cycle_multiplier + s * 50) / 100;
277 }
278
279 static u_int get_page(u_int vaddr)
280 {
281   u_int page=vaddr&~0xe0000000;
282   if (page < 0x1000000)
283     page &= ~0x0e00000; // RAM mirrors
284   page>>=12;
285   if(page>2048) page=2048+(page&2047);
286   return page;
287 }
288
289 // no virtual mem in PCSX
290 static u_int get_vpage(u_int vaddr)
291 {
292   return get_page(vaddr);
293 }
294
295 // Get address from virtual address
296 // This is called from the recompiled JR/JALR instructions
297 void *get_addr(u_int vaddr)
298 {
299   u_int page=get_page(vaddr);
300   u_int vpage=get_vpage(vaddr);
301   struct ll_entry *head;
302   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
303   head=jump_in[page];
304   while(head!=NULL) {
305     if(head->vaddr==vaddr) {
306   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
307       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
308       ht_bin[3]=ht_bin[1];
309       ht_bin[2]=ht_bin[0];
310       ht_bin[1]=(int)head->addr;
311       ht_bin[0]=vaddr;
312       return head->addr;
313     }
314     head=head->next;
315   }
316   head=jump_dirty[vpage];
317   while(head!=NULL) {
318     if(head->vaddr==vaddr) {
319       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
320       // Don't restore blocks which are about to expire from the cache
321       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
322       if(verify_dirty(head->addr)) {
323         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
324         invalid_code[vaddr>>12]=0;
325         inv_code_start=inv_code_end=~0;
326         if(vpage<2048) {
327           restore_candidate[vpage>>3]|=1<<(vpage&7);
328         }
329         else restore_candidate[page>>3]|=1<<(page&7);
330         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
331         if(ht_bin[0]==vaddr) {
332           ht_bin[1]=(int)head->addr; // Replace existing entry
333         }
334         else
335         {
336           ht_bin[3]=ht_bin[1];
337           ht_bin[2]=ht_bin[0];
338           ht_bin[1]=(int)head->addr;
339           ht_bin[0]=vaddr;
340         }
341         return head->addr;
342       }
343     }
344     head=head->next;
345   }
346   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
347   int r=new_recompile_block(vaddr);
348   if(r==0) return get_addr(vaddr);
349   // Execute in unmapped page, generate pagefault execption
350   Status|=2;
351   Cause=(vaddr<<31)|0x8;
352   EPC=(vaddr&1)?vaddr-5:vaddr;
353   BadVAddr=(vaddr&~1);
354   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
355   EntryHi=BadVAddr&0xFFFFE000;
356   return get_addr_ht(0x80000000);
357 }
358 // Look up address in hash table first
359 void *get_addr_ht(u_int vaddr)
360 {
361   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
362   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
363   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
364   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
365   return get_addr(vaddr);
366 }
367
368 void clear_all_regs(signed char regmap[])
369 {
370   int hr;
371   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
372 }
373
374 signed char get_reg(signed char regmap[],int r)
375 {
376   int hr;
377   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
378   return -1;
379 }
380
381 // Find a register that is available for two consecutive cycles
382 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
383 {
384   int hr;
385   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
386   return -1;
387 }
388
389 int count_free_regs(signed char regmap[])
390 {
391   int count=0;
392   int hr;
393   for(hr=0;hr<HOST_REGS;hr++)
394   {
395     if(hr!=EXCLUDE_REG) {
396       if(regmap[hr]<0) count++;
397     }
398   }
399   return count;
400 }
401
402 void dirty_reg(struct regstat *cur,signed char reg)
403 {
404   int hr;
405   if(!reg) return;
406   for (hr=0;hr<HOST_REGS;hr++) {
407     if((cur->regmap[hr]&63)==reg) {
408       cur->dirty|=1<<hr;
409     }
410   }
411 }
412
413 // If we dirty the lower half of a 64 bit register which is now being
414 // sign-extended, we need to dump the upper half.
415 // Note: Do this only after completion of the instruction, because
416 // some instructions may need to read the full 64-bit value even if
417 // overwriting it (eg SLTI, DSRA32).
418 static void flush_dirty_uppers(struct regstat *cur)
419 {
420   int hr,reg;
421   for (hr=0;hr<HOST_REGS;hr++) {
422     if((cur->dirty>>hr)&1) {
423       reg=cur->regmap[hr];
424       if(reg>=64)
425         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
426     }
427   }
428 }
429
430 void set_const(struct regstat *cur,signed char reg,uint64_t value)
431 {
432   int hr;
433   if(!reg) return;
434   for (hr=0;hr<HOST_REGS;hr++) {
435     if(cur->regmap[hr]==reg) {
436       cur->isconst|=1<<hr;
437       current_constmap[hr]=value;
438     }
439     else if((cur->regmap[hr]^64)==reg) {
440       cur->isconst|=1<<hr;
441       current_constmap[hr]=value>>32;
442     }
443   }
444 }
445
446 void clear_const(struct regstat *cur,signed char reg)
447 {
448   int hr;
449   if(!reg) return;
450   for (hr=0;hr<HOST_REGS;hr++) {
451     if((cur->regmap[hr]&63)==reg) {
452       cur->isconst&=~(1<<hr);
453     }
454   }
455 }
456
457 int is_const(struct regstat *cur,signed char reg)
458 {
459   int hr;
460   if(reg<0) return 0;
461   if(!reg) return 1;
462   for (hr=0;hr<HOST_REGS;hr++) {
463     if((cur->regmap[hr]&63)==reg) {
464       return (cur->isconst>>hr)&1;
465     }
466   }
467   return 0;
468 }
469 uint64_t get_const(struct regstat *cur,signed char reg)
470 {
471   int hr;
472   if(!reg) return 0;
473   for (hr=0;hr<HOST_REGS;hr++) {
474     if(cur->regmap[hr]==reg) {
475       return current_constmap[hr];
476     }
477   }
478   SysPrintf("Unknown constant in r%d\n",reg);
479   exit(1);
480 }
481
482 // Least soon needed registers
483 // Look at the next ten instructions and see which registers
484 // will be used.  Try not to reallocate these.
485 void lsn(u_char hsn[], int i, int *preferred_reg)
486 {
487   int j;
488   int b=-1;
489   for(j=0;j<9;j++)
490   {
491     if(i+j>=slen) {
492       j=slen-i-1;
493       break;
494     }
495     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
496     {
497       // Don't go past an unconditonal jump
498       j++;
499       break;
500     }
501   }
502   for(;j>=0;j--)
503   {
504     if(rs1[i+j]) hsn[rs1[i+j]]=j;
505     if(rs2[i+j]) hsn[rs2[i+j]]=j;
506     if(rt1[i+j]) hsn[rt1[i+j]]=j;
507     if(rt2[i+j]) hsn[rt2[i+j]]=j;
508     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
509       // Stores can allocate zero
510       hsn[rs1[i+j]]=j;
511       hsn[rs2[i+j]]=j;
512     }
513     // On some architectures stores need invc_ptr
514     #if defined(HOST_IMM8)
515     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
516       hsn[INVCP]=j;
517     }
518     #endif
519     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
520     {
521       hsn[CCREG]=j;
522       b=j;
523     }
524   }
525   if(b>=0)
526   {
527     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
528     {
529       // Follow first branch
530       int t=(ba[i+b]-start)>>2;
531       j=7-b;if(t+j>=slen) j=slen-t-1;
532       for(;j>=0;j--)
533       {
534         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
535         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
536         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
537         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
538       }
539     }
540     // TODO: preferred register based on backward branch
541   }
542   // Delay slot should preferably not overwrite branch conditions or cycle count
543   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
544     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
545     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
546     hsn[CCREG]=1;
547     // ...or hash tables
548     hsn[RHASH]=1;
549     hsn[RHTBL]=1;
550   }
551   // Coprocessor load/store needs FTEMP, even if not declared
552   if(itype[i]==C1LS||itype[i]==C2LS) {
553     hsn[FTEMP]=0;
554   }
555   // Load L/R also uses FTEMP as a temporary register
556   if(itype[i]==LOADLR) {
557     hsn[FTEMP]=0;
558   }
559   // Also SWL/SWR/SDL/SDR
560   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
561     hsn[FTEMP]=0;
562   }
563   // Don't remove the miniht registers
564   if(itype[i]==UJUMP||itype[i]==RJUMP)
565   {
566     hsn[RHASH]=0;
567     hsn[RHTBL]=0;
568   }
569 }
570
571 // We only want to allocate registers if we're going to use them again soon
572 int needed_again(int r, int i)
573 {
574   int j;
575   int b=-1;
576   int rn=10;
577
578   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
579   {
580     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
581       return 0; // Don't need any registers if exiting the block
582   }
583   for(j=0;j<9;j++)
584   {
585     if(i+j>=slen) {
586       j=slen-i-1;
587       break;
588     }
589     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
590     {
591       // Don't go past an unconditonal jump
592       j++;
593       break;
594     }
595     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
596     {
597       break;
598     }
599   }
600   for(;j>=1;j--)
601   {
602     if(rs1[i+j]==r) rn=j;
603     if(rs2[i+j]==r) rn=j;
604     if((unneeded_reg[i+j]>>r)&1) rn=10;
605     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
606     {
607       b=j;
608     }
609   }
610   /*
611   if(b>=0)
612   {
613     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
614     {
615       // Follow first branch
616       int o=rn;
617       int t=(ba[i+b]-start)>>2;
618       j=7-b;if(t+j>=slen) j=slen-t-1;
619       for(;j>=0;j--)
620       {
621         if(!((unneeded_reg[t+j]>>r)&1)) {
622           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
623           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
624         }
625         else rn=o;
626       }
627     }
628   }*/
629   if(rn<10) return 1;
630   return 0;
631 }
632
633 // Try to match register allocations at the end of a loop with those
634 // at the beginning
635 int loop_reg(int i, int r, int hr)
636 {
637   int j,k;
638   for(j=0;j<9;j++)
639   {
640     if(i+j>=slen) {
641       j=slen-i-1;
642       break;
643     }
644     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
645     {
646       // Don't go past an unconditonal jump
647       j++;
648       break;
649     }
650   }
651   k=0;
652   if(i>0){
653     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
654       k--;
655   }
656   for(;k<j;k++)
657   {
658     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
659     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
660     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
661     {
662       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
663       {
664         int t=(ba[i+k]-start)>>2;
665         int reg=get_reg(regs[t].regmap_entry,r);
666         if(reg>=0) return reg;
667         //reg=get_reg(regs[t+1].regmap_entry,r);
668         //if(reg>=0) return reg;
669       }
670     }
671   }
672   return hr;
673 }
674
675
676 // Allocate every register, preserving source/target regs
677 void alloc_all(struct regstat *cur,int i)
678 {
679   int hr;
680
681   for(hr=0;hr<HOST_REGS;hr++) {
682     if(hr!=EXCLUDE_REG) {
683       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
684          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
685       {
686         cur->regmap[hr]=-1;
687         cur->dirty&=~(1<<hr);
688       }
689       // Don't need zeros
690       if((cur->regmap[hr]&63)==0)
691       {
692         cur->regmap[hr]=-1;
693         cur->dirty&=~(1<<hr);
694       }
695     }
696   }
697 }
698
699 #ifdef __i386__
700 #include "assem_x86.c"
701 #endif
702 #ifdef __x86_64__
703 #include "assem_x64.c"
704 #endif
705 #ifdef __arm__
706 #include "assem_arm.c"
707 #endif
708
709 // Add virtual address mapping to linked list
710 void ll_add(struct ll_entry **head,int vaddr,void *addr)
711 {
712   struct ll_entry *new_entry;
713   new_entry=malloc(sizeof(struct ll_entry));
714   assert(new_entry!=NULL);
715   new_entry->vaddr=vaddr;
716   new_entry->reg_sv_flags=0;
717   new_entry->addr=addr;
718   new_entry->next=*head;
719   *head=new_entry;
720 }
721
722 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
723 {
724   ll_add(head,vaddr,addr);
725   (*head)->reg_sv_flags=reg_sv_flags;
726 }
727
728 // Check if an address is already compiled
729 // but don't return addresses which are about to expire from the cache
730 void *check_addr(u_int vaddr)
731 {
732   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
733   if(ht_bin[0]==vaddr) {
734     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
735       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
736   }
737   if(ht_bin[2]==vaddr) {
738     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
739       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
740   }
741   u_int page=get_page(vaddr);
742   struct ll_entry *head;
743   head=jump_in[page];
744   while(head!=NULL) {
745     if(head->vaddr==vaddr) {
746       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
747         // Update existing entry with current address
748         if(ht_bin[0]==vaddr) {
749           ht_bin[1]=(int)head->addr;
750           return head->addr;
751         }
752         if(ht_bin[2]==vaddr) {
753           ht_bin[3]=(int)head->addr;
754           return head->addr;
755         }
756         // Insert into hash table with low priority.
757         // Don't evict existing entries, as they are probably
758         // addresses that are being accessed frequently.
759         if(ht_bin[0]==-1) {
760           ht_bin[1]=(int)head->addr;
761           ht_bin[0]=vaddr;
762         }else if(ht_bin[2]==-1) {
763           ht_bin[3]=(int)head->addr;
764           ht_bin[2]=vaddr;
765         }
766         return head->addr;
767       }
768     }
769     head=head->next;
770   }
771   return 0;
772 }
773
774 void remove_hash(int vaddr)
775 {
776   //printf("remove hash: %x\n",vaddr);
777   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
778   if(ht_bin[2]==vaddr) {
779     ht_bin[2]=ht_bin[3]=-1;
780   }
781   if(ht_bin[0]==vaddr) {
782     ht_bin[0]=ht_bin[2];
783     ht_bin[1]=ht_bin[3];
784     ht_bin[2]=ht_bin[3]=-1;
785   }
786 }
787
788 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
789 {
790   struct ll_entry *next;
791   while(*head) {
792     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
793        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
794     {
795       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
796       remove_hash((*head)->vaddr);
797       next=(*head)->next;
798       free(*head);
799       *head=next;
800     }
801     else
802     {
803       head=&((*head)->next);
804     }
805   }
806 }
807
808 // Remove all entries from linked list
809 void ll_clear(struct ll_entry **head)
810 {
811   struct ll_entry *cur;
812   struct ll_entry *next;
813   if(cur=*head) {
814     *head=0;
815     while(cur) {
816       next=cur->next;
817       free(cur);
818       cur=next;
819     }
820   }
821 }
822
823 // Dereference the pointers and remove if it matches
824 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
825 {
826   while(head) {
827     int ptr=get_pointer(head->addr);
828     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
829     if(((ptr>>shift)==(addr>>shift)) ||
830        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
831     {
832       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
833       u_int host_addr=(u_int)kill_pointer(head->addr);
834       #ifdef __arm__
835         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
836       #endif
837     }
838     head=head->next;
839   }
840 }
841
842 // This is called when we write to a compiled block (see do_invstub)
843 void invalidate_page(u_int page)
844 {
845   struct ll_entry *head;
846   struct ll_entry *next;
847   head=jump_in[page];
848   jump_in[page]=0;
849   while(head!=NULL) {
850     inv_debug("INVALIDATE: %x\n",head->vaddr);
851     remove_hash(head->vaddr);
852     next=head->next;
853     free(head);
854     head=next;
855   }
856   head=jump_out[page];
857   jump_out[page]=0;
858   while(head!=NULL) {
859     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
860     u_int host_addr=(u_int)kill_pointer(head->addr);
861     #ifdef __arm__
862       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
863     #endif
864     next=head->next;
865     free(head);
866     head=next;
867   }
868 }
869
870 static void invalidate_block_range(u_int block, u_int first, u_int last)
871 {
872   u_int page=get_page(block<<12);
873   //printf("first=%d last=%d\n",first,last);
874   invalidate_page(page);
875   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
876   assert(last<page+5);
877   // Invalidate the adjacent pages if a block crosses a 4K boundary
878   while(first<page) {
879     invalidate_page(first);
880     first++;
881   }
882   for(first=page+1;first<last;first++) {
883     invalidate_page(first);
884   }
885   #ifdef __arm__
886     do_clear_cache();
887   #endif
888
889   // Don't trap writes
890   invalid_code[block]=1;
891
892   #ifdef USE_MINI_HT
893   memset(mini_ht,-1,sizeof(mini_ht));
894   #endif
895 }
896
897 void invalidate_block(u_int block)
898 {
899   u_int page=get_page(block<<12);
900   u_int vpage=get_vpage(block<<12);
901   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
902   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
903   u_int first,last;
904   first=last=page;
905   struct ll_entry *head;
906   head=jump_dirty[vpage];
907   //printf("page=%d vpage=%d\n",page,vpage);
908   while(head!=NULL) {
909     u_int start,end;
910     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
911       get_bounds((int)head->addr,&start,&end);
912       //printf("start: %x end: %x\n",start,end);
913       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
914         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
915           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
916           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
917         }
918       }
919     }
920     head=head->next;
921   }
922   invalidate_block_range(block,first,last);
923 }
924
925 void invalidate_addr(u_int addr)
926 {
927   //static int rhits;
928   // this check is done by the caller
929   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
930   u_int page=get_vpage(addr);
931   if(page<2048) { // RAM
932     struct ll_entry *head;
933     u_int addr_min=~0, addr_max=0;
934     u_int mask=RAM_SIZE-1;
935     u_int addr_main=0x80000000|(addr&mask);
936     int pg1;
937     inv_code_start=addr_main&~0xfff;
938     inv_code_end=addr_main|0xfff;
939     pg1=page;
940     if (pg1>0) {
941       // must check previous page too because of spans..
942       pg1--;
943       inv_code_start-=0x1000;
944     }
945     for(;pg1<=page;pg1++) {
946       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
947         u_int start,end;
948         get_bounds((int)head->addr,&start,&end);
949         if(ram_offset) {
950           start-=ram_offset;
951           end-=ram_offset;
952         }
953         if(start<=addr_main&&addr_main<end) {
954           if(start<addr_min) addr_min=start;
955           if(end>addr_max) addr_max=end;
956         }
957         else if(addr_main<start) {
958           if(start<inv_code_end)
959             inv_code_end=start-1;
960         }
961         else {
962           if(end>inv_code_start)
963             inv_code_start=end;
964         }
965       }
966     }
967     if (addr_min!=~0) {
968       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
969       inv_code_start=inv_code_end=~0;
970       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
971       return;
972     }
973     else {
974       inv_code_start=(addr&~mask)|(inv_code_start&mask);
975       inv_code_end=(addr&~mask)|(inv_code_end&mask);
976       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
977       return;
978     }
979   }
980   invalidate_block(addr>>12);
981 }
982
983 // This is called when loading a save state.
984 // Anything could have changed, so invalidate everything.
985 void invalidate_all_pages()
986 {
987   u_int page,n;
988   for(page=0;page<4096;page++)
989     invalidate_page(page);
990   for(page=0;page<1048576;page++)
991     if(!invalid_code[page]) {
992       restore_candidate[(page&2047)>>3]|=1<<(page&7);
993       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
994     }
995   #ifdef __arm__
996   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
997   #endif
998   #ifdef USE_MINI_HT
999   memset(mini_ht,-1,sizeof(mini_ht));
1000   #endif
1001 }
1002
1003 // Add an entry to jump_out after making a link
1004 void add_link(u_int vaddr,void *src)
1005 {
1006   u_int page=get_page(vaddr);
1007   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1008   int *ptr=(int *)(src+4);
1009   assert((*ptr&0x0fff0000)==0x059f0000);
1010   ll_add(jump_out+page,vaddr,src);
1011   //int ptr=get_pointer(src);
1012   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1013 }
1014
1015 // If a code block was found to be unmodified (bit was set in
1016 // restore_candidate) and it remains unmodified (bit is clear
1017 // in invalid_code) then move the entries for that 4K page from
1018 // the dirty list to the clean list.
1019 void clean_blocks(u_int page)
1020 {
1021   struct ll_entry *head;
1022   inv_debug("INV: clean_blocks page=%d\n",page);
1023   head=jump_dirty[page];
1024   while(head!=NULL) {
1025     if(!invalid_code[head->vaddr>>12]) {
1026       // Don't restore blocks which are about to expire from the cache
1027       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1028         u_int start,end;
1029         if(verify_dirty((int)head->addr)) {
1030           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1031           u_int i;
1032           u_int inv=0;
1033           get_bounds((int)head->addr,&start,&end);
1034           if(start-(u_int)rdram<RAM_SIZE) {
1035             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1036               inv|=invalid_code[i];
1037             }
1038           }
1039           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1040             inv=1;
1041           }
1042           if(!inv) {
1043             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1044             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1045               u_int ppage=page;
1046               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1047               //printf("page=%x, addr=%x\n",page,head->vaddr);
1048               //assert(head->vaddr>>12==(page|0x80000));
1049               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1050               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1051               if(ht_bin[0]==head->vaddr) {
1052                 ht_bin[1]=(int)clean_addr; // Replace existing entry
1053               }
1054               if(ht_bin[2]==head->vaddr) {
1055                 ht_bin[3]=(int)clean_addr; // Replace existing entry
1056               }
1057             }
1058           }
1059         }
1060       }
1061     }
1062     head=head->next;
1063   }
1064 }
1065
1066
1067 void mov_alloc(struct regstat *current,int i)
1068 {
1069   // Note: Don't need to actually alloc the source registers
1070   if((~current->is32>>rs1[i])&1) {
1071     //alloc_reg64(current,i,rs1[i]);
1072     alloc_reg64(current,i,rt1[i]);
1073     current->is32&=~(1LL<<rt1[i]);
1074   } else {
1075     //alloc_reg(current,i,rs1[i]);
1076     alloc_reg(current,i,rt1[i]);
1077     current->is32|=(1LL<<rt1[i]);
1078   }
1079   clear_const(current,rs1[i]);
1080   clear_const(current,rt1[i]);
1081   dirty_reg(current,rt1[i]);
1082 }
1083
1084 void shiftimm_alloc(struct regstat *current,int i)
1085 {
1086   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1087   {
1088     if(rt1[i]) {
1089       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1090       else lt1[i]=rs1[i];
1091       alloc_reg(current,i,rt1[i]);
1092       current->is32|=1LL<<rt1[i];
1093       dirty_reg(current,rt1[i]);
1094       if(is_const(current,rs1[i])) {
1095         int v=get_const(current,rs1[i]);
1096         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1097         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1098         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1099       }
1100       else clear_const(current,rt1[i]);
1101     }
1102   }
1103   else
1104   {
1105     clear_const(current,rs1[i]);
1106     clear_const(current,rt1[i]);
1107   }
1108
1109   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1110   {
1111     if(rt1[i]) {
1112       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1113       alloc_reg64(current,i,rt1[i]);
1114       current->is32&=~(1LL<<rt1[i]);
1115       dirty_reg(current,rt1[i]);
1116     }
1117   }
1118   if(opcode2[i]==0x3c) // DSLL32
1119   {
1120     if(rt1[i]) {
1121       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1122       alloc_reg64(current,i,rt1[i]);
1123       current->is32&=~(1LL<<rt1[i]);
1124       dirty_reg(current,rt1[i]);
1125     }
1126   }
1127   if(opcode2[i]==0x3e) // DSRL32
1128   {
1129     if(rt1[i]) {
1130       alloc_reg64(current,i,rs1[i]);
1131       if(imm[i]==32) {
1132         alloc_reg64(current,i,rt1[i]);
1133         current->is32&=~(1LL<<rt1[i]);
1134       } else {
1135         alloc_reg(current,i,rt1[i]);
1136         current->is32|=1LL<<rt1[i];
1137       }
1138       dirty_reg(current,rt1[i]);
1139     }
1140   }
1141   if(opcode2[i]==0x3f) // DSRA32
1142   {
1143     if(rt1[i]) {
1144       alloc_reg64(current,i,rs1[i]);
1145       alloc_reg(current,i,rt1[i]);
1146       current->is32|=1LL<<rt1[i];
1147       dirty_reg(current,rt1[i]);
1148     }
1149   }
1150 }
1151
1152 void shift_alloc(struct regstat *current,int i)
1153 {
1154   if(rt1[i]) {
1155     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1156     {
1157       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1158       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1159       alloc_reg(current,i,rt1[i]);
1160       if(rt1[i]==rs2[i]) {
1161         alloc_reg_temp(current,i,-1);
1162         minimum_free_regs[i]=1;
1163       }
1164       current->is32|=1LL<<rt1[i];
1165     } else { // DSLLV/DSRLV/DSRAV
1166       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1167       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1168       alloc_reg64(current,i,rt1[i]);
1169       current->is32&=~(1LL<<rt1[i]);
1170       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1171       {
1172         alloc_reg_temp(current,i,-1);
1173         minimum_free_regs[i]=1;
1174       }
1175     }
1176     clear_const(current,rs1[i]);
1177     clear_const(current,rs2[i]);
1178     clear_const(current,rt1[i]);
1179     dirty_reg(current,rt1[i]);
1180   }
1181 }
1182
1183 void alu_alloc(struct regstat *current,int i)
1184 {
1185   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1186     if(rt1[i]) {
1187       if(rs1[i]&&rs2[i]) {
1188         alloc_reg(current,i,rs1[i]);
1189         alloc_reg(current,i,rs2[i]);
1190       }
1191       else {
1192         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1193         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1194       }
1195       alloc_reg(current,i,rt1[i]);
1196     }
1197     current->is32|=1LL<<rt1[i];
1198   }
1199   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1200     if(rt1[i]) {
1201       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1202       {
1203         alloc_reg64(current,i,rs1[i]);
1204         alloc_reg64(current,i,rs2[i]);
1205         alloc_reg(current,i,rt1[i]);
1206       } else {
1207         alloc_reg(current,i,rs1[i]);
1208         alloc_reg(current,i,rs2[i]);
1209         alloc_reg(current,i,rt1[i]);
1210       }
1211     }
1212     current->is32|=1LL<<rt1[i];
1213   }
1214   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1215     if(rt1[i]) {
1216       if(rs1[i]&&rs2[i]) {
1217         alloc_reg(current,i,rs1[i]);
1218         alloc_reg(current,i,rs2[i]);
1219       }
1220       else
1221       {
1222         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1223         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1224       }
1225       alloc_reg(current,i,rt1[i]);
1226       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1227       {
1228         if(!((current->uu>>rt1[i])&1)) {
1229           alloc_reg64(current,i,rt1[i]);
1230         }
1231         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1232           if(rs1[i]&&rs2[i]) {
1233             alloc_reg64(current,i,rs1[i]);
1234             alloc_reg64(current,i,rs2[i]);
1235           }
1236           else
1237           {
1238             // Is is really worth it to keep 64-bit values in registers?
1239             #ifdef NATIVE_64BIT
1240             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1241             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1242             #endif
1243           }
1244         }
1245         current->is32&=~(1LL<<rt1[i]);
1246       } else {
1247         current->is32|=1LL<<rt1[i];
1248       }
1249     }
1250   }
1251   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1252     if(rt1[i]) {
1253       if(rs1[i]&&rs2[i]) {
1254         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1255           alloc_reg64(current,i,rs1[i]);
1256           alloc_reg64(current,i,rs2[i]);
1257           alloc_reg64(current,i,rt1[i]);
1258         } else {
1259           alloc_reg(current,i,rs1[i]);
1260           alloc_reg(current,i,rs2[i]);
1261           alloc_reg(current,i,rt1[i]);
1262         }
1263       }
1264       else {
1265         alloc_reg(current,i,rt1[i]);
1266         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1267           // DADD used as move, or zeroing
1268           // If we have a 64-bit source, then make the target 64 bits too
1269           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1270             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1271             alloc_reg64(current,i,rt1[i]);
1272           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1273             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1274             alloc_reg64(current,i,rt1[i]);
1275           }
1276           if(opcode2[i]>=0x2e&&rs2[i]) {
1277             // DSUB used as negation - 64-bit result
1278             // If we have a 32-bit register, extend it to 64 bits
1279             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1280             alloc_reg64(current,i,rt1[i]);
1281           }
1282         }
1283       }
1284       if(rs1[i]&&rs2[i]) {
1285         current->is32&=~(1LL<<rt1[i]);
1286       } else if(rs1[i]) {
1287         current->is32&=~(1LL<<rt1[i]);
1288         if((current->is32>>rs1[i])&1)
1289           current->is32|=1LL<<rt1[i];
1290       } else if(rs2[i]) {
1291         current->is32&=~(1LL<<rt1[i]);
1292         if((current->is32>>rs2[i])&1)
1293           current->is32|=1LL<<rt1[i];
1294       } else {
1295         current->is32|=1LL<<rt1[i];
1296       }
1297     }
1298   }
1299   clear_const(current,rs1[i]);
1300   clear_const(current,rs2[i]);
1301   clear_const(current,rt1[i]);
1302   dirty_reg(current,rt1[i]);
1303 }
1304
1305 void imm16_alloc(struct regstat *current,int i)
1306 {
1307   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1308   else lt1[i]=rs1[i];
1309   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1310   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1311     current->is32&=~(1LL<<rt1[i]);
1312     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1313       // TODO: Could preserve the 32-bit flag if the immediate is zero
1314       alloc_reg64(current,i,rt1[i]);
1315       alloc_reg64(current,i,rs1[i]);
1316     }
1317     clear_const(current,rs1[i]);
1318     clear_const(current,rt1[i]);
1319   }
1320   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1321     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1322     current->is32|=1LL<<rt1[i];
1323     clear_const(current,rs1[i]);
1324     clear_const(current,rt1[i]);
1325   }
1326   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1327     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1328       if(rs1[i]!=rt1[i]) {
1329         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1330         alloc_reg64(current,i,rt1[i]);
1331         current->is32&=~(1LL<<rt1[i]);
1332       }
1333     }
1334     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1335     if(is_const(current,rs1[i])) {
1336       int v=get_const(current,rs1[i]);
1337       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1338       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1339       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1340     }
1341     else clear_const(current,rt1[i]);
1342   }
1343   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1344     if(is_const(current,rs1[i])) {
1345       int v=get_const(current,rs1[i]);
1346       set_const(current,rt1[i],v+imm[i]);
1347     }
1348     else clear_const(current,rt1[i]);
1349     current->is32|=1LL<<rt1[i];
1350   }
1351   else {
1352     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1353     current->is32|=1LL<<rt1[i];
1354   }
1355   dirty_reg(current,rt1[i]);
1356 }
1357
1358 void load_alloc(struct regstat *current,int i)
1359 {
1360   clear_const(current,rt1[i]);
1361   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1362   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1363   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1364   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1365     alloc_reg(current,i,rt1[i]);
1366     assert(get_reg(current->regmap,rt1[i])>=0);
1367     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1368     {
1369       current->is32&=~(1LL<<rt1[i]);
1370       alloc_reg64(current,i,rt1[i]);
1371     }
1372     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1373     {
1374       current->is32&=~(1LL<<rt1[i]);
1375       alloc_reg64(current,i,rt1[i]);
1376       alloc_all(current,i);
1377       alloc_reg64(current,i,FTEMP);
1378       minimum_free_regs[i]=HOST_REGS;
1379     }
1380     else current->is32|=1LL<<rt1[i];
1381     dirty_reg(current,rt1[i]);
1382     // LWL/LWR need a temporary register for the old value
1383     if(opcode[i]==0x22||opcode[i]==0x26)
1384     {
1385       alloc_reg(current,i,FTEMP);
1386       alloc_reg_temp(current,i,-1);
1387       minimum_free_regs[i]=1;
1388     }
1389   }
1390   else
1391   {
1392     // Load to r0 or unneeded register (dummy load)
1393     // but we still need a register to calculate the address
1394     if(opcode[i]==0x22||opcode[i]==0x26)
1395     {
1396       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1397     }
1398     alloc_reg_temp(current,i,-1);
1399     minimum_free_regs[i]=1;
1400     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1401     {
1402       alloc_all(current,i);
1403       alloc_reg64(current,i,FTEMP);
1404       minimum_free_regs[i]=HOST_REGS;
1405     }
1406   }
1407 }
1408
1409 void store_alloc(struct regstat *current,int i)
1410 {
1411   clear_const(current,rs2[i]);
1412   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1413   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1414   alloc_reg(current,i,rs2[i]);
1415   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1416     alloc_reg64(current,i,rs2[i]);
1417     if(rs2[i]) alloc_reg(current,i,FTEMP);
1418   }
1419   #if defined(HOST_IMM8)
1420   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1421   else alloc_reg(current,i,INVCP);
1422   #endif
1423   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1424     alloc_reg(current,i,FTEMP);
1425   }
1426   // We need a temporary register for address generation
1427   alloc_reg_temp(current,i,-1);
1428   minimum_free_regs[i]=1;
1429 }
1430
1431 void c1ls_alloc(struct regstat *current,int i)
1432 {
1433   //clear_const(current,rs1[i]); // FIXME
1434   clear_const(current,rt1[i]);
1435   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1436   alloc_reg(current,i,CSREG); // Status
1437   alloc_reg(current,i,FTEMP);
1438   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1439     alloc_reg64(current,i,FTEMP);
1440   }
1441   #if defined(HOST_IMM8)
1442   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1443   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1444     alloc_reg(current,i,INVCP);
1445   #endif
1446   // We need a temporary register for address generation
1447   alloc_reg_temp(current,i,-1);
1448 }
1449
1450 void c2ls_alloc(struct regstat *current,int i)
1451 {
1452   clear_const(current,rt1[i]);
1453   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1454   alloc_reg(current,i,FTEMP);
1455   #if defined(HOST_IMM8)
1456   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1457   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1458     alloc_reg(current,i,INVCP);
1459   #endif
1460   // We need a temporary register for address generation
1461   alloc_reg_temp(current,i,-1);
1462   minimum_free_regs[i]=1;
1463 }
1464
1465 #ifndef multdiv_alloc
1466 void multdiv_alloc(struct regstat *current,int i)
1467 {
1468   //  case 0x18: MULT
1469   //  case 0x19: MULTU
1470   //  case 0x1A: DIV
1471   //  case 0x1B: DIVU
1472   //  case 0x1C: DMULT
1473   //  case 0x1D: DMULTU
1474   //  case 0x1E: DDIV
1475   //  case 0x1F: DDIVU
1476   clear_const(current,rs1[i]);
1477   clear_const(current,rs2[i]);
1478   if(rs1[i]&&rs2[i])
1479   {
1480     if((opcode2[i]&4)==0) // 32-bit
1481     {
1482       current->u&=~(1LL<<HIREG);
1483       current->u&=~(1LL<<LOREG);
1484       alloc_reg(current,i,HIREG);
1485       alloc_reg(current,i,LOREG);
1486       alloc_reg(current,i,rs1[i]);
1487       alloc_reg(current,i,rs2[i]);
1488       current->is32|=1LL<<HIREG;
1489       current->is32|=1LL<<LOREG;
1490       dirty_reg(current,HIREG);
1491       dirty_reg(current,LOREG);
1492     }
1493     else // 64-bit
1494     {
1495       current->u&=~(1LL<<HIREG);
1496       current->u&=~(1LL<<LOREG);
1497       current->uu&=~(1LL<<HIREG);
1498       current->uu&=~(1LL<<LOREG);
1499       alloc_reg64(current,i,HIREG);
1500       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1501       alloc_reg64(current,i,rs1[i]);
1502       alloc_reg64(current,i,rs2[i]);
1503       alloc_all(current,i);
1504       current->is32&=~(1LL<<HIREG);
1505       current->is32&=~(1LL<<LOREG);
1506       dirty_reg(current,HIREG);
1507       dirty_reg(current,LOREG);
1508       minimum_free_regs[i]=HOST_REGS;
1509     }
1510   }
1511   else
1512   {
1513     // Multiply by zero is zero.
1514     // MIPS does not have a divide by zero exception.
1515     // The result is undefined, we return zero.
1516     alloc_reg(current,i,HIREG);
1517     alloc_reg(current,i,LOREG);
1518     current->is32|=1LL<<HIREG;
1519     current->is32|=1LL<<LOREG;
1520     dirty_reg(current,HIREG);
1521     dirty_reg(current,LOREG);
1522   }
1523 }
1524 #endif
1525
1526 void cop0_alloc(struct regstat *current,int i)
1527 {
1528   if(opcode2[i]==0) // MFC0
1529   {
1530     if(rt1[i]) {
1531       clear_const(current,rt1[i]);
1532       alloc_all(current,i);
1533       alloc_reg(current,i,rt1[i]);
1534       current->is32|=1LL<<rt1[i];
1535       dirty_reg(current,rt1[i]);
1536     }
1537   }
1538   else if(opcode2[i]==4) // MTC0
1539   {
1540     if(rs1[i]){
1541       clear_const(current,rs1[i]);
1542       alloc_reg(current,i,rs1[i]);
1543       alloc_all(current,i);
1544     }
1545     else {
1546       alloc_all(current,i); // FIXME: Keep r0
1547       current->u&=~1LL;
1548       alloc_reg(current,i,0);
1549     }
1550   }
1551   else
1552   {
1553     // TLBR/TLBWI/TLBWR/TLBP/ERET
1554     assert(opcode2[i]==0x10);
1555     alloc_all(current,i);
1556   }
1557   minimum_free_regs[i]=HOST_REGS;
1558 }
1559
1560 void cop1_alloc(struct regstat *current,int i)
1561 {
1562   alloc_reg(current,i,CSREG); // Load status
1563   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1564   {
1565     if(rt1[i]){
1566       clear_const(current,rt1[i]);
1567       if(opcode2[i]==1) {
1568         alloc_reg64(current,i,rt1[i]); // DMFC1
1569         current->is32&=~(1LL<<rt1[i]);
1570       }else{
1571         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1572         current->is32|=1LL<<rt1[i];
1573       }
1574       dirty_reg(current,rt1[i]);
1575     }
1576     alloc_reg_temp(current,i,-1);
1577   }
1578   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1579   {
1580     if(rs1[i]){
1581       clear_const(current,rs1[i]);
1582       if(opcode2[i]==5)
1583         alloc_reg64(current,i,rs1[i]); // DMTC1
1584       else
1585         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1586       alloc_reg_temp(current,i,-1);
1587     }
1588     else {
1589       current->u&=~1LL;
1590       alloc_reg(current,i,0);
1591       alloc_reg_temp(current,i,-1);
1592     }
1593   }
1594   minimum_free_regs[i]=1;
1595 }
1596 void fconv_alloc(struct regstat *current,int i)
1597 {
1598   alloc_reg(current,i,CSREG); // Load status
1599   alloc_reg_temp(current,i,-1);
1600   minimum_free_regs[i]=1;
1601 }
1602 void float_alloc(struct regstat *current,int i)
1603 {
1604   alloc_reg(current,i,CSREG); // Load status
1605   alloc_reg_temp(current,i,-1);
1606   minimum_free_regs[i]=1;
1607 }
1608 void c2op_alloc(struct regstat *current,int i)
1609 {
1610   alloc_reg_temp(current,i,-1);
1611 }
1612 void fcomp_alloc(struct regstat *current,int i)
1613 {
1614   alloc_reg(current,i,CSREG); // Load status
1615   alloc_reg(current,i,FSREG); // Load flags
1616   dirty_reg(current,FSREG); // Flag will be modified
1617   alloc_reg_temp(current,i,-1);
1618   minimum_free_regs[i]=1;
1619 }
1620
1621 void syscall_alloc(struct regstat *current,int i)
1622 {
1623   alloc_cc(current,i);
1624   dirty_reg(current,CCREG);
1625   alloc_all(current,i);
1626   minimum_free_regs[i]=HOST_REGS;
1627   current->isconst=0;
1628 }
1629
1630 void delayslot_alloc(struct regstat *current,int i)
1631 {
1632   switch(itype[i]) {
1633     case UJUMP:
1634     case CJUMP:
1635     case SJUMP:
1636     case RJUMP:
1637     case FJUMP:
1638     case SYSCALL:
1639     case HLECALL:
1640     case SPAN:
1641       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1642       SysPrintf("Disabled speculative precompilation\n");
1643       stop_after_jal=1;
1644       break;
1645     case IMM16:
1646       imm16_alloc(current,i);
1647       break;
1648     case LOAD:
1649     case LOADLR:
1650       load_alloc(current,i);
1651       break;
1652     case STORE:
1653     case STORELR:
1654       store_alloc(current,i);
1655       break;
1656     case ALU:
1657       alu_alloc(current,i);
1658       break;
1659     case SHIFT:
1660       shift_alloc(current,i);
1661       break;
1662     case MULTDIV:
1663       multdiv_alloc(current,i);
1664       break;
1665     case SHIFTIMM:
1666       shiftimm_alloc(current,i);
1667       break;
1668     case MOV:
1669       mov_alloc(current,i);
1670       break;
1671     case COP0:
1672       cop0_alloc(current,i);
1673       break;
1674     case COP1:
1675     case COP2:
1676       cop1_alloc(current,i);
1677       break;
1678     case C1LS:
1679       c1ls_alloc(current,i);
1680       break;
1681     case C2LS:
1682       c2ls_alloc(current,i);
1683       break;
1684     case FCONV:
1685       fconv_alloc(current,i);
1686       break;
1687     case FLOAT:
1688       float_alloc(current,i);
1689       break;
1690     case FCOMP:
1691       fcomp_alloc(current,i);
1692       break;
1693     case C2OP:
1694       c2op_alloc(current,i);
1695       break;
1696   }
1697 }
1698
1699 // Special case where a branch and delay slot span two pages in virtual memory
1700 static void pagespan_alloc(struct regstat *current,int i)
1701 {
1702   current->isconst=0;
1703   current->wasconst=0;
1704   regs[i].wasconst=0;
1705   minimum_free_regs[i]=HOST_REGS;
1706   alloc_all(current,i);
1707   alloc_cc(current,i);
1708   dirty_reg(current,CCREG);
1709   if(opcode[i]==3) // JAL
1710   {
1711     alloc_reg(current,i,31);
1712     dirty_reg(current,31);
1713   }
1714   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1715   {
1716     alloc_reg(current,i,rs1[i]);
1717     if (rt1[i]!=0) {
1718       alloc_reg(current,i,rt1[i]);
1719       dirty_reg(current,rt1[i]);
1720     }
1721   }
1722   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1723   {
1724     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1725     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1726     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1727     {
1728       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1729       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1730     }
1731   }
1732   else
1733   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1734   {
1735     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1736     if(!((current->is32>>rs1[i])&1))
1737     {
1738       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1739     }
1740   }
1741   else
1742   if(opcode[i]==0x11) // BC1
1743   {
1744     alloc_reg(current,i,FSREG);
1745     alloc_reg(current,i,CSREG);
1746   }
1747   //else ...
1748 }
1749
1750 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1751 {
1752   stubs[stubcount][0]=type;
1753   stubs[stubcount][1]=addr;
1754   stubs[stubcount][2]=retaddr;
1755   stubs[stubcount][3]=a;
1756   stubs[stubcount][4]=b;
1757   stubs[stubcount][5]=c;
1758   stubs[stubcount][6]=d;
1759   stubs[stubcount][7]=e;
1760   stubcount++;
1761 }
1762
1763 // Write out a single register
1764 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1765 {
1766   int hr;
1767   for(hr=0;hr<HOST_REGS;hr++) {
1768     if(hr!=EXCLUDE_REG) {
1769       if((regmap[hr]&63)==r) {
1770         if((dirty>>hr)&1) {
1771           if(regmap[hr]<64) {
1772             emit_storereg(r,hr);
1773           }else{
1774             emit_storereg(r|64,hr);
1775           }
1776         }
1777       }
1778     }
1779   }
1780 }
1781
1782 int mchecksum()
1783 {
1784   //if(!tracedebug) return 0;
1785   int i;
1786   int sum=0;
1787   for(i=0;i<2097152;i++) {
1788     unsigned int temp=sum;
1789     sum<<=1;
1790     sum|=(~temp)>>31;
1791     sum^=((u_int *)rdram)[i];
1792   }
1793   return sum;
1794 }
1795 int rchecksum()
1796 {
1797   int i;
1798   int sum=0;
1799   for(i=0;i<64;i++)
1800     sum^=((u_int *)reg)[i];
1801   return sum;
1802 }
1803 void rlist()
1804 {
1805   int i;
1806   printf("TRACE: ");
1807   for(i=0;i<32;i++)
1808     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1809   printf("\n");
1810 }
1811
1812 void enabletrace()
1813 {
1814   tracedebug=1;
1815 }
1816
1817 void memdebug(int i)
1818 {
1819   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1820   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1821   //rlist();
1822   //if(tracedebug) {
1823   //if(Count>=-2084597794) {
1824   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1825   //if(0) {
1826     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1827     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1828     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1829     rlist();
1830     #ifdef __i386__
1831     printf("TRACE: %x\n",(&i)[-1]);
1832     #endif
1833     #ifdef __arm__
1834     int j;
1835     printf("TRACE: %x \n",(&j)[10]);
1836     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1837     #endif
1838     //fflush(stdout);
1839   }
1840   //printf("TRACE: %x\n",(&i)[-1]);
1841 }
1842
1843 void alu_assemble(int i,struct regstat *i_regs)
1844 {
1845   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1846     if(rt1[i]) {
1847       signed char s1,s2,t;
1848       t=get_reg(i_regs->regmap,rt1[i]);
1849       if(t>=0) {
1850         s1=get_reg(i_regs->regmap,rs1[i]);
1851         s2=get_reg(i_regs->regmap,rs2[i]);
1852         if(rs1[i]&&rs2[i]) {
1853           assert(s1>=0);
1854           assert(s2>=0);
1855           if(opcode2[i]&2) emit_sub(s1,s2,t);
1856           else emit_add(s1,s2,t);
1857         }
1858         else if(rs1[i]) {
1859           if(s1>=0) emit_mov(s1,t);
1860           else emit_loadreg(rs1[i],t);
1861         }
1862         else if(rs2[i]) {
1863           if(s2>=0) {
1864             if(opcode2[i]&2) emit_neg(s2,t);
1865             else emit_mov(s2,t);
1866           }
1867           else {
1868             emit_loadreg(rs2[i],t);
1869             if(opcode2[i]&2) emit_neg(t,t);
1870           }
1871         }
1872         else emit_zeroreg(t);
1873       }
1874     }
1875   }
1876   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1877     if(rt1[i]) {
1878       signed char s1l,s2l,s1h,s2h,tl,th;
1879       tl=get_reg(i_regs->regmap,rt1[i]);
1880       th=get_reg(i_regs->regmap,rt1[i]|64);
1881       if(tl>=0) {
1882         s1l=get_reg(i_regs->regmap,rs1[i]);
1883         s2l=get_reg(i_regs->regmap,rs2[i]);
1884         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1885         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1886         if(rs1[i]&&rs2[i]) {
1887           assert(s1l>=0);
1888           assert(s2l>=0);
1889           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1890           else emit_adds(s1l,s2l,tl);
1891           if(th>=0) {
1892             #ifdef INVERTED_CARRY
1893             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1894             #else
1895             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1896             #endif
1897             else emit_add(s1h,s2h,th);
1898           }
1899         }
1900         else if(rs1[i]) {
1901           if(s1l>=0) emit_mov(s1l,tl);
1902           else emit_loadreg(rs1[i],tl);
1903           if(th>=0) {
1904             if(s1h>=0) emit_mov(s1h,th);
1905             else emit_loadreg(rs1[i]|64,th);
1906           }
1907         }
1908         else if(rs2[i]) {
1909           if(s2l>=0) {
1910             if(opcode2[i]&2) emit_negs(s2l,tl);
1911             else emit_mov(s2l,tl);
1912           }
1913           else {
1914             emit_loadreg(rs2[i],tl);
1915             if(opcode2[i]&2) emit_negs(tl,tl);
1916           }
1917           if(th>=0) {
1918             #ifdef INVERTED_CARRY
1919             if(s2h>=0) emit_mov(s2h,th);
1920             else emit_loadreg(rs2[i]|64,th);
1921             if(opcode2[i]&2) {
1922               emit_adcimm(-1,th); // x86 has inverted carry flag
1923               emit_not(th,th);
1924             }
1925             #else
1926             if(opcode2[i]&2) {
1927               if(s2h>=0) emit_rscimm(s2h,0,th);
1928               else {
1929                 emit_loadreg(rs2[i]|64,th);
1930                 emit_rscimm(th,0,th);
1931               }
1932             }else{
1933               if(s2h>=0) emit_mov(s2h,th);
1934               else emit_loadreg(rs2[i]|64,th);
1935             }
1936             #endif
1937           }
1938         }
1939         else {
1940           emit_zeroreg(tl);
1941           if(th>=0) emit_zeroreg(th);
1942         }
1943       }
1944     }
1945   }
1946   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1947     if(rt1[i]) {
1948       signed char s1l,s1h,s2l,s2h,t;
1949       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
1950       {
1951         t=get_reg(i_regs->regmap,rt1[i]);
1952         //assert(t>=0);
1953         if(t>=0) {
1954           s1l=get_reg(i_regs->regmap,rs1[i]);
1955           s1h=get_reg(i_regs->regmap,rs1[i]|64);
1956           s2l=get_reg(i_regs->regmap,rs2[i]);
1957           s2h=get_reg(i_regs->regmap,rs2[i]|64);
1958           if(rs2[i]==0) // rx<r0
1959           {
1960             assert(s1h>=0);
1961             if(opcode2[i]==0x2a) // SLT
1962               emit_shrimm(s1h,31,t);
1963             else // SLTU (unsigned can not be less than zero)
1964               emit_zeroreg(t);
1965           }
1966           else if(rs1[i]==0) // r0<rx
1967           {
1968             assert(s2h>=0);
1969             if(opcode2[i]==0x2a) // SLT
1970               emit_set_gz64_32(s2h,s2l,t);
1971             else // SLTU (set if not zero)
1972               emit_set_nz64_32(s2h,s2l,t);
1973           }
1974           else {
1975             assert(s1l>=0);assert(s1h>=0);
1976             assert(s2l>=0);assert(s2h>=0);
1977             if(opcode2[i]==0x2a) // SLT
1978               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
1979             else // SLTU
1980               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
1981           }
1982         }
1983       } else {
1984         t=get_reg(i_regs->regmap,rt1[i]);
1985         //assert(t>=0);
1986         if(t>=0) {
1987           s1l=get_reg(i_regs->regmap,rs1[i]);
1988           s2l=get_reg(i_regs->regmap,rs2[i]);
1989           if(rs2[i]==0) // rx<r0
1990           {
1991             assert(s1l>=0);
1992             if(opcode2[i]==0x2a) // SLT
1993               emit_shrimm(s1l,31,t);
1994             else // SLTU (unsigned can not be less than zero)
1995               emit_zeroreg(t);
1996           }
1997           else if(rs1[i]==0) // r0<rx
1998           {
1999             assert(s2l>=0);
2000             if(opcode2[i]==0x2a) // SLT
2001               emit_set_gz32(s2l,t);
2002             else // SLTU (set if not zero)
2003               emit_set_nz32(s2l,t);
2004           }
2005           else{
2006             assert(s1l>=0);assert(s2l>=0);
2007             if(opcode2[i]==0x2a) // SLT
2008               emit_set_if_less32(s1l,s2l,t);
2009             else // SLTU
2010               emit_set_if_carry32(s1l,s2l,t);
2011           }
2012         }
2013       }
2014     }
2015   }
2016   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2017     if(rt1[i]) {
2018       signed char s1l,s1h,s2l,s2h,th,tl;
2019       tl=get_reg(i_regs->regmap,rt1[i]);
2020       th=get_reg(i_regs->regmap,rt1[i]|64);
2021       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2022       {
2023         assert(tl>=0);
2024         if(tl>=0) {
2025           s1l=get_reg(i_regs->regmap,rs1[i]);
2026           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2027           s2l=get_reg(i_regs->regmap,rs2[i]);
2028           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2029           if(rs1[i]&&rs2[i]) {
2030             assert(s1l>=0);assert(s1h>=0);
2031             assert(s2l>=0);assert(s2h>=0);
2032             if(opcode2[i]==0x24) { // AND
2033               emit_and(s1l,s2l,tl);
2034               emit_and(s1h,s2h,th);
2035             } else
2036             if(opcode2[i]==0x25) { // OR
2037               emit_or(s1l,s2l,tl);
2038               emit_or(s1h,s2h,th);
2039             } else
2040             if(opcode2[i]==0x26) { // XOR
2041               emit_xor(s1l,s2l,tl);
2042               emit_xor(s1h,s2h,th);
2043             } else
2044             if(opcode2[i]==0x27) { // NOR
2045               emit_or(s1l,s2l,tl);
2046               emit_or(s1h,s2h,th);
2047               emit_not(tl,tl);
2048               emit_not(th,th);
2049             }
2050           }
2051           else
2052           {
2053             if(opcode2[i]==0x24) { // AND
2054               emit_zeroreg(tl);
2055               emit_zeroreg(th);
2056             } else
2057             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2058               if(rs1[i]){
2059                 if(s1l>=0) emit_mov(s1l,tl);
2060                 else emit_loadreg(rs1[i],tl);
2061                 if(s1h>=0) emit_mov(s1h,th);
2062                 else emit_loadreg(rs1[i]|64,th);
2063               }
2064               else
2065               if(rs2[i]){
2066                 if(s2l>=0) emit_mov(s2l,tl);
2067                 else emit_loadreg(rs2[i],tl);
2068                 if(s2h>=0) emit_mov(s2h,th);
2069                 else emit_loadreg(rs2[i]|64,th);
2070               }
2071               else{
2072                 emit_zeroreg(tl);
2073                 emit_zeroreg(th);
2074               }
2075             } else
2076             if(opcode2[i]==0x27) { // NOR
2077               if(rs1[i]){
2078                 if(s1l>=0) emit_not(s1l,tl);
2079                 else{
2080                   emit_loadreg(rs1[i],tl);
2081                   emit_not(tl,tl);
2082                 }
2083                 if(s1h>=0) emit_not(s1h,th);
2084                 else{
2085                   emit_loadreg(rs1[i]|64,th);
2086                   emit_not(th,th);
2087                 }
2088               }
2089               else
2090               if(rs2[i]){
2091                 if(s2l>=0) emit_not(s2l,tl);
2092                 else{
2093                   emit_loadreg(rs2[i],tl);
2094                   emit_not(tl,tl);
2095                 }
2096                 if(s2h>=0) emit_not(s2h,th);
2097                 else{
2098                   emit_loadreg(rs2[i]|64,th);
2099                   emit_not(th,th);
2100                 }
2101               }
2102               else {
2103                 emit_movimm(-1,tl);
2104                 emit_movimm(-1,th);
2105               }
2106             }
2107           }
2108         }
2109       }
2110       else
2111       {
2112         // 32 bit
2113         if(tl>=0) {
2114           s1l=get_reg(i_regs->regmap,rs1[i]);
2115           s2l=get_reg(i_regs->regmap,rs2[i]);
2116           if(rs1[i]&&rs2[i]) {
2117             assert(s1l>=0);
2118             assert(s2l>=0);
2119             if(opcode2[i]==0x24) { // AND
2120               emit_and(s1l,s2l,tl);
2121             } else
2122             if(opcode2[i]==0x25) { // OR
2123               emit_or(s1l,s2l,tl);
2124             } else
2125             if(opcode2[i]==0x26) { // XOR
2126               emit_xor(s1l,s2l,tl);
2127             } else
2128             if(opcode2[i]==0x27) { // NOR
2129               emit_or(s1l,s2l,tl);
2130               emit_not(tl,tl);
2131             }
2132           }
2133           else
2134           {
2135             if(opcode2[i]==0x24) { // AND
2136               emit_zeroreg(tl);
2137             } else
2138             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2139               if(rs1[i]){
2140                 if(s1l>=0) emit_mov(s1l,tl);
2141                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2142               }
2143               else
2144               if(rs2[i]){
2145                 if(s2l>=0) emit_mov(s2l,tl);
2146                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2147               }
2148               else emit_zeroreg(tl);
2149             } else
2150             if(opcode2[i]==0x27) { // NOR
2151               if(rs1[i]){
2152                 if(s1l>=0) emit_not(s1l,tl);
2153                 else {
2154                   emit_loadreg(rs1[i],tl);
2155                   emit_not(tl,tl);
2156                 }
2157               }
2158               else
2159               if(rs2[i]){
2160                 if(s2l>=0) emit_not(s2l,tl);
2161                 else {
2162                   emit_loadreg(rs2[i],tl);
2163                   emit_not(tl,tl);
2164                 }
2165               }
2166               else emit_movimm(-1,tl);
2167             }
2168           }
2169         }
2170       }
2171     }
2172   }
2173 }
2174
2175 void imm16_assemble(int i,struct regstat *i_regs)
2176 {
2177   if (opcode[i]==0x0f) { // LUI
2178     if(rt1[i]) {
2179       signed char t;
2180       t=get_reg(i_regs->regmap,rt1[i]);
2181       //assert(t>=0);
2182       if(t>=0) {
2183         if(!((i_regs->isconst>>t)&1))
2184           emit_movimm(imm[i]<<16,t);
2185       }
2186     }
2187   }
2188   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2189     if(rt1[i]) {
2190       signed char s,t;
2191       t=get_reg(i_regs->regmap,rt1[i]);
2192       s=get_reg(i_regs->regmap,rs1[i]);
2193       if(rs1[i]) {
2194         //assert(t>=0);
2195         //assert(s>=0);
2196         if(t>=0) {
2197           if(!((i_regs->isconst>>t)&1)) {
2198             if(s<0) {
2199               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2200               emit_addimm(t,imm[i],t);
2201             }else{
2202               if(!((i_regs->wasconst>>s)&1))
2203                 emit_addimm(s,imm[i],t);
2204               else
2205                 emit_movimm(constmap[i][s]+imm[i],t);
2206             }
2207           }
2208         }
2209       } else {
2210         if(t>=0) {
2211           if(!((i_regs->isconst>>t)&1))
2212             emit_movimm(imm[i],t);
2213         }
2214       }
2215     }
2216   }
2217   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2218     if(rt1[i]) {
2219       signed char sh,sl,th,tl;
2220       th=get_reg(i_regs->regmap,rt1[i]|64);
2221       tl=get_reg(i_regs->regmap,rt1[i]);
2222       sh=get_reg(i_regs->regmap,rs1[i]|64);
2223       sl=get_reg(i_regs->regmap,rs1[i]);
2224       if(tl>=0) {
2225         if(rs1[i]) {
2226           assert(sh>=0);
2227           assert(sl>=0);
2228           if(th>=0) {
2229             emit_addimm64_32(sh,sl,imm[i],th,tl);
2230           }
2231           else {
2232             emit_addimm(sl,imm[i],tl);
2233           }
2234         } else {
2235           emit_movimm(imm[i],tl);
2236           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2237         }
2238       }
2239     }
2240   }
2241   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2242     if(rt1[i]) {
2243       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2244       signed char sh,sl,t;
2245       t=get_reg(i_regs->regmap,rt1[i]);
2246       sh=get_reg(i_regs->regmap,rs1[i]|64);
2247       sl=get_reg(i_regs->regmap,rs1[i]);
2248       //assert(t>=0);
2249       if(t>=0) {
2250         if(rs1[i]>0) {
2251           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2252           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2253             if(opcode[i]==0x0a) { // SLTI
2254               if(sl<0) {
2255                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2256                 emit_slti32(t,imm[i],t);
2257               }else{
2258                 emit_slti32(sl,imm[i],t);
2259               }
2260             }
2261             else { // SLTIU
2262               if(sl<0) {
2263                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2264                 emit_sltiu32(t,imm[i],t);
2265               }else{
2266                 emit_sltiu32(sl,imm[i],t);
2267               }
2268             }
2269           }else{ // 64-bit
2270             assert(sl>=0);
2271             if(opcode[i]==0x0a) // SLTI
2272               emit_slti64_32(sh,sl,imm[i],t);
2273             else // SLTIU
2274               emit_sltiu64_32(sh,sl,imm[i],t);
2275           }
2276         }else{
2277           // SLTI(U) with r0 is just stupid,
2278           // nonetheless examples can be found
2279           if(opcode[i]==0x0a) // SLTI
2280             if(0<imm[i]) emit_movimm(1,t);
2281             else emit_zeroreg(t);
2282           else // SLTIU
2283           {
2284             if(imm[i]) emit_movimm(1,t);
2285             else emit_zeroreg(t);
2286           }
2287         }
2288       }
2289     }
2290   }
2291   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2292     if(rt1[i]) {
2293       signed char sh,sl,th,tl;
2294       th=get_reg(i_regs->regmap,rt1[i]|64);
2295       tl=get_reg(i_regs->regmap,rt1[i]);
2296       sh=get_reg(i_regs->regmap,rs1[i]|64);
2297       sl=get_reg(i_regs->regmap,rs1[i]);
2298       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2299         if(opcode[i]==0x0c) //ANDI
2300         {
2301           if(rs1[i]) {
2302             if(sl<0) {
2303               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2304               emit_andimm(tl,imm[i],tl);
2305             }else{
2306               if(!((i_regs->wasconst>>sl)&1))
2307                 emit_andimm(sl,imm[i],tl);
2308               else
2309                 emit_movimm(constmap[i][sl]&imm[i],tl);
2310             }
2311           }
2312           else
2313             emit_zeroreg(tl);
2314           if(th>=0) emit_zeroreg(th);
2315         }
2316         else
2317         {
2318           if(rs1[i]) {
2319             if(sl<0) {
2320               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2321             }
2322             if(th>=0) {
2323               if(sh<0) {
2324                 emit_loadreg(rs1[i]|64,th);
2325               }else{
2326                 emit_mov(sh,th);
2327               }
2328             }
2329             if(opcode[i]==0x0d) //ORI
2330             if(sl<0) {
2331               emit_orimm(tl,imm[i],tl);
2332             }else{
2333               if(!((i_regs->wasconst>>sl)&1))
2334                 emit_orimm(sl,imm[i],tl);
2335               else
2336                 emit_movimm(constmap[i][sl]|imm[i],tl);
2337             }
2338             if(opcode[i]==0x0e) //XORI
2339             if(sl<0) {
2340               emit_xorimm(tl,imm[i],tl);
2341             }else{
2342               if(!((i_regs->wasconst>>sl)&1))
2343                 emit_xorimm(sl,imm[i],tl);
2344               else
2345                 emit_movimm(constmap[i][sl]^imm[i],tl);
2346             }
2347           }
2348           else {
2349             emit_movimm(imm[i],tl);
2350             if(th>=0) emit_zeroreg(th);
2351           }
2352         }
2353       }
2354     }
2355   }
2356 }
2357
2358 void shiftimm_assemble(int i,struct regstat *i_regs)
2359 {
2360   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2361   {
2362     if(rt1[i]) {
2363       signed char s,t;
2364       t=get_reg(i_regs->regmap,rt1[i]);
2365       s=get_reg(i_regs->regmap,rs1[i]);
2366       //assert(t>=0);
2367       if(t>=0&&!((i_regs->isconst>>t)&1)){
2368         if(rs1[i]==0)
2369         {
2370           emit_zeroreg(t);
2371         }
2372         else
2373         {
2374           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2375           if(imm[i]) {
2376             if(opcode2[i]==0) // SLL
2377             {
2378               emit_shlimm(s<0?t:s,imm[i],t);
2379             }
2380             if(opcode2[i]==2) // SRL
2381             {
2382               emit_shrimm(s<0?t:s,imm[i],t);
2383             }
2384             if(opcode2[i]==3) // SRA
2385             {
2386               emit_sarimm(s<0?t:s,imm[i],t);
2387             }
2388           }else{
2389             // Shift by zero
2390             if(s>=0 && s!=t) emit_mov(s,t);
2391           }
2392         }
2393       }
2394       //emit_storereg(rt1[i],t); //DEBUG
2395     }
2396   }
2397   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2398   {
2399     if(rt1[i]) {
2400       signed char sh,sl,th,tl;
2401       th=get_reg(i_regs->regmap,rt1[i]|64);
2402       tl=get_reg(i_regs->regmap,rt1[i]);
2403       sh=get_reg(i_regs->regmap,rs1[i]|64);
2404       sl=get_reg(i_regs->regmap,rs1[i]);
2405       if(tl>=0) {
2406         if(rs1[i]==0)
2407         {
2408           emit_zeroreg(tl);
2409           if(th>=0) emit_zeroreg(th);
2410         }
2411         else
2412         {
2413           assert(sl>=0);
2414           assert(sh>=0);
2415           if(imm[i]) {
2416             if(opcode2[i]==0x38) // DSLL
2417             {
2418               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2419               emit_shlimm(sl,imm[i],tl);
2420             }
2421             if(opcode2[i]==0x3a) // DSRL
2422             {
2423               emit_shrdimm(sl,sh,imm[i],tl);
2424               if(th>=0) emit_shrimm(sh,imm[i],th);
2425             }
2426             if(opcode2[i]==0x3b) // DSRA
2427             {
2428               emit_shrdimm(sl,sh,imm[i],tl);
2429               if(th>=0) emit_sarimm(sh,imm[i],th);
2430             }
2431           }else{
2432             // Shift by zero
2433             if(sl!=tl) emit_mov(sl,tl);
2434             if(th>=0&&sh!=th) emit_mov(sh,th);
2435           }
2436         }
2437       }
2438     }
2439   }
2440   if(opcode2[i]==0x3c) // DSLL32
2441   {
2442     if(rt1[i]) {
2443       signed char sl,tl,th;
2444       tl=get_reg(i_regs->regmap,rt1[i]);
2445       th=get_reg(i_regs->regmap,rt1[i]|64);
2446       sl=get_reg(i_regs->regmap,rs1[i]);
2447       if(th>=0||tl>=0){
2448         assert(tl>=0);
2449         assert(th>=0);
2450         assert(sl>=0);
2451         emit_mov(sl,th);
2452         emit_zeroreg(tl);
2453         if(imm[i]>32)
2454         {
2455           emit_shlimm(th,imm[i]&31,th);
2456         }
2457       }
2458     }
2459   }
2460   if(opcode2[i]==0x3e) // DSRL32
2461   {
2462     if(rt1[i]) {
2463       signed char sh,tl,th;
2464       tl=get_reg(i_regs->regmap,rt1[i]);
2465       th=get_reg(i_regs->regmap,rt1[i]|64);
2466       sh=get_reg(i_regs->regmap,rs1[i]|64);
2467       if(tl>=0){
2468         assert(sh>=0);
2469         emit_mov(sh,tl);
2470         if(th>=0) emit_zeroreg(th);
2471         if(imm[i]>32)
2472         {
2473           emit_shrimm(tl,imm[i]&31,tl);
2474         }
2475       }
2476     }
2477   }
2478   if(opcode2[i]==0x3f) // DSRA32
2479   {
2480     if(rt1[i]) {
2481       signed char sh,tl;
2482       tl=get_reg(i_regs->regmap,rt1[i]);
2483       sh=get_reg(i_regs->regmap,rs1[i]|64);
2484       if(tl>=0){
2485         assert(sh>=0);
2486         emit_mov(sh,tl);
2487         if(imm[i]>32)
2488         {
2489           emit_sarimm(tl,imm[i]&31,tl);
2490         }
2491       }
2492     }
2493   }
2494 }
2495
2496 #ifndef shift_assemble
2497 void shift_assemble(int i,struct regstat *i_regs)
2498 {
2499   printf("Need shift_assemble for this architecture.\n");
2500   exit(1);
2501 }
2502 #endif
2503
2504 void load_assemble(int i,struct regstat *i_regs)
2505 {
2506   int s,th,tl,addr,map=-1;
2507   int offset;
2508   int jaddr=0;
2509   int memtarget=0,c=0;
2510   int fastload_reg_override=0;
2511   u_int hr,reglist=0;
2512   th=get_reg(i_regs->regmap,rt1[i]|64);
2513   tl=get_reg(i_regs->regmap,rt1[i]);
2514   s=get_reg(i_regs->regmap,rs1[i]);
2515   offset=imm[i];
2516   for(hr=0;hr<HOST_REGS;hr++) {
2517     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2518   }
2519   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2520   if(s>=0) {
2521     c=(i_regs->wasconst>>s)&1;
2522     if (c) {
2523       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2524     }
2525   }
2526   //printf("load_assemble: c=%d\n",c);
2527   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2528   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2529   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2530     ||rt1[i]==0) {
2531       // could be FIFO, must perform the read
2532       // ||dummy read
2533       assem_debug("(forced read)\n");
2534       tl=get_reg(i_regs->regmap,-1);
2535       assert(tl>=0);
2536   }
2537   if(offset||s<0||c) addr=tl;
2538   else addr=s;
2539   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2540  if(tl>=0) {
2541   //printf("load_assemble: c=%d\n",c);
2542   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2543   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2544   reglist&=~(1<<tl);
2545   if(th>=0) reglist&=~(1<<th);
2546   if(!c) {
2547     #ifdef RAM_OFFSET
2548     map=get_reg(i_regs->regmap,ROREG);
2549     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2550     #endif
2551     #ifdef R29_HACK
2552     // Strmnnrmn's speed hack
2553     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2554     #endif
2555     {
2556       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2557     }
2558   }
2559   else if(ram_offset&&memtarget) {
2560     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2561     fastload_reg_override=HOST_TEMPREG;
2562   }
2563   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2564   if (opcode[i]==0x20) { // LB
2565     if(!c||memtarget) {
2566       if(!dummy) {
2567         #ifdef HOST_IMM_ADDR32
2568         if(c)
2569           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2570         else
2571         #endif
2572         {
2573           //emit_xorimm(addr,3,tl);
2574           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2575           int x=0,a=tl;
2576 #ifdef BIG_ENDIAN_MIPS
2577           if(!c) emit_xorimm(addr,3,tl);
2578           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2579 #else
2580           if(!c) a=addr;
2581 #endif
2582           if(fastload_reg_override) a=fastload_reg_override;
2583
2584           emit_movsbl_indexed_tlb(x,a,map,tl);
2585         }
2586       }
2587       if(jaddr)
2588         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2589     }
2590     else
2591       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2592   }
2593   if (opcode[i]==0x21) { // LH
2594     if(!c||memtarget) {
2595       if(!dummy) {
2596         #ifdef HOST_IMM_ADDR32
2597         if(c)
2598           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2599         else
2600         #endif
2601         {
2602           int x=0,a=tl;
2603 #ifdef BIG_ENDIAN_MIPS
2604           if(!c) emit_xorimm(addr,2,tl);
2605           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2606 #else
2607           if(!c) a=addr;
2608 #endif
2609           if(fastload_reg_override) a=fastload_reg_override;
2610           //#ifdef
2611           //emit_movswl_indexed_tlb(x,tl,map,tl);
2612           //else
2613           if(map>=0) {
2614             emit_movswl_indexed(x,a,tl);
2615           }else{
2616             #if 1 //def RAM_OFFSET
2617             emit_movswl_indexed(x,a,tl);
2618             #else
2619             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2620             #endif
2621           }
2622         }
2623       }
2624       if(jaddr)
2625         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2626     }
2627     else
2628       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2629   }
2630   if (opcode[i]==0x23) { // LW
2631     if(!c||memtarget) {
2632       if(!dummy) {
2633         int a=addr;
2634         if(fastload_reg_override) a=fastload_reg_override;
2635         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2636         #ifdef HOST_IMM_ADDR32
2637         if(c)
2638           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2639         else
2640         #endif
2641         emit_readword_indexed_tlb(0,a,map,tl);
2642       }
2643       if(jaddr)
2644         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2645     }
2646     else
2647       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2648   }
2649   if (opcode[i]==0x24) { // LBU
2650     if(!c||memtarget) {
2651       if(!dummy) {
2652         #ifdef HOST_IMM_ADDR32
2653         if(c)
2654           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2655         else
2656         #endif
2657         {
2658           //emit_xorimm(addr,3,tl);
2659           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2660           int x=0,a=tl;
2661 #ifdef BIG_ENDIAN_MIPS
2662           if(!c) emit_xorimm(addr,3,tl);
2663           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2664 #else
2665           if(!c) a=addr;
2666 #endif
2667           if(fastload_reg_override) a=fastload_reg_override;
2668
2669           emit_movzbl_indexed_tlb(x,a,map,tl);
2670         }
2671       }
2672       if(jaddr)
2673         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2674     }
2675     else
2676       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2677   }
2678   if (opcode[i]==0x25) { // LHU
2679     if(!c||memtarget) {
2680       if(!dummy) {
2681         #ifdef HOST_IMM_ADDR32
2682         if(c)
2683           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2684         else
2685         #endif
2686         {
2687           int x=0,a=tl;
2688 #ifdef BIG_ENDIAN_MIPS
2689           if(!c) emit_xorimm(addr,2,tl);
2690           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2691 #else
2692           if(!c) a=addr;
2693 #endif
2694           if(fastload_reg_override) a=fastload_reg_override;
2695           //#ifdef
2696           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2697           //#else
2698           if(map>=0) {
2699             emit_movzwl_indexed(x,a,tl);
2700           }else{
2701             #if 1 //def RAM_OFFSET
2702             emit_movzwl_indexed(x,a,tl);
2703             #else
2704             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
2705             #endif
2706           }
2707         }
2708       }
2709       if(jaddr)
2710         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2711     }
2712     else
2713       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2714   }
2715   if (opcode[i]==0x27) { // LWU
2716     assert(th>=0);
2717     if(!c||memtarget) {
2718       if(!dummy) {
2719         int a=addr;
2720         if(fastload_reg_override) a=fastload_reg_override;
2721         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2722         #ifdef HOST_IMM_ADDR32
2723         if(c)
2724           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2725         else
2726         #endif
2727         emit_readword_indexed_tlb(0,a,map,tl);
2728       }
2729       if(jaddr)
2730         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2731     }
2732     else {
2733       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2734     }
2735     emit_zeroreg(th);
2736   }
2737   if (opcode[i]==0x37) { // LD
2738     if(!c||memtarget) {
2739       if(!dummy) {
2740         int a=addr;
2741         if(fastload_reg_override) a=fastload_reg_override;
2742         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2743         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2744         #ifdef HOST_IMM_ADDR32
2745         if(c)
2746           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2747         else
2748         #endif
2749         emit_readdword_indexed_tlb(0,a,map,th,tl);
2750       }
2751       if(jaddr)
2752         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2753     }
2754     else
2755       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2756   }
2757  }
2758   //emit_storereg(rt1[i],tl); // DEBUG
2759   //if(opcode[i]==0x23)
2760   //if(opcode[i]==0x24)
2761   //if(opcode[i]==0x23||opcode[i]==0x24)
2762   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2763   {
2764     //emit_pusha();
2765     save_regs(0x100f);
2766         emit_readword((int)&last_count,ECX);
2767         #ifdef __i386__
2768         if(get_reg(i_regs->regmap,CCREG)<0)
2769           emit_loadreg(CCREG,HOST_CCREG);
2770         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2771         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2772         emit_writeword(HOST_CCREG,(int)&Count);
2773         #endif
2774         #ifdef __arm__
2775         if(get_reg(i_regs->regmap,CCREG)<0)
2776           emit_loadreg(CCREG,0);
2777         else
2778           emit_mov(HOST_CCREG,0);
2779         emit_add(0,ECX,0);
2780         emit_addimm(0,2*ccadj[i],0);
2781         emit_writeword(0,(int)&Count);
2782         #endif
2783     emit_call((int)memdebug);
2784     //emit_popa();
2785     restore_regs(0x100f);
2786   }/**/
2787 }
2788
2789 #ifndef loadlr_assemble
2790 void loadlr_assemble(int i,struct regstat *i_regs)
2791 {
2792   printf("Need loadlr_assemble for this architecture.\n");
2793   exit(1);
2794 }
2795 #endif
2796
2797 void store_assemble(int i,struct regstat *i_regs)
2798 {
2799   int s,th,tl,map=-1;
2800   int addr,temp;
2801   int offset;
2802   int jaddr=0,jaddr2,type;
2803   int memtarget=0,c=0;
2804   int agr=AGEN1+(i&1);
2805   int faststore_reg_override=0;
2806   u_int hr,reglist=0;
2807   th=get_reg(i_regs->regmap,rs2[i]|64);
2808   tl=get_reg(i_regs->regmap,rs2[i]);
2809   s=get_reg(i_regs->regmap,rs1[i]);
2810   temp=get_reg(i_regs->regmap,agr);
2811   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2812   offset=imm[i];
2813   if(s>=0) {
2814     c=(i_regs->wasconst>>s)&1;
2815     if(c) {
2816       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2817     }
2818   }
2819   assert(tl>=0);
2820   assert(temp>=0);
2821   for(hr=0;hr<HOST_REGS;hr++) {
2822     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2823   }
2824   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2825   if(offset||s<0||c) addr=temp;
2826   else addr=s;
2827   if(!c) {
2828     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2829   }
2830   else if(ram_offset&&memtarget) {
2831     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2832     faststore_reg_override=HOST_TEMPREG;
2833   }
2834
2835   if (opcode[i]==0x28) { // SB
2836     if(!c||memtarget) {
2837       int x=0,a=temp;
2838 #ifdef BIG_ENDIAN_MIPS
2839       if(!c) emit_xorimm(addr,3,temp);
2840       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2841 #else
2842       if(!c) a=addr;
2843 #endif
2844       if(faststore_reg_override) a=faststore_reg_override;
2845       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
2846       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2847     }
2848     type=STOREB_STUB;
2849   }
2850   if (opcode[i]==0x29) { // SH
2851     if(!c||memtarget) {
2852       int x=0,a=temp;
2853 #ifdef BIG_ENDIAN_MIPS
2854       if(!c) emit_xorimm(addr,2,temp);
2855       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2856 #else
2857       if(!c) a=addr;
2858 #endif
2859       if(faststore_reg_override) a=faststore_reg_override;
2860       //#ifdef
2861       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2862       //#else
2863       if(map>=0) {
2864         emit_writehword_indexed(tl,x,a);
2865       }else
2866         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
2867         emit_writehword_indexed(tl,x,a);
2868     }
2869     type=STOREH_STUB;
2870   }
2871   if (opcode[i]==0x2B) { // SW
2872     if(!c||memtarget) {
2873       int a=addr;
2874       if(faststore_reg_override) a=faststore_reg_override;
2875       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
2876       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2877     }
2878     type=STOREW_STUB;
2879   }
2880   if (opcode[i]==0x3F) { // SD
2881     if(!c||memtarget) {
2882       int a=addr;
2883       if(faststore_reg_override) a=faststore_reg_override;
2884       if(rs2[i]) {
2885         assert(th>=0);
2886         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
2887         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
2888         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
2889       }else{
2890         // Store zero
2891         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
2892         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
2893         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
2894       }
2895     }
2896     type=STORED_STUB;
2897   }
2898   if(jaddr) {
2899     // PCSX store handlers don't check invcode again
2900     reglist|=1<<addr;
2901     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2902     jaddr=0;
2903   }
2904   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2905     if(!c||memtarget) {
2906       #ifdef DESTRUCTIVE_SHIFT
2907       // The x86 shift operation is 'destructive'; it overwrites the
2908       // source register, so we need to make a copy first and use that.
2909       addr=temp;
2910       #endif
2911       #if defined(HOST_IMM8)
2912       int ir=get_reg(i_regs->regmap,INVCP);
2913       assert(ir>=0);
2914       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2915       #else
2916       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
2917       #endif
2918       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2919       emit_callne(invalidate_addr_reg[addr]);
2920       #else
2921       jaddr2=(int)out;
2922       emit_jne(0);
2923       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2924       #endif
2925     }
2926   }
2927   u_int addr_val=constmap[i][s]+offset;
2928   if(jaddr) {
2929     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2930   } else if(c&&!memtarget) {
2931     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2932   }
2933   // basic current block modification detection..
2934   // not looking back as that should be in mips cache already
2935   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2936     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2937     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2938     if(i_regs->regmap==regs[i].regmap) {
2939       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
2940       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
2941       emit_movimm(start+i*4+4,0);
2942       emit_writeword(0,(int)&pcaddr);
2943       emit_jmp((int)do_interrupt);
2944     }
2945   }
2946   //if(opcode[i]==0x2B || opcode[i]==0x3F)
2947   //if(opcode[i]==0x2B || opcode[i]==0x28)
2948   //if(opcode[i]==0x2B || opcode[i]==0x29)
2949   //if(opcode[i]==0x2B)
2950   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
2951   {
2952     #ifdef __i386__
2953     emit_pusha();
2954     #endif
2955     #ifdef __arm__
2956     save_regs(0x100f);
2957     #endif
2958         emit_readword((int)&last_count,ECX);
2959         #ifdef __i386__
2960         if(get_reg(i_regs->regmap,CCREG)<0)
2961           emit_loadreg(CCREG,HOST_CCREG);
2962         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2963         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2964         emit_writeword(HOST_CCREG,(int)&Count);
2965         #endif
2966         #ifdef __arm__
2967         if(get_reg(i_regs->regmap,CCREG)<0)
2968           emit_loadreg(CCREG,0);
2969         else
2970           emit_mov(HOST_CCREG,0);
2971         emit_add(0,ECX,0);
2972         emit_addimm(0,2*ccadj[i],0);
2973         emit_writeword(0,(int)&Count);
2974         #endif
2975     emit_call((int)memdebug);
2976     #ifdef __i386__
2977     emit_popa();
2978     #endif
2979     #ifdef __arm__
2980     restore_regs(0x100f);
2981     #endif
2982   }/**/
2983 }
2984
2985 void storelr_assemble(int i,struct regstat *i_regs)
2986 {
2987   int s,th,tl;
2988   int temp;
2989   int temp2;
2990   int offset;
2991   int jaddr=0,jaddr2;
2992   int case1,case2,case3;
2993   int done0,done1,done2;
2994   int memtarget=0,c=0;
2995   int agr=AGEN1+(i&1);
2996   u_int hr,reglist=0;
2997   th=get_reg(i_regs->regmap,rs2[i]|64);
2998   tl=get_reg(i_regs->regmap,rs2[i]);
2999   s=get_reg(i_regs->regmap,rs1[i]);
3000   temp=get_reg(i_regs->regmap,agr);
3001   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3002   offset=imm[i];
3003   if(s>=0) {
3004     c=(i_regs->isconst>>s)&1;
3005     if(c) {
3006       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3007     }
3008   }
3009   assert(tl>=0);
3010   for(hr=0;hr<HOST_REGS;hr++) {
3011     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3012   }
3013   assert(temp>=0);
3014   if(!c) {
3015     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3016     if(!offset&&s!=temp) emit_mov(s,temp);
3017     jaddr=(int)out;
3018     emit_jno(0);
3019   }
3020   else
3021   {
3022     if(!memtarget||!rs1[i]) {
3023       jaddr=(int)out;
3024       emit_jmp(0);
3025     }
3026   }
3027   #ifdef RAM_OFFSET
3028   int map=get_reg(i_regs->regmap,ROREG);
3029   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3030   #else
3031   if((u_int)rdram!=0x80000000)
3032     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3033   #endif
3034
3035   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3036     temp2=get_reg(i_regs->regmap,FTEMP);
3037     if(!rs2[i]) temp2=th=tl;
3038   }
3039
3040 #ifndef BIG_ENDIAN_MIPS
3041     emit_xorimm(temp,3,temp);
3042 #endif
3043   emit_testimm(temp,2);
3044   case2=(int)out;
3045   emit_jne(0);
3046   emit_testimm(temp,1);
3047   case1=(int)out;
3048   emit_jne(0);
3049   // 0
3050   if (opcode[i]==0x2A) { // SWL
3051     emit_writeword_indexed(tl,0,temp);
3052   }
3053   if (opcode[i]==0x2E) { // SWR
3054     emit_writebyte_indexed(tl,3,temp);
3055   }
3056   if (opcode[i]==0x2C) { // SDL
3057     emit_writeword_indexed(th,0,temp);
3058     if(rs2[i]) emit_mov(tl,temp2);
3059   }
3060   if (opcode[i]==0x2D) { // SDR
3061     emit_writebyte_indexed(tl,3,temp);
3062     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3063   }
3064   done0=(int)out;
3065   emit_jmp(0);
3066   // 1
3067   set_jump_target(case1,(int)out);
3068   if (opcode[i]==0x2A) { // SWL
3069     // Write 3 msb into three least significant bytes
3070     if(rs2[i]) emit_rorimm(tl,8,tl);
3071     emit_writehword_indexed(tl,-1,temp);
3072     if(rs2[i]) emit_rorimm(tl,16,tl);
3073     emit_writebyte_indexed(tl,1,temp);
3074     if(rs2[i]) emit_rorimm(tl,8,tl);
3075   }
3076   if (opcode[i]==0x2E) { // SWR
3077     // Write two lsb into two most significant bytes
3078     emit_writehword_indexed(tl,1,temp);
3079   }
3080   if (opcode[i]==0x2C) { // SDL
3081     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3082     // Write 3 msb into three least significant bytes
3083     if(rs2[i]) emit_rorimm(th,8,th);
3084     emit_writehword_indexed(th,-1,temp);
3085     if(rs2[i]) emit_rorimm(th,16,th);
3086     emit_writebyte_indexed(th,1,temp);
3087     if(rs2[i]) emit_rorimm(th,8,th);
3088   }
3089   if (opcode[i]==0x2D) { // SDR
3090     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3091     // Write two lsb into two most significant bytes
3092     emit_writehword_indexed(tl,1,temp);
3093   }
3094   done1=(int)out;
3095   emit_jmp(0);
3096   // 2
3097   set_jump_target(case2,(int)out);
3098   emit_testimm(temp,1);
3099   case3=(int)out;
3100   emit_jne(0);
3101   if (opcode[i]==0x2A) { // SWL
3102     // Write two msb into two least significant bytes
3103     if(rs2[i]) emit_rorimm(tl,16,tl);
3104     emit_writehword_indexed(tl,-2,temp);
3105     if(rs2[i]) emit_rorimm(tl,16,tl);
3106   }
3107   if (opcode[i]==0x2E) { // SWR
3108     // Write 3 lsb into three most significant bytes
3109     emit_writebyte_indexed(tl,-1,temp);
3110     if(rs2[i]) emit_rorimm(tl,8,tl);
3111     emit_writehword_indexed(tl,0,temp);
3112     if(rs2[i]) emit_rorimm(tl,24,tl);
3113   }
3114   if (opcode[i]==0x2C) { // SDL
3115     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3116     // Write two msb into two least significant bytes
3117     if(rs2[i]) emit_rorimm(th,16,th);
3118     emit_writehword_indexed(th,-2,temp);
3119     if(rs2[i]) emit_rorimm(th,16,th);
3120   }
3121   if (opcode[i]==0x2D) { // SDR
3122     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3123     // Write 3 lsb into three most significant bytes
3124     emit_writebyte_indexed(tl,-1,temp);
3125     if(rs2[i]) emit_rorimm(tl,8,tl);
3126     emit_writehword_indexed(tl,0,temp);
3127     if(rs2[i]) emit_rorimm(tl,24,tl);
3128   }
3129   done2=(int)out;
3130   emit_jmp(0);
3131   // 3
3132   set_jump_target(case3,(int)out);
3133   if (opcode[i]==0x2A) { // SWL
3134     // Write msb into least significant byte
3135     if(rs2[i]) emit_rorimm(tl,24,tl);
3136     emit_writebyte_indexed(tl,-3,temp);
3137     if(rs2[i]) emit_rorimm(tl,8,tl);
3138   }
3139   if (opcode[i]==0x2E) { // SWR
3140     // Write entire word
3141     emit_writeword_indexed(tl,-3,temp);
3142   }
3143   if (opcode[i]==0x2C) { // SDL
3144     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3145     // Write msb into least significant byte
3146     if(rs2[i]) emit_rorimm(th,24,th);
3147     emit_writebyte_indexed(th,-3,temp);
3148     if(rs2[i]) emit_rorimm(th,8,th);
3149   }
3150   if (opcode[i]==0x2D) { // SDR
3151     if(rs2[i]) emit_mov(th,temp2);
3152     // Write entire word
3153     emit_writeword_indexed(tl,-3,temp);
3154   }
3155   set_jump_target(done0,(int)out);
3156   set_jump_target(done1,(int)out);
3157   set_jump_target(done2,(int)out);
3158   if (opcode[i]==0x2C) { // SDL
3159     emit_testimm(temp,4);
3160     done0=(int)out;
3161     emit_jne(0);
3162     emit_andimm(temp,~3,temp);
3163     emit_writeword_indexed(temp2,4,temp);
3164     set_jump_target(done0,(int)out);
3165   }
3166   if (opcode[i]==0x2D) { // SDR
3167     emit_testimm(temp,4);
3168     done0=(int)out;
3169     emit_jeq(0);
3170     emit_andimm(temp,~3,temp);
3171     emit_writeword_indexed(temp2,-4,temp);
3172     set_jump_target(done0,(int)out);
3173   }
3174   if(!c||!memtarget)
3175     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3176   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3177     #ifdef RAM_OFFSET
3178     int map=get_reg(i_regs->regmap,ROREG);
3179     if(map<0) map=HOST_TEMPREG;
3180     gen_orig_addr_w(temp,map);
3181     #else
3182     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3183     #endif
3184     #if defined(HOST_IMM8)
3185     int ir=get_reg(i_regs->regmap,INVCP);
3186     assert(ir>=0);
3187     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3188     #else
3189     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3190     #endif
3191     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3192     emit_callne(invalidate_addr_reg[temp]);
3193     #else
3194     jaddr2=(int)out;
3195     emit_jne(0);
3196     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3197     #endif
3198   }
3199   /*
3200     emit_pusha();
3201     //save_regs(0x100f);
3202         emit_readword((int)&last_count,ECX);
3203         if(get_reg(i_regs->regmap,CCREG)<0)
3204           emit_loadreg(CCREG,HOST_CCREG);
3205         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3206         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3207         emit_writeword(HOST_CCREG,(int)&Count);
3208     emit_call((int)memdebug);
3209     emit_popa();
3210     //restore_regs(0x100f);
3211   /**/
3212 }
3213
3214 void c1ls_assemble(int i,struct regstat *i_regs)
3215 {
3216   cop1_unusable(i, i_regs);
3217 }
3218
3219 void c2ls_assemble(int i,struct regstat *i_regs)
3220 {
3221   int s,tl;
3222   int ar;
3223   int offset;
3224   int memtarget=0,c=0;
3225   int jaddr2=0,jaddr3,type;
3226   int agr=AGEN1+(i&1);
3227   int fastio_reg_override=0;
3228   u_int hr,reglist=0;
3229   u_int copr=(source[i]>>16)&0x1f;
3230   s=get_reg(i_regs->regmap,rs1[i]);
3231   tl=get_reg(i_regs->regmap,FTEMP);
3232   offset=imm[i];
3233   assert(rs1[i]>0);
3234   assert(tl>=0);
3235
3236   for(hr=0;hr<HOST_REGS;hr++) {
3237     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3238   }
3239   if(i_regs->regmap[HOST_CCREG]==CCREG)
3240     reglist&=~(1<<HOST_CCREG);
3241
3242   // get the address
3243   if (opcode[i]==0x3a) { // SWC2
3244     ar=get_reg(i_regs->regmap,agr);
3245     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3246     reglist|=1<<ar;
3247   } else { // LWC2
3248     ar=tl;
3249   }
3250   if(s>=0) c=(i_regs->wasconst>>s)&1;
3251   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3252   if (!offset&&!c&&s>=0) ar=s;
3253   assert(ar>=0);
3254
3255   if (opcode[i]==0x3a) { // SWC2
3256     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3257     type=STOREW_STUB;
3258   }
3259   else
3260     type=LOADW_STUB;
3261
3262   if(c&&!memtarget) {
3263     jaddr2=(int)out;
3264     emit_jmp(0); // inline_readstub/inline_writestub?
3265   }
3266   else {
3267     if(!c) {
3268       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3269     }
3270     else if(ram_offset&&memtarget) {
3271       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3272       fastio_reg_override=HOST_TEMPREG;
3273     }
3274     if (opcode[i]==0x32) { // LWC2
3275       #ifdef HOST_IMM_ADDR32
3276       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3277       else
3278       #endif
3279       int a=ar;
3280       if(fastio_reg_override) a=fastio_reg_override;
3281       emit_readword_indexed(0,a,tl);
3282     }
3283     if (opcode[i]==0x3a) { // SWC2
3284       #ifdef DESTRUCTIVE_SHIFT
3285       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3286       #endif
3287       int a=ar;
3288       if(fastio_reg_override) a=fastio_reg_override;
3289       emit_writeword_indexed(tl,0,a);
3290     }
3291   }
3292   if(jaddr2)
3293     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3294   if(opcode[i]==0x3a) // SWC2
3295   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3296 #if defined(HOST_IMM8)
3297     int ir=get_reg(i_regs->regmap,INVCP);
3298     assert(ir>=0);
3299     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3300 #else
3301     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3302 #endif
3303     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3304     emit_callne(invalidate_addr_reg[ar]);
3305     #else
3306     jaddr3=(int)out;
3307     emit_jne(0);
3308     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3309     #endif
3310   }
3311   if (opcode[i]==0x32) { // LWC2
3312     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3313   }
3314 }
3315
3316 #ifndef multdiv_assemble
3317 void multdiv_assemble(int i,struct regstat *i_regs)
3318 {
3319   printf("Need multdiv_assemble for this architecture.\n");
3320   exit(1);
3321 }
3322 #endif
3323
3324 void mov_assemble(int i,struct regstat *i_regs)
3325 {
3326   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3327   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3328   if(rt1[i]) {
3329     signed char sh,sl,th,tl;
3330     th=get_reg(i_regs->regmap,rt1[i]|64);
3331     tl=get_reg(i_regs->regmap,rt1[i]);
3332     //assert(tl>=0);
3333     if(tl>=0) {
3334       sh=get_reg(i_regs->regmap,rs1[i]|64);
3335       sl=get_reg(i_regs->regmap,rs1[i]);
3336       if(sl>=0) emit_mov(sl,tl);
3337       else emit_loadreg(rs1[i],tl);
3338       if(th>=0) {
3339         if(sh>=0) emit_mov(sh,th);
3340         else emit_loadreg(rs1[i]|64,th);
3341       }
3342     }
3343   }
3344 }
3345
3346 #ifndef fconv_assemble
3347 void fconv_assemble(int i,struct regstat *i_regs)
3348 {
3349   printf("Need fconv_assemble for this architecture.\n");
3350   exit(1);
3351 }
3352 #endif
3353
3354 #if 0
3355 void float_assemble(int i,struct regstat *i_regs)
3356 {
3357   printf("Need float_assemble for this architecture.\n");
3358   exit(1);
3359 }
3360 #endif
3361
3362 void syscall_assemble(int i,struct regstat *i_regs)
3363 {
3364   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3365   assert(ccreg==HOST_CCREG);
3366   assert(!is_delayslot);
3367   emit_movimm(start+i*4,EAX); // Get PC
3368   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3369   emit_jmp((int)jump_syscall_hle); // XXX
3370 }
3371
3372 void hlecall_assemble(int i,struct regstat *i_regs)
3373 {
3374   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3375   assert(ccreg==HOST_CCREG);
3376   assert(!is_delayslot);
3377   emit_movimm(start+i*4+4,0); // Get PC
3378   emit_movimm((int)psxHLEt[source[i]&7],1);
3379   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3380   emit_jmp((int)jump_hlecall);
3381 }
3382
3383 void intcall_assemble(int i,struct regstat *i_regs)
3384 {
3385   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3386   assert(ccreg==HOST_CCREG);
3387   assert(!is_delayslot);
3388   emit_movimm(start+i*4,0); // Get PC
3389   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3390   emit_jmp((int)jump_intcall);
3391 }
3392
3393 void ds_assemble(int i,struct regstat *i_regs)
3394 {
3395   speculate_register_values(i);
3396   is_delayslot=1;
3397   switch(itype[i]) {
3398     case ALU:
3399       alu_assemble(i,i_regs);break;
3400     case IMM16:
3401       imm16_assemble(i,i_regs);break;
3402     case SHIFT:
3403       shift_assemble(i,i_regs);break;
3404     case SHIFTIMM:
3405       shiftimm_assemble(i,i_regs);break;
3406     case LOAD:
3407       load_assemble(i,i_regs);break;
3408     case LOADLR:
3409       loadlr_assemble(i,i_regs);break;
3410     case STORE:
3411       store_assemble(i,i_regs);break;
3412     case STORELR:
3413       storelr_assemble(i,i_regs);break;
3414     case COP0:
3415       cop0_assemble(i,i_regs);break;
3416     case COP1:
3417       cop1_assemble(i,i_regs);break;
3418     case C1LS:
3419       c1ls_assemble(i,i_regs);break;
3420     case COP2:
3421       cop2_assemble(i,i_regs);break;
3422     case C2LS:
3423       c2ls_assemble(i,i_regs);break;
3424     case C2OP:
3425       c2op_assemble(i,i_regs);break;
3426     case FCONV:
3427       fconv_assemble(i,i_regs);break;
3428     case FLOAT:
3429       float_assemble(i,i_regs);break;
3430     case FCOMP:
3431       fcomp_assemble(i,i_regs);break;
3432     case MULTDIV:
3433       multdiv_assemble(i,i_regs);break;
3434     case MOV:
3435       mov_assemble(i,i_regs);break;
3436     case SYSCALL:
3437     case HLECALL:
3438     case INTCALL:
3439     case SPAN:
3440     case UJUMP:
3441     case RJUMP:
3442     case CJUMP:
3443     case SJUMP:
3444     case FJUMP:
3445       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3446   }
3447   is_delayslot=0;
3448 }
3449
3450 // Is the branch target a valid internal jump?
3451 int internal_branch(uint64_t i_is32,int addr)
3452 {
3453   if(addr&1) return 0; // Indirect (register) jump
3454   if(addr>=start && addr<start+slen*4-4)
3455   {
3456     //int t=(addr-start)>>2;
3457     // Delay slots are not valid branch targets
3458     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3459     // 64 -> 32 bit transition requires a recompile
3460     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3461     {
3462       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3463       else printf("optimizable: yes\n");
3464     }*/
3465     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3466     return 1;
3467   }
3468   return 0;
3469 }
3470
3471 #ifndef wb_invalidate
3472 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3473   uint64_t u,uint64_t uu)
3474 {
3475   int hr;
3476   for(hr=0;hr<HOST_REGS;hr++) {
3477     if(hr!=EXCLUDE_REG) {
3478       if(pre[hr]!=entry[hr]) {
3479         if(pre[hr]>=0) {
3480           if((dirty>>hr)&1) {
3481             if(get_reg(entry,pre[hr])<0) {
3482               if(pre[hr]<64) {
3483                 if(!((u>>pre[hr])&1)) {
3484                   emit_storereg(pre[hr],hr);
3485                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3486                     emit_sarimm(hr,31,hr);
3487                     emit_storereg(pre[hr]|64,hr);
3488                   }
3489                 }
3490               }else{
3491                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3492                   emit_storereg(pre[hr],hr);
3493                 }
3494               }
3495             }
3496           }
3497         }
3498       }
3499     }
3500   }
3501   // Move from one register to another (no writeback)
3502   for(hr=0;hr<HOST_REGS;hr++) {
3503     if(hr!=EXCLUDE_REG) {
3504       if(pre[hr]!=entry[hr]) {
3505         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3506           int nr;
3507           if((nr=get_reg(entry,pre[hr]))>=0) {
3508             emit_mov(hr,nr);
3509           }
3510         }
3511       }
3512     }
3513   }
3514 }
3515 #endif
3516
3517 // Load the specified registers
3518 // This only loads the registers given as arguments because
3519 // we don't want to load things that will be overwritten
3520 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3521 {
3522   int hr;
3523   // Load 32-bit regs
3524   for(hr=0;hr<HOST_REGS;hr++) {
3525     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3526       if(entry[hr]!=regmap[hr]) {
3527         if(regmap[hr]==rs1||regmap[hr]==rs2)
3528         {
3529           if(regmap[hr]==0) {
3530             emit_zeroreg(hr);
3531           }
3532           else
3533           {
3534             emit_loadreg(regmap[hr],hr);
3535           }
3536         }
3537       }
3538     }
3539   }
3540   //Load 64-bit regs
3541   for(hr=0;hr<HOST_REGS;hr++) {
3542     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3543       if(entry[hr]!=regmap[hr]) {
3544         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3545         {
3546           assert(regmap[hr]!=64);
3547           if((is32>>(regmap[hr]&63))&1) {
3548             int lr=get_reg(regmap,regmap[hr]-64);
3549             if(lr>=0)
3550               emit_sarimm(lr,31,hr);
3551             else
3552               emit_loadreg(regmap[hr],hr);
3553           }
3554           else
3555           {
3556             emit_loadreg(regmap[hr],hr);
3557           }
3558         }
3559       }
3560     }
3561   }
3562 }
3563
3564 // Load registers prior to the start of a loop
3565 // so that they are not loaded within the loop
3566 static void loop_preload(signed char pre[],signed char entry[])
3567 {
3568   int hr;
3569   for(hr=0;hr<HOST_REGS;hr++) {
3570     if(hr!=EXCLUDE_REG) {
3571       if(pre[hr]!=entry[hr]) {
3572         if(entry[hr]>=0) {
3573           if(get_reg(pre,entry[hr])<0) {
3574             assem_debug("loop preload:\n");
3575             //printf("loop preload: %d\n",hr);
3576             if(entry[hr]==0) {
3577               emit_zeroreg(hr);
3578             }
3579             else if(entry[hr]<TEMPREG)
3580             {
3581               emit_loadreg(entry[hr],hr);
3582             }
3583             else if(entry[hr]-64<TEMPREG)
3584             {
3585               emit_loadreg(entry[hr],hr);
3586             }
3587           }
3588         }
3589       }
3590     }
3591   }
3592 }
3593
3594 // Generate address for load/store instruction
3595 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3596 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3597 {
3598   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3599     int ra=-1;
3600     int agr=AGEN1+(i&1);
3601     if(itype[i]==LOAD) {
3602       ra=get_reg(i_regs->regmap,rt1[i]);
3603       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3604       assert(ra>=0);
3605     }
3606     if(itype[i]==LOADLR) {
3607       ra=get_reg(i_regs->regmap,FTEMP);
3608     }
3609     if(itype[i]==STORE||itype[i]==STORELR) {
3610       ra=get_reg(i_regs->regmap,agr);
3611       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3612     }
3613     if(itype[i]==C1LS||itype[i]==C2LS) {
3614       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3615         ra=get_reg(i_regs->regmap,FTEMP);
3616       else { // SWC1/SDC1/SWC2/SDC2
3617         ra=get_reg(i_regs->regmap,agr);
3618         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3619       }
3620     }
3621     int rs=get_reg(i_regs->regmap,rs1[i]);
3622     if(ra>=0) {
3623       int offset=imm[i];
3624       int c=(i_regs->wasconst>>rs)&1;
3625       if(rs1[i]==0) {
3626         // Using r0 as a base address
3627         if(!entry||entry[ra]!=agr) {
3628           if (opcode[i]==0x22||opcode[i]==0x26) {
3629             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3630           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3631             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3632           }else{
3633             emit_movimm(offset,ra);
3634           }
3635         } // else did it in the previous cycle
3636       }
3637       else if(rs<0) {
3638         if(!entry||entry[ra]!=rs1[i])
3639           emit_loadreg(rs1[i],ra);
3640         //if(!entry||entry[ra]!=rs1[i])
3641         //  printf("poor load scheduling!\n");
3642       }
3643       else if(c) {
3644         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3645           if(!entry||entry[ra]!=agr) {
3646             if (opcode[i]==0x22||opcode[i]==0x26) {
3647               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3648             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3649               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3650             }else{
3651               #ifdef HOST_IMM_ADDR32
3652               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3653               #endif
3654               emit_movimm(constmap[i][rs]+offset,ra);
3655               regs[i].loadedconst|=1<<ra;
3656             }
3657           } // else did it in the previous cycle
3658         } // else load_consts already did it
3659       }
3660       if(offset&&!c&&rs1[i]) {
3661         if(rs>=0) {
3662           emit_addimm(rs,offset,ra);
3663         }else{
3664           emit_addimm(ra,offset,ra);
3665         }
3666       }
3667     }
3668   }
3669   // Preload constants for next instruction
3670   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3671     int agr,ra;
3672     // Actual address
3673     agr=AGEN1+((i+1)&1);
3674     ra=get_reg(i_regs->regmap,agr);
3675     if(ra>=0) {
3676       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3677       int offset=imm[i+1];
3678       int c=(regs[i+1].wasconst>>rs)&1;
3679       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3680         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3681           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3682         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3683           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3684         }else{
3685           #ifdef HOST_IMM_ADDR32
3686           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32)) // LWC1/LDC1/LWC2/LDC2
3687           #endif
3688           emit_movimm(constmap[i+1][rs]+offset,ra);
3689           regs[i+1].loadedconst|=1<<ra;
3690         }
3691       }
3692       else if(rs1[i+1]==0) {
3693         // Using r0 as a base address
3694         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3695           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3696         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3697           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3698         }else{
3699           emit_movimm(offset,ra);
3700         }
3701       }
3702     }
3703   }
3704 }
3705
3706 int get_final_value(int hr, int i, int *value)
3707 {
3708   int reg=regs[i].regmap[hr];
3709   while(i<slen-1) {
3710     if(regs[i+1].regmap[hr]!=reg) break;
3711     if(!((regs[i+1].isconst>>hr)&1)) break;
3712     if(bt[i+1]) break;
3713     i++;
3714   }
3715   if(i<slen-1) {
3716     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3717       *value=constmap[i][hr];
3718       return 1;
3719     }
3720     if(!bt[i+1]) {
3721       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3722         // Load in delay slot, out-of-order execution
3723         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3724         {
3725           // Precompute load address
3726           *value=constmap[i][hr]+imm[i+2];
3727           return 1;
3728         }
3729       }
3730       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3731       {
3732         // Precompute load address
3733         *value=constmap[i][hr]+imm[i+1];
3734         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
3735         return 1;
3736       }
3737     }
3738   }
3739   *value=constmap[i][hr];
3740   //printf("c=%x\n",(int)constmap[i][hr]);
3741   if(i==slen-1) return 1;
3742   if(reg<64) {
3743     return !((unneeded_reg[i+1]>>reg)&1);
3744   }else{
3745     return !((unneeded_reg_upper[i+1]>>reg)&1);
3746   }
3747 }
3748
3749 // Load registers with known constants
3750 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3751 {
3752   int hr,hr2;
3753   // propagate loaded constant flags
3754   if(i==0||bt[i])
3755     regs[i].loadedconst=0;
3756   else {
3757     for(hr=0;hr<HOST_REGS;hr++) {
3758       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3759          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3760       {
3761         regs[i].loadedconst|=1<<hr;
3762       }
3763     }
3764   }
3765   // Load 32-bit regs
3766   for(hr=0;hr<HOST_REGS;hr++) {
3767     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3768       //if(entry[hr]!=regmap[hr]) {
3769       if(!((regs[i].loadedconst>>hr)&1)) {
3770         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3771           int value,similar=0;
3772           if(get_final_value(hr,i,&value)) {
3773             // see if some other register has similar value
3774             for(hr2=0;hr2<HOST_REGS;hr2++) {
3775               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3776                 if(is_similar_value(value,constmap[i][hr2])) {
3777                   similar=1;
3778                   break;
3779                 }
3780               }
3781             }
3782             if(similar) {
3783               int value2;
3784               if(get_final_value(hr2,i,&value2)) // is this needed?
3785                 emit_movimm_from(value2,hr2,value,hr);
3786               else
3787                 emit_movimm(value,hr);
3788             }
3789             else if(value==0) {
3790               emit_zeroreg(hr);
3791             }
3792             else {
3793               emit_movimm(value,hr);
3794             }
3795           }
3796           regs[i].loadedconst|=1<<hr;
3797         }
3798       }
3799     }
3800   }
3801   // Load 64-bit regs
3802   for(hr=0;hr<HOST_REGS;hr++) {
3803     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3804       //if(entry[hr]!=regmap[hr]) {
3805       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3806         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3807           if((is32>>(regmap[hr]&63))&1) {
3808             int lr=get_reg(regmap,regmap[hr]-64);
3809             assert(lr>=0);
3810             emit_sarimm(lr,31,hr);
3811           }
3812           else
3813           {
3814             int value;
3815             if(get_final_value(hr,i,&value)) {
3816               if(value==0) {
3817                 emit_zeroreg(hr);
3818               }
3819               else {
3820                 emit_movimm(value,hr);
3821               }
3822             }
3823           }
3824         }
3825       }
3826     }
3827   }
3828 }
3829 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3830 {
3831   int hr;
3832   // Load 32-bit regs
3833   for(hr=0;hr<HOST_REGS;hr++) {
3834     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3835       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3836         int value=constmap[i][hr];
3837         if(value==0) {
3838           emit_zeroreg(hr);
3839         }
3840         else {
3841           emit_movimm(value,hr);
3842         }
3843       }
3844     }
3845   }
3846   // Load 64-bit regs
3847   for(hr=0;hr<HOST_REGS;hr++) {
3848     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3849       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3850         if((is32>>(regmap[hr]&63))&1) {
3851           int lr=get_reg(regmap,regmap[hr]-64);
3852           assert(lr>=0);
3853           emit_sarimm(lr,31,hr);
3854         }
3855         else
3856         {
3857           int value=constmap[i][hr];
3858           if(value==0) {
3859             emit_zeroreg(hr);
3860           }
3861           else {
3862             emit_movimm(value,hr);
3863           }
3864         }
3865       }
3866     }
3867   }
3868 }
3869
3870 // Write out all dirty registers (except cycle count)
3871 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3872 {
3873   int hr;
3874   for(hr=0;hr<HOST_REGS;hr++) {
3875     if(hr!=EXCLUDE_REG) {
3876       if(i_regmap[hr]>0) {
3877         if(i_regmap[hr]!=CCREG) {
3878           if((i_dirty>>hr)&1) {
3879             if(i_regmap[hr]<64) {
3880               emit_storereg(i_regmap[hr],hr);
3881             }else{
3882               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3883                 emit_storereg(i_regmap[hr],hr);
3884               }
3885             }
3886           }
3887         }
3888       }
3889     }
3890   }
3891 }
3892 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3893 // This writes the registers not written by store_regs_bt
3894 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3895 {
3896   int hr;
3897   int t=(addr-start)>>2;
3898   for(hr=0;hr<HOST_REGS;hr++) {
3899     if(hr!=EXCLUDE_REG) {
3900       if(i_regmap[hr]>0) {
3901         if(i_regmap[hr]!=CCREG) {
3902           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3903             if((i_dirty>>hr)&1) {
3904               if(i_regmap[hr]<64) {
3905                 emit_storereg(i_regmap[hr],hr);
3906               }else{
3907                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3908                   emit_storereg(i_regmap[hr],hr);
3909                 }
3910               }
3911             }
3912           }
3913         }
3914       }
3915     }
3916   }
3917 }
3918
3919 // Load all registers (except cycle count)
3920 void load_all_regs(signed char i_regmap[])
3921 {
3922   int hr;
3923   for(hr=0;hr<HOST_REGS;hr++) {
3924     if(hr!=EXCLUDE_REG) {
3925       if(i_regmap[hr]==0) {
3926         emit_zeroreg(hr);
3927       }
3928       else
3929       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3930       {
3931         emit_loadreg(i_regmap[hr],hr);
3932       }
3933     }
3934   }
3935 }
3936
3937 // Load all current registers also needed by next instruction
3938 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
3939 {
3940   int hr;
3941   for(hr=0;hr<HOST_REGS;hr++) {
3942     if(hr!=EXCLUDE_REG) {
3943       if(get_reg(next_regmap,i_regmap[hr])>=0) {
3944         if(i_regmap[hr]==0) {
3945           emit_zeroreg(hr);
3946         }
3947         else
3948         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3949         {
3950           emit_loadreg(i_regmap[hr],hr);
3951         }
3952       }
3953     }
3954   }
3955 }
3956
3957 // Load all regs, storing cycle count if necessary
3958 void load_regs_entry(int t)
3959 {
3960   int hr;
3961   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
3962   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
3963   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3964     emit_storereg(CCREG,HOST_CCREG);
3965   }
3966   // Load 32-bit regs
3967   for(hr=0;hr<HOST_REGS;hr++) {
3968     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3969       if(regs[t].regmap_entry[hr]==0) {
3970         emit_zeroreg(hr);
3971       }
3972       else if(regs[t].regmap_entry[hr]!=CCREG)
3973       {
3974         emit_loadreg(regs[t].regmap_entry[hr],hr);
3975       }
3976     }
3977   }
3978   // Load 64-bit regs
3979   for(hr=0;hr<HOST_REGS;hr++) {
3980     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
3981       assert(regs[t].regmap_entry[hr]!=64);
3982       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
3983         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3984         if(lr<0) {
3985           emit_loadreg(regs[t].regmap_entry[hr],hr);
3986         }
3987         else
3988         {
3989           emit_sarimm(lr,31,hr);
3990         }
3991       }
3992       else
3993       {
3994         emit_loadreg(regs[t].regmap_entry[hr],hr);
3995       }
3996     }
3997   }
3998 }
3999
4000 // Store dirty registers prior to branch
4001 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4002 {
4003   if(internal_branch(i_is32,addr))
4004   {
4005     int t=(addr-start)>>2;
4006     int hr;
4007     for(hr=0;hr<HOST_REGS;hr++) {
4008       if(hr!=EXCLUDE_REG) {
4009         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4010           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4011             if((i_dirty>>hr)&1) {
4012               if(i_regmap[hr]<64) {
4013                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4014                   emit_storereg(i_regmap[hr],hr);
4015                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4016                     #ifdef DESTRUCTIVE_WRITEBACK
4017                     emit_sarimm(hr,31,hr);
4018                     emit_storereg(i_regmap[hr]|64,hr);
4019                     #else
4020                     emit_sarimm(hr,31,HOST_TEMPREG);
4021                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4022                     #endif
4023                   }
4024                 }
4025               }else{
4026                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4027                   emit_storereg(i_regmap[hr],hr);
4028                 }
4029               }
4030             }
4031           }
4032         }
4033       }
4034     }
4035   }
4036   else
4037   {
4038     // Branch out of this block, write out all dirty regs
4039     wb_dirtys(i_regmap,i_is32,i_dirty);
4040   }
4041 }
4042
4043 // Load all needed registers for branch target
4044 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4045 {
4046   //if(addr>=start && addr<(start+slen*4))
4047   if(internal_branch(i_is32,addr))
4048   {
4049     int t=(addr-start)>>2;
4050     int hr;
4051     // Store the cycle count before loading something else
4052     if(i_regmap[HOST_CCREG]!=CCREG) {
4053       assert(i_regmap[HOST_CCREG]==-1);
4054     }
4055     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4056       emit_storereg(CCREG,HOST_CCREG);
4057     }
4058     // Load 32-bit regs
4059     for(hr=0;hr<HOST_REGS;hr++) {
4060       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4061         #ifdef DESTRUCTIVE_WRITEBACK
4062         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4063         #else
4064         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4065         #endif
4066           if(regs[t].regmap_entry[hr]==0) {
4067             emit_zeroreg(hr);
4068           }
4069           else if(regs[t].regmap_entry[hr]!=CCREG)
4070           {
4071             emit_loadreg(regs[t].regmap_entry[hr],hr);
4072           }
4073         }
4074       }
4075     }
4076     //Load 64-bit regs
4077     for(hr=0;hr<HOST_REGS;hr++) {
4078       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4079         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4080           assert(regs[t].regmap_entry[hr]!=64);
4081           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4082             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4083             if(lr<0) {
4084               emit_loadreg(regs[t].regmap_entry[hr],hr);
4085             }
4086             else
4087             {
4088               emit_sarimm(lr,31,hr);
4089             }
4090           }
4091           else
4092           {
4093             emit_loadreg(regs[t].regmap_entry[hr],hr);
4094           }
4095         }
4096         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4097           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4098           assert(lr>=0);
4099           emit_sarimm(lr,31,hr);
4100         }
4101       }
4102     }
4103   }
4104 }
4105
4106 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4107 {
4108   if(addr>=start && addr<start+slen*4-4)
4109   {
4110     int t=(addr-start)>>2;
4111     int hr;
4112     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4113     for(hr=0;hr<HOST_REGS;hr++)
4114     {
4115       if(hr!=EXCLUDE_REG)
4116       {
4117         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4118         {
4119           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4120           {
4121             return 0;
4122           }
4123           else
4124           if((i_dirty>>hr)&1)
4125           {
4126             if(i_regmap[hr]<TEMPREG)
4127             {
4128               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4129                 return 0;
4130             }
4131             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4132             {
4133               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4134                 return 0;
4135             }
4136           }
4137         }
4138         else // Same register but is it 32-bit or dirty?
4139         if(i_regmap[hr]>=0)
4140         {
4141           if(!((regs[t].dirty>>hr)&1))
4142           {
4143             if((i_dirty>>hr)&1)
4144             {
4145               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4146               {
4147                 //printf("%x: dirty no match\n",addr);
4148                 return 0;
4149               }
4150             }
4151           }
4152           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4153           {
4154             //printf("%x: is32 no match\n",addr);
4155             return 0;
4156           }
4157         }
4158       }
4159     }
4160     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4161     // Delay slots are not valid branch targets
4162     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4163     // Delay slots require additional processing, so do not match
4164     if(is_ds[t]) return 0;
4165   }
4166   else
4167   {
4168     int hr;
4169     for(hr=0;hr<HOST_REGS;hr++)
4170     {
4171       if(hr!=EXCLUDE_REG)
4172       {
4173         if(i_regmap[hr]>=0)
4174         {
4175           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4176           {
4177             if((i_dirty>>hr)&1)
4178             {
4179               return 0;
4180             }
4181           }
4182         }
4183       }
4184     }
4185   }
4186   return 1;
4187 }
4188
4189 // Used when a branch jumps into the delay slot of another branch
4190 void ds_assemble_entry(int i)
4191 {
4192   int t=(ba[i]-start)>>2;
4193   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4194   assem_debug("Assemble delay slot at %x\n",ba[i]);
4195   assem_debug("<->\n");
4196   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4197     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4198   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4199   address_generation(t,&regs[t],regs[t].regmap_entry);
4200   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4201     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4202   cop1_usable=0;
4203   is_delayslot=0;
4204   switch(itype[t]) {
4205     case ALU:
4206       alu_assemble(t,&regs[t]);break;
4207     case IMM16:
4208       imm16_assemble(t,&regs[t]);break;
4209     case SHIFT:
4210       shift_assemble(t,&regs[t]);break;
4211     case SHIFTIMM:
4212       shiftimm_assemble(t,&regs[t]);break;
4213     case LOAD:
4214       load_assemble(t,&regs[t]);break;
4215     case LOADLR:
4216       loadlr_assemble(t,&regs[t]);break;
4217     case STORE:
4218       store_assemble(t,&regs[t]);break;
4219     case STORELR:
4220       storelr_assemble(t,&regs[t]);break;
4221     case COP0:
4222       cop0_assemble(t,&regs[t]);break;
4223     case COP1:
4224       cop1_assemble(t,&regs[t]);break;
4225     case C1LS:
4226       c1ls_assemble(t,&regs[t]);break;
4227     case COP2:
4228       cop2_assemble(t,&regs[t]);break;
4229     case C2LS:
4230       c2ls_assemble(t,&regs[t]);break;
4231     case C2OP:
4232       c2op_assemble(t,&regs[t]);break;
4233     case FCONV:
4234       fconv_assemble(t,&regs[t]);break;
4235     case FLOAT:
4236       float_assemble(t,&regs[t]);break;
4237     case FCOMP:
4238       fcomp_assemble(t,&regs[t]);break;
4239     case MULTDIV:
4240       multdiv_assemble(t,&regs[t]);break;
4241     case MOV:
4242       mov_assemble(t,&regs[t]);break;
4243     case SYSCALL:
4244     case HLECALL:
4245     case INTCALL:
4246     case SPAN:
4247     case UJUMP:
4248     case RJUMP:
4249     case CJUMP:
4250     case SJUMP:
4251     case FJUMP:
4252       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4253   }
4254   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4255   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4256   if(internal_branch(regs[t].is32,ba[i]+4))
4257     assem_debug("branch: internal\n");
4258   else
4259     assem_debug("branch: external\n");
4260   assert(internal_branch(regs[t].is32,ba[i]+4));
4261   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4262   emit_jmp(0);
4263 }
4264
4265 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4266 {
4267   int count;
4268   int jaddr;
4269   int idle=0;
4270   int t=0;
4271   if(itype[i]==RJUMP)
4272   {
4273     *adj=0;
4274   }
4275   //if(ba[i]>=start && ba[i]<(start+slen*4))
4276   if(internal_branch(branch_regs[i].is32,ba[i]))
4277   {
4278     t=(ba[i]-start)>>2;
4279     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4280     else *adj=ccadj[t];
4281   }
4282   else
4283   {
4284     *adj=0;
4285   }
4286   count=ccadj[i];
4287   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4288     // Idle loop
4289     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4290     idle=(int)out;
4291     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4292     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4293     jaddr=(int)out;
4294     emit_jmp(0);
4295   }
4296   else if(*adj==0||invert) {
4297     int cycles=CLOCK_ADJUST(count+2);
4298     // faster loop HACK
4299     if (t&&*adj) {
4300       int rel=t-i;
4301       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4302         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4303     }
4304     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4305     jaddr=(int)out;
4306     emit_jns(0);
4307   }
4308   else
4309   {
4310     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4311     jaddr=(int)out;
4312     emit_jns(0);
4313   }
4314   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4315 }
4316
4317 void do_ccstub(int n)
4318 {
4319   literal_pool(256);
4320   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4321   set_jump_target(stubs[n][1],(int)out);
4322   int i=stubs[n][4];
4323   if(stubs[n][6]==NULLDS) {
4324     // Delay slot instruction is nullified ("likely" branch)
4325     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4326   }
4327   else if(stubs[n][6]!=TAKEN) {
4328     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4329   }
4330   else {
4331     if(internal_branch(branch_regs[i].is32,ba[i]))
4332       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4333   }
4334   if(stubs[n][5]!=-1)
4335   {
4336     // Save PC as return address
4337     emit_movimm(stubs[n][5],EAX);
4338     emit_writeword(EAX,(int)&pcaddr);
4339   }
4340   else
4341   {
4342     // Return address depends on which way the branch goes
4343     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4344     {
4345       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4346       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4347       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4348       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4349       if(rs1[i]==0)
4350       {
4351         s1l=s2l;s1h=s2h;
4352         s2l=s2h=-1;
4353       }
4354       else if(rs2[i]==0)
4355       {
4356         s2l=s2h=-1;
4357       }
4358       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4359         s1h=s2h=-1;
4360       }
4361       assert(s1l>=0);
4362       #ifdef DESTRUCTIVE_WRITEBACK
4363       if(rs1[i]) {
4364         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4365           emit_loadreg(rs1[i],s1l);
4366       }
4367       else {
4368         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4369           emit_loadreg(rs2[i],s1l);
4370       }
4371       if(s2l>=0)
4372         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4373           emit_loadreg(rs2[i],s2l);
4374       #endif
4375       int hr=0;
4376       int addr=-1,alt=-1,ntaddr=-1;
4377       while(hr<HOST_REGS)
4378       {
4379         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4380            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4381            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4382         {
4383           addr=hr++;break;
4384         }
4385         hr++;
4386       }
4387       while(hr<HOST_REGS)
4388       {
4389         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4390            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4391            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4392         {
4393           alt=hr++;break;
4394         }
4395         hr++;
4396       }
4397       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4398       {
4399         while(hr<HOST_REGS)
4400         {
4401           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4402              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4403              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4404           {
4405             ntaddr=hr;break;
4406           }
4407           hr++;
4408         }
4409         assert(hr<HOST_REGS);
4410       }
4411       if((opcode[i]&0x2f)==4) // BEQ
4412       {
4413         #ifdef HAVE_CMOV_IMM
4414         if(s1h<0) {
4415           if(s2l>=0) emit_cmp(s1l,s2l);
4416           else emit_test(s1l,s1l);
4417           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4418         }
4419         else
4420         #endif
4421         {
4422           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4423           if(s1h>=0) {
4424             if(s2h>=0) emit_cmp(s1h,s2h);
4425             else emit_test(s1h,s1h);
4426             emit_cmovne_reg(alt,addr);
4427           }
4428           if(s2l>=0) emit_cmp(s1l,s2l);
4429           else emit_test(s1l,s1l);
4430           emit_cmovne_reg(alt,addr);
4431         }
4432       }
4433       if((opcode[i]&0x2f)==5) // BNE
4434       {
4435         #ifdef HAVE_CMOV_IMM
4436         if(s1h<0) {
4437           if(s2l>=0) emit_cmp(s1l,s2l);
4438           else emit_test(s1l,s1l);
4439           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4440         }
4441         else
4442         #endif
4443         {
4444           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4445           if(s1h>=0) {
4446             if(s2h>=0) emit_cmp(s1h,s2h);
4447             else emit_test(s1h,s1h);
4448             emit_cmovne_reg(alt,addr);
4449           }
4450           if(s2l>=0) emit_cmp(s1l,s2l);
4451           else emit_test(s1l,s1l);
4452           emit_cmovne_reg(alt,addr);
4453         }
4454       }
4455       if((opcode[i]&0x2f)==6) // BLEZ
4456       {
4457         //emit_movimm(ba[i],alt);
4458         //emit_movimm(start+i*4+8,addr);
4459         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4460         emit_cmpimm(s1l,1);
4461         if(s1h>=0) emit_mov(addr,ntaddr);
4462         emit_cmovl_reg(alt,addr);
4463         if(s1h>=0) {
4464           emit_test(s1h,s1h);
4465           emit_cmovne_reg(ntaddr,addr);
4466           emit_cmovs_reg(alt,addr);
4467         }
4468       }
4469       if((opcode[i]&0x2f)==7) // BGTZ
4470       {
4471         //emit_movimm(ba[i],addr);
4472         //emit_movimm(start+i*4+8,ntaddr);
4473         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4474         emit_cmpimm(s1l,1);
4475         if(s1h>=0) emit_mov(addr,alt);
4476         emit_cmovl_reg(ntaddr,addr);
4477         if(s1h>=0) {
4478           emit_test(s1h,s1h);
4479           emit_cmovne_reg(alt,addr);
4480           emit_cmovs_reg(ntaddr,addr);
4481         }
4482       }
4483       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4484       {
4485         //emit_movimm(ba[i],alt);
4486         //emit_movimm(start+i*4+8,addr);
4487         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4488         if(s1h>=0) emit_test(s1h,s1h);
4489         else emit_test(s1l,s1l);
4490         emit_cmovs_reg(alt,addr);
4491       }
4492       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4493       {
4494         //emit_movimm(ba[i],addr);
4495         //emit_movimm(start+i*4+8,alt);
4496         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4497         if(s1h>=0) emit_test(s1h,s1h);
4498         else emit_test(s1l,s1l);
4499         emit_cmovs_reg(alt,addr);
4500       }
4501       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4502         if(source[i]&0x10000) // BC1T
4503         {
4504           //emit_movimm(ba[i],alt);
4505           //emit_movimm(start+i*4+8,addr);
4506           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4507           emit_testimm(s1l,0x800000);
4508           emit_cmovne_reg(alt,addr);
4509         }
4510         else // BC1F
4511         {
4512           //emit_movimm(ba[i],addr);
4513           //emit_movimm(start+i*4+8,alt);
4514           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4515           emit_testimm(s1l,0x800000);
4516           emit_cmovne_reg(alt,addr);
4517         }
4518       }
4519       emit_writeword(addr,(int)&pcaddr);
4520     }
4521     else
4522     if(itype[i]==RJUMP)
4523     {
4524       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4525       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4526         r=get_reg(branch_regs[i].regmap,RTEMP);
4527       }
4528       emit_writeword(r,(int)&pcaddr);
4529     }
4530     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4531   }
4532   // Update cycle count
4533   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4534   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4535   emit_call((int)cc_interrupt);
4536   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
4537   if(stubs[n][6]==TAKEN) {
4538     if(internal_branch(branch_regs[i].is32,ba[i]))
4539       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4540     else if(itype[i]==RJUMP) {
4541       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4542         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4543       else
4544         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4545     }
4546   }else if(stubs[n][6]==NOTTAKEN) {
4547     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4548     else load_all_regs(branch_regs[i].regmap);
4549   }else if(stubs[n][6]==NULLDS) {
4550     // Delay slot instruction is nullified ("likely" branch)
4551     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4552     else load_all_regs(regs[i].regmap);
4553   }else{
4554     load_all_regs(branch_regs[i].regmap);
4555   }
4556   emit_jmp(stubs[n][2]); // return address
4557
4558   /* This works but uses a lot of memory...
4559   emit_readword((int)&last_count,ECX);
4560   emit_add(HOST_CCREG,ECX,EAX);
4561   emit_writeword(EAX,(int)&Count);
4562   emit_call((int)gen_interupt);
4563   emit_readword((int)&Count,HOST_CCREG);
4564   emit_readword((int)&next_interupt,EAX);
4565   emit_readword((int)&pending_exception,EBX);
4566   emit_writeword(EAX,(int)&last_count);
4567   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4568   emit_test(EBX,EBX);
4569   int jne_instr=(int)out;
4570   emit_jne(0);
4571   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4572   load_all_regs(branch_regs[i].regmap);
4573   emit_jmp(stubs[n][2]); // return address
4574   set_jump_target(jne_instr,(int)out);
4575   emit_readword((int)&pcaddr,EAX);
4576   // Call get_addr_ht instead of doing the hash table here.
4577   // This code is executed infrequently and takes up a lot of space
4578   // so smaller is better.
4579   emit_storereg(CCREG,HOST_CCREG);
4580   emit_pushreg(EAX);
4581   emit_call((int)get_addr_ht);
4582   emit_loadreg(CCREG,HOST_CCREG);
4583   emit_addimm(ESP,4,ESP);
4584   emit_jmpreg(EAX);*/
4585 }
4586
4587 add_to_linker(int addr,int target,int ext)
4588 {
4589   link_addr[linkcount][0]=addr;
4590   link_addr[linkcount][1]=target;
4591   link_addr[linkcount][2]=ext;
4592   linkcount++;
4593 }
4594
4595 static void ujump_assemble_write_ra(int i)
4596 {
4597   int rt;
4598   unsigned int return_address;
4599   rt=get_reg(branch_regs[i].regmap,31);
4600   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4601   //assert(rt>=0);
4602   return_address=start+i*4+8;
4603   if(rt>=0) {
4604     #ifdef USE_MINI_HT
4605     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4606       int temp=-1; // note: must be ds-safe
4607       #ifdef HOST_TEMPREG
4608       temp=HOST_TEMPREG;
4609       #endif
4610       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4611       else emit_movimm(return_address,rt);
4612     }
4613     else
4614     #endif
4615     {
4616       #ifdef REG_PREFETCH
4617       if(temp>=0)
4618       {
4619         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4620       }
4621       #endif
4622       emit_movimm(return_address,rt); // PC into link register
4623       #ifdef IMM_PREFETCH
4624       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4625       #endif
4626     }
4627   }
4628 }
4629
4630 void ujump_assemble(int i,struct regstat *i_regs)
4631 {
4632   signed char *i_regmap=i_regs->regmap;
4633   int ra_done=0;
4634   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4635   address_generation(i+1,i_regs,regs[i].regmap_entry);
4636   #ifdef REG_PREFETCH
4637   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4638   if(rt1[i]==31&&temp>=0)
4639   {
4640     int return_address=start+i*4+8;
4641     if(get_reg(branch_regs[i].regmap,31)>0)
4642     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4643   }
4644   #endif
4645   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4646     ujump_assemble_write_ra(i); // writeback ra for DS
4647     ra_done=1;
4648   }
4649   ds_assemble(i+1,i_regs);
4650   uint64_t bc_unneeded=branch_regs[i].u;
4651   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4652   bc_unneeded|=1|(1LL<<rt1[i]);
4653   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4654   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4655                 bc_unneeded,bc_unneeded_upper);
4656   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4657   if(!ra_done&&rt1[i]==31)
4658     ujump_assemble_write_ra(i);
4659   int cc,adj;
4660   cc=get_reg(branch_regs[i].regmap,CCREG);
4661   assert(cc==HOST_CCREG);
4662   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4663   #ifdef REG_PREFETCH
4664   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4665   #endif
4666   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4667   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4668   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4669   if(internal_branch(branch_regs[i].is32,ba[i]))
4670     assem_debug("branch: internal\n");
4671   else
4672     assem_debug("branch: external\n");
4673   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4674     ds_assemble_entry(i);
4675   }
4676   else {
4677     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4678     emit_jmp(0);
4679   }
4680 }
4681
4682 static void rjump_assemble_write_ra(int i)
4683 {
4684   int rt,return_address;
4685   assert(rt1[i+1]!=rt1[i]);
4686   assert(rt2[i+1]!=rt1[i]);
4687   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4688   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4689   assert(rt>=0);
4690   return_address=start+i*4+8;
4691   #ifdef REG_PREFETCH
4692   if(temp>=0)
4693   {
4694     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4695   }
4696   #endif
4697   emit_movimm(return_address,rt); // PC into link register
4698   #ifdef IMM_PREFETCH
4699   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
4700   #endif
4701 }
4702
4703 void rjump_assemble(int i,struct regstat *i_regs)
4704 {
4705   signed char *i_regmap=i_regs->regmap;
4706   int temp;
4707   int rs,cc,adj;
4708   int ra_done=0;
4709   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4710   assert(rs>=0);
4711   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4712     // Delay slot abuse, make a copy of the branch address register
4713     temp=get_reg(branch_regs[i].regmap,RTEMP);
4714     assert(temp>=0);
4715     assert(regs[i].regmap[temp]==RTEMP);
4716     emit_mov(rs,temp);
4717     rs=temp;
4718   }
4719   address_generation(i+1,i_regs,regs[i].regmap_entry);
4720   #ifdef REG_PREFETCH
4721   if(rt1[i]==31)
4722   {
4723     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4724       int return_address=start+i*4+8;
4725       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
4726     }
4727   }
4728   #endif
4729   #ifdef USE_MINI_HT
4730   if(rs1[i]==31) {
4731     int rh=get_reg(regs[i].regmap,RHASH);
4732     if(rh>=0) do_preload_rhash(rh);
4733   }
4734   #endif
4735   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4736     rjump_assemble_write_ra(i);
4737     ra_done=1;
4738   }
4739   ds_assemble(i+1,i_regs);
4740   uint64_t bc_unneeded=branch_regs[i].u;
4741   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4742   bc_unneeded|=1|(1LL<<rt1[i]);
4743   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4744   bc_unneeded&=~(1LL<<rs1[i]);
4745   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4746                 bc_unneeded,bc_unneeded_upper);
4747   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4748   if(!ra_done&&rt1[i]!=0)
4749     rjump_assemble_write_ra(i);
4750   cc=get_reg(branch_regs[i].regmap,CCREG);
4751   assert(cc==HOST_CCREG);
4752   #ifdef USE_MINI_HT
4753   int rh=get_reg(branch_regs[i].regmap,RHASH);
4754   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4755   if(rs1[i]==31) {
4756     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4757     do_preload_rhtbl(ht);
4758     do_rhash(rs,rh);
4759   }
4760   #endif
4761   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4762   #ifdef DESTRUCTIVE_WRITEBACK
4763   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4764     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4765       emit_loadreg(rs1[i],rs);
4766     }
4767   }
4768   #endif
4769   #ifdef REG_PREFETCH
4770   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4771   #endif
4772   #ifdef USE_MINI_HT
4773   if(rs1[i]==31) {
4774     do_miniht_load(ht,rh);
4775   }
4776   #endif
4777   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4778   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4779   //assert(adj==0);
4780   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4781   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4782   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4783     // special case for RFE
4784     emit_jmp(0);
4785   else
4786     emit_jns(0);
4787   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4788   #ifdef USE_MINI_HT
4789   if(rs1[i]==31) {
4790     do_miniht_jump(rs,rh,ht);
4791   }
4792   else
4793   #endif
4794   {
4795     //if(rs!=EAX) emit_mov(rs,EAX);
4796     //emit_jmp((int)jump_vaddr_eax);
4797     emit_jmp(jump_vaddr_reg[rs]);
4798   }
4799   /* Check hash table
4800   temp=!rs;
4801   emit_mov(rs,temp);
4802   emit_shrimm(rs,16,rs);
4803   emit_xor(temp,rs,rs);
4804   emit_movzwl_reg(rs,rs);
4805   emit_shlimm(rs,4,rs);
4806   emit_cmpmem_indexed((int)hash_table,rs,temp);
4807   emit_jne((int)out+14);
4808   emit_readword_indexed((int)hash_table+4,rs,rs);
4809   emit_jmpreg(rs);
4810   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
4811   emit_addimm_no_flags(8,rs);
4812   emit_jeq((int)out-17);
4813   // No hit on hash table, call compiler
4814   emit_pushreg(temp);
4815 //DEBUG >
4816 #ifdef DEBUG_CYCLE_COUNT
4817   emit_readword((int)&last_count,ECX);
4818   emit_add(HOST_CCREG,ECX,HOST_CCREG);
4819   emit_readword((int)&next_interupt,ECX);
4820   emit_writeword(HOST_CCREG,(int)&Count);
4821   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
4822   emit_writeword(ECX,(int)&last_count);
4823 #endif
4824 //DEBUG <
4825   emit_storereg(CCREG,HOST_CCREG);
4826   emit_call((int)get_addr);
4827   emit_loadreg(CCREG,HOST_CCREG);
4828   emit_addimm(ESP,4,ESP);
4829   emit_jmpreg(EAX);*/
4830   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4831   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4832   #endif
4833 }
4834
4835 void cjump_assemble(int i,struct regstat *i_regs)
4836 {
4837   signed char *i_regmap=i_regs->regmap;
4838   int cc;
4839   int match;
4840   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4841   assem_debug("match=%d\n",match);
4842   int s1h,s1l,s2h,s2l;
4843   int prev_cop1_usable=cop1_usable;
4844   int unconditional=0,nop=0;
4845   int only32=0;
4846   int invert=0;
4847   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4848   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4849   if(!match) invert=1;
4850   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4851   if(i>(ba[i]-start)>>2) invert=1;
4852   #endif
4853
4854   if(ooo[i]) {
4855     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4856     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4857     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4858     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4859   }
4860   else {
4861     s1l=get_reg(i_regmap,rs1[i]);
4862     s1h=get_reg(i_regmap,rs1[i]|64);
4863     s2l=get_reg(i_regmap,rs2[i]);
4864     s2h=get_reg(i_regmap,rs2[i]|64);
4865   }
4866   if(rs1[i]==0&&rs2[i]==0)
4867   {
4868     if(opcode[i]&1) nop=1;
4869     else unconditional=1;
4870     //assert(opcode[i]!=5);
4871     //assert(opcode[i]!=7);
4872     //assert(opcode[i]!=0x15);
4873     //assert(opcode[i]!=0x17);
4874   }
4875   else if(rs1[i]==0)
4876   {
4877     s1l=s2l;s1h=s2h;
4878     s2l=s2h=-1;
4879     only32=(regs[i].was32>>rs2[i])&1;
4880   }
4881   else if(rs2[i]==0)
4882   {
4883     s2l=s2h=-1;
4884     only32=(regs[i].was32>>rs1[i])&1;
4885   }
4886   else {
4887     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
4888   }
4889
4890   if(ooo[i]) {
4891     // Out of order execution (delay slot first)
4892     //printf("OOOE\n");
4893     address_generation(i+1,i_regs,regs[i].regmap_entry);
4894     ds_assemble(i+1,i_regs);
4895     int adj;
4896     uint64_t bc_unneeded=branch_regs[i].u;
4897     uint64_t bc_unneeded_upper=branch_regs[i].uu;
4898     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4899     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
4900     bc_unneeded|=1;
4901     bc_unneeded_upper|=1;
4902     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4903                   bc_unneeded,bc_unneeded_upper);
4904     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
4905     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4906     cc=get_reg(branch_regs[i].regmap,CCREG);
4907     assert(cc==HOST_CCREG);
4908     if(unconditional)
4909       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4910     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4911     //assem_debug("cycle count (adj)\n");
4912     if(unconditional) {
4913       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4914       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4915         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4916         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4917         if(internal)
4918           assem_debug("branch: internal\n");
4919         else
4920           assem_debug("branch: external\n");
4921         if(internal&&is_ds[(ba[i]-start)>>2]) {
4922           ds_assemble_entry(i);
4923         }
4924         else {
4925           add_to_linker((int)out,ba[i],internal);
4926           emit_jmp(0);
4927         }
4928         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4929         if(((u_int)out)&7) emit_addnop(0);
4930         #endif
4931       }
4932     }
4933     else if(nop) {
4934       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4935       int jaddr=(int)out;
4936       emit_jns(0);
4937       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
4938     }
4939     else {
4940       int taken=0,nottaken=0,nottaken1=0;
4941       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4942       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4943       if(!only32)
4944       {
4945         assert(s1h>=0);
4946         if(opcode[i]==4) // BEQ
4947         {
4948           if(s2h>=0) emit_cmp(s1h,s2h);
4949           else emit_test(s1h,s1h);
4950           nottaken1=(int)out;
4951           emit_jne(1);
4952         }
4953         if(opcode[i]==5) // BNE
4954         {
4955           if(s2h>=0) emit_cmp(s1h,s2h);
4956           else emit_test(s1h,s1h);
4957           if(invert) taken=(int)out;
4958           else add_to_linker((int)out,ba[i],internal);
4959           emit_jne(0);
4960         }
4961         if(opcode[i]==6) // BLEZ
4962         {
4963           emit_test(s1h,s1h);
4964           if(invert) taken=(int)out;
4965           else add_to_linker((int)out,ba[i],internal);
4966           emit_js(0);
4967           nottaken1=(int)out;
4968           emit_jne(1);
4969         }
4970         if(opcode[i]==7) // BGTZ
4971         {
4972           emit_test(s1h,s1h);
4973           nottaken1=(int)out;
4974           emit_js(1);
4975           if(invert) taken=(int)out;
4976           else add_to_linker((int)out,ba[i],internal);
4977           emit_jne(0);
4978         }
4979       } // if(!only32)
4980
4981       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4982       assert(s1l>=0);
4983       if(opcode[i]==4) // BEQ
4984       {
4985         if(s2l>=0) emit_cmp(s1l,s2l);
4986         else emit_test(s1l,s1l);
4987         if(invert){
4988           nottaken=(int)out;
4989           emit_jne(1);
4990         }else{
4991           add_to_linker((int)out,ba[i],internal);
4992           emit_jeq(0);
4993         }
4994       }
4995       if(opcode[i]==5) // BNE
4996       {
4997         if(s2l>=0) emit_cmp(s1l,s2l);
4998         else emit_test(s1l,s1l);
4999         if(invert){
5000           nottaken=(int)out;
5001           emit_jeq(1);
5002         }else{
5003           add_to_linker((int)out,ba[i],internal);
5004           emit_jne(0);
5005         }
5006       }
5007       if(opcode[i]==6) // BLEZ
5008       {
5009         emit_cmpimm(s1l,1);
5010         if(invert){
5011           nottaken=(int)out;
5012           emit_jge(1);
5013         }else{
5014           add_to_linker((int)out,ba[i],internal);
5015           emit_jl(0);
5016         }
5017       }
5018       if(opcode[i]==7) // BGTZ
5019       {
5020         emit_cmpimm(s1l,1);
5021         if(invert){
5022           nottaken=(int)out;
5023           emit_jl(1);
5024         }else{
5025           add_to_linker((int)out,ba[i],internal);
5026           emit_jge(0);
5027         }
5028       }
5029       if(invert) {
5030         if(taken) set_jump_target(taken,(int)out);
5031         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5032         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5033           if(adj) {
5034             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5035             add_to_linker((int)out,ba[i],internal);
5036           }else{
5037             emit_addnop(13);
5038             add_to_linker((int)out,ba[i],internal*2);
5039           }
5040           emit_jmp(0);
5041         }else
5042         #endif
5043         {
5044           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5045           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5046           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5047           if(internal)
5048             assem_debug("branch: internal\n");
5049           else
5050             assem_debug("branch: external\n");
5051           if(internal&&is_ds[(ba[i]-start)>>2]) {
5052             ds_assemble_entry(i);
5053           }
5054           else {
5055             add_to_linker((int)out,ba[i],internal);
5056             emit_jmp(0);
5057           }
5058         }
5059         set_jump_target(nottaken,(int)out);
5060       }
5061
5062       if(nottaken1) set_jump_target(nottaken1,(int)out);
5063       if(adj) {
5064         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5065       }
5066     } // (!unconditional)
5067   } // if(ooo)
5068   else
5069   {
5070     // In-order execution (branch first)
5071     //if(likely[i]) printf("IOL\n");
5072     //else
5073     //printf("IOE\n");
5074     int taken=0,nottaken=0,nottaken1=0;
5075     if(!unconditional&&!nop) {
5076       if(!only32)
5077       {
5078         assert(s1h>=0);
5079         if((opcode[i]&0x2f)==4) // BEQ
5080         {
5081           if(s2h>=0) emit_cmp(s1h,s2h);
5082           else emit_test(s1h,s1h);
5083           nottaken1=(int)out;
5084           emit_jne(2);
5085         }
5086         if((opcode[i]&0x2f)==5) // BNE
5087         {
5088           if(s2h>=0) emit_cmp(s1h,s2h);
5089           else emit_test(s1h,s1h);
5090           taken=(int)out;
5091           emit_jne(1);
5092         }
5093         if((opcode[i]&0x2f)==6) // BLEZ
5094         {
5095           emit_test(s1h,s1h);
5096           taken=(int)out;
5097           emit_js(1);
5098           nottaken1=(int)out;
5099           emit_jne(2);
5100         }
5101         if((opcode[i]&0x2f)==7) // BGTZ
5102         {
5103           emit_test(s1h,s1h);
5104           nottaken1=(int)out;
5105           emit_js(2);
5106           taken=(int)out;
5107           emit_jne(1);
5108         }
5109       } // if(!only32)
5110
5111       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5112       assert(s1l>=0);
5113       if((opcode[i]&0x2f)==4) // BEQ
5114       {
5115         if(s2l>=0) emit_cmp(s1l,s2l);
5116         else emit_test(s1l,s1l);
5117         nottaken=(int)out;
5118         emit_jne(2);
5119       }
5120       if((opcode[i]&0x2f)==5) // BNE
5121       {
5122         if(s2l>=0) emit_cmp(s1l,s2l);
5123         else emit_test(s1l,s1l);
5124         nottaken=(int)out;
5125         emit_jeq(2);
5126       }
5127       if((opcode[i]&0x2f)==6) // BLEZ
5128       {
5129         emit_cmpimm(s1l,1);
5130         nottaken=(int)out;
5131         emit_jge(2);
5132       }
5133       if((opcode[i]&0x2f)==7) // BGTZ
5134       {
5135         emit_cmpimm(s1l,1);
5136         nottaken=(int)out;
5137         emit_jl(2);
5138       }
5139     } // if(!unconditional)
5140     int adj;
5141     uint64_t ds_unneeded=branch_regs[i].u;
5142     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5143     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5144     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5145     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5146     ds_unneeded|=1;
5147     ds_unneeded_upper|=1;
5148     // branch taken
5149     if(!nop) {
5150       if(taken) set_jump_target(taken,(int)out);
5151       assem_debug("1:\n");
5152       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5153                     ds_unneeded,ds_unneeded_upper);
5154       // load regs
5155       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5156       address_generation(i+1,&branch_regs[i],0);
5157       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5158       ds_assemble(i+1,&branch_regs[i]);
5159       cc=get_reg(branch_regs[i].regmap,CCREG);
5160       if(cc==-1) {
5161         emit_loadreg(CCREG,cc=HOST_CCREG);
5162         // CHECK: Is the following instruction (fall thru) allocated ok?
5163       }
5164       assert(cc==HOST_CCREG);
5165       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5166       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5167       assem_debug("cycle count (adj)\n");
5168       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5169       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5170       if(internal)
5171         assem_debug("branch: internal\n");
5172       else
5173         assem_debug("branch: external\n");
5174       if(internal&&is_ds[(ba[i]-start)>>2]) {
5175         ds_assemble_entry(i);
5176       }
5177       else {
5178         add_to_linker((int)out,ba[i],internal);
5179         emit_jmp(0);
5180       }
5181     }
5182     // branch not taken
5183     cop1_usable=prev_cop1_usable;
5184     if(!unconditional) {
5185       if(nottaken1) set_jump_target(nottaken1,(int)out);
5186       set_jump_target(nottaken,(int)out);
5187       assem_debug("2:\n");
5188       if(!likely[i]) {
5189         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5190                       ds_unneeded,ds_unneeded_upper);
5191         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5192         address_generation(i+1,&branch_regs[i],0);
5193         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5194         ds_assemble(i+1,&branch_regs[i]);
5195       }
5196       cc=get_reg(branch_regs[i].regmap,CCREG);
5197       if(cc==-1&&!likely[i]) {
5198         // Cycle count isn't in a register, temporarily load it then write it out
5199         emit_loadreg(CCREG,HOST_CCREG);
5200         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5201         int jaddr=(int)out;
5202         emit_jns(0);
5203         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5204         emit_storereg(CCREG,HOST_CCREG);
5205       }
5206       else{
5207         cc=get_reg(i_regmap,CCREG);
5208         assert(cc==HOST_CCREG);
5209         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5210         int jaddr=(int)out;
5211         emit_jns(0);
5212         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5213       }
5214     }
5215   }
5216 }
5217
5218 void sjump_assemble(int i,struct regstat *i_regs)
5219 {
5220   signed char *i_regmap=i_regs->regmap;
5221   int cc;
5222   int match;
5223   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5224   assem_debug("smatch=%d\n",match);
5225   int s1h,s1l;
5226   int prev_cop1_usable=cop1_usable;
5227   int unconditional=0,nevertaken=0;
5228   int only32=0;
5229   int invert=0;
5230   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5231   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5232   if(!match) invert=1;
5233   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5234   if(i>(ba[i]-start)>>2) invert=1;
5235   #endif
5236
5237   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5238   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5239
5240   if(ooo[i]) {
5241     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5242     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5243   }
5244   else {
5245     s1l=get_reg(i_regmap,rs1[i]);
5246     s1h=get_reg(i_regmap,rs1[i]|64);
5247   }
5248   if(rs1[i]==0)
5249   {
5250     if(opcode2[i]&1) unconditional=1;
5251     else nevertaken=1;
5252     // These are never taken (r0 is never less than zero)
5253     //assert(opcode2[i]!=0);
5254     //assert(opcode2[i]!=2);
5255     //assert(opcode2[i]!=0x10);
5256     //assert(opcode2[i]!=0x12);
5257   }
5258   else {
5259     only32=(regs[i].was32>>rs1[i])&1;
5260   }
5261
5262   if(ooo[i]) {
5263     // Out of order execution (delay slot first)
5264     //printf("OOOE\n");
5265     address_generation(i+1,i_regs,regs[i].regmap_entry);
5266     ds_assemble(i+1,i_regs);
5267     int adj;
5268     uint64_t bc_unneeded=branch_regs[i].u;
5269     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5270     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5271     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5272     bc_unneeded|=1;
5273     bc_unneeded_upper|=1;
5274     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5275                   bc_unneeded,bc_unneeded_upper);
5276     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5277     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5278     if(rt1[i]==31) {
5279       int rt,return_address;
5280       rt=get_reg(branch_regs[i].regmap,31);
5281       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5282       if(rt>=0) {
5283         // Save the PC even if the branch is not taken
5284         return_address=start+i*4+8;
5285         emit_movimm(return_address,rt); // PC into link register
5286         #ifdef IMM_PREFETCH
5287         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5288         #endif
5289       }
5290     }
5291     cc=get_reg(branch_regs[i].regmap,CCREG);
5292     assert(cc==HOST_CCREG);
5293     if(unconditional)
5294       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5295     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5296     assem_debug("cycle count (adj)\n");
5297     if(unconditional) {
5298       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5299       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5300         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5301         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5302         if(internal)
5303           assem_debug("branch: internal\n");
5304         else
5305           assem_debug("branch: external\n");
5306         if(internal&&is_ds[(ba[i]-start)>>2]) {
5307           ds_assemble_entry(i);
5308         }
5309         else {
5310           add_to_linker((int)out,ba[i],internal);
5311           emit_jmp(0);
5312         }
5313         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5314         if(((u_int)out)&7) emit_addnop(0);
5315         #endif
5316       }
5317     }
5318     else if(nevertaken) {
5319       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5320       int jaddr=(int)out;
5321       emit_jns(0);
5322       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5323     }
5324     else {
5325       int nottaken=0;
5326       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5327       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5328       if(!only32)
5329       {
5330         assert(s1h>=0);
5331         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5332         {
5333           emit_test(s1h,s1h);
5334           if(invert){
5335             nottaken=(int)out;
5336             emit_jns(1);
5337           }else{
5338             add_to_linker((int)out,ba[i],internal);
5339             emit_js(0);
5340           }
5341         }
5342         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5343         {
5344           emit_test(s1h,s1h);
5345           if(invert){
5346             nottaken=(int)out;
5347             emit_js(1);
5348           }else{
5349             add_to_linker((int)out,ba[i],internal);
5350             emit_jns(0);
5351           }
5352         }
5353       } // if(!only32)
5354       else
5355       {
5356         assert(s1l>=0);
5357         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5358         {
5359           emit_test(s1l,s1l);
5360           if(invert){
5361             nottaken=(int)out;
5362             emit_jns(1);
5363           }else{
5364             add_to_linker((int)out,ba[i],internal);
5365             emit_js(0);
5366           }
5367         }
5368         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5369         {
5370           emit_test(s1l,s1l);
5371           if(invert){
5372             nottaken=(int)out;
5373             emit_js(1);
5374           }else{
5375             add_to_linker((int)out,ba[i],internal);
5376             emit_jns(0);
5377           }
5378         }
5379       } // if(!only32)
5380
5381       if(invert) {
5382         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5383         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5384           if(adj) {
5385             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5386             add_to_linker((int)out,ba[i],internal);
5387           }else{
5388             emit_addnop(13);
5389             add_to_linker((int)out,ba[i],internal*2);
5390           }
5391           emit_jmp(0);
5392         }else
5393         #endif
5394         {
5395           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5396           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5397           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5398           if(internal)
5399             assem_debug("branch: internal\n");
5400           else
5401             assem_debug("branch: external\n");
5402           if(internal&&is_ds[(ba[i]-start)>>2]) {
5403             ds_assemble_entry(i);
5404           }
5405           else {
5406             add_to_linker((int)out,ba[i],internal);
5407             emit_jmp(0);
5408           }
5409         }
5410         set_jump_target(nottaken,(int)out);
5411       }
5412
5413       if(adj) {
5414         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5415       }
5416     } // (!unconditional)
5417   } // if(ooo)
5418   else
5419   {
5420     // In-order execution (branch first)
5421     //printf("IOE\n");
5422     int nottaken=0;
5423     if(rt1[i]==31) {
5424       int rt,return_address;
5425       rt=get_reg(branch_regs[i].regmap,31);
5426       if(rt>=0) {
5427         // Save the PC even if the branch is not taken
5428         return_address=start+i*4+8;
5429         emit_movimm(return_address,rt); // PC into link register
5430         #ifdef IMM_PREFETCH
5431         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5432         #endif
5433       }
5434     }
5435     if(!unconditional) {
5436       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5437       if(!only32)
5438       {
5439         assert(s1h>=0);
5440         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5441         {
5442           emit_test(s1h,s1h);
5443           nottaken=(int)out;
5444           emit_jns(1);
5445         }
5446         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5447         {
5448           emit_test(s1h,s1h);
5449           nottaken=(int)out;
5450           emit_js(1);
5451         }
5452       } // if(!only32)
5453       else
5454       {
5455         assert(s1l>=0);
5456         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5457         {
5458           emit_test(s1l,s1l);
5459           nottaken=(int)out;
5460           emit_jns(1);
5461         }
5462         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5463         {
5464           emit_test(s1l,s1l);
5465           nottaken=(int)out;
5466           emit_js(1);
5467         }
5468       }
5469     } // if(!unconditional)
5470     int adj;
5471     uint64_t ds_unneeded=branch_regs[i].u;
5472     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5473     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5474     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5475     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5476     ds_unneeded|=1;
5477     ds_unneeded_upper|=1;
5478     // branch taken
5479     if(!nevertaken) {
5480       //assem_debug("1:\n");
5481       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5482                     ds_unneeded,ds_unneeded_upper);
5483       // load regs
5484       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5485       address_generation(i+1,&branch_regs[i],0);
5486       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5487       ds_assemble(i+1,&branch_regs[i]);
5488       cc=get_reg(branch_regs[i].regmap,CCREG);
5489       if(cc==-1) {
5490         emit_loadreg(CCREG,cc=HOST_CCREG);
5491         // CHECK: Is the following instruction (fall thru) allocated ok?
5492       }
5493       assert(cc==HOST_CCREG);
5494       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5495       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5496       assem_debug("cycle count (adj)\n");
5497       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5498       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5499       if(internal)
5500         assem_debug("branch: internal\n");
5501       else
5502         assem_debug("branch: external\n");
5503       if(internal&&is_ds[(ba[i]-start)>>2]) {
5504         ds_assemble_entry(i);
5505       }
5506       else {
5507         add_to_linker((int)out,ba[i],internal);
5508         emit_jmp(0);
5509       }
5510     }
5511     // branch not taken
5512     cop1_usable=prev_cop1_usable;
5513     if(!unconditional) {
5514       set_jump_target(nottaken,(int)out);
5515       assem_debug("1:\n");
5516       if(!likely[i]) {
5517         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5518                       ds_unneeded,ds_unneeded_upper);
5519         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5520         address_generation(i+1,&branch_regs[i],0);
5521         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5522         ds_assemble(i+1,&branch_regs[i]);
5523       }
5524       cc=get_reg(branch_regs[i].regmap,CCREG);
5525       if(cc==-1&&!likely[i]) {
5526         // Cycle count isn't in a register, temporarily load it then write it out
5527         emit_loadreg(CCREG,HOST_CCREG);
5528         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5529         int jaddr=(int)out;
5530         emit_jns(0);
5531         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5532         emit_storereg(CCREG,HOST_CCREG);
5533       }
5534       else{
5535         cc=get_reg(i_regmap,CCREG);
5536         assert(cc==HOST_CCREG);
5537         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5538         int jaddr=(int)out;
5539         emit_jns(0);
5540         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5541       }
5542     }
5543   }
5544 }
5545
5546 void fjump_assemble(int i,struct regstat *i_regs)
5547 {
5548   signed char *i_regmap=i_regs->regmap;
5549   int cc;
5550   int match;
5551   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5552   assem_debug("fmatch=%d\n",match);
5553   int fs,cs;
5554   int eaddr;
5555   int invert=0;
5556   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5557   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5558   if(!match) invert=1;
5559   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5560   if(i>(ba[i]-start)>>2) invert=1;
5561   #endif
5562
5563   if(ooo[i]) {
5564     fs=get_reg(branch_regs[i].regmap,FSREG);
5565     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5566   }
5567   else {
5568     fs=get_reg(i_regmap,FSREG);
5569   }
5570
5571   // Check cop1 unusable
5572   if(!cop1_usable) {
5573     cs=get_reg(i_regmap,CSREG);
5574     assert(cs>=0);
5575     emit_testimm(cs,0x20000000);
5576     eaddr=(int)out;
5577     emit_jeq(0);
5578     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5579     cop1_usable=1;
5580   }
5581
5582   if(ooo[i]) {
5583     // Out of order execution (delay slot first)
5584     //printf("OOOE\n");
5585     ds_assemble(i+1,i_regs);
5586     int adj;
5587     uint64_t bc_unneeded=branch_regs[i].u;
5588     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5589     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5590     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5591     bc_unneeded|=1;
5592     bc_unneeded_upper|=1;
5593     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5594                   bc_unneeded,bc_unneeded_upper);
5595     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5596     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5597     cc=get_reg(branch_regs[i].regmap,CCREG);
5598     assert(cc==HOST_CCREG);
5599     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5600     assem_debug("cycle count (adj)\n");
5601     if(1) {
5602       int nottaken=0;
5603       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5604       if(1) {
5605         assert(fs>=0);
5606         emit_testimm(fs,0x800000);
5607         if(source[i]&0x10000) // BC1T
5608         {
5609           if(invert){
5610             nottaken=(int)out;
5611             emit_jeq(1);
5612           }else{
5613             add_to_linker((int)out,ba[i],internal);
5614             emit_jne(0);
5615           }
5616         }
5617         else // BC1F
5618           if(invert){
5619             nottaken=(int)out;
5620             emit_jne(1);
5621           }else{
5622             add_to_linker((int)out,ba[i],internal);
5623             emit_jeq(0);
5624           }
5625         {
5626         }
5627       } // if(!only32)
5628
5629       if(invert) {
5630         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5631         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5632         else if(match) emit_addnop(13);
5633         #endif
5634         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5635         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5636         if(internal)
5637           assem_debug("branch: internal\n");
5638         else
5639           assem_debug("branch: external\n");
5640         if(internal&&is_ds[(ba[i]-start)>>2]) {
5641           ds_assemble_entry(i);
5642         }
5643         else {
5644           add_to_linker((int)out,ba[i],internal);
5645           emit_jmp(0);
5646         }
5647         set_jump_target(nottaken,(int)out);
5648       }
5649
5650       if(adj) {
5651         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5652       }
5653     } // (!unconditional)
5654   } // if(ooo)
5655   else
5656   {
5657     // In-order execution (branch first)
5658     //printf("IOE\n");
5659     int nottaken=0;
5660     if(1) {
5661       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5662       if(1) {
5663         assert(fs>=0);
5664         emit_testimm(fs,0x800000);
5665         if(source[i]&0x10000) // BC1T
5666         {
5667           nottaken=(int)out;
5668           emit_jeq(1);
5669         }
5670         else // BC1F
5671         {
5672           nottaken=(int)out;
5673           emit_jne(1);
5674         }
5675       }
5676     } // if(!unconditional)
5677     int adj;
5678     uint64_t ds_unneeded=branch_regs[i].u;
5679     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5680     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5681     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5682     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5683     ds_unneeded|=1;
5684     ds_unneeded_upper|=1;
5685     // branch taken
5686     //assem_debug("1:\n");
5687     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5688                   ds_unneeded,ds_unneeded_upper);
5689     // load regs
5690     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5691     address_generation(i+1,&branch_regs[i],0);
5692     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5693     ds_assemble(i+1,&branch_regs[i]);
5694     cc=get_reg(branch_regs[i].regmap,CCREG);
5695     if(cc==-1) {
5696       emit_loadreg(CCREG,cc=HOST_CCREG);
5697       // CHECK: Is the following instruction (fall thru) allocated ok?
5698     }
5699     assert(cc==HOST_CCREG);
5700     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5701     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5702     assem_debug("cycle count (adj)\n");
5703     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5704     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5705     if(internal)
5706       assem_debug("branch: internal\n");
5707     else
5708       assem_debug("branch: external\n");
5709     if(internal&&is_ds[(ba[i]-start)>>2]) {
5710       ds_assemble_entry(i);
5711     }
5712     else {
5713       add_to_linker((int)out,ba[i],internal);
5714       emit_jmp(0);
5715     }
5716
5717     // branch not taken
5718     if(1) { // <- FIXME (don't need this)
5719       set_jump_target(nottaken,(int)out);
5720       assem_debug("1:\n");
5721       if(!likely[i]) {
5722         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5723                       ds_unneeded,ds_unneeded_upper);
5724         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5725         address_generation(i+1,&branch_regs[i],0);
5726         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5727         ds_assemble(i+1,&branch_regs[i]);
5728       }
5729       cc=get_reg(branch_regs[i].regmap,CCREG);
5730       if(cc==-1&&!likely[i]) {
5731         // Cycle count isn't in a register, temporarily load it then write it out
5732         emit_loadreg(CCREG,HOST_CCREG);
5733         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5734         int jaddr=(int)out;
5735         emit_jns(0);
5736         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5737         emit_storereg(CCREG,HOST_CCREG);
5738       }
5739       else{
5740         cc=get_reg(i_regmap,CCREG);
5741         assert(cc==HOST_CCREG);
5742         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5743         int jaddr=(int)out;
5744         emit_jns(0);
5745         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5746       }
5747     }
5748   }
5749 }
5750
5751 static void pagespan_assemble(int i,struct regstat *i_regs)
5752 {
5753   int s1l=get_reg(i_regs->regmap,rs1[i]);
5754   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5755   int s2l=get_reg(i_regs->regmap,rs2[i]);
5756   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5757   void *nt_branch=NULL;
5758   int taken=0;
5759   int nottaken=0;
5760   int unconditional=0;
5761   if(rs1[i]==0)
5762   {
5763     s1l=s2l;s1h=s2h;
5764     s2l=s2h=-1;
5765   }
5766   else if(rs2[i]==0)
5767   {
5768     s2l=s2h=-1;
5769   }
5770   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5771     s1h=s2h=-1;
5772   }
5773   int hr=0;
5774   int addr,alt,ntaddr;
5775   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5776   else {
5777     while(hr<HOST_REGS)
5778     {
5779       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5780          (i_regs->regmap[hr]&63)!=rs1[i] &&
5781          (i_regs->regmap[hr]&63)!=rs2[i] )
5782       {
5783         addr=hr++;break;
5784       }
5785       hr++;
5786     }
5787   }
5788   while(hr<HOST_REGS)
5789   {
5790     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5791        (i_regs->regmap[hr]&63)!=rs1[i] &&
5792        (i_regs->regmap[hr]&63)!=rs2[i] )
5793     {
5794       alt=hr++;break;
5795     }
5796     hr++;
5797   }
5798   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5799   {
5800     while(hr<HOST_REGS)
5801     {
5802       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5803          (i_regs->regmap[hr]&63)!=rs1[i] &&
5804          (i_regs->regmap[hr]&63)!=rs2[i] )
5805       {
5806         ntaddr=hr;break;
5807       }
5808       hr++;
5809     }
5810   }
5811   assert(hr<HOST_REGS);
5812   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5813     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5814   }
5815   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5816   if(opcode[i]==2) // J
5817   {
5818     unconditional=1;
5819   }
5820   if(opcode[i]==3) // JAL
5821   {
5822     // TODO: mini_ht
5823     int rt=get_reg(i_regs->regmap,31);
5824     emit_movimm(start+i*4+8,rt);
5825     unconditional=1;
5826   }
5827   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5828   {
5829     emit_mov(s1l,addr);
5830     if(opcode2[i]==9) // JALR
5831     {
5832       int rt=get_reg(i_regs->regmap,rt1[i]);
5833       emit_movimm(start+i*4+8,rt);
5834     }
5835   }
5836   if((opcode[i]&0x3f)==4) // BEQ
5837   {
5838     if(rs1[i]==rs2[i])
5839     {
5840       unconditional=1;
5841     }
5842     else
5843     #ifdef HAVE_CMOV_IMM
5844     if(s1h<0) {
5845       if(s2l>=0) emit_cmp(s1l,s2l);
5846       else emit_test(s1l,s1l);
5847       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5848     }
5849     else
5850     #endif
5851     {
5852       assert(s1l>=0);
5853       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5854       if(s1h>=0) {
5855         if(s2h>=0) emit_cmp(s1h,s2h);
5856         else emit_test(s1h,s1h);
5857         emit_cmovne_reg(alt,addr);
5858       }
5859       if(s2l>=0) emit_cmp(s1l,s2l);
5860       else emit_test(s1l,s1l);
5861       emit_cmovne_reg(alt,addr);
5862     }
5863   }
5864   if((opcode[i]&0x3f)==5) // BNE
5865   {
5866     #ifdef HAVE_CMOV_IMM
5867     if(s1h<0) {
5868       if(s2l>=0) emit_cmp(s1l,s2l);
5869       else emit_test(s1l,s1l);
5870       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5871     }
5872     else
5873     #endif
5874     {
5875       assert(s1l>=0);
5876       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5877       if(s1h>=0) {
5878         if(s2h>=0) emit_cmp(s1h,s2h);
5879         else emit_test(s1h,s1h);
5880         emit_cmovne_reg(alt,addr);
5881       }
5882       if(s2l>=0) emit_cmp(s1l,s2l);
5883       else emit_test(s1l,s1l);
5884       emit_cmovne_reg(alt,addr);
5885     }
5886   }
5887   if((opcode[i]&0x3f)==0x14) // BEQL
5888   {
5889     if(s1h>=0) {
5890       if(s2h>=0) emit_cmp(s1h,s2h);
5891       else emit_test(s1h,s1h);
5892       nottaken=(int)out;
5893       emit_jne(0);
5894     }
5895     if(s2l>=0) emit_cmp(s1l,s2l);
5896     else emit_test(s1l,s1l);
5897     if(nottaken) set_jump_target(nottaken,(int)out);
5898     nottaken=(int)out;
5899     emit_jne(0);
5900   }
5901   if((opcode[i]&0x3f)==0x15) // BNEL
5902   {
5903     if(s1h>=0) {
5904       if(s2h>=0) emit_cmp(s1h,s2h);
5905       else emit_test(s1h,s1h);
5906       taken=(int)out;
5907       emit_jne(0);
5908     }
5909     if(s2l>=0) emit_cmp(s1l,s2l);
5910     else emit_test(s1l,s1l);
5911     nottaken=(int)out;
5912     emit_jeq(0);
5913     if(taken) set_jump_target(taken,(int)out);
5914   }
5915   if((opcode[i]&0x3f)==6) // BLEZ
5916   {
5917     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5918     emit_cmpimm(s1l,1);
5919     if(s1h>=0) emit_mov(addr,ntaddr);
5920     emit_cmovl_reg(alt,addr);
5921     if(s1h>=0) {
5922       emit_test(s1h,s1h);
5923       emit_cmovne_reg(ntaddr,addr);
5924       emit_cmovs_reg(alt,addr);
5925     }
5926   }
5927   if((opcode[i]&0x3f)==7) // BGTZ
5928   {
5929     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5930     emit_cmpimm(s1l,1);
5931     if(s1h>=0) emit_mov(addr,alt);
5932     emit_cmovl_reg(ntaddr,addr);
5933     if(s1h>=0) {
5934       emit_test(s1h,s1h);
5935       emit_cmovne_reg(alt,addr);
5936       emit_cmovs_reg(ntaddr,addr);
5937     }
5938   }
5939   if((opcode[i]&0x3f)==0x16) // BLEZL
5940   {
5941     assert((opcode[i]&0x3f)!=0x16);
5942   }
5943   if((opcode[i]&0x3f)==0x17) // BGTZL
5944   {
5945     assert((opcode[i]&0x3f)!=0x17);
5946   }
5947   assert(opcode[i]!=1); // BLTZ/BGEZ
5948
5949   //FIXME: Check CSREG
5950   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5951     if((source[i]&0x30000)==0) // BC1F
5952     {
5953       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5954       emit_testimm(s1l,0x800000);
5955       emit_cmovne_reg(alt,addr);
5956     }
5957     if((source[i]&0x30000)==0x10000) // BC1T
5958     {
5959       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5960       emit_testimm(s1l,0x800000);
5961       emit_cmovne_reg(alt,addr);
5962     }
5963     if((source[i]&0x30000)==0x20000) // BC1FL
5964     {
5965       emit_testimm(s1l,0x800000);
5966       nottaken=(int)out;
5967       emit_jne(0);
5968     }
5969     if((source[i]&0x30000)==0x30000) // BC1TL
5970     {
5971       emit_testimm(s1l,0x800000);
5972       nottaken=(int)out;
5973       emit_jeq(0);
5974     }
5975   }
5976
5977   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5978   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5979   if(likely[i]||unconditional)
5980   {
5981     emit_movimm(ba[i],HOST_BTREG);
5982   }
5983   else if(addr!=HOST_BTREG)
5984   {
5985     emit_mov(addr,HOST_BTREG);
5986   }
5987   void *branch_addr=out;
5988   emit_jmp(0);
5989   int target_addr=start+i*4+5;
5990   void *stub=out;
5991   void *compiled_target_addr=check_addr(target_addr);
5992   emit_extjump_ds((int)branch_addr,target_addr);
5993   if(compiled_target_addr) {
5994     set_jump_target((int)branch_addr,(int)compiled_target_addr);
5995     add_link(target_addr,stub);
5996   }
5997   else set_jump_target((int)branch_addr,(int)stub);
5998   if(likely[i]) {
5999     // Not-taken path
6000     set_jump_target((int)nottaken,(int)out);
6001     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6002     void *branch_addr=out;
6003     emit_jmp(0);
6004     int target_addr=start+i*4+8;
6005     void *stub=out;
6006     void *compiled_target_addr=check_addr(target_addr);
6007     emit_extjump_ds((int)branch_addr,target_addr);
6008     if(compiled_target_addr) {
6009       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6010       add_link(target_addr,stub);
6011     }
6012     else set_jump_target((int)branch_addr,(int)stub);
6013   }
6014 }
6015
6016 // Assemble the delay slot for the above
6017 static void pagespan_ds()
6018 {
6019   assem_debug("initial delay slot:\n");
6020   u_int vaddr=start+1;
6021   u_int page=get_page(vaddr);
6022   u_int vpage=get_vpage(vaddr);
6023   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6024   do_dirty_stub_ds();
6025   ll_add(jump_in+page,vaddr,(void *)out);
6026   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6027   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6028     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6029   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6030     emit_writeword(HOST_BTREG,(int)&branch_target);
6031   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6032   address_generation(0,&regs[0],regs[0].regmap_entry);
6033   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6034     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6035   cop1_usable=0;
6036   is_delayslot=0;
6037   switch(itype[0]) {
6038     case ALU:
6039       alu_assemble(0,&regs[0]);break;
6040     case IMM16:
6041       imm16_assemble(0,&regs[0]);break;
6042     case SHIFT:
6043       shift_assemble(0,&regs[0]);break;
6044     case SHIFTIMM:
6045       shiftimm_assemble(0,&regs[0]);break;
6046     case LOAD:
6047       load_assemble(0,&regs[0]);break;
6048     case LOADLR:
6049       loadlr_assemble(0,&regs[0]);break;
6050     case STORE:
6051       store_assemble(0,&regs[0]);break;
6052     case STORELR:
6053       storelr_assemble(0,&regs[0]);break;
6054     case COP0:
6055       cop0_assemble(0,&regs[0]);break;
6056     case COP1:
6057       cop1_assemble(0,&regs[0]);break;
6058     case C1LS:
6059       c1ls_assemble(0,&regs[0]);break;
6060     case COP2:
6061       cop2_assemble(0,&regs[0]);break;
6062     case C2LS:
6063       c2ls_assemble(0,&regs[0]);break;
6064     case C2OP:
6065       c2op_assemble(0,&regs[0]);break;
6066     case FCONV:
6067       fconv_assemble(0,&regs[0]);break;
6068     case FLOAT:
6069       float_assemble(0,&regs[0]);break;
6070     case FCOMP:
6071       fcomp_assemble(0,&regs[0]);break;
6072     case MULTDIV:
6073       multdiv_assemble(0,&regs[0]);break;
6074     case MOV:
6075       mov_assemble(0,&regs[0]);break;
6076     case SYSCALL:
6077     case HLECALL:
6078     case INTCALL:
6079     case SPAN:
6080     case UJUMP:
6081     case RJUMP:
6082     case CJUMP:
6083     case SJUMP:
6084     case FJUMP:
6085       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6086   }
6087   int btaddr=get_reg(regs[0].regmap,BTREG);
6088   if(btaddr<0) {
6089     btaddr=get_reg(regs[0].regmap,-1);
6090     emit_readword((int)&branch_target,btaddr);
6091   }
6092   assert(btaddr!=HOST_CCREG);
6093   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6094 #ifdef HOST_IMM8
6095   emit_movimm(start+4,HOST_TEMPREG);
6096   emit_cmp(btaddr,HOST_TEMPREG);
6097 #else
6098   emit_cmpimm(btaddr,start+4);
6099 #endif
6100   int branch=(int)out;
6101   emit_jeq(0);
6102   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6103   emit_jmp(jump_vaddr_reg[btaddr]);
6104   set_jump_target(branch,(int)out);
6105   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6106   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6107 }
6108
6109 // Basic liveness analysis for MIPS registers
6110 void unneeded_registers(int istart,int iend,int r)
6111 {
6112   int i;
6113   uint64_t u,uu,gte_u,b,bu,gte_bu;
6114   uint64_t temp_u,temp_uu,temp_gte_u=0;
6115   uint64_t tdep;
6116   uint64_t gte_u_unknown=0;
6117   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6118     gte_u_unknown=~0ll;
6119   if(iend==slen-1) {
6120     u=1;uu=1;
6121     gte_u=gte_u_unknown;
6122   }else{
6123     u=unneeded_reg[iend+1];
6124     uu=unneeded_reg_upper[iend+1];
6125     u=1;uu=1;
6126     gte_u=gte_unneeded[iend+1];
6127   }
6128
6129   for (i=iend;i>=istart;i--)
6130   {
6131     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6132     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6133     {
6134       // If subroutine call, flag return address as a possible branch target
6135       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6136
6137       if(ba[i]<start || ba[i]>=(start+slen*4))
6138       {
6139         // Branch out of this block, flush all regs
6140         u=1;
6141         uu=1;
6142         gte_u=gte_u_unknown;
6143         /* Hexagon hack
6144         if(itype[i]==UJUMP&&rt1[i]==31)
6145         {
6146           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6147         }
6148         if(itype[i]==RJUMP&&rs1[i]==31)
6149         {
6150           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6151         }
6152         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6153           if(itype[i]==UJUMP&&rt1[i]==31)
6154           {
6155             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6156             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6157           }
6158           if(itype[i]==RJUMP&&rs1[i]==31)
6159           {
6160             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6161             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6162           }
6163         }*/
6164         branch_unneeded_reg[i]=u;
6165         branch_unneeded_reg_upper[i]=uu;
6166         // Merge in delay slot
6167         tdep=(~uu>>rt1[i+1])&1;
6168         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6169         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6170         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6171         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6172         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6173         u|=1;uu|=1;
6174         gte_u|=gte_rt[i+1];
6175         gte_u&=~gte_rs[i+1];
6176         // If branch is "likely" (and conditional)
6177         // then we skip the delay slot on the fall-thru path
6178         if(likely[i]) {
6179           if(i<slen-1) {
6180             u&=unneeded_reg[i+2];
6181             uu&=unneeded_reg_upper[i+2];
6182             gte_u&=gte_unneeded[i+2];
6183           }
6184           else
6185           {
6186             u=1;
6187             uu=1;
6188             gte_u=gte_u_unknown;
6189           }
6190         }
6191       }
6192       else
6193       {
6194         // Internal branch, flag target
6195         bt[(ba[i]-start)>>2]=1;
6196         if(ba[i]<=start+i*4) {
6197           // Backward branch
6198           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6199           {
6200             // Unconditional branch
6201             temp_u=1;temp_uu=1;
6202             temp_gte_u=0;
6203           } else {
6204             // Conditional branch (not taken case)
6205             temp_u=unneeded_reg[i+2];
6206             temp_uu=unneeded_reg_upper[i+2];
6207             temp_gte_u&=gte_unneeded[i+2];
6208           }
6209           // Merge in delay slot
6210           tdep=(~temp_uu>>rt1[i+1])&1;
6211           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6212           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6213           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6214           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6215           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6216           temp_u|=1;temp_uu|=1;
6217           temp_gte_u|=gte_rt[i+1];
6218           temp_gte_u&=~gte_rs[i+1];
6219           // If branch is "likely" (and conditional)
6220           // then we skip the delay slot on the fall-thru path
6221           if(likely[i]) {
6222             if(i<slen-1) {
6223               temp_u&=unneeded_reg[i+2];
6224               temp_uu&=unneeded_reg_upper[i+2];
6225               temp_gte_u&=gte_unneeded[i+2];
6226             }
6227             else
6228             {
6229               temp_u=1;
6230               temp_uu=1;
6231               temp_gte_u=gte_u_unknown;
6232             }
6233           }
6234           tdep=(~temp_uu>>rt1[i])&1;
6235           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6236           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6237           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6238           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6239           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6240           temp_u|=1;temp_uu|=1;
6241           temp_gte_u|=gte_rt[i];
6242           temp_gte_u&=~gte_rs[i];
6243           unneeded_reg[i]=temp_u;
6244           unneeded_reg_upper[i]=temp_uu;
6245           gte_unneeded[i]=temp_gte_u;
6246           // Only go three levels deep.  This recursion can take an
6247           // excessive amount of time if there are a lot of nested loops.
6248           if(r<2) {
6249             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6250           }else{
6251             unneeded_reg[(ba[i]-start)>>2]=1;
6252             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6253             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6254           }
6255         } /*else*/ if(1) {
6256           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6257           {
6258             // Unconditional branch
6259             u=unneeded_reg[(ba[i]-start)>>2];
6260             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6261             gte_u=gte_unneeded[(ba[i]-start)>>2];
6262             branch_unneeded_reg[i]=u;
6263             branch_unneeded_reg_upper[i]=uu;
6264         //u=1;
6265         //uu=1;
6266         //branch_unneeded_reg[i]=u;
6267         //branch_unneeded_reg_upper[i]=uu;
6268             // Merge in delay slot
6269             tdep=(~uu>>rt1[i+1])&1;
6270             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6271             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6272             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6273             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6274             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6275             u|=1;uu|=1;
6276             gte_u|=gte_rt[i+1];
6277             gte_u&=~gte_rs[i+1];
6278           } else {
6279             // Conditional branch
6280             b=unneeded_reg[(ba[i]-start)>>2];
6281             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6282             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6283             branch_unneeded_reg[i]=b;
6284             branch_unneeded_reg_upper[i]=bu;
6285         //b=1;
6286         //bu=1;
6287         //branch_unneeded_reg[i]=b;
6288         //branch_unneeded_reg_upper[i]=bu;
6289             // Branch delay slot
6290             tdep=(~uu>>rt1[i+1])&1;
6291             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6292             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6293             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6294             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6295             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6296             b|=1;bu|=1;
6297             gte_bu|=gte_rt[i+1];
6298             gte_bu&=~gte_rs[i+1];
6299             // If branch is "likely" then we skip the
6300             // delay slot on the fall-thru path
6301             if(likely[i]) {
6302               u=b;
6303               uu=bu;
6304               gte_u=gte_bu;
6305               if(i<slen-1) {
6306                 u&=unneeded_reg[i+2];
6307                 uu&=unneeded_reg_upper[i+2];
6308                 gte_u&=gte_unneeded[i+2];
6309         //u=1;
6310         //uu=1;
6311               }
6312             } else {
6313               u&=b;
6314               uu&=bu;
6315               gte_u&=gte_bu;
6316         //u=1;
6317         //uu=1;
6318             }
6319             if(i<slen-1) {
6320               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6321               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6322         //branch_unneeded_reg[i]=1;
6323         //branch_unneeded_reg_upper[i]=1;
6324             } else {
6325               branch_unneeded_reg[i]=1;
6326               branch_unneeded_reg_upper[i]=1;
6327             }
6328           }
6329         }
6330       }
6331     }
6332     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6333     {
6334       // SYSCALL instruction (software interrupt)
6335       u=1;
6336       uu=1;
6337     }
6338     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6339     {
6340       // ERET instruction (return from interrupt)
6341       u=1;
6342       uu=1;
6343     }
6344     //u=uu=1; // DEBUG
6345     tdep=(~uu>>rt1[i])&1;
6346     // Written registers are unneeded
6347     u|=1LL<<rt1[i];
6348     u|=1LL<<rt2[i];
6349     uu|=1LL<<rt1[i];
6350     uu|=1LL<<rt2[i];
6351     gte_u|=gte_rt[i];
6352     // Accessed registers are needed
6353     u&=~(1LL<<rs1[i]);
6354     u&=~(1LL<<rs2[i]);
6355     uu&=~(1LL<<us1[i]);
6356     uu&=~(1LL<<us2[i]);
6357     gte_u&=~gte_rs[i];
6358     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6359       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6360     // Source-target dependencies
6361     uu&=~(tdep<<dep1[i]);
6362     uu&=~(tdep<<dep2[i]);
6363     // R0 is always unneeded
6364     u|=1;uu|=1;
6365     // Save it
6366     unneeded_reg[i]=u;
6367     unneeded_reg_upper[i]=uu;
6368     gte_unneeded[i]=gte_u;
6369     /*
6370     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6371     printf("U:");
6372     int r;
6373     for(r=1;r<=CCREG;r++) {
6374       if((unneeded_reg[i]>>r)&1) {
6375         if(r==HIREG) printf(" HI");
6376         else if(r==LOREG) printf(" LO");
6377         else printf(" r%d",r);
6378       }
6379     }
6380     printf(" UU:");
6381     for(r=1;r<=CCREG;r++) {
6382       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6383         if(r==HIREG) printf(" HI");
6384         else if(r==LOREG) printf(" LO");
6385         else printf(" r%d",r);
6386       }
6387     }
6388     printf("\n");*/
6389   }
6390   for (i=iend;i>=istart;i--)
6391   {
6392     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6393   }
6394 }
6395
6396 // Write back dirty registers as soon as we will no longer modify them,
6397 // so that we don't end up with lots of writes at the branches.
6398 void clean_registers(int istart,int iend,int wr)
6399 {
6400   int i;
6401   int r;
6402   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6403   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6404   if(iend==slen-1) {
6405     will_dirty_i=will_dirty_next=0;
6406     wont_dirty_i=wont_dirty_next=0;
6407   }else{
6408     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6409     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6410   }
6411   for (i=iend;i>=istart;i--)
6412   {
6413     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6414     {
6415       if(ba[i]<start || ba[i]>=(start+slen*4))
6416       {
6417         // Branch out of this block, flush all regs
6418         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6419         {
6420           // Unconditional branch
6421           will_dirty_i=0;
6422           wont_dirty_i=0;
6423           // Merge in delay slot (will dirty)
6424           for(r=0;r<HOST_REGS;r++) {
6425             if(r!=EXCLUDE_REG) {
6426               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6427               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6428               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6429               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6430               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6431               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6432               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6433               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6434               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6435               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6436               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6437               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6438               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6439               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6440             }
6441           }
6442         }
6443         else
6444         {
6445           // Conditional branch
6446           will_dirty_i=0;
6447           wont_dirty_i=wont_dirty_next;
6448           // Merge in delay slot (will dirty)
6449           for(r=0;r<HOST_REGS;r++) {
6450             if(r!=EXCLUDE_REG) {
6451               if(!likely[i]) {
6452                 // Might not dirty if likely branch is not taken
6453                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6454                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6455                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6456                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6457                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6458                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6459                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6460                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6461                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6462                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6463                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6464                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6465                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6466                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6467               }
6468             }
6469           }
6470         }
6471         // Merge in delay slot (wont dirty)
6472         for(r=0;r<HOST_REGS;r++) {
6473           if(r!=EXCLUDE_REG) {
6474             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6475             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6476             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6477             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6478             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6479             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6480             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6481             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6482             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6483             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6484           }
6485         }
6486         if(wr) {
6487           #ifndef DESTRUCTIVE_WRITEBACK
6488           branch_regs[i].dirty&=wont_dirty_i;
6489           #endif
6490           branch_regs[i].dirty|=will_dirty_i;
6491         }
6492       }
6493       else
6494       {
6495         // Internal branch
6496         if(ba[i]<=start+i*4) {
6497           // Backward branch
6498           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6499           {
6500             // Unconditional branch
6501             temp_will_dirty=0;
6502             temp_wont_dirty=0;
6503             // Merge in delay slot (will dirty)
6504             for(r=0;r<HOST_REGS;r++) {
6505               if(r!=EXCLUDE_REG) {
6506                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6507                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6508                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6509                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6510                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6511                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6512                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6513                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6514                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6515                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6516                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6517                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6518                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6519                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6520               }
6521             }
6522           } else {
6523             // Conditional branch (not taken case)
6524             temp_will_dirty=will_dirty_next;
6525             temp_wont_dirty=wont_dirty_next;
6526             // Merge in delay slot (will dirty)
6527             for(r=0;r<HOST_REGS;r++) {
6528               if(r!=EXCLUDE_REG) {
6529                 if(!likely[i]) {
6530                   // Will not dirty if likely branch is not taken
6531                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6532                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6533                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6534                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6535                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6536                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6537                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6538                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6539                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6540                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6541                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6542                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6543                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6544                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6545                 }
6546               }
6547             }
6548           }
6549           // Merge in delay slot (wont dirty)
6550           for(r=0;r<HOST_REGS;r++) {
6551             if(r!=EXCLUDE_REG) {
6552               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6553               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6554               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6555               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6556               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6557               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6558               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6559               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6560               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6561               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6562             }
6563           }
6564           // Deal with changed mappings
6565           if(i<iend) {
6566             for(r=0;r<HOST_REGS;r++) {
6567               if(r!=EXCLUDE_REG) {
6568                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6569                   temp_will_dirty&=~(1<<r);
6570                   temp_wont_dirty&=~(1<<r);
6571                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6572                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6573                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6574                   } else {
6575                     temp_will_dirty|=1<<r;
6576                     temp_wont_dirty|=1<<r;
6577                   }
6578                 }
6579               }
6580             }
6581           }
6582           if(wr) {
6583             will_dirty[i]=temp_will_dirty;
6584             wont_dirty[i]=temp_wont_dirty;
6585             clean_registers((ba[i]-start)>>2,i-1,0);
6586           }else{
6587             // Limit recursion.  It can take an excessive amount
6588             // of time if there are a lot of nested loops.
6589             will_dirty[(ba[i]-start)>>2]=0;
6590             wont_dirty[(ba[i]-start)>>2]=-1;
6591           }
6592         }
6593         /*else*/ if(1)
6594         {
6595           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6596           {
6597             // Unconditional branch
6598             will_dirty_i=0;
6599             wont_dirty_i=0;
6600           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6601             for(r=0;r<HOST_REGS;r++) {
6602               if(r!=EXCLUDE_REG) {
6603                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6604                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6605                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6606                 }
6607                 if(branch_regs[i].regmap[r]>=0) {
6608                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6609                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6610                 }
6611               }
6612             }
6613           //}
6614             // Merge in delay slot
6615             for(r=0;r<HOST_REGS;r++) {
6616               if(r!=EXCLUDE_REG) {
6617                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6618                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6619                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6620                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6621                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6622                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6623                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6624                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6625                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6626                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6627                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6628                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6629                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6630                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6631               }
6632             }
6633           } else {
6634             // Conditional branch
6635             will_dirty_i=will_dirty_next;
6636             wont_dirty_i=wont_dirty_next;
6637           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6638             for(r=0;r<HOST_REGS;r++) {
6639               if(r!=EXCLUDE_REG) {
6640                 signed char target_reg=branch_regs[i].regmap[r];
6641                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6642                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6643                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6644                 }
6645                 else if(target_reg>=0) {
6646                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6647                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6648                 }
6649                 // Treat delay slot as part of branch too
6650                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6651                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6652                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6653                 }
6654                 else
6655                 {
6656                   will_dirty[i+1]&=~(1<<r);
6657                 }*/
6658               }
6659             }
6660           //}
6661             // Merge in delay slot
6662             for(r=0;r<HOST_REGS;r++) {
6663               if(r!=EXCLUDE_REG) {
6664                 if(!likely[i]) {
6665                   // Might not dirty if likely branch is not taken
6666                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6667                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6668                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6669                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6670                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6671                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6672                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6673                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6674                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6675                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6676                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6677                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6678                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6679                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6680                 }
6681               }
6682             }
6683           }
6684           // Merge in delay slot (won't dirty)
6685           for(r=0;r<HOST_REGS;r++) {
6686             if(r!=EXCLUDE_REG) {
6687               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6688               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6689               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6690               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6691               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6692               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6693               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6694               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6695               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6696               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6697             }
6698           }
6699           if(wr) {
6700             #ifndef DESTRUCTIVE_WRITEBACK
6701             branch_regs[i].dirty&=wont_dirty_i;
6702             #endif
6703             branch_regs[i].dirty|=will_dirty_i;
6704           }
6705         }
6706       }
6707     }
6708     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6709     {
6710       // SYSCALL instruction (software interrupt)
6711       will_dirty_i=0;
6712       wont_dirty_i=0;
6713     }
6714     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6715     {
6716       // ERET instruction (return from interrupt)
6717       will_dirty_i=0;
6718       wont_dirty_i=0;
6719     }
6720     will_dirty_next=will_dirty_i;
6721     wont_dirty_next=wont_dirty_i;
6722     for(r=0;r<HOST_REGS;r++) {
6723       if(r!=EXCLUDE_REG) {
6724         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6725         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6726         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6727         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6728         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6729         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6730         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6731         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6732         if(i>istart) {
6733           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6734           {
6735             // Don't store a register immediately after writing it,
6736             // may prevent dual-issue.
6737             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6738             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6739           }
6740         }
6741       }
6742     }
6743     // Save it
6744     will_dirty[i]=will_dirty_i;
6745     wont_dirty[i]=wont_dirty_i;
6746     // Mark registers that won't be dirtied as not dirty
6747     if(wr) {
6748       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6749       for(r=0;r<HOST_REGS;r++) {
6750         if((will_dirty_i>>r)&1) {
6751           printf(" r%d",r);
6752         }
6753       }
6754       printf("\n");*/
6755
6756       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6757         regs[i].dirty|=will_dirty_i;
6758         #ifndef DESTRUCTIVE_WRITEBACK
6759         regs[i].dirty&=wont_dirty_i;
6760         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6761         {
6762           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6763             for(r=0;r<HOST_REGS;r++) {
6764               if(r!=EXCLUDE_REG) {
6765                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6766                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6767                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
6768               }
6769             }
6770           }
6771         }
6772         else
6773         {
6774           if(i<iend) {
6775             for(r=0;r<HOST_REGS;r++) {
6776               if(r!=EXCLUDE_REG) {
6777                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6778                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6779                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
6780               }
6781             }
6782           }
6783         }
6784         #endif
6785       //}
6786     }
6787     // Deal with changed mappings
6788     temp_will_dirty=will_dirty_i;
6789     temp_wont_dirty=wont_dirty_i;
6790     for(r=0;r<HOST_REGS;r++) {
6791       if(r!=EXCLUDE_REG) {
6792         int nr;
6793         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6794           if(wr) {
6795             #ifndef DESTRUCTIVE_WRITEBACK
6796             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6797             #endif
6798             regs[i].wasdirty|=will_dirty_i&(1<<r);
6799           }
6800         }
6801         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6802           // Register moved to a different register
6803           will_dirty_i&=~(1<<r);
6804           wont_dirty_i&=~(1<<r);
6805           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6806           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6807           if(wr) {
6808             #ifndef DESTRUCTIVE_WRITEBACK
6809             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6810             #endif
6811             regs[i].wasdirty|=will_dirty_i&(1<<r);
6812           }
6813         }
6814         else {
6815           will_dirty_i&=~(1<<r);
6816           wont_dirty_i&=~(1<<r);
6817           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6818             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6819             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6820           } else {
6821             wont_dirty_i|=1<<r;
6822             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
6823           }
6824         }
6825       }
6826     }
6827   }
6828 }
6829
6830 #ifdef DISASM
6831   /* disassembly */
6832 void disassemble_inst(int i)
6833 {
6834     if (bt[i]) printf("*"); else printf(" ");
6835     switch(itype[i]) {
6836       case UJUMP:
6837         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6838       case CJUMP:
6839         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6840       case SJUMP:
6841         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6842       case FJUMP:
6843         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6844       case RJUMP:
6845         if (opcode[i]==0x9&&rt1[i]!=31)
6846           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6847         else
6848           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6849         break;
6850       case SPAN:
6851         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6852       case IMM16:
6853         if(opcode[i]==0xf) //LUI
6854           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6855         else
6856           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6857         break;
6858       case LOAD:
6859       case LOADLR:
6860         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6861         break;
6862       case STORE:
6863       case STORELR:
6864         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6865         break;
6866       case ALU:
6867       case SHIFT:
6868         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6869         break;
6870       case MULTDIV:
6871         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6872         break;
6873       case SHIFTIMM:
6874         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6875         break;
6876       case MOV:
6877         if((opcode2[i]&0x1d)==0x10)
6878           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6879         else if((opcode2[i]&0x1d)==0x11)
6880           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6881         else
6882           printf (" %x: %s\n",start+i*4,insn[i]);
6883         break;
6884       case COP0:
6885         if(opcode2[i]==0)
6886           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6887         else if(opcode2[i]==4)
6888           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6889         else printf (" %x: %s\n",start+i*4,insn[i]);
6890         break;
6891       case COP1:
6892         if(opcode2[i]<3)
6893           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6894         else if(opcode2[i]>3)
6895           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6896         else printf (" %x: %s\n",start+i*4,insn[i]);
6897         break;
6898       case COP2:
6899         if(opcode2[i]<3)
6900           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6901         else if(opcode2[i]>3)
6902           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6903         else printf (" %x: %s\n",start+i*4,insn[i]);
6904         break;
6905       case C1LS:
6906         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6907         break;
6908       case C2LS:
6909         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6910         break;
6911       case INTCALL:
6912         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6913         break;
6914       default:
6915         //printf (" %s %8x\n",insn[i],source[i]);
6916         printf (" %x: %s\n",start+i*4,insn[i]);
6917     }
6918 }
6919 #else
6920 static void disassemble_inst(int i) {}
6921 #endif // DISASM
6922
6923 #define DRC_TEST_VAL 0x74657374
6924
6925 static int new_dynarec_test(void)
6926 {
6927   int (*testfunc)(void) = (void *)out;
6928   int ret;
6929   emit_movimm(DRC_TEST_VAL,0); // test
6930   emit_jmpreg(14);
6931   literal_pool(0);
6932 #ifdef __arm__
6933   __clear_cache((void *)testfunc, out);
6934 #endif
6935   SysPrintf("testing if we can run recompiled code..\n");
6936   ret = testfunc();
6937   if (ret == DRC_TEST_VAL)
6938     SysPrintf("test passed.\n");
6939   else
6940     SysPrintf("test failed: %08x\n", ret);
6941   out=(u_char *)BASE_ADDR;
6942   return ret == DRC_TEST_VAL;
6943 }
6944
6945 // clear the state completely, instead of just marking
6946 // things invalid like invalidate_all_pages() does
6947 void new_dynarec_clear_full()
6948 {
6949   int n;
6950   out=(u_char *)BASE_ADDR;
6951   memset(invalid_code,1,sizeof(invalid_code));
6952   memset(hash_table,0xff,sizeof(hash_table));
6953   memset(mini_ht,-1,sizeof(mini_ht));
6954   memset(restore_candidate,0,sizeof(restore_candidate));
6955   memset(shadow,0,sizeof(shadow));
6956   copy=shadow;
6957   expirep=16384; // Expiry pointer, +2 blocks
6958   pending_exception=0;
6959   literalcount=0;
6960   stop_after_jal=0;
6961   inv_code_start=inv_code_end=~0;
6962   // TLB
6963   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6964   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6965   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6966 }
6967
6968 void new_dynarec_init()
6969 {
6970   SysPrintf("Init new dynarec\n");
6971   out=(u_char *)BASE_ADDR;
6972 #if BASE_ADDR_FIXED
6973   if (mmap (out, 1<<TARGET_SIZE_2,
6974             PROT_READ | PROT_WRITE | PROT_EXEC,
6975             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
6976             -1, 0) <= 0) {
6977     SysPrintf("mmap() failed: %s\n", strerror(errno));
6978   }
6979 #else
6980   // not all systems allow execute in data segment by default
6981   if (mprotect(out, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6982     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6983 #endif
6984   int n;
6985   cycle_multiplier=200;
6986   new_dynarec_clear_full();
6987 #ifdef HOST_IMM8
6988   // Copy this into local area so we don't have to put it in every literal pool
6989   invc_ptr=invalid_code;
6990 #endif
6991   arch_init();
6992   new_dynarec_test();
6993 #ifndef RAM_FIXED
6994   ram_offset=(u_int)rdram-0x80000000;
6995 #endif
6996   if (ram_offset!=0)
6997     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6998 }
6999
7000 void new_dynarec_cleanup()
7001 {
7002   int n;
7003   #if BASE_ADDR_FIXED
7004   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {SysPrintf("munmap() failed\n");}
7005   #endif
7006   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7007   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7008   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7009   #ifdef ROM_COPY
7010   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
7011   #endif
7012 }
7013
7014 static u_int *get_source_start(u_int addr, u_int *limit)
7015 {
7016   if (addr < 0x00200000 ||
7017     (0xa0000000 <= addr && addr < 0xa0200000)) {
7018     // used for BIOS calls mostly?
7019     *limit = (addr&0xa0000000)|0x00200000;
7020     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7021   }
7022   else if (!Config.HLE && (
7023     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7024     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7025     // BIOS
7026     *limit = (addr & 0xfff00000) | 0x80000;
7027     return (u_int *)((u_int)psxR + (addr&0x7ffff));
7028   }
7029   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
7030     *limit = (addr & 0x80600000) + 0x00200000;
7031     return (u_int *)((u_int)rdram + (addr&0x1fffff));
7032   }
7033 }
7034
7035 static u_int scan_for_ret(u_int addr)
7036 {
7037   u_int limit = 0;
7038   u_int *mem;
7039
7040   mem = get_source_start(addr, &limit);
7041   if (mem == NULL)
7042     return addr;
7043
7044   if (limit > addr + 0x1000)
7045     limit = addr + 0x1000;
7046   for (; addr < limit; addr += 4, mem++) {
7047     if (*mem == 0x03e00008) // jr $ra
7048       return addr + 8;
7049   }
7050 }
7051
7052 struct savestate_block {
7053   uint32_t addr;
7054   uint32_t regflags;
7055 };
7056
7057 static int addr_cmp(const void *p1_, const void *p2_)
7058 {
7059   const struct savestate_block *p1 = p1_, *p2 = p2_;
7060   return p1->addr - p2->addr;
7061 }
7062
7063 int new_dynarec_save_blocks(void *save, int size)
7064 {
7065   struct savestate_block *blocks = save;
7066   int maxcount = size / sizeof(blocks[0]);
7067   struct savestate_block tmp_blocks[1024];
7068   struct ll_entry *head;
7069   int p, s, d, o, bcnt;
7070   u_int addr;
7071
7072   o = 0;
7073   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
7074     bcnt = 0;
7075     for (head = jump_in[p]; head != NULL; head = head->next) {
7076       tmp_blocks[bcnt].addr = head->vaddr;
7077       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
7078       bcnt++;
7079     }
7080     if (bcnt < 1)
7081       continue;
7082     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
7083
7084     addr = tmp_blocks[0].addr;
7085     for (s = d = 0; s < bcnt; s++) {
7086       if (tmp_blocks[s].addr < addr)
7087         continue;
7088       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
7089         tmp_blocks[d++] = tmp_blocks[s];
7090       addr = scan_for_ret(tmp_blocks[s].addr);
7091     }
7092
7093     if (o + d > maxcount)
7094       d = maxcount - o;
7095     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
7096     o += d;
7097   }
7098
7099   return o * sizeof(blocks[0]);
7100 }
7101
7102 void new_dynarec_load_blocks(const void *save, int size)
7103 {
7104   const struct savestate_block *blocks = save;
7105   int count = size / sizeof(blocks[0]);
7106   u_int regs_save[32];
7107   uint32_t f;
7108   int i, b;
7109
7110   get_addr(psxRegs.pc);
7111
7112   // change GPRs for speculation to at least partially work..
7113   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
7114   for (i = 1; i < 32; i++)
7115     psxRegs.GPR.r[i] = 0x80000000;
7116
7117   for (b = 0; b < count; b++) {
7118     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7119       if (f & 1)
7120         psxRegs.GPR.r[i] = 0x1f800000;
7121     }
7122
7123     get_addr(blocks[b].addr);
7124
7125     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
7126       if (f & 1)
7127         psxRegs.GPR.r[i] = 0x80000000;
7128     }
7129   }
7130
7131   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
7132 }
7133
7134 int new_recompile_block(int addr)
7135 {
7136   u_int pagelimit = 0;
7137   u_int state_rflags = 0;
7138   int i;
7139
7140   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7141   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7142   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7143   //if(debug)
7144   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7145   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7146   /*if(Count>=312978186) {
7147     rlist();
7148   }*/
7149   //rlist();
7150
7151   // this is just for speculation
7152   for (i = 1; i < 32; i++) {
7153     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
7154       state_rflags |= 1 << i;
7155   }
7156
7157   start = (u_int)addr&~3;
7158   //assert(((u_int)addr&1)==0);
7159   new_dynarec_did_compile=1;
7160   if (Config.HLE && start == 0x80001000) // hlecall
7161   {
7162     // XXX: is this enough? Maybe check hleSoftCall?
7163     u_int beginning=(u_int)out;
7164     u_int page=get_page(start);
7165     invalid_code[start>>12]=0;
7166     emit_movimm(start,0);
7167     emit_writeword(0,(int)&pcaddr);
7168     emit_jmp((int)new_dyna_leave);
7169     literal_pool(0);
7170 #ifdef __arm__
7171     __clear_cache((void *)beginning,out);
7172 #endif
7173     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
7174     return 0;
7175   }
7176
7177   source = get_source_start(start, &pagelimit);
7178   if (source == NULL) {
7179     SysPrintf("Compile at bogus memory address: %08x\n", addr);
7180     exit(1);
7181   }
7182
7183   /* Pass 1: disassemble */
7184   /* Pass 2: register dependencies, branch targets */
7185   /* Pass 3: register allocation */
7186   /* Pass 4: branch dependencies */
7187   /* Pass 5: pre-alloc */
7188   /* Pass 6: optimize clean/dirty state */
7189   /* Pass 7: flag 32-bit registers */
7190   /* Pass 8: assembly */
7191   /* Pass 9: linker */
7192   /* Pass 10: garbage collection / free memory */
7193
7194   int j;
7195   int done=0;
7196   unsigned int type,op,op2;
7197
7198   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7199
7200   /* Pass 1 disassembly */
7201
7202   for(i=0;!done;i++) {
7203     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
7204     minimum_free_regs[i]=0;
7205     opcode[i]=op=source[i]>>26;
7206     switch(op)
7207     {
7208       case 0x00: strcpy(insn[i],"special"); type=NI;
7209         op2=source[i]&0x3f;
7210         switch(op2)
7211         {
7212           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7213           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7214           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7215           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7216           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7217           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7218           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7219           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7220           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7221           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7222           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7223           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7224           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7225           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7226           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7227           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7228           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7229           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7230           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7231           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7232           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7233           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7234           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7235           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7236           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7237           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7238           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7239           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7240           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7241           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7242           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7243           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7244           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7245           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7246           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7247 #if 0
7248           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7249           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7250           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7251           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7252           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7253           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7254           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7255           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7256           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7257           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7258           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7259           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7260           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7261           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7262           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7263           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7264           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7265 #endif
7266         }
7267         break;
7268       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7269         op2=(source[i]>>16)&0x1f;
7270         switch(op2)
7271         {
7272           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7273           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7274           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7275           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7276           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7277           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7278           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7279           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7280           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7281           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7282           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7283           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7284           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7285           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7286         }
7287         break;
7288       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7289       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7290       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7291       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7292       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7293       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7294       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7295       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7296       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7297       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7298       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7299       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7300       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7301       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7302       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7303         op2=(source[i]>>21)&0x1f;
7304         switch(op2)
7305         {
7306           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7307           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7308           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7309           switch(source[i]&0x3f)
7310           {
7311             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7312             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7313             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7314             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7315             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7316             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7317           }
7318         }
7319         break;
7320       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7321         op2=(source[i]>>21)&0x1f;
7322         switch(op2)
7323         {
7324           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7325           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7326           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7327           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7328           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7329           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7330           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7331           switch((source[i]>>16)&0x3)
7332           {
7333             case 0x00: strcpy(insn[i],"BC1F"); break;
7334             case 0x01: strcpy(insn[i],"BC1T"); break;
7335             case 0x02: strcpy(insn[i],"BC1FL"); break;
7336             case 0x03: strcpy(insn[i],"BC1TL"); break;
7337           }
7338           break;
7339           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7340           switch(source[i]&0x3f)
7341           {
7342             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7343             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7344             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7345             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7346             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7347             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7348             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7349             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7350             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7351             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7352             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7353             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7354             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7355             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7356             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7357             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7358             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7359             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7360             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7361             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7362             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7363             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7364             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7365             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7366             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7367             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7368             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7369             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7370             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7371             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7372             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7373             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7374             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7375             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7376             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7377           }
7378           break;
7379           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7380           switch(source[i]&0x3f)
7381           {
7382             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7383             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7384             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7385             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7386             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7387             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7388             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7389             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7390             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7391             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7392             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7393             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7394             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7395             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7396             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7397             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7398             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7399             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7400             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7401             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7402             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7403             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7404             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7405             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7406             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7407             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7408             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7409             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7410             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7411             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7412             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7413             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7414             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7415             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7416             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7417           }
7418           break;
7419           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7420           switch(source[i]&0x3f)
7421           {
7422             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7423             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7424           }
7425           break;
7426           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7427           switch(source[i]&0x3f)
7428           {
7429             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7430             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7431           }
7432           break;
7433         }
7434         break;
7435 #if 0
7436       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7437       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7438       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7439       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7440       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7441       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7442       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7443       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7444 #endif
7445       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7446       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7447       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7448       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7449       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7450       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7451       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7452 #if 0
7453       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7454 #endif
7455       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7456       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7457       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7458       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7459 #if 0
7460       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7461       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7462 #endif
7463       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7464       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7465       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7466       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7467 #if 0
7468       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7469       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7470       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7471 #endif
7472       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7473       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7474 #if 0
7475       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7476       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7477       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7478 #endif
7479       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7480         op2=(source[i]>>21)&0x1f;
7481         //if (op2 & 0x10) {
7482         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7483           if (gte_handlers[source[i]&0x3f]!=NULL) {
7484             if (gte_regnames[source[i]&0x3f]!=NULL)
7485               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7486             else
7487               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7488             type=C2OP;
7489           }
7490         }
7491         else switch(op2)
7492         {
7493           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7494           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7495           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7496           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7497         }
7498         break;
7499       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7500       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7501       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7502       default: strcpy(insn[i],"???"); type=NI;
7503         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7504         break;
7505     }
7506     itype[i]=type;
7507     opcode2[i]=op2;
7508     /* Get registers/immediates */
7509     lt1[i]=0;
7510     us1[i]=0;
7511     us2[i]=0;
7512     dep1[i]=0;
7513     dep2[i]=0;
7514     gte_rs[i]=gte_rt[i]=0;
7515     switch(type) {
7516       case LOAD:
7517         rs1[i]=(source[i]>>21)&0x1f;
7518         rs2[i]=0;
7519         rt1[i]=(source[i]>>16)&0x1f;
7520         rt2[i]=0;
7521         imm[i]=(short)source[i];
7522         break;
7523       case STORE:
7524       case STORELR:
7525         rs1[i]=(source[i]>>21)&0x1f;
7526         rs2[i]=(source[i]>>16)&0x1f;
7527         rt1[i]=0;
7528         rt2[i]=0;
7529         imm[i]=(short)source[i];
7530         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7531         break;
7532       case LOADLR:
7533         // LWL/LWR only load part of the register,
7534         // therefore the target register must be treated as a source too
7535         rs1[i]=(source[i]>>21)&0x1f;
7536         rs2[i]=(source[i]>>16)&0x1f;
7537         rt1[i]=(source[i]>>16)&0x1f;
7538         rt2[i]=0;
7539         imm[i]=(short)source[i];
7540         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7541         if(op==0x26) dep1[i]=rt1[i]; // LWR
7542         break;
7543       case IMM16:
7544         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7545         else rs1[i]=(source[i]>>21)&0x1f;
7546         rs2[i]=0;
7547         rt1[i]=(source[i]>>16)&0x1f;
7548         rt2[i]=0;
7549         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7550           imm[i]=(unsigned short)source[i];
7551         }else{
7552           imm[i]=(short)source[i];
7553         }
7554         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7555         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7556         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7557         break;
7558       case UJUMP:
7559         rs1[i]=0;
7560         rs2[i]=0;
7561         rt1[i]=0;
7562         rt2[i]=0;
7563         // The JAL instruction writes to r31.
7564         if (op&1) {
7565           rt1[i]=31;
7566         }
7567         rs2[i]=CCREG;
7568         break;
7569       case RJUMP:
7570         rs1[i]=(source[i]>>21)&0x1f;
7571         rs2[i]=0;
7572         rt1[i]=0;
7573         rt2[i]=0;
7574         // The JALR instruction writes to rd.
7575         if (op2&1) {
7576           rt1[i]=(source[i]>>11)&0x1f;
7577         }
7578         rs2[i]=CCREG;
7579         break;
7580       case CJUMP:
7581         rs1[i]=(source[i]>>21)&0x1f;
7582         rs2[i]=(source[i]>>16)&0x1f;
7583         rt1[i]=0;
7584         rt2[i]=0;
7585         if(op&2) { // BGTZ/BLEZ
7586           rs2[i]=0;
7587         }
7588         us1[i]=rs1[i];
7589         us2[i]=rs2[i];
7590         likely[i]=op>>4;
7591         break;
7592       case SJUMP:
7593         rs1[i]=(source[i]>>21)&0x1f;
7594         rs2[i]=CCREG;
7595         rt1[i]=0;
7596         rt2[i]=0;
7597         us1[i]=rs1[i];
7598         if(op2&0x10) { // BxxAL
7599           rt1[i]=31;
7600           // NOTE: If the branch is not taken, r31 is still overwritten
7601         }
7602         likely[i]=(op2&2)>>1;
7603         break;
7604       case FJUMP:
7605         rs1[i]=FSREG;
7606         rs2[i]=CSREG;
7607         rt1[i]=0;
7608         rt2[i]=0;
7609         likely[i]=((source[i])>>17)&1;
7610         break;
7611       case ALU:
7612         rs1[i]=(source[i]>>21)&0x1f; // source
7613         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7614         rt1[i]=(source[i]>>11)&0x1f; // destination
7615         rt2[i]=0;
7616         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7617           us1[i]=rs1[i];us2[i]=rs2[i];
7618         }
7619         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7620           dep1[i]=rs1[i];dep2[i]=rs2[i];
7621         }
7622         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7623           dep1[i]=rs1[i];dep2[i]=rs2[i];
7624         }
7625         break;
7626       case MULTDIV:
7627         rs1[i]=(source[i]>>21)&0x1f; // source
7628         rs2[i]=(source[i]>>16)&0x1f; // divisor
7629         rt1[i]=HIREG;
7630         rt2[i]=LOREG;
7631         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7632           us1[i]=rs1[i];us2[i]=rs2[i];
7633         }
7634         break;
7635       case MOV:
7636         rs1[i]=0;
7637         rs2[i]=0;
7638         rt1[i]=0;
7639         rt2[i]=0;
7640         if(op2==0x10) rs1[i]=HIREG; // MFHI
7641         if(op2==0x11) rt1[i]=HIREG; // MTHI
7642         if(op2==0x12) rs1[i]=LOREG; // MFLO
7643         if(op2==0x13) rt1[i]=LOREG; // MTLO
7644         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7645         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7646         dep1[i]=rs1[i];
7647         break;
7648       case SHIFT:
7649         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7650         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7651         rt1[i]=(source[i]>>11)&0x1f; // destination
7652         rt2[i]=0;
7653         // DSLLV/DSRLV/DSRAV are 64-bit
7654         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7655         break;
7656       case SHIFTIMM:
7657         rs1[i]=(source[i]>>16)&0x1f;
7658         rs2[i]=0;
7659         rt1[i]=(source[i]>>11)&0x1f;
7660         rt2[i]=0;
7661         imm[i]=(source[i]>>6)&0x1f;
7662         // DSxx32 instructions
7663         if(op2>=0x3c) imm[i]|=0x20;
7664         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7665         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7666         break;
7667       case COP0:
7668         rs1[i]=0;
7669         rs2[i]=0;
7670         rt1[i]=0;
7671         rt2[i]=0;
7672         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7673         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7674         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7675         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7676         break;
7677       case COP1:
7678         rs1[i]=0;
7679         rs2[i]=0;
7680         rt1[i]=0;
7681         rt2[i]=0;
7682         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7683         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7684         if(op2==5) us1[i]=rs1[i]; // DMTC1
7685         rs2[i]=CSREG;
7686         break;
7687       case COP2:
7688         rs1[i]=0;
7689         rs2[i]=0;
7690         rt1[i]=0;
7691         rt2[i]=0;
7692         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7693         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7694         rs2[i]=CSREG;
7695         int gr=(source[i]>>11)&0x1F;
7696         switch(op2)
7697         {
7698           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7699           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7700           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7701           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7702         }
7703         break;
7704       case C1LS:
7705         rs1[i]=(source[i]>>21)&0x1F;
7706         rs2[i]=CSREG;
7707         rt1[i]=0;
7708         rt2[i]=0;
7709         imm[i]=(short)source[i];
7710         break;
7711       case C2LS:
7712         rs1[i]=(source[i]>>21)&0x1F;
7713         rs2[i]=0;
7714         rt1[i]=0;
7715         rt2[i]=0;
7716         imm[i]=(short)source[i];
7717         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7718         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7719         break;
7720       case C2OP:
7721         rs1[i]=0;
7722         rs2[i]=0;
7723         rt1[i]=0;
7724         rt2[i]=0;
7725         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7726         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7727         gte_rt[i]|=1ll<<63; // every op changes flags
7728         if((source[i]&0x3f)==GTE_MVMVA) {
7729           int v = (source[i] >> 15) & 3;
7730           gte_rs[i]&=~0xe3fll;
7731           if(v==3) gte_rs[i]|=0xe00ll;
7732           else gte_rs[i]|=3ll<<(v*2);
7733         }
7734         break;
7735       case FLOAT:
7736       case FCONV:
7737         rs1[i]=0;
7738         rs2[i]=CSREG;
7739         rt1[i]=0;
7740         rt2[i]=0;
7741         break;
7742       case FCOMP:
7743         rs1[i]=FSREG;
7744         rs2[i]=CSREG;
7745         rt1[i]=FSREG;
7746         rt2[i]=0;
7747         break;
7748       case SYSCALL:
7749       case HLECALL:
7750       case INTCALL:
7751         rs1[i]=CCREG;
7752         rs2[i]=0;
7753         rt1[i]=0;
7754         rt2[i]=0;
7755         break;
7756       default:
7757         rs1[i]=0;
7758         rs2[i]=0;
7759         rt1[i]=0;
7760         rt2[i]=0;
7761     }
7762     /* Calculate branch target addresses */
7763     if(type==UJUMP)
7764       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7765     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7766       ba[i]=start+i*4+8; // Ignore never taken branch
7767     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7768       ba[i]=start+i*4+8; // Ignore never taken branch
7769     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7770       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7771     else ba[i]=-1;
7772     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7773       int do_in_intrp=0;
7774       // branch in delay slot?
7775       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7776         // don't handle first branch and call interpreter if it's hit
7777         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7778         do_in_intrp=1;
7779       }
7780       // basic load delay detection
7781       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7782         int t=(ba[i-1]-start)/4;
7783         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7784           // jump target wants DS result - potential load delay effect
7785           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7786           do_in_intrp=1;
7787           bt[t+1]=1; // expected return from interpreter
7788         }
7789         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7790               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7791           // v0 overwrite like this is a sign of trouble, bail out
7792           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7793           do_in_intrp=1;
7794         }
7795       }
7796       if(do_in_intrp) {
7797         rs1[i-1]=CCREG;
7798         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7799         ba[i-1]=-1;
7800         itype[i-1]=INTCALL;
7801         done=2;
7802         i--; // don't compile the DS
7803       }
7804     }
7805     /* Is this the end of the block? */
7806     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7807       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7808         done=2;
7809       }
7810       else {
7811         if(stop_after_jal) done=1;
7812         // Stop on BREAK
7813         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7814       }
7815       // Don't recompile stuff that's already compiled
7816       if(check_addr(start+i*4+4)) done=1;
7817       // Don't get too close to the limit
7818       if(i>MAXBLOCK/2) done=1;
7819     }
7820     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7821     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7822     if(done==2) {
7823       // Does the block continue due to a branch?
7824       for(j=i-1;j>=0;j--)
7825       {
7826         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7827         if(ba[j]==start+i*4+4) done=j=0;
7828         if(ba[j]==start+i*4+8) done=j=0;
7829       }
7830     }
7831     //assert(i<MAXBLOCK-1);
7832     if(start+i*4==pagelimit-4) done=1;
7833     assert(start+i*4<pagelimit);
7834     if (i==MAXBLOCK-1) done=1;
7835     // Stop if we're compiling junk
7836     if(itype[i]==NI&&opcode[i]==0x11) {
7837       done=stop_after_jal=1;
7838       SysPrintf("Disabled speculative precompilation\n");
7839     }
7840   }
7841   slen=i;
7842   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
7843     if(start+i*4==pagelimit) {
7844       itype[i-1]=SPAN;
7845     }
7846   }
7847   assert(slen>0);
7848
7849   /* Pass 2 - Register dependencies and branch targets */
7850
7851   unneeded_registers(0,slen-1,0);
7852
7853   /* Pass 3 - Register allocation */
7854
7855   struct regstat current; // Current register allocations/status
7856   current.is32=1;
7857   current.dirty=0;
7858   current.u=unneeded_reg[0];
7859   current.uu=unneeded_reg_upper[0];
7860   clear_all_regs(current.regmap);
7861   alloc_reg(&current,0,CCREG);
7862   dirty_reg(&current,CCREG);
7863   current.isconst=0;
7864   current.wasconst=0;
7865   current.waswritten=0;
7866   int ds=0;
7867   int cc=0;
7868   int hr=-1;
7869
7870   if((u_int)addr&1) {
7871     // First instruction is delay slot
7872     cc=-1;
7873     bt[1]=1;
7874     ds=1;
7875     unneeded_reg[0]=1;
7876     unneeded_reg_upper[0]=1;
7877     current.regmap[HOST_BTREG]=BTREG;
7878   }
7879
7880   for(i=0;i<slen;i++)
7881   {
7882     if(bt[i])
7883     {
7884       int hr;
7885       for(hr=0;hr<HOST_REGS;hr++)
7886       {
7887         // Is this really necessary?
7888         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7889       }
7890       current.isconst=0;
7891       current.waswritten=0;
7892     }
7893     if(i>1)
7894     {
7895       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7896       {
7897         if(rs1[i-2]==0||rs2[i-2]==0)
7898         {
7899           if(rs1[i-2]) {
7900             current.is32|=1LL<<rs1[i-2];
7901             int hr=get_reg(current.regmap,rs1[i-2]|64);
7902             if(hr>=0) current.regmap[hr]=-1;
7903           }
7904           if(rs2[i-2]) {
7905             current.is32|=1LL<<rs2[i-2];
7906             int hr=get_reg(current.regmap,rs2[i-2]|64);
7907             if(hr>=0) current.regmap[hr]=-1;
7908           }
7909         }
7910       }
7911     }
7912     current.is32=-1LL;
7913
7914     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7915     regs[i].wasconst=current.isconst;
7916     regs[i].was32=current.is32;
7917     regs[i].wasdirty=current.dirty;
7918     regs[i].loadedconst=0;
7919     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
7920       if(i+1<slen) {
7921         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7922         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
7923         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7924         current.u|=1;
7925         current.uu|=1;
7926       } else {
7927         current.u=1;
7928         current.uu=1;
7929       }
7930     } else {
7931       if(i+1<slen) {
7932         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7933         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
7934         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
7935         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7936         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7937         current.u|=1;
7938         current.uu|=1;
7939       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
7940     }
7941     is_ds[i]=ds;
7942     if(ds) {
7943       ds=0; // Skip delay slot, already allocated as part of branch
7944       // ...but we need to alloc it in case something jumps here
7945       if(i+1<slen) {
7946         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7947         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
7948       }else{
7949         current.u=branch_unneeded_reg[i-1];
7950         current.uu=branch_unneeded_reg_upper[i-1];
7951       }
7952       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7953       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7954       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7955       current.u|=1;
7956       current.uu|=1;
7957       struct regstat temp;
7958       memcpy(&temp,&current,sizeof(current));
7959       temp.wasdirty=temp.dirty;
7960       temp.was32=temp.is32;
7961       // TODO: Take into account unconditional branches, as below
7962       delayslot_alloc(&temp,i);
7963       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7964       regs[i].wasdirty=temp.wasdirty;
7965       regs[i].was32=temp.was32;
7966       regs[i].dirty=temp.dirty;
7967       regs[i].is32=temp.is32;
7968       regs[i].isconst=0;
7969       regs[i].wasconst=0;
7970       current.isconst=0;
7971       // Create entry (branch target) regmap
7972       for(hr=0;hr<HOST_REGS;hr++)
7973       {
7974         int r=temp.regmap[hr];
7975         if(r>=0) {
7976           if(r!=regmap_pre[i][hr]) {
7977             regs[i].regmap_entry[hr]=-1;
7978           }
7979           else
7980           {
7981             if(r<64){
7982               if((current.u>>r)&1) {
7983                 regs[i].regmap_entry[hr]=-1;
7984                 regs[i].regmap[hr]=-1;
7985                 //Don't clear regs in the delay slot as the branch might need them
7986                 //current.regmap[hr]=-1;
7987               }else
7988                 regs[i].regmap_entry[hr]=r;
7989             }
7990             else {
7991               if((current.uu>>(r&63))&1) {
7992                 regs[i].regmap_entry[hr]=-1;
7993                 regs[i].regmap[hr]=-1;
7994                 //Don't clear regs in the delay slot as the branch might need them
7995                 //current.regmap[hr]=-1;
7996               }else
7997                 regs[i].regmap_entry[hr]=r;
7998             }
7999           }
8000         } else {
8001           // First instruction expects CCREG to be allocated
8002           if(i==0&&hr==HOST_CCREG)
8003             regs[i].regmap_entry[hr]=CCREG;
8004           else
8005             regs[i].regmap_entry[hr]=-1;
8006         }
8007       }
8008     }
8009     else { // Not delay slot
8010       switch(itype[i]) {
8011         case UJUMP:
8012           //current.isconst=0; // DEBUG
8013           //current.wasconst=0; // DEBUG
8014           //regs[i].wasconst=0; // DEBUG
8015           clear_const(&current,rt1[i]);
8016           alloc_cc(&current,i);
8017           dirty_reg(&current,CCREG);
8018           if (rt1[i]==31) {
8019             alloc_reg(&current,i,31);
8020             dirty_reg(&current,31);
8021             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8022             //assert(rt1[i+1]!=rt1[i]);
8023             #ifdef REG_PREFETCH
8024             alloc_reg(&current,i,PTEMP);
8025             #endif
8026             //current.is32|=1LL<<rt1[i];
8027           }
8028           ooo[i]=1;
8029           delayslot_alloc(&current,i+1);
8030           //current.isconst=0; // DEBUG
8031           ds=1;
8032           //printf("i=%d, isconst=%x\n",i,current.isconst);
8033           break;
8034         case RJUMP:
8035           //current.isconst=0;
8036           //current.wasconst=0;
8037           //regs[i].wasconst=0;
8038           clear_const(&current,rs1[i]);
8039           clear_const(&current,rt1[i]);
8040           alloc_cc(&current,i);
8041           dirty_reg(&current,CCREG);
8042           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8043             alloc_reg(&current,i,rs1[i]);
8044             if (rt1[i]!=0) {
8045               alloc_reg(&current,i,rt1[i]);
8046               dirty_reg(&current,rt1[i]);
8047               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
8048               assert(rt1[i+1]!=rt1[i]);
8049               #ifdef REG_PREFETCH
8050               alloc_reg(&current,i,PTEMP);
8051               #endif
8052             }
8053             #ifdef USE_MINI_HT
8054             if(rs1[i]==31) { // JALR
8055               alloc_reg(&current,i,RHASH);
8056               #ifndef HOST_IMM_ADDR32
8057               alloc_reg(&current,i,RHTBL);
8058               #endif
8059             }
8060             #endif
8061             delayslot_alloc(&current,i+1);
8062           } else {
8063             // The delay slot overwrites our source register,
8064             // allocate a temporary register to hold the old value.
8065             current.isconst=0;
8066             current.wasconst=0;
8067             regs[i].wasconst=0;
8068             delayslot_alloc(&current,i+1);
8069             current.isconst=0;
8070             alloc_reg(&current,i,RTEMP);
8071           }
8072           //current.isconst=0; // DEBUG
8073           ooo[i]=1;
8074           ds=1;
8075           break;
8076         case CJUMP:
8077           //current.isconst=0;
8078           //current.wasconst=0;
8079           //regs[i].wasconst=0;
8080           clear_const(&current,rs1[i]);
8081           clear_const(&current,rs2[i]);
8082           if((opcode[i]&0x3E)==4) // BEQ/BNE
8083           {
8084             alloc_cc(&current,i);
8085             dirty_reg(&current,CCREG);
8086             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8087             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8088             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8089             {
8090               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8091               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8092             }
8093             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8094                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8095               // The delay slot overwrites one of our conditions.
8096               // Allocate the branch condition registers instead.
8097               current.isconst=0;
8098               current.wasconst=0;
8099               regs[i].wasconst=0;
8100               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8101               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8102               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8103               {
8104                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8105                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8106               }
8107             }
8108             else
8109             {
8110               ooo[i]=1;
8111               delayslot_alloc(&current,i+1);
8112             }
8113           }
8114           else
8115           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8116           {
8117             alloc_cc(&current,i);
8118             dirty_reg(&current,CCREG);
8119             alloc_reg(&current,i,rs1[i]);
8120             if(!(current.is32>>rs1[i]&1))
8121             {
8122               alloc_reg64(&current,i,rs1[i]);
8123             }
8124             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8125               // The delay slot overwrites one of our conditions.
8126               // Allocate the branch condition registers instead.
8127               current.isconst=0;
8128               current.wasconst=0;
8129               regs[i].wasconst=0;
8130               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8131               if(!((current.is32>>rs1[i])&1))
8132               {
8133                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8134               }
8135             }
8136             else
8137             {
8138               ooo[i]=1;
8139               delayslot_alloc(&current,i+1);
8140             }
8141           }
8142           else
8143           // Don't alloc the delay slot yet because we might not execute it
8144           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8145           {
8146             current.isconst=0;
8147             current.wasconst=0;
8148             regs[i].wasconst=0;
8149             alloc_cc(&current,i);
8150             dirty_reg(&current,CCREG);
8151             alloc_reg(&current,i,rs1[i]);
8152             alloc_reg(&current,i,rs2[i]);
8153             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8154             {
8155               alloc_reg64(&current,i,rs1[i]);
8156               alloc_reg64(&current,i,rs2[i]);
8157             }
8158           }
8159           else
8160           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8161           {
8162             current.isconst=0;
8163             current.wasconst=0;
8164             regs[i].wasconst=0;
8165             alloc_cc(&current,i);
8166             dirty_reg(&current,CCREG);
8167             alloc_reg(&current,i,rs1[i]);
8168             if(!(current.is32>>rs1[i]&1))
8169             {
8170               alloc_reg64(&current,i,rs1[i]);
8171             }
8172           }
8173           ds=1;
8174           //current.isconst=0;
8175           break;
8176         case SJUMP:
8177           //current.isconst=0;
8178           //current.wasconst=0;
8179           //regs[i].wasconst=0;
8180           clear_const(&current,rs1[i]);
8181           clear_const(&current,rt1[i]);
8182           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8183           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8184           {
8185             alloc_cc(&current,i);
8186             dirty_reg(&current,CCREG);
8187             alloc_reg(&current,i,rs1[i]);
8188             if(!(current.is32>>rs1[i]&1))
8189             {
8190               alloc_reg64(&current,i,rs1[i]);
8191             }
8192             if (rt1[i]==31) { // BLTZAL/BGEZAL
8193               alloc_reg(&current,i,31);
8194               dirty_reg(&current,31);
8195               //#ifdef REG_PREFETCH
8196               //alloc_reg(&current,i,PTEMP);
8197               //#endif
8198               //current.is32|=1LL<<rt1[i];
8199             }
8200             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
8201                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
8202               // Allocate the branch condition registers instead.
8203               current.isconst=0;
8204               current.wasconst=0;
8205               regs[i].wasconst=0;
8206               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8207               if(!((current.is32>>rs1[i])&1))
8208               {
8209                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8210               }
8211             }
8212             else
8213             {
8214               ooo[i]=1;
8215               delayslot_alloc(&current,i+1);
8216             }
8217           }
8218           else
8219           // Don't alloc the delay slot yet because we might not execute it
8220           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8221           {
8222             current.isconst=0;
8223             current.wasconst=0;
8224             regs[i].wasconst=0;
8225             alloc_cc(&current,i);
8226             dirty_reg(&current,CCREG);
8227             alloc_reg(&current,i,rs1[i]);
8228             if(!(current.is32>>rs1[i]&1))
8229             {
8230               alloc_reg64(&current,i,rs1[i]);
8231             }
8232           }
8233           ds=1;
8234           //current.isconst=0;
8235           break;
8236         case FJUMP:
8237           current.isconst=0;
8238           current.wasconst=0;
8239           regs[i].wasconst=0;
8240           if(likely[i]==0) // BC1F/BC1T
8241           {
8242             // TODO: Theoretically we can run out of registers here on x86.
8243             // The delay slot can allocate up to six, and we need to check
8244             // CSREG before executing the delay slot.  Possibly we can drop
8245             // the cycle count and then reload it after checking that the
8246             // FPU is in a usable state, or don't do out-of-order execution.
8247             alloc_cc(&current,i);
8248             dirty_reg(&current,CCREG);
8249             alloc_reg(&current,i,FSREG);
8250             alloc_reg(&current,i,CSREG);
8251             if(itype[i+1]==FCOMP) {
8252               // The delay slot overwrites the branch condition.
8253               // Allocate the branch condition registers instead.
8254               alloc_cc(&current,i);
8255               dirty_reg(&current,CCREG);
8256               alloc_reg(&current,i,CSREG);
8257               alloc_reg(&current,i,FSREG);
8258             }
8259             else {
8260               ooo[i]=1;
8261               delayslot_alloc(&current,i+1);
8262               alloc_reg(&current,i+1,CSREG);
8263             }
8264           }
8265           else
8266           // Don't alloc the delay slot yet because we might not execute it
8267           if(likely[i]) // BC1FL/BC1TL
8268           {
8269             alloc_cc(&current,i);
8270             dirty_reg(&current,CCREG);
8271             alloc_reg(&current,i,CSREG);
8272             alloc_reg(&current,i,FSREG);
8273           }
8274           ds=1;
8275           current.isconst=0;
8276           break;
8277         case IMM16:
8278           imm16_alloc(&current,i);
8279           break;
8280         case LOAD:
8281         case LOADLR:
8282           load_alloc(&current,i);
8283           break;
8284         case STORE:
8285         case STORELR:
8286           store_alloc(&current,i);
8287           break;
8288         case ALU:
8289           alu_alloc(&current,i);
8290           break;
8291         case SHIFT:
8292           shift_alloc(&current,i);
8293           break;
8294         case MULTDIV:
8295           multdiv_alloc(&current,i);
8296           break;
8297         case SHIFTIMM:
8298           shiftimm_alloc(&current,i);
8299           break;
8300         case MOV:
8301           mov_alloc(&current,i);
8302           break;
8303         case COP0:
8304           cop0_alloc(&current,i);
8305           break;
8306         case COP1:
8307         case COP2:
8308           cop1_alloc(&current,i);
8309           break;
8310         case C1LS:
8311           c1ls_alloc(&current,i);
8312           break;
8313         case C2LS:
8314           c2ls_alloc(&current,i);
8315           break;
8316         case C2OP:
8317           c2op_alloc(&current,i);
8318           break;
8319         case FCONV:
8320           fconv_alloc(&current,i);
8321           break;
8322         case FLOAT:
8323           float_alloc(&current,i);
8324           break;
8325         case FCOMP:
8326           fcomp_alloc(&current,i);
8327           break;
8328         case SYSCALL:
8329         case HLECALL:
8330         case INTCALL:
8331           syscall_alloc(&current,i);
8332           break;
8333         case SPAN:
8334           pagespan_alloc(&current,i);
8335           break;
8336       }
8337
8338       // Drop the upper half of registers that have become 32-bit
8339       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8340       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8341         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8342         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8343         current.uu|=1;
8344       } else {
8345         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8346         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8347         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8348         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8349         current.uu|=1;
8350       }
8351
8352       // Create entry (branch target) regmap
8353       for(hr=0;hr<HOST_REGS;hr++)
8354       {
8355         int r,or,er;
8356         r=current.regmap[hr];
8357         if(r>=0) {
8358           if(r!=regmap_pre[i][hr]) {
8359             // TODO: delay slot (?)
8360             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8361             if(or<0||(r&63)>=TEMPREG){
8362               regs[i].regmap_entry[hr]=-1;
8363             }
8364             else
8365             {
8366               // Just move it to a different register
8367               regs[i].regmap_entry[hr]=r;
8368               // If it was dirty before, it's still dirty
8369               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8370             }
8371           }
8372           else
8373           {
8374             // Unneeded
8375             if(r==0){
8376               regs[i].regmap_entry[hr]=0;
8377             }
8378             else
8379             if(r<64){
8380               if((current.u>>r)&1) {
8381                 regs[i].regmap_entry[hr]=-1;
8382                 //regs[i].regmap[hr]=-1;
8383                 current.regmap[hr]=-1;
8384               }else
8385                 regs[i].regmap_entry[hr]=r;
8386             }
8387             else {
8388               if((current.uu>>(r&63))&1) {
8389                 regs[i].regmap_entry[hr]=-1;
8390                 //regs[i].regmap[hr]=-1;
8391                 current.regmap[hr]=-1;
8392               }else
8393                 regs[i].regmap_entry[hr]=r;
8394             }
8395           }
8396         } else {
8397           // Branches expect CCREG to be allocated at the target
8398           if(regmap_pre[i][hr]==CCREG)
8399             regs[i].regmap_entry[hr]=CCREG;
8400           else
8401             regs[i].regmap_entry[hr]=-1;
8402         }
8403       }
8404       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8405     }
8406
8407     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8408       current.waswritten|=1<<rs1[i-1];
8409     current.waswritten&=~(1<<rt1[i]);
8410     current.waswritten&=~(1<<rt2[i]);
8411     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8412       current.waswritten&=~(1<<rs1[i]);
8413
8414     /* Branch post-alloc */
8415     if(i>0)
8416     {
8417       current.was32=current.is32;
8418       current.wasdirty=current.dirty;
8419       switch(itype[i-1]) {
8420         case UJUMP:
8421           memcpy(&branch_regs[i-1],&current,sizeof(current));
8422           branch_regs[i-1].isconst=0;
8423           branch_regs[i-1].wasconst=0;
8424           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8425           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8426           alloc_cc(&branch_regs[i-1],i-1);
8427           dirty_reg(&branch_regs[i-1],CCREG);
8428           if(rt1[i-1]==31) { // JAL
8429             alloc_reg(&branch_regs[i-1],i-1,31);
8430             dirty_reg(&branch_regs[i-1],31);
8431             branch_regs[i-1].is32|=1LL<<31;
8432           }
8433           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8434           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8435           break;
8436         case RJUMP:
8437           memcpy(&branch_regs[i-1],&current,sizeof(current));
8438           branch_regs[i-1].isconst=0;
8439           branch_regs[i-1].wasconst=0;
8440           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8441           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8442           alloc_cc(&branch_regs[i-1],i-1);
8443           dirty_reg(&branch_regs[i-1],CCREG);
8444           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8445           if(rt1[i-1]!=0) { // JALR
8446             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8447             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8448             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8449           }
8450           #ifdef USE_MINI_HT
8451           if(rs1[i-1]==31) { // JALR
8452             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8453             #ifndef HOST_IMM_ADDR32
8454             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8455             #endif
8456           }
8457           #endif
8458           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8459           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8460           break;
8461         case CJUMP:
8462           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8463           {
8464             alloc_cc(&current,i-1);
8465             dirty_reg(&current,CCREG);
8466             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8467                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8468               // The delay slot overwrote one of our conditions
8469               // Delay slot goes after the test (in order)
8470               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8471               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8472               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8473               current.u|=1;
8474               current.uu|=1;
8475               delayslot_alloc(&current,i);
8476               current.isconst=0;
8477             }
8478             else
8479             {
8480               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8481               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8482               // Alloc the branch condition registers
8483               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8484               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8485               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8486               {
8487                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8488                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8489               }
8490             }
8491             memcpy(&branch_regs[i-1],&current,sizeof(current));
8492             branch_regs[i-1].isconst=0;
8493             branch_regs[i-1].wasconst=0;
8494             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8495             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8496           }
8497           else
8498           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8499           {
8500             alloc_cc(&current,i-1);
8501             dirty_reg(&current,CCREG);
8502             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8503               // The delay slot overwrote the branch condition
8504               // Delay slot goes after the test (in order)
8505               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8506               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8507               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8508               current.u|=1;
8509               current.uu|=1;
8510               delayslot_alloc(&current,i);
8511               current.isconst=0;
8512             }
8513             else
8514             {
8515               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8516               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8517               // Alloc the branch condition register
8518               alloc_reg(&current,i-1,rs1[i-1]);
8519               if(!(current.is32>>rs1[i-1]&1))
8520               {
8521                 alloc_reg64(&current,i-1,rs1[i-1]);
8522               }
8523             }
8524             memcpy(&branch_regs[i-1],&current,sizeof(current));
8525             branch_regs[i-1].isconst=0;
8526             branch_regs[i-1].wasconst=0;
8527             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8528             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8529           }
8530           else
8531           // Alloc the delay slot in case the branch is taken
8532           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8533           {
8534             memcpy(&branch_regs[i-1],&current,sizeof(current));
8535             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8536             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8537             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8538             alloc_cc(&branch_regs[i-1],i);
8539             dirty_reg(&branch_regs[i-1],CCREG);
8540             delayslot_alloc(&branch_regs[i-1],i);
8541             branch_regs[i-1].isconst=0;
8542             alloc_reg(&current,i,CCREG); // Not taken path
8543             dirty_reg(&current,CCREG);
8544             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8545           }
8546           else
8547           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8548           {
8549             memcpy(&branch_regs[i-1],&current,sizeof(current));
8550             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8551             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8552             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8553             alloc_cc(&branch_regs[i-1],i);
8554             dirty_reg(&branch_regs[i-1],CCREG);
8555             delayslot_alloc(&branch_regs[i-1],i);
8556             branch_regs[i-1].isconst=0;
8557             alloc_reg(&current,i,CCREG); // Not taken path
8558             dirty_reg(&current,CCREG);
8559             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8560           }
8561           break;
8562         case SJUMP:
8563           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8564           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8565           {
8566             alloc_cc(&current,i-1);
8567             dirty_reg(&current,CCREG);
8568             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8569               // The delay slot overwrote the branch condition
8570               // Delay slot goes after the test (in order)
8571               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8572               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8573               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8574               current.u|=1;
8575               current.uu|=1;
8576               delayslot_alloc(&current,i);
8577               current.isconst=0;
8578             }
8579             else
8580             {
8581               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8582               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8583               // Alloc the branch condition register
8584               alloc_reg(&current,i-1,rs1[i-1]);
8585               if(!(current.is32>>rs1[i-1]&1))
8586               {
8587                 alloc_reg64(&current,i-1,rs1[i-1]);
8588               }
8589             }
8590             memcpy(&branch_regs[i-1],&current,sizeof(current));
8591             branch_regs[i-1].isconst=0;
8592             branch_regs[i-1].wasconst=0;
8593             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8594             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8595           }
8596           else
8597           // Alloc the delay slot in case the branch is taken
8598           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8599           {
8600             memcpy(&branch_regs[i-1],&current,sizeof(current));
8601             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8602             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8603             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8604             alloc_cc(&branch_regs[i-1],i);
8605             dirty_reg(&branch_regs[i-1],CCREG);
8606             delayslot_alloc(&branch_regs[i-1],i);
8607             branch_regs[i-1].isconst=0;
8608             alloc_reg(&current,i,CCREG); // Not taken path
8609             dirty_reg(&current,CCREG);
8610             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8611           }
8612           // FIXME: BLTZAL/BGEZAL
8613           if(opcode2[i-1]&0x10) { // BxxZAL
8614             alloc_reg(&branch_regs[i-1],i-1,31);
8615             dirty_reg(&branch_regs[i-1],31);
8616             branch_regs[i-1].is32|=1LL<<31;
8617           }
8618           break;
8619         case FJUMP:
8620           if(likely[i-1]==0) // BC1F/BC1T
8621           {
8622             alloc_cc(&current,i-1);
8623             dirty_reg(&current,CCREG);
8624             if(itype[i]==FCOMP) {
8625               // The delay slot overwrote the branch condition
8626               // Delay slot goes after the test (in order)
8627               delayslot_alloc(&current,i);
8628               current.isconst=0;
8629             }
8630             else
8631             {
8632               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8633               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8634               // Alloc the branch condition register
8635               alloc_reg(&current,i-1,FSREG);
8636             }
8637             memcpy(&branch_regs[i-1],&current,sizeof(current));
8638             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8639           }
8640           else // BC1FL/BC1TL
8641           {
8642             // Alloc the delay slot in case the branch is taken
8643             memcpy(&branch_regs[i-1],&current,sizeof(current));
8644             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8645             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8646             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8647             alloc_cc(&branch_regs[i-1],i);
8648             dirty_reg(&branch_regs[i-1],CCREG);
8649             delayslot_alloc(&branch_regs[i-1],i);
8650             branch_regs[i-1].isconst=0;
8651             alloc_reg(&current,i,CCREG); // Not taken path
8652             dirty_reg(&current,CCREG);
8653             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8654           }
8655           break;
8656       }
8657
8658       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8659       {
8660         if(rt1[i-1]==31) // JAL/JALR
8661         {
8662           // Subroutine call will return here, don't alloc any registers
8663           current.is32=1;
8664           current.dirty=0;
8665           clear_all_regs(current.regmap);
8666           alloc_reg(&current,i,CCREG);
8667           dirty_reg(&current,CCREG);
8668         }
8669         else if(i+1<slen)
8670         {
8671           // Internal branch will jump here, match registers to caller
8672           current.is32=0x3FFFFFFFFLL;
8673           current.dirty=0;
8674           clear_all_regs(current.regmap);
8675           alloc_reg(&current,i,CCREG);
8676           dirty_reg(&current,CCREG);
8677           for(j=i-1;j>=0;j--)
8678           {
8679             if(ba[j]==start+i*4+4) {
8680               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8681               current.is32=branch_regs[j].is32;
8682               current.dirty=branch_regs[j].dirty;
8683               break;
8684             }
8685           }
8686           while(j>=0) {
8687             if(ba[j]==start+i*4+4) {
8688               for(hr=0;hr<HOST_REGS;hr++) {
8689                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8690                   current.regmap[hr]=-1;
8691                 }
8692                 current.is32&=branch_regs[j].is32;
8693                 current.dirty&=branch_regs[j].dirty;
8694               }
8695             }
8696             j--;
8697           }
8698         }
8699       }
8700     }
8701
8702     // Count cycles in between branches
8703     ccadj[i]=cc;
8704     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8705     {
8706       cc=0;
8707     }
8708 #if !defined(DRC_DBG)
8709     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8710     {
8711       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8712       cc+=gte_cycletab[source[i]&0x3f]/2;
8713     }
8714     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8715     {
8716       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8717     }
8718     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8719     {
8720       cc+=4;
8721     }
8722     else if(itype[i]==C2LS)
8723     {
8724       cc+=4;
8725     }
8726 #endif
8727     else
8728     {
8729       cc++;
8730     }
8731
8732     flush_dirty_uppers(&current);
8733     if(!is_ds[i]) {
8734       regs[i].is32=current.is32;
8735       regs[i].dirty=current.dirty;
8736       regs[i].isconst=current.isconst;
8737       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8738     }
8739     for(hr=0;hr<HOST_REGS;hr++) {
8740       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8741         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8742           regs[i].wasconst&=~(1<<hr);
8743         }
8744       }
8745     }
8746     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8747     regs[i].waswritten=current.waswritten;
8748   }
8749
8750   /* Pass 4 - Cull unused host registers */
8751
8752   uint64_t nr=0;
8753
8754   for (i=slen-1;i>=0;i--)
8755   {
8756     int hr;
8757     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8758     {
8759       if(ba[i]<start || ba[i]>=(start+slen*4))
8760       {
8761         // Branch out of this block, don't need anything
8762         nr=0;
8763       }
8764       else
8765       {
8766         // Internal branch
8767         // Need whatever matches the target
8768         nr=0;
8769         int t=(ba[i]-start)>>2;
8770         for(hr=0;hr<HOST_REGS;hr++)
8771         {
8772           if(regs[i].regmap_entry[hr]>=0) {
8773             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8774           }
8775         }
8776       }
8777       // Conditional branch may need registers for following instructions
8778       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8779       {
8780         if(i<slen-2) {
8781           nr|=needed_reg[i+2];
8782           for(hr=0;hr<HOST_REGS;hr++)
8783           {
8784             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8785             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8786           }
8787         }
8788       }
8789       // Don't need stuff which is overwritten
8790       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8791       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8792       // Merge in delay slot
8793       for(hr=0;hr<HOST_REGS;hr++)
8794       {
8795         if(!likely[i]) {
8796           // These are overwritten unless the branch is "likely"
8797           // and the delay slot is nullified if not taken
8798           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8799           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8800         }
8801         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8802         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8803         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8804         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8805         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8806         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8807         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8808         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8809         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8810           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8811           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8812         }
8813         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8814           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8815           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8816         }
8817         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8818           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8819           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8820         }
8821       }
8822     }
8823     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8824     {
8825       // SYSCALL instruction (software interrupt)
8826       nr=0;
8827     }
8828     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8829     {
8830       // ERET instruction (return from interrupt)
8831       nr=0;
8832     }
8833     else // Non-branch
8834     {
8835       if(i<slen-1) {
8836         for(hr=0;hr<HOST_REGS;hr++) {
8837           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8838           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8839           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8840           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8841         }
8842       }
8843     }
8844     for(hr=0;hr<HOST_REGS;hr++)
8845     {
8846       // Overwritten registers are not needed
8847       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8848       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8849       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8850       // Source registers are needed
8851       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8852       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8853       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8854       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8855       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8856       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8857       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8858       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8859       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
8860         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8861         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8862       }
8863       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
8864         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8865         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8866       }
8867       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8868         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8869         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8870       }
8871       // Don't store a register immediately after writing it,
8872       // may prevent dual-issue.
8873       // But do so if this is a branch target, otherwise we
8874       // might have to load the register before the branch.
8875       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8876         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
8877            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
8878           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8879           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8880         }
8881         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
8882            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
8883           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8884           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8885         }
8886       }
8887     }
8888     // Cycle count is needed at branches.  Assume it is needed at the target too.
8889     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
8890       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8891       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8892     }
8893     // Save it
8894     needed_reg[i]=nr;
8895
8896     // Deallocate unneeded registers
8897     for(hr=0;hr<HOST_REGS;hr++)
8898     {
8899       if(!((nr>>hr)&1)) {
8900         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8901         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8902            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8903            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8904         {
8905           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8906           {
8907             if(likely[i]) {
8908               regs[i].regmap[hr]=-1;
8909               regs[i].isconst&=~(1<<hr);
8910               if(i<slen-2) {
8911                 regmap_pre[i+2][hr]=-1;
8912                 regs[i+2].wasconst&=~(1<<hr);
8913               }
8914             }
8915           }
8916         }
8917         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8918         {
8919           int d1=0,d2=0,map=0,temp=0;
8920           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
8921           {
8922             d1=dep1[i+1];
8923             d2=dep2[i+1];
8924           }
8925           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8926              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8927             map=INVCP;
8928           }
8929           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8930              itype[i+1]==C1LS || itype[i+1]==C2LS)
8931             temp=FTEMP;
8932           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8933              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8934              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8935              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
8936              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8937              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8938              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8939              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8940              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8941              regs[i].regmap[hr]!=map )
8942           {
8943             regs[i].regmap[hr]=-1;
8944             regs[i].isconst&=~(1<<hr);
8945             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8946                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8947                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8948                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
8949                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
8950                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8951                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8952                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8953                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8954                branch_regs[i].regmap[hr]!=map)
8955             {
8956               branch_regs[i].regmap[hr]=-1;
8957               branch_regs[i].regmap_entry[hr]=-1;
8958               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8959               {
8960                 if(!likely[i]&&i<slen-2) {
8961                   regmap_pre[i+2][hr]=-1;
8962                   regs[i+2].wasconst&=~(1<<hr);
8963                 }
8964               }
8965             }
8966           }
8967         }
8968         else
8969         {
8970           // Non-branch
8971           if(i>0)
8972           {
8973             int d1=0,d2=0,map=-1,temp=-1;
8974             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
8975             {
8976               d1=dep1[i];
8977               d2=dep2[i];
8978             }
8979             if(itype[i]==STORE || itype[i]==STORELR ||
8980                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8981               map=INVCP;
8982             }
8983             if(itype[i]==LOADLR || itype[i]==STORELR ||
8984                itype[i]==C1LS || itype[i]==C2LS)
8985               temp=FTEMP;
8986             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8987                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
8988                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8989                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8990                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8991                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8992             {
8993               if(i<slen-1&&!is_ds[i]) {
8994                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
8995                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8996                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
8997                 {
8998                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
8999                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9000                 }
9001                 regmap_pre[i+1][hr]=-1;
9002                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9003                 regs[i+1].wasconst&=~(1<<hr);
9004               }
9005               regs[i].regmap[hr]=-1;
9006               regs[i].isconst&=~(1<<hr);
9007             }
9008           }
9009         }
9010       }
9011     }
9012   }
9013
9014   /* Pass 5 - Pre-allocate registers */
9015
9016   // If a register is allocated during a loop, try to allocate it for the
9017   // entire loop, if possible.  This avoids loading/storing registers
9018   // inside of the loop.
9019
9020   signed char f_regmap[HOST_REGS];
9021   clear_all_regs(f_regmap);
9022   for(i=0;i<slen-1;i++)
9023   {
9024     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9025     {
9026       if(ba[i]>=start && ba[i]<(start+i*4))
9027       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9028       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9029       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9030       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9031       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9032       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9033       {
9034         int t=(ba[i]-start)>>2;
9035         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9036         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
9037         for(hr=0;hr<HOST_REGS;hr++)
9038         {
9039           if(regs[i].regmap[hr]>64) {
9040             if(!((regs[i].dirty>>hr)&1))
9041               f_regmap[hr]=regs[i].regmap[hr];
9042             else f_regmap[hr]=-1;
9043           }
9044           else if(regs[i].regmap[hr]>=0) {
9045             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9046               // dealloc old register
9047               int n;
9048               for(n=0;n<HOST_REGS;n++)
9049               {
9050                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9051               }
9052               // and alloc new one
9053               f_regmap[hr]=regs[i].regmap[hr];
9054             }
9055           }
9056           if(branch_regs[i].regmap[hr]>64) {
9057             if(!((branch_regs[i].dirty>>hr)&1))
9058               f_regmap[hr]=branch_regs[i].regmap[hr];
9059             else f_regmap[hr]=-1;
9060           }
9061           else if(branch_regs[i].regmap[hr]>=0) {
9062             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
9063               // dealloc old register
9064               int n;
9065               for(n=0;n<HOST_REGS;n++)
9066               {
9067                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
9068               }
9069               // and alloc new one
9070               f_regmap[hr]=branch_regs[i].regmap[hr];
9071             }
9072           }
9073           if(ooo[i]) {
9074             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
9075               f_regmap[hr]=branch_regs[i].regmap[hr];
9076           }else{
9077             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
9078               f_regmap[hr]=branch_regs[i].regmap[hr];
9079           }
9080           // Avoid dirty->clean transition
9081           #ifdef DESTRUCTIVE_WRITEBACK
9082           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9083           #endif
9084           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
9085           // case above, however it's always a good idea.  We can't hoist the
9086           // load if the register was already allocated, so there's no point
9087           // wasting time analyzing most of these cases.  It only "succeeds"
9088           // when the mapping was different and the load can be replaced with
9089           // a mov, which is of negligible benefit.  So such cases are
9090           // skipped below.
9091           if(f_regmap[hr]>0) {
9092             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
9093               int r=f_regmap[hr];
9094               for(j=t;j<=i;j++)
9095               {
9096                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9097                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9098                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9099                 if(r>63) {
9100                   // NB This can exclude the case where the upper-half
9101                   // register is lower numbered than the lower-half
9102                   // register.  Not sure if it's worth fixing...
9103                   if(get_reg(regs[j].regmap,r&63)<0) break;
9104                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
9105                   if(regs[j].is32&(1LL<<(r&63))) break;
9106                 }
9107                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9108                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9109                   int k;
9110                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9111                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9112                     if(r>63) {
9113                       if(get_reg(regs[i].regmap,r&63)<0) break;
9114                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9115                     }
9116                     k=i;
9117                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9118                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9119                         //printf("no free regs for store %x\n",start+(k-1)*4);
9120                         break;
9121                       }
9122                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9123                         //printf("no-match due to different register\n");
9124                         break;
9125                       }
9126                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9127                         //printf("no-match due to branch\n");
9128                         break;
9129                       }
9130                       // call/ret fast path assumes no registers allocated
9131                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
9132                         break;
9133                       }
9134                       if(r>63) {
9135                         // NB This can exclude the case where the upper-half
9136                         // register is lower numbered than the lower-half
9137                         // register.  Not sure if it's worth fixing...
9138                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9139                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9140                       }
9141                       k--;
9142                     }
9143                     if(i<slen-1) {
9144                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9145                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9146                         //printf("bad match after branch\n");
9147                         break;
9148                       }
9149                     }
9150                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9151                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9152                       while(k<i) {
9153                         regs[k].regmap_entry[hr]=f_regmap[hr];
9154                         regs[k].regmap[hr]=f_regmap[hr];
9155                         regmap_pre[k+1][hr]=f_regmap[hr];
9156                         regs[k].wasdirty&=~(1<<hr);
9157                         regs[k].dirty&=~(1<<hr);
9158                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9159                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9160                         regs[k].wasconst&=~(1<<hr);
9161                         regs[k].isconst&=~(1<<hr);
9162                         k++;
9163                       }
9164                     }
9165                     else {
9166                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9167                       break;
9168                     }
9169                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9170                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9171                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9172                       regs[i].regmap_entry[hr]=f_regmap[hr];
9173                       regs[i].regmap[hr]=f_regmap[hr];
9174                       regs[i].wasdirty&=~(1<<hr);
9175                       regs[i].dirty&=~(1<<hr);
9176                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9177                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9178                       regs[i].wasconst&=~(1<<hr);
9179                       regs[i].isconst&=~(1<<hr);
9180                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9181                       branch_regs[i].wasdirty&=~(1<<hr);
9182                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9183                       branch_regs[i].regmap[hr]=f_regmap[hr];
9184                       branch_regs[i].dirty&=~(1<<hr);
9185                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9186                       branch_regs[i].wasconst&=~(1<<hr);
9187                       branch_regs[i].isconst&=~(1<<hr);
9188                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9189                         regmap_pre[i+2][hr]=f_regmap[hr];
9190                         regs[i+2].wasdirty&=~(1<<hr);
9191                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9192                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9193                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9194                       }
9195                     }
9196                   }
9197                   for(k=t;k<j;k++) {
9198                     // Alloc register clean at beginning of loop,
9199                     // but may dirty it in pass 6
9200                     regs[k].regmap_entry[hr]=f_regmap[hr];
9201                     regs[k].regmap[hr]=f_regmap[hr];
9202                     regs[k].dirty&=~(1<<hr);
9203                     regs[k].wasconst&=~(1<<hr);
9204                     regs[k].isconst&=~(1<<hr);
9205                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
9206                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
9207                       branch_regs[k].regmap[hr]=f_regmap[hr];
9208                       branch_regs[k].dirty&=~(1<<hr);
9209                       branch_regs[k].wasconst&=~(1<<hr);
9210                       branch_regs[k].isconst&=~(1<<hr);
9211                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
9212                         regmap_pre[k+2][hr]=f_regmap[hr];
9213                         regs[k+2].wasdirty&=~(1<<hr);
9214                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
9215                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
9216                       }
9217                     }
9218                     else
9219                     {
9220                       regmap_pre[k+1][hr]=f_regmap[hr];
9221                       regs[k+1].wasdirty&=~(1<<hr);
9222                     }
9223                   }
9224                   if(regs[j].regmap[hr]==f_regmap[hr])
9225                     regs[j].regmap_entry[hr]=f_regmap[hr];
9226                   break;
9227                 }
9228                 if(j==i) break;
9229                 if(regs[j].regmap[hr]>=0)
9230                   break;
9231                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9232                   //printf("no-match due to different register\n");
9233                   break;
9234                 }
9235                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9236                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9237                   break;
9238                 }
9239                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9240                 {
9241                   // Stop on unconditional branch
9242                   break;
9243                 }
9244                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
9245                 {
9246                   if(ooo[j]) {
9247                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
9248                       break;
9249                   }else{
9250                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
9251                       break;
9252                   }
9253                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
9254                     //printf("no-match due to different register (branch)\n");
9255                     break;
9256                   }
9257                 }
9258                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9259                   //printf("No free regs for store %x\n",start+j*4);
9260                   break;
9261                 }
9262                 if(f_regmap[hr]>=64) {
9263                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9264                     break;
9265                   }
9266                   else
9267                   {
9268                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9269                       break;
9270                     }
9271                   }
9272                 }
9273               }
9274             }
9275           }
9276         }
9277       }
9278     }else{
9279       // Non branch or undetermined branch target
9280       for(hr=0;hr<HOST_REGS;hr++)
9281       {
9282         if(hr!=EXCLUDE_REG) {
9283           if(regs[i].regmap[hr]>64) {
9284             if(!((regs[i].dirty>>hr)&1))
9285               f_regmap[hr]=regs[i].regmap[hr];
9286           }
9287           else if(regs[i].regmap[hr]>=0) {
9288             if(f_regmap[hr]!=regs[i].regmap[hr]) {
9289               // dealloc old register
9290               int n;
9291               for(n=0;n<HOST_REGS;n++)
9292               {
9293                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
9294               }
9295               // and alloc new one
9296               f_regmap[hr]=regs[i].regmap[hr];
9297             }
9298           }
9299         }
9300       }
9301       // Try to restore cycle count at branch targets
9302       if(bt[i]) {
9303         for(j=i;j<slen-1;j++) {
9304           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9305           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
9306             //printf("no free regs for store %x\n",start+j*4);
9307             break;
9308           }
9309         }
9310         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9311           int k=i;
9312           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9313           while(k<j) {
9314             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9315             regs[k].regmap[HOST_CCREG]=CCREG;
9316             regmap_pre[k+1][HOST_CCREG]=CCREG;
9317             regs[k+1].wasdirty|=1<<HOST_CCREG;
9318             regs[k].dirty|=1<<HOST_CCREG;
9319             regs[k].wasconst&=~(1<<HOST_CCREG);
9320             regs[k].isconst&=~(1<<HOST_CCREG);
9321             k++;
9322           }
9323           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9324         }
9325         // Work backwards from the branch target
9326         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9327         {
9328           //printf("Extend backwards\n");
9329           int k;
9330           k=i;
9331           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9332             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9333               //printf("no free regs for store %x\n",start+(k-1)*4);
9334               break;
9335             }
9336             k--;
9337           }
9338           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9339             //printf("Extend CC, %x ->\n",start+k*4);
9340             while(k<=i) {
9341               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9342               regs[k].regmap[HOST_CCREG]=CCREG;
9343               regmap_pre[k+1][HOST_CCREG]=CCREG;
9344               regs[k+1].wasdirty|=1<<HOST_CCREG;
9345               regs[k].dirty|=1<<HOST_CCREG;
9346               regs[k].wasconst&=~(1<<HOST_CCREG);
9347               regs[k].isconst&=~(1<<HOST_CCREG);
9348               k++;
9349             }
9350           }
9351           else {
9352             //printf("Fail Extend CC, %x ->\n",start+k*4);
9353           }
9354         }
9355       }
9356       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9357          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9358          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9359          itype[i]!=FCONV&&itype[i]!=FCOMP)
9360       {
9361         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9362       }
9363     }
9364   }
9365
9366   // Cache memory offset or tlb map pointer if a register is available
9367   #ifndef HOST_IMM_ADDR32
9368   #ifndef RAM_OFFSET
9369   if(0)
9370   #endif
9371   {
9372     int earliest_available[HOST_REGS];
9373     int loop_start[HOST_REGS];
9374     int score[HOST_REGS];
9375     int end[HOST_REGS];
9376     int reg=ROREG;
9377
9378     // Init
9379     for(hr=0;hr<HOST_REGS;hr++) {
9380       score[hr]=0;earliest_available[hr]=0;
9381       loop_start[hr]=MAXBLOCK;
9382     }
9383     for(i=0;i<slen-1;i++)
9384     {
9385       // Can't do anything if no registers are available
9386       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
9387         for(hr=0;hr<HOST_REGS;hr++) {
9388           score[hr]=0;earliest_available[hr]=i+1;
9389           loop_start[hr]=MAXBLOCK;
9390         }
9391       }
9392       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9393         if(!ooo[i]) {
9394           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
9395             for(hr=0;hr<HOST_REGS;hr++) {
9396               score[hr]=0;earliest_available[hr]=i+1;
9397               loop_start[hr]=MAXBLOCK;
9398             }
9399           }
9400         }else{
9401           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
9402             for(hr=0;hr<HOST_REGS;hr++) {
9403               score[hr]=0;earliest_available[hr]=i+1;
9404               loop_start[hr]=MAXBLOCK;
9405             }
9406           }
9407         }
9408       }
9409       // Mark unavailable registers
9410       for(hr=0;hr<HOST_REGS;hr++) {
9411         if(regs[i].regmap[hr]>=0) {
9412           score[hr]=0;earliest_available[hr]=i+1;
9413           loop_start[hr]=MAXBLOCK;
9414         }
9415         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9416           if(branch_regs[i].regmap[hr]>=0) {
9417             score[hr]=0;earliest_available[hr]=i+2;
9418             loop_start[hr]=MAXBLOCK;
9419           }
9420         }
9421       }
9422       // No register allocations after unconditional jumps
9423       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9424       {
9425         for(hr=0;hr<HOST_REGS;hr++) {
9426           score[hr]=0;earliest_available[hr]=i+2;
9427           loop_start[hr]=MAXBLOCK;
9428         }
9429         i++; // Skip delay slot too
9430         //printf("skip delay slot: %x\n",start+i*4);
9431       }
9432       else
9433       // Possible match
9434       if(itype[i]==LOAD||itype[i]==LOADLR||
9435          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
9436         for(hr=0;hr<HOST_REGS;hr++) {
9437           if(hr!=EXCLUDE_REG) {
9438             end[hr]=i-1;
9439             for(j=i;j<slen-1;j++) {
9440               if(regs[j].regmap[hr]>=0) break;
9441               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9442                 if(branch_regs[j].regmap[hr]>=0) break;
9443                 if(ooo[j]) {
9444                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
9445                 }else{
9446                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
9447                 }
9448               }
9449               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
9450               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9451                 int t=(ba[j]-start)>>2;
9452                 if(t<j&&t>=earliest_available[hr]) {
9453                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
9454                     // Score a point for hoisting loop invariant
9455                     if(t<loop_start[hr]) loop_start[hr]=t;
9456                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
9457                     score[hr]++;
9458                     end[hr]=j;
9459                   }
9460                 }
9461                 else if(t<j) {
9462                   if(regs[t].regmap[hr]==reg) {
9463                     // Score a point if the branch target matches this register
9464                     score[hr]++;
9465                     end[hr]=j;
9466                   }
9467                 }
9468                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
9469                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
9470                   score[hr]++;
9471                   end[hr]=j;
9472                 }
9473               }
9474               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
9475               {
9476                 // Stop on unconditional branch
9477                 break;
9478               }
9479               else
9480               if(itype[j]==LOAD||itype[j]==LOADLR||
9481                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
9482                 score[hr]++;
9483                 end[hr]=j;
9484               }
9485             }
9486           }
9487         }
9488         // Find highest score and allocate that register
9489         int maxscore=0;
9490         for(hr=0;hr<HOST_REGS;hr++) {
9491           if(hr!=EXCLUDE_REG) {
9492             if(score[hr]>score[maxscore]) {
9493               maxscore=hr;
9494               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
9495             }
9496           }
9497         }
9498         if(score[maxscore]>1)
9499         {
9500           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
9501           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
9502             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
9503             assert(regs[j].regmap[maxscore]<0);
9504             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
9505             regs[j].regmap[maxscore]=reg;
9506             regs[j].dirty&=~(1<<maxscore);
9507             regs[j].wasconst&=~(1<<maxscore);
9508             regs[j].isconst&=~(1<<maxscore);
9509             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
9510               branch_regs[j].regmap[maxscore]=reg;
9511               branch_regs[j].wasdirty&=~(1<<maxscore);
9512               branch_regs[j].dirty&=~(1<<maxscore);
9513               branch_regs[j].wasconst&=~(1<<maxscore);
9514               branch_regs[j].isconst&=~(1<<maxscore);
9515               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
9516                 regmap_pre[j+2][maxscore]=reg;
9517                 regs[j+2].wasdirty&=~(1<<maxscore);
9518               }
9519               // loop optimization (loop_preload)
9520               int t=(ba[j]-start)>>2;
9521               if(t==loop_start[maxscore]) {
9522                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
9523                   regs[t].regmap_entry[maxscore]=reg;
9524               }
9525             }
9526             else
9527             {
9528               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
9529                 regmap_pre[j+1][maxscore]=reg;
9530                 regs[j+1].wasdirty&=~(1<<maxscore);
9531               }
9532             }
9533           }
9534           i=j-1;
9535           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
9536           for(hr=0;hr<HOST_REGS;hr++) {
9537             score[hr]=0;earliest_available[hr]=i+i;
9538             loop_start[hr]=MAXBLOCK;
9539           }
9540         }
9541       }
9542     }
9543   }
9544   #endif
9545
9546   // This allocates registers (if possible) one instruction prior
9547   // to use, which can avoid a load-use penalty on certain CPUs.
9548   for(i=0;i<slen-1;i++)
9549   {
9550     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9551     {
9552       if(!bt[i+1])
9553       {
9554         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9555            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9556         {
9557           if(rs1[i+1]) {
9558             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9559             {
9560               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9561               {
9562                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9563                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9564                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9565                 regs[i].isconst&=~(1<<hr);
9566                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9567                 constmap[i][hr]=constmap[i+1][hr];
9568                 regs[i+1].wasdirty&=~(1<<hr);
9569                 regs[i].dirty&=~(1<<hr);
9570               }
9571             }
9572           }
9573           if(rs2[i+1]) {
9574             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9575             {
9576               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9577               {
9578                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9579                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9580                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9581                 regs[i].isconst&=~(1<<hr);
9582                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9583                 constmap[i][hr]=constmap[i+1][hr];
9584                 regs[i+1].wasdirty&=~(1<<hr);
9585                 regs[i].dirty&=~(1<<hr);
9586               }
9587             }
9588           }
9589           // Preload target address for load instruction (non-constant)
9590           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9591             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9592             {
9593               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9594               {
9595                 regs[i].regmap[hr]=rs1[i+1];
9596                 regmap_pre[i+1][hr]=rs1[i+1];
9597                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9598                 regs[i].isconst&=~(1<<hr);
9599                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9600                 constmap[i][hr]=constmap[i+1][hr];
9601                 regs[i+1].wasdirty&=~(1<<hr);
9602                 regs[i].dirty&=~(1<<hr);
9603               }
9604             }
9605           }
9606           // Load source into target register
9607           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9608             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9609             {
9610               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9611               {
9612                 regs[i].regmap[hr]=rs1[i+1];
9613                 regmap_pre[i+1][hr]=rs1[i+1];
9614                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9615                 regs[i].isconst&=~(1<<hr);
9616                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9617                 constmap[i][hr]=constmap[i+1][hr];
9618                 regs[i+1].wasdirty&=~(1<<hr);
9619                 regs[i].dirty&=~(1<<hr);
9620               }
9621             }
9622           }
9623           // Address for store instruction (non-constant)
9624           if(itype[i+1]==STORE||itype[i+1]==STORELR
9625              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9626             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9627               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9628               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9629               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9630               assert(hr>=0);
9631               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9632               {
9633                 regs[i].regmap[hr]=rs1[i+1];
9634                 regmap_pre[i+1][hr]=rs1[i+1];
9635                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9636                 regs[i].isconst&=~(1<<hr);
9637                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9638                 constmap[i][hr]=constmap[i+1][hr];
9639                 regs[i+1].wasdirty&=~(1<<hr);
9640                 regs[i].dirty&=~(1<<hr);
9641               }
9642             }
9643           }
9644           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9645             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9646               int nr;
9647               hr=get_reg(regs[i+1].regmap,FTEMP);
9648               assert(hr>=0);
9649               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9650               {
9651                 regs[i].regmap[hr]=rs1[i+1];
9652                 regmap_pre[i+1][hr]=rs1[i+1];
9653                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9654                 regs[i].isconst&=~(1<<hr);
9655                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9656                 constmap[i][hr]=constmap[i+1][hr];
9657                 regs[i+1].wasdirty&=~(1<<hr);
9658                 regs[i].dirty&=~(1<<hr);
9659               }
9660               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9661               {
9662                 // move it to another register
9663                 regs[i+1].regmap[hr]=-1;
9664                 regmap_pre[i+2][hr]=-1;
9665                 regs[i+1].regmap[nr]=FTEMP;
9666                 regmap_pre[i+2][nr]=FTEMP;
9667                 regs[i].regmap[nr]=rs1[i+1];
9668                 regmap_pre[i+1][nr]=rs1[i+1];
9669                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9670                 regs[i].isconst&=~(1<<nr);
9671                 regs[i+1].isconst&=~(1<<nr);
9672                 regs[i].dirty&=~(1<<nr);
9673                 regs[i+1].wasdirty&=~(1<<nr);
9674                 regs[i+1].dirty&=~(1<<nr);
9675                 regs[i+2].wasdirty&=~(1<<nr);
9676               }
9677             }
9678           }
9679           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9680             if(itype[i+1]==LOAD)
9681               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9682             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9683               hr=get_reg(regs[i+1].regmap,FTEMP);
9684             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9685               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9686               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9687             }
9688             if(hr>=0&&regs[i].regmap[hr]<0) {
9689               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9690               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9691                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9692                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9693                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9694                 regs[i].isconst&=~(1<<hr);
9695                 regs[i+1].wasdirty&=~(1<<hr);
9696                 regs[i].dirty&=~(1<<hr);
9697               }
9698             }
9699           }
9700         }
9701       }
9702     }
9703   }
9704
9705   /* Pass 6 - Optimize clean/dirty state */
9706   clean_registers(0,slen-1,1);
9707
9708   /* Pass 7 - Identify 32-bit registers */
9709   for (i=slen-1;i>=0;i--)
9710   {
9711     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9712     {
9713       // Conditional branch
9714       if((source[i]>>16)!=0x1000&&i<slen-2) {
9715         // Mark this address as a branch target since it may be called
9716         // upon return from interrupt
9717         bt[i+2]=1;
9718       }
9719     }
9720   }
9721
9722   if(itype[slen-1]==SPAN) {
9723     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9724   }
9725
9726 #ifdef DISASM
9727   /* Debug/disassembly */
9728   for(i=0;i<slen;i++)
9729   {
9730     printf("U:");
9731     int r;
9732     for(r=1;r<=CCREG;r++) {
9733       if((unneeded_reg[i]>>r)&1) {
9734         if(r==HIREG) printf(" HI");
9735         else if(r==LOREG) printf(" LO");
9736         else printf(" r%d",r);
9737       }
9738     }
9739     printf("\n");
9740     #if defined(__i386__) || defined(__x86_64__)
9741     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9742     #endif
9743     #ifdef __arm__
9744     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9745     #endif
9746     printf("needs: ");
9747     if(needed_reg[i]&1) printf("eax ");
9748     if((needed_reg[i]>>1)&1) printf("ecx ");
9749     if((needed_reg[i]>>2)&1) printf("edx ");
9750     if((needed_reg[i]>>3)&1) printf("ebx ");
9751     if((needed_reg[i]>>5)&1) printf("ebp ");
9752     if((needed_reg[i]>>6)&1) printf("esi ");
9753     if((needed_reg[i]>>7)&1) printf("edi ");
9754     printf("\n");
9755     #if defined(__i386__) || defined(__x86_64__)
9756     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9757     printf("dirty: ");
9758     if(regs[i].wasdirty&1) printf("eax ");
9759     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9760     if((regs[i].wasdirty>>2)&1) printf("edx ");
9761     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9762     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9763     if((regs[i].wasdirty>>6)&1) printf("esi ");
9764     if((regs[i].wasdirty>>7)&1) printf("edi ");
9765     #endif
9766     #ifdef __arm__
9767     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9768     printf("dirty: ");
9769     if(regs[i].wasdirty&1) printf("r0 ");
9770     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9771     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9772     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9773     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9774     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9775     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9776     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9777     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9778     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9779     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9780     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9781     #endif
9782     printf("\n");
9783     disassemble_inst(i);
9784     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9785     #if defined(__i386__) || defined(__x86_64__)
9786     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9787     if(regs[i].dirty&1) printf("eax ");
9788     if((regs[i].dirty>>1)&1) printf("ecx ");
9789     if((regs[i].dirty>>2)&1) printf("edx ");
9790     if((regs[i].dirty>>3)&1) printf("ebx ");
9791     if((regs[i].dirty>>5)&1) printf("ebp ");
9792     if((regs[i].dirty>>6)&1) printf("esi ");
9793     if((regs[i].dirty>>7)&1) printf("edi ");
9794     #endif
9795     #ifdef __arm__
9796     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9797     if(regs[i].dirty&1) printf("r0 ");
9798     if((regs[i].dirty>>1)&1) printf("r1 ");
9799     if((regs[i].dirty>>2)&1) printf("r2 ");
9800     if((regs[i].dirty>>3)&1) printf("r3 ");
9801     if((regs[i].dirty>>4)&1) printf("r4 ");
9802     if((regs[i].dirty>>5)&1) printf("r5 ");
9803     if((regs[i].dirty>>6)&1) printf("r6 ");
9804     if((regs[i].dirty>>7)&1) printf("r7 ");
9805     if((regs[i].dirty>>8)&1) printf("r8 ");
9806     if((regs[i].dirty>>9)&1) printf("r9 ");
9807     if((regs[i].dirty>>10)&1) printf("r10 ");
9808     if((regs[i].dirty>>12)&1) printf("r12 ");
9809     #endif
9810     printf("\n");
9811     if(regs[i].isconst) {
9812       printf("constants: ");
9813       #if defined(__i386__) || defined(__x86_64__)
9814       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
9815       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
9816       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
9817       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
9818       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
9819       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
9820       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
9821       #endif
9822       #ifdef __arm__
9823       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
9824       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
9825       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
9826       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
9827       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
9828       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
9829       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
9830       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
9831       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
9832       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
9833       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
9834       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
9835       #endif
9836       printf("\n");
9837     }
9838     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9839       #if defined(__i386__) || defined(__x86_64__)
9840       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9841       if(branch_regs[i].dirty&1) printf("eax ");
9842       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
9843       if((branch_regs[i].dirty>>2)&1) printf("edx ");
9844       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
9845       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
9846       if((branch_regs[i].dirty>>6)&1) printf("esi ");
9847       if((branch_regs[i].dirty>>7)&1) printf("edi ");
9848       #endif
9849       #ifdef __arm__
9850       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
9851       if(branch_regs[i].dirty&1) printf("r0 ");
9852       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
9853       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
9854       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
9855       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
9856       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
9857       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
9858       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
9859       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
9860       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
9861       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
9862       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
9863       #endif
9864     }
9865   }
9866 #endif // DISASM
9867
9868   /* Pass 8 - Assembly */
9869   linkcount=0;stubcount=0;
9870   ds=0;is_delayslot=0;
9871   cop1_usable=0;
9872   uint64_t is32_pre=0;
9873   u_int dirty_pre=0;
9874   u_int beginning=(u_int)out;
9875   if((u_int)addr&1) {
9876     ds=1;
9877     pagespan_ds();
9878   }
9879   u_int instr_addr0_override=0;
9880
9881   if (start == 0x80030000) {
9882     // nasty hack for fastbios thing
9883     // override block entry to this code
9884     instr_addr0_override=(u_int)out;
9885     emit_movimm(start,0);
9886     // abuse io address var as a flag that we
9887     // have already returned here once
9888     emit_readword((int)&address,1);
9889     emit_writeword(0,(int)&pcaddr);
9890     emit_writeword(0,(int)&address);
9891     emit_cmp(0,1);
9892     emit_jne((int)new_dyna_leave);
9893   }
9894   for(i=0;i<slen;i++)
9895   {
9896     //if(ds) printf("ds: ");
9897     disassemble_inst(i);
9898     if(ds) {
9899       ds=0; // Skip delay slot
9900       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
9901       instr_addr[i]=0;
9902     } else {
9903       speculate_register_values(i);
9904       #ifndef DESTRUCTIVE_WRITEBACK
9905       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9906       {
9907         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
9908               unneeded_reg[i],unneeded_reg_upper[i]);
9909       }
9910       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
9911         is32_pre=branch_regs[i].is32;
9912         dirty_pre=branch_regs[i].dirty;
9913       }else{
9914         is32_pre=regs[i].is32;
9915         dirty_pre=regs[i].dirty;
9916       }
9917       #endif
9918       // write back
9919       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9920       {
9921         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
9922                       unneeded_reg[i],unneeded_reg_upper[i]);
9923         loop_preload(regmap_pre[i],regs[i].regmap_entry);
9924       }
9925       // branch target entry point
9926       instr_addr[i]=(u_int)out;
9927       assem_debug("<->\n");
9928       // load regs
9929       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
9930         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
9931       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
9932       address_generation(i,&regs[i],regs[i].regmap_entry);
9933       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
9934       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9935       {
9936         // Load the delay slot registers if necessary
9937         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
9938           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9939         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
9940           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9941         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
9942           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9943       }
9944       else if(i+1<slen)
9945       {
9946         // Preload registers for following instruction
9947         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
9948           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
9949             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9950         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
9951           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
9952             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9953       }
9954       // TODO: if(is_ooo(i)) address_generation(i+1);
9955       if(itype[i]==CJUMP||itype[i]==FJUMP)
9956         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
9957       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
9958         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9959       if(bt[i]) cop1_usable=0;
9960       // assemble
9961       switch(itype[i]) {
9962         case ALU:
9963           alu_assemble(i,&regs[i]);break;
9964         case IMM16:
9965           imm16_assemble(i,&regs[i]);break;
9966         case SHIFT:
9967           shift_assemble(i,&regs[i]);break;
9968         case SHIFTIMM:
9969           shiftimm_assemble(i,&regs[i]);break;
9970         case LOAD:
9971           load_assemble(i,&regs[i]);break;
9972         case LOADLR:
9973           loadlr_assemble(i,&regs[i]);break;
9974         case STORE:
9975           store_assemble(i,&regs[i]);break;
9976         case STORELR:
9977           storelr_assemble(i,&regs[i]);break;
9978         case COP0:
9979           cop0_assemble(i,&regs[i]);break;
9980         case COP1:
9981           cop1_assemble(i,&regs[i]);break;
9982         case C1LS:
9983           c1ls_assemble(i,&regs[i]);break;
9984         case COP2:
9985           cop2_assemble(i,&regs[i]);break;
9986         case C2LS:
9987           c2ls_assemble(i,&regs[i]);break;
9988         case C2OP:
9989           c2op_assemble(i,&regs[i]);break;
9990         case FCONV:
9991           fconv_assemble(i,&regs[i]);break;
9992         case FLOAT:
9993           float_assemble(i,&regs[i]);break;
9994         case FCOMP:
9995           fcomp_assemble(i,&regs[i]);break;
9996         case MULTDIV:
9997           multdiv_assemble(i,&regs[i]);break;
9998         case MOV:
9999           mov_assemble(i,&regs[i]);break;
10000         case SYSCALL:
10001           syscall_assemble(i,&regs[i]);break;
10002         case HLECALL:
10003           hlecall_assemble(i,&regs[i]);break;
10004         case INTCALL:
10005           intcall_assemble(i,&regs[i]);break;
10006         case UJUMP:
10007           ujump_assemble(i,&regs[i]);ds=1;break;
10008         case RJUMP:
10009           rjump_assemble(i,&regs[i]);ds=1;break;
10010         case CJUMP:
10011           cjump_assemble(i,&regs[i]);ds=1;break;
10012         case SJUMP:
10013           sjump_assemble(i,&regs[i]);ds=1;break;
10014         case FJUMP:
10015           fjump_assemble(i,&regs[i]);ds=1;break;
10016         case SPAN:
10017           pagespan_assemble(i,&regs[i]);break;
10018       }
10019       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10020         literal_pool(1024);
10021       else
10022         literal_pool_jumpover(256);
10023     }
10024   }
10025   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10026   // If the block did not end with an unconditional branch,
10027   // add a jump to the next instruction.
10028   if(i>1) {
10029     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10030       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10031       assert(i==slen);
10032       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10033         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10034         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10035           emit_loadreg(CCREG,HOST_CCREG);
10036         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10037       }
10038       else if(!likely[i-2])
10039       {
10040         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10041         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10042       }
10043       else
10044       {
10045         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10046         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10047       }
10048       add_to_linker((int)out,start+i*4,0);
10049       emit_jmp(0);
10050     }
10051   }
10052   else
10053   {
10054     assert(i>0);
10055     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10056     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10057     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10058       emit_loadreg(CCREG,HOST_CCREG);
10059     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
10060     add_to_linker((int)out,start+i*4,0);
10061     emit_jmp(0);
10062   }
10063
10064   // TODO: delay slot stubs?
10065   // Stubs
10066   for(i=0;i<stubcount;i++)
10067   {
10068     switch(stubs[i][0])
10069     {
10070       case LOADB_STUB:
10071       case LOADH_STUB:
10072       case LOADW_STUB:
10073       case LOADD_STUB:
10074       case LOADBU_STUB:
10075       case LOADHU_STUB:
10076         do_readstub(i);break;
10077       case STOREB_STUB:
10078       case STOREH_STUB:
10079       case STOREW_STUB:
10080       case STORED_STUB:
10081         do_writestub(i);break;
10082       case CC_STUB:
10083         do_ccstub(i);break;
10084       case INVCODE_STUB:
10085         do_invstub(i);break;
10086       case FP_STUB:
10087         do_cop1stub(i);break;
10088       case STORELR_STUB:
10089         do_unalignedwritestub(i);break;
10090     }
10091   }
10092
10093   if (instr_addr0_override)
10094     instr_addr[0] = instr_addr0_override;
10095
10096   /* Pass 9 - Linker */
10097   for(i=0;i<linkcount;i++)
10098   {
10099     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10100     literal_pool(64);
10101     if(!link_addr[i][2])
10102     {
10103       void *stub=out;
10104       void *addr=check_addr(link_addr[i][1]);
10105       emit_extjump(link_addr[i][0],link_addr[i][1]);
10106       if(addr) {
10107         set_jump_target(link_addr[i][0],(int)addr);
10108         add_link(link_addr[i][1],stub);
10109       }
10110       else set_jump_target(link_addr[i][0],(int)stub);
10111     }
10112     else
10113     {
10114       // Internal branch
10115       int target=(link_addr[i][1]-start)>>2;
10116       assert(target>=0&&target<slen);
10117       assert(instr_addr[target]);
10118       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10119       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10120       //#else
10121       set_jump_target(link_addr[i][0],instr_addr[target]);
10122       //#endif
10123     }
10124   }
10125   // External Branch Targets (jump_in)
10126   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10127   for(i=0;i<slen;i++)
10128   {
10129     if(bt[i]||i==0)
10130     {
10131       if(instr_addr[i]) // TODO - delay slots (=null)
10132       {
10133         u_int vaddr=start+i*4;
10134         u_int page=get_page(vaddr);
10135         u_int vpage=get_vpage(vaddr);
10136         literal_pool(256);
10137         {
10138           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10139           assem_debug("jump_in: %x\n",start+i*4);
10140           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10141           int entry_point=do_dirty_stub(i);
10142           ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
10143           // If there was an existing entry in the hash table,
10144           // replace it with the new address.
10145           // Don't add new entries.  We'll insert the
10146           // ones that actually get used in check_addr().
10147           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10148           if(ht_bin[0]==vaddr) {
10149             ht_bin[1]=entry_point;
10150           }
10151           if(ht_bin[2]==vaddr) {
10152             ht_bin[3]=entry_point;
10153           }
10154         }
10155       }
10156     }
10157   }
10158   // Write out the literal pool if necessary
10159   literal_pool(0);
10160   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10161   // Align code
10162   if(((u_int)out)&7) emit_addnop(13);
10163   #endif
10164   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
10165   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10166   memcpy(copy,source,slen*4);
10167   copy+=slen*4;
10168
10169   #ifdef __arm__
10170   __clear_cache((void *)beginning,out);
10171   #endif
10172
10173   // If we're within 256K of the end of the buffer,
10174   // start over from the beginning. (Is 256K enough?)
10175   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10176
10177   // Trap writes to any of the pages we compiled
10178   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10179     invalid_code[i]=0;
10180   }
10181   inv_code_start=inv_code_end=~0;
10182
10183   // for PCSX we need to mark all mirrors too
10184   if(get_page(start)<(RAM_SIZE>>12))
10185     for(i=start>>12;i<=(start+slen*4)>>12;i++)
10186       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
10187       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
10188       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
10189
10190   /* Pass 10 - Free memory by expiring oldest blocks */
10191
10192   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10193   while(expirep!=end)
10194   {
10195     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10196     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10197     inv_debug("EXP: Phase %d\n",expirep);
10198     switch((expirep>>11)&3)
10199     {
10200       case 0:
10201         // Clear jump_in and jump_dirty
10202         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10203         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10204         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10205         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10206         break;
10207       case 1:
10208         // Clear pointers
10209         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10210         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10211         break;
10212       case 2:
10213         // Clear hash table
10214         for(i=0;i<32;i++) {
10215           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10216           if((ht_bin[3]>>shift)==(base>>shift) ||
10217              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10218             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10219             ht_bin[2]=ht_bin[3]=-1;
10220           }
10221           if((ht_bin[1]>>shift)==(base>>shift) ||
10222              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10223             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10224             ht_bin[0]=ht_bin[2];
10225             ht_bin[1]=ht_bin[3];
10226             ht_bin[2]=ht_bin[3]=-1;
10227           }
10228         }
10229         break;
10230       case 3:
10231         // Clear jump_out
10232         #ifdef __arm__
10233         if((expirep&2047)==0)
10234           do_clear_cache();
10235         #endif
10236         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10237         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10238         break;
10239     }
10240     expirep=(expirep+1)&65535;
10241   }
10242   return 0;
10243 }
10244
10245 // vim:shiftwidth=2:expandtab