(VITA) More dynarec fixes
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26
27 #include "emu_if.h" //emulator interface
28
29 //#define DISASM
30 //#define assem_debug printf
31 //#define inv_debug printf
32 #define assem_debug(...)
33 #define inv_debug(...)
34
35 #ifdef __i386__
36 #include "assem_x86.h"
37 #endif
38 #ifdef __x86_64__
39 #include "assem_x64.h"
40 #endif
41 #ifdef __arm__
42 #include "assem_arm.h"
43 #endif
44
45 #ifdef __BLACKBERRY_QNX__
46 #undef __clear_cache
47 #define __clear_cache(start,end) msync(start, (size_t)((void*)end - (void*)start), MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
48 #elif defined(__MACH__)
49 #include <libkern/OSCacheControl.h>
50 #define __clear_cache mach_clear_cache
51 static void __clear_cache(void *start, void *end) {
52   size_t len = (char *)end - (char *)start;
53   sys_dcache_flush(start, len);
54   sys_icache_invalidate(start, len);
55 }
56 #elif defined(_3DS)
57 #include "3ds_utils.h"
58 #define __clear_cache(start,end) svcFlushProcessDataCache(0xFFFF8001, start, (u32)(end)-(u32)(start))
59 #elif defined(VITA)
60 #define __clear_cache vita_clear_cache
61 static void __clear_cache(void *start, void *end) {
62   size_t len = (char *)end - (char *)start;
63   int block = sceKernelFindMemBlockByAddr(start,len);
64   sceKernelSyncVMDomain(block, start, len);
65 }
66 #endif
67
68 #define MAXBLOCK 4096
69 #define MAX_OUTPUT_BLOCK_SIZE 262144
70
71 struct regstat
72 {
73   signed char regmap_entry[HOST_REGS];
74   signed char regmap[HOST_REGS];
75   uint64_t was32;
76   uint64_t is32;
77   uint64_t wasdirty;
78   uint64_t dirty;
79   uint64_t u;
80   uint64_t uu;
81   u_int wasconst;
82   u_int isconst;
83   u_int loadedconst;             // host regs that have constants loaded
84   u_int waswritten;              // MIPS regs that were used as store base before
85 };
86
87 // note: asm depends on this layout
88 struct ll_entry
89 {
90   u_int vaddr;
91   u_int reg_sv_flags;
92   void *addr;
93   struct ll_entry *next;
94 };
95
96   u_int start;
97   u_int *source;
98   char insn[MAXBLOCK][10];
99   u_char itype[MAXBLOCK];
100   u_char opcode[MAXBLOCK];
101   u_char opcode2[MAXBLOCK];
102   u_char bt[MAXBLOCK];
103   u_char rs1[MAXBLOCK];
104   u_char rs2[MAXBLOCK];
105   u_char rt1[MAXBLOCK];
106   u_char rt2[MAXBLOCK];
107   u_char us1[MAXBLOCK];
108   u_char us2[MAXBLOCK];
109   u_char dep1[MAXBLOCK];
110   u_char dep2[MAXBLOCK];
111   u_char lt1[MAXBLOCK];
112   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
113   static uint64_t gte_rt[MAXBLOCK];
114   static uint64_t gte_unneeded[MAXBLOCK];
115   static u_int smrv[32]; // speculated MIPS register values
116   static u_int smrv_strong; // mask or regs that are likely to have correct values
117   static u_int smrv_weak; // same, but somewhat less likely
118   static u_int smrv_strong_next; // same, but after current insn executes
119   static u_int smrv_weak_next;
120   int imm[MAXBLOCK];
121   u_int ba[MAXBLOCK];
122   char likely[MAXBLOCK];
123   char is_ds[MAXBLOCK];
124   char ooo[MAXBLOCK];
125   uint64_t unneeded_reg[MAXBLOCK];
126   uint64_t unneeded_reg_upper[MAXBLOCK];
127   uint64_t branch_unneeded_reg[MAXBLOCK];
128   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
129   uint64_t p32[MAXBLOCK];
130   uint64_t pr32[MAXBLOCK];
131   signed char regmap_pre[MAXBLOCK][HOST_REGS];
132   static uint64_t current_constmap[HOST_REGS];
133   static uint64_t constmap[MAXBLOCK][HOST_REGS];
134   static struct regstat regs[MAXBLOCK];
135   static struct regstat branch_regs[MAXBLOCK];
136   signed char minimum_free_regs[MAXBLOCK];
137   u_int needed_reg[MAXBLOCK];
138   uint64_t requires_32bit[MAXBLOCK];
139   u_int wont_dirty[MAXBLOCK];
140   u_int will_dirty[MAXBLOCK];
141   int ccadj[MAXBLOCK];
142   int slen;
143   u_int instr_addr[MAXBLOCK];
144   static u_int link_addr[MAXBLOCK][3];
145   int linkcount;
146   u_int stubs[MAXBLOCK*3][8];
147   int stubcount;
148   u_int literals[1024][2];
149   int literalcount;
150   int is_delayslot;
151   int cop1_usable;
152   u_char *out;
153   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
154   struct ll_entry *jump_out[4096];
155   struct ll_entry *jump_dirty[4096];
156   u_int hash_table[65536][4]  __attribute__((aligned(16)));
157   char shadow[1048576]  __attribute__((aligned(16)));
158   void *copy;
159   int expirep;
160 #ifndef PCSX
161   u_int using_tlb;
162 #else
163   static const u_int using_tlb=0;
164 #endif
165   int new_dynarec_did_compile;
166   int new_dynarec_hacks;
167   u_int stop_after_jal;
168 #ifndef RAM_FIXED
169   static u_int ram_offset;
170 #else
171   static const u_int ram_offset=0;
172 #endif
173   extern u_char restore_candidate[512];
174   extern int cycle_count;
175
176   /* registers that may be allocated */
177   /* 1-31 gpr */
178 #define HIREG 32 // hi
179 #define LOREG 33 // lo
180 #define FSREG 34 // FPU status (FCSR)
181 #define CSREG 35 // Coprocessor status
182 #define CCREG 36 // Cycle count
183 #define INVCP 37 // Pointer to invalid_code
184 #define MMREG 38 // Pointer to memory_map
185 #define ROREG 39 // ram offset (if rdram!=0x80000000)
186 #define TEMPREG 40
187 #define FTEMP 40 // FPU temporary register
188 #define PTEMP 41 // Prefetch temporary register
189 #define TLREG 42 // TLB mapping offset
190 #define RHASH 43 // Return address hash
191 #define RHTBL 44 // Return address hash table address
192 #define RTEMP 45 // JR/JALR address register
193 #define MAXREG 45
194 #define AGEN1 46 // Address generation temporary register
195 #define AGEN2 47 // Address generation temporary register
196 #define MGEN1 48 // Maptable address generation temporary register
197 #define MGEN2 49 // Maptable address generation temporary register
198 #define BTREG 50 // Branch target temporary register
199
200   /* instruction types */
201 #define NOP 0     // No operation
202 #define LOAD 1    // Load
203 #define STORE 2   // Store
204 #define LOADLR 3  // Unaligned load
205 #define STORELR 4 // Unaligned store
206 #define MOV 5     // Move
207 #define ALU 6     // Arithmetic/logic
208 #define MULTDIV 7 // Multiply/divide
209 #define SHIFT 8   // Shift by register
210 #define SHIFTIMM 9// Shift by immediate
211 #define IMM16 10  // 16-bit immediate
212 #define RJUMP 11  // Unconditional jump to register
213 #define UJUMP 12  // Unconditional jump
214 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
215 #define SJUMP 14  // Conditional branch (regimm format)
216 #define COP0 15   // Coprocessor 0
217 #define COP1 16   // Coprocessor 1
218 #define C1LS 17   // Coprocessor 1 load/store
219 #define FJUMP 18  // Conditional branch (floating point)
220 #define FLOAT 19  // Floating point unit
221 #define FCONV 20  // Convert integer to float
222 #define FCOMP 21  // Floating point compare (sets FSREG)
223 #define SYSCALL 22// SYSCALL
224 #define OTHER 23  // Other
225 #define SPAN 24   // Branch/delay slot spans 2 pages
226 #define NI 25     // Not implemented
227 #define HLECALL 26// PCSX fake opcodes for HLE
228 #define COP2 27   // Coprocessor 2 move
229 #define C2LS 28   // Coprocessor 2 load/store
230 #define C2OP 29   // Coprocessor 2 operation
231 #define INTCALL 30// Call interpreter to handle rare corner cases
232
233   /* stubs */
234 #define CC_STUB 1
235 #define FP_STUB 2
236 #define LOADB_STUB 3
237 #define LOADH_STUB 4
238 #define LOADW_STUB 5
239 #define LOADD_STUB 6
240 #define LOADBU_STUB 7
241 #define LOADHU_STUB 8
242 #define STOREB_STUB 9
243 #define STOREH_STUB 10
244 #define STOREW_STUB 11
245 #define STORED_STUB 12
246 #define STORELR_STUB 13
247 #define INVCODE_STUB 14
248
249   /* branch codes */
250 #define TAKEN 1
251 #define NOTTAKEN 2
252 #define NULLDS 3
253
254 // asm linkage
255 int new_recompile_block(int addr);
256 void *get_addr_ht(u_int vaddr);
257 void invalidate_block(u_int block);
258 void invalidate_addr(u_int addr);
259 void remove_hash(int vaddr);
260 void jump_vaddr();
261 void dyna_linker();
262 void dyna_linker_ds();
263 void verify_code();
264 void verify_code_vm();
265 void verify_code_ds();
266 void cc_interrupt();
267 void fp_exception();
268 void fp_exception_ds();
269 void jump_syscall();
270 void jump_syscall_hle();
271 void jump_eret();
272 void jump_hlecall();
273 void jump_intcall();
274 void new_dyna_leave();
275
276 // TLB
277 void TLBWI_new();
278 void TLBWR_new();
279 void read_nomem_new();
280 void read_nomemb_new();
281 void read_nomemh_new();
282 void read_nomemd_new();
283 void write_nomem_new();
284 void write_nomemb_new();
285 void write_nomemh_new();
286 void write_nomemd_new();
287 void write_rdram_new();
288 void write_rdramb_new();
289 void write_rdramh_new();
290 void write_rdramd_new();
291 extern u_int memory_map[1048576];
292
293 // Needed by assembler
294 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
295 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
296 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
297 void load_all_regs(signed char i_regmap[]);
298 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
299 void load_regs_entry(int t);
300 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
301
302 int tracedebug=0;
303
304 //#define DEBUG_CYCLE_COUNT 1
305
306 #define NO_CYCLE_PENALTY_THR 12
307
308 int cycle_multiplier; // 100 for 1.0
309
310 static int CLOCK_ADJUST(int x)
311 {
312   int s=(x>>31)|1;
313   return (x * cycle_multiplier + s * 50) / 100;
314 }
315
316 static void tlb_hacks()
317 {
318 #ifndef DISABLE_TLB
319   // Goldeneye hack
320   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
321   {
322     u_int addr;
323     int n;
324     switch (ROM_HEADER->Country_code&0xFF)
325     {
326       case 0x45: // U
327         addr=0x34b30;
328         break;
329       case 0x4A: // J
330         addr=0x34b70;
331         break;
332       case 0x50: // E
333         addr=0x329f0;
334         break;
335       default:
336         // Unknown country code
337         addr=0;
338         break;
339     }
340     u_int rom_addr=(u_int)rom;
341     #ifdef ROM_COPY
342     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
343     // in the lower 4G of memory to use this hack.  Copy it if necessary.
344     if((void *)rom>(void *)0xffffffff) {
345       munmap(ROM_COPY, 67108864);
346       if(mmap(ROM_COPY, 12582912,
347               PROT_READ | PROT_WRITE,
348               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
349               -1, 0) <= 0) {printf("mmap() failed\n");}
350       memcpy(ROM_COPY,rom,12582912);
351       rom_addr=(u_int)ROM_COPY;
352     }
353     #endif
354     if(addr) {
355       for(n=0x7F000;n<0x80000;n++) {
356         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
357       }
358     }
359   }
360 #endif
361 }
362
363 static u_int get_page(u_int vaddr)
364 {
365 #ifndef PCSX
366   u_int page=(vaddr^0x80000000)>>12;
367 #else
368   u_int page=vaddr&~0xe0000000;
369   if (page < 0x1000000)
370     page &= ~0x0e00000; // RAM mirrors
371   page>>=12;
372 #endif
373 #ifndef DISABLE_TLB
374   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
375 #endif
376   if(page>2048) page=2048+(page&2047);
377   return page;
378 }
379
380 #ifndef PCSX
381 static u_int get_vpage(u_int vaddr)
382 {
383   u_int vpage=(vaddr^0x80000000)>>12;
384 #ifndef DISABLE_TLB
385   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
386 #endif
387   if(vpage>2048) vpage=2048+(vpage&2047);
388   return vpage;
389 }
390 #else
391 // no virtual mem in PCSX
392 static u_int get_vpage(u_int vaddr)
393 {
394   return get_page(vaddr);
395 }
396 #endif
397
398 // Get address from virtual address
399 // This is called from the recompiled JR/JALR instructions
400 void *get_addr(u_int vaddr)
401 {
402   u_int page=get_page(vaddr);
403   u_int vpage=get_vpage(vaddr);
404   struct ll_entry *head;
405   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
406   head=jump_in[page];
407   while(head!=NULL) {
408     if(head->vaddr==vaddr) {
409   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
410       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
411       ht_bin[3]=ht_bin[1];
412       ht_bin[2]=ht_bin[0];
413       ht_bin[1]=(int)head->addr;
414       ht_bin[0]=vaddr;
415       return head->addr;
416     }
417     head=head->next;
418   }
419   head=jump_dirty[vpage];
420   while(head!=NULL) {
421     if(head->vaddr==vaddr) {
422       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
423       // Don't restore blocks which are about to expire from the cache
424       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
425       if(verify_dirty(head->addr)) {
426         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
427         invalid_code[vaddr>>12]=0;
428         inv_code_start=inv_code_end=~0;
429 #ifndef DISABLE_TLB
430         memory_map[vaddr>>12]|=0x40000000;
431 #endif
432         if(vpage<2048) {
433 #ifndef DISABLE_TLB
434           if(tlb_LUT_r[vaddr>>12]) {
435             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
436             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
437           }
438 #endif
439           restore_candidate[vpage>>3]|=1<<(vpage&7);
440         }
441         else restore_candidate[page>>3]|=1<<(page&7);
442         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
443         if(ht_bin[0]==vaddr) {
444           ht_bin[1]=(int)head->addr; // Replace existing entry
445         }
446         else
447         {
448           ht_bin[3]=ht_bin[1];
449           ht_bin[2]=ht_bin[0];
450           ht_bin[1]=(int)head->addr;
451           ht_bin[0]=vaddr;
452         }
453         return head->addr;
454       }
455     }
456     head=head->next;
457   }
458   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
459   int r=new_recompile_block(vaddr);
460   if(r==0) return get_addr(vaddr);
461   // Execute in unmapped page, generate pagefault execption
462   Status|=2;
463   Cause=(vaddr<<31)|0x8;
464   EPC=(vaddr&1)?vaddr-5:vaddr;
465   BadVAddr=(vaddr&~1);
466   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
467   EntryHi=BadVAddr&0xFFFFE000;
468   return get_addr_ht(0x80000000);
469 }
470 // Look up address in hash table first
471 void *get_addr_ht(u_int vaddr)
472 {
473   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
474   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
475   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
476   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
477   return get_addr(vaddr);
478 }
479
480 void clear_all_regs(signed char regmap[])
481 {
482   int hr;
483   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
484 }
485
486 signed char get_reg(signed char regmap[],int r)
487 {
488   int hr;
489   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
490   return -1;
491 }
492
493 // Find a register that is available for two consecutive cycles
494 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
495 {
496   int hr;
497   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
498   return -1;
499 }
500
501 int count_free_regs(signed char regmap[])
502 {
503   int count=0;
504   int hr;
505   for(hr=0;hr<HOST_REGS;hr++)
506   {
507     if(hr!=EXCLUDE_REG) {
508       if(regmap[hr]<0) count++;
509     }
510   }
511   return count;
512 }
513
514 void dirty_reg(struct regstat *cur,signed char reg)
515 {
516   int hr;
517   if(!reg) return;
518   for (hr=0;hr<HOST_REGS;hr++) {
519     if((cur->regmap[hr]&63)==reg) {
520       cur->dirty|=1<<hr;
521     }
522   }
523 }
524
525 // If we dirty the lower half of a 64 bit register which is now being
526 // sign-extended, we need to dump the upper half.
527 // Note: Do this only after completion of the instruction, because
528 // some instructions may need to read the full 64-bit value even if
529 // overwriting it (eg SLTI, DSRA32).
530 static void flush_dirty_uppers(struct regstat *cur)
531 {
532   int hr,reg;
533   for (hr=0;hr<HOST_REGS;hr++) {
534     if((cur->dirty>>hr)&1) {
535       reg=cur->regmap[hr];
536       if(reg>=64)
537         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
538     }
539   }
540 }
541
542 void set_const(struct regstat *cur,signed char reg,uint64_t value)
543 {
544   int hr;
545   if(!reg) return;
546   for (hr=0;hr<HOST_REGS;hr++) {
547     if(cur->regmap[hr]==reg) {
548       cur->isconst|=1<<hr;
549       current_constmap[hr]=value;
550     }
551     else if((cur->regmap[hr]^64)==reg) {
552       cur->isconst|=1<<hr;
553       current_constmap[hr]=value>>32;
554     }
555   }
556 }
557
558 void clear_const(struct regstat *cur,signed char reg)
559 {
560   int hr;
561   if(!reg) return;
562   for (hr=0;hr<HOST_REGS;hr++) {
563     if((cur->regmap[hr]&63)==reg) {
564       cur->isconst&=~(1<<hr);
565     }
566   }
567 }
568
569 int is_const(struct regstat *cur,signed char reg)
570 {
571   int hr;
572   if(reg<0) return 0;
573   if(!reg) return 1;
574   for (hr=0;hr<HOST_REGS;hr++) {
575     if((cur->regmap[hr]&63)==reg) {
576       return (cur->isconst>>hr)&1;
577     }
578   }
579   return 0;
580 }
581 uint64_t get_const(struct regstat *cur,signed char reg)
582 {
583   int hr;
584   if(!reg) return 0;
585   for (hr=0;hr<HOST_REGS;hr++) {
586     if(cur->regmap[hr]==reg) {
587       return current_constmap[hr];
588     }
589   }
590   SysPrintf("Unknown constant in r%d\n",reg);
591   exit(1);
592 }
593
594 // Least soon needed registers
595 // Look at the next ten instructions and see which registers
596 // will be used.  Try not to reallocate these.
597 void lsn(u_char hsn[], int i, int *preferred_reg)
598 {
599   int j;
600   int b=-1;
601   for(j=0;j<9;j++)
602   {
603     if(i+j>=slen) {
604       j=slen-i-1;
605       break;
606     }
607     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
608     {
609       // Don't go past an unconditonal jump
610       j++;
611       break;
612     }
613   }
614   for(;j>=0;j--)
615   {
616     if(rs1[i+j]) hsn[rs1[i+j]]=j;
617     if(rs2[i+j]) hsn[rs2[i+j]]=j;
618     if(rt1[i+j]) hsn[rt1[i+j]]=j;
619     if(rt2[i+j]) hsn[rt2[i+j]]=j;
620     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
621       // Stores can allocate zero
622       hsn[rs1[i+j]]=j;
623       hsn[rs2[i+j]]=j;
624     }
625     // On some architectures stores need invc_ptr
626     #if defined(HOST_IMM8)
627     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
628       hsn[INVCP]=j;
629     }
630     #endif
631     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
632     {
633       hsn[CCREG]=j;
634       b=j;
635     }
636   }
637   if(b>=0)
638   {
639     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
640     {
641       // Follow first branch
642       int t=(ba[i+b]-start)>>2;
643       j=7-b;if(t+j>=slen) j=slen-t-1;
644       for(;j>=0;j--)
645       {
646         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
647         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
648         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
649         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
650       }
651     }
652     // TODO: preferred register based on backward branch
653   }
654   // Delay slot should preferably not overwrite branch conditions or cycle count
655   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
656     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
657     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
658     hsn[CCREG]=1;
659     // ...or hash tables
660     hsn[RHASH]=1;
661     hsn[RHTBL]=1;
662   }
663   // Coprocessor load/store needs FTEMP, even if not declared
664   if(itype[i]==C1LS||itype[i]==C2LS) {
665     hsn[FTEMP]=0;
666   }
667   // Load L/R also uses FTEMP as a temporary register
668   if(itype[i]==LOADLR) {
669     hsn[FTEMP]=0;
670   }
671   // Also SWL/SWR/SDL/SDR
672   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
673     hsn[FTEMP]=0;
674   }
675   // Don't remove the TLB registers either
676   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
677     hsn[TLREG]=0;
678   }
679   // Don't remove the miniht registers
680   if(itype[i]==UJUMP||itype[i]==RJUMP)
681   {
682     hsn[RHASH]=0;
683     hsn[RHTBL]=0;
684   }
685 }
686
687 // We only want to allocate registers if we're going to use them again soon
688 int needed_again(int r, int i)
689 {
690   int j;
691   int b=-1;
692   int rn=10;
693
694   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
695   {
696     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
697       return 0; // Don't need any registers if exiting the block
698   }
699   for(j=0;j<9;j++)
700   {
701     if(i+j>=slen) {
702       j=slen-i-1;
703       break;
704     }
705     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
706     {
707       // Don't go past an unconditonal jump
708       j++;
709       break;
710     }
711     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
712     {
713       break;
714     }
715   }
716   for(;j>=1;j--)
717   {
718     if(rs1[i+j]==r) rn=j;
719     if(rs2[i+j]==r) rn=j;
720     if((unneeded_reg[i+j]>>r)&1) rn=10;
721     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
722     {
723       b=j;
724     }
725   }
726   /*
727   if(b>=0)
728   {
729     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
730     {
731       // Follow first branch
732       int o=rn;
733       int t=(ba[i+b]-start)>>2;
734       j=7-b;if(t+j>=slen) j=slen-t-1;
735       for(;j>=0;j--)
736       {
737         if(!((unneeded_reg[t+j]>>r)&1)) {
738           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
739           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
740         }
741         else rn=o;
742       }
743     }
744   }*/
745   if(rn<10) return 1;
746   return 0;
747 }
748
749 // Try to match register allocations at the end of a loop with those
750 // at the beginning
751 int loop_reg(int i, int r, int hr)
752 {
753   int j,k;
754   for(j=0;j<9;j++)
755   {
756     if(i+j>=slen) {
757       j=slen-i-1;
758       break;
759     }
760     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
761     {
762       // Don't go past an unconditonal jump
763       j++;
764       break;
765     }
766   }
767   k=0;
768   if(i>0){
769     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
770       k--;
771   }
772   for(;k<j;k++)
773   {
774     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
775     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
776     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
777     {
778       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
779       {
780         int t=(ba[i+k]-start)>>2;
781         int reg=get_reg(regs[t].regmap_entry,r);
782         if(reg>=0) return reg;
783         //reg=get_reg(regs[t+1].regmap_entry,r);
784         //if(reg>=0) return reg;
785       }
786     }
787   }
788   return hr;
789 }
790
791
792 // Allocate every register, preserving source/target regs
793 void alloc_all(struct regstat *cur,int i)
794 {
795   int hr;
796
797   for(hr=0;hr<HOST_REGS;hr++) {
798     if(hr!=EXCLUDE_REG) {
799       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
800          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
801       {
802         cur->regmap[hr]=-1;
803         cur->dirty&=~(1<<hr);
804       }
805       // Don't need zeros
806       if((cur->regmap[hr]&63)==0)
807       {
808         cur->regmap[hr]=-1;
809         cur->dirty&=~(1<<hr);
810       }
811     }
812   }
813 }
814
815 #ifndef FORCE32
816 void div64(int64_t dividend,int64_t divisor)
817 {
818   lo=dividend/divisor;
819   hi=dividend%divisor;
820   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
821   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
822 }
823 void divu64(uint64_t dividend,uint64_t divisor)
824 {
825   lo=dividend/divisor;
826   hi=dividend%divisor;
827   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
828   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
829 }
830
831 void mult64(uint64_t m1,uint64_t m2)
832 {
833    unsigned long long int op1, op2, op3, op4;
834    unsigned long long int result1, result2, result3, result4;
835    unsigned long long int temp1, temp2, temp3, temp4;
836    int sign = 0;
837
838    if (m1 < 0)
839      {
840     op2 = -m1;
841     sign = 1 - sign;
842      }
843    else op2 = m1;
844    if (m2 < 0)
845      {
846     op4 = -m2;
847     sign = 1 - sign;
848      }
849    else op4 = m2;
850
851    op1 = op2 & 0xFFFFFFFF;
852    op2 = (op2 >> 32) & 0xFFFFFFFF;
853    op3 = op4 & 0xFFFFFFFF;
854    op4 = (op4 >> 32) & 0xFFFFFFFF;
855
856    temp1 = op1 * op3;
857    temp2 = (temp1 >> 32) + op1 * op4;
858    temp3 = op2 * op3;
859    temp4 = (temp3 >> 32) + op2 * op4;
860
861    result1 = temp1 & 0xFFFFFFFF;
862    result2 = temp2 + (temp3 & 0xFFFFFFFF);
863    result3 = (result2 >> 32) + temp4;
864    result4 = (result3 >> 32);
865
866    lo = result1 | (result2 << 32);
867    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
868    if (sign)
869      {
870     hi = ~hi;
871     if (!lo) hi++;
872     else lo = ~lo + 1;
873      }
874 }
875
876 void multu64(uint64_t m1,uint64_t m2)
877 {
878    unsigned long long int op1, op2, op3, op4;
879    unsigned long long int result1, result2, result3, result4;
880    unsigned long long int temp1, temp2, temp3, temp4;
881
882    op1 = m1 & 0xFFFFFFFF;
883    op2 = (m1 >> 32) & 0xFFFFFFFF;
884    op3 = m2 & 0xFFFFFFFF;
885    op4 = (m2 >> 32) & 0xFFFFFFFF;
886
887    temp1 = op1 * op3;
888    temp2 = (temp1 >> 32) + op1 * op4;
889    temp3 = op2 * op3;
890    temp4 = (temp3 >> 32) + op2 * op4;
891
892    result1 = temp1 & 0xFFFFFFFF;
893    result2 = temp2 + (temp3 & 0xFFFFFFFF);
894    result3 = (result2 >> 32) + temp4;
895    result4 = (result3 >> 32);
896
897    lo = result1 | (result2 << 32);
898    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
899
900   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
901   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
902 }
903
904 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
905 {
906   if(bits) {
907     original<<=64-bits;
908     original>>=64-bits;
909     loaded<<=bits;
910     original|=loaded;
911   }
912   else original=loaded;
913   return original;
914 }
915 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
916 {
917   if(bits^56) {
918     original>>=64-(bits^56);
919     original<<=64-(bits^56);
920     loaded>>=bits^56;
921     original|=loaded;
922   }
923   else original=loaded;
924   return original;
925 }
926 #endif
927
928 #ifdef __i386__
929 #include "assem_x86.c"
930 #endif
931 #ifdef __x86_64__
932 #include "assem_x64.c"
933 #endif
934 #ifdef __arm__
935 #include "assem_arm.c"
936 #endif
937
938 // Add virtual address mapping to linked list
939 void ll_add(struct ll_entry **head,int vaddr,void *addr)
940 {
941   struct ll_entry *new_entry;
942   new_entry=malloc(sizeof(struct ll_entry));
943   assert(new_entry!=NULL);
944   new_entry->vaddr=vaddr;
945   new_entry->reg_sv_flags=0;
946   new_entry->addr=addr;
947   new_entry->next=*head;
948   *head=new_entry;
949 }
950
951 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
952 {
953   ll_add(head,vaddr,addr);
954   (*head)->reg_sv_flags=reg_sv_flags;
955 }
956
957 // Check if an address is already compiled
958 // but don't return addresses which are about to expire from the cache
959 void *check_addr(u_int vaddr)
960 {
961   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
962   if(ht_bin[0]==vaddr) {
963     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
964       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
965   }
966   if(ht_bin[2]==vaddr) {
967     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
968       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
969   }
970   u_int page=get_page(vaddr);
971   struct ll_entry *head;
972   head=jump_in[page];
973   while(head!=NULL) {
974     if(head->vaddr==vaddr) {
975       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
976         // Update existing entry with current address
977         if(ht_bin[0]==vaddr) {
978           ht_bin[1]=(int)head->addr;
979           return head->addr;
980         }
981         if(ht_bin[2]==vaddr) {
982           ht_bin[3]=(int)head->addr;
983           return head->addr;
984         }
985         // Insert into hash table with low priority.
986         // Don't evict existing entries, as they are probably
987         // addresses that are being accessed frequently.
988         if(ht_bin[0]==-1) {
989           ht_bin[1]=(int)head->addr;
990           ht_bin[0]=vaddr;
991         }else if(ht_bin[2]==-1) {
992           ht_bin[3]=(int)head->addr;
993           ht_bin[2]=vaddr;
994         }
995         return head->addr;
996       }
997     }
998     head=head->next;
999   }
1000   return 0;
1001 }
1002
1003 void remove_hash(int vaddr)
1004 {
1005   //printf("remove hash: %x\n",vaddr);
1006   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1007   if(ht_bin[2]==vaddr) {
1008     ht_bin[2]=ht_bin[3]=-1;
1009   }
1010   if(ht_bin[0]==vaddr) {
1011     ht_bin[0]=ht_bin[2];
1012     ht_bin[1]=ht_bin[3];
1013     ht_bin[2]=ht_bin[3]=-1;
1014   }
1015 }
1016
1017 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1018 {
1019   struct ll_entry *next;
1020   while(*head) {
1021     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) ||
1022        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1023     {
1024       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1025       remove_hash((*head)->vaddr);
1026       next=(*head)->next;
1027       free(*head);
1028       *head=next;
1029     }
1030     else
1031     {
1032       head=&((*head)->next);
1033     }
1034   }
1035 }
1036
1037 // Remove all entries from linked list
1038 void ll_clear(struct ll_entry **head)
1039 {
1040   struct ll_entry *cur;
1041   struct ll_entry *next;
1042   if(cur=*head) {
1043     *head=0;
1044     while(cur) {
1045       next=cur->next;
1046       free(cur);
1047       cur=next;
1048     }
1049   }
1050 }
1051
1052 // Dereference the pointers and remove if it matches
1053 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1054 {
1055   while(head) {
1056     int ptr=get_pointer(head->addr);
1057     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1058     if(((ptr>>shift)==(addr>>shift)) ||
1059        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1060     {
1061       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1062       u_int host_addr=(u_int)kill_pointer(head->addr);
1063       #ifdef __arm__
1064         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1065       #endif
1066     }
1067     head=head->next;
1068   }
1069 }
1070
1071 // This is called when we write to a compiled block (see do_invstub)
1072 void invalidate_page(u_int page)
1073 {
1074   struct ll_entry *head;
1075   struct ll_entry *next;
1076   head=jump_in[page];
1077   jump_in[page]=0;
1078   while(head!=NULL) {
1079     inv_debug("INVALIDATE: %x\n",head->vaddr);
1080     remove_hash(head->vaddr);
1081     next=head->next;
1082     free(head);
1083     head=next;
1084   }
1085   head=jump_out[page];
1086   jump_out[page]=0;
1087   while(head!=NULL) {
1088     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1089     u_int host_addr=(u_int)kill_pointer(head->addr);
1090     #ifdef __arm__
1091       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1092     #endif
1093     next=head->next;
1094     free(head);
1095     head=next;
1096   }
1097 }
1098
1099 static void invalidate_block_range(u_int block, u_int first, u_int last)
1100 {
1101   u_int page=get_page(block<<12);
1102   //printf("first=%d last=%d\n",first,last);
1103   invalidate_page(page);
1104   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1105   assert(last<page+5);
1106   // Invalidate the adjacent pages if a block crosses a 4K boundary
1107   while(first<page) {
1108     invalidate_page(first);
1109     first++;
1110   }
1111   for(first=page+1;first<last;first++) {
1112     invalidate_page(first);
1113   }
1114   #ifdef __arm__
1115     do_clear_cache();
1116   #endif
1117
1118   // Don't trap writes
1119   invalid_code[block]=1;
1120 #ifndef DISABLE_TLB
1121   // If there is a valid TLB entry for this page, remove write protect
1122   if(tlb_LUT_w[block]) {
1123     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1124     // CHECK: Is this right?
1125     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1126     u_int real_block=tlb_LUT_w[block]>>12;
1127     invalid_code[real_block]=1;
1128     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1129   }
1130   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1131 #endif
1132
1133   #ifdef USE_MINI_HT
1134   memset(mini_ht,-1,sizeof(mini_ht));
1135   #endif
1136 }
1137
1138 void invalidate_block(u_int block)
1139 {
1140   u_int page=get_page(block<<12);
1141   u_int vpage=get_vpage(block<<12);
1142   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1143   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1144   u_int first,last;
1145   first=last=page;
1146   struct ll_entry *head;
1147   head=jump_dirty[vpage];
1148   //printf("page=%d vpage=%d\n",page,vpage);
1149   while(head!=NULL) {
1150     u_int start,end;
1151     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1152       get_bounds((int)head->addr,&start,&end);
1153       //printf("start: %x end: %x\n",start,end);
1154       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
1155         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1156           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1157           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1158         }
1159       }
1160 #ifndef DISABLE_TLB
1161       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1162         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1163           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1164           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1165         }
1166       }
1167 #endif
1168     }
1169     head=head->next;
1170   }
1171   invalidate_block_range(block,first,last);
1172 }
1173
1174 void invalidate_addr(u_int addr)
1175 {
1176 #ifdef PCSX
1177   //static int rhits;
1178   // this check is done by the caller
1179   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1180   u_int page=get_vpage(addr);
1181   if(page<2048) { // RAM
1182     struct ll_entry *head;
1183     u_int addr_min=~0, addr_max=0;
1184     u_int mask=RAM_SIZE-1;
1185     u_int addr_main=0x80000000|(addr&mask);
1186     int pg1;
1187     inv_code_start=addr_main&~0xfff;
1188     inv_code_end=addr_main|0xfff;
1189     pg1=page;
1190     if (pg1>0) {
1191       // must check previous page too because of spans..
1192       pg1--;
1193       inv_code_start-=0x1000;
1194     }
1195     for(;pg1<=page;pg1++) {
1196       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1197         u_int start,end;
1198         get_bounds((int)head->addr,&start,&end);
1199         if(ram_offset) {
1200           start-=ram_offset;
1201           end-=ram_offset;
1202         }
1203         if(start<=addr_main&&addr_main<end) {
1204           if(start<addr_min) addr_min=start;
1205           if(end>addr_max) addr_max=end;
1206         }
1207         else if(addr_main<start) {
1208           if(start<inv_code_end)
1209             inv_code_end=start-1;
1210         }
1211         else {
1212           if(end>inv_code_start)
1213             inv_code_start=end;
1214         }
1215       }
1216     }
1217     if (addr_min!=~0) {
1218       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1219       inv_code_start=inv_code_end=~0;
1220       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1221       return;
1222     }
1223     else {
1224       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1225       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1226       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1227       return;
1228     }
1229   }
1230 #endif
1231   invalidate_block(addr>>12);
1232 }
1233
1234 // This is called when loading a save state.
1235 // Anything could have changed, so invalidate everything.
1236 void invalidate_all_pages()
1237 {
1238   u_int page,n;
1239   #if defined(VITA)
1240     sceKernelOpenVMDomain();
1241   #endif
1242   for(page=0;page<4096;page++)
1243     invalidate_page(page);
1244   for(page=0;page<1048576;page++)
1245     if(!invalid_code[page]) {
1246       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1247       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1248     }
1249   #ifdef __arm__
1250   #if defined(VITA)
1251     sceKernelCloseVMDomain();
1252   #endif
1253   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1254   #endif
1255   #ifdef USE_MINI_HT
1256   memset(mini_ht,-1,sizeof(mini_ht));
1257   #endif
1258   #ifndef DISABLE_TLB
1259   // TLB
1260   for(page=0;page<0x100000;page++) {
1261     if(tlb_LUT_r[page]) {
1262       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1263       if(!tlb_LUT_w[page]||!invalid_code[page])
1264         memory_map[page]|=0x40000000; // Write protect
1265     }
1266     else memory_map[page]=-1;
1267     if(page==0x80000) page=0xC0000;
1268   }
1269   tlb_hacks();
1270   #endif
1271 }
1272
1273 // Add an entry to jump_out after making a link
1274 void add_link(u_int vaddr,void *src)
1275 {
1276   u_int page=get_page(vaddr);
1277   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1278   int *ptr=(int *)(src+4);
1279   assert((*ptr&0x0fff0000)==0x059f0000);
1280   ll_add(jump_out+page,vaddr,src);
1281   //int ptr=get_pointer(src);
1282   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1283 }
1284
1285 // If a code block was found to be unmodified (bit was set in
1286 // restore_candidate) and it remains unmodified (bit is clear
1287 // in invalid_code) then move the entries for that 4K page from
1288 // the dirty list to the clean list.
1289 void clean_blocks(u_int page)
1290 {
1291   struct ll_entry *head;
1292   inv_debug("INV: clean_blocks page=%d\n",page);
1293   head=jump_dirty[page];
1294   while(head!=NULL) {
1295     if(!invalid_code[head->vaddr>>12]) {
1296       // Don't restore blocks which are about to expire from the cache
1297       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1298         u_int start,end;
1299         if(verify_dirty((int)head->addr)) {
1300           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1301           u_int i;
1302           u_int inv=0;
1303           get_bounds((int)head->addr,&start,&end);
1304           if(start-(u_int)rdram<RAM_SIZE) {
1305             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1306               inv|=invalid_code[i];
1307             }
1308           }
1309 #ifndef DISABLE_TLB
1310           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1311             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1312             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1313             if(addr<start||addr>=end) inv=1;
1314           }
1315 #endif
1316           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1317             inv=1;
1318           }
1319           if(!inv) {
1320             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1321             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1322               u_int ppage=page;
1323 #ifndef DISABLE_TLB
1324               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1325 #endif
1326               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1327               //printf("page=%x, addr=%x\n",page,head->vaddr);
1328               //assert(head->vaddr>>12==(page|0x80000));
1329               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1330               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1331               if(ht_bin[0]==head->vaddr) {
1332                 ht_bin[1]=(int)clean_addr; // Replace existing entry
1333               }
1334               if(ht_bin[2]==head->vaddr) {
1335                 ht_bin[3]=(int)clean_addr; // Replace existing entry
1336               }
1337             }
1338           }
1339         }
1340       }
1341     }
1342     head=head->next;
1343   }
1344 }
1345
1346
1347 void mov_alloc(struct regstat *current,int i)
1348 {
1349   // Note: Don't need to actually alloc the source registers
1350   if((~current->is32>>rs1[i])&1) {
1351     //alloc_reg64(current,i,rs1[i]);
1352     alloc_reg64(current,i,rt1[i]);
1353     current->is32&=~(1LL<<rt1[i]);
1354   } else {
1355     //alloc_reg(current,i,rs1[i]);
1356     alloc_reg(current,i,rt1[i]);
1357     current->is32|=(1LL<<rt1[i]);
1358   }
1359   clear_const(current,rs1[i]);
1360   clear_const(current,rt1[i]);
1361   dirty_reg(current,rt1[i]);
1362 }
1363
1364 void shiftimm_alloc(struct regstat *current,int i)
1365 {
1366   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1367   {
1368     if(rt1[i]) {
1369       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1370       else lt1[i]=rs1[i];
1371       alloc_reg(current,i,rt1[i]);
1372       current->is32|=1LL<<rt1[i];
1373       dirty_reg(current,rt1[i]);
1374       if(is_const(current,rs1[i])) {
1375         int v=get_const(current,rs1[i]);
1376         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1377         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1378         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1379       }
1380       else clear_const(current,rt1[i]);
1381     }
1382   }
1383   else
1384   {
1385     clear_const(current,rs1[i]);
1386     clear_const(current,rt1[i]);
1387   }
1388
1389   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1390   {
1391     if(rt1[i]) {
1392       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1393       alloc_reg64(current,i,rt1[i]);
1394       current->is32&=~(1LL<<rt1[i]);
1395       dirty_reg(current,rt1[i]);
1396     }
1397   }
1398   if(opcode2[i]==0x3c) // DSLL32
1399   {
1400     if(rt1[i]) {
1401       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1402       alloc_reg64(current,i,rt1[i]);
1403       current->is32&=~(1LL<<rt1[i]);
1404       dirty_reg(current,rt1[i]);
1405     }
1406   }
1407   if(opcode2[i]==0x3e) // DSRL32
1408   {
1409     if(rt1[i]) {
1410       alloc_reg64(current,i,rs1[i]);
1411       if(imm[i]==32) {
1412         alloc_reg64(current,i,rt1[i]);
1413         current->is32&=~(1LL<<rt1[i]);
1414       } else {
1415         alloc_reg(current,i,rt1[i]);
1416         current->is32|=1LL<<rt1[i];
1417       }
1418       dirty_reg(current,rt1[i]);
1419     }
1420   }
1421   if(opcode2[i]==0x3f) // DSRA32
1422   {
1423     if(rt1[i]) {
1424       alloc_reg64(current,i,rs1[i]);
1425       alloc_reg(current,i,rt1[i]);
1426       current->is32|=1LL<<rt1[i];
1427       dirty_reg(current,rt1[i]);
1428     }
1429   }
1430 }
1431
1432 void shift_alloc(struct regstat *current,int i)
1433 {
1434   if(rt1[i]) {
1435     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1436     {
1437       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1438       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1439       alloc_reg(current,i,rt1[i]);
1440       if(rt1[i]==rs2[i]) {
1441         alloc_reg_temp(current,i,-1);
1442         minimum_free_regs[i]=1;
1443       }
1444       current->is32|=1LL<<rt1[i];
1445     } else { // DSLLV/DSRLV/DSRAV
1446       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1447       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1448       alloc_reg64(current,i,rt1[i]);
1449       current->is32&=~(1LL<<rt1[i]);
1450       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1451       {
1452         alloc_reg_temp(current,i,-1);
1453         minimum_free_regs[i]=1;
1454       }
1455     }
1456     clear_const(current,rs1[i]);
1457     clear_const(current,rs2[i]);
1458     clear_const(current,rt1[i]);
1459     dirty_reg(current,rt1[i]);
1460   }
1461 }
1462
1463 void alu_alloc(struct regstat *current,int i)
1464 {
1465   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1466     if(rt1[i]) {
1467       if(rs1[i]&&rs2[i]) {
1468         alloc_reg(current,i,rs1[i]);
1469         alloc_reg(current,i,rs2[i]);
1470       }
1471       else {
1472         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1473         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1474       }
1475       alloc_reg(current,i,rt1[i]);
1476     }
1477     current->is32|=1LL<<rt1[i];
1478   }
1479   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1480     if(rt1[i]) {
1481       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1482       {
1483         alloc_reg64(current,i,rs1[i]);
1484         alloc_reg64(current,i,rs2[i]);
1485         alloc_reg(current,i,rt1[i]);
1486       } else {
1487         alloc_reg(current,i,rs1[i]);
1488         alloc_reg(current,i,rs2[i]);
1489         alloc_reg(current,i,rt1[i]);
1490       }
1491     }
1492     current->is32|=1LL<<rt1[i];
1493   }
1494   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1495     if(rt1[i]) {
1496       if(rs1[i]&&rs2[i]) {
1497         alloc_reg(current,i,rs1[i]);
1498         alloc_reg(current,i,rs2[i]);
1499       }
1500       else
1501       {
1502         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1503         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1504       }
1505       alloc_reg(current,i,rt1[i]);
1506       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1507       {
1508         if(!((current->uu>>rt1[i])&1)) {
1509           alloc_reg64(current,i,rt1[i]);
1510         }
1511         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1512           if(rs1[i]&&rs2[i]) {
1513             alloc_reg64(current,i,rs1[i]);
1514             alloc_reg64(current,i,rs2[i]);
1515           }
1516           else
1517           {
1518             // Is is really worth it to keep 64-bit values in registers?
1519             #ifdef NATIVE_64BIT
1520             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1521             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1522             #endif
1523           }
1524         }
1525         current->is32&=~(1LL<<rt1[i]);
1526       } else {
1527         current->is32|=1LL<<rt1[i];
1528       }
1529     }
1530   }
1531   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1532     if(rt1[i]) {
1533       if(rs1[i]&&rs2[i]) {
1534         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1535           alloc_reg64(current,i,rs1[i]);
1536           alloc_reg64(current,i,rs2[i]);
1537           alloc_reg64(current,i,rt1[i]);
1538         } else {
1539           alloc_reg(current,i,rs1[i]);
1540           alloc_reg(current,i,rs2[i]);
1541           alloc_reg(current,i,rt1[i]);
1542         }
1543       }
1544       else {
1545         alloc_reg(current,i,rt1[i]);
1546         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1547           // DADD used as move, or zeroing
1548           // If we have a 64-bit source, then make the target 64 bits too
1549           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1550             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1551             alloc_reg64(current,i,rt1[i]);
1552           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1553             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1554             alloc_reg64(current,i,rt1[i]);
1555           }
1556           if(opcode2[i]>=0x2e&&rs2[i]) {
1557             // DSUB used as negation - 64-bit result
1558             // If we have a 32-bit register, extend it to 64 bits
1559             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1560             alloc_reg64(current,i,rt1[i]);
1561           }
1562         }
1563       }
1564       if(rs1[i]&&rs2[i]) {
1565         current->is32&=~(1LL<<rt1[i]);
1566       } else if(rs1[i]) {
1567         current->is32&=~(1LL<<rt1[i]);
1568         if((current->is32>>rs1[i])&1)
1569           current->is32|=1LL<<rt1[i];
1570       } else if(rs2[i]) {
1571         current->is32&=~(1LL<<rt1[i]);
1572         if((current->is32>>rs2[i])&1)
1573           current->is32|=1LL<<rt1[i];
1574       } else {
1575         current->is32|=1LL<<rt1[i];
1576       }
1577     }
1578   }
1579   clear_const(current,rs1[i]);
1580   clear_const(current,rs2[i]);
1581   clear_const(current,rt1[i]);
1582   dirty_reg(current,rt1[i]);
1583 }
1584
1585 void imm16_alloc(struct regstat *current,int i)
1586 {
1587   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1588   else lt1[i]=rs1[i];
1589   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1590   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1591     current->is32&=~(1LL<<rt1[i]);
1592     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1593       // TODO: Could preserve the 32-bit flag if the immediate is zero
1594       alloc_reg64(current,i,rt1[i]);
1595       alloc_reg64(current,i,rs1[i]);
1596     }
1597     clear_const(current,rs1[i]);
1598     clear_const(current,rt1[i]);
1599   }
1600   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1601     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1602     current->is32|=1LL<<rt1[i];
1603     clear_const(current,rs1[i]);
1604     clear_const(current,rt1[i]);
1605   }
1606   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1607     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1608       if(rs1[i]!=rt1[i]) {
1609         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1610         alloc_reg64(current,i,rt1[i]);
1611         current->is32&=~(1LL<<rt1[i]);
1612       }
1613     }
1614     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1615     if(is_const(current,rs1[i])) {
1616       int v=get_const(current,rs1[i]);
1617       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1618       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1619       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1620     }
1621     else clear_const(current,rt1[i]);
1622   }
1623   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1624     if(is_const(current,rs1[i])) {
1625       int v=get_const(current,rs1[i]);
1626       set_const(current,rt1[i],v+imm[i]);
1627     }
1628     else clear_const(current,rt1[i]);
1629     current->is32|=1LL<<rt1[i];
1630   }
1631   else {
1632     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1633     current->is32|=1LL<<rt1[i];
1634   }
1635   dirty_reg(current,rt1[i]);
1636 }
1637
1638 void load_alloc(struct regstat *current,int i)
1639 {
1640   clear_const(current,rt1[i]);
1641   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1642   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1643   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1644   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1645     alloc_reg(current,i,rt1[i]);
1646     assert(get_reg(current->regmap,rt1[i])>=0);
1647     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1648     {
1649       current->is32&=~(1LL<<rt1[i]);
1650       alloc_reg64(current,i,rt1[i]);
1651     }
1652     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1653     {
1654       current->is32&=~(1LL<<rt1[i]);
1655       alloc_reg64(current,i,rt1[i]);
1656       alloc_all(current,i);
1657       alloc_reg64(current,i,FTEMP);
1658       minimum_free_regs[i]=HOST_REGS;
1659     }
1660     else current->is32|=1LL<<rt1[i];
1661     dirty_reg(current,rt1[i]);
1662     // If using TLB, need a register for pointer to the mapping table
1663     if(using_tlb) alloc_reg(current,i,TLREG);
1664     // LWL/LWR need a temporary register for the old value
1665     if(opcode[i]==0x22||opcode[i]==0x26)
1666     {
1667       alloc_reg(current,i,FTEMP);
1668       alloc_reg_temp(current,i,-1);
1669       minimum_free_regs[i]=1;
1670     }
1671   }
1672   else
1673   {
1674     // Load to r0 or unneeded register (dummy load)
1675     // but we still need a register to calculate the address
1676     if(opcode[i]==0x22||opcode[i]==0x26)
1677     {
1678       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1679     }
1680     // If using TLB, need a register for pointer to the mapping table
1681     if(using_tlb) alloc_reg(current,i,TLREG);
1682     alloc_reg_temp(current,i,-1);
1683     minimum_free_regs[i]=1;
1684     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1685     {
1686       alloc_all(current,i);
1687       alloc_reg64(current,i,FTEMP);
1688       minimum_free_regs[i]=HOST_REGS;
1689     }
1690   }
1691 }
1692
1693 void store_alloc(struct regstat *current,int i)
1694 {
1695   clear_const(current,rs2[i]);
1696   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1697   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1698   alloc_reg(current,i,rs2[i]);
1699   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1700     alloc_reg64(current,i,rs2[i]);
1701     if(rs2[i]) alloc_reg(current,i,FTEMP);
1702   }
1703   // If using TLB, need a register for pointer to the mapping table
1704   if(using_tlb) alloc_reg(current,i,TLREG);
1705   #if defined(HOST_IMM8)
1706   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1707   else alloc_reg(current,i,INVCP);
1708   #endif
1709   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1710     alloc_reg(current,i,FTEMP);
1711   }
1712   // We need a temporary register for address generation
1713   alloc_reg_temp(current,i,-1);
1714   minimum_free_regs[i]=1;
1715 }
1716
1717 void c1ls_alloc(struct regstat *current,int i)
1718 {
1719   //clear_const(current,rs1[i]); // FIXME
1720   clear_const(current,rt1[i]);
1721   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1722   alloc_reg(current,i,CSREG); // Status
1723   alloc_reg(current,i,FTEMP);
1724   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1725     alloc_reg64(current,i,FTEMP);
1726   }
1727   // If using TLB, need a register for pointer to the mapping table
1728   if(using_tlb) alloc_reg(current,i,TLREG);
1729   #if defined(HOST_IMM8)
1730   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1731   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1732     alloc_reg(current,i,INVCP);
1733   #endif
1734   // We need a temporary register for address generation
1735   alloc_reg_temp(current,i,-1);
1736 }
1737
1738 void c2ls_alloc(struct regstat *current,int i)
1739 {
1740   clear_const(current,rt1[i]);
1741   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1742   alloc_reg(current,i,FTEMP);
1743   // If using TLB, need a register for pointer to the mapping table
1744   if(using_tlb) alloc_reg(current,i,TLREG);
1745   #if defined(HOST_IMM8)
1746   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1747   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1748     alloc_reg(current,i,INVCP);
1749   #endif
1750   // We need a temporary register for address generation
1751   alloc_reg_temp(current,i,-1);
1752   minimum_free_regs[i]=1;
1753 }
1754
1755 #ifndef multdiv_alloc
1756 void multdiv_alloc(struct regstat *current,int i)
1757 {
1758   //  case 0x18: MULT
1759   //  case 0x19: MULTU
1760   //  case 0x1A: DIV
1761   //  case 0x1B: DIVU
1762   //  case 0x1C: DMULT
1763   //  case 0x1D: DMULTU
1764   //  case 0x1E: DDIV
1765   //  case 0x1F: DDIVU
1766   clear_const(current,rs1[i]);
1767   clear_const(current,rs2[i]);
1768   if(rs1[i]&&rs2[i])
1769   {
1770     if((opcode2[i]&4)==0) // 32-bit
1771     {
1772       current->u&=~(1LL<<HIREG);
1773       current->u&=~(1LL<<LOREG);
1774       alloc_reg(current,i,HIREG);
1775       alloc_reg(current,i,LOREG);
1776       alloc_reg(current,i,rs1[i]);
1777       alloc_reg(current,i,rs2[i]);
1778       current->is32|=1LL<<HIREG;
1779       current->is32|=1LL<<LOREG;
1780       dirty_reg(current,HIREG);
1781       dirty_reg(current,LOREG);
1782     }
1783     else // 64-bit
1784     {
1785       current->u&=~(1LL<<HIREG);
1786       current->u&=~(1LL<<LOREG);
1787       current->uu&=~(1LL<<HIREG);
1788       current->uu&=~(1LL<<LOREG);
1789       alloc_reg64(current,i,HIREG);
1790       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1791       alloc_reg64(current,i,rs1[i]);
1792       alloc_reg64(current,i,rs2[i]);
1793       alloc_all(current,i);
1794       current->is32&=~(1LL<<HIREG);
1795       current->is32&=~(1LL<<LOREG);
1796       dirty_reg(current,HIREG);
1797       dirty_reg(current,LOREG);
1798       minimum_free_regs[i]=HOST_REGS;
1799     }
1800   }
1801   else
1802   {
1803     // Multiply by zero is zero.
1804     // MIPS does not have a divide by zero exception.
1805     // The result is undefined, we return zero.
1806     alloc_reg(current,i,HIREG);
1807     alloc_reg(current,i,LOREG);
1808     current->is32|=1LL<<HIREG;
1809     current->is32|=1LL<<LOREG;
1810     dirty_reg(current,HIREG);
1811     dirty_reg(current,LOREG);
1812   }
1813 }
1814 #endif
1815
1816 void cop0_alloc(struct regstat *current,int i)
1817 {
1818   if(opcode2[i]==0) // MFC0
1819   {
1820     if(rt1[i]) {
1821       clear_const(current,rt1[i]);
1822       alloc_all(current,i);
1823       alloc_reg(current,i,rt1[i]);
1824       current->is32|=1LL<<rt1[i];
1825       dirty_reg(current,rt1[i]);
1826     }
1827   }
1828   else if(opcode2[i]==4) // MTC0
1829   {
1830     if(rs1[i]){
1831       clear_const(current,rs1[i]);
1832       alloc_reg(current,i,rs1[i]);
1833       alloc_all(current,i);
1834     }
1835     else {
1836       alloc_all(current,i); // FIXME: Keep r0
1837       current->u&=~1LL;
1838       alloc_reg(current,i,0);
1839     }
1840   }
1841   else
1842   {
1843     // TLBR/TLBWI/TLBWR/TLBP/ERET
1844     assert(opcode2[i]==0x10);
1845     alloc_all(current,i);
1846   }
1847   minimum_free_regs[i]=HOST_REGS;
1848 }
1849
1850 void cop1_alloc(struct regstat *current,int i)
1851 {
1852   alloc_reg(current,i,CSREG); // Load status
1853   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1854   {
1855     if(rt1[i]){
1856       clear_const(current,rt1[i]);
1857       if(opcode2[i]==1) {
1858         alloc_reg64(current,i,rt1[i]); // DMFC1
1859         current->is32&=~(1LL<<rt1[i]);
1860       }else{
1861         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1862         current->is32|=1LL<<rt1[i];
1863       }
1864       dirty_reg(current,rt1[i]);
1865     }
1866     alloc_reg_temp(current,i,-1);
1867   }
1868   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1869   {
1870     if(rs1[i]){
1871       clear_const(current,rs1[i]);
1872       if(opcode2[i]==5)
1873         alloc_reg64(current,i,rs1[i]); // DMTC1
1874       else
1875         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1876       alloc_reg_temp(current,i,-1);
1877     }
1878     else {
1879       current->u&=~1LL;
1880       alloc_reg(current,i,0);
1881       alloc_reg_temp(current,i,-1);
1882     }
1883   }
1884   minimum_free_regs[i]=1;
1885 }
1886 void fconv_alloc(struct regstat *current,int i)
1887 {
1888   alloc_reg(current,i,CSREG); // Load status
1889   alloc_reg_temp(current,i,-1);
1890   minimum_free_regs[i]=1;
1891 }
1892 void float_alloc(struct regstat *current,int i)
1893 {
1894   alloc_reg(current,i,CSREG); // Load status
1895   alloc_reg_temp(current,i,-1);
1896   minimum_free_regs[i]=1;
1897 }
1898 void c2op_alloc(struct regstat *current,int i)
1899 {
1900   alloc_reg_temp(current,i,-1);
1901 }
1902 void fcomp_alloc(struct regstat *current,int i)
1903 {
1904   alloc_reg(current,i,CSREG); // Load status
1905   alloc_reg(current,i,FSREG); // Load flags
1906   dirty_reg(current,FSREG); // Flag will be modified
1907   alloc_reg_temp(current,i,-1);
1908   minimum_free_regs[i]=1;
1909 }
1910
1911 void syscall_alloc(struct regstat *current,int i)
1912 {
1913   alloc_cc(current,i);
1914   dirty_reg(current,CCREG);
1915   alloc_all(current,i);
1916   minimum_free_regs[i]=HOST_REGS;
1917   current->isconst=0;
1918 }
1919
1920 void delayslot_alloc(struct regstat *current,int i)
1921 {
1922   switch(itype[i]) {
1923     case UJUMP:
1924     case CJUMP:
1925     case SJUMP:
1926     case RJUMP:
1927     case FJUMP:
1928     case SYSCALL:
1929     case HLECALL:
1930     case SPAN:
1931       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1932       SysPrintf("Disabled speculative precompilation\n");
1933       stop_after_jal=1;
1934       break;
1935     case IMM16:
1936       imm16_alloc(current,i);
1937       break;
1938     case LOAD:
1939     case LOADLR:
1940       load_alloc(current,i);
1941       break;
1942     case STORE:
1943     case STORELR:
1944       store_alloc(current,i);
1945       break;
1946     case ALU:
1947       alu_alloc(current,i);
1948       break;
1949     case SHIFT:
1950       shift_alloc(current,i);
1951       break;
1952     case MULTDIV:
1953       multdiv_alloc(current,i);
1954       break;
1955     case SHIFTIMM:
1956       shiftimm_alloc(current,i);
1957       break;
1958     case MOV:
1959       mov_alloc(current,i);
1960       break;
1961     case COP0:
1962       cop0_alloc(current,i);
1963       break;
1964     case COP1:
1965     case COP2:
1966       cop1_alloc(current,i);
1967       break;
1968     case C1LS:
1969       c1ls_alloc(current,i);
1970       break;
1971     case C2LS:
1972       c2ls_alloc(current,i);
1973       break;
1974     case FCONV:
1975       fconv_alloc(current,i);
1976       break;
1977     case FLOAT:
1978       float_alloc(current,i);
1979       break;
1980     case FCOMP:
1981       fcomp_alloc(current,i);
1982       break;
1983     case C2OP:
1984       c2op_alloc(current,i);
1985       break;
1986   }
1987 }
1988
1989 // Special case where a branch and delay slot span two pages in virtual memory
1990 static void pagespan_alloc(struct regstat *current,int i)
1991 {
1992   current->isconst=0;
1993   current->wasconst=0;
1994   regs[i].wasconst=0;
1995   minimum_free_regs[i]=HOST_REGS;
1996   alloc_all(current,i);
1997   alloc_cc(current,i);
1998   dirty_reg(current,CCREG);
1999   if(opcode[i]==3) // JAL
2000   {
2001     alloc_reg(current,i,31);
2002     dirty_reg(current,31);
2003   }
2004   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2005   {
2006     alloc_reg(current,i,rs1[i]);
2007     if (rt1[i]!=0) {
2008       alloc_reg(current,i,rt1[i]);
2009       dirty_reg(current,rt1[i]);
2010     }
2011   }
2012   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2013   {
2014     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2015     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2016     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
2017     {
2018       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2019       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
2020     }
2021   }
2022   else
2023   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2024   {
2025     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2026     if(!((current->is32>>rs1[i])&1))
2027     {
2028       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2029     }
2030   }
2031   else
2032   if(opcode[i]==0x11) // BC1
2033   {
2034     alloc_reg(current,i,FSREG);
2035     alloc_reg(current,i,CSREG);
2036   }
2037   //else ...
2038 }
2039
2040 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2041 {
2042   stubs[stubcount][0]=type;
2043   stubs[stubcount][1]=addr;
2044   stubs[stubcount][2]=retaddr;
2045   stubs[stubcount][3]=a;
2046   stubs[stubcount][4]=b;
2047   stubs[stubcount][5]=c;
2048   stubs[stubcount][6]=d;
2049   stubs[stubcount][7]=e;
2050   stubcount++;
2051 }
2052
2053 // Write out a single register
2054 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2055 {
2056   int hr;
2057   for(hr=0;hr<HOST_REGS;hr++) {
2058     if(hr!=EXCLUDE_REG) {
2059       if((regmap[hr]&63)==r) {
2060         if((dirty>>hr)&1) {
2061           if(regmap[hr]<64) {
2062             emit_storereg(r,hr);
2063 #ifndef FORCE32
2064             if((is32>>regmap[hr])&1) {
2065               emit_sarimm(hr,31,hr);
2066               emit_storereg(r|64,hr);
2067             }
2068 #endif
2069           }else{
2070             emit_storereg(r|64,hr);
2071           }
2072         }
2073       }
2074     }
2075   }
2076 }
2077
2078 int mchecksum()
2079 {
2080   //if(!tracedebug) return 0;
2081   int i;
2082   int sum=0;
2083   for(i=0;i<2097152;i++) {
2084     unsigned int temp=sum;
2085     sum<<=1;
2086     sum|=(~temp)>>31;
2087     sum^=((u_int *)rdram)[i];
2088   }
2089   return sum;
2090 }
2091 int rchecksum()
2092 {
2093   int i;
2094   int sum=0;
2095   for(i=0;i<64;i++)
2096     sum^=((u_int *)reg)[i];
2097   return sum;
2098 }
2099 void rlist()
2100 {
2101   int i;
2102   printf("TRACE: ");
2103   for(i=0;i<32;i++)
2104     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2105   printf("\n");
2106 #ifndef DISABLE_COP1
2107   printf("TRACE: ");
2108   for(i=0;i<32;i++)
2109     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2110   printf("\n");
2111 #endif
2112 }
2113
2114 void enabletrace()
2115 {
2116   tracedebug=1;
2117 }
2118
2119 void memdebug(int i)
2120 {
2121   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2122   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2123   //rlist();
2124   //if(tracedebug) {
2125   //if(Count>=-2084597794) {
2126   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2127   //if(0) {
2128     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2129     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2130     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2131     rlist();
2132     #ifdef __i386__
2133     printf("TRACE: %x\n",(&i)[-1]);
2134     #endif
2135     #ifdef __arm__
2136     int j;
2137     printf("TRACE: %x \n",(&j)[10]);
2138     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2139     #endif
2140     //fflush(stdout);
2141   }
2142   //printf("TRACE: %x\n",(&i)[-1]);
2143 }
2144
2145 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2146 {
2147   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2148 }
2149
2150 void alu_assemble(int i,struct regstat *i_regs)
2151 {
2152   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2153     if(rt1[i]) {
2154       signed char s1,s2,t;
2155       t=get_reg(i_regs->regmap,rt1[i]);
2156       if(t>=0) {
2157         s1=get_reg(i_regs->regmap,rs1[i]);
2158         s2=get_reg(i_regs->regmap,rs2[i]);
2159         if(rs1[i]&&rs2[i]) {
2160           assert(s1>=0);
2161           assert(s2>=0);
2162           if(opcode2[i]&2) emit_sub(s1,s2,t);
2163           else emit_add(s1,s2,t);
2164         }
2165         else if(rs1[i]) {
2166           if(s1>=0) emit_mov(s1,t);
2167           else emit_loadreg(rs1[i],t);
2168         }
2169         else if(rs2[i]) {
2170           if(s2>=0) {
2171             if(opcode2[i]&2) emit_neg(s2,t);
2172             else emit_mov(s2,t);
2173           }
2174           else {
2175             emit_loadreg(rs2[i],t);
2176             if(opcode2[i]&2) emit_neg(t,t);
2177           }
2178         }
2179         else emit_zeroreg(t);
2180       }
2181     }
2182   }
2183   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2184     if(rt1[i]) {
2185       signed char s1l,s2l,s1h,s2h,tl,th;
2186       tl=get_reg(i_regs->regmap,rt1[i]);
2187       th=get_reg(i_regs->regmap,rt1[i]|64);
2188       if(tl>=0) {
2189         s1l=get_reg(i_regs->regmap,rs1[i]);
2190         s2l=get_reg(i_regs->regmap,rs2[i]);
2191         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2192         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2193         if(rs1[i]&&rs2[i]) {
2194           assert(s1l>=0);
2195           assert(s2l>=0);
2196           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2197           else emit_adds(s1l,s2l,tl);
2198           if(th>=0) {
2199             #ifdef INVERTED_CARRY
2200             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2201             #else
2202             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2203             #endif
2204             else emit_add(s1h,s2h,th);
2205           }
2206         }
2207         else if(rs1[i]) {
2208           if(s1l>=0) emit_mov(s1l,tl);
2209           else emit_loadreg(rs1[i],tl);
2210           if(th>=0) {
2211             if(s1h>=0) emit_mov(s1h,th);
2212             else emit_loadreg(rs1[i]|64,th);
2213           }
2214         }
2215         else if(rs2[i]) {
2216           if(s2l>=0) {
2217             if(opcode2[i]&2) emit_negs(s2l,tl);
2218             else emit_mov(s2l,tl);
2219           }
2220           else {
2221             emit_loadreg(rs2[i],tl);
2222             if(opcode2[i]&2) emit_negs(tl,tl);
2223           }
2224           if(th>=0) {
2225             #ifdef INVERTED_CARRY
2226             if(s2h>=0) emit_mov(s2h,th);
2227             else emit_loadreg(rs2[i]|64,th);
2228             if(opcode2[i]&2) {
2229               emit_adcimm(-1,th); // x86 has inverted carry flag
2230               emit_not(th,th);
2231             }
2232             #else
2233             if(opcode2[i]&2) {
2234               if(s2h>=0) emit_rscimm(s2h,0,th);
2235               else {
2236                 emit_loadreg(rs2[i]|64,th);
2237                 emit_rscimm(th,0,th);
2238               }
2239             }else{
2240               if(s2h>=0) emit_mov(s2h,th);
2241               else emit_loadreg(rs2[i]|64,th);
2242             }
2243             #endif
2244           }
2245         }
2246         else {
2247           emit_zeroreg(tl);
2248           if(th>=0) emit_zeroreg(th);
2249         }
2250       }
2251     }
2252   }
2253   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2254     if(rt1[i]) {
2255       signed char s1l,s1h,s2l,s2h,t;
2256       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2257       {
2258         t=get_reg(i_regs->regmap,rt1[i]);
2259         //assert(t>=0);
2260         if(t>=0) {
2261           s1l=get_reg(i_regs->regmap,rs1[i]);
2262           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2263           s2l=get_reg(i_regs->regmap,rs2[i]);
2264           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2265           if(rs2[i]==0) // rx<r0
2266           {
2267             assert(s1h>=0);
2268             if(opcode2[i]==0x2a) // SLT
2269               emit_shrimm(s1h,31,t);
2270             else // SLTU (unsigned can not be less than zero)
2271               emit_zeroreg(t);
2272           }
2273           else if(rs1[i]==0) // r0<rx
2274           {
2275             assert(s2h>=0);
2276             if(opcode2[i]==0x2a) // SLT
2277               emit_set_gz64_32(s2h,s2l,t);
2278             else // SLTU (set if not zero)
2279               emit_set_nz64_32(s2h,s2l,t);
2280           }
2281           else {
2282             assert(s1l>=0);assert(s1h>=0);
2283             assert(s2l>=0);assert(s2h>=0);
2284             if(opcode2[i]==0x2a) // SLT
2285               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2286             else // SLTU
2287               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2288           }
2289         }
2290       } else {
2291         t=get_reg(i_regs->regmap,rt1[i]);
2292         //assert(t>=0);
2293         if(t>=0) {
2294           s1l=get_reg(i_regs->regmap,rs1[i]);
2295           s2l=get_reg(i_regs->regmap,rs2[i]);
2296           if(rs2[i]==0) // rx<r0
2297           {
2298             assert(s1l>=0);
2299             if(opcode2[i]==0x2a) // SLT
2300               emit_shrimm(s1l,31,t);
2301             else // SLTU (unsigned can not be less than zero)
2302               emit_zeroreg(t);
2303           }
2304           else if(rs1[i]==0) // r0<rx
2305           {
2306             assert(s2l>=0);
2307             if(opcode2[i]==0x2a) // SLT
2308               emit_set_gz32(s2l,t);
2309             else // SLTU (set if not zero)
2310               emit_set_nz32(s2l,t);
2311           }
2312           else{
2313             assert(s1l>=0);assert(s2l>=0);
2314             if(opcode2[i]==0x2a) // SLT
2315               emit_set_if_less32(s1l,s2l,t);
2316             else // SLTU
2317               emit_set_if_carry32(s1l,s2l,t);
2318           }
2319         }
2320       }
2321     }
2322   }
2323   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2324     if(rt1[i]) {
2325       signed char s1l,s1h,s2l,s2h,th,tl;
2326       tl=get_reg(i_regs->regmap,rt1[i]);
2327       th=get_reg(i_regs->regmap,rt1[i]|64);
2328       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2329       {
2330         assert(tl>=0);
2331         if(tl>=0) {
2332           s1l=get_reg(i_regs->regmap,rs1[i]);
2333           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2334           s2l=get_reg(i_regs->regmap,rs2[i]);
2335           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2336           if(rs1[i]&&rs2[i]) {
2337             assert(s1l>=0);assert(s1h>=0);
2338             assert(s2l>=0);assert(s2h>=0);
2339             if(opcode2[i]==0x24) { // AND
2340               emit_and(s1l,s2l,tl);
2341               emit_and(s1h,s2h,th);
2342             } else
2343             if(opcode2[i]==0x25) { // OR
2344               emit_or(s1l,s2l,tl);
2345               emit_or(s1h,s2h,th);
2346             } else
2347             if(opcode2[i]==0x26) { // XOR
2348               emit_xor(s1l,s2l,tl);
2349               emit_xor(s1h,s2h,th);
2350             } else
2351             if(opcode2[i]==0x27) { // NOR
2352               emit_or(s1l,s2l,tl);
2353               emit_or(s1h,s2h,th);
2354               emit_not(tl,tl);
2355               emit_not(th,th);
2356             }
2357           }
2358           else
2359           {
2360             if(opcode2[i]==0x24) { // AND
2361               emit_zeroreg(tl);
2362               emit_zeroreg(th);
2363             } else
2364             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2365               if(rs1[i]){
2366                 if(s1l>=0) emit_mov(s1l,tl);
2367                 else emit_loadreg(rs1[i],tl);
2368                 if(s1h>=0) emit_mov(s1h,th);
2369                 else emit_loadreg(rs1[i]|64,th);
2370               }
2371               else
2372               if(rs2[i]){
2373                 if(s2l>=0) emit_mov(s2l,tl);
2374                 else emit_loadreg(rs2[i],tl);
2375                 if(s2h>=0) emit_mov(s2h,th);
2376                 else emit_loadreg(rs2[i]|64,th);
2377               }
2378               else{
2379                 emit_zeroreg(tl);
2380                 emit_zeroreg(th);
2381               }
2382             } else
2383             if(opcode2[i]==0x27) { // NOR
2384               if(rs1[i]){
2385                 if(s1l>=0) emit_not(s1l,tl);
2386                 else{
2387                   emit_loadreg(rs1[i],tl);
2388                   emit_not(tl,tl);
2389                 }
2390                 if(s1h>=0) emit_not(s1h,th);
2391                 else{
2392                   emit_loadreg(rs1[i]|64,th);
2393                   emit_not(th,th);
2394                 }
2395               }
2396               else
2397               if(rs2[i]){
2398                 if(s2l>=0) emit_not(s2l,tl);
2399                 else{
2400                   emit_loadreg(rs2[i],tl);
2401                   emit_not(tl,tl);
2402                 }
2403                 if(s2h>=0) emit_not(s2h,th);
2404                 else{
2405                   emit_loadreg(rs2[i]|64,th);
2406                   emit_not(th,th);
2407                 }
2408               }
2409               else {
2410                 emit_movimm(-1,tl);
2411                 emit_movimm(-1,th);
2412               }
2413             }
2414           }
2415         }
2416       }
2417       else
2418       {
2419         // 32 bit
2420         if(tl>=0) {
2421           s1l=get_reg(i_regs->regmap,rs1[i]);
2422           s2l=get_reg(i_regs->regmap,rs2[i]);
2423           if(rs1[i]&&rs2[i]) {
2424             assert(s1l>=0);
2425             assert(s2l>=0);
2426             if(opcode2[i]==0x24) { // AND
2427               emit_and(s1l,s2l,tl);
2428             } else
2429             if(opcode2[i]==0x25) { // OR
2430               emit_or(s1l,s2l,tl);
2431             } else
2432             if(opcode2[i]==0x26) { // XOR
2433               emit_xor(s1l,s2l,tl);
2434             } else
2435             if(opcode2[i]==0x27) { // NOR
2436               emit_or(s1l,s2l,tl);
2437               emit_not(tl,tl);
2438             }
2439           }
2440           else
2441           {
2442             if(opcode2[i]==0x24) { // AND
2443               emit_zeroreg(tl);
2444             } else
2445             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2446               if(rs1[i]){
2447                 if(s1l>=0) emit_mov(s1l,tl);
2448                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2449               }
2450               else
2451               if(rs2[i]){
2452                 if(s2l>=0) emit_mov(s2l,tl);
2453                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2454               }
2455               else emit_zeroreg(tl);
2456             } else
2457             if(opcode2[i]==0x27) { // NOR
2458               if(rs1[i]){
2459                 if(s1l>=0) emit_not(s1l,tl);
2460                 else {
2461                   emit_loadreg(rs1[i],tl);
2462                   emit_not(tl,tl);
2463                 }
2464               }
2465               else
2466               if(rs2[i]){
2467                 if(s2l>=0) emit_not(s2l,tl);
2468                 else {
2469                   emit_loadreg(rs2[i],tl);
2470                   emit_not(tl,tl);
2471                 }
2472               }
2473               else emit_movimm(-1,tl);
2474             }
2475           }
2476         }
2477       }
2478     }
2479   }
2480 }
2481
2482 void imm16_assemble(int i,struct regstat *i_regs)
2483 {
2484   if (opcode[i]==0x0f) { // LUI
2485     if(rt1[i]) {
2486       signed char t;
2487       t=get_reg(i_regs->regmap,rt1[i]);
2488       //assert(t>=0);
2489       if(t>=0) {
2490         if(!((i_regs->isconst>>t)&1))
2491           emit_movimm(imm[i]<<16,t);
2492       }
2493     }
2494   }
2495   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2496     if(rt1[i]) {
2497       signed char s,t;
2498       t=get_reg(i_regs->regmap,rt1[i]);
2499       s=get_reg(i_regs->regmap,rs1[i]);
2500       if(rs1[i]) {
2501         //assert(t>=0);
2502         //assert(s>=0);
2503         if(t>=0) {
2504           if(!((i_regs->isconst>>t)&1)) {
2505             if(s<0) {
2506               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2507               emit_addimm(t,imm[i],t);
2508             }else{
2509               if(!((i_regs->wasconst>>s)&1))
2510                 emit_addimm(s,imm[i],t);
2511               else
2512                 emit_movimm(constmap[i][s]+imm[i],t);
2513             }
2514           }
2515         }
2516       } else {
2517         if(t>=0) {
2518           if(!((i_regs->isconst>>t)&1))
2519             emit_movimm(imm[i],t);
2520         }
2521       }
2522     }
2523   }
2524   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2525     if(rt1[i]) {
2526       signed char sh,sl,th,tl;
2527       th=get_reg(i_regs->regmap,rt1[i]|64);
2528       tl=get_reg(i_regs->regmap,rt1[i]);
2529       sh=get_reg(i_regs->regmap,rs1[i]|64);
2530       sl=get_reg(i_regs->regmap,rs1[i]);
2531       if(tl>=0) {
2532         if(rs1[i]) {
2533           assert(sh>=0);
2534           assert(sl>=0);
2535           if(th>=0) {
2536             emit_addimm64_32(sh,sl,imm[i],th,tl);
2537           }
2538           else {
2539             emit_addimm(sl,imm[i],tl);
2540           }
2541         } else {
2542           emit_movimm(imm[i],tl);
2543           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2544         }
2545       }
2546     }
2547   }
2548   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2549     if(rt1[i]) {
2550       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2551       signed char sh,sl,t;
2552       t=get_reg(i_regs->regmap,rt1[i]);
2553       sh=get_reg(i_regs->regmap,rs1[i]|64);
2554       sl=get_reg(i_regs->regmap,rs1[i]);
2555       //assert(t>=0);
2556       if(t>=0) {
2557         if(rs1[i]>0) {
2558           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2559           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2560             if(opcode[i]==0x0a) { // SLTI
2561               if(sl<0) {
2562                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2563                 emit_slti32(t,imm[i],t);
2564               }else{
2565                 emit_slti32(sl,imm[i],t);
2566               }
2567             }
2568             else { // SLTIU
2569               if(sl<0) {
2570                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2571                 emit_sltiu32(t,imm[i],t);
2572               }else{
2573                 emit_sltiu32(sl,imm[i],t);
2574               }
2575             }
2576           }else{ // 64-bit
2577             assert(sl>=0);
2578             if(opcode[i]==0x0a) // SLTI
2579               emit_slti64_32(sh,sl,imm[i],t);
2580             else // SLTIU
2581               emit_sltiu64_32(sh,sl,imm[i],t);
2582           }
2583         }else{
2584           // SLTI(U) with r0 is just stupid,
2585           // nonetheless examples can be found
2586           if(opcode[i]==0x0a) // SLTI
2587             if(0<imm[i]) emit_movimm(1,t);
2588             else emit_zeroreg(t);
2589           else // SLTIU
2590           {
2591             if(imm[i]) emit_movimm(1,t);
2592             else emit_zeroreg(t);
2593           }
2594         }
2595       }
2596     }
2597   }
2598   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2599     if(rt1[i]) {
2600       signed char sh,sl,th,tl;
2601       th=get_reg(i_regs->regmap,rt1[i]|64);
2602       tl=get_reg(i_regs->regmap,rt1[i]);
2603       sh=get_reg(i_regs->regmap,rs1[i]|64);
2604       sl=get_reg(i_regs->regmap,rs1[i]);
2605       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2606         if(opcode[i]==0x0c) //ANDI
2607         {
2608           if(rs1[i]) {
2609             if(sl<0) {
2610               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2611               emit_andimm(tl,imm[i],tl);
2612             }else{
2613               if(!((i_regs->wasconst>>sl)&1))
2614                 emit_andimm(sl,imm[i],tl);
2615               else
2616                 emit_movimm(constmap[i][sl]&imm[i],tl);
2617             }
2618           }
2619           else
2620             emit_zeroreg(tl);
2621           if(th>=0) emit_zeroreg(th);
2622         }
2623         else
2624         {
2625           if(rs1[i]) {
2626             if(sl<0) {
2627               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2628             }
2629             if(th>=0) {
2630               if(sh<0) {
2631                 emit_loadreg(rs1[i]|64,th);
2632               }else{
2633                 emit_mov(sh,th);
2634               }
2635             }
2636             if(opcode[i]==0x0d) //ORI
2637             if(sl<0) {
2638               emit_orimm(tl,imm[i],tl);
2639             }else{
2640               if(!((i_regs->wasconst>>sl)&1))
2641                 emit_orimm(sl,imm[i],tl);
2642               else
2643                 emit_movimm(constmap[i][sl]|imm[i],tl);
2644             }
2645             if(opcode[i]==0x0e) //XORI
2646             if(sl<0) {
2647               emit_xorimm(tl,imm[i],tl);
2648             }else{
2649               if(!((i_regs->wasconst>>sl)&1))
2650                 emit_xorimm(sl,imm[i],tl);
2651               else
2652                 emit_movimm(constmap[i][sl]^imm[i],tl);
2653             }
2654           }
2655           else {
2656             emit_movimm(imm[i],tl);
2657             if(th>=0) emit_zeroreg(th);
2658           }
2659         }
2660       }
2661     }
2662   }
2663 }
2664
2665 void shiftimm_assemble(int i,struct regstat *i_regs)
2666 {
2667   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2668   {
2669     if(rt1[i]) {
2670       signed char s,t;
2671       t=get_reg(i_regs->regmap,rt1[i]);
2672       s=get_reg(i_regs->regmap,rs1[i]);
2673       //assert(t>=0);
2674       if(t>=0&&!((i_regs->isconst>>t)&1)){
2675         if(rs1[i]==0)
2676         {
2677           emit_zeroreg(t);
2678         }
2679         else
2680         {
2681           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2682           if(imm[i]) {
2683             if(opcode2[i]==0) // SLL
2684             {
2685               emit_shlimm(s<0?t:s,imm[i],t);
2686             }
2687             if(opcode2[i]==2) // SRL
2688             {
2689               emit_shrimm(s<0?t:s,imm[i],t);
2690             }
2691             if(opcode2[i]==3) // SRA
2692             {
2693               emit_sarimm(s<0?t:s,imm[i],t);
2694             }
2695           }else{
2696             // Shift by zero
2697             if(s>=0 && s!=t) emit_mov(s,t);
2698           }
2699         }
2700       }
2701       //emit_storereg(rt1[i],t); //DEBUG
2702     }
2703   }
2704   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2705   {
2706     if(rt1[i]) {
2707       signed char sh,sl,th,tl;
2708       th=get_reg(i_regs->regmap,rt1[i]|64);
2709       tl=get_reg(i_regs->regmap,rt1[i]);
2710       sh=get_reg(i_regs->regmap,rs1[i]|64);
2711       sl=get_reg(i_regs->regmap,rs1[i]);
2712       if(tl>=0) {
2713         if(rs1[i]==0)
2714         {
2715           emit_zeroreg(tl);
2716           if(th>=0) emit_zeroreg(th);
2717         }
2718         else
2719         {
2720           assert(sl>=0);
2721           assert(sh>=0);
2722           if(imm[i]) {
2723             if(opcode2[i]==0x38) // DSLL
2724             {
2725               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2726               emit_shlimm(sl,imm[i],tl);
2727             }
2728             if(opcode2[i]==0x3a) // DSRL
2729             {
2730               emit_shrdimm(sl,sh,imm[i],tl);
2731               if(th>=0) emit_shrimm(sh,imm[i],th);
2732             }
2733             if(opcode2[i]==0x3b) // DSRA
2734             {
2735               emit_shrdimm(sl,sh,imm[i],tl);
2736               if(th>=0) emit_sarimm(sh,imm[i],th);
2737             }
2738           }else{
2739             // Shift by zero
2740             if(sl!=tl) emit_mov(sl,tl);
2741             if(th>=0&&sh!=th) emit_mov(sh,th);
2742           }
2743         }
2744       }
2745     }
2746   }
2747   if(opcode2[i]==0x3c) // DSLL32
2748   {
2749     if(rt1[i]) {
2750       signed char sl,tl,th;
2751       tl=get_reg(i_regs->regmap,rt1[i]);
2752       th=get_reg(i_regs->regmap,rt1[i]|64);
2753       sl=get_reg(i_regs->regmap,rs1[i]);
2754       if(th>=0||tl>=0){
2755         assert(tl>=0);
2756         assert(th>=0);
2757         assert(sl>=0);
2758         emit_mov(sl,th);
2759         emit_zeroreg(tl);
2760         if(imm[i]>32)
2761         {
2762           emit_shlimm(th,imm[i]&31,th);
2763         }
2764       }
2765     }
2766   }
2767   if(opcode2[i]==0x3e) // DSRL32
2768   {
2769     if(rt1[i]) {
2770       signed char sh,tl,th;
2771       tl=get_reg(i_regs->regmap,rt1[i]);
2772       th=get_reg(i_regs->regmap,rt1[i]|64);
2773       sh=get_reg(i_regs->regmap,rs1[i]|64);
2774       if(tl>=0){
2775         assert(sh>=0);
2776         emit_mov(sh,tl);
2777         if(th>=0) emit_zeroreg(th);
2778         if(imm[i]>32)
2779         {
2780           emit_shrimm(tl,imm[i]&31,tl);
2781         }
2782       }
2783     }
2784   }
2785   if(opcode2[i]==0x3f) // DSRA32
2786   {
2787     if(rt1[i]) {
2788       signed char sh,tl;
2789       tl=get_reg(i_regs->regmap,rt1[i]);
2790       sh=get_reg(i_regs->regmap,rs1[i]|64);
2791       if(tl>=0){
2792         assert(sh>=0);
2793         emit_mov(sh,tl);
2794         if(imm[i]>32)
2795         {
2796           emit_sarimm(tl,imm[i]&31,tl);
2797         }
2798       }
2799     }
2800   }
2801 }
2802
2803 #ifndef shift_assemble
2804 void shift_assemble(int i,struct regstat *i_regs)
2805 {
2806   printf("Need shift_assemble for this architecture.\n");
2807   exit(1);
2808 }
2809 #endif
2810
2811 void load_assemble(int i,struct regstat *i_regs)
2812 {
2813   int s,th,tl,addr,map=-1;
2814   int offset;
2815   int jaddr=0;
2816   int memtarget=0,c=0;
2817   int fastload_reg_override=0;
2818   u_int hr,reglist=0;
2819   th=get_reg(i_regs->regmap,rt1[i]|64);
2820   tl=get_reg(i_regs->regmap,rt1[i]);
2821   s=get_reg(i_regs->regmap,rs1[i]);
2822   offset=imm[i];
2823   for(hr=0;hr<HOST_REGS;hr++) {
2824     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2825   }
2826   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2827   if(s>=0) {
2828     c=(i_regs->wasconst>>s)&1;
2829     if (c) {
2830       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2831       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2832     }
2833   }
2834   //printf("load_assemble: c=%d\n",c);
2835   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2836   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2837 #ifdef PCSX
2838   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2839     ||rt1[i]==0) {
2840       // could be FIFO, must perform the read
2841       // ||dummy read
2842       assem_debug("(forced read)\n");
2843       tl=get_reg(i_regs->regmap,-1);
2844       assert(tl>=0);
2845   }
2846 #endif
2847   if(offset||s<0||c) addr=tl;
2848   else addr=s;
2849   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2850  if(tl>=0) {
2851   //printf("load_assemble: c=%d\n",c);
2852   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2853   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2854   reglist&=~(1<<tl);
2855   if(th>=0) reglist&=~(1<<th);
2856   if(!using_tlb) {
2857     if(!c) {
2858       #ifdef RAM_OFFSET
2859       map=get_reg(i_regs->regmap,ROREG);
2860       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2861       #endif
2862 //#define R29_HACK 1
2863       #ifdef R29_HACK
2864       // Strmnnrmn's speed hack
2865       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2866       #endif
2867       {
2868         jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2869       }
2870     }
2871     else if(ram_offset&&memtarget) {
2872       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2873       fastload_reg_override=HOST_TEMPREG;
2874     }
2875   }else{ // using tlb
2876     int x=0;
2877     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2878     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2879     map=get_reg(i_regs->regmap,TLREG);
2880     assert(map>=0);
2881     reglist&=~(1<<map);
2882     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2883     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2884   }
2885   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2886   if (opcode[i]==0x20) { // LB
2887     if(!c||memtarget) {
2888       if(!dummy) {
2889         #ifdef HOST_IMM_ADDR32
2890         if(c)
2891           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2892         else
2893         #endif
2894         {
2895           //emit_xorimm(addr,3,tl);
2896           //gen_tlb_addr_r(tl,map);
2897           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2898           int x=0,a=tl;
2899 #ifdef BIG_ENDIAN_MIPS
2900           if(!c) emit_xorimm(addr,3,tl);
2901           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2902 #else
2903           if(!c) a=addr;
2904 #endif
2905           if(fastload_reg_override) a=fastload_reg_override;
2906
2907           emit_movsbl_indexed_tlb(x,a,map,tl);
2908         }
2909       }
2910       if(jaddr)
2911         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2912     }
2913     else
2914       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2915   }
2916   if (opcode[i]==0x21) { // LH
2917     if(!c||memtarget) {
2918       if(!dummy) {
2919         #ifdef HOST_IMM_ADDR32
2920         if(c)
2921           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2922         else
2923         #endif
2924         {
2925           int x=0,a=tl;
2926 #ifdef BIG_ENDIAN_MIPS
2927           if(!c) emit_xorimm(addr,2,tl);
2928           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2929 #else
2930           if(!c) a=addr;
2931 #endif
2932           if(fastload_reg_override) a=fastload_reg_override;
2933           //#ifdef
2934           //emit_movswl_indexed_tlb(x,tl,map,tl);
2935           //else
2936           if(map>=0) {
2937             gen_tlb_addr_r(a,map);
2938             emit_movswl_indexed(x,a,tl);
2939           }else{
2940             #if 1 //def RAM_OFFSET
2941             emit_movswl_indexed(x,a,tl);
2942             #else
2943             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
2944             #endif
2945           }
2946         }
2947       }
2948       if(jaddr)
2949         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2950     }
2951     else
2952       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2953   }
2954   if (opcode[i]==0x23) { // LW
2955     if(!c||memtarget) {
2956       if(!dummy) {
2957         int a=addr;
2958         if(fastload_reg_override) a=fastload_reg_override;
2959         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2960         #ifdef HOST_IMM_ADDR32
2961         if(c)
2962           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2963         else
2964         #endif
2965         emit_readword_indexed_tlb(0,a,map,tl);
2966       }
2967       if(jaddr)
2968         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2969     }
2970     else
2971       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2972   }
2973   if (opcode[i]==0x24) { // LBU
2974     if(!c||memtarget) {
2975       if(!dummy) {
2976         #ifdef HOST_IMM_ADDR32
2977         if(c)
2978           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2979         else
2980         #endif
2981         {
2982           //emit_xorimm(addr,3,tl);
2983           //gen_tlb_addr_r(tl,map);
2984           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2985           int x=0,a=tl;
2986 #ifdef BIG_ENDIAN_MIPS
2987           if(!c) emit_xorimm(addr,3,tl);
2988           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2989 #else
2990           if(!c) a=addr;
2991 #endif
2992           if(fastload_reg_override) a=fastload_reg_override;
2993
2994           emit_movzbl_indexed_tlb(x,a,map,tl);
2995         }
2996       }
2997       if(jaddr)
2998         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2999     }
3000     else
3001       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3002   }
3003   if (opcode[i]==0x25) { // LHU
3004     if(!c||memtarget) {
3005       if(!dummy) {
3006         #ifdef HOST_IMM_ADDR32
3007         if(c)
3008           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
3009         else
3010         #endif
3011         {
3012           int x=0,a=tl;
3013 #ifdef BIG_ENDIAN_MIPS
3014           if(!c) emit_xorimm(addr,2,tl);
3015           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3016 #else
3017           if(!c) a=addr;
3018 #endif
3019           if(fastload_reg_override) a=fastload_reg_override;
3020           //#ifdef
3021           //emit_movzwl_indexed_tlb(x,tl,map,tl);
3022           //#else
3023           if(map>=0) {
3024             gen_tlb_addr_r(a,map);
3025             emit_movzwl_indexed(x,a,tl);
3026           }else{
3027             #if 1 //def RAM_OFFSET
3028             emit_movzwl_indexed(x,a,tl);
3029             #else
3030             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3031             #endif
3032           }
3033         }
3034       }
3035       if(jaddr)
3036         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3037     }
3038     else
3039       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3040   }
3041   if (opcode[i]==0x27) { // LWU
3042     assert(th>=0);
3043     if(!c||memtarget) {
3044       if(!dummy) {
3045         int a=addr;
3046         if(fastload_reg_override) a=fastload_reg_override;
3047         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3048         #ifdef HOST_IMM_ADDR32
3049         if(c)
3050           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3051         else
3052         #endif
3053         emit_readword_indexed_tlb(0,a,map,tl);
3054       }
3055       if(jaddr)
3056         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3057     }
3058     else {
3059       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3060     }
3061     emit_zeroreg(th);
3062   }
3063   if (opcode[i]==0x37) { // LD
3064     if(!c||memtarget) {
3065       if(!dummy) {
3066         int a=addr;
3067         if(fastload_reg_override) a=fastload_reg_override;
3068         //gen_tlb_addr_r(tl,map);
3069         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3070         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3071         #ifdef HOST_IMM_ADDR32
3072         if(c)
3073           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3074         else
3075         #endif
3076         emit_readdword_indexed_tlb(0,a,map,th,tl);
3077       }
3078       if(jaddr)
3079         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3080     }
3081     else
3082       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3083   }
3084  }
3085   //emit_storereg(rt1[i],tl); // DEBUG
3086   //if(opcode[i]==0x23)
3087   //if(opcode[i]==0x24)
3088   //if(opcode[i]==0x23||opcode[i]==0x24)
3089   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3090   {
3091     //emit_pusha();
3092     save_regs(0x100f);
3093         emit_readword((int)&last_count,ECX);
3094         #ifdef __i386__
3095         if(get_reg(i_regs->regmap,CCREG)<0)
3096           emit_loadreg(CCREG,HOST_CCREG);
3097         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3098         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3099         emit_writeword(HOST_CCREG,(int)&Count);
3100         #endif
3101         #ifdef __arm__
3102         if(get_reg(i_regs->regmap,CCREG)<0)
3103           emit_loadreg(CCREG,0);
3104         else
3105           emit_mov(HOST_CCREG,0);
3106         emit_add(0,ECX,0);
3107         emit_addimm(0,2*ccadj[i],0);
3108         emit_writeword(0,(int)&Count);
3109         #endif
3110     emit_call((int)memdebug);
3111     //emit_popa();
3112     restore_regs(0x100f);
3113   }/**/
3114 }
3115
3116 #ifndef loadlr_assemble
3117 void loadlr_assemble(int i,struct regstat *i_regs)
3118 {
3119   printf("Need loadlr_assemble for this architecture.\n");
3120   exit(1);
3121 }
3122 #endif
3123
3124 void store_assemble(int i,struct regstat *i_regs)
3125 {
3126   int s,th,tl,map=-1;
3127   int addr,temp;
3128   int offset;
3129   int jaddr=0,jaddr2,type;
3130   int memtarget=0,c=0;
3131   int agr=AGEN1+(i&1);
3132   int faststore_reg_override=0;
3133   u_int hr,reglist=0;
3134   th=get_reg(i_regs->regmap,rs2[i]|64);
3135   tl=get_reg(i_regs->regmap,rs2[i]);
3136   s=get_reg(i_regs->regmap,rs1[i]);
3137   temp=get_reg(i_regs->regmap,agr);
3138   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3139   offset=imm[i];
3140   if(s>=0) {
3141     c=(i_regs->wasconst>>s)&1;
3142     if(c) {
3143       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3144       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3145     }
3146   }
3147   assert(tl>=0);
3148   assert(temp>=0);
3149   for(hr=0;hr<HOST_REGS;hr++) {
3150     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3151   }
3152   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3153   if(offset||s<0||c) addr=temp;
3154   else addr=s;
3155   if(!using_tlb) {
3156     if(!c) {
3157       #ifndef PCSX
3158       #ifdef R29_HACK
3159       // Strmnnrmn's speed hack
3160       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3161       #endif
3162       emit_cmpimm(addr,RAM_SIZE);
3163       #ifdef DESTRUCTIVE_SHIFT
3164       if(s==addr) emit_mov(s,temp);
3165       #endif
3166       #ifdef R29_HACK
3167       memtarget=1;
3168       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3169       #endif
3170       {
3171         jaddr=(int)out;
3172         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3173         // Hint to branch predictor that the branch is unlikely to be taken
3174         if(rs1[i]>=28)
3175           emit_jno_unlikely(0);
3176         else
3177         #endif
3178         emit_jno(0);
3179       }
3180       #else
3181         jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
3182       #endif
3183     }
3184     else if(ram_offset&&memtarget) {
3185       emit_addimm(addr,ram_offset,HOST_TEMPREG);
3186       faststore_reg_override=HOST_TEMPREG;
3187     }
3188   }else{ // using tlb
3189     int x=0;
3190     if (opcode[i]==0x28) x=3; // SB
3191     if (opcode[i]==0x29) x=2; // SH
3192     map=get_reg(i_regs->regmap,TLREG);
3193     assert(map>=0);
3194     reglist&=~(1<<map);
3195     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3196     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3197   }
3198
3199   if (opcode[i]==0x28) { // SB
3200     if(!c||memtarget) {
3201       int x=0,a=temp;
3202 #ifdef BIG_ENDIAN_MIPS
3203       if(!c) emit_xorimm(addr,3,temp);
3204       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3205 #else
3206       if(!c) a=addr;
3207 #endif
3208       if(faststore_reg_override) a=faststore_reg_override;
3209       //gen_tlb_addr_w(temp,map);
3210       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3211       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3212     }
3213     type=STOREB_STUB;
3214   }
3215   if (opcode[i]==0x29) { // SH
3216     if(!c||memtarget) {
3217       int x=0,a=temp;
3218 #ifdef BIG_ENDIAN_MIPS
3219       if(!c) emit_xorimm(addr,2,temp);
3220       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3221 #else
3222       if(!c) a=addr;
3223 #endif
3224       if(faststore_reg_override) a=faststore_reg_override;
3225       //#ifdef
3226       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3227       //#else
3228       if(map>=0) {
3229         gen_tlb_addr_w(a,map);
3230         emit_writehword_indexed(tl,x,a);
3231       }else
3232         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3233         emit_writehword_indexed(tl,x,a);
3234     }
3235     type=STOREH_STUB;
3236   }
3237   if (opcode[i]==0x2B) { // SW
3238     if(!c||memtarget) {
3239       int a=addr;
3240       if(faststore_reg_override) a=faststore_reg_override;
3241       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3242       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3243     }
3244     type=STOREW_STUB;
3245   }
3246   if (opcode[i]==0x3F) { // SD
3247     if(!c||memtarget) {
3248       int a=addr;
3249       if(faststore_reg_override) a=faststore_reg_override;
3250       if(rs2[i]) {
3251         assert(th>=0);
3252         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3253         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3254         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3255       }else{
3256         // Store zero
3257         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3258         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3259         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3260       }
3261     }
3262     type=STORED_STUB;
3263   }
3264 #ifdef PCSX
3265   if(jaddr) {
3266     // PCSX store handlers don't check invcode again
3267     reglist|=1<<addr;
3268     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3269     jaddr=0;
3270   }
3271 #endif
3272   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3273     if(!c||memtarget) {
3274       #ifdef DESTRUCTIVE_SHIFT
3275       // The x86 shift operation is 'destructive'; it overwrites the
3276       // source register, so we need to make a copy first and use that.
3277       addr=temp;
3278       #endif
3279       #if defined(HOST_IMM8)
3280       int ir=get_reg(i_regs->regmap,INVCP);
3281       assert(ir>=0);
3282       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3283       #else
3284       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3285       #endif
3286       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3287       emit_callne(invalidate_addr_reg[addr]);
3288       #else
3289       jaddr2=(int)out;
3290       emit_jne(0);
3291       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3292       #endif
3293     }
3294   }
3295   u_int addr_val=constmap[i][s]+offset;
3296   if(jaddr) {
3297     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3298   } else if(c&&!memtarget) {
3299     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3300   }
3301   // basic current block modification detection..
3302   // not looking back as that should be in mips cache already
3303   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3304     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3305     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3306     if(i_regs->regmap==regs[i].regmap) {
3307       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3308       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3309       emit_movimm(start+i*4+4,0);
3310       emit_writeword(0,(int)&pcaddr);
3311       emit_jmp((int)do_interrupt);
3312     }
3313   }
3314   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3315   //if(opcode[i]==0x2B || opcode[i]==0x28)
3316   //if(opcode[i]==0x2B || opcode[i]==0x29)
3317   //if(opcode[i]==0x2B)
3318   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3319   {
3320     #ifdef __i386__
3321     emit_pusha();
3322     #endif
3323     #ifdef __arm__
3324     save_regs(0x100f);
3325     #endif
3326         emit_readword((int)&last_count,ECX);
3327         #ifdef __i386__
3328         if(get_reg(i_regs->regmap,CCREG)<0)
3329           emit_loadreg(CCREG,HOST_CCREG);
3330         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3331         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3332         emit_writeword(HOST_CCREG,(int)&Count);
3333         #endif
3334         #ifdef __arm__
3335         if(get_reg(i_regs->regmap,CCREG)<0)
3336           emit_loadreg(CCREG,0);
3337         else
3338           emit_mov(HOST_CCREG,0);
3339         emit_add(0,ECX,0);
3340         emit_addimm(0,2*ccadj[i],0);
3341         emit_writeword(0,(int)&Count);
3342         #endif
3343     emit_call((int)memdebug);
3344     #ifdef __i386__
3345     emit_popa();
3346     #endif
3347     #ifdef __arm__
3348     restore_regs(0x100f);
3349     #endif
3350   }/**/
3351 }
3352
3353 void storelr_assemble(int i,struct regstat *i_regs)
3354 {
3355   int s,th,tl;
3356   int temp;
3357   int temp2;
3358   int offset;
3359   int jaddr=0,jaddr2;
3360   int case1,case2,case3;
3361   int done0,done1,done2;
3362   int memtarget=0,c=0;
3363   int agr=AGEN1+(i&1);
3364   u_int hr,reglist=0;
3365   th=get_reg(i_regs->regmap,rs2[i]|64);
3366   tl=get_reg(i_regs->regmap,rs2[i]);
3367   s=get_reg(i_regs->regmap,rs1[i]);
3368   temp=get_reg(i_regs->regmap,agr);
3369   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3370   offset=imm[i];
3371   if(s>=0) {
3372     c=(i_regs->isconst>>s)&1;
3373     if(c) {
3374       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3375       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3376     }
3377   }
3378   assert(tl>=0);
3379   for(hr=0;hr<HOST_REGS;hr++) {
3380     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3381   }
3382   assert(temp>=0);
3383   if(!using_tlb) {
3384     if(!c) {
3385       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3386       if(!offset&&s!=temp) emit_mov(s,temp);
3387       jaddr=(int)out;
3388       emit_jno(0);
3389     }
3390     else
3391     {
3392       if(!memtarget||!rs1[i]) {
3393         jaddr=(int)out;
3394         emit_jmp(0);
3395       }
3396     }
3397     #ifdef RAM_OFFSET
3398     int map=get_reg(i_regs->regmap,ROREG);
3399     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3400     gen_tlb_addr_w(temp,map);
3401     #else
3402     if((u_int)rdram!=0x80000000)
3403       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3404     #endif
3405   }else{ // using tlb
3406     int map=get_reg(i_regs->regmap,TLREG);
3407     assert(map>=0);
3408     reglist&=~(1<<map);
3409     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3410     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3411     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3412     if(!jaddr&&!memtarget) {
3413       jaddr=(int)out;
3414       emit_jmp(0);
3415     }
3416     gen_tlb_addr_w(temp,map);
3417   }
3418
3419   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3420     temp2=get_reg(i_regs->regmap,FTEMP);
3421     if(!rs2[i]) temp2=th=tl;
3422   }
3423
3424 #ifndef BIG_ENDIAN_MIPS
3425     emit_xorimm(temp,3,temp);
3426 #endif
3427   emit_testimm(temp,2);
3428   case2=(int)out;
3429   emit_jne(0);
3430   emit_testimm(temp,1);
3431   case1=(int)out;
3432   emit_jne(0);
3433   // 0
3434   if (opcode[i]==0x2A) { // SWL
3435     emit_writeword_indexed(tl,0,temp);
3436   }
3437   if (opcode[i]==0x2E) { // SWR
3438     emit_writebyte_indexed(tl,3,temp);
3439   }
3440   if (opcode[i]==0x2C) { // SDL
3441     emit_writeword_indexed(th,0,temp);
3442     if(rs2[i]) emit_mov(tl,temp2);
3443   }
3444   if (opcode[i]==0x2D) { // SDR
3445     emit_writebyte_indexed(tl,3,temp);
3446     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3447   }
3448   done0=(int)out;
3449   emit_jmp(0);
3450   // 1
3451   set_jump_target(case1,(int)out);
3452   if (opcode[i]==0x2A) { // SWL
3453     // Write 3 msb into three least significant bytes
3454     if(rs2[i]) emit_rorimm(tl,8,tl);
3455     emit_writehword_indexed(tl,-1,temp);
3456     if(rs2[i]) emit_rorimm(tl,16,tl);
3457     emit_writebyte_indexed(tl,1,temp);
3458     if(rs2[i]) emit_rorimm(tl,8,tl);
3459   }
3460   if (opcode[i]==0x2E) { // SWR
3461     // Write two lsb into two most significant bytes
3462     emit_writehword_indexed(tl,1,temp);
3463   }
3464   if (opcode[i]==0x2C) { // SDL
3465     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3466     // Write 3 msb into three least significant bytes
3467     if(rs2[i]) emit_rorimm(th,8,th);
3468     emit_writehword_indexed(th,-1,temp);
3469     if(rs2[i]) emit_rorimm(th,16,th);
3470     emit_writebyte_indexed(th,1,temp);
3471     if(rs2[i]) emit_rorimm(th,8,th);
3472   }
3473   if (opcode[i]==0x2D) { // SDR
3474     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3475     // Write two lsb into two most significant bytes
3476     emit_writehword_indexed(tl,1,temp);
3477   }
3478   done1=(int)out;
3479   emit_jmp(0);
3480   // 2
3481   set_jump_target(case2,(int)out);
3482   emit_testimm(temp,1);
3483   case3=(int)out;
3484   emit_jne(0);
3485   if (opcode[i]==0x2A) { // SWL
3486     // Write two msb into two least significant bytes
3487     if(rs2[i]) emit_rorimm(tl,16,tl);
3488     emit_writehword_indexed(tl,-2,temp);
3489     if(rs2[i]) emit_rorimm(tl,16,tl);
3490   }
3491   if (opcode[i]==0x2E) { // SWR
3492     // Write 3 lsb into three most significant bytes
3493     emit_writebyte_indexed(tl,-1,temp);
3494     if(rs2[i]) emit_rorimm(tl,8,tl);
3495     emit_writehword_indexed(tl,0,temp);
3496     if(rs2[i]) emit_rorimm(tl,24,tl);
3497   }
3498   if (opcode[i]==0x2C) { // SDL
3499     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3500     // Write two msb into two least significant bytes
3501     if(rs2[i]) emit_rorimm(th,16,th);
3502     emit_writehword_indexed(th,-2,temp);
3503     if(rs2[i]) emit_rorimm(th,16,th);
3504   }
3505   if (opcode[i]==0x2D) { // SDR
3506     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3507     // Write 3 lsb into three most significant bytes
3508     emit_writebyte_indexed(tl,-1,temp);
3509     if(rs2[i]) emit_rorimm(tl,8,tl);
3510     emit_writehword_indexed(tl,0,temp);
3511     if(rs2[i]) emit_rorimm(tl,24,tl);
3512   }
3513   done2=(int)out;
3514   emit_jmp(0);
3515   // 3
3516   set_jump_target(case3,(int)out);
3517   if (opcode[i]==0x2A) { // SWL
3518     // Write msb into least significant byte
3519     if(rs2[i]) emit_rorimm(tl,24,tl);
3520     emit_writebyte_indexed(tl,-3,temp);
3521     if(rs2[i]) emit_rorimm(tl,8,tl);
3522   }
3523   if (opcode[i]==0x2E) { // SWR
3524     // Write entire word
3525     emit_writeword_indexed(tl,-3,temp);
3526   }
3527   if (opcode[i]==0x2C) { // SDL
3528     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3529     // Write msb into least significant byte
3530     if(rs2[i]) emit_rorimm(th,24,th);
3531     emit_writebyte_indexed(th,-3,temp);
3532     if(rs2[i]) emit_rorimm(th,8,th);
3533   }
3534   if (opcode[i]==0x2D) { // SDR
3535     if(rs2[i]) emit_mov(th,temp2);
3536     // Write entire word
3537     emit_writeword_indexed(tl,-3,temp);
3538   }
3539   set_jump_target(done0,(int)out);
3540   set_jump_target(done1,(int)out);
3541   set_jump_target(done2,(int)out);
3542   if (opcode[i]==0x2C) { // SDL
3543     emit_testimm(temp,4);
3544     done0=(int)out;
3545     emit_jne(0);
3546     emit_andimm(temp,~3,temp);
3547     emit_writeword_indexed(temp2,4,temp);
3548     set_jump_target(done0,(int)out);
3549   }
3550   if (opcode[i]==0x2D) { // SDR
3551     emit_testimm(temp,4);
3552     done0=(int)out;
3553     emit_jeq(0);
3554     emit_andimm(temp,~3,temp);
3555     emit_writeword_indexed(temp2,-4,temp);
3556     set_jump_target(done0,(int)out);
3557   }
3558   if(!c||!memtarget)
3559     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3560   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3561     #ifdef RAM_OFFSET
3562     int map=get_reg(i_regs->regmap,ROREG);
3563     if(map<0) map=HOST_TEMPREG;
3564     gen_orig_addr_w(temp,map);
3565     #else
3566     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3567     #endif
3568     #if defined(HOST_IMM8)
3569     int ir=get_reg(i_regs->regmap,INVCP);
3570     assert(ir>=0);
3571     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3572     #else
3573     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3574     #endif
3575     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3576     emit_callne(invalidate_addr_reg[temp]);
3577     #else
3578     jaddr2=(int)out;
3579     emit_jne(0);
3580     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3581     #endif
3582   }
3583   /*
3584     emit_pusha();
3585     //save_regs(0x100f);
3586         emit_readword((int)&last_count,ECX);
3587         if(get_reg(i_regs->regmap,CCREG)<0)
3588           emit_loadreg(CCREG,HOST_CCREG);
3589         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3590         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3591         emit_writeword(HOST_CCREG,(int)&Count);
3592     emit_call((int)memdebug);
3593     emit_popa();
3594     //restore_regs(0x100f);
3595   /**/
3596 }
3597
3598 void c1ls_assemble(int i,struct regstat *i_regs)
3599 {
3600 #ifndef DISABLE_COP1
3601   int s,th,tl;
3602   int temp,ar;
3603   int map=-1;
3604   int offset;
3605   int c=0;
3606   int jaddr,jaddr2=0,jaddr3,type;
3607   int agr=AGEN1+(i&1);
3608   u_int hr,reglist=0;
3609   th=get_reg(i_regs->regmap,FTEMP|64);
3610   tl=get_reg(i_regs->regmap,FTEMP);
3611   s=get_reg(i_regs->regmap,rs1[i]);
3612   temp=get_reg(i_regs->regmap,agr);
3613   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3614   offset=imm[i];
3615   assert(tl>=0);
3616   assert(rs1[i]>0);
3617   assert(temp>=0);
3618   for(hr=0;hr<HOST_REGS;hr++) {
3619     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3620   }
3621   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3622   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3623   {
3624     // Loads use a temporary register which we need to save
3625     reglist|=1<<temp;
3626   }
3627   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3628     ar=temp;
3629   else // LWC1/LDC1
3630     ar=tl;
3631   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3632   //else c=(i_regs->wasconst>>s)&1;
3633   if(s>=0) c=(i_regs->wasconst>>s)&1;
3634   // Check cop1 unusable
3635   if(!cop1_usable) {
3636     signed char rs=get_reg(i_regs->regmap,CSREG);
3637     assert(rs>=0);
3638     emit_testimm(rs,0x20000000);
3639     jaddr=(int)out;
3640     emit_jeq(0);
3641     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3642     cop1_usable=1;
3643   }
3644   if (opcode[i]==0x39) { // SWC1 (get float address)
3645     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3646   }
3647   if (opcode[i]==0x3D) { // SDC1 (get double address)
3648     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3649   }
3650   // Generate address + offset
3651   if(!using_tlb) {
3652     if(!c)
3653       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3654   }
3655   else
3656   {
3657     map=get_reg(i_regs->regmap,TLREG);
3658     assert(map>=0);
3659     reglist&=~(1<<map);
3660     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3661       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3662     }
3663     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3664       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3665     }
3666   }
3667   if (opcode[i]==0x39) { // SWC1 (read float)
3668     emit_readword_indexed(0,tl,tl);
3669   }
3670   if (opcode[i]==0x3D) { // SDC1 (read double)
3671     emit_readword_indexed(4,tl,th);
3672     emit_readword_indexed(0,tl,tl);
3673   }
3674   if (opcode[i]==0x31) { // LWC1 (get target address)
3675     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3676   }
3677   if (opcode[i]==0x35) { // LDC1 (get target address)
3678     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3679   }
3680   if(!using_tlb) {
3681     if(!c) {
3682       jaddr2=(int)out;
3683       emit_jno(0);
3684     }
3685     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3686       jaddr2=(int)out;
3687       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3688     }
3689     #ifdef DESTRUCTIVE_SHIFT
3690     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3691       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3692     }
3693     #endif
3694   }else{
3695     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3696       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3697     }
3698     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3699       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3700     }
3701   }
3702   if (opcode[i]==0x31) { // LWC1
3703     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3704     //gen_tlb_addr_r(ar,map);
3705     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3706     #ifdef HOST_IMM_ADDR32
3707     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3708     else
3709     #endif
3710     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3711     type=LOADW_STUB;
3712   }
3713   if (opcode[i]==0x35) { // LDC1
3714     assert(th>=0);
3715     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3716     //gen_tlb_addr_r(ar,map);
3717     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3718     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3719     #ifdef HOST_IMM_ADDR32
3720     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3721     else
3722     #endif
3723     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3724     type=LOADD_STUB;
3725   }
3726   if (opcode[i]==0x39) { // SWC1
3727     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3728     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3729     type=STOREW_STUB;
3730   }
3731   if (opcode[i]==0x3D) { // SDC1
3732     assert(th>=0);
3733     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3734     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3735     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3736     type=STORED_STUB;
3737   }
3738   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3739     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3740       #ifndef DESTRUCTIVE_SHIFT
3741       temp=offset||c||s<0?ar:s;
3742       #endif
3743       #if defined(HOST_IMM8)
3744       int ir=get_reg(i_regs->regmap,INVCP);
3745       assert(ir>=0);
3746       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3747       #else
3748       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3749       #endif
3750       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3751       emit_callne(invalidate_addr_reg[temp]);
3752       #else
3753       jaddr3=(int)out;
3754       emit_jne(0);
3755       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3756       #endif
3757     }
3758   }
3759   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3760   if (opcode[i]==0x31) { // LWC1 (write float)
3761     emit_writeword_indexed(tl,0,temp);
3762   }
3763   if (opcode[i]==0x35) { // LDC1 (write double)
3764     emit_writeword_indexed(th,4,temp);
3765     emit_writeword_indexed(tl,0,temp);
3766   }
3767   //if(opcode[i]==0x39)
3768   /*if(opcode[i]==0x39||opcode[i]==0x31)
3769   {
3770     emit_pusha();
3771         emit_readword((int)&last_count,ECX);
3772         if(get_reg(i_regs->regmap,CCREG)<0)
3773           emit_loadreg(CCREG,HOST_CCREG);
3774         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3775         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3776         emit_writeword(HOST_CCREG,(int)&Count);
3777     emit_call((int)memdebug);
3778     emit_popa();
3779   }/**/
3780 #else
3781   cop1_unusable(i, i_regs);
3782 #endif
3783 }
3784
3785 void c2ls_assemble(int i,struct regstat *i_regs)
3786 {
3787   int s,tl;
3788   int ar;
3789   int offset;
3790   int memtarget=0,c=0;
3791   int jaddr2=0,jaddr3,type;
3792   int agr=AGEN1+(i&1);
3793   int fastio_reg_override=0;
3794   u_int hr,reglist=0;
3795   u_int copr=(source[i]>>16)&0x1f;
3796   s=get_reg(i_regs->regmap,rs1[i]);
3797   tl=get_reg(i_regs->regmap,FTEMP);
3798   offset=imm[i];
3799   assert(rs1[i]>0);
3800   assert(tl>=0);
3801   assert(!using_tlb);
3802
3803   for(hr=0;hr<HOST_REGS;hr++) {
3804     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3805   }
3806   if(i_regs->regmap[HOST_CCREG]==CCREG)
3807     reglist&=~(1<<HOST_CCREG);
3808
3809   // get the address
3810   if (opcode[i]==0x3a) { // SWC2
3811     ar=get_reg(i_regs->regmap,agr);
3812     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3813     reglist|=1<<ar;
3814   } else { // LWC2
3815     ar=tl;
3816   }
3817   if(s>=0) c=(i_regs->wasconst>>s)&1;
3818   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3819   if (!offset&&!c&&s>=0) ar=s;
3820   assert(ar>=0);
3821
3822   if (opcode[i]==0x3a) { // SWC2
3823     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3824     type=STOREW_STUB;
3825   }
3826   else
3827     type=LOADW_STUB;
3828
3829   if(c&&!memtarget) {
3830     jaddr2=(int)out;
3831     emit_jmp(0); // inline_readstub/inline_writestub?
3832   }
3833   else {
3834     if(!c) {
3835       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3836     }
3837     else if(ram_offset&&memtarget) {
3838       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3839       fastio_reg_override=HOST_TEMPREG;
3840     }
3841     if (opcode[i]==0x32) { // LWC2
3842       #ifdef HOST_IMM_ADDR32
3843       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3844       else
3845       #endif
3846       int a=ar;
3847       if(fastio_reg_override) a=fastio_reg_override;
3848       emit_readword_indexed(0,a,tl);
3849     }
3850     if (opcode[i]==0x3a) { // SWC2
3851       #ifdef DESTRUCTIVE_SHIFT
3852       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3853       #endif
3854       int a=ar;
3855       if(fastio_reg_override) a=fastio_reg_override;
3856       emit_writeword_indexed(tl,0,a);
3857     }
3858   }
3859   if(jaddr2)
3860     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3861   if(opcode[i]==0x3a) // SWC2
3862   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3863 #if defined(HOST_IMM8)
3864     int ir=get_reg(i_regs->regmap,INVCP);
3865     assert(ir>=0);
3866     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3867 #else
3868     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3869 #endif
3870     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3871     emit_callne(invalidate_addr_reg[ar]);
3872     #else
3873     jaddr3=(int)out;
3874     emit_jne(0);
3875     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3876     #endif
3877   }
3878   if (opcode[i]==0x32) { // LWC2
3879     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3880   }
3881 }
3882
3883 #ifndef multdiv_assemble
3884 void multdiv_assemble(int i,struct regstat *i_regs)
3885 {
3886   printf("Need multdiv_assemble for this architecture.\n");
3887   exit(1);
3888 }
3889 #endif
3890
3891 void mov_assemble(int i,struct regstat *i_regs)
3892 {
3893   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3894   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3895   if(rt1[i]) {
3896     signed char sh,sl,th,tl;
3897     th=get_reg(i_regs->regmap,rt1[i]|64);
3898     tl=get_reg(i_regs->regmap,rt1[i]);
3899     //assert(tl>=0);
3900     if(tl>=0) {
3901       sh=get_reg(i_regs->regmap,rs1[i]|64);
3902       sl=get_reg(i_regs->regmap,rs1[i]);
3903       if(sl>=0) emit_mov(sl,tl);
3904       else emit_loadreg(rs1[i],tl);
3905       if(th>=0) {
3906         if(sh>=0) emit_mov(sh,th);
3907         else emit_loadreg(rs1[i]|64,th);
3908       }
3909     }
3910   }
3911 }
3912
3913 #ifndef fconv_assemble
3914 void fconv_assemble(int i,struct regstat *i_regs)
3915 {
3916   printf("Need fconv_assemble for this architecture.\n");
3917   exit(1);
3918 }
3919 #endif
3920
3921 #if 0
3922 void float_assemble(int i,struct regstat *i_regs)
3923 {
3924   printf("Need float_assemble for this architecture.\n");
3925   exit(1);
3926 }
3927 #endif
3928
3929 void syscall_assemble(int i,struct regstat *i_regs)
3930 {
3931   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3932   assert(ccreg==HOST_CCREG);
3933   assert(!is_delayslot);
3934   emit_movimm(start+i*4,EAX); // Get PC
3935   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3936   emit_jmp((int)jump_syscall_hle); // XXX
3937 }
3938
3939 void hlecall_assemble(int i,struct regstat *i_regs)
3940 {
3941   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3942   assert(ccreg==HOST_CCREG);
3943   assert(!is_delayslot);
3944   emit_movimm(start+i*4+4,0); // Get PC
3945   emit_movimm((int)psxHLEt[source[i]&7],1);
3946   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3947   emit_jmp((int)jump_hlecall);
3948 }
3949
3950 void intcall_assemble(int i,struct regstat *i_regs)
3951 {
3952   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3953   assert(ccreg==HOST_CCREG);
3954   assert(!is_delayslot);
3955   emit_movimm(start+i*4,0); // Get PC
3956   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3957   emit_jmp((int)jump_intcall);
3958 }
3959
3960 void ds_assemble(int i,struct regstat *i_regs)
3961 {
3962   speculate_register_values(i);
3963   is_delayslot=1;
3964   switch(itype[i]) {
3965     case ALU:
3966       alu_assemble(i,i_regs);break;
3967     case IMM16:
3968       imm16_assemble(i,i_regs);break;
3969     case SHIFT:
3970       shift_assemble(i,i_regs);break;
3971     case SHIFTIMM:
3972       shiftimm_assemble(i,i_regs);break;
3973     case LOAD:
3974       load_assemble(i,i_regs);break;
3975     case LOADLR:
3976       loadlr_assemble(i,i_regs);break;
3977     case STORE:
3978       store_assemble(i,i_regs);break;
3979     case STORELR:
3980       storelr_assemble(i,i_regs);break;
3981     case COP0:
3982       cop0_assemble(i,i_regs);break;
3983     case COP1:
3984       cop1_assemble(i,i_regs);break;
3985     case C1LS:
3986       c1ls_assemble(i,i_regs);break;
3987     case COP2:
3988       cop2_assemble(i,i_regs);break;
3989     case C2LS:
3990       c2ls_assemble(i,i_regs);break;
3991     case C2OP:
3992       c2op_assemble(i,i_regs);break;
3993     case FCONV:
3994       fconv_assemble(i,i_regs);break;
3995     case FLOAT:
3996       float_assemble(i,i_regs);break;
3997     case FCOMP:
3998       fcomp_assemble(i,i_regs);break;
3999     case MULTDIV:
4000       multdiv_assemble(i,i_regs);break;
4001     case MOV:
4002       mov_assemble(i,i_regs);break;
4003     case SYSCALL:
4004     case HLECALL:
4005     case INTCALL:
4006     case SPAN:
4007     case UJUMP:
4008     case RJUMP:
4009     case CJUMP:
4010     case SJUMP:
4011     case FJUMP:
4012       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4013   }
4014   is_delayslot=0;
4015 }
4016
4017 // Is the branch target a valid internal jump?
4018 int internal_branch(uint64_t i_is32,int addr)
4019 {
4020   if(addr&1) return 0; // Indirect (register) jump
4021   if(addr>=start && addr<start+slen*4-4)
4022   {
4023     int t=(addr-start)>>2;
4024     // Delay slots are not valid branch targets
4025     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4026     // 64 -> 32 bit transition requires a recompile
4027     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
4028     {
4029       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
4030       else printf("optimizable: yes\n");
4031     }*/
4032     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4033 #ifndef FORCE32
4034     if(requires_32bit[t]&~i_is32) return 0;
4035     else
4036 #endif
4037       return 1;
4038   }
4039   return 0;
4040 }
4041
4042 #ifndef wb_invalidate
4043 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
4044   uint64_t u,uint64_t uu)
4045 {
4046   int hr;
4047   for(hr=0;hr<HOST_REGS;hr++) {
4048     if(hr!=EXCLUDE_REG) {
4049       if(pre[hr]!=entry[hr]) {
4050         if(pre[hr]>=0) {
4051           if((dirty>>hr)&1) {
4052             if(get_reg(entry,pre[hr])<0) {
4053               if(pre[hr]<64) {
4054                 if(!((u>>pre[hr])&1)) {
4055                   emit_storereg(pre[hr],hr);
4056                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
4057                     emit_sarimm(hr,31,hr);
4058                     emit_storereg(pre[hr]|64,hr);
4059                   }
4060                 }
4061               }else{
4062                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4063                   emit_storereg(pre[hr],hr);
4064                 }
4065               }
4066             }
4067           }
4068         }
4069       }
4070     }
4071   }
4072   // Move from one register to another (no writeback)
4073   for(hr=0;hr<HOST_REGS;hr++) {
4074     if(hr!=EXCLUDE_REG) {
4075       if(pre[hr]!=entry[hr]) {
4076         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4077           int nr;
4078           if((nr=get_reg(entry,pre[hr]))>=0) {
4079             emit_mov(hr,nr);
4080           }
4081         }
4082       }
4083     }
4084   }
4085 }
4086 #endif
4087
4088 // Load the specified registers
4089 // This only loads the registers given as arguments because
4090 // we don't want to load things that will be overwritten
4091 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4092 {
4093   int hr;
4094   // Load 32-bit regs
4095   for(hr=0;hr<HOST_REGS;hr++) {
4096     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4097       if(entry[hr]!=regmap[hr]) {
4098         if(regmap[hr]==rs1||regmap[hr]==rs2)
4099         {
4100           if(regmap[hr]==0) {
4101             emit_zeroreg(hr);
4102           }
4103           else
4104           {
4105             emit_loadreg(regmap[hr],hr);
4106           }
4107         }
4108       }
4109     }
4110   }
4111   //Load 64-bit regs
4112   for(hr=0;hr<HOST_REGS;hr++) {
4113     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4114       if(entry[hr]!=regmap[hr]) {
4115         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4116         {
4117           assert(regmap[hr]!=64);
4118           if((is32>>(regmap[hr]&63))&1) {
4119             int lr=get_reg(regmap,regmap[hr]-64);
4120             if(lr>=0)
4121               emit_sarimm(lr,31,hr);
4122             else
4123               emit_loadreg(regmap[hr],hr);
4124           }
4125           else
4126           {
4127             emit_loadreg(regmap[hr],hr);
4128           }
4129         }
4130       }
4131     }
4132   }
4133 }
4134
4135 // Load registers prior to the start of a loop
4136 // so that they are not loaded within the loop
4137 static void loop_preload(signed char pre[],signed char entry[])
4138 {
4139   int hr;
4140   for(hr=0;hr<HOST_REGS;hr++) {
4141     if(hr!=EXCLUDE_REG) {
4142       if(pre[hr]!=entry[hr]) {
4143         if(entry[hr]>=0) {
4144           if(get_reg(pre,entry[hr])<0) {
4145             assem_debug("loop preload:\n");
4146             //printf("loop preload: %d\n",hr);
4147             if(entry[hr]==0) {
4148               emit_zeroreg(hr);
4149             }
4150             else if(entry[hr]<TEMPREG)
4151             {
4152               emit_loadreg(entry[hr],hr);
4153             }
4154             else if(entry[hr]-64<TEMPREG)
4155             {
4156               emit_loadreg(entry[hr],hr);
4157             }
4158           }
4159         }
4160       }
4161     }
4162   }
4163 }
4164
4165 // Generate address for load/store instruction
4166 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4167 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4168 {
4169   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4170     int ra=-1;
4171     int agr=AGEN1+(i&1);
4172     int mgr=MGEN1+(i&1);
4173     if(itype[i]==LOAD) {
4174       ra=get_reg(i_regs->regmap,rt1[i]);
4175       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4176       assert(ra>=0);
4177     }
4178     if(itype[i]==LOADLR) {
4179       ra=get_reg(i_regs->regmap,FTEMP);
4180     }
4181     if(itype[i]==STORE||itype[i]==STORELR) {
4182       ra=get_reg(i_regs->regmap,agr);
4183       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4184     }
4185     if(itype[i]==C1LS||itype[i]==C2LS) {
4186       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4187         ra=get_reg(i_regs->regmap,FTEMP);
4188       else { // SWC1/SDC1/SWC2/SDC2
4189         ra=get_reg(i_regs->regmap,agr);
4190         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4191       }
4192     }
4193     int rs=get_reg(i_regs->regmap,rs1[i]);
4194     int rm=get_reg(i_regs->regmap,TLREG);
4195     if(ra>=0) {
4196       int offset=imm[i];
4197       int c=(i_regs->wasconst>>rs)&1;
4198       if(rs1[i]==0) {
4199         // Using r0 as a base address
4200         /*if(rm>=0) {
4201           if(!entry||entry[rm]!=mgr) {
4202             generate_map_const(offset,rm);
4203           } // else did it in the previous cycle
4204         }*/
4205         if(!entry||entry[ra]!=agr) {
4206           if (opcode[i]==0x22||opcode[i]==0x26) {
4207             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4208           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4209             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4210           }else{
4211             emit_movimm(offset,ra);
4212           }
4213         } // else did it in the previous cycle
4214       }
4215       else if(rs<0) {
4216         if(!entry||entry[ra]!=rs1[i])
4217           emit_loadreg(rs1[i],ra);
4218         //if(!entry||entry[ra]!=rs1[i])
4219         //  printf("poor load scheduling!\n");
4220       }
4221       else if(c) {
4222 #ifndef DISABLE_TLB
4223         if(rm>=0) {
4224           if(!entry||entry[rm]!=mgr) {
4225             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4226               // Stores to memory go thru the mapper to detect self-modifying
4227               // code, loads don't.
4228               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4229                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4230                 generate_map_const(constmap[i][rs]+offset,rm);
4231             }else{
4232               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4233                 generate_map_const(constmap[i][rs]+offset,rm);
4234             }
4235           }
4236         }
4237 #endif
4238         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4239           if(!entry||entry[ra]!=agr) {
4240             if (opcode[i]==0x22||opcode[i]==0x26) {
4241               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4242             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4243               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4244             }else{
4245               #ifdef HOST_IMM_ADDR32
4246               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4247                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4248               #endif
4249               emit_movimm(constmap[i][rs]+offset,ra);
4250               regs[i].loadedconst|=1<<ra;
4251             }
4252           } // else did it in the previous cycle
4253         } // else load_consts already did it
4254       }
4255       if(offset&&!c&&rs1[i]) {
4256         if(rs>=0) {
4257           emit_addimm(rs,offset,ra);
4258         }else{
4259           emit_addimm(ra,offset,ra);
4260         }
4261       }
4262     }
4263   }
4264   // Preload constants for next instruction
4265   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4266     int agr,ra;
4267     #if !defined(HOST_IMM_ADDR32) && !defined(DISABLE_TLB)
4268     // Mapper entry
4269     agr=MGEN1+((i+1)&1);
4270     ra=get_reg(i_regs->regmap,agr);
4271     if(ra>=0) {
4272       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4273       int offset=imm[i+1];
4274       int c=(regs[i+1].wasconst>>rs)&1;
4275       if(c) {
4276         if(itype[i+1]==STORE||itype[i+1]==STORELR
4277            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4278           // Stores to memory go thru the mapper to detect self-modifying
4279           // code, loads don't.
4280           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4281              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4282             generate_map_const(constmap[i+1][rs]+offset,ra);
4283         }else{
4284           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4285             generate_map_const(constmap[i+1][rs]+offset,ra);
4286         }
4287       }
4288       /*else if(rs1[i]==0) {
4289         generate_map_const(offset,ra);
4290       }*/
4291     }
4292     #endif
4293     // Actual address
4294     agr=AGEN1+((i+1)&1);
4295     ra=get_reg(i_regs->regmap,agr);
4296     if(ra>=0) {
4297       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4298       int offset=imm[i+1];
4299       int c=(regs[i+1].wasconst>>rs)&1;
4300       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4301         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4302           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4303         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4304           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4305         }else{
4306           #ifdef HOST_IMM_ADDR32
4307           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4308              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4309           #endif
4310           emit_movimm(constmap[i+1][rs]+offset,ra);
4311           regs[i+1].loadedconst|=1<<ra;
4312         }
4313       }
4314       else if(rs1[i+1]==0) {
4315         // Using r0 as a base address
4316         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4317           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4318         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4319           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4320         }else{
4321           emit_movimm(offset,ra);
4322         }
4323       }
4324     }
4325   }
4326 }
4327
4328 int get_final_value(int hr, int i, int *value)
4329 {
4330   int reg=regs[i].regmap[hr];
4331   while(i<slen-1) {
4332     if(regs[i+1].regmap[hr]!=reg) break;
4333     if(!((regs[i+1].isconst>>hr)&1)) break;
4334     if(bt[i+1]) break;
4335     i++;
4336   }
4337   if(i<slen-1) {
4338     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4339       *value=constmap[i][hr];
4340       return 1;
4341     }
4342     if(!bt[i+1]) {
4343       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4344         // Load in delay slot, out-of-order execution
4345         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4346         {
4347           #ifdef HOST_IMM_ADDR32
4348           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4349           #endif
4350           // Precompute load address
4351           *value=constmap[i][hr]+imm[i+2];
4352           return 1;
4353         }
4354       }
4355       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4356       {
4357         #ifdef HOST_IMM_ADDR32
4358         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4359         #endif
4360         // Precompute load address
4361         *value=constmap[i][hr]+imm[i+1];
4362         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4363         return 1;
4364       }
4365     }
4366   }
4367   *value=constmap[i][hr];
4368   //printf("c=%x\n",(int)constmap[i][hr]);
4369   if(i==slen-1) return 1;
4370   if(reg<64) {
4371     return !((unneeded_reg[i+1]>>reg)&1);
4372   }else{
4373     return !((unneeded_reg_upper[i+1]>>reg)&1);
4374   }
4375 }
4376
4377 // Load registers with known constants
4378 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4379 {
4380   int hr,hr2;
4381   // propagate loaded constant flags
4382   if(i==0||bt[i])
4383     regs[i].loadedconst=0;
4384   else {
4385     for(hr=0;hr<HOST_REGS;hr++) {
4386       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4387          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4388       {
4389         regs[i].loadedconst|=1<<hr;
4390       }
4391     }
4392   }
4393   // Load 32-bit regs
4394   for(hr=0;hr<HOST_REGS;hr++) {
4395     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4396       //if(entry[hr]!=regmap[hr]) {
4397       if(!((regs[i].loadedconst>>hr)&1)) {
4398         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4399           int value,similar=0;
4400           if(get_final_value(hr,i,&value)) {
4401             // see if some other register has similar value
4402             for(hr2=0;hr2<HOST_REGS;hr2++) {
4403               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4404                 if(is_similar_value(value,constmap[i][hr2])) {
4405                   similar=1;
4406                   break;
4407                 }
4408               }
4409             }
4410             if(similar) {
4411               int value2;
4412               if(get_final_value(hr2,i,&value2)) // is this needed?
4413                 emit_movimm_from(value2,hr2,value,hr);
4414               else
4415                 emit_movimm(value,hr);
4416             }
4417             else if(value==0) {
4418               emit_zeroreg(hr);
4419             }
4420             else {
4421               emit_movimm(value,hr);
4422             }
4423           }
4424           regs[i].loadedconst|=1<<hr;
4425         }
4426       }
4427     }
4428   }
4429   // Load 64-bit regs
4430   for(hr=0;hr<HOST_REGS;hr++) {
4431     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4432       //if(entry[hr]!=regmap[hr]) {
4433       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4434         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4435           if((is32>>(regmap[hr]&63))&1) {
4436             int lr=get_reg(regmap,regmap[hr]-64);
4437             assert(lr>=0);
4438             emit_sarimm(lr,31,hr);
4439           }
4440           else
4441           {
4442             int value;
4443             if(get_final_value(hr,i,&value)) {
4444               if(value==0) {
4445                 emit_zeroreg(hr);
4446               }
4447               else {
4448                 emit_movimm(value,hr);
4449               }
4450             }
4451           }
4452         }
4453       }
4454     }
4455   }
4456 }
4457 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4458 {
4459   int hr;
4460   // Load 32-bit regs
4461   for(hr=0;hr<HOST_REGS;hr++) {
4462     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4463       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4464         int value=constmap[i][hr];
4465         if(value==0) {
4466           emit_zeroreg(hr);
4467         }
4468         else {
4469           emit_movimm(value,hr);
4470         }
4471       }
4472     }
4473   }
4474   // Load 64-bit regs
4475   for(hr=0;hr<HOST_REGS;hr++) {
4476     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4477       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4478         if((is32>>(regmap[hr]&63))&1) {
4479           int lr=get_reg(regmap,regmap[hr]-64);
4480           assert(lr>=0);
4481           emit_sarimm(lr,31,hr);
4482         }
4483         else
4484         {
4485           int value=constmap[i][hr];
4486           if(value==0) {
4487             emit_zeroreg(hr);
4488           }
4489           else {
4490             emit_movimm(value,hr);
4491           }
4492         }
4493       }
4494     }
4495   }
4496 }
4497
4498 // Write out all dirty registers (except cycle count)
4499 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4500 {
4501   int hr;
4502   for(hr=0;hr<HOST_REGS;hr++) {
4503     if(hr!=EXCLUDE_REG) {
4504       if(i_regmap[hr]>0) {
4505         if(i_regmap[hr]!=CCREG) {
4506           if((i_dirty>>hr)&1) {
4507             if(i_regmap[hr]<64) {
4508               emit_storereg(i_regmap[hr],hr);
4509 #ifndef FORCE32
4510               if( ((i_is32>>i_regmap[hr])&1) ) {
4511                 #ifdef DESTRUCTIVE_WRITEBACK
4512                 emit_sarimm(hr,31,hr);
4513                 emit_storereg(i_regmap[hr]|64,hr);
4514                 #else
4515                 emit_sarimm(hr,31,HOST_TEMPREG);
4516                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4517                 #endif
4518               }
4519 #endif
4520             }else{
4521               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4522                 emit_storereg(i_regmap[hr],hr);
4523               }
4524             }
4525           }
4526         }
4527       }
4528     }
4529   }
4530 }
4531 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4532 // This writes the registers not written by store_regs_bt
4533 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4534 {
4535   int hr;
4536   int t=(addr-start)>>2;
4537   for(hr=0;hr<HOST_REGS;hr++) {
4538     if(hr!=EXCLUDE_REG) {
4539       if(i_regmap[hr]>0) {
4540         if(i_regmap[hr]!=CCREG) {
4541           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4542             if((i_dirty>>hr)&1) {
4543               if(i_regmap[hr]<64) {
4544                 emit_storereg(i_regmap[hr],hr);
4545 #ifndef FORCE32
4546                 if( ((i_is32>>i_regmap[hr])&1) ) {
4547                   #ifdef DESTRUCTIVE_WRITEBACK
4548                   emit_sarimm(hr,31,hr);
4549                   emit_storereg(i_regmap[hr]|64,hr);
4550                   #else
4551                   emit_sarimm(hr,31,HOST_TEMPREG);
4552                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4553                   #endif
4554                 }
4555 #endif
4556               }else{
4557                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4558                   emit_storereg(i_regmap[hr],hr);
4559                 }
4560               }
4561             }
4562           }
4563         }
4564       }
4565     }
4566   }
4567 }
4568
4569 // Load all registers (except cycle count)
4570 void load_all_regs(signed char i_regmap[])
4571 {
4572   int hr;
4573   for(hr=0;hr<HOST_REGS;hr++) {
4574     if(hr!=EXCLUDE_REG) {
4575       if(i_regmap[hr]==0) {
4576         emit_zeroreg(hr);
4577       }
4578       else
4579       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4580       {
4581         emit_loadreg(i_regmap[hr],hr);
4582       }
4583     }
4584   }
4585 }
4586
4587 // Load all current registers also needed by next instruction
4588 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4589 {
4590   int hr;
4591   for(hr=0;hr<HOST_REGS;hr++) {
4592     if(hr!=EXCLUDE_REG) {
4593       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4594         if(i_regmap[hr]==0) {
4595           emit_zeroreg(hr);
4596         }
4597         else
4598         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4599         {
4600           emit_loadreg(i_regmap[hr],hr);
4601         }
4602       }
4603     }
4604   }
4605 }
4606
4607 // Load all regs, storing cycle count if necessary
4608 void load_regs_entry(int t)
4609 {
4610   int hr;
4611   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4612   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4613   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4614     emit_storereg(CCREG,HOST_CCREG);
4615   }
4616   // Load 32-bit regs
4617   for(hr=0;hr<HOST_REGS;hr++) {
4618     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4619       if(regs[t].regmap_entry[hr]==0) {
4620         emit_zeroreg(hr);
4621       }
4622       else if(regs[t].regmap_entry[hr]!=CCREG)
4623       {
4624         emit_loadreg(regs[t].regmap_entry[hr],hr);
4625       }
4626     }
4627   }
4628   // Load 64-bit regs
4629   for(hr=0;hr<HOST_REGS;hr++) {
4630     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4631       assert(regs[t].regmap_entry[hr]!=64);
4632       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4633         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4634         if(lr<0) {
4635           emit_loadreg(regs[t].regmap_entry[hr],hr);
4636         }
4637         else
4638         {
4639           emit_sarimm(lr,31,hr);
4640         }
4641       }
4642       else
4643       {
4644         emit_loadreg(regs[t].regmap_entry[hr],hr);
4645       }
4646     }
4647   }
4648 }
4649
4650 // Store dirty registers prior to branch
4651 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4652 {
4653   if(internal_branch(i_is32,addr))
4654   {
4655     int t=(addr-start)>>2;
4656     int hr;
4657     for(hr=0;hr<HOST_REGS;hr++) {
4658       if(hr!=EXCLUDE_REG) {
4659         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4660           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4661             if((i_dirty>>hr)&1) {
4662               if(i_regmap[hr]<64) {
4663                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4664                   emit_storereg(i_regmap[hr],hr);
4665                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4666                     #ifdef DESTRUCTIVE_WRITEBACK
4667                     emit_sarimm(hr,31,hr);
4668                     emit_storereg(i_regmap[hr]|64,hr);
4669                     #else
4670                     emit_sarimm(hr,31,HOST_TEMPREG);
4671                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4672                     #endif
4673                   }
4674                 }
4675               }else{
4676                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4677                   emit_storereg(i_regmap[hr],hr);
4678                 }
4679               }
4680             }
4681           }
4682         }
4683       }
4684     }
4685   }
4686   else
4687   {
4688     // Branch out of this block, write out all dirty regs
4689     wb_dirtys(i_regmap,i_is32,i_dirty);
4690   }
4691 }
4692
4693 // Load all needed registers for branch target
4694 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4695 {
4696   //if(addr>=start && addr<(start+slen*4))
4697   if(internal_branch(i_is32,addr))
4698   {
4699     int t=(addr-start)>>2;
4700     int hr;
4701     // Store the cycle count before loading something else
4702     if(i_regmap[HOST_CCREG]!=CCREG) {
4703       assert(i_regmap[HOST_CCREG]==-1);
4704     }
4705     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4706       emit_storereg(CCREG,HOST_CCREG);
4707     }
4708     // Load 32-bit regs
4709     for(hr=0;hr<HOST_REGS;hr++) {
4710       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4711         #ifdef DESTRUCTIVE_WRITEBACK
4712         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4713         #else
4714         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4715         #endif
4716           if(regs[t].regmap_entry[hr]==0) {
4717             emit_zeroreg(hr);
4718           }
4719           else if(regs[t].regmap_entry[hr]!=CCREG)
4720           {
4721             emit_loadreg(regs[t].regmap_entry[hr],hr);
4722           }
4723         }
4724       }
4725     }
4726     //Load 64-bit regs
4727     for(hr=0;hr<HOST_REGS;hr++) {
4728       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4729         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4730           assert(regs[t].regmap_entry[hr]!=64);
4731           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4732             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4733             if(lr<0) {
4734               emit_loadreg(regs[t].regmap_entry[hr],hr);
4735             }
4736             else
4737             {
4738               emit_sarimm(lr,31,hr);
4739             }
4740           }
4741           else
4742           {
4743             emit_loadreg(regs[t].regmap_entry[hr],hr);
4744           }
4745         }
4746         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4747           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4748           assert(lr>=0);
4749           emit_sarimm(lr,31,hr);
4750         }
4751       }
4752     }
4753   }
4754 }
4755
4756 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4757 {
4758   if(addr>=start && addr<start+slen*4-4)
4759   {
4760     int t=(addr-start)>>2;
4761     int hr;
4762     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4763     for(hr=0;hr<HOST_REGS;hr++)
4764     {
4765       if(hr!=EXCLUDE_REG)
4766       {
4767         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4768         {
4769           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4770           {
4771             return 0;
4772           }
4773           else
4774           if((i_dirty>>hr)&1)
4775           {
4776             if(i_regmap[hr]<TEMPREG)
4777             {
4778               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4779                 return 0;
4780             }
4781             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4782             {
4783               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4784                 return 0;
4785             }
4786           }
4787         }
4788         else // Same register but is it 32-bit or dirty?
4789         if(i_regmap[hr]>=0)
4790         {
4791           if(!((regs[t].dirty>>hr)&1))
4792           {
4793             if((i_dirty>>hr)&1)
4794             {
4795               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4796               {
4797                 //printf("%x: dirty no match\n",addr);
4798                 return 0;
4799               }
4800             }
4801           }
4802           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4803           {
4804             //printf("%x: is32 no match\n",addr);
4805             return 0;
4806           }
4807         }
4808       }
4809     }
4810     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4811 #ifndef FORCE32
4812     if(requires_32bit[t]&~i_is32) return 0;
4813 #endif
4814     // Delay slots are not valid branch targets
4815     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4816     // Delay slots require additional processing, so do not match
4817     if(is_ds[t]) return 0;
4818   }
4819   else
4820   {
4821     int hr;
4822     for(hr=0;hr<HOST_REGS;hr++)
4823     {
4824       if(hr!=EXCLUDE_REG)
4825       {
4826         if(i_regmap[hr]>=0)
4827         {
4828           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4829           {
4830             if((i_dirty>>hr)&1)
4831             {
4832               return 0;
4833             }
4834           }
4835         }
4836       }
4837     }
4838   }
4839   return 1;
4840 }
4841
4842 // Used when a branch jumps into the delay slot of another branch
4843 void ds_assemble_entry(int i)
4844 {
4845   int t=(ba[i]-start)>>2;
4846   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4847   assem_debug("Assemble delay slot at %x\n",ba[i]);
4848   assem_debug("<->\n");
4849   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4850     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4851   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4852   address_generation(t,&regs[t],regs[t].regmap_entry);
4853   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4854     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4855   cop1_usable=0;
4856   is_delayslot=0;
4857   switch(itype[t]) {
4858     case ALU:
4859       alu_assemble(t,&regs[t]);break;
4860     case IMM16:
4861       imm16_assemble(t,&regs[t]);break;
4862     case SHIFT:
4863       shift_assemble(t,&regs[t]);break;
4864     case SHIFTIMM:
4865       shiftimm_assemble(t,&regs[t]);break;
4866     case LOAD:
4867       load_assemble(t,&regs[t]);break;
4868     case LOADLR:
4869       loadlr_assemble(t,&regs[t]);break;
4870     case STORE:
4871       store_assemble(t,&regs[t]);break;
4872     case STORELR:
4873       storelr_assemble(t,&regs[t]);break;
4874     case COP0:
4875       cop0_assemble(t,&regs[t]);break;
4876     case COP1:
4877       cop1_assemble(t,&regs[t]);break;
4878     case C1LS:
4879       c1ls_assemble(t,&regs[t]);break;
4880     case COP2:
4881       cop2_assemble(t,&regs[t]);break;
4882     case C2LS:
4883       c2ls_assemble(t,&regs[t]);break;
4884     case C2OP:
4885       c2op_assemble(t,&regs[t]);break;
4886     case FCONV:
4887       fconv_assemble(t,&regs[t]);break;
4888     case FLOAT:
4889       float_assemble(t,&regs[t]);break;
4890     case FCOMP:
4891       fcomp_assemble(t,&regs[t]);break;
4892     case MULTDIV:
4893       multdiv_assemble(t,&regs[t]);break;
4894     case MOV:
4895       mov_assemble(t,&regs[t]);break;
4896     case SYSCALL:
4897     case HLECALL:
4898     case INTCALL:
4899     case SPAN:
4900     case UJUMP:
4901     case RJUMP:
4902     case CJUMP:
4903     case SJUMP:
4904     case FJUMP:
4905       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4906   }
4907   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4908   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4909   if(internal_branch(regs[t].is32,ba[i]+4))
4910     assem_debug("branch: internal\n");
4911   else
4912     assem_debug("branch: external\n");
4913   assert(internal_branch(regs[t].is32,ba[i]+4));
4914   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4915   emit_jmp(0);
4916 }
4917
4918 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4919 {
4920   int count;
4921   int jaddr;
4922   int idle=0;
4923   int t=0;
4924   if(itype[i]==RJUMP)
4925   {
4926     *adj=0;
4927   }
4928   //if(ba[i]>=start && ba[i]<(start+slen*4))
4929   if(internal_branch(branch_regs[i].is32,ba[i]))
4930   {
4931     t=(ba[i]-start)>>2;
4932     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4933     else *adj=ccadj[t];
4934   }
4935   else
4936   {
4937     *adj=0;
4938   }
4939   count=ccadj[i];
4940   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4941     // Idle loop
4942     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4943     idle=(int)out;
4944     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4945     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4946     jaddr=(int)out;
4947     emit_jmp(0);
4948   }
4949   else if(*adj==0||invert) {
4950     int cycles=CLOCK_ADJUST(count+2);
4951     // faster loop HACK
4952     if (t&&*adj) {
4953       int rel=t-i;
4954       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4955         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4956     }
4957     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4958     jaddr=(int)out;
4959     emit_jns(0);
4960   }
4961   else
4962   {
4963     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4964     jaddr=(int)out;
4965     emit_jns(0);
4966   }
4967   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4968 }
4969
4970 void do_ccstub(int n)
4971 {
4972   literal_pool(256);
4973   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4974   set_jump_target(stubs[n][1],(int)out);
4975   int i=stubs[n][4];
4976   if(stubs[n][6]==NULLDS) {
4977     // Delay slot instruction is nullified ("likely" branch)
4978     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4979   }
4980   else if(stubs[n][6]!=TAKEN) {
4981     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4982   }
4983   else {
4984     if(internal_branch(branch_regs[i].is32,ba[i]))
4985       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4986   }
4987   if(stubs[n][5]!=-1)
4988   {
4989     // Save PC as return address
4990     emit_movimm(stubs[n][5],EAX);
4991     emit_writeword(EAX,(int)&pcaddr);
4992   }
4993   else
4994   {
4995     // Return address depends on which way the branch goes
4996     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4997     {
4998       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4999       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5000       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5001       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5002       if(rs1[i]==0)
5003       {
5004         s1l=s2l;s1h=s2h;
5005         s2l=s2h=-1;
5006       }
5007       else if(rs2[i]==0)
5008       {
5009         s2l=s2h=-1;
5010       }
5011       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
5012         s1h=s2h=-1;
5013       }
5014       assert(s1l>=0);
5015       #ifdef DESTRUCTIVE_WRITEBACK
5016       if(rs1[i]) {
5017         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
5018           emit_loadreg(rs1[i],s1l);
5019       }
5020       else {
5021         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
5022           emit_loadreg(rs2[i],s1l);
5023       }
5024       if(s2l>=0)
5025         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
5026           emit_loadreg(rs2[i],s2l);
5027       #endif
5028       int hr=0;
5029       int addr=-1,alt=-1,ntaddr=-1;
5030       while(hr<HOST_REGS)
5031       {
5032         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5033            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5034            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5035         {
5036           addr=hr++;break;
5037         }
5038         hr++;
5039       }
5040       while(hr<HOST_REGS)
5041       {
5042         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5043            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5044            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5045         {
5046           alt=hr++;break;
5047         }
5048         hr++;
5049       }
5050       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5051       {
5052         while(hr<HOST_REGS)
5053         {
5054           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5055              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5056              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5057           {
5058             ntaddr=hr;break;
5059           }
5060           hr++;
5061         }
5062         assert(hr<HOST_REGS);
5063       }
5064       if((opcode[i]&0x2f)==4) // BEQ
5065       {
5066         #ifdef HAVE_CMOV_IMM
5067         if(s1h<0) {
5068           if(s2l>=0) emit_cmp(s1l,s2l);
5069           else emit_test(s1l,s1l);
5070           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5071         }
5072         else
5073         #endif
5074         {
5075           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5076           if(s1h>=0) {
5077             if(s2h>=0) emit_cmp(s1h,s2h);
5078             else emit_test(s1h,s1h);
5079             emit_cmovne_reg(alt,addr);
5080           }
5081           if(s2l>=0) emit_cmp(s1l,s2l);
5082           else emit_test(s1l,s1l);
5083           emit_cmovne_reg(alt,addr);
5084         }
5085       }
5086       if((opcode[i]&0x2f)==5) // BNE
5087       {
5088         #ifdef HAVE_CMOV_IMM
5089         if(s1h<0) {
5090           if(s2l>=0) emit_cmp(s1l,s2l);
5091           else emit_test(s1l,s1l);
5092           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5093         }
5094         else
5095         #endif
5096         {
5097           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5098           if(s1h>=0) {
5099             if(s2h>=0) emit_cmp(s1h,s2h);
5100             else emit_test(s1h,s1h);
5101             emit_cmovne_reg(alt,addr);
5102           }
5103           if(s2l>=0) emit_cmp(s1l,s2l);
5104           else emit_test(s1l,s1l);
5105           emit_cmovne_reg(alt,addr);
5106         }
5107       }
5108       if((opcode[i]&0x2f)==6) // BLEZ
5109       {
5110         //emit_movimm(ba[i],alt);
5111         //emit_movimm(start+i*4+8,addr);
5112         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5113         emit_cmpimm(s1l,1);
5114         if(s1h>=0) emit_mov(addr,ntaddr);
5115         emit_cmovl_reg(alt,addr);
5116         if(s1h>=0) {
5117           emit_test(s1h,s1h);
5118           emit_cmovne_reg(ntaddr,addr);
5119           emit_cmovs_reg(alt,addr);
5120         }
5121       }
5122       if((opcode[i]&0x2f)==7) // BGTZ
5123       {
5124         //emit_movimm(ba[i],addr);
5125         //emit_movimm(start+i*4+8,ntaddr);
5126         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5127         emit_cmpimm(s1l,1);
5128         if(s1h>=0) emit_mov(addr,alt);
5129         emit_cmovl_reg(ntaddr,addr);
5130         if(s1h>=0) {
5131           emit_test(s1h,s1h);
5132           emit_cmovne_reg(alt,addr);
5133           emit_cmovs_reg(ntaddr,addr);
5134         }
5135       }
5136       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5137       {
5138         //emit_movimm(ba[i],alt);
5139         //emit_movimm(start+i*4+8,addr);
5140         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5141         if(s1h>=0) emit_test(s1h,s1h);
5142         else emit_test(s1l,s1l);
5143         emit_cmovs_reg(alt,addr);
5144       }
5145       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5146       {
5147         //emit_movimm(ba[i],addr);
5148         //emit_movimm(start+i*4+8,alt);
5149         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5150         if(s1h>=0) emit_test(s1h,s1h);
5151         else emit_test(s1l,s1l);
5152         emit_cmovs_reg(alt,addr);
5153       }
5154       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5155         if(source[i]&0x10000) // BC1T
5156         {
5157           //emit_movimm(ba[i],alt);
5158           //emit_movimm(start+i*4+8,addr);
5159           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5160           emit_testimm(s1l,0x800000);
5161           emit_cmovne_reg(alt,addr);
5162         }
5163         else // BC1F
5164         {
5165           //emit_movimm(ba[i],addr);
5166           //emit_movimm(start+i*4+8,alt);
5167           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5168           emit_testimm(s1l,0x800000);
5169           emit_cmovne_reg(alt,addr);
5170         }
5171       }
5172       emit_writeword(addr,(int)&pcaddr);
5173     }
5174     else
5175     if(itype[i]==RJUMP)
5176     {
5177       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5178       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5179         r=get_reg(branch_regs[i].regmap,RTEMP);
5180       }
5181       emit_writeword(r,(int)&pcaddr);
5182     }
5183     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
5184   }
5185   // Update cycle count
5186   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5187   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
5188   emit_call((int)cc_interrupt);
5189   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
5190   if(stubs[n][6]==TAKEN) {
5191     if(internal_branch(branch_regs[i].is32,ba[i]))
5192       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5193     else if(itype[i]==RJUMP) {
5194       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5195         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5196       else
5197         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5198     }
5199   }else if(stubs[n][6]==NOTTAKEN) {
5200     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5201     else load_all_regs(branch_regs[i].regmap);
5202   }else if(stubs[n][6]==NULLDS) {
5203     // Delay slot instruction is nullified ("likely" branch)
5204     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5205     else load_all_regs(regs[i].regmap);
5206   }else{
5207     load_all_regs(branch_regs[i].regmap);
5208   }
5209   emit_jmp(stubs[n][2]); // return address
5210
5211   /* This works but uses a lot of memory...
5212   emit_readword((int)&last_count,ECX);
5213   emit_add(HOST_CCREG,ECX,EAX);
5214   emit_writeword(EAX,(int)&Count);
5215   emit_call((int)gen_interupt);
5216   emit_readword((int)&Count,HOST_CCREG);
5217   emit_readword((int)&next_interupt,EAX);
5218   emit_readword((int)&pending_exception,EBX);
5219   emit_writeword(EAX,(int)&last_count);
5220   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5221   emit_test(EBX,EBX);
5222   int jne_instr=(int)out;
5223   emit_jne(0);
5224   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5225   load_all_regs(branch_regs[i].regmap);
5226   emit_jmp(stubs[n][2]); // return address
5227   set_jump_target(jne_instr,(int)out);
5228   emit_readword((int)&pcaddr,EAX);
5229   // Call get_addr_ht instead of doing the hash table here.
5230   // This code is executed infrequently and takes up a lot of space
5231   // so smaller is better.
5232   emit_storereg(CCREG,HOST_CCREG);
5233   emit_pushreg(EAX);
5234   emit_call((int)get_addr_ht);
5235   emit_loadreg(CCREG,HOST_CCREG);
5236   emit_addimm(ESP,4,ESP);
5237   emit_jmpreg(EAX);*/
5238 }
5239
5240 add_to_linker(int addr,int target,int ext)
5241 {
5242   link_addr[linkcount][0]=addr;
5243   link_addr[linkcount][1]=target;
5244   link_addr[linkcount][2]=ext;
5245   linkcount++;
5246 }
5247
5248 static void ujump_assemble_write_ra(int i)
5249 {
5250   int rt;
5251   unsigned int return_address;
5252   rt=get_reg(branch_regs[i].regmap,31);
5253   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5254   //assert(rt>=0);
5255   return_address=start+i*4+8;
5256   if(rt>=0) {
5257     #ifdef USE_MINI_HT
5258     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5259       int temp=-1; // note: must be ds-safe
5260       #ifdef HOST_TEMPREG
5261       temp=HOST_TEMPREG;
5262       #endif
5263       if(temp>=0) do_miniht_insert(return_address,rt,temp);
5264       else emit_movimm(return_address,rt);
5265     }
5266     else
5267     #endif
5268     {
5269       #ifdef REG_PREFETCH
5270       if(temp>=0)
5271       {
5272         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5273       }
5274       #endif
5275       emit_movimm(return_address,rt); // PC into link register
5276       #ifdef IMM_PREFETCH
5277       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5278       #endif
5279     }
5280   }
5281 }
5282
5283 void ujump_assemble(int i,struct regstat *i_regs)
5284 {
5285   signed char *i_regmap=i_regs->regmap;
5286   int ra_done=0;
5287   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5288   address_generation(i+1,i_regs,regs[i].regmap_entry);
5289   #ifdef REG_PREFETCH
5290   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5291   if(rt1[i]==31&&temp>=0)
5292   {
5293     int return_address=start+i*4+8;
5294     if(get_reg(branch_regs[i].regmap,31)>0)
5295     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5296   }
5297   #endif
5298   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5299     ujump_assemble_write_ra(i); // writeback ra for DS
5300     ra_done=1;
5301   }
5302   ds_assemble(i+1,i_regs);
5303   uint64_t bc_unneeded=branch_regs[i].u;
5304   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5305   bc_unneeded|=1|(1LL<<rt1[i]);
5306   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5307   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5308                 bc_unneeded,bc_unneeded_upper);
5309   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5310   if(!ra_done&&rt1[i]==31)
5311     ujump_assemble_write_ra(i);
5312   int cc,adj;
5313   cc=get_reg(branch_regs[i].regmap,CCREG);
5314   assert(cc==HOST_CCREG);
5315   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5316   #ifdef REG_PREFETCH
5317   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5318   #endif
5319   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5320   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5321   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5322   if(internal_branch(branch_regs[i].is32,ba[i]))
5323     assem_debug("branch: internal\n");
5324   else
5325     assem_debug("branch: external\n");
5326   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5327     ds_assemble_entry(i);
5328   }
5329   else {
5330     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5331     emit_jmp(0);
5332   }
5333 }
5334
5335 static void rjump_assemble_write_ra(int i)
5336 {
5337   int rt,return_address;
5338   assert(rt1[i+1]!=rt1[i]);
5339   assert(rt2[i+1]!=rt1[i]);
5340   rt=get_reg(branch_regs[i].regmap,rt1[i]);
5341   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5342   assert(rt>=0);
5343   return_address=start+i*4+8;
5344   #ifdef REG_PREFETCH
5345   if(temp>=0)
5346   {
5347     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5348   }
5349   #endif
5350   emit_movimm(return_address,rt); // PC into link register
5351   #ifdef IMM_PREFETCH
5352   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5353   #endif
5354 }
5355
5356 void rjump_assemble(int i,struct regstat *i_regs)
5357 {
5358   signed char *i_regmap=i_regs->regmap;
5359   int temp;
5360   int rs,cc,adj;
5361   int ra_done=0;
5362   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5363   assert(rs>=0);
5364   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5365     // Delay slot abuse, make a copy of the branch address register
5366     temp=get_reg(branch_regs[i].regmap,RTEMP);
5367     assert(temp>=0);
5368     assert(regs[i].regmap[temp]==RTEMP);
5369     emit_mov(rs,temp);
5370     rs=temp;
5371   }
5372   address_generation(i+1,i_regs,regs[i].regmap_entry);
5373   #ifdef REG_PREFETCH
5374   if(rt1[i]==31)
5375   {
5376     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5377       int return_address=start+i*4+8;
5378       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5379     }
5380   }
5381   #endif
5382   #ifdef USE_MINI_HT
5383   if(rs1[i]==31) {
5384     int rh=get_reg(regs[i].regmap,RHASH);
5385     if(rh>=0) do_preload_rhash(rh);
5386   }
5387   #endif
5388   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5389     rjump_assemble_write_ra(i);
5390     ra_done=1;
5391   }
5392   ds_assemble(i+1,i_regs);
5393   uint64_t bc_unneeded=branch_regs[i].u;
5394   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5395   bc_unneeded|=1|(1LL<<rt1[i]);
5396   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5397   bc_unneeded&=~(1LL<<rs1[i]);
5398   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5399                 bc_unneeded,bc_unneeded_upper);
5400   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5401   if(!ra_done&&rt1[i]!=0)
5402     rjump_assemble_write_ra(i);
5403   cc=get_reg(branch_regs[i].regmap,CCREG);
5404   assert(cc==HOST_CCREG);
5405   #ifdef USE_MINI_HT
5406   int rh=get_reg(branch_regs[i].regmap,RHASH);
5407   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5408   if(rs1[i]==31) {
5409     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5410     do_preload_rhtbl(ht);
5411     do_rhash(rs,rh);
5412   }
5413   #endif
5414   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5415   #ifdef DESTRUCTIVE_WRITEBACK
5416   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5417     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5418       emit_loadreg(rs1[i],rs);
5419     }
5420   }
5421   #endif
5422   #ifdef REG_PREFETCH
5423   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5424   #endif
5425   #ifdef USE_MINI_HT
5426   if(rs1[i]==31) {
5427     do_miniht_load(ht,rh);
5428   }
5429   #endif
5430   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5431   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5432   //assert(adj==0);
5433   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5434   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5435 #ifdef PCSX
5436   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
5437     // special case for RFE
5438     emit_jmp(0);
5439   else
5440 #endif
5441   emit_jns(0);
5442   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5443   #ifdef USE_MINI_HT
5444   if(rs1[i]==31) {
5445     do_miniht_jump(rs,rh,ht);
5446   }
5447   else
5448   #endif
5449   {
5450     //if(rs!=EAX) emit_mov(rs,EAX);
5451     //emit_jmp((int)jump_vaddr_eax);
5452     emit_jmp(jump_vaddr_reg[rs]);
5453   }
5454   /* Check hash table
5455   temp=!rs;
5456   emit_mov(rs,temp);
5457   emit_shrimm(rs,16,rs);
5458   emit_xor(temp,rs,rs);
5459   emit_movzwl_reg(rs,rs);
5460   emit_shlimm(rs,4,rs);
5461   emit_cmpmem_indexed((int)hash_table,rs,temp);
5462   emit_jne((int)out+14);
5463   emit_readword_indexed((int)hash_table+4,rs,rs);
5464   emit_jmpreg(rs);
5465   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5466   emit_addimm_no_flags(8,rs);
5467   emit_jeq((int)out-17);
5468   // No hit on hash table, call compiler
5469   emit_pushreg(temp);
5470 //DEBUG >
5471 #ifdef DEBUG_CYCLE_COUNT
5472   emit_readword((int)&last_count,ECX);
5473   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5474   emit_readword((int)&next_interupt,ECX);
5475   emit_writeword(HOST_CCREG,(int)&Count);
5476   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5477   emit_writeword(ECX,(int)&last_count);
5478 #endif
5479 //DEBUG <
5480   emit_storereg(CCREG,HOST_CCREG);
5481   emit_call((int)get_addr);
5482   emit_loadreg(CCREG,HOST_CCREG);
5483   emit_addimm(ESP,4,ESP);
5484   emit_jmpreg(EAX);*/
5485   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5486   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5487   #endif
5488 }
5489
5490 void cjump_assemble(int i,struct regstat *i_regs)
5491 {
5492   signed char *i_regmap=i_regs->regmap;
5493   int cc;
5494   int match;
5495   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5496   assem_debug("match=%d\n",match);
5497   int s1h,s1l,s2h,s2l;
5498   int prev_cop1_usable=cop1_usable;
5499   int unconditional=0,nop=0;
5500   int only32=0;
5501   int invert=0;
5502   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5503   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5504   if(!match) invert=1;
5505   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5506   if(i>(ba[i]-start)>>2) invert=1;
5507   #endif
5508
5509   if(ooo[i]) {
5510     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5511     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5512     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5513     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5514   }
5515   else {
5516     s1l=get_reg(i_regmap,rs1[i]);
5517     s1h=get_reg(i_regmap,rs1[i]|64);
5518     s2l=get_reg(i_regmap,rs2[i]);
5519     s2h=get_reg(i_regmap,rs2[i]|64);
5520   }
5521   if(rs1[i]==0&&rs2[i]==0)
5522   {
5523     if(opcode[i]&1) nop=1;
5524     else unconditional=1;
5525     //assert(opcode[i]!=5);
5526     //assert(opcode[i]!=7);
5527     //assert(opcode[i]!=0x15);
5528     //assert(opcode[i]!=0x17);
5529   }
5530   else if(rs1[i]==0)
5531   {
5532     s1l=s2l;s1h=s2h;
5533     s2l=s2h=-1;
5534     only32=(regs[i].was32>>rs2[i])&1;
5535   }
5536   else if(rs2[i]==0)
5537   {
5538     s2l=s2h=-1;
5539     only32=(regs[i].was32>>rs1[i])&1;
5540   }
5541   else {
5542     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5543   }
5544
5545   if(ooo[i]) {
5546     // Out of order execution (delay slot first)
5547     //printf("OOOE\n");
5548     address_generation(i+1,i_regs,regs[i].regmap_entry);
5549     ds_assemble(i+1,i_regs);
5550     int adj;
5551     uint64_t bc_unneeded=branch_regs[i].u;
5552     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5553     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5554     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5555     bc_unneeded|=1;
5556     bc_unneeded_upper|=1;
5557     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5558                   bc_unneeded,bc_unneeded_upper);
5559     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5560     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5561     cc=get_reg(branch_regs[i].regmap,CCREG);
5562     assert(cc==HOST_CCREG);
5563     if(unconditional)
5564       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5565     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5566     //assem_debug("cycle count (adj)\n");
5567     if(unconditional) {
5568       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5569       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5570         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5571         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5572         if(internal)
5573           assem_debug("branch: internal\n");
5574         else
5575           assem_debug("branch: external\n");
5576         if(internal&&is_ds[(ba[i]-start)>>2]) {
5577           ds_assemble_entry(i);
5578         }
5579         else {
5580           add_to_linker((int)out,ba[i],internal);
5581           emit_jmp(0);
5582         }
5583         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5584         if(((u_int)out)&7) emit_addnop(0);
5585         #endif
5586       }
5587     }
5588     else if(nop) {
5589       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5590       int jaddr=(int)out;
5591       emit_jns(0);
5592       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5593     }
5594     else {
5595       int taken=0,nottaken=0,nottaken1=0;
5596       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5597       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5598       if(!only32)
5599       {
5600         assert(s1h>=0);
5601         if(opcode[i]==4) // BEQ
5602         {
5603           if(s2h>=0) emit_cmp(s1h,s2h);
5604           else emit_test(s1h,s1h);
5605           nottaken1=(int)out;
5606           emit_jne(1);
5607         }
5608         if(opcode[i]==5) // BNE
5609         {
5610           if(s2h>=0) emit_cmp(s1h,s2h);
5611           else emit_test(s1h,s1h);
5612           if(invert) taken=(int)out;
5613           else add_to_linker((int)out,ba[i],internal);
5614           emit_jne(0);
5615         }
5616         if(opcode[i]==6) // BLEZ
5617         {
5618           emit_test(s1h,s1h);
5619           if(invert) taken=(int)out;
5620           else add_to_linker((int)out,ba[i],internal);
5621           emit_js(0);
5622           nottaken1=(int)out;
5623           emit_jne(1);
5624         }
5625         if(opcode[i]==7) // BGTZ
5626         {
5627           emit_test(s1h,s1h);
5628           nottaken1=(int)out;
5629           emit_js(1);
5630           if(invert) taken=(int)out;
5631           else add_to_linker((int)out,ba[i],internal);
5632           emit_jne(0);
5633         }
5634       } // if(!only32)
5635
5636       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5637       assert(s1l>=0);
5638       if(opcode[i]==4) // BEQ
5639       {
5640         if(s2l>=0) emit_cmp(s1l,s2l);
5641         else emit_test(s1l,s1l);
5642         if(invert){
5643           nottaken=(int)out;
5644           emit_jne(1);
5645         }else{
5646           add_to_linker((int)out,ba[i],internal);
5647           emit_jeq(0);
5648         }
5649       }
5650       if(opcode[i]==5) // BNE
5651       {
5652         if(s2l>=0) emit_cmp(s1l,s2l);
5653         else emit_test(s1l,s1l);
5654         if(invert){
5655           nottaken=(int)out;
5656           emit_jeq(1);
5657         }else{
5658           add_to_linker((int)out,ba[i],internal);
5659           emit_jne(0);
5660         }
5661       }
5662       if(opcode[i]==6) // BLEZ
5663       {
5664         emit_cmpimm(s1l,1);
5665         if(invert){
5666           nottaken=(int)out;
5667           emit_jge(1);
5668         }else{
5669           add_to_linker((int)out,ba[i],internal);
5670           emit_jl(0);
5671         }
5672       }
5673       if(opcode[i]==7) // BGTZ
5674       {
5675         emit_cmpimm(s1l,1);
5676         if(invert){
5677           nottaken=(int)out;
5678           emit_jl(1);
5679         }else{
5680           add_to_linker((int)out,ba[i],internal);
5681           emit_jge(0);
5682         }
5683       }
5684       if(invert) {
5685         if(taken) set_jump_target(taken,(int)out);
5686         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5687         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5688           if(adj) {
5689             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5690             add_to_linker((int)out,ba[i],internal);
5691           }else{
5692             emit_addnop(13);
5693             add_to_linker((int)out,ba[i],internal*2);
5694           }
5695           emit_jmp(0);
5696         }else
5697         #endif
5698         {
5699           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5700           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5701           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5702           if(internal)
5703             assem_debug("branch: internal\n");
5704           else
5705             assem_debug("branch: external\n");
5706           if(internal&&is_ds[(ba[i]-start)>>2]) {
5707             ds_assemble_entry(i);
5708           }
5709           else {
5710             add_to_linker((int)out,ba[i],internal);
5711             emit_jmp(0);
5712           }
5713         }
5714         set_jump_target(nottaken,(int)out);
5715       }
5716
5717       if(nottaken1) set_jump_target(nottaken1,(int)out);
5718       if(adj) {
5719         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5720       }
5721     } // (!unconditional)
5722   } // if(ooo)
5723   else
5724   {
5725     // In-order execution (branch first)
5726     //if(likely[i]) printf("IOL\n");
5727     //else
5728     //printf("IOE\n");
5729     int taken=0,nottaken=0,nottaken1=0;
5730     if(!unconditional&&!nop) {
5731       if(!only32)
5732       {
5733         assert(s1h>=0);
5734         if((opcode[i]&0x2f)==4) // BEQ
5735         {
5736           if(s2h>=0) emit_cmp(s1h,s2h);
5737           else emit_test(s1h,s1h);
5738           nottaken1=(int)out;
5739           emit_jne(2);
5740         }
5741         if((opcode[i]&0x2f)==5) // BNE
5742         {
5743           if(s2h>=0) emit_cmp(s1h,s2h);
5744           else emit_test(s1h,s1h);
5745           taken=(int)out;
5746           emit_jne(1);
5747         }
5748         if((opcode[i]&0x2f)==6) // BLEZ
5749         {
5750           emit_test(s1h,s1h);
5751           taken=(int)out;
5752           emit_js(1);
5753           nottaken1=(int)out;
5754           emit_jne(2);
5755         }
5756         if((opcode[i]&0x2f)==7) // BGTZ
5757         {
5758           emit_test(s1h,s1h);
5759           nottaken1=(int)out;
5760           emit_js(2);
5761           taken=(int)out;
5762           emit_jne(1);
5763         }
5764       } // if(!only32)
5765
5766       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5767       assert(s1l>=0);
5768       if((opcode[i]&0x2f)==4) // BEQ
5769       {
5770         if(s2l>=0) emit_cmp(s1l,s2l);
5771         else emit_test(s1l,s1l);
5772         nottaken=(int)out;
5773         emit_jne(2);
5774       }
5775       if((opcode[i]&0x2f)==5) // BNE
5776       {
5777         if(s2l>=0) emit_cmp(s1l,s2l);
5778         else emit_test(s1l,s1l);
5779         nottaken=(int)out;
5780         emit_jeq(2);
5781       }
5782       if((opcode[i]&0x2f)==6) // BLEZ
5783       {
5784         emit_cmpimm(s1l,1);
5785         nottaken=(int)out;
5786         emit_jge(2);
5787       }
5788       if((opcode[i]&0x2f)==7) // BGTZ
5789       {
5790         emit_cmpimm(s1l,1);
5791         nottaken=(int)out;
5792         emit_jl(2);
5793       }
5794     } // if(!unconditional)
5795     int adj;
5796     uint64_t ds_unneeded=branch_regs[i].u;
5797     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5798     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5799     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5800     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5801     ds_unneeded|=1;
5802     ds_unneeded_upper|=1;
5803     // branch taken
5804     if(!nop) {
5805       if(taken) set_jump_target(taken,(int)out);
5806       assem_debug("1:\n");
5807       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5808                     ds_unneeded,ds_unneeded_upper);
5809       // load regs
5810       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5811       address_generation(i+1,&branch_regs[i],0);
5812       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5813       ds_assemble(i+1,&branch_regs[i]);
5814       cc=get_reg(branch_regs[i].regmap,CCREG);
5815       if(cc==-1) {
5816         emit_loadreg(CCREG,cc=HOST_CCREG);
5817         // CHECK: Is the following instruction (fall thru) allocated ok?
5818       }
5819       assert(cc==HOST_CCREG);
5820       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5821       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5822       assem_debug("cycle count (adj)\n");
5823       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5824       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5825       if(internal)
5826         assem_debug("branch: internal\n");
5827       else
5828         assem_debug("branch: external\n");
5829       if(internal&&is_ds[(ba[i]-start)>>2]) {
5830         ds_assemble_entry(i);
5831       }
5832       else {
5833         add_to_linker((int)out,ba[i],internal);
5834         emit_jmp(0);
5835       }
5836     }
5837     // branch not taken
5838     cop1_usable=prev_cop1_usable;
5839     if(!unconditional) {
5840       if(nottaken1) set_jump_target(nottaken1,(int)out);
5841       set_jump_target(nottaken,(int)out);
5842       assem_debug("2:\n");
5843       if(!likely[i]) {
5844         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5845                       ds_unneeded,ds_unneeded_upper);
5846         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5847         address_generation(i+1,&branch_regs[i],0);
5848         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5849         ds_assemble(i+1,&branch_regs[i]);
5850       }
5851       cc=get_reg(branch_regs[i].regmap,CCREG);
5852       if(cc==-1&&!likely[i]) {
5853         // Cycle count isn't in a register, temporarily load it then write it out
5854         emit_loadreg(CCREG,HOST_CCREG);
5855         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5856         int jaddr=(int)out;
5857         emit_jns(0);
5858         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5859         emit_storereg(CCREG,HOST_CCREG);
5860       }
5861       else{
5862         cc=get_reg(i_regmap,CCREG);
5863         assert(cc==HOST_CCREG);
5864         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5865         int jaddr=(int)out;
5866         emit_jns(0);
5867         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5868       }
5869     }
5870   }
5871 }
5872
5873 void sjump_assemble(int i,struct regstat *i_regs)
5874 {
5875   signed char *i_regmap=i_regs->regmap;
5876   int cc;
5877   int match;
5878   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5879   assem_debug("smatch=%d\n",match);
5880   int s1h,s1l;
5881   int prev_cop1_usable=cop1_usable;
5882   int unconditional=0,nevertaken=0;
5883   int only32=0;
5884   int invert=0;
5885   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5886   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5887   if(!match) invert=1;
5888   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5889   if(i>(ba[i]-start)>>2) invert=1;
5890   #endif
5891
5892   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5893   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5894
5895   if(ooo[i]) {
5896     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5897     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5898   }
5899   else {
5900     s1l=get_reg(i_regmap,rs1[i]);
5901     s1h=get_reg(i_regmap,rs1[i]|64);
5902   }
5903   if(rs1[i]==0)
5904   {
5905     if(opcode2[i]&1) unconditional=1;
5906     else nevertaken=1;
5907     // These are never taken (r0 is never less than zero)
5908     //assert(opcode2[i]!=0);
5909     //assert(opcode2[i]!=2);
5910     //assert(opcode2[i]!=0x10);
5911     //assert(opcode2[i]!=0x12);
5912   }
5913   else {
5914     only32=(regs[i].was32>>rs1[i])&1;
5915   }
5916
5917   if(ooo[i]) {
5918     // Out of order execution (delay slot first)
5919     //printf("OOOE\n");
5920     address_generation(i+1,i_regs,regs[i].regmap_entry);
5921     ds_assemble(i+1,i_regs);
5922     int adj;
5923     uint64_t bc_unneeded=branch_regs[i].u;
5924     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5925     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5926     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5927     bc_unneeded|=1;
5928     bc_unneeded_upper|=1;
5929     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5930                   bc_unneeded,bc_unneeded_upper);
5931     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5932     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5933     if(rt1[i]==31) {
5934       int rt,return_address;
5935       rt=get_reg(branch_regs[i].regmap,31);
5936       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5937       if(rt>=0) {
5938         // Save the PC even if the branch is not taken
5939         return_address=start+i*4+8;
5940         emit_movimm(return_address,rt); // PC into link register
5941         #ifdef IMM_PREFETCH
5942         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5943         #endif
5944       }
5945     }
5946     cc=get_reg(branch_regs[i].regmap,CCREG);
5947     assert(cc==HOST_CCREG);
5948     if(unconditional)
5949       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5950     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5951     assem_debug("cycle count (adj)\n");
5952     if(unconditional) {
5953       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5954       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5955         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5956         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5957         if(internal)
5958           assem_debug("branch: internal\n");
5959         else
5960           assem_debug("branch: external\n");
5961         if(internal&&is_ds[(ba[i]-start)>>2]) {
5962           ds_assemble_entry(i);
5963         }
5964         else {
5965           add_to_linker((int)out,ba[i],internal);
5966           emit_jmp(0);
5967         }
5968         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5969         if(((u_int)out)&7) emit_addnop(0);
5970         #endif
5971       }
5972     }
5973     else if(nevertaken) {
5974       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5975       int jaddr=(int)out;
5976       emit_jns(0);
5977       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5978     }
5979     else {
5980       int nottaken=0;
5981       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5982       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5983       if(!only32)
5984       {
5985         assert(s1h>=0);
5986         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5987         {
5988           emit_test(s1h,s1h);
5989           if(invert){
5990             nottaken=(int)out;
5991             emit_jns(1);
5992           }else{
5993             add_to_linker((int)out,ba[i],internal);
5994             emit_js(0);
5995           }
5996         }
5997         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5998         {
5999           emit_test(s1h,s1h);
6000           if(invert){
6001             nottaken=(int)out;
6002             emit_js(1);
6003           }else{
6004             add_to_linker((int)out,ba[i],internal);
6005             emit_jns(0);
6006           }
6007         }
6008       } // if(!only32)
6009       else
6010       {
6011         assert(s1l>=0);
6012         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
6013         {
6014           emit_test(s1l,s1l);
6015           if(invert){
6016             nottaken=(int)out;
6017             emit_jns(1);
6018           }else{
6019             add_to_linker((int)out,ba[i],internal);
6020             emit_js(0);
6021           }
6022         }
6023         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
6024         {
6025           emit_test(s1l,s1l);
6026           if(invert){
6027             nottaken=(int)out;
6028             emit_js(1);
6029           }else{
6030             add_to_linker((int)out,ba[i],internal);
6031             emit_jns(0);
6032           }
6033         }
6034       } // if(!only32)
6035
6036       if(invert) {
6037         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6038         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
6039           if(adj) {
6040             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6041             add_to_linker((int)out,ba[i],internal);
6042           }else{
6043             emit_addnop(13);
6044             add_to_linker((int)out,ba[i],internal*2);
6045           }
6046           emit_jmp(0);
6047         }else
6048         #endif
6049         {
6050           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6051           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6052           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6053           if(internal)
6054             assem_debug("branch: internal\n");
6055           else
6056             assem_debug("branch: external\n");
6057           if(internal&&is_ds[(ba[i]-start)>>2]) {
6058             ds_assemble_entry(i);
6059           }
6060           else {
6061             add_to_linker((int)out,ba[i],internal);
6062             emit_jmp(0);
6063           }
6064         }
6065         set_jump_target(nottaken,(int)out);
6066       }
6067
6068       if(adj) {
6069         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
6070       }
6071     } // (!unconditional)
6072   } // if(ooo)
6073   else
6074   {
6075     // In-order execution (branch first)
6076     //printf("IOE\n");
6077     int nottaken=0;
6078     if(rt1[i]==31) {
6079       int rt,return_address;
6080       rt=get_reg(branch_regs[i].regmap,31);
6081       if(rt>=0) {
6082         // Save the PC even if the branch is not taken
6083         return_address=start+i*4+8;
6084         emit_movimm(return_address,rt); // PC into link register
6085         #ifdef IMM_PREFETCH
6086         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
6087         #endif
6088       }
6089     }
6090     if(!unconditional) {
6091       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6092       if(!only32)
6093       {
6094         assert(s1h>=0);
6095         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6096         {
6097           emit_test(s1h,s1h);
6098           nottaken=(int)out;
6099           emit_jns(1);
6100         }
6101         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6102         {
6103           emit_test(s1h,s1h);
6104           nottaken=(int)out;
6105           emit_js(1);
6106         }
6107       } // if(!only32)
6108       else
6109       {
6110         assert(s1l>=0);
6111         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6112         {
6113           emit_test(s1l,s1l);
6114           nottaken=(int)out;
6115           emit_jns(1);
6116         }
6117         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6118         {
6119           emit_test(s1l,s1l);
6120           nottaken=(int)out;
6121           emit_js(1);
6122         }
6123       }
6124     } // if(!unconditional)
6125     int adj;
6126     uint64_t ds_unneeded=branch_regs[i].u;
6127     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6128     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6129     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6130     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6131     ds_unneeded|=1;
6132     ds_unneeded_upper|=1;
6133     // branch taken
6134     if(!nevertaken) {
6135       //assem_debug("1:\n");
6136       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6137                     ds_unneeded,ds_unneeded_upper);
6138       // load regs
6139       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6140       address_generation(i+1,&branch_regs[i],0);
6141       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6142       ds_assemble(i+1,&branch_regs[i]);
6143       cc=get_reg(branch_regs[i].regmap,CCREG);
6144       if(cc==-1) {
6145         emit_loadreg(CCREG,cc=HOST_CCREG);
6146         // CHECK: Is the following instruction (fall thru) allocated ok?
6147       }
6148       assert(cc==HOST_CCREG);
6149       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6150       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6151       assem_debug("cycle count (adj)\n");
6152       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6153       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6154       if(internal)
6155         assem_debug("branch: internal\n");
6156       else
6157         assem_debug("branch: external\n");
6158       if(internal&&is_ds[(ba[i]-start)>>2]) {
6159         ds_assemble_entry(i);
6160       }
6161       else {
6162         add_to_linker((int)out,ba[i],internal);
6163         emit_jmp(0);
6164       }
6165     }
6166     // branch not taken
6167     cop1_usable=prev_cop1_usable;
6168     if(!unconditional) {
6169       set_jump_target(nottaken,(int)out);
6170       assem_debug("1:\n");
6171       if(!likely[i]) {
6172         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6173                       ds_unneeded,ds_unneeded_upper);
6174         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6175         address_generation(i+1,&branch_regs[i],0);
6176         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6177         ds_assemble(i+1,&branch_regs[i]);
6178       }
6179       cc=get_reg(branch_regs[i].regmap,CCREG);
6180       if(cc==-1&&!likely[i]) {
6181         // Cycle count isn't in a register, temporarily load it then write it out
6182         emit_loadreg(CCREG,HOST_CCREG);
6183         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6184         int jaddr=(int)out;
6185         emit_jns(0);
6186         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6187         emit_storereg(CCREG,HOST_CCREG);
6188       }
6189       else{
6190         cc=get_reg(i_regmap,CCREG);
6191         assert(cc==HOST_CCREG);
6192         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6193         int jaddr=(int)out;
6194         emit_jns(0);
6195         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6196       }
6197     }
6198   }
6199 }
6200
6201 void fjump_assemble(int i,struct regstat *i_regs)
6202 {
6203   signed char *i_regmap=i_regs->regmap;
6204   int cc;
6205   int match;
6206   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6207   assem_debug("fmatch=%d\n",match);
6208   int fs,cs;
6209   int eaddr;
6210   int invert=0;
6211   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6212   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6213   if(!match) invert=1;
6214   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6215   if(i>(ba[i]-start)>>2) invert=1;
6216   #endif
6217
6218   if(ooo[i]) {
6219     fs=get_reg(branch_regs[i].regmap,FSREG);
6220     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6221   }
6222   else {
6223     fs=get_reg(i_regmap,FSREG);
6224   }
6225
6226   // Check cop1 unusable
6227   if(!cop1_usable) {
6228     cs=get_reg(i_regmap,CSREG);
6229     assert(cs>=0);
6230     emit_testimm(cs,0x20000000);
6231     eaddr=(int)out;
6232     emit_jeq(0);
6233     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6234     cop1_usable=1;
6235   }
6236
6237   if(ooo[i]) {
6238     // Out of order execution (delay slot first)
6239     //printf("OOOE\n");
6240     ds_assemble(i+1,i_regs);
6241     int adj;
6242     uint64_t bc_unneeded=branch_regs[i].u;
6243     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6244     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6245     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6246     bc_unneeded|=1;
6247     bc_unneeded_upper|=1;
6248     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6249                   bc_unneeded,bc_unneeded_upper);
6250     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6251     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6252     cc=get_reg(branch_regs[i].regmap,CCREG);
6253     assert(cc==HOST_CCREG);
6254     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6255     assem_debug("cycle count (adj)\n");
6256     if(1) {
6257       int nottaken=0;
6258       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6259       if(1) {
6260         assert(fs>=0);
6261         emit_testimm(fs,0x800000);
6262         if(source[i]&0x10000) // BC1T
6263         {
6264           if(invert){
6265             nottaken=(int)out;
6266             emit_jeq(1);
6267           }else{
6268             add_to_linker((int)out,ba[i],internal);
6269             emit_jne(0);
6270           }
6271         }
6272         else // BC1F
6273           if(invert){
6274             nottaken=(int)out;
6275             emit_jne(1);
6276           }else{
6277             add_to_linker((int)out,ba[i],internal);
6278             emit_jeq(0);
6279           }
6280         {
6281         }
6282       } // if(!only32)
6283
6284       if(invert) {
6285         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6286         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6287         else if(match) emit_addnop(13);
6288         #endif
6289         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6290         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6291         if(internal)
6292           assem_debug("branch: internal\n");
6293         else
6294           assem_debug("branch: external\n");
6295         if(internal&&is_ds[(ba[i]-start)>>2]) {
6296           ds_assemble_entry(i);
6297         }
6298         else {
6299           add_to_linker((int)out,ba[i],internal);
6300           emit_jmp(0);
6301         }
6302         set_jump_target(nottaken,(int)out);
6303       }
6304
6305       if(adj) {
6306         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
6307       }
6308     } // (!unconditional)
6309   } // if(ooo)
6310   else
6311   {
6312     // In-order execution (branch first)
6313     //printf("IOE\n");
6314     int nottaken=0;
6315     if(1) {
6316       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6317       if(1) {
6318         assert(fs>=0);
6319         emit_testimm(fs,0x800000);
6320         if(source[i]&0x10000) // BC1T
6321         {
6322           nottaken=(int)out;
6323           emit_jeq(1);
6324         }
6325         else // BC1F
6326         {
6327           nottaken=(int)out;
6328           emit_jne(1);
6329         }
6330       }
6331     } // if(!unconditional)
6332     int adj;
6333     uint64_t ds_unneeded=branch_regs[i].u;
6334     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6335     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6336     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6337     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6338     ds_unneeded|=1;
6339     ds_unneeded_upper|=1;
6340     // branch taken
6341     //assem_debug("1:\n");
6342     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6343                   ds_unneeded,ds_unneeded_upper);
6344     // load regs
6345     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6346     address_generation(i+1,&branch_regs[i],0);
6347     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6348     ds_assemble(i+1,&branch_regs[i]);
6349     cc=get_reg(branch_regs[i].regmap,CCREG);
6350     if(cc==-1) {
6351       emit_loadreg(CCREG,cc=HOST_CCREG);
6352       // CHECK: Is the following instruction (fall thru) allocated ok?
6353     }
6354     assert(cc==HOST_CCREG);
6355     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6356     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6357     assem_debug("cycle count (adj)\n");
6358     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6359     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6360     if(internal)
6361       assem_debug("branch: internal\n");
6362     else
6363       assem_debug("branch: external\n");
6364     if(internal&&is_ds[(ba[i]-start)>>2]) {
6365       ds_assemble_entry(i);
6366     }
6367     else {
6368       add_to_linker((int)out,ba[i],internal);
6369       emit_jmp(0);
6370     }
6371
6372     // branch not taken
6373     if(1) { // <- FIXME (don't need this)
6374       set_jump_target(nottaken,(int)out);
6375       assem_debug("1:\n");
6376       if(!likely[i]) {
6377         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6378                       ds_unneeded,ds_unneeded_upper);
6379         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6380         address_generation(i+1,&branch_regs[i],0);
6381         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6382         ds_assemble(i+1,&branch_regs[i]);
6383       }
6384       cc=get_reg(branch_regs[i].regmap,CCREG);
6385       if(cc==-1&&!likely[i]) {
6386         // Cycle count isn't in a register, temporarily load it then write it out
6387         emit_loadreg(CCREG,HOST_CCREG);
6388         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6389         int jaddr=(int)out;
6390         emit_jns(0);
6391         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6392         emit_storereg(CCREG,HOST_CCREG);
6393       }
6394       else{
6395         cc=get_reg(i_regmap,CCREG);
6396         assert(cc==HOST_CCREG);
6397         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6398         int jaddr=(int)out;
6399         emit_jns(0);
6400         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6401       }
6402     }
6403   }
6404 }
6405
6406 static void pagespan_assemble(int i,struct regstat *i_regs)
6407 {
6408   int s1l=get_reg(i_regs->regmap,rs1[i]);
6409   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6410   int s2l=get_reg(i_regs->regmap,rs2[i]);
6411   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6412   void *nt_branch=NULL;
6413   int taken=0;
6414   int nottaken=0;
6415   int unconditional=0;
6416   if(rs1[i]==0)
6417   {
6418     s1l=s2l;s1h=s2h;
6419     s2l=s2h=-1;
6420   }
6421   else if(rs2[i]==0)
6422   {
6423     s2l=s2h=-1;
6424   }
6425   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6426     s1h=s2h=-1;
6427   }
6428   int hr=0;
6429   int addr,alt,ntaddr;
6430   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6431   else {
6432     while(hr<HOST_REGS)
6433     {
6434       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6435          (i_regs->regmap[hr]&63)!=rs1[i] &&
6436          (i_regs->regmap[hr]&63)!=rs2[i] )
6437       {
6438         addr=hr++;break;
6439       }
6440       hr++;
6441     }
6442   }
6443   while(hr<HOST_REGS)
6444   {
6445     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6446        (i_regs->regmap[hr]&63)!=rs1[i] &&
6447        (i_regs->regmap[hr]&63)!=rs2[i] )
6448     {
6449       alt=hr++;break;
6450     }
6451     hr++;
6452   }
6453   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6454   {
6455     while(hr<HOST_REGS)
6456     {
6457       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6458          (i_regs->regmap[hr]&63)!=rs1[i] &&
6459          (i_regs->regmap[hr]&63)!=rs2[i] )
6460       {
6461         ntaddr=hr;break;
6462       }
6463       hr++;
6464     }
6465   }
6466   assert(hr<HOST_REGS);
6467   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6468     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6469   }
6470   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6471   if(opcode[i]==2) // J
6472   {
6473     unconditional=1;
6474   }
6475   if(opcode[i]==3) // JAL
6476   {
6477     // TODO: mini_ht
6478     int rt=get_reg(i_regs->regmap,31);
6479     emit_movimm(start+i*4+8,rt);
6480     unconditional=1;
6481   }
6482   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6483   {
6484     emit_mov(s1l,addr);
6485     if(opcode2[i]==9) // JALR
6486     {
6487       int rt=get_reg(i_regs->regmap,rt1[i]);
6488       emit_movimm(start+i*4+8,rt);
6489     }
6490   }
6491   if((opcode[i]&0x3f)==4) // BEQ
6492   {
6493     if(rs1[i]==rs2[i])
6494     {
6495       unconditional=1;
6496     }
6497     else
6498     #ifdef HAVE_CMOV_IMM
6499     if(s1h<0) {
6500       if(s2l>=0) emit_cmp(s1l,s2l);
6501       else emit_test(s1l,s1l);
6502       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6503     }
6504     else
6505     #endif
6506     {
6507       assert(s1l>=0);
6508       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6509       if(s1h>=0) {
6510         if(s2h>=0) emit_cmp(s1h,s2h);
6511         else emit_test(s1h,s1h);
6512         emit_cmovne_reg(alt,addr);
6513       }
6514       if(s2l>=0) emit_cmp(s1l,s2l);
6515       else emit_test(s1l,s1l);
6516       emit_cmovne_reg(alt,addr);
6517     }
6518   }
6519   if((opcode[i]&0x3f)==5) // BNE
6520   {
6521     #ifdef HAVE_CMOV_IMM
6522     if(s1h<0) {
6523       if(s2l>=0) emit_cmp(s1l,s2l);
6524       else emit_test(s1l,s1l);
6525       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6526     }
6527     else
6528     #endif
6529     {
6530       assert(s1l>=0);
6531       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6532       if(s1h>=0) {
6533         if(s2h>=0) emit_cmp(s1h,s2h);
6534         else emit_test(s1h,s1h);
6535         emit_cmovne_reg(alt,addr);
6536       }
6537       if(s2l>=0) emit_cmp(s1l,s2l);
6538       else emit_test(s1l,s1l);
6539       emit_cmovne_reg(alt,addr);
6540     }
6541   }
6542   if((opcode[i]&0x3f)==0x14) // BEQL
6543   {
6544     if(s1h>=0) {
6545       if(s2h>=0) emit_cmp(s1h,s2h);
6546       else emit_test(s1h,s1h);
6547       nottaken=(int)out;
6548       emit_jne(0);
6549     }
6550     if(s2l>=0) emit_cmp(s1l,s2l);
6551     else emit_test(s1l,s1l);
6552     if(nottaken) set_jump_target(nottaken,(int)out);
6553     nottaken=(int)out;
6554     emit_jne(0);
6555   }
6556   if((opcode[i]&0x3f)==0x15) // BNEL
6557   {
6558     if(s1h>=0) {
6559       if(s2h>=0) emit_cmp(s1h,s2h);
6560       else emit_test(s1h,s1h);
6561       taken=(int)out;
6562       emit_jne(0);
6563     }
6564     if(s2l>=0) emit_cmp(s1l,s2l);
6565     else emit_test(s1l,s1l);
6566     nottaken=(int)out;
6567     emit_jeq(0);
6568     if(taken) set_jump_target(taken,(int)out);
6569   }
6570   if((opcode[i]&0x3f)==6) // BLEZ
6571   {
6572     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6573     emit_cmpimm(s1l,1);
6574     if(s1h>=0) emit_mov(addr,ntaddr);
6575     emit_cmovl_reg(alt,addr);
6576     if(s1h>=0) {
6577       emit_test(s1h,s1h);
6578       emit_cmovne_reg(ntaddr,addr);
6579       emit_cmovs_reg(alt,addr);
6580     }
6581   }
6582   if((opcode[i]&0x3f)==7) // BGTZ
6583   {
6584     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6585     emit_cmpimm(s1l,1);
6586     if(s1h>=0) emit_mov(addr,alt);
6587     emit_cmovl_reg(ntaddr,addr);
6588     if(s1h>=0) {
6589       emit_test(s1h,s1h);
6590       emit_cmovne_reg(alt,addr);
6591       emit_cmovs_reg(ntaddr,addr);
6592     }
6593   }
6594   if((opcode[i]&0x3f)==0x16) // BLEZL
6595   {
6596     assert((opcode[i]&0x3f)!=0x16);
6597   }
6598   if((opcode[i]&0x3f)==0x17) // BGTZL
6599   {
6600     assert((opcode[i]&0x3f)!=0x17);
6601   }
6602   assert(opcode[i]!=1); // BLTZ/BGEZ
6603
6604   //FIXME: Check CSREG
6605   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6606     if((source[i]&0x30000)==0) // BC1F
6607     {
6608       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6609       emit_testimm(s1l,0x800000);
6610       emit_cmovne_reg(alt,addr);
6611     }
6612     if((source[i]&0x30000)==0x10000) // BC1T
6613     {
6614       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6615       emit_testimm(s1l,0x800000);
6616       emit_cmovne_reg(alt,addr);
6617     }
6618     if((source[i]&0x30000)==0x20000) // BC1FL
6619     {
6620       emit_testimm(s1l,0x800000);
6621       nottaken=(int)out;
6622       emit_jne(0);
6623     }
6624     if((source[i]&0x30000)==0x30000) // BC1TL
6625     {
6626       emit_testimm(s1l,0x800000);
6627       nottaken=(int)out;
6628       emit_jeq(0);
6629     }
6630   }
6631
6632   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6633   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6634   if(likely[i]||unconditional)
6635   {
6636     emit_movimm(ba[i],HOST_BTREG);
6637   }
6638   else if(addr!=HOST_BTREG)
6639   {
6640     emit_mov(addr,HOST_BTREG);
6641   }
6642   void *branch_addr=out;
6643   emit_jmp(0);
6644   int target_addr=start+i*4+5;
6645   void *stub=out;
6646   void *compiled_target_addr=check_addr(target_addr);
6647   emit_extjump_ds((int)branch_addr,target_addr);
6648   if(compiled_target_addr) {
6649     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6650     add_link(target_addr,stub);
6651   }
6652   else set_jump_target((int)branch_addr,(int)stub);
6653   if(likely[i]) {
6654     // Not-taken path
6655     set_jump_target((int)nottaken,(int)out);
6656     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6657     void *branch_addr=out;
6658     emit_jmp(0);
6659     int target_addr=start+i*4+8;
6660     void *stub=out;
6661     void *compiled_target_addr=check_addr(target_addr);
6662     emit_extjump_ds((int)branch_addr,target_addr);
6663     if(compiled_target_addr) {
6664       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6665       add_link(target_addr,stub);
6666     }
6667     else set_jump_target((int)branch_addr,(int)stub);
6668   }
6669 }
6670
6671 // Assemble the delay slot for the above
6672 static void pagespan_ds()
6673 {
6674   assem_debug("initial delay slot:\n");
6675   u_int vaddr=start+1;
6676   u_int page=get_page(vaddr);
6677   u_int vpage=get_vpage(vaddr);
6678   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6679   do_dirty_stub_ds();
6680   ll_add(jump_in+page,vaddr,(void *)out);
6681   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6682   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6683     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6684   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6685     emit_writeword(HOST_BTREG,(int)&branch_target);
6686   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6687   address_generation(0,&regs[0],regs[0].regmap_entry);
6688   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6689     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6690   cop1_usable=0;
6691   is_delayslot=0;
6692   switch(itype[0]) {
6693     case ALU:
6694       alu_assemble(0,&regs[0]);break;
6695     case IMM16:
6696       imm16_assemble(0,&regs[0]);break;
6697     case SHIFT:
6698       shift_assemble(0,&regs[0]);break;
6699     case SHIFTIMM:
6700       shiftimm_assemble(0,&regs[0]);break;
6701     case LOAD:
6702       load_assemble(0,&regs[0]);break;
6703     case LOADLR:
6704       loadlr_assemble(0,&regs[0]);break;
6705     case STORE:
6706       store_assemble(0,&regs[0]);break;
6707     case STORELR:
6708       storelr_assemble(0,&regs[0]);break;
6709     case COP0:
6710       cop0_assemble(0,&regs[0]);break;
6711     case COP1:
6712       cop1_assemble(0,&regs[0]);break;
6713     case C1LS:
6714       c1ls_assemble(0,&regs[0]);break;
6715     case COP2:
6716       cop2_assemble(0,&regs[0]);break;
6717     case C2LS:
6718       c2ls_assemble(0,&regs[0]);break;
6719     case C2OP:
6720       c2op_assemble(0,&regs[0]);break;
6721     case FCONV:
6722       fconv_assemble(0,&regs[0]);break;
6723     case FLOAT:
6724       float_assemble(0,&regs[0]);break;
6725     case FCOMP:
6726       fcomp_assemble(0,&regs[0]);break;
6727     case MULTDIV:
6728       multdiv_assemble(0,&regs[0]);break;
6729     case MOV:
6730       mov_assemble(0,&regs[0]);break;
6731     case SYSCALL:
6732     case HLECALL:
6733     case INTCALL:
6734     case SPAN:
6735     case UJUMP:
6736     case RJUMP:
6737     case CJUMP:
6738     case SJUMP:
6739     case FJUMP:
6740       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6741   }
6742   int btaddr=get_reg(regs[0].regmap,BTREG);
6743   if(btaddr<0) {
6744     btaddr=get_reg(regs[0].regmap,-1);
6745     emit_readword((int)&branch_target,btaddr);
6746   }
6747   assert(btaddr!=HOST_CCREG);
6748   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6749 #ifdef HOST_IMM8
6750   emit_movimm(start+4,HOST_TEMPREG);
6751   emit_cmp(btaddr,HOST_TEMPREG);
6752 #else
6753   emit_cmpimm(btaddr,start+4);
6754 #endif
6755   int branch=(int)out;
6756   emit_jeq(0);
6757   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6758   emit_jmp(jump_vaddr_reg[btaddr]);
6759   set_jump_target(branch,(int)out);
6760   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6761   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6762 }
6763
6764 // Basic liveness analysis for MIPS registers
6765 void unneeded_registers(int istart,int iend,int r)
6766 {
6767   int i;
6768   uint64_t u,uu,gte_u,b,bu,gte_bu;
6769   uint64_t temp_u,temp_uu,temp_gte_u=0;
6770   uint64_t tdep;
6771   uint64_t gte_u_unknown=0;
6772   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6773     gte_u_unknown=~0ll;
6774   if(iend==slen-1) {
6775     u=1;uu=1;
6776     gte_u=gte_u_unknown;
6777   }else{
6778     u=unneeded_reg[iend+1];
6779     uu=unneeded_reg_upper[iend+1];
6780     u=1;uu=1;
6781     gte_u=gte_unneeded[iend+1];
6782   }
6783
6784   for (i=iend;i>=istart;i--)
6785   {
6786     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6787     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6788     {
6789       // If subroutine call, flag return address as a possible branch target
6790       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6791
6792       if(ba[i]<start || ba[i]>=(start+slen*4))
6793       {
6794         // Branch out of this block, flush all regs
6795         u=1;
6796         uu=1;
6797         gte_u=gte_u_unknown;
6798         /* Hexagon hack
6799         if(itype[i]==UJUMP&&rt1[i]==31)
6800         {
6801           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6802         }
6803         if(itype[i]==RJUMP&&rs1[i]==31)
6804         {
6805           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6806         }
6807         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6808           if(itype[i]==UJUMP&&rt1[i]==31)
6809           {
6810             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6811             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6812           }
6813           if(itype[i]==RJUMP&&rs1[i]==31)
6814           {
6815             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6816             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6817           }
6818         }*/
6819         branch_unneeded_reg[i]=u;
6820         branch_unneeded_reg_upper[i]=uu;
6821         // Merge in delay slot
6822         tdep=(~uu>>rt1[i+1])&1;
6823         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6824         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6825         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6826         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6827         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6828         u|=1;uu|=1;
6829         gte_u|=gte_rt[i+1];
6830         gte_u&=~gte_rs[i+1];
6831         // If branch is "likely" (and conditional)
6832         // then we skip the delay slot on the fall-thru path
6833         if(likely[i]) {
6834           if(i<slen-1) {
6835             u&=unneeded_reg[i+2];
6836             uu&=unneeded_reg_upper[i+2];
6837             gte_u&=gte_unneeded[i+2];
6838           }
6839           else
6840           {
6841             u=1;
6842             uu=1;
6843             gte_u=gte_u_unknown;
6844           }
6845         }
6846       }
6847       else
6848       {
6849         // Internal branch, flag target
6850         bt[(ba[i]-start)>>2]=1;
6851         if(ba[i]<=start+i*4) {
6852           // Backward branch
6853           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6854           {
6855             // Unconditional branch
6856             temp_u=1;temp_uu=1;
6857             temp_gte_u=0;
6858           } else {
6859             // Conditional branch (not taken case)
6860             temp_u=unneeded_reg[i+2];
6861             temp_uu=unneeded_reg_upper[i+2];
6862             temp_gte_u&=gte_unneeded[i+2];
6863           }
6864           // Merge in delay slot
6865           tdep=(~temp_uu>>rt1[i+1])&1;
6866           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6867           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6868           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6869           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6870           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6871           temp_u|=1;temp_uu|=1;
6872           temp_gte_u|=gte_rt[i+1];
6873           temp_gte_u&=~gte_rs[i+1];
6874           // If branch is "likely" (and conditional)
6875           // then we skip the delay slot on the fall-thru path
6876           if(likely[i]) {
6877             if(i<slen-1) {
6878               temp_u&=unneeded_reg[i+2];
6879               temp_uu&=unneeded_reg_upper[i+2];
6880               temp_gte_u&=gte_unneeded[i+2];
6881             }
6882             else
6883             {
6884               temp_u=1;
6885               temp_uu=1;
6886               temp_gte_u=gte_u_unknown;
6887             }
6888           }
6889           tdep=(~temp_uu>>rt1[i])&1;
6890           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6891           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6892           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6893           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6894           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6895           temp_u|=1;temp_uu|=1;
6896           temp_gte_u|=gte_rt[i];
6897           temp_gte_u&=~gte_rs[i];
6898           unneeded_reg[i]=temp_u;
6899           unneeded_reg_upper[i]=temp_uu;
6900           gte_unneeded[i]=temp_gte_u;
6901           // Only go three levels deep.  This recursion can take an
6902           // excessive amount of time if there are a lot of nested loops.
6903           if(r<2) {
6904             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6905           }else{
6906             unneeded_reg[(ba[i]-start)>>2]=1;
6907             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6908             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6909           }
6910         } /*else*/ if(1) {
6911           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6912           {
6913             // Unconditional branch
6914             u=unneeded_reg[(ba[i]-start)>>2];
6915             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6916             gte_u=gte_unneeded[(ba[i]-start)>>2];
6917             branch_unneeded_reg[i]=u;
6918             branch_unneeded_reg_upper[i]=uu;
6919         //u=1;
6920         //uu=1;
6921         //branch_unneeded_reg[i]=u;
6922         //branch_unneeded_reg_upper[i]=uu;
6923             // Merge in delay slot
6924             tdep=(~uu>>rt1[i+1])&1;
6925             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6926             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6927             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6928             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6929             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6930             u|=1;uu|=1;
6931             gte_u|=gte_rt[i+1];
6932             gte_u&=~gte_rs[i+1];
6933           } else {
6934             // Conditional branch
6935             b=unneeded_reg[(ba[i]-start)>>2];
6936             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6937             gte_bu=gte_unneeded[(ba[i]-start)>>2];
6938             branch_unneeded_reg[i]=b;
6939             branch_unneeded_reg_upper[i]=bu;
6940         //b=1;
6941         //bu=1;
6942         //branch_unneeded_reg[i]=b;
6943         //branch_unneeded_reg_upper[i]=bu;
6944             // Branch delay slot
6945             tdep=(~uu>>rt1[i+1])&1;
6946             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6947             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6948             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6949             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6950             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6951             b|=1;bu|=1;
6952             gte_bu|=gte_rt[i+1];
6953             gte_bu&=~gte_rs[i+1];
6954             // If branch is "likely" then we skip the
6955             // delay slot on the fall-thru path
6956             if(likely[i]) {
6957               u=b;
6958               uu=bu;
6959               gte_u=gte_bu;
6960               if(i<slen-1) {
6961                 u&=unneeded_reg[i+2];
6962                 uu&=unneeded_reg_upper[i+2];
6963                 gte_u&=gte_unneeded[i+2];
6964         //u=1;
6965         //uu=1;
6966               }
6967             } else {
6968               u&=b;
6969               uu&=bu;
6970               gte_u&=gte_bu;
6971         //u=1;
6972         //uu=1;
6973             }
6974             if(i<slen-1) {
6975               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6976               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6977         //branch_unneeded_reg[i]=1;
6978         //branch_unneeded_reg_upper[i]=1;
6979             } else {
6980               branch_unneeded_reg[i]=1;
6981               branch_unneeded_reg_upper[i]=1;
6982             }
6983           }
6984         }
6985       }
6986     }
6987     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6988     {
6989       // SYSCALL instruction (software interrupt)
6990       u=1;
6991       uu=1;
6992     }
6993     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6994     {
6995       // ERET instruction (return from interrupt)
6996       u=1;
6997       uu=1;
6998     }
6999     //u=uu=1; // DEBUG
7000     tdep=(~uu>>rt1[i])&1;
7001     // Written registers are unneeded
7002     u|=1LL<<rt1[i];
7003     u|=1LL<<rt2[i];
7004     uu|=1LL<<rt1[i];
7005     uu|=1LL<<rt2[i];
7006     gte_u|=gte_rt[i];
7007     // Accessed registers are needed
7008     u&=~(1LL<<rs1[i]);
7009     u&=~(1LL<<rs2[i]);
7010     uu&=~(1LL<<us1[i]);
7011     uu&=~(1LL<<us2[i]);
7012     gte_u&=~gte_rs[i];
7013     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
7014       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
7015     // Source-target dependencies
7016     uu&=~(tdep<<dep1[i]);
7017     uu&=~(tdep<<dep2[i]);
7018     // R0 is always unneeded
7019     u|=1;uu|=1;
7020     // Save it
7021     unneeded_reg[i]=u;
7022     unneeded_reg_upper[i]=uu;
7023     gte_unneeded[i]=gte_u;
7024     /*
7025     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
7026     printf("U:");
7027     int r;
7028     for(r=1;r<=CCREG;r++) {
7029       if((unneeded_reg[i]>>r)&1) {
7030         if(r==HIREG) printf(" HI");
7031         else if(r==LOREG) printf(" LO");
7032         else printf(" r%d",r);
7033       }
7034     }
7035     printf(" UU:");
7036     for(r=1;r<=CCREG;r++) {
7037       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
7038         if(r==HIREG) printf(" HI");
7039         else if(r==LOREG) printf(" LO");
7040         else printf(" r%d",r);
7041       }
7042     }
7043     printf("\n");*/
7044   }
7045 #ifdef FORCE32
7046   for (i=iend;i>=istart;i--)
7047   {
7048     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
7049   }
7050 #endif
7051 }
7052
7053 // Identify registers which are likely to contain 32-bit values
7054 // This is used to predict whether any branches will jump to a
7055 // location with 64-bit values in registers.
7056 static void provisional_32bit()
7057 {
7058   int i,j;
7059   uint64_t is32=1;
7060   uint64_t lastbranch=1;
7061
7062   for(i=0;i<slen;i++)
7063   {
7064     if(i>0) {
7065       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
7066         if(i>1) is32=lastbranch;
7067         else is32=1;
7068       }
7069     }
7070     if(i>1)
7071     {
7072       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
7073         if(likely[i-2]) {
7074           if(i>2) is32=lastbranch;
7075           else is32=1;
7076         }
7077       }
7078       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7079       {
7080         if(rs1[i-2]==0||rs2[i-2]==0)
7081         {
7082           if(rs1[i-2]) {
7083             is32|=1LL<<rs1[i-2];
7084           }
7085           if(rs2[i-2]) {
7086             is32|=1LL<<rs2[i-2];
7087           }
7088         }
7089       }
7090     }
7091     // If something jumps here with 64-bit values
7092     // then promote those registers to 64 bits
7093     if(bt[i])
7094     {
7095       uint64_t temp_is32=is32;
7096       for(j=i-1;j>=0;j--)
7097       {
7098         if(ba[j]==start+i*4)
7099           //temp_is32&=branch_regs[j].is32;
7100           temp_is32&=p32[j];
7101       }
7102       for(j=i;j<slen;j++)
7103       {
7104         if(ba[j]==start+i*4)
7105           temp_is32=1;
7106       }
7107       is32=temp_is32;
7108     }
7109     int type=itype[i];
7110     int op=opcode[i];
7111     int op2=opcode2[i];
7112     int rt=rt1[i];
7113     int s1=rs1[i];
7114     int s2=rs2[i];
7115     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7116       // Branches don't write registers, consider the delay slot instead.
7117       type=itype[i+1];
7118       op=opcode[i+1];
7119       op2=opcode2[i+1];
7120       rt=rt1[i+1];
7121       s1=rs1[i+1];
7122       s2=rs2[i+1];
7123       lastbranch=is32;
7124     }
7125     switch(type) {
7126       case LOAD:
7127         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
7128            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
7129           is32&=~(1LL<<rt);
7130         else
7131           is32|=1LL<<rt;
7132         break;
7133       case STORE:
7134       case STORELR:
7135         break;
7136       case LOADLR:
7137         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
7138         if(op==0x22) is32|=1LL<<rt; // LWL
7139         break;
7140       case IMM16:
7141         if (op==0x08||op==0x09|| // ADDI/ADDIU
7142             op==0x0a||op==0x0b|| // SLTI/SLTIU
7143             op==0x0c|| // ANDI
7144             op==0x0f)  // LUI
7145         {
7146           is32|=1LL<<rt;
7147         }
7148         if(op==0x18||op==0x19) { // DADDI/DADDIU
7149           is32&=~(1LL<<rt);
7150           //if(imm[i]==0)
7151           //  is32|=((is32>>s1)&1LL)<<rt;
7152         }
7153         if(op==0x0d||op==0x0e) { // ORI/XORI
7154           uint64_t sr=((is32>>s1)&1LL);
7155           is32&=~(1LL<<rt);
7156           is32|=sr<<rt;
7157         }
7158         break;
7159       case UJUMP:
7160         break;
7161       case RJUMP:
7162         break;
7163       case CJUMP:
7164         break;
7165       case SJUMP:
7166         break;
7167       case FJUMP:
7168         break;
7169       case ALU:
7170         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7171           is32|=1LL<<rt;
7172         }
7173         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7174           is32|=1LL<<rt;
7175         }
7176         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7177           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7178           is32&=~(1LL<<rt);
7179           is32|=sr<<rt;
7180         }
7181         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7182           if(s1==0&&s2==0) {
7183             is32|=1LL<<rt;
7184           }
7185           else if(s2==0) {
7186             uint64_t sr=((is32>>s1)&1LL);
7187             is32&=~(1LL<<rt);
7188             is32|=sr<<rt;
7189           }
7190           else if(s1==0) {
7191             uint64_t sr=((is32>>s2)&1LL);
7192             is32&=~(1LL<<rt);
7193             is32|=sr<<rt;
7194           }
7195           else {
7196             is32&=~(1LL<<rt);
7197           }
7198         }
7199         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7200           if(s1==0&&s2==0) {
7201             is32|=1LL<<rt;
7202           }
7203           else if(s2==0) {
7204             uint64_t sr=((is32>>s1)&1LL);
7205             is32&=~(1LL<<rt);
7206             is32|=sr<<rt;
7207           }
7208           else {
7209             is32&=~(1LL<<rt);
7210           }
7211         }
7212         break;
7213       case MULTDIV:
7214         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7215           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7216         }
7217         else {
7218           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7219         }
7220         break;
7221       case MOV:
7222         {
7223           uint64_t sr=((is32>>s1)&1LL);
7224           is32&=~(1LL<<rt);
7225           is32|=sr<<rt;
7226         }
7227         break;
7228       case SHIFT:
7229         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7230         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7231         break;
7232       case SHIFTIMM:
7233         is32|=1LL<<rt;
7234         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7235         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7236         break;
7237       case COP0:
7238         if(op2==0) is32|=1LL<<rt; // MFC0
7239         break;
7240       case COP1:
7241       case COP2:
7242         if(op2==0) is32|=1LL<<rt; // MFC1
7243         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7244         if(op2==2) is32|=1LL<<rt; // CFC1
7245         break;
7246       case C1LS:
7247       case C2LS:
7248         break;
7249       case FLOAT:
7250       case FCONV:
7251         break;
7252       case FCOMP:
7253         break;
7254       case C2OP:
7255       case SYSCALL:
7256       case HLECALL:
7257         break;
7258       default:
7259         break;
7260     }
7261     is32|=1;
7262     p32[i]=is32;
7263
7264     if(i>0)
7265     {
7266       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7267       {
7268         if(rt1[i-1]==31) // JAL/JALR
7269         {
7270           // Subroutine call will return here, don't alloc any registers
7271           is32=1;
7272         }
7273         else if(i+1<slen)
7274         {
7275           // Internal branch will jump here, match registers to caller
7276           is32=0x3FFFFFFFFLL;
7277         }
7278       }
7279     }
7280   }
7281 }
7282
7283 // Identify registers which may be assumed to contain 32-bit values
7284 // and where optimizations will rely on this.
7285 // This is used to determine whether backward branches can safely
7286 // jump to a location with 64-bit values in registers.
7287 static void provisional_r32()
7288 {
7289   u_int r32=0;
7290   int i;
7291
7292   for (i=slen-1;i>=0;i--)
7293   {
7294     int hr;
7295     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7296     {
7297       if(ba[i]<start || ba[i]>=(start+slen*4))
7298       {
7299         // Branch out of this block, don't need anything
7300         r32=0;
7301       }
7302       else
7303       {
7304         // Internal branch
7305         // Need whatever matches the target
7306         // (and doesn't get overwritten by the delay slot instruction)
7307         r32=0;
7308         int t=(ba[i]-start)>>2;
7309         if(ba[i]>start+i*4) {
7310           // Forward branch
7311           //if(!(requires_32bit[t]&~regs[i].was32))
7312           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7313           if(!(pr32[t]&~regs[i].was32))
7314             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7315         }else{
7316           // Backward branch
7317           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7318             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7319         }
7320       }
7321       // Conditional branch may need registers for following instructions
7322       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7323       {
7324         if(i<slen-2) {
7325           //r32|=requires_32bit[i+2];
7326           r32|=pr32[i+2];
7327           r32&=regs[i].was32;
7328           // Mark this address as a branch target since it may be called
7329           // upon return from interrupt
7330           //bt[i+2]=1;
7331         }
7332       }
7333       // Merge in delay slot
7334       if(!likely[i]) {
7335         // These are overwritten unless the branch is "likely"
7336         // and the delay slot is nullified if not taken
7337         r32&=~(1LL<<rt1[i+1]);
7338         r32&=~(1LL<<rt2[i+1]);
7339       }
7340       // Assume these are needed (delay slot)
7341       if(us1[i+1]>0)
7342       {
7343         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7344       }
7345       if(us2[i+1]>0)
7346       {
7347         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7348       }
7349       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7350       {
7351         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7352       }
7353       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7354       {
7355         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7356       }
7357     }
7358     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7359     {
7360       // SYSCALL instruction (software interrupt)
7361       r32=0;
7362     }
7363     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7364     {
7365       // ERET instruction (return from interrupt)
7366       r32=0;
7367     }
7368     // Check 32 bits
7369     r32&=~(1LL<<rt1[i]);
7370     r32&=~(1LL<<rt2[i]);
7371     if(us1[i]>0)
7372     {
7373       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7374     }
7375     if(us2[i]>0)
7376     {
7377       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7378     }
7379     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7380     {
7381       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7382     }
7383     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7384     {
7385       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7386     }
7387     //requires_32bit[i]=r32;
7388     pr32[i]=r32;
7389
7390     // Dirty registers which are 32-bit, require 32-bit input
7391     // as they will be written as 32-bit values
7392     for(hr=0;hr<HOST_REGS;hr++)
7393     {
7394       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7395         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7396           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7397           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7398           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7399         }
7400       }
7401     }
7402   }
7403 }
7404
7405 // Write back dirty registers as soon as we will no longer modify them,
7406 // so that we don't end up with lots of writes at the branches.
7407 void clean_registers(int istart,int iend,int wr)
7408 {
7409   int i;
7410   int r;
7411   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7412   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7413   if(iend==slen-1) {
7414     will_dirty_i=will_dirty_next=0;
7415     wont_dirty_i=wont_dirty_next=0;
7416   }else{
7417     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7418     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7419   }
7420   for (i=iend;i>=istart;i--)
7421   {
7422     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7423     {
7424       if(ba[i]<start || ba[i]>=(start+slen*4))
7425       {
7426         // Branch out of this block, flush all regs
7427         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7428         {
7429           // Unconditional branch
7430           will_dirty_i=0;
7431           wont_dirty_i=0;
7432           // Merge in delay slot (will dirty)
7433           for(r=0;r<HOST_REGS;r++) {
7434             if(r!=EXCLUDE_REG) {
7435               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7436               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7437               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7438               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7439               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7440               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7441               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7442               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7443               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7444               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7445               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7446               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7447               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7448               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7449             }
7450           }
7451         }
7452         else
7453         {
7454           // Conditional branch
7455           will_dirty_i=0;
7456           wont_dirty_i=wont_dirty_next;
7457           // Merge in delay slot (will dirty)
7458           for(r=0;r<HOST_REGS;r++) {
7459             if(r!=EXCLUDE_REG) {
7460               if(!likely[i]) {
7461                 // Might not dirty if likely branch is not taken
7462                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7463                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7464                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7465                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7466                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7467                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7468                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7469                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7470                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7471                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7472                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7473                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7474                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7475                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7476               }
7477             }
7478           }
7479         }
7480         // Merge in delay slot (wont dirty)
7481         for(r=0;r<HOST_REGS;r++) {
7482           if(r!=EXCLUDE_REG) {
7483             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7484             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7485             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7486             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7487             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7488             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7489             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7490             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7491             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7492             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7493           }
7494         }
7495         if(wr) {
7496           #ifndef DESTRUCTIVE_WRITEBACK
7497           branch_regs[i].dirty&=wont_dirty_i;
7498           #endif
7499           branch_regs[i].dirty|=will_dirty_i;
7500         }
7501       }
7502       else
7503       {
7504         // Internal branch
7505         if(ba[i]<=start+i*4) {
7506           // Backward branch
7507           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7508           {
7509             // Unconditional branch
7510             temp_will_dirty=0;
7511             temp_wont_dirty=0;
7512             // Merge in delay slot (will dirty)
7513             for(r=0;r<HOST_REGS;r++) {
7514               if(r!=EXCLUDE_REG) {
7515                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7516                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7517                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7518                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7519                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7520                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7521                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7522                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7523                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7524                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7525                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7526                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7527                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7528                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7529               }
7530             }
7531           } else {
7532             // Conditional branch (not taken case)
7533             temp_will_dirty=will_dirty_next;
7534             temp_wont_dirty=wont_dirty_next;
7535             // Merge in delay slot (will dirty)
7536             for(r=0;r<HOST_REGS;r++) {
7537               if(r!=EXCLUDE_REG) {
7538                 if(!likely[i]) {
7539                   // Will not dirty if likely branch is not taken
7540                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7541                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7542                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7543                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7544                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7545                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7546                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7547                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7548                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7549                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7550                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7551                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7552                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7553                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7554                 }
7555               }
7556             }
7557           }
7558           // Merge in delay slot (wont dirty)
7559           for(r=0;r<HOST_REGS;r++) {
7560             if(r!=EXCLUDE_REG) {
7561               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7562               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7563               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7564               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7565               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7566               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7567               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7568               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7569               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7570               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7571             }
7572           }
7573           // Deal with changed mappings
7574           if(i<iend) {
7575             for(r=0;r<HOST_REGS;r++) {
7576               if(r!=EXCLUDE_REG) {
7577                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7578                   temp_will_dirty&=~(1<<r);
7579                   temp_wont_dirty&=~(1<<r);
7580                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7581                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7582                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7583                   } else {
7584                     temp_will_dirty|=1<<r;
7585                     temp_wont_dirty|=1<<r;
7586                   }
7587                 }
7588               }
7589             }
7590           }
7591           if(wr) {
7592             will_dirty[i]=temp_will_dirty;
7593             wont_dirty[i]=temp_wont_dirty;
7594             clean_registers((ba[i]-start)>>2,i-1,0);
7595           }else{
7596             // Limit recursion.  It can take an excessive amount
7597             // of time if there are a lot of nested loops.
7598             will_dirty[(ba[i]-start)>>2]=0;
7599             wont_dirty[(ba[i]-start)>>2]=-1;
7600           }
7601         }
7602         /*else*/ if(1)
7603         {
7604           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7605           {
7606             // Unconditional branch
7607             will_dirty_i=0;
7608             wont_dirty_i=0;
7609           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7610             for(r=0;r<HOST_REGS;r++) {
7611               if(r!=EXCLUDE_REG) {
7612                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7613                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7614                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7615                 }
7616                 if(branch_regs[i].regmap[r]>=0) {
7617                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7618                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7619                 }
7620               }
7621             }
7622           //}
7623             // Merge in delay slot
7624             for(r=0;r<HOST_REGS;r++) {
7625               if(r!=EXCLUDE_REG) {
7626                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7627                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7628                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7629                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7630                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7631                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7632                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7633                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7634                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7635                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7636                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7637                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7638                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7639                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7640               }
7641             }
7642           } else {
7643             // Conditional branch
7644             will_dirty_i=will_dirty_next;
7645             wont_dirty_i=wont_dirty_next;
7646           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7647             for(r=0;r<HOST_REGS;r++) {
7648               if(r!=EXCLUDE_REG) {
7649                 signed char target_reg=branch_regs[i].regmap[r];
7650                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7651                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7652                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7653                 }
7654                 else if(target_reg>=0) {
7655                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7656                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7657                 }
7658                 // Treat delay slot as part of branch too
7659                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7660                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7661                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7662                 }
7663                 else
7664                 {
7665                   will_dirty[i+1]&=~(1<<r);
7666                 }*/
7667               }
7668             }
7669           //}
7670             // Merge in delay slot
7671             for(r=0;r<HOST_REGS;r++) {
7672               if(r!=EXCLUDE_REG) {
7673                 if(!likely[i]) {
7674                   // Might not dirty if likely branch is not taken
7675                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7676                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7677                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7678                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7679                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7680                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7681                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7682                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7683                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7684                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7685                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7686                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7687                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7688                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7689                 }
7690               }
7691             }
7692           }
7693           // Merge in delay slot (won't dirty)
7694           for(r=0;r<HOST_REGS;r++) {
7695             if(r!=EXCLUDE_REG) {
7696               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7697               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7698               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7699               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7700               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7701               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7702               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7703               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7704               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7705               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7706             }
7707           }
7708           if(wr) {
7709             #ifndef DESTRUCTIVE_WRITEBACK
7710             branch_regs[i].dirty&=wont_dirty_i;
7711             #endif
7712             branch_regs[i].dirty|=will_dirty_i;
7713           }
7714         }
7715       }
7716     }
7717     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7718     {
7719       // SYSCALL instruction (software interrupt)
7720       will_dirty_i=0;
7721       wont_dirty_i=0;
7722     }
7723     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7724     {
7725       // ERET instruction (return from interrupt)
7726       will_dirty_i=0;
7727       wont_dirty_i=0;
7728     }
7729     will_dirty_next=will_dirty_i;
7730     wont_dirty_next=wont_dirty_i;
7731     for(r=0;r<HOST_REGS;r++) {
7732       if(r!=EXCLUDE_REG) {
7733         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7734         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7735         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7736         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7737         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7738         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7739         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7740         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7741         if(i>istart) {
7742           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
7743           {
7744             // Don't store a register immediately after writing it,
7745             // may prevent dual-issue.
7746             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7747             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7748           }
7749         }
7750       }
7751     }
7752     // Save it
7753     will_dirty[i]=will_dirty_i;
7754     wont_dirty[i]=wont_dirty_i;
7755     // Mark registers that won't be dirtied as not dirty
7756     if(wr) {
7757       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7758       for(r=0;r<HOST_REGS;r++) {
7759         if((will_dirty_i>>r)&1) {
7760           printf(" r%d",r);
7761         }
7762       }
7763       printf("\n");*/
7764
7765       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7766         regs[i].dirty|=will_dirty_i;
7767         #ifndef DESTRUCTIVE_WRITEBACK
7768         regs[i].dirty&=wont_dirty_i;
7769         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7770         {
7771           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7772             for(r=0;r<HOST_REGS;r++) {
7773               if(r!=EXCLUDE_REG) {
7774                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7775                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7776                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7777               }
7778             }
7779           }
7780         }
7781         else
7782         {
7783           if(i<iend) {
7784             for(r=0;r<HOST_REGS;r++) {
7785               if(r!=EXCLUDE_REG) {
7786                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7787                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7788                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7789               }
7790             }
7791           }
7792         }
7793         #endif
7794       //}
7795     }
7796     // Deal with changed mappings
7797     temp_will_dirty=will_dirty_i;
7798     temp_wont_dirty=wont_dirty_i;
7799     for(r=0;r<HOST_REGS;r++) {
7800       if(r!=EXCLUDE_REG) {
7801         int nr;
7802         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7803           if(wr) {
7804             #ifndef DESTRUCTIVE_WRITEBACK
7805             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7806             #endif
7807             regs[i].wasdirty|=will_dirty_i&(1<<r);
7808           }
7809         }
7810         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7811           // Register moved to a different register
7812           will_dirty_i&=~(1<<r);
7813           wont_dirty_i&=~(1<<r);
7814           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7815           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7816           if(wr) {
7817             #ifndef DESTRUCTIVE_WRITEBACK
7818             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7819             #endif
7820             regs[i].wasdirty|=will_dirty_i&(1<<r);
7821           }
7822         }
7823         else {
7824           will_dirty_i&=~(1<<r);
7825           wont_dirty_i&=~(1<<r);
7826           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7827             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7828             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7829           } else {
7830             wont_dirty_i|=1<<r;
7831             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7832           }
7833         }
7834       }
7835     }
7836   }
7837 }
7838
7839 #ifdef DISASM
7840   /* disassembly */
7841 void disassemble_inst(int i)
7842 {
7843     if (bt[i]) printf("*"); else printf(" ");
7844     switch(itype[i]) {
7845       case UJUMP:
7846         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7847       case CJUMP:
7848         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7849       case SJUMP:
7850         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7851       case FJUMP:
7852         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7853       case RJUMP:
7854         if (opcode[i]==0x9&&rt1[i]!=31)
7855           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7856         else
7857           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7858         break;
7859       case SPAN:
7860         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7861       case IMM16:
7862         if(opcode[i]==0xf) //LUI
7863           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7864         else
7865           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7866         break;
7867       case LOAD:
7868       case LOADLR:
7869         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7870         break;
7871       case STORE:
7872       case STORELR:
7873         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7874         break;
7875       case ALU:
7876       case SHIFT:
7877         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7878         break;
7879       case MULTDIV:
7880         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7881         break;
7882       case SHIFTIMM:
7883         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7884         break;
7885       case MOV:
7886         if((opcode2[i]&0x1d)==0x10)
7887           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7888         else if((opcode2[i]&0x1d)==0x11)
7889           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7890         else
7891           printf (" %x: %s\n",start+i*4,insn[i]);
7892         break;
7893       case COP0:
7894         if(opcode2[i]==0)
7895           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7896         else if(opcode2[i]==4)
7897           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7898         else printf (" %x: %s\n",start+i*4,insn[i]);
7899         break;
7900       case COP1:
7901         if(opcode2[i]<3)
7902           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7903         else if(opcode2[i]>3)
7904           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7905         else printf (" %x: %s\n",start+i*4,insn[i]);
7906         break;
7907       case COP2:
7908         if(opcode2[i]<3)
7909           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7910         else if(opcode2[i]>3)
7911           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7912         else printf (" %x: %s\n",start+i*4,insn[i]);
7913         break;
7914       case C1LS:
7915         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7916         break;
7917       case C2LS:
7918         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7919         break;
7920       case INTCALL:
7921         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7922         break;
7923       default:
7924         //printf (" %s %8x\n",insn[i],source[i]);
7925         printf (" %x: %s\n",start+i*4,insn[i]);
7926     }
7927 }
7928 #else
7929 static void disassemble_inst(int i) {}
7930 #endif // DISASM
7931
7932 #define DRC_TEST_VAL 0x74657374
7933
7934 static int new_dynarec_test(void)
7935 {
7936   #if defined(VITA)
7937     sceKernelOpenVMDomain();
7938   #endif
7939   int (*testfunc)(void) = (void *)out;
7940   int ret;
7941   emit_movimm(DRC_TEST_VAL,0); // test
7942   emit_jmpreg(14);
7943   literal_pool(0);
7944 #ifdef __arm__
7945 #if defined(VITA)
7946   sceKernelCloseVMDomain();
7947 #endif
7948   __clear_cache((void *)testfunc, out);
7949 #endif
7950   SysPrintf("testing if we can run recompiled code..\n");
7951   ret = testfunc();
7952   if (ret == DRC_TEST_VAL)
7953     SysPrintf("test passed.\n");
7954   else
7955     SysPrintf("test failed: %08x\n", ret);
7956   out=(u_char *)BASE_ADDR;
7957   return ret == DRC_TEST_VAL;
7958 }
7959
7960 // clear the state completely, instead of just marking
7961 // things invalid like invalidate_all_pages() does
7962 void new_dynarec_clear_full()
7963 {
7964   int n;
7965   out=(u_char *)BASE_ADDR;
7966   memset(invalid_code,1,sizeof(invalid_code));
7967   memset(hash_table,0xff,sizeof(hash_table));
7968   memset(mini_ht,-1,sizeof(mini_ht));
7969   memset(restore_candidate,0,sizeof(restore_candidate));
7970   memset(shadow,0,sizeof(shadow));
7971   copy=shadow;
7972   expirep=16384; // Expiry pointer, +2 blocks
7973   pending_exception=0;
7974   literalcount=0;
7975   stop_after_jal=0;
7976   inv_code_start=inv_code_end=~0;
7977   // TLB
7978 #ifndef DISABLE_TLB
7979   using_tlb=0;
7980   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7981     memory_map[n]=-1;
7982   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7983     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7984   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7985     memory_map[n]=-1;
7986 #endif
7987   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7988   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7989   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7990 }
7991
7992 void new_dynarec_init()
7993 {
7994   SysPrintf("Init new dynarec\n");
7995
7996 #if defined(VITA)
7997   out=(u_char *)mmap(translation_cache, 1<<TARGET_SIZE_2,
7998             0,
7999             0,
8000             -1, 0);
8001   if (out<=0) {
8002     SysPrintf("mmap() failed: %s\n", strerror(errno));
8003   }
8004   out=(u_char *)BASE_ADDR;
8005 #else
8006   out=(u_char *)BASE_ADDR;
8007   #if BASE_ADDR_FIXED
8008     if (mmap (out, 1<<TARGET_SIZE_2,
8009               PROT_READ | PROT_WRITE | PROT_EXEC,
8010               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
8011               -1, 0) <= 0) {
8012       SysPrintf("mmap() failed: %s\n", strerror(errno));
8013     }
8014   #else
8015     // not all systems allow execute in data segment by default
8016     if (mprotect(out, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
8017       SysPrintf("mprotect() failed: %s\n", strerror(errno));
8018   #endif
8019 #endif
8020 #ifdef MUPEN64
8021   rdword=&readmem_dword;
8022   fake_pc.f.r.rs=&readmem_dword;
8023   fake_pc.f.r.rt=&readmem_dword;
8024   fake_pc.f.r.rd=&readmem_dword;
8025 #endif
8026   int n;
8027   cycle_multiplier=200;
8028   new_dynarec_clear_full();
8029 #ifdef HOST_IMM8
8030   // Copy this into local area so we don't have to put it in every literal pool
8031   invc_ptr=invalid_code;
8032 #endif
8033 #ifdef MUPEN64
8034   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
8035     writemem[n] = write_nomem_new;
8036     writememb[n] = write_nomemb_new;
8037     writememh[n] = write_nomemh_new;
8038 #ifndef FORCE32
8039     writememd[n] = write_nomemd_new;
8040 #endif
8041     readmem[n] = read_nomem_new;
8042     readmemb[n] = read_nomemb_new;
8043     readmemh[n] = read_nomemh_new;
8044 #ifndef FORCE32
8045     readmemd[n] = read_nomemd_new;
8046 #endif
8047   }
8048   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
8049     writemem[n] = write_rdram_new;
8050     writememb[n] = write_rdramb_new;
8051     writememh[n] = write_rdramh_new;
8052 #ifndef FORCE32
8053     writememd[n] = write_rdramd_new;
8054 #endif
8055   }
8056   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
8057     writemem[n] = write_nomem_new;
8058     writememb[n] = write_nomemb_new;
8059     writememh[n] = write_nomemh_new;
8060 #ifndef FORCE32
8061     writememd[n] = write_nomemd_new;
8062 #endif
8063     readmem[n] = read_nomem_new;
8064     readmemb[n] = read_nomemb_new;
8065     readmemh[n] = read_nomemh_new;
8066 #ifndef FORCE32
8067     readmemd[n] = read_nomemd_new;
8068 #endif
8069   }
8070 #endif
8071   tlb_hacks();
8072   arch_init();
8073   new_dynarec_test();
8074 #ifndef RAM_FIXED
8075   ram_offset=(u_int)rdram-0x80000000;
8076 #endif
8077   if (ram_offset!=0)
8078     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
8079 }
8080
8081 void new_dynarec_cleanup()
8082 {
8083   int n;
8084   #if BASE_ADDR_FIXED || defined(VITA)
8085   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {SysPrintf("munmap() failed\n");}
8086   #endif
8087   for(n=0;n<4096;n++) ll_clear(jump_in+n);
8088   for(n=0;n<4096;n++) ll_clear(jump_out+n);
8089   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
8090   #ifdef ROM_COPY
8091   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
8092   #endif
8093 }
8094
8095 static u_int *get_source_start(u_int addr, u_int *limit)
8096 {
8097   if (addr < 0x00200000 ||
8098     (0xa0000000 <= addr && addr < 0xa0200000)) {
8099     // used for BIOS calls mostly?
8100     *limit = (addr&0xa0000000)|0x00200000;
8101     return (u_int *)((u_int)rdram + (addr&0x1fffff));
8102   }
8103   else if (!Config.HLE && (
8104     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
8105     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
8106     // BIOS
8107     *limit = (addr & 0xfff00000) | 0x80000;
8108     return (u_int *)((u_int)psxR + (addr&0x7ffff));
8109   }
8110   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
8111     *limit = (addr & 0x80600000) + 0x00200000;
8112     return (u_int *)((u_int)rdram + (addr&0x1fffff));
8113   }
8114 }
8115
8116 static u_int scan_for_ret(u_int addr)
8117 {
8118   u_int limit = 0;
8119   u_int *mem;
8120
8121   mem = get_source_start(addr, &limit);
8122   if (mem == NULL)
8123     return addr;
8124
8125   if (limit > addr + 0x1000)
8126     limit = addr + 0x1000;
8127   for (; addr < limit; addr += 4, mem++) {
8128     if (*mem == 0x03e00008) // jr $ra
8129       return addr + 8;
8130   }
8131 }
8132
8133 struct savestate_block {
8134   uint32_t addr;
8135   uint32_t regflags;
8136 };
8137
8138 static int addr_cmp(const void *p1_, const void *p2_)
8139 {
8140   const struct savestate_block *p1 = p1_, *p2 = p2_;
8141   return p1->addr - p2->addr;
8142 }
8143
8144 int new_dynarec_save_blocks(void *save, int size)
8145 {
8146   struct savestate_block *blocks = save;
8147   int maxcount = size / sizeof(blocks[0]);
8148   struct savestate_block tmp_blocks[1024];
8149   struct ll_entry *head;
8150   int p, s, d, o, bcnt;
8151   u_int addr;
8152
8153   o = 0;
8154   for (p = 0; p < sizeof(jump_in) / sizeof(jump_in[0]); p++) {
8155     bcnt = 0;
8156     for (head = jump_in[p]; head != NULL; head = head->next) {
8157       tmp_blocks[bcnt].addr = head->vaddr;
8158       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
8159       bcnt++;
8160     }
8161     if (bcnt < 1)
8162       continue;
8163     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
8164
8165     addr = tmp_blocks[0].addr;
8166     for (s = d = 0; s < bcnt; s++) {
8167       if (tmp_blocks[s].addr < addr)
8168         continue;
8169       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
8170         tmp_blocks[d++] = tmp_blocks[s];
8171       addr = scan_for_ret(tmp_blocks[s].addr);
8172     }
8173
8174     if (o + d > maxcount)
8175       d = maxcount - o;
8176     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
8177     o += d;
8178   }
8179
8180   return o * sizeof(blocks[0]);
8181 }
8182
8183 void new_dynarec_load_blocks(const void *save, int size)
8184 {
8185   const struct savestate_block *blocks = save;
8186   int count = size / sizeof(blocks[0]);
8187   u_int regs_save[32];
8188   uint32_t f;
8189   int i, b;
8190
8191   get_addr(psxRegs.pc);
8192
8193   // change GPRs for speculation to at least partially work..
8194   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
8195   for (i = 1; i < 32; i++)
8196     psxRegs.GPR.r[i] = 0x80000000;
8197
8198   for (b = 0; b < count; b++) {
8199     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
8200       if (f & 1)
8201         psxRegs.GPR.r[i] = 0x1f800000;
8202     }
8203
8204     get_addr(blocks[b].addr);
8205
8206     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
8207       if (f & 1)
8208         psxRegs.GPR.r[i] = 0x80000000;
8209     }
8210   }
8211
8212   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
8213 }
8214
8215 int new_recompile_block(int addr)
8216 {
8217   u_int pagelimit = 0;
8218   u_int state_rflags = 0;
8219   int i;
8220
8221   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8222   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8223   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
8224   //if(debug)
8225   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
8226   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
8227   /*if(Count>=312978186) {
8228     rlist();
8229   }*/
8230   //rlist();
8231
8232   // this is just for speculation
8233   for (i = 1; i < 32; i++) {
8234     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
8235       state_rflags |= 1 << i;
8236   }
8237
8238   start = (u_int)addr&~3;
8239   //assert(((u_int)addr&1)==0);
8240   new_dynarec_did_compile=1;
8241 #if defined(VITA)
8242   sceKernelOpenVMDomain();
8243 #endif
8244   if (Config.HLE && start == 0x80001000) // hlecall
8245   {
8246     // XXX: is this enough? Maybe check hleSoftCall?
8247     u_int beginning=(u_int)out;
8248     u_int page=get_page(start);
8249     invalid_code[start>>12]=0;
8250     emit_movimm(start,0);
8251     emit_writeword(0,(int)&pcaddr);
8252     emit_jmp((int)new_dyna_leave);
8253     literal_pool(0);
8254 #ifdef __arm__
8255   #if defined(VITA)
8256     sceKernelCloseVMDomain();
8257   #endif
8258     __clear_cache((void *)beginning,out);
8259 #endif
8260     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
8261     return 0;
8262   }
8263
8264   source = get_source_start(start, &pagelimit);
8265   if (source == NULL) {
8266     SysPrintf("Compile at bogus memory address: %08x\n", addr);
8267     exit(1);
8268   }
8269
8270   /* Pass 1: disassemble */
8271   /* Pass 2: register dependencies, branch targets */
8272   /* Pass 3: register allocation */
8273   /* Pass 4: branch dependencies */
8274   /* Pass 5: pre-alloc */
8275   /* Pass 6: optimize clean/dirty state */
8276   /* Pass 7: flag 32-bit registers */
8277   /* Pass 8: assembly */
8278   /* Pass 9: linker */
8279   /* Pass 10: garbage collection / free memory */
8280
8281   int j;
8282   int done=0;
8283   unsigned int type,op,op2;
8284
8285   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
8286
8287   /* Pass 1 disassembly */
8288
8289   for(i=0;!done;i++) {
8290     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
8291     minimum_free_regs[i]=0;
8292     opcode[i]=op=source[i]>>26;
8293     switch(op)
8294     {
8295       case 0x00: strcpy(insn[i],"special"); type=NI;
8296         op2=source[i]&0x3f;
8297         switch(op2)
8298         {
8299           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8300           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8301           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8302           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8303           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8304           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8305           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8306           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8307           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8308           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8309           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8310           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8311           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8312           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8313           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8314           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8315           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8316           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8317           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8318           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8319           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8320           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8321           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8322           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8323           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8324           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8325           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8326           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8327           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8328           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8329           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8330           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8331           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8332           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8333           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8334 #ifndef FORCE32
8335           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8336           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8337           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8338           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8339           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8340           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8341           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8342           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8343           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8344           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8345           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8346           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8347           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8348           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8349           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8350           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8351           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8352 #endif
8353         }
8354         break;
8355       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8356         op2=(source[i]>>16)&0x1f;
8357         switch(op2)
8358         {
8359           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8360           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8361           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8362           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8363           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8364           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8365           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8366           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8367           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8368           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8369           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8370           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8371           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8372           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8373         }
8374         break;
8375       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8376       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8377       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8378       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8379       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8380       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8381       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8382       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8383       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8384       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8385       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8386       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8387       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8388       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8389       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8390         op2=(source[i]>>21)&0x1f;
8391         switch(op2)
8392         {
8393           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8394           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8395           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8396           switch(source[i]&0x3f)
8397           {
8398             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8399             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8400             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8401             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8402 #ifdef PCSX
8403             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8404 #else
8405             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8406 #endif
8407           }
8408         }
8409         break;
8410       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8411         op2=(source[i]>>21)&0x1f;
8412         switch(op2)
8413         {
8414           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8415           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8416           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8417           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8418           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8419           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8420           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8421           switch((source[i]>>16)&0x3)
8422           {
8423             case 0x00: strcpy(insn[i],"BC1F"); break;
8424             case 0x01: strcpy(insn[i],"BC1T"); break;
8425             case 0x02: strcpy(insn[i],"BC1FL"); break;
8426             case 0x03: strcpy(insn[i],"BC1TL"); break;
8427           }
8428           break;
8429           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8430           switch(source[i]&0x3f)
8431           {
8432             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8433             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8434             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8435             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8436             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8437             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8438             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8439             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8440             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8441             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8442             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8443             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8444             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8445             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8446             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8447             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8448             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8449             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8450             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8451             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8452             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8453             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8454             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8455             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8456             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8457             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8458             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8459             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8460             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8461             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8462             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8463             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8464             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8465             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8466             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8467           }
8468           break;
8469           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8470           switch(source[i]&0x3f)
8471           {
8472             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8473             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8474             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8475             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8476             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8477             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8478             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8479             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8480             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8481             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8482             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8483             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8484             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8485             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8486             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8487             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8488             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8489             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8490             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8491             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8492             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8493             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8494             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8495             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8496             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8497             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8498             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8499             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8500             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8501             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8502             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8503             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8504             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8505             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8506             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8507           }
8508           break;
8509           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8510           switch(source[i]&0x3f)
8511           {
8512             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8513             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8514           }
8515           break;
8516           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8517           switch(source[i]&0x3f)
8518           {
8519             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8520             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8521           }
8522           break;
8523         }
8524         break;
8525 #ifndef FORCE32
8526       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8527       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8528       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8529       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8530       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8531       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8532       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8533       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8534 #endif
8535       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8536       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8537       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8538       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8539       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8540       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8541       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8542 #ifndef FORCE32
8543       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8544 #endif
8545       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8546       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8547       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8548       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8549 #ifndef FORCE32
8550       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8551       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8552 #endif
8553       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8554       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8555       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8556       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8557 #ifndef FORCE32
8558       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8559       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8560       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8561 #endif
8562       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8563       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8564 #ifndef FORCE32
8565       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8566       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8567       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8568 #endif
8569 #ifdef PCSX
8570       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8571         op2=(source[i]>>21)&0x1f;
8572         //if (op2 & 0x10) {
8573         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
8574           if (gte_handlers[source[i]&0x3f]!=NULL) {
8575             if (gte_regnames[source[i]&0x3f]!=NULL)
8576               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
8577             else
8578               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8579             type=C2OP;
8580           }
8581         }
8582         else switch(op2)
8583         {
8584           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8585           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8586           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8587           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8588         }
8589         break;
8590       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8591       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8592       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8593 #endif
8594       default: strcpy(insn[i],"???"); type=NI;
8595         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8596         break;
8597     }
8598     itype[i]=type;
8599     opcode2[i]=op2;
8600     /* Get registers/immediates */
8601     lt1[i]=0;
8602     us1[i]=0;
8603     us2[i]=0;
8604     dep1[i]=0;
8605     dep2[i]=0;
8606     gte_rs[i]=gte_rt[i]=0;
8607     switch(type) {
8608       case LOAD:
8609         rs1[i]=(source[i]>>21)&0x1f;
8610         rs2[i]=0;
8611         rt1[i]=(source[i]>>16)&0x1f;
8612         rt2[i]=0;
8613         imm[i]=(short)source[i];
8614         break;
8615       case STORE:
8616       case STORELR:
8617         rs1[i]=(source[i]>>21)&0x1f;
8618         rs2[i]=(source[i]>>16)&0x1f;
8619         rt1[i]=0;
8620         rt2[i]=0;
8621         imm[i]=(short)source[i];
8622         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8623         break;
8624       case LOADLR:
8625         // LWL/LWR only load part of the register,
8626         // therefore the target register must be treated as a source too
8627         rs1[i]=(source[i]>>21)&0x1f;
8628         rs2[i]=(source[i]>>16)&0x1f;
8629         rt1[i]=(source[i]>>16)&0x1f;
8630         rt2[i]=0;
8631         imm[i]=(short)source[i];
8632         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8633         if(op==0x26) dep1[i]=rt1[i]; // LWR
8634         break;
8635       case IMM16:
8636         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8637         else rs1[i]=(source[i]>>21)&0x1f;
8638         rs2[i]=0;
8639         rt1[i]=(source[i]>>16)&0x1f;
8640         rt2[i]=0;
8641         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8642           imm[i]=(unsigned short)source[i];
8643         }else{
8644           imm[i]=(short)source[i];
8645         }
8646         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8647         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8648         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8649         break;
8650       case UJUMP:
8651         rs1[i]=0;
8652         rs2[i]=0;
8653         rt1[i]=0;
8654         rt2[i]=0;
8655         // The JAL instruction writes to r31.
8656         if (op&1) {
8657           rt1[i]=31;
8658         }
8659         rs2[i]=CCREG;
8660         break;
8661       case RJUMP:
8662         rs1[i]=(source[i]>>21)&0x1f;
8663         rs2[i]=0;
8664         rt1[i]=0;
8665         rt2[i]=0;
8666         // The JALR instruction writes to rd.
8667         if (op2&1) {
8668           rt1[i]=(source[i]>>11)&0x1f;
8669         }
8670         rs2[i]=CCREG;
8671         break;
8672       case CJUMP:
8673         rs1[i]=(source[i]>>21)&0x1f;
8674         rs2[i]=(source[i]>>16)&0x1f;
8675         rt1[i]=0;
8676         rt2[i]=0;
8677         if(op&2) { // BGTZ/BLEZ
8678           rs2[i]=0;
8679         }
8680         us1[i]=rs1[i];
8681         us2[i]=rs2[i];
8682         likely[i]=op>>4;
8683         break;
8684       case SJUMP:
8685         rs1[i]=(source[i]>>21)&0x1f;
8686         rs2[i]=CCREG;
8687         rt1[i]=0;
8688         rt2[i]=0;
8689         us1[i]=rs1[i];
8690         if(op2&0x10) { // BxxAL
8691           rt1[i]=31;
8692           // NOTE: If the branch is not taken, r31 is still overwritten
8693         }
8694         likely[i]=(op2&2)>>1;
8695         break;
8696       case FJUMP:
8697         rs1[i]=FSREG;
8698         rs2[i]=CSREG;
8699         rt1[i]=0;
8700         rt2[i]=0;
8701         likely[i]=((source[i])>>17)&1;
8702         break;
8703       case ALU:
8704         rs1[i]=(source[i]>>21)&0x1f; // source
8705         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8706         rt1[i]=(source[i]>>11)&0x1f; // destination
8707         rt2[i]=0;
8708         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8709           us1[i]=rs1[i];us2[i]=rs2[i];
8710         }
8711         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8712           dep1[i]=rs1[i];dep2[i]=rs2[i];
8713         }
8714         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8715           dep1[i]=rs1[i];dep2[i]=rs2[i];
8716         }
8717         break;
8718       case MULTDIV:
8719         rs1[i]=(source[i]>>21)&0x1f; // source
8720         rs2[i]=(source[i]>>16)&0x1f; // divisor
8721         rt1[i]=HIREG;
8722         rt2[i]=LOREG;
8723         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8724           us1[i]=rs1[i];us2[i]=rs2[i];
8725         }
8726         break;
8727       case MOV:
8728         rs1[i]=0;
8729         rs2[i]=0;
8730         rt1[i]=0;
8731         rt2[i]=0;
8732         if(op2==0x10) rs1[i]=HIREG; // MFHI
8733         if(op2==0x11) rt1[i]=HIREG; // MTHI
8734         if(op2==0x12) rs1[i]=LOREG; // MFLO
8735         if(op2==0x13) rt1[i]=LOREG; // MTLO
8736         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8737         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8738         dep1[i]=rs1[i];
8739         break;
8740       case SHIFT:
8741         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8742         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8743         rt1[i]=(source[i]>>11)&0x1f; // destination
8744         rt2[i]=0;
8745         // DSLLV/DSRLV/DSRAV are 64-bit
8746         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8747         break;
8748       case SHIFTIMM:
8749         rs1[i]=(source[i]>>16)&0x1f;
8750         rs2[i]=0;
8751         rt1[i]=(source[i]>>11)&0x1f;
8752         rt2[i]=0;
8753         imm[i]=(source[i]>>6)&0x1f;
8754         // DSxx32 instructions
8755         if(op2>=0x3c) imm[i]|=0x20;
8756         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8757         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8758         break;
8759       case COP0:
8760         rs1[i]=0;
8761         rs2[i]=0;
8762         rt1[i]=0;
8763         rt2[i]=0;
8764         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8765         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8766         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8767         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8768         break;
8769       case COP1:
8770         rs1[i]=0;
8771         rs2[i]=0;
8772         rt1[i]=0;
8773         rt2[i]=0;
8774         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8775         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8776         if(op2==5) us1[i]=rs1[i]; // DMTC1
8777         rs2[i]=CSREG;
8778         break;
8779       case COP2:
8780         rs1[i]=0;
8781         rs2[i]=0;
8782         rt1[i]=0;
8783         rt2[i]=0;
8784         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
8785         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
8786         rs2[i]=CSREG;
8787         int gr=(source[i]>>11)&0x1F;
8788         switch(op2)
8789         {
8790           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
8791           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
8792           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
8793           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
8794         }
8795         break;
8796       case C1LS:
8797         rs1[i]=(source[i]>>21)&0x1F;
8798         rs2[i]=CSREG;
8799         rt1[i]=0;
8800         rt2[i]=0;
8801         imm[i]=(short)source[i];
8802         break;
8803       case C2LS:
8804         rs1[i]=(source[i]>>21)&0x1F;
8805         rs2[i]=0;
8806         rt1[i]=0;
8807         rt2[i]=0;
8808         imm[i]=(short)source[i];
8809         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
8810         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
8811         break;
8812       case C2OP:
8813         rs1[i]=0;
8814         rs2[i]=0;
8815         rt1[i]=0;
8816         rt2[i]=0;
8817         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
8818         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
8819         gte_rt[i]|=1ll<<63; // every op changes flags
8820         if((source[i]&0x3f)==GTE_MVMVA) {
8821           int v = (source[i] >> 15) & 3;
8822           gte_rs[i]&=~0xe3fll;
8823           if(v==3) gte_rs[i]|=0xe00ll;
8824           else gte_rs[i]|=3ll<<(v*2);
8825         }
8826         break;
8827       case FLOAT:
8828       case FCONV:
8829         rs1[i]=0;
8830         rs2[i]=CSREG;
8831         rt1[i]=0;
8832         rt2[i]=0;
8833         break;
8834       case FCOMP:
8835         rs1[i]=FSREG;
8836         rs2[i]=CSREG;
8837         rt1[i]=FSREG;
8838         rt2[i]=0;
8839         break;
8840       case SYSCALL:
8841       case HLECALL:
8842       case INTCALL:
8843         rs1[i]=CCREG;
8844         rs2[i]=0;
8845         rt1[i]=0;
8846         rt2[i]=0;
8847         break;
8848       default:
8849         rs1[i]=0;
8850         rs2[i]=0;
8851         rt1[i]=0;
8852         rt2[i]=0;
8853     }
8854     /* Calculate branch target addresses */
8855     if(type==UJUMP)
8856       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8857     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8858       ba[i]=start+i*4+8; // Ignore never taken branch
8859     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8860       ba[i]=start+i*4+8; // Ignore never taken branch
8861     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8862       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8863     else ba[i]=-1;
8864 #ifdef PCSX
8865     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8866       int do_in_intrp=0;
8867       // branch in delay slot?
8868       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8869         // don't handle first branch and call interpreter if it's hit
8870         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8871         do_in_intrp=1;
8872       }
8873       // basic load delay detection
8874       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8875         int t=(ba[i-1]-start)/4;
8876         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8877           // jump target wants DS result - potential load delay effect
8878           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
8879           do_in_intrp=1;
8880           bt[t+1]=1; // expected return from interpreter
8881         }
8882         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8883               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8884           // v0 overwrite like this is a sign of trouble, bail out
8885           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8886           do_in_intrp=1;
8887         }
8888       }
8889       if(do_in_intrp) {
8890         rs1[i-1]=CCREG;
8891         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8892         ba[i-1]=-1;
8893         itype[i-1]=INTCALL;
8894         done=2;
8895         i--; // don't compile the DS
8896       }
8897     }
8898 #endif
8899     /* Is this the end of the block? */
8900     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8901       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8902         done=2;
8903       }
8904       else {
8905         if(stop_after_jal) done=1;
8906         // Stop on BREAK
8907         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8908       }
8909       // Don't recompile stuff that's already compiled
8910       if(check_addr(start+i*4+4)) done=1;
8911       // Don't get too close to the limit
8912       if(i>MAXBLOCK/2) done=1;
8913     }
8914     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8915     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8916     if(done==2) {
8917       // Does the block continue due to a branch?
8918       for(j=i-1;j>=0;j--)
8919       {
8920         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
8921         if(ba[j]==start+i*4+4) done=j=0;
8922         if(ba[j]==start+i*4+8) done=j=0;
8923       }
8924     }
8925     //assert(i<MAXBLOCK-1);
8926     if(start+i*4==pagelimit-4) done=1;
8927     assert(start+i*4<pagelimit);
8928     if (i==MAXBLOCK-1) done=1;
8929     // Stop if we're compiling junk
8930     if(itype[i]==NI&&opcode[i]==0x11) {
8931       done=stop_after_jal=1;
8932       SysPrintf("Disabled speculative precompilation\n");
8933     }
8934   }
8935   slen=i;
8936   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8937     if(start+i*4==pagelimit) {
8938       itype[i-1]=SPAN;
8939     }
8940   }
8941   assert(slen>0);
8942
8943   /* Pass 2 - Register dependencies and branch targets */
8944
8945   unneeded_registers(0,slen-1,0);
8946
8947   /* Pass 3 - Register allocation */
8948
8949   struct regstat current; // Current register allocations/status
8950   current.is32=1;
8951   current.dirty=0;
8952   current.u=unneeded_reg[0];
8953   current.uu=unneeded_reg_upper[0];
8954   clear_all_regs(current.regmap);
8955   alloc_reg(&current,0,CCREG);
8956   dirty_reg(&current,CCREG);
8957   current.isconst=0;
8958   current.wasconst=0;
8959   current.waswritten=0;
8960   int ds=0;
8961   int cc=0;
8962   int hr=-1;
8963
8964 #ifndef FORCE32
8965   provisional_32bit();
8966 #endif
8967   if((u_int)addr&1) {
8968     // First instruction is delay slot
8969     cc=-1;
8970     bt[1]=1;
8971     ds=1;
8972     unneeded_reg[0]=1;
8973     unneeded_reg_upper[0]=1;
8974     current.regmap[HOST_BTREG]=BTREG;
8975   }
8976
8977   for(i=0;i<slen;i++)
8978   {
8979     if(bt[i])
8980     {
8981       int hr;
8982       for(hr=0;hr<HOST_REGS;hr++)
8983       {
8984         // Is this really necessary?
8985         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8986       }
8987       current.isconst=0;
8988       current.waswritten=0;
8989     }
8990     if(i>1)
8991     {
8992       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8993       {
8994         if(rs1[i-2]==0||rs2[i-2]==0)
8995         {
8996           if(rs1[i-2]) {
8997             current.is32|=1LL<<rs1[i-2];
8998             int hr=get_reg(current.regmap,rs1[i-2]|64);
8999             if(hr>=0) current.regmap[hr]=-1;
9000           }
9001           if(rs2[i-2]) {
9002             current.is32|=1LL<<rs2[i-2];
9003             int hr=get_reg(current.regmap,rs2[i-2]|64);
9004             if(hr>=0) current.regmap[hr]=-1;
9005           }
9006         }
9007       }
9008     }
9009 #ifndef FORCE32
9010     // If something jumps here with 64-bit values
9011     // then promote those registers to 64 bits
9012     if(bt[i])
9013     {
9014       uint64_t temp_is32=current.is32;
9015       for(j=i-1;j>=0;j--)
9016       {
9017         if(ba[j]==start+i*4)
9018           temp_is32&=branch_regs[j].is32;
9019       }
9020       for(j=i;j<slen;j++)
9021       {
9022         if(ba[j]==start+i*4)
9023           //temp_is32=1;
9024           temp_is32&=p32[j];
9025       }
9026       if(temp_is32!=current.is32) {
9027         //printf("dumping 32-bit regs (%x)\n",start+i*4);
9028         #ifndef DESTRUCTIVE_WRITEBACK
9029         if(ds)
9030         #endif
9031         for(hr=0;hr<HOST_REGS;hr++)
9032         {
9033           int r=current.regmap[hr];
9034           if(r>0&&r<64)
9035           {
9036             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
9037               temp_is32|=1LL<<r;
9038               //printf("restore %d\n",r);
9039             }
9040           }
9041         }
9042         current.is32=temp_is32;
9043       }
9044     }
9045 #else
9046     current.is32=-1LL;
9047 #endif
9048
9049     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
9050     regs[i].wasconst=current.isconst;
9051     regs[i].was32=current.is32;
9052     regs[i].wasdirty=current.dirty;
9053     regs[i].loadedconst=0;
9054     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
9055     // To change a dirty register from 32 to 64 bits, we must write
9056     // it out during the previous cycle (for branches, 2 cycles)
9057     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
9058     {
9059       uint64_t temp_is32=current.is32;
9060       for(j=i-1;j>=0;j--)
9061       {
9062         if(ba[j]==start+i*4+4)
9063           temp_is32&=branch_regs[j].is32;
9064       }
9065       for(j=i;j<slen;j++)
9066       {
9067         if(ba[j]==start+i*4+4)
9068           //temp_is32=1;
9069           temp_is32&=p32[j];
9070       }
9071       if(temp_is32!=current.is32) {
9072         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
9073         for(hr=0;hr<HOST_REGS;hr++)
9074         {
9075           int r=current.regmap[hr];
9076           if(r>0)
9077           {
9078             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
9079               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
9080               {
9081                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
9082                 {
9083                   //printf("dump %d/r%d\n",hr,r);
9084                   current.regmap[hr]=-1;
9085                   if(get_reg(current.regmap,r|64)>=0)
9086                     current.regmap[get_reg(current.regmap,r|64)]=-1;
9087                 }
9088               }
9089             }
9090           }
9091         }
9092       }
9093     }
9094     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
9095     {
9096       uint64_t temp_is32=current.is32;
9097       for(j=i-1;j>=0;j--)
9098       {
9099         if(ba[j]==start+i*4+8)
9100           temp_is32&=branch_regs[j].is32;
9101       }
9102       for(j=i;j<slen;j++)
9103       {
9104         if(ba[j]==start+i*4+8)
9105           //temp_is32=1;
9106           temp_is32&=p32[j];
9107       }
9108       if(temp_is32!=current.is32) {
9109         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
9110         for(hr=0;hr<HOST_REGS;hr++)
9111         {
9112           int r=current.regmap[hr];
9113           if(r>0)
9114           {
9115             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
9116               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
9117               {
9118                 //printf("dump %d/r%d\n",hr,r);
9119                 current.regmap[hr]=-1;
9120                 if(get_reg(current.regmap,r|64)>=0)
9121                   current.regmap[get_reg(current.regmap,r|64)]=-1;
9122               }
9123             }
9124           }
9125         }
9126       }
9127     }
9128     #endif
9129     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9130       if(i+1<slen) {
9131         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9132         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9133         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9134         current.u|=1;
9135         current.uu|=1;
9136       } else {
9137         current.u=1;
9138         current.uu=1;
9139       }
9140     } else {
9141       if(i+1<slen) {
9142         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
9143         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9144         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9145         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
9146         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9147         current.u|=1;
9148         current.uu|=1;
9149       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
9150     }
9151     is_ds[i]=ds;
9152     if(ds) {
9153       ds=0; // Skip delay slot, already allocated as part of branch
9154       // ...but we need to alloc it in case something jumps here
9155       if(i+1<slen) {
9156         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
9157         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
9158       }else{
9159         current.u=branch_unneeded_reg[i-1];
9160         current.uu=branch_unneeded_reg_upper[i-1];
9161       }
9162       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
9163       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9164       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9165       current.u|=1;
9166       current.uu|=1;
9167       struct regstat temp;
9168       memcpy(&temp,&current,sizeof(current));
9169       temp.wasdirty=temp.dirty;
9170       temp.was32=temp.is32;
9171       // TODO: Take into account unconditional branches, as below
9172       delayslot_alloc(&temp,i);
9173       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
9174       regs[i].wasdirty=temp.wasdirty;
9175       regs[i].was32=temp.was32;
9176       regs[i].dirty=temp.dirty;
9177       regs[i].is32=temp.is32;
9178       regs[i].isconst=0;
9179       regs[i].wasconst=0;
9180       current.isconst=0;
9181       // Create entry (branch target) regmap
9182       for(hr=0;hr<HOST_REGS;hr++)
9183       {
9184         int r=temp.regmap[hr];
9185         if(r>=0) {
9186           if(r!=regmap_pre[i][hr]) {
9187             regs[i].regmap_entry[hr]=-1;
9188           }
9189           else
9190           {
9191             if(r<64){
9192               if((current.u>>r)&1) {
9193                 regs[i].regmap_entry[hr]=-1;
9194                 regs[i].regmap[hr]=-1;
9195                 //Don't clear regs in the delay slot as the branch might need them
9196                 //current.regmap[hr]=-1;
9197               }else
9198                 regs[i].regmap_entry[hr]=r;
9199             }
9200             else {
9201               if((current.uu>>(r&63))&1) {
9202                 regs[i].regmap_entry[hr]=-1;
9203                 regs[i].regmap[hr]=-1;
9204                 //Don't clear regs in the delay slot as the branch might need them
9205                 //current.regmap[hr]=-1;
9206               }else
9207                 regs[i].regmap_entry[hr]=r;
9208             }
9209           }
9210         } else {
9211           // First instruction expects CCREG to be allocated
9212           if(i==0&&hr==HOST_CCREG)
9213             regs[i].regmap_entry[hr]=CCREG;
9214           else
9215             regs[i].regmap_entry[hr]=-1;
9216         }
9217       }
9218     }
9219     else { // Not delay slot
9220       switch(itype[i]) {
9221         case UJUMP:
9222           //current.isconst=0; // DEBUG
9223           //current.wasconst=0; // DEBUG
9224           //regs[i].wasconst=0; // DEBUG
9225           clear_const(&current,rt1[i]);
9226           alloc_cc(&current,i);
9227           dirty_reg(&current,CCREG);
9228           if (rt1[i]==31) {
9229             alloc_reg(&current,i,31);
9230             dirty_reg(&current,31);
9231             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
9232             //assert(rt1[i+1]!=rt1[i]);
9233             #ifdef REG_PREFETCH
9234             alloc_reg(&current,i,PTEMP);
9235             #endif
9236             //current.is32|=1LL<<rt1[i];
9237           }
9238           ooo[i]=1;
9239           delayslot_alloc(&current,i+1);
9240           //current.isconst=0; // DEBUG
9241           ds=1;
9242           //printf("i=%d, isconst=%x\n",i,current.isconst);
9243           break;
9244         case RJUMP:
9245           //current.isconst=0;
9246           //current.wasconst=0;
9247           //regs[i].wasconst=0;
9248           clear_const(&current,rs1[i]);
9249           clear_const(&current,rt1[i]);
9250           alloc_cc(&current,i);
9251           dirty_reg(&current,CCREG);
9252           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
9253             alloc_reg(&current,i,rs1[i]);
9254             if (rt1[i]!=0) {
9255               alloc_reg(&current,i,rt1[i]);
9256               dirty_reg(&current,rt1[i]);
9257               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
9258               assert(rt1[i+1]!=rt1[i]);
9259               #ifdef REG_PREFETCH
9260               alloc_reg(&current,i,PTEMP);
9261               #endif
9262             }
9263             #ifdef USE_MINI_HT
9264             if(rs1[i]==31) { // JALR
9265               alloc_reg(&current,i,RHASH);
9266               #ifndef HOST_IMM_ADDR32
9267               alloc_reg(&current,i,RHTBL);
9268               #endif
9269             }
9270             #endif
9271             delayslot_alloc(&current,i+1);
9272           } else {
9273             // The delay slot overwrites our source register,
9274             // allocate a temporary register to hold the old value.
9275             current.isconst=0;
9276             current.wasconst=0;
9277             regs[i].wasconst=0;
9278             delayslot_alloc(&current,i+1);
9279             current.isconst=0;
9280             alloc_reg(&current,i,RTEMP);
9281           }
9282           //current.isconst=0; // DEBUG
9283           ooo[i]=1;
9284           ds=1;
9285           break;
9286         case CJUMP:
9287           //current.isconst=0;
9288           //current.wasconst=0;
9289           //regs[i].wasconst=0;
9290           clear_const(&current,rs1[i]);
9291           clear_const(&current,rs2[i]);
9292           if((opcode[i]&0x3E)==4) // BEQ/BNE
9293           {
9294             alloc_cc(&current,i);
9295             dirty_reg(&current,CCREG);
9296             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9297             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9298             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9299             {
9300               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9301               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9302             }
9303             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
9304                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
9305               // The delay slot overwrites one of our conditions.
9306               // Allocate the branch condition registers instead.
9307               current.isconst=0;
9308               current.wasconst=0;
9309               regs[i].wasconst=0;
9310               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9311               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9312               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9313               {
9314                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9315                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9316               }
9317             }
9318             else
9319             {
9320               ooo[i]=1;
9321               delayslot_alloc(&current,i+1);
9322             }
9323           }
9324           else
9325           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
9326           {
9327             alloc_cc(&current,i);
9328             dirty_reg(&current,CCREG);
9329             alloc_reg(&current,i,rs1[i]);
9330             if(!(current.is32>>rs1[i]&1))
9331             {
9332               alloc_reg64(&current,i,rs1[i]);
9333             }
9334             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
9335               // The delay slot overwrites one of our conditions.
9336               // Allocate the branch condition registers instead.
9337               current.isconst=0;
9338               current.wasconst=0;
9339               regs[i].wasconst=0;
9340               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9341               if(!((current.is32>>rs1[i])&1))
9342               {
9343                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9344               }
9345             }
9346             else
9347             {
9348               ooo[i]=1;
9349               delayslot_alloc(&current,i+1);
9350             }
9351           }
9352           else
9353           // Don't alloc the delay slot yet because we might not execute it
9354           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9355           {
9356             current.isconst=0;
9357             current.wasconst=0;
9358             regs[i].wasconst=0;
9359             alloc_cc(&current,i);
9360             dirty_reg(&current,CCREG);
9361             alloc_reg(&current,i,rs1[i]);
9362             alloc_reg(&current,i,rs2[i]);
9363             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9364             {
9365               alloc_reg64(&current,i,rs1[i]);
9366               alloc_reg64(&current,i,rs2[i]);
9367             }
9368           }
9369           else
9370           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9371           {
9372             current.isconst=0;
9373             current.wasconst=0;
9374             regs[i].wasconst=0;
9375             alloc_cc(&current,i);
9376             dirty_reg(&current,CCREG);
9377             alloc_reg(&current,i,rs1[i]);
9378             if(!(current.is32>>rs1[i]&1))
9379             {
9380               alloc_reg64(&current,i,rs1[i]);
9381             }
9382           }
9383           ds=1;
9384           //current.isconst=0;
9385           break;
9386         case SJUMP:
9387           //current.isconst=0;
9388           //current.wasconst=0;
9389           //regs[i].wasconst=0;
9390           clear_const(&current,rs1[i]);
9391           clear_const(&current,rt1[i]);
9392           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9393           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9394           {
9395             alloc_cc(&current,i);
9396             dirty_reg(&current,CCREG);
9397             alloc_reg(&current,i,rs1[i]);
9398             if(!(current.is32>>rs1[i]&1))
9399             {
9400               alloc_reg64(&current,i,rs1[i]);
9401             }
9402             if (rt1[i]==31) { // BLTZAL/BGEZAL
9403               alloc_reg(&current,i,31);
9404               dirty_reg(&current,31);
9405               //#ifdef REG_PREFETCH
9406               //alloc_reg(&current,i,PTEMP);
9407               //#endif
9408               //current.is32|=1LL<<rt1[i];
9409             }
9410             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9411                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9412               // Allocate the branch condition registers instead.
9413               current.isconst=0;
9414               current.wasconst=0;
9415               regs[i].wasconst=0;
9416               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9417               if(!((current.is32>>rs1[i])&1))
9418               {
9419                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9420               }
9421             }
9422             else
9423             {
9424               ooo[i]=1;
9425               delayslot_alloc(&current,i+1);
9426             }
9427           }
9428           else
9429           // Don't alloc the delay slot yet because we might not execute it
9430           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9431           {
9432             current.isconst=0;
9433             current.wasconst=0;
9434             regs[i].wasconst=0;
9435             alloc_cc(&current,i);
9436             dirty_reg(&current,CCREG);
9437             alloc_reg(&current,i,rs1[i]);
9438             if(!(current.is32>>rs1[i]&1))
9439             {
9440               alloc_reg64(&current,i,rs1[i]);
9441             }
9442           }
9443           ds=1;
9444           //current.isconst=0;
9445           break;
9446         case FJUMP:
9447           current.isconst=0;
9448           current.wasconst=0;
9449           regs[i].wasconst=0;
9450           if(likely[i]==0) // BC1F/BC1T
9451           {
9452             // TODO: Theoretically we can run out of registers here on x86.
9453             // The delay slot can allocate up to six, and we need to check
9454             // CSREG before executing the delay slot.  Possibly we can drop
9455             // the cycle count and then reload it after checking that the
9456             // FPU is in a usable state, or don't do out-of-order execution.
9457             alloc_cc(&current,i);
9458             dirty_reg(&current,CCREG);
9459             alloc_reg(&current,i,FSREG);
9460             alloc_reg(&current,i,CSREG);
9461             if(itype[i+1]==FCOMP) {
9462               // The delay slot overwrites the branch condition.
9463               // Allocate the branch condition registers instead.
9464               alloc_cc(&current,i);
9465               dirty_reg(&current,CCREG);
9466               alloc_reg(&current,i,CSREG);
9467               alloc_reg(&current,i,FSREG);
9468             }
9469             else {
9470               ooo[i]=1;
9471               delayslot_alloc(&current,i+1);
9472               alloc_reg(&current,i+1,CSREG);
9473             }
9474           }
9475           else
9476           // Don't alloc the delay slot yet because we might not execute it
9477           if(likely[i]) // BC1FL/BC1TL
9478           {
9479             alloc_cc(&current,i);
9480             dirty_reg(&current,CCREG);
9481             alloc_reg(&current,i,CSREG);
9482             alloc_reg(&current,i,FSREG);
9483           }
9484           ds=1;
9485           current.isconst=0;
9486           break;
9487         case IMM16:
9488           imm16_alloc(&current,i);
9489           break;
9490         case LOAD:
9491         case LOADLR:
9492           load_alloc(&current,i);
9493           break;
9494         case STORE:
9495         case STORELR:
9496           store_alloc(&current,i);
9497           break;
9498         case ALU:
9499           alu_alloc(&current,i);
9500           break;
9501         case SHIFT:
9502           shift_alloc(&current,i);
9503           break;
9504         case MULTDIV:
9505           multdiv_alloc(&current,i);
9506           break;
9507         case SHIFTIMM:
9508           shiftimm_alloc(&current,i);
9509           break;
9510         case MOV:
9511           mov_alloc(&current,i);
9512           break;
9513         case COP0:
9514           cop0_alloc(&current,i);
9515           break;
9516         case COP1:
9517         case COP2:
9518           cop1_alloc(&current,i);
9519           break;
9520         case C1LS:
9521           c1ls_alloc(&current,i);
9522           break;
9523         case C2LS:
9524           c2ls_alloc(&current,i);
9525           break;
9526         case C2OP:
9527           c2op_alloc(&current,i);
9528           break;
9529         case FCONV:
9530           fconv_alloc(&current,i);
9531           break;
9532         case FLOAT:
9533           float_alloc(&current,i);
9534           break;
9535         case FCOMP:
9536           fcomp_alloc(&current,i);
9537           break;
9538         case SYSCALL:
9539         case HLECALL:
9540         case INTCALL:
9541           syscall_alloc(&current,i);
9542           break;
9543         case SPAN:
9544           pagespan_alloc(&current,i);
9545           break;
9546       }
9547
9548       // Drop the upper half of registers that have become 32-bit
9549       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9550       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9551         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9552         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9553         current.uu|=1;
9554       } else {
9555         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9556         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9557         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9558         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9559         current.uu|=1;
9560       }
9561
9562       // Create entry (branch target) regmap
9563       for(hr=0;hr<HOST_REGS;hr++)
9564       {
9565         int r,or,er;
9566         r=current.regmap[hr];
9567         if(r>=0) {
9568           if(r!=regmap_pre[i][hr]) {
9569             // TODO: delay slot (?)
9570             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9571             if(or<0||(r&63)>=TEMPREG){
9572               regs[i].regmap_entry[hr]=-1;
9573             }
9574             else
9575             {
9576               // Just move it to a different register
9577               regs[i].regmap_entry[hr]=r;
9578               // If it was dirty before, it's still dirty
9579               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9580             }
9581           }
9582           else
9583           {
9584             // Unneeded
9585             if(r==0){
9586               regs[i].regmap_entry[hr]=0;
9587             }
9588             else
9589             if(r<64){
9590               if((current.u>>r)&1) {
9591                 regs[i].regmap_entry[hr]=-1;
9592                 //regs[i].regmap[hr]=-1;
9593                 current.regmap[hr]=-1;
9594               }else
9595                 regs[i].regmap_entry[hr]=r;
9596             }
9597             else {
9598               if((current.uu>>(r&63))&1) {
9599                 regs[i].regmap_entry[hr]=-1;
9600                 //regs[i].regmap[hr]=-1;
9601                 current.regmap[hr]=-1;
9602               }else
9603                 regs[i].regmap_entry[hr]=r;
9604             }
9605           }
9606         } else {
9607           // Branches expect CCREG to be allocated at the target
9608           if(regmap_pre[i][hr]==CCREG)
9609             regs[i].regmap_entry[hr]=CCREG;
9610           else
9611             regs[i].regmap_entry[hr]=-1;
9612         }
9613       }
9614       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9615     }
9616
9617     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
9618       current.waswritten|=1<<rs1[i-1];
9619     current.waswritten&=~(1<<rt1[i]);
9620     current.waswritten&=~(1<<rt2[i]);
9621     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
9622       current.waswritten&=~(1<<rs1[i]);
9623
9624     /* Branch post-alloc */
9625     if(i>0)
9626     {
9627       current.was32=current.is32;
9628       current.wasdirty=current.dirty;
9629       switch(itype[i-1]) {
9630         case UJUMP:
9631           memcpy(&branch_regs[i-1],&current,sizeof(current));
9632           branch_regs[i-1].isconst=0;
9633           branch_regs[i-1].wasconst=0;
9634           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9635           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9636           alloc_cc(&branch_regs[i-1],i-1);
9637           dirty_reg(&branch_regs[i-1],CCREG);
9638           if(rt1[i-1]==31) { // JAL
9639             alloc_reg(&branch_regs[i-1],i-1,31);
9640             dirty_reg(&branch_regs[i-1],31);
9641             branch_regs[i-1].is32|=1LL<<31;
9642           }
9643           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9644           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9645           break;
9646         case RJUMP:
9647           memcpy(&branch_regs[i-1],&current,sizeof(current));
9648           branch_regs[i-1].isconst=0;
9649           branch_regs[i-1].wasconst=0;
9650           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9651           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9652           alloc_cc(&branch_regs[i-1],i-1);
9653           dirty_reg(&branch_regs[i-1],CCREG);
9654           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9655           if(rt1[i-1]!=0) { // JALR
9656             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9657             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9658             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9659           }
9660           #ifdef USE_MINI_HT
9661           if(rs1[i-1]==31) { // JALR
9662             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9663             #ifndef HOST_IMM_ADDR32
9664             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9665             #endif
9666           }
9667           #endif
9668           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9669           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9670           break;
9671         case CJUMP:
9672           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9673           {
9674             alloc_cc(&current,i-1);
9675             dirty_reg(&current,CCREG);
9676             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9677                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9678               // The delay slot overwrote one of our conditions
9679               // Delay slot goes after the test (in order)
9680               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9681               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9682               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9683               current.u|=1;
9684               current.uu|=1;
9685               delayslot_alloc(&current,i);
9686               current.isconst=0;
9687             }
9688             else
9689             {
9690               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9691               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9692               // Alloc the branch condition registers
9693               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9694               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9695               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9696               {
9697                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9698                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9699               }
9700             }
9701             memcpy(&branch_regs[i-1],&current,sizeof(current));
9702             branch_regs[i-1].isconst=0;
9703             branch_regs[i-1].wasconst=0;
9704             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9705             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9706           }
9707           else
9708           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9709           {
9710             alloc_cc(&current,i-1);
9711             dirty_reg(&current,CCREG);
9712             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9713               // The delay slot overwrote the branch condition
9714               // Delay slot goes after the test (in order)
9715               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9716               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9717               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9718               current.u|=1;
9719               current.uu|=1;
9720               delayslot_alloc(&current,i);
9721               current.isconst=0;
9722             }
9723             else
9724             {
9725               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9726               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9727               // Alloc the branch condition register
9728               alloc_reg(&current,i-1,rs1[i-1]);
9729               if(!(current.is32>>rs1[i-1]&1))
9730               {
9731                 alloc_reg64(&current,i-1,rs1[i-1]);
9732               }
9733             }
9734             memcpy(&branch_regs[i-1],&current,sizeof(current));
9735             branch_regs[i-1].isconst=0;
9736             branch_regs[i-1].wasconst=0;
9737             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9738             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9739           }
9740           else
9741           // Alloc the delay slot in case the branch is taken
9742           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9743           {
9744             memcpy(&branch_regs[i-1],&current,sizeof(current));
9745             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9746             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9747             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9748             alloc_cc(&branch_regs[i-1],i);
9749             dirty_reg(&branch_regs[i-1],CCREG);
9750             delayslot_alloc(&branch_regs[i-1],i);
9751             branch_regs[i-1].isconst=0;
9752             alloc_reg(&current,i,CCREG); // Not taken path
9753             dirty_reg(&current,CCREG);
9754             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9755           }
9756           else
9757           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9758           {
9759             memcpy(&branch_regs[i-1],&current,sizeof(current));
9760             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9761             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9762             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9763             alloc_cc(&branch_regs[i-1],i);
9764             dirty_reg(&branch_regs[i-1],CCREG);
9765             delayslot_alloc(&branch_regs[i-1],i);
9766             branch_regs[i-1].isconst=0;
9767             alloc_reg(&current,i,CCREG); // Not taken path
9768             dirty_reg(&current,CCREG);
9769             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9770           }
9771           break;
9772         case SJUMP:
9773           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9774           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9775           {
9776             alloc_cc(&current,i-1);
9777             dirty_reg(&current,CCREG);
9778             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9779               // The delay slot overwrote the branch condition
9780               // Delay slot goes after the test (in order)
9781               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9782               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9783               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9784               current.u|=1;
9785               current.uu|=1;
9786               delayslot_alloc(&current,i);
9787               current.isconst=0;
9788             }
9789             else
9790             {
9791               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9792               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9793               // Alloc the branch condition register
9794               alloc_reg(&current,i-1,rs1[i-1]);
9795               if(!(current.is32>>rs1[i-1]&1))
9796               {
9797                 alloc_reg64(&current,i-1,rs1[i-1]);
9798               }
9799             }
9800             memcpy(&branch_regs[i-1],&current,sizeof(current));
9801             branch_regs[i-1].isconst=0;
9802             branch_regs[i-1].wasconst=0;
9803             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9804             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9805           }
9806           else
9807           // Alloc the delay slot in case the branch is taken
9808           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9809           {
9810             memcpy(&branch_regs[i-1],&current,sizeof(current));
9811             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9812             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9813             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9814             alloc_cc(&branch_regs[i-1],i);
9815             dirty_reg(&branch_regs[i-1],CCREG);
9816             delayslot_alloc(&branch_regs[i-1],i);
9817             branch_regs[i-1].isconst=0;
9818             alloc_reg(&current,i,CCREG); // Not taken path
9819             dirty_reg(&current,CCREG);
9820             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9821           }
9822           // FIXME: BLTZAL/BGEZAL
9823           if(opcode2[i-1]&0x10) { // BxxZAL
9824             alloc_reg(&branch_regs[i-1],i-1,31);
9825             dirty_reg(&branch_regs[i-1],31);
9826             branch_regs[i-1].is32|=1LL<<31;
9827           }
9828           break;
9829         case FJUMP:
9830           if(likely[i-1]==0) // BC1F/BC1T
9831           {
9832             alloc_cc(&current,i-1);
9833             dirty_reg(&current,CCREG);
9834             if(itype[i]==FCOMP) {
9835               // The delay slot overwrote the branch condition
9836               // Delay slot goes after the test (in order)
9837               delayslot_alloc(&current,i);
9838               current.isconst=0;
9839             }
9840             else
9841             {
9842               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9843               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9844               // Alloc the branch condition register
9845               alloc_reg(&current,i-1,FSREG);
9846             }
9847             memcpy(&branch_regs[i-1],&current,sizeof(current));
9848             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9849           }
9850           else // BC1FL/BC1TL
9851           {
9852             // Alloc the delay slot in case the branch is taken
9853             memcpy(&branch_regs[i-1],&current,sizeof(current));
9854             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9855             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9856             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9857             alloc_cc(&branch_regs[i-1],i);
9858             dirty_reg(&branch_regs[i-1],CCREG);
9859             delayslot_alloc(&branch_regs[i-1],i);
9860             branch_regs[i-1].isconst=0;
9861             alloc_reg(&current,i,CCREG); // Not taken path
9862             dirty_reg(&current,CCREG);
9863             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9864           }
9865           break;
9866       }
9867
9868       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9869       {
9870         if(rt1[i-1]==31) // JAL/JALR
9871         {
9872           // Subroutine call will return here, don't alloc any registers
9873           current.is32=1;
9874           current.dirty=0;
9875           clear_all_regs(current.regmap);
9876           alloc_reg(&current,i,CCREG);
9877           dirty_reg(&current,CCREG);
9878         }
9879         else if(i+1<slen)
9880         {
9881           // Internal branch will jump here, match registers to caller
9882           current.is32=0x3FFFFFFFFLL;
9883           current.dirty=0;
9884           clear_all_regs(current.regmap);
9885           alloc_reg(&current,i,CCREG);
9886           dirty_reg(&current,CCREG);
9887           for(j=i-1;j>=0;j--)
9888           {
9889             if(ba[j]==start+i*4+4) {
9890               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9891               current.is32=branch_regs[j].is32;
9892               current.dirty=branch_regs[j].dirty;
9893               break;
9894             }
9895           }
9896           while(j>=0) {
9897             if(ba[j]==start+i*4+4) {
9898               for(hr=0;hr<HOST_REGS;hr++) {
9899                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9900                   current.regmap[hr]=-1;
9901                 }
9902                 current.is32&=branch_regs[j].is32;
9903                 current.dirty&=branch_regs[j].dirty;
9904               }
9905             }
9906             j--;
9907           }
9908         }
9909       }
9910     }
9911
9912     // Count cycles in between branches
9913     ccadj[i]=cc;
9914     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9915     {
9916       cc=0;
9917     }
9918 #if defined(PCSX) && !defined(DRC_DBG)
9919     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
9920     {
9921       // GTE runs in parallel until accessed, divide by 2 for a rough guess
9922       cc+=gte_cycletab[source[i]&0x3f]/2;
9923     }
9924     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
9925     {
9926       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9927     }
9928     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
9929     {
9930       cc+=4;
9931     }
9932     else if(itype[i]==C2LS)
9933     {
9934       cc+=4;
9935     }
9936 #endif
9937     else
9938     {
9939       cc++;
9940     }
9941
9942     flush_dirty_uppers(&current);
9943     if(!is_ds[i]) {
9944       regs[i].is32=current.is32;
9945       regs[i].dirty=current.dirty;
9946       regs[i].isconst=current.isconst;
9947       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
9948     }
9949     for(hr=0;hr<HOST_REGS;hr++) {
9950       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9951         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9952           regs[i].wasconst&=~(1<<hr);
9953         }
9954       }
9955     }
9956     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9957     regs[i].waswritten=current.waswritten;
9958   }
9959
9960   /* Pass 4 - Cull unused host registers */
9961
9962   uint64_t nr=0;
9963
9964   for (i=slen-1;i>=0;i--)
9965   {
9966     int hr;
9967     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9968     {
9969       if(ba[i]<start || ba[i]>=(start+slen*4))
9970       {
9971         // Branch out of this block, don't need anything
9972         nr=0;
9973       }
9974       else
9975       {
9976         // Internal branch
9977         // Need whatever matches the target
9978         nr=0;
9979         int t=(ba[i]-start)>>2;
9980         for(hr=0;hr<HOST_REGS;hr++)
9981         {
9982           if(regs[i].regmap_entry[hr]>=0) {
9983             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9984           }
9985         }
9986       }
9987       // Conditional branch may need registers for following instructions
9988       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9989       {
9990         if(i<slen-2) {
9991           nr|=needed_reg[i+2];
9992           for(hr=0;hr<HOST_REGS;hr++)
9993           {
9994             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9995             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9996           }
9997         }
9998       }
9999       // Don't need stuff which is overwritten
10000       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
10001       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
10002       // Merge in delay slot
10003       for(hr=0;hr<HOST_REGS;hr++)
10004       {
10005         if(!likely[i]) {
10006           // These are overwritten unless the branch is "likely"
10007           // and the delay slot is nullified if not taken
10008           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10009           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10010         }
10011         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10012         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10013         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
10014         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
10015         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10016         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10017         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
10018         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
10019         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
10020           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10021           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10022         }
10023         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
10024           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10025           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10026         }
10027         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
10028           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
10029           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
10030         }
10031       }
10032     }
10033     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10034     {
10035       // SYSCALL instruction (software interrupt)
10036       nr=0;
10037     }
10038     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10039     {
10040       // ERET instruction (return from interrupt)
10041       nr=0;
10042     }
10043     else // Non-branch
10044     {
10045       if(i<slen-1) {
10046         for(hr=0;hr<HOST_REGS;hr++) {
10047           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
10048           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
10049           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
10050           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
10051         }
10052       }
10053     }
10054     for(hr=0;hr<HOST_REGS;hr++)
10055     {
10056       // Overwritten registers are not needed
10057       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10058       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10059       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10060       // Source registers are needed
10061       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10062       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10063       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
10064       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
10065       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10066       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10067       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
10068       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
10069       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
10070         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10071         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10072       }
10073       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
10074         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10075         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10076       }
10077       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
10078         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
10079         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
10080       }
10081       // Don't store a register immediately after writing it,
10082       // may prevent dual-issue.
10083       // But do so if this is a branch target, otherwise we
10084       // might have to load the register before the branch.
10085       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
10086         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
10087            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
10088           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10089           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10090         }
10091         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
10092            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
10093           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10094           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10095         }
10096       }
10097     }
10098     // Cycle count is needed at branches.  Assume it is needed at the target too.
10099     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
10100       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
10101       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
10102     }
10103     // Save it
10104     needed_reg[i]=nr;
10105
10106     // Deallocate unneeded registers
10107     for(hr=0;hr<HOST_REGS;hr++)
10108     {
10109       if(!((nr>>hr)&1)) {
10110         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
10111         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
10112            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10113            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
10114         {
10115           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10116           {
10117             if(likely[i]) {
10118               regs[i].regmap[hr]=-1;
10119               regs[i].isconst&=~(1<<hr);
10120               if(i<slen-2) {
10121                 regmap_pre[i+2][hr]=-1;
10122                 regs[i+2].wasconst&=~(1<<hr);
10123               }
10124             }
10125           }
10126         }
10127         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10128         {
10129           int d1=0,d2=0,map=0,temp=0;
10130           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
10131           {
10132             d1=dep1[i+1];
10133             d2=dep2[i+1];
10134           }
10135           if(using_tlb) {
10136             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
10137                itype[i+1]==STORE || itype[i+1]==STORELR ||
10138                itype[i+1]==C1LS || itype[i+1]==C2LS)
10139             map=TLREG;
10140           } else
10141           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
10142              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10143             map=INVCP;
10144           }
10145           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
10146              itype[i+1]==C1LS || itype[i+1]==C2LS)
10147             temp=FTEMP;
10148           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
10149              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10150              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
10151              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
10152              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10153              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
10154              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
10155              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
10156              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
10157              regs[i].regmap[hr]!=map )
10158           {
10159             regs[i].regmap[hr]=-1;
10160             regs[i].isconst&=~(1<<hr);
10161             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
10162                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
10163                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
10164                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
10165                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
10166                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
10167                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
10168                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
10169                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
10170                branch_regs[i].regmap[hr]!=map)
10171             {
10172               branch_regs[i].regmap[hr]=-1;
10173               branch_regs[i].regmap_entry[hr]=-1;
10174               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10175               {
10176                 if(!likely[i]&&i<slen-2) {
10177                   regmap_pre[i+2][hr]=-1;
10178                   regs[i+2].wasconst&=~(1<<hr);
10179                 }
10180               }
10181             }
10182           }
10183         }
10184         else
10185         {
10186           // Non-branch
10187           if(i>0)
10188           {
10189             int d1=0,d2=0,map=-1,temp=-1;
10190             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
10191             {
10192               d1=dep1[i];
10193               d2=dep2[i];
10194             }
10195             if(using_tlb) {
10196               if(itype[i]==LOAD || itype[i]==LOADLR ||
10197                  itype[i]==STORE || itype[i]==STORELR ||
10198                  itype[i]==C1LS || itype[i]==C2LS)
10199               map=TLREG;
10200             } else if(itype[i]==STORE || itype[i]==STORELR ||
10201                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10202               map=INVCP;
10203             }
10204             if(itype[i]==LOADLR || itype[i]==STORELR ||
10205                itype[i]==C1LS || itype[i]==C2LS)
10206               temp=FTEMP;
10207             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10208                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
10209                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10210                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
10211                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
10212                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
10213             {
10214               if(i<slen-1&&!is_ds[i]) {
10215                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
10216                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
10217                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
10218                 {
10219                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
10220                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
10221                 }
10222                 regmap_pre[i+1][hr]=-1;
10223                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
10224                 regs[i+1].wasconst&=~(1<<hr);
10225               }
10226               regs[i].regmap[hr]=-1;
10227               regs[i].isconst&=~(1<<hr);
10228             }
10229           }
10230         }
10231       }
10232     }
10233   }
10234
10235   /* Pass 5 - Pre-allocate registers */
10236
10237   // If a register is allocated during a loop, try to allocate it for the
10238   // entire loop, if possible.  This avoids loading/storing registers
10239   // inside of the loop.
10240
10241   signed char f_regmap[HOST_REGS];
10242   clear_all_regs(f_regmap);
10243   for(i=0;i<slen-1;i++)
10244   {
10245     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10246     {
10247       if(ba[i]>=start && ba[i]<(start+i*4))
10248       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
10249       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
10250       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
10251       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
10252       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
10253       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
10254       {
10255         int t=(ba[i]-start)>>2;
10256         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
10257         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
10258         for(hr=0;hr<HOST_REGS;hr++)
10259         {
10260           if(regs[i].regmap[hr]>64) {
10261             if(!((regs[i].dirty>>hr)&1))
10262               f_regmap[hr]=regs[i].regmap[hr];
10263             else f_regmap[hr]=-1;
10264           }
10265           else if(regs[i].regmap[hr]>=0) {
10266             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10267               // dealloc old register
10268               int n;
10269               for(n=0;n<HOST_REGS;n++)
10270               {
10271                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10272               }
10273               // and alloc new one
10274               f_regmap[hr]=regs[i].regmap[hr];
10275             }
10276           }
10277           if(branch_regs[i].regmap[hr]>64) {
10278             if(!((branch_regs[i].dirty>>hr)&1))
10279               f_regmap[hr]=branch_regs[i].regmap[hr];
10280             else f_regmap[hr]=-1;
10281           }
10282           else if(branch_regs[i].regmap[hr]>=0) {
10283             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
10284               // dealloc old register
10285               int n;
10286               for(n=0;n<HOST_REGS;n++)
10287               {
10288                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
10289               }
10290               // and alloc new one
10291               f_regmap[hr]=branch_regs[i].regmap[hr];
10292             }
10293           }
10294           if(ooo[i]) {
10295             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
10296               f_regmap[hr]=branch_regs[i].regmap[hr];
10297           }else{
10298             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
10299               f_regmap[hr]=branch_regs[i].regmap[hr];
10300           }
10301           // Avoid dirty->clean transition
10302           #ifdef DESTRUCTIVE_WRITEBACK
10303           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
10304           #endif
10305           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
10306           // case above, however it's always a good idea.  We can't hoist the
10307           // load if the register was already allocated, so there's no point
10308           // wasting time analyzing most of these cases.  It only "succeeds"
10309           // when the mapping was different and the load can be replaced with
10310           // a mov, which is of negligible benefit.  So such cases are
10311           // skipped below.
10312           if(f_regmap[hr]>0) {
10313             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
10314               int r=f_regmap[hr];
10315               for(j=t;j<=i;j++)
10316               {
10317                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10318                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
10319                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
10320                 if(r>63) {
10321                   // NB This can exclude the case where the upper-half
10322                   // register is lower numbered than the lower-half
10323                   // register.  Not sure if it's worth fixing...
10324                   if(get_reg(regs[j].regmap,r&63)<0) break;
10325                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
10326                   if(regs[j].is32&(1LL<<(r&63))) break;
10327                 }
10328                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
10329                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10330                   int k;
10331                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
10332                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
10333                     if(r>63) {
10334                       if(get_reg(regs[i].regmap,r&63)<0) break;
10335                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
10336                     }
10337                     k=i;
10338                     while(k>1&&regs[k-1].regmap[hr]==-1) {
10339                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10340                         //printf("no free regs for store %x\n",start+(k-1)*4);
10341                         break;
10342                       }
10343                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
10344                         //printf("no-match due to different register\n");
10345                         break;
10346                       }
10347                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
10348                         //printf("no-match due to branch\n");
10349                         break;
10350                       }
10351                       // call/ret fast path assumes no registers allocated
10352                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
10353                         break;
10354                       }
10355                       if(r>63) {
10356                         // NB This can exclude the case where the upper-half
10357                         // register is lower numbered than the lower-half
10358                         // register.  Not sure if it's worth fixing...
10359                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
10360                         if(regs[k-1].is32&(1LL<<(r&63))) break;
10361                       }
10362                       k--;
10363                     }
10364                     if(i<slen-1) {
10365                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10366                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10367                         //printf("bad match after branch\n");
10368                         break;
10369                       }
10370                     }
10371                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10372                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10373                       while(k<i) {
10374                         regs[k].regmap_entry[hr]=f_regmap[hr];
10375                         regs[k].regmap[hr]=f_regmap[hr];
10376                         regmap_pre[k+1][hr]=f_regmap[hr];
10377                         regs[k].wasdirty&=~(1<<hr);
10378                         regs[k].dirty&=~(1<<hr);
10379                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10380                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10381                         regs[k].wasconst&=~(1<<hr);
10382                         regs[k].isconst&=~(1<<hr);
10383                         k++;
10384                       }
10385                     }
10386                     else {
10387                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10388                       break;
10389                     }
10390                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10391                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10392                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10393                       regs[i].regmap_entry[hr]=f_regmap[hr];
10394                       regs[i].regmap[hr]=f_regmap[hr];
10395                       regs[i].wasdirty&=~(1<<hr);
10396                       regs[i].dirty&=~(1<<hr);
10397                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10398                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10399                       regs[i].wasconst&=~(1<<hr);
10400                       regs[i].isconst&=~(1<<hr);
10401                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10402                       branch_regs[i].wasdirty&=~(1<<hr);
10403                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10404                       branch_regs[i].regmap[hr]=f_regmap[hr];
10405                       branch_regs[i].dirty&=~(1<<hr);
10406                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10407                       branch_regs[i].wasconst&=~(1<<hr);
10408                       branch_regs[i].isconst&=~(1<<hr);
10409                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10410                         regmap_pre[i+2][hr]=f_regmap[hr];
10411                         regs[i+2].wasdirty&=~(1<<hr);
10412                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10413                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10414                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10415                       }
10416                     }
10417                   }
10418                   for(k=t;k<j;k++) {
10419                     // Alloc register clean at beginning of loop,
10420                     // but may dirty it in pass 6
10421                     regs[k].regmap_entry[hr]=f_regmap[hr];
10422                     regs[k].regmap[hr]=f_regmap[hr];
10423                     regs[k].dirty&=~(1<<hr);
10424                     regs[k].wasconst&=~(1<<hr);
10425                     regs[k].isconst&=~(1<<hr);
10426                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10427                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10428                       branch_regs[k].regmap[hr]=f_regmap[hr];
10429                       branch_regs[k].dirty&=~(1<<hr);
10430                       branch_regs[k].wasconst&=~(1<<hr);
10431                       branch_regs[k].isconst&=~(1<<hr);
10432                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10433                         regmap_pre[k+2][hr]=f_regmap[hr];
10434                         regs[k+2].wasdirty&=~(1<<hr);
10435                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10436                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10437                       }
10438                     }
10439                     else
10440                     {
10441                       regmap_pre[k+1][hr]=f_regmap[hr];
10442                       regs[k+1].wasdirty&=~(1<<hr);
10443                     }
10444                   }
10445                   if(regs[j].regmap[hr]==f_regmap[hr])
10446                     regs[j].regmap_entry[hr]=f_regmap[hr];
10447                   break;
10448                 }
10449                 if(j==i) break;
10450                 if(regs[j].regmap[hr]>=0)
10451                   break;
10452                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10453                   //printf("no-match due to different register\n");
10454                   break;
10455                 }
10456                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10457                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10458                   break;
10459                 }
10460                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10461                 {
10462                   // Stop on unconditional branch
10463                   break;
10464                 }
10465                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10466                 {
10467                   if(ooo[j]) {
10468                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
10469                       break;
10470                   }else{
10471                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
10472                       break;
10473                   }
10474                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10475                     //printf("no-match due to different register (branch)\n");
10476                     break;
10477                   }
10478                 }
10479                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10480                   //printf("No free regs for store %x\n",start+j*4);
10481                   break;
10482                 }
10483                 if(f_regmap[hr]>=64) {
10484                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10485                     break;
10486                   }
10487                   else
10488                   {
10489                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10490                       break;
10491                     }
10492                   }
10493                 }
10494               }
10495             }
10496           }
10497         }
10498       }
10499     }else{
10500       // Non branch or undetermined branch target
10501       for(hr=0;hr<HOST_REGS;hr++)
10502       {
10503         if(hr!=EXCLUDE_REG) {
10504           if(regs[i].regmap[hr]>64) {
10505             if(!((regs[i].dirty>>hr)&1))
10506               f_regmap[hr]=regs[i].regmap[hr];
10507           }
10508           else if(regs[i].regmap[hr]>=0) {
10509             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10510               // dealloc old register
10511               int n;
10512               for(n=0;n<HOST_REGS;n++)
10513               {
10514                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10515               }
10516               // and alloc new one
10517               f_regmap[hr]=regs[i].regmap[hr];
10518             }
10519           }
10520         }
10521       }
10522       // Try to restore cycle count at branch targets
10523       if(bt[i]) {
10524         for(j=i;j<slen-1;j++) {
10525           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10526           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10527             //printf("no free regs for store %x\n",start+j*4);
10528             break;
10529           }
10530         }
10531         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10532           int k=i;
10533           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10534           while(k<j) {
10535             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10536             regs[k].regmap[HOST_CCREG]=CCREG;
10537             regmap_pre[k+1][HOST_CCREG]=CCREG;
10538             regs[k+1].wasdirty|=1<<HOST_CCREG;
10539             regs[k].dirty|=1<<HOST_CCREG;
10540             regs[k].wasconst&=~(1<<HOST_CCREG);
10541             regs[k].isconst&=~(1<<HOST_CCREG);
10542             k++;
10543           }
10544           regs[j].regmap_entry[HOST_CCREG]=CCREG;
10545         }
10546         // Work backwards from the branch target
10547         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10548         {
10549           //printf("Extend backwards\n");
10550           int k;
10551           k=i;
10552           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10553             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10554               //printf("no free regs for store %x\n",start+(k-1)*4);
10555               break;
10556             }
10557             k--;
10558           }
10559           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10560             //printf("Extend CC, %x ->\n",start+k*4);
10561             while(k<=i) {
10562               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10563               regs[k].regmap[HOST_CCREG]=CCREG;
10564               regmap_pre[k+1][HOST_CCREG]=CCREG;
10565               regs[k+1].wasdirty|=1<<HOST_CCREG;
10566               regs[k].dirty|=1<<HOST_CCREG;
10567               regs[k].wasconst&=~(1<<HOST_CCREG);
10568               regs[k].isconst&=~(1<<HOST_CCREG);
10569               k++;
10570             }
10571           }
10572           else {
10573             //printf("Fail Extend CC, %x ->\n",start+k*4);
10574           }
10575         }
10576       }
10577       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10578          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10579          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10580          itype[i]!=FCONV&&itype[i]!=FCOMP)
10581       {
10582         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10583       }
10584     }
10585   }
10586
10587   // Cache memory offset or tlb map pointer if a register is available
10588   #ifndef HOST_IMM_ADDR32
10589   #ifndef RAM_OFFSET
10590   if(using_tlb)
10591   #endif
10592   {
10593     int earliest_available[HOST_REGS];
10594     int loop_start[HOST_REGS];
10595     int score[HOST_REGS];
10596     int end[HOST_REGS];
10597     int reg=using_tlb?MMREG:ROREG;
10598
10599     // Init
10600     for(hr=0;hr<HOST_REGS;hr++) {
10601       score[hr]=0;earliest_available[hr]=0;
10602       loop_start[hr]=MAXBLOCK;
10603     }
10604     for(i=0;i<slen-1;i++)
10605     {
10606       // Can't do anything if no registers are available
10607       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
10608         for(hr=0;hr<HOST_REGS;hr++) {
10609           score[hr]=0;earliest_available[hr]=i+1;
10610           loop_start[hr]=MAXBLOCK;
10611         }
10612       }
10613       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10614         if(!ooo[i]) {
10615           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
10616             for(hr=0;hr<HOST_REGS;hr++) {
10617               score[hr]=0;earliest_available[hr]=i+1;
10618               loop_start[hr]=MAXBLOCK;
10619             }
10620           }
10621         }else{
10622           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
10623             for(hr=0;hr<HOST_REGS;hr++) {
10624               score[hr]=0;earliest_available[hr]=i+1;
10625               loop_start[hr]=MAXBLOCK;
10626             }
10627           }
10628         }
10629       }
10630       // Mark unavailable registers
10631       for(hr=0;hr<HOST_REGS;hr++) {
10632         if(regs[i].regmap[hr]>=0) {
10633           score[hr]=0;earliest_available[hr]=i+1;
10634           loop_start[hr]=MAXBLOCK;
10635         }
10636         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10637           if(branch_regs[i].regmap[hr]>=0) {
10638             score[hr]=0;earliest_available[hr]=i+2;
10639             loop_start[hr]=MAXBLOCK;
10640           }
10641         }
10642       }
10643       // No register allocations after unconditional jumps
10644       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10645       {
10646         for(hr=0;hr<HOST_REGS;hr++) {
10647           score[hr]=0;earliest_available[hr]=i+2;
10648           loop_start[hr]=MAXBLOCK;
10649         }
10650         i++; // Skip delay slot too
10651         //printf("skip delay slot: %x\n",start+i*4);
10652       }
10653       else
10654       // Possible match
10655       if(itype[i]==LOAD||itype[i]==LOADLR||
10656          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
10657         for(hr=0;hr<HOST_REGS;hr++) {
10658           if(hr!=EXCLUDE_REG) {
10659             end[hr]=i-1;
10660             for(j=i;j<slen-1;j++) {
10661               if(regs[j].regmap[hr]>=0) break;
10662               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10663                 if(branch_regs[j].regmap[hr]>=0) break;
10664                 if(ooo[j]) {
10665                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
10666                 }else{
10667                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
10668                 }
10669               }
10670               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
10671               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10672                 int t=(ba[j]-start)>>2;
10673                 if(t<j&&t>=earliest_available[hr]) {
10674                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
10675                     // Score a point for hoisting loop invariant
10676                     if(t<loop_start[hr]) loop_start[hr]=t;
10677                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
10678                     score[hr]++;
10679                     end[hr]=j;
10680                   }
10681                 }
10682                 else if(t<j) {
10683                   if(regs[t].regmap[hr]==reg) {
10684                     // Score a point if the branch target matches this register
10685                     score[hr]++;
10686                     end[hr]=j;
10687                   }
10688                 }
10689                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
10690                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
10691                   score[hr]++;
10692                   end[hr]=j;
10693                 }
10694               }
10695               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10696               {
10697                 // Stop on unconditional branch
10698                 break;
10699               }
10700               else
10701               if(itype[j]==LOAD||itype[j]==LOADLR||
10702                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
10703                 score[hr]++;
10704                 end[hr]=j;
10705               }
10706             }
10707           }
10708         }
10709         // Find highest score and allocate that register
10710         int maxscore=0;
10711         for(hr=0;hr<HOST_REGS;hr++) {
10712           if(hr!=EXCLUDE_REG) {
10713             if(score[hr]>score[maxscore]) {
10714               maxscore=hr;
10715               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
10716             }
10717           }
10718         }
10719         if(score[maxscore]>1)
10720         {
10721           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
10722           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
10723             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
10724             assert(regs[j].regmap[maxscore]<0);
10725             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
10726             regs[j].regmap[maxscore]=reg;
10727             regs[j].dirty&=~(1<<maxscore);
10728             regs[j].wasconst&=~(1<<maxscore);
10729             regs[j].isconst&=~(1<<maxscore);
10730             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10731               branch_regs[j].regmap[maxscore]=reg;
10732               branch_regs[j].wasdirty&=~(1<<maxscore);
10733               branch_regs[j].dirty&=~(1<<maxscore);
10734               branch_regs[j].wasconst&=~(1<<maxscore);
10735               branch_regs[j].isconst&=~(1<<maxscore);
10736               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
10737                 regmap_pre[j+2][maxscore]=reg;
10738                 regs[j+2].wasdirty&=~(1<<maxscore);
10739               }
10740               // loop optimization (loop_preload)
10741               int t=(ba[j]-start)>>2;
10742               if(t==loop_start[maxscore]) {
10743                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
10744                   regs[t].regmap_entry[maxscore]=reg;
10745               }
10746             }
10747             else
10748             {
10749               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
10750                 regmap_pre[j+1][maxscore]=reg;
10751                 regs[j+1].wasdirty&=~(1<<maxscore);
10752               }
10753             }
10754           }
10755           i=j-1;
10756           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
10757           for(hr=0;hr<HOST_REGS;hr++) {
10758             score[hr]=0;earliest_available[hr]=i+i;
10759             loop_start[hr]=MAXBLOCK;
10760           }
10761         }
10762       }
10763     }
10764   }
10765   #endif
10766
10767   // This allocates registers (if possible) one instruction prior
10768   // to use, which can avoid a load-use penalty on certain CPUs.
10769   for(i=0;i<slen-1;i++)
10770   {
10771     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10772     {
10773       if(!bt[i+1])
10774       {
10775         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10776            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10777         {
10778           if(rs1[i+1]) {
10779             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10780             {
10781               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10782               {
10783                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10784                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10785                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10786                 regs[i].isconst&=~(1<<hr);
10787                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10788                 constmap[i][hr]=constmap[i+1][hr];
10789                 regs[i+1].wasdirty&=~(1<<hr);
10790                 regs[i].dirty&=~(1<<hr);
10791               }
10792             }
10793           }
10794           if(rs2[i+1]) {
10795             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10796             {
10797               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10798               {
10799                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10800                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10801                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10802                 regs[i].isconst&=~(1<<hr);
10803                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10804                 constmap[i][hr]=constmap[i+1][hr];
10805                 regs[i+1].wasdirty&=~(1<<hr);
10806                 regs[i].dirty&=~(1<<hr);
10807               }
10808             }
10809           }
10810           // Preload target address for load instruction (non-constant)
10811           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10812             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10813             {
10814               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10815               {
10816                 regs[i].regmap[hr]=rs1[i+1];
10817                 regmap_pre[i+1][hr]=rs1[i+1];
10818                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10819                 regs[i].isconst&=~(1<<hr);
10820                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10821                 constmap[i][hr]=constmap[i+1][hr];
10822                 regs[i+1].wasdirty&=~(1<<hr);
10823                 regs[i].dirty&=~(1<<hr);
10824               }
10825             }
10826           }
10827           // Load source into target register
10828           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10829             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10830             {
10831               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10832               {
10833                 regs[i].regmap[hr]=rs1[i+1];
10834                 regmap_pre[i+1][hr]=rs1[i+1];
10835                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10836                 regs[i].isconst&=~(1<<hr);
10837                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10838                 constmap[i][hr]=constmap[i+1][hr];
10839                 regs[i+1].wasdirty&=~(1<<hr);
10840                 regs[i].dirty&=~(1<<hr);
10841               }
10842             }
10843           }
10844           // Preload map address
10845           #ifndef HOST_IMM_ADDR32
10846           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10847             hr=get_reg(regs[i+1].regmap,TLREG);
10848             if(hr>=0) {
10849               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10850               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10851                 int nr;
10852                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10853                 {
10854                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10855                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10856                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10857                   regs[i].isconst&=~(1<<hr);
10858                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10859                   constmap[i][hr]=constmap[i+1][hr];
10860                   regs[i+1].wasdirty&=~(1<<hr);
10861                   regs[i].dirty&=~(1<<hr);
10862                 }
10863                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10864                 {
10865                   // move it to another register
10866                   regs[i+1].regmap[hr]=-1;
10867                   regmap_pre[i+2][hr]=-1;
10868                   regs[i+1].regmap[nr]=TLREG;
10869                   regmap_pre[i+2][nr]=TLREG;
10870                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10871                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10872                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10873                   regs[i].isconst&=~(1<<nr);
10874                   regs[i+1].isconst&=~(1<<nr);
10875                   regs[i].dirty&=~(1<<nr);
10876                   regs[i+1].wasdirty&=~(1<<nr);
10877                   regs[i+1].dirty&=~(1<<nr);
10878                   regs[i+2].wasdirty&=~(1<<nr);
10879                 }
10880               }
10881             }
10882           }
10883           #endif
10884           // Address for store instruction (non-constant)
10885           if(itype[i+1]==STORE||itype[i+1]==STORELR
10886              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10887             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10888               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10889               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10890               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10891               assert(hr>=0);
10892               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10893               {
10894                 regs[i].regmap[hr]=rs1[i+1];
10895                 regmap_pre[i+1][hr]=rs1[i+1];
10896                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10897                 regs[i].isconst&=~(1<<hr);
10898                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10899                 constmap[i][hr]=constmap[i+1][hr];
10900                 regs[i+1].wasdirty&=~(1<<hr);
10901                 regs[i].dirty&=~(1<<hr);
10902               }
10903             }
10904           }
10905           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10906             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10907               int nr;
10908               hr=get_reg(regs[i+1].regmap,FTEMP);
10909               assert(hr>=0);
10910               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10911               {
10912                 regs[i].regmap[hr]=rs1[i+1];
10913                 regmap_pre[i+1][hr]=rs1[i+1];
10914                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10915                 regs[i].isconst&=~(1<<hr);
10916                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10917                 constmap[i][hr]=constmap[i+1][hr];
10918                 regs[i+1].wasdirty&=~(1<<hr);
10919                 regs[i].dirty&=~(1<<hr);
10920               }
10921               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10922               {
10923                 // move it to another register
10924                 regs[i+1].regmap[hr]=-1;
10925                 regmap_pre[i+2][hr]=-1;
10926                 regs[i+1].regmap[nr]=FTEMP;
10927                 regmap_pre[i+2][nr]=FTEMP;
10928                 regs[i].regmap[nr]=rs1[i+1];
10929                 regmap_pre[i+1][nr]=rs1[i+1];
10930                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10931                 regs[i].isconst&=~(1<<nr);
10932                 regs[i+1].isconst&=~(1<<nr);
10933                 regs[i].dirty&=~(1<<nr);
10934                 regs[i+1].wasdirty&=~(1<<nr);
10935                 regs[i+1].dirty&=~(1<<nr);
10936                 regs[i+2].wasdirty&=~(1<<nr);
10937               }
10938             }
10939           }
10940           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10941             if(itype[i+1]==LOAD)
10942               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10943             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10944               hr=get_reg(regs[i+1].regmap,FTEMP);
10945             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10946               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10947               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10948             }
10949             if(hr>=0&&regs[i].regmap[hr]<0) {
10950               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10951               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10952                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10953                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10954                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10955                 regs[i].isconst&=~(1<<hr);
10956                 regs[i+1].wasdirty&=~(1<<hr);
10957                 regs[i].dirty&=~(1<<hr);
10958               }
10959             }
10960           }
10961         }
10962       }
10963     }
10964   }
10965
10966   /* Pass 6 - Optimize clean/dirty state */
10967   clean_registers(0,slen-1,1);
10968
10969   /* Pass 7 - Identify 32-bit registers */
10970 #ifndef FORCE32
10971   provisional_r32();
10972
10973   u_int r32=0;
10974
10975   for (i=slen-1;i>=0;i--)
10976   {
10977     int hr;
10978     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10979     {
10980       if(ba[i]<start || ba[i]>=(start+slen*4))
10981       {
10982         // Branch out of this block, don't need anything
10983         r32=0;
10984       }
10985       else
10986       {
10987         // Internal branch
10988         // Need whatever matches the target
10989         // (and doesn't get overwritten by the delay slot instruction)
10990         r32=0;
10991         int t=(ba[i]-start)>>2;
10992         if(ba[i]>start+i*4) {
10993           // Forward branch
10994           if(!(requires_32bit[t]&~regs[i].was32))
10995             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10996         }else{
10997           // Backward branch
10998           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10999           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
11000           if(!(pr32[t]&~regs[i].was32))
11001             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
11002         }
11003       }
11004       // Conditional branch may need registers for following instructions
11005       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
11006       {
11007         if(i<slen-2) {
11008           r32|=requires_32bit[i+2];
11009           r32&=regs[i].was32;
11010           // Mark this address as a branch target since it may be called
11011           // upon return from interrupt
11012           bt[i+2]=1;
11013         }
11014       }
11015       // Merge in delay slot
11016       if(!likely[i]) {
11017         // These are overwritten unless the branch is "likely"
11018         // and the delay slot is nullified if not taken
11019         r32&=~(1LL<<rt1[i+1]);
11020         r32&=~(1LL<<rt2[i+1]);
11021       }
11022       // Assume these are needed (delay slot)
11023       if(us1[i+1]>0)
11024       {
11025         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
11026       }
11027       if(us2[i+1]>0)
11028       {
11029         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
11030       }
11031       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
11032       {
11033         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
11034       }
11035       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
11036       {
11037         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
11038       }
11039     }
11040     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
11041     {
11042       // SYSCALL instruction (software interrupt)
11043       r32=0;
11044     }
11045     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
11046     {
11047       // ERET instruction (return from interrupt)
11048       r32=0;
11049     }
11050     // Check 32 bits
11051     r32&=~(1LL<<rt1[i]);
11052     r32&=~(1LL<<rt2[i]);
11053     if(us1[i]>0)
11054     {
11055       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
11056     }
11057     if(us2[i]>0)
11058     {
11059       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
11060     }
11061     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
11062     {
11063       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
11064     }
11065     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
11066     {
11067       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
11068     }
11069     requires_32bit[i]=r32;
11070
11071     // Dirty registers which are 32-bit, require 32-bit input
11072     // as they will be written as 32-bit values
11073     for(hr=0;hr<HOST_REGS;hr++)
11074     {
11075       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
11076         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
11077           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
11078           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
11079         }
11080       }
11081     }
11082     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
11083   }
11084 #else
11085   for (i=slen-1;i>=0;i--)
11086   {
11087     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11088     {
11089       // Conditional branch
11090       if((source[i]>>16)!=0x1000&&i<slen-2) {
11091         // Mark this address as a branch target since it may be called
11092         // upon return from interrupt
11093         bt[i+2]=1;
11094       }
11095     }
11096   }
11097 #endif
11098
11099   if(itype[slen-1]==SPAN) {
11100     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
11101   }
11102
11103 #ifdef DISASM
11104   /* Debug/disassembly */
11105   for(i=0;i<slen;i++)
11106   {
11107     printf("U:");
11108     int r;
11109     for(r=1;r<=CCREG;r++) {
11110       if((unneeded_reg[i]>>r)&1) {
11111         if(r==HIREG) printf(" HI");
11112         else if(r==LOREG) printf(" LO");
11113         else printf(" r%d",r);
11114       }
11115     }
11116 #ifndef FORCE32
11117     printf(" UU:");
11118     for(r=1;r<=CCREG;r++) {
11119       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
11120         if(r==HIREG) printf(" HI");
11121         else if(r==LOREG) printf(" LO");
11122         else printf(" r%d",r);
11123       }
11124     }
11125     printf(" 32:");
11126     for(r=0;r<=CCREG;r++) {
11127       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11128       if((regs[i].was32>>r)&1) {
11129         if(r==CCREG) printf(" CC");
11130         else if(r==HIREG) printf(" HI");
11131         else if(r==LOREG) printf(" LO");
11132         else printf(" r%d",r);
11133       }
11134     }
11135 #endif
11136     printf("\n");
11137     #if defined(__i386__) || defined(__x86_64__)
11138     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
11139     #endif
11140     #ifdef __arm__
11141     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
11142     #endif
11143     printf("needs: ");
11144     if(needed_reg[i]&1) printf("eax ");
11145     if((needed_reg[i]>>1)&1) printf("ecx ");
11146     if((needed_reg[i]>>2)&1) printf("edx ");
11147     if((needed_reg[i]>>3)&1) printf("ebx ");
11148     if((needed_reg[i]>>5)&1) printf("ebp ");
11149     if((needed_reg[i]>>6)&1) printf("esi ");
11150     if((needed_reg[i]>>7)&1) printf("edi ");
11151     printf("r:");
11152     for(r=0;r<=CCREG;r++) {
11153       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11154       if((requires_32bit[i]>>r)&1) {
11155         if(r==CCREG) printf(" CC");
11156         else if(r==HIREG) printf(" HI");
11157         else if(r==LOREG) printf(" LO");
11158         else printf(" r%d",r);
11159       }
11160     }
11161     printf("\n");
11162     /*printf("pr:");
11163     for(r=0;r<=CCREG;r++) {
11164       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11165       if((pr32[i]>>r)&1) {
11166         if(r==CCREG) printf(" CC");
11167         else if(r==HIREG) printf(" HI");
11168         else if(r==LOREG) printf(" LO");
11169         else printf(" r%d",r);
11170       }
11171     }
11172     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
11173     printf("\n");*/
11174     #if defined(__i386__) || defined(__x86_64__)
11175     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
11176     printf("dirty: ");
11177     if(regs[i].wasdirty&1) printf("eax ");
11178     if((regs[i].wasdirty>>1)&1) printf("ecx ");
11179     if((regs[i].wasdirty>>2)&1) printf("edx ");
11180     if((regs[i].wasdirty>>3)&1) printf("ebx ");
11181     if((regs[i].wasdirty>>5)&1) printf("ebp ");
11182     if((regs[i].wasdirty>>6)&1) printf("esi ");
11183     if((regs[i].wasdirty>>7)&1) printf("edi ");
11184     #endif
11185     #ifdef __arm__
11186     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
11187     printf("dirty: ");
11188     if(regs[i].wasdirty&1) printf("r0 ");
11189     if((regs[i].wasdirty>>1)&1) printf("r1 ");
11190     if((regs[i].wasdirty>>2)&1) printf("r2 ");
11191     if((regs[i].wasdirty>>3)&1) printf("r3 ");
11192     if((regs[i].wasdirty>>4)&1) printf("r4 ");
11193     if((regs[i].wasdirty>>5)&1) printf("r5 ");
11194     if((regs[i].wasdirty>>6)&1) printf("r6 ");
11195     if((regs[i].wasdirty>>7)&1) printf("r7 ");
11196     if((regs[i].wasdirty>>8)&1) printf("r8 ");
11197     if((regs[i].wasdirty>>9)&1) printf("r9 ");
11198     if((regs[i].wasdirty>>10)&1) printf("r10 ");
11199     if((regs[i].wasdirty>>12)&1) printf("r12 ");
11200     #endif
11201     printf("\n");
11202     disassemble_inst(i);
11203     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
11204     #if defined(__i386__) || defined(__x86_64__)
11205     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
11206     if(regs[i].dirty&1) printf("eax ");
11207     if((regs[i].dirty>>1)&1) printf("ecx ");
11208     if((regs[i].dirty>>2)&1) printf("edx ");
11209     if((regs[i].dirty>>3)&1) printf("ebx ");
11210     if((regs[i].dirty>>5)&1) printf("ebp ");
11211     if((regs[i].dirty>>6)&1) printf("esi ");
11212     if((regs[i].dirty>>7)&1) printf("edi ");
11213     #endif
11214     #ifdef __arm__
11215     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
11216     if(regs[i].dirty&1) printf("r0 ");
11217     if((regs[i].dirty>>1)&1) printf("r1 ");
11218     if((regs[i].dirty>>2)&1) printf("r2 ");
11219     if((regs[i].dirty>>3)&1) printf("r3 ");
11220     if((regs[i].dirty>>4)&1) printf("r4 ");
11221     if((regs[i].dirty>>5)&1) printf("r5 ");
11222     if((regs[i].dirty>>6)&1) printf("r6 ");
11223     if((regs[i].dirty>>7)&1) printf("r7 ");
11224     if((regs[i].dirty>>8)&1) printf("r8 ");
11225     if((regs[i].dirty>>9)&1) printf("r9 ");
11226     if((regs[i].dirty>>10)&1) printf("r10 ");
11227     if((regs[i].dirty>>12)&1) printf("r12 ");
11228     #endif
11229     printf("\n");
11230     if(regs[i].isconst) {
11231       printf("constants: ");
11232       #if defined(__i386__) || defined(__x86_64__)
11233       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
11234       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
11235       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
11236       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
11237       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
11238       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
11239       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
11240       #endif
11241       #ifdef __arm__
11242       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
11243       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
11244       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
11245       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
11246       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
11247       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
11248       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
11249       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
11250       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
11251       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
11252       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
11253       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
11254       #endif
11255       printf("\n");
11256     }
11257 #ifndef FORCE32
11258     printf(" 32:");
11259     for(r=0;r<=CCREG;r++) {
11260       if((regs[i].is32>>r)&1) {
11261         if(r==CCREG) printf(" CC");
11262         else if(r==HIREG) printf(" HI");
11263         else if(r==LOREG) printf(" LO");
11264         else printf(" r%d",r);
11265       }
11266     }
11267     printf("\n");
11268 #endif
11269     /*printf(" p32:");
11270     for(r=0;r<=CCREG;r++) {
11271       if((p32[i]>>r)&1) {
11272         if(r==CCREG) printf(" CC");
11273         else if(r==HIREG) printf(" HI");
11274         else if(r==LOREG) printf(" LO");
11275         else printf(" r%d",r);
11276       }
11277     }
11278     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
11279     else printf("\n");*/
11280     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
11281       #if defined(__i386__) || defined(__x86_64__)
11282       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
11283       if(branch_regs[i].dirty&1) printf("eax ");
11284       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
11285       if((branch_regs[i].dirty>>2)&1) printf("edx ");
11286       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
11287       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
11288       if((branch_regs[i].dirty>>6)&1) printf("esi ");
11289       if((branch_regs[i].dirty>>7)&1) printf("edi ");
11290       #endif
11291       #ifdef __arm__
11292       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
11293       if(branch_regs[i].dirty&1) printf("r0 ");
11294       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
11295       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
11296       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
11297       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
11298       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
11299       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
11300       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
11301       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
11302       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
11303       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
11304       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
11305       #endif
11306 #ifndef FORCE32
11307       printf(" 32:");
11308       for(r=0;r<=CCREG;r++) {
11309         if((branch_regs[i].is32>>r)&1) {
11310           if(r==CCREG) printf(" CC");
11311           else if(r==HIREG) printf(" HI");
11312           else if(r==LOREG) printf(" LO");
11313           else printf(" r%d",r);
11314         }
11315       }
11316       printf("\n");
11317 #endif
11318     }
11319   }
11320 #endif // DISASM
11321
11322   /* Pass 8 - Assembly */
11323   linkcount=0;stubcount=0;
11324   ds=0;is_delayslot=0;
11325   cop1_usable=0;
11326   uint64_t is32_pre=0;
11327   u_int dirty_pre=0;
11328   u_int beginning=(u_int)out;
11329   if((u_int)addr&1) {
11330     ds=1;
11331     pagespan_ds();
11332   }
11333   u_int instr_addr0_override=0;
11334
11335 #ifdef PCSX
11336   if (start == 0x80030000) {
11337     // nasty hack for fastbios thing
11338     // override block entry to this code
11339     instr_addr0_override=(u_int)out;
11340     emit_movimm(start,0);
11341     // abuse io address var as a flag that we
11342     // have already returned here once
11343     emit_readword((int)&address,1);
11344     emit_writeword(0,(int)&pcaddr);
11345     emit_writeword(0,(int)&address);
11346     emit_cmp(0,1);
11347     emit_jne((int)new_dyna_leave);
11348   }
11349 #endif
11350   for(i=0;i<slen;i++)
11351   {
11352     //if(ds) printf("ds: ");
11353     disassemble_inst(i);
11354     if(ds) {
11355       ds=0; // Skip delay slot
11356       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
11357       instr_addr[i]=0;
11358     } else {
11359       speculate_register_values(i);
11360       #ifndef DESTRUCTIVE_WRITEBACK
11361       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11362       {
11363         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
11364               unneeded_reg[i],unneeded_reg_upper[i]);
11365         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
11366               unneeded_reg[i],unneeded_reg_upper[i]);
11367       }
11368       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
11369         is32_pre=branch_regs[i].is32;
11370         dirty_pre=branch_regs[i].dirty;
11371       }else{
11372         is32_pre=regs[i].is32;
11373         dirty_pre=regs[i].dirty;
11374       }
11375       #endif
11376       // write back
11377       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11378       {
11379         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
11380                       unneeded_reg[i],unneeded_reg_upper[i]);
11381         loop_preload(regmap_pre[i],regs[i].regmap_entry);
11382       }
11383       // branch target entry point
11384       instr_addr[i]=(u_int)out;
11385       assem_debug("<->\n");
11386       // load regs
11387       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
11388         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
11389       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
11390       address_generation(i,&regs[i],regs[i].regmap_entry);
11391       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
11392       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11393       {
11394         // Load the delay slot registers if necessary
11395         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
11396           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11397         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
11398           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11399         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
11400           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11401       }
11402       else if(i+1<slen)
11403       {
11404         // Preload registers for following instruction
11405         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
11406           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
11407             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11408         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
11409           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
11410             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11411       }
11412       // TODO: if(is_ooo(i)) address_generation(i+1);
11413       if(itype[i]==CJUMP||itype[i]==FJUMP)
11414         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
11415       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
11416         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11417       if(bt[i]) cop1_usable=0;
11418       // assemble
11419       switch(itype[i]) {
11420         case ALU:
11421           alu_assemble(i,&regs[i]);break;
11422         case IMM16:
11423           imm16_assemble(i,&regs[i]);break;
11424         case SHIFT:
11425           shift_assemble(i,&regs[i]);break;
11426         case SHIFTIMM:
11427           shiftimm_assemble(i,&regs[i]);break;
11428         case LOAD:
11429           load_assemble(i,&regs[i]);break;
11430         case LOADLR:
11431           loadlr_assemble(i,&regs[i]);break;
11432         case STORE:
11433           store_assemble(i,&regs[i]);break;
11434         case STORELR:
11435           storelr_assemble(i,&regs[i]);break;
11436         case COP0:
11437           cop0_assemble(i,&regs[i]);break;
11438         case COP1:
11439           cop1_assemble(i,&regs[i]);break;
11440         case C1LS:
11441           c1ls_assemble(i,&regs[i]);break;
11442         case COP2:
11443           cop2_assemble(i,&regs[i]);break;
11444         case C2LS:
11445           c2ls_assemble(i,&regs[i]);break;
11446         case C2OP:
11447           c2op_assemble(i,&regs[i]);break;
11448         case FCONV:
11449           fconv_assemble(i,&regs[i]);break;
11450         case FLOAT:
11451           float_assemble(i,&regs[i]);break;
11452         case FCOMP:
11453           fcomp_assemble(i,&regs[i]);break;
11454         case MULTDIV:
11455           multdiv_assemble(i,&regs[i]);break;
11456         case MOV:
11457           mov_assemble(i,&regs[i]);break;
11458         case SYSCALL:
11459           syscall_assemble(i,&regs[i]);break;
11460         case HLECALL:
11461           hlecall_assemble(i,&regs[i]);break;
11462         case INTCALL:
11463           intcall_assemble(i,&regs[i]);break;
11464         case UJUMP:
11465           ujump_assemble(i,&regs[i]);ds=1;break;
11466         case RJUMP:
11467           rjump_assemble(i,&regs[i]);ds=1;break;
11468         case CJUMP:
11469           cjump_assemble(i,&regs[i]);ds=1;break;
11470         case SJUMP:
11471           sjump_assemble(i,&regs[i]);ds=1;break;
11472         case FJUMP:
11473           fjump_assemble(i,&regs[i]);ds=1;break;
11474         case SPAN:
11475           pagespan_assemble(i,&regs[i]);break;
11476       }
11477       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
11478         literal_pool(1024);
11479       else
11480         literal_pool_jumpover(256);
11481     }
11482   }
11483   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
11484   // If the block did not end with an unconditional branch,
11485   // add a jump to the next instruction.
11486   if(i>1) {
11487     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
11488       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11489       assert(i==slen);
11490       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
11491         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11492         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11493           emit_loadreg(CCREG,HOST_CCREG);
11494         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
11495       }
11496       else if(!likely[i-2])
11497       {
11498         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
11499         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
11500       }
11501       else
11502       {
11503         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
11504         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
11505       }
11506       add_to_linker((int)out,start+i*4,0);
11507       emit_jmp(0);
11508     }
11509   }
11510   else
11511   {
11512     assert(i>0);
11513     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11514     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11515     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11516       emit_loadreg(CCREG,HOST_CCREG);
11517     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
11518     add_to_linker((int)out,start+i*4,0);
11519     emit_jmp(0);
11520   }
11521
11522   // TODO: delay slot stubs?
11523   // Stubs
11524   for(i=0;i<stubcount;i++)
11525   {
11526     switch(stubs[i][0])
11527     {
11528       case LOADB_STUB:
11529       case LOADH_STUB:
11530       case LOADW_STUB:
11531       case LOADD_STUB:
11532       case LOADBU_STUB:
11533       case LOADHU_STUB:
11534         do_readstub(i);break;
11535       case STOREB_STUB:
11536       case STOREH_STUB:
11537       case STOREW_STUB:
11538       case STORED_STUB:
11539         do_writestub(i);break;
11540       case CC_STUB:
11541         do_ccstub(i);break;
11542       case INVCODE_STUB:
11543         do_invstub(i);break;
11544       case FP_STUB:
11545         do_cop1stub(i);break;
11546       case STORELR_STUB:
11547         do_unalignedwritestub(i);break;
11548     }
11549   }
11550
11551   if (instr_addr0_override)
11552     instr_addr[0] = instr_addr0_override;
11553
11554   /* Pass 9 - Linker */
11555   for(i=0;i<linkcount;i++)
11556   {
11557     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
11558     literal_pool(64);
11559     if(!link_addr[i][2])
11560     {
11561       void *stub=out;
11562       void *addr=check_addr(link_addr[i][1]);
11563       emit_extjump(link_addr[i][0],link_addr[i][1]);
11564       if(addr) {
11565         set_jump_target(link_addr[i][0],(int)addr);
11566         add_link(link_addr[i][1],stub);
11567       }
11568       else set_jump_target(link_addr[i][0],(int)stub);
11569     }
11570     else
11571     {
11572       // Internal branch
11573       int target=(link_addr[i][1]-start)>>2;
11574       assert(target>=0&&target<slen);
11575       assert(instr_addr[target]);
11576       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11577       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11578       //#else
11579       set_jump_target(link_addr[i][0],instr_addr[target]);
11580       //#endif
11581     }
11582   }
11583   // External Branch Targets (jump_in)
11584   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11585   for(i=0;i<slen;i++)
11586   {
11587     if(bt[i]||i==0)
11588     {
11589       if(instr_addr[i]) // TODO - delay slots (=null)
11590       {
11591         u_int vaddr=start+i*4;
11592         u_int page=get_page(vaddr);
11593         u_int vpage=get_vpage(vaddr);
11594         literal_pool(256);
11595         {
11596           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11597           assem_debug("jump_in: %x\n",start+i*4);
11598           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11599           int entry_point=do_dirty_stub(i);
11600           ll_add_flags(jump_in+page,vaddr,state_rflags,(void *)entry_point);
11601           // If there was an existing entry in the hash table,
11602           // replace it with the new address.
11603           // Don't add new entries.  We'll insert the
11604           // ones that actually get used in check_addr().
11605           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11606           if(ht_bin[0]==vaddr) {
11607             ht_bin[1]=entry_point;
11608           }
11609           if(ht_bin[2]==vaddr) {
11610             ht_bin[3]=entry_point;
11611           }
11612         }
11613       }
11614     }
11615   }
11616   // Write out the literal pool if necessary
11617   literal_pool(0);
11618   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11619   // Align code
11620   if(((u_int)out)&7) emit_addnop(13);
11621   #endif
11622   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11623   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11624   memcpy(copy,source,slen*4);
11625   copy+=slen*4;
11626
11627   #ifdef __arm__
11628   #if defined(VITA)
11629     sceKernelCloseVMDomain();
11630   #endif
11631   __clear_cache((void *)beginning,out);
11632   #endif
11633
11634   // If we're within 256K of the end of the buffer,
11635   // start over from the beginning. (Is 256K enough?)
11636   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11637
11638   // Trap writes to any of the pages we compiled
11639   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11640     invalid_code[i]=0;
11641 #ifndef DISABLE_TLB
11642     memory_map[i]|=0x40000000;
11643     if((signed int)start>=(signed int)0xC0000000) {
11644       assert(using_tlb);
11645       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11646       invalid_code[j]=0;
11647       memory_map[j]|=0x40000000;
11648       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11649     }
11650 #endif
11651   }
11652   inv_code_start=inv_code_end=~0;
11653 #ifdef PCSX
11654   // for PCSX we need to mark all mirrors too
11655   if(get_page(start)<(RAM_SIZE>>12))
11656     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11657       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
11658       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
11659       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
11660 #endif
11661
11662   /* Pass 10 - Free memory by expiring oldest blocks */
11663
11664   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11665   while(expirep!=end)
11666   {
11667     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11668     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11669     inv_debug("EXP: Phase %d\n",expirep);
11670     switch((expirep>>11)&3)
11671     {
11672       case 0:
11673         // Clear jump_in and jump_dirty
11674         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11675         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11676         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11677         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11678         break;
11679       case 1:
11680         // Clear pointers
11681         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11682         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11683         break;
11684       case 2:
11685         // Clear hash table
11686         for(i=0;i<32;i++) {
11687           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11688           if((ht_bin[3]>>shift)==(base>>shift) ||
11689              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11690             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11691             ht_bin[2]=ht_bin[3]=-1;
11692           }
11693           if((ht_bin[1]>>shift)==(base>>shift) ||
11694              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11695             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11696             ht_bin[0]=ht_bin[2];
11697             ht_bin[1]=ht_bin[3];
11698             ht_bin[2]=ht_bin[3]=-1;
11699           }
11700         }
11701         break;
11702       case 3:
11703         // Clear jump_out
11704         #ifdef __arm__
11705         if((expirep&2047)==0)
11706           do_clear_cache();
11707         #endif
11708         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11709         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11710         break;
11711     }
11712     expirep=(expirep+1)&65535;
11713   }
11714   return 0;
11715 }
11716
11717 // vim:shiftwidth=2:expandtab