425591761463d25675a19ffc2ef6f1914e170275
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <sys/mman.h>
25
26 #include "emu_if.h" //emulator interface
27
28 //#define DISASM
29 //#define assem_debug printf
30 //#define inv_debug printf
31 #define assem_debug(...)
32 #define inv_debug(...)
33
34 #ifdef __i386__
35 #include "assem_x86.h"
36 #endif
37 #ifdef __x86_64__
38 #include "assem_x64.h"
39 #endif
40 #ifdef __arm__
41 #include "assem_arm.h"
42 #endif
43
44 #ifdef __BLACKBERRY_QNX__
45 #undef __clear_cache
46 #define __clear_cache(start,end) msync(start, (size_t)((void*)end - (void*)start), MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
47 #endif
48
49 #define MAXBLOCK 4096
50 #define MAX_OUTPUT_BLOCK_SIZE 262144
51
52 struct regstat
53 {
54   signed char regmap_entry[HOST_REGS];
55   signed char regmap[HOST_REGS];
56   uint64_t was32;
57   uint64_t is32;
58   uint64_t wasdirty;
59   uint64_t dirty;
60   uint64_t u;
61   uint64_t uu;
62   u_int wasconst;
63   u_int isconst;
64   u_int loadedconst;             // host regs that have constants loaded
65   u_int waswritten;              // MIPS regs that were used as store base before
66 };
67
68 struct ll_entry
69 {
70   u_int vaddr;
71   u_int reg32;
72   void *addr;
73   struct ll_entry *next;
74 };
75
76   u_int start;
77   u_int *source;
78   u_int pagelimit;
79   char insn[MAXBLOCK][10];
80   u_char itype[MAXBLOCK];
81   u_char opcode[MAXBLOCK];
82   u_char opcode2[MAXBLOCK];
83   u_char bt[MAXBLOCK];
84   u_char rs1[MAXBLOCK];
85   u_char rs2[MAXBLOCK];
86   u_char rt1[MAXBLOCK];
87   u_char rt2[MAXBLOCK];
88   u_char us1[MAXBLOCK];
89   u_char us2[MAXBLOCK];
90   u_char dep1[MAXBLOCK];
91   u_char dep2[MAXBLOCK];
92   u_char lt1[MAXBLOCK];
93   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
94   static uint64_t gte_rt[MAXBLOCK];
95   static uint64_t gte_unneeded[MAXBLOCK];
96   static u_int smrv[32]; // speculated MIPS register values
97   static u_int smrv_strong; // mask or regs that are likely to have correct values
98   static u_int smrv_weak; // same, but somewhat less likely
99   static u_int smrv_strong_next; // same, but after current insn executes
100   static u_int smrv_weak_next;
101   int imm[MAXBLOCK];
102   u_int ba[MAXBLOCK];
103   char likely[MAXBLOCK];
104   char is_ds[MAXBLOCK];
105   char ooo[MAXBLOCK];
106   uint64_t unneeded_reg[MAXBLOCK];
107   uint64_t unneeded_reg_upper[MAXBLOCK];
108   uint64_t branch_unneeded_reg[MAXBLOCK];
109   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
110   uint64_t p32[MAXBLOCK];
111   uint64_t pr32[MAXBLOCK];
112   signed char regmap_pre[MAXBLOCK][HOST_REGS];
113   static uint64_t current_constmap[HOST_REGS];
114   static uint64_t constmap[MAXBLOCK][HOST_REGS];
115   static struct regstat regs[MAXBLOCK];
116   static struct regstat branch_regs[MAXBLOCK];
117   signed char minimum_free_regs[MAXBLOCK];
118   u_int needed_reg[MAXBLOCK];
119   uint64_t requires_32bit[MAXBLOCK];
120   u_int wont_dirty[MAXBLOCK];
121   u_int will_dirty[MAXBLOCK];
122   int ccadj[MAXBLOCK];
123   int slen;
124   u_int instr_addr[MAXBLOCK];
125   u_int link_addr[MAXBLOCK][3];
126   int linkcount;
127   u_int stubs[MAXBLOCK*3][8];
128   int stubcount;
129   u_int literals[1024][2];
130   int literalcount;
131   int is_delayslot;
132   int cop1_usable;
133   u_char *out;
134   struct ll_entry *jump_in[4096];
135   struct ll_entry *jump_out[4096];
136   struct ll_entry *jump_dirty[4096];
137   u_int hash_table[65536][4]  __attribute__((aligned(16)));
138   char shadow[1048576]  __attribute__((aligned(16)));
139   void *copy;
140   int expirep;
141 #ifndef PCSX
142   u_int using_tlb;
143 #else
144   static const u_int using_tlb=0;
145 #endif
146   int new_dynarec_did_compile;
147   int new_dynarec_hacks;
148   u_int stop_after_jal;
149 #ifndef RAM_FIXED
150   static u_int ram_offset;
151 #else
152   static const u_int ram_offset=0;
153 #endif
154   extern u_char restore_candidate[512];
155   extern int cycle_count;
156
157   /* registers that may be allocated */
158   /* 1-31 gpr */
159 #define HIREG 32 // hi
160 #define LOREG 33 // lo
161 #define FSREG 34 // FPU status (FCSR)
162 #define CSREG 35 // Coprocessor status
163 #define CCREG 36 // Cycle count
164 #define INVCP 37 // Pointer to invalid_code
165 #define MMREG 38 // Pointer to memory_map
166 #define ROREG 39 // ram offset (if rdram!=0x80000000)
167 #define TEMPREG 40
168 #define FTEMP 40 // FPU temporary register
169 #define PTEMP 41 // Prefetch temporary register
170 #define TLREG 42 // TLB mapping offset
171 #define RHASH 43 // Return address hash
172 #define RHTBL 44 // Return address hash table address
173 #define RTEMP 45 // JR/JALR address register
174 #define MAXREG 45
175 #define AGEN1 46 // Address generation temporary register
176 #define AGEN2 47 // Address generation temporary register
177 #define MGEN1 48 // Maptable address generation temporary register
178 #define MGEN2 49 // Maptable address generation temporary register
179 #define BTREG 50 // Branch target temporary register
180
181   /* instruction types */
182 #define NOP 0     // No operation
183 #define LOAD 1    // Load
184 #define STORE 2   // Store
185 #define LOADLR 3  // Unaligned load
186 #define STORELR 4 // Unaligned store
187 #define MOV 5     // Move 
188 #define ALU 6     // Arithmetic/logic
189 #define MULTDIV 7 // Multiply/divide
190 #define SHIFT 8   // Shift by register
191 #define SHIFTIMM 9// Shift by immediate
192 #define IMM16 10  // 16-bit immediate
193 #define RJUMP 11  // Unconditional jump to register
194 #define UJUMP 12  // Unconditional jump
195 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
196 #define SJUMP 14  // Conditional branch (regimm format)
197 #define COP0 15   // Coprocessor 0
198 #define COP1 16   // Coprocessor 1
199 #define C1LS 17   // Coprocessor 1 load/store
200 #define FJUMP 18  // Conditional branch (floating point)
201 #define FLOAT 19  // Floating point unit
202 #define FCONV 20  // Convert integer to float
203 #define FCOMP 21  // Floating point compare (sets FSREG)
204 #define SYSCALL 22// SYSCALL
205 #define OTHER 23  // Other
206 #define SPAN 24   // Branch/delay slot spans 2 pages
207 #define NI 25     // Not implemented
208 #define HLECALL 26// PCSX fake opcodes for HLE
209 #define COP2 27   // Coprocessor 2 move
210 #define C2LS 28   // Coprocessor 2 load/store
211 #define C2OP 29   // Coprocessor 2 operation
212 #define INTCALL 30// Call interpreter to handle rare corner cases
213
214   /* stubs */
215 #define CC_STUB 1
216 #define FP_STUB 2
217 #define LOADB_STUB 3
218 #define LOADH_STUB 4
219 #define LOADW_STUB 5
220 #define LOADD_STUB 6
221 #define LOADBU_STUB 7
222 #define LOADHU_STUB 8
223 #define STOREB_STUB 9
224 #define STOREH_STUB 10
225 #define STOREW_STUB 11
226 #define STORED_STUB 12
227 #define STORELR_STUB 13
228 #define INVCODE_STUB 14
229
230   /* branch codes */
231 #define TAKEN 1
232 #define NOTTAKEN 2
233 #define NULLDS 3
234
235 // asm linkage
236 int new_recompile_block(int addr);
237 void *get_addr_ht(u_int vaddr);
238 void invalidate_block(u_int block);
239 void invalidate_addr(u_int addr);
240 void remove_hash(int vaddr);
241 void jump_vaddr();
242 void dyna_linker();
243 void dyna_linker_ds();
244 void verify_code();
245 void verify_code_vm();
246 void verify_code_ds();
247 void cc_interrupt();
248 void fp_exception();
249 void fp_exception_ds();
250 void jump_syscall();
251 void jump_syscall_hle();
252 void jump_eret();
253 void jump_hlecall();
254 void jump_intcall();
255 void new_dyna_leave();
256
257 // TLB
258 void TLBWI_new();
259 void TLBWR_new();
260 void read_nomem_new();
261 void read_nomemb_new();
262 void read_nomemh_new();
263 void read_nomemd_new();
264 void write_nomem_new();
265 void write_nomemb_new();
266 void write_nomemh_new();
267 void write_nomemd_new();
268 void write_rdram_new();
269 void write_rdramb_new();
270 void write_rdramh_new();
271 void write_rdramd_new();
272 extern u_int memory_map[1048576];
273
274 // Needed by assembler
275 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
276 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
277 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
278 void load_all_regs(signed char i_regmap[]);
279 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
280 void load_regs_entry(int t);
281 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
282
283 int tracedebug=0;
284
285 //#define DEBUG_CYCLE_COUNT 1
286
287 #define NO_CYCLE_PENALTY_THR 12
288
289 int cycle_multiplier; // 100 for 1.0
290
291 static int CLOCK_ADJUST(int x)
292 {
293   int s=(x>>31)|1;
294   return (x * cycle_multiplier + s * 50) / 100;
295 }
296
297 static void tlb_hacks()
298 {
299 #ifndef DISABLE_TLB
300   // Goldeneye hack
301   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
302   {
303     u_int addr;
304     int n;
305     switch (ROM_HEADER->Country_code&0xFF) 
306     {
307       case 0x45: // U
308         addr=0x34b30;
309         break;                   
310       case 0x4A: // J 
311         addr=0x34b70;    
312         break;    
313       case 0x50: // E 
314         addr=0x329f0;
315         break;                        
316       default: 
317         // Unknown country code
318         addr=0;
319         break;
320     }
321     u_int rom_addr=(u_int)rom;
322     #ifdef ROM_COPY
323     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
324     // in the lower 4G of memory to use this hack.  Copy it if necessary.
325     if((void *)rom>(void *)0xffffffff) {
326       munmap(ROM_COPY, 67108864);
327       if(mmap(ROM_COPY, 12582912,
328               PROT_READ | PROT_WRITE,
329               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
330               -1, 0) <= 0) {printf("mmap() failed\n");}
331       memcpy(ROM_COPY,rom,12582912);
332       rom_addr=(u_int)ROM_COPY;
333     }
334     #endif
335     if(addr) {
336       for(n=0x7F000;n<0x80000;n++) {
337         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
338       }
339     }
340   }
341 #endif
342 }
343
344 static u_int get_page(u_int vaddr)
345 {
346 #ifndef PCSX
347   u_int page=(vaddr^0x80000000)>>12;
348 #else
349   u_int page=vaddr&~0xe0000000;
350   if (page < 0x1000000)
351     page &= ~0x0e00000; // RAM mirrors
352   page>>=12;
353 #endif
354 #ifndef DISABLE_TLB
355   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
356 #endif
357   if(page>2048) page=2048+(page&2047);
358   return page;
359 }
360
361 #ifndef PCSX
362 static u_int get_vpage(u_int vaddr)
363 {
364   u_int vpage=(vaddr^0x80000000)>>12;
365 #ifndef DISABLE_TLB
366   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
367 #endif
368   if(vpage>2048) vpage=2048+(vpage&2047);
369   return vpage;
370 }
371 #else
372 // no virtual mem in PCSX
373 static u_int get_vpage(u_int vaddr)
374 {
375   return get_page(vaddr);
376 }
377 #endif
378
379 // Get address from virtual address
380 // This is called from the recompiled JR/JALR instructions
381 void *get_addr(u_int vaddr)
382 {
383   u_int page=get_page(vaddr);
384   u_int vpage=get_vpage(vaddr);
385   struct ll_entry *head;
386   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
387   head=jump_in[page];
388   while(head!=NULL) {
389     if(head->vaddr==vaddr&&head->reg32==0) {
390   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
391       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
392       ht_bin[3]=ht_bin[1];
393       ht_bin[2]=ht_bin[0];
394       ht_bin[1]=(int)head->addr;
395       ht_bin[0]=vaddr;
396       return head->addr;
397     }
398     head=head->next;
399   }
400   head=jump_dirty[vpage];
401   while(head!=NULL) {
402     if(head->vaddr==vaddr&&head->reg32==0) {
403       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
404       // Don't restore blocks which are about to expire from the cache
405       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
406       if(verify_dirty(head->addr)) {
407         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
408         invalid_code[vaddr>>12]=0;
409         inv_code_start=inv_code_end=~0;
410 #ifndef DISABLE_TLB
411         memory_map[vaddr>>12]|=0x40000000;
412 #endif
413         if(vpage<2048) {
414 #ifndef DISABLE_TLB
415           if(tlb_LUT_r[vaddr>>12]) {
416             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
417             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
418           }
419 #endif
420           restore_candidate[vpage>>3]|=1<<(vpage&7);
421         }
422         else restore_candidate[page>>3]|=1<<(page&7);
423         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
424         if(ht_bin[0]==vaddr) {
425           ht_bin[1]=(int)head->addr; // Replace existing entry
426         }
427         else
428         {
429           ht_bin[3]=ht_bin[1];
430           ht_bin[2]=ht_bin[0];
431           ht_bin[1]=(int)head->addr;
432           ht_bin[0]=vaddr;
433         }
434         return head->addr;
435       }
436     }
437     head=head->next;
438   }
439   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
440   int r=new_recompile_block(vaddr);
441   if(r==0) return get_addr(vaddr);
442   // Execute in unmapped page, generate pagefault execption
443   Status|=2;
444   Cause=(vaddr<<31)|0x8;
445   EPC=(vaddr&1)?vaddr-5:vaddr;
446   BadVAddr=(vaddr&~1);
447   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
448   EntryHi=BadVAddr&0xFFFFE000;
449   return get_addr_ht(0x80000000);
450 }
451 // Look up address in hash table first
452 void *get_addr_ht(u_int vaddr)
453 {
454   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
455   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
456   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
457   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
458   return get_addr(vaddr);
459 }
460
461 void *get_addr_32(u_int vaddr,u_int flags)
462 {
463 #ifdef FORCE32
464   return get_addr(vaddr);
465 #else
466   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
467   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
468   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
469   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
470   u_int page=get_page(vaddr);
471   u_int vpage=get_vpage(vaddr);
472   struct ll_entry *head;
473   head=jump_in[page];
474   while(head!=NULL) {
475     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
476       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
477       if(head->reg32==0) {
478         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
479         if(ht_bin[0]==-1) {
480           ht_bin[1]=(int)head->addr;
481           ht_bin[0]=vaddr;
482         }else if(ht_bin[2]==-1) {
483           ht_bin[3]=(int)head->addr;
484           ht_bin[2]=vaddr;
485         }
486         //ht_bin[3]=ht_bin[1];
487         //ht_bin[2]=ht_bin[0];
488         //ht_bin[1]=(int)head->addr;
489         //ht_bin[0]=vaddr;
490       }
491       return head->addr;
492     }
493     head=head->next;
494   }
495   head=jump_dirty[vpage];
496   while(head!=NULL) {
497     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
498       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
499       // Don't restore blocks which are about to expire from the cache
500       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
501       if(verify_dirty(head->addr)) {
502         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
503         invalid_code[vaddr>>12]=0;
504         inv_code_start=inv_code_end=~0;
505         memory_map[vaddr>>12]|=0x40000000;
506         if(vpage<2048) {
507 #ifndef DISABLE_TLB
508           if(tlb_LUT_r[vaddr>>12]) {
509             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
510             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
511           }
512 #endif
513           restore_candidate[vpage>>3]|=1<<(vpage&7);
514         }
515         else restore_candidate[page>>3]|=1<<(page&7);
516         if(head->reg32==0) {
517           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
518           if(ht_bin[0]==-1) {
519             ht_bin[1]=(int)head->addr;
520             ht_bin[0]=vaddr;
521           }else if(ht_bin[2]==-1) {
522             ht_bin[3]=(int)head->addr;
523             ht_bin[2]=vaddr;
524           }
525           //ht_bin[3]=ht_bin[1];
526           //ht_bin[2]=ht_bin[0];
527           //ht_bin[1]=(int)head->addr;
528           //ht_bin[0]=vaddr;
529         }
530         return head->addr;
531       }
532     }
533     head=head->next;
534   }
535   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
536   int r=new_recompile_block(vaddr);
537   if(r==0) return get_addr(vaddr);
538   // Execute in unmapped page, generate pagefault execption
539   Status|=2;
540   Cause=(vaddr<<31)|0x8;
541   EPC=(vaddr&1)?vaddr-5:vaddr;
542   BadVAddr=(vaddr&~1);
543   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
544   EntryHi=BadVAddr&0xFFFFE000;
545   return get_addr_ht(0x80000000);
546 #endif
547 }
548
549 void clear_all_regs(signed char regmap[])
550 {
551   int hr;
552   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
553 }
554
555 signed char get_reg(signed char regmap[],int r)
556 {
557   int hr;
558   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
559   return -1;
560 }
561
562 // Find a register that is available for two consecutive cycles
563 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
564 {
565   int hr;
566   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
567   return -1;
568 }
569
570 int count_free_regs(signed char regmap[])
571 {
572   int count=0;
573   int hr;
574   for(hr=0;hr<HOST_REGS;hr++)
575   {
576     if(hr!=EXCLUDE_REG) {
577       if(regmap[hr]<0) count++;
578     }
579   }
580   return count;
581 }
582
583 void dirty_reg(struct regstat *cur,signed char reg)
584 {
585   int hr;
586   if(!reg) return;
587   for (hr=0;hr<HOST_REGS;hr++) {
588     if((cur->regmap[hr]&63)==reg) {
589       cur->dirty|=1<<hr;
590     }
591   }
592 }
593
594 // If we dirty the lower half of a 64 bit register which is now being
595 // sign-extended, we need to dump the upper half.
596 // Note: Do this only after completion of the instruction, because
597 // some instructions may need to read the full 64-bit value even if
598 // overwriting it (eg SLTI, DSRA32).
599 static void flush_dirty_uppers(struct regstat *cur)
600 {
601   int hr,reg;
602   for (hr=0;hr<HOST_REGS;hr++) {
603     if((cur->dirty>>hr)&1) {
604       reg=cur->regmap[hr];
605       if(reg>=64) 
606         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
607     }
608   }
609 }
610
611 void set_const(struct regstat *cur,signed char reg,uint64_t value)
612 {
613   int hr;
614   if(!reg) return;
615   for (hr=0;hr<HOST_REGS;hr++) {
616     if(cur->regmap[hr]==reg) {
617       cur->isconst|=1<<hr;
618       current_constmap[hr]=value;
619     }
620     else if((cur->regmap[hr]^64)==reg) {
621       cur->isconst|=1<<hr;
622       current_constmap[hr]=value>>32;
623     }
624   }
625 }
626
627 void clear_const(struct regstat *cur,signed char reg)
628 {
629   int hr;
630   if(!reg) return;
631   for (hr=0;hr<HOST_REGS;hr++) {
632     if((cur->regmap[hr]&63)==reg) {
633       cur->isconst&=~(1<<hr);
634     }
635   }
636 }
637
638 int is_const(struct regstat *cur,signed char reg)
639 {
640   int hr;
641   if(reg<0) return 0;
642   if(!reg) return 1;
643   for (hr=0;hr<HOST_REGS;hr++) {
644     if((cur->regmap[hr]&63)==reg) {
645       return (cur->isconst>>hr)&1;
646     }
647   }
648   return 0;
649 }
650 uint64_t get_const(struct regstat *cur,signed char reg)
651 {
652   int hr;
653   if(!reg) return 0;
654   for (hr=0;hr<HOST_REGS;hr++) {
655     if(cur->regmap[hr]==reg) {
656       return current_constmap[hr];
657     }
658   }
659   SysPrintf("Unknown constant in r%d\n",reg);
660   exit(1);
661 }
662
663 // Least soon needed registers
664 // Look at the next ten instructions and see which registers
665 // will be used.  Try not to reallocate these.
666 void lsn(u_char hsn[], int i, int *preferred_reg)
667 {
668   int j;
669   int b=-1;
670   for(j=0;j<9;j++)
671   {
672     if(i+j>=slen) {
673       j=slen-i-1;
674       break;
675     }
676     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
677     {
678       // Don't go past an unconditonal jump
679       j++;
680       break;
681     }
682   }
683   for(;j>=0;j--)
684   {
685     if(rs1[i+j]) hsn[rs1[i+j]]=j;
686     if(rs2[i+j]) hsn[rs2[i+j]]=j;
687     if(rt1[i+j]) hsn[rt1[i+j]]=j;
688     if(rt2[i+j]) hsn[rt2[i+j]]=j;
689     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
690       // Stores can allocate zero
691       hsn[rs1[i+j]]=j;
692       hsn[rs2[i+j]]=j;
693     }
694     // On some architectures stores need invc_ptr
695     #if defined(HOST_IMM8)
696     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
697       hsn[INVCP]=j;
698     }
699     #endif
700     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
701     {
702       hsn[CCREG]=j;
703       b=j;
704     }
705   }
706   if(b>=0)
707   {
708     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
709     {
710       // Follow first branch
711       int t=(ba[i+b]-start)>>2;
712       j=7-b;if(t+j>=slen) j=slen-t-1;
713       for(;j>=0;j--)
714       {
715         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
716         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
717         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
718         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
719       }
720     }
721     // TODO: preferred register based on backward branch
722   }
723   // Delay slot should preferably not overwrite branch conditions or cycle count
724   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
725     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
726     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
727     hsn[CCREG]=1;
728     // ...or hash tables
729     hsn[RHASH]=1;
730     hsn[RHTBL]=1;
731   }
732   // Coprocessor load/store needs FTEMP, even if not declared
733   if(itype[i]==C1LS||itype[i]==C2LS) {
734     hsn[FTEMP]=0;
735   }
736   // Load L/R also uses FTEMP as a temporary register
737   if(itype[i]==LOADLR) {
738     hsn[FTEMP]=0;
739   }
740   // Also SWL/SWR/SDL/SDR
741   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
742     hsn[FTEMP]=0;
743   }
744   // Don't remove the TLB registers either
745   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
746     hsn[TLREG]=0;
747   }
748   // Don't remove the miniht registers
749   if(itype[i]==UJUMP||itype[i]==RJUMP)
750   {
751     hsn[RHASH]=0;
752     hsn[RHTBL]=0;
753   }
754 }
755
756 // We only want to allocate registers if we're going to use them again soon
757 int needed_again(int r, int i)
758 {
759   int j;
760   int b=-1;
761   int rn=10;
762   
763   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
764   {
765     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
766       return 0; // Don't need any registers if exiting the block
767   }
768   for(j=0;j<9;j++)
769   {
770     if(i+j>=slen) {
771       j=slen-i-1;
772       break;
773     }
774     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
775     {
776       // Don't go past an unconditonal jump
777       j++;
778       break;
779     }
780     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
781     {
782       break;
783     }
784   }
785   for(;j>=1;j--)
786   {
787     if(rs1[i+j]==r) rn=j;
788     if(rs2[i+j]==r) rn=j;
789     if((unneeded_reg[i+j]>>r)&1) rn=10;
790     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
791     {
792       b=j;
793     }
794   }
795   /*
796   if(b>=0)
797   {
798     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
799     {
800       // Follow first branch
801       int o=rn;
802       int t=(ba[i+b]-start)>>2;
803       j=7-b;if(t+j>=slen) j=slen-t-1;
804       for(;j>=0;j--)
805       {
806         if(!((unneeded_reg[t+j]>>r)&1)) {
807           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
808           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
809         }
810         else rn=o;
811       }
812     }
813   }*/
814   if(rn<10) return 1;
815   return 0;
816 }
817
818 // Try to match register allocations at the end of a loop with those
819 // at the beginning
820 int loop_reg(int i, int r, int hr)
821 {
822   int j,k;
823   for(j=0;j<9;j++)
824   {
825     if(i+j>=slen) {
826       j=slen-i-1;
827       break;
828     }
829     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
830     {
831       // Don't go past an unconditonal jump
832       j++;
833       break;
834     }
835   }
836   k=0;
837   if(i>0){
838     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
839       k--;
840   }
841   for(;k<j;k++)
842   {
843     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
844     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
845     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
846     {
847       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
848       {
849         int t=(ba[i+k]-start)>>2;
850         int reg=get_reg(regs[t].regmap_entry,r);
851         if(reg>=0) return reg;
852         //reg=get_reg(regs[t+1].regmap_entry,r);
853         //if(reg>=0) return reg;
854       }
855     }
856   }
857   return hr;
858 }
859
860
861 // Allocate every register, preserving source/target regs
862 void alloc_all(struct regstat *cur,int i)
863 {
864   int hr;
865   
866   for(hr=0;hr<HOST_REGS;hr++) {
867     if(hr!=EXCLUDE_REG) {
868       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
869          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
870       {
871         cur->regmap[hr]=-1;
872         cur->dirty&=~(1<<hr);
873       }
874       // Don't need zeros
875       if((cur->regmap[hr]&63)==0)
876       {
877         cur->regmap[hr]=-1;
878         cur->dirty&=~(1<<hr);
879       }
880     }
881   }
882 }
883
884 #ifndef FORCE32
885 void div64(int64_t dividend,int64_t divisor)
886 {
887   lo=dividend/divisor;
888   hi=dividend%divisor;
889   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
890   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
891 }
892 void divu64(uint64_t dividend,uint64_t divisor)
893 {
894   lo=dividend/divisor;
895   hi=dividend%divisor;
896   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
897   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
898 }
899
900 void mult64(uint64_t m1,uint64_t m2)
901 {
902    unsigned long long int op1, op2, op3, op4;
903    unsigned long long int result1, result2, result3, result4;
904    unsigned long long int temp1, temp2, temp3, temp4;
905    int sign = 0;
906    
907    if (m1 < 0)
908      {
909     op2 = -m1;
910     sign = 1 - sign;
911      }
912    else op2 = m1;
913    if (m2 < 0)
914      {
915     op4 = -m2;
916     sign = 1 - sign;
917      }
918    else op4 = m2;
919    
920    op1 = op2 & 0xFFFFFFFF;
921    op2 = (op2 >> 32) & 0xFFFFFFFF;
922    op3 = op4 & 0xFFFFFFFF;
923    op4 = (op4 >> 32) & 0xFFFFFFFF;
924    
925    temp1 = op1 * op3;
926    temp2 = (temp1 >> 32) + op1 * op4;
927    temp3 = op2 * op3;
928    temp4 = (temp3 >> 32) + op2 * op4;
929    
930    result1 = temp1 & 0xFFFFFFFF;
931    result2 = temp2 + (temp3 & 0xFFFFFFFF);
932    result3 = (result2 >> 32) + temp4;
933    result4 = (result3 >> 32);
934    
935    lo = result1 | (result2 << 32);
936    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
937    if (sign)
938      {
939     hi = ~hi;
940     if (!lo) hi++;
941     else lo = ~lo + 1;
942      }
943 }
944
945 void multu64(uint64_t m1,uint64_t m2)
946 {
947    unsigned long long int op1, op2, op3, op4;
948    unsigned long long int result1, result2, result3, result4;
949    unsigned long long int temp1, temp2, temp3, temp4;
950    
951    op1 = m1 & 0xFFFFFFFF;
952    op2 = (m1 >> 32) & 0xFFFFFFFF;
953    op3 = m2 & 0xFFFFFFFF;
954    op4 = (m2 >> 32) & 0xFFFFFFFF;
955    
956    temp1 = op1 * op3;
957    temp2 = (temp1 >> 32) + op1 * op4;
958    temp3 = op2 * op3;
959    temp4 = (temp3 >> 32) + op2 * op4;
960    
961    result1 = temp1 & 0xFFFFFFFF;
962    result2 = temp2 + (temp3 & 0xFFFFFFFF);
963    result3 = (result2 >> 32) + temp4;
964    result4 = (result3 >> 32);
965    
966    lo = result1 | (result2 << 32);
967    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
968    
969   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
970   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
971 }
972
973 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
974 {
975   if(bits) {
976     original<<=64-bits;
977     original>>=64-bits;
978     loaded<<=bits;
979     original|=loaded;
980   }
981   else original=loaded;
982   return original;
983 }
984 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
985 {
986   if(bits^56) {
987     original>>=64-(bits^56);
988     original<<=64-(bits^56);
989     loaded>>=bits^56;
990     original|=loaded;
991   }
992   else original=loaded;
993   return original;
994 }
995 #endif
996
997 #ifdef __i386__
998 #include "assem_x86.c"
999 #endif
1000 #ifdef __x86_64__
1001 #include "assem_x64.c"
1002 #endif
1003 #ifdef __arm__
1004 #include "assem_arm.c"
1005 #endif
1006
1007 // Add virtual address mapping to linked list
1008 void ll_add(struct ll_entry **head,int vaddr,void *addr)
1009 {
1010   struct ll_entry *new_entry;
1011   new_entry=malloc(sizeof(struct ll_entry));
1012   assert(new_entry!=NULL);
1013   new_entry->vaddr=vaddr;
1014   new_entry->reg32=0;
1015   new_entry->addr=addr;
1016   new_entry->next=*head;
1017   *head=new_entry;
1018 }
1019
1020 // Add virtual address mapping for 32-bit compiled block
1021 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
1022 {
1023   ll_add(head,vaddr,addr);
1024 #ifndef FORCE32
1025   (*head)->reg32=reg32;
1026 #endif
1027 }
1028
1029 // Check if an address is already compiled
1030 // but don't return addresses which are about to expire from the cache
1031 void *check_addr(u_int vaddr)
1032 {
1033   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
1034   if(ht_bin[0]==vaddr) {
1035     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1036       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
1037   }
1038   if(ht_bin[2]==vaddr) {
1039     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
1040       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
1041   }
1042   u_int page=get_page(vaddr);
1043   struct ll_entry *head;
1044   head=jump_in[page];
1045   while(head!=NULL) {
1046     if(head->vaddr==vaddr&&head->reg32==0) {
1047       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1048         // Update existing entry with current address
1049         if(ht_bin[0]==vaddr) {
1050           ht_bin[1]=(int)head->addr;
1051           return head->addr;
1052         }
1053         if(ht_bin[2]==vaddr) {
1054           ht_bin[3]=(int)head->addr;
1055           return head->addr;
1056         }
1057         // Insert into hash table with low priority.
1058         // Don't evict existing entries, as they are probably
1059         // addresses that are being accessed frequently.
1060         if(ht_bin[0]==-1) {
1061           ht_bin[1]=(int)head->addr;
1062           ht_bin[0]=vaddr;
1063         }else if(ht_bin[2]==-1) {
1064           ht_bin[3]=(int)head->addr;
1065           ht_bin[2]=vaddr;
1066         }
1067         return head->addr;
1068       }
1069     }
1070     head=head->next;
1071   }
1072   return 0;
1073 }
1074
1075 void remove_hash(int vaddr)
1076 {
1077   //printf("remove hash: %x\n",vaddr);
1078   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1079   if(ht_bin[2]==vaddr) {
1080     ht_bin[2]=ht_bin[3]=-1;
1081   }
1082   if(ht_bin[0]==vaddr) {
1083     ht_bin[0]=ht_bin[2];
1084     ht_bin[1]=ht_bin[3];
1085     ht_bin[2]=ht_bin[3]=-1;
1086   }
1087 }
1088
1089 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1090 {
1091   struct ll_entry *next;
1092   while(*head) {
1093     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1094        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1095     {
1096       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1097       remove_hash((*head)->vaddr);
1098       next=(*head)->next;
1099       free(*head);
1100       *head=next;
1101     }
1102     else
1103     {
1104       head=&((*head)->next);
1105     }
1106   }
1107 }
1108
1109 // Remove all entries from linked list
1110 void ll_clear(struct ll_entry **head)
1111 {
1112   struct ll_entry *cur;
1113   struct ll_entry *next;
1114   if(cur=*head) {
1115     *head=0;
1116     while(cur) {
1117       next=cur->next;
1118       free(cur);
1119       cur=next;
1120     }
1121   }
1122 }
1123
1124 // Dereference the pointers and remove if it matches
1125 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1126 {
1127   while(head) {
1128     int ptr=get_pointer(head->addr);
1129     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1130     if(((ptr>>shift)==(addr>>shift)) ||
1131        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1132     {
1133       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1134       u_int host_addr=(u_int)kill_pointer(head->addr);
1135       #ifdef __arm__
1136         needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1137       #endif
1138     }
1139     head=head->next;
1140   }
1141 }
1142
1143 // This is called when we write to a compiled block (see do_invstub)
1144 void invalidate_page(u_int page)
1145 {
1146   struct ll_entry *head;
1147   struct ll_entry *next;
1148   head=jump_in[page];
1149   jump_in[page]=0;
1150   while(head!=NULL) {
1151     inv_debug("INVALIDATE: %x\n",head->vaddr);
1152     remove_hash(head->vaddr);
1153     next=head->next;
1154     free(head);
1155     head=next;
1156   }
1157   head=jump_out[page];
1158   jump_out[page]=0;
1159   while(head!=NULL) {
1160     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1161     u_int host_addr=(u_int)kill_pointer(head->addr);
1162     #ifdef __arm__
1163       needs_clear_cache[(host_addr-(u_int)BASE_ADDR)>>17]|=1<<(((host_addr-(u_int)BASE_ADDR)>>12)&31);
1164     #endif
1165     next=head->next;
1166     free(head);
1167     head=next;
1168   }
1169 }
1170
1171 static void invalidate_block_range(u_int block, u_int first, u_int last)
1172 {
1173   u_int page=get_page(block<<12);
1174   //printf("first=%d last=%d\n",first,last);
1175   invalidate_page(page);
1176   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1177   assert(last<page+5);
1178   // Invalidate the adjacent pages if a block crosses a 4K boundary
1179   while(first<page) {
1180     invalidate_page(first);
1181     first++;
1182   }
1183   for(first=page+1;first<last;first++) {
1184     invalidate_page(first);
1185   }
1186   #ifdef __arm__
1187     do_clear_cache();
1188   #endif
1189   
1190   // Don't trap writes
1191   invalid_code[block]=1;
1192 #ifndef DISABLE_TLB
1193   // If there is a valid TLB entry for this page, remove write protect
1194   if(tlb_LUT_w[block]) {
1195     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1196     // CHECK: Is this right?
1197     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1198     u_int real_block=tlb_LUT_w[block]>>12;
1199     invalid_code[real_block]=1;
1200     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1201   }
1202   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1203 #endif
1204
1205   #ifdef USE_MINI_HT
1206   memset(mini_ht,-1,sizeof(mini_ht));
1207   #endif
1208 }
1209
1210 void invalidate_block(u_int block)
1211 {
1212   u_int page=get_page(block<<12);
1213   u_int vpage=get_vpage(block<<12);
1214   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1215   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1216   u_int first,last;
1217   first=last=page;
1218   struct ll_entry *head;
1219   head=jump_dirty[vpage];
1220   //printf("page=%d vpage=%d\n",page,vpage);
1221   while(head!=NULL) {
1222     u_int start,end;
1223     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1224       get_bounds((int)head->addr,&start,&end);
1225       //printf("start: %x end: %x\n",start,end);
1226       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
1227         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1228           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1229           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1230         }
1231       }
1232 #ifndef DISABLE_TLB
1233       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1234         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1235           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1236           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1237         }
1238       }
1239 #endif
1240     }
1241     head=head->next;
1242   }
1243   invalidate_block_range(block,first,last);
1244 }
1245
1246 void invalidate_addr(u_int addr)
1247 {
1248 #ifdef PCSX
1249   //static int rhits;
1250   // this check is done by the caller
1251   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1252   u_int page=get_vpage(addr);
1253   if(page<2048) { // RAM
1254     struct ll_entry *head;
1255     u_int addr_min=~0, addr_max=0;
1256     u_int mask=RAM_SIZE-1;
1257     u_int addr_main=0x80000000|(addr&mask);
1258     int pg1;
1259     inv_code_start=addr_main&~0xfff;
1260     inv_code_end=addr_main|0xfff;
1261     pg1=page;
1262     if (pg1>0) {
1263       // must check previous page too because of spans..
1264       pg1--;
1265       inv_code_start-=0x1000;
1266     }
1267     for(;pg1<=page;pg1++) {
1268       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1269         u_int start,end;
1270         get_bounds((int)head->addr,&start,&end);
1271         if(ram_offset) {
1272           start-=ram_offset;
1273           end-=ram_offset;
1274         }
1275         if(start<=addr_main&&addr_main<end) {
1276           if(start<addr_min) addr_min=start;
1277           if(end>addr_max) addr_max=end;
1278         }
1279         else if(addr_main<start) {
1280           if(start<inv_code_end)
1281             inv_code_end=start-1;
1282         }
1283         else {
1284           if(end>inv_code_start)
1285             inv_code_start=end;
1286         }
1287       }
1288     }
1289     if (addr_min!=~0) {
1290       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1291       inv_code_start=inv_code_end=~0;
1292       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1293       return;
1294     }
1295     else {
1296       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1297       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1298       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1299       return;
1300     }
1301   }
1302 #endif
1303   invalidate_block(addr>>12);
1304 }
1305
1306 // This is called when loading a save state.
1307 // Anything could have changed, so invalidate everything.
1308 void invalidate_all_pages()
1309 {
1310   u_int page,n;
1311   for(page=0;page<4096;page++)
1312     invalidate_page(page);
1313   for(page=0;page<1048576;page++)
1314     if(!invalid_code[page]) {
1315       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1316       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1317     }
1318   #ifdef __arm__
1319   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1320   #endif
1321   #ifdef USE_MINI_HT
1322   memset(mini_ht,-1,sizeof(mini_ht));
1323   #endif
1324   #ifndef DISABLE_TLB
1325   // TLB
1326   for(page=0;page<0x100000;page++) {
1327     if(tlb_LUT_r[page]) {
1328       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1329       if(!tlb_LUT_w[page]||!invalid_code[page])
1330         memory_map[page]|=0x40000000; // Write protect
1331     }
1332     else memory_map[page]=-1;
1333     if(page==0x80000) page=0xC0000;
1334   }
1335   tlb_hacks();
1336   #endif
1337 }
1338
1339 // Add an entry to jump_out after making a link
1340 void add_link(u_int vaddr,void *src)
1341 {
1342   u_int page=get_page(vaddr);
1343   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1344   int *ptr=(int *)(src+4);
1345   assert((*ptr&0x0fff0000)==0x059f0000);
1346   ll_add(jump_out+page,vaddr,src);
1347   //int ptr=get_pointer(src);
1348   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1349 }
1350
1351 // If a code block was found to be unmodified (bit was set in
1352 // restore_candidate) and it remains unmodified (bit is clear
1353 // in invalid_code) then move the entries for that 4K page from
1354 // the dirty list to the clean list.
1355 void clean_blocks(u_int page)
1356 {
1357   struct ll_entry *head;
1358   inv_debug("INV: clean_blocks page=%d\n",page);
1359   head=jump_dirty[page];
1360   while(head!=NULL) {
1361     if(!invalid_code[head->vaddr>>12]) {
1362       // Don't restore blocks which are about to expire from the cache
1363       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1364         u_int start,end;
1365         if(verify_dirty((int)head->addr)) {
1366           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1367           u_int i;
1368           u_int inv=0;
1369           get_bounds((int)head->addr,&start,&end);
1370           if(start-(u_int)rdram<RAM_SIZE) {
1371             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1372               inv|=invalid_code[i];
1373             }
1374           }
1375 #ifndef DISABLE_TLB
1376           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1377             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1378             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1379             if(addr<start||addr>=end) inv=1;
1380           }
1381 #endif
1382           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1383             inv=1;
1384           }
1385           if(!inv) {
1386             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1387             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1388               u_int ppage=page;
1389 #ifndef DISABLE_TLB
1390               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1391 #endif
1392               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1393               //printf("page=%x, addr=%x\n",page,head->vaddr);
1394               //assert(head->vaddr>>12==(page|0x80000));
1395               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1396               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1397               if(!head->reg32) {
1398                 if(ht_bin[0]==head->vaddr) {
1399                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1400                 }
1401                 if(ht_bin[2]==head->vaddr) {
1402                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1403                 }
1404               }
1405             }
1406           }
1407         }
1408       }
1409     }
1410     head=head->next;
1411   }
1412 }
1413
1414
1415 void mov_alloc(struct regstat *current,int i)
1416 {
1417   // Note: Don't need to actually alloc the source registers
1418   if((~current->is32>>rs1[i])&1) {
1419     //alloc_reg64(current,i,rs1[i]);
1420     alloc_reg64(current,i,rt1[i]);
1421     current->is32&=~(1LL<<rt1[i]);
1422   } else {
1423     //alloc_reg(current,i,rs1[i]);
1424     alloc_reg(current,i,rt1[i]);
1425     current->is32|=(1LL<<rt1[i]);
1426   }
1427   clear_const(current,rs1[i]);
1428   clear_const(current,rt1[i]);
1429   dirty_reg(current,rt1[i]);
1430 }
1431
1432 void shiftimm_alloc(struct regstat *current,int i)
1433 {
1434   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1435   {
1436     if(rt1[i]) {
1437       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1438       else lt1[i]=rs1[i];
1439       alloc_reg(current,i,rt1[i]);
1440       current->is32|=1LL<<rt1[i];
1441       dirty_reg(current,rt1[i]);
1442       if(is_const(current,rs1[i])) {
1443         int v=get_const(current,rs1[i]);
1444         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1445         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1446         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1447       }
1448       else clear_const(current,rt1[i]);
1449     }
1450   }
1451   else
1452   {
1453     clear_const(current,rs1[i]);
1454     clear_const(current,rt1[i]);
1455   }
1456
1457   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1458   {
1459     if(rt1[i]) {
1460       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1461       alloc_reg64(current,i,rt1[i]);
1462       current->is32&=~(1LL<<rt1[i]);
1463       dirty_reg(current,rt1[i]);
1464     }
1465   }
1466   if(opcode2[i]==0x3c) // DSLL32
1467   {
1468     if(rt1[i]) {
1469       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1470       alloc_reg64(current,i,rt1[i]);
1471       current->is32&=~(1LL<<rt1[i]);
1472       dirty_reg(current,rt1[i]);
1473     }
1474   }
1475   if(opcode2[i]==0x3e) // DSRL32
1476   {
1477     if(rt1[i]) {
1478       alloc_reg64(current,i,rs1[i]);
1479       if(imm[i]==32) {
1480         alloc_reg64(current,i,rt1[i]);
1481         current->is32&=~(1LL<<rt1[i]);
1482       } else {
1483         alloc_reg(current,i,rt1[i]);
1484         current->is32|=1LL<<rt1[i];
1485       }
1486       dirty_reg(current,rt1[i]);
1487     }
1488   }
1489   if(opcode2[i]==0x3f) // DSRA32
1490   {
1491     if(rt1[i]) {
1492       alloc_reg64(current,i,rs1[i]);
1493       alloc_reg(current,i,rt1[i]);
1494       current->is32|=1LL<<rt1[i];
1495       dirty_reg(current,rt1[i]);
1496     }
1497   }
1498 }
1499
1500 void shift_alloc(struct regstat *current,int i)
1501 {
1502   if(rt1[i]) {
1503     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1504     {
1505       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1506       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1507       alloc_reg(current,i,rt1[i]);
1508       if(rt1[i]==rs2[i]) {
1509         alloc_reg_temp(current,i,-1);
1510         minimum_free_regs[i]=1;
1511       }
1512       current->is32|=1LL<<rt1[i];
1513     } else { // DSLLV/DSRLV/DSRAV
1514       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1515       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1516       alloc_reg64(current,i,rt1[i]);
1517       current->is32&=~(1LL<<rt1[i]);
1518       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1519       {
1520         alloc_reg_temp(current,i,-1);
1521         minimum_free_regs[i]=1;
1522       }
1523     }
1524     clear_const(current,rs1[i]);
1525     clear_const(current,rs2[i]);
1526     clear_const(current,rt1[i]);
1527     dirty_reg(current,rt1[i]);
1528   }
1529 }
1530
1531 void alu_alloc(struct regstat *current,int i)
1532 {
1533   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1534     if(rt1[i]) {
1535       if(rs1[i]&&rs2[i]) {
1536         alloc_reg(current,i,rs1[i]);
1537         alloc_reg(current,i,rs2[i]);
1538       }
1539       else {
1540         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1541         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1542       }
1543       alloc_reg(current,i,rt1[i]);
1544     }
1545     current->is32|=1LL<<rt1[i];
1546   }
1547   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1548     if(rt1[i]) {
1549       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1550       {
1551         alloc_reg64(current,i,rs1[i]);
1552         alloc_reg64(current,i,rs2[i]);
1553         alloc_reg(current,i,rt1[i]);
1554       } else {
1555         alloc_reg(current,i,rs1[i]);
1556         alloc_reg(current,i,rs2[i]);
1557         alloc_reg(current,i,rt1[i]);
1558       }
1559     }
1560     current->is32|=1LL<<rt1[i];
1561   }
1562   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1563     if(rt1[i]) {
1564       if(rs1[i]&&rs2[i]) {
1565         alloc_reg(current,i,rs1[i]);
1566         alloc_reg(current,i,rs2[i]);
1567       }
1568       else
1569       {
1570         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1571         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1572       }
1573       alloc_reg(current,i,rt1[i]);
1574       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1575       {
1576         if(!((current->uu>>rt1[i])&1)) {
1577           alloc_reg64(current,i,rt1[i]);
1578         }
1579         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1580           if(rs1[i]&&rs2[i]) {
1581             alloc_reg64(current,i,rs1[i]);
1582             alloc_reg64(current,i,rs2[i]);
1583           }
1584           else
1585           {
1586             // Is is really worth it to keep 64-bit values in registers?
1587             #ifdef NATIVE_64BIT
1588             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1589             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1590             #endif
1591           }
1592         }
1593         current->is32&=~(1LL<<rt1[i]);
1594       } else {
1595         current->is32|=1LL<<rt1[i];
1596       }
1597     }
1598   }
1599   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1600     if(rt1[i]) {
1601       if(rs1[i]&&rs2[i]) {
1602         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1603           alloc_reg64(current,i,rs1[i]);
1604           alloc_reg64(current,i,rs2[i]);
1605           alloc_reg64(current,i,rt1[i]);
1606         } else {
1607           alloc_reg(current,i,rs1[i]);
1608           alloc_reg(current,i,rs2[i]);
1609           alloc_reg(current,i,rt1[i]);
1610         }
1611       }
1612       else {
1613         alloc_reg(current,i,rt1[i]);
1614         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1615           // DADD used as move, or zeroing
1616           // If we have a 64-bit source, then make the target 64 bits too
1617           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1618             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1619             alloc_reg64(current,i,rt1[i]);
1620           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1621             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1622             alloc_reg64(current,i,rt1[i]);
1623           }
1624           if(opcode2[i]>=0x2e&&rs2[i]) {
1625             // DSUB used as negation - 64-bit result
1626             // If we have a 32-bit register, extend it to 64 bits
1627             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1628             alloc_reg64(current,i,rt1[i]);
1629           }
1630         }
1631       }
1632       if(rs1[i]&&rs2[i]) {
1633         current->is32&=~(1LL<<rt1[i]);
1634       } else if(rs1[i]) {
1635         current->is32&=~(1LL<<rt1[i]);
1636         if((current->is32>>rs1[i])&1)
1637           current->is32|=1LL<<rt1[i];
1638       } else if(rs2[i]) {
1639         current->is32&=~(1LL<<rt1[i]);
1640         if((current->is32>>rs2[i])&1)
1641           current->is32|=1LL<<rt1[i];
1642       } else {
1643         current->is32|=1LL<<rt1[i];
1644       }
1645     }
1646   }
1647   clear_const(current,rs1[i]);
1648   clear_const(current,rs2[i]);
1649   clear_const(current,rt1[i]);
1650   dirty_reg(current,rt1[i]);
1651 }
1652
1653 void imm16_alloc(struct regstat *current,int i)
1654 {
1655   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1656   else lt1[i]=rs1[i];
1657   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1658   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1659     current->is32&=~(1LL<<rt1[i]);
1660     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1661       // TODO: Could preserve the 32-bit flag if the immediate is zero
1662       alloc_reg64(current,i,rt1[i]);
1663       alloc_reg64(current,i,rs1[i]);
1664     }
1665     clear_const(current,rs1[i]);
1666     clear_const(current,rt1[i]);
1667   }
1668   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1669     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1670     current->is32|=1LL<<rt1[i];
1671     clear_const(current,rs1[i]);
1672     clear_const(current,rt1[i]);
1673   }
1674   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1675     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1676       if(rs1[i]!=rt1[i]) {
1677         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1678         alloc_reg64(current,i,rt1[i]);
1679         current->is32&=~(1LL<<rt1[i]);
1680       }
1681     }
1682     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1683     if(is_const(current,rs1[i])) {
1684       int v=get_const(current,rs1[i]);
1685       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1686       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1687       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1688     }
1689     else clear_const(current,rt1[i]);
1690   }
1691   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1692     if(is_const(current,rs1[i])) {
1693       int v=get_const(current,rs1[i]);
1694       set_const(current,rt1[i],v+imm[i]);
1695     }
1696     else clear_const(current,rt1[i]);
1697     current->is32|=1LL<<rt1[i];
1698   }
1699   else {
1700     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1701     current->is32|=1LL<<rt1[i];
1702   }
1703   dirty_reg(current,rt1[i]);
1704 }
1705
1706 void load_alloc(struct regstat *current,int i)
1707 {
1708   clear_const(current,rt1[i]);
1709   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1710   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1711   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1712   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1713     alloc_reg(current,i,rt1[i]);
1714     assert(get_reg(current->regmap,rt1[i])>=0);
1715     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1716     {
1717       current->is32&=~(1LL<<rt1[i]);
1718       alloc_reg64(current,i,rt1[i]);
1719     }
1720     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1721     {
1722       current->is32&=~(1LL<<rt1[i]);
1723       alloc_reg64(current,i,rt1[i]);
1724       alloc_all(current,i);
1725       alloc_reg64(current,i,FTEMP);
1726       minimum_free_regs[i]=HOST_REGS;
1727     }
1728     else current->is32|=1LL<<rt1[i];
1729     dirty_reg(current,rt1[i]);
1730     // If using TLB, need a register for pointer to the mapping table
1731     if(using_tlb) alloc_reg(current,i,TLREG);
1732     // LWL/LWR need a temporary register for the old value
1733     if(opcode[i]==0x22||opcode[i]==0x26)
1734     {
1735       alloc_reg(current,i,FTEMP);
1736       alloc_reg_temp(current,i,-1);
1737       minimum_free_regs[i]=1;
1738     }
1739   }
1740   else
1741   {
1742     // Load to r0 or unneeded register (dummy load)
1743     // but we still need a register to calculate the address
1744     if(opcode[i]==0x22||opcode[i]==0x26)
1745     {
1746       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1747     }
1748     // If using TLB, need a register for pointer to the mapping table
1749     if(using_tlb) alloc_reg(current,i,TLREG);
1750     alloc_reg_temp(current,i,-1);
1751     minimum_free_regs[i]=1;
1752     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1753     {
1754       alloc_all(current,i);
1755       alloc_reg64(current,i,FTEMP);
1756       minimum_free_regs[i]=HOST_REGS;
1757     }
1758   }
1759 }
1760
1761 void store_alloc(struct regstat *current,int i)
1762 {
1763   clear_const(current,rs2[i]);
1764   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1765   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1766   alloc_reg(current,i,rs2[i]);
1767   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1768     alloc_reg64(current,i,rs2[i]);
1769     if(rs2[i]) alloc_reg(current,i,FTEMP);
1770   }
1771   // If using TLB, need a register for pointer to the mapping table
1772   if(using_tlb) alloc_reg(current,i,TLREG);
1773   #if defined(HOST_IMM8)
1774   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1775   else alloc_reg(current,i,INVCP);
1776   #endif
1777   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1778     alloc_reg(current,i,FTEMP);
1779   }
1780   // We need a temporary register for address generation
1781   alloc_reg_temp(current,i,-1);
1782   minimum_free_regs[i]=1;
1783 }
1784
1785 void c1ls_alloc(struct regstat *current,int i)
1786 {
1787   //clear_const(current,rs1[i]); // FIXME
1788   clear_const(current,rt1[i]);
1789   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1790   alloc_reg(current,i,CSREG); // Status
1791   alloc_reg(current,i,FTEMP);
1792   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1793     alloc_reg64(current,i,FTEMP);
1794   }
1795   // If using TLB, need a register for pointer to the mapping table
1796   if(using_tlb) alloc_reg(current,i,TLREG);
1797   #if defined(HOST_IMM8)
1798   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1799   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1800     alloc_reg(current,i,INVCP);
1801   #endif
1802   // We need a temporary register for address generation
1803   alloc_reg_temp(current,i,-1);
1804 }
1805
1806 void c2ls_alloc(struct regstat *current,int i)
1807 {
1808   clear_const(current,rt1[i]);
1809   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1810   alloc_reg(current,i,FTEMP);
1811   // If using TLB, need a register for pointer to the mapping table
1812   if(using_tlb) alloc_reg(current,i,TLREG);
1813   #if defined(HOST_IMM8)
1814   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1815   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1816     alloc_reg(current,i,INVCP);
1817   #endif
1818   // We need a temporary register for address generation
1819   alloc_reg_temp(current,i,-1);
1820   minimum_free_regs[i]=1;
1821 }
1822
1823 #ifndef multdiv_alloc
1824 void multdiv_alloc(struct regstat *current,int i)
1825 {
1826   //  case 0x18: MULT
1827   //  case 0x19: MULTU
1828   //  case 0x1A: DIV
1829   //  case 0x1B: DIVU
1830   //  case 0x1C: DMULT
1831   //  case 0x1D: DMULTU
1832   //  case 0x1E: DDIV
1833   //  case 0x1F: DDIVU
1834   clear_const(current,rs1[i]);
1835   clear_const(current,rs2[i]);
1836   if(rs1[i]&&rs2[i])
1837   {
1838     if((opcode2[i]&4)==0) // 32-bit
1839     {
1840       current->u&=~(1LL<<HIREG);
1841       current->u&=~(1LL<<LOREG);
1842       alloc_reg(current,i,HIREG);
1843       alloc_reg(current,i,LOREG);
1844       alloc_reg(current,i,rs1[i]);
1845       alloc_reg(current,i,rs2[i]);
1846       current->is32|=1LL<<HIREG;
1847       current->is32|=1LL<<LOREG;
1848       dirty_reg(current,HIREG);
1849       dirty_reg(current,LOREG);
1850     }
1851     else // 64-bit
1852     {
1853       current->u&=~(1LL<<HIREG);
1854       current->u&=~(1LL<<LOREG);
1855       current->uu&=~(1LL<<HIREG);
1856       current->uu&=~(1LL<<LOREG);
1857       alloc_reg64(current,i,HIREG);
1858       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1859       alloc_reg64(current,i,rs1[i]);
1860       alloc_reg64(current,i,rs2[i]);
1861       alloc_all(current,i);
1862       current->is32&=~(1LL<<HIREG);
1863       current->is32&=~(1LL<<LOREG);
1864       dirty_reg(current,HIREG);
1865       dirty_reg(current,LOREG);
1866       minimum_free_regs[i]=HOST_REGS;
1867     }
1868   }
1869   else
1870   {
1871     // Multiply by zero is zero.
1872     // MIPS does not have a divide by zero exception.
1873     // The result is undefined, we return zero.
1874     alloc_reg(current,i,HIREG);
1875     alloc_reg(current,i,LOREG);
1876     current->is32|=1LL<<HIREG;
1877     current->is32|=1LL<<LOREG;
1878     dirty_reg(current,HIREG);
1879     dirty_reg(current,LOREG);
1880   }
1881 }
1882 #endif
1883
1884 void cop0_alloc(struct regstat *current,int i)
1885 {
1886   if(opcode2[i]==0) // MFC0
1887   {
1888     if(rt1[i]) {
1889       clear_const(current,rt1[i]);
1890       alloc_all(current,i);
1891       alloc_reg(current,i,rt1[i]);
1892       current->is32|=1LL<<rt1[i];
1893       dirty_reg(current,rt1[i]);
1894     }
1895   }
1896   else if(opcode2[i]==4) // MTC0
1897   {
1898     if(rs1[i]){
1899       clear_const(current,rs1[i]);
1900       alloc_reg(current,i,rs1[i]);
1901       alloc_all(current,i);
1902     }
1903     else {
1904       alloc_all(current,i); // FIXME: Keep r0
1905       current->u&=~1LL;
1906       alloc_reg(current,i,0);
1907     }
1908   }
1909   else
1910   {
1911     // TLBR/TLBWI/TLBWR/TLBP/ERET
1912     assert(opcode2[i]==0x10);
1913     alloc_all(current,i);
1914   }
1915   minimum_free_regs[i]=HOST_REGS;
1916 }
1917
1918 void cop1_alloc(struct regstat *current,int i)
1919 {
1920   alloc_reg(current,i,CSREG); // Load status
1921   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1922   {
1923     if(rt1[i]){
1924       clear_const(current,rt1[i]);
1925       if(opcode2[i]==1) {
1926         alloc_reg64(current,i,rt1[i]); // DMFC1
1927         current->is32&=~(1LL<<rt1[i]);
1928       }else{
1929         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1930         current->is32|=1LL<<rt1[i];
1931       }
1932       dirty_reg(current,rt1[i]);
1933     }
1934     alloc_reg_temp(current,i,-1);
1935   }
1936   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1937   {
1938     if(rs1[i]){
1939       clear_const(current,rs1[i]);
1940       if(opcode2[i]==5)
1941         alloc_reg64(current,i,rs1[i]); // DMTC1
1942       else
1943         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1944       alloc_reg_temp(current,i,-1);
1945     }
1946     else {
1947       current->u&=~1LL;
1948       alloc_reg(current,i,0);
1949       alloc_reg_temp(current,i,-1);
1950     }
1951   }
1952   minimum_free_regs[i]=1;
1953 }
1954 void fconv_alloc(struct regstat *current,int i)
1955 {
1956   alloc_reg(current,i,CSREG); // Load status
1957   alloc_reg_temp(current,i,-1);
1958   minimum_free_regs[i]=1;
1959 }
1960 void float_alloc(struct regstat *current,int i)
1961 {
1962   alloc_reg(current,i,CSREG); // Load status
1963   alloc_reg_temp(current,i,-1);
1964   minimum_free_regs[i]=1;
1965 }
1966 void c2op_alloc(struct regstat *current,int i)
1967 {
1968   alloc_reg_temp(current,i,-1);
1969 }
1970 void fcomp_alloc(struct regstat *current,int i)
1971 {
1972   alloc_reg(current,i,CSREG); // Load status
1973   alloc_reg(current,i,FSREG); // Load flags
1974   dirty_reg(current,FSREG); // Flag will be modified
1975   alloc_reg_temp(current,i,-1);
1976   minimum_free_regs[i]=1;
1977 }
1978
1979 void syscall_alloc(struct regstat *current,int i)
1980 {
1981   alloc_cc(current,i);
1982   dirty_reg(current,CCREG);
1983   alloc_all(current,i);
1984   minimum_free_regs[i]=HOST_REGS;
1985   current->isconst=0;
1986 }
1987
1988 void delayslot_alloc(struct regstat *current,int i)
1989 {
1990   switch(itype[i]) {
1991     case UJUMP:
1992     case CJUMP:
1993     case SJUMP:
1994     case RJUMP:
1995     case FJUMP:
1996     case SYSCALL:
1997     case HLECALL:
1998     case SPAN:
1999       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
2000       SysPrintf("Disabled speculative precompilation\n");
2001       stop_after_jal=1;
2002       break;
2003     case IMM16:
2004       imm16_alloc(current,i);
2005       break;
2006     case LOAD:
2007     case LOADLR:
2008       load_alloc(current,i);
2009       break;
2010     case STORE:
2011     case STORELR:
2012       store_alloc(current,i);
2013       break;
2014     case ALU:
2015       alu_alloc(current,i);
2016       break;
2017     case SHIFT:
2018       shift_alloc(current,i);
2019       break;
2020     case MULTDIV:
2021       multdiv_alloc(current,i);
2022       break;
2023     case SHIFTIMM:
2024       shiftimm_alloc(current,i);
2025       break;
2026     case MOV:
2027       mov_alloc(current,i);
2028       break;
2029     case COP0:
2030       cop0_alloc(current,i);
2031       break;
2032     case COP1:
2033     case COP2:
2034       cop1_alloc(current,i);
2035       break;
2036     case C1LS:
2037       c1ls_alloc(current,i);
2038       break;
2039     case C2LS:
2040       c2ls_alloc(current,i);
2041       break;
2042     case FCONV:
2043       fconv_alloc(current,i);
2044       break;
2045     case FLOAT:
2046       float_alloc(current,i);
2047       break;
2048     case FCOMP:
2049       fcomp_alloc(current,i);
2050       break;
2051     case C2OP:
2052       c2op_alloc(current,i);
2053       break;
2054   }
2055 }
2056
2057 // Special case where a branch and delay slot span two pages in virtual memory
2058 static void pagespan_alloc(struct regstat *current,int i)
2059 {
2060   current->isconst=0;
2061   current->wasconst=0;
2062   regs[i].wasconst=0;
2063   minimum_free_regs[i]=HOST_REGS;
2064   alloc_all(current,i);
2065   alloc_cc(current,i);
2066   dirty_reg(current,CCREG);
2067   if(opcode[i]==3) // JAL
2068   {
2069     alloc_reg(current,i,31);
2070     dirty_reg(current,31);
2071   }
2072   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
2073   {
2074     alloc_reg(current,i,rs1[i]);
2075     if (rt1[i]!=0) {
2076       alloc_reg(current,i,rt1[i]);
2077       dirty_reg(current,rt1[i]);
2078     }
2079   }
2080   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
2081   {
2082     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2083     if(rs2[i]) alloc_reg(current,i,rs2[i]);
2084     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
2085     {
2086       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2087       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
2088     }
2089   }
2090   else
2091   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
2092   {
2093     if(rs1[i]) alloc_reg(current,i,rs1[i]);
2094     if(!((current->is32>>rs1[i])&1))
2095     {
2096       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
2097     }
2098   }
2099   else
2100   if(opcode[i]==0x11) // BC1
2101   {
2102     alloc_reg(current,i,FSREG);
2103     alloc_reg(current,i,CSREG);
2104   }
2105   //else ...
2106 }
2107
2108 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
2109 {
2110   stubs[stubcount][0]=type;
2111   stubs[stubcount][1]=addr;
2112   stubs[stubcount][2]=retaddr;
2113   stubs[stubcount][3]=a;
2114   stubs[stubcount][4]=b;
2115   stubs[stubcount][5]=c;
2116   stubs[stubcount][6]=d;
2117   stubs[stubcount][7]=e;
2118   stubcount++;
2119 }
2120
2121 // Write out a single register
2122 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
2123 {
2124   int hr;
2125   for(hr=0;hr<HOST_REGS;hr++) {
2126     if(hr!=EXCLUDE_REG) {
2127       if((regmap[hr]&63)==r) {
2128         if((dirty>>hr)&1) {
2129           if(regmap[hr]<64) {
2130             emit_storereg(r,hr);
2131 #ifndef FORCE32
2132             if((is32>>regmap[hr])&1) {
2133               emit_sarimm(hr,31,hr);
2134               emit_storereg(r|64,hr);
2135             }
2136 #endif
2137           }else{
2138             emit_storereg(r|64,hr);
2139           }
2140         }
2141       }
2142     }
2143   }
2144 }
2145
2146 int mchecksum()
2147 {
2148   //if(!tracedebug) return 0;
2149   int i;
2150   int sum=0;
2151   for(i=0;i<2097152;i++) {
2152     unsigned int temp=sum;
2153     sum<<=1;
2154     sum|=(~temp)>>31;
2155     sum^=((u_int *)rdram)[i];
2156   }
2157   return sum;
2158 }
2159 int rchecksum()
2160 {
2161   int i;
2162   int sum=0;
2163   for(i=0;i<64;i++)
2164     sum^=((u_int *)reg)[i];
2165   return sum;
2166 }
2167 void rlist()
2168 {
2169   int i;
2170   printf("TRACE: ");
2171   for(i=0;i<32;i++)
2172     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2173   printf("\n");
2174 #ifndef DISABLE_COP1
2175   printf("TRACE: ");
2176   for(i=0;i<32;i++)
2177     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2178   printf("\n");
2179 #endif
2180 }
2181
2182 void enabletrace()
2183 {
2184   tracedebug=1;
2185 }
2186
2187 void memdebug(int i)
2188 {
2189   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2190   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2191   //rlist();
2192   //if(tracedebug) {
2193   //if(Count>=-2084597794) {
2194   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2195   //if(0) {
2196     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2197     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2198     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2199     rlist();
2200     #ifdef __i386__
2201     printf("TRACE: %x\n",(&i)[-1]);
2202     #endif
2203     #ifdef __arm__
2204     int j;
2205     printf("TRACE: %x \n",(&j)[10]);
2206     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2207     #endif
2208     //fflush(stdout);
2209   }
2210   //printf("TRACE: %x\n",(&i)[-1]);
2211 }
2212
2213 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2214 {
2215   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2216 }
2217
2218 void alu_assemble(int i,struct regstat *i_regs)
2219 {
2220   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2221     if(rt1[i]) {
2222       signed char s1,s2,t;
2223       t=get_reg(i_regs->regmap,rt1[i]);
2224       if(t>=0) {
2225         s1=get_reg(i_regs->regmap,rs1[i]);
2226         s2=get_reg(i_regs->regmap,rs2[i]);
2227         if(rs1[i]&&rs2[i]) {
2228           assert(s1>=0);
2229           assert(s2>=0);
2230           if(opcode2[i]&2) emit_sub(s1,s2,t);
2231           else emit_add(s1,s2,t);
2232         }
2233         else if(rs1[i]) {
2234           if(s1>=0) emit_mov(s1,t);
2235           else emit_loadreg(rs1[i],t);
2236         }
2237         else if(rs2[i]) {
2238           if(s2>=0) {
2239             if(opcode2[i]&2) emit_neg(s2,t);
2240             else emit_mov(s2,t);
2241           }
2242           else {
2243             emit_loadreg(rs2[i],t);
2244             if(opcode2[i]&2) emit_neg(t,t);
2245           }
2246         }
2247         else emit_zeroreg(t);
2248       }
2249     }
2250   }
2251   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2252     if(rt1[i]) {
2253       signed char s1l,s2l,s1h,s2h,tl,th;
2254       tl=get_reg(i_regs->regmap,rt1[i]);
2255       th=get_reg(i_regs->regmap,rt1[i]|64);
2256       if(tl>=0) {
2257         s1l=get_reg(i_regs->regmap,rs1[i]);
2258         s2l=get_reg(i_regs->regmap,rs2[i]);
2259         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2260         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2261         if(rs1[i]&&rs2[i]) {
2262           assert(s1l>=0);
2263           assert(s2l>=0);
2264           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2265           else emit_adds(s1l,s2l,tl);
2266           if(th>=0) {
2267             #ifdef INVERTED_CARRY
2268             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2269             #else
2270             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2271             #endif
2272             else emit_add(s1h,s2h,th);
2273           }
2274         }
2275         else if(rs1[i]) {
2276           if(s1l>=0) emit_mov(s1l,tl);
2277           else emit_loadreg(rs1[i],tl);
2278           if(th>=0) {
2279             if(s1h>=0) emit_mov(s1h,th);
2280             else emit_loadreg(rs1[i]|64,th);
2281           }
2282         }
2283         else if(rs2[i]) {
2284           if(s2l>=0) {
2285             if(opcode2[i]&2) emit_negs(s2l,tl);
2286             else emit_mov(s2l,tl);
2287           }
2288           else {
2289             emit_loadreg(rs2[i],tl);
2290             if(opcode2[i]&2) emit_negs(tl,tl);
2291           }
2292           if(th>=0) {
2293             #ifdef INVERTED_CARRY
2294             if(s2h>=0) emit_mov(s2h,th);
2295             else emit_loadreg(rs2[i]|64,th);
2296             if(opcode2[i]&2) {
2297               emit_adcimm(-1,th); // x86 has inverted carry flag
2298               emit_not(th,th);
2299             }
2300             #else
2301             if(opcode2[i]&2) {
2302               if(s2h>=0) emit_rscimm(s2h,0,th);
2303               else {
2304                 emit_loadreg(rs2[i]|64,th);
2305                 emit_rscimm(th,0,th);
2306               }
2307             }else{
2308               if(s2h>=0) emit_mov(s2h,th);
2309               else emit_loadreg(rs2[i]|64,th);
2310             }
2311             #endif
2312           }
2313         }
2314         else {
2315           emit_zeroreg(tl);
2316           if(th>=0) emit_zeroreg(th);
2317         }
2318       }
2319     }
2320   }
2321   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2322     if(rt1[i]) {
2323       signed char s1l,s1h,s2l,s2h,t;
2324       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2325       {
2326         t=get_reg(i_regs->regmap,rt1[i]);
2327         //assert(t>=0);
2328         if(t>=0) {
2329           s1l=get_reg(i_regs->regmap,rs1[i]);
2330           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2331           s2l=get_reg(i_regs->regmap,rs2[i]);
2332           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2333           if(rs2[i]==0) // rx<r0
2334           {
2335             assert(s1h>=0);
2336             if(opcode2[i]==0x2a) // SLT
2337               emit_shrimm(s1h,31,t);
2338             else // SLTU (unsigned can not be less than zero)
2339               emit_zeroreg(t);
2340           }
2341           else if(rs1[i]==0) // r0<rx
2342           {
2343             assert(s2h>=0);
2344             if(opcode2[i]==0x2a) // SLT
2345               emit_set_gz64_32(s2h,s2l,t);
2346             else // SLTU (set if not zero)
2347               emit_set_nz64_32(s2h,s2l,t);
2348           }
2349           else {
2350             assert(s1l>=0);assert(s1h>=0);
2351             assert(s2l>=0);assert(s2h>=0);
2352             if(opcode2[i]==0x2a) // SLT
2353               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2354             else // SLTU
2355               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2356           }
2357         }
2358       } else {
2359         t=get_reg(i_regs->regmap,rt1[i]);
2360         //assert(t>=0);
2361         if(t>=0) {
2362           s1l=get_reg(i_regs->regmap,rs1[i]);
2363           s2l=get_reg(i_regs->regmap,rs2[i]);
2364           if(rs2[i]==0) // rx<r0
2365           {
2366             assert(s1l>=0);
2367             if(opcode2[i]==0x2a) // SLT
2368               emit_shrimm(s1l,31,t);
2369             else // SLTU (unsigned can not be less than zero)
2370               emit_zeroreg(t);
2371           }
2372           else if(rs1[i]==0) // r0<rx
2373           {
2374             assert(s2l>=0);
2375             if(opcode2[i]==0x2a) // SLT
2376               emit_set_gz32(s2l,t);
2377             else // SLTU (set if not zero)
2378               emit_set_nz32(s2l,t);
2379           }
2380           else{
2381             assert(s1l>=0);assert(s2l>=0);
2382             if(opcode2[i]==0x2a) // SLT
2383               emit_set_if_less32(s1l,s2l,t);
2384             else // SLTU
2385               emit_set_if_carry32(s1l,s2l,t);
2386           }
2387         }
2388       }
2389     }
2390   }
2391   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2392     if(rt1[i]) {
2393       signed char s1l,s1h,s2l,s2h,th,tl;
2394       tl=get_reg(i_regs->regmap,rt1[i]);
2395       th=get_reg(i_regs->regmap,rt1[i]|64);
2396       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2397       {
2398         assert(tl>=0);
2399         if(tl>=0) {
2400           s1l=get_reg(i_regs->regmap,rs1[i]);
2401           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2402           s2l=get_reg(i_regs->regmap,rs2[i]);
2403           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2404           if(rs1[i]&&rs2[i]) {
2405             assert(s1l>=0);assert(s1h>=0);
2406             assert(s2l>=0);assert(s2h>=0);
2407             if(opcode2[i]==0x24) { // AND
2408               emit_and(s1l,s2l,tl);
2409               emit_and(s1h,s2h,th);
2410             } else
2411             if(opcode2[i]==0x25) { // OR
2412               emit_or(s1l,s2l,tl);
2413               emit_or(s1h,s2h,th);
2414             } else
2415             if(opcode2[i]==0x26) { // XOR
2416               emit_xor(s1l,s2l,tl);
2417               emit_xor(s1h,s2h,th);
2418             } else
2419             if(opcode2[i]==0x27) { // NOR
2420               emit_or(s1l,s2l,tl);
2421               emit_or(s1h,s2h,th);
2422               emit_not(tl,tl);
2423               emit_not(th,th);
2424             }
2425           }
2426           else
2427           {
2428             if(opcode2[i]==0x24) { // AND
2429               emit_zeroreg(tl);
2430               emit_zeroreg(th);
2431             } else
2432             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2433               if(rs1[i]){
2434                 if(s1l>=0) emit_mov(s1l,tl);
2435                 else emit_loadreg(rs1[i],tl);
2436                 if(s1h>=0) emit_mov(s1h,th);
2437                 else emit_loadreg(rs1[i]|64,th);
2438               }
2439               else
2440               if(rs2[i]){
2441                 if(s2l>=0) emit_mov(s2l,tl);
2442                 else emit_loadreg(rs2[i],tl);
2443                 if(s2h>=0) emit_mov(s2h,th);
2444                 else emit_loadreg(rs2[i]|64,th);
2445               }
2446               else{
2447                 emit_zeroreg(tl);
2448                 emit_zeroreg(th);
2449               }
2450             } else
2451             if(opcode2[i]==0x27) { // NOR
2452               if(rs1[i]){
2453                 if(s1l>=0) emit_not(s1l,tl);
2454                 else{
2455                   emit_loadreg(rs1[i],tl);
2456                   emit_not(tl,tl);
2457                 }
2458                 if(s1h>=0) emit_not(s1h,th);
2459                 else{
2460                   emit_loadreg(rs1[i]|64,th);
2461                   emit_not(th,th);
2462                 }
2463               }
2464               else
2465               if(rs2[i]){
2466                 if(s2l>=0) emit_not(s2l,tl);
2467                 else{
2468                   emit_loadreg(rs2[i],tl);
2469                   emit_not(tl,tl);
2470                 }
2471                 if(s2h>=0) emit_not(s2h,th);
2472                 else{
2473                   emit_loadreg(rs2[i]|64,th);
2474                   emit_not(th,th);
2475                 }
2476               }
2477               else {
2478                 emit_movimm(-1,tl);
2479                 emit_movimm(-1,th);
2480               }
2481             }
2482           }
2483         }
2484       }
2485       else
2486       {
2487         // 32 bit
2488         if(tl>=0) {
2489           s1l=get_reg(i_regs->regmap,rs1[i]);
2490           s2l=get_reg(i_regs->regmap,rs2[i]);
2491           if(rs1[i]&&rs2[i]) {
2492             assert(s1l>=0);
2493             assert(s2l>=0);
2494             if(opcode2[i]==0x24) { // AND
2495               emit_and(s1l,s2l,tl);
2496             } else
2497             if(opcode2[i]==0x25) { // OR
2498               emit_or(s1l,s2l,tl);
2499             } else
2500             if(opcode2[i]==0x26) { // XOR
2501               emit_xor(s1l,s2l,tl);
2502             } else
2503             if(opcode2[i]==0x27) { // NOR
2504               emit_or(s1l,s2l,tl);
2505               emit_not(tl,tl);
2506             }
2507           }
2508           else
2509           {
2510             if(opcode2[i]==0x24) { // AND
2511               emit_zeroreg(tl);
2512             } else
2513             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2514               if(rs1[i]){
2515                 if(s1l>=0) emit_mov(s1l,tl);
2516                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2517               }
2518               else
2519               if(rs2[i]){
2520                 if(s2l>=0) emit_mov(s2l,tl);
2521                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2522               }
2523               else emit_zeroreg(tl);
2524             } else
2525             if(opcode2[i]==0x27) { // NOR
2526               if(rs1[i]){
2527                 if(s1l>=0) emit_not(s1l,tl);
2528                 else {
2529                   emit_loadreg(rs1[i],tl);
2530                   emit_not(tl,tl);
2531                 }
2532               }
2533               else
2534               if(rs2[i]){
2535                 if(s2l>=0) emit_not(s2l,tl);
2536                 else {
2537                   emit_loadreg(rs2[i],tl);
2538                   emit_not(tl,tl);
2539                 }
2540               }
2541               else emit_movimm(-1,tl);
2542             }
2543           }
2544         }
2545       }
2546     }
2547   }
2548 }
2549
2550 void imm16_assemble(int i,struct regstat *i_regs)
2551 {
2552   if (opcode[i]==0x0f) { // LUI
2553     if(rt1[i]) {
2554       signed char t;
2555       t=get_reg(i_regs->regmap,rt1[i]);
2556       //assert(t>=0);
2557       if(t>=0) {
2558         if(!((i_regs->isconst>>t)&1))
2559           emit_movimm(imm[i]<<16,t);
2560       }
2561     }
2562   }
2563   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2564     if(rt1[i]) {
2565       signed char s,t;
2566       t=get_reg(i_regs->regmap,rt1[i]);
2567       s=get_reg(i_regs->regmap,rs1[i]);
2568       if(rs1[i]) {
2569         //assert(t>=0);
2570         //assert(s>=0);
2571         if(t>=0) {
2572           if(!((i_regs->isconst>>t)&1)) {
2573             if(s<0) {
2574               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2575               emit_addimm(t,imm[i],t);
2576             }else{
2577               if(!((i_regs->wasconst>>s)&1))
2578                 emit_addimm(s,imm[i],t);
2579               else
2580                 emit_movimm(constmap[i][s]+imm[i],t);
2581             }
2582           }
2583         }
2584       } else {
2585         if(t>=0) {
2586           if(!((i_regs->isconst>>t)&1))
2587             emit_movimm(imm[i],t);
2588         }
2589       }
2590     }
2591   }
2592   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2593     if(rt1[i]) {
2594       signed char sh,sl,th,tl;
2595       th=get_reg(i_regs->regmap,rt1[i]|64);
2596       tl=get_reg(i_regs->regmap,rt1[i]);
2597       sh=get_reg(i_regs->regmap,rs1[i]|64);
2598       sl=get_reg(i_regs->regmap,rs1[i]);
2599       if(tl>=0) {
2600         if(rs1[i]) {
2601           assert(sh>=0);
2602           assert(sl>=0);
2603           if(th>=0) {
2604             emit_addimm64_32(sh,sl,imm[i],th,tl);
2605           }
2606           else {
2607             emit_addimm(sl,imm[i],tl);
2608           }
2609         } else {
2610           emit_movimm(imm[i],tl);
2611           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2612         }
2613       }
2614     }
2615   }
2616   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2617     if(rt1[i]) {
2618       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2619       signed char sh,sl,t;
2620       t=get_reg(i_regs->regmap,rt1[i]);
2621       sh=get_reg(i_regs->regmap,rs1[i]|64);
2622       sl=get_reg(i_regs->regmap,rs1[i]);
2623       //assert(t>=0);
2624       if(t>=0) {
2625         if(rs1[i]>0) {
2626           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2627           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2628             if(opcode[i]==0x0a) { // SLTI
2629               if(sl<0) {
2630                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2631                 emit_slti32(t,imm[i],t);
2632               }else{
2633                 emit_slti32(sl,imm[i],t);
2634               }
2635             }
2636             else { // SLTIU
2637               if(sl<0) {
2638                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2639                 emit_sltiu32(t,imm[i],t);
2640               }else{
2641                 emit_sltiu32(sl,imm[i],t);
2642               }
2643             }
2644           }else{ // 64-bit
2645             assert(sl>=0);
2646             if(opcode[i]==0x0a) // SLTI
2647               emit_slti64_32(sh,sl,imm[i],t);
2648             else // SLTIU
2649               emit_sltiu64_32(sh,sl,imm[i],t);
2650           }
2651         }else{
2652           // SLTI(U) with r0 is just stupid,
2653           // nonetheless examples can be found
2654           if(opcode[i]==0x0a) // SLTI
2655             if(0<imm[i]) emit_movimm(1,t);
2656             else emit_zeroreg(t);
2657           else // SLTIU
2658           {
2659             if(imm[i]) emit_movimm(1,t);
2660             else emit_zeroreg(t);
2661           }
2662         }
2663       }
2664     }
2665   }
2666   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2667     if(rt1[i]) {
2668       signed char sh,sl,th,tl;
2669       th=get_reg(i_regs->regmap,rt1[i]|64);
2670       tl=get_reg(i_regs->regmap,rt1[i]);
2671       sh=get_reg(i_regs->regmap,rs1[i]|64);
2672       sl=get_reg(i_regs->regmap,rs1[i]);
2673       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2674         if(opcode[i]==0x0c) //ANDI
2675         {
2676           if(rs1[i]) {
2677             if(sl<0) {
2678               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2679               emit_andimm(tl,imm[i],tl);
2680             }else{
2681               if(!((i_regs->wasconst>>sl)&1))
2682                 emit_andimm(sl,imm[i],tl);
2683               else
2684                 emit_movimm(constmap[i][sl]&imm[i],tl);
2685             }
2686           }
2687           else
2688             emit_zeroreg(tl);
2689           if(th>=0) emit_zeroreg(th);
2690         }
2691         else
2692         {
2693           if(rs1[i]) {
2694             if(sl<0) {
2695               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2696             }
2697             if(th>=0) {
2698               if(sh<0) {
2699                 emit_loadreg(rs1[i]|64,th);
2700               }else{
2701                 emit_mov(sh,th);
2702               }
2703             }
2704             if(opcode[i]==0x0d) //ORI
2705             if(sl<0) {
2706               emit_orimm(tl,imm[i],tl);
2707             }else{
2708               if(!((i_regs->wasconst>>sl)&1))
2709                 emit_orimm(sl,imm[i],tl);
2710               else
2711                 emit_movimm(constmap[i][sl]|imm[i],tl);
2712             }
2713             if(opcode[i]==0x0e) //XORI
2714             if(sl<0) {
2715               emit_xorimm(tl,imm[i],tl);
2716             }else{
2717               if(!((i_regs->wasconst>>sl)&1))
2718                 emit_xorimm(sl,imm[i],tl);
2719               else
2720                 emit_movimm(constmap[i][sl]^imm[i],tl);
2721             }
2722           }
2723           else {
2724             emit_movimm(imm[i],tl);
2725             if(th>=0) emit_zeroreg(th);
2726           }
2727         }
2728       }
2729     }
2730   }
2731 }
2732
2733 void shiftimm_assemble(int i,struct regstat *i_regs)
2734 {
2735   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2736   {
2737     if(rt1[i]) {
2738       signed char s,t;
2739       t=get_reg(i_regs->regmap,rt1[i]);
2740       s=get_reg(i_regs->regmap,rs1[i]);
2741       //assert(t>=0);
2742       if(t>=0&&!((i_regs->isconst>>t)&1)){
2743         if(rs1[i]==0)
2744         {
2745           emit_zeroreg(t);
2746         }
2747         else
2748         {
2749           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2750           if(imm[i]) {
2751             if(opcode2[i]==0) // SLL
2752             {
2753               emit_shlimm(s<0?t:s,imm[i],t);
2754             }
2755             if(opcode2[i]==2) // SRL
2756             {
2757               emit_shrimm(s<0?t:s,imm[i],t);
2758             }
2759             if(opcode2[i]==3) // SRA
2760             {
2761               emit_sarimm(s<0?t:s,imm[i],t);
2762             }
2763           }else{
2764             // Shift by zero
2765             if(s>=0 && s!=t) emit_mov(s,t);
2766           }
2767         }
2768       }
2769       //emit_storereg(rt1[i],t); //DEBUG
2770     }
2771   }
2772   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2773   {
2774     if(rt1[i]) {
2775       signed char sh,sl,th,tl;
2776       th=get_reg(i_regs->regmap,rt1[i]|64);
2777       tl=get_reg(i_regs->regmap,rt1[i]);
2778       sh=get_reg(i_regs->regmap,rs1[i]|64);
2779       sl=get_reg(i_regs->regmap,rs1[i]);
2780       if(tl>=0) {
2781         if(rs1[i]==0)
2782         {
2783           emit_zeroreg(tl);
2784           if(th>=0) emit_zeroreg(th);
2785         }
2786         else
2787         {
2788           assert(sl>=0);
2789           assert(sh>=0);
2790           if(imm[i]) {
2791             if(opcode2[i]==0x38) // DSLL
2792             {
2793               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2794               emit_shlimm(sl,imm[i],tl);
2795             }
2796             if(opcode2[i]==0x3a) // DSRL
2797             {
2798               emit_shrdimm(sl,sh,imm[i],tl);
2799               if(th>=0) emit_shrimm(sh,imm[i],th);
2800             }
2801             if(opcode2[i]==0x3b) // DSRA
2802             {
2803               emit_shrdimm(sl,sh,imm[i],tl);
2804               if(th>=0) emit_sarimm(sh,imm[i],th);
2805             }
2806           }else{
2807             // Shift by zero
2808             if(sl!=tl) emit_mov(sl,tl);
2809             if(th>=0&&sh!=th) emit_mov(sh,th);
2810           }
2811         }
2812       }
2813     }
2814   }
2815   if(opcode2[i]==0x3c) // DSLL32
2816   {
2817     if(rt1[i]) {
2818       signed char sl,tl,th;
2819       tl=get_reg(i_regs->regmap,rt1[i]);
2820       th=get_reg(i_regs->regmap,rt1[i]|64);
2821       sl=get_reg(i_regs->regmap,rs1[i]);
2822       if(th>=0||tl>=0){
2823         assert(tl>=0);
2824         assert(th>=0);
2825         assert(sl>=0);
2826         emit_mov(sl,th);
2827         emit_zeroreg(tl);
2828         if(imm[i]>32)
2829         {
2830           emit_shlimm(th,imm[i]&31,th);
2831         }
2832       }
2833     }
2834   }
2835   if(opcode2[i]==0x3e) // DSRL32
2836   {
2837     if(rt1[i]) {
2838       signed char sh,tl,th;
2839       tl=get_reg(i_regs->regmap,rt1[i]);
2840       th=get_reg(i_regs->regmap,rt1[i]|64);
2841       sh=get_reg(i_regs->regmap,rs1[i]|64);
2842       if(tl>=0){
2843         assert(sh>=0);
2844         emit_mov(sh,tl);
2845         if(th>=0) emit_zeroreg(th);
2846         if(imm[i]>32)
2847         {
2848           emit_shrimm(tl,imm[i]&31,tl);
2849         }
2850       }
2851     }
2852   }
2853   if(opcode2[i]==0x3f) // DSRA32
2854   {
2855     if(rt1[i]) {
2856       signed char sh,tl;
2857       tl=get_reg(i_regs->regmap,rt1[i]);
2858       sh=get_reg(i_regs->regmap,rs1[i]|64);
2859       if(tl>=0){
2860         assert(sh>=0);
2861         emit_mov(sh,tl);
2862         if(imm[i]>32)
2863         {
2864           emit_sarimm(tl,imm[i]&31,tl);
2865         }
2866       }
2867     }
2868   }
2869 }
2870
2871 #ifndef shift_assemble
2872 void shift_assemble(int i,struct regstat *i_regs)
2873 {
2874   printf("Need shift_assemble for this architecture.\n");
2875   exit(1);
2876 }
2877 #endif
2878
2879 void load_assemble(int i,struct regstat *i_regs)
2880 {
2881   int s,th,tl,addr,map=-1;
2882   int offset;
2883   int jaddr=0;
2884   int memtarget=0,c=0;
2885   int fastload_reg_override=0;
2886   u_int hr,reglist=0;
2887   th=get_reg(i_regs->regmap,rt1[i]|64);
2888   tl=get_reg(i_regs->regmap,rt1[i]);
2889   s=get_reg(i_regs->regmap,rs1[i]);
2890   offset=imm[i];
2891   for(hr=0;hr<HOST_REGS;hr++) {
2892     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2893   }
2894   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2895   if(s>=0) {
2896     c=(i_regs->wasconst>>s)&1;
2897     if (c) {
2898       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2899       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2900     }
2901   }
2902   //printf("load_assemble: c=%d\n",c);
2903   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2904   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2905 #ifdef PCSX
2906   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2907     ||rt1[i]==0) {
2908       // could be FIFO, must perform the read
2909       // ||dummy read
2910       assem_debug("(forced read)\n");
2911       tl=get_reg(i_regs->regmap,-1);
2912       assert(tl>=0);
2913   }
2914 #endif
2915   if(offset||s<0||c) addr=tl;
2916   else addr=s;
2917   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2918  if(tl>=0) {
2919   //printf("load_assemble: c=%d\n",c);
2920   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2921   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2922   reglist&=~(1<<tl);
2923   if(th>=0) reglist&=~(1<<th);
2924   if(!using_tlb) {
2925     if(!c) {
2926       #ifdef RAM_OFFSET
2927       map=get_reg(i_regs->regmap,ROREG);
2928       if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2929       #endif
2930 //#define R29_HACK 1
2931       #ifdef R29_HACK
2932       // Strmnnrmn's speed hack
2933       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2934       #endif
2935       {
2936         jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2937       }
2938     }
2939     else if(ram_offset&&memtarget) {
2940       emit_addimm(addr,ram_offset,HOST_TEMPREG);
2941       fastload_reg_override=HOST_TEMPREG;
2942     }
2943   }else{ // using tlb
2944     int x=0;
2945     if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2946     if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2947     map=get_reg(i_regs->regmap,TLREG);
2948     assert(map>=0);
2949     reglist&=~(1<<map);
2950     map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2951     do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2952   }
2953   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2954   if (opcode[i]==0x20) { // LB
2955     if(!c||memtarget) {
2956       if(!dummy) {
2957         #ifdef HOST_IMM_ADDR32
2958         if(c)
2959           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2960         else
2961         #endif
2962         {
2963           //emit_xorimm(addr,3,tl);
2964           //gen_tlb_addr_r(tl,map);
2965           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2966           int x=0,a=tl;
2967 #ifdef BIG_ENDIAN_MIPS
2968           if(!c) emit_xorimm(addr,3,tl);
2969           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2970 #else
2971           if(!c) a=addr;
2972 #endif
2973           if(fastload_reg_override) a=fastload_reg_override;
2974
2975           emit_movsbl_indexed_tlb(x,a,map,tl);
2976         }
2977       }
2978       if(jaddr)
2979         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2980     }
2981     else
2982       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2983   }
2984   if (opcode[i]==0x21) { // LH
2985     if(!c||memtarget) {
2986       if(!dummy) {
2987         #ifdef HOST_IMM_ADDR32
2988         if(c)
2989           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2990         else
2991         #endif
2992         {
2993           int x=0,a=tl;
2994 #ifdef BIG_ENDIAN_MIPS
2995           if(!c) emit_xorimm(addr,2,tl);
2996           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2997 #else
2998           if(!c) a=addr;
2999 #endif
3000           if(fastload_reg_override) a=fastload_reg_override;
3001           //#ifdef
3002           //emit_movswl_indexed_tlb(x,tl,map,tl);
3003           //else
3004           if(map>=0) {
3005             gen_tlb_addr_r(a,map);
3006             emit_movswl_indexed(x,a,tl);
3007           }else{
3008             #if 1 //def RAM_OFFSET
3009             emit_movswl_indexed(x,a,tl);
3010             #else
3011             emit_movswl_indexed((int)rdram-0x80000000+x,a,tl);
3012             #endif
3013           }
3014         }
3015       }
3016       if(jaddr)
3017         add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3018     }
3019     else
3020       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3021   }
3022   if (opcode[i]==0x23) { // LW
3023     if(!c||memtarget) {
3024       if(!dummy) {
3025         int a=addr;
3026         if(fastload_reg_override) a=fastload_reg_override;
3027         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3028         #ifdef HOST_IMM_ADDR32
3029         if(c)
3030           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3031         else
3032         #endif
3033         emit_readword_indexed_tlb(0,a,map,tl);
3034       }
3035       if(jaddr)
3036         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3037     }
3038     else
3039       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3040   }
3041   if (opcode[i]==0x24) { // LBU
3042     if(!c||memtarget) {
3043       if(!dummy) {
3044         #ifdef HOST_IMM_ADDR32
3045         if(c)
3046           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
3047         else
3048         #endif
3049         {
3050           //emit_xorimm(addr,3,tl);
3051           //gen_tlb_addr_r(tl,map);
3052           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
3053           int x=0,a=tl;
3054 #ifdef BIG_ENDIAN_MIPS
3055           if(!c) emit_xorimm(addr,3,tl);
3056           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3057 #else
3058           if(!c) a=addr;
3059 #endif
3060           if(fastload_reg_override) a=fastload_reg_override;
3061
3062           emit_movzbl_indexed_tlb(x,a,map,tl);
3063         }
3064       }
3065       if(jaddr)
3066         add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3067     }
3068     else
3069       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3070   }
3071   if (opcode[i]==0x25) { // LHU
3072     if(!c||memtarget) {
3073       if(!dummy) {
3074         #ifdef HOST_IMM_ADDR32
3075         if(c)
3076           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
3077         else
3078         #endif
3079         {
3080           int x=0,a=tl;
3081 #ifdef BIG_ENDIAN_MIPS
3082           if(!c) emit_xorimm(addr,2,tl);
3083           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3084 #else
3085           if(!c) a=addr;
3086 #endif
3087           if(fastload_reg_override) a=fastload_reg_override;
3088           //#ifdef
3089           //emit_movzwl_indexed_tlb(x,tl,map,tl);
3090           //#else
3091           if(map>=0) {
3092             gen_tlb_addr_r(a,map);
3093             emit_movzwl_indexed(x,a,tl);
3094           }else{
3095             #if 1 //def RAM_OFFSET
3096             emit_movzwl_indexed(x,a,tl);
3097             #else
3098             emit_movzwl_indexed((int)rdram-0x80000000+x,a,tl);
3099             #endif
3100           }
3101         }
3102       }
3103       if(jaddr)
3104         add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3105     }
3106     else
3107       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3108   }
3109   if (opcode[i]==0x27) { // LWU
3110     assert(th>=0);
3111     if(!c||memtarget) {
3112       if(!dummy) {
3113         int a=addr;
3114         if(fastload_reg_override) a=fastload_reg_override;
3115         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
3116         #ifdef HOST_IMM_ADDR32
3117         if(c)
3118           emit_readword_tlb(constmap[i][s]+offset,map,tl);
3119         else
3120         #endif
3121         emit_readword_indexed_tlb(0,a,map,tl);
3122       }
3123       if(jaddr)
3124         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3125     }
3126     else {
3127       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3128     }
3129     emit_zeroreg(th);
3130   }
3131   if (opcode[i]==0x37) { // LD
3132     if(!c||memtarget) {
3133       if(!dummy) {
3134         int a=addr;
3135         if(fastload_reg_override) a=fastload_reg_override;
3136         //gen_tlb_addr_r(tl,map);
3137         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
3138         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
3139         #ifdef HOST_IMM_ADDR32
3140         if(c)
3141           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3142         else
3143         #endif
3144         emit_readdword_indexed_tlb(0,a,map,th,tl);
3145       }
3146       if(jaddr)
3147         add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3148     }
3149     else
3150       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
3151   }
3152  }
3153   //emit_storereg(rt1[i],tl); // DEBUG
3154   //if(opcode[i]==0x23)
3155   //if(opcode[i]==0x24)
3156   //if(opcode[i]==0x23||opcode[i]==0x24)
3157   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
3158   {
3159     //emit_pusha();
3160     save_regs(0x100f);
3161         emit_readword((int)&last_count,ECX);
3162         #ifdef __i386__
3163         if(get_reg(i_regs->regmap,CCREG)<0)
3164           emit_loadreg(CCREG,HOST_CCREG);
3165         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3166         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3167         emit_writeword(HOST_CCREG,(int)&Count);
3168         #endif
3169         #ifdef __arm__
3170         if(get_reg(i_regs->regmap,CCREG)<0)
3171           emit_loadreg(CCREG,0);
3172         else
3173           emit_mov(HOST_CCREG,0);
3174         emit_add(0,ECX,0);
3175         emit_addimm(0,2*ccadj[i],0);
3176         emit_writeword(0,(int)&Count);
3177         #endif
3178     emit_call((int)memdebug);
3179     //emit_popa();
3180     restore_regs(0x100f);
3181   }/**/
3182 }
3183
3184 #ifndef loadlr_assemble
3185 void loadlr_assemble(int i,struct regstat *i_regs)
3186 {
3187   printf("Need loadlr_assemble for this architecture.\n");
3188   exit(1);
3189 }
3190 #endif
3191
3192 void store_assemble(int i,struct regstat *i_regs)
3193 {
3194   int s,th,tl,map=-1;
3195   int addr,temp;
3196   int offset;
3197   int jaddr=0,jaddr2,type;
3198   int memtarget=0,c=0;
3199   int agr=AGEN1+(i&1);
3200   int faststore_reg_override=0;
3201   u_int hr,reglist=0;
3202   th=get_reg(i_regs->regmap,rs2[i]|64);
3203   tl=get_reg(i_regs->regmap,rs2[i]);
3204   s=get_reg(i_regs->regmap,rs1[i]);
3205   temp=get_reg(i_regs->regmap,agr);
3206   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3207   offset=imm[i];
3208   if(s>=0) {
3209     c=(i_regs->wasconst>>s)&1;
3210     if(c) {
3211       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3212       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3213     }
3214   }
3215   assert(tl>=0);
3216   assert(temp>=0);
3217   for(hr=0;hr<HOST_REGS;hr++) {
3218     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3219   }
3220   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3221   if(offset||s<0||c) addr=temp;
3222   else addr=s;
3223   if(!using_tlb) {
3224     if(!c) {
3225       #ifndef PCSX
3226       #ifdef R29_HACK
3227       // Strmnnrmn's speed hack
3228       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3229       #endif
3230       emit_cmpimm(addr,RAM_SIZE);
3231       #ifdef DESTRUCTIVE_SHIFT
3232       if(s==addr) emit_mov(s,temp);
3233       #endif
3234       #ifdef R29_HACK
3235       memtarget=1;
3236       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3237       #endif
3238       {
3239         jaddr=(int)out;
3240         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3241         // Hint to branch predictor that the branch is unlikely to be taken
3242         if(rs1[i]>=28)
3243           emit_jno_unlikely(0);
3244         else
3245         #endif
3246         emit_jno(0);
3247       }
3248       #else
3249         jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
3250       #endif
3251     }
3252     else if(ram_offset&&memtarget) {
3253       emit_addimm(addr,ram_offset,HOST_TEMPREG);
3254       faststore_reg_override=HOST_TEMPREG;
3255     }
3256   }else{ // using tlb
3257     int x=0;
3258     if (opcode[i]==0x28) x=3; // SB
3259     if (opcode[i]==0x29) x=2; // SH
3260     map=get_reg(i_regs->regmap,TLREG);
3261     assert(map>=0);
3262     reglist&=~(1<<map);
3263     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3264     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3265   }
3266
3267   if (opcode[i]==0x28) { // SB
3268     if(!c||memtarget) {
3269       int x=0,a=temp;
3270 #ifdef BIG_ENDIAN_MIPS
3271       if(!c) emit_xorimm(addr,3,temp);
3272       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3273 #else
3274       if(!c) a=addr;
3275 #endif
3276       if(faststore_reg_override) a=faststore_reg_override;
3277       //gen_tlb_addr_w(temp,map);
3278       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3279       emit_writebyte_indexed_tlb(tl,x,a,map,a);
3280     }
3281     type=STOREB_STUB;
3282   }
3283   if (opcode[i]==0x29) { // SH
3284     if(!c||memtarget) {
3285       int x=0,a=temp;
3286 #ifdef BIG_ENDIAN_MIPS
3287       if(!c) emit_xorimm(addr,2,temp);
3288       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3289 #else
3290       if(!c) a=addr;
3291 #endif
3292       if(faststore_reg_override) a=faststore_reg_override;
3293       //#ifdef
3294       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3295       //#else
3296       if(map>=0) {
3297         gen_tlb_addr_w(a,map);
3298         emit_writehword_indexed(tl,x,a);
3299       }else
3300         //emit_writehword_indexed(tl,(int)rdram-0x80000000+x,a);
3301         emit_writehword_indexed(tl,x,a);
3302     }
3303     type=STOREH_STUB;
3304   }
3305   if (opcode[i]==0x2B) { // SW
3306     if(!c||memtarget) {
3307       int a=addr;
3308       if(faststore_reg_override) a=faststore_reg_override;
3309       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3310       emit_writeword_indexed_tlb(tl,0,a,map,temp);
3311     }
3312     type=STOREW_STUB;
3313   }
3314   if (opcode[i]==0x3F) { // SD
3315     if(!c||memtarget) {
3316       int a=addr;
3317       if(faststore_reg_override) a=faststore_reg_override;
3318       if(rs2[i]) {
3319         assert(th>=0);
3320         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3321         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3322         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
3323       }else{
3324         // Store zero
3325         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3326         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3327         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
3328       }
3329     }
3330     type=STORED_STUB;
3331   }
3332 #ifdef PCSX
3333   if(jaddr) {
3334     // PCSX store handlers don't check invcode again
3335     reglist|=1<<addr;
3336     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3337     jaddr=0;
3338   }
3339 #endif
3340   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3341     if(!c||memtarget) {
3342       #ifdef DESTRUCTIVE_SHIFT
3343       // The x86 shift operation is 'destructive'; it overwrites the
3344       // source register, so we need to make a copy first and use that.
3345       addr=temp;
3346       #endif
3347       #if defined(HOST_IMM8)
3348       int ir=get_reg(i_regs->regmap,INVCP);
3349       assert(ir>=0);
3350       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3351       #else
3352       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3353       #endif
3354       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3355       emit_callne(invalidate_addr_reg[addr]);
3356       #else
3357       jaddr2=(int)out;
3358       emit_jne(0);
3359       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3360       #endif
3361     }
3362   }
3363   u_int addr_val=constmap[i][s]+offset;
3364   if(jaddr) {
3365     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3366   } else if(c&&!memtarget) {
3367     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
3368   }
3369   // basic current block modification detection..
3370   // not looking back as that should be in mips cache already
3371   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
3372     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
3373     assert(i_regs->regmap==regs[i].regmap); // not delay slot
3374     if(i_regs->regmap==regs[i].regmap) {
3375       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3376       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3377       emit_movimm(start+i*4+4,0);
3378       emit_writeword(0,(int)&pcaddr);
3379       emit_jmp((int)do_interrupt);
3380     }
3381   }
3382   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3383   //if(opcode[i]==0x2B || opcode[i]==0x28)
3384   //if(opcode[i]==0x2B || opcode[i]==0x29)
3385   //if(opcode[i]==0x2B)
3386   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3387   {
3388     #ifdef __i386__
3389     emit_pusha();
3390     #endif
3391     #ifdef __arm__
3392     save_regs(0x100f);
3393     #endif
3394         emit_readword((int)&last_count,ECX);
3395         #ifdef __i386__
3396         if(get_reg(i_regs->regmap,CCREG)<0)
3397           emit_loadreg(CCREG,HOST_CCREG);
3398         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3399         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3400         emit_writeword(HOST_CCREG,(int)&Count);
3401         #endif
3402         #ifdef __arm__
3403         if(get_reg(i_regs->regmap,CCREG)<0)
3404           emit_loadreg(CCREG,0);
3405         else
3406           emit_mov(HOST_CCREG,0);
3407         emit_add(0,ECX,0);
3408         emit_addimm(0,2*ccadj[i],0);
3409         emit_writeword(0,(int)&Count);
3410         #endif
3411     emit_call((int)memdebug);
3412     #ifdef __i386__
3413     emit_popa();
3414     #endif
3415     #ifdef __arm__
3416     restore_regs(0x100f);
3417     #endif
3418   }/**/
3419 }
3420
3421 void storelr_assemble(int i,struct regstat *i_regs)
3422 {
3423   int s,th,tl;
3424   int temp;
3425   int temp2;
3426   int offset;
3427   int jaddr=0,jaddr2;
3428   int case1,case2,case3;
3429   int done0,done1,done2;
3430   int memtarget=0,c=0;
3431   int agr=AGEN1+(i&1);
3432   u_int hr,reglist=0;
3433   th=get_reg(i_regs->regmap,rs2[i]|64);
3434   tl=get_reg(i_regs->regmap,rs2[i]);
3435   s=get_reg(i_regs->regmap,rs1[i]);
3436   temp=get_reg(i_regs->regmap,agr);
3437   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3438   offset=imm[i];
3439   if(s>=0) {
3440     c=(i_regs->isconst>>s)&1;
3441     if(c) {
3442       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3443       if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3444     }
3445   }
3446   assert(tl>=0);
3447   for(hr=0;hr<HOST_REGS;hr++) {
3448     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3449   }
3450   assert(temp>=0);
3451   if(!using_tlb) {
3452     if(!c) {
3453       emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3454       if(!offset&&s!=temp) emit_mov(s,temp);
3455       jaddr=(int)out;
3456       emit_jno(0);
3457     }
3458     else
3459     {
3460       if(!memtarget||!rs1[i]) {
3461         jaddr=(int)out;
3462         emit_jmp(0);
3463       }
3464     }
3465     #ifdef RAM_OFFSET
3466     int map=get_reg(i_regs->regmap,ROREG);
3467     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3468     gen_tlb_addr_w(temp,map);
3469     #else
3470     if((u_int)rdram!=0x80000000) 
3471       emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3472     #endif
3473   }else{ // using tlb
3474     int map=get_reg(i_regs->regmap,TLREG);
3475     assert(map>=0);
3476     reglist&=~(1<<map);
3477     map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3478     if(!c&&!offset&&s>=0) emit_mov(s,temp);
3479     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3480     if(!jaddr&&!memtarget) {
3481       jaddr=(int)out;
3482       emit_jmp(0);
3483     }
3484     gen_tlb_addr_w(temp,map);
3485   }
3486
3487   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3488     temp2=get_reg(i_regs->regmap,FTEMP);
3489     if(!rs2[i]) temp2=th=tl;
3490   }
3491
3492 #ifndef BIG_ENDIAN_MIPS
3493     emit_xorimm(temp,3,temp);
3494 #endif
3495   emit_testimm(temp,2);
3496   case2=(int)out;
3497   emit_jne(0);
3498   emit_testimm(temp,1);
3499   case1=(int)out;
3500   emit_jne(0);
3501   // 0
3502   if (opcode[i]==0x2A) { // SWL
3503     emit_writeword_indexed(tl,0,temp);
3504   }
3505   if (opcode[i]==0x2E) { // SWR
3506     emit_writebyte_indexed(tl,3,temp);
3507   }
3508   if (opcode[i]==0x2C) { // SDL
3509     emit_writeword_indexed(th,0,temp);
3510     if(rs2[i]) emit_mov(tl,temp2);
3511   }
3512   if (opcode[i]==0x2D) { // SDR
3513     emit_writebyte_indexed(tl,3,temp);
3514     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3515   }
3516   done0=(int)out;
3517   emit_jmp(0);
3518   // 1
3519   set_jump_target(case1,(int)out);
3520   if (opcode[i]==0x2A) { // SWL
3521     // Write 3 msb into three least significant bytes
3522     if(rs2[i]) emit_rorimm(tl,8,tl);
3523     emit_writehword_indexed(tl,-1,temp);
3524     if(rs2[i]) emit_rorimm(tl,16,tl);
3525     emit_writebyte_indexed(tl,1,temp);
3526     if(rs2[i]) emit_rorimm(tl,8,tl);
3527   }
3528   if (opcode[i]==0x2E) { // SWR
3529     // Write two lsb into two most significant bytes
3530     emit_writehword_indexed(tl,1,temp);
3531   }
3532   if (opcode[i]==0x2C) { // SDL
3533     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3534     // Write 3 msb into three least significant bytes
3535     if(rs2[i]) emit_rorimm(th,8,th);
3536     emit_writehword_indexed(th,-1,temp);
3537     if(rs2[i]) emit_rorimm(th,16,th);
3538     emit_writebyte_indexed(th,1,temp);
3539     if(rs2[i]) emit_rorimm(th,8,th);
3540   }
3541   if (opcode[i]==0x2D) { // SDR
3542     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3543     // Write two lsb into two most significant bytes
3544     emit_writehword_indexed(tl,1,temp);
3545   }
3546   done1=(int)out;
3547   emit_jmp(0);
3548   // 2
3549   set_jump_target(case2,(int)out);
3550   emit_testimm(temp,1);
3551   case3=(int)out;
3552   emit_jne(0);
3553   if (opcode[i]==0x2A) { // SWL
3554     // Write two msb into two least significant bytes
3555     if(rs2[i]) emit_rorimm(tl,16,tl);
3556     emit_writehword_indexed(tl,-2,temp);
3557     if(rs2[i]) emit_rorimm(tl,16,tl);
3558   }
3559   if (opcode[i]==0x2E) { // SWR
3560     // Write 3 lsb into three most significant bytes
3561     emit_writebyte_indexed(tl,-1,temp);
3562     if(rs2[i]) emit_rorimm(tl,8,tl);
3563     emit_writehword_indexed(tl,0,temp);
3564     if(rs2[i]) emit_rorimm(tl,24,tl);
3565   }
3566   if (opcode[i]==0x2C) { // SDL
3567     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3568     // Write two msb into two least significant bytes
3569     if(rs2[i]) emit_rorimm(th,16,th);
3570     emit_writehword_indexed(th,-2,temp);
3571     if(rs2[i]) emit_rorimm(th,16,th);
3572   }
3573   if (opcode[i]==0x2D) { // SDR
3574     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3575     // Write 3 lsb into three most significant bytes
3576     emit_writebyte_indexed(tl,-1,temp);
3577     if(rs2[i]) emit_rorimm(tl,8,tl);
3578     emit_writehword_indexed(tl,0,temp);
3579     if(rs2[i]) emit_rorimm(tl,24,tl);
3580   }
3581   done2=(int)out;
3582   emit_jmp(0);
3583   // 3
3584   set_jump_target(case3,(int)out);
3585   if (opcode[i]==0x2A) { // SWL
3586     // Write msb into least significant byte
3587     if(rs2[i]) emit_rorimm(tl,24,tl);
3588     emit_writebyte_indexed(tl,-3,temp);
3589     if(rs2[i]) emit_rorimm(tl,8,tl);
3590   }
3591   if (opcode[i]==0x2E) { // SWR
3592     // Write entire word
3593     emit_writeword_indexed(tl,-3,temp);
3594   }
3595   if (opcode[i]==0x2C) { // SDL
3596     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3597     // Write msb into least significant byte
3598     if(rs2[i]) emit_rorimm(th,24,th);
3599     emit_writebyte_indexed(th,-3,temp);
3600     if(rs2[i]) emit_rorimm(th,8,th);
3601   }
3602   if (opcode[i]==0x2D) { // SDR
3603     if(rs2[i]) emit_mov(th,temp2);
3604     // Write entire word
3605     emit_writeword_indexed(tl,-3,temp);
3606   }
3607   set_jump_target(done0,(int)out);
3608   set_jump_target(done1,(int)out);
3609   set_jump_target(done2,(int)out);
3610   if (opcode[i]==0x2C) { // SDL
3611     emit_testimm(temp,4);
3612     done0=(int)out;
3613     emit_jne(0);
3614     emit_andimm(temp,~3,temp);
3615     emit_writeword_indexed(temp2,4,temp);
3616     set_jump_target(done0,(int)out);
3617   }
3618   if (opcode[i]==0x2D) { // SDR
3619     emit_testimm(temp,4);
3620     done0=(int)out;
3621     emit_jeq(0);
3622     emit_andimm(temp,~3,temp);
3623     emit_writeword_indexed(temp2,-4,temp);
3624     set_jump_target(done0,(int)out);
3625   }
3626   if(!c||!memtarget)
3627     add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3628   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3629     #ifdef RAM_OFFSET
3630     int map=get_reg(i_regs->regmap,ROREG);
3631     if(map<0) map=HOST_TEMPREG;
3632     gen_orig_addr_w(temp,map);
3633     #else
3634     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3635     #endif
3636     #if defined(HOST_IMM8)
3637     int ir=get_reg(i_regs->regmap,INVCP);
3638     assert(ir>=0);
3639     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3640     #else
3641     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3642     #endif
3643     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3644     emit_callne(invalidate_addr_reg[temp]);
3645     #else
3646     jaddr2=(int)out;
3647     emit_jne(0);
3648     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3649     #endif
3650   }
3651   /*
3652     emit_pusha();
3653     //save_regs(0x100f);
3654         emit_readword((int)&last_count,ECX);
3655         if(get_reg(i_regs->regmap,CCREG)<0)
3656           emit_loadreg(CCREG,HOST_CCREG);
3657         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3658         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3659         emit_writeword(HOST_CCREG,(int)&Count);
3660     emit_call((int)memdebug);
3661     emit_popa();
3662     //restore_regs(0x100f);
3663   /**/
3664 }
3665
3666 void c1ls_assemble(int i,struct regstat *i_regs)
3667 {
3668 #ifndef DISABLE_COP1
3669   int s,th,tl;
3670   int temp,ar;
3671   int map=-1;
3672   int offset;
3673   int c=0;
3674   int jaddr,jaddr2=0,jaddr3,type;
3675   int agr=AGEN1+(i&1);
3676   u_int hr,reglist=0;
3677   th=get_reg(i_regs->regmap,FTEMP|64);
3678   tl=get_reg(i_regs->regmap,FTEMP);
3679   s=get_reg(i_regs->regmap,rs1[i]);
3680   temp=get_reg(i_regs->regmap,agr);
3681   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3682   offset=imm[i];
3683   assert(tl>=0);
3684   assert(rs1[i]>0);
3685   assert(temp>=0);
3686   for(hr=0;hr<HOST_REGS;hr++) {
3687     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3688   }
3689   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3690   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3691   {
3692     // Loads use a temporary register which we need to save
3693     reglist|=1<<temp;
3694   }
3695   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3696     ar=temp;
3697   else // LWC1/LDC1
3698     ar=tl;
3699   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3700   //else c=(i_regs->wasconst>>s)&1;
3701   if(s>=0) c=(i_regs->wasconst>>s)&1;
3702   // Check cop1 unusable
3703   if(!cop1_usable) {
3704     signed char rs=get_reg(i_regs->regmap,CSREG);
3705     assert(rs>=0);
3706     emit_testimm(rs,0x20000000);
3707     jaddr=(int)out;
3708     emit_jeq(0);
3709     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3710     cop1_usable=1;
3711   }
3712   if (opcode[i]==0x39) { // SWC1 (get float address)
3713     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3714   }
3715   if (opcode[i]==0x3D) { // SDC1 (get double address)
3716     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3717   }
3718   // Generate address + offset
3719   if(!using_tlb) {
3720     if(!c)
3721       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3722   }
3723   else
3724   {
3725     map=get_reg(i_regs->regmap,TLREG);
3726     assert(map>=0);
3727     reglist&=~(1<<map);
3728     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3729       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3730     }
3731     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3732       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3733     }
3734   }
3735   if (opcode[i]==0x39) { // SWC1 (read float)
3736     emit_readword_indexed(0,tl,tl);
3737   }
3738   if (opcode[i]==0x3D) { // SDC1 (read double)
3739     emit_readword_indexed(4,tl,th);
3740     emit_readword_indexed(0,tl,tl);
3741   }
3742   if (opcode[i]==0x31) { // LWC1 (get target address)
3743     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3744   }
3745   if (opcode[i]==0x35) { // LDC1 (get target address)
3746     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3747   }
3748   if(!using_tlb) {
3749     if(!c) {
3750       jaddr2=(int)out;
3751       emit_jno(0);
3752     }
3753     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3754       jaddr2=(int)out;
3755       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3756     }
3757     #ifdef DESTRUCTIVE_SHIFT
3758     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3759       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3760     }
3761     #endif
3762   }else{
3763     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3764       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3765     }
3766     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3767       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3768     }
3769   }
3770   if (opcode[i]==0x31) { // LWC1
3771     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3772     //gen_tlb_addr_r(ar,map);
3773     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3774     #ifdef HOST_IMM_ADDR32
3775     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3776     else
3777     #endif
3778     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3779     type=LOADW_STUB;
3780   }
3781   if (opcode[i]==0x35) { // LDC1
3782     assert(th>=0);
3783     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3784     //gen_tlb_addr_r(ar,map);
3785     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3786     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3787     #ifdef HOST_IMM_ADDR32
3788     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3789     else
3790     #endif
3791     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3792     type=LOADD_STUB;
3793   }
3794   if (opcode[i]==0x39) { // SWC1
3795     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3796     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3797     type=STOREW_STUB;
3798   }
3799   if (opcode[i]==0x3D) { // SDC1
3800     assert(th>=0);
3801     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3802     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3803     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3804     type=STORED_STUB;
3805   }
3806   if(!using_tlb&&!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3807     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3808       #ifndef DESTRUCTIVE_SHIFT
3809       temp=offset||c||s<0?ar:s;
3810       #endif
3811       #if defined(HOST_IMM8)
3812       int ir=get_reg(i_regs->regmap,INVCP);
3813       assert(ir>=0);
3814       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3815       #else
3816       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3817       #endif
3818       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3819       emit_callne(invalidate_addr_reg[temp]);
3820       #else
3821       jaddr3=(int)out;
3822       emit_jne(0);
3823       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3824       #endif
3825     }
3826   }
3827   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3828   if (opcode[i]==0x31) { // LWC1 (write float)
3829     emit_writeword_indexed(tl,0,temp);
3830   }
3831   if (opcode[i]==0x35) { // LDC1 (write double)
3832     emit_writeword_indexed(th,4,temp);
3833     emit_writeword_indexed(tl,0,temp);
3834   }
3835   //if(opcode[i]==0x39)
3836   /*if(opcode[i]==0x39||opcode[i]==0x31)
3837   {
3838     emit_pusha();
3839         emit_readword((int)&last_count,ECX);
3840         if(get_reg(i_regs->regmap,CCREG)<0)
3841           emit_loadreg(CCREG,HOST_CCREG);
3842         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3843         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3844         emit_writeword(HOST_CCREG,(int)&Count);
3845     emit_call((int)memdebug);
3846     emit_popa();
3847   }/**/
3848 #else
3849   cop1_unusable(i, i_regs);
3850 #endif
3851 }
3852
3853 void c2ls_assemble(int i,struct regstat *i_regs)
3854 {
3855   int s,tl;
3856   int ar;
3857   int offset;
3858   int memtarget=0,c=0;
3859   int jaddr2=0,jaddr3,type;
3860   int agr=AGEN1+(i&1);
3861   int fastio_reg_override=0;
3862   u_int hr,reglist=0;
3863   u_int copr=(source[i]>>16)&0x1f;
3864   s=get_reg(i_regs->regmap,rs1[i]);
3865   tl=get_reg(i_regs->regmap,FTEMP);
3866   offset=imm[i];
3867   assert(rs1[i]>0);
3868   assert(tl>=0);
3869   assert(!using_tlb);
3870
3871   for(hr=0;hr<HOST_REGS;hr++) {
3872     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3873   }
3874   if(i_regs->regmap[HOST_CCREG]==CCREG)
3875     reglist&=~(1<<HOST_CCREG);
3876
3877   // get the address
3878   if (opcode[i]==0x3a) { // SWC2
3879     ar=get_reg(i_regs->regmap,agr);
3880     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3881     reglist|=1<<ar;
3882   } else { // LWC2
3883     ar=tl;
3884   }
3885   if(s>=0) c=(i_regs->wasconst>>s)&1;
3886   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3887   if (!offset&&!c&&s>=0) ar=s;
3888   assert(ar>=0);
3889
3890   if (opcode[i]==0x3a) { // SWC2
3891     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3892     type=STOREW_STUB;
3893   }
3894   else
3895     type=LOADW_STUB;
3896
3897   if(c&&!memtarget) {
3898     jaddr2=(int)out;
3899     emit_jmp(0); // inline_readstub/inline_writestub?
3900   }
3901   else {
3902     if(!c) {
3903       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3904     }
3905     else if(ram_offset&&memtarget) {
3906       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3907       fastio_reg_override=HOST_TEMPREG;
3908     }
3909     if (opcode[i]==0x32) { // LWC2
3910       #ifdef HOST_IMM_ADDR32
3911       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3912       else
3913       #endif
3914       int a=ar;
3915       if(fastio_reg_override) a=fastio_reg_override;
3916       emit_readword_indexed(0,a,tl);
3917     }
3918     if (opcode[i]==0x3a) { // SWC2
3919       #ifdef DESTRUCTIVE_SHIFT
3920       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3921       #endif
3922       int a=ar;
3923       if(fastio_reg_override) a=fastio_reg_override;
3924       emit_writeword_indexed(tl,0,a);
3925     }
3926   }
3927   if(jaddr2)
3928     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3929   if(opcode[i]==0x3a) // SWC2
3930   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3931 #if defined(HOST_IMM8)
3932     int ir=get_reg(i_regs->regmap,INVCP);
3933     assert(ir>=0);
3934     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3935 #else
3936     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3937 #endif
3938     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3939     emit_callne(invalidate_addr_reg[ar]);
3940     #else
3941     jaddr3=(int)out;
3942     emit_jne(0);
3943     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3944     #endif
3945   }
3946   if (opcode[i]==0x32) { // LWC2
3947     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3948   }
3949 }
3950
3951 #ifndef multdiv_assemble
3952 void multdiv_assemble(int i,struct regstat *i_regs)
3953 {
3954   printf("Need multdiv_assemble for this architecture.\n");
3955   exit(1);
3956 }
3957 #endif
3958
3959 void mov_assemble(int i,struct regstat *i_regs)
3960 {
3961   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3962   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3963   if(rt1[i]) {
3964     signed char sh,sl,th,tl;
3965     th=get_reg(i_regs->regmap,rt1[i]|64);
3966     tl=get_reg(i_regs->regmap,rt1[i]);
3967     //assert(tl>=0);
3968     if(tl>=0) {
3969       sh=get_reg(i_regs->regmap,rs1[i]|64);
3970       sl=get_reg(i_regs->regmap,rs1[i]);
3971       if(sl>=0) emit_mov(sl,tl);
3972       else emit_loadreg(rs1[i],tl);
3973       if(th>=0) {
3974         if(sh>=0) emit_mov(sh,th);
3975         else emit_loadreg(rs1[i]|64,th);
3976       }
3977     }
3978   }
3979 }
3980
3981 #ifndef fconv_assemble
3982 void fconv_assemble(int i,struct regstat *i_regs)
3983 {
3984   printf("Need fconv_assemble for this architecture.\n");
3985   exit(1);
3986 }
3987 #endif
3988
3989 #if 0
3990 void float_assemble(int i,struct regstat *i_regs)
3991 {
3992   printf("Need float_assemble for this architecture.\n");
3993   exit(1);
3994 }
3995 #endif
3996
3997 void syscall_assemble(int i,struct regstat *i_regs)
3998 {
3999   signed char ccreg=get_reg(i_regs->regmap,CCREG);
4000   assert(ccreg==HOST_CCREG);
4001   assert(!is_delayslot);
4002   emit_movimm(start+i*4,EAX); // Get PC
4003   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
4004   emit_jmp((int)jump_syscall_hle); // XXX
4005 }
4006
4007 void hlecall_assemble(int i,struct regstat *i_regs)
4008 {
4009   signed char ccreg=get_reg(i_regs->regmap,CCREG);
4010   assert(ccreg==HOST_CCREG);
4011   assert(!is_delayslot);
4012   emit_movimm(start+i*4+4,0); // Get PC
4013   emit_movimm((int)psxHLEt[source[i]&7],1);
4014   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
4015   emit_jmp((int)jump_hlecall);
4016 }
4017
4018 void intcall_assemble(int i,struct regstat *i_regs)
4019 {
4020   signed char ccreg=get_reg(i_regs->regmap,CCREG);
4021   assert(ccreg==HOST_CCREG);
4022   assert(!is_delayslot);
4023   emit_movimm(start+i*4,0); // Get PC
4024   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
4025   emit_jmp((int)jump_intcall);
4026 }
4027
4028 void ds_assemble(int i,struct regstat *i_regs)
4029 {
4030   speculate_register_values(i);
4031   is_delayslot=1;
4032   switch(itype[i]) {
4033     case ALU:
4034       alu_assemble(i,i_regs);break;
4035     case IMM16:
4036       imm16_assemble(i,i_regs);break;
4037     case SHIFT:
4038       shift_assemble(i,i_regs);break;
4039     case SHIFTIMM:
4040       shiftimm_assemble(i,i_regs);break;
4041     case LOAD:
4042       load_assemble(i,i_regs);break;
4043     case LOADLR:
4044       loadlr_assemble(i,i_regs);break;
4045     case STORE:
4046       store_assemble(i,i_regs);break;
4047     case STORELR:
4048       storelr_assemble(i,i_regs);break;
4049     case COP0:
4050       cop0_assemble(i,i_regs);break;
4051     case COP1:
4052       cop1_assemble(i,i_regs);break;
4053     case C1LS:
4054       c1ls_assemble(i,i_regs);break;
4055     case COP2:
4056       cop2_assemble(i,i_regs);break;
4057     case C2LS:
4058       c2ls_assemble(i,i_regs);break;
4059     case C2OP:
4060       c2op_assemble(i,i_regs);break;
4061     case FCONV:
4062       fconv_assemble(i,i_regs);break;
4063     case FLOAT:
4064       float_assemble(i,i_regs);break;
4065     case FCOMP:
4066       fcomp_assemble(i,i_regs);break;
4067     case MULTDIV:
4068       multdiv_assemble(i,i_regs);break;
4069     case MOV:
4070       mov_assemble(i,i_regs);break;
4071     case SYSCALL:
4072     case HLECALL:
4073     case INTCALL:
4074     case SPAN:
4075     case UJUMP:
4076     case RJUMP:
4077     case CJUMP:
4078     case SJUMP:
4079     case FJUMP:
4080       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4081   }
4082   is_delayslot=0;
4083 }
4084
4085 // Is the branch target a valid internal jump?
4086 int internal_branch(uint64_t i_is32,int addr)
4087 {
4088   if(addr&1) return 0; // Indirect (register) jump
4089   if(addr>=start && addr<start+slen*4-4)
4090   {
4091     int t=(addr-start)>>2;
4092     // Delay slots are not valid branch targets
4093     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4094     // 64 -> 32 bit transition requires a recompile
4095     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
4096     {
4097       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
4098       else printf("optimizable: yes\n");
4099     }*/
4100     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4101 #ifndef FORCE32
4102     if(requires_32bit[t]&~i_is32) return 0;
4103     else
4104 #endif
4105       return 1;
4106   }
4107   return 0;
4108 }
4109
4110 #ifndef wb_invalidate
4111 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
4112   uint64_t u,uint64_t uu)
4113 {
4114   int hr;
4115   for(hr=0;hr<HOST_REGS;hr++) {
4116     if(hr!=EXCLUDE_REG) {
4117       if(pre[hr]!=entry[hr]) {
4118         if(pre[hr]>=0) {
4119           if((dirty>>hr)&1) {
4120             if(get_reg(entry,pre[hr])<0) {
4121               if(pre[hr]<64) {
4122                 if(!((u>>pre[hr])&1)) {
4123                   emit_storereg(pre[hr],hr);
4124                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
4125                     emit_sarimm(hr,31,hr);
4126                     emit_storereg(pre[hr]|64,hr);
4127                   }
4128                 }
4129               }else{
4130                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
4131                   emit_storereg(pre[hr],hr);
4132                 }
4133               }
4134             }
4135           }
4136         }
4137       }
4138     }
4139   }
4140   // Move from one register to another (no writeback)
4141   for(hr=0;hr<HOST_REGS;hr++) {
4142     if(hr!=EXCLUDE_REG) {
4143       if(pre[hr]!=entry[hr]) {
4144         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
4145           int nr;
4146           if((nr=get_reg(entry,pre[hr]))>=0) {
4147             emit_mov(hr,nr);
4148           }
4149         }
4150       }
4151     }
4152   }
4153 }
4154 #endif
4155
4156 // Load the specified registers
4157 // This only loads the registers given as arguments because
4158 // we don't want to load things that will be overwritten
4159 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
4160 {
4161   int hr;
4162   // Load 32-bit regs
4163   for(hr=0;hr<HOST_REGS;hr++) {
4164     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4165       if(entry[hr]!=regmap[hr]) {
4166         if(regmap[hr]==rs1||regmap[hr]==rs2)
4167         {
4168           if(regmap[hr]==0) {
4169             emit_zeroreg(hr);
4170           }
4171           else
4172           {
4173             emit_loadreg(regmap[hr],hr);
4174           }
4175         }
4176       }
4177     }
4178   }
4179   //Load 64-bit regs
4180   for(hr=0;hr<HOST_REGS;hr++) {
4181     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4182       if(entry[hr]!=regmap[hr]) {
4183         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
4184         {
4185           assert(regmap[hr]!=64);
4186           if((is32>>(regmap[hr]&63))&1) {
4187             int lr=get_reg(regmap,regmap[hr]-64);
4188             if(lr>=0)
4189               emit_sarimm(lr,31,hr);
4190             else
4191               emit_loadreg(regmap[hr],hr);
4192           }
4193           else
4194           {
4195             emit_loadreg(regmap[hr],hr);
4196           }
4197         }
4198       }
4199     }
4200   }
4201 }
4202
4203 // Load registers prior to the start of a loop
4204 // so that they are not loaded within the loop
4205 static void loop_preload(signed char pre[],signed char entry[])
4206 {
4207   int hr;
4208   for(hr=0;hr<HOST_REGS;hr++) {
4209     if(hr!=EXCLUDE_REG) {
4210       if(pre[hr]!=entry[hr]) {
4211         if(entry[hr]>=0) {
4212           if(get_reg(pre,entry[hr])<0) {
4213             assem_debug("loop preload:\n");
4214             //printf("loop preload: %d\n",hr);
4215             if(entry[hr]==0) {
4216               emit_zeroreg(hr);
4217             }
4218             else if(entry[hr]<TEMPREG)
4219             {
4220               emit_loadreg(entry[hr],hr);
4221             }
4222             else if(entry[hr]-64<TEMPREG)
4223             {
4224               emit_loadreg(entry[hr],hr);
4225             }
4226           }
4227         }
4228       }
4229     }
4230   }
4231 }
4232
4233 // Generate address for load/store instruction
4234 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
4235 void address_generation(int i,struct regstat *i_regs,signed char entry[])
4236 {
4237   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
4238     int ra=-1;
4239     int agr=AGEN1+(i&1);
4240     int mgr=MGEN1+(i&1);
4241     if(itype[i]==LOAD) {
4242       ra=get_reg(i_regs->regmap,rt1[i]);
4243       if(ra<0) ra=get_reg(i_regs->regmap,-1); 
4244       assert(ra>=0);
4245     }
4246     if(itype[i]==LOADLR) {
4247       ra=get_reg(i_regs->regmap,FTEMP);
4248     }
4249     if(itype[i]==STORE||itype[i]==STORELR) {
4250       ra=get_reg(i_regs->regmap,agr);
4251       if(ra<0) ra=get_reg(i_regs->regmap,-1);
4252     }
4253     if(itype[i]==C1LS||itype[i]==C2LS) {
4254       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
4255         ra=get_reg(i_regs->regmap,FTEMP);
4256       else { // SWC1/SDC1/SWC2/SDC2
4257         ra=get_reg(i_regs->regmap,agr);
4258         if(ra<0) ra=get_reg(i_regs->regmap,-1);
4259       }
4260     }
4261     int rs=get_reg(i_regs->regmap,rs1[i]);
4262     int rm=get_reg(i_regs->regmap,TLREG);
4263     if(ra>=0) {
4264       int offset=imm[i];
4265       int c=(i_regs->wasconst>>rs)&1;
4266       if(rs1[i]==0) {
4267         // Using r0 as a base address
4268         /*if(rm>=0) {
4269           if(!entry||entry[rm]!=mgr) {
4270             generate_map_const(offset,rm);
4271           } // else did it in the previous cycle
4272         }*/
4273         if(!entry||entry[ra]!=agr) {
4274           if (opcode[i]==0x22||opcode[i]==0x26) {
4275             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4276           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4277             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4278           }else{
4279             emit_movimm(offset,ra);
4280           }
4281         } // else did it in the previous cycle
4282       }
4283       else if(rs<0) {
4284         if(!entry||entry[ra]!=rs1[i])
4285           emit_loadreg(rs1[i],ra);
4286         //if(!entry||entry[ra]!=rs1[i])
4287         //  printf("poor load scheduling!\n");
4288       }
4289       else if(c) {
4290 #ifndef DISABLE_TLB
4291         if(rm>=0) {
4292           if(!entry||entry[rm]!=mgr) {
4293             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4294               // Stores to memory go thru the mapper to detect self-modifying
4295               // code, loads don't.
4296               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4297                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4298                 generate_map_const(constmap[i][rs]+offset,rm);
4299             }else{
4300               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4301                 generate_map_const(constmap[i][rs]+offset,rm);
4302             }
4303           }
4304         }
4305 #endif
4306         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4307           if(!entry||entry[ra]!=agr) {
4308             if (opcode[i]==0x22||opcode[i]==0x26) {
4309               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4310             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4311               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4312             }else{
4313               #ifdef HOST_IMM_ADDR32
4314               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4315                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4316               #endif
4317               emit_movimm(constmap[i][rs]+offset,ra);
4318               regs[i].loadedconst|=1<<ra;
4319             }
4320           } // else did it in the previous cycle
4321         } // else load_consts already did it
4322       }
4323       if(offset&&!c&&rs1[i]) {
4324         if(rs>=0) {
4325           emit_addimm(rs,offset,ra);
4326         }else{
4327           emit_addimm(ra,offset,ra);
4328         }
4329       }
4330     }
4331   }
4332   // Preload constants for next instruction
4333   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4334     int agr,ra;
4335     #if !defined(HOST_IMM_ADDR32) && !defined(DISABLE_TLB)
4336     // Mapper entry
4337     agr=MGEN1+((i+1)&1);
4338     ra=get_reg(i_regs->regmap,agr);
4339     if(ra>=0) {
4340       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4341       int offset=imm[i+1];
4342       int c=(regs[i+1].wasconst>>rs)&1;
4343       if(c) {
4344         if(itype[i+1]==STORE||itype[i+1]==STORELR
4345            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4346           // Stores to memory go thru the mapper to detect self-modifying
4347           // code, loads don't.
4348           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4349              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4350             generate_map_const(constmap[i+1][rs]+offset,ra);
4351         }else{
4352           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4353             generate_map_const(constmap[i+1][rs]+offset,ra);
4354         }
4355       }
4356       /*else if(rs1[i]==0) {
4357         generate_map_const(offset,ra);
4358       }*/
4359     }
4360     #endif
4361     // Actual address
4362     agr=AGEN1+((i+1)&1);
4363     ra=get_reg(i_regs->regmap,agr);
4364     if(ra>=0) {
4365       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4366       int offset=imm[i+1];
4367       int c=(regs[i+1].wasconst>>rs)&1;
4368       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4369         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4370           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4371         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4372           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4373         }else{
4374           #ifdef HOST_IMM_ADDR32
4375           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4376              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4377           #endif
4378           emit_movimm(constmap[i+1][rs]+offset,ra);
4379           regs[i+1].loadedconst|=1<<ra;
4380         }
4381       }
4382       else if(rs1[i+1]==0) {
4383         // Using r0 as a base address
4384         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4385           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4386         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4387           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4388         }else{
4389           emit_movimm(offset,ra);
4390         }
4391       }
4392     }
4393   }
4394 }
4395
4396 int get_final_value(int hr, int i, int *value)
4397 {
4398   int reg=regs[i].regmap[hr];
4399   while(i<slen-1) {
4400     if(regs[i+1].regmap[hr]!=reg) break;
4401     if(!((regs[i+1].isconst>>hr)&1)) break;
4402     if(bt[i+1]) break;
4403     i++;
4404   }
4405   if(i<slen-1) {
4406     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4407       *value=constmap[i][hr];
4408       return 1;
4409     }
4410     if(!bt[i+1]) {
4411       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4412         // Load in delay slot, out-of-order execution
4413         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4414         {
4415           #ifdef HOST_IMM_ADDR32
4416           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4417           #endif
4418           // Precompute load address
4419           *value=constmap[i][hr]+imm[i+2];
4420           return 1;
4421         }
4422       }
4423       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4424       {
4425         #ifdef HOST_IMM_ADDR32
4426         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4427         #endif
4428         // Precompute load address
4429         *value=constmap[i][hr]+imm[i+1];
4430         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4431         return 1;
4432       }
4433     }
4434   }
4435   *value=constmap[i][hr];
4436   //printf("c=%x\n",(int)constmap[i][hr]);
4437   if(i==slen-1) return 1;
4438   if(reg<64) {
4439     return !((unneeded_reg[i+1]>>reg)&1);
4440   }else{
4441     return !((unneeded_reg_upper[i+1]>>reg)&1);
4442   }
4443 }
4444
4445 // Load registers with known constants
4446 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4447 {
4448   int hr,hr2;
4449   // propagate loaded constant flags
4450   if(i==0||bt[i])
4451     regs[i].loadedconst=0;
4452   else {
4453     for(hr=0;hr<HOST_REGS;hr++) {
4454       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
4455          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
4456       {
4457         regs[i].loadedconst|=1<<hr;
4458       }
4459     }
4460   }
4461   // Load 32-bit regs
4462   for(hr=0;hr<HOST_REGS;hr++) {
4463     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4464       //if(entry[hr]!=regmap[hr]) {
4465       if(!((regs[i].loadedconst>>hr)&1)) {
4466         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4467           int value,similar=0;
4468           if(get_final_value(hr,i,&value)) {
4469             // see if some other register has similar value
4470             for(hr2=0;hr2<HOST_REGS;hr2++) {
4471               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
4472                 if(is_similar_value(value,constmap[i][hr2])) {
4473                   similar=1;
4474                   break;
4475                 }
4476               }
4477             }
4478             if(similar) {
4479               int value2;
4480               if(get_final_value(hr2,i,&value2)) // is this needed?
4481                 emit_movimm_from(value2,hr2,value,hr);
4482               else
4483                 emit_movimm(value,hr);
4484             }
4485             else if(value==0) {
4486               emit_zeroreg(hr);
4487             }
4488             else {
4489               emit_movimm(value,hr);
4490             }
4491           }
4492           regs[i].loadedconst|=1<<hr;
4493         }
4494       }
4495     }
4496   }
4497   // Load 64-bit regs
4498   for(hr=0;hr<HOST_REGS;hr++) {
4499     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4500       //if(entry[hr]!=regmap[hr]) {
4501       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4502         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4503           if((is32>>(regmap[hr]&63))&1) {
4504             int lr=get_reg(regmap,regmap[hr]-64);
4505             assert(lr>=0);
4506             emit_sarimm(lr,31,hr);
4507           }
4508           else
4509           {
4510             int value;
4511             if(get_final_value(hr,i,&value)) {
4512               if(value==0) {
4513                 emit_zeroreg(hr);
4514               }
4515               else {
4516                 emit_movimm(value,hr);
4517               }
4518             }
4519           }
4520         }
4521       }
4522     }
4523   }
4524 }
4525 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4526 {
4527   int hr;
4528   // Load 32-bit regs
4529   for(hr=0;hr<HOST_REGS;hr++) {
4530     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4531       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4532         int value=constmap[i][hr];
4533         if(value==0) {
4534           emit_zeroreg(hr);
4535         }
4536         else {
4537           emit_movimm(value,hr);
4538         }
4539       }
4540     }
4541   }
4542   // Load 64-bit regs
4543   for(hr=0;hr<HOST_REGS;hr++) {
4544     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4545       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4546         if((is32>>(regmap[hr]&63))&1) {
4547           int lr=get_reg(regmap,regmap[hr]-64);
4548           assert(lr>=0);
4549           emit_sarimm(lr,31,hr);
4550         }
4551         else
4552         {
4553           int value=constmap[i][hr];
4554           if(value==0) {
4555             emit_zeroreg(hr);
4556           }
4557           else {
4558             emit_movimm(value,hr);
4559           }
4560         }
4561       }
4562     }
4563   }
4564 }
4565
4566 // Write out all dirty registers (except cycle count)
4567 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4568 {
4569   int hr;
4570   for(hr=0;hr<HOST_REGS;hr++) {
4571     if(hr!=EXCLUDE_REG) {
4572       if(i_regmap[hr]>0) {
4573         if(i_regmap[hr]!=CCREG) {
4574           if((i_dirty>>hr)&1) {
4575             if(i_regmap[hr]<64) {
4576               emit_storereg(i_regmap[hr],hr);
4577 #ifndef FORCE32
4578               if( ((i_is32>>i_regmap[hr])&1) ) {
4579                 #ifdef DESTRUCTIVE_WRITEBACK
4580                 emit_sarimm(hr,31,hr);
4581                 emit_storereg(i_regmap[hr]|64,hr);
4582                 #else
4583                 emit_sarimm(hr,31,HOST_TEMPREG);
4584                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4585                 #endif
4586               }
4587 #endif
4588             }else{
4589               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4590                 emit_storereg(i_regmap[hr],hr);
4591               }
4592             }
4593           }
4594         }
4595       }
4596     }
4597   }
4598 }
4599 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4600 // This writes the registers not written by store_regs_bt
4601 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4602 {
4603   int hr;
4604   int t=(addr-start)>>2;
4605   for(hr=0;hr<HOST_REGS;hr++) {
4606     if(hr!=EXCLUDE_REG) {
4607       if(i_regmap[hr]>0) {
4608         if(i_regmap[hr]!=CCREG) {
4609           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4610             if((i_dirty>>hr)&1) {
4611               if(i_regmap[hr]<64) {
4612                 emit_storereg(i_regmap[hr],hr);
4613 #ifndef FORCE32
4614                 if( ((i_is32>>i_regmap[hr])&1) ) {
4615                   #ifdef DESTRUCTIVE_WRITEBACK
4616                   emit_sarimm(hr,31,hr);
4617                   emit_storereg(i_regmap[hr]|64,hr);
4618                   #else
4619                   emit_sarimm(hr,31,HOST_TEMPREG);
4620                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4621                   #endif
4622                 }
4623 #endif
4624               }else{
4625                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4626                   emit_storereg(i_regmap[hr],hr);
4627                 }
4628               }
4629             }
4630           }
4631         }
4632       }
4633     }
4634   }
4635 }
4636
4637 // Load all registers (except cycle count)
4638 void load_all_regs(signed char i_regmap[])
4639 {
4640   int hr;
4641   for(hr=0;hr<HOST_REGS;hr++) {
4642     if(hr!=EXCLUDE_REG) {
4643       if(i_regmap[hr]==0) {
4644         emit_zeroreg(hr);
4645       }
4646       else
4647       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4648       {
4649         emit_loadreg(i_regmap[hr],hr);
4650       }
4651     }
4652   }
4653 }
4654
4655 // Load all current registers also needed by next instruction
4656 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4657 {
4658   int hr;
4659   for(hr=0;hr<HOST_REGS;hr++) {
4660     if(hr!=EXCLUDE_REG) {
4661       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4662         if(i_regmap[hr]==0) {
4663           emit_zeroreg(hr);
4664         }
4665         else
4666         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
4667         {
4668           emit_loadreg(i_regmap[hr],hr);
4669         }
4670       }
4671     }
4672   }
4673 }
4674
4675 // Load all regs, storing cycle count if necessary
4676 void load_regs_entry(int t)
4677 {
4678   int hr;
4679   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
4680   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
4681   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4682     emit_storereg(CCREG,HOST_CCREG);
4683   }
4684   // Load 32-bit regs
4685   for(hr=0;hr<HOST_REGS;hr++) {
4686     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4687       if(regs[t].regmap_entry[hr]==0) {
4688         emit_zeroreg(hr);
4689       }
4690       else if(regs[t].regmap_entry[hr]!=CCREG)
4691       {
4692         emit_loadreg(regs[t].regmap_entry[hr],hr);
4693       }
4694     }
4695   }
4696   // Load 64-bit regs
4697   for(hr=0;hr<HOST_REGS;hr++) {
4698     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4699       assert(regs[t].regmap_entry[hr]!=64);
4700       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4701         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4702         if(lr<0) {
4703           emit_loadreg(regs[t].regmap_entry[hr],hr);
4704         }
4705         else
4706         {
4707           emit_sarimm(lr,31,hr);
4708         }
4709       }
4710       else
4711       {
4712         emit_loadreg(regs[t].regmap_entry[hr],hr);
4713       }
4714     }
4715   }
4716 }
4717
4718 // Store dirty registers prior to branch
4719 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4720 {
4721   if(internal_branch(i_is32,addr))
4722   {
4723     int t=(addr-start)>>2;
4724     int hr;
4725     for(hr=0;hr<HOST_REGS;hr++) {
4726       if(hr!=EXCLUDE_REG) {
4727         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4728           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4729             if((i_dirty>>hr)&1) {
4730               if(i_regmap[hr]<64) {
4731                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4732                   emit_storereg(i_regmap[hr],hr);
4733                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4734                     #ifdef DESTRUCTIVE_WRITEBACK
4735                     emit_sarimm(hr,31,hr);
4736                     emit_storereg(i_regmap[hr]|64,hr);
4737                     #else
4738                     emit_sarimm(hr,31,HOST_TEMPREG);
4739                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4740                     #endif
4741                   }
4742                 }
4743               }else{
4744                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4745                   emit_storereg(i_regmap[hr],hr);
4746                 }
4747               }
4748             }
4749           }
4750         }
4751       }
4752     }
4753   }
4754   else
4755   {
4756     // Branch out of this block, write out all dirty regs
4757     wb_dirtys(i_regmap,i_is32,i_dirty);
4758   }
4759 }
4760
4761 // Load all needed registers for branch target
4762 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4763 {
4764   //if(addr>=start && addr<(start+slen*4))
4765   if(internal_branch(i_is32,addr))
4766   {
4767     int t=(addr-start)>>2;
4768     int hr;
4769     // Store the cycle count before loading something else
4770     if(i_regmap[HOST_CCREG]!=CCREG) {
4771       assert(i_regmap[HOST_CCREG]==-1);
4772     }
4773     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4774       emit_storereg(CCREG,HOST_CCREG);
4775     }
4776     // Load 32-bit regs
4777     for(hr=0;hr<HOST_REGS;hr++) {
4778       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
4779         #ifdef DESTRUCTIVE_WRITEBACK
4780         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4781         #else
4782         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4783         #endif
4784           if(regs[t].regmap_entry[hr]==0) {
4785             emit_zeroreg(hr);
4786           }
4787           else if(regs[t].regmap_entry[hr]!=CCREG)
4788           {
4789             emit_loadreg(regs[t].regmap_entry[hr],hr);
4790           }
4791         }
4792       }
4793     }
4794     //Load 64-bit regs
4795     for(hr=0;hr<HOST_REGS;hr++) {
4796       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
4797         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4798           assert(regs[t].regmap_entry[hr]!=64);
4799           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4800             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4801             if(lr<0) {
4802               emit_loadreg(regs[t].regmap_entry[hr],hr);
4803             }
4804             else
4805             {
4806               emit_sarimm(lr,31,hr);
4807             }
4808           }
4809           else
4810           {
4811             emit_loadreg(regs[t].regmap_entry[hr],hr);
4812           }
4813         }
4814         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4815           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4816           assert(lr>=0);
4817           emit_sarimm(lr,31,hr);
4818         }
4819       }
4820     }
4821   }
4822 }
4823
4824 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4825 {
4826   if(addr>=start && addr<start+slen*4-4)
4827   {
4828     int t=(addr-start)>>2;
4829     int hr;
4830     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4831     for(hr=0;hr<HOST_REGS;hr++)
4832     {
4833       if(hr!=EXCLUDE_REG)
4834       {
4835         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4836         {
4837           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
4838           {
4839             return 0;
4840           }
4841           else 
4842           if((i_dirty>>hr)&1)
4843           {
4844             if(i_regmap[hr]<TEMPREG)
4845             {
4846               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4847                 return 0;
4848             }
4849             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
4850             {
4851               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4852                 return 0;
4853             }
4854           }
4855         }
4856         else // Same register but is it 32-bit or dirty?
4857         if(i_regmap[hr]>=0)
4858         {
4859           if(!((regs[t].dirty>>hr)&1))
4860           {
4861             if((i_dirty>>hr)&1)
4862             {
4863               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4864               {
4865                 //printf("%x: dirty no match\n",addr);
4866                 return 0;
4867               }
4868             }
4869           }
4870           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4871           {
4872             //printf("%x: is32 no match\n",addr);
4873             return 0;
4874           }
4875         }
4876       }
4877     }
4878     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4879 #ifndef FORCE32
4880     if(requires_32bit[t]&~i_is32) return 0;
4881 #endif
4882     // Delay slots are not valid branch targets
4883     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4884     // Delay slots require additional processing, so do not match
4885     if(is_ds[t]) return 0;
4886   }
4887   else
4888   {
4889     int hr;
4890     for(hr=0;hr<HOST_REGS;hr++)
4891     {
4892       if(hr!=EXCLUDE_REG)
4893       {
4894         if(i_regmap[hr]>=0)
4895         {
4896           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4897           {
4898             if((i_dirty>>hr)&1)
4899             {
4900               return 0;
4901             }
4902           }
4903         }
4904       }
4905     }
4906   }
4907   return 1;
4908 }
4909
4910 // Used when a branch jumps into the delay slot of another branch
4911 void ds_assemble_entry(int i)
4912 {
4913   int t=(ba[i]-start)>>2;
4914   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4915   assem_debug("Assemble delay slot at %x\n",ba[i]);
4916   assem_debug("<->\n");
4917   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4918     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4919   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4920   address_generation(t,&regs[t],regs[t].regmap_entry);
4921   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4922     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4923   cop1_usable=0;
4924   is_delayslot=0;
4925   switch(itype[t]) {
4926     case ALU:
4927       alu_assemble(t,&regs[t]);break;
4928     case IMM16:
4929       imm16_assemble(t,&regs[t]);break;
4930     case SHIFT:
4931       shift_assemble(t,&regs[t]);break;
4932     case SHIFTIMM:
4933       shiftimm_assemble(t,&regs[t]);break;
4934     case LOAD:
4935       load_assemble(t,&regs[t]);break;
4936     case LOADLR:
4937       loadlr_assemble(t,&regs[t]);break;
4938     case STORE:
4939       store_assemble(t,&regs[t]);break;
4940     case STORELR:
4941       storelr_assemble(t,&regs[t]);break;
4942     case COP0:
4943       cop0_assemble(t,&regs[t]);break;
4944     case COP1:
4945       cop1_assemble(t,&regs[t]);break;
4946     case C1LS:
4947       c1ls_assemble(t,&regs[t]);break;
4948     case COP2:
4949       cop2_assemble(t,&regs[t]);break;
4950     case C2LS:
4951       c2ls_assemble(t,&regs[t]);break;
4952     case C2OP:
4953       c2op_assemble(t,&regs[t]);break;
4954     case FCONV:
4955       fconv_assemble(t,&regs[t]);break;
4956     case FLOAT:
4957       float_assemble(t,&regs[t]);break;
4958     case FCOMP:
4959       fcomp_assemble(t,&regs[t]);break;
4960     case MULTDIV:
4961       multdiv_assemble(t,&regs[t]);break;
4962     case MOV:
4963       mov_assemble(t,&regs[t]);break;
4964     case SYSCALL:
4965     case HLECALL:
4966     case INTCALL:
4967     case SPAN:
4968     case UJUMP:
4969     case RJUMP:
4970     case CJUMP:
4971     case SJUMP:
4972     case FJUMP:
4973       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
4974   }
4975   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4976   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4977   if(internal_branch(regs[t].is32,ba[i]+4))
4978     assem_debug("branch: internal\n");
4979   else
4980     assem_debug("branch: external\n");
4981   assert(internal_branch(regs[t].is32,ba[i]+4));
4982   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4983   emit_jmp(0);
4984 }
4985
4986 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4987 {
4988   int count;
4989   int jaddr;
4990   int idle=0;
4991   int t=0;
4992   if(itype[i]==RJUMP)
4993   {
4994     *adj=0;
4995   }
4996   //if(ba[i]>=start && ba[i]<(start+slen*4))
4997   if(internal_branch(branch_regs[i].is32,ba[i]))
4998   {
4999     t=(ba[i]-start)>>2;
5000     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
5001     else *adj=ccadj[t];
5002   }
5003   else
5004   {
5005     *adj=0;
5006   }
5007   count=ccadj[i];
5008   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
5009     // Idle loop
5010     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
5011     idle=(int)out;
5012     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
5013     emit_andimm(HOST_CCREG,3,HOST_CCREG);
5014     jaddr=(int)out;
5015     emit_jmp(0);
5016   }
5017   else if(*adj==0||invert) {
5018     int cycles=CLOCK_ADJUST(count+2);
5019     // faster loop HACK
5020     if (t&&*adj) {
5021       int rel=t-i;
5022       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
5023         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
5024     }
5025     emit_addimm_and_set_flags(cycles,HOST_CCREG);
5026     jaddr=(int)out;
5027     emit_jns(0);
5028   }
5029   else
5030   {
5031     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
5032     jaddr=(int)out;
5033     emit_jns(0);
5034   }
5035   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
5036 }
5037
5038 void do_ccstub(int n)
5039 {
5040   literal_pool(256);
5041   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
5042   set_jump_target(stubs[n][1],(int)out);
5043   int i=stubs[n][4];
5044   if(stubs[n][6]==NULLDS) {
5045     // Delay slot instruction is nullified ("likely" branch)
5046     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5047   }
5048   else if(stubs[n][6]!=TAKEN) {
5049     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
5050   }
5051   else {
5052     if(internal_branch(branch_regs[i].is32,ba[i]))
5053       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5054   }
5055   if(stubs[n][5]!=-1)
5056   {
5057     // Save PC as return address
5058     emit_movimm(stubs[n][5],EAX);
5059     emit_writeword(EAX,(int)&pcaddr);
5060   }
5061   else
5062   {
5063     // Return address depends on which way the branch goes
5064     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
5065     {
5066       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5067       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5068       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5069       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5070       if(rs1[i]==0)
5071       {
5072         s1l=s2l;s1h=s2h;
5073         s2l=s2h=-1;
5074       }
5075       else if(rs2[i]==0)
5076       {
5077         s2l=s2h=-1;
5078       }
5079       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
5080         s1h=s2h=-1;
5081       }
5082       assert(s1l>=0);
5083       #ifdef DESTRUCTIVE_WRITEBACK
5084       if(rs1[i]) {
5085         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
5086           emit_loadreg(rs1[i],s1l);
5087       } 
5088       else {
5089         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
5090           emit_loadreg(rs2[i],s1l);
5091       }
5092       if(s2l>=0)
5093         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
5094           emit_loadreg(rs2[i],s2l);
5095       #endif
5096       int hr=0;
5097       int addr=-1,alt=-1,ntaddr=-1;
5098       while(hr<HOST_REGS)
5099       {
5100         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5101            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5102            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5103         {
5104           addr=hr++;break;
5105         }
5106         hr++;
5107       }
5108       while(hr<HOST_REGS)
5109       {
5110         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5111            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5112            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5113         {
5114           alt=hr++;break;
5115         }
5116         hr++;
5117       }
5118       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5119       {
5120         while(hr<HOST_REGS)
5121         {
5122           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5123              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
5124              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
5125           {
5126             ntaddr=hr;break;
5127           }
5128           hr++;
5129         }
5130         assert(hr<HOST_REGS);
5131       }
5132       if((opcode[i]&0x2f)==4) // BEQ
5133       {
5134         #ifdef HAVE_CMOV_IMM
5135         if(s1h<0) {
5136           if(s2l>=0) emit_cmp(s1l,s2l);
5137           else emit_test(s1l,s1l);
5138           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5139         }
5140         else
5141         #endif
5142         {
5143           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5144           if(s1h>=0) {
5145             if(s2h>=0) emit_cmp(s1h,s2h);
5146             else emit_test(s1h,s1h);
5147             emit_cmovne_reg(alt,addr);
5148           }
5149           if(s2l>=0) emit_cmp(s1l,s2l);
5150           else emit_test(s1l,s1l);
5151           emit_cmovne_reg(alt,addr);
5152         }
5153       }
5154       if((opcode[i]&0x2f)==5) // BNE
5155       {
5156         #ifdef HAVE_CMOV_IMM
5157         if(s1h<0) {
5158           if(s2l>=0) emit_cmp(s1l,s2l);
5159           else emit_test(s1l,s1l);
5160           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5161         }
5162         else
5163         #endif
5164         {
5165           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5166           if(s1h>=0) {
5167             if(s2h>=0) emit_cmp(s1h,s2h);
5168             else emit_test(s1h,s1h);
5169             emit_cmovne_reg(alt,addr);
5170           }
5171           if(s2l>=0) emit_cmp(s1l,s2l);
5172           else emit_test(s1l,s1l);
5173           emit_cmovne_reg(alt,addr);
5174         }
5175       }
5176       if((opcode[i]&0x2f)==6) // BLEZ
5177       {
5178         //emit_movimm(ba[i],alt);
5179         //emit_movimm(start+i*4+8,addr);
5180         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5181         emit_cmpimm(s1l,1);
5182         if(s1h>=0) emit_mov(addr,ntaddr);
5183         emit_cmovl_reg(alt,addr);
5184         if(s1h>=0) {
5185           emit_test(s1h,s1h);
5186           emit_cmovne_reg(ntaddr,addr);
5187           emit_cmovs_reg(alt,addr);
5188         }
5189       }
5190       if((opcode[i]&0x2f)==7) // BGTZ
5191       {
5192         //emit_movimm(ba[i],addr);
5193         //emit_movimm(start+i*4+8,ntaddr);
5194         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5195         emit_cmpimm(s1l,1);
5196         if(s1h>=0) emit_mov(addr,alt);
5197         emit_cmovl_reg(ntaddr,addr);
5198         if(s1h>=0) {
5199           emit_test(s1h,s1h);
5200           emit_cmovne_reg(alt,addr);
5201           emit_cmovs_reg(ntaddr,addr);
5202         }
5203       }
5204       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
5205       {
5206         //emit_movimm(ba[i],alt);
5207         //emit_movimm(start+i*4+8,addr);
5208         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5209         if(s1h>=0) emit_test(s1h,s1h);
5210         else emit_test(s1l,s1l);
5211         emit_cmovs_reg(alt,addr);
5212       }
5213       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
5214       {
5215         //emit_movimm(ba[i],addr);
5216         //emit_movimm(start+i*4+8,alt);
5217         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5218         if(s1h>=0) emit_test(s1h,s1h);
5219         else emit_test(s1l,s1l);
5220         emit_cmovs_reg(alt,addr);
5221       }
5222       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5223         if(source[i]&0x10000) // BC1T
5224         {
5225           //emit_movimm(ba[i],alt);
5226           //emit_movimm(start+i*4+8,addr);
5227           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5228           emit_testimm(s1l,0x800000);
5229           emit_cmovne_reg(alt,addr);
5230         }
5231         else // BC1F
5232         {
5233           //emit_movimm(ba[i],addr);
5234           //emit_movimm(start+i*4+8,alt);
5235           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5236           emit_testimm(s1l,0x800000);
5237           emit_cmovne_reg(alt,addr);
5238         }
5239       }
5240       emit_writeword(addr,(int)&pcaddr);
5241     }
5242     else
5243     if(itype[i]==RJUMP)
5244     {
5245       int r=get_reg(branch_regs[i].regmap,rs1[i]);
5246       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5247         r=get_reg(branch_regs[i].regmap,RTEMP);
5248       }
5249       emit_writeword(r,(int)&pcaddr);
5250     }
5251     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
5252   }
5253   // Update cycle count
5254   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
5255   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
5256   emit_call((int)cc_interrupt);
5257   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((int)stubs[n][3]),HOST_CCREG);
5258   if(stubs[n][6]==TAKEN) {
5259     if(internal_branch(branch_regs[i].is32,ba[i]))
5260       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
5261     else if(itype[i]==RJUMP) {
5262       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
5263         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
5264       else
5265         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
5266     }
5267   }else if(stubs[n][6]==NOTTAKEN) {
5268     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
5269     else load_all_regs(branch_regs[i].regmap);
5270   }else if(stubs[n][6]==NULLDS) {
5271     // Delay slot instruction is nullified ("likely" branch)
5272     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
5273     else load_all_regs(regs[i].regmap);
5274   }else{
5275     load_all_regs(branch_regs[i].regmap);
5276   }
5277   emit_jmp(stubs[n][2]); // return address
5278   
5279   /* This works but uses a lot of memory...
5280   emit_readword((int)&last_count,ECX);
5281   emit_add(HOST_CCREG,ECX,EAX);
5282   emit_writeword(EAX,(int)&Count);
5283   emit_call((int)gen_interupt);
5284   emit_readword((int)&Count,HOST_CCREG);
5285   emit_readword((int)&next_interupt,EAX);
5286   emit_readword((int)&pending_exception,EBX);
5287   emit_writeword(EAX,(int)&last_count);
5288   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
5289   emit_test(EBX,EBX);
5290   int jne_instr=(int)out;
5291   emit_jne(0);
5292   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
5293   load_all_regs(branch_regs[i].regmap);
5294   emit_jmp(stubs[n][2]); // return address
5295   set_jump_target(jne_instr,(int)out);
5296   emit_readword((int)&pcaddr,EAX);
5297   // Call get_addr_ht instead of doing the hash table here.
5298   // This code is executed infrequently and takes up a lot of space
5299   // so smaller is better.
5300   emit_storereg(CCREG,HOST_CCREG);
5301   emit_pushreg(EAX);
5302   emit_call((int)get_addr_ht);
5303   emit_loadreg(CCREG,HOST_CCREG);
5304   emit_addimm(ESP,4,ESP);
5305   emit_jmpreg(EAX);*/
5306 }
5307
5308 add_to_linker(int addr,int target,int ext)
5309 {
5310   link_addr[linkcount][0]=addr;
5311   link_addr[linkcount][1]=target;
5312   link_addr[linkcount][2]=ext;  
5313   linkcount++;
5314 }
5315
5316 static void ujump_assemble_write_ra(int i)
5317 {
5318   int rt;
5319   unsigned int return_address;
5320   rt=get_reg(branch_regs[i].regmap,31);
5321   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5322   //assert(rt>=0);
5323   return_address=start+i*4+8;
5324   if(rt>=0) {
5325     #ifdef USE_MINI_HT
5326     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
5327       int temp=-1; // note: must be ds-safe
5328       #ifdef HOST_TEMPREG
5329       temp=HOST_TEMPREG;
5330       #endif
5331       if(temp>=0) do_miniht_insert(return_address,rt,temp);
5332       else emit_movimm(return_address,rt);
5333     }
5334     else
5335     #endif
5336     {
5337       #ifdef REG_PREFETCH
5338       if(temp>=0) 
5339       {
5340         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5341       }
5342       #endif
5343       emit_movimm(return_address,rt); // PC into link register
5344       #ifdef IMM_PREFETCH
5345       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5346       #endif
5347     }
5348   }
5349 }
5350
5351 void ujump_assemble(int i,struct regstat *i_regs)
5352 {
5353   signed char *i_regmap=i_regs->regmap;
5354   int ra_done=0;
5355   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5356   address_generation(i+1,i_regs,regs[i].regmap_entry);
5357   #ifdef REG_PREFETCH
5358   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5359   if(rt1[i]==31&&temp>=0) 
5360   {
5361     int return_address=start+i*4+8;
5362     if(get_reg(branch_regs[i].regmap,31)>0) 
5363     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5364   }
5365   #endif
5366   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5367     ujump_assemble_write_ra(i); // writeback ra for DS
5368     ra_done=1;
5369   }
5370   ds_assemble(i+1,i_regs);
5371   uint64_t bc_unneeded=branch_regs[i].u;
5372   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5373   bc_unneeded|=1|(1LL<<rt1[i]);
5374   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5375   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5376                 bc_unneeded,bc_unneeded_upper);
5377   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5378   if(!ra_done&&rt1[i]==31)
5379     ujump_assemble_write_ra(i);
5380   int cc,adj;
5381   cc=get_reg(branch_regs[i].regmap,CCREG);
5382   assert(cc==HOST_CCREG);
5383   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5384   #ifdef REG_PREFETCH
5385   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5386   #endif
5387   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5388   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5389   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5390   if(internal_branch(branch_regs[i].is32,ba[i]))
5391     assem_debug("branch: internal\n");
5392   else
5393     assem_debug("branch: external\n");
5394   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5395     ds_assemble_entry(i);
5396   }
5397   else {
5398     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5399     emit_jmp(0);
5400   }
5401 }
5402
5403 static void rjump_assemble_write_ra(int i)
5404 {
5405   int rt,return_address;
5406   assert(rt1[i+1]!=rt1[i]);
5407   assert(rt2[i+1]!=rt1[i]);
5408   rt=get_reg(branch_regs[i].regmap,rt1[i]);
5409   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5410   assert(rt>=0);
5411   return_address=start+i*4+8;
5412   #ifdef REG_PREFETCH
5413   if(temp>=0) 
5414   {
5415     if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5416   }
5417   #endif
5418   emit_movimm(return_address,rt); // PC into link register
5419   #ifdef IMM_PREFETCH
5420   emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5421   #endif
5422 }
5423
5424 void rjump_assemble(int i,struct regstat *i_regs)
5425 {
5426   signed char *i_regmap=i_regs->regmap;
5427   int temp;
5428   int rs,cc,adj;
5429   int ra_done=0;
5430   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5431   assert(rs>=0);
5432   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5433     // Delay slot abuse, make a copy of the branch address register
5434     temp=get_reg(branch_regs[i].regmap,RTEMP);
5435     assert(temp>=0);
5436     assert(regs[i].regmap[temp]==RTEMP);
5437     emit_mov(rs,temp);
5438     rs=temp;
5439   }
5440   address_generation(i+1,i_regs,regs[i].regmap_entry);
5441   #ifdef REG_PREFETCH
5442   if(rt1[i]==31) 
5443   {
5444     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5445       int return_address=start+i*4+8;
5446       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5447     }
5448   }
5449   #endif
5450   #ifdef USE_MINI_HT
5451   if(rs1[i]==31) {
5452     int rh=get_reg(regs[i].regmap,RHASH);
5453     if(rh>=0) do_preload_rhash(rh);
5454   }
5455   #endif
5456   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
5457     rjump_assemble_write_ra(i);
5458     ra_done=1;
5459   }
5460   ds_assemble(i+1,i_regs);
5461   uint64_t bc_unneeded=branch_regs[i].u;
5462   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5463   bc_unneeded|=1|(1LL<<rt1[i]);
5464   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5465   bc_unneeded&=~(1LL<<rs1[i]);
5466   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5467                 bc_unneeded,bc_unneeded_upper);
5468   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5469   if(!ra_done&&rt1[i]!=0)
5470     rjump_assemble_write_ra(i);
5471   cc=get_reg(branch_regs[i].regmap,CCREG);
5472   assert(cc==HOST_CCREG);
5473   #ifdef USE_MINI_HT
5474   int rh=get_reg(branch_regs[i].regmap,RHASH);
5475   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5476   if(rs1[i]==31) {
5477     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5478     do_preload_rhtbl(ht);
5479     do_rhash(rs,rh);
5480   }
5481   #endif
5482   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5483   #ifdef DESTRUCTIVE_WRITEBACK
5484   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5485     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5486       emit_loadreg(rs1[i],rs);
5487     }
5488   }
5489   #endif
5490   #ifdef REG_PREFETCH
5491   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5492   #endif
5493   #ifdef USE_MINI_HT
5494   if(rs1[i]==31) {
5495     do_miniht_load(ht,rh);
5496   }
5497   #endif
5498   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5499   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5500   //assert(adj==0);
5501   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5502   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5503 #ifdef PCSX
5504   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
5505     // special case for RFE
5506     emit_jmp(0);
5507   else
5508 #endif
5509   emit_jns(0);
5510   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5511   #ifdef USE_MINI_HT
5512   if(rs1[i]==31) {
5513     do_miniht_jump(rs,rh,ht);
5514   }
5515   else
5516   #endif
5517   {
5518     //if(rs!=EAX) emit_mov(rs,EAX);
5519     //emit_jmp((int)jump_vaddr_eax);
5520     emit_jmp(jump_vaddr_reg[rs]);
5521   }
5522   /* Check hash table
5523   temp=!rs;
5524   emit_mov(rs,temp);
5525   emit_shrimm(rs,16,rs);
5526   emit_xor(temp,rs,rs);
5527   emit_movzwl_reg(rs,rs);
5528   emit_shlimm(rs,4,rs);
5529   emit_cmpmem_indexed((int)hash_table,rs,temp);
5530   emit_jne((int)out+14);
5531   emit_readword_indexed((int)hash_table+4,rs,rs);
5532   emit_jmpreg(rs);
5533   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5534   emit_addimm_no_flags(8,rs);
5535   emit_jeq((int)out-17);
5536   // No hit on hash table, call compiler
5537   emit_pushreg(temp);
5538 //DEBUG >
5539 #ifdef DEBUG_CYCLE_COUNT
5540   emit_readword((int)&last_count,ECX);
5541   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5542   emit_readword((int)&next_interupt,ECX);
5543   emit_writeword(HOST_CCREG,(int)&Count);
5544   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5545   emit_writeword(ECX,(int)&last_count);
5546 #endif
5547 //DEBUG <
5548   emit_storereg(CCREG,HOST_CCREG);
5549   emit_call((int)get_addr);
5550   emit_loadreg(CCREG,HOST_CCREG);
5551   emit_addimm(ESP,4,ESP);
5552   emit_jmpreg(EAX);*/
5553   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5554   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5555   #endif
5556 }
5557
5558 void cjump_assemble(int i,struct regstat *i_regs)
5559 {
5560   signed char *i_regmap=i_regs->regmap;
5561   int cc;
5562   int match;
5563   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5564   assem_debug("match=%d\n",match);
5565   int s1h,s1l,s2h,s2l;
5566   int prev_cop1_usable=cop1_usable;
5567   int unconditional=0,nop=0;
5568   int only32=0;
5569   int invert=0;
5570   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5571   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5572   if(!match) invert=1;
5573   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5574   if(i>(ba[i]-start)>>2) invert=1;
5575   #endif
5576   
5577   if(ooo[i]) {
5578     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5579     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5580     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5581     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5582   }
5583   else {
5584     s1l=get_reg(i_regmap,rs1[i]);
5585     s1h=get_reg(i_regmap,rs1[i]|64);
5586     s2l=get_reg(i_regmap,rs2[i]);
5587     s2h=get_reg(i_regmap,rs2[i]|64);
5588   }
5589   if(rs1[i]==0&&rs2[i]==0)
5590   {
5591     if(opcode[i]&1) nop=1;
5592     else unconditional=1;
5593     //assert(opcode[i]!=5);
5594     //assert(opcode[i]!=7);
5595     //assert(opcode[i]!=0x15);
5596     //assert(opcode[i]!=0x17);
5597   }
5598   else if(rs1[i]==0)
5599   {
5600     s1l=s2l;s1h=s2h;
5601     s2l=s2h=-1;
5602     only32=(regs[i].was32>>rs2[i])&1;
5603   }
5604   else if(rs2[i]==0)
5605   {
5606     s2l=s2h=-1;
5607     only32=(regs[i].was32>>rs1[i])&1;
5608   }
5609   else {
5610     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5611   }
5612
5613   if(ooo[i]) {
5614     // Out of order execution (delay slot first)
5615     //printf("OOOE\n");
5616     address_generation(i+1,i_regs,regs[i].regmap_entry);
5617     ds_assemble(i+1,i_regs);
5618     int adj;
5619     uint64_t bc_unneeded=branch_regs[i].u;
5620     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5621     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5622     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5623     bc_unneeded|=1;
5624     bc_unneeded_upper|=1;
5625     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5626                   bc_unneeded,bc_unneeded_upper);
5627     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5628     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5629     cc=get_reg(branch_regs[i].regmap,CCREG);
5630     assert(cc==HOST_CCREG);
5631     if(unconditional) 
5632       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5633     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5634     //assem_debug("cycle count (adj)\n");
5635     if(unconditional) {
5636       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5637       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5638         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5639         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5640         if(internal)
5641           assem_debug("branch: internal\n");
5642         else
5643           assem_debug("branch: external\n");
5644         if(internal&&is_ds[(ba[i]-start)>>2]) {
5645           ds_assemble_entry(i);
5646         }
5647         else {
5648           add_to_linker((int)out,ba[i],internal);
5649           emit_jmp(0);
5650         }
5651         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5652         if(((u_int)out)&7) emit_addnop(0);
5653         #endif
5654       }
5655     }
5656     else if(nop) {
5657       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5658       int jaddr=(int)out;
5659       emit_jns(0);
5660       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5661     }
5662     else {
5663       int taken=0,nottaken=0,nottaken1=0;
5664       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5665       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5666       if(!only32)
5667       {
5668         assert(s1h>=0);
5669         if(opcode[i]==4) // BEQ
5670         {
5671           if(s2h>=0) emit_cmp(s1h,s2h);
5672           else emit_test(s1h,s1h);
5673           nottaken1=(int)out;
5674           emit_jne(1);
5675         }
5676         if(opcode[i]==5) // BNE
5677         {
5678           if(s2h>=0) emit_cmp(s1h,s2h);
5679           else emit_test(s1h,s1h);
5680           if(invert) taken=(int)out;
5681           else add_to_linker((int)out,ba[i],internal);
5682           emit_jne(0);
5683         }
5684         if(opcode[i]==6) // BLEZ
5685         {
5686           emit_test(s1h,s1h);
5687           if(invert) taken=(int)out;
5688           else add_to_linker((int)out,ba[i],internal);
5689           emit_js(0);
5690           nottaken1=(int)out;
5691           emit_jne(1);
5692         }
5693         if(opcode[i]==7) // BGTZ
5694         {
5695           emit_test(s1h,s1h);
5696           nottaken1=(int)out;
5697           emit_js(1);
5698           if(invert) taken=(int)out;
5699           else add_to_linker((int)out,ba[i],internal);
5700           emit_jne(0);
5701         }
5702       } // if(!only32)
5703           
5704       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5705       assert(s1l>=0);
5706       if(opcode[i]==4) // BEQ
5707       {
5708         if(s2l>=0) emit_cmp(s1l,s2l);
5709         else emit_test(s1l,s1l);
5710         if(invert){
5711           nottaken=(int)out;
5712           emit_jne(1);
5713         }else{
5714           add_to_linker((int)out,ba[i],internal);
5715           emit_jeq(0);
5716         }
5717       }
5718       if(opcode[i]==5) // BNE
5719       {
5720         if(s2l>=0) emit_cmp(s1l,s2l);
5721         else emit_test(s1l,s1l);
5722         if(invert){
5723           nottaken=(int)out;
5724           emit_jeq(1);
5725         }else{
5726           add_to_linker((int)out,ba[i],internal);
5727           emit_jne(0);
5728         }
5729       }
5730       if(opcode[i]==6) // BLEZ
5731       {
5732         emit_cmpimm(s1l,1);
5733         if(invert){
5734           nottaken=(int)out;
5735           emit_jge(1);
5736         }else{
5737           add_to_linker((int)out,ba[i],internal);
5738           emit_jl(0);
5739         }
5740       }
5741       if(opcode[i]==7) // BGTZ
5742       {
5743         emit_cmpimm(s1l,1);
5744         if(invert){
5745           nottaken=(int)out;
5746           emit_jl(1);
5747         }else{
5748           add_to_linker((int)out,ba[i],internal);
5749           emit_jge(0);
5750         }
5751       }
5752       if(invert) {
5753         if(taken) set_jump_target(taken,(int)out);
5754         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5755         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5756           if(adj) {
5757             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5758             add_to_linker((int)out,ba[i],internal);
5759           }else{
5760             emit_addnop(13);
5761             add_to_linker((int)out,ba[i],internal*2);
5762           }
5763           emit_jmp(0);
5764         }else
5765         #endif
5766         {
5767           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5768           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5769           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5770           if(internal)
5771             assem_debug("branch: internal\n");
5772           else
5773             assem_debug("branch: external\n");
5774           if(internal&&is_ds[(ba[i]-start)>>2]) {
5775             ds_assemble_entry(i);
5776           }
5777           else {
5778             add_to_linker((int)out,ba[i],internal);
5779             emit_jmp(0);
5780           }
5781         }
5782         set_jump_target(nottaken,(int)out);
5783       }
5784
5785       if(nottaken1) set_jump_target(nottaken1,(int)out);
5786       if(adj) {
5787         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5788       }
5789     } // (!unconditional)
5790   } // if(ooo)
5791   else
5792   {
5793     // In-order execution (branch first)
5794     //if(likely[i]) printf("IOL\n");
5795     //else
5796     //printf("IOE\n");
5797     int taken=0,nottaken=0,nottaken1=0;
5798     if(!unconditional&&!nop) {
5799       if(!only32)
5800       {
5801         assert(s1h>=0);
5802         if((opcode[i]&0x2f)==4) // BEQ
5803         {
5804           if(s2h>=0) emit_cmp(s1h,s2h);
5805           else emit_test(s1h,s1h);
5806           nottaken1=(int)out;
5807           emit_jne(2);
5808         }
5809         if((opcode[i]&0x2f)==5) // BNE
5810         {
5811           if(s2h>=0) emit_cmp(s1h,s2h);
5812           else emit_test(s1h,s1h);
5813           taken=(int)out;
5814           emit_jne(1);
5815         }
5816         if((opcode[i]&0x2f)==6) // BLEZ
5817         {
5818           emit_test(s1h,s1h);
5819           taken=(int)out;
5820           emit_js(1);
5821           nottaken1=(int)out;
5822           emit_jne(2);
5823         }
5824         if((opcode[i]&0x2f)==7) // BGTZ
5825         {
5826           emit_test(s1h,s1h);
5827           nottaken1=(int)out;
5828           emit_js(2);
5829           taken=(int)out;
5830           emit_jne(1);
5831         }
5832       } // if(!only32)
5833           
5834       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5835       assert(s1l>=0);
5836       if((opcode[i]&0x2f)==4) // BEQ
5837       {
5838         if(s2l>=0) emit_cmp(s1l,s2l);
5839         else emit_test(s1l,s1l);
5840         nottaken=(int)out;
5841         emit_jne(2);
5842       }
5843       if((opcode[i]&0x2f)==5) // BNE
5844       {
5845         if(s2l>=0) emit_cmp(s1l,s2l);
5846         else emit_test(s1l,s1l);
5847         nottaken=(int)out;
5848         emit_jeq(2);
5849       }
5850       if((opcode[i]&0x2f)==6) // BLEZ
5851       {
5852         emit_cmpimm(s1l,1);
5853         nottaken=(int)out;
5854         emit_jge(2);
5855       }
5856       if((opcode[i]&0x2f)==7) // BGTZ
5857       {
5858         emit_cmpimm(s1l,1);
5859         nottaken=(int)out;
5860         emit_jl(2);
5861       }
5862     } // if(!unconditional)
5863     int adj;
5864     uint64_t ds_unneeded=branch_regs[i].u;
5865     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5866     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5867     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5868     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5869     ds_unneeded|=1;
5870     ds_unneeded_upper|=1;
5871     // branch taken
5872     if(!nop) {
5873       if(taken) set_jump_target(taken,(int)out);
5874       assem_debug("1:\n");
5875       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5876                     ds_unneeded,ds_unneeded_upper);
5877       // load regs
5878       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5879       address_generation(i+1,&branch_regs[i],0);
5880       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5881       ds_assemble(i+1,&branch_regs[i]);
5882       cc=get_reg(branch_regs[i].regmap,CCREG);
5883       if(cc==-1) {
5884         emit_loadreg(CCREG,cc=HOST_CCREG);
5885         // CHECK: Is the following instruction (fall thru) allocated ok?
5886       }
5887       assert(cc==HOST_CCREG);
5888       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5889       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5890       assem_debug("cycle count (adj)\n");
5891       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5892       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5893       if(internal)
5894         assem_debug("branch: internal\n");
5895       else
5896         assem_debug("branch: external\n");
5897       if(internal&&is_ds[(ba[i]-start)>>2]) {
5898         ds_assemble_entry(i);
5899       }
5900       else {
5901         add_to_linker((int)out,ba[i],internal);
5902         emit_jmp(0);
5903       }
5904     }
5905     // branch not taken
5906     cop1_usable=prev_cop1_usable;
5907     if(!unconditional) {
5908       if(nottaken1) set_jump_target(nottaken1,(int)out);
5909       set_jump_target(nottaken,(int)out);
5910       assem_debug("2:\n");
5911       if(!likely[i]) {
5912         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5913                       ds_unneeded,ds_unneeded_upper);
5914         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5915         address_generation(i+1,&branch_regs[i],0);
5916         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5917         ds_assemble(i+1,&branch_regs[i]);
5918       }
5919       cc=get_reg(branch_regs[i].regmap,CCREG);
5920       if(cc==-1&&!likely[i]) {
5921         // Cycle count isn't in a register, temporarily load it then write it out
5922         emit_loadreg(CCREG,HOST_CCREG);
5923         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5924         int jaddr=(int)out;
5925         emit_jns(0);
5926         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5927         emit_storereg(CCREG,HOST_CCREG);
5928       }
5929       else{
5930         cc=get_reg(i_regmap,CCREG);
5931         assert(cc==HOST_CCREG);
5932         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5933         int jaddr=(int)out;
5934         emit_jns(0);
5935         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5936       }
5937     }
5938   }
5939 }
5940
5941 void sjump_assemble(int i,struct regstat *i_regs)
5942 {
5943   signed char *i_regmap=i_regs->regmap;
5944   int cc;
5945   int match;
5946   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5947   assem_debug("smatch=%d\n",match);
5948   int s1h,s1l;
5949   int prev_cop1_usable=cop1_usable;
5950   int unconditional=0,nevertaken=0;
5951   int only32=0;
5952   int invert=0;
5953   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5954   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5955   if(!match) invert=1;
5956   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5957   if(i>(ba[i]-start)>>2) invert=1;
5958   #endif
5959
5960   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5961   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5962
5963   if(ooo[i]) {
5964     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5965     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5966   }
5967   else {
5968     s1l=get_reg(i_regmap,rs1[i]);
5969     s1h=get_reg(i_regmap,rs1[i]|64);
5970   }
5971   if(rs1[i]==0)
5972   {
5973     if(opcode2[i]&1) unconditional=1;
5974     else nevertaken=1;
5975     // These are never taken (r0 is never less than zero)
5976     //assert(opcode2[i]!=0);
5977     //assert(opcode2[i]!=2);
5978     //assert(opcode2[i]!=0x10);
5979     //assert(opcode2[i]!=0x12);
5980   }
5981   else {
5982     only32=(regs[i].was32>>rs1[i])&1;
5983   }
5984
5985   if(ooo[i]) {
5986     // Out of order execution (delay slot first)
5987     //printf("OOOE\n");
5988     address_generation(i+1,i_regs,regs[i].regmap_entry);
5989     ds_assemble(i+1,i_regs);
5990     int adj;
5991     uint64_t bc_unneeded=branch_regs[i].u;
5992     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5993     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5994     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5995     bc_unneeded|=1;
5996     bc_unneeded_upper|=1;
5997     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5998                   bc_unneeded,bc_unneeded_upper);
5999     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6000     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6001     if(rt1[i]==31) {
6002       int rt,return_address;
6003       rt=get_reg(branch_regs[i].regmap,31);
6004       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6005       if(rt>=0) {
6006         // Save the PC even if the branch is not taken
6007         return_address=start+i*4+8;
6008         emit_movimm(return_address,rt); // PC into link register
6009         #ifdef IMM_PREFETCH
6010         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
6011         #endif
6012       }
6013     }
6014     cc=get_reg(branch_regs[i].regmap,CCREG);
6015     assert(cc==HOST_CCREG);
6016     if(unconditional) 
6017       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6018     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
6019     assem_debug("cycle count (adj)\n");
6020     if(unconditional) {
6021       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
6022       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
6023         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6024         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6025         if(internal)
6026           assem_debug("branch: internal\n");
6027         else
6028           assem_debug("branch: external\n");
6029         if(internal&&is_ds[(ba[i]-start)>>2]) {
6030           ds_assemble_entry(i);
6031         }
6032         else {
6033           add_to_linker((int)out,ba[i],internal);
6034           emit_jmp(0);
6035         }
6036         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6037         if(((u_int)out)&7) emit_addnop(0);
6038         #endif
6039       }
6040     }
6041     else if(nevertaken) {
6042       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6043       int jaddr=(int)out;
6044       emit_jns(0);
6045       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6046     }
6047     else {
6048       int nottaken=0;
6049       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6050       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6051       if(!only32)
6052       {
6053         assert(s1h>=0);
6054         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
6055         {
6056           emit_test(s1h,s1h);
6057           if(invert){
6058             nottaken=(int)out;
6059             emit_jns(1);
6060           }else{
6061             add_to_linker((int)out,ba[i],internal);
6062             emit_js(0);
6063           }
6064         }
6065         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
6066         {
6067           emit_test(s1h,s1h);
6068           if(invert){
6069             nottaken=(int)out;
6070             emit_js(1);
6071           }else{
6072             add_to_linker((int)out,ba[i],internal);
6073             emit_jns(0);
6074           }
6075         }
6076       } // if(!only32)
6077       else
6078       {
6079         assert(s1l>=0);
6080         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
6081         {
6082           emit_test(s1l,s1l);
6083           if(invert){
6084             nottaken=(int)out;
6085             emit_jns(1);
6086           }else{
6087             add_to_linker((int)out,ba[i],internal);
6088             emit_js(0);
6089           }
6090         }
6091         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
6092         {
6093           emit_test(s1l,s1l);
6094           if(invert){
6095             nottaken=(int)out;
6096             emit_js(1);
6097           }else{
6098             add_to_linker((int)out,ba[i],internal);
6099             emit_jns(0);
6100           }
6101         }
6102       } // if(!only32)
6103           
6104       if(invert) {
6105         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6106         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
6107           if(adj) {
6108             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6109             add_to_linker((int)out,ba[i],internal);
6110           }else{
6111             emit_addnop(13);
6112             add_to_linker((int)out,ba[i],internal*2);
6113           }
6114           emit_jmp(0);
6115         }else
6116         #endif
6117         {
6118           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6119           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6120           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6121           if(internal)
6122             assem_debug("branch: internal\n");
6123           else
6124             assem_debug("branch: external\n");
6125           if(internal&&is_ds[(ba[i]-start)>>2]) {
6126             ds_assemble_entry(i);
6127           }
6128           else {
6129             add_to_linker((int)out,ba[i],internal);
6130             emit_jmp(0);
6131           }
6132         }
6133         set_jump_target(nottaken,(int)out);
6134       }
6135
6136       if(adj) {
6137         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
6138       }
6139     } // (!unconditional)
6140   } // if(ooo)
6141   else
6142   {
6143     // In-order execution (branch first)
6144     //printf("IOE\n");
6145     int nottaken=0;
6146     if(rt1[i]==31) {
6147       int rt,return_address;
6148       rt=get_reg(branch_regs[i].regmap,31);
6149       if(rt>=0) {
6150         // Save the PC even if the branch is not taken
6151         return_address=start+i*4+8;
6152         emit_movimm(return_address,rt); // PC into link register
6153         #ifdef IMM_PREFETCH
6154         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
6155         #endif
6156       }
6157     }
6158     if(!unconditional) {
6159       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6160       if(!only32)
6161       {
6162         assert(s1h>=0);
6163         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6164         {
6165           emit_test(s1h,s1h);
6166           nottaken=(int)out;
6167           emit_jns(1);
6168         }
6169         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6170         {
6171           emit_test(s1h,s1h);
6172           nottaken=(int)out;
6173           emit_js(1);
6174         }
6175       } // if(!only32)
6176       else
6177       {
6178         assert(s1l>=0);
6179         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
6180         {
6181           emit_test(s1l,s1l);
6182           nottaken=(int)out;
6183           emit_jns(1);
6184         }
6185         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
6186         {
6187           emit_test(s1l,s1l);
6188           nottaken=(int)out;
6189           emit_js(1);
6190         }
6191       }
6192     } // if(!unconditional)
6193     int adj;
6194     uint64_t ds_unneeded=branch_regs[i].u;
6195     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6196     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6197     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6198     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6199     ds_unneeded|=1;
6200     ds_unneeded_upper|=1;
6201     // branch taken
6202     if(!nevertaken) {
6203       //assem_debug("1:\n");
6204       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6205                     ds_unneeded,ds_unneeded_upper);
6206       // load regs
6207       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6208       address_generation(i+1,&branch_regs[i],0);
6209       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6210       ds_assemble(i+1,&branch_regs[i]);
6211       cc=get_reg(branch_regs[i].regmap,CCREG);
6212       if(cc==-1) {
6213         emit_loadreg(CCREG,cc=HOST_CCREG);
6214         // CHECK: Is the following instruction (fall thru) allocated ok?
6215       }
6216       assert(cc==HOST_CCREG);
6217       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6218       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6219       assem_debug("cycle count (adj)\n");
6220       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6221       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6222       if(internal)
6223         assem_debug("branch: internal\n");
6224       else
6225         assem_debug("branch: external\n");
6226       if(internal&&is_ds[(ba[i]-start)>>2]) {
6227         ds_assemble_entry(i);
6228       }
6229       else {
6230         add_to_linker((int)out,ba[i],internal);
6231         emit_jmp(0);
6232       }
6233     }
6234     // branch not taken
6235     cop1_usable=prev_cop1_usable;
6236     if(!unconditional) {
6237       set_jump_target(nottaken,(int)out);
6238       assem_debug("1:\n");
6239       if(!likely[i]) {
6240         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6241                       ds_unneeded,ds_unneeded_upper);
6242         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6243         address_generation(i+1,&branch_regs[i],0);
6244         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6245         ds_assemble(i+1,&branch_regs[i]);
6246       }
6247       cc=get_reg(branch_regs[i].regmap,CCREG);
6248       if(cc==-1&&!likely[i]) {
6249         // Cycle count isn't in a register, temporarily load it then write it out
6250         emit_loadreg(CCREG,HOST_CCREG);
6251         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6252         int jaddr=(int)out;
6253         emit_jns(0);
6254         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6255         emit_storereg(CCREG,HOST_CCREG);
6256       }
6257       else{
6258         cc=get_reg(i_regmap,CCREG);
6259         assert(cc==HOST_CCREG);
6260         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6261         int jaddr=(int)out;
6262         emit_jns(0);
6263         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6264       }
6265     }
6266   }
6267 }
6268
6269 void fjump_assemble(int i,struct regstat *i_regs)
6270 {
6271   signed char *i_regmap=i_regs->regmap;
6272   int cc;
6273   int match;
6274   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6275   assem_debug("fmatch=%d\n",match);
6276   int fs,cs;
6277   int eaddr;
6278   int invert=0;
6279   int internal=internal_branch(branch_regs[i].is32,ba[i]);
6280   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
6281   if(!match) invert=1;
6282   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6283   if(i>(ba[i]-start)>>2) invert=1;
6284   #endif
6285
6286   if(ooo[i]) {
6287     fs=get_reg(branch_regs[i].regmap,FSREG);
6288     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
6289   }
6290   else {
6291     fs=get_reg(i_regmap,FSREG);
6292   }
6293
6294   // Check cop1 unusable
6295   if(!cop1_usable) {
6296     cs=get_reg(i_regmap,CSREG);
6297     assert(cs>=0);
6298     emit_testimm(cs,0x20000000);
6299     eaddr=(int)out;
6300     emit_jeq(0);
6301     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
6302     cop1_usable=1;
6303   }
6304
6305   if(ooo[i]) {
6306     // Out of order execution (delay slot first)
6307     //printf("OOOE\n");
6308     ds_assemble(i+1,i_regs);
6309     int adj;
6310     uint64_t bc_unneeded=branch_regs[i].u;
6311     uint64_t bc_unneeded_upper=branch_regs[i].uu;
6312     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6313     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
6314     bc_unneeded|=1;
6315     bc_unneeded_upper|=1;
6316     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6317                   bc_unneeded,bc_unneeded_upper);
6318     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6319     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6320     cc=get_reg(branch_regs[i].regmap,CCREG);
6321     assert(cc==HOST_CCREG);
6322     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6323     assem_debug("cycle count (adj)\n");
6324     if(1) {
6325       int nottaken=0;
6326       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6327       if(1) {
6328         assert(fs>=0);
6329         emit_testimm(fs,0x800000);
6330         if(source[i]&0x10000) // BC1T
6331         {
6332           if(invert){
6333             nottaken=(int)out;
6334             emit_jeq(1);
6335           }else{
6336             add_to_linker((int)out,ba[i],internal);
6337             emit_jne(0);
6338           }
6339         }
6340         else // BC1F
6341           if(invert){
6342             nottaken=(int)out;
6343             emit_jne(1);
6344           }else{
6345             add_to_linker((int)out,ba[i],internal);
6346             emit_jeq(0);
6347           }
6348         {
6349         }
6350       } // if(!only32)
6351           
6352       if(invert) {
6353         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
6354         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6355         else if(match) emit_addnop(13);
6356         #endif
6357         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6358         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6359         if(internal)
6360           assem_debug("branch: internal\n");
6361         else
6362           assem_debug("branch: external\n");
6363         if(internal&&is_ds[(ba[i]-start)>>2]) {
6364           ds_assemble_entry(i);
6365         }
6366         else {
6367           add_to_linker((int)out,ba[i],internal);
6368           emit_jmp(0);
6369         }
6370         set_jump_target(nottaken,(int)out);
6371       }
6372
6373       if(adj) {
6374         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
6375       }
6376     } // (!unconditional)
6377   } // if(ooo)
6378   else
6379   {
6380     // In-order execution (branch first)
6381     //printf("IOE\n");
6382     int nottaken=0;
6383     if(1) {
6384       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6385       if(1) {
6386         assert(fs>=0);
6387         emit_testimm(fs,0x800000);
6388         if(source[i]&0x10000) // BC1T
6389         {
6390           nottaken=(int)out;
6391           emit_jeq(1);
6392         }
6393         else // BC1F
6394         {
6395           nottaken=(int)out;
6396           emit_jne(1);
6397         }
6398       }
6399     } // if(!unconditional)
6400     int adj;
6401     uint64_t ds_unneeded=branch_regs[i].u;
6402     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6403     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6404     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6405     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6406     ds_unneeded|=1;
6407     ds_unneeded_upper|=1;
6408     // branch taken
6409     //assem_debug("1:\n");
6410     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6411                   ds_unneeded,ds_unneeded_upper);
6412     // load regs
6413     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6414     address_generation(i+1,&branch_regs[i],0);
6415     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6416     ds_assemble(i+1,&branch_regs[i]);
6417     cc=get_reg(branch_regs[i].regmap,CCREG);
6418     if(cc==-1) {
6419       emit_loadreg(CCREG,cc=HOST_CCREG);
6420       // CHECK: Is the following instruction (fall thru) allocated ok?
6421     }
6422     assert(cc==HOST_CCREG);
6423     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6424     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6425     assem_debug("cycle count (adj)\n");
6426     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
6427     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6428     if(internal)
6429       assem_debug("branch: internal\n");
6430     else
6431       assem_debug("branch: external\n");
6432     if(internal&&is_ds[(ba[i]-start)>>2]) {
6433       ds_assemble_entry(i);
6434     }
6435     else {
6436       add_to_linker((int)out,ba[i],internal);
6437       emit_jmp(0);
6438     }
6439
6440     // branch not taken
6441     if(1) { // <- FIXME (don't need this)
6442       set_jump_target(nottaken,(int)out);
6443       assem_debug("1:\n");
6444       if(!likely[i]) {
6445         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6446                       ds_unneeded,ds_unneeded_upper);
6447         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6448         address_generation(i+1,&branch_regs[i],0);
6449         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6450         ds_assemble(i+1,&branch_regs[i]);
6451       }
6452       cc=get_reg(branch_regs[i].regmap,CCREG);
6453       if(cc==-1&&!likely[i]) {
6454         // Cycle count isn't in a register, temporarily load it then write it out
6455         emit_loadreg(CCREG,HOST_CCREG);
6456         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6457         int jaddr=(int)out;
6458         emit_jns(0);
6459         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6460         emit_storereg(CCREG,HOST_CCREG);
6461       }
6462       else{
6463         cc=get_reg(i_regmap,CCREG);
6464         assert(cc==HOST_CCREG);
6465         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
6466         int jaddr=(int)out;
6467         emit_jns(0);
6468         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6469       }
6470     }
6471   }
6472 }
6473
6474 static void pagespan_assemble(int i,struct regstat *i_regs)
6475 {
6476   int s1l=get_reg(i_regs->regmap,rs1[i]);
6477   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6478   int s2l=get_reg(i_regs->regmap,rs2[i]);
6479   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6480   void *nt_branch=NULL;
6481   int taken=0;
6482   int nottaken=0;
6483   int unconditional=0;
6484   if(rs1[i]==0)
6485   {
6486     s1l=s2l;s1h=s2h;
6487     s2l=s2h=-1;
6488   }
6489   else if(rs2[i]==0)
6490   {
6491     s2l=s2h=-1;
6492   }
6493   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6494     s1h=s2h=-1;
6495   }
6496   int hr=0;
6497   int addr,alt,ntaddr;
6498   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6499   else {
6500     while(hr<HOST_REGS)
6501     {
6502       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6503          (i_regs->regmap[hr]&63)!=rs1[i] &&
6504          (i_regs->regmap[hr]&63)!=rs2[i] )
6505       {
6506         addr=hr++;break;
6507       }
6508       hr++;
6509     }
6510   }
6511   while(hr<HOST_REGS)
6512   {
6513     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6514        (i_regs->regmap[hr]&63)!=rs1[i] &&
6515        (i_regs->regmap[hr]&63)!=rs2[i] )
6516     {
6517       alt=hr++;break;
6518     }
6519     hr++;
6520   }
6521   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6522   {
6523     while(hr<HOST_REGS)
6524     {
6525       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6526          (i_regs->regmap[hr]&63)!=rs1[i] &&
6527          (i_regs->regmap[hr]&63)!=rs2[i] )
6528       {
6529         ntaddr=hr;break;
6530       }
6531       hr++;
6532     }
6533   }
6534   assert(hr<HOST_REGS);
6535   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6536     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6537   }
6538   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
6539   if(opcode[i]==2) // J
6540   {
6541     unconditional=1;
6542   }
6543   if(opcode[i]==3) // JAL
6544   {
6545     // TODO: mini_ht
6546     int rt=get_reg(i_regs->regmap,31);
6547     emit_movimm(start+i*4+8,rt);
6548     unconditional=1;
6549   }
6550   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6551   {
6552     emit_mov(s1l,addr);
6553     if(opcode2[i]==9) // JALR
6554     {
6555       int rt=get_reg(i_regs->regmap,rt1[i]);
6556       emit_movimm(start+i*4+8,rt);
6557     }
6558   }
6559   if((opcode[i]&0x3f)==4) // BEQ
6560   {
6561     if(rs1[i]==rs2[i])
6562     {
6563       unconditional=1;
6564     }
6565     else
6566     #ifdef HAVE_CMOV_IMM
6567     if(s1h<0) {
6568       if(s2l>=0) emit_cmp(s1l,s2l);
6569       else emit_test(s1l,s1l);
6570       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6571     }
6572     else
6573     #endif
6574     {
6575       assert(s1l>=0);
6576       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6577       if(s1h>=0) {
6578         if(s2h>=0) emit_cmp(s1h,s2h);
6579         else emit_test(s1h,s1h);
6580         emit_cmovne_reg(alt,addr);
6581       }
6582       if(s2l>=0) emit_cmp(s1l,s2l);
6583       else emit_test(s1l,s1l);
6584       emit_cmovne_reg(alt,addr);
6585     }
6586   }
6587   if((opcode[i]&0x3f)==5) // BNE
6588   {
6589     #ifdef HAVE_CMOV_IMM
6590     if(s1h<0) {
6591       if(s2l>=0) emit_cmp(s1l,s2l);
6592       else emit_test(s1l,s1l);
6593       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6594     }
6595     else
6596     #endif
6597     {
6598       assert(s1l>=0);
6599       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6600       if(s1h>=0) {
6601         if(s2h>=0) emit_cmp(s1h,s2h);
6602         else emit_test(s1h,s1h);
6603         emit_cmovne_reg(alt,addr);
6604       }
6605       if(s2l>=0) emit_cmp(s1l,s2l);
6606       else emit_test(s1l,s1l);
6607       emit_cmovne_reg(alt,addr);
6608     }
6609   }
6610   if((opcode[i]&0x3f)==0x14) // BEQL
6611   {
6612     if(s1h>=0) {
6613       if(s2h>=0) emit_cmp(s1h,s2h);
6614       else emit_test(s1h,s1h);
6615       nottaken=(int)out;
6616       emit_jne(0);
6617     }
6618     if(s2l>=0) emit_cmp(s1l,s2l);
6619     else emit_test(s1l,s1l);
6620     if(nottaken) set_jump_target(nottaken,(int)out);
6621     nottaken=(int)out;
6622     emit_jne(0);
6623   }
6624   if((opcode[i]&0x3f)==0x15) // BNEL
6625   {
6626     if(s1h>=0) {
6627       if(s2h>=0) emit_cmp(s1h,s2h);
6628       else emit_test(s1h,s1h);
6629       taken=(int)out;
6630       emit_jne(0);
6631     }
6632     if(s2l>=0) emit_cmp(s1l,s2l);
6633     else emit_test(s1l,s1l);
6634     nottaken=(int)out;
6635     emit_jeq(0);
6636     if(taken) set_jump_target(taken,(int)out);
6637   }
6638   if((opcode[i]&0x3f)==6) // BLEZ
6639   {
6640     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6641     emit_cmpimm(s1l,1);
6642     if(s1h>=0) emit_mov(addr,ntaddr);
6643     emit_cmovl_reg(alt,addr);
6644     if(s1h>=0) {
6645       emit_test(s1h,s1h);
6646       emit_cmovne_reg(ntaddr,addr);
6647       emit_cmovs_reg(alt,addr);
6648     }
6649   }
6650   if((opcode[i]&0x3f)==7) // BGTZ
6651   {
6652     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6653     emit_cmpimm(s1l,1);
6654     if(s1h>=0) emit_mov(addr,alt);
6655     emit_cmovl_reg(ntaddr,addr);
6656     if(s1h>=0) {
6657       emit_test(s1h,s1h);
6658       emit_cmovne_reg(alt,addr);
6659       emit_cmovs_reg(ntaddr,addr);
6660     }
6661   }
6662   if((opcode[i]&0x3f)==0x16) // BLEZL
6663   {
6664     assert((opcode[i]&0x3f)!=0x16);
6665   }
6666   if((opcode[i]&0x3f)==0x17) // BGTZL
6667   {
6668     assert((opcode[i]&0x3f)!=0x17);
6669   }
6670   assert(opcode[i]!=1); // BLTZ/BGEZ
6671
6672   //FIXME: Check CSREG
6673   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6674     if((source[i]&0x30000)==0) // BC1F
6675     {
6676       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6677       emit_testimm(s1l,0x800000);
6678       emit_cmovne_reg(alt,addr);
6679     }
6680     if((source[i]&0x30000)==0x10000) // BC1T
6681     {
6682       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6683       emit_testimm(s1l,0x800000);
6684       emit_cmovne_reg(alt,addr);
6685     }
6686     if((source[i]&0x30000)==0x20000) // BC1FL
6687     {
6688       emit_testimm(s1l,0x800000);
6689       nottaken=(int)out;
6690       emit_jne(0);
6691     }
6692     if((source[i]&0x30000)==0x30000) // BC1TL
6693     {
6694       emit_testimm(s1l,0x800000);
6695       nottaken=(int)out;
6696       emit_jeq(0);
6697     }
6698   }
6699
6700   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6701   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6702   if(likely[i]||unconditional)
6703   {
6704     emit_movimm(ba[i],HOST_BTREG);
6705   }
6706   else if(addr!=HOST_BTREG)
6707   {
6708     emit_mov(addr,HOST_BTREG);
6709   }
6710   void *branch_addr=out;
6711   emit_jmp(0);
6712   int target_addr=start+i*4+5;
6713   void *stub=out;
6714   void *compiled_target_addr=check_addr(target_addr);
6715   emit_extjump_ds((int)branch_addr,target_addr);
6716   if(compiled_target_addr) {
6717     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6718     add_link(target_addr,stub);
6719   }
6720   else set_jump_target((int)branch_addr,(int)stub);
6721   if(likely[i]) {
6722     // Not-taken path
6723     set_jump_target((int)nottaken,(int)out);
6724     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6725     void *branch_addr=out;
6726     emit_jmp(0);
6727     int target_addr=start+i*4+8;
6728     void *stub=out;
6729     void *compiled_target_addr=check_addr(target_addr);
6730     emit_extjump_ds((int)branch_addr,target_addr);
6731     if(compiled_target_addr) {
6732       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6733       add_link(target_addr,stub);
6734     }
6735     else set_jump_target((int)branch_addr,(int)stub);
6736   }
6737 }
6738
6739 // Assemble the delay slot for the above
6740 static void pagespan_ds()
6741 {
6742   assem_debug("initial delay slot:\n");
6743   u_int vaddr=start+1;
6744   u_int page=get_page(vaddr);
6745   u_int vpage=get_vpage(vaddr);
6746   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6747   do_dirty_stub_ds();
6748   ll_add(jump_in+page,vaddr,(void *)out);
6749   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6750   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6751     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6752   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6753     emit_writeword(HOST_BTREG,(int)&branch_target);
6754   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6755   address_generation(0,&regs[0],regs[0].regmap_entry);
6756   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6757     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6758   cop1_usable=0;
6759   is_delayslot=0;
6760   switch(itype[0]) {
6761     case ALU:
6762       alu_assemble(0,&regs[0]);break;
6763     case IMM16:
6764       imm16_assemble(0,&regs[0]);break;
6765     case SHIFT:
6766       shift_assemble(0,&regs[0]);break;
6767     case SHIFTIMM:
6768       shiftimm_assemble(0,&regs[0]);break;
6769     case LOAD:
6770       load_assemble(0,&regs[0]);break;
6771     case LOADLR:
6772       loadlr_assemble(0,&regs[0]);break;
6773     case STORE:
6774       store_assemble(0,&regs[0]);break;
6775     case STORELR:
6776       storelr_assemble(0,&regs[0]);break;
6777     case COP0:
6778       cop0_assemble(0,&regs[0]);break;
6779     case COP1:
6780       cop1_assemble(0,&regs[0]);break;
6781     case C1LS:
6782       c1ls_assemble(0,&regs[0]);break;
6783     case COP2:
6784       cop2_assemble(0,&regs[0]);break;
6785     case C2LS:
6786       c2ls_assemble(0,&regs[0]);break;
6787     case C2OP:
6788       c2op_assemble(0,&regs[0]);break;
6789     case FCONV:
6790       fconv_assemble(0,&regs[0]);break;
6791     case FLOAT:
6792       float_assemble(0,&regs[0]);break;
6793     case FCOMP:
6794       fcomp_assemble(0,&regs[0]);break;
6795     case MULTDIV:
6796       multdiv_assemble(0,&regs[0]);break;
6797     case MOV:
6798       mov_assemble(0,&regs[0]);break;
6799     case SYSCALL:
6800     case HLECALL:
6801     case INTCALL:
6802     case SPAN:
6803     case UJUMP:
6804     case RJUMP:
6805     case CJUMP:
6806     case SJUMP:
6807     case FJUMP:
6808       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
6809   }
6810   int btaddr=get_reg(regs[0].regmap,BTREG);
6811   if(btaddr<0) {
6812     btaddr=get_reg(regs[0].regmap,-1);
6813     emit_readword((int)&branch_target,btaddr);
6814   }
6815   assert(btaddr!=HOST_CCREG);
6816   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6817 #ifdef HOST_IMM8
6818   emit_movimm(start+4,HOST_TEMPREG);
6819   emit_cmp(btaddr,HOST_TEMPREG);
6820 #else
6821   emit_cmpimm(btaddr,start+4);
6822 #endif
6823   int branch=(int)out;
6824   emit_jeq(0);
6825   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6826   emit_jmp(jump_vaddr_reg[btaddr]);
6827   set_jump_target(branch,(int)out);
6828   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6829   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6830 }
6831
6832 // Basic liveness analysis for MIPS registers
6833 void unneeded_registers(int istart,int iend,int r)
6834 {
6835   int i;
6836   uint64_t u,uu,gte_u,b,bu,gte_bu;
6837   uint64_t temp_u,temp_uu,temp_gte_u=0;
6838   uint64_t tdep;
6839   uint64_t gte_u_unknown=0;
6840   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
6841     gte_u_unknown=~0ll;
6842   if(iend==slen-1) {
6843     u=1;uu=1;
6844     gte_u=gte_u_unknown;
6845   }else{
6846     u=unneeded_reg[iend+1];
6847     uu=unneeded_reg_upper[iend+1];
6848     u=1;uu=1;
6849     gte_u=gte_unneeded[iend+1];
6850   }
6851
6852   for (i=iend;i>=istart;i--)
6853   {
6854     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6855     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6856     {
6857       // If subroutine call, flag return address as a possible branch target
6858       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6859       
6860       if(ba[i]<start || ba[i]>=(start+slen*4))
6861       {
6862         // Branch out of this block, flush all regs
6863         u=1;
6864         uu=1;
6865         gte_u=gte_u_unknown;
6866         /* Hexagon hack 
6867         if(itype[i]==UJUMP&&rt1[i]==31)
6868         {
6869           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6870         }
6871         if(itype[i]==RJUMP&&rs1[i]==31)
6872         {
6873           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6874         }
6875         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6876           if(itype[i]==UJUMP&&rt1[i]==31)
6877           {
6878             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6879             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6880           }
6881           if(itype[i]==RJUMP&&rs1[i]==31)
6882           {
6883             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6884             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6885           }
6886         }*/
6887         branch_unneeded_reg[i]=u;
6888         branch_unneeded_reg_upper[i]=uu;
6889         // Merge in delay slot
6890         tdep=(~uu>>rt1[i+1])&1;
6891         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6892         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6893         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6894         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6895         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6896         u|=1;uu|=1;
6897         gte_u|=gte_rt[i+1];
6898         gte_u&=~gte_rs[i+1];
6899         // If branch is "likely" (and conditional)
6900         // then we skip the delay slot on the fall-thru path
6901         if(likely[i]) {
6902           if(i<slen-1) {
6903             u&=unneeded_reg[i+2];
6904             uu&=unneeded_reg_upper[i+2];
6905             gte_u&=gte_unneeded[i+2];
6906           }
6907           else
6908           {
6909             u=1;
6910             uu=1;
6911             gte_u=gte_u_unknown;
6912           }
6913         }
6914       }
6915       else
6916       {
6917         // Internal branch, flag target
6918         bt[(ba[i]-start)>>2]=1;
6919         if(ba[i]<=start+i*4) {
6920           // Backward branch
6921           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6922           {
6923             // Unconditional branch
6924             temp_u=1;temp_uu=1;
6925             temp_gte_u=0;
6926           } else {
6927             // Conditional branch (not taken case)
6928             temp_u=unneeded_reg[i+2];
6929             temp_uu=unneeded_reg_upper[i+2];
6930             temp_gte_u&=gte_unneeded[i+2];
6931           }
6932           // Merge in delay slot
6933           tdep=(~temp_uu>>rt1[i+1])&1;
6934           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6935           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6936           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6937           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6938           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6939           temp_u|=1;temp_uu|=1;
6940           temp_gte_u|=gte_rt[i+1];
6941           temp_gte_u&=~gte_rs[i+1];
6942           // If branch is "likely" (and conditional)
6943           // then we skip the delay slot on the fall-thru path
6944           if(likely[i]) {
6945             if(i<slen-1) {
6946               temp_u&=unneeded_reg[i+2];
6947               temp_uu&=unneeded_reg_upper[i+2];
6948               temp_gte_u&=gte_unneeded[i+2];
6949             }
6950             else
6951             {
6952               temp_u=1;
6953               temp_uu=1;
6954               temp_gte_u=gte_u_unknown;
6955             }
6956           }
6957           tdep=(~temp_uu>>rt1[i])&1;
6958           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6959           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6960           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6961           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6962           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6963           temp_u|=1;temp_uu|=1;
6964           temp_gte_u|=gte_rt[i];
6965           temp_gte_u&=~gte_rs[i];
6966           unneeded_reg[i]=temp_u;
6967           unneeded_reg_upper[i]=temp_uu;
6968           gte_unneeded[i]=temp_gte_u;
6969           // Only go three levels deep.  This recursion can take an
6970           // excessive amount of time if there are a lot of nested loops.
6971           if(r<2) {
6972             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6973           }else{
6974             unneeded_reg[(ba[i]-start)>>2]=1;
6975             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6976             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
6977           }
6978         } /*else*/ if(1) {
6979           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6980           {
6981             // Unconditional branch
6982             u=unneeded_reg[(ba[i]-start)>>2];
6983             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6984             gte_u=gte_unneeded[(ba[i]-start)>>2];
6985             branch_unneeded_reg[i]=u;
6986             branch_unneeded_reg_upper[i]=uu;
6987         //u=1;
6988         //uu=1;
6989         //branch_unneeded_reg[i]=u;
6990         //branch_unneeded_reg_upper[i]=uu;
6991             // Merge in delay slot
6992             tdep=(~uu>>rt1[i+1])&1;
6993             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6994             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6995             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6996             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6997             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6998             u|=1;uu|=1;
6999             gte_u|=gte_rt[i+1];
7000             gte_u&=~gte_rs[i+1];
7001           } else {
7002             // Conditional branch
7003             b=unneeded_reg[(ba[i]-start)>>2];
7004             bu=unneeded_reg_upper[(ba[i]-start)>>2];
7005             gte_bu=gte_unneeded[(ba[i]-start)>>2];
7006             branch_unneeded_reg[i]=b;
7007             branch_unneeded_reg_upper[i]=bu;
7008         //b=1;
7009         //bu=1;
7010         //branch_unneeded_reg[i]=b;
7011         //branch_unneeded_reg_upper[i]=bu;
7012             // Branch delay slot
7013             tdep=(~uu>>rt1[i+1])&1;
7014             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
7015             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
7016             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7017             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
7018             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
7019             b|=1;bu|=1;
7020             gte_bu|=gte_rt[i+1];
7021             gte_bu&=~gte_rs[i+1];
7022             // If branch is "likely" then we skip the
7023             // delay slot on the fall-thru path
7024             if(likely[i]) {
7025               u=b;
7026               uu=bu;
7027               gte_u=gte_bu;
7028               if(i<slen-1) {
7029                 u&=unneeded_reg[i+2];
7030                 uu&=unneeded_reg_upper[i+2];
7031                 gte_u&=gte_unneeded[i+2];
7032         //u=1;
7033         //uu=1;
7034               }
7035             } else {
7036               u&=b;
7037               uu&=bu;
7038               gte_u&=gte_bu;
7039         //u=1;
7040         //uu=1;
7041             }
7042             if(i<slen-1) {
7043               branch_unneeded_reg[i]&=unneeded_reg[i+2];
7044               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
7045         //branch_unneeded_reg[i]=1;
7046         //branch_unneeded_reg_upper[i]=1;
7047             } else {
7048               branch_unneeded_reg[i]=1;
7049               branch_unneeded_reg_upper[i]=1;
7050             }
7051           }
7052         }
7053       }
7054     }
7055     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7056     {
7057       // SYSCALL instruction (software interrupt)
7058       u=1;
7059       uu=1;
7060     }
7061     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7062     {
7063       // ERET instruction (return from interrupt)
7064       u=1;
7065       uu=1;
7066     }
7067     //u=uu=1; // DEBUG
7068     tdep=(~uu>>rt1[i])&1;
7069     // Written registers are unneeded
7070     u|=1LL<<rt1[i];
7071     u|=1LL<<rt2[i];
7072     uu|=1LL<<rt1[i];
7073     uu|=1LL<<rt2[i];
7074     gte_u|=gte_rt[i];
7075     // Accessed registers are needed
7076     u&=~(1LL<<rs1[i]);
7077     u&=~(1LL<<rs2[i]);
7078     uu&=~(1LL<<us1[i]);
7079     uu&=~(1LL<<us2[i]);
7080     gte_u&=~gte_rs[i];
7081     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
7082       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
7083     // Source-target dependencies
7084     uu&=~(tdep<<dep1[i]);
7085     uu&=~(tdep<<dep2[i]);
7086     // R0 is always unneeded
7087     u|=1;uu|=1;
7088     // Save it
7089     unneeded_reg[i]=u;
7090     unneeded_reg_upper[i]=uu;
7091     gte_unneeded[i]=gte_u;
7092     /*
7093     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
7094     printf("U:");
7095     int r;
7096     for(r=1;r<=CCREG;r++) {
7097       if((unneeded_reg[i]>>r)&1) {
7098         if(r==HIREG) printf(" HI");
7099         else if(r==LOREG) printf(" LO");
7100         else printf(" r%d",r);
7101       }
7102     }
7103     printf(" UU:");
7104     for(r=1;r<=CCREG;r++) {
7105       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
7106         if(r==HIREG) printf(" HI");
7107         else if(r==LOREG) printf(" LO");
7108         else printf(" r%d",r);
7109       }
7110     }
7111     printf("\n");*/
7112   }
7113 #ifdef FORCE32
7114   for (i=iend;i>=istart;i--)
7115   {
7116     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
7117   }
7118 #endif
7119 }
7120
7121 // Identify registers which are likely to contain 32-bit values
7122 // This is used to predict whether any branches will jump to a
7123 // location with 64-bit values in registers.
7124 static void provisional_32bit()
7125 {
7126   int i,j;
7127   uint64_t is32=1;
7128   uint64_t lastbranch=1;
7129   
7130   for(i=0;i<slen;i++)
7131   {
7132     if(i>0) {
7133       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
7134         if(i>1) is32=lastbranch;
7135         else is32=1;
7136       }
7137     }
7138     if(i>1)
7139     {
7140       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
7141         if(likely[i-2]) {
7142           if(i>2) is32=lastbranch;
7143           else is32=1;
7144         }
7145       }
7146       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7147       {
7148         if(rs1[i-2]==0||rs2[i-2]==0)
7149         {
7150           if(rs1[i-2]) {
7151             is32|=1LL<<rs1[i-2];
7152           }
7153           if(rs2[i-2]) {
7154             is32|=1LL<<rs2[i-2];
7155           }
7156         }
7157       }
7158     }
7159     // If something jumps here with 64-bit values
7160     // then promote those registers to 64 bits
7161     if(bt[i])
7162     {
7163       uint64_t temp_is32=is32;
7164       for(j=i-1;j>=0;j--)
7165       {
7166         if(ba[j]==start+i*4) 
7167           //temp_is32&=branch_regs[j].is32;
7168           temp_is32&=p32[j];
7169       }
7170       for(j=i;j<slen;j++)
7171       {
7172         if(ba[j]==start+i*4) 
7173           temp_is32=1;
7174       }
7175       is32=temp_is32;
7176     }
7177     int type=itype[i];
7178     int op=opcode[i];
7179     int op2=opcode2[i];
7180     int rt=rt1[i];
7181     int s1=rs1[i];
7182     int s2=rs2[i];
7183     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7184       // Branches don't write registers, consider the delay slot instead.
7185       type=itype[i+1];
7186       op=opcode[i+1];
7187       op2=opcode2[i+1];
7188       rt=rt1[i+1];
7189       s1=rs1[i+1];
7190       s2=rs2[i+1];
7191       lastbranch=is32;
7192     }
7193     switch(type) {
7194       case LOAD:
7195         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
7196            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
7197           is32&=~(1LL<<rt);
7198         else
7199           is32|=1LL<<rt;
7200         break;
7201       case STORE:
7202       case STORELR:
7203         break;
7204       case LOADLR:
7205         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
7206         if(op==0x22) is32|=1LL<<rt; // LWL
7207         break;
7208       case IMM16:
7209         if (op==0x08||op==0x09|| // ADDI/ADDIU
7210             op==0x0a||op==0x0b|| // SLTI/SLTIU
7211             op==0x0c|| // ANDI
7212             op==0x0f)  // LUI
7213         {
7214           is32|=1LL<<rt;
7215         }
7216         if(op==0x18||op==0x19) { // DADDI/DADDIU
7217           is32&=~(1LL<<rt);
7218           //if(imm[i]==0)
7219           //  is32|=((is32>>s1)&1LL)<<rt;
7220         }
7221         if(op==0x0d||op==0x0e) { // ORI/XORI
7222           uint64_t sr=((is32>>s1)&1LL);
7223           is32&=~(1LL<<rt);
7224           is32|=sr<<rt;
7225         }
7226         break;
7227       case UJUMP:
7228         break;
7229       case RJUMP:
7230         break;
7231       case CJUMP:
7232         break;
7233       case SJUMP:
7234         break;
7235       case FJUMP:
7236         break;
7237       case ALU:
7238         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
7239           is32|=1LL<<rt;
7240         }
7241         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7242           is32|=1LL<<rt;
7243         }
7244         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7245           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
7246           is32&=~(1LL<<rt);
7247           is32|=sr<<rt;
7248         }
7249         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
7250           if(s1==0&&s2==0) {
7251             is32|=1LL<<rt;
7252           }
7253           else if(s2==0) {
7254             uint64_t sr=((is32>>s1)&1LL);
7255             is32&=~(1LL<<rt);
7256             is32|=sr<<rt;
7257           }
7258           else if(s1==0) {
7259             uint64_t sr=((is32>>s2)&1LL);
7260             is32&=~(1LL<<rt);
7261             is32|=sr<<rt;
7262           }
7263           else {
7264             is32&=~(1LL<<rt);
7265           }
7266         }
7267         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
7268           if(s1==0&&s2==0) {
7269             is32|=1LL<<rt;
7270           }
7271           else if(s2==0) {
7272             uint64_t sr=((is32>>s1)&1LL);
7273             is32&=~(1LL<<rt);
7274             is32|=sr<<rt;
7275           }
7276           else {
7277             is32&=~(1LL<<rt);
7278           }
7279         }
7280         break;
7281       case MULTDIV:
7282         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7283           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
7284         }
7285         else {
7286           is32|=(1LL<<HIREG)|(1LL<<LOREG);
7287         }
7288         break;
7289       case MOV:
7290         {
7291           uint64_t sr=((is32>>s1)&1LL);
7292           is32&=~(1LL<<rt);
7293           is32|=sr<<rt;
7294         }
7295         break;
7296       case SHIFT:
7297         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
7298         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
7299         break;
7300       case SHIFTIMM:
7301         is32|=1LL<<rt;
7302         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
7303         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
7304         break;
7305       case COP0:
7306         if(op2==0) is32|=1LL<<rt; // MFC0
7307         break;
7308       case COP1:
7309       case COP2:
7310         if(op2==0) is32|=1LL<<rt; // MFC1
7311         if(op2==1) is32&=~(1LL<<rt); // DMFC1
7312         if(op2==2) is32|=1LL<<rt; // CFC1
7313         break;
7314       case C1LS:
7315       case C2LS:
7316         break;
7317       case FLOAT:
7318       case FCONV:
7319         break;
7320       case FCOMP:
7321         break;
7322       case C2OP:
7323       case SYSCALL:
7324       case HLECALL:
7325         break;
7326       default:
7327         break;
7328     }
7329     is32|=1;
7330     p32[i]=is32;
7331
7332     if(i>0)
7333     {
7334       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
7335       {
7336         if(rt1[i-1]==31) // JAL/JALR
7337         {
7338           // Subroutine call will return here, don't alloc any registers
7339           is32=1;
7340         }
7341         else if(i+1<slen)
7342         {
7343           // Internal branch will jump here, match registers to caller
7344           is32=0x3FFFFFFFFLL;
7345         }
7346       }
7347     }
7348   }
7349 }
7350
7351 // Identify registers which may be assumed to contain 32-bit values
7352 // and where optimizations will rely on this.
7353 // This is used to determine whether backward branches can safely
7354 // jump to a location with 64-bit values in registers.
7355 static void provisional_r32()
7356 {
7357   u_int r32=0;
7358   int i;
7359   
7360   for (i=slen-1;i>=0;i--)
7361   {
7362     int hr;
7363     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7364     {
7365       if(ba[i]<start || ba[i]>=(start+slen*4))
7366       {
7367         // Branch out of this block, don't need anything
7368         r32=0;
7369       }
7370       else
7371       {
7372         // Internal branch
7373         // Need whatever matches the target
7374         // (and doesn't get overwritten by the delay slot instruction)
7375         r32=0;
7376         int t=(ba[i]-start)>>2;
7377         if(ba[i]>start+i*4) {
7378           // Forward branch
7379           //if(!(requires_32bit[t]&~regs[i].was32))
7380           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7381           if(!(pr32[t]&~regs[i].was32))
7382             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7383         }else{
7384           // Backward branch
7385           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7386             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7387         }
7388       }
7389       // Conditional branch may need registers for following instructions
7390       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7391       {
7392         if(i<slen-2) {
7393           //r32|=requires_32bit[i+2];
7394           r32|=pr32[i+2];
7395           r32&=regs[i].was32;
7396           // Mark this address as a branch target since it may be called
7397           // upon return from interrupt
7398           //bt[i+2]=1;
7399         }
7400       }
7401       // Merge in delay slot
7402       if(!likely[i]) {
7403         // These are overwritten unless the branch is "likely"
7404         // and the delay slot is nullified if not taken
7405         r32&=~(1LL<<rt1[i+1]);
7406         r32&=~(1LL<<rt2[i+1]);
7407       }
7408       // Assume these are needed (delay slot)
7409       if(us1[i+1]>0)
7410       {
7411         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7412       }
7413       if(us2[i+1]>0)
7414       {
7415         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7416       }
7417       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7418       {
7419         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7420       }
7421       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7422       {
7423         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7424       }
7425     }
7426     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7427     {
7428       // SYSCALL instruction (software interrupt)
7429       r32=0;
7430     }
7431     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7432     {
7433       // ERET instruction (return from interrupt)
7434       r32=0;
7435     }
7436     // Check 32 bits
7437     r32&=~(1LL<<rt1[i]);
7438     r32&=~(1LL<<rt2[i]);
7439     if(us1[i]>0)
7440     {
7441       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7442     }
7443     if(us2[i]>0)
7444     {
7445       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7446     }
7447     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7448     {
7449       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7450     }
7451     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7452     {
7453       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7454     }
7455     //requires_32bit[i]=r32;
7456     pr32[i]=r32;
7457     
7458     // Dirty registers which are 32-bit, require 32-bit input
7459     // as they will be written as 32-bit values
7460     for(hr=0;hr<HOST_REGS;hr++)
7461     {
7462       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7463         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7464           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7465           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7466           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7467         }
7468       }
7469     }
7470   }
7471 }
7472
7473 // Write back dirty registers as soon as we will no longer modify them,
7474 // so that we don't end up with lots of writes at the branches.
7475 void clean_registers(int istart,int iend,int wr)
7476 {
7477   int i;
7478   int r;
7479   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7480   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7481   if(iend==slen-1) {
7482     will_dirty_i=will_dirty_next=0;
7483     wont_dirty_i=wont_dirty_next=0;
7484   }else{
7485     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7486     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7487   }
7488   for (i=iend;i>=istart;i--)
7489   {
7490     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7491     {
7492       if(ba[i]<start || ba[i]>=(start+slen*4))
7493       {
7494         // Branch out of this block, flush all regs
7495         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7496         {
7497           // Unconditional branch
7498           will_dirty_i=0;
7499           wont_dirty_i=0;
7500           // Merge in delay slot (will dirty)
7501           for(r=0;r<HOST_REGS;r++) {
7502             if(r!=EXCLUDE_REG) {
7503               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7504               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7505               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7506               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7507               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7508               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7509               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7510               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7511               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7512               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7513               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7514               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7515               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7516               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7517             }
7518           }
7519         }
7520         else
7521         {
7522           // Conditional branch
7523           will_dirty_i=0;
7524           wont_dirty_i=wont_dirty_next;
7525           // Merge in delay slot (will dirty)
7526           for(r=0;r<HOST_REGS;r++) {
7527             if(r!=EXCLUDE_REG) {
7528               if(!likely[i]) {
7529                 // Might not dirty if likely branch is not taken
7530                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7531                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7532                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7533                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7534                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7535                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7536                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7537                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7538                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7539                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7540                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7541                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7542                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7543                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7544               }
7545             }
7546           }
7547         }
7548         // Merge in delay slot (wont dirty)
7549         for(r=0;r<HOST_REGS;r++) {
7550           if(r!=EXCLUDE_REG) {
7551             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7552             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7553             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7554             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7555             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7556             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7557             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7558             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7559             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7560             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7561           }
7562         }
7563         if(wr) {
7564           #ifndef DESTRUCTIVE_WRITEBACK
7565           branch_regs[i].dirty&=wont_dirty_i;
7566           #endif
7567           branch_regs[i].dirty|=will_dirty_i;
7568         }
7569       }
7570       else
7571       {
7572         // Internal branch
7573         if(ba[i]<=start+i*4) {
7574           // Backward branch
7575           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7576           {
7577             // Unconditional branch
7578             temp_will_dirty=0;
7579             temp_wont_dirty=0;
7580             // Merge in delay slot (will dirty)
7581             for(r=0;r<HOST_REGS;r++) {
7582               if(r!=EXCLUDE_REG) {
7583                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7584                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7585                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7586                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7587                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7588                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7589                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7590                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7591                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7592                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7593                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7594                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7595                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7596                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7597               }
7598             }
7599           } else {
7600             // Conditional branch (not taken case)
7601             temp_will_dirty=will_dirty_next;
7602             temp_wont_dirty=wont_dirty_next;
7603             // Merge in delay slot (will dirty)
7604             for(r=0;r<HOST_REGS;r++) {
7605               if(r!=EXCLUDE_REG) {
7606                 if(!likely[i]) {
7607                   // Will not dirty if likely branch is not taken
7608                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7609                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7610                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7611                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7612                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7613                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7614                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7615                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7616                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7617                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7618                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7619                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7620                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7621                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7622                 }
7623               }
7624             }
7625           }
7626           // Merge in delay slot (wont dirty)
7627           for(r=0;r<HOST_REGS;r++) {
7628             if(r!=EXCLUDE_REG) {
7629               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7630               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7631               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7632               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7633               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7634               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7635               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7636               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7637               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7638               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7639             }
7640           }
7641           // Deal with changed mappings
7642           if(i<iend) {
7643             for(r=0;r<HOST_REGS;r++) {
7644               if(r!=EXCLUDE_REG) {
7645                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7646                   temp_will_dirty&=~(1<<r);
7647                   temp_wont_dirty&=~(1<<r);
7648                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7649                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7650                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7651                   } else {
7652                     temp_will_dirty|=1<<r;
7653                     temp_wont_dirty|=1<<r;
7654                   }
7655                 }
7656               }
7657             }
7658           }
7659           if(wr) {
7660             will_dirty[i]=temp_will_dirty;
7661             wont_dirty[i]=temp_wont_dirty;
7662             clean_registers((ba[i]-start)>>2,i-1,0);
7663           }else{
7664             // Limit recursion.  It can take an excessive amount
7665             // of time if there are a lot of nested loops.
7666             will_dirty[(ba[i]-start)>>2]=0;
7667             wont_dirty[(ba[i]-start)>>2]=-1;
7668           }
7669         }
7670         /*else*/ if(1)
7671         {
7672           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7673           {
7674             // Unconditional branch
7675             will_dirty_i=0;
7676             wont_dirty_i=0;
7677           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7678             for(r=0;r<HOST_REGS;r++) {
7679               if(r!=EXCLUDE_REG) {
7680                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7681                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7682                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7683                 }
7684                 if(branch_regs[i].regmap[r]>=0) {
7685                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7686                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
7687                 }
7688               }
7689             }
7690           //}
7691             // Merge in delay slot
7692             for(r=0;r<HOST_REGS;r++) {
7693               if(r!=EXCLUDE_REG) {
7694                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7695                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7696                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7697                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7698                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7699                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7700                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7701                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7702                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7703                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7704                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7705                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7706                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7707                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7708               }
7709             }
7710           } else {
7711             // Conditional branch
7712             will_dirty_i=will_dirty_next;
7713             wont_dirty_i=wont_dirty_next;
7714           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7715             for(r=0;r<HOST_REGS;r++) {
7716               if(r!=EXCLUDE_REG) {
7717                 signed char target_reg=branch_regs[i].regmap[r];
7718                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7719                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7720                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7721                 }
7722                 else if(target_reg>=0) {
7723                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7724                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
7725                 }
7726                 // Treat delay slot as part of branch too
7727                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7728                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7729                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7730                 }
7731                 else
7732                 {
7733                   will_dirty[i+1]&=~(1<<r);
7734                 }*/
7735               }
7736             }
7737           //}
7738             // Merge in delay slot
7739             for(r=0;r<HOST_REGS;r++) {
7740               if(r!=EXCLUDE_REG) {
7741                 if(!likely[i]) {
7742                   // Might not dirty if likely branch is not taken
7743                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7744                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7745                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7746                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7747                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7748                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7749                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7750                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7751                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7752                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7753                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7754                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7755                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7756                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7757                 }
7758               }
7759             }
7760           }
7761           // Merge in delay slot (won't dirty)
7762           for(r=0;r<HOST_REGS;r++) {
7763             if(r!=EXCLUDE_REG) {
7764               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7765               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7766               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7767               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7768               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7769               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7770               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7771               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7772               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7773               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7774             }
7775           }
7776           if(wr) {
7777             #ifndef DESTRUCTIVE_WRITEBACK
7778             branch_regs[i].dirty&=wont_dirty_i;
7779             #endif
7780             branch_regs[i].dirty|=will_dirty_i;
7781           }
7782         }
7783       }
7784     }
7785     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
7786     {
7787       // SYSCALL instruction (software interrupt)
7788       will_dirty_i=0;
7789       wont_dirty_i=0;
7790     }
7791     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7792     {
7793       // ERET instruction (return from interrupt)
7794       will_dirty_i=0;
7795       wont_dirty_i=0;
7796     }
7797     will_dirty_next=will_dirty_i;
7798     wont_dirty_next=wont_dirty_i;
7799     for(r=0;r<HOST_REGS;r++) {
7800       if(r!=EXCLUDE_REG) {
7801         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7802         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7803         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7804         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7805         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7806         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7807         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7808         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7809         if(i>istart) {
7810           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7811           {
7812             // Don't store a register immediately after writing it,
7813             // may prevent dual-issue.
7814             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7815             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7816           }
7817         }
7818       }
7819     }
7820     // Save it
7821     will_dirty[i]=will_dirty_i;
7822     wont_dirty[i]=wont_dirty_i;
7823     // Mark registers that won't be dirtied as not dirty
7824     if(wr) {
7825       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7826       for(r=0;r<HOST_REGS;r++) {
7827         if((will_dirty_i>>r)&1) {
7828           printf(" r%d",r);
7829         }
7830       }
7831       printf("\n");*/
7832
7833       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7834         regs[i].dirty|=will_dirty_i;
7835         #ifndef DESTRUCTIVE_WRITEBACK
7836         regs[i].dirty&=wont_dirty_i;
7837         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7838         {
7839           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7840             for(r=0;r<HOST_REGS;r++) {
7841               if(r!=EXCLUDE_REG) {
7842                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7843                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7844                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7845               }
7846             }
7847           }
7848         }
7849         else
7850         {
7851           if(i<iend) {
7852             for(r=0;r<HOST_REGS;r++) {
7853               if(r!=EXCLUDE_REG) {
7854                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7855                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7856                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7857               }
7858             }
7859           }
7860         }
7861         #endif
7862       //}
7863     }
7864     // Deal with changed mappings
7865     temp_will_dirty=will_dirty_i;
7866     temp_wont_dirty=wont_dirty_i;
7867     for(r=0;r<HOST_REGS;r++) {
7868       if(r!=EXCLUDE_REG) {
7869         int nr;
7870         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7871           if(wr) {
7872             #ifndef DESTRUCTIVE_WRITEBACK
7873             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7874             #endif
7875             regs[i].wasdirty|=will_dirty_i&(1<<r);
7876           }
7877         }
7878         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7879           // Register moved to a different register
7880           will_dirty_i&=~(1<<r);
7881           wont_dirty_i&=~(1<<r);
7882           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7883           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7884           if(wr) {
7885             #ifndef DESTRUCTIVE_WRITEBACK
7886             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7887             #endif
7888             regs[i].wasdirty|=will_dirty_i&(1<<r);
7889           }
7890         }
7891         else {
7892           will_dirty_i&=~(1<<r);
7893           wont_dirty_i&=~(1<<r);
7894           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7895             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7896             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7897           } else {
7898             wont_dirty_i|=1<<r;
7899             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7900           }
7901         }
7902       }
7903     }
7904   }
7905 }
7906
7907 #ifdef DISASM
7908   /* disassembly */
7909 void disassemble_inst(int i)
7910 {
7911     if (bt[i]) printf("*"); else printf(" ");
7912     switch(itype[i]) {
7913       case UJUMP:
7914         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7915       case CJUMP:
7916         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7917       case SJUMP:
7918         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7919       case FJUMP:
7920         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7921       case RJUMP:
7922         if (opcode[i]==0x9&&rt1[i]!=31)
7923           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7924         else
7925           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7926         break;
7927       case SPAN:
7928         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7929       case IMM16:
7930         if(opcode[i]==0xf) //LUI
7931           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7932         else
7933           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7934         break;
7935       case LOAD:
7936       case LOADLR:
7937         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7938         break;
7939       case STORE:
7940       case STORELR:
7941         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7942         break;
7943       case ALU:
7944       case SHIFT:
7945         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7946         break;
7947       case MULTDIV:
7948         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7949         break;
7950       case SHIFTIMM:
7951         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7952         break;
7953       case MOV:
7954         if((opcode2[i]&0x1d)==0x10)
7955           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7956         else if((opcode2[i]&0x1d)==0x11)
7957           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7958         else
7959           printf (" %x: %s\n",start+i*4,insn[i]);
7960         break;
7961       case COP0:
7962         if(opcode2[i]==0)
7963           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7964         else if(opcode2[i]==4)
7965           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7966         else printf (" %x: %s\n",start+i*4,insn[i]);
7967         break;
7968       case COP1:
7969         if(opcode2[i]<3)
7970           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7971         else if(opcode2[i]>3)
7972           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7973         else printf (" %x: %s\n",start+i*4,insn[i]);
7974         break;
7975       case COP2:
7976         if(opcode2[i]<3)
7977           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7978         else if(opcode2[i]>3)
7979           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7980         else printf (" %x: %s\n",start+i*4,insn[i]);
7981         break;
7982       case C1LS:
7983         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7984         break;
7985       case C2LS:
7986         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7987         break;
7988       case INTCALL:
7989         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
7990         break;
7991       default:
7992         //printf (" %s %8x\n",insn[i],source[i]);
7993         printf (" %x: %s\n",start+i*4,insn[i]);
7994     }
7995 }
7996 #else
7997 static void disassemble_inst(int i) {}
7998 #endif // DISASM
7999
8000 // clear the state completely, instead of just marking
8001 // things invalid like invalidate_all_pages() does
8002 void new_dynarec_clear_full()
8003 {
8004   int n;
8005   out=(u_char *)BASE_ADDR;
8006   memset(invalid_code,1,sizeof(invalid_code));
8007   memset(hash_table,0xff,sizeof(hash_table));
8008   memset(mini_ht,-1,sizeof(mini_ht));
8009   memset(restore_candidate,0,sizeof(restore_candidate));
8010   memset(shadow,0,sizeof(shadow));
8011   copy=shadow;
8012   expirep=16384; // Expiry pointer, +2 blocks
8013   pending_exception=0;
8014   literalcount=0;
8015   stop_after_jal=0;
8016   inv_code_start=inv_code_end=~0;
8017   // TLB
8018 #ifndef DISABLE_TLB
8019   using_tlb=0;
8020   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
8021     memory_map[n]=-1;
8022   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
8023     memory_map[n]=((u_int)rdram-0x80000000)>>2;
8024   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
8025     memory_map[n]=-1;
8026 #endif
8027   for(n=0;n<4096;n++) ll_clear(jump_in+n);
8028   for(n=0;n<4096;n++) ll_clear(jump_out+n);
8029   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
8030 }
8031
8032 void new_dynarec_init()
8033 {
8034   printf("Init new dynarec\n");
8035   out=(u_char *)BASE_ADDR;
8036 #if BASE_ADDR_FIXED
8037   if (mmap (out, 1<<TARGET_SIZE_2,
8038             PROT_READ | PROT_WRITE | PROT_EXEC,
8039             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
8040             -1, 0) <= 0) {SysPrintf("mmap() failed\n");}
8041 #else
8042   // not all systems allow execute in data segment by default
8043   if (mprotect(out, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
8044     SysPrintf("mprotect() failed\n");
8045 #endif
8046 #ifdef MUPEN64
8047   rdword=&readmem_dword;
8048   fake_pc.f.r.rs=&readmem_dword;
8049   fake_pc.f.r.rt=&readmem_dword;
8050   fake_pc.f.r.rd=&readmem_dword;
8051 #endif
8052   int n;
8053   cycle_multiplier=200;
8054   new_dynarec_clear_full();
8055 #ifdef HOST_IMM8
8056   // Copy this into local area so we don't have to put it in every literal pool
8057   invc_ptr=invalid_code;
8058 #endif
8059 #ifdef MUPEN64
8060   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
8061     writemem[n] = write_nomem_new;
8062     writememb[n] = write_nomemb_new;
8063     writememh[n] = write_nomemh_new;
8064 #ifndef FORCE32
8065     writememd[n] = write_nomemd_new;
8066 #endif
8067     readmem[n] = read_nomem_new;
8068     readmemb[n] = read_nomemb_new;
8069     readmemh[n] = read_nomemh_new;
8070 #ifndef FORCE32
8071     readmemd[n] = read_nomemd_new;
8072 #endif
8073   }
8074   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
8075     writemem[n] = write_rdram_new;
8076     writememb[n] = write_rdramb_new;
8077     writememh[n] = write_rdramh_new;
8078 #ifndef FORCE32
8079     writememd[n] = write_rdramd_new;
8080 #endif
8081   }
8082   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
8083     writemem[n] = write_nomem_new;
8084     writememb[n] = write_nomemb_new;
8085     writememh[n] = write_nomemh_new;
8086 #ifndef FORCE32
8087     writememd[n] = write_nomemd_new;
8088 #endif
8089     readmem[n] = read_nomem_new;
8090     readmemb[n] = read_nomemb_new;
8091     readmemh[n] = read_nomemh_new;
8092 #ifndef FORCE32
8093     readmemd[n] = read_nomemd_new;
8094 #endif
8095   }
8096 #endif
8097   tlb_hacks();
8098   arch_init();
8099 #ifndef RAM_FIXED
8100   ram_offset=(u_int)rdram-0x80000000;
8101 #endif
8102   if (ram_offset!=0)
8103     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
8104 }
8105
8106 void new_dynarec_cleanup()
8107 {
8108   int n;
8109   #if BASE_ADDR_FIXED
8110   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {SysPrintf("munmap() failed\n");}
8111   #endif
8112   for(n=0;n<4096;n++) ll_clear(jump_in+n);
8113   for(n=0;n<4096;n++) ll_clear(jump_out+n);
8114   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
8115   #ifdef ROM_COPY
8116   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
8117   #endif
8118 }
8119
8120 int new_recompile_block(int addr)
8121 {
8122 /*
8123   if(addr==0x800cd050) {
8124     int block;
8125     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
8126     int n;
8127     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
8128   }
8129 */
8130   //if(Count==365117028) tracedebug=1;
8131   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8132   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
8133   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
8134   //if(debug) 
8135   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
8136   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
8137   /*if(Count>=312978186) {
8138     rlist();
8139   }*/
8140   //rlist();
8141   start = (u_int)addr&~3;
8142   //assert(((u_int)addr&1)==0);
8143   new_dynarec_did_compile=1;
8144 #ifdef PCSX
8145   if (Config.HLE && start == 0x80001000) // hlecall
8146   {
8147     // XXX: is this enough? Maybe check hleSoftCall?
8148     u_int beginning=(u_int)out;
8149     u_int page=get_page(start);
8150     invalid_code[start>>12]=0;
8151     emit_movimm(start,0);
8152     emit_writeword(0,(int)&pcaddr);
8153     emit_jmp((int)new_dyna_leave);
8154     literal_pool(0);
8155 #ifdef __arm__
8156     __clear_cache((void *)beginning,out);
8157 #endif
8158     ll_add(jump_in+page,start,(void *)beginning);
8159     return 0;
8160   }
8161   else if ((u_int)addr < 0x00200000 ||
8162     (0xa0000000 <= addr && addr < 0xa0200000)) {
8163     // used for BIOS calls mostly?
8164     source = (u_int *)((u_int)rdram+(start&0x1fffff));
8165     pagelimit = (addr&0xa0000000)|0x00200000;
8166   }
8167   else if (!Config.HLE && (
8168 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
8169     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
8170     // BIOS
8171     source = (u_int *)((u_int)psxR+(start&0x7ffff));
8172     pagelimit = (addr&0xfff00000)|0x80000;
8173   }
8174   else
8175 #endif
8176 #ifdef MUPEN64
8177   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
8178     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
8179     pagelimit = 0xa4001000;
8180   }
8181   else
8182 #endif
8183   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
8184     source = (u_int *)((u_int)rdram+start-0x80000000);
8185     pagelimit = 0x80000000+RAM_SIZE;
8186   }
8187 #ifndef DISABLE_TLB
8188   else if ((signed int)addr >= (signed int)0xC0000000) {
8189     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
8190     //if(tlb_LUT_r[start>>12])
8191       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
8192     if((signed int)memory_map[start>>12]>=0) {
8193       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
8194       pagelimit=(start+4096)&0xFFFFF000;
8195       int map=memory_map[start>>12];
8196       int i;
8197       for(i=0;i<5;i++) {
8198         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
8199         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
8200       }
8201       assem_debug("pagelimit=%x\n",pagelimit);
8202       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
8203     }
8204     else {
8205       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
8206       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
8207       return -1; // Caller will invoke exception handler
8208     }
8209     //printf("source= %x\n",(int)source);
8210   }
8211 #endif
8212   else {
8213     SysPrintf("Compile at bogus memory address: %x \n", (int)addr);
8214     exit(1);
8215   }
8216
8217   /* Pass 1: disassemble */
8218   /* Pass 2: register dependencies, branch targets */
8219   /* Pass 3: register allocation */
8220   /* Pass 4: branch dependencies */
8221   /* Pass 5: pre-alloc */
8222   /* Pass 6: optimize clean/dirty state */
8223   /* Pass 7: flag 32-bit registers */
8224   /* Pass 8: assembly */
8225   /* Pass 9: linker */
8226   /* Pass 10: garbage collection / free memory */
8227
8228   int i,j;
8229   int done=0;
8230   unsigned int type,op,op2;
8231
8232   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
8233   
8234   /* Pass 1 disassembly */
8235
8236   for(i=0;!done;i++) {
8237     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
8238     minimum_free_regs[i]=0;
8239     opcode[i]=op=source[i]>>26;
8240     switch(op)
8241     {
8242       case 0x00: strcpy(insn[i],"special"); type=NI;
8243         op2=source[i]&0x3f;
8244         switch(op2)
8245         {
8246           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
8247           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
8248           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
8249           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
8250           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
8251           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
8252           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
8253           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
8254           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
8255           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
8256           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
8257           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
8258           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
8259           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
8260           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
8261           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
8262           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
8263           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
8264           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
8265           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
8266           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
8267           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
8268           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
8269           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
8270           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
8271           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
8272           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
8273           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
8274           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
8275           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
8276           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
8277           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
8278           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
8279           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
8280           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
8281 #ifndef FORCE32
8282           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
8283           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
8284           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
8285           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
8286           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
8287           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
8288           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
8289           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
8290           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
8291           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
8292           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
8293           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
8294           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
8295           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
8296           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
8297           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
8298           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
8299 #endif
8300         }
8301         break;
8302       case 0x01: strcpy(insn[i],"regimm"); type=NI;
8303         op2=(source[i]>>16)&0x1f;
8304         switch(op2)
8305         {
8306           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
8307           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
8308           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
8309           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
8310           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
8311           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
8312           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
8313           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
8314           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
8315           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
8316           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
8317           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
8318           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
8319           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
8320         }
8321         break;
8322       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
8323       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
8324       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
8325       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
8326       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
8327       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
8328       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
8329       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
8330       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
8331       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
8332       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
8333       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
8334       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
8335       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
8336       case 0x10: strcpy(insn[i],"cop0"); type=NI;
8337         op2=(source[i]>>21)&0x1f;
8338         switch(op2)
8339         {
8340           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
8341           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
8342           case 0x10: strcpy(insn[i],"tlb"); type=NI;
8343           switch(source[i]&0x3f)
8344           {
8345             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
8346             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
8347             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
8348             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
8349 #ifdef PCSX
8350             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
8351 #else
8352             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
8353 #endif
8354           }
8355         }
8356         break;
8357       case 0x11: strcpy(insn[i],"cop1"); type=NI;
8358         op2=(source[i]>>21)&0x1f;
8359         switch(op2)
8360         {
8361           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
8362           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
8363           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
8364           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
8365           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
8366           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
8367           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
8368           switch((source[i]>>16)&0x3)
8369           {
8370             case 0x00: strcpy(insn[i],"BC1F"); break;
8371             case 0x01: strcpy(insn[i],"BC1T"); break;
8372             case 0x02: strcpy(insn[i],"BC1FL"); break;
8373             case 0x03: strcpy(insn[i],"BC1TL"); break;
8374           }
8375           break;
8376           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
8377           switch(source[i]&0x3f)
8378           {
8379             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
8380             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
8381             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
8382             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
8383             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
8384             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
8385             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
8386             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
8387             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
8388             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
8389             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
8390             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
8391             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
8392             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
8393             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
8394             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8395             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8396             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8397             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8398             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8399             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8400             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8401             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8402             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8403             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8404             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8405             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8406             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8407             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8408             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8409             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8410             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8411             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8412             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8413             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8414           }
8415           break;
8416           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8417           switch(source[i]&0x3f)
8418           {
8419             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8420             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8421             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8422             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8423             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8424             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8425             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8426             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8427             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8428             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8429             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8430             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8431             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8432             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8433             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8434             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8435             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8436             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8437             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8438             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8439             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8440             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8441             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8442             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8443             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8444             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8445             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8446             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8447             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8448             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8449             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8450             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8451             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8452             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8453             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8454           }
8455           break;
8456           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8457           switch(source[i]&0x3f)
8458           {
8459             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8460             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8461           }
8462           break;
8463           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8464           switch(source[i]&0x3f)
8465           {
8466             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8467             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8468           }
8469           break;
8470         }
8471         break;
8472 #ifndef FORCE32
8473       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8474       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8475       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8476       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8477       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8478       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8479       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8480       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8481 #endif
8482       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8483       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8484       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8485       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8486       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8487       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8488       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8489 #ifndef FORCE32
8490       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8491 #endif
8492       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8493       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8494       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8495       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8496 #ifndef FORCE32
8497       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8498       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8499 #endif
8500       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8501       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8502       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8503       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8504 #ifndef FORCE32
8505       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8506       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8507       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8508 #endif
8509       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8510       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8511 #ifndef FORCE32
8512       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8513       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8514       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8515 #endif
8516 #ifdef PCSX
8517       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8518         op2=(source[i]>>21)&0x1f;
8519         //if (op2 & 0x10) {
8520         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
8521           if (gte_handlers[source[i]&0x3f]!=NULL) {
8522             if (gte_regnames[source[i]&0x3f]!=NULL)
8523               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
8524             else
8525               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8526             type=C2OP;
8527           }
8528         }
8529         else switch(op2)
8530         {
8531           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8532           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8533           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8534           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8535         }
8536         break;
8537       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8538       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8539       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8540 #endif
8541       default: strcpy(insn[i],"???"); type=NI;
8542         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8543         break;
8544     }
8545     itype[i]=type;
8546     opcode2[i]=op2;
8547     /* Get registers/immediates */
8548     lt1[i]=0;
8549     us1[i]=0;
8550     us2[i]=0;
8551     dep1[i]=0;
8552     dep2[i]=0;
8553     gte_rs[i]=gte_rt[i]=0;
8554     switch(type) {
8555       case LOAD:
8556         rs1[i]=(source[i]>>21)&0x1f;
8557         rs2[i]=0;
8558         rt1[i]=(source[i]>>16)&0x1f;
8559         rt2[i]=0;
8560         imm[i]=(short)source[i];
8561         break;
8562       case STORE:
8563       case STORELR:
8564         rs1[i]=(source[i]>>21)&0x1f;
8565         rs2[i]=(source[i]>>16)&0x1f;
8566         rt1[i]=0;
8567         rt2[i]=0;
8568         imm[i]=(short)source[i];
8569         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8570         break;
8571       case LOADLR:
8572         // LWL/LWR only load part of the register,
8573         // therefore the target register must be treated as a source too
8574         rs1[i]=(source[i]>>21)&0x1f;
8575         rs2[i]=(source[i]>>16)&0x1f;
8576         rt1[i]=(source[i]>>16)&0x1f;
8577         rt2[i]=0;
8578         imm[i]=(short)source[i];
8579         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8580         if(op==0x26) dep1[i]=rt1[i]; // LWR
8581         break;
8582       case IMM16:
8583         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8584         else rs1[i]=(source[i]>>21)&0x1f;
8585         rs2[i]=0;
8586         rt1[i]=(source[i]>>16)&0x1f;
8587         rt2[i]=0;
8588         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8589           imm[i]=(unsigned short)source[i];
8590         }else{
8591           imm[i]=(short)source[i];
8592         }
8593         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8594         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8595         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8596         break;
8597       case UJUMP:
8598         rs1[i]=0;
8599         rs2[i]=0;
8600         rt1[i]=0;
8601         rt2[i]=0;
8602         // The JAL instruction writes to r31.
8603         if (op&1) {
8604           rt1[i]=31;
8605         }
8606         rs2[i]=CCREG;
8607         break;
8608       case RJUMP:
8609         rs1[i]=(source[i]>>21)&0x1f;
8610         rs2[i]=0;
8611         rt1[i]=0;
8612         rt2[i]=0;
8613         // The JALR instruction writes to rd.
8614         if (op2&1) {
8615           rt1[i]=(source[i]>>11)&0x1f;
8616         }
8617         rs2[i]=CCREG;
8618         break;
8619       case CJUMP:
8620         rs1[i]=(source[i]>>21)&0x1f;
8621         rs2[i]=(source[i]>>16)&0x1f;
8622         rt1[i]=0;
8623         rt2[i]=0;
8624         if(op&2) { // BGTZ/BLEZ
8625           rs2[i]=0;
8626         }
8627         us1[i]=rs1[i];
8628         us2[i]=rs2[i];
8629         likely[i]=op>>4;
8630         break;
8631       case SJUMP:
8632         rs1[i]=(source[i]>>21)&0x1f;
8633         rs2[i]=CCREG;
8634         rt1[i]=0;
8635         rt2[i]=0;
8636         us1[i]=rs1[i];
8637         if(op2&0x10) { // BxxAL
8638           rt1[i]=31;
8639           // NOTE: If the branch is not taken, r31 is still overwritten
8640         }
8641         likely[i]=(op2&2)>>1;
8642         break;
8643       case FJUMP:
8644         rs1[i]=FSREG;
8645         rs2[i]=CSREG;
8646         rt1[i]=0;
8647         rt2[i]=0;
8648         likely[i]=((source[i])>>17)&1;
8649         break;
8650       case ALU:
8651         rs1[i]=(source[i]>>21)&0x1f; // source
8652         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8653         rt1[i]=(source[i]>>11)&0x1f; // destination
8654         rt2[i]=0;
8655         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8656           us1[i]=rs1[i];us2[i]=rs2[i];
8657         }
8658         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8659           dep1[i]=rs1[i];dep2[i]=rs2[i];
8660         }
8661         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8662           dep1[i]=rs1[i];dep2[i]=rs2[i];
8663         }
8664         break;
8665       case MULTDIV:
8666         rs1[i]=(source[i]>>21)&0x1f; // source
8667         rs2[i]=(source[i]>>16)&0x1f; // divisor
8668         rt1[i]=HIREG;
8669         rt2[i]=LOREG;
8670         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8671           us1[i]=rs1[i];us2[i]=rs2[i];
8672         }
8673         break;
8674       case MOV:
8675         rs1[i]=0;
8676         rs2[i]=0;
8677         rt1[i]=0;
8678         rt2[i]=0;
8679         if(op2==0x10) rs1[i]=HIREG; // MFHI
8680         if(op2==0x11) rt1[i]=HIREG; // MTHI
8681         if(op2==0x12) rs1[i]=LOREG; // MFLO
8682         if(op2==0x13) rt1[i]=LOREG; // MTLO
8683         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8684         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8685         dep1[i]=rs1[i];
8686         break;
8687       case SHIFT:
8688         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8689         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8690         rt1[i]=(source[i]>>11)&0x1f; // destination
8691         rt2[i]=0;
8692         // DSLLV/DSRLV/DSRAV are 64-bit
8693         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8694         break;
8695       case SHIFTIMM:
8696         rs1[i]=(source[i]>>16)&0x1f;
8697         rs2[i]=0;
8698         rt1[i]=(source[i]>>11)&0x1f;
8699         rt2[i]=0;
8700         imm[i]=(source[i]>>6)&0x1f;
8701         // DSxx32 instructions
8702         if(op2>=0x3c) imm[i]|=0x20;
8703         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8704         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8705         break;
8706       case COP0:
8707         rs1[i]=0;
8708         rs2[i]=0;
8709         rt1[i]=0;
8710         rt2[i]=0;
8711         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8712         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8713         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8714         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8715         break;
8716       case COP1:
8717         rs1[i]=0;
8718         rs2[i]=0;
8719         rt1[i]=0;
8720         rt2[i]=0;
8721         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8722         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8723         if(op2==5) us1[i]=rs1[i]; // DMTC1
8724         rs2[i]=CSREG;
8725         break;
8726       case COP2:
8727         rs1[i]=0;
8728         rs2[i]=0;
8729         rt1[i]=0;
8730         rt2[i]=0;
8731         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
8732         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
8733         rs2[i]=CSREG;
8734         int gr=(source[i]>>11)&0x1F;
8735         switch(op2)
8736         {
8737           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
8738           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
8739           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
8740           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
8741         }
8742         break;
8743       case C1LS:
8744         rs1[i]=(source[i]>>21)&0x1F;
8745         rs2[i]=CSREG;
8746         rt1[i]=0;
8747         rt2[i]=0;
8748         imm[i]=(short)source[i];
8749         break;
8750       case C2LS:
8751         rs1[i]=(source[i]>>21)&0x1F;
8752         rs2[i]=0;
8753         rt1[i]=0;
8754         rt2[i]=0;
8755         imm[i]=(short)source[i];
8756         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
8757         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
8758         break;
8759       case C2OP:
8760         rs1[i]=0;
8761         rs2[i]=0;
8762         rt1[i]=0;
8763         rt2[i]=0;
8764         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
8765         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
8766         gte_rt[i]|=1ll<<63; // every op changes flags
8767         if((source[i]&0x3f)==GTE_MVMVA) {
8768           int v = (source[i] >> 15) & 3;
8769           gte_rs[i]&=~0xe3fll;
8770           if(v==3) gte_rs[i]|=0xe00ll;
8771           else gte_rs[i]|=3ll<<(v*2);
8772         }
8773         break;
8774       case FLOAT:
8775       case FCONV:
8776         rs1[i]=0;
8777         rs2[i]=CSREG;
8778         rt1[i]=0;
8779         rt2[i]=0;
8780         break;
8781       case FCOMP:
8782         rs1[i]=FSREG;
8783         rs2[i]=CSREG;
8784         rt1[i]=FSREG;
8785         rt2[i]=0;
8786         break;
8787       case SYSCALL:
8788       case HLECALL:
8789       case INTCALL:
8790         rs1[i]=CCREG;
8791         rs2[i]=0;
8792         rt1[i]=0;
8793         rt2[i]=0;
8794         break;
8795       default:
8796         rs1[i]=0;
8797         rs2[i]=0;
8798         rt1[i]=0;
8799         rt2[i]=0;
8800     }
8801     /* Calculate branch target addresses */
8802     if(type==UJUMP)
8803       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8804     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8805       ba[i]=start+i*4+8; // Ignore never taken branch
8806     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8807       ba[i]=start+i*4+8; // Ignore never taken branch
8808     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8809       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8810     else ba[i]=-1;
8811 #ifdef PCSX
8812     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
8813       int do_in_intrp=0;
8814       // branch in delay slot?
8815       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
8816         // don't handle first branch and call interpreter if it's hit
8817         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
8818         do_in_intrp=1;
8819       }
8820       // basic load delay detection
8821       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
8822         int t=(ba[i-1]-start)/4;
8823         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
8824           // jump target wants DS result - potential load delay effect
8825           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
8826           do_in_intrp=1;
8827           bt[t+1]=1; // expected return from interpreter
8828         }
8829         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
8830               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
8831           // v0 overwrite like this is a sign of trouble, bail out
8832           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
8833           do_in_intrp=1;
8834         }
8835       }
8836       if(do_in_intrp) {
8837         rs1[i-1]=CCREG;
8838         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
8839         ba[i-1]=-1;
8840         itype[i-1]=INTCALL;
8841         done=2;
8842         i--; // don't compile the DS
8843       }
8844     }
8845 #endif
8846     /* Is this the end of the block? */
8847     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8848       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8849         done=2;
8850       }
8851       else {
8852         if(stop_after_jal) done=1;
8853         // Stop on BREAK
8854         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8855       }
8856       // Don't recompile stuff that's already compiled
8857       if(check_addr(start+i*4+4)) done=1;
8858       // Don't get too close to the limit
8859       if(i>MAXBLOCK/2) done=1;
8860     }
8861     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8862     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
8863     if(done==2) {
8864       // Does the block continue due to a branch?
8865       for(j=i-1;j>=0;j--)
8866       {
8867         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
8868         if(ba[j]==start+i*4+4) done=j=0;
8869         if(ba[j]==start+i*4+8) done=j=0;
8870       }
8871     }
8872     //assert(i<MAXBLOCK-1);
8873     if(start+i*4==pagelimit-4) done=1;
8874     assert(start+i*4<pagelimit);
8875     if (i==MAXBLOCK-1) done=1;
8876     // Stop if we're compiling junk
8877     if(itype[i]==NI&&opcode[i]==0x11) {
8878       done=stop_after_jal=1;
8879       SysPrintf("Disabled speculative precompilation\n");
8880     }
8881   }
8882   slen=i;
8883   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8884     if(start+i*4==pagelimit) {
8885       itype[i-1]=SPAN;
8886     }
8887   }
8888   assert(slen>0);
8889
8890   /* Pass 2 - Register dependencies and branch targets */
8891
8892   unneeded_registers(0,slen-1,0);
8893   
8894   /* Pass 3 - Register allocation */
8895
8896   struct regstat current; // Current register allocations/status
8897   current.is32=1;
8898   current.dirty=0;
8899   current.u=unneeded_reg[0];
8900   current.uu=unneeded_reg_upper[0];
8901   clear_all_regs(current.regmap);
8902   alloc_reg(&current,0,CCREG);
8903   dirty_reg(&current,CCREG);
8904   current.isconst=0;
8905   current.wasconst=0;
8906   current.waswritten=0;
8907   int ds=0;
8908   int cc=0;
8909   int hr=-1;
8910
8911 #ifndef FORCE32
8912   provisional_32bit();
8913 #endif
8914   if((u_int)addr&1) {
8915     // First instruction is delay slot
8916     cc=-1;
8917     bt[1]=1;
8918     ds=1;
8919     unneeded_reg[0]=1;
8920     unneeded_reg_upper[0]=1;
8921     current.regmap[HOST_BTREG]=BTREG;
8922   }
8923   
8924   for(i=0;i<slen;i++)
8925   {
8926     if(bt[i])
8927     {
8928       int hr;
8929       for(hr=0;hr<HOST_REGS;hr++)
8930       {
8931         // Is this really necessary?
8932         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8933       }
8934       current.isconst=0;
8935       current.waswritten=0;
8936     }
8937     if(i>1)
8938     {
8939       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8940       {
8941         if(rs1[i-2]==0||rs2[i-2]==0)
8942         {
8943           if(rs1[i-2]) {
8944             current.is32|=1LL<<rs1[i-2];
8945             int hr=get_reg(current.regmap,rs1[i-2]|64);
8946             if(hr>=0) current.regmap[hr]=-1;
8947           }
8948           if(rs2[i-2]) {
8949             current.is32|=1LL<<rs2[i-2];
8950             int hr=get_reg(current.regmap,rs2[i-2]|64);
8951             if(hr>=0) current.regmap[hr]=-1;
8952           }
8953         }
8954       }
8955     }
8956 #ifndef FORCE32
8957     // If something jumps here with 64-bit values
8958     // then promote those registers to 64 bits
8959     if(bt[i])
8960     {
8961       uint64_t temp_is32=current.is32;
8962       for(j=i-1;j>=0;j--)
8963       {
8964         if(ba[j]==start+i*4) 
8965           temp_is32&=branch_regs[j].is32;
8966       }
8967       for(j=i;j<slen;j++)
8968       {
8969         if(ba[j]==start+i*4) 
8970           //temp_is32=1;
8971           temp_is32&=p32[j];
8972       }
8973       if(temp_is32!=current.is32) {
8974         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8975         #ifndef DESTRUCTIVE_WRITEBACK
8976         if(ds)
8977         #endif
8978         for(hr=0;hr<HOST_REGS;hr++)
8979         {
8980           int r=current.regmap[hr];
8981           if(r>0&&r<64)
8982           {
8983             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8984               temp_is32|=1LL<<r;
8985               //printf("restore %d\n",r);
8986             }
8987           }
8988         }
8989         current.is32=temp_is32;
8990       }
8991     }
8992 #else
8993     current.is32=-1LL;
8994 #endif
8995
8996     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8997     regs[i].wasconst=current.isconst;
8998     regs[i].was32=current.is32;
8999     regs[i].wasdirty=current.dirty;
9000     regs[i].loadedconst=0;
9001     #if defined(DESTRUCTIVE_WRITEBACK) && !defined(FORCE32)
9002     // To change a dirty register from 32 to 64 bits, we must write
9003     // it out during the previous cycle (for branches, 2 cycles)
9004     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
9005     {
9006       uint64_t temp_is32=current.is32;
9007       for(j=i-1;j>=0;j--)
9008       {
9009         if(ba[j]==start+i*4+4) 
9010           temp_is32&=branch_regs[j].is32;
9011       }
9012       for(j=i;j<slen;j++)
9013       {
9014         if(ba[j]==start+i*4+4) 
9015           //temp_is32=1;
9016           temp_is32&=p32[j];
9017       }
9018       if(temp_is32!=current.is32) {
9019         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
9020         for(hr=0;hr<HOST_REGS;hr++)
9021         {
9022           int r=current.regmap[hr];
9023           if(r>0)
9024           {
9025             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
9026               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
9027               {
9028                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
9029                 {
9030                   //printf("dump %d/r%d\n",hr,r);
9031                   current.regmap[hr]=-1;
9032                   if(get_reg(current.regmap,r|64)>=0) 
9033                     current.regmap[get_reg(current.regmap,r|64)]=-1;
9034                 }
9035               }
9036             }
9037           }
9038         }
9039       }
9040     }
9041     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
9042     {
9043       uint64_t temp_is32=current.is32;
9044       for(j=i-1;j>=0;j--)
9045       {
9046         if(ba[j]==start+i*4+8) 
9047           temp_is32&=branch_regs[j].is32;
9048       }
9049       for(j=i;j<slen;j++)
9050       {
9051         if(ba[j]==start+i*4+8) 
9052           //temp_is32=1;
9053           temp_is32&=p32[j];
9054       }
9055       if(temp_is32!=current.is32) {
9056         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
9057         for(hr=0;hr<HOST_REGS;hr++)
9058         {
9059           int r=current.regmap[hr];
9060           if(r>0)
9061           {
9062             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
9063               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
9064               {
9065                 //printf("dump %d/r%d\n",hr,r);
9066                 current.regmap[hr]=-1;
9067                 if(get_reg(current.regmap,r|64)>=0) 
9068                   current.regmap[get_reg(current.regmap,r|64)]=-1;
9069               }
9070             }
9071           }
9072         }
9073       }
9074     }
9075     #endif
9076     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9077       if(i+1<slen) {
9078         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9079         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9080         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9081         current.u|=1;
9082         current.uu|=1;
9083       } else {
9084         current.u=1;
9085         current.uu=1;
9086       }
9087     } else {
9088       if(i+1<slen) {
9089         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
9090         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9091         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9092         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
9093         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9094         current.u|=1;
9095         current.uu|=1;
9096       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
9097     }
9098     is_ds[i]=ds;
9099     if(ds) {
9100       ds=0; // Skip delay slot, already allocated as part of branch
9101       // ...but we need to alloc it in case something jumps here
9102       if(i+1<slen) {
9103         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
9104         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
9105       }else{
9106         current.u=branch_unneeded_reg[i-1];
9107         current.uu=branch_unneeded_reg_upper[i-1];
9108       }
9109       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
9110       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9111       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9112       current.u|=1;
9113       current.uu|=1;
9114       struct regstat temp;
9115       memcpy(&temp,&current,sizeof(current));
9116       temp.wasdirty=temp.dirty;
9117       temp.was32=temp.is32;
9118       // TODO: Take into account unconditional branches, as below
9119       delayslot_alloc(&temp,i);
9120       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
9121       regs[i].wasdirty=temp.wasdirty;
9122       regs[i].was32=temp.was32;
9123       regs[i].dirty=temp.dirty;
9124       regs[i].is32=temp.is32;
9125       regs[i].isconst=0;
9126       regs[i].wasconst=0;
9127       current.isconst=0;
9128       // Create entry (branch target) regmap
9129       for(hr=0;hr<HOST_REGS;hr++)
9130       {
9131         int r=temp.regmap[hr];
9132         if(r>=0) {
9133           if(r!=regmap_pre[i][hr]) {
9134             regs[i].regmap_entry[hr]=-1;
9135           }
9136           else
9137           {
9138             if(r<64){
9139               if((current.u>>r)&1) {
9140                 regs[i].regmap_entry[hr]=-1;
9141                 regs[i].regmap[hr]=-1;
9142                 //Don't clear regs in the delay slot as the branch might need them
9143                 //current.regmap[hr]=-1;
9144               }else
9145                 regs[i].regmap_entry[hr]=r;
9146             }
9147             else {
9148               if((current.uu>>(r&63))&1) {
9149                 regs[i].regmap_entry[hr]=-1;
9150                 regs[i].regmap[hr]=-1;
9151                 //Don't clear regs in the delay slot as the branch might need them
9152                 //current.regmap[hr]=-1;
9153               }else
9154                 regs[i].regmap_entry[hr]=r;
9155             }
9156           }
9157         } else {
9158           // First instruction expects CCREG to be allocated
9159           if(i==0&&hr==HOST_CCREG) 
9160             regs[i].regmap_entry[hr]=CCREG;
9161           else
9162             regs[i].regmap_entry[hr]=-1;
9163         }
9164       }
9165     }
9166     else { // Not delay slot
9167       switch(itype[i]) {
9168         case UJUMP:
9169           //current.isconst=0; // DEBUG
9170           //current.wasconst=0; // DEBUG
9171           //regs[i].wasconst=0; // DEBUG
9172           clear_const(&current,rt1[i]);
9173           alloc_cc(&current,i);
9174           dirty_reg(&current,CCREG);
9175           if (rt1[i]==31) {
9176             alloc_reg(&current,i,31);
9177             dirty_reg(&current,31);
9178             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
9179             //assert(rt1[i+1]!=rt1[i]);
9180             #ifdef REG_PREFETCH
9181             alloc_reg(&current,i,PTEMP);
9182             #endif
9183             //current.is32|=1LL<<rt1[i];
9184           }
9185           ooo[i]=1;
9186           delayslot_alloc(&current,i+1);
9187           //current.isconst=0; // DEBUG
9188           ds=1;
9189           //printf("i=%d, isconst=%x\n",i,current.isconst);
9190           break;
9191         case RJUMP:
9192           //current.isconst=0;
9193           //current.wasconst=0;
9194           //regs[i].wasconst=0;
9195           clear_const(&current,rs1[i]);
9196           clear_const(&current,rt1[i]);
9197           alloc_cc(&current,i);
9198           dirty_reg(&current,CCREG);
9199           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
9200             alloc_reg(&current,i,rs1[i]);
9201             if (rt1[i]!=0) {
9202               alloc_reg(&current,i,rt1[i]);
9203               dirty_reg(&current,rt1[i]);
9204               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
9205               assert(rt1[i+1]!=rt1[i]);
9206               #ifdef REG_PREFETCH
9207               alloc_reg(&current,i,PTEMP);
9208               #endif
9209             }
9210             #ifdef USE_MINI_HT
9211             if(rs1[i]==31) { // JALR
9212               alloc_reg(&current,i,RHASH);
9213               #ifndef HOST_IMM_ADDR32
9214               alloc_reg(&current,i,RHTBL);
9215               #endif
9216             }
9217             #endif
9218             delayslot_alloc(&current,i+1);
9219           } else {
9220             // The delay slot overwrites our source register,
9221             // allocate a temporary register to hold the old value.
9222             current.isconst=0;
9223             current.wasconst=0;
9224             regs[i].wasconst=0;
9225             delayslot_alloc(&current,i+1);
9226             current.isconst=0;
9227             alloc_reg(&current,i,RTEMP);
9228           }
9229           //current.isconst=0; // DEBUG
9230           ooo[i]=1;
9231           ds=1;
9232           break;
9233         case CJUMP:
9234           //current.isconst=0;
9235           //current.wasconst=0;
9236           //regs[i].wasconst=0;
9237           clear_const(&current,rs1[i]);
9238           clear_const(&current,rs2[i]);
9239           if((opcode[i]&0x3E)==4) // BEQ/BNE
9240           {
9241             alloc_cc(&current,i);
9242             dirty_reg(&current,CCREG);
9243             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9244             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9245             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9246             {
9247               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9248               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9249             }
9250             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
9251                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
9252               // The delay slot overwrites one of our conditions.
9253               // Allocate the branch condition registers instead.
9254               current.isconst=0;
9255               current.wasconst=0;
9256               regs[i].wasconst=0;
9257               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9258               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
9259               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9260               {
9261                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9262                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
9263               }
9264             }
9265             else
9266             {
9267               ooo[i]=1;
9268               delayslot_alloc(&current,i+1);
9269             }
9270           }
9271           else
9272           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
9273           {
9274             alloc_cc(&current,i);
9275             dirty_reg(&current,CCREG);
9276             alloc_reg(&current,i,rs1[i]);
9277             if(!(current.is32>>rs1[i]&1))
9278             {
9279               alloc_reg64(&current,i,rs1[i]);
9280             }
9281             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
9282               // The delay slot overwrites one of our conditions.
9283               // Allocate the branch condition registers instead.
9284               current.isconst=0;
9285               current.wasconst=0;
9286               regs[i].wasconst=0;
9287               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9288               if(!((current.is32>>rs1[i])&1))
9289               {
9290                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9291               }
9292             }
9293             else
9294             {
9295               ooo[i]=1;
9296               delayslot_alloc(&current,i+1);
9297             }
9298           }
9299           else
9300           // Don't alloc the delay slot yet because we might not execute it
9301           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
9302           {
9303             current.isconst=0;
9304             current.wasconst=0;
9305             regs[i].wasconst=0;
9306             alloc_cc(&current,i);
9307             dirty_reg(&current,CCREG);
9308             alloc_reg(&current,i,rs1[i]);
9309             alloc_reg(&current,i,rs2[i]);
9310             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
9311             {
9312               alloc_reg64(&current,i,rs1[i]);
9313               alloc_reg64(&current,i,rs2[i]);
9314             }
9315           }
9316           else
9317           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
9318           {
9319             current.isconst=0;
9320             current.wasconst=0;
9321             regs[i].wasconst=0;
9322             alloc_cc(&current,i);
9323             dirty_reg(&current,CCREG);
9324             alloc_reg(&current,i,rs1[i]);
9325             if(!(current.is32>>rs1[i]&1))
9326             {
9327               alloc_reg64(&current,i,rs1[i]);
9328             }
9329           }
9330           ds=1;
9331           //current.isconst=0;
9332           break;
9333         case SJUMP:
9334           //current.isconst=0;
9335           //current.wasconst=0;
9336           //regs[i].wasconst=0;
9337           clear_const(&current,rs1[i]);
9338           clear_const(&current,rt1[i]);
9339           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
9340           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
9341           {
9342             alloc_cc(&current,i);
9343             dirty_reg(&current,CCREG);
9344             alloc_reg(&current,i,rs1[i]);
9345             if(!(current.is32>>rs1[i]&1))
9346             {
9347               alloc_reg64(&current,i,rs1[i]);
9348             }
9349             if (rt1[i]==31) { // BLTZAL/BGEZAL
9350               alloc_reg(&current,i,31);
9351               dirty_reg(&current,31);
9352               //#ifdef REG_PREFETCH
9353               //alloc_reg(&current,i,PTEMP);
9354               //#endif
9355               //current.is32|=1LL<<rt1[i];
9356             }
9357             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
9358                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
9359               // Allocate the branch condition registers instead.
9360               current.isconst=0;
9361               current.wasconst=0;
9362               regs[i].wasconst=0;
9363               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
9364               if(!((current.is32>>rs1[i])&1))
9365               {
9366                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
9367               }
9368             }
9369             else
9370             {
9371               ooo[i]=1;
9372               delayslot_alloc(&current,i+1);
9373             }
9374           }
9375           else
9376           // Don't alloc the delay slot yet because we might not execute it
9377           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
9378           {
9379             current.isconst=0;
9380             current.wasconst=0;
9381             regs[i].wasconst=0;
9382             alloc_cc(&current,i);
9383             dirty_reg(&current,CCREG);
9384             alloc_reg(&current,i,rs1[i]);
9385             if(!(current.is32>>rs1[i]&1))
9386             {
9387               alloc_reg64(&current,i,rs1[i]);
9388             }
9389           }
9390           ds=1;
9391           //current.isconst=0;
9392           break;
9393         case FJUMP:
9394           current.isconst=0;
9395           current.wasconst=0;
9396           regs[i].wasconst=0;
9397           if(likely[i]==0) // BC1F/BC1T
9398           {
9399             // TODO: Theoretically we can run out of registers here on x86.
9400             // The delay slot can allocate up to six, and we need to check
9401             // CSREG before executing the delay slot.  Possibly we can drop
9402             // the cycle count and then reload it after checking that the
9403             // FPU is in a usable state, or don't do out-of-order execution.
9404             alloc_cc(&current,i);
9405             dirty_reg(&current,CCREG);
9406             alloc_reg(&current,i,FSREG);
9407             alloc_reg(&current,i,CSREG);
9408             if(itype[i+1]==FCOMP) {
9409               // The delay slot overwrites the branch condition.
9410               // Allocate the branch condition registers instead.
9411               alloc_cc(&current,i);
9412               dirty_reg(&current,CCREG);
9413               alloc_reg(&current,i,CSREG);
9414               alloc_reg(&current,i,FSREG);
9415             }
9416             else {
9417               ooo[i]=1;
9418               delayslot_alloc(&current,i+1);
9419               alloc_reg(&current,i+1,CSREG);
9420             }
9421           }
9422           else
9423           // Don't alloc the delay slot yet because we might not execute it
9424           if(likely[i]) // BC1FL/BC1TL
9425           {
9426             alloc_cc(&current,i);
9427             dirty_reg(&current,CCREG);
9428             alloc_reg(&current,i,CSREG);
9429             alloc_reg(&current,i,FSREG);
9430           }
9431           ds=1;
9432           current.isconst=0;
9433           break;
9434         case IMM16:
9435           imm16_alloc(&current,i);
9436           break;
9437         case LOAD:
9438         case LOADLR:
9439           load_alloc(&current,i);
9440           break;
9441         case STORE:
9442         case STORELR:
9443           store_alloc(&current,i);
9444           break;
9445         case ALU:
9446           alu_alloc(&current,i);
9447           break;
9448         case SHIFT:
9449           shift_alloc(&current,i);
9450           break;
9451         case MULTDIV:
9452           multdiv_alloc(&current,i);
9453           break;
9454         case SHIFTIMM:
9455           shiftimm_alloc(&current,i);
9456           break;
9457         case MOV:
9458           mov_alloc(&current,i);
9459           break;
9460         case COP0:
9461           cop0_alloc(&current,i);
9462           break;
9463         case COP1:
9464         case COP2:
9465           cop1_alloc(&current,i);
9466           break;
9467         case C1LS:
9468           c1ls_alloc(&current,i);
9469           break;
9470         case C2LS:
9471           c2ls_alloc(&current,i);
9472           break;
9473         case C2OP:
9474           c2op_alloc(&current,i);
9475           break;
9476         case FCONV:
9477           fconv_alloc(&current,i);
9478           break;
9479         case FLOAT:
9480           float_alloc(&current,i);
9481           break;
9482         case FCOMP:
9483           fcomp_alloc(&current,i);
9484           break;
9485         case SYSCALL:
9486         case HLECALL:
9487         case INTCALL:
9488           syscall_alloc(&current,i);
9489           break;
9490         case SPAN:
9491           pagespan_alloc(&current,i);
9492           break;
9493       }
9494       
9495       // Drop the upper half of registers that have become 32-bit
9496       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9497       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9498         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9499         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9500         current.uu|=1;
9501       } else {
9502         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9503         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9504         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9505         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9506         current.uu|=1;
9507       }
9508
9509       // Create entry (branch target) regmap
9510       for(hr=0;hr<HOST_REGS;hr++)
9511       {
9512         int r,or,er;
9513         r=current.regmap[hr];
9514         if(r>=0) {
9515           if(r!=regmap_pre[i][hr]) {
9516             // TODO: delay slot (?)
9517             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9518             if(or<0||(r&63)>=TEMPREG){
9519               regs[i].regmap_entry[hr]=-1;
9520             }
9521             else
9522             {
9523               // Just move it to a different register
9524               regs[i].regmap_entry[hr]=r;
9525               // If it was dirty before, it's still dirty
9526               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9527             }
9528           }
9529           else
9530           {
9531             // Unneeded
9532             if(r==0){
9533               regs[i].regmap_entry[hr]=0;
9534             }
9535             else
9536             if(r<64){
9537               if((current.u>>r)&1) {
9538                 regs[i].regmap_entry[hr]=-1;
9539                 //regs[i].regmap[hr]=-1;
9540                 current.regmap[hr]=-1;
9541               }else
9542                 regs[i].regmap_entry[hr]=r;
9543             }
9544             else {
9545               if((current.uu>>(r&63))&1) {
9546                 regs[i].regmap_entry[hr]=-1;
9547                 //regs[i].regmap[hr]=-1;
9548                 current.regmap[hr]=-1;
9549               }else
9550                 regs[i].regmap_entry[hr]=r;
9551             }
9552           }
9553         } else {
9554           // Branches expect CCREG to be allocated at the target
9555           if(regmap_pre[i][hr]==CCREG) 
9556             regs[i].regmap_entry[hr]=CCREG;
9557           else
9558             regs[i].regmap_entry[hr]=-1;
9559         }
9560       }
9561       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9562     }
9563
9564     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
9565       current.waswritten|=1<<rs1[i-1];
9566     current.waswritten&=~(1<<rt1[i]);
9567     current.waswritten&=~(1<<rt2[i]);
9568     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
9569       current.waswritten&=~(1<<rs1[i]);
9570
9571     /* Branch post-alloc */
9572     if(i>0)
9573     {
9574       current.was32=current.is32;
9575       current.wasdirty=current.dirty;
9576       switch(itype[i-1]) {
9577         case UJUMP:
9578           memcpy(&branch_regs[i-1],&current,sizeof(current));
9579           branch_regs[i-1].isconst=0;
9580           branch_regs[i-1].wasconst=0;
9581           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9582           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9583           alloc_cc(&branch_regs[i-1],i-1);
9584           dirty_reg(&branch_regs[i-1],CCREG);
9585           if(rt1[i-1]==31) { // JAL
9586             alloc_reg(&branch_regs[i-1],i-1,31);
9587             dirty_reg(&branch_regs[i-1],31);
9588             branch_regs[i-1].is32|=1LL<<31;
9589           }
9590           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9591           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9592           break;
9593         case RJUMP:
9594           memcpy(&branch_regs[i-1],&current,sizeof(current));
9595           branch_regs[i-1].isconst=0;
9596           branch_regs[i-1].wasconst=0;
9597           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9598           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9599           alloc_cc(&branch_regs[i-1],i-1);
9600           dirty_reg(&branch_regs[i-1],CCREG);
9601           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9602           if(rt1[i-1]!=0) { // JALR
9603             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9604             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9605             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9606           }
9607           #ifdef USE_MINI_HT
9608           if(rs1[i-1]==31) { // JALR
9609             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9610             #ifndef HOST_IMM_ADDR32
9611             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9612             #endif
9613           }
9614           #endif
9615           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9616           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9617           break;
9618         case CJUMP:
9619           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9620           {
9621             alloc_cc(&current,i-1);
9622             dirty_reg(&current,CCREG);
9623             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9624                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9625               // The delay slot overwrote one of our conditions
9626               // Delay slot goes after the test (in order)
9627               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9628               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9629               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9630               current.u|=1;
9631               current.uu|=1;
9632               delayslot_alloc(&current,i);
9633               current.isconst=0;
9634             }
9635             else
9636             {
9637               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9638               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9639               // Alloc the branch condition registers
9640               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9641               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9642               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9643               {
9644                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9645                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9646               }
9647             }
9648             memcpy(&branch_regs[i-1],&current,sizeof(current));
9649             branch_regs[i-1].isconst=0;
9650             branch_regs[i-1].wasconst=0;
9651             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9652             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9653           }
9654           else
9655           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9656           {
9657             alloc_cc(&current,i-1);
9658             dirty_reg(&current,CCREG);
9659             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9660               // The delay slot overwrote the branch condition
9661               // Delay slot goes after the test (in order)
9662               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9663               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9664               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9665               current.u|=1;
9666               current.uu|=1;
9667               delayslot_alloc(&current,i);
9668               current.isconst=0;
9669             }
9670             else
9671             {
9672               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9673               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9674               // Alloc the branch condition register
9675               alloc_reg(&current,i-1,rs1[i-1]);
9676               if(!(current.is32>>rs1[i-1]&1))
9677               {
9678                 alloc_reg64(&current,i-1,rs1[i-1]);
9679               }
9680             }
9681             memcpy(&branch_regs[i-1],&current,sizeof(current));
9682             branch_regs[i-1].isconst=0;
9683             branch_regs[i-1].wasconst=0;
9684             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9685             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9686           }
9687           else
9688           // Alloc the delay slot in case the branch is taken
9689           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9690           {
9691             memcpy(&branch_regs[i-1],&current,sizeof(current));
9692             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9693             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9694             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9695             alloc_cc(&branch_regs[i-1],i);
9696             dirty_reg(&branch_regs[i-1],CCREG);
9697             delayslot_alloc(&branch_regs[i-1],i);
9698             branch_regs[i-1].isconst=0;
9699             alloc_reg(&current,i,CCREG); // Not taken path
9700             dirty_reg(&current,CCREG);
9701             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9702           }
9703           else
9704           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9705           {
9706             memcpy(&branch_regs[i-1],&current,sizeof(current));
9707             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9708             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9709             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9710             alloc_cc(&branch_regs[i-1],i);
9711             dirty_reg(&branch_regs[i-1],CCREG);
9712             delayslot_alloc(&branch_regs[i-1],i);
9713             branch_regs[i-1].isconst=0;
9714             alloc_reg(&current,i,CCREG); // Not taken path
9715             dirty_reg(&current,CCREG);
9716             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9717           }
9718           break;
9719         case SJUMP:
9720           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9721           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9722           {
9723             alloc_cc(&current,i-1);
9724             dirty_reg(&current,CCREG);
9725             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9726               // The delay slot overwrote the branch condition
9727               // Delay slot goes after the test (in order)
9728               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9729               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9730               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9731               current.u|=1;
9732               current.uu|=1;
9733               delayslot_alloc(&current,i);
9734               current.isconst=0;
9735             }
9736             else
9737             {
9738               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9739               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9740               // Alloc the branch condition register
9741               alloc_reg(&current,i-1,rs1[i-1]);
9742               if(!(current.is32>>rs1[i-1]&1))
9743               {
9744                 alloc_reg64(&current,i-1,rs1[i-1]);
9745               }
9746             }
9747             memcpy(&branch_regs[i-1],&current,sizeof(current));
9748             branch_regs[i-1].isconst=0;
9749             branch_regs[i-1].wasconst=0;
9750             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9751             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
9752           }
9753           else
9754           // Alloc the delay slot in case the branch is taken
9755           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9756           {
9757             memcpy(&branch_regs[i-1],&current,sizeof(current));
9758             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9759             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9760             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9761             alloc_cc(&branch_regs[i-1],i);
9762             dirty_reg(&branch_regs[i-1],CCREG);
9763             delayslot_alloc(&branch_regs[i-1],i);
9764             branch_regs[i-1].isconst=0;
9765             alloc_reg(&current,i,CCREG); // Not taken path
9766             dirty_reg(&current,CCREG);
9767             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9768           }
9769           // FIXME: BLTZAL/BGEZAL
9770           if(opcode2[i-1]&0x10) { // BxxZAL
9771             alloc_reg(&branch_regs[i-1],i-1,31);
9772             dirty_reg(&branch_regs[i-1],31);
9773             branch_regs[i-1].is32|=1LL<<31;
9774           }
9775           break;
9776         case FJUMP:
9777           if(likely[i-1]==0) // BC1F/BC1T
9778           {
9779             alloc_cc(&current,i-1);
9780             dirty_reg(&current,CCREG);
9781             if(itype[i]==FCOMP) {
9782               // The delay slot overwrote the branch condition
9783               // Delay slot goes after the test (in order)
9784               delayslot_alloc(&current,i);
9785               current.isconst=0;
9786             }
9787             else
9788             {
9789               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9790               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9791               // Alloc the branch condition register
9792               alloc_reg(&current,i-1,FSREG);
9793             }
9794             memcpy(&branch_regs[i-1],&current,sizeof(current));
9795             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9796           }
9797           else // BC1FL/BC1TL
9798           {
9799             // Alloc the delay slot in case the branch is taken
9800             memcpy(&branch_regs[i-1],&current,sizeof(current));
9801             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9802             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9803             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9804             alloc_cc(&branch_regs[i-1],i);
9805             dirty_reg(&branch_regs[i-1],CCREG);
9806             delayslot_alloc(&branch_regs[i-1],i);
9807             branch_regs[i-1].isconst=0;
9808             alloc_reg(&current,i,CCREG); // Not taken path
9809             dirty_reg(&current,CCREG);
9810             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9811           }
9812           break;
9813       }
9814
9815       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9816       {
9817         if(rt1[i-1]==31) // JAL/JALR
9818         {
9819           // Subroutine call will return here, don't alloc any registers
9820           current.is32=1;
9821           current.dirty=0;
9822           clear_all_regs(current.regmap);
9823           alloc_reg(&current,i,CCREG);
9824           dirty_reg(&current,CCREG);
9825         }
9826         else if(i+1<slen)
9827         {
9828           // Internal branch will jump here, match registers to caller
9829           current.is32=0x3FFFFFFFFLL;
9830           current.dirty=0;
9831           clear_all_regs(current.regmap);
9832           alloc_reg(&current,i,CCREG);
9833           dirty_reg(&current,CCREG);
9834           for(j=i-1;j>=0;j--)
9835           {
9836             if(ba[j]==start+i*4+4) {
9837               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9838               current.is32=branch_regs[j].is32;
9839               current.dirty=branch_regs[j].dirty;
9840               break;
9841             }
9842           }
9843           while(j>=0) {
9844             if(ba[j]==start+i*4+4) {
9845               for(hr=0;hr<HOST_REGS;hr++) {
9846                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9847                   current.regmap[hr]=-1;
9848                 }
9849                 current.is32&=branch_regs[j].is32;
9850                 current.dirty&=branch_regs[j].dirty;
9851               }
9852             }
9853             j--;
9854           }
9855         }
9856       }
9857     }
9858
9859     // Count cycles in between branches
9860     ccadj[i]=cc;
9861     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9862     {
9863       cc=0;
9864     }
9865 #if defined(PCSX) && !defined(DRC_DBG)
9866     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
9867     {
9868       // GTE runs in parallel until accessed, divide by 2 for a rough guess
9869       cc+=gte_cycletab[source[i]&0x3f]/2;
9870     }
9871     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
9872     {
9873       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
9874     }
9875     else if(itype[i]==C2LS)
9876     {
9877       cc+=4;
9878     }
9879 #endif
9880     else
9881     {
9882       cc++;
9883     }
9884
9885     flush_dirty_uppers(&current);
9886     if(!is_ds[i]) {
9887       regs[i].is32=current.is32;
9888       regs[i].dirty=current.dirty;
9889       regs[i].isconst=current.isconst;
9890       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
9891     }
9892     for(hr=0;hr<HOST_REGS;hr++) {
9893       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9894         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9895           regs[i].wasconst&=~(1<<hr);
9896         }
9897       }
9898     }
9899     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9900     regs[i].waswritten=current.waswritten;
9901   }
9902   
9903   /* Pass 4 - Cull unused host registers */
9904   
9905   uint64_t nr=0;
9906   
9907   for (i=slen-1;i>=0;i--)
9908   {
9909     int hr;
9910     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9911     {
9912       if(ba[i]<start || ba[i]>=(start+slen*4))
9913       {
9914         // Branch out of this block, don't need anything
9915         nr=0;
9916       }
9917       else
9918       {
9919         // Internal branch
9920         // Need whatever matches the target
9921         nr=0;
9922         int t=(ba[i]-start)>>2;
9923         for(hr=0;hr<HOST_REGS;hr++)
9924         {
9925           if(regs[i].regmap_entry[hr]>=0) {
9926             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9927           }
9928         }
9929       }
9930       // Conditional branch may need registers for following instructions
9931       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9932       {
9933         if(i<slen-2) {
9934           nr|=needed_reg[i+2];
9935           for(hr=0;hr<HOST_REGS;hr++)
9936           {
9937             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9938             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9939           }
9940         }
9941       }
9942       // Don't need stuff which is overwritten
9943       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9944       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9945       // Merge in delay slot
9946       for(hr=0;hr<HOST_REGS;hr++)
9947       {
9948         if(!likely[i]) {
9949           // These are overwritten unless the branch is "likely"
9950           // and the delay slot is nullified if not taken
9951           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9952           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9953         }
9954         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9955         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9956         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9957         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9958         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9959         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9960         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9961         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9962         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9963           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9964           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9965         }
9966         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9967           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9968           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9969         }
9970         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9971           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9972           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9973         }
9974       }
9975     }
9976     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
9977     {
9978       // SYSCALL instruction (software interrupt)
9979       nr=0;
9980     }
9981     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9982     {
9983       // ERET instruction (return from interrupt)
9984       nr=0;
9985     }
9986     else // Non-branch
9987     {
9988       if(i<slen-1) {
9989         for(hr=0;hr<HOST_REGS;hr++) {
9990           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9991           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9992           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9993           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9994         }
9995       }
9996     }
9997     for(hr=0;hr<HOST_REGS;hr++)
9998     {
9999       // Overwritten registers are not needed
10000       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10001       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10002       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
10003       // Source registers are needed
10004       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10005       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10006       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
10007       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
10008       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10009       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10010       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
10011       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
10012       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
10013         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10014         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10015       }
10016       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
10017         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10018         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10019       }
10020       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
10021         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
10022         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
10023       }
10024       // Don't store a register immediately after writing it,
10025       // may prevent dual-issue.
10026       // But do so if this is a branch target, otherwise we
10027       // might have to load the register before the branch.
10028       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
10029         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
10030            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
10031           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10032           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
10033         }
10034         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
10035            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
10036           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10037           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
10038         }
10039       }
10040     }
10041     // Cycle count is needed at branches.  Assume it is needed at the target too.
10042     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
10043       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
10044       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
10045     }
10046     // Save it
10047     needed_reg[i]=nr;
10048     
10049     // Deallocate unneeded registers
10050     for(hr=0;hr<HOST_REGS;hr++)
10051     {
10052       if(!((nr>>hr)&1)) {
10053         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
10054         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
10055            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10056            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
10057         {
10058           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10059           {
10060             if(likely[i]) {
10061               regs[i].regmap[hr]=-1;
10062               regs[i].isconst&=~(1<<hr);
10063               if(i<slen-2) {
10064                 regmap_pre[i+2][hr]=-1;
10065                 regs[i+2].wasconst&=~(1<<hr);
10066               }
10067             }
10068           }
10069         }
10070         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10071         {
10072           int d1=0,d2=0,map=0,temp=0;
10073           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
10074           {
10075             d1=dep1[i+1];
10076             d2=dep2[i+1];
10077           }
10078           if(using_tlb) {
10079             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
10080                itype[i+1]==STORE || itype[i+1]==STORELR ||
10081                itype[i+1]==C1LS || itype[i+1]==C2LS)
10082             map=TLREG;
10083           } else
10084           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
10085              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10086             map=INVCP;
10087           }
10088           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
10089              itype[i+1]==C1LS || itype[i+1]==C2LS)
10090             temp=FTEMP;
10091           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
10092              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10093              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
10094              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
10095              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10096              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
10097              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
10098              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
10099              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
10100              regs[i].regmap[hr]!=map )
10101           {
10102             regs[i].regmap[hr]=-1;
10103             regs[i].isconst&=~(1<<hr);
10104             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
10105                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
10106                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
10107                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
10108                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
10109                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
10110                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
10111                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
10112                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
10113                branch_regs[i].regmap[hr]!=map)
10114             {
10115               branch_regs[i].regmap[hr]=-1;
10116               branch_regs[i].regmap_entry[hr]=-1;
10117               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10118               {
10119                 if(!likely[i]&&i<slen-2) {
10120                   regmap_pre[i+2][hr]=-1;
10121                   regs[i+2].wasconst&=~(1<<hr);
10122                 }
10123               }
10124             }
10125           }
10126         }
10127         else
10128         {
10129           // Non-branch
10130           if(i>0)
10131           {
10132             int d1=0,d2=0,map=-1,temp=-1;
10133             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
10134             {
10135               d1=dep1[i];
10136               d2=dep2[i];
10137             }
10138             if(using_tlb) {
10139               if(itype[i]==LOAD || itype[i]==LOADLR ||
10140                  itype[i]==STORE || itype[i]==STORELR ||
10141                  itype[i]==C1LS || itype[i]==C2LS)
10142               map=TLREG;
10143             } else if(itype[i]==STORE || itype[i]==STORELR ||
10144                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
10145               map=INVCP;
10146             }
10147             if(itype[i]==LOADLR || itype[i]==STORELR ||
10148                itype[i]==C1LS || itype[i]==C2LS)
10149               temp=FTEMP;
10150             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
10151                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
10152                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
10153                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
10154                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
10155                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
10156             {
10157               if(i<slen-1&&!is_ds[i]) {
10158                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
10159                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
10160                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
10161                 {
10162                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
10163                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
10164                 }
10165                 regmap_pre[i+1][hr]=-1;
10166                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
10167                 regs[i+1].wasconst&=~(1<<hr);
10168               }
10169               regs[i].regmap[hr]=-1;
10170               regs[i].isconst&=~(1<<hr);
10171             }
10172           }
10173         }
10174       }
10175     }
10176   }
10177   
10178   /* Pass 5 - Pre-allocate registers */
10179   
10180   // If a register is allocated during a loop, try to allocate it for the
10181   // entire loop, if possible.  This avoids loading/storing registers
10182   // inside of the loop.
10183   
10184   signed char f_regmap[HOST_REGS];
10185   clear_all_regs(f_regmap);
10186   for(i=0;i<slen-1;i++)
10187   {
10188     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10189     {
10190       if(ba[i]>=start && ba[i]<(start+i*4)) 
10191       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
10192       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
10193       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
10194       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
10195       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
10196       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
10197       {
10198         int t=(ba[i]-start)>>2;
10199         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
10200         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
10201         for(hr=0;hr<HOST_REGS;hr++)
10202         {
10203           if(regs[i].regmap[hr]>64) {
10204             if(!((regs[i].dirty>>hr)&1))
10205               f_regmap[hr]=regs[i].regmap[hr];
10206             else f_regmap[hr]=-1;
10207           }
10208           else if(regs[i].regmap[hr]>=0) {
10209             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10210               // dealloc old register
10211               int n;
10212               for(n=0;n<HOST_REGS;n++)
10213               {
10214                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10215               }
10216               // and alloc new one
10217               f_regmap[hr]=regs[i].regmap[hr];
10218             }
10219           }
10220           if(branch_regs[i].regmap[hr]>64) {
10221             if(!((branch_regs[i].dirty>>hr)&1))
10222               f_regmap[hr]=branch_regs[i].regmap[hr];
10223             else f_regmap[hr]=-1;
10224           }
10225           else if(branch_regs[i].regmap[hr]>=0) {
10226             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
10227               // dealloc old register
10228               int n;
10229               for(n=0;n<HOST_REGS;n++)
10230               {
10231                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
10232               }
10233               // and alloc new one
10234               f_regmap[hr]=branch_regs[i].regmap[hr];
10235             }
10236           }
10237           if(ooo[i]) {
10238             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) 
10239               f_regmap[hr]=branch_regs[i].regmap[hr];
10240           }else{
10241             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) 
10242               f_regmap[hr]=branch_regs[i].regmap[hr];
10243           }
10244           // Avoid dirty->clean transition
10245           #ifdef DESTRUCTIVE_WRITEBACK
10246           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
10247           #endif
10248           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
10249           // case above, however it's always a good idea.  We can't hoist the
10250           // load if the register was already allocated, so there's no point
10251           // wasting time analyzing most of these cases.  It only "succeeds"
10252           // when the mapping was different and the load can be replaced with
10253           // a mov, which is of negligible benefit.  So such cases are
10254           // skipped below.
10255           if(f_regmap[hr]>0) {
10256             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
10257               int r=f_regmap[hr];
10258               for(j=t;j<=i;j++)
10259               {
10260                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10261                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
10262                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
10263                 if(r>63) {
10264                   // NB This can exclude the case where the upper-half
10265                   // register is lower numbered than the lower-half
10266                   // register.  Not sure if it's worth fixing...
10267                   if(get_reg(regs[j].regmap,r&63)<0) break;
10268                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
10269                   if(regs[j].is32&(1LL<<(r&63))) break;
10270                 }
10271                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
10272                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
10273                   int k;
10274                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
10275                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
10276                     if(r>63) {
10277                       if(get_reg(regs[i].regmap,r&63)<0) break;
10278                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
10279                     }
10280                     k=i;
10281                     while(k>1&&regs[k-1].regmap[hr]==-1) {
10282                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10283                         //printf("no free regs for store %x\n",start+(k-1)*4);
10284                         break;
10285                       }
10286                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
10287                         //printf("no-match due to different register\n");
10288                         break;
10289                       }
10290                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
10291                         //printf("no-match due to branch\n");
10292                         break;
10293                       }
10294                       // call/ret fast path assumes no registers allocated
10295                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
10296                         break;
10297                       }
10298                       if(r>63) {
10299                         // NB This can exclude the case where the upper-half
10300                         // register is lower numbered than the lower-half
10301                         // register.  Not sure if it's worth fixing...
10302                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
10303                         if(regs[k-1].is32&(1LL<<(r&63))) break;
10304                       }
10305                       k--;
10306                     }
10307                     if(i<slen-1) {
10308                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
10309                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
10310                         //printf("bad match after branch\n");
10311                         break;
10312                       }
10313                     }
10314                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
10315                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
10316                       while(k<i) {
10317                         regs[k].regmap_entry[hr]=f_regmap[hr];
10318                         regs[k].regmap[hr]=f_regmap[hr];
10319                         regmap_pre[k+1][hr]=f_regmap[hr];
10320                         regs[k].wasdirty&=~(1<<hr);
10321                         regs[k].dirty&=~(1<<hr);
10322                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
10323                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
10324                         regs[k].wasconst&=~(1<<hr);
10325                         regs[k].isconst&=~(1<<hr);
10326                         k++;
10327                       }
10328                     }
10329                     else {
10330                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
10331                       break;
10332                     }
10333                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
10334                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
10335                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
10336                       regs[i].regmap_entry[hr]=f_regmap[hr];
10337                       regs[i].regmap[hr]=f_regmap[hr];
10338                       regs[i].wasdirty&=~(1<<hr);
10339                       regs[i].dirty&=~(1<<hr);
10340                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
10341                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
10342                       regs[i].wasconst&=~(1<<hr);
10343                       regs[i].isconst&=~(1<<hr);
10344                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
10345                       branch_regs[i].wasdirty&=~(1<<hr);
10346                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
10347                       branch_regs[i].regmap[hr]=f_regmap[hr];
10348                       branch_regs[i].dirty&=~(1<<hr);
10349                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
10350                       branch_regs[i].wasconst&=~(1<<hr);
10351                       branch_regs[i].isconst&=~(1<<hr);
10352                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
10353                         regmap_pre[i+2][hr]=f_regmap[hr];
10354                         regs[i+2].wasdirty&=~(1<<hr);
10355                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
10356                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
10357                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
10358                       }
10359                     }
10360                   }
10361                   for(k=t;k<j;k++) {
10362                     // Alloc register clean at beginning of loop,
10363                     // but may dirty it in pass 6
10364                     regs[k].regmap_entry[hr]=f_regmap[hr];
10365                     regs[k].regmap[hr]=f_regmap[hr];
10366                     regs[k].dirty&=~(1<<hr);
10367                     regs[k].wasconst&=~(1<<hr);
10368                     regs[k].isconst&=~(1<<hr);
10369                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
10370                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
10371                       branch_regs[k].regmap[hr]=f_regmap[hr];
10372                       branch_regs[k].dirty&=~(1<<hr);
10373                       branch_regs[k].wasconst&=~(1<<hr);
10374                       branch_regs[k].isconst&=~(1<<hr);
10375                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
10376                         regmap_pre[k+2][hr]=f_regmap[hr];
10377                         regs[k+2].wasdirty&=~(1<<hr);
10378                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
10379                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
10380                       }
10381                     }
10382                     else
10383                     {
10384                       regmap_pre[k+1][hr]=f_regmap[hr];
10385                       regs[k+1].wasdirty&=~(1<<hr);
10386                     }
10387                   }
10388                   if(regs[j].regmap[hr]==f_regmap[hr])
10389                     regs[j].regmap_entry[hr]=f_regmap[hr];
10390                   break;
10391                 }
10392                 if(j==i) break;
10393                 if(regs[j].regmap[hr]>=0)
10394                   break;
10395                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
10396                   //printf("no-match due to different register\n");
10397                   break;
10398                 }
10399                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
10400                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
10401                   break;
10402                 }
10403                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10404                 {
10405                   // Stop on unconditional branch
10406                   break;
10407                 }
10408                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
10409                 {
10410                   if(ooo[j]) {
10411                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) 
10412                       break;
10413                   }else{
10414                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) 
10415                       break;
10416                   }
10417                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
10418                     //printf("no-match due to different register (branch)\n");
10419                     break;
10420                   }
10421                 }
10422                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10423                   //printf("No free regs for store %x\n",start+j*4);
10424                   break;
10425                 }
10426                 if(f_regmap[hr]>=64) {
10427                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
10428                     break;
10429                   }
10430                   else
10431                   {
10432                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
10433                       break;
10434                     }
10435                   }
10436                 }
10437               }
10438             }
10439           }
10440         }
10441       }
10442     }else{
10443       // Non branch or undetermined branch target
10444       for(hr=0;hr<HOST_REGS;hr++)
10445       {
10446         if(hr!=EXCLUDE_REG) {
10447           if(regs[i].regmap[hr]>64) {
10448             if(!((regs[i].dirty>>hr)&1))
10449               f_regmap[hr]=regs[i].regmap[hr];
10450           }
10451           else if(regs[i].regmap[hr]>=0) {
10452             if(f_regmap[hr]!=regs[i].regmap[hr]) {
10453               // dealloc old register
10454               int n;
10455               for(n=0;n<HOST_REGS;n++)
10456               {
10457                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
10458               }
10459               // and alloc new one
10460               f_regmap[hr]=regs[i].regmap[hr];
10461             }
10462           }
10463         }
10464       }
10465       // Try to restore cycle count at branch targets
10466       if(bt[i]) {
10467         for(j=i;j<slen-1;j++) {
10468           if(regs[j].regmap[HOST_CCREG]!=-1) break;
10469           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
10470             //printf("no free regs for store %x\n",start+j*4);
10471             break;
10472           }
10473         }
10474         if(regs[j].regmap[HOST_CCREG]==CCREG) {
10475           int k=i;
10476           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
10477           while(k<j) {
10478             regs[k].regmap_entry[HOST_CCREG]=CCREG;
10479             regs[k].regmap[HOST_CCREG]=CCREG;
10480             regmap_pre[k+1][HOST_CCREG]=CCREG;
10481             regs[k+1].wasdirty|=1<<HOST_CCREG;
10482             regs[k].dirty|=1<<HOST_CCREG;
10483             regs[k].wasconst&=~(1<<HOST_CCREG);
10484             regs[k].isconst&=~(1<<HOST_CCREG);
10485             k++;
10486           }
10487           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
10488         }
10489         // Work backwards from the branch target
10490         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
10491         {
10492           //printf("Extend backwards\n");
10493           int k;
10494           k=i;
10495           while(regs[k-1].regmap[HOST_CCREG]==-1) {
10496             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
10497               //printf("no free regs for store %x\n",start+(k-1)*4);
10498               break;
10499             }
10500             k--;
10501           }
10502           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
10503             //printf("Extend CC, %x ->\n",start+k*4);
10504             while(k<=i) {
10505               regs[k].regmap_entry[HOST_CCREG]=CCREG;
10506               regs[k].regmap[HOST_CCREG]=CCREG;
10507               regmap_pre[k+1][HOST_CCREG]=CCREG;
10508               regs[k+1].wasdirty|=1<<HOST_CCREG;
10509               regs[k].dirty|=1<<HOST_CCREG;
10510               regs[k].wasconst&=~(1<<HOST_CCREG);
10511               regs[k].isconst&=~(1<<HOST_CCREG);
10512               k++;
10513             }
10514           }
10515           else {
10516             //printf("Fail Extend CC, %x ->\n",start+k*4);
10517           }
10518         }
10519       }
10520       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
10521          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
10522          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
10523          itype[i]!=FCONV&&itype[i]!=FCOMP)
10524       {
10525         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
10526       }
10527     }
10528   }
10529   
10530   // Cache memory offset or tlb map pointer if a register is available
10531   #ifndef HOST_IMM_ADDR32
10532   #ifndef RAM_OFFSET
10533   if(using_tlb)
10534   #endif
10535   {
10536     int earliest_available[HOST_REGS];
10537     int loop_start[HOST_REGS];
10538     int score[HOST_REGS];
10539     int end[HOST_REGS];
10540     int reg=using_tlb?MMREG:ROREG;
10541
10542     // Init
10543     for(hr=0;hr<HOST_REGS;hr++) {
10544       score[hr]=0;earliest_available[hr]=0;
10545       loop_start[hr]=MAXBLOCK;
10546     }
10547     for(i=0;i<slen-1;i++)
10548     {
10549       // Can't do anything if no registers are available
10550       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
10551         for(hr=0;hr<HOST_REGS;hr++) {
10552           score[hr]=0;earliest_available[hr]=i+1;
10553           loop_start[hr]=MAXBLOCK;
10554         }
10555       }
10556       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10557         if(!ooo[i]) {
10558           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
10559             for(hr=0;hr<HOST_REGS;hr++) {
10560               score[hr]=0;earliest_available[hr]=i+1;
10561               loop_start[hr]=MAXBLOCK;
10562             }
10563           }
10564         }else{
10565           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
10566             for(hr=0;hr<HOST_REGS;hr++) {
10567               score[hr]=0;earliest_available[hr]=i+1;
10568               loop_start[hr]=MAXBLOCK;
10569             }
10570           }
10571         }
10572       }
10573       // Mark unavailable registers
10574       for(hr=0;hr<HOST_REGS;hr++) {
10575         if(regs[i].regmap[hr]>=0) {
10576           score[hr]=0;earliest_available[hr]=i+1;
10577           loop_start[hr]=MAXBLOCK;
10578         }
10579         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10580           if(branch_regs[i].regmap[hr]>=0) {
10581             score[hr]=0;earliest_available[hr]=i+2;
10582             loop_start[hr]=MAXBLOCK;
10583           }
10584         }
10585       }
10586       // No register allocations after unconditional jumps
10587       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10588       {
10589         for(hr=0;hr<HOST_REGS;hr++) {
10590           score[hr]=0;earliest_available[hr]=i+2;
10591           loop_start[hr]=MAXBLOCK;
10592         }
10593         i++; // Skip delay slot too
10594         //printf("skip delay slot: %x\n",start+i*4);
10595       }
10596       else
10597       // Possible match
10598       if(itype[i]==LOAD||itype[i]==LOADLR||
10599          itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS) {
10600         for(hr=0;hr<HOST_REGS;hr++) {
10601           if(hr!=EXCLUDE_REG) {
10602             end[hr]=i-1;
10603             for(j=i;j<slen-1;j++) {
10604               if(regs[j].regmap[hr]>=0) break;
10605               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10606                 if(branch_regs[j].regmap[hr]>=0) break;
10607                 if(ooo[j]) {
10608                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
10609                 }else{
10610                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
10611                 }
10612               }
10613               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
10614               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10615                 int t=(ba[j]-start)>>2;
10616                 if(t<j&&t>=earliest_available[hr]) {
10617                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) { // call/ret assumes no registers allocated
10618                     // Score a point for hoisting loop invariant
10619                     if(t<loop_start[hr]) loop_start[hr]=t;
10620                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*4,start+j*4,start+t*4);
10621                     score[hr]++;
10622                     end[hr]=j;
10623                   }
10624                 }
10625                 else if(t<j) {
10626                   if(regs[t].regmap[hr]==reg) {
10627                     // Score a point if the branch target matches this register
10628                     score[hr]++;
10629                     end[hr]=j;
10630                   }
10631                 }
10632                 if(itype[j+1]==LOAD||itype[j+1]==LOADLR||
10633                    itype[j+1]==STORE||itype[j+1]==STORELR||itype[j+1]==C1LS) {
10634                   score[hr]++;
10635                   end[hr]=j;
10636                 }
10637               }
10638               if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
10639               {
10640                 // Stop on unconditional branch
10641                 break;
10642               }
10643               else
10644               if(itype[j]==LOAD||itype[j]==LOADLR||
10645                  itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS) {
10646                 score[hr]++;
10647                 end[hr]=j;
10648               }
10649             }
10650           }
10651         }
10652         // Find highest score and allocate that register
10653         int maxscore=0;
10654         for(hr=0;hr<HOST_REGS;hr++) {
10655           if(hr!=EXCLUDE_REG) {
10656             if(score[hr]>score[maxscore]) {
10657               maxscore=hr;
10658               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*4,start+end[hr]*4);
10659             }
10660           }
10661         }
10662         if(score[maxscore]>1)
10663         {
10664           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
10665           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
10666             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*4+start,j*4+start,maxscore,regs[j].regmap[maxscore]);}
10667             assert(regs[j].regmap[maxscore]<0);
10668             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
10669             regs[j].regmap[maxscore]=reg;
10670             regs[j].dirty&=~(1<<maxscore);
10671             regs[j].wasconst&=~(1<<maxscore);
10672             regs[j].isconst&=~(1<<maxscore);
10673             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP) {
10674               branch_regs[j].regmap[maxscore]=reg;
10675               branch_regs[j].wasdirty&=~(1<<maxscore);
10676               branch_regs[j].dirty&=~(1<<maxscore);
10677               branch_regs[j].wasconst&=~(1<<maxscore);
10678               branch_regs[j].isconst&=~(1<<maxscore);
10679               if(itype[j]!=RJUMP&&itype[j]!=UJUMP&&(source[j]>>16)!=0x1000) {
10680                 regmap_pre[j+2][maxscore]=reg;
10681                 regs[j+2].wasdirty&=~(1<<maxscore);
10682               }
10683               // loop optimization (loop_preload)
10684               int t=(ba[j]-start)>>2;
10685               if(t==loop_start[maxscore]) {
10686                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=31)) // call/ret assumes no registers allocated
10687                   regs[t].regmap_entry[maxscore]=reg;
10688               }
10689             }
10690             else
10691             {
10692               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=CJUMP&&itype[j-1]!=SJUMP&&itype[j-1]!=FJUMP)) {
10693                 regmap_pre[j+1][maxscore]=reg;
10694                 regs[j+1].wasdirty&=~(1<<maxscore);
10695               }
10696             }
10697           }
10698           i=j-1;
10699           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==CJUMP||itype[j-1]==SJUMP||itype[j-1]==FJUMP) i++; // skip delay slot
10700           for(hr=0;hr<HOST_REGS;hr++) {
10701             score[hr]=0;earliest_available[hr]=i+i;
10702             loop_start[hr]=MAXBLOCK;
10703           }
10704         }
10705       }
10706     }
10707   }
10708   #endif
10709   
10710   // This allocates registers (if possible) one instruction prior
10711   // to use, which can avoid a load-use penalty on certain CPUs.
10712   for(i=0;i<slen-1;i++)
10713   {
10714     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
10715     {
10716       if(!bt[i+1])
10717       {
10718         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
10719            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
10720         {
10721           if(rs1[i+1]) {
10722             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
10723             {
10724               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10725               {
10726                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10727                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10728                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10729                 regs[i].isconst&=~(1<<hr);
10730                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10731                 constmap[i][hr]=constmap[i+1][hr];
10732                 regs[i+1].wasdirty&=~(1<<hr);
10733                 regs[i].dirty&=~(1<<hr);
10734               }
10735             }
10736           }
10737           if(rs2[i+1]) {
10738             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10739             {
10740               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10741               {
10742                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10743                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10744                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10745                 regs[i].isconst&=~(1<<hr);
10746                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10747                 constmap[i][hr]=constmap[i+1][hr];
10748                 regs[i+1].wasdirty&=~(1<<hr);
10749                 regs[i].dirty&=~(1<<hr);
10750               }
10751             }
10752           }
10753           // Preload target address for load instruction (non-constant)
10754           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10755             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10756             {
10757               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10758               {
10759                 regs[i].regmap[hr]=rs1[i+1];
10760                 regmap_pre[i+1][hr]=rs1[i+1];
10761                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10762                 regs[i].isconst&=~(1<<hr);
10763                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10764                 constmap[i][hr]=constmap[i+1][hr];
10765                 regs[i+1].wasdirty&=~(1<<hr);
10766                 regs[i].dirty&=~(1<<hr);
10767               }
10768             }
10769           }
10770           // Load source into target register 
10771           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10772             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10773             {
10774               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10775               {
10776                 regs[i].regmap[hr]=rs1[i+1];
10777                 regmap_pre[i+1][hr]=rs1[i+1];
10778                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10779                 regs[i].isconst&=~(1<<hr);
10780                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10781                 constmap[i][hr]=constmap[i+1][hr];
10782                 regs[i+1].wasdirty&=~(1<<hr);
10783                 regs[i].dirty&=~(1<<hr);
10784               }
10785             }
10786           }
10787           // Preload map address
10788           #ifndef HOST_IMM_ADDR32
10789           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10790             hr=get_reg(regs[i+1].regmap,TLREG);
10791             if(hr>=0) {
10792               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10793               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10794                 int nr;
10795                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10796                 {
10797                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10798                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10799                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10800                   regs[i].isconst&=~(1<<hr);
10801                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10802                   constmap[i][hr]=constmap[i+1][hr];
10803                   regs[i+1].wasdirty&=~(1<<hr);
10804                   regs[i].dirty&=~(1<<hr);
10805                 }
10806                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10807                 {
10808                   // move it to another register
10809                   regs[i+1].regmap[hr]=-1;
10810                   regmap_pre[i+2][hr]=-1;
10811                   regs[i+1].regmap[nr]=TLREG;
10812                   regmap_pre[i+2][nr]=TLREG;
10813                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10814                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10815                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10816                   regs[i].isconst&=~(1<<nr);
10817                   regs[i+1].isconst&=~(1<<nr);
10818                   regs[i].dirty&=~(1<<nr);
10819                   regs[i+1].wasdirty&=~(1<<nr);
10820                   regs[i+1].dirty&=~(1<<nr);
10821                   regs[i+2].wasdirty&=~(1<<nr);
10822                 }
10823               }
10824             }
10825           }
10826           #endif
10827           // Address for store instruction (non-constant)
10828           if(itype[i+1]==STORE||itype[i+1]==STORELR
10829              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10830             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10831               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10832               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10833               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10834               assert(hr>=0);
10835               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10836               {
10837                 regs[i].regmap[hr]=rs1[i+1];
10838                 regmap_pre[i+1][hr]=rs1[i+1];
10839                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10840                 regs[i].isconst&=~(1<<hr);
10841                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10842                 constmap[i][hr]=constmap[i+1][hr];
10843                 regs[i+1].wasdirty&=~(1<<hr);
10844                 regs[i].dirty&=~(1<<hr);
10845               }
10846             }
10847           }
10848           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10849             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10850               int nr;
10851               hr=get_reg(regs[i+1].regmap,FTEMP);
10852               assert(hr>=0);
10853               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10854               {
10855                 regs[i].regmap[hr]=rs1[i+1];
10856                 regmap_pre[i+1][hr]=rs1[i+1];
10857                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10858                 regs[i].isconst&=~(1<<hr);
10859                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10860                 constmap[i][hr]=constmap[i+1][hr];
10861                 regs[i+1].wasdirty&=~(1<<hr);
10862                 regs[i].dirty&=~(1<<hr);
10863               }
10864               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10865               {
10866                 // move it to another register
10867                 regs[i+1].regmap[hr]=-1;
10868                 regmap_pre[i+2][hr]=-1;
10869                 regs[i+1].regmap[nr]=FTEMP;
10870                 regmap_pre[i+2][nr]=FTEMP;
10871                 regs[i].regmap[nr]=rs1[i+1];
10872                 regmap_pre[i+1][nr]=rs1[i+1];
10873                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10874                 regs[i].isconst&=~(1<<nr);
10875                 regs[i+1].isconst&=~(1<<nr);
10876                 regs[i].dirty&=~(1<<nr);
10877                 regs[i+1].wasdirty&=~(1<<nr);
10878                 regs[i+1].dirty&=~(1<<nr);
10879                 regs[i+2].wasdirty&=~(1<<nr);
10880               }
10881             }
10882           }
10883           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10884             if(itype[i+1]==LOAD) 
10885               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10886             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10887               hr=get_reg(regs[i+1].regmap,FTEMP);
10888             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10889               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10890               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10891             }
10892             if(hr>=0&&regs[i].regmap[hr]<0) {
10893               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10894               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10895                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10896                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10897                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10898                 regs[i].isconst&=~(1<<hr);
10899                 regs[i+1].wasdirty&=~(1<<hr);
10900                 regs[i].dirty&=~(1<<hr);
10901               }
10902             }
10903           }
10904         }
10905       }
10906     }
10907   }
10908   
10909   /* Pass 6 - Optimize clean/dirty state */
10910   clean_registers(0,slen-1,1);
10911   
10912   /* Pass 7 - Identify 32-bit registers */
10913 #ifndef FORCE32
10914   provisional_r32();
10915
10916   u_int r32=0;
10917   
10918   for (i=slen-1;i>=0;i--)
10919   {
10920     int hr;
10921     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10922     {
10923       if(ba[i]<start || ba[i]>=(start+slen*4))
10924       {
10925         // Branch out of this block, don't need anything
10926         r32=0;
10927       }
10928       else
10929       {
10930         // Internal branch
10931         // Need whatever matches the target
10932         // (and doesn't get overwritten by the delay slot instruction)
10933         r32=0;
10934         int t=(ba[i]-start)>>2;
10935         if(ba[i]>start+i*4) {
10936           // Forward branch
10937           if(!(requires_32bit[t]&~regs[i].was32))
10938             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10939         }else{
10940           // Backward branch
10941           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10942           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10943           if(!(pr32[t]&~regs[i].was32))
10944             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10945         }
10946       }
10947       // Conditional branch may need registers for following instructions
10948       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10949       {
10950         if(i<slen-2) {
10951           r32|=requires_32bit[i+2];
10952           r32&=regs[i].was32;
10953           // Mark this address as a branch target since it may be called
10954           // upon return from interrupt
10955           bt[i+2]=1;
10956         }
10957       }
10958       // Merge in delay slot
10959       if(!likely[i]) {
10960         // These are overwritten unless the branch is "likely"
10961         // and the delay slot is nullified if not taken
10962         r32&=~(1LL<<rt1[i+1]);
10963         r32&=~(1LL<<rt2[i+1]);
10964       }
10965       // Assume these are needed (delay slot)
10966       if(us1[i+1]>0)
10967       {
10968         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10969       }
10970       if(us2[i+1]>0)
10971       {
10972         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10973       }
10974       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10975       {
10976         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10977       }
10978       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10979       {
10980         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10981       }
10982     }
10983     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
10984     {
10985       // SYSCALL instruction (software interrupt)
10986       r32=0;
10987     }
10988     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10989     {
10990       // ERET instruction (return from interrupt)
10991       r32=0;
10992     }
10993     // Check 32 bits
10994     r32&=~(1LL<<rt1[i]);
10995     r32&=~(1LL<<rt2[i]);
10996     if(us1[i]>0)
10997     {
10998       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10999     }
11000     if(us2[i]>0)
11001     {
11002       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
11003     }
11004     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
11005     {
11006       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
11007     }
11008     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
11009     {
11010       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
11011     }
11012     requires_32bit[i]=r32;
11013     
11014     // Dirty registers which are 32-bit, require 32-bit input
11015     // as they will be written as 32-bit values
11016     for(hr=0;hr<HOST_REGS;hr++)
11017     {
11018       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
11019         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
11020           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
11021           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
11022         }
11023       }
11024     }
11025     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
11026   }
11027 #else
11028   for (i=slen-1;i>=0;i--)
11029   {
11030     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11031     {
11032       // Conditional branch
11033       if((source[i]>>16)!=0x1000&&i<slen-2) {
11034         // Mark this address as a branch target since it may be called
11035         // upon return from interrupt
11036         bt[i+2]=1;
11037       }
11038     }
11039   }
11040 #endif
11041
11042   if(itype[slen-1]==SPAN) {
11043     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
11044   }
11045
11046 #ifdef DISASM
11047   /* Debug/disassembly */
11048   for(i=0;i<slen;i++)
11049   {
11050     printf("U:");
11051     int r;
11052     for(r=1;r<=CCREG;r++) {
11053       if((unneeded_reg[i]>>r)&1) {
11054         if(r==HIREG) printf(" HI");
11055         else if(r==LOREG) printf(" LO");
11056         else printf(" r%d",r);
11057       }
11058     }
11059 #ifndef FORCE32
11060     printf(" UU:");
11061     for(r=1;r<=CCREG;r++) {
11062       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
11063         if(r==HIREG) printf(" HI");
11064         else if(r==LOREG) printf(" LO");
11065         else printf(" r%d",r);
11066       }
11067     }
11068     printf(" 32:");
11069     for(r=0;r<=CCREG;r++) {
11070       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11071       if((regs[i].was32>>r)&1) {
11072         if(r==CCREG) printf(" CC");
11073         else if(r==HIREG) printf(" HI");
11074         else if(r==LOREG) printf(" LO");
11075         else printf(" r%d",r);
11076       }
11077     }
11078 #endif
11079     printf("\n");
11080     #if defined(__i386__) || defined(__x86_64__)
11081     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
11082     #endif
11083     #ifdef __arm__
11084     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
11085     #endif
11086     printf("needs: ");
11087     if(needed_reg[i]&1) printf("eax ");
11088     if((needed_reg[i]>>1)&1) printf("ecx ");
11089     if((needed_reg[i]>>2)&1) printf("edx ");
11090     if((needed_reg[i]>>3)&1) printf("ebx ");
11091     if((needed_reg[i]>>5)&1) printf("ebp ");
11092     if((needed_reg[i]>>6)&1) printf("esi ");
11093     if((needed_reg[i]>>7)&1) printf("edi ");
11094     printf("r:");
11095     for(r=0;r<=CCREG;r++) {
11096       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11097       if((requires_32bit[i]>>r)&1) {
11098         if(r==CCREG) printf(" CC");
11099         else if(r==HIREG) printf(" HI");
11100         else if(r==LOREG) printf(" LO");
11101         else printf(" r%d",r);
11102       }
11103     }
11104     printf("\n");
11105     /*printf("pr:");
11106     for(r=0;r<=CCREG;r++) {
11107       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
11108       if((pr32[i]>>r)&1) {
11109         if(r==CCREG) printf(" CC");
11110         else if(r==HIREG) printf(" HI");
11111         else if(r==LOREG) printf(" LO");
11112         else printf(" r%d",r);
11113       }
11114     }
11115     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
11116     printf("\n");*/
11117     #if defined(__i386__) || defined(__x86_64__)
11118     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
11119     printf("dirty: ");
11120     if(regs[i].wasdirty&1) printf("eax ");
11121     if((regs[i].wasdirty>>1)&1) printf("ecx ");
11122     if((regs[i].wasdirty>>2)&1) printf("edx ");
11123     if((regs[i].wasdirty>>3)&1) printf("ebx ");
11124     if((regs[i].wasdirty>>5)&1) printf("ebp ");
11125     if((regs[i].wasdirty>>6)&1) printf("esi ");
11126     if((regs[i].wasdirty>>7)&1) printf("edi ");
11127     #endif
11128     #ifdef __arm__
11129     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
11130     printf("dirty: ");
11131     if(regs[i].wasdirty&1) printf("r0 ");
11132     if((regs[i].wasdirty>>1)&1) printf("r1 ");
11133     if((regs[i].wasdirty>>2)&1) printf("r2 ");
11134     if((regs[i].wasdirty>>3)&1) printf("r3 ");
11135     if((regs[i].wasdirty>>4)&1) printf("r4 ");
11136     if((regs[i].wasdirty>>5)&1) printf("r5 ");
11137     if((regs[i].wasdirty>>6)&1) printf("r6 ");
11138     if((regs[i].wasdirty>>7)&1) printf("r7 ");
11139     if((regs[i].wasdirty>>8)&1) printf("r8 ");
11140     if((regs[i].wasdirty>>9)&1) printf("r9 ");
11141     if((regs[i].wasdirty>>10)&1) printf("r10 ");
11142     if((regs[i].wasdirty>>12)&1) printf("r12 ");
11143     #endif
11144     printf("\n");
11145     disassemble_inst(i);
11146     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
11147     #if defined(__i386__) || defined(__x86_64__)
11148     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
11149     if(regs[i].dirty&1) printf("eax ");
11150     if((regs[i].dirty>>1)&1) printf("ecx ");
11151     if((regs[i].dirty>>2)&1) printf("edx ");
11152     if((regs[i].dirty>>3)&1) printf("ebx ");
11153     if((regs[i].dirty>>5)&1) printf("ebp ");
11154     if((regs[i].dirty>>6)&1) printf("esi ");
11155     if((regs[i].dirty>>7)&1) printf("edi ");
11156     #endif
11157     #ifdef __arm__
11158     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
11159     if(regs[i].dirty&1) printf("r0 ");
11160     if((regs[i].dirty>>1)&1) printf("r1 ");
11161     if((regs[i].dirty>>2)&1) printf("r2 ");
11162     if((regs[i].dirty>>3)&1) printf("r3 ");
11163     if((regs[i].dirty>>4)&1) printf("r4 ");
11164     if((regs[i].dirty>>5)&1) printf("r5 ");
11165     if((regs[i].dirty>>6)&1) printf("r6 ");
11166     if((regs[i].dirty>>7)&1) printf("r7 ");
11167     if((regs[i].dirty>>8)&1) printf("r8 ");
11168     if((regs[i].dirty>>9)&1) printf("r9 ");
11169     if((regs[i].dirty>>10)&1) printf("r10 ");
11170     if((regs[i].dirty>>12)&1) printf("r12 ");
11171     #endif
11172     printf("\n");
11173     if(regs[i].isconst) {
11174       printf("constants: ");
11175       #if defined(__i386__) || defined(__x86_64__)
11176       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
11177       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
11178       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
11179       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
11180       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
11181       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
11182       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
11183       #endif
11184       #ifdef __arm__
11185       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
11186       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
11187       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
11188       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
11189       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
11190       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
11191       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
11192       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
11193       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
11194       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
11195       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
11196       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
11197       #endif
11198       printf("\n");
11199     }
11200 #ifndef FORCE32
11201     printf(" 32:");
11202     for(r=0;r<=CCREG;r++) {
11203       if((regs[i].is32>>r)&1) {
11204         if(r==CCREG) printf(" CC");
11205         else if(r==HIREG) printf(" HI");
11206         else if(r==LOREG) printf(" LO");
11207         else printf(" r%d",r);
11208       }
11209     }
11210     printf("\n");
11211 #endif
11212     /*printf(" p32:");
11213     for(r=0;r<=CCREG;r++) {
11214       if((p32[i]>>r)&1) {
11215         if(r==CCREG) printf(" CC");
11216         else if(r==HIREG) printf(" HI");
11217         else if(r==LOREG) printf(" LO");
11218         else printf(" r%d",r);
11219       }
11220     }
11221     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
11222     else printf("\n");*/
11223     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
11224       #if defined(__i386__) || defined(__x86_64__)
11225       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
11226       if(branch_regs[i].dirty&1) printf("eax ");
11227       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
11228       if((branch_regs[i].dirty>>2)&1) printf("edx ");
11229       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
11230       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
11231       if((branch_regs[i].dirty>>6)&1) printf("esi ");
11232       if((branch_regs[i].dirty>>7)&1) printf("edi ");
11233       #endif
11234       #ifdef __arm__
11235       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
11236       if(branch_regs[i].dirty&1) printf("r0 ");
11237       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
11238       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
11239       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
11240       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
11241       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
11242       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
11243       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
11244       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
11245       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
11246       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
11247       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
11248       #endif
11249 #ifndef FORCE32
11250       printf(" 32:");
11251       for(r=0;r<=CCREG;r++) {
11252         if((branch_regs[i].is32>>r)&1) {
11253           if(r==CCREG) printf(" CC");
11254           else if(r==HIREG) printf(" HI");
11255           else if(r==LOREG) printf(" LO");
11256           else printf(" r%d",r);
11257         }
11258       }
11259       printf("\n");
11260 #endif
11261     }
11262   }
11263 #endif // DISASM
11264
11265   /* Pass 8 - Assembly */
11266   linkcount=0;stubcount=0;
11267   ds=0;is_delayslot=0;
11268   cop1_usable=0;
11269   uint64_t is32_pre=0;
11270   u_int dirty_pre=0;
11271   u_int beginning=(u_int)out;
11272   if((u_int)addr&1) {
11273     ds=1;
11274     pagespan_ds();
11275   }
11276   u_int instr_addr0_override=0;
11277
11278 #ifdef PCSX
11279   if (start == 0x80030000) {
11280     // nasty hack for fastbios thing
11281     // override block entry to this code
11282     instr_addr0_override=(u_int)out;
11283     emit_movimm(start,0);
11284     // abuse io address var as a flag that we
11285     // have already returned here once
11286     emit_readword((int)&address,1);
11287     emit_writeword(0,(int)&pcaddr);
11288     emit_writeword(0,(int)&address);
11289     emit_cmp(0,1);
11290     emit_jne((int)new_dyna_leave);
11291   }
11292 #endif
11293   for(i=0;i<slen;i++)
11294   {
11295     //if(ds) printf("ds: ");
11296     disassemble_inst(i);
11297     if(ds) {
11298       ds=0; // Skip delay slot
11299       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
11300       instr_addr[i]=0;
11301     } else {
11302       speculate_register_values(i);
11303       #ifndef DESTRUCTIVE_WRITEBACK
11304       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11305       {
11306         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
11307               unneeded_reg[i],unneeded_reg_upper[i]);
11308         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
11309               unneeded_reg[i],unneeded_reg_upper[i]);
11310       }
11311       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
11312         is32_pre=branch_regs[i].is32;
11313         dirty_pre=branch_regs[i].dirty;
11314       }else{
11315         is32_pre=regs[i].is32;
11316         dirty_pre=regs[i].dirty;
11317       }
11318       #endif
11319       // write back
11320       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
11321       {
11322         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
11323                       unneeded_reg[i],unneeded_reg_upper[i]);
11324         loop_preload(regmap_pre[i],regs[i].regmap_entry);
11325       }
11326       // branch target entry point
11327       instr_addr[i]=(u_int)out;
11328       assem_debug("<->\n");
11329       // load regs
11330       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
11331         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
11332       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
11333       address_generation(i,&regs[i],regs[i].regmap_entry);
11334       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
11335       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
11336       {
11337         // Load the delay slot registers if necessary
11338         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
11339           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11340         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
11341           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11342         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
11343           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11344       }
11345       else if(i+1<slen)
11346       {
11347         // Preload registers for following instruction
11348         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
11349           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
11350             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
11351         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
11352           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
11353             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
11354       }
11355       // TODO: if(is_ooo(i)) address_generation(i+1);
11356       if(itype[i]==CJUMP||itype[i]==FJUMP)
11357         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
11358       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
11359         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
11360       if(bt[i]) cop1_usable=0;
11361       // assemble
11362       switch(itype[i]) {
11363         case ALU:
11364           alu_assemble(i,&regs[i]);break;
11365         case IMM16:
11366           imm16_assemble(i,&regs[i]);break;
11367         case SHIFT:
11368           shift_assemble(i,&regs[i]);break;
11369         case SHIFTIMM:
11370           shiftimm_assemble(i,&regs[i]);break;
11371         case LOAD:
11372           load_assemble(i,&regs[i]);break;
11373         case LOADLR:
11374           loadlr_assemble(i,&regs[i]);break;
11375         case STORE:
11376           store_assemble(i,&regs[i]);break;
11377         case STORELR:
11378           storelr_assemble(i,&regs[i]);break;
11379         case COP0:
11380           cop0_assemble(i,&regs[i]);break;
11381         case COP1:
11382           cop1_assemble(i,&regs[i]);break;
11383         case C1LS:
11384           c1ls_assemble(i,&regs[i]);break;
11385         case COP2:
11386           cop2_assemble(i,&regs[i]);break;
11387         case C2LS:
11388           c2ls_assemble(i,&regs[i]);break;
11389         case C2OP:
11390           c2op_assemble(i,&regs[i]);break;
11391         case FCONV:
11392           fconv_assemble(i,&regs[i]);break;
11393         case FLOAT:
11394           float_assemble(i,&regs[i]);break;
11395         case FCOMP:
11396           fcomp_assemble(i,&regs[i]);break;
11397         case MULTDIV:
11398           multdiv_assemble(i,&regs[i]);break;
11399         case MOV:
11400           mov_assemble(i,&regs[i]);break;
11401         case SYSCALL:
11402           syscall_assemble(i,&regs[i]);break;
11403         case HLECALL:
11404           hlecall_assemble(i,&regs[i]);break;
11405         case INTCALL:
11406           intcall_assemble(i,&regs[i]);break;
11407         case UJUMP:
11408           ujump_assemble(i,&regs[i]);ds=1;break;
11409         case RJUMP:
11410           rjump_assemble(i,&regs[i]);ds=1;break;
11411         case CJUMP:
11412           cjump_assemble(i,&regs[i]);ds=1;break;
11413         case SJUMP:
11414           sjump_assemble(i,&regs[i]);ds=1;break;
11415         case FJUMP:
11416           fjump_assemble(i,&regs[i]);ds=1;break;
11417         case SPAN:
11418           pagespan_assemble(i,&regs[i]);break;
11419       }
11420       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
11421         literal_pool(1024);
11422       else
11423         literal_pool_jumpover(256);
11424     }
11425   }
11426   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
11427   // If the block did not end with an unconditional branch,
11428   // add a jump to the next instruction.
11429   if(i>1) {
11430     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
11431       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11432       assert(i==slen);
11433       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
11434         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11435         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11436           emit_loadreg(CCREG,HOST_CCREG);
11437         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
11438       }
11439       else if(!likely[i-2])
11440       {
11441         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
11442         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
11443       }
11444       else
11445       {
11446         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
11447         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
11448       }
11449       add_to_linker((int)out,start+i*4,0);
11450       emit_jmp(0);
11451     }
11452   }
11453   else
11454   {
11455     assert(i>0);
11456     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
11457     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
11458     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
11459       emit_loadreg(CCREG,HOST_CCREG);
11460     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
11461     add_to_linker((int)out,start+i*4,0);
11462     emit_jmp(0);
11463   }
11464
11465   // TODO: delay slot stubs?
11466   // Stubs
11467   for(i=0;i<stubcount;i++)
11468   {
11469     switch(stubs[i][0])
11470     {
11471       case LOADB_STUB:
11472       case LOADH_STUB:
11473       case LOADW_STUB:
11474       case LOADD_STUB:
11475       case LOADBU_STUB:
11476       case LOADHU_STUB:
11477         do_readstub(i);break;
11478       case STOREB_STUB:
11479       case STOREH_STUB:
11480       case STOREW_STUB:
11481       case STORED_STUB:
11482         do_writestub(i);break;
11483       case CC_STUB:
11484         do_ccstub(i);break;
11485       case INVCODE_STUB:
11486         do_invstub(i);break;
11487       case FP_STUB:
11488         do_cop1stub(i);break;
11489       case STORELR_STUB:
11490         do_unalignedwritestub(i);break;
11491     }
11492   }
11493
11494   if (instr_addr0_override)
11495     instr_addr[0] = instr_addr0_override;
11496
11497   /* Pass 9 - Linker */
11498   for(i=0;i<linkcount;i++)
11499   {
11500     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
11501     literal_pool(64);
11502     if(!link_addr[i][2])
11503     {
11504       void *stub=out;
11505       void *addr=check_addr(link_addr[i][1]);
11506       emit_extjump(link_addr[i][0],link_addr[i][1]);
11507       if(addr) {
11508         set_jump_target(link_addr[i][0],(int)addr);
11509         add_link(link_addr[i][1],stub);
11510       }
11511       else set_jump_target(link_addr[i][0],(int)stub);
11512     }
11513     else
11514     {
11515       // Internal branch
11516       int target=(link_addr[i][1]-start)>>2;
11517       assert(target>=0&&target<slen);
11518       assert(instr_addr[target]);
11519       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11520       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
11521       //#else
11522       set_jump_target(link_addr[i][0],instr_addr[target]);
11523       //#endif
11524     }
11525   }
11526   // External Branch Targets (jump_in)
11527   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
11528   for(i=0;i<slen;i++)
11529   {
11530     if(bt[i]||i==0)
11531     {
11532       if(instr_addr[i]) // TODO - delay slots (=null)
11533       {
11534         u_int vaddr=start+i*4;
11535         u_int page=get_page(vaddr);
11536         u_int vpage=get_vpage(vaddr);
11537         literal_pool(256);
11538         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
11539 #ifndef FORCE32
11540         if(!requires_32bit[i])
11541 #else
11542         if(1)
11543 #endif
11544         {
11545           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11546           assem_debug("jump_in: %x\n",start+i*4);
11547           ll_add(jump_dirty+vpage,vaddr,(void *)out);
11548           int entry_point=do_dirty_stub(i);
11549           ll_add(jump_in+page,vaddr,(void *)entry_point);
11550           // If there was an existing entry in the hash table,
11551           // replace it with the new address.
11552           // Don't add new entries.  We'll insert the
11553           // ones that actually get used in check_addr().
11554           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
11555           if(ht_bin[0]==vaddr) {
11556             ht_bin[1]=entry_point;
11557           }
11558           if(ht_bin[2]==vaddr) {
11559             ht_bin[3]=entry_point;
11560           }
11561         }
11562         else
11563         {
11564           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
11565           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
11566           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
11567           //int entry_point=(int)out;
11568           ////assem_debug("entry_point: %x\n",entry_point);
11569           //load_regs_entry(i);
11570           //if(entry_point==(int)out)
11571           //  entry_point=instr_addr[i];
11572           //else
11573           //  emit_jmp(instr_addr[i]);
11574           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11575           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
11576           int entry_point=do_dirty_stub(i);
11577           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
11578         }
11579       }
11580     }
11581   }
11582   // Write out the literal pool if necessary
11583   literal_pool(0);
11584   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
11585   // Align code
11586   if(((u_int)out)&7) emit_addnop(13);
11587   #endif
11588   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
11589   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
11590   memcpy(copy,source,slen*4);
11591   copy+=slen*4;
11592   
11593   #ifdef __arm__
11594   __clear_cache((void *)beginning,out);
11595   #endif
11596   
11597   // If we're within 256K of the end of the buffer,
11598   // start over from the beginning. (Is 256K enough?)
11599   if((u_int)out>(u_int)BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
11600   
11601   // Trap writes to any of the pages we compiled
11602   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
11603     invalid_code[i]=0;
11604 #ifndef DISABLE_TLB
11605     memory_map[i]|=0x40000000;
11606     if((signed int)start>=(signed int)0xC0000000) {
11607       assert(using_tlb);
11608       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
11609       invalid_code[j]=0;
11610       memory_map[j]|=0x40000000;
11611       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
11612     }
11613 #endif
11614   }
11615   inv_code_start=inv_code_end=~0;
11616 #ifdef PCSX
11617   // for PCSX we need to mark all mirrors too
11618   if(get_page(start)<(RAM_SIZE>>12))
11619     for(i=start>>12;i<=(start+slen*4)>>12;i++)
11620       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
11621       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
11622       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
11623 #endif
11624   
11625   /* Pass 10 - Free memory by expiring oldest blocks */
11626   
11627   int end=((((int)out-(int)BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
11628   while(expirep!=end)
11629   {
11630     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
11631     int base=(int)BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
11632     inv_debug("EXP: Phase %d\n",expirep);
11633     switch((expirep>>11)&3)
11634     {
11635       case 0:
11636         // Clear jump_in and jump_dirty
11637         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
11638         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
11639         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
11640         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
11641         break;
11642       case 1:
11643         // Clear pointers
11644         ll_kill_pointers(jump_out[expirep&2047],base,shift);
11645         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
11646         break;
11647       case 2:
11648         // Clear hash table
11649         for(i=0;i<32;i++) {
11650           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
11651           if((ht_bin[3]>>shift)==(base>>shift) ||
11652              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11653             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
11654             ht_bin[2]=ht_bin[3]=-1;
11655           }
11656           if((ht_bin[1]>>shift)==(base>>shift) ||
11657              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
11658             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
11659             ht_bin[0]=ht_bin[2];
11660             ht_bin[1]=ht_bin[3];
11661             ht_bin[2]=ht_bin[3]=-1;
11662           }
11663         }
11664         break;
11665       case 3:
11666         // Clear jump_out
11667         #ifdef __arm__
11668         if((expirep&2047)==0) 
11669           do_clear_cache();
11670         #endif
11671         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
11672         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
11673         break;
11674     }
11675     expirep=(expirep+1)&65535;
11676   }
11677   return 0;
11678 }
11679
11680 // vim:shiftwidth=2:expandtab