347fd3dd31fb0c25b5c35fb4954671acb4fe3be3
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2010 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24
25 #include "emu_if.h" //emulator interface
26
27 #include <sys/mman.h>
28
29 #ifdef __i386__
30 #include "assem_x86.h"
31 #endif
32 #ifdef __x86_64__
33 #include "assem_x64.h"
34 #endif
35 #ifdef __arm__
36 #include "assem_arm.h"
37 #endif
38
39 #define MAXBLOCK 4096
40 #define MAX_OUTPUT_BLOCK_SIZE 262144
41 #define CLOCK_DIVIDER 2
42
43 struct regstat
44 {
45   signed char regmap_entry[HOST_REGS];
46   signed char regmap[HOST_REGS];
47   uint64_t was32;
48   uint64_t is32;
49   uint64_t wasdirty;
50   uint64_t dirty;
51   uint64_t u;
52   uint64_t uu;
53   u_int wasconst;
54   u_int isconst;
55   uint64_t constmap[HOST_REGS];
56 };
57
58 struct ll_entry
59 {
60   u_int vaddr;
61   u_int reg32;
62   void *addr;
63   struct ll_entry *next;
64 };
65
66   u_int start;
67   u_int *source;
68   u_int pagelimit;
69   char insn[MAXBLOCK][10];
70   u_char itype[MAXBLOCK];
71   u_char opcode[MAXBLOCK];
72   u_char opcode2[MAXBLOCK];
73   u_char bt[MAXBLOCK];
74   u_char rs1[MAXBLOCK];
75   u_char rs2[MAXBLOCK];
76   u_char rt1[MAXBLOCK];
77   u_char rt2[MAXBLOCK];
78   u_char us1[MAXBLOCK];
79   u_char us2[MAXBLOCK];
80   u_char dep1[MAXBLOCK];
81   u_char dep2[MAXBLOCK];
82   u_char lt1[MAXBLOCK];
83   int imm[MAXBLOCK];
84   u_int ba[MAXBLOCK];
85   char likely[MAXBLOCK];
86   char is_ds[MAXBLOCK];
87   uint64_t unneeded_reg[MAXBLOCK];
88   uint64_t unneeded_reg_upper[MAXBLOCK];
89   uint64_t branch_unneeded_reg[MAXBLOCK];
90   uint64_t branch_unneeded_reg_upper[MAXBLOCK];
91   uint64_t p32[MAXBLOCK];
92   uint64_t pr32[MAXBLOCK];
93   signed char regmap_pre[MAXBLOCK][HOST_REGS];
94   signed char regmap[MAXBLOCK][HOST_REGS];
95   signed char regmap_entry[MAXBLOCK][HOST_REGS];
96   uint64_t constmap[MAXBLOCK][HOST_REGS];
97   uint64_t known_value[HOST_REGS];
98   u_int known_reg;
99   struct regstat regs[MAXBLOCK];
100   struct regstat branch_regs[MAXBLOCK];
101   u_int needed_reg[MAXBLOCK];
102   uint64_t requires_32bit[MAXBLOCK];
103   u_int wont_dirty[MAXBLOCK];
104   u_int will_dirty[MAXBLOCK];
105   int ccadj[MAXBLOCK];
106   int slen;
107   u_int instr_addr[MAXBLOCK];
108   u_int link_addr[MAXBLOCK][3];
109   int linkcount;
110   u_int stubs[MAXBLOCK*3][8];
111   int stubcount;
112   u_int literals[1024][2];
113   int literalcount;
114   int is_delayslot;
115   int cop1_usable;
116   u_char *out;
117   struct ll_entry *jump_in[4096];
118   struct ll_entry *jump_out[4096];
119   struct ll_entry *jump_dirty[4096];
120   u_int hash_table[65536][4]  __attribute__((aligned(16)));
121   char shadow[1048576]  __attribute__((aligned(16)));
122   void *copy;
123   int expirep;
124   u_int using_tlb;
125   u_int stop_after_jal;
126   extern u_char restore_candidate[512];
127   extern int cycle_count;
128
129   /* registers that may be allocated */
130   /* 1-31 gpr */
131 #define HIREG 32 // hi
132 #define LOREG 33 // lo
133 #define FSREG 34 // FPU status (FCSR)
134 #define CSREG 35 // Coprocessor status
135 #define CCREG 36 // Cycle count
136 #define INVCP 37 // Pointer to invalid_code
137 #define TEMPREG 38
138 #define FTEMP 38 // FPU/LDL/LDR temporary register
139 #define PTEMP 39 // Prefetch temporary register
140 #define TLREG 40 // TLB mapping offset
141 #define RHASH 41 // Return address hash
142 #define RHTBL 42 // Return address hash table address
143 #define RTEMP 43 // JR/JALR address register
144 #define MAXREG 43
145 #define AGEN1 44 // Address generation temporary register
146 #define AGEN2 45 // Address generation temporary register
147 #define MGEN1 46 // Maptable address generation temporary register
148 #define MGEN2 47 // Maptable address generation temporary register
149 #define BTREG 48 // Branch target temporary register
150
151   /* instruction types */
152 #define NOP 0     // No operation
153 #define LOAD 1    // Load
154 #define STORE 2   // Store
155 #define LOADLR 3  // Unaligned load
156 #define STORELR 4 // Unaligned store
157 #define MOV 5     // Move 
158 #define ALU 6     // Arithmetic/logic
159 #define MULTDIV 7 // Multiply/divide
160 #define SHIFT 8   // Shift by register
161 #define SHIFTIMM 9// Shift by immediate
162 #define IMM16 10  // 16-bit immediate
163 #define RJUMP 11  // Unconditional jump to register
164 #define UJUMP 12  // Unconditional jump
165 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
166 #define SJUMP 14  // Conditional branch (regimm format)
167 #define COP0 15   // Coprocessor 0
168 #define COP1 16   // Coprocessor 1
169 #define C1LS 17   // Coprocessor 1 load/store
170 #define FJUMP 18  // Conditional branch (floating point)
171 #define FLOAT 19  // Floating point unit
172 #define FCONV 20  // Convert integer to float
173 #define FCOMP 21  // Floating point compare (sets FSREG)
174 #define SYSCALL 22// SYSCALL
175 #define OTHER 23  // Other
176 #define SPAN 24   // Branch/delay slot spans 2 pages
177 #define NI 25     // Not implemented
178 #define HLECALL 26// PCSX fake opcodes for HLE
179 #define COP2 27   // Coprocessor 2 move
180 #define C2LS 28   // Coprocessor 2 load/store
181 #define C2OP 29   // Coprocessor 2 operation
182
183   /* stubs */
184 #define CC_STUB 1
185 #define FP_STUB 2
186 #define LOADB_STUB 3
187 #define LOADH_STUB 4
188 #define LOADW_STUB 5
189 #define LOADD_STUB 6
190 #define LOADBU_STUB 7
191 #define LOADHU_STUB 8
192 #define STOREB_STUB 9
193 #define STOREH_STUB 10
194 #define STOREW_STUB 11
195 #define STORED_STUB 12
196 #define STORELR_STUB 13
197 #define INVCODE_STUB 14
198
199   /* branch codes */
200 #define TAKEN 1
201 #define NOTTAKEN 2
202 #define NULLDS 3
203
204 // asm linkage
205 int new_recompile_block(int addr);
206 void *get_addr_ht(u_int vaddr);
207 void invalidate_block(u_int block);
208 void invalidate_addr(u_int addr);
209 void remove_hash(int vaddr);
210 void jump_vaddr();
211 void dyna_linker();
212 void dyna_linker_ds();
213 void verify_code();
214 void verify_code_vm();
215 void verify_code_ds();
216 void cc_interrupt();
217 void fp_exception();
218 void fp_exception_ds();
219 void jump_syscall();
220 void jump_syscall_hle();
221 void jump_eret();
222 void jump_hlecall();
223 void new_dyna_leave();
224
225 // TLB
226 void TLBWI_new();
227 void TLBWR_new();
228 void read_nomem_new();
229 void read_nomemb_new();
230 void read_nomemh_new();
231 void read_nomemd_new();
232 void write_nomem_new();
233 void write_nomemb_new();
234 void write_nomemh_new();
235 void write_nomemd_new();
236 void write_rdram_new();
237 void write_rdramb_new();
238 void write_rdramh_new();
239 void write_rdramd_new();
240 extern u_int memory_map[1048576];
241
242 // Needed by assembler
243 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
244 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
245 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
246 void load_all_regs(signed char i_regmap[]);
247 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
248 void load_regs_entry(int t);
249 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
250
251 int tracedebug=0;
252
253 //#define DEBUG_CYCLE_COUNT 1
254
255 void nullf() {}
256 //#define assem_debug printf
257 //#define inv_debug printf
258 #define assem_debug nullf
259 #define inv_debug nullf
260
261 static void tlb_hacks()
262 {
263 #ifndef DISABLE_TLB
264   // Goldeneye hack
265   if (strncmp((char *) ROM_HEADER->nom, "GOLDENEYE",9) == 0)
266   {
267     u_int addr;
268     int n;
269     switch (ROM_HEADER->Country_code&0xFF) 
270     {
271       case 0x45: // U
272         addr=0x34b30;
273         break;                   
274       case 0x4A: // J 
275         addr=0x34b70;    
276         break;    
277       case 0x50: // E 
278         addr=0x329f0;
279         break;                        
280       default: 
281         // Unknown country code
282         addr=0;
283         break;
284     }
285     u_int rom_addr=(u_int)rom;
286     #ifdef ROM_COPY
287     // Since memory_map is 32-bit, on 64-bit systems the rom needs to be
288     // in the lower 4G of memory to use this hack.  Copy it if necessary.
289     if((void *)rom>(void *)0xffffffff) {
290       munmap(ROM_COPY, 67108864);
291       if(mmap(ROM_COPY, 12582912,
292               PROT_READ | PROT_WRITE,
293               MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
294               -1, 0) <= 0) {printf("mmap() failed\n");}
295       memcpy(ROM_COPY,rom,12582912);
296       rom_addr=(u_int)ROM_COPY;
297     }
298     #endif
299     if(addr) {
300       for(n=0x7F000;n<0x80000;n++) {
301         memory_map[n]=(((u_int)(rom_addr+addr-0x7F000000))>>2)|0x40000000;
302       }
303     }
304   }
305 #endif
306 }
307
308 static u_int get_page(u_int vaddr)
309 {
310   u_int page=(vaddr^0x80000000)>>12;
311 #ifndef DISABLE_TLB
312   if(page>262143&&tlb_LUT_r[vaddr>>12]) page=(tlb_LUT_r[vaddr>>12]^0x80000000)>>12;
313 #endif
314   if(page>2048) page=2048+(page&2047);
315   return page;
316 }
317
318 static u_int get_vpage(u_int vaddr)
319 {
320   u_int vpage=(vaddr^0x80000000)>>12;
321 #ifndef DISABLE_TLB
322   if(vpage>262143&&tlb_LUT_r[vaddr>>12]) vpage&=2047; // jump_dirty uses a hash of the virtual address instead
323 #endif
324   if(vpage>2048) vpage=2048+(vpage&2047);
325   return vpage;
326 }
327
328 // Get address from virtual address
329 // This is called from the recompiled JR/JALR instructions
330 void *get_addr(u_int vaddr)
331 {
332   u_int page=get_page(vaddr);
333   u_int vpage=get_vpage(vaddr);
334   struct ll_entry *head;
335   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
336   head=jump_in[page];
337   while(head!=NULL) {
338     if(head->vaddr==vaddr&&head->reg32==0) {
339   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
340       int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
341       ht_bin[3]=ht_bin[1];
342       ht_bin[2]=ht_bin[0];
343       ht_bin[1]=(int)head->addr;
344       ht_bin[0]=vaddr;
345       return head->addr;
346     }
347     head=head->next;
348   }
349   head=jump_dirty[vpage];
350   while(head!=NULL) {
351     if(head->vaddr==vaddr&&head->reg32==0) {
352       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
353       // Don't restore blocks which are about to expire from the cache
354       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
355       if(verify_dirty(head->addr)) {
356         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
357         invalid_code[vaddr>>12]=0;
358         memory_map[vaddr>>12]|=0x40000000;
359         if(vpage<2048) {
360 #ifndef DISABLE_TLB
361           if(tlb_LUT_r[vaddr>>12]) {
362             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
363             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
364           }
365 #endif
366           restore_candidate[vpage>>3]|=1<<(vpage&7);
367         }
368         else restore_candidate[page>>3]|=1<<(page&7);
369         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
370         if(ht_bin[0]==vaddr) {
371           ht_bin[1]=(int)head->addr; // Replace existing entry
372         }
373         else
374         {
375           ht_bin[3]=ht_bin[1];
376           ht_bin[2]=ht_bin[0];
377           ht_bin[1]=(int)head->addr;
378           ht_bin[0]=vaddr;
379         }
380         return head->addr;
381       }
382     }
383     head=head->next;
384   }
385   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
386   int r=new_recompile_block(vaddr);
387   if(r==0) return get_addr(vaddr);
388 #ifdef PCSX
389   return (void *)r;
390 #else
391   // Execute in unmapped page, generate pagefault execption
392   Status|=2;
393   Cause=(vaddr<<31)|0x8;
394   EPC=(vaddr&1)?vaddr-5:vaddr;
395   BadVAddr=(vaddr&~1);
396   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
397   EntryHi=BadVAddr&0xFFFFE000;
398   return get_addr_ht(0x80000000);
399 #endif
400 }
401 // Look up address in hash table first
402 void *get_addr_ht(u_int vaddr)
403 {
404   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
405   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
406   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
407   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
408   return get_addr(vaddr);
409 }
410
411 void *get_addr_32(u_int vaddr,u_int flags)
412 {
413 #ifdef FORCE32
414   return get_addr(vaddr);
415 #else
416   //printf("TRACE: count=%d next=%d (get_addr_32 %x,flags %x)\n",Count,next_interupt,vaddr,flags);
417   int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
418   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
419   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
420   u_int page=get_page(vaddr);
421   u_int vpage=get_vpage(vaddr);
422   struct ll_entry *head;
423   head=jump_in[page];
424   while(head!=NULL) {
425     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
426       //printf("TRACE: count=%d next=%d (get_addr_32 match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
427       if(head->reg32==0) {
428         int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
429         if(ht_bin[0]==-1) {
430           ht_bin[1]=(int)head->addr;
431           ht_bin[0]=vaddr;
432         }else if(ht_bin[2]==-1) {
433           ht_bin[3]=(int)head->addr;
434           ht_bin[2]=vaddr;
435         }
436         //ht_bin[3]=ht_bin[1];
437         //ht_bin[2]=ht_bin[0];
438         //ht_bin[1]=(int)head->addr;
439         //ht_bin[0]=vaddr;
440       }
441       return head->addr;
442     }
443     head=head->next;
444   }
445   head=jump_dirty[vpage];
446   while(head!=NULL) {
447     if(head->vaddr==vaddr&&(head->reg32&flags)==0) {
448       //printf("TRACE: count=%d next=%d (get_addr_32 match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
449       // Don't restore blocks which are about to expire from the cache
450       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
451       if(verify_dirty(head->addr)) {
452         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
453         invalid_code[vaddr>>12]=0;
454         memory_map[vaddr>>12]|=0x40000000;
455         if(vpage<2048) {
456 #ifndef DISABLE_TLB
457           if(tlb_LUT_r[vaddr>>12]) {
458             invalid_code[tlb_LUT_r[vaddr>>12]>>12]=0;
459             memory_map[tlb_LUT_r[vaddr>>12]>>12]|=0x40000000;
460           }
461 #endif
462           restore_candidate[vpage>>3]|=1<<(vpage&7);
463         }
464         else restore_candidate[page>>3]|=1<<(page&7);
465         if(head->reg32==0) {
466           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
467           if(ht_bin[0]==-1) {
468             ht_bin[1]=(int)head->addr;
469             ht_bin[0]=vaddr;
470           }else if(ht_bin[2]==-1) {
471             ht_bin[3]=(int)head->addr;
472             ht_bin[2]=vaddr;
473           }
474           //ht_bin[3]=ht_bin[1];
475           //ht_bin[2]=ht_bin[0];
476           //ht_bin[1]=(int)head->addr;
477           //ht_bin[0]=vaddr;
478         }
479         return head->addr;
480       }
481     }
482     head=head->next;
483   }
484   //printf("TRACE: count=%d next=%d (get_addr_32 no-match %x,flags %x)\n",Count,next_interupt,vaddr,flags);
485   int r=new_recompile_block(vaddr);
486   if(r==0) return get_addr(vaddr);
487   // Execute in unmapped page, generate pagefault execption
488   Status|=2;
489   Cause=(vaddr<<31)|0x8;
490   EPC=(vaddr&1)?vaddr-5:vaddr;
491   BadVAddr=(vaddr&~1);
492   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
493   EntryHi=BadVAddr&0xFFFFE000;
494   return get_addr_ht(0x80000000);
495 #endif
496 }
497
498 void clear_all_regs(signed char regmap[])
499 {
500   int hr;
501   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
502 }
503
504 signed char get_reg(signed char regmap[],int r)
505 {
506   int hr;
507   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
508   return -1;
509 }
510
511 // Find a register that is available for two consecutive cycles
512 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
513 {
514   int hr;
515   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
516   return -1;
517 }
518
519 int count_free_regs(signed char regmap[])
520 {
521   int count=0;
522   int hr;
523   for(hr=0;hr<HOST_REGS;hr++)
524   {
525     if(hr!=EXCLUDE_REG) {
526       if(regmap[hr]<0) count++;
527     }
528   }
529   return count;
530 }
531
532 void dirty_reg(struct regstat *cur,signed char reg)
533 {
534   int hr;
535   if(!reg) return;
536   for (hr=0;hr<HOST_REGS;hr++) {
537     if((cur->regmap[hr]&63)==reg) {
538       cur->dirty|=1<<hr;
539     }
540   }
541 }
542
543 // If we dirty the lower half of a 64 bit register which is now being
544 // sign-extended, we need to dump the upper half.
545 // Note: Do this only after completion of the instruction, because
546 // some instructions may need to read the full 64-bit value even if
547 // overwriting it (eg SLTI, DSRA32).
548 static void flush_dirty_uppers(struct regstat *cur)
549 {
550   int hr,reg;
551   for (hr=0;hr<HOST_REGS;hr++) {
552     if((cur->dirty>>hr)&1) {
553       reg=cur->regmap[hr];
554       if(reg>=64) 
555         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
556     }
557   }
558 }
559
560 void set_const(struct regstat *cur,signed char reg,uint64_t value)
561 {
562   int hr;
563   if(!reg) return;
564   for (hr=0;hr<HOST_REGS;hr++) {
565     if(cur->regmap[hr]==reg) {
566       cur->isconst|=1<<hr;
567       cur->constmap[hr]=value;
568     }
569     else if((cur->regmap[hr]^64)==reg) {
570       cur->isconst|=1<<hr;
571       cur->constmap[hr]=value>>32;
572     }
573   }
574 }
575
576 void clear_const(struct regstat *cur,signed char reg)
577 {
578   int hr;
579   if(!reg) return;
580   for (hr=0;hr<HOST_REGS;hr++) {
581     if((cur->regmap[hr]&63)==reg) {
582       cur->isconst&=~(1<<hr);
583     }
584   }
585 }
586
587 int is_const(struct regstat *cur,signed char reg)
588 {
589   int hr;
590   if(!reg) return 1;
591   for (hr=0;hr<HOST_REGS;hr++) {
592     if((cur->regmap[hr]&63)==reg) {
593       return (cur->isconst>>hr)&1;
594     }
595   }
596   return 0;
597 }
598 uint64_t get_const(struct regstat *cur,signed char reg)
599 {
600   int hr;
601   if(!reg) return 0;
602   for (hr=0;hr<HOST_REGS;hr++) {
603     if(cur->regmap[hr]==reg) {
604       return cur->constmap[hr];
605     }
606   }
607   printf("Unknown constant in r%d\n",reg);
608   exit(1);
609 }
610
611 // Least soon needed registers
612 // Look at the next ten instructions and see which registers
613 // will be used.  Try not to reallocate these.
614 void lsn(u_char hsn[], int i, int *preferred_reg)
615 {
616   int j;
617   int b=-1;
618   for(j=0;j<9;j++)
619   {
620     if(i+j>=slen) {
621       j=slen-i-1;
622       break;
623     }
624     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
625     {
626       // Don't go past an unconditonal jump
627       j++;
628       break;
629     }
630   }
631   for(;j>=0;j--)
632   {
633     if(rs1[i+j]) hsn[rs1[i+j]]=j;
634     if(rs2[i+j]) hsn[rs2[i+j]]=j;
635     if(rt1[i+j]) hsn[rt1[i+j]]=j;
636     if(rt2[i+j]) hsn[rt2[i+j]]=j;
637     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
638       // Stores can allocate zero
639       hsn[rs1[i+j]]=j;
640       hsn[rs2[i+j]]=j;
641     }
642     // On some architectures stores need invc_ptr
643     #if defined(HOST_IMM8)
644     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
645       hsn[INVCP]=j;
646     }
647     #endif
648     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
649     {
650       hsn[CCREG]=j;
651       b=j;
652     }
653   }
654   if(b>=0)
655   {
656     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
657     {
658       // Follow first branch
659       int t=(ba[i+b]-start)>>2;
660       j=7-b;if(t+j>=slen) j=slen-t-1;
661       for(;j>=0;j--)
662       {
663         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
664         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
665         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
666         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
667       }
668     }
669     // TODO: preferred register based on backward branch
670   }
671   // Delay slot should preferably not overwrite branch conditions or cycle count
672   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
673     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
674     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
675     hsn[CCREG]=1;
676     // ...or hash tables
677     hsn[RHASH]=1;
678     hsn[RHTBL]=1;
679   }
680   // Coprocessor load/store needs FTEMP, even if not declared
681   if(itype[i]==C1LS||itype[i]==C2LS) {
682     hsn[FTEMP]=0;
683   }
684   // Load L/R also uses FTEMP as a temporary register
685   if(itype[i]==LOADLR) {
686     hsn[FTEMP]=0;
687   }
688   // Also SWL/SWR/SDL/SDR
689   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
690     hsn[FTEMP]=0;
691   }
692   // Don't remove the TLB registers either
693   if(itype[i]==LOAD || itype[i]==LOADLR || itype[i]==STORE || itype[i]==STORELR || itype[i]==C1LS || itype[i]==C2LS) {
694     hsn[TLREG]=0;
695   }
696   // Don't remove the miniht registers
697   if(itype[i]==UJUMP||itype[i]==RJUMP)
698   {
699     hsn[RHASH]=0;
700     hsn[RHTBL]=0;
701   }
702 }
703
704 // We only want to allocate registers if we're going to use them again soon
705 int needed_again(int r, int i)
706 {
707   int j;
708   int b=-1;
709   int rn=10;
710   int hr;
711   u_char hsn[MAXREG+1];
712   int preferred_reg;
713   
714   memset(hsn,10,sizeof(hsn));
715   lsn(hsn,i,&preferred_reg);
716   
717   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
718   {
719     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
720       return 0; // Don't need any registers if exiting the block
721   }
722   for(j=0;j<9;j++)
723   {
724     if(i+j>=slen) {
725       j=slen-i-1;
726       break;
727     }
728     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
729     {
730       // Don't go past an unconditonal jump
731       j++;
732       break;
733     }
734     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||((source[i+j]&0xfc00003f)==0x0d))
735     {
736       break;
737     }
738   }
739   for(;j>=1;j--)
740   {
741     if(rs1[i+j]==r) rn=j;
742     if(rs2[i+j]==r) rn=j;
743     if((unneeded_reg[i+j]>>r)&1) rn=10;
744     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
745     {
746       b=j;
747     }
748   }
749   /*
750   if(b>=0)
751   {
752     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
753     {
754       // Follow first branch
755       int o=rn;
756       int t=(ba[i+b]-start)>>2;
757       j=7-b;if(t+j>=slen) j=slen-t-1;
758       for(;j>=0;j--)
759       {
760         if(!((unneeded_reg[t+j]>>r)&1)) {
761           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
762           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
763         }
764         else rn=o;
765       }
766     }
767   }*/
768   for(hr=0;hr<HOST_REGS;hr++) {
769     if(hr!=EXCLUDE_REG) {
770       if(rn<hsn[hr]) return 1;
771     }
772   }
773   return 0;
774 }
775
776 // Try to match register allocations at the end of a loop with those
777 // at the beginning
778 int loop_reg(int i, int r, int hr)
779 {
780   int j,k;
781   for(j=0;j<9;j++)
782   {
783     if(i+j>=slen) {
784       j=slen-i-1;
785       break;
786     }
787     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
788     {
789       // Don't go past an unconditonal jump
790       j++;
791       break;
792     }
793   }
794   k=0;
795   if(i>0){
796     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
797       k--;
798   }
799   for(;k<j;k++)
800   {
801     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
802     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
803     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
804     {
805       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
806       {
807         int t=(ba[i+k]-start)>>2;
808         int reg=get_reg(regs[t].regmap_entry,r);
809         if(reg>=0) return reg;
810         //reg=get_reg(regs[t+1].regmap_entry,r);
811         //if(reg>=0) return reg;
812       }
813     }
814   }
815   return hr;
816 }
817
818
819 // Allocate every register, preserving source/target regs
820 void alloc_all(struct regstat *cur,int i)
821 {
822   int hr;
823   
824   for(hr=0;hr<HOST_REGS;hr++) {
825     if(hr!=EXCLUDE_REG) {
826       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
827          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
828       {
829         cur->regmap[hr]=-1;
830         cur->dirty&=~(1<<hr);
831       }
832       // Don't need zeros
833       if((cur->regmap[hr]&63)==0)
834       {
835         cur->regmap[hr]=-1;
836         cur->dirty&=~(1<<hr);
837       }
838     }
839   }
840 }
841
842
843 void div64(int64_t dividend,int64_t divisor)
844 {
845   lo=dividend/divisor;
846   hi=dividend%divisor;
847   //printf("TRACE: ddiv %8x%8x %8x%8x\n" ,(int)reg[HIREG],(int)(reg[HIREG]>>32)
848   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
849 }
850 void divu64(uint64_t dividend,uint64_t divisor)
851 {
852   lo=dividend/divisor;
853   hi=dividend%divisor;
854   //printf("TRACE: ddivu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
855   //                                     ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
856 }
857
858 void mult64(uint64_t m1,uint64_t m2)
859 {
860    unsigned long long int op1, op2, op3, op4;
861    unsigned long long int result1, result2, result3, result4;
862    unsigned long long int temp1, temp2, temp3, temp4;
863    int sign = 0;
864    
865    if (m1 < 0)
866      {
867     op2 = -m1;
868     sign = 1 - sign;
869      }
870    else op2 = m1;
871    if (m2 < 0)
872      {
873     op4 = -m2;
874     sign = 1 - sign;
875      }
876    else op4 = m2;
877    
878    op1 = op2 & 0xFFFFFFFF;
879    op2 = (op2 >> 32) & 0xFFFFFFFF;
880    op3 = op4 & 0xFFFFFFFF;
881    op4 = (op4 >> 32) & 0xFFFFFFFF;
882    
883    temp1 = op1 * op3;
884    temp2 = (temp1 >> 32) + op1 * op4;
885    temp3 = op2 * op3;
886    temp4 = (temp3 >> 32) + op2 * op4;
887    
888    result1 = temp1 & 0xFFFFFFFF;
889    result2 = temp2 + (temp3 & 0xFFFFFFFF);
890    result3 = (result2 >> 32) + temp4;
891    result4 = (result3 >> 32);
892    
893    lo = result1 | (result2 << 32);
894    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
895    if (sign)
896      {
897     hi = ~hi;
898     if (!lo) hi++;
899     else lo = ~lo + 1;
900      }
901 }
902
903 void multu64(uint64_t m1,uint64_t m2)
904 {
905    unsigned long long int op1, op2, op3, op4;
906    unsigned long long int result1, result2, result3, result4;
907    unsigned long long int temp1, temp2, temp3, temp4;
908    
909    op1 = m1 & 0xFFFFFFFF;
910    op2 = (m1 >> 32) & 0xFFFFFFFF;
911    op3 = m2 & 0xFFFFFFFF;
912    op4 = (m2 >> 32) & 0xFFFFFFFF;
913    
914    temp1 = op1 * op3;
915    temp2 = (temp1 >> 32) + op1 * op4;
916    temp3 = op2 * op3;
917    temp4 = (temp3 >> 32) + op2 * op4;
918    
919    result1 = temp1 & 0xFFFFFFFF;
920    result2 = temp2 + (temp3 & 0xFFFFFFFF);
921    result3 = (result2 >> 32) + temp4;
922    result4 = (result3 >> 32);
923    
924    lo = result1 | (result2 << 32);
925    hi = (result3 & 0xFFFFFFFF) | (result4 << 32);
926    
927   //printf("TRACE: dmultu %8x%8x %8x%8x\n",(int)reg[HIREG],(int)(reg[HIREG]>>32)
928   //                                      ,(int)reg[LOREG],(int)(reg[LOREG]>>32));
929 }
930
931 uint64_t ldl_merge(uint64_t original,uint64_t loaded,u_int bits)
932 {
933   if(bits) {
934     original<<=64-bits;
935     original>>=64-bits;
936     loaded<<=bits;
937     original|=loaded;
938   }
939   else original=loaded;
940   return original;
941 }
942 uint64_t ldr_merge(uint64_t original,uint64_t loaded,u_int bits)
943 {
944   if(bits^56) {
945     original>>=64-(bits^56);
946     original<<=64-(bits^56);
947     loaded>>=bits^56;
948     original|=loaded;
949   }
950   else original=loaded;
951   return original;
952 }
953
954 #ifdef __i386__
955 #include "assem_x86.c"
956 #endif
957 #ifdef __x86_64__
958 #include "assem_x64.c"
959 #endif
960 #ifdef __arm__
961 #include "assem_arm.c"
962 #endif
963
964 // Add virtual address mapping to linked list
965 void ll_add(struct ll_entry **head,int vaddr,void *addr)
966 {
967   struct ll_entry *new_entry;
968   new_entry=malloc(sizeof(struct ll_entry));
969   assert(new_entry!=NULL);
970   new_entry->vaddr=vaddr;
971   new_entry->reg32=0;
972   new_entry->addr=addr;
973   new_entry->next=*head;
974   *head=new_entry;
975 }
976
977 // Add virtual address mapping for 32-bit compiled block
978 void ll_add_32(struct ll_entry **head,int vaddr,u_int reg32,void *addr)
979 {
980   ll_add(head,vaddr,addr);
981 #ifndef FORCE32
982   (*head)->reg32=reg32;
983 #endif
984 }
985
986 // Check if an address is already compiled
987 // but don't return addresses which are about to expire from the cache
988 void *check_addr(u_int vaddr)
989 {
990   u_int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
991   if(ht_bin[0]==vaddr) {
992     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
993       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
994   }
995   if(ht_bin[2]==vaddr) {
996     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
997       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
998   }
999   u_int page=get_page(vaddr);
1000   struct ll_entry *head;
1001   head=jump_in[page];
1002   while(head!=NULL) {
1003     if(head->vaddr==vaddr&&head->reg32==0) {
1004       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1005         // Update existing entry with current address
1006         if(ht_bin[0]==vaddr) {
1007           ht_bin[1]=(int)head->addr;
1008           return head->addr;
1009         }
1010         if(ht_bin[2]==vaddr) {
1011           ht_bin[3]=(int)head->addr;
1012           return head->addr;
1013         }
1014         // Insert into hash table with low priority.
1015         // Don't evict existing entries, as they are probably
1016         // addresses that are being accessed frequently.
1017         if(ht_bin[0]==-1) {
1018           ht_bin[1]=(int)head->addr;
1019           ht_bin[0]=vaddr;
1020         }else if(ht_bin[2]==-1) {
1021           ht_bin[3]=(int)head->addr;
1022           ht_bin[2]=vaddr;
1023         }
1024         return head->addr;
1025       }
1026     }
1027     head=head->next;
1028   }
1029   return 0;
1030 }
1031
1032 void remove_hash(int vaddr)
1033 {
1034   //printf("remove hash: %x\n",vaddr);
1035   int *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
1036   if(ht_bin[2]==vaddr) {
1037     ht_bin[2]=ht_bin[3]=-1;
1038   }
1039   if(ht_bin[0]==vaddr) {
1040     ht_bin[0]=ht_bin[2];
1041     ht_bin[1]=ht_bin[3];
1042     ht_bin[2]=ht_bin[3]=-1;
1043   }
1044 }
1045
1046 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
1047 {
1048   struct ll_entry *next;
1049   while(*head) {
1050     if(((u_int)((*head)->addr)>>shift)==(addr>>shift) || 
1051        ((u_int)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
1052     {
1053       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
1054       remove_hash((*head)->vaddr);
1055       next=(*head)->next;
1056       free(*head);
1057       *head=next;
1058     }
1059     else
1060     {
1061       head=&((*head)->next);
1062     }
1063   }
1064 }
1065
1066 // Remove all entries from linked list
1067 void ll_clear(struct ll_entry **head)
1068 {
1069   struct ll_entry *cur;
1070   struct ll_entry *next;
1071   if(cur=*head) {
1072     *head=0;
1073     while(cur) {
1074       next=cur->next;
1075       free(cur);
1076       cur=next;
1077     }
1078   }
1079 }
1080
1081 // Dereference the pointers and remove if it matches
1082 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
1083 {
1084   u_int old_host_addr=0;
1085   while(head) {
1086     int ptr=get_pointer(head->addr);
1087     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
1088     if(((ptr>>shift)==(addr>>shift)) ||
1089        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
1090     {
1091       printf("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
1092       u_int host_addr=(u_int)kill_pointer(head->addr);
1093
1094       if((host_addr>>12)!=(old_host_addr>>12)) {
1095         #ifdef __arm__
1096         __clear_cache((void *)(old_host_addr&~0xfff),(void *)(old_host_addr|0xfff));
1097         #endif
1098         old_host_addr=host_addr;
1099       }
1100     }
1101     head=head->next;
1102   }
1103   #ifdef __arm__
1104   if (old_host_addr)
1105     __clear_cache((void *)(old_host_addr&~0xfff),(void *)(old_host_addr|0xfff));
1106   #endif
1107 }
1108
1109 // This is called when we write to a compiled block (see do_invstub)
1110 void invalidate_page(u_int page)
1111 {
1112   struct ll_entry *head;
1113   struct ll_entry *next;
1114   u_int old_host_addr=0;
1115   head=jump_in[page];
1116   jump_in[page]=0;
1117   while(head!=NULL) {
1118     inv_debug("INVALIDATE: %x\n",head->vaddr);
1119     remove_hash(head->vaddr);
1120     next=head->next;
1121     free(head);
1122     head=next;
1123   }
1124   head=jump_out[page];
1125   jump_out[page]=0;
1126   while(head!=NULL) {
1127     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
1128     u_int host_addr=(u_int)kill_pointer(head->addr);
1129
1130     if((host_addr>>12)!=(old_host_addr>>12)) {
1131       #ifdef __arm__
1132       __clear_cache((void *)(old_host_addr&~0xfff),(void *)(old_host_addr|0xfff));
1133       #endif
1134       old_host_addr=host_addr;
1135     }
1136     next=head->next;
1137     free(head);
1138     head=next;
1139   }
1140   #ifdef __arm__
1141   if (old_host_addr)
1142     __clear_cache((void *)(old_host_addr&~0xfff),(void *)(old_host_addr|0xfff));
1143   #endif
1144 }
1145 void invalidate_block(u_int block)
1146 {
1147   u_int page=get_page(block<<12);
1148   u_int vpage=get_vpage(block<<12);
1149   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1150   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1151   u_int first,last;
1152   first=last=page;
1153   struct ll_entry *head;
1154   head=jump_dirty[vpage];
1155   //printf("page=%d vpage=%d\n",page,vpage);
1156   while(head!=NULL) {
1157     u_int start,end;
1158     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1159       get_bounds((int)head->addr,&start,&end);
1160       //printf("start: %x end: %x\n",start,end);
1161       if(page<2048&&start>=0x80000000&&end<0x80000000+RAM_SIZE) {
1162         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1163           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1164           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1165         }
1166       }
1167 #ifndef DISABLE_TLB
1168       if(page<2048&&(signed int)start>=(signed int)0xC0000000&&(signed int)end>=(signed int)0xC0000000) {
1169         if(((start+memory_map[start>>12]-(u_int)rdram)>>12)<=page&&((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)>=page) {
1170           if((((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047)<first) first=((start+memory_map[start>>12]-(u_int)rdram)>>12)&2047;
1171           if((((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047)>last) last=((end-1+memory_map[(end-1)>>12]-(u_int)rdram)>>12)&2047;
1172         }
1173       }
1174 #endif
1175     }
1176     head=head->next;
1177   }
1178   //printf("first=%d last=%d\n",first,last);
1179   invalidate_page(page);
1180   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
1181   assert(last<page+5);
1182   // Invalidate the adjacent pages if a block crosses a 4K boundary
1183   while(first<page) {
1184     invalidate_page(first);
1185     first++;
1186   }
1187   for(first=page+1;first<last;first++) {
1188     invalidate_page(first);
1189   }
1190   
1191   // Don't trap writes
1192   invalid_code[block]=1;
1193 #ifndef DISABLE_TLB
1194   // If there is a valid TLB entry for this page, remove write protect
1195   if(tlb_LUT_w[block]) {
1196     assert(tlb_LUT_r[block]==tlb_LUT_w[block]);
1197     // CHECK: Is this right?
1198     memory_map[block]=((tlb_LUT_w[block]&0xFFFFF000)-(block<<12)+(unsigned int)rdram-0x80000000)>>2;
1199     u_int real_block=tlb_LUT_w[block]>>12;
1200     invalid_code[real_block]=1;
1201     if(real_block>=0x80000&&real_block<0x80800) memory_map[real_block]=((u_int)rdram-0x80000000)>>2;
1202   }
1203   else if(block>=0x80000&&block<0x80800) memory_map[block]=((u_int)rdram-0x80000000)>>2;
1204 #endif
1205
1206   #ifdef USE_MINI_HT
1207   memset(mini_ht,-1,sizeof(mini_ht));
1208   #endif
1209 }
1210 void invalidate_addr(u_int addr)
1211 {
1212   invalidate_block(addr>>12);
1213 }
1214 void invalidate_all_pages()
1215 {
1216   u_int page,n;
1217   for(page=0;page<4096;page++)
1218     invalidate_page(page);
1219   for(page=0;page<1048576;page++)
1220     if(!invalid_code[page]) {
1221       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1222       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1223     }
1224   #ifdef __arm__
1225   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1226   #endif
1227   #ifdef USE_MINI_HT
1228   memset(mini_ht,-1,sizeof(mini_ht));
1229   #endif
1230   #ifndef DISABLE_TLB
1231   // TLB
1232   for(page=0;page<0x100000;page++) {
1233     if(tlb_LUT_r[page]) {
1234       memory_map[page]=((tlb_LUT_r[page]&0xFFFFF000)-(page<<12)+(unsigned int)rdram-0x80000000)>>2;
1235       if(!tlb_LUT_w[page]||!invalid_code[page])
1236         memory_map[page]|=0x40000000; // Write protect
1237     }
1238     else memory_map[page]=-1;
1239     if(page==0x80000) page=0xC0000;
1240   }
1241   tlb_hacks();
1242   #endif
1243 }
1244
1245 // Add an entry to jump_out after making a link
1246 void add_link(u_int vaddr,void *src)
1247 {
1248   u_int page=get_page(vaddr);
1249   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1250   ll_add(jump_out+page,vaddr,src);
1251   //int ptr=get_pointer(src);
1252   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1253 }
1254
1255 // If a code block was found to be unmodified (bit was set in
1256 // restore_candidate) and it remains unmodified (bit is clear
1257 // in invalid_code) then move the entries for that 4K page from
1258 // the dirty list to the clean list.
1259 void clean_blocks(u_int page)
1260 {
1261   struct ll_entry *head;
1262   inv_debug("INV: clean_blocks page=%d\n",page);
1263   head=jump_dirty[page];
1264   while(head!=NULL) {
1265     if(!invalid_code[head->vaddr>>12]) {
1266       // Don't restore blocks which are about to expire from the cache
1267       if((((u_int)head->addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1268         u_int start,end;
1269         if(verify_dirty((int)head->addr)) {
1270           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1271           u_int i;
1272           u_int inv=0;
1273           get_bounds((int)head->addr,&start,&end);
1274           if(start-(u_int)rdram<RAM_SIZE) {
1275             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1276               inv|=invalid_code[i];
1277             }
1278           }
1279           if((signed int)head->vaddr>=(signed int)0xC0000000) {
1280             u_int addr = (head->vaddr+(memory_map[head->vaddr>>12]<<2));
1281             //printf("addr=%x start=%x end=%x\n",addr,start,end);
1282             if(addr<start||addr>=end) inv=1;
1283           }
1284           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1285             inv=1;
1286           }
1287           if(!inv) {
1288             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1289             if((((u_int)clean_addr-(u_int)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1290               u_int ppage=page;
1291 #ifndef DISABLE_TLB
1292               if(page<2048&&tlb_LUT_r[head->vaddr>>12]) ppage=(tlb_LUT_r[head->vaddr>>12]^0x80000000)>>12;
1293 #endif
1294               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1295               //printf("page=%x, addr=%x\n",page,head->vaddr);
1296               //assert(head->vaddr>>12==(page|0x80000));
1297               ll_add_32(jump_in+ppage,head->vaddr,head->reg32,clean_addr);
1298               int *ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1299               if(!head->reg32) {
1300                 if(ht_bin[0]==head->vaddr) {
1301                   ht_bin[1]=(int)clean_addr; // Replace existing entry
1302                 }
1303                 if(ht_bin[2]==head->vaddr) {
1304                   ht_bin[3]=(int)clean_addr; // Replace existing entry
1305                 }
1306               }
1307             }
1308           }
1309         }
1310       }
1311     }
1312     head=head->next;
1313   }
1314 }
1315
1316
1317 void mov_alloc(struct regstat *current,int i)
1318 {
1319   // Note: Don't need to actually alloc the source registers
1320   if((~current->is32>>rs1[i])&1) {
1321     //alloc_reg64(current,i,rs1[i]);
1322     alloc_reg64(current,i,rt1[i]);
1323     current->is32&=~(1LL<<rt1[i]);
1324   } else {
1325     //alloc_reg(current,i,rs1[i]);
1326     alloc_reg(current,i,rt1[i]);
1327     current->is32|=(1LL<<rt1[i]);
1328   }
1329   clear_const(current,rs1[i]);
1330   clear_const(current,rt1[i]);
1331   dirty_reg(current,rt1[i]);
1332 }
1333
1334 void shiftimm_alloc(struct regstat *current,int i)
1335 {
1336   clear_const(current,rs1[i]);
1337   clear_const(current,rt1[i]);
1338   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1339   {
1340     if(rt1[i]) {
1341       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1342       else lt1[i]=rs1[i];
1343       alloc_reg(current,i,rt1[i]);
1344       current->is32|=1LL<<rt1[i];
1345       dirty_reg(current,rt1[i]);
1346     }
1347   }
1348   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1349   {
1350     if(rt1[i]) {
1351       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1352       alloc_reg64(current,i,rt1[i]);
1353       current->is32&=~(1LL<<rt1[i]);
1354       dirty_reg(current,rt1[i]);
1355     }
1356   }
1357   if(opcode2[i]==0x3c) // DSLL32
1358   {
1359     if(rt1[i]) {
1360       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1361       alloc_reg64(current,i,rt1[i]);
1362       current->is32&=~(1LL<<rt1[i]);
1363       dirty_reg(current,rt1[i]);
1364     }
1365   }
1366   if(opcode2[i]==0x3e) // DSRL32
1367   {
1368     if(rt1[i]) {
1369       alloc_reg64(current,i,rs1[i]);
1370       if(imm[i]==32) {
1371         alloc_reg64(current,i,rt1[i]);
1372         current->is32&=~(1LL<<rt1[i]);
1373       } else {
1374         alloc_reg(current,i,rt1[i]);
1375         current->is32|=1LL<<rt1[i];
1376       }
1377       dirty_reg(current,rt1[i]);
1378     }
1379   }
1380   if(opcode2[i]==0x3f) // DSRA32
1381   {
1382     if(rt1[i]) {
1383       alloc_reg64(current,i,rs1[i]);
1384       alloc_reg(current,i,rt1[i]);
1385       current->is32|=1LL<<rt1[i];
1386       dirty_reg(current,rt1[i]);
1387     }
1388   }
1389 }
1390
1391 void shift_alloc(struct regstat *current,int i)
1392 {
1393   if(rt1[i]) {
1394     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1395     {
1396       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1397       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1398       alloc_reg(current,i,rt1[i]);
1399       if(rt1[i]==rs2[i]) alloc_reg_temp(current,i,-1);
1400       current->is32|=1LL<<rt1[i];
1401     } else { // DSLLV/DSRLV/DSRAV
1402       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1403       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1404       alloc_reg64(current,i,rt1[i]);
1405       current->is32&=~(1LL<<rt1[i]);
1406       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1407         alloc_reg_temp(current,i,-1);
1408     }
1409     clear_const(current,rs1[i]);
1410     clear_const(current,rs2[i]);
1411     clear_const(current,rt1[i]);
1412     dirty_reg(current,rt1[i]);
1413   }
1414 }
1415
1416 void alu_alloc(struct regstat *current,int i)
1417 {
1418   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1419     if(rt1[i]) {
1420       if(rs1[i]&&rs2[i]) {
1421         alloc_reg(current,i,rs1[i]);
1422         alloc_reg(current,i,rs2[i]);
1423       }
1424       else {
1425         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1426         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1427       }
1428       alloc_reg(current,i,rt1[i]);
1429     }
1430     current->is32|=1LL<<rt1[i];
1431   }
1432   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1433     if(rt1[i]) {
1434       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1435       {
1436         alloc_reg64(current,i,rs1[i]);
1437         alloc_reg64(current,i,rs2[i]);
1438         alloc_reg(current,i,rt1[i]);
1439       } else {
1440         alloc_reg(current,i,rs1[i]);
1441         alloc_reg(current,i,rs2[i]);
1442         alloc_reg(current,i,rt1[i]);
1443       }
1444     }
1445     current->is32|=1LL<<rt1[i];
1446   }
1447   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1448     if(rt1[i]) {
1449       if(rs1[i]&&rs2[i]) {
1450         alloc_reg(current,i,rs1[i]);
1451         alloc_reg(current,i,rs2[i]);
1452       }
1453       else
1454       {
1455         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1456         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1457       }
1458       alloc_reg(current,i,rt1[i]);
1459       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1460       {
1461         if(!((current->uu>>rt1[i])&1)) {
1462           alloc_reg64(current,i,rt1[i]);
1463         }
1464         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1465           if(rs1[i]&&rs2[i]) {
1466             alloc_reg64(current,i,rs1[i]);
1467             alloc_reg64(current,i,rs2[i]);
1468           }
1469           else
1470           {
1471             // Is is really worth it to keep 64-bit values in registers?
1472             #ifdef NATIVE_64BIT
1473             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1474             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1475             #endif
1476           }
1477         }
1478         current->is32&=~(1LL<<rt1[i]);
1479       } else {
1480         current->is32|=1LL<<rt1[i];
1481       }
1482     }
1483   }
1484   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1485     if(rt1[i]) {
1486       if(rs1[i]&&rs2[i]) {
1487         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1488           alloc_reg64(current,i,rs1[i]);
1489           alloc_reg64(current,i,rs2[i]);
1490           alloc_reg64(current,i,rt1[i]);
1491         } else {
1492           alloc_reg(current,i,rs1[i]);
1493           alloc_reg(current,i,rs2[i]);
1494           alloc_reg(current,i,rt1[i]);
1495         }
1496       }
1497       else {
1498         alloc_reg(current,i,rt1[i]);
1499         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1500           // DADD used as move, or zeroing
1501           // If we have a 64-bit source, then make the target 64 bits too
1502           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1503             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1504             alloc_reg64(current,i,rt1[i]);
1505           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1506             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1507             alloc_reg64(current,i,rt1[i]);
1508           }
1509           if(opcode2[i]>=0x2e&&rs2[i]) {
1510             // DSUB used as negation - 64-bit result
1511             // If we have a 32-bit register, extend it to 64 bits
1512             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1513             alloc_reg64(current,i,rt1[i]);
1514           }
1515         }
1516       }
1517       if(rs1[i]&&rs2[i]) {
1518         current->is32&=~(1LL<<rt1[i]);
1519       } else if(rs1[i]) {
1520         current->is32&=~(1LL<<rt1[i]);
1521         if((current->is32>>rs1[i])&1)
1522           current->is32|=1LL<<rt1[i];
1523       } else if(rs2[i]) {
1524         current->is32&=~(1LL<<rt1[i]);
1525         if((current->is32>>rs2[i])&1)
1526           current->is32|=1LL<<rt1[i];
1527       } else {
1528         current->is32|=1LL<<rt1[i];
1529       }
1530     }
1531   }
1532   clear_const(current,rs1[i]);
1533   clear_const(current,rs2[i]);
1534   clear_const(current,rt1[i]);
1535   dirty_reg(current,rt1[i]);
1536 }
1537
1538 void imm16_alloc(struct regstat *current,int i)
1539 {
1540   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1541   else lt1[i]=rs1[i];
1542   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1543   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1544     current->is32&=~(1LL<<rt1[i]);
1545     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1546       // TODO: Could preserve the 32-bit flag if the immediate is zero
1547       alloc_reg64(current,i,rt1[i]);
1548       alloc_reg64(current,i,rs1[i]);
1549     }
1550     clear_const(current,rs1[i]);
1551     clear_const(current,rt1[i]);
1552   }
1553   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1554     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1555     current->is32|=1LL<<rt1[i];
1556     clear_const(current,rs1[i]);
1557     clear_const(current,rt1[i]);
1558   }
1559   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1560     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1561       if(rs1[i]!=rt1[i]) {
1562         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1563         alloc_reg64(current,i,rt1[i]);
1564         current->is32&=~(1LL<<rt1[i]);
1565       }
1566     }
1567     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1568     if(is_const(current,rs1[i])) {
1569       int v=get_const(current,rs1[i]);
1570       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1571       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1572       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1573     }
1574     else clear_const(current,rt1[i]);
1575   }
1576   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1577     if(is_const(current,rs1[i])) {
1578       int v=get_const(current,rs1[i]);
1579       set_const(current,rt1[i],v+imm[i]);
1580     }
1581     else clear_const(current,rt1[i]);
1582     current->is32|=1LL<<rt1[i];
1583   }
1584   else {
1585     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1586     current->is32|=1LL<<rt1[i];
1587   }
1588   dirty_reg(current,rt1[i]);
1589 }
1590
1591 void load_alloc(struct regstat *current,int i)
1592 {
1593   clear_const(current,rt1[i]);
1594   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1595   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1596   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1597   if(rt1[i]) {
1598     alloc_reg(current,i,rt1[i]);
1599     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1600     {
1601       current->is32&=~(1LL<<rt1[i]);
1602       alloc_reg64(current,i,rt1[i]);
1603     }
1604     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1605     {
1606       current->is32&=~(1LL<<rt1[i]);
1607       alloc_reg64(current,i,rt1[i]);
1608       alloc_all(current,i);
1609       alloc_reg64(current,i,FTEMP);
1610     }
1611     else current->is32|=1LL<<rt1[i];
1612     dirty_reg(current,rt1[i]);
1613     // If using TLB, need a register for pointer to the mapping table
1614     if(using_tlb) alloc_reg(current,i,TLREG);
1615     // LWL/LWR need a temporary register for the old value
1616     if(opcode[i]==0x22||opcode[i]==0x26)
1617     {
1618       alloc_reg(current,i,FTEMP);
1619       alloc_reg_temp(current,i,-1);
1620     }
1621   }
1622   else
1623   {
1624     // Load to r0 (dummy load)
1625     // but we still need a register to calculate the address
1626     alloc_reg_temp(current,i,-1);
1627   }
1628 }
1629
1630 void store_alloc(struct regstat *current,int i)
1631 {
1632   clear_const(current,rs2[i]);
1633   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1634   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1635   alloc_reg(current,i,rs2[i]);
1636   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1637     alloc_reg64(current,i,rs2[i]);
1638     if(rs2[i]) alloc_reg(current,i,FTEMP);
1639   }
1640   // If using TLB, need a register for pointer to the mapping table
1641   if(using_tlb) alloc_reg(current,i,TLREG);
1642   #if defined(HOST_IMM8)
1643   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1644   else alloc_reg(current,i,INVCP);
1645   #endif
1646   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1647     alloc_reg(current,i,FTEMP);
1648   }
1649   // We need a temporary register for address generation
1650   alloc_reg_temp(current,i,-1);
1651 }
1652
1653 void c1ls_alloc(struct regstat *current,int i)
1654 {
1655   //clear_const(current,rs1[i]); // FIXME
1656   clear_const(current,rt1[i]);
1657   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1658   alloc_reg(current,i,CSREG); // Status
1659   alloc_reg(current,i,FTEMP);
1660   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1661     alloc_reg64(current,i,FTEMP);
1662   }
1663   // If using TLB, need a register for pointer to the mapping table
1664   if(using_tlb) alloc_reg(current,i,TLREG);
1665   #if defined(HOST_IMM8)
1666   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1667   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1668     alloc_reg(current,i,INVCP);
1669   #endif
1670   // We need a temporary register for address generation
1671   alloc_reg_temp(current,i,-1);
1672 }
1673
1674 void c2ls_alloc(struct regstat *current,int i)
1675 {
1676   clear_const(current,rt1[i]);
1677   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1678   alloc_reg(current,i,FTEMP);
1679   // If using TLB, need a register for pointer to the mapping table
1680   if(using_tlb) alloc_reg(current,i,TLREG);
1681   #if defined(HOST_IMM8)
1682   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1683   else if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1684     alloc_reg(current,i,INVCP);
1685   #endif
1686   // We need a temporary register for address generation
1687   alloc_reg_temp(current,i,-1);
1688 }
1689
1690 #ifndef multdiv_alloc
1691 void multdiv_alloc(struct regstat *current,int i)
1692 {
1693   //  case 0x18: MULT
1694   //  case 0x19: MULTU
1695   //  case 0x1A: DIV
1696   //  case 0x1B: DIVU
1697   //  case 0x1C: DMULT
1698   //  case 0x1D: DMULTU
1699   //  case 0x1E: DDIV
1700   //  case 0x1F: DDIVU
1701   clear_const(current,rs1[i]);
1702   clear_const(current,rs2[i]);
1703   if(rs1[i]&&rs2[i])
1704   {
1705     if((opcode2[i]&4)==0) // 32-bit
1706     {
1707       current->u&=~(1LL<<HIREG);
1708       current->u&=~(1LL<<LOREG);
1709       alloc_reg(current,i,HIREG);
1710       alloc_reg(current,i,LOREG);
1711       alloc_reg(current,i,rs1[i]);
1712       alloc_reg(current,i,rs2[i]);
1713       current->is32|=1LL<<HIREG;
1714       current->is32|=1LL<<LOREG;
1715       dirty_reg(current,HIREG);
1716       dirty_reg(current,LOREG);
1717     }
1718     else // 64-bit
1719     {
1720       current->u&=~(1LL<<HIREG);
1721       current->u&=~(1LL<<LOREG);
1722       current->uu&=~(1LL<<HIREG);
1723       current->uu&=~(1LL<<LOREG);
1724       alloc_reg64(current,i,HIREG);
1725       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1726       alloc_reg64(current,i,rs1[i]);
1727       alloc_reg64(current,i,rs2[i]);
1728       alloc_all(current,i);
1729       current->is32&=~(1LL<<HIREG);
1730       current->is32&=~(1LL<<LOREG);
1731       dirty_reg(current,HIREG);
1732       dirty_reg(current,LOREG);
1733     }
1734   }
1735   else
1736   {
1737     // Multiply by zero is zero.
1738     // MIPS does not have a divide by zero exception.
1739     // The result is undefined, we return zero.
1740     alloc_reg(current,i,HIREG);
1741     alloc_reg(current,i,LOREG);
1742     current->is32|=1LL<<HIREG;
1743     current->is32|=1LL<<LOREG;
1744     dirty_reg(current,HIREG);
1745     dirty_reg(current,LOREG);
1746   }
1747 }
1748 #endif
1749
1750 void cop0_alloc(struct regstat *current,int i)
1751 {
1752   if(opcode2[i]==0) // MFC0
1753   {
1754     if(rt1[i]) {
1755       clear_const(current,rt1[i]);
1756       alloc_all(current,i);
1757       alloc_reg(current,i,rt1[i]);
1758       current->is32|=1LL<<rt1[i];
1759       dirty_reg(current,rt1[i]);
1760     }
1761   }
1762   else if(opcode2[i]==4) // MTC0
1763   {
1764     if(rs1[i]){
1765       clear_const(current,rs1[i]);
1766       alloc_reg(current,i,rs1[i]);
1767       alloc_all(current,i);
1768     }
1769     else {
1770       alloc_all(current,i); // FIXME: Keep r0
1771       current->u&=~1LL;
1772       alloc_reg(current,i,0);
1773     }
1774   }
1775   else
1776   {
1777     // TLBR/TLBWI/TLBWR/TLBP/ERET
1778     assert(opcode2[i]==0x10);
1779     alloc_all(current,i);
1780   }
1781 }
1782
1783 void cop1_alloc(struct regstat *current,int i)
1784 {
1785   alloc_reg(current,i,CSREG); // Load status
1786   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1787   {
1788     assert(rt1[i]);
1789     clear_const(current,rt1[i]);
1790     if(opcode2[i]==1) {
1791       alloc_reg64(current,i,rt1[i]); // DMFC1
1792       current->is32&=~(1LL<<rt1[i]);
1793     }else{
1794       alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1795       current->is32|=1LL<<rt1[i];
1796     }
1797     dirty_reg(current,rt1[i]);
1798     alloc_reg_temp(current,i,-1);
1799   }
1800   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1801   {
1802     if(rs1[i]){
1803       clear_const(current,rs1[i]);
1804       if(opcode2[i]==5)
1805         alloc_reg64(current,i,rs1[i]); // DMTC1
1806       else
1807         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1808       alloc_reg_temp(current,i,-1);
1809     }
1810     else {
1811       current->u&=~1LL;
1812       alloc_reg(current,i,0);
1813       alloc_reg_temp(current,i,-1);
1814     }
1815   }
1816 }
1817 void fconv_alloc(struct regstat *current,int i)
1818 {
1819   alloc_reg(current,i,CSREG); // Load status
1820   alloc_reg_temp(current,i,-1);
1821 }
1822 void float_alloc(struct regstat *current,int i)
1823 {
1824   alloc_reg(current,i,CSREG); // Load status
1825   alloc_reg_temp(current,i,-1);
1826 }
1827 void c2op_alloc(struct regstat *current,int i)
1828 {
1829   alloc_reg_temp(current,i,-1);
1830 }
1831 void fcomp_alloc(struct regstat *current,int i)
1832 {
1833   alloc_reg(current,i,CSREG); // Load status
1834   alloc_reg(current,i,FSREG); // Load flags
1835   dirty_reg(current,FSREG); // Flag will be modified
1836   alloc_reg_temp(current,i,-1);
1837 }
1838
1839 void syscall_alloc(struct regstat *current,int i)
1840 {
1841   alloc_cc(current,i);
1842   dirty_reg(current,CCREG);
1843   alloc_all(current,i);
1844   current->isconst=0;
1845 }
1846
1847 void delayslot_alloc(struct regstat *current,int i)
1848 {
1849   switch(itype[i]) {
1850     case UJUMP:
1851     case CJUMP:
1852     case SJUMP:
1853     case RJUMP:
1854     case FJUMP:
1855     case SYSCALL:
1856     case HLECALL:
1857     case SPAN:
1858       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1859       printf("Disabled speculative precompilation\n");
1860       stop_after_jal=1;
1861       break;
1862     case IMM16:
1863       imm16_alloc(current,i);
1864       break;
1865     case LOAD:
1866     case LOADLR:
1867       load_alloc(current,i);
1868       break;
1869     case STORE:
1870     case STORELR:
1871       store_alloc(current,i);
1872       break;
1873     case ALU:
1874       alu_alloc(current,i);
1875       break;
1876     case SHIFT:
1877       shift_alloc(current,i);
1878       break;
1879     case MULTDIV:
1880       multdiv_alloc(current,i);
1881       break;
1882     case SHIFTIMM:
1883       shiftimm_alloc(current,i);
1884       break;
1885     case MOV:
1886       mov_alloc(current,i);
1887       break;
1888     case COP0:
1889       cop0_alloc(current,i);
1890       break;
1891     case COP1:
1892     case COP2:
1893       cop1_alloc(current,i);
1894       break;
1895     case C1LS:
1896       c1ls_alloc(current,i);
1897       break;
1898     case C2LS:
1899       c2ls_alloc(current,i);
1900       break;
1901     case FCONV:
1902       fconv_alloc(current,i);
1903       break;
1904     case FLOAT:
1905       float_alloc(current,i);
1906       break;
1907     case FCOMP:
1908       fcomp_alloc(current,i);
1909       break;
1910     case C2OP:
1911       c2op_alloc(current,i);
1912       break;
1913   }
1914 }
1915
1916 // Special case where a branch and delay slot span two pages in virtual memory
1917 static void pagespan_alloc(struct regstat *current,int i)
1918 {
1919   current->isconst=0;
1920   current->wasconst=0;
1921   regs[i].wasconst=0;
1922   alloc_all(current,i);
1923   alloc_cc(current,i);
1924   dirty_reg(current,CCREG);
1925   if(opcode[i]==3) // JAL
1926   {
1927     alloc_reg(current,i,31);
1928     dirty_reg(current,31);
1929   }
1930   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1931   {
1932     alloc_reg(current,i,rs1[i]);
1933     if (rt1[i]!=0) {
1934       alloc_reg(current,i,rt1[i]);
1935       dirty_reg(current,rt1[i]);
1936     }
1937   }
1938   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1939   {
1940     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1941     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1942     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1943     {
1944       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1945       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1946     }
1947   }
1948   else
1949   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1950   {
1951     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1952     if(!((current->is32>>rs1[i])&1))
1953     {
1954       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1955     }
1956   }
1957   else
1958   if(opcode[i]==0x11) // BC1
1959   {
1960     alloc_reg(current,i,FSREG);
1961     alloc_reg(current,i,CSREG);
1962   }
1963   //else ...
1964 }
1965
1966 add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1967 {
1968   stubs[stubcount][0]=type;
1969   stubs[stubcount][1]=addr;
1970   stubs[stubcount][2]=retaddr;
1971   stubs[stubcount][3]=a;
1972   stubs[stubcount][4]=b;
1973   stubs[stubcount][5]=c;
1974   stubs[stubcount][6]=d;
1975   stubs[stubcount][7]=e;
1976   stubcount++;
1977 }
1978
1979 // Write out a single register
1980 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1981 {
1982   int hr;
1983   for(hr=0;hr<HOST_REGS;hr++) {
1984     if(hr!=EXCLUDE_REG) {
1985       if((regmap[hr]&63)==r) {
1986         if((dirty>>hr)&1) {
1987           if(regmap[hr]<64) {
1988             emit_storereg(r,hr);
1989 #ifndef FORCE32
1990             if((is32>>regmap[hr])&1) {
1991               emit_sarimm(hr,31,hr);
1992               emit_storereg(r|64,hr);
1993             }
1994 #endif
1995           }else{
1996             emit_storereg(r|64,hr);
1997           }
1998         }
1999       }
2000     }
2001   }
2002 }
2003
2004 int mchecksum()
2005 {
2006   //if(!tracedebug) return 0;
2007   int i;
2008   int sum=0;
2009   for(i=0;i<2097152;i++) {
2010     unsigned int temp=sum;
2011     sum<<=1;
2012     sum|=(~temp)>>31;
2013     sum^=((u_int *)rdram)[i];
2014   }
2015   return sum;
2016 }
2017 int rchecksum()
2018 {
2019   int i;
2020   int sum=0;
2021   for(i=0;i<64;i++)
2022     sum^=((u_int *)reg)[i];
2023   return sum;
2024 }
2025 void rlist()
2026 {
2027   int i;
2028   printf("TRACE: ");
2029   for(i=0;i<32;i++)
2030     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
2031   printf("\n");
2032 #ifndef DISABLE_COP1
2033   printf("TRACE: ");
2034   for(i=0;i<32;i++)
2035     printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
2036   printf("\n");
2037 #endif
2038 }
2039
2040 void enabletrace()
2041 {
2042   tracedebug=1;
2043 }
2044
2045 void memdebug(int i)
2046 {
2047   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
2048   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
2049   //rlist();
2050   //if(tracedebug) {
2051   //if(Count>=-2084597794) {
2052   if((signed int)Count>=-2084597794&&(signed int)Count<0) {
2053   //if(0) {
2054     printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
2055     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
2056     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
2057     rlist();
2058     #ifdef __i386__
2059     printf("TRACE: %x\n",(&i)[-1]);
2060     #endif
2061     #ifdef __arm__
2062     int j;
2063     printf("TRACE: %x \n",(&j)[10]);
2064     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
2065     #endif
2066     //fflush(stdout);
2067   }
2068   //printf("TRACE: %x\n",(&i)[-1]);
2069 }
2070
2071 void tlb_debug(u_int cause, u_int addr, u_int iaddr)
2072 {
2073   printf("TLB Exception: instruction=%x addr=%x cause=%x\n",iaddr, addr, cause);
2074 }
2075
2076 void alu_assemble(int i,struct regstat *i_regs)
2077 {
2078   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
2079     if(rt1[i]) {
2080       signed char s1,s2,t;
2081       t=get_reg(i_regs->regmap,rt1[i]);
2082       if(t>=0) {
2083         s1=get_reg(i_regs->regmap,rs1[i]);
2084         s2=get_reg(i_regs->regmap,rs2[i]);
2085         if(rs1[i]&&rs2[i]) {
2086           assert(s1>=0);
2087           assert(s2>=0);
2088           if(opcode2[i]&2) emit_sub(s1,s2,t);
2089           else emit_add(s1,s2,t);
2090         }
2091         else if(rs1[i]) {
2092           if(s1>=0) emit_mov(s1,t);
2093           else emit_loadreg(rs1[i],t);
2094         }
2095         else if(rs2[i]) {
2096           if(s2>=0) {
2097             if(opcode2[i]&2) emit_neg(s2,t);
2098             else emit_mov(s2,t);
2099           }
2100           else {
2101             emit_loadreg(rs2[i],t);
2102             if(opcode2[i]&2) emit_neg(t,t);
2103           }
2104         }
2105         else emit_zeroreg(t);
2106       }
2107     }
2108   }
2109   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
2110     if(rt1[i]) {
2111       signed char s1l,s2l,s1h,s2h,tl,th;
2112       tl=get_reg(i_regs->regmap,rt1[i]);
2113       th=get_reg(i_regs->regmap,rt1[i]|64);
2114       if(tl>=0) {
2115         s1l=get_reg(i_regs->regmap,rs1[i]);
2116         s2l=get_reg(i_regs->regmap,rs2[i]);
2117         s1h=get_reg(i_regs->regmap,rs1[i]|64);
2118         s2h=get_reg(i_regs->regmap,rs2[i]|64);
2119         if(rs1[i]&&rs2[i]) {
2120           assert(s1l>=0);
2121           assert(s2l>=0);
2122           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
2123           else emit_adds(s1l,s2l,tl);
2124           if(th>=0) {
2125             #ifdef INVERTED_CARRY
2126             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
2127             #else
2128             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
2129             #endif
2130             else emit_add(s1h,s2h,th);
2131           }
2132         }
2133         else if(rs1[i]) {
2134           if(s1l>=0) emit_mov(s1l,tl);
2135           else emit_loadreg(rs1[i],tl);
2136           if(th>=0) {
2137             if(s1h>=0) emit_mov(s1h,th);
2138             else emit_loadreg(rs1[i]|64,th);
2139           }
2140         }
2141         else if(rs2[i]) {
2142           if(s2l>=0) {
2143             if(opcode2[i]&2) emit_negs(s2l,tl);
2144             else emit_mov(s2l,tl);
2145           }
2146           else {
2147             emit_loadreg(rs2[i],tl);
2148             if(opcode2[i]&2) emit_negs(tl,tl);
2149           }
2150           if(th>=0) {
2151             #ifdef INVERTED_CARRY
2152             if(s2h>=0) emit_mov(s2h,th);
2153             else emit_loadreg(rs2[i]|64,th);
2154             if(opcode2[i]&2) {
2155               emit_adcimm(-1,th); // x86 has inverted carry flag
2156               emit_not(th,th);
2157             }
2158             #else
2159             if(opcode2[i]&2) {
2160               if(s2h>=0) emit_rscimm(s2h,0,th);
2161               else {
2162                 emit_loadreg(rs2[i]|64,th);
2163                 emit_rscimm(th,0,th);
2164               }
2165             }else{
2166               if(s2h>=0) emit_mov(s2h,th);
2167               else emit_loadreg(rs2[i]|64,th);
2168             }
2169             #endif
2170           }
2171         }
2172         else {
2173           emit_zeroreg(tl);
2174           if(th>=0) emit_zeroreg(th);
2175         }
2176       }
2177     }
2178   }
2179   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2180     if(rt1[i]) {
2181       signed char s1l,s1h,s2l,s2h,t;
2182       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2183       {
2184         t=get_reg(i_regs->regmap,rt1[i]);
2185         //assert(t>=0);
2186         if(t>=0) {
2187           s1l=get_reg(i_regs->regmap,rs1[i]);
2188           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2189           s2l=get_reg(i_regs->regmap,rs2[i]);
2190           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2191           if(rs2[i]==0) // rx<r0
2192           {
2193             assert(s1h>=0);
2194             if(opcode2[i]==0x2a) // SLT
2195               emit_shrimm(s1h,31,t);
2196             else // SLTU (unsigned can not be less than zero)
2197               emit_zeroreg(t);
2198           }
2199           else if(rs1[i]==0) // r0<rx
2200           {
2201             assert(s2h>=0);
2202             if(opcode2[i]==0x2a) // SLT
2203               emit_set_gz64_32(s2h,s2l,t);
2204             else // SLTU (set if not zero)
2205               emit_set_nz64_32(s2h,s2l,t);
2206           }
2207           else {
2208             assert(s1l>=0);assert(s1h>=0);
2209             assert(s2l>=0);assert(s2h>=0);
2210             if(opcode2[i]==0x2a) // SLT
2211               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2212             else // SLTU
2213               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2214           }
2215         }
2216       } else {
2217         t=get_reg(i_regs->regmap,rt1[i]);
2218         //assert(t>=0);
2219         if(t>=0) {
2220           s1l=get_reg(i_regs->regmap,rs1[i]);
2221           s2l=get_reg(i_regs->regmap,rs2[i]);
2222           if(rs2[i]==0) // rx<r0
2223           {
2224             assert(s1l>=0);
2225             if(opcode2[i]==0x2a) // SLT
2226               emit_shrimm(s1l,31,t);
2227             else // SLTU (unsigned can not be less than zero)
2228               emit_zeroreg(t);
2229           }
2230           else if(rs1[i]==0) // r0<rx
2231           {
2232             assert(s2l>=0);
2233             if(opcode2[i]==0x2a) // SLT
2234               emit_set_gz32(s2l,t);
2235             else // SLTU (set if not zero)
2236               emit_set_nz32(s2l,t);
2237           }
2238           else{
2239             assert(s1l>=0);assert(s2l>=0);
2240             if(opcode2[i]==0x2a) // SLT
2241               emit_set_if_less32(s1l,s2l,t);
2242             else // SLTU
2243               emit_set_if_carry32(s1l,s2l,t);
2244           }
2245         }
2246       }
2247     }
2248   }
2249   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2250     if(rt1[i]) {
2251       signed char s1l,s1h,s2l,s2h,th,tl;
2252       tl=get_reg(i_regs->regmap,rt1[i]);
2253       th=get_reg(i_regs->regmap,rt1[i]|64);
2254       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2255       {
2256         assert(tl>=0);
2257         if(tl>=0) {
2258           s1l=get_reg(i_regs->regmap,rs1[i]);
2259           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2260           s2l=get_reg(i_regs->regmap,rs2[i]);
2261           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2262           if(rs1[i]&&rs2[i]) {
2263             assert(s1l>=0);assert(s1h>=0);
2264             assert(s2l>=0);assert(s2h>=0);
2265             if(opcode2[i]==0x24) { // AND
2266               emit_and(s1l,s2l,tl);
2267               emit_and(s1h,s2h,th);
2268             } else
2269             if(opcode2[i]==0x25) { // OR
2270               emit_or(s1l,s2l,tl);
2271               emit_or(s1h,s2h,th);
2272             } else
2273             if(opcode2[i]==0x26) { // XOR
2274               emit_xor(s1l,s2l,tl);
2275               emit_xor(s1h,s2h,th);
2276             } else
2277             if(opcode2[i]==0x27) { // NOR
2278               emit_or(s1l,s2l,tl);
2279               emit_or(s1h,s2h,th);
2280               emit_not(tl,tl);
2281               emit_not(th,th);
2282             }
2283           }
2284           else
2285           {
2286             if(opcode2[i]==0x24) { // AND
2287               emit_zeroreg(tl);
2288               emit_zeroreg(th);
2289             } else
2290             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2291               if(rs1[i]){
2292                 if(s1l>=0) emit_mov(s1l,tl);
2293                 else emit_loadreg(rs1[i],tl);
2294                 if(s1h>=0) emit_mov(s1h,th);
2295                 else emit_loadreg(rs1[i]|64,th);
2296               }
2297               else
2298               if(rs2[i]){
2299                 if(s2l>=0) emit_mov(s2l,tl);
2300                 else emit_loadreg(rs2[i],tl);
2301                 if(s2h>=0) emit_mov(s2h,th);
2302                 else emit_loadreg(rs2[i]|64,th);
2303               }
2304               else{
2305                 emit_zeroreg(tl);
2306                 emit_zeroreg(th);
2307               }
2308             } else
2309             if(opcode2[i]==0x27) { // NOR
2310               if(rs1[i]){
2311                 if(s1l>=0) emit_not(s1l,tl);
2312                 else{
2313                   emit_loadreg(rs1[i],tl);
2314                   emit_not(tl,tl);
2315                 }
2316                 if(s1h>=0) emit_not(s1h,th);
2317                 else{
2318                   emit_loadreg(rs1[i]|64,th);
2319                   emit_not(th,th);
2320                 }
2321               }
2322               else
2323               if(rs2[i]){
2324                 if(s2l>=0) emit_not(s2l,tl);
2325                 else{
2326                   emit_loadreg(rs2[i],tl);
2327                   emit_not(tl,tl);
2328                 }
2329                 if(s2h>=0) emit_not(s2h,th);
2330                 else{
2331                   emit_loadreg(rs2[i]|64,th);
2332                   emit_not(th,th);
2333                 }
2334               }
2335               else {
2336                 emit_movimm(-1,tl);
2337                 emit_movimm(-1,th);
2338               }
2339             }
2340           }
2341         }
2342       }
2343       else
2344       {
2345         // 32 bit
2346         if(tl>=0) {
2347           s1l=get_reg(i_regs->regmap,rs1[i]);
2348           s2l=get_reg(i_regs->regmap,rs2[i]);
2349           if(rs1[i]&&rs2[i]) {
2350             assert(s1l>=0);
2351             assert(s2l>=0);
2352             if(opcode2[i]==0x24) { // AND
2353               emit_and(s1l,s2l,tl);
2354             } else
2355             if(opcode2[i]==0x25) { // OR
2356               emit_or(s1l,s2l,tl);
2357             } else
2358             if(opcode2[i]==0x26) { // XOR
2359               emit_xor(s1l,s2l,tl);
2360             } else
2361             if(opcode2[i]==0x27) { // NOR
2362               emit_or(s1l,s2l,tl);
2363               emit_not(tl,tl);
2364             }
2365           }
2366           else
2367           {
2368             if(opcode2[i]==0x24) { // AND
2369               emit_zeroreg(tl);
2370             } else
2371             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2372               if(rs1[i]){
2373                 if(s1l>=0) emit_mov(s1l,tl);
2374                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2375               }
2376               else
2377               if(rs2[i]){
2378                 if(s2l>=0) emit_mov(s2l,tl);
2379                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2380               }
2381               else emit_zeroreg(tl);
2382             } else
2383             if(opcode2[i]==0x27) { // NOR
2384               if(rs1[i]){
2385                 if(s1l>=0) emit_not(s1l,tl);
2386                 else {
2387                   emit_loadreg(rs1[i],tl);
2388                   emit_not(tl,tl);
2389                 }
2390               }
2391               else
2392               if(rs2[i]){
2393                 if(s2l>=0) emit_not(s2l,tl);
2394                 else {
2395                   emit_loadreg(rs2[i],tl);
2396                   emit_not(tl,tl);
2397                 }
2398               }
2399               else emit_movimm(-1,tl);
2400             }
2401           }
2402         }
2403       }
2404     }
2405   }
2406 }
2407
2408 void imm16_assemble(int i,struct regstat *i_regs)
2409 {
2410   if (opcode[i]==0x0f) { // LUI
2411     if(rt1[i]) {
2412       signed char t;
2413       t=get_reg(i_regs->regmap,rt1[i]);
2414       //assert(t>=0);
2415       if(t>=0) {
2416         if(!((i_regs->isconst>>t)&1))
2417           emit_movimm(imm[i]<<16,t);
2418       }
2419     }
2420   }
2421   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2422     if(rt1[i]) {
2423       signed char s,t;
2424       t=get_reg(i_regs->regmap,rt1[i]);
2425       s=get_reg(i_regs->regmap,rs1[i]);
2426       if(rs1[i]) {
2427         //assert(t>=0);
2428         //assert(s>=0);
2429         if(t>=0) {
2430           if(!((i_regs->isconst>>t)&1)) {
2431             if(s<0) {
2432               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2433               emit_addimm(t,imm[i],t);
2434             }else{
2435               if(!((i_regs->wasconst>>s)&1))
2436                 emit_addimm(s,imm[i],t);
2437               else
2438                 emit_movimm(constmap[i][s]+imm[i],t);
2439             }
2440           }
2441         }
2442       } else {
2443         if(t>=0) {
2444           if(!((i_regs->isconst>>t)&1))
2445             emit_movimm(imm[i],t);
2446         }
2447       }
2448     }
2449   }
2450   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2451     if(rt1[i]) {
2452       signed char sh,sl,th,tl;
2453       th=get_reg(i_regs->regmap,rt1[i]|64);
2454       tl=get_reg(i_regs->regmap,rt1[i]);
2455       sh=get_reg(i_regs->regmap,rs1[i]|64);
2456       sl=get_reg(i_regs->regmap,rs1[i]);
2457       if(tl>=0) {
2458         if(rs1[i]) {
2459           assert(sh>=0);
2460           assert(sl>=0);
2461           if(th>=0) {
2462             emit_addimm64_32(sh,sl,imm[i],th,tl);
2463           }
2464           else {
2465             emit_addimm(sl,imm[i],tl);
2466           }
2467         } else {
2468           emit_movimm(imm[i],tl);
2469           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2470         }
2471       }
2472     }
2473   }
2474   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2475     if(rt1[i]) {
2476       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2477       signed char sh,sl,t;
2478       t=get_reg(i_regs->regmap,rt1[i]);
2479       sh=get_reg(i_regs->regmap,rs1[i]|64);
2480       sl=get_reg(i_regs->regmap,rs1[i]);
2481       //assert(t>=0);
2482       if(t>=0) {
2483         if(rs1[i]>0) {
2484           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2485           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2486             if(opcode[i]==0x0a) { // SLTI
2487               if(sl<0) {
2488                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2489                 emit_slti32(t,imm[i],t);
2490               }else{
2491                 emit_slti32(sl,imm[i],t);
2492               }
2493             }
2494             else { // SLTIU
2495               if(sl<0) {
2496                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2497                 emit_sltiu32(t,imm[i],t);
2498               }else{
2499                 emit_sltiu32(sl,imm[i],t);
2500               }
2501             }
2502           }else{ // 64-bit
2503             assert(sl>=0);
2504             if(opcode[i]==0x0a) // SLTI
2505               emit_slti64_32(sh,sl,imm[i],t);
2506             else // SLTIU
2507               emit_sltiu64_32(sh,sl,imm[i],t);
2508           }
2509         }else{
2510           // SLTI(U) with r0 is just stupid,
2511           // nonetheless examples can be found
2512           if(opcode[i]==0x0a) // SLTI
2513             if(0<imm[i]) emit_movimm(1,t);
2514             else emit_zeroreg(t);
2515           else // SLTIU
2516           {
2517             if(imm[i]) emit_movimm(1,t);
2518             else emit_zeroreg(t);
2519           }
2520         }
2521       }
2522     }
2523   }
2524   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2525     if(rt1[i]) {
2526       signed char sh,sl,th,tl;
2527       th=get_reg(i_regs->regmap,rt1[i]|64);
2528       tl=get_reg(i_regs->regmap,rt1[i]);
2529       sh=get_reg(i_regs->regmap,rs1[i]|64);
2530       sl=get_reg(i_regs->regmap,rs1[i]);
2531       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2532         if(opcode[i]==0x0c) //ANDI
2533         {
2534           if(rs1[i]) {
2535             if(sl<0) {
2536               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2537               emit_andimm(tl,imm[i],tl);
2538             }else{
2539               if(!((i_regs->wasconst>>sl)&1))
2540                 emit_andimm(sl,imm[i],tl);
2541               else
2542                 emit_movimm(constmap[i][sl]&imm[i],tl);
2543             }
2544           }
2545           else
2546             emit_zeroreg(tl);
2547           if(th>=0) emit_zeroreg(th);
2548         }
2549         else
2550         {
2551           if(rs1[i]) {
2552             if(sl<0) {
2553               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2554             }
2555             if(th>=0) {
2556               if(sh<0) {
2557                 emit_loadreg(rs1[i]|64,th);
2558               }else{
2559                 emit_mov(sh,th);
2560               }
2561             }
2562             if(opcode[i]==0x0d) //ORI
2563             if(sl<0) {
2564               emit_orimm(tl,imm[i],tl);
2565             }else{
2566               if(!((i_regs->wasconst>>sl)&1))
2567                 emit_orimm(sl,imm[i],tl);
2568               else
2569                 emit_movimm(constmap[i][sl]|imm[i],tl);
2570             }
2571             if(opcode[i]==0x0e) //XORI
2572             if(sl<0) {
2573               emit_xorimm(tl,imm[i],tl);
2574             }else{
2575               if(!((i_regs->wasconst>>sl)&1))
2576                 emit_xorimm(sl,imm[i],tl);
2577               else
2578                 emit_movimm(constmap[i][sl]^imm[i],tl);
2579             }
2580           }
2581           else {
2582             emit_movimm(imm[i],tl);
2583             if(th>=0) emit_zeroreg(th);
2584           }
2585         }
2586       }
2587     }
2588   }
2589 }
2590
2591 void shiftimm_assemble(int i,struct regstat *i_regs)
2592 {
2593   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2594   {
2595     if(rt1[i]) {
2596       signed char s,t;
2597       t=get_reg(i_regs->regmap,rt1[i]);
2598       s=get_reg(i_regs->regmap,rs1[i]);
2599       //assert(t>=0);
2600       if(t>=0){
2601         if(rs1[i]==0)
2602         {
2603           emit_zeroreg(t);
2604         }
2605         else
2606         {
2607           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2608           if(imm[i]) {
2609             if(opcode2[i]==0) // SLL
2610             {
2611               emit_shlimm(s<0?t:s,imm[i],t);
2612             }
2613             if(opcode2[i]==2) // SRL
2614             {
2615               emit_shrimm(s<0?t:s,imm[i],t);
2616             }
2617             if(opcode2[i]==3) // SRA
2618             {
2619               emit_sarimm(s<0?t:s,imm[i],t);
2620             }
2621           }else{
2622             // Shift by zero
2623             if(s>=0 && s!=t) emit_mov(s,t);
2624           }
2625         }
2626       }
2627       //emit_storereg(rt1[i],t); //DEBUG
2628     }
2629   }
2630   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2631   {
2632     if(rt1[i]) {
2633       signed char sh,sl,th,tl;
2634       th=get_reg(i_regs->regmap,rt1[i]|64);
2635       tl=get_reg(i_regs->regmap,rt1[i]);
2636       sh=get_reg(i_regs->regmap,rs1[i]|64);
2637       sl=get_reg(i_regs->regmap,rs1[i]);
2638       if(tl>=0) {
2639         if(rs1[i]==0)
2640         {
2641           emit_zeroreg(tl);
2642           if(th>=0) emit_zeroreg(th);
2643         }
2644         else
2645         {
2646           assert(sl>=0);
2647           assert(sh>=0);
2648           if(imm[i]) {
2649             if(opcode2[i]==0x38) // DSLL
2650             {
2651               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2652               emit_shlimm(sl,imm[i],tl);
2653             }
2654             if(opcode2[i]==0x3a) // DSRL
2655             {
2656               emit_shrdimm(sl,sh,imm[i],tl);
2657               if(th>=0) emit_shrimm(sh,imm[i],th);
2658             }
2659             if(opcode2[i]==0x3b) // DSRA
2660             {
2661               emit_shrdimm(sl,sh,imm[i],tl);
2662               if(th>=0) emit_sarimm(sh,imm[i],th);
2663             }
2664           }else{
2665             // Shift by zero
2666             if(sl!=tl) emit_mov(sl,tl);
2667             if(th>=0&&sh!=th) emit_mov(sh,th);
2668           }
2669         }
2670       }
2671     }
2672   }
2673   if(opcode2[i]==0x3c) // DSLL32
2674   {
2675     if(rt1[i]) {
2676       signed char sl,tl,th;
2677       tl=get_reg(i_regs->regmap,rt1[i]);
2678       th=get_reg(i_regs->regmap,rt1[i]|64);
2679       sl=get_reg(i_regs->regmap,rs1[i]);
2680       if(th>=0||tl>=0){
2681         assert(tl>=0);
2682         assert(th>=0);
2683         assert(sl>=0);
2684         emit_mov(sl,th);
2685         emit_zeroreg(tl);
2686         if(imm[i]>32)
2687         {
2688           emit_shlimm(th,imm[i]&31,th);
2689         }
2690       }
2691     }
2692   }
2693   if(opcode2[i]==0x3e) // DSRL32
2694   {
2695     if(rt1[i]) {
2696       signed char sh,tl,th;
2697       tl=get_reg(i_regs->regmap,rt1[i]);
2698       th=get_reg(i_regs->regmap,rt1[i]|64);
2699       sh=get_reg(i_regs->regmap,rs1[i]|64);
2700       if(tl>=0){
2701         assert(sh>=0);
2702         emit_mov(sh,tl);
2703         if(th>=0) emit_zeroreg(th);
2704         if(imm[i]>32)
2705         {
2706           emit_shrimm(tl,imm[i]&31,tl);
2707         }
2708       }
2709     }
2710   }
2711   if(opcode2[i]==0x3f) // DSRA32
2712   {
2713     if(rt1[i]) {
2714       signed char sh,tl;
2715       tl=get_reg(i_regs->regmap,rt1[i]);
2716       sh=get_reg(i_regs->regmap,rs1[i]|64);
2717       if(tl>=0){
2718         assert(sh>=0);
2719         emit_mov(sh,tl);
2720         if(imm[i]>32)
2721         {
2722           emit_sarimm(tl,imm[i]&31,tl);
2723         }
2724       }
2725     }
2726   }
2727 }
2728
2729 #ifndef shift_assemble
2730 void shift_assemble(int i,struct regstat *i_regs)
2731 {
2732   printf("Need shift_assemble for this architecture.\n");
2733   exit(1);
2734 }
2735 #endif
2736
2737 void load_assemble(int i,struct regstat *i_regs)
2738 {
2739   int s,th,tl,addr,map=-1;
2740   int offset;
2741   int jaddr=0;
2742   int memtarget=0,c=0;
2743   u_int hr,reglist=0;
2744   th=get_reg(i_regs->regmap,rt1[i]|64);
2745   tl=get_reg(i_regs->regmap,rt1[i]);
2746   s=get_reg(i_regs->regmap,rs1[i]);
2747   offset=imm[i];
2748   for(hr=0;hr<HOST_REGS;hr++) {
2749     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2750   }
2751   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2752   if(s>=0) {
2753     c=(i_regs->wasconst>>s)&1;
2754     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2755     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
2756   }
2757   //printf("load_assemble: c=%d\n",c);
2758   //if(c) printf("load_assemble: const=%x\n",(int)constmap[i][s]+offset);
2759   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2760 #ifdef PCSX
2761   if(tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80)
2762     ||rt1[i]==0) {
2763       // could be FIFO, must perform the read
2764       // ||dummy read
2765       assem_debug("(forced read)\n");
2766       tl=get_reg(i_regs->regmap,-1);
2767       assert(tl>=0);
2768   }
2769 #endif
2770   if(offset||s<0||c) addr=tl;
2771   else addr=s;
2772   if(tl>=0) {
2773     //assert(tl>=0);
2774     //assert(rt1[i]);
2775     reglist&=~(1<<tl);
2776     if(th>=0) reglist&=~(1<<th);
2777     if(!using_tlb) {
2778       if(!c) {
2779 //#define R29_HACK 1
2780         #ifdef R29_HACK
2781         // Strmnnrmn's speed hack
2782         if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2783         #endif
2784         {
2785           emit_cmpimm(addr,RAM_SIZE);
2786           jaddr=(int)out;
2787           #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
2788           // Hint to branch predictor that the branch is unlikely to be taken
2789           if(rs1[i]>=28)
2790             emit_jno_unlikely(0);
2791           else
2792           #endif
2793           emit_jno(0);
2794         }
2795       }
2796     }else{ // using tlb
2797       int x=0;
2798       if (opcode[i]==0x20||opcode[i]==0x24) x=3; // LB/LBU
2799       if (opcode[i]==0x21||opcode[i]==0x25) x=2; // LH/LHU
2800       map=get_reg(i_regs->regmap,TLREG);
2801       assert(map>=0);
2802       map=do_tlb_r(addr,tl,map,x,-1,-1,c,constmap[i][s]+offset);
2803       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr);
2804     }
2805     if (opcode[i]==0x20) { // LB
2806       if(!c||memtarget) {
2807         #ifdef HOST_IMM_ADDR32
2808         if(c)
2809           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2810         else
2811         #endif
2812         {
2813           //emit_xorimm(addr,3,tl);
2814           //gen_tlb_addr_r(tl,map);
2815           //emit_movsbl_indexed((int)rdram-0x80000000,tl,tl);
2816           int x=0;
2817 #ifdef BIG_ENDIAN_MIPS
2818           if(!c) emit_xorimm(addr,3,tl);
2819           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2820 #else
2821           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2822           else if (tl!=addr) emit_mov(addr,tl);
2823 #endif
2824           emit_movsbl_indexed_tlb(x,tl,map,tl);
2825         }
2826         if(jaddr)
2827           add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2828       }
2829       else
2830         inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2831     }
2832     if (opcode[i]==0x21) { // LH
2833       if(!c||memtarget) {
2834         #ifdef HOST_IMM_ADDR32
2835         if(c)
2836           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2837         else
2838         #endif
2839         {
2840           int x=0;
2841 #ifdef BIG_ENDIAN_MIPS
2842           if(!c) emit_xorimm(addr,2,tl);
2843           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2844 #else
2845           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2846           else if (tl!=addr) emit_mov(addr,tl);
2847 #endif
2848           //#ifdef
2849           //emit_movswl_indexed_tlb(x,tl,map,tl);
2850           //else
2851           if(map>=0) {
2852             gen_tlb_addr_r(tl,map);
2853             emit_movswl_indexed(x,tl,tl);
2854           }else
2855             emit_movswl_indexed((int)rdram-0x80000000+x,tl,tl);
2856         }
2857         if(jaddr)
2858           add_stub(LOADH_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2859       }
2860       else
2861         inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2862     }
2863     if (opcode[i]==0x23) { // LW
2864       if(!c||memtarget) {
2865         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2866         #ifdef HOST_IMM_ADDR32
2867         if(c)
2868           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2869         else
2870         #endif
2871         emit_readword_indexed_tlb(0,addr,map,tl);
2872         if(jaddr)
2873           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2874       }
2875       else
2876         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2877     }
2878     if (opcode[i]==0x24) { // LBU
2879       if(!c||memtarget) {
2880         #ifdef HOST_IMM_ADDR32
2881         if(c)
2882           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2883         else
2884         #endif
2885         {
2886           //emit_xorimm(addr,3,tl);
2887           //gen_tlb_addr_r(tl,map);
2888           //emit_movzbl_indexed((int)rdram-0x80000000,tl,tl);
2889           int x=0;
2890 #ifdef BIG_ENDIAN_MIPS
2891           if(!c) emit_xorimm(addr,3,tl);
2892           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2893 #else
2894           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2895           else if (tl!=addr) emit_mov(addr,tl);
2896 #endif
2897           emit_movzbl_indexed_tlb(x,tl,map,tl);
2898         }
2899         if(jaddr)
2900           add_stub(LOADBU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2901       }
2902       else
2903         inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2904     }
2905     if (opcode[i]==0x25) { // LHU
2906       if(!c||memtarget) {
2907         #ifdef HOST_IMM_ADDR32
2908         if(c)
2909           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2910         else
2911         #endif
2912         {
2913           int x=0;
2914 #ifdef BIG_ENDIAN_MIPS
2915           if(!c) emit_xorimm(addr,2,tl);
2916           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2917 #else
2918           if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
2919           else if (tl!=addr) emit_mov(addr,tl);
2920 #endif
2921           //#ifdef
2922           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2923           //#else
2924           if(map>=0) {
2925             gen_tlb_addr_r(tl,map);
2926             emit_movzwl_indexed(x,tl,tl);
2927           }else
2928             emit_movzwl_indexed((int)rdram-0x80000000+x,tl,tl);
2929           if(jaddr)
2930             add_stub(LOADHU_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2931         }
2932       }
2933       else
2934         inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2935     }
2936     if (opcode[i]==0x27) { // LWU
2937       assert(th>=0);
2938       if(!c||memtarget) {
2939         //emit_readword_indexed((int)rdram-0x80000000,addr,tl);
2940         #ifdef HOST_IMM_ADDR32
2941         if(c)
2942           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2943         else
2944         #endif
2945         emit_readword_indexed_tlb(0,addr,map,tl);
2946         if(jaddr)
2947           add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2948       }
2949       else {
2950         inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2951       }
2952       emit_zeroreg(th);
2953     }
2954     if (opcode[i]==0x37) { // LD
2955       if(!c||memtarget) {
2956         //gen_tlb_addr_r(tl,map);
2957         //if(th>=0) emit_readword_indexed((int)rdram-0x80000000,addr,th);
2958         //emit_readword_indexed((int)rdram-0x7FFFFFFC,addr,tl);
2959         #ifdef HOST_IMM_ADDR32
2960         if(c)
2961           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2962         else
2963         #endif
2964         emit_readdword_indexed_tlb(0,addr,map,th,tl);
2965         if(jaddr)
2966           add_stub(LOADD_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2967       }
2968       else
2969         inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2970     }
2971     //emit_storereg(rt1[i],tl); // DEBUG
2972   }
2973   //if(opcode[i]==0x23)
2974   //if(opcode[i]==0x24)
2975   //if(opcode[i]==0x23||opcode[i]==0x24)
2976   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2977   {
2978     //emit_pusha();
2979     save_regs(0x100f);
2980         emit_readword((int)&last_count,ECX);
2981         #ifdef __i386__
2982         if(get_reg(i_regs->regmap,CCREG)<0)
2983           emit_loadreg(CCREG,HOST_CCREG);
2984         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2985         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2986         emit_writeword(HOST_CCREG,(int)&Count);
2987         #endif
2988         #ifdef __arm__
2989         if(get_reg(i_regs->regmap,CCREG)<0)
2990           emit_loadreg(CCREG,0);
2991         else
2992           emit_mov(HOST_CCREG,0);
2993         emit_add(0,ECX,0);
2994         emit_addimm(0,2*ccadj[i],0);
2995         emit_writeword(0,(int)&Count);
2996         #endif
2997     emit_call((int)memdebug);
2998     //emit_popa();
2999     restore_regs(0x100f);
3000   }/**/
3001 }
3002
3003 #ifndef loadlr_assemble
3004 void loadlr_assemble(int i,struct regstat *i_regs)
3005 {
3006   printf("Need loadlr_assemble for this architecture.\n");
3007   exit(1);
3008 }
3009 #endif
3010
3011 void store_assemble(int i,struct regstat *i_regs)
3012 {
3013   int s,th,tl,map=-1;
3014   int addr,temp;
3015   int offset;
3016   int jaddr=0,jaddr2,type;
3017   int memtarget=0,c=0;
3018   int agr=AGEN1+(i&1);
3019   u_int hr,reglist=0;
3020   th=get_reg(i_regs->regmap,rs2[i]|64);
3021   tl=get_reg(i_regs->regmap,rs2[i]);
3022   s=get_reg(i_regs->regmap,rs1[i]);
3023   temp=get_reg(i_regs->regmap,agr);
3024   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3025   offset=imm[i];
3026   if(s>=0) {
3027     c=(i_regs->wasconst>>s)&1;
3028     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3029     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3030   }
3031   assert(tl>=0);
3032   assert(temp>=0);
3033   for(hr=0;hr<HOST_REGS;hr++) {
3034     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3035   }
3036   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3037   if(offset||s<0||c) addr=temp;
3038   else addr=s;
3039   if(!using_tlb) {
3040     if(!c) {
3041       #ifdef R29_HACK
3042       // Strmnnrmn's speed hack
3043       memtarget=1;
3044       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3045       #endif
3046       emit_cmpimm(addr,RAM_SIZE);
3047       #ifdef DESTRUCTIVE_SHIFT
3048       if(s==addr) emit_mov(s,temp);
3049       #endif
3050       #ifdef R29_HACK
3051       if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
3052       #endif
3053       {
3054         jaddr=(int)out;
3055         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
3056         // Hint to branch predictor that the branch is unlikely to be taken
3057         if(rs1[i]>=28)
3058           emit_jno_unlikely(0);
3059         else
3060         #endif
3061         emit_jno(0);
3062       }
3063     }
3064   }else{ // using tlb
3065     int x=0;
3066     if (opcode[i]==0x28) x=3; // SB
3067     if (opcode[i]==0x29) x=2; // SH
3068     map=get_reg(i_regs->regmap,TLREG);
3069     assert(map>=0);
3070     map=do_tlb_w(addr,temp,map,x,c,constmap[i][s]+offset);
3071     do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3072   }
3073
3074   if (opcode[i]==0x28) { // SB
3075     if(!c||memtarget) {
3076       int x=0;
3077 #ifdef BIG_ENDIAN_MIPS
3078       if(!c) emit_xorimm(addr,3,temp);
3079       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
3080 #else
3081       if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
3082       else if (addr!=temp) emit_mov(addr,temp);
3083 #endif
3084       //gen_tlb_addr_w(temp,map);
3085       //emit_writebyte_indexed(tl,(int)rdram-0x80000000,temp);
3086       emit_writebyte_indexed_tlb(tl,x,temp,map,temp);
3087     }
3088     type=STOREB_STUB;
3089   }
3090   if (opcode[i]==0x29) { // SH
3091     if(!c||memtarget) {
3092       int x=0;
3093 #ifdef BIG_ENDIAN_MIPS
3094       if(!c) emit_xorimm(addr,2,temp);
3095       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
3096 #else
3097       if(c) x=(constmap[i][s]+offset)-(constmap[i][s]+offset);
3098       else if (addr!=temp) emit_mov(addr,temp);
3099 #endif
3100       //#ifdef
3101       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
3102       //#else
3103       if(map>=0) {
3104         gen_tlb_addr_w(temp,map);
3105         emit_writehword_indexed(tl,x,temp);
3106       }else
3107         emit_writehword_indexed(tl,(int)rdram-0x80000000+x,temp);
3108     }
3109     type=STOREH_STUB;
3110   }
3111   if (opcode[i]==0x2B) { // SW
3112     if(!c||memtarget)
3113       //emit_writeword_indexed(tl,(int)rdram-0x80000000,addr);
3114       emit_writeword_indexed_tlb(tl,0,addr,map,temp);
3115     type=STOREW_STUB;
3116   }
3117   if (opcode[i]==0x3F) { // SD
3118     if(!c||memtarget) {
3119       if(rs2[i]) {
3120         assert(th>=0);
3121         //emit_writeword_indexed(th,(int)rdram-0x80000000,addr);
3122         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,addr);
3123         emit_writedword_indexed_tlb(th,tl,0,addr,map,temp);
3124       }else{
3125         // Store zero
3126         //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3127         //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3128         emit_writedword_indexed_tlb(tl,tl,0,addr,map,temp);
3129       }
3130     }
3131     type=STORED_STUB;
3132   }
3133   if(!using_tlb&&(!c||memtarget))
3134     // addr could be a temp, make sure it survives STORE*_STUB
3135     reglist|=1<<addr;
3136   if(jaddr) {
3137     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
3138   } else if(!memtarget) {
3139     inline_writestub(type,i,constmap[i][s]+offset,i_regs->regmap,rs2[i],ccadj[i],reglist);
3140   }
3141   if(!using_tlb) {
3142     if(!c||memtarget) {
3143       #ifdef DESTRUCTIVE_SHIFT
3144       // The x86 shift operation is 'destructive'; it overwrites the
3145       // source register, so we need to make a copy first and use that.
3146       addr=temp;
3147       #endif
3148       #if defined(HOST_IMM8)
3149       int ir=get_reg(i_regs->regmap,INVCP);
3150       assert(ir>=0);
3151       emit_cmpmem_indexedsr12_reg(ir,addr,1);
3152       #else
3153       emit_cmpmem_indexedsr12_imm((int)invalid_code,addr,1);
3154       #endif
3155       jaddr2=(int)out;
3156       emit_jne(0);
3157       add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),addr,0,0,0);
3158     }
3159   }
3160   //if(opcode[i]==0x2B || opcode[i]==0x3F)
3161   //if(opcode[i]==0x2B || opcode[i]==0x28)
3162   //if(opcode[i]==0x2B || opcode[i]==0x29)
3163   //if(opcode[i]==0x2B)
3164   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
3165   {
3166     //emit_pusha();
3167     save_regs(0x100f);
3168         emit_readword((int)&last_count,ECX);
3169         #ifdef __i386__
3170         if(get_reg(i_regs->regmap,CCREG)<0)
3171           emit_loadreg(CCREG,HOST_CCREG);
3172         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3173         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3174         emit_writeword(HOST_CCREG,(int)&Count);
3175         #endif
3176         #ifdef __arm__
3177         if(get_reg(i_regs->regmap,CCREG)<0)
3178           emit_loadreg(CCREG,0);
3179         else
3180           emit_mov(HOST_CCREG,0);
3181         emit_add(0,ECX,0);
3182         emit_addimm(0,2*ccadj[i],0);
3183         emit_writeword(0,(int)&Count);
3184         #endif
3185     emit_call((int)memdebug);
3186     //emit_popa();
3187     restore_regs(0x100f);
3188   }/**/
3189 }
3190
3191 void storelr_assemble(int i,struct regstat *i_regs)
3192 {
3193   int s,th,tl;
3194   int temp;
3195   int temp2;
3196   int offset;
3197   int jaddr=0,jaddr2;
3198   int case1,case2,case3;
3199   int done0,done1,done2;
3200   int memtarget,c=0;
3201   int agr=AGEN1+(i&1);
3202   u_int hr,reglist=0;
3203   th=get_reg(i_regs->regmap,rs2[i]|64);
3204   tl=get_reg(i_regs->regmap,rs2[i]);
3205   s=get_reg(i_regs->regmap,rs1[i]);
3206   temp=get_reg(i_regs->regmap,agr);
3207   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3208   offset=imm[i];
3209   if(s>=0) {
3210     c=(i_regs->isconst>>s)&1;
3211     memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3212     if(using_tlb&&((signed int)(constmap[i][s]+offset))>=(signed int)0xC0000000) memtarget=1;
3213   }
3214   assert(tl>=0);
3215   for(hr=0;hr<HOST_REGS;hr++) {
3216     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3217   }
3218   if(tl>=0) {
3219     assert(temp>=0);
3220     if(!using_tlb) {
3221       if(!c) {
3222         emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3223         if(!offset&&s!=temp) emit_mov(s,temp);
3224         jaddr=(int)out;
3225         emit_jno(0);
3226       }
3227       else
3228       {
3229         if(!memtarget||!rs1[i]) {
3230           jaddr=(int)out;
3231           emit_jmp(0);
3232         }
3233       }
3234       if((u_int)rdram!=0x80000000) 
3235         emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3236     }else{ // using tlb
3237       int map=get_reg(i_regs->regmap,TLREG);
3238       assert(map>=0);
3239       map=do_tlb_w(c||s<0||offset?temp:s,temp,map,0,c,constmap[i][s]+offset);
3240       if(!c&&!offset&&s>=0) emit_mov(s,temp);
3241       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr);
3242       if(!jaddr&&!memtarget) {
3243         jaddr=(int)out;
3244         emit_jmp(0);
3245       }
3246       gen_tlb_addr_w(temp,map);
3247     }
3248
3249     if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3250       temp2=get_reg(i_regs->regmap,FTEMP);
3251       if(!rs2[i]) temp2=th=tl;
3252     }
3253
3254 #ifndef BIG_ENDIAN_MIPS
3255     emit_xorimm(temp,3,temp);
3256 #endif
3257     emit_testimm(temp,2);
3258     case2=(int)out;
3259     emit_jne(0);
3260     emit_testimm(temp,1);
3261     case1=(int)out;
3262     emit_jne(0);
3263     // 0
3264     if (opcode[i]==0x2A) { // SWL
3265       emit_writeword_indexed(tl,0,temp);
3266     }
3267     if (opcode[i]==0x2E) { // SWR
3268       emit_writebyte_indexed(tl,3,temp);
3269     }
3270     if (opcode[i]==0x2C) { // SDL
3271       emit_writeword_indexed(th,0,temp);
3272       if(rs2[i]) emit_mov(tl,temp2);
3273     }
3274     if (opcode[i]==0x2D) { // SDR
3275       emit_writebyte_indexed(tl,3,temp);
3276       if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3277     }
3278     done0=(int)out;
3279     emit_jmp(0);
3280     // 1
3281     set_jump_target(case1,(int)out);
3282     if (opcode[i]==0x2A) { // SWL
3283       // Write 3 msb into three least significant bytes
3284       if(rs2[i]) emit_rorimm(tl,8,tl);
3285       emit_writehword_indexed(tl,-1,temp);
3286       if(rs2[i]) emit_rorimm(tl,16,tl);
3287       emit_writebyte_indexed(tl,1,temp);
3288       if(rs2[i]) emit_rorimm(tl,8,tl);
3289     }
3290     if (opcode[i]==0x2E) { // SWR
3291       // Write two lsb into two most significant bytes
3292       emit_writehword_indexed(tl,1,temp);
3293     }
3294     if (opcode[i]==0x2C) { // SDL
3295       if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3296       // Write 3 msb into three least significant bytes
3297       if(rs2[i]) emit_rorimm(th,8,th);
3298       emit_writehword_indexed(th,-1,temp);
3299       if(rs2[i]) emit_rorimm(th,16,th);
3300       emit_writebyte_indexed(th,1,temp);
3301       if(rs2[i]) emit_rorimm(th,8,th);
3302     }
3303     if (opcode[i]==0x2D) { // SDR
3304       if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3305       // Write two lsb into two most significant bytes
3306       emit_writehword_indexed(tl,1,temp);
3307     }
3308     done1=(int)out;
3309     emit_jmp(0);
3310     // 2
3311     set_jump_target(case2,(int)out);
3312     emit_testimm(temp,1);
3313     case3=(int)out;
3314     emit_jne(0);
3315     if (opcode[i]==0x2A) { // SWL
3316       // Write two msb into two least significant bytes
3317       if(rs2[i]) emit_rorimm(tl,16,tl);
3318       emit_writehword_indexed(tl,-2,temp);
3319       if(rs2[i]) emit_rorimm(tl,16,tl);
3320     }
3321     if (opcode[i]==0x2E) { // SWR
3322       // Write 3 lsb into three most significant bytes
3323       emit_writebyte_indexed(tl,-1,temp);
3324       if(rs2[i]) emit_rorimm(tl,8,tl);
3325       emit_writehword_indexed(tl,0,temp);
3326       if(rs2[i]) emit_rorimm(tl,24,tl);
3327     }
3328     if (opcode[i]==0x2C) { // SDL
3329       if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3330       // Write two msb into two least significant bytes
3331       if(rs2[i]) emit_rorimm(th,16,th);
3332       emit_writehword_indexed(th,-2,temp);
3333       if(rs2[i]) emit_rorimm(th,16,th);
3334     }
3335     if (opcode[i]==0x2D) { // SDR
3336       if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3337       // Write 3 lsb into three most significant bytes
3338       emit_writebyte_indexed(tl,-1,temp);
3339       if(rs2[i]) emit_rorimm(tl,8,tl);
3340       emit_writehword_indexed(tl,0,temp);
3341       if(rs2[i]) emit_rorimm(tl,24,tl);
3342     }
3343     done2=(int)out;
3344     emit_jmp(0);
3345     // 3
3346     set_jump_target(case3,(int)out);
3347     if (opcode[i]==0x2A) { // SWL
3348       // Write msb into least significant byte
3349       if(rs2[i]) emit_rorimm(tl,24,tl);
3350       emit_writebyte_indexed(tl,-3,temp);
3351       if(rs2[i]) emit_rorimm(tl,8,tl);
3352     }
3353     if (opcode[i]==0x2E) { // SWR
3354       // Write entire word
3355       emit_writeword_indexed(tl,-3,temp);
3356     }
3357     if (opcode[i]==0x2C) { // SDL
3358       if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3359       // Write msb into least significant byte
3360       if(rs2[i]) emit_rorimm(th,24,th);
3361       emit_writebyte_indexed(th,-3,temp);
3362       if(rs2[i]) emit_rorimm(th,8,th);
3363     }
3364     if (opcode[i]==0x2D) { // SDR
3365       if(rs2[i]) emit_mov(th,temp2);
3366       // Write entire word
3367       emit_writeword_indexed(tl,-3,temp);
3368     }
3369     set_jump_target(done0,(int)out);
3370     set_jump_target(done1,(int)out);
3371     set_jump_target(done2,(int)out);
3372     if (opcode[i]==0x2C) { // SDL
3373       emit_testimm(temp,4);
3374       done0=(int)out;
3375       emit_jne(0);
3376       emit_andimm(temp,~3,temp);
3377       emit_writeword_indexed(temp2,4,temp);
3378       set_jump_target(done0,(int)out);
3379     }
3380     if (opcode[i]==0x2D) { // SDR
3381       emit_testimm(temp,4);
3382       done0=(int)out;
3383       emit_jeq(0);
3384       emit_andimm(temp,~3,temp);
3385       emit_writeword_indexed(temp2,-4,temp);
3386       set_jump_target(done0,(int)out);
3387     }
3388     if(!c||!memtarget)
3389       add_stub(STORELR_STUB,jaddr,(int)out,i,(int)i_regs,temp,ccadj[i],reglist);
3390   }
3391   if(!using_tlb) {
3392     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3393     #if defined(HOST_IMM8)
3394     int ir=get_reg(i_regs->regmap,INVCP);
3395     assert(ir>=0);
3396     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3397     #else
3398     emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3399     #endif
3400     jaddr2=(int)out;
3401     emit_jne(0);
3402     add_stub(INVCODE_STUB,jaddr2,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3403   }
3404   /*
3405     emit_pusha();
3406     //save_regs(0x100f);
3407         emit_readword((int)&last_count,ECX);
3408         if(get_reg(i_regs->regmap,CCREG)<0)
3409           emit_loadreg(CCREG,HOST_CCREG);
3410         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3411         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3412         emit_writeword(HOST_CCREG,(int)&Count);
3413     emit_call((int)memdebug);
3414     emit_popa();
3415     //restore_regs(0x100f);
3416   /**/
3417 }
3418
3419 void c1ls_assemble(int i,struct regstat *i_regs)
3420 {
3421 #ifndef DISABLE_COP1
3422   int s,th,tl;
3423   int temp,ar;
3424   int map=-1;
3425   int offset;
3426   int c=0;
3427   int jaddr,jaddr2=0,jaddr3,type;
3428   int agr=AGEN1+(i&1);
3429   u_int hr,reglist=0;
3430   th=get_reg(i_regs->regmap,FTEMP|64);
3431   tl=get_reg(i_regs->regmap,FTEMP);
3432   s=get_reg(i_regs->regmap,rs1[i]);
3433   temp=get_reg(i_regs->regmap,agr);
3434   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3435   offset=imm[i];
3436   assert(tl>=0);
3437   assert(rs1[i]>0);
3438   assert(temp>=0);
3439   for(hr=0;hr<HOST_REGS;hr++) {
3440     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3441   }
3442   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
3443   if (opcode[i]==0x31||opcode[i]==0x35) // LWC1/LDC1
3444   {
3445     // Loads use a temporary register which we need to save
3446     reglist|=1<<temp;
3447   }
3448   if (opcode[i]==0x39||opcode[i]==0x3D) // SWC1/SDC1
3449     ar=temp;
3450   else // LWC1/LDC1
3451     ar=tl;
3452   //if(s<0) emit_loadreg(rs1[i],ar); //address_generation does this now
3453   //else c=(i_regs->wasconst>>s)&1;
3454   if(s>=0) c=(i_regs->wasconst>>s)&1;
3455   // Check cop1 unusable
3456   if(!cop1_usable) {
3457     signed char rs=get_reg(i_regs->regmap,CSREG);
3458     assert(rs>=0);
3459     emit_testimm(rs,0x20000000);
3460     jaddr=(int)out;
3461     emit_jeq(0);
3462     add_stub(FP_STUB,jaddr,(int)out,i,rs,(int)i_regs,is_delayslot,0);
3463     cop1_usable=1;
3464   }
3465   if (opcode[i]==0x39) { // SWC1 (get float address)
3466     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],tl);
3467   }
3468   if (opcode[i]==0x3D) { // SDC1 (get double address)
3469     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],tl);
3470   }
3471   // Generate address + offset
3472   if(!using_tlb) {
3473     if(!c)
3474       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3475   }
3476   else
3477   {
3478     map=get_reg(i_regs->regmap,TLREG);
3479     assert(map>=0);
3480     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3481       map=do_tlb_r(offset||c||s<0?ar:s,ar,map,0,-1,-1,c,constmap[i][s]+offset);
3482     }
3483     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3484       map=do_tlb_w(offset||c||s<0?ar:s,ar,map,0,c,constmap[i][s]+offset);
3485     }
3486   }
3487   if (opcode[i]==0x39) { // SWC1 (read float)
3488     emit_readword_indexed(0,tl,tl);
3489   }
3490   if (opcode[i]==0x3D) { // SDC1 (read double)
3491     emit_readword_indexed(4,tl,th);
3492     emit_readword_indexed(0,tl,tl);
3493   }
3494   if (opcode[i]==0x31) { // LWC1 (get target address)
3495     emit_readword((int)&reg_cop1_simple[(source[i]>>16)&0x1f],temp);
3496   }
3497   if (opcode[i]==0x35) { // LDC1 (get target address)
3498     emit_readword((int)&reg_cop1_double[(source[i]>>16)&0x1f],temp);
3499   }
3500   if(!using_tlb) {
3501     if(!c) {
3502       jaddr2=(int)out;
3503       emit_jno(0);
3504     }
3505     else if(((signed int)(constmap[i][s]+offset))>=(signed int)0x80000000+RAM_SIZE) {
3506       jaddr2=(int)out;
3507       emit_jmp(0); // inline_readstub/inline_writestub?  Very rare case
3508     }
3509     #ifdef DESTRUCTIVE_SHIFT
3510     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3511       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3512     }
3513     #endif
3514   }else{
3515     if (opcode[i]==0x31||opcode[i]==0x35) { // LWC1/LDC1
3516       do_tlb_r_branch(map,c,constmap[i][s]+offset,&jaddr2);
3517     }
3518     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3519       do_tlb_w_branch(map,c,constmap[i][s]+offset,&jaddr2);
3520     }
3521   }
3522   if (opcode[i]==0x31) { // LWC1
3523     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3524     //gen_tlb_addr_r(ar,map);
3525     //emit_readword_indexed((int)rdram-0x80000000,tl,tl);
3526     #ifdef HOST_IMM_ADDR32
3527     if(c) emit_readword_tlb(constmap[i][s]+offset,map,tl);
3528     else
3529     #endif
3530     emit_readword_indexed_tlb(0,offset||c||s<0?tl:s,map,tl);
3531     type=LOADW_STUB;
3532   }
3533   if (opcode[i]==0x35) { // LDC1
3534     assert(th>=0);
3535     //if(s>=0&&!c&&!offset) emit_mov(s,tl);
3536     //gen_tlb_addr_r(ar,map);
3537     //emit_readword_indexed((int)rdram-0x80000000,tl,th);
3538     //emit_readword_indexed((int)rdram-0x7FFFFFFC,tl,tl);
3539     #ifdef HOST_IMM_ADDR32
3540     if(c) emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
3541     else
3542     #endif
3543     emit_readdword_indexed_tlb(0,offset||c||s<0?tl:s,map,th,tl);
3544     type=LOADD_STUB;
3545   }
3546   if (opcode[i]==0x39) { // SWC1
3547     //emit_writeword_indexed(tl,(int)rdram-0x80000000,temp);
3548     emit_writeword_indexed_tlb(tl,0,offset||c||s<0?temp:s,map,temp);
3549     type=STOREW_STUB;
3550   }
3551   if (opcode[i]==0x3D) { // SDC1
3552     assert(th>=0);
3553     //emit_writeword_indexed(th,(int)rdram-0x80000000,temp);
3554     //emit_writeword_indexed(tl,(int)rdram-0x7FFFFFFC,temp);
3555     emit_writedword_indexed_tlb(th,tl,0,offset||c||s<0?temp:s,map,temp);
3556     type=STORED_STUB;
3557   }
3558   if(!using_tlb) {
3559     if (opcode[i]==0x39||opcode[i]==0x3D) { // SWC1/SDC1
3560       #ifndef DESTRUCTIVE_SHIFT
3561       temp=offset||c||s<0?ar:s;
3562       #endif
3563       #if defined(HOST_IMM8)
3564       int ir=get_reg(i_regs->regmap,INVCP);
3565       assert(ir>=0);
3566       emit_cmpmem_indexedsr12_reg(ir,temp,1);
3567       #else
3568       emit_cmpmem_indexedsr12_imm((int)invalid_code,temp,1);
3569       #endif
3570       jaddr3=(int)out;
3571       emit_jne(0);
3572       add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3573     }
3574   }
3575   if(jaddr2) add_stub(type,jaddr2,(int)out,i,offset||c||s<0?ar:s,(int)i_regs,ccadj[i],reglist);
3576   if (opcode[i]==0x31) { // LWC1 (write float)
3577     emit_writeword_indexed(tl,0,temp);
3578   }
3579   if (opcode[i]==0x35) { // LDC1 (write double)
3580     emit_writeword_indexed(th,4,temp);
3581     emit_writeword_indexed(tl,0,temp);
3582   }
3583   //if(opcode[i]==0x39)
3584   /*if(opcode[i]==0x39||opcode[i]==0x31)
3585   {
3586     emit_pusha();
3587         emit_readword((int)&last_count,ECX);
3588         if(get_reg(i_regs->regmap,CCREG)<0)
3589           emit_loadreg(CCREG,HOST_CCREG);
3590         emit_add(HOST_CCREG,ECX,HOST_CCREG);
3591         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
3592         emit_writeword(HOST_CCREG,(int)&Count);
3593     emit_call((int)memdebug);
3594     emit_popa();
3595   }/**/
3596 #else
3597   cop1_unusable(i, i_regs);
3598 #endif
3599 }
3600
3601 void c2ls_assemble(int i,struct regstat *i_regs)
3602 {
3603   int s,tl;
3604   int ar;
3605   int offset;
3606   int memtarget=0,c=0;
3607   int jaddr,jaddr2=0,jaddr3,type;
3608   int agr=AGEN1+(i&1);
3609   u_int hr,reglist=0;
3610   u_int copr=(source[i]>>16)&0x1f;
3611   s=get_reg(i_regs->regmap,rs1[i]);
3612   tl=get_reg(i_regs->regmap,FTEMP);
3613   offset=imm[i];
3614   assert(rs1[i]>0);
3615   assert(tl>=0);
3616   assert(!using_tlb);
3617
3618   for(hr=0;hr<HOST_REGS;hr++) {
3619     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3620   }
3621   if(i_regs->regmap[HOST_CCREG]==CCREG)
3622     reglist&=~(1<<HOST_CCREG);
3623
3624   // get the address
3625   if (opcode[i]==0x3a) { // SWC2
3626     ar=get_reg(i_regs->regmap,agr);
3627     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3628     reglist|=1<<ar;
3629   } else { // LWC2
3630     ar=tl;
3631   }
3632   if(s>=0) c=(i_regs->wasconst>>s)&1;
3633   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3634   if (!offset&&!c&&s>=0) ar=s;
3635   assert(ar>=0);
3636
3637   if (opcode[i]==0x3a) { // SWC2
3638     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3639     type=STOREW_STUB;
3640   }
3641   else
3642     type=LOADW_STUB;
3643
3644   if(c&&!memtarget) {
3645     jaddr2=(int)out;
3646     emit_jmp(0); // inline_readstub/inline_writestub?
3647   }
3648   else {
3649     if(!c) {
3650       emit_cmpimm(offset||c||s<0?ar:s,RAM_SIZE);
3651       jaddr2=(int)out;
3652       emit_jno(0);
3653     }
3654     if (opcode[i]==0x32) { // LWC2
3655       #ifdef HOST_IMM_ADDR32
3656       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3657       else
3658       #endif
3659       emit_readword_indexed(0,ar,tl);
3660     }
3661     if (opcode[i]==0x3a) { // SWC2
3662       #ifdef DESTRUCTIVE_SHIFT
3663       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3664       #endif
3665       emit_writeword_indexed(tl,0,ar);
3666     }
3667   }
3668   if(jaddr2)
3669     add_stub(type,jaddr2,(int)out,i,ar,(int)i_regs,ccadj[i],reglist);
3670   if (opcode[i]==0x3a) { // SWC2
3671 #if defined(HOST_IMM8)
3672     int ir=get_reg(i_regs->regmap,INVCP);
3673     assert(ir>=0);
3674     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3675 #else
3676     emit_cmpmem_indexedsr12_imm((int)invalid_code,ar,1);
3677 #endif
3678     jaddr3=(int)out;
3679     emit_jne(0);
3680     add_stub(INVCODE_STUB,jaddr3,(int)out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3681   }
3682   if (opcode[i]==0x32) { // LWC2
3683     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3684   }
3685 }
3686
3687 #ifndef multdiv_assemble
3688 void multdiv_assemble(int i,struct regstat *i_regs)
3689 {
3690   printf("Need multdiv_assemble for this architecture.\n");
3691   exit(1);
3692 }
3693 #endif
3694
3695 void mov_assemble(int i,struct regstat *i_regs)
3696 {
3697   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3698   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3699   assert(rt1[i]>0);
3700   if(rt1[i]) {
3701     signed char sh,sl,th,tl;
3702     th=get_reg(i_regs->regmap,rt1[i]|64);
3703     tl=get_reg(i_regs->regmap,rt1[i]);
3704     //assert(tl>=0);
3705     if(tl>=0) {
3706       sh=get_reg(i_regs->regmap,rs1[i]|64);
3707       sl=get_reg(i_regs->regmap,rs1[i]);
3708       if(sl>=0) emit_mov(sl,tl);
3709       else emit_loadreg(rs1[i],tl);
3710       if(th>=0) {
3711         if(sh>=0) emit_mov(sh,th);
3712         else emit_loadreg(rs1[i]|64,th);
3713       }
3714     }
3715   }
3716 }
3717
3718 #ifndef fconv_assemble
3719 void fconv_assemble(int i,struct regstat *i_regs)
3720 {
3721   printf("Need fconv_assemble for this architecture.\n");
3722   exit(1);
3723 }
3724 #endif
3725
3726 #if 0
3727 void float_assemble(int i,struct regstat *i_regs)
3728 {
3729   printf("Need float_assemble for this architecture.\n");
3730   exit(1);
3731 }
3732 #endif
3733
3734 void syscall_assemble(int i,struct regstat *i_regs)
3735 {
3736   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3737   assert(ccreg==HOST_CCREG);
3738   assert(!is_delayslot);
3739   emit_movimm(start+i*4,EAX); // Get PC
3740   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3741   emit_jmp((int)jump_syscall_hle); // XXX
3742 }
3743
3744 void hlecall_assemble(int i,struct regstat *i_regs)
3745 {
3746   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3747   assert(ccreg==HOST_CCREG);
3748   assert(!is_delayslot);
3749   emit_movimm(start+i*4+4,0); // Get PC
3750   emit_movimm((int)psxHLEt[source[i]&7],1);
3751   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG); // XXX
3752   emit_jmp((int)jump_hlecall);
3753 }
3754
3755 void ds_assemble(int i,struct regstat *i_regs)
3756 {
3757   is_delayslot=1;
3758   switch(itype[i]) {
3759     case ALU:
3760       alu_assemble(i,i_regs);break;
3761     case IMM16:
3762       imm16_assemble(i,i_regs);break;
3763     case SHIFT:
3764       shift_assemble(i,i_regs);break;
3765     case SHIFTIMM:
3766       shiftimm_assemble(i,i_regs);break;
3767     case LOAD:
3768       load_assemble(i,i_regs);break;
3769     case LOADLR:
3770       loadlr_assemble(i,i_regs);break;
3771     case STORE:
3772       store_assemble(i,i_regs);break;
3773     case STORELR:
3774       storelr_assemble(i,i_regs);break;
3775     case COP0:
3776       cop0_assemble(i,i_regs);break;
3777     case COP1:
3778       cop1_assemble(i,i_regs);break;
3779     case C1LS:
3780       c1ls_assemble(i,i_regs);break;
3781     case COP2:
3782       cop2_assemble(i,i_regs);break;
3783     case C2LS:
3784       c2ls_assemble(i,i_regs);break;
3785     case C2OP:
3786       c2op_assemble(i,i_regs);break;
3787     case FCONV:
3788       fconv_assemble(i,i_regs);break;
3789     case FLOAT:
3790       float_assemble(i,i_regs);break;
3791     case FCOMP:
3792       fcomp_assemble(i,i_regs);break;
3793     case MULTDIV:
3794       multdiv_assemble(i,i_regs);break;
3795     case MOV:
3796       mov_assemble(i,i_regs);break;
3797     case SYSCALL:
3798     case HLECALL:
3799     case SPAN:
3800     case UJUMP:
3801     case RJUMP:
3802     case CJUMP:
3803     case SJUMP:
3804     case FJUMP:
3805       printf("Jump in the delay slot.  This is probably a bug.\n");
3806   }
3807   is_delayslot=0;
3808 }
3809
3810 // Is the branch target a valid internal jump?
3811 int internal_branch(uint64_t i_is32,int addr)
3812 {
3813   if(addr&1) return 0; // Indirect (register) jump
3814   if(addr>=start && addr<start+slen*4-4)
3815   {
3816     int t=(addr-start)>>2;
3817     // Delay slots are not valid branch targets
3818     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3819     // 64 -> 32 bit transition requires a recompile
3820     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3821     {
3822       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3823       else printf("optimizable: yes\n");
3824     }*/
3825     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3826     if(requires_32bit[t]&~i_is32) return 0;
3827     else return 1;
3828   }
3829   return 0;
3830 }
3831
3832 #ifndef wb_invalidate
3833 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3834   uint64_t u,uint64_t uu)
3835 {
3836   int hr;
3837   for(hr=0;hr<HOST_REGS;hr++) {
3838     if(hr!=EXCLUDE_REG) {
3839       if(pre[hr]!=entry[hr]) {
3840         if(pre[hr]>=0) {
3841           if((dirty>>hr)&1) {
3842             if(get_reg(entry,pre[hr])<0) {
3843               if(pre[hr]<64) {
3844                 if(!((u>>pre[hr])&1)) {
3845                   emit_storereg(pre[hr],hr);
3846                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3847                     emit_sarimm(hr,31,hr);
3848                     emit_storereg(pre[hr]|64,hr);
3849                   }
3850                 }
3851               }else{
3852                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3853                   emit_storereg(pre[hr],hr);
3854                 }
3855               }
3856             }
3857           }
3858         }
3859       }
3860     }
3861   }
3862   // Move from one register to another (no writeback)
3863   for(hr=0;hr<HOST_REGS;hr++) {
3864     if(hr!=EXCLUDE_REG) {
3865       if(pre[hr]!=entry[hr]) {
3866         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3867           int nr;
3868           if((nr=get_reg(entry,pre[hr]))>=0) {
3869             emit_mov(hr,nr);
3870           }
3871         }
3872       }
3873     }
3874   }
3875 }
3876 #endif
3877
3878 // Load the specified registers
3879 // This only loads the registers given as arguments because
3880 // we don't want to load things that will be overwritten
3881 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3882 {
3883   int hr;
3884   // Load 32-bit regs
3885   for(hr=0;hr<HOST_REGS;hr++) {
3886     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3887       if(entry[hr]!=regmap[hr]) {
3888         if(regmap[hr]==rs1||regmap[hr]==rs2)
3889         {
3890           if(regmap[hr]==0) {
3891             emit_zeroreg(hr);
3892           }
3893           else
3894           {
3895             emit_loadreg(regmap[hr],hr);
3896           }
3897         }
3898       }
3899     }
3900   }
3901   //Load 64-bit regs
3902   for(hr=0;hr<HOST_REGS;hr++) {
3903     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3904       if(entry[hr]!=regmap[hr]) {
3905         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3906         {
3907           assert(regmap[hr]!=64);
3908           if((is32>>(regmap[hr]&63))&1) {
3909             int lr=get_reg(regmap,regmap[hr]-64);
3910             if(lr>=0)
3911               emit_sarimm(lr,31,hr);
3912             else
3913               emit_loadreg(regmap[hr],hr);
3914           }
3915           else
3916           {
3917             emit_loadreg(regmap[hr],hr);
3918           }
3919         }
3920       }
3921     }
3922   }
3923 }
3924
3925 // Load registers prior to the start of a loop
3926 // so that they are not loaded within the loop
3927 static void loop_preload(signed char pre[],signed char entry[])
3928 {
3929   int hr;
3930   for(hr=0;hr<HOST_REGS;hr++) {
3931     if(hr!=EXCLUDE_REG) {
3932       if(pre[hr]!=entry[hr]) {
3933         if(entry[hr]>=0) {
3934           if(get_reg(pre,entry[hr])<0) {
3935             assem_debug("loop preload:\n");
3936             //printf("loop preload: %d\n",hr);
3937             if(entry[hr]==0) {
3938               emit_zeroreg(hr);
3939             }
3940             else if(entry[hr]<TEMPREG)
3941             {
3942               emit_loadreg(entry[hr],hr);
3943             }
3944             else if(entry[hr]-64<TEMPREG)
3945             {
3946               emit_loadreg(entry[hr],hr);
3947             }
3948           }
3949         }
3950       }
3951     }
3952   }
3953 }
3954
3955 // Generate address for load/store instruction
3956 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3957 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3958 {
3959   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3960     int ra;
3961     int agr=AGEN1+(i&1);
3962     int mgr=MGEN1+(i&1);
3963     if(itype[i]==LOAD) {
3964       ra=get_reg(i_regs->regmap,rt1[i]);
3965       //if(rt1[i]) assert(ra>=0);
3966     }
3967     if(itype[i]==LOADLR) {
3968       ra=get_reg(i_regs->regmap,FTEMP);
3969     }
3970     if(itype[i]==STORE||itype[i]==STORELR) {
3971       ra=get_reg(i_regs->regmap,agr);
3972       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3973     }
3974     if(itype[i]==C1LS||itype[i]==C2LS) {
3975       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3976         ra=get_reg(i_regs->regmap,FTEMP);
3977       else { // SWC1/SDC1/SWC2/SDC2
3978         ra=get_reg(i_regs->regmap,agr);
3979         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3980       }
3981     }
3982     int rs=get_reg(i_regs->regmap,rs1[i]);
3983     int rm=get_reg(i_regs->regmap,TLREG);
3984     if(ra>=0) {
3985       int offset=imm[i];
3986       int c=(i_regs->wasconst>>rs)&1;
3987       if(rs1[i]==0) {
3988         // Using r0 as a base address
3989         /*if(rm>=0) {
3990           if(!entry||entry[rm]!=mgr) {
3991             generate_map_const(offset,rm);
3992           } // else did it in the previous cycle
3993         }*/
3994         if(!entry||entry[ra]!=agr) {
3995           if (opcode[i]==0x22||opcode[i]==0x26) {
3996             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3997           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3998             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3999           }else{
4000             emit_movimm(offset,ra);
4001           }
4002         } // else did it in the previous cycle
4003       }
4004       else if(rs<0) {
4005         if(!entry||entry[ra]!=rs1[i])
4006           emit_loadreg(rs1[i],ra);
4007         //if(!entry||entry[ra]!=rs1[i])
4008         //  printf("poor load scheduling!\n");
4009       }
4010       else if(c) {
4011         if(rm>=0) {
4012           if(!entry||entry[rm]!=mgr) {
4013             if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a) {
4014               // Stores to memory go thru the mapper to detect self-modifying
4015               // code, loads don't.
4016               if((unsigned int)(constmap[i][rs]+offset)>=0xC0000000 ||
4017                  (unsigned int)(constmap[i][rs]+offset)<0x80000000+RAM_SIZE )
4018                 generate_map_const(constmap[i][rs]+offset,rm);
4019             }else{
4020               if((signed int)(constmap[i][rs]+offset)>=(signed int)0xC0000000)
4021                 generate_map_const(constmap[i][rs]+offset,rm);
4022             }
4023           }
4024         }
4025         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
4026           if(!entry||entry[ra]!=agr) {
4027             if (opcode[i]==0x22||opcode[i]==0x26) {
4028               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4029             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
4030               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4031             }else{
4032               #ifdef HOST_IMM_ADDR32
4033               if((itype[i]!=LOAD&&(opcode[i]&0x3b)!=0x31&&(opcode[i]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4034                  (using_tlb&&((signed int)constmap[i][rs]+offset)>=(signed int)0xC0000000))
4035               #endif
4036               emit_movimm(constmap[i][rs]+offset,ra);
4037             }
4038           } // else did it in the previous cycle
4039         } // else load_consts already did it
4040       }
4041       if(offset&&!c&&rs1[i]) {
4042         if(rs>=0) {
4043           emit_addimm(rs,offset,ra);
4044         }else{
4045           emit_addimm(ra,offset,ra);
4046         }
4047       }
4048     }
4049   }
4050   // Preload constants for next instruction
4051   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
4052     int agr,ra;
4053     #ifndef HOST_IMM_ADDR32
4054     // Mapper entry
4055     agr=MGEN1+((i+1)&1);
4056     ra=get_reg(i_regs->regmap,agr);
4057     if(ra>=0) {
4058       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4059       int offset=imm[i+1];
4060       int c=(regs[i+1].wasconst>>rs)&1;
4061       if(c) {
4062         if(itype[i+1]==STORE||itype[i+1]==STORELR
4063            ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1, SWC2/SDC2
4064           // Stores to memory go thru the mapper to detect self-modifying
4065           // code, loads don't.
4066           if((unsigned int)(constmap[i+1][rs]+offset)>=0xC0000000 ||
4067              (unsigned int)(constmap[i+1][rs]+offset)<0x80000000+RAM_SIZE )
4068             generate_map_const(constmap[i+1][rs]+offset,ra);
4069         }else{
4070           if((signed int)(constmap[i+1][rs]+offset)>=(signed int)0xC0000000)
4071             generate_map_const(constmap[i+1][rs]+offset,ra);
4072         }
4073       }
4074       /*else if(rs1[i]==0) {
4075         generate_map_const(offset,ra);
4076       }*/
4077     }
4078     #endif
4079     // Actual address
4080     agr=AGEN1+((i+1)&1);
4081     ra=get_reg(i_regs->regmap,agr);
4082     if(ra>=0) {
4083       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
4084       int offset=imm[i+1];
4085       int c=(regs[i+1].wasconst>>rs)&1;
4086       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
4087         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4088           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
4089         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4090           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
4091         }else{
4092           #ifdef HOST_IMM_ADDR32
4093           if((itype[i+1]!=LOAD&&(opcode[i+1]&0x3b)!=0x31&&(opcode[i+1]&0x3b)!=0x32) || // LWC1/LDC1/LWC2/LDC2
4094              (using_tlb&&((signed int)constmap[i+1][rs]+offset)>=(signed int)0xC0000000))
4095           #endif
4096           emit_movimm(constmap[i+1][rs]+offset,ra);
4097         }
4098       }
4099       else if(rs1[i+1]==0) {
4100         // Using r0 as a base address
4101         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
4102           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
4103         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
4104           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
4105         }else{
4106           emit_movimm(offset,ra);
4107         }
4108       }
4109     }
4110   }
4111 }
4112
4113 int get_final_value(int hr, int i, int *value)
4114 {
4115   int reg=regs[i].regmap[hr];
4116   while(i<slen-1) {
4117     if(regs[i+1].regmap[hr]!=reg) break;
4118     if(!((regs[i+1].isconst>>hr)&1)) break;
4119     if(bt[i+1]) break;
4120     i++;
4121   }
4122   if(i<slen-1) {
4123     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
4124       *value=constmap[i][hr];
4125       return 1;
4126     }
4127     if(!bt[i+1]) {
4128       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
4129         // Load in delay slot, out-of-order execution
4130         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
4131         {
4132           #ifdef HOST_IMM_ADDR32
4133           if(!using_tlb||((signed int)constmap[i][hr]+imm[i+2])<(signed int)0xC0000000) return 0;
4134           #endif
4135           // Precompute load address
4136           *value=constmap[i][hr]+imm[i+2];
4137           return 1;
4138         }
4139       }
4140       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
4141       {
4142         #ifdef HOST_IMM_ADDR32
4143         if(!using_tlb||((signed int)constmap[i][hr]+imm[i+1])<(signed int)0xC0000000) return 0;
4144         #endif
4145         // Precompute load address
4146         *value=constmap[i][hr]+imm[i+1];
4147         //printf("c=%x imm=%x\n",(int)constmap[i][hr],imm[i+1]);
4148         return 1;
4149       }
4150     }
4151   }
4152   *value=constmap[i][hr];
4153   //printf("c=%x\n",(int)constmap[i][hr]);
4154   if(i==slen-1) return 1;
4155   if(reg<64) {
4156     return !((unneeded_reg[i+1]>>reg)&1);
4157   }else{
4158     return !((unneeded_reg_upper[i+1]>>reg)&1);
4159   }
4160 }
4161
4162 // Load registers with known constants
4163 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
4164 {
4165   int hr;
4166   // Load 32-bit regs
4167   for(hr=0;hr<HOST_REGS;hr++) {
4168     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4169       //if(entry[hr]!=regmap[hr]) {
4170       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4171         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4172           int value;
4173           if(get_final_value(hr,i,&value)) {
4174             if(value==0) {
4175               emit_zeroreg(hr);
4176             }
4177             else {
4178               emit_movimm(value,hr);
4179             }
4180           }
4181         }
4182       }
4183     }
4184   }
4185   // Load 64-bit regs
4186   for(hr=0;hr<HOST_REGS;hr++) {
4187     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
4188       //if(entry[hr]!=regmap[hr]) {
4189       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
4190         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4191           if((is32>>(regmap[hr]&63))&1) {
4192             int lr=get_reg(regmap,regmap[hr]-64);
4193             assert(lr>=0);
4194             emit_sarimm(lr,31,hr);
4195           }
4196           else
4197           {
4198             int value;
4199             if(get_final_value(hr,i,&value)) {
4200               if(value==0) {
4201                 emit_zeroreg(hr);
4202               }
4203               else {
4204                 emit_movimm(value,hr);
4205               }
4206             }
4207           }
4208         }
4209       }
4210     }
4211   }
4212 }
4213 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
4214 {
4215   int hr;
4216   // Load 32-bit regs
4217   for(hr=0;hr<HOST_REGS;hr++) {
4218     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4219       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
4220         int value=constmap[i][hr];
4221         if(value==0) {
4222           emit_zeroreg(hr);
4223         }
4224         else {
4225           emit_movimm(value,hr);
4226         }
4227       }
4228     }
4229   }
4230   // Load 64-bit regs
4231   for(hr=0;hr<HOST_REGS;hr++) {
4232     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
4233       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
4234         if((is32>>(regmap[hr]&63))&1) {
4235           int lr=get_reg(regmap,regmap[hr]-64);
4236           assert(lr>=0);
4237           emit_sarimm(lr,31,hr);
4238         }
4239         else
4240         {
4241           int value=constmap[i][hr];
4242           if(value==0) {
4243             emit_zeroreg(hr);
4244           }
4245           else {
4246             emit_movimm(value,hr);
4247           }
4248         }
4249       }
4250     }
4251   }
4252 }
4253
4254 // Write out all dirty registers (except cycle count)
4255 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
4256 {
4257   int hr;
4258   for(hr=0;hr<HOST_REGS;hr++) {
4259     if(hr!=EXCLUDE_REG) {
4260       if(i_regmap[hr]>0) {
4261         if(i_regmap[hr]!=CCREG) {
4262           if((i_dirty>>hr)&1) {
4263             if(i_regmap[hr]<64) {
4264               emit_storereg(i_regmap[hr],hr);
4265 #ifndef FORCE32
4266               if( ((i_is32>>i_regmap[hr])&1) ) {
4267                 #ifdef DESTRUCTIVE_WRITEBACK
4268                 emit_sarimm(hr,31,hr);
4269                 emit_storereg(i_regmap[hr]|64,hr);
4270                 #else
4271                 emit_sarimm(hr,31,HOST_TEMPREG);
4272                 emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4273                 #endif
4274               }
4275 #endif
4276             }else{
4277               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4278                 emit_storereg(i_regmap[hr],hr);
4279               }
4280             }
4281           }
4282         }
4283       }
4284     }
4285   }
4286 }
4287 // Write out dirty registers that we need to reload (pair with load_needed_regs)
4288 // This writes the registers not written by store_regs_bt
4289 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4290 {
4291   int hr;
4292   int t=(addr-start)>>2;
4293   for(hr=0;hr<HOST_REGS;hr++) {
4294     if(hr!=EXCLUDE_REG) {
4295       if(i_regmap[hr]>0) {
4296         if(i_regmap[hr]!=CCREG) {
4297           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4298             if((i_dirty>>hr)&1) {
4299               if(i_regmap[hr]<64) {
4300                 emit_storereg(i_regmap[hr],hr);
4301 #ifndef FORCE32
4302                 if( ((i_is32>>i_regmap[hr])&1) ) {
4303                   #ifdef DESTRUCTIVE_WRITEBACK
4304                   emit_sarimm(hr,31,hr);
4305                   emit_storereg(i_regmap[hr]|64,hr);
4306                   #else
4307                   emit_sarimm(hr,31,HOST_TEMPREG);
4308                   emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4309                   #endif
4310                 }
4311 #endif
4312               }else{
4313                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
4314                   emit_storereg(i_regmap[hr],hr);
4315                 }
4316               }
4317             }
4318           }
4319         }
4320       }
4321     }
4322   }
4323 }
4324
4325 // Load all registers (except cycle count)
4326 void load_all_regs(signed char i_regmap[])
4327 {
4328   int hr;
4329   for(hr=0;hr<HOST_REGS;hr++) {
4330     if(hr!=EXCLUDE_REG) {
4331       if(i_regmap[hr]==0) {
4332         emit_zeroreg(hr);
4333       }
4334       else
4335       if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4336       {
4337         emit_loadreg(i_regmap[hr],hr);
4338       }
4339     }
4340   }
4341 }
4342
4343 // Load all current registers also needed by next instruction
4344 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
4345 {
4346   int hr;
4347   for(hr=0;hr<HOST_REGS;hr++) {
4348     if(hr!=EXCLUDE_REG) {
4349       if(get_reg(next_regmap,i_regmap[hr])>=0) {
4350         if(i_regmap[hr]==0) {
4351           emit_zeroreg(hr);
4352         }
4353         else
4354         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG)
4355         {
4356           emit_loadreg(i_regmap[hr],hr);
4357         }
4358       }
4359     }
4360   }
4361 }
4362
4363 // Load all regs, storing cycle count if necessary
4364 void load_regs_entry(int t)
4365 {
4366   int hr;
4367   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
4368   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
4369   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4370     emit_storereg(CCREG,HOST_CCREG);
4371   }
4372   // Load 32-bit regs
4373   for(hr=0;hr<HOST_REGS;hr++) {
4374     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4375       if(regs[t].regmap_entry[hr]==0) {
4376         emit_zeroreg(hr);
4377       }
4378       else if(regs[t].regmap_entry[hr]!=CCREG)
4379       {
4380         emit_loadreg(regs[t].regmap_entry[hr],hr);
4381       }
4382     }
4383   }
4384   // Load 64-bit regs
4385   for(hr=0;hr<HOST_REGS;hr++) {
4386     if(regs[t].regmap_entry[hr]>=64) {
4387       assert(regs[t].regmap_entry[hr]!=64);
4388       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
4389         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4390         if(lr<0) {
4391           emit_loadreg(regs[t].regmap_entry[hr],hr);
4392         }
4393         else
4394         {
4395           emit_sarimm(lr,31,hr);
4396         }
4397       }
4398       else
4399       {
4400         emit_loadreg(regs[t].regmap_entry[hr],hr);
4401       }
4402     }
4403   }
4404 }
4405
4406 // Store dirty registers prior to branch
4407 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4408 {
4409   if(internal_branch(i_is32,addr))
4410   {
4411     int t=(addr-start)>>2;
4412     int hr;
4413     for(hr=0;hr<HOST_REGS;hr++) {
4414       if(hr!=EXCLUDE_REG) {
4415         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
4416           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4417             if((i_dirty>>hr)&1) {
4418               if(i_regmap[hr]<64) {
4419                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
4420                   emit_storereg(i_regmap[hr],hr);
4421                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
4422                     #ifdef DESTRUCTIVE_WRITEBACK
4423                     emit_sarimm(hr,31,hr);
4424                     emit_storereg(i_regmap[hr]|64,hr);
4425                     #else
4426                     emit_sarimm(hr,31,HOST_TEMPREG);
4427                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
4428                     #endif
4429                   }
4430                 }
4431               }else{
4432                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
4433                   emit_storereg(i_regmap[hr],hr);
4434                 }
4435               }
4436             }
4437           }
4438         }
4439       }
4440     }
4441   }
4442   else
4443   {
4444     // Branch out of this block, write out all dirty regs
4445     wb_dirtys(i_regmap,i_is32,i_dirty);
4446   }
4447 }
4448
4449 // Load all needed registers for branch target
4450 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4451 {
4452   //if(addr>=start && addr<(start+slen*4))
4453   if(internal_branch(i_is32,addr))
4454   {
4455     int t=(addr-start)>>2;
4456     int hr;
4457     // Store the cycle count before loading something else
4458     if(i_regmap[HOST_CCREG]!=CCREG) {
4459       assert(i_regmap[HOST_CCREG]==-1);
4460     }
4461     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
4462       emit_storereg(CCREG,HOST_CCREG);
4463     }
4464     // Load 32-bit regs
4465     for(hr=0;hr<HOST_REGS;hr++) {
4466       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<64) {
4467         #ifdef DESTRUCTIVE_WRITEBACK
4468         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
4469         #else
4470         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
4471         #endif
4472           if(regs[t].regmap_entry[hr]==0) {
4473             emit_zeroreg(hr);
4474           }
4475           else if(regs[t].regmap_entry[hr]!=CCREG)
4476           {
4477             emit_loadreg(regs[t].regmap_entry[hr],hr);
4478           }
4479         }
4480       }
4481     }
4482     //Load 64-bit regs
4483     for(hr=0;hr<HOST_REGS;hr++) {
4484       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64) {
4485         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
4486           assert(regs[t].regmap_entry[hr]!=64);
4487           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4488             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4489             if(lr<0) {
4490               emit_loadreg(regs[t].regmap_entry[hr],hr);
4491             }
4492             else
4493             {
4494               emit_sarimm(lr,31,hr);
4495             }
4496           }
4497           else
4498           {
4499             emit_loadreg(regs[t].regmap_entry[hr],hr);
4500           }
4501         }
4502         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
4503           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
4504           assert(lr>=0);
4505           emit_sarimm(lr,31,hr);
4506         }
4507       }
4508     }
4509   }
4510 }
4511
4512 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
4513 {
4514   if(addr>=start && addr<start+slen*4-4)
4515   {
4516     int t=(addr-start)>>2;
4517     int hr;
4518     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
4519     for(hr=0;hr<HOST_REGS;hr++)
4520     {
4521       if(hr!=EXCLUDE_REG)
4522       {
4523         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
4524         {
4525           if(regs[t].regmap_entry[hr]!=-1)
4526           {
4527             return 0;
4528           }
4529           else 
4530           if((i_dirty>>hr)&1)
4531           {
4532             if(i_regmap[hr]<64)
4533             {
4534               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4535                 return 0;
4536             }
4537             else
4538             {
4539               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
4540                 return 0;
4541             }
4542           }
4543         }
4544         else // Same register but is it 32-bit or dirty?
4545         if(i_regmap[hr]>=0)
4546         {
4547           if(!((regs[t].dirty>>hr)&1))
4548           {
4549             if((i_dirty>>hr)&1)
4550             {
4551               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
4552               {
4553                 //printf("%x: dirty no match\n",addr);
4554                 return 0;
4555               }
4556             }
4557           }
4558           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
4559           {
4560             //printf("%x: is32 no match\n",addr);
4561             return 0;
4562           }
4563         }
4564       }
4565     }
4566     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
4567     if(requires_32bit[t]&~i_is32) return 0;
4568     // Delay slots are not valid branch targets
4569     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
4570     // Delay slots require additional processing, so do not match
4571     if(is_ds[t]) return 0;
4572   }
4573   else
4574   {
4575     int hr;
4576     for(hr=0;hr<HOST_REGS;hr++)
4577     {
4578       if(hr!=EXCLUDE_REG)
4579       {
4580         if(i_regmap[hr]>=0)
4581         {
4582           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
4583           {
4584             if((i_dirty>>hr)&1)
4585             {
4586               return 0;
4587             }
4588           }
4589         }
4590       }
4591     }
4592   }
4593   return 1;
4594 }
4595
4596 // Used when a branch jumps into the delay slot of another branch
4597 void ds_assemble_entry(int i)
4598 {
4599   int t=(ba[i]-start)>>2;
4600   if(!instr_addr[t]) instr_addr[t]=(u_int)out;
4601   assem_debug("Assemble delay slot at %x\n",ba[i]);
4602   assem_debug("<->\n");
4603   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
4604     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
4605   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
4606   address_generation(t,&regs[t],regs[t].regmap_entry);
4607   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
4608     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
4609   cop1_usable=0;
4610   is_delayslot=0;
4611   switch(itype[t]) {
4612     case ALU:
4613       alu_assemble(t,&regs[t]);break;
4614     case IMM16:
4615       imm16_assemble(t,&regs[t]);break;
4616     case SHIFT:
4617       shift_assemble(t,&regs[t]);break;
4618     case SHIFTIMM:
4619       shiftimm_assemble(t,&regs[t]);break;
4620     case LOAD:
4621       load_assemble(t,&regs[t]);break;
4622     case LOADLR:
4623       loadlr_assemble(t,&regs[t]);break;
4624     case STORE:
4625       store_assemble(t,&regs[t]);break;
4626     case STORELR:
4627       storelr_assemble(t,&regs[t]);break;
4628     case COP0:
4629       cop0_assemble(t,&regs[t]);break;
4630     case COP1:
4631       cop1_assemble(t,&regs[t]);break;
4632     case C1LS:
4633       c1ls_assemble(t,&regs[t]);break;
4634     case COP2:
4635       cop2_assemble(t,&regs[t]);break;
4636     case C2LS:
4637       c2ls_assemble(t,&regs[t]);break;
4638     case C2OP:
4639       c2op_assemble(t,&regs[t]);break;
4640     case FCONV:
4641       fconv_assemble(t,&regs[t]);break;
4642     case FLOAT:
4643       float_assemble(t,&regs[t]);break;
4644     case FCOMP:
4645       fcomp_assemble(t,&regs[t]);break;
4646     case MULTDIV:
4647       multdiv_assemble(t,&regs[t]);break;
4648     case MOV:
4649       mov_assemble(t,&regs[t]);break;
4650     case SYSCALL:
4651     case HLECALL:
4652     case SPAN:
4653     case UJUMP:
4654     case RJUMP:
4655     case CJUMP:
4656     case SJUMP:
4657     case FJUMP:
4658       printf("Jump in the delay slot.  This is probably a bug.\n");
4659   }
4660   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4661   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
4662   if(internal_branch(regs[t].is32,ba[i]+4))
4663     assem_debug("branch: internal\n");
4664   else
4665     assem_debug("branch: external\n");
4666   assert(internal_branch(regs[t].is32,ba[i]+4));
4667   add_to_linker((int)out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
4668   emit_jmp(0);
4669 }
4670
4671 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
4672 {
4673   int count;
4674   int jaddr;
4675   int idle=0;
4676   if(itype[i]==RJUMP)
4677   {
4678     *adj=0;
4679   }
4680   //if(ba[i]>=start && ba[i]<(start+slen*4))
4681   if(internal_branch(branch_regs[i].is32,ba[i]))
4682   {
4683     int t=(ba[i]-start)>>2;
4684     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
4685     else *adj=ccadj[t];
4686   }
4687   else
4688   {
4689     *adj=0;
4690   }
4691   count=ccadj[i];
4692   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4693     // Idle loop
4694     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4695     idle=(int)out;
4696     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4697     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4698     jaddr=(int)out;
4699     emit_jmp(0);
4700   }
4701   else if(*adj==0||invert) {
4702     emit_addimm_and_set_flags(CLOCK_DIVIDER*(count+2),HOST_CCREG);
4703     jaddr=(int)out;
4704     emit_jns(0);
4705   }
4706   else
4707   {
4708     emit_cmpimm(HOST_CCREG,-2*(count+2));
4709     jaddr=(int)out;
4710     emit_jns(0);
4711   }
4712   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4713 }
4714
4715 void do_ccstub(int n)
4716 {
4717   literal_pool(256);
4718   assem_debug("do_ccstub %x\n",start+stubs[n][4]*4);
4719   set_jump_target(stubs[n][1],(int)out);
4720   int i=stubs[n][4];
4721   if(stubs[n][6]==NULLDS) {
4722     // Delay slot instruction is nullified ("likely" branch)
4723     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4724   }
4725   else if(stubs[n][6]!=TAKEN) {
4726     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4727   }
4728   else {
4729     if(internal_branch(branch_regs[i].is32,ba[i]))
4730       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4731   }
4732   if(stubs[n][5]!=-1)
4733   {
4734     // Save PC as return address
4735     emit_movimm(stubs[n][5],EAX);
4736     emit_writeword(EAX,(int)&pcaddr);
4737   }
4738   else
4739   {
4740     // Return address depends on which way the branch goes
4741     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4742     {
4743       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4744       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4745       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4746       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4747       if(rs1[i]==0)
4748       {
4749         s1l=s2l;s1h=s2h;
4750         s2l=s2h=-1;
4751       }
4752       else if(rs2[i]==0)
4753       {
4754         s2l=s2h=-1;
4755       }
4756       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4757         s1h=s2h=-1;
4758       }
4759       assert(s1l>=0);
4760       #ifdef DESTRUCTIVE_WRITEBACK
4761       if(rs1[i]) {
4762         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4763           emit_loadreg(rs1[i],s1l);
4764       } 
4765       else {
4766         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4767           emit_loadreg(rs2[i],s1l);
4768       }
4769       if(s2l>=0)
4770         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4771           emit_loadreg(rs2[i],s2l);
4772       #endif
4773       int hr=0;
4774       int addr,alt,ntaddr;
4775       while(hr<HOST_REGS)
4776       {
4777         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4778            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4779            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4780         {
4781           addr=hr++;break;
4782         }
4783         hr++;
4784       }
4785       while(hr<HOST_REGS)
4786       {
4787         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4788            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4789            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4790         {
4791           alt=hr++;break;
4792         }
4793         hr++;
4794       }
4795       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4796       {
4797         while(hr<HOST_REGS)
4798         {
4799           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4800              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4801              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4802           {
4803             ntaddr=hr;break;
4804           }
4805           hr++;
4806         }
4807         assert(hr<HOST_REGS);
4808       }
4809       if((opcode[i]&0x2f)==4) // BEQ
4810       {
4811         #ifdef HAVE_CMOV_IMM
4812         if(s1h<0) {
4813           if(s2l>=0) emit_cmp(s1l,s2l);
4814           else emit_test(s1l,s1l);
4815           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4816         }
4817         else
4818         #endif
4819         {
4820           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4821           if(s1h>=0) {
4822             if(s2h>=0) emit_cmp(s1h,s2h);
4823             else emit_test(s1h,s1h);
4824             emit_cmovne_reg(alt,addr);
4825           }
4826           if(s2l>=0) emit_cmp(s1l,s2l);
4827           else emit_test(s1l,s1l);
4828           emit_cmovne_reg(alt,addr);
4829         }
4830       }
4831       if((opcode[i]&0x2f)==5) // BNE
4832       {
4833         #ifdef HAVE_CMOV_IMM
4834         if(s1h<0) {
4835           if(s2l>=0) emit_cmp(s1l,s2l);
4836           else emit_test(s1l,s1l);
4837           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4838         }
4839         else
4840         #endif
4841         {
4842           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4843           if(s1h>=0) {
4844             if(s2h>=0) emit_cmp(s1h,s2h);
4845             else emit_test(s1h,s1h);
4846             emit_cmovne_reg(alt,addr);
4847           }
4848           if(s2l>=0) emit_cmp(s1l,s2l);
4849           else emit_test(s1l,s1l);
4850           emit_cmovne_reg(alt,addr);
4851         }
4852       }
4853       if((opcode[i]&0x2f)==6) // BLEZ
4854       {
4855         //emit_movimm(ba[i],alt);
4856         //emit_movimm(start+i*4+8,addr);
4857         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4858         emit_cmpimm(s1l,1);
4859         if(s1h>=0) emit_mov(addr,ntaddr);
4860         emit_cmovl_reg(alt,addr);
4861         if(s1h>=0) {
4862           emit_test(s1h,s1h);
4863           emit_cmovne_reg(ntaddr,addr);
4864           emit_cmovs_reg(alt,addr);
4865         }
4866       }
4867       if((opcode[i]&0x2f)==7) // BGTZ
4868       {
4869         //emit_movimm(ba[i],addr);
4870         //emit_movimm(start+i*4+8,ntaddr);
4871         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4872         emit_cmpimm(s1l,1);
4873         if(s1h>=0) emit_mov(addr,alt);
4874         emit_cmovl_reg(ntaddr,addr);
4875         if(s1h>=0) {
4876           emit_test(s1h,s1h);
4877           emit_cmovne_reg(alt,addr);
4878           emit_cmovs_reg(ntaddr,addr);
4879         }
4880       }
4881       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4882       {
4883         //emit_movimm(ba[i],alt);
4884         //emit_movimm(start+i*4+8,addr);
4885         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4886         if(s1h>=0) emit_test(s1h,s1h);
4887         else emit_test(s1l,s1l);
4888         emit_cmovs_reg(alt,addr);
4889       }
4890       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4891       {
4892         //emit_movimm(ba[i],addr);
4893         //emit_movimm(start+i*4+8,alt);
4894         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4895         if(s1h>=0) emit_test(s1h,s1h);
4896         else emit_test(s1l,s1l);
4897         emit_cmovs_reg(alt,addr);
4898       }
4899       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4900         if(source[i]&0x10000) // BC1T
4901         {
4902           //emit_movimm(ba[i],alt);
4903           //emit_movimm(start+i*4+8,addr);
4904           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4905           emit_testimm(s1l,0x800000);
4906           emit_cmovne_reg(alt,addr);
4907         }
4908         else // BC1F
4909         {
4910           //emit_movimm(ba[i],addr);
4911           //emit_movimm(start+i*4+8,alt);
4912           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4913           emit_testimm(s1l,0x800000);
4914           emit_cmovne_reg(alt,addr);
4915         }
4916       }
4917       emit_writeword(addr,(int)&pcaddr);
4918     }
4919     else
4920     if(itype[i]==RJUMP)
4921     {
4922       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4923       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4924         r=get_reg(branch_regs[i].regmap,RTEMP);
4925       }
4926       emit_writeword(r,(int)&pcaddr);
4927     }
4928     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
4929   }
4930   // Update cycle count
4931   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4932   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4933   emit_call((int)cc_interrupt);
4934   if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
4935   if(stubs[n][6]==TAKEN) {
4936     if(internal_branch(branch_regs[i].is32,ba[i]))
4937       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4938     else if(itype[i]==RJUMP) {
4939       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4940         emit_readword((int)&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4941       else
4942         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4943     }
4944   }else if(stubs[n][6]==NOTTAKEN) {
4945     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4946     else load_all_regs(branch_regs[i].regmap);
4947   }else if(stubs[n][6]==NULLDS) {
4948     // Delay slot instruction is nullified ("likely" branch)
4949     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4950     else load_all_regs(regs[i].regmap);
4951   }else{
4952     load_all_regs(branch_regs[i].regmap);
4953   }
4954   emit_jmp(stubs[n][2]); // return address
4955   
4956   /* This works but uses a lot of memory...
4957   emit_readword((int)&last_count,ECX);
4958   emit_add(HOST_CCREG,ECX,EAX);
4959   emit_writeword(EAX,(int)&Count);
4960   emit_call((int)gen_interupt);
4961   emit_readword((int)&Count,HOST_CCREG);
4962   emit_readword((int)&next_interupt,EAX);
4963   emit_readword((int)&pending_exception,EBX);
4964   emit_writeword(EAX,(int)&last_count);
4965   emit_sub(HOST_CCREG,EAX,HOST_CCREG);
4966   emit_test(EBX,EBX);
4967   int jne_instr=(int)out;
4968   emit_jne(0);
4969   if(stubs[n][3]) emit_addimm(HOST_CCREG,-2*stubs[n][3],HOST_CCREG);
4970   load_all_regs(branch_regs[i].regmap);
4971   emit_jmp(stubs[n][2]); // return address
4972   set_jump_target(jne_instr,(int)out);
4973   emit_readword((int)&pcaddr,EAX);
4974   // Call get_addr_ht instead of doing the hash table here.
4975   // This code is executed infrequently and takes up a lot of space
4976   // so smaller is better.
4977   emit_storereg(CCREG,HOST_CCREG);
4978   emit_pushreg(EAX);
4979   emit_call((int)get_addr_ht);
4980   emit_loadreg(CCREG,HOST_CCREG);
4981   emit_addimm(ESP,4,ESP);
4982   emit_jmpreg(EAX);*/
4983 }
4984
4985 add_to_linker(int addr,int target,int ext)
4986 {
4987   link_addr[linkcount][0]=addr;
4988   link_addr[linkcount][1]=target;
4989   link_addr[linkcount][2]=ext;  
4990   linkcount++;
4991 }
4992
4993 void ujump_assemble(int i,struct regstat *i_regs)
4994 {
4995   signed char *i_regmap=i_regs->regmap;
4996   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4997   address_generation(i+1,i_regs,regs[i].regmap_entry);
4998   #ifdef REG_PREFETCH
4999   int temp=get_reg(branch_regs[i].regmap,PTEMP);
5000   if(rt1[i]==31&&temp>=0) 
5001   {
5002     int return_address=start+i*4+8;
5003     if(get_reg(branch_regs[i].regmap,31)>0) 
5004     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5005   }
5006   #endif
5007   ds_assemble(i+1,i_regs);
5008   uint64_t bc_unneeded=branch_regs[i].u;
5009   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5010   bc_unneeded|=1|(1LL<<rt1[i]);
5011   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5012   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5013                 bc_unneeded,bc_unneeded_upper);
5014   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5015   if(rt1[i]==31) {
5016     int rt;
5017     unsigned int return_address;
5018     assert(rt1[i+1]!=31);
5019     assert(rt2[i+1]!=31);
5020     rt=get_reg(branch_regs[i].regmap,31);
5021     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5022     //assert(rt>=0);
5023     return_address=start+i*4+8;
5024     if(rt>=0) {
5025       #ifdef USE_MINI_HT
5026       if(internal_branch(branch_regs[i].is32,return_address)) {
5027         int temp=rt+1;
5028         if(temp==EXCLUDE_REG||temp>=HOST_REGS||
5029            branch_regs[i].regmap[temp]>=0)
5030         {
5031           temp=get_reg(branch_regs[i].regmap,-1);
5032         }
5033         #ifdef HOST_TEMPREG
5034         if(temp<0) temp=HOST_TEMPREG;
5035         #endif
5036         if(temp>=0) do_miniht_insert(return_address,rt,temp);
5037         else emit_movimm(return_address,rt);
5038       }
5039       else
5040       #endif
5041       {
5042         #ifdef REG_PREFETCH
5043         if(temp>=0) 
5044         {
5045           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5046         }
5047         #endif
5048         emit_movimm(return_address,rt); // PC into link register
5049         #ifdef IMM_PREFETCH
5050         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5051         #endif
5052       }
5053     }
5054   }
5055   int cc,adj;
5056   cc=get_reg(branch_regs[i].regmap,CCREG);
5057   assert(cc==HOST_CCREG);
5058   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5059   #ifdef REG_PREFETCH
5060   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5061   #endif
5062   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5063   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5064   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5065   if(internal_branch(branch_regs[i].is32,ba[i]))
5066     assem_debug("branch: internal\n");
5067   else
5068     assem_debug("branch: external\n");
5069   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
5070     ds_assemble_entry(i);
5071   }
5072   else {
5073     add_to_linker((int)out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
5074     emit_jmp(0);
5075   }
5076 }
5077
5078 void rjump_assemble(int i,struct regstat *i_regs)
5079 {
5080   signed char *i_regmap=i_regs->regmap;
5081   int temp;
5082   int rs,cc,adj;
5083   rs=get_reg(branch_regs[i].regmap,rs1[i]);
5084   assert(rs>=0);
5085   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
5086     // Delay slot abuse, make a copy of the branch address register
5087     temp=get_reg(branch_regs[i].regmap,RTEMP);
5088     assert(temp>=0);
5089     assert(regs[i].regmap[temp]==RTEMP);
5090     emit_mov(rs,temp);
5091     rs=temp;
5092   }
5093   address_generation(i+1,i_regs,regs[i].regmap_entry);
5094   #ifdef REG_PREFETCH
5095   if(rt1[i]==31) 
5096   {
5097     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
5098       int return_address=start+i*4+8;
5099       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5100     }
5101   }
5102   #endif
5103   #ifdef USE_MINI_HT
5104   if(rs1[i]==31) {
5105     int rh=get_reg(regs[i].regmap,RHASH);
5106     if(rh>=0) do_preload_rhash(rh);
5107   }
5108   #endif
5109   ds_assemble(i+1,i_regs);
5110   uint64_t bc_unneeded=branch_regs[i].u;
5111   uint64_t bc_unneeded_upper=branch_regs[i].uu;
5112   bc_unneeded|=1|(1LL<<rt1[i]);
5113   bc_unneeded_upper|=1|(1LL<<rt1[i]);
5114   bc_unneeded&=~(1LL<<rs1[i]);
5115   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5116                 bc_unneeded,bc_unneeded_upper);
5117   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
5118   if(rt1[i]!=0) {
5119     int rt,return_address;
5120     assert(rt1[i+1]!=rt1[i]);
5121     assert(rt2[i+1]!=rt1[i]);
5122     rt=get_reg(branch_regs[i].regmap,rt1[i]);
5123     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5124     assert(rt>=0);
5125     return_address=start+i*4+8;
5126     #ifdef REG_PREFETCH
5127     if(temp>=0) 
5128     {
5129       if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
5130     }
5131     #endif
5132     emit_movimm(return_address,rt); // PC into link register
5133     #ifdef IMM_PREFETCH
5134     emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5135     #endif
5136   }
5137   cc=get_reg(branch_regs[i].regmap,CCREG);
5138   assert(cc==HOST_CCREG);
5139   #ifdef USE_MINI_HT
5140   int rh=get_reg(branch_regs[i].regmap,RHASH);
5141   int ht=get_reg(branch_regs[i].regmap,RHTBL);
5142   if(rs1[i]==31) {
5143     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
5144     do_preload_rhtbl(ht);
5145     do_rhash(rs,rh);
5146   }
5147   #endif
5148   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5149   #ifdef DESTRUCTIVE_WRITEBACK
5150   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
5151     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
5152       emit_loadreg(rs1[i],rs);
5153     }
5154   }
5155   #endif
5156   #ifdef REG_PREFETCH
5157   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
5158   #endif
5159   #ifdef USE_MINI_HT
5160   if(rs1[i]==31) {
5161     do_miniht_load(ht,rh);
5162   }
5163   #endif
5164   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
5165   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
5166   //assert(adj==0);
5167   emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5168   add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
5169   emit_jns(0);
5170   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
5171   #ifdef USE_MINI_HT
5172   if(rs1[i]==31) {
5173     do_miniht_jump(rs,rh,ht);
5174   }
5175   else
5176   #endif
5177   {
5178     //if(rs!=EAX) emit_mov(rs,EAX);
5179     //emit_jmp((int)jump_vaddr_eax);
5180     emit_jmp(jump_vaddr_reg[rs]);
5181   }
5182   /* Check hash table
5183   temp=!rs;
5184   emit_mov(rs,temp);
5185   emit_shrimm(rs,16,rs);
5186   emit_xor(temp,rs,rs);
5187   emit_movzwl_reg(rs,rs);
5188   emit_shlimm(rs,4,rs);
5189   emit_cmpmem_indexed((int)hash_table,rs,temp);
5190   emit_jne((int)out+14);
5191   emit_readword_indexed((int)hash_table+4,rs,rs);
5192   emit_jmpreg(rs);
5193   emit_cmpmem_indexed((int)hash_table+8,rs,temp);
5194   emit_addimm_no_flags(8,rs);
5195   emit_jeq((int)out-17);
5196   // No hit on hash table, call compiler
5197   emit_pushreg(temp);
5198 //DEBUG >
5199 #ifdef DEBUG_CYCLE_COUNT
5200   emit_readword((int)&last_count,ECX);
5201   emit_add(HOST_CCREG,ECX,HOST_CCREG);
5202   emit_readword((int)&next_interupt,ECX);
5203   emit_writeword(HOST_CCREG,(int)&Count);
5204   emit_sub(HOST_CCREG,ECX,HOST_CCREG);
5205   emit_writeword(ECX,(int)&last_count);
5206 #endif
5207 //DEBUG <
5208   emit_storereg(CCREG,HOST_CCREG);
5209   emit_call((int)get_addr);
5210   emit_loadreg(CCREG,HOST_CCREG);
5211   emit_addimm(ESP,4,ESP);
5212   emit_jmpreg(EAX);*/
5213   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5214   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
5215   #endif
5216 }
5217
5218 void cjump_assemble(int i,struct regstat *i_regs)
5219 {
5220   signed char *i_regmap=i_regs->regmap;
5221   int cc;
5222   int match;
5223   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5224   assem_debug("match=%d\n",match);
5225   int s1h,s1l,s2h,s2l;
5226   int prev_cop1_usable=cop1_usable;
5227   int unconditional=0,nop=0;
5228   int only32=0;
5229   int ooo=1;
5230   int invert=0;
5231   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5232   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5233   if(likely[i]) ooo=0;
5234   if(!match) invert=1;
5235   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5236   if(i>(ba[i]-start)>>2) invert=1;
5237   #endif
5238     
5239   if(ooo)
5240     if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
5241        (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1])))
5242   {
5243     // Write-after-read dependency prevents out of order execution
5244     // First test branch condition, then execute delay slot, then branch
5245     ooo=0;
5246   }
5247
5248   if(ooo) {
5249     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5250     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5251     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
5252     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
5253   }
5254   else {
5255     s1l=get_reg(i_regmap,rs1[i]);
5256     s1h=get_reg(i_regmap,rs1[i]|64);
5257     s2l=get_reg(i_regmap,rs2[i]);
5258     s2h=get_reg(i_regmap,rs2[i]|64);
5259   }
5260   if(rs1[i]==0&&rs2[i]==0)
5261   {
5262     if(opcode[i]&1) nop=1;
5263     else unconditional=1;
5264     //assert(opcode[i]!=5);
5265     //assert(opcode[i]!=7);
5266     //assert(opcode[i]!=0x15);
5267     //assert(opcode[i]!=0x17);
5268   }
5269   else if(rs1[i]==0)
5270   {
5271     s1l=s2l;s1h=s2h;
5272     s2l=s2h=-1;
5273     only32=(regs[i].was32>>rs2[i])&1;
5274   }
5275   else if(rs2[i]==0)
5276   {
5277     s2l=s2h=-1;
5278     only32=(regs[i].was32>>rs1[i])&1;
5279   }
5280   else {
5281     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
5282   }
5283
5284   if(ooo) {
5285     // Out of order execution (delay slot first)
5286     //printf("OOOE\n");
5287     address_generation(i+1,i_regs,regs[i].regmap_entry);
5288     ds_assemble(i+1,i_regs);
5289     int adj;
5290     uint64_t bc_unneeded=branch_regs[i].u;
5291     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5292     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5293     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5294     bc_unneeded|=1;
5295     bc_unneeded_upper|=1;
5296     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5297                   bc_unneeded,bc_unneeded_upper);
5298     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
5299     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5300     cc=get_reg(branch_regs[i].regmap,CCREG);
5301     assert(cc==HOST_CCREG);
5302     if(unconditional) 
5303       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5304     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5305     //assem_debug("cycle count (adj)\n");
5306     if(unconditional) {
5307       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5308       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5309         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5310         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5311         if(internal)
5312           assem_debug("branch: internal\n");
5313         else
5314           assem_debug("branch: external\n");
5315         if(internal&&is_ds[(ba[i]-start)>>2]) {
5316           ds_assemble_entry(i);
5317         }
5318         else {
5319           add_to_linker((int)out,ba[i],internal);
5320           emit_jmp(0);
5321         }
5322         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5323         if(((u_int)out)&7) emit_addnop(0);
5324         #endif
5325       }
5326     }
5327     else if(nop) {
5328       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5329       int jaddr=(int)out;
5330       emit_jns(0);
5331       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5332     }
5333     else {
5334       int taken=0,nottaken=0,nottaken1=0;
5335       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5336       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5337       if(!only32)
5338       {
5339         assert(s1h>=0);
5340         if(opcode[i]==4) // BEQ
5341         {
5342           if(s2h>=0) emit_cmp(s1h,s2h);
5343           else emit_test(s1h,s1h);
5344           nottaken1=(int)out;
5345           emit_jne(1);
5346         }
5347         if(opcode[i]==5) // BNE
5348         {
5349           if(s2h>=0) emit_cmp(s1h,s2h);
5350           else emit_test(s1h,s1h);
5351           if(invert) taken=(int)out;
5352           else add_to_linker((int)out,ba[i],internal);
5353           emit_jne(0);
5354         }
5355         if(opcode[i]==6) // BLEZ
5356         {
5357           emit_test(s1h,s1h);
5358           if(invert) taken=(int)out;
5359           else add_to_linker((int)out,ba[i],internal);
5360           emit_js(0);
5361           nottaken1=(int)out;
5362           emit_jne(1);
5363         }
5364         if(opcode[i]==7) // BGTZ
5365         {
5366           emit_test(s1h,s1h);
5367           nottaken1=(int)out;
5368           emit_js(1);
5369           if(invert) taken=(int)out;
5370           else add_to_linker((int)out,ba[i],internal);
5371           emit_jne(0);
5372         }
5373       } // if(!only32)
5374           
5375       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5376       assert(s1l>=0);
5377       if(opcode[i]==4) // BEQ
5378       {
5379         if(s2l>=0) emit_cmp(s1l,s2l);
5380         else emit_test(s1l,s1l);
5381         if(invert){
5382           nottaken=(int)out;
5383           emit_jne(1);
5384         }else{
5385           add_to_linker((int)out,ba[i],internal);
5386           emit_jeq(0);
5387         }
5388       }
5389       if(opcode[i]==5) // BNE
5390       {
5391         if(s2l>=0) emit_cmp(s1l,s2l);
5392         else emit_test(s1l,s1l);
5393         if(invert){
5394           nottaken=(int)out;
5395           emit_jeq(1);
5396         }else{
5397           add_to_linker((int)out,ba[i],internal);
5398           emit_jne(0);
5399         }
5400       }
5401       if(opcode[i]==6) // BLEZ
5402       {
5403         emit_cmpimm(s1l,1);
5404         if(invert){
5405           nottaken=(int)out;
5406           emit_jge(1);
5407         }else{
5408           add_to_linker((int)out,ba[i],internal);
5409           emit_jl(0);
5410         }
5411       }
5412       if(opcode[i]==7) // BGTZ
5413       {
5414         emit_cmpimm(s1l,1);
5415         if(invert){
5416           nottaken=(int)out;
5417           emit_jl(1);
5418         }else{
5419           add_to_linker((int)out,ba[i],internal);
5420           emit_jge(0);
5421         }
5422       }
5423       if(invert) {
5424         if(taken) set_jump_target(taken,(int)out);
5425         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5426         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5427           if(adj) {
5428             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5429             add_to_linker((int)out,ba[i],internal);
5430           }else{
5431             emit_addnop(13);
5432             add_to_linker((int)out,ba[i],internal*2);
5433           }
5434           emit_jmp(0);
5435         }else
5436         #endif
5437         {
5438           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5439           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5440           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5441           if(internal)
5442             assem_debug("branch: internal\n");
5443           else
5444             assem_debug("branch: external\n");
5445           if(internal&&is_ds[(ba[i]-start)>>2]) {
5446             ds_assemble_entry(i);
5447           }
5448           else {
5449             add_to_linker((int)out,ba[i],internal);
5450             emit_jmp(0);
5451           }
5452         }
5453         set_jump_target(nottaken,(int)out);
5454       }
5455
5456       if(nottaken1) set_jump_target(nottaken1,(int)out);
5457       if(adj) {
5458         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5459       }
5460     } // (!unconditional)
5461   } // if(ooo)
5462   else
5463   {
5464     // In-order execution (branch first)
5465     //if(likely[i]) printf("IOL\n");
5466     //else
5467     //printf("IOE\n");
5468     int taken=0,nottaken=0,nottaken1=0;
5469     if(!unconditional&&!nop) {
5470       if(!only32)
5471       {
5472         assert(s1h>=0);
5473         if((opcode[i]&0x2f)==4) // BEQ
5474         {
5475           if(s2h>=0) emit_cmp(s1h,s2h);
5476           else emit_test(s1h,s1h);
5477           nottaken1=(int)out;
5478           emit_jne(2);
5479         }
5480         if((opcode[i]&0x2f)==5) // BNE
5481         {
5482           if(s2h>=0) emit_cmp(s1h,s2h);
5483           else emit_test(s1h,s1h);
5484           taken=(int)out;
5485           emit_jne(1);
5486         }
5487         if((opcode[i]&0x2f)==6) // BLEZ
5488         {
5489           emit_test(s1h,s1h);
5490           taken=(int)out;
5491           emit_js(1);
5492           nottaken1=(int)out;
5493           emit_jne(2);
5494         }
5495         if((opcode[i]&0x2f)==7) // BGTZ
5496         {
5497           emit_test(s1h,s1h);
5498           nottaken1=(int)out;
5499           emit_js(2);
5500           taken=(int)out;
5501           emit_jne(1);
5502         }
5503       } // if(!only32)
5504           
5505       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5506       assert(s1l>=0);
5507       if((opcode[i]&0x2f)==4) // BEQ
5508       {
5509         if(s2l>=0) emit_cmp(s1l,s2l);
5510         else emit_test(s1l,s1l);
5511         nottaken=(int)out;
5512         emit_jne(2);
5513       }
5514       if((opcode[i]&0x2f)==5) // BNE
5515       {
5516         if(s2l>=0) emit_cmp(s1l,s2l);
5517         else emit_test(s1l,s1l);
5518         nottaken=(int)out;
5519         emit_jeq(2);
5520       }
5521       if((opcode[i]&0x2f)==6) // BLEZ
5522       {
5523         emit_cmpimm(s1l,1);
5524         nottaken=(int)out;
5525         emit_jge(2);
5526       }
5527       if((opcode[i]&0x2f)==7) // BGTZ
5528       {
5529         emit_cmpimm(s1l,1);
5530         nottaken=(int)out;
5531         emit_jl(2);
5532       }
5533     } // if(!unconditional)
5534     int adj;
5535     uint64_t ds_unneeded=branch_regs[i].u;
5536     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5537     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5538     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5539     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5540     ds_unneeded|=1;
5541     ds_unneeded_upper|=1;
5542     // branch taken
5543     if(!nop) {
5544       if(taken) set_jump_target(taken,(int)out);
5545       assem_debug("1:\n");
5546       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5547                     ds_unneeded,ds_unneeded_upper);
5548       // load regs
5549       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5550       address_generation(i+1,&branch_regs[i],0);
5551       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5552       ds_assemble(i+1,&branch_regs[i]);
5553       cc=get_reg(branch_regs[i].regmap,CCREG);
5554       if(cc==-1) {
5555         emit_loadreg(CCREG,cc=HOST_CCREG);
5556         // CHECK: Is the following instruction (fall thru) allocated ok?
5557       }
5558       assert(cc==HOST_CCREG);
5559       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5560       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5561       assem_debug("cycle count (adj)\n");
5562       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5563       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5564       if(internal)
5565         assem_debug("branch: internal\n");
5566       else
5567         assem_debug("branch: external\n");
5568       if(internal&&is_ds[(ba[i]-start)>>2]) {
5569         ds_assemble_entry(i);
5570       }
5571       else {
5572         add_to_linker((int)out,ba[i],internal);
5573         emit_jmp(0);
5574       }
5575     }
5576     // branch not taken
5577     cop1_usable=prev_cop1_usable;
5578     if(!unconditional) {
5579       if(nottaken1) set_jump_target(nottaken1,(int)out);
5580       set_jump_target(nottaken,(int)out);
5581       assem_debug("2:\n");
5582       if(!likely[i]) {
5583         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5584                       ds_unneeded,ds_unneeded_upper);
5585         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5586         address_generation(i+1,&branch_regs[i],0);
5587         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5588         ds_assemble(i+1,&branch_regs[i]);
5589       }
5590       cc=get_reg(branch_regs[i].regmap,CCREG);
5591       if(cc==-1&&!likely[i]) {
5592         // Cycle count isn't in a register, temporarily load it then write it out
5593         emit_loadreg(CCREG,HOST_CCREG);
5594         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5595         int jaddr=(int)out;
5596         emit_jns(0);
5597         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5598         emit_storereg(CCREG,HOST_CCREG);
5599       }
5600       else{
5601         cc=get_reg(i_regmap,CCREG);
5602         assert(cc==HOST_CCREG);
5603         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5604         int jaddr=(int)out;
5605         emit_jns(0);
5606         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5607       }
5608     }
5609   }
5610 }
5611
5612 void sjump_assemble(int i,struct regstat *i_regs)
5613 {
5614   signed char *i_regmap=i_regs->regmap;
5615   int cc;
5616   int match;
5617   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5618   assem_debug("smatch=%d\n",match);
5619   int s1h,s1l;
5620   int prev_cop1_usable=cop1_usable;
5621   int unconditional=0,nevertaken=0;
5622   int only32=0;
5623   int ooo=1;
5624   int invert=0;
5625   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5626   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5627   if(likely[i]) ooo=0;
5628   if(!match) invert=1;
5629   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5630   if(i>(ba[i]-start)>>2) invert=1;
5631   #endif
5632
5633   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
5634   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
5635
5636   if(ooo)
5637     if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))
5638   {
5639     // Write-after-read dependency prevents out of order execution
5640     // First test branch condition, then execute delay slot, then branch
5641     ooo=0;
5642   }
5643   assert(opcode2[i]<0x10||ooo); // FIXME (BxxZALL)
5644
5645   if(ooo) {
5646     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
5647     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
5648   }
5649   else {
5650     s1l=get_reg(i_regmap,rs1[i]);
5651     s1h=get_reg(i_regmap,rs1[i]|64);
5652   }
5653   if(rs1[i]==0)
5654   {
5655     if(opcode2[i]&1) unconditional=1;
5656     else nevertaken=1;
5657     // These are never taken (r0 is never less than zero)
5658     //assert(opcode2[i]!=0);
5659     //assert(opcode2[i]!=2);
5660     //assert(opcode2[i]!=0x10);
5661     //assert(opcode2[i]!=0x12);
5662   }
5663   else {
5664     only32=(regs[i].was32>>rs1[i])&1;
5665   }
5666
5667   if(ooo) {
5668     // Out of order execution (delay slot first)
5669     //printf("OOOE\n");
5670     address_generation(i+1,i_regs,regs[i].regmap_entry);
5671     ds_assemble(i+1,i_regs);
5672     int adj;
5673     uint64_t bc_unneeded=branch_regs[i].u;
5674     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5675     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5676     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5677     bc_unneeded|=1;
5678     bc_unneeded_upper|=1;
5679     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5680                   bc_unneeded,bc_unneeded_upper);
5681     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5682     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5683     if(rt1[i]==31) {
5684       int rt,return_address;
5685       assert(rt1[i+1]!=31);
5686       assert(rt2[i+1]!=31);
5687       rt=get_reg(branch_regs[i].regmap,31);
5688       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5689       if(rt>=0) {
5690         // Save the PC even if the branch is not taken
5691         return_address=start+i*4+8;
5692         emit_movimm(return_address,rt); // PC into link register
5693         #ifdef IMM_PREFETCH
5694         if(!nevertaken) emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
5695         #endif
5696       }
5697     }
5698     cc=get_reg(branch_regs[i].regmap,CCREG);
5699     assert(cc==HOST_CCREG);
5700     if(unconditional) 
5701       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5702     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
5703     assem_debug("cycle count (adj)\n");
5704     if(unconditional) {
5705       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
5706       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
5707         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5708         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5709         if(internal)
5710           assem_debug("branch: internal\n");
5711         else
5712           assem_debug("branch: external\n");
5713         if(internal&&is_ds[(ba[i]-start)>>2]) {
5714           ds_assemble_entry(i);
5715         }
5716         else {
5717           add_to_linker((int)out,ba[i],internal);
5718           emit_jmp(0);
5719         }
5720         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5721         if(((u_int)out)&7) emit_addnop(0);
5722         #endif
5723       }
5724     }
5725     else if(nevertaken) {
5726       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5727       int jaddr=(int)out;
5728       emit_jns(0);
5729       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5730     }
5731     else {
5732       int nottaken=0;
5733       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5734       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5735       if(!only32)
5736       {
5737         assert(s1h>=0);
5738         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5739         {
5740           emit_test(s1h,s1h);
5741           if(invert){
5742             nottaken=(int)out;
5743             emit_jns(1);
5744           }else{
5745             add_to_linker((int)out,ba[i],internal);
5746             emit_js(0);
5747           }
5748         }
5749         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5750         {
5751           emit_test(s1h,s1h);
5752           if(invert){
5753             nottaken=(int)out;
5754             emit_js(1);
5755           }else{
5756             add_to_linker((int)out,ba[i],internal);
5757             emit_jns(0);
5758           }
5759         }
5760       } // if(!only32)
5761       else
5762       {
5763         assert(s1l>=0);
5764         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5765         {
5766           emit_test(s1l,s1l);
5767           if(invert){
5768             nottaken=(int)out;
5769             emit_jns(1);
5770           }else{
5771             add_to_linker((int)out,ba[i],internal);
5772             emit_js(0);
5773           }
5774         }
5775         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5776         {
5777           emit_test(s1l,s1l);
5778           if(invert){
5779             nottaken=(int)out;
5780             emit_js(1);
5781           }else{
5782             add_to_linker((int)out,ba[i],internal);
5783             emit_jns(0);
5784           }
5785         }
5786       } // if(!only32)
5787           
5788       if(invert) {
5789         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5790         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5791           if(adj) {
5792             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5793             add_to_linker((int)out,ba[i],internal);
5794           }else{
5795             emit_addnop(13);
5796             add_to_linker((int)out,ba[i],internal*2);
5797           }
5798           emit_jmp(0);
5799         }else
5800         #endif
5801         {
5802           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
5803           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5804           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5805           if(internal)
5806             assem_debug("branch: internal\n");
5807           else
5808             assem_debug("branch: external\n");
5809           if(internal&&is_ds[(ba[i]-start)>>2]) {
5810             ds_assemble_entry(i);
5811           }
5812           else {
5813             add_to_linker((int)out,ba[i],internal);
5814             emit_jmp(0);
5815           }
5816         }
5817         set_jump_target(nottaken,(int)out);
5818       }
5819
5820       if(adj) {
5821         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
5822       }
5823     } // (!unconditional)
5824   } // if(ooo)
5825   else
5826   {
5827     // In-order execution (branch first)
5828     //printf("IOE\n");
5829     int nottaken=0;
5830     if(!unconditional) {
5831       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5832       if(!only32)
5833       {
5834         assert(s1h>=0);
5835         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5836         {
5837           emit_test(s1h,s1h);
5838           nottaken=(int)out;
5839           emit_jns(1);
5840         }
5841         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5842         {
5843           emit_test(s1h,s1h);
5844           nottaken=(int)out;
5845           emit_js(1);
5846         }
5847       } // if(!only32)
5848       else
5849       {
5850         assert(s1l>=0);
5851         if((opcode2[i]&0x1d)==0) // BLTZ/BLTZL
5852         {
5853           emit_test(s1l,s1l);
5854           nottaken=(int)out;
5855           emit_jns(1);
5856         }
5857         if((opcode2[i]&0x1d)==1) // BGEZ/BGEZL
5858         {
5859           emit_test(s1l,s1l);
5860           nottaken=(int)out;
5861           emit_js(1);
5862         }
5863       }
5864     } // if(!unconditional)
5865     int adj;
5866     uint64_t ds_unneeded=branch_regs[i].u;
5867     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5868     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5869     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5870     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5871     ds_unneeded|=1;
5872     ds_unneeded_upper|=1;
5873     // branch taken
5874     if(!nevertaken) {
5875       //assem_debug("1:\n");
5876       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5877                     ds_unneeded,ds_unneeded_upper);
5878       // load regs
5879       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5880       address_generation(i+1,&branch_regs[i],0);
5881       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5882       ds_assemble(i+1,&branch_regs[i]);
5883       cc=get_reg(branch_regs[i].regmap,CCREG);
5884       if(cc==-1) {
5885         emit_loadreg(CCREG,cc=HOST_CCREG);
5886         // CHECK: Is the following instruction (fall thru) allocated ok?
5887       }
5888       assert(cc==HOST_CCREG);
5889       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5890       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5891       assem_debug("cycle count (adj)\n");
5892       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
5893       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5894       if(internal)
5895         assem_debug("branch: internal\n");
5896       else
5897         assem_debug("branch: external\n");
5898       if(internal&&is_ds[(ba[i]-start)>>2]) {
5899         ds_assemble_entry(i);
5900       }
5901       else {
5902         add_to_linker((int)out,ba[i],internal);
5903         emit_jmp(0);
5904       }
5905     }
5906     // branch not taken
5907     cop1_usable=prev_cop1_usable;
5908     if(!unconditional) {
5909       set_jump_target(nottaken,(int)out);
5910       assem_debug("1:\n");
5911       if(!likely[i]) {
5912         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5913                       ds_unneeded,ds_unneeded_upper);
5914         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5915         address_generation(i+1,&branch_regs[i],0);
5916         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5917         ds_assemble(i+1,&branch_regs[i]);
5918       }
5919       cc=get_reg(branch_regs[i].regmap,CCREG);
5920       if(cc==-1&&!likely[i]) {
5921         // Cycle count isn't in a register, temporarily load it then write it out
5922         emit_loadreg(CCREG,HOST_CCREG);
5923         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
5924         int jaddr=(int)out;
5925         emit_jns(0);
5926         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
5927         emit_storereg(CCREG,HOST_CCREG);
5928       }
5929       else{
5930         cc=get_reg(i_regmap,CCREG);
5931         assert(cc==HOST_CCREG);
5932         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
5933         int jaddr=(int)out;
5934         emit_jns(0);
5935         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5936       }
5937     }
5938   }
5939 }
5940
5941 void fjump_assemble(int i,struct regstat *i_regs)
5942 {
5943   signed char *i_regmap=i_regs->regmap;
5944   int cc;
5945   int match;
5946   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5947   assem_debug("fmatch=%d\n",match);
5948   int fs,cs;
5949   int eaddr;
5950   int ooo=1;
5951   int invert=0;
5952   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5953   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5954   if(likely[i]) ooo=0;
5955   if(!match) invert=1;
5956   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5957   if(i>(ba[i]-start)>>2) invert=1;
5958   #endif
5959
5960   if(ooo)
5961     if(itype[i+1]==FCOMP)
5962   {
5963     // Write-after-read dependency prevents out of order execution
5964     // First test branch condition, then execute delay slot, then branch
5965     ooo=0;
5966   }
5967
5968   if(ooo) {
5969     fs=get_reg(branch_regs[i].regmap,FSREG);
5970     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5971   }
5972   else {
5973     fs=get_reg(i_regmap,FSREG);
5974   }
5975
5976   // Check cop1 unusable
5977   if(!cop1_usable) {
5978     cs=get_reg(i_regmap,CSREG);
5979     assert(cs>=0);
5980     emit_testimm(cs,0x20000000);
5981     eaddr=(int)out;
5982     emit_jeq(0);
5983     add_stub(FP_STUB,eaddr,(int)out,i,cs,(int)i_regs,0,0);
5984     cop1_usable=1;
5985   }
5986
5987   if(ooo) {
5988     // Out of order execution (delay slot first)
5989     //printf("OOOE\n");
5990     ds_assemble(i+1,i_regs);
5991     int adj;
5992     uint64_t bc_unneeded=branch_regs[i].u;
5993     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5994     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5995     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5996     bc_unneeded|=1;
5997     bc_unneeded_upper|=1;
5998     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5999                   bc_unneeded,bc_unneeded_upper);
6000     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
6001     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6002     cc=get_reg(branch_regs[i].regmap,CCREG);
6003     assert(cc==HOST_CCREG);
6004     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
6005     assem_debug("cycle count (adj)\n");
6006     if(1) {
6007       int nottaken=0;
6008       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6009       if(1) {
6010         assert(fs>=0);
6011         emit_testimm(fs,0x800000);
6012         if(source[i]&0x10000) // BC1T
6013         {
6014           if(invert){
6015             nottaken=(int)out;
6016             emit_jeq(1);
6017           }else{
6018             add_to_linker((int)out,ba[i],internal);
6019             emit_jne(0);
6020           }
6021         }
6022         else // BC1F
6023           if(invert){
6024             nottaken=(int)out;
6025             emit_jne(1);
6026           }else{
6027             add_to_linker((int)out,ba[i],internal);
6028             emit_jeq(0);
6029           }
6030         {
6031         }
6032       } // if(!only32)
6033           
6034       if(invert) {
6035         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
6036         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
6037         else if(match) emit_addnop(13);
6038         #endif
6039         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6040         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6041         if(internal)
6042           assem_debug("branch: internal\n");
6043         else
6044           assem_debug("branch: external\n");
6045         if(internal&&is_ds[(ba[i]-start)>>2]) {
6046           ds_assemble_entry(i);
6047         }
6048         else {
6049           add_to_linker((int)out,ba[i],internal);
6050           emit_jmp(0);
6051         }
6052         set_jump_target(nottaken,(int)out);
6053       }
6054
6055       if(adj) {
6056         if(!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
6057       }
6058     } // (!unconditional)
6059   } // if(ooo)
6060   else
6061   {
6062     // In-order execution (branch first)
6063     //printf("IOE\n");
6064     int nottaken=0;
6065     if(1) {
6066       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
6067       if(1) {
6068         assert(fs>=0);
6069         emit_testimm(fs,0x800000);
6070         if(source[i]&0x10000) // BC1T
6071         {
6072           nottaken=(int)out;
6073           emit_jeq(1);
6074         }
6075         else // BC1F
6076         {
6077           nottaken=(int)out;
6078           emit_jne(1);
6079         }
6080       }
6081     } // if(!unconditional)
6082     int adj;
6083     uint64_t ds_unneeded=branch_regs[i].u;
6084     uint64_t ds_unneeded_upper=branch_regs[i].uu;
6085     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6086     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6087     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
6088     ds_unneeded|=1;
6089     ds_unneeded_upper|=1;
6090     // branch taken
6091     //assem_debug("1:\n");
6092     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6093                   ds_unneeded,ds_unneeded_upper);
6094     // load regs
6095     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6096     address_generation(i+1,&branch_regs[i],0);
6097     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
6098     ds_assemble(i+1,&branch_regs[i]);
6099     cc=get_reg(branch_regs[i].regmap,CCREG);
6100     if(cc==-1) {
6101       emit_loadreg(CCREG,cc=HOST_CCREG);
6102       // CHECK: Is the following instruction (fall thru) allocated ok?
6103     }
6104     assert(cc==HOST_CCREG);
6105     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6106     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
6107     assem_debug("cycle count (adj)\n");
6108     if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
6109     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
6110     if(internal)
6111       assem_debug("branch: internal\n");
6112     else
6113       assem_debug("branch: external\n");
6114     if(internal&&is_ds[(ba[i]-start)>>2]) {
6115       ds_assemble_entry(i);
6116     }
6117     else {
6118       add_to_linker((int)out,ba[i],internal);
6119       emit_jmp(0);
6120     }
6121
6122     // branch not taken
6123     if(1) { // <- FIXME (don't need this)
6124       set_jump_target(nottaken,(int)out);
6125       assem_debug("1:\n");
6126       if(!likely[i]) {
6127         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
6128                       ds_unneeded,ds_unneeded_upper);
6129         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
6130         address_generation(i+1,&branch_regs[i],0);
6131         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
6132         ds_assemble(i+1,&branch_regs[i]);
6133       }
6134       cc=get_reg(branch_regs[i].regmap,CCREG);
6135       if(cc==-1&&!likely[i]) {
6136         // Cycle count isn't in a register, temporarily load it then write it out
6137         emit_loadreg(CCREG,HOST_CCREG);
6138         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6139         int jaddr=(int)out;
6140         emit_jns(0);
6141         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,NOTTAKEN,0);
6142         emit_storereg(CCREG,HOST_CCREG);
6143       }
6144       else{
6145         cc=get_reg(i_regmap,CCREG);
6146         assert(cc==HOST_CCREG);
6147         emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
6148         int jaddr=(int)out;
6149         emit_jns(0);
6150         add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
6151       }
6152     }
6153   }
6154 }
6155
6156 static void pagespan_assemble(int i,struct regstat *i_regs)
6157 {
6158   int s1l=get_reg(i_regs->regmap,rs1[i]);
6159   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
6160   int s2l=get_reg(i_regs->regmap,rs2[i]);
6161   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
6162   void *nt_branch=NULL;
6163   int taken=0;
6164   int nottaken=0;
6165   int unconditional=0;
6166   if(rs1[i]==0)
6167   {
6168     s1l=s2l;s1h=s2h;
6169     s2l=s2h=-1;
6170   }
6171   else if(rs2[i]==0)
6172   {
6173     s2l=s2h=-1;
6174   }
6175   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
6176     s1h=s2h=-1;
6177   }
6178   int hr=0;
6179   int addr,alt,ntaddr;
6180   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
6181   else {
6182     while(hr<HOST_REGS)
6183     {
6184       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
6185          (i_regs->regmap[hr]&63)!=rs1[i] &&
6186          (i_regs->regmap[hr]&63)!=rs2[i] )
6187       {
6188         addr=hr++;break;
6189       }
6190       hr++;
6191     }
6192   }
6193   while(hr<HOST_REGS)
6194   {
6195     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6196        (i_regs->regmap[hr]&63)!=rs1[i] &&
6197        (i_regs->regmap[hr]&63)!=rs2[i] )
6198     {
6199       alt=hr++;break;
6200     }
6201     hr++;
6202   }
6203   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
6204   {
6205     while(hr<HOST_REGS)
6206     {
6207       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
6208          (i_regs->regmap[hr]&63)!=rs1[i] &&
6209          (i_regs->regmap[hr]&63)!=rs2[i] )
6210       {
6211         ntaddr=hr;break;
6212       }
6213       hr++;
6214     }
6215   }
6216   assert(hr<HOST_REGS);
6217   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
6218     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
6219   }
6220   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i]+2),HOST_CCREG);
6221   if(opcode[i]==2) // J
6222   {
6223     unconditional=1;
6224   }
6225   if(opcode[i]==3) // JAL
6226   {
6227     // TODO: mini_ht
6228     int rt=get_reg(i_regs->regmap,31);
6229     emit_movimm(start+i*4+8,rt);
6230     unconditional=1;
6231   }
6232   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
6233   {
6234     emit_mov(s1l,addr);
6235     if(opcode2[i]==9) // JALR
6236     {
6237       int rt=get_reg(i_regs->regmap,rt1[i]);
6238       emit_movimm(start+i*4+8,rt);
6239     }
6240   }
6241   if((opcode[i]&0x3f)==4) // BEQ
6242   {
6243     if(rs1[i]==rs2[i])
6244     {
6245       unconditional=1;
6246     }
6247     else
6248     #ifdef HAVE_CMOV_IMM
6249     if(s1h<0) {
6250       if(s2l>=0) emit_cmp(s1l,s2l);
6251       else emit_test(s1l,s1l);
6252       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
6253     }
6254     else
6255     #endif
6256     {
6257       assert(s1l>=0);
6258       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6259       if(s1h>=0) {
6260         if(s2h>=0) emit_cmp(s1h,s2h);
6261         else emit_test(s1h,s1h);
6262         emit_cmovne_reg(alt,addr);
6263       }
6264       if(s2l>=0) emit_cmp(s1l,s2l);
6265       else emit_test(s1l,s1l);
6266       emit_cmovne_reg(alt,addr);
6267     }
6268   }
6269   if((opcode[i]&0x3f)==5) // BNE
6270   {
6271     #ifdef HAVE_CMOV_IMM
6272     if(s1h<0) {
6273       if(s2l>=0) emit_cmp(s1l,s2l);
6274       else emit_test(s1l,s1l);
6275       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
6276     }
6277     else
6278     #endif
6279     {
6280       assert(s1l>=0);
6281       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
6282       if(s1h>=0) {
6283         if(s2h>=0) emit_cmp(s1h,s2h);
6284         else emit_test(s1h,s1h);
6285         emit_cmovne_reg(alt,addr);
6286       }
6287       if(s2l>=0) emit_cmp(s1l,s2l);
6288       else emit_test(s1l,s1l);
6289       emit_cmovne_reg(alt,addr);
6290     }
6291   }
6292   if((opcode[i]&0x3f)==0x14) // BEQL
6293   {
6294     if(s1h>=0) {
6295       if(s2h>=0) emit_cmp(s1h,s2h);
6296       else emit_test(s1h,s1h);
6297       nottaken=(int)out;
6298       emit_jne(0);
6299     }
6300     if(s2l>=0) emit_cmp(s1l,s2l);
6301     else emit_test(s1l,s1l);
6302     if(nottaken) set_jump_target(nottaken,(int)out);
6303     nottaken=(int)out;
6304     emit_jne(0);
6305   }
6306   if((opcode[i]&0x3f)==0x15) // BNEL
6307   {
6308     if(s1h>=0) {
6309       if(s2h>=0) emit_cmp(s1h,s2h);
6310       else emit_test(s1h,s1h);
6311       taken=(int)out;
6312       emit_jne(0);
6313     }
6314     if(s2l>=0) emit_cmp(s1l,s2l);
6315     else emit_test(s1l,s1l);
6316     nottaken=(int)out;
6317     emit_jeq(0);
6318     if(taken) set_jump_target(taken,(int)out);
6319   }
6320   if((opcode[i]&0x3f)==6) // BLEZ
6321   {
6322     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6323     emit_cmpimm(s1l,1);
6324     if(s1h>=0) emit_mov(addr,ntaddr);
6325     emit_cmovl_reg(alt,addr);
6326     if(s1h>=0) {
6327       emit_test(s1h,s1h);
6328       emit_cmovne_reg(ntaddr,addr);
6329       emit_cmovs_reg(alt,addr);
6330     }
6331   }
6332   if((opcode[i]&0x3f)==7) // BGTZ
6333   {
6334     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
6335     emit_cmpimm(s1l,1);
6336     if(s1h>=0) emit_mov(addr,alt);
6337     emit_cmovl_reg(ntaddr,addr);
6338     if(s1h>=0) {
6339       emit_test(s1h,s1h);
6340       emit_cmovne_reg(alt,addr);
6341       emit_cmovs_reg(ntaddr,addr);
6342     }
6343   }
6344   if((opcode[i]&0x3f)==0x16) // BLEZL
6345   {
6346     assert((opcode[i]&0x3f)!=0x16);
6347   }
6348   if((opcode[i]&0x3f)==0x17) // BGTZL
6349   {
6350     assert((opcode[i]&0x3f)!=0x17);
6351   }
6352   assert(opcode[i]!=1); // BLTZ/BGEZ
6353
6354   //FIXME: Check CSREG
6355   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
6356     if((source[i]&0x30000)==0) // BC1F
6357     {
6358       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
6359       emit_testimm(s1l,0x800000);
6360       emit_cmovne_reg(alt,addr);
6361     }
6362     if((source[i]&0x30000)==0x10000) // BC1T
6363     {
6364       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
6365       emit_testimm(s1l,0x800000);
6366       emit_cmovne_reg(alt,addr);
6367     }
6368     if((source[i]&0x30000)==0x20000) // BC1FL
6369     {
6370       emit_testimm(s1l,0x800000);
6371       nottaken=(int)out;
6372       emit_jne(0);
6373     }
6374     if((source[i]&0x30000)==0x30000) // BC1TL
6375     {
6376       emit_testimm(s1l,0x800000);
6377       nottaken=(int)out;
6378       emit_jeq(0);
6379     }
6380   }
6381
6382   assert(i_regs->regmap[HOST_CCREG]==CCREG);
6383   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6384   if(likely[i]||unconditional)
6385   {
6386     emit_movimm(ba[i],HOST_BTREG);
6387   }
6388   else if(addr!=HOST_BTREG)
6389   {
6390     emit_mov(addr,HOST_BTREG);
6391   }
6392   void *branch_addr=out;
6393   emit_jmp(0);
6394   int target_addr=start+i*4+5;
6395   void *stub=out;
6396   void *compiled_target_addr=check_addr(target_addr);
6397   emit_extjump_ds((int)branch_addr,target_addr);
6398   if(compiled_target_addr) {
6399     set_jump_target((int)branch_addr,(int)compiled_target_addr);
6400     add_link(target_addr,stub);
6401   }
6402   else set_jump_target((int)branch_addr,(int)stub);
6403   if(likely[i]) {
6404     // Not-taken path
6405     set_jump_target((int)nottaken,(int)out);
6406     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
6407     void *branch_addr=out;
6408     emit_jmp(0);
6409     int target_addr=start+i*4+8;
6410     void *stub=out;
6411     void *compiled_target_addr=check_addr(target_addr);
6412     emit_extjump_ds((int)branch_addr,target_addr);
6413     if(compiled_target_addr) {
6414       set_jump_target((int)branch_addr,(int)compiled_target_addr);
6415       add_link(target_addr,stub);
6416     }
6417     else set_jump_target((int)branch_addr,(int)stub);
6418   }
6419 }
6420
6421 // Assemble the delay slot for the above
6422 static void pagespan_ds()
6423 {
6424   assem_debug("initial delay slot:\n");
6425   u_int vaddr=start+1;
6426   u_int page=get_page(vaddr);
6427   u_int vpage=get_vpage(vaddr);
6428   ll_add(jump_dirty+vpage,vaddr,(void *)out);
6429   do_dirty_stub_ds();
6430   ll_add(jump_in+page,vaddr,(void *)out);
6431   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
6432   if(regs[0].regmap[HOST_CCREG]!=CCREG)
6433     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
6434   if(regs[0].regmap[HOST_BTREG]!=BTREG)
6435     emit_writeword(HOST_BTREG,(int)&branch_target);
6436   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
6437   address_generation(0,&regs[0],regs[0].regmap_entry);
6438   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
6439     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
6440   cop1_usable=0;
6441   is_delayslot=0;
6442   switch(itype[0]) {
6443     case ALU:
6444       alu_assemble(0,&regs[0]);break;
6445     case IMM16:
6446       imm16_assemble(0,&regs[0]);break;
6447     case SHIFT:
6448       shift_assemble(0,&regs[0]);break;
6449     case SHIFTIMM:
6450       shiftimm_assemble(0,&regs[0]);break;
6451     case LOAD:
6452       load_assemble(0,&regs[0]);break;
6453     case LOADLR:
6454       loadlr_assemble(0,&regs[0]);break;
6455     case STORE:
6456       store_assemble(0,&regs[0]);break;
6457     case STORELR:
6458       storelr_assemble(0,&regs[0]);break;
6459     case COP0:
6460       cop0_assemble(0,&regs[0]);break;
6461     case COP1:
6462       cop1_assemble(0,&regs[0]);break;
6463     case C1LS:
6464       c1ls_assemble(0,&regs[0]);break;
6465     case COP2:
6466       cop2_assemble(0,&regs[0]);break;
6467     case C2LS:
6468       c2ls_assemble(0,&regs[0]);break;
6469     case C2OP:
6470       c2op_assemble(0,&regs[0]);break;
6471     case FCONV:
6472       fconv_assemble(0,&regs[0]);break;
6473     case FLOAT:
6474       float_assemble(0,&regs[0]);break;
6475     case FCOMP:
6476       fcomp_assemble(0,&regs[0]);break;
6477     case MULTDIV:
6478       multdiv_assemble(0,&regs[0]);break;
6479     case MOV:
6480       mov_assemble(0,&regs[0]);break;
6481     case SYSCALL:
6482     case HLECALL:
6483     case SPAN:
6484     case UJUMP:
6485     case RJUMP:
6486     case CJUMP:
6487     case SJUMP:
6488     case FJUMP:
6489       printf("Jump in the delay slot.  This is probably a bug.\n");
6490   }
6491   int btaddr=get_reg(regs[0].regmap,BTREG);
6492   if(btaddr<0) {
6493     btaddr=get_reg(regs[0].regmap,-1);
6494     emit_readword((int)&branch_target,btaddr);
6495   }
6496   assert(btaddr!=HOST_CCREG);
6497   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
6498 #ifdef HOST_IMM8
6499   emit_movimm(start+4,HOST_TEMPREG);
6500   emit_cmp(btaddr,HOST_TEMPREG);
6501 #else
6502   emit_cmpimm(btaddr,start+4);
6503 #endif
6504   int branch=(int)out;
6505   emit_jeq(0);
6506   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
6507   emit_jmp(jump_vaddr_reg[btaddr]);
6508   set_jump_target(branch,(int)out);
6509   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6510   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
6511 }
6512
6513 // Basic liveness analysis for MIPS registers
6514 void unneeded_registers(int istart,int iend,int r)
6515 {
6516   int i;
6517   uint64_t u,uu,b,bu;
6518   uint64_t temp_u,temp_uu;
6519   uint64_t tdep;
6520   if(iend==slen-1) {
6521     u=1;uu=1;
6522   }else{
6523     u=unneeded_reg[iend+1];
6524     uu=unneeded_reg_upper[iend+1];
6525     u=1;uu=1;
6526   }
6527   for (i=iend;i>=istart;i--)
6528   {
6529     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
6530     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6531     {
6532       // If subroutine call, flag return address as a possible branch target
6533       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
6534       
6535       if(ba[i]<start || ba[i]>=(start+slen*4))
6536       {
6537         // Branch out of this block, flush all regs
6538         u=1;
6539         uu=1;
6540         /* Hexagon hack 
6541         if(itype[i]==UJUMP&&rt1[i]==31)
6542         {
6543           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
6544         }
6545         if(itype[i]==RJUMP&&rs1[i]==31)
6546         {
6547           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
6548         }
6549         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
6550           if(itype[i]==UJUMP&&rt1[i]==31)
6551           {
6552             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
6553             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
6554           }
6555           if(itype[i]==RJUMP&&rs1[i]==31)
6556           {
6557             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
6558             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
6559           }
6560         }*/
6561         branch_unneeded_reg[i]=u;
6562         branch_unneeded_reg_upper[i]=uu;
6563         // Merge in delay slot
6564         tdep=(~uu>>rt1[i+1])&1;
6565         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6566         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6567         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6568         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6569         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6570         u|=1;uu|=1;
6571         // If branch is "likely" (and conditional)
6572         // then we skip the delay slot on the fall-thru path
6573         if(likely[i]) {
6574           if(i<slen-1) {
6575             u&=unneeded_reg[i+2];
6576             uu&=unneeded_reg_upper[i+2];
6577           }
6578           else
6579           {
6580             u=1;
6581             uu=1;
6582           }
6583         }
6584       }
6585       else
6586       {
6587         // Internal branch, flag target
6588         bt[(ba[i]-start)>>2]=1;
6589         if(ba[i]<=start+i*4) {
6590           // Backward branch
6591           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6592           {
6593             // Unconditional branch
6594             temp_u=1;temp_uu=1;
6595           } else {
6596             // Conditional branch (not taken case)
6597             temp_u=unneeded_reg[i+2];
6598             temp_uu=unneeded_reg_upper[i+2];
6599           }
6600           // Merge in delay slot
6601           tdep=(~temp_uu>>rt1[i+1])&1;
6602           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6603           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6604           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6605           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6606           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6607           temp_u|=1;temp_uu|=1;
6608           // If branch is "likely" (and conditional)
6609           // then we skip the delay slot on the fall-thru path
6610           if(likely[i]) {
6611             if(i<slen-1) {
6612               temp_u&=unneeded_reg[i+2];
6613               temp_uu&=unneeded_reg_upper[i+2];
6614             }
6615             else
6616             {
6617               temp_u=1;
6618               temp_uu=1;
6619             }
6620           }
6621           tdep=(~temp_uu>>rt1[i])&1;
6622           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6623           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
6624           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6625           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
6626           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
6627           temp_u|=1;temp_uu|=1;
6628           unneeded_reg[i]=temp_u;
6629           unneeded_reg_upper[i]=temp_uu;
6630           // Only go three levels deep.  This recursion can take an
6631           // excessive amount of time if there are a lot of nested loops.
6632           if(r<2) {
6633             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
6634           }else{
6635             unneeded_reg[(ba[i]-start)>>2]=1;
6636             unneeded_reg_upper[(ba[i]-start)>>2]=1;
6637           }
6638         } /*else*/ if(1) {
6639           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6640           {
6641             // Unconditional branch
6642             u=unneeded_reg[(ba[i]-start)>>2];
6643             uu=unneeded_reg_upper[(ba[i]-start)>>2];
6644             branch_unneeded_reg[i]=u;
6645             branch_unneeded_reg_upper[i]=uu;
6646         //u=1;
6647         //uu=1;
6648         //branch_unneeded_reg[i]=u;
6649         //branch_unneeded_reg_upper[i]=uu;
6650             // Merge in delay slot
6651             tdep=(~uu>>rt1[i+1])&1;
6652             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6653             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6654             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6655             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6656             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6657             u|=1;uu|=1;
6658           } else {
6659             // Conditional branch
6660             b=unneeded_reg[(ba[i]-start)>>2];
6661             bu=unneeded_reg_upper[(ba[i]-start)>>2];
6662             branch_unneeded_reg[i]=b;
6663             branch_unneeded_reg_upper[i]=bu;
6664         //b=1;
6665         //bu=1;
6666         //branch_unneeded_reg[i]=b;
6667         //branch_unneeded_reg_upper[i]=bu;
6668             // Branch delay slot
6669             tdep=(~uu>>rt1[i+1])&1;
6670             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6671             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
6672             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6673             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
6674             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
6675             b|=1;bu|=1;
6676             // If branch is "likely" then we skip the
6677             // delay slot on the fall-thru path
6678             if(likely[i]) {
6679               u=b;
6680               uu=bu;
6681               if(i<slen-1) {
6682                 u&=unneeded_reg[i+2];
6683                 uu&=unneeded_reg_upper[i+2];
6684         //u=1;
6685         //uu=1;
6686               }
6687             } else {
6688               u&=b;
6689               uu&=bu;
6690         //u=1;
6691         //uu=1;
6692             }
6693             if(i<slen-1) {
6694               branch_unneeded_reg[i]&=unneeded_reg[i+2];
6695               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
6696         //branch_unneeded_reg[i]=1;
6697         //branch_unneeded_reg_upper[i]=1;
6698             } else {
6699               branch_unneeded_reg[i]=1;
6700               branch_unneeded_reg_upper[i]=1;
6701             }
6702           }
6703         }
6704       }
6705     }
6706     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
6707     {
6708       // SYSCALL instruction (software interrupt)
6709       u=1;
6710       uu=1;
6711     }
6712     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6713     {
6714       // ERET instruction (return from interrupt)
6715       u=1;
6716       uu=1;
6717     }
6718     //u=uu=1; // DEBUG
6719     tdep=(~uu>>rt1[i])&1;
6720     // Written registers are unneeded
6721     u|=1LL<<rt1[i];
6722     u|=1LL<<rt2[i];
6723     uu|=1LL<<rt1[i];
6724     uu|=1LL<<rt2[i];
6725     // Accessed registers are needed
6726     u&=~(1LL<<rs1[i]);
6727     u&=~(1LL<<rs2[i]);
6728     uu&=~(1LL<<us1[i]);
6729     uu&=~(1LL<<us2[i]);
6730     // Source-target dependencies
6731     uu&=~(tdep<<dep1[i]);
6732     uu&=~(tdep<<dep2[i]);
6733     // R0 is always unneeded
6734     u|=1;uu|=1;
6735     // Save it
6736     unneeded_reg[i]=u;
6737     unneeded_reg_upper[i]=uu;
6738     /*
6739     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6740     printf("U:");
6741     int r;
6742     for(r=1;r<=CCREG;r++) {
6743       if((unneeded_reg[i]>>r)&1) {
6744         if(r==HIREG) printf(" HI");
6745         else if(r==LOREG) printf(" LO");
6746         else printf(" r%d",r);
6747       }
6748     }
6749     printf(" UU:");
6750     for(r=1;r<=CCREG;r++) {
6751       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6752         if(r==HIREG) printf(" HI");
6753         else if(r==LOREG) printf(" LO");
6754         else printf(" r%d",r);
6755       }
6756     }
6757     printf("\n");*/
6758   }
6759 #ifdef FORCE32
6760   for (i=iend;i>=istart;i--)
6761   {
6762     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6763   }
6764 #endif
6765 }
6766
6767 // Identify registers which are likely to contain 32-bit values
6768 // This is used to predict whether any branches will jump to a
6769 // location with 64-bit values in registers.
6770 static void provisional_32bit()
6771 {
6772   int i,j;
6773   uint64_t is32=1;
6774   uint64_t lastbranch=1;
6775   
6776   for(i=0;i<slen;i++)
6777   {
6778     if(i>0) {
6779       if(itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP) {
6780         if(i>1) is32=lastbranch;
6781         else is32=1;
6782       }
6783     }
6784     if(i>1)
6785     {
6786       if(itype[i-2]==CJUMP||itype[i-2]==SJUMP||itype[i-2]==FJUMP) {
6787         if(likely[i-2]) {
6788           if(i>2) is32=lastbranch;
6789           else is32=1;
6790         }
6791       }
6792       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
6793       {
6794         if(rs1[i-2]==0||rs2[i-2]==0)
6795         {
6796           if(rs1[i-2]) {
6797             is32|=1LL<<rs1[i-2];
6798           }
6799           if(rs2[i-2]) {
6800             is32|=1LL<<rs2[i-2];
6801           }
6802         }
6803       }
6804     }
6805     // If something jumps here with 64-bit values
6806     // then promote those registers to 64 bits
6807     if(bt[i])
6808     {
6809       uint64_t temp_is32=is32;
6810       for(j=i-1;j>=0;j--)
6811       {
6812         if(ba[j]==start+i*4) 
6813           //temp_is32&=branch_regs[j].is32;
6814           temp_is32&=p32[j];
6815       }
6816       for(j=i;j<slen;j++)
6817       {
6818         if(ba[j]==start+i*4) 
6819           temp_is32=1;
6820       }
6821       is32=temp_is32;
6822     }
6823     int type=itype[i];
6824     int op=opcode[i];
6825     int op2=opcode2[i];
6826     int rt=rt1[i];
6827     int s1=rs1[i];
6828     int s2=rs2[i];
6829     if(type==UJUMP||type==RJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
6830       // Branches don't write registers, consider the delay slot instead.
6831       type=itype[i+1];
6832       op=opcode[i+1];
6833       op2=opcode2[i+1];
6834       rt=rt1[i+1];
6835       s1=rs1[i+1];
6836       s2=rs2[i+1];
6837       lastbranch=is32;
6838     }
6839     switch(type) {
6840       case LOAD:
6841         if(opcode[i]==0x27||opcode[i]==0x37|| // LWU/LD
6842            opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
6843           is32&=~(1LL<<rt);
6844         else
6845           is32|=1LL<<rt;
6846         break;
6847       case STORE:
6848       case STORELR:
6849         break;
6850       case LOADLR:
6851         if(op==0x1a||op==0x1b) is32&=~(1LL<<rt); // LDR/LDL
6852         if(op==0x22) is32|=1LL<<rt; // LWL
6853         break;
6854       case IMM16:
6855         if (op==0x08||op==0x09|| // ADDI/ADDIU
6856             op==0x0a||op==0x0b|| // SLTI/SLTIU
6857             op==0x0c|| // ANDI
6858             op==0x0f)  // LUI
6859         {
6860           is32|=1LL<<rt;
6861         }
6862         if(op==0x18||op==0x19) { // DADDI/DADDIU
6863           is32&=~(1LL<<rt);
6864           //if(imm[i]==0)
6865           //  is32|=((is32>>s1)&1LL)<<rt;
6866         }
6867         if(op==0x0d||op==0x0e) { // ORI/XORI
6868           uint64_t sr=((is32>>s1)&1LL);
6869           is32&=~(1LL<<rt);
6870           is32|=sr<<rt;
6871         }
6872         break;
6873       case UJUMP:
6874         break;
6875       case RJUMP:
6876         break;
6877       case CJUMP:
6878         break;
6879       case SJUMP:
6880         break;
6881       case FJUMP:
6882         break;
6883       case ALU:
6884         if(op2>=0x20&&op2<=0x23) { // ADD/ADDU/SUB/SUBU
6885           is32|=1LL<<rt;
6886         }
6887         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
6888           is32|=1LL<<rt;
6889         }
6890         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
6891           uint64_t sr=((is32>>s1)&(is32>>s2)&1LL);
6892           is32&=~(1LL<<rt);
6893           is32|=sr<<rt;
6894         }
6895         else if(op2>=0x2c&&op2<=0x2d) { // DADD/DADDU
6896           if(s1==0&&s2==0) {
6897             is32|=1LL<<rt;
6898           }
6899           else if(s2==0) {
6900             uint64_t sr=((is32>>s1)&1LL);
6901             is32&=~(1LL<<rt);
6902             is32|=sr<<rt;
6903           }
6904           else if(s1==0) {
6905             uint64_t sr=((is32>>s2)&1LL);
6906             is32&=~(1LL<<rt);
6907             is32|=sr<<rt;
6908           }
6909           else {
6910             is32&=~(1LL<<rt);
6911           }
6912         }
6913         else if(op2>=0x2e&&op2<=0x2f) { // DSUB/DSUBU
6914           if(s1==0&&s2==0) {
6915             is32|=1LL<<rt;
6916           }
6917           else if(s2==0) {
6918             uint64_t sr=((is32>>s1)&1LL);
6919             is32&=~(1LL<<rt);
6920             is32|=sr<<rt;
6921           }
6922           else {
6923             is32&=~(1LL<<rt);
6924           }
6925         }
6926         break;
6927       case MULTDIV:
6928         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
6929           is32&=~((1LL<<HIREG)|(1LL<<LOREG));
6930         }
6931         else {
6932           is32|=(1LL<<HIREG)|(1LL<<LOREG);
6933         }
6934         break;
6935       case MOV:
6936         {
6937           uint64_t sr=((is32>>s1)&1LL);
6938           is32&=~(1LL<<rt);
6939           is32|=sr<<rt;
6940         }
6941         break;
6942       case SHIFT:
6943         if(op2>=0x14&&op2<=0x17) is32&=~(1LL<<rt); // DSLLV/DSRLV/DSRAV
6944         else is32|=1LL<<rt; // SLLV/SRLV/SRAV
6945         break;
6946       case SHIFTIMM:
6947         is32|=1LL<<rt;
6948         // DSLL/DSRL/DSRA/DSLL32/DSRL32 but not DSRA32 have 64-bit result
6949         if(op2>=0x38&&op2<0x3f) is32&=~(1LL<<rt);
6950         break;
6951       case COP0:
6952         if(op2==0) is32|=1LL<<rt; // MFC0
6953         break;
6954       case COP1:
6955       case COP2:
6956         if(op2==0) is32|=1LL<<rt; // MFC1
6957         if(op2==1) is32&=~(1LL<<rt); // DMFC1
6958         if(op2==2) is32|=1LL<<rt; // CFC1
6959         break;
6960       case C1LS:
6961       case C2LS:
6962         break;
6963       case FLOAT:
6964       case FCONV:
6965         break;
6966       case FCOMP:
6967         break;
6968       case C2OP:
6969       case SYSCALL:
6970       case HLECALL:
6971         break;
6972       default:
6973         break;
6974     }
6975     is32|=1;
6976     p32[i]=is32;
6977
6978     if(i>0)
6979     {
6980       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
6981       {
6982         if(rt1[i-1]==31) // JAL/JALR
6983         {
6984           // Subroutine call will return here, don't alloc any registers
6985           is32=1;
6986         }
6987         else if(i+1<slen)
6988         {
6989           // Internal branch will jump here, match registers to caller
6990           is32=0x3FFFFFFFFLL;
6991         }
6992       }
6993     }
6994   }
6995 }
6996
6997 // Identify registers which may be assumed to contain 32-bit values
6998 // and where optimizations will rely on this.
6999 // This is used to determine whether backward branches can safely
7000 // jump to a location with 64-bit values in registers.
7001 static void provisional_r32()
7002 {
7003   u_int r32=0;
7004   int i;
7005   
7006   for (i=slen-1;i>=0;i--)
7007   {
7008     int hr;
7009     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7010     {
7011       if(ba[i]<start || ba[i]>=(start+slen*4))
7012       {
7013         // Branch out of this block, don't need anything
7014         r32=0;
7015       }
7016       else
7017       {
7018         // Internal branch
7019         // Need whatever matches the target
7020         // (and doesn't get overwritten by the delay slot instruction)
7021         r32=0;
7022         int t=(ba[i]-start)>>2;
7023         if(ba[i]>start+i*4) {
7024           // Forward branch
7025           //if(!(requires_32bit[t]&~regs[i].was32))
7026           //  r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7027           if(!(pr32[t]&~regs[i].was32))
7028             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7029         }else{
7030           // Backward branch
7031           if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
7032             r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
7033         }
7034       }
7035       // Conditional branch may need registers for following instructions
7036       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
7037       {
7038         if(i<slen-2) {
7039           //r32|=requires_32bit[i+2];
7040           r32|=pr32[i+2];
7041           r32&=regs[i].was32;
7042           // Mark this address as a branch target since it may be called
7043           // upon return from interrupt
7044           //bt[i+2]=1;
7045         }
7046       }
7047       // Merge in delay slot
7048       if(!likely[i]) {
7049         // These are overwritten unless the branch is "likely"
7050         // and the delay slot is nullified if not taken
7051         r32&=~(1LL<<rt1[i+1]);
7052         r32&=~(1LL<<rt2[i+1]);
7053       }
7054       // Assume these are needed (delay slot)
7055       if(us1[i+1]>0)
7056       {
7057         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
7058       }
7059       if(us2[i+1]>0)
7060       {
7061         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
7062       }
7063       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
7064       {
7065         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
7066       }
7067       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
7068       {
7069         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
7070       }
7071     }
7072     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
7073     {
7074       // SYSCALL instruction (software interrupt)
7075       r32=0;
7076     }
7077     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7078     {
7079       // ERET instruction (return from interrupt)
7080       r32=0;
7081     }
7082     // Check 32 bits
7083     r32&=~(1LL<<rt1[i]);
7084     r32&=~(1LL<<rt2[i]);
7085     if(us1[i]>0)
7086     {
7087       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
7088     }
7089     if(us2[i]>0)
7090     {
7091       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
7092     }
7093     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
7094     {
7095       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
7096     }
7097     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
7098     {
7099       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
7100     }
7101     //requires_32bit[i]=r32;
7102     pr32[i]=r32;
7103     
7104     // Dirty registers which are 32-bit, require 32-bit input
7105     // as they will be written as 32-bit values
7106     for(hr=0;hr<HOST_REGS;hr++)
7107     {
7108       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
7109         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
7110           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
7111           pr32[i]|=1LL<<regs[i].regmap_entry[hr];
7112           //requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
7113         }
7114       }
7115     }
7116   }
7117 }
7118
7119 // Write back dirty registers as soon as we will no longer modify them,
7120 // so that we don't end up with lots of writes at the branches.
7121 void clean_registers(int istart,int iend,int wr)
7122 {
7123   int i;
7124   int r;
7125   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
7126   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
7127   if(iend==slen-1) {
7128     will_dirty_i=will_dirty_next=0;
7129     wont_dirty_i=wont_dirty_next=0;
7130   }else{
7131     will_dirty_i=will_dirty_next=will_dirty[iend+1];
7132     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
7133   }
7134   for (i=iend;i>=istart;i--)
7135   {
7136     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7137     {
7138       if(ba[i]<start || ba[i]>=(start+slen*4))
7139       {
7140         // Branch out of this block, flush all regs
7141         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7142         {
7143           // Unconditional branch
7144           will_dirty_i=0;
7145           wont_dirty_i=0;
7146           // Merge in delay slot (will dirty)
7147           for(r=0;r<HOST_REGS;r++) {
7148             if(r!=EXCLUDE_REG) {
7149               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7150               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7151               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7152               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7153               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7154               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7155               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7156               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7157               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7158               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7159               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7160               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7161               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7162               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7163             }
7164           }
7165         }
7166         else
7167         {
7168           // Conditional branch
7169           will_dirty_i=0;
7170           wont_dirty_i=wont_dirty_next;
7171           // Merge in delay slot (will dirty)
7172           for(r=0;r<HOST_REGS;r++) {
7173             if(r!=EXCLUDE_REG) {
7174               if(!likely[i]) {
7175                 // Might not dirty if likely branch is not taken
7176                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7177                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7178                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7179                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7180                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7181                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
7182                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7183                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7184                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7185                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7186                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7187                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7188                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7189                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7190               }
7191             }
7192           }
7193         }
7194         // Merge in delay slot (wont dirty)
7195         for(r=0;r<HOST_REGS;r++) {
7196           if(r!=EXCLUDE_REG) {
7197             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7198             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7199             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7200             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7201             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7202             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7203             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7204             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7205             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7206             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7207           }
7208         }
7209         if(wr) {
7210           #ifndef DESTRUCTIVE_WRITEBACK
7211           branch_regs[i].dirty&=wont_dirty_i;
7212           #endif
7213           branch_regs[i].dirty|=will_dirty_i;
7214         }
7215       }
7216       else
7217       {
7218         // Internal branch
7219         if(ba[i]<=start+i*4) {
7220           // Backward branch
7221           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7222           {
7223             // Unconditional branch
7224             temp_will_dirty=0;
7225             temp_wont_dirty=0;
7226             // Merge in delay slot (will dirty)
7227             for(r=0;r<HOST_REGS;r++) {
7228               if(r!=EXCLUDE_REG) {
7229                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7230                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7231                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7232                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7233                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7234                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7235                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7236                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7237                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7238                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7239                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7240                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7241                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7242                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7243               }
7244             }
7245           } else {
7246             // Conditional branch (not taken case)
7247             temp_will_dirty=will_dirty_next;
7248             temp_wont_dirty=wont_dirty_next;
7249             // Merge in delay slot (will dirty)
7250             for(r=0;r<HOST_REGS;r++) {
7251               if(r!=EXCLUDE_REG) {
7252                 if(!likely[i]) {
7253                   // Will not dirty if likely branch is not taken
7254                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7255                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7256                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7257                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7258                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7259                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
7260                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7261                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
7262                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
7263                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
7264                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
7265                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
7266                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
7267                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
7268                 }
7269               }
7270             }
7271           }
7272           // Merge in delay slot (wont dirty)
7273           for(r=0;r<HOST_REGS;r++) {
7274             if(r!=EXCLUDE_REG) {
7275               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7276               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7277               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7278               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7279               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7280               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
7281               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
7282               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
7283               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
7284               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
7285             }
7286           }
7287           // Deal with changed mappings
7288           if(i<iend) {
7289             for(r=0;r<HOST_REGS;r++) {
7290               if(r!=EXCLUDE_REG) {
7291                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
7292                   temp_will_dirty&=~(1<<r);
7293                   temp_wont_dirty&=~(1<<r);
7294                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7295                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7296                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7297                   } else {
7298                     temp_will_dirty|=1<<r;
7299                     temp_wont_dirty|=1<<r;
7300                   }
7301                 }
7302               }
7303             }
7304           }
7305           if(wr) {
7306             will_dirty[i]=temp_will_dirty;
7307             wont_dirty[i]=temp_wont_dirty;
7308             clean_registers((ba[i]-start)>>2,i-1,0);
7309           }else{
7310             // Limit recursion.  It can take an excessive amount
7311             // of time if there are a lot of nested loops.
7312             will_dirty[(ba[i]-start)>>2]=0;
7313             wont_dirty[(ba[i]-start)>>2]=-1;
7314           }
7315         }
7316         /*else*/ if(1)
7317         {
7318           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
7319           {
7320             // Unconditional branch
7321             will_dirty_i=0;
7322             wont_dirty_i=0;
7323           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7324             for(r=0;r<HOST_REGS;r++) {
7325               if(r!=EXCLUDE_REG) {
7326                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7327                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
7328                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7329                 }
7330               }
7331             }
7332           //}
7333             // Merge in delay slot
7334             for(r=0;r<HOST_REGS;r++) {
7335               if(r!=EXCLUDE_REG) {
7336                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7337                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7338                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7339                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7340                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7341                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7342                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7343                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7344                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7345                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7346                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7347                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7348                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7349                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7350               }
7351             }
7352           } else {
7353             // Conditional branch
7354             will_dirty_i=will_dirty_next;
7355             wont_dirty_i=wont_dirty_next;
7356           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
7357             for(r=0;r<HOST_REGS;r++) {
7358               if(r!=EXCLUDE_REG) {
7359                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7360                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7361                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7362                 }
7363                 else
7364                 {
7365                   will_dirty_i&=~(1<<r);
7366                 }
7367                 // Treat delay slot as part of branch too
7368                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
7369                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
7370                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
7371                 }
7372                 else
7373                 {
7374                   will_dirty[i+1]&=~(1<<r);
7375                 }*/
7376               }
7377             }
7378           //}
7379             // Merge in delay slot
7380             for(r=0;r<HOST_REGS;r++) {
7381               if(r!=EXCLUDE_REG) {
7382                 if(!likely[i]) {
7383                   // Might not dirty if likely branch is not taken
7384                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7385                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7386                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7387                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7388                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7389                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7390                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7391                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7392                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7393                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
7394                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
7395                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7396                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7397                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7398                 }
7399               }
7400             }
7401           }
7402           // Merge in delay slot
7403           for(r=0;r<HOST_REGS;r++) {
7404             if(r!=EXCLUDE_REG) {
7405               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7406               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7407               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7408               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7409               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7410               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7411               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7412               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
7413               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
7414               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7415             }
7416           }
7417           if(wr) {
7418             #ifndef DESTRUCTIVE_WRITEBACK
7419             branch_regs[i].dirty&=wont_dirty_i;
7420             #endif
7421             branch_regs[i].dirty|=will_dirty_i;
7422           }
7423         }
7424       }
7425     }
7426     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
7427     {
7428       // SYSCALL instruction (software interrupt)
7429       will_dirty_i=0;
7430       wont_dirty_i=0;
7431     }
7432     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
7433     {
7434       // ERET instruction (return from interrupt)
7435       will_dirty_i=0;
7436       wont_dirty_i=0;
7437     }
7438     will_dirty_next=will_dirty_i;
7439     wont_dirty_next=wont_dirty_i;
7440     for(r=0;r<HOST_REGS;r++) {
7441       if(r!=EXCLUDE_REG) {
7442         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
7443         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
7444         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
7445         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
7446         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
7447         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
7448         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
7449         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
7450         if(i>istart) {
7451           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP) 
7452           {
7453             // Don't store a register immediately after writing it,
7454             // may prevent dual-issue.
7455             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
7456             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
7457           }
7458         }
7459       }
7460     }
7461     // Save it
7462     will_dirty[i]=will_dirty_i;
7463     wont_dirty[i]=wont_dirty_i;
7464     // Mark registers that won't be dirtied as not dirty
7465     if(wr) {
7466       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
7467       for(r=0;r<HOST_REGS;r++) {
7468         if((will_dirty_i>>r)&1) {
7469           printf(" r%d",r);
7470         }
7471       }
7472       printf("\n");*/
7473
7474       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
7475         regs[i].dirty|=will_dirty_i;
7476         #ifndef DESTRUCTIVE_WRITEBACK
7477         regs[i].dirty&=wont_dirty_i;
7478         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
7479         {
7480           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
7481             for(r=0;r<HOST_REGS;r++) {
7482               if(r!=EXCLUDE_REG) {
7483                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
7484                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
7485                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7486               }
7487             }
7488           }
7489         }
7490         else
7491         {
7492           if(i<iend) {
7493             for(r=0;r<HOST_REGS;r++) {
7494               if(r!=EXCLUDE_REG) {
7495                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
7496                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
7497                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);/*assert(!((wont_dirty_i>>r)&1));*/}
7498               }
7499             }
7500           }
7501         }
7502         #endif
7503       //}
7504     }
7505     // Deal with changed mappings
7506     temp_will_dirty=will_dirty_i;
7507     temp_wont_dirty=wont_dirty_i;
7508     for(r=0;r<HOST_REGS;r++) {
7509       if(r!=EXCLUDE_REG) {
7510         int nr;
7511         if(regs[i].regmap[r]==regmap_pre[i][r]) {
7512           if(wr) {
7513             #ifndef DESTRUCTIVE_WRITEBACK
7514             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7515             #endif
7516             regs[i].wasdirty|=will_dirty_i&(1<<r);
7517           }
7518         }
7519         else if((nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
7520           // Register moved to a different register
7521           will_dirty_i&=~(1<<r);
7522           wont_dirty_i&=~(1<<r);
7523           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
7524           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
7525           if(wr) {
7526             #ifndef DESTRUCTIVE_WRITEBACK
7527             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
7528             #endif
7529             regs[i].wasdirty|=will_dirty_i&(1<<r);
7530           }
7531         }
7532         else {
7533           will_dirty_i&=~(1<<r);
7534           wont_dirty_i&=~(1<<r);
7535           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
7536             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7537             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
7538           } else {
7539             wont_dirty_i|=1<<r;
7540             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);/*assert(!((will_dirty>>r)&1));*/
7541           }
7542         }
7543       }
7544     }
7545   }
7546 }
7547
7548   /* disassembly */
7549 void disassemble_inst(int i)
7550 {
7551     if (bt[i]) printf("*"); else printf(" ");
7552     switch(itype[i]) {
7553       case UJUMP:
7554         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7555       case CJUMP:
7556         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
7557       case SJUMP:
7558         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
7559       case FJUMP:
7560         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
7561       case RJUMP:
7562         if (opcode[i]==0x9&&rt1[i]!=31)
7563           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
7564         else
7565           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7566         break;
7567       case SPAN:
7568         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
7569       case IMM16:
7570         if(opcode[i]==0xf) //LUI
7571           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
7572         else
7573           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7574         break;
7575       case LOAD:
7576       case LOADLR:
7577         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7578         break;
7579       case STORE:
7580       case STORELR:
7581         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
7582         break;
7583       case ALU:
7584       case SHIFT:
7585         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
7586         break;
7587       case MULTDIV:
7588         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
7589         break;
7590       case SHIFTIMM:
7591         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
7592         break;
7593       case MOV:
7594         if((opcode2[i]&0x1d)==0x10)
7595           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
7596         else if((opcode2[i]&0x1d)==0x11)
7597           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
7598         else
7599           printf (" %x: %s\n",start+i*4,insn[i]);
7600         break;
7601       case COP0:
7602         if(opcode2[i]==0)
7603           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
7604         else if(opcode2[i]==4)
7605           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
7606         else printf (" %x: %s\n",start+i*4,insn[i]);
7607         break;
7608       case COP1:
7609         if(opcode2[i]<3)
7610           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
7611         else if(opcode2[i]>3)
7612           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
7613         else printf (" %x: %s\n",start+i*4,insn[i]);
7614         break;
7615       case COP2:
7616         if(opcode2[i]<3)
7617           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
7618         else if(opcode2[i]>3)
7619           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
7620         else printf (" %x: %s\n",start+i*4,insn[i]);
7621         break;
7622       case C1LS:
7623         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7624         break;
7625       case C2LS:
7626         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
7627         break;
7628       default:
7629         //printf (" %s %8x\n",insn[i],source[i]);
7630         printf (" %x: %s\n",start+i*4,insn[i]);
7631     }
7632 }
7633
7634 void new_dynarec_init()
7635 {
7636   printf("Init new dynarec\n");
7637   out=(u_char *)BASE_ADDR;
7638   if (mmap (out, 1<<TARGET_SIZE_2,
7639             PROT_READ | PROT_WRITE | PROT_EXEC,
7640             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
7641             -1, 0) <= 0) {printf("mmap() failed\n");}
7642 #ifdef MUPEN64
7643   rdword=&readmem_dword;
7644   fake_pc.f.r.rs=&readmem_dword;
7645   fake_pc.f.r.rt=&readmem_dword;
7646   fake_pc.f.r.rd=&readmem_dword;
7647 #endif
7648   int n;
7649   for(n=0x80000;n<0x80800;n++)
7650     invalid_code[n]=1;
7651   for(n=0;n<65536;n++)
7652     hash_table[n][0]=hash_table[n][2]=-1;
7653   memset(mini_ht,-1,sizeof(mini_ht));
7654   memset(restore_candidate,0,sizeof(restore_candidate));
7655   copy=shadow;
7656   expirep=16384; // Expiry pointer, +2 blocks
7657   pending_exception=0;
7658   literalcount=0;
7659 #ifdef HOST_IMM8
7660   // Copy this into local area so we don't have to put it in every literal pool
7661   invc_ptr=invalid_code;
7662 #endif
7663   stop_after_jal=0;
7664   // TLB
7665   using_tlb=0;
7666   for(n=0;n<524288;n++) // 0 .. 0x7FFFFFFF
7667     memory_map[n]=-1;
7668   for(n=524288;n<526336;n++) // 0x80000000 .. 0x807FFFFF
7669     memory_map[n]=((u_int)rdram-0x80000000)>>2;
7670   for(n=526336;n<1048576;n++) // 0x80800000 .. 0xFFFFFFFF
7671     memory_map[n]=-1;
7672 #ifdef MUPEN64
7673   for(n=0;n<0x8000;n++) { // 0 .. 0x7FFFFFFF
7674     writemem[n] = write_nomem_new;
7675     writememb[n] = write_nomemb_new;
7676     writememh[n] = write_nomemh_new;
7677 #ifndef FORCE32
7678     writememd[n] = write_nomemd_new;
7679 #endif
7680     readmem[n] = read_nomem_new;
7681     readmemb[n] = read_nomemb_new;
7682     readmemh[n] = read_nomemh_new;
7683 #ifndef FORCE32
7684     readmemd[n] = read_nomemd_new;
7685 #endif
7686   }
7687   for(n=0x8000;n<0x8080;n++) { // 0x80000000 .. 0x807FFFFF
7688     writemem[n] = write_rdram_new;
7689     writememb[n] = write_rdramb_new;
7690     writememh[n] = write_rdramh_new;
7691 #ifndef FORCE32
7692     writememd[n] = write_rdramd_new;
7693 #endif
7694   }
7695   for(n=0xC000;n<0x10000;n++) { // 0xC0000000 .. 0xFFFFFFFF
7696     writemem[n] = write_nomem_new;
7697     writememb[n] = write_nomemb_new;
7698     writememh[n] = write_nomemh_new;
7699 #ifndef FORCE32
7700     writememd[n] = write_nomemd_new;
7701 #endif
7702     readmem[n] = read_nomem_new;
7703     readmemb[n] = read_nomemb_new;
7704     readmemh[n] = read_nomemh_new;
7705 #ifndef FORCE32
7706     readmemd[n] = read_nomemd_new;
7707 #endif
7708   }
7709 #endif
7710   tlb_hacks();
7711   arch_init();
7712 }
7713
7714 void new_dynarec_cleanup()
7715 {
7716   int n;
7717   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
7718   for(n=0;n<4096;n++) ll_clear(jump_in+n);
7719   for(n=0;n<4096;n++) ll_clear(jump_out+n);
7720   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
7721   #ifdef ROM_COPY
7722   if (munmap (ROM_COPY, 67108864) < 0) {printf("munmap() failed\n");}
7723   #endif
7724 }
7725
7726 int new_recompile_block(int addr)
7727 {
7728 /*
7729   if(addr==0x800cd050) {
7730     int block;
7731     for(block=0x80000;block<0x80800;block++) invalidate_block(block);
7732     int n;
7733     for(n=0;n<=2048;n++) ll_clear(jump_dirty+n);
7734   }
7735 */
7736   //if(Count==365117028) tracedebug=1;
7737   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7738   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
7739   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
7740   //if(debug) 
7741   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
7742   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
7743   /*if(Count>=312978186) {
7744     rlist();
7745   }*/
7746   //rlist();
7747   start = (u_int)addr&~3;
7748   //assert(((u_int)addr&1)==0);
7749 #ifdef PCSX
7750   if ((Config.HLE && start == 0x80001000) || // hlecall
7751       (/*psxRegs.pc != 0x80030000 &&*/ start == 0x80030000)) // fastbios thing
7752   {
7753     // XXX: is this enough? Maybe check hleSoftCall?
7754     u_int beginning=(u_int)out;
7755     u_int page=get_page(start);
7756     invalid_code[start>>12]=0;
7757     emit_movimm(start,0);
7758     emit_writeword(0,(int)&pcaddr);
7759     emit_jmp((int)new_dyna_leave);
7760 #ifdef __arm__
7761     __clear_cache((void *)beginning,out);
7762 #endif
7763     if (start == 0x80030000)
7764       return beginning;
7765     else
7766       ll_add(jump_in+page,start,(void *)beginning);
7767     return 0;
7768   }
7769   else if ((u_int)addr < 0x00200000 ||
7770     (0xa0000000 <= addr && addr < 0xa0200000)) {
7771     // used for BIOS calls mostly?
7772     source = (u_int *)((u_int)rdram+(start&0x1fffff));
7773     pagelimit = (addr&0xa0000000)|0x00200000;
7774   }
7775   else if (!Config.HLE && (
7776 /*    (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
7777     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
7778     // BIOS
7779     source = (u_int *)((u_int)psxR+(start&0x7ffff));
7780     pagelimit = (addr&0xfff00000)|0x80000;
7781   }
7782   else
7783 #endif
7784 #ifdef MUPEN64
7785   if ((int)addr >= 0xa4000000 && (int)addr < 0xa4001000) {
7786     source = (u_int *)((u_int)SP_DMEM+start-0xa4000000);
7787     pagelimit = 0xa4001000;
7788   }
7789   else
7790 #endif
7791   if ((int)addr >= 0x80000000 && (int)addr < 0x80000000+RAM_SIZE) {
7792     source = (u_int *)((u_int)rdram+start-0x80000000);
7793     pagelimit = 0x80000000+RAM_SIZE;
7794   }
7795 #ifndef DISABLE_TLB
7796   else if ((signed int)addr >= (signed int)0xC0000000) {
7797     //printf("addr=%x mm=%x\n",(u_int)addr,(memory_map[start>>12]<<2));
7798     //if(tlb_LUT_r[start>>12])
7799       //source = (u_int *)(((int)rdram)+(tlb_LUT_r[start>>12]&0xFFFFF000)+(((int)addr)&0xFFF)-0x80000000);
7800     if((signed int)memory_map[start>>12]>=0) {
7801       source = (u_int *)((u_int)(start+(memory_map[start>>12]<<2)));
7802       pagelimit=(start+4096)&0xFFFFF000;
7803       int map=memory_map[start>>12];
7804       int i;
7805       for(i=0;i<5;i++) {
7806         //printf("start: %x next: %x\n",map,memory_map[pagelimit>>12]);
7807         if((map&0xBFFFFFFF)==(memory_map[pagelimit>>12]&0xBFFFFFFF)) pagelimit+=4096;
7808       }
7809       assem_debug("pagelimit=%x\n",pagelimit);
7810       assem_debug("mapping=%x (%x)\n",memory_map[start>>12],(memory_map[start>>12]<<2)+start);
7811     }
7812     else {
7813       assem_debug("Compile at unmapped memory address: %x \n", (int)addr);
7814       //assem_debug("start: %x next: %x\n",memory_map[start>>12],memory_map[(start+4096)>>12]);
7815       return -1; // Caller will invoke exception handler
7816     }
7817     //printf("source= %x\n",(int)source);
7818   }
7819 #endif
7820   else {
7821     printf("Compile at bogus memory address: %x \n", (int)addr);
7822     exit(1);
7823   }
7824
7825   /* Pass 1: disassemble */
7826   /* Pass 2: register dependencies, branch targets */
7827   /* Pass 3: register allocation */
7828   /* Pass 4: branch dependencies */
7829   /* Pass 5: pre-alloc */
7830   /* Pass 6: optimize clean/dirty state */
7831   /* Pass 7: flag 32-bit registers */
7832   /* Pass 8: assembly */
7833   /* Pass 9: linker */
7834   /* Pass 10: garbage collection / free memory */
7835
7836   int i,j;
7837   int done=0;
7838   unsigned int type,op,op2;
7839
7840   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
7841   
7842   /* Pass 1 disassembly */
7843
7844   for(i=0;!done;i++) {
7845     bt[i]=0;likely[i]=0;op2=0;
7846     opcode[i]=op=source[i]>>26;
7847     switch(op)
7848     {
7849       case 0x00: strcpy(insn[i],"special"); type=NI;
7850         op2=source[i]&0x3f;
7851         switch(op2)
7852         {
7853           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
7854           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
7855           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
7856           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
7857           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
7858           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
7859           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
7860           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
7861           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
7862           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
7863           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
7864           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
7865           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
7866           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
7867           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
7868           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
7869           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
7870           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
7871           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
7872           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
7873           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
7874           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
7875           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
7876           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
7877           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
7878           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
7879           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
7880           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
7881           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
7882           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
7883           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
7884           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
7885           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
7886           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
7887           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
7888           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
7889           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
7890           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
7891           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
7892           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
7893           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
7894           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
7895           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
7896           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
7897           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
7898           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
7899           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
7900           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
7901           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
7902           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
7903           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
7904           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
7905         }
7906         break;
7907       case 0x01: strcpy(insn[i],"regimm"); type=NI;
7908         op2=(source[i]>>16)&0x1f;
7909         switch(op2)
7910         {
7911           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
7912           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
7913           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
7914           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
7915           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
7916           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
7917           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
7918           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
7919           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
7920           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
7921           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
7922           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
7923           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
7924           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
7925         }
7926         break;
7927       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
7928       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
7929       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
7930       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
7931       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
7932       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
7933       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
7934       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
7935       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
7936       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
7937       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
7938       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
7939       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
7940       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
7941       case 0x10: strcpy(insn[i],"cop0"); type=NI;
7942         op2=(source[i]>>21)&0x1f;
7943         switch(op2)
7944         {
7945           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
7946           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
7947           case 0x10: strcpy(insn[i],"tlb"); type=NI;
7948           switch(source[i]&0x3f)
7949           {
7950             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
7951             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
7952             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
7953             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
7954 #ifdef PCSX
7955             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
7956 #else
7957             case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
7958 #endif
7959           }
7960         }
7961         break;
7962       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7963         op2=(source[i]>>21)&0x1f;
7964         switch(op2)
7965         {
7966           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7967           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7968           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7969           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7970           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7971           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7972           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7973           switch((source[i]>>16)&0x3)
7974           {
7975             case 0x00: strcpy(insn[i],"BC1F"); break;
7976             case 0x01: strcpy(insn[i],"BC1T"); break;
7977             case 0x02: strcpy(insn[i],"BC1FL"); break;
7978             case 0x03: strcpy(insn[i],"BC1TL"); break;
7979           }
7980           break;
7981           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7982           switch(source[i]&0x3f)
7983           {
7984             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7985             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7986             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7987             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7988             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7989             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7990             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7991             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7992             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7993             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7994             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7995             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7996             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7997             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7998             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7999             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
8000             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
8001             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
8002             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
8003             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
8004             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
8005             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
8006             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
8007             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
8008             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
8009             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
8010             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
8011             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
8012             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
8013             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
8014             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
8015             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
8016             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
8017             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
8018             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
8019           }
8020           break;
8021           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
8022           switch(source[i]&0x3f)
8023           {
8024             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
8025             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
8026             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
8027             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
8028             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
8029             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
8030             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
8031             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
8032             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
8033             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
8034             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
8035             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
8036             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
8037             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
8038             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
8039             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
8040             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
8041             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
8042             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
8043             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
8044             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
8045             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
8046             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
8047             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
8048             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
8049             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
8050             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
8051             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
8052             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
8053             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
8054             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
8055             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
8056             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
8057             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
8058             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
8059           }
8060           break;
8061           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
8062           switch(source[i]&0x3f)
8063           {
8064             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
8065             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
8066           }
8067           break;
8068           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
8069           switch(source[i]&0x3f)
8070           {
8071             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
8072             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
8073           }
8074           break;
8075         }
8076         break;
8077       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
8078       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
8079       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
8080       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
8081 #ifndef FORCE32
8082       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
8083       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
8084       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
8085       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
8086 #endif
8087       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
8088       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
8089       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
8090       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
8091       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
8092       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
8093       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
8094       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
8095       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
8096       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
8097       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
8098       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
8099 #ifndef FORCE32
8100       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
8101       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
8102 #endif
8103       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
8104       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
8105       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
8106       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
8107 #ifndef FORCE32
8108       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
8109       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
8110       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
8111 #endif
8112       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
8113       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
8114 #ifndef FORCE32
8115       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
8116       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
8117       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
8118 #endif
8119 #ifdef PCSX
8120       case 0x12: strcpy(insn[i],"COP2"); type=NI;
8121         op2=(source[i]>>21)&0x1f;
8122         switch(op2)
8123         {
8124           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
8125           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
8126           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
8127           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
8128           default:
8129             if (gte_handlers[source[i]&0x3f]!=NULL) {
8130               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
8131               type=C2OP;
8132             }
8133             break;
8134         }
8135         break;
8136       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
8137       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
8138       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
8139 #endif
8140       default: strcpy(insn[i],"???"); type=NI;
8141         printf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
8142         break;
8143     }
8144     itype[i]=type;
8145     opcode2[i]=op2;
8146     /* Get registers/immediates */
8147     lt1[i]=0;
8148     us1[i]=0;
8149     us2[i]=0;
8150     dep1[i]=0;
8151     dep2[i]=0;
8152     switch(type) {
8153       case LOAD:
8154         rs1[i]=(source[i]>>21)&0x1f;
8155         rs2[i]=0;
8156         rt1[i]=(source[i]>>16)&0x1f;
8157         rt2[i]=0;
8158         imm[i]=(short)source[i];
8159         break;
8160       case STORE:
8161       case STORELR:
8162         rs1[i]=(source[i]>>21)&0x1f;
8163         rs2[i]=(source[i]>>16)&0x1f;
8164         rt1[i]=0;
8165         rt2[i]=0;
8166         imm[i]=(short)source[i];
8167         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
8168         break;
8169       case LOADLR:
8170         // LWL/LWR only load part of the register,
8171         // therefore the target register must be treated as a source too
8172         rs1[i]=(source[i]>>21)&0x1f;
8173         rs2[i]=(source[i]>>16)&0x1f;
8174         rt1[i]=(source[i]>>16)&0x1f;
8175         rt2[i]=0;
8176         imm[i]=(short)source[i];
8177         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
8178         if(op==0x26) dep1[i]=rt1[i]; // LWR
8179         break;
8180       case IMM16:
8181         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
8182         else rs1[i]=(source[i]>>21)&0x1f;
8183         rs2[i]=0;
8184         rt1[i]=(source[i]>>16)&0x1f;
8185         rt2[i]=0;
8186         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
8187           imm[i]=(unsigned short)source[i];
8188         }else{
8189           imm[i]=(short)source[i];
8190         }
8191         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
8192         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
8193         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
8194         break;
8195       case UJUMP:
8196         rs1[i]=0;
8197         rs2[i]=0;
8198         rt1[i]=0;
8199         rt2[i]=0;
8200         // The JAL instruction writes to r31.
8201         if (op&1) {
8202           rt1[i]=31;
8203         }
8204         rs2[i]=CCREG;
8205         break;
8206       case RJUMP:
8207         rs1[i]=(source[i]>>21)&0x1f;
8208         rs2[i]=0;
8209         rt1[i]=0;
8210         rt2[i]=0;
8211         // The JALR instruction writes to rd.
8212         if (op2&1) {
8213           rt1[i]=(source[i]>>11)&0x1f;
8214         }
8215         rs2[i]=CCREG;
8216         break;
8217       case CJUMP:
8218         rs1[i]=(source[i]>>21)&0x1f;
8219         rs2[i]=(source[i]>>16)&0x1f;
8220         rt1[i]=0;
8221         rt2[i]=0;
8222         if(op&2) { // BGTZ/BLEZ
8223           rs2[i]=0;
8224         }
8225         us1[i]=rs1[i];
8226         us2[i]=rs2[i];
8227         likely[i]=op>>4;
8228         break;
8229       case SJUMP:
8230         rs1[i]=(source[i]>>21)&0x1f;
8231         rs2[i]=CCREG;
8232         rt1[i]=0;
8233         rt2[i]=0;
8234         us1[i]=rs1[i];
8235         if(op2&0x10) { // BxxAL
8236           rt1[i]=31;
8237           // NOTE: If the branch is not taken, r31 is still overwritten
8238         }
8239         likely[i]=(op2&2)>>1;
8240         break;
8241       case FJUMP:
8242         rs1[i]=FSREG;
8243         rs2[i]=CSREG;
8244         rt1[i]=0;
8245         rt2[i]=0;
8246         likely[i]=((source[i])>>17)&1;
8247         break;
8248       case ALU:
8249         rs1[i]=(source[i]>>21)&0x1f; // source
8250         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
8251         rt1[i]=(source[i]>>11)&0x1f; // destination
8252         rt2[i]=0;
8253         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
8254           us1[i]=rs1[i];us2[i]=rs2[i];
8255         }
8256         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
8257           dep1[i]=rs1[i];dep2[i]=rs2[i];
8258         }
8259         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
8260           dep1[i]=rs1[i];dep2[i]=rs2[i];
8261         }
8262         break;
8263       case MULTDIV:
8264         rs1[i]=(source[i]>>21)&0x1f; // source
8265         rs2[i]=(source[i]>>16)&0x1f; // divisor
8266         rt1[i]=HIREG;
8267         rt2[i]=LOREG;
8268         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
8269           us1[i]=rs1[i];us2[i]=rs2[i];
8270         }
8271         break;
8272       case MOV:
8273         rs1[i]=0;
8274         rs2[i]=0;
8275         rt1[i]=0;
8276         rt2[i]=0;
8277         if(op2==0x10) rs1[i]=HIREG; // MFHI
8278         if(op2==0x11) rt1[i]=HIREG; // MTHI
8279         if(op2==0x12) rs1[i]=LOREG; // MFLO
8280         if(op2==0x13) rt1[i]=LOREG; // MTLO
8281         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
8282         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
8283         dep1[i]=rs1[i];
8284         break;
8285       case SHIFT:
8286         rs1[i]=(source[i]>>16)&0x1f; // target of shift
8287         rs2[i]=(source[i]>>21)&0x1f; // shift amount
8288         rt1[i]=(source[i]>>11)&0x1f; // destination
8289         rt2[i]=0;
8290         // DSLLV/DSRLV/DSRAV are 64-bit
8291         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
8292         break;
8293       case SHIFTIMM:
8294         rs1[i]=(source[i]>>16)&0x1f;
8295         rs2[i]=0;
8296         rt1[i]=(source[i]>>11)&0x1f;
8297         rt2[i]=0;
8298         imm[i]=(source[i]>>6)&0x1f;
8299         // DSxx32 instructions
8300         if(op2>=0x3c) imm[i]|=0x20;
8301         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
8302         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
8303         break;
8304       case COP0:
8305         rs1[i]=0;
8306         rs2[i]=0;
8307         rt1[i]=0;
8308         rt2[i]=0;
8309         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
8310         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
8311         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
8312         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
8313         break;
8314       case COP1:
8315       case COP2:
8316         rs1[i]=0;
8317         rs2[i]=0;
8318         rt1[i]=0;
8319         rt2[i]=0;
8320         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
8321         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
8322         if(op2==5) us1[i]=rs1[i]; // DMTC1
8323         rs2[i]=CSREG;
8324         break;
8325       case C1LS:
8326         rs1[i]=(source[i]>>21)&0x1F;
8327         rs2[i]=CSREG;
8328         rt1[i]=0;
8329         rt2[i]=0;
8330         imm[i]=(short)source[i];
8331         break;
8332       case C2LS:
8333         rs1[i]=(source[i]>>21)&0x1F;
8334         rs2[i]=0;
8335         rt1[i]=0;
8336         rt2[i]=0;
8337         imm[i]=(short)source[i];
8338         break;
8339       case FLOAT:
8340       case FCONV:
8341         rs1[i]=0;
8342         rs2[i]=CSREG;
8343         rt1[i]=0;
8344         rt2[i]=0;
8345         break;
8346       case FCOMP:
8347         rs1[i]=FSREG;
8348         rs2[i]=CSREG;
8349         rt1[i]=FSREG;
8350         rt2[i]=0;
8351         break;
8352       case SYSCALL:
8353       case HLECALL:
8354         rs1[i]=CCREG;
8355         rs2[i]=0;
8356         rt1[i]=0;
8357         rt2[i]=0;
8358         break;
8359       default:
8360         rs1[i]=0;
8361         rs2[i]=0;
8362         rt1[i]=0;
8363         rt2[i]=0;
8364     }
8365     /* Calculate branch target addresses */
8366     if(type==UJUMP)
8367       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
8368     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
8369       ba[i]=start+i*4+8; // Ignore never taken branch
8370     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
8371       ba[i]=start+i*4+8; // Ignore never taken branch
8372     else if(type==CJUMP||type==SJUMP||type==FJUMP)
8373       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
8374     else ba[i]=-1;
8375     /* Is this the end of the block? */
8376     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
8377       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
8378         done=1;
8379         // Does the block continue due to a branch?
8380         for(j=i-1;j>=0;j--)
8381         {
8382           if(ba[j]==start+i*4+4) done=j=0;
8383           if(ba[j]==start+i*4+8) done=j=0;
8384         }
8385       }
8386       else {
8387         if(stop_after_jal) done=1;
8388         // Stop on BREAK
8389         if((source[i+1]&0xfc00003f)==0x0d) done=1;
8390       }
8391       // Don't recompile stuff that's already compiled
8392       if(check_addr(start+i*4+4)) done=1;
8393       // Don't get too close to the limit
8394       if(i>MAXBLOCK/2) done=1;
8395     }
8396     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
8397     if(itype[i]==HLECALL) done=1;
8398     //assert(i<MAXBLOCK-1);
8399     if(start+i*4==pagelimit-4) done=1;
8400     assert(start+i*4<pagelimit);
8401     if (i==MAXBLOCK-1) done=1;
8402     // Stop if we're compiling junk
8403     if(itype[i]==NI&&opcode[i]==0x11) {
8404       done=stop_after_jal=1;
8405       printf("Disabled speculative precompilation\n");
8406     }
8407   }
8408   slen=i;
8409   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
8410     if(start+i*4==pagelimit) {
8411       itype[i-1]=SPAN;
8412     }
8413   }
8414   assert(slen>0);
8415
8416   /* Pass 2 - Register dependencies and branch targets */
8417
8418   unneeded_registers(0,slen-1,0);
8419   
8420   /* Pass 3 - Register allocation */
8421
8422   struct regstat current; // Current register allocations/status
8423   current.is32=1;
8424   current.dirty=0;
8425   current.u=unneeded_reg[0];
8426   current.uu=unneeded_reg_upper[0];
8427   clear_all_regs(current.regmap);
8428   alloc_reg(&current,0,CCREG);
8429   dirty_reg(&current,CCREG);
8430   current.isconst=0;
8431   current.wasconst=0;
8432   int ds=0;
8433   int cc=0;
8434   int hr;
8435   
8436   provisional_32bit();
8437   
8438   if((u_int)addr&1) {
8439     // First instruction is delay slot
8440     cc=-1;
8441     bt[1]=1;
8442     ds=1;
8443     unneeded_reg[0]=1;
8444     unneeded_reg_upper[0]=1;
8445     current.regmap[HOST_BTREG]=BTREG;
8446   }
8447   
8448   for(i=0;i<slen;i++)
8449   {
8450     if(bt[i])
8451     {
8452       int hr;
8453       for(hr=0;hr<HOST_REGS;hr++)
8454       {
8455         // Is this really necessary?
8456         if(current.regmap[hr]==0) current.regmap[hr]=-1;
8457       }
8458       current.isconst=0;
8459     }
8460     if(i>1)
8461     {
8462       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
8463       {
8464         if(rs1[i-2]==0||rs2[i-2]==0)
8465         {
8466           if(rs1[i-2]) {
8467             current.is32|=1LL<<rs1[i-2];
8468             int hr=get_reg(current.regmap,rs1[i-2]|64);
8469             if(hr>=0) current.regmap[hr]=-1;
8470           }
8471           if(rs2[i-2]) {
8472             current.is32|=1LL<<rs2[i-2];
8473             int hr=get_reg(current.regmap,rs2[i-2]|64);
8474             if(hr>=0) current.regmap[hr]=-1;
8475           }
8476         }
8477       }
8478     }
8479     // If something jumps here with 64-bit values
8480     // then promote those registers to 64 bits
8481     if(bt[i])
8482     {
8483       uint64_t temp_is32=current.is32;
8484       for(j=i-1;j>=0;j--)
8485       {
8486         if(ba[j]==start+i*4) 
8487           temp_is32&=branch_regs[j].is32;
8488       }
8489       for(j=i;j<slen;j++)
8490       {
8491         if(ba[j]==start+i*4) 
8492           //temp_is32=1;
8493           temp_is32&=p32[j];
8494       }
8495       if(temp_is32!=current.is32) {
8496         //printf("dumping 32-bit regs (%x)\n",start+i*4);
8497         #ifdef DESTRUCTIVE_WRITEBACK
8498         for(hr=0;hr<HOST_REGS;hr++)
8499         {
8500           int r=current.regmap[hr];
8501           if(r>0&&r<64)
8502           {
8503             if((current.dirty>>hr)&((current.is32&~temp_is32)>>r)&1) {
8504               temp_is32|=1LL<<r;
8505               //printf("restore %d\n",r);
8506             }
8507           }
8508         }
8509         #endif
8510         current.is32=temp_is32;
8511       }
8512     }
8513 #ifdef FORCE32
8514     memset(p32, 0xff, sizeof(p32));
8515     current.is32=-1LL;
8516 #endif
8517
8518     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
8519     regs[i].wasconst=current.isconst;
8520     regs[i].was32=current.is32;
8521     regs[i].wasdirty=current.dirty;
8522     #ifdef DESTRUCTIVE_WRITEBACK
8523     // To change a dirty register from 32 to 64 bits, we must write
8524     // it out during the previous cycle (for branches, 2 cycles)
8525     if(i<slen-1&&bt[i+1]&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP)
8526     {
8527       uint64_t temp_is32=current.is32;
8528       for(j=i-1;j>=0;j--)
8529       {
8530         if(ba[j]==start+i*4+4) 
8531           temp_is32&=branch_regs[j].is32;
8532       }
8533       for(j=i;j<slen;j++)
8534       {
8535         if(ba[j]==start+i*4+4) 
8536           //temp_is32=1;
8537           temp_is32&=p32[j];
8538       }
8539       if(temp_is32!=current.is32) {
8540         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8541         for(hr=0;hr<HOST_REGS;hr++)
8542         {
8543           int r=current.regmap[hr];
8544           if(r>0)
8545           {
8546             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8547               if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP)
8548               {
8549                 if(rs1[i]!=(r&63)&&rs2[i]!=(r&63))
8550                 {
8551                   //printf("dump %d/r%d\n",hr,r);
8552                   current.regmap[hr]=-1;
8553                   if(get_reg(current.regmap,r|64)>=0) 
8554                     current.regmap[get_reg(current.regmap,r|64)]=-1;
8555                 }
8556               }
8557             }
8558           }
8559         }
8560       }
8561     }
8562     else if(i<slen-2&&bt[i+2]&&(source[i-1]>>16)!=0x1000&&(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP))
8563     {
8564       uint64_t temp_is32=current.is32;
8565       for(j=i-1;j>=0;j--)
8566       {
8567         if(ba[j]==start+i*4+8) 
8568           temp_is32&=branch_regs[j].is32;
8569       }
8570       for(j=i;j<slen;j++)
8571       {
8572         if(ba[j]==start+i*4+8) 
8573           //temp_is32=1;
8574           temp_is32&=p32[j];
8575       }
8576       if(temp_is32!=current.is32) {
8577         //printf("pre-dumping 32-bit regs (%x)\n",start+i*4);
8578         for(hr=0;hr<HOST_REGS;hr++)
8579         {
8580           int r=current.regmap[hr];
8581           if(r>0)
8582           {
8583             if((current.dirty>>hr)&((current.is32&~temp_is32)>>(r&63))&1) {
8584               if(rs1[i]!=(r&63)&&rs2[i]!=(r&63)&&rs1[i+1]!=(r&63)&&rs2[i+1]!=(r&63))
8585               {
8586                 //printf("dump %d/r%d\n",hr,r);
8587                 current.regmap[hr]=-1;
8588                 if(get_reg(current.regmap,r|64)>=0) 
8589                   current.regmap[get_reg(current.regmap,r|64)]=-1;
8590               }
8591             }
8592           }
8593         }
8594       }
8595     }
8596     #endif
8597     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8598       if(i+1<slen) {
8599         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8600         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8601         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8602         current.u|=1;
8603         current.uu|=1;
8604       } else {
8605         current.u=1;
8606         current.uu=1;
8607       }
8608     } else {
8609       if(i+1<slen) {
8610         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
8611         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8612         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8613         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8614         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8615         current.u|=1;
8616         current.uu|=1;
8617       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
8618     }
8619     is_ds[i]=ds;
8620     if(ds) {
8621       ds=0; // Skip delay slot, already allocated as part of branch
8622       // ...but we need to alloc it in case something jumps here
8623       if(i+1<slen) {
8624         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
8625         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
8626       }else{
8627         current.u=branch_unneeded_reg[i-1];
8628         current.uu=branch_unneeded_reg_upper[i-1];
8629       }
8630       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
8631       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8632       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8633       current.u|=1;
8634       current.uu|=1;
8635       struct regstat temp;
8636       memcpy(&temp,&current,sizeof(current));
8637       temp.wasdirty=temp.dirty;
8638       temp.was32=temp.is32;
8639       // TODO: Take into account unconditional branches, as below
8640       delayslot_alloc(&temp,i);
8641       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
8642       regs[i].wasdirty=temp.wasdirty;
8643       regs[i].was32=temp.was32;
8644       regs[i].dirty=temp.dirty;
8645       regs[i].is32=temp.is32;
8646       regs[i].isconst=0;
8647       regs[i].wasconst=0;
8648       current.isconst=0;
8649       // Create entry (branch target) regmap
8650       for(hr=0;hr<HOST_REGS;hr++)
8651       {
8652         int r=temp.regmap[hr];
8653         if(r>=0) {
8654           if(r!=regmap_pre[i][hr]) {
8655             regs[i].regmap_entry[hr]=-1;
8656           }
8657           else
8658           {
8659             if(r<64){
8660               if((current.u>>r)&1) {
8661                 regs[i].regmap_entry[hr]=-1;
8662                 regs[i].regmap[hr]=-1;
8663                 //Don't clear regs in the delay slot as the branch might need them
8664                 //current.regmap[hr]=-1;
8665               }else
8666                 regs[i].regmap_entry[hr]=r;
8667             }
8668             else {
8669               if((current.uu>>(r&63))&1) {
8670                 regs[i].regmap_entry[hr]=-1;
8671                 regs[i].regmap[hr]=-1;
8672                 //Don't clear regs in the delay slot as the branch might need them
8673                 //current.regmap[hr]=-1;
8674               }else
8675                 regs[i].regmap_entry[hr]=r;
8676             }
8677           }
8678         } else {
8679           // First instruction expects CCREG to be allocated
8680           if(i==0&&hr==HOST_CCREG) 
8681             regs[i].regmap_entry[hr]=CCREG;
8682           else
8683             regs[i].regmap_entry[hr]=-1;
8684         }
8685       }
8686     }
8687     else { // Not delay slot
8688       switch(itype[i]) {
8689         case UJUMP:
8690           //current.isconst=0; // DEBUG
8691           //current.wasconst=0; // DEBUG
8692           //regs[i].wasconst=0; // DEBUG
8693           clear_const(&current,rt1[i]);
8694           alloc_cc(&current,i);
8695           dirty_reg(&current,CCREG);
8696           if (rt1[i]==31) {
8697             alloc_reg(&current,i,31);
8698             dirty_reg(&current,31);
8699             assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8700             #ifdef REG_PREFETCH
8701             alloc_reg(&current,i,PTEMP);
8702             #endif
8703             //current.is32|=1LL<<rt1[i];
8704           }
8705           delayslot_alloc(&current,i+1);
8706           //current.isconst=0; // DEBUG
8707           ds=1;
8708           //printf("i=%d, isconst=%x\n",i,current.isconst);
8709           break;
8710         case RJUMP:
8711           //current.isconst=0;
8712           //current.wasconst=0;
8713           //regs[i].wasconst=0;
8714           clear_const(&current,rs1[i]);
8715           clear_const(&current,rt1[i]);
8716           alloc_cc(&current,i);
8717           dirty_reg(&current,CCREG);
8718           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
8719             alloc_reg(&current,i,rs1[i]);
8720             if (rt1[i]!=0) {
8721               alloc_reg(&current,i,rt1[i]);
8722               dirty_reg(&current,rt1[i]);
8723               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8724               #ifdef REG_PREFETCH
8725               alloc_reg(&current,i,PTEMP);
8726               #endif
8727             }
8728             #ifdef USE_MINI_HT
8729             if(rs1[i]==31) { // JALR
8730               alloc_reg(&current,i,RHASH);
8731               #ifndef HOST_IMM_ADDR32
8732               alloc_reg(&current,i,RHTBL);
8733               #endif
8734             }
8735             #endif
8736             delayslot_alloc(&current,i+1);
8737           } else {
8738             // The delay slot overwrites our source register,
8739             // allocate a temporary register to hold the old value.
8740             current.isconst=0;
8741             current.wasconst=0;
8742             regs[i].wasconst=0;
8743             delayslot_alloc(&current,i+1);
8744             current.isconst=0;
8745             alloc_reg(&current,i,RTEMP);
8746           }
8747           //current.isconst=0; // DEBUG
8748           ds=1;
8749           break;
8750         case CJUMP:
8751           //current.isconst=0;
8752           //current.wasconst=0;
8753           //regs[i].wasconst=0;
8754           clear_const(&current,rs1[i]);
8755           clear_const(&current,rs2[i]);
8756           if((opcode[i]&0x3E)==4) // BEQ/BNE
8757           {
8758             alloc_cc(&current,i);
8759             dirty_reg(&current,CCREG);
8760             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8761             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8762             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8763             {
8764               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8765               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8766             }
8767             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
8768                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
8769               // The delay slot overwrites one of our conditions.
8770               // Allocate the branch condition registers instead.
8771               // Note that such a sequence of instructions could
8772               // be considered a bug since the branch can not be
8773               // re-executed if an exception occurs.
8774               current.isconst=0;
8775               current.wasconst=0;
8776               regs[i].wasconst=0;
8777               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8778               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
8779               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8780               {
8781                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8782                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
8783               }
8784             }
8785             else delayslot_alloc(&current,i+1);
8786           }
8787           else
8788           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
8789           {
8790             alloc_cc(&current,i);
8791             dirty_reg(&current,CCREG);
8792             alloc_reg(&current,i,rs1[i]);
8793             if(!(current.is32>>rs1[i]&1))
8794             {
8795               alloc_reg64(&current,i,rs1[i]);
8796             }
8797             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8798               // The delay slot overwrites one of our conditions.
8799               // Allocate the branch condition registers instead.
8800               // Note that such a sequence of instructions could
8801               // be considered a bug since the branch can not be
8802               // re-executed if an exception occurs.
8803               current.isconst=0;
8804               current.wasconst=0;
8805               regs[i].wasconst=0;
8806               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8807               if(!((current.is32>>rs1[i])&1))
8808               {
8809                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8810               }
8811             }
8812             else delayslot_alloc(&current,i+1);
8813           }
8814           else
8815           // Don't alloc the delay slot yet because we might not execute it
8816           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
8817           {
8818             current.isconst=0;
8819             current.wasconst=0;
8820             regs[i].wasconst=0;
8821             alloc_cc(&current,i);
8822             dirty_reg(&current,CCREG);
8823             alloc_reg(&current,i,rs1[i]);
8824             alloc_reg(&current,i,rs2[i]);
8825             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
8826             {
8827               alloc_reg64(&current,i,rs1[i]);
8828               alloc_reg64(&current,i,rs2[i]);
8829             }
8830           }
8831           else
8832           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
8833           {
8834             current.isconst=0;
8835             current.wasconst=0;
8836             regs[i].wasconst=0;
8837             alloc_cc(&current,i);
8838             dirty_reg(&current,CCREG);
8839             alloc_reg(&current,i,rs1[i]);
8840             if(!(current.is32>>rs1[i]&1))
8841             {
8842               alloc_reg64(&current,i,rs1[i]);
8843             }
8844           }
8845           ds=1;
8846           //current.isconst=0;
8847           break;
8848         case SJUMP:
8849           //current.isconst=0;
8850           //current.wasconst=0;
8851           //regs[i].wasconst=0;
8852           clear_const(&current,rs1[i]);
8853           clear_const(&current,rt1[i]);
8854           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
8855           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
8856           {
8857             alloc_cc(&current,i);
8858             dirty_reg(&current,CCREG);
8859             alloc_reg(&current,i,rs1[i]);
8860             if(!(current.is32>>rs1[i]&1))
8861             {
8862               alloc_reg64(&current,i,rs1[i]);
8863             }
8864             if (rt1[i]==31) { // BLTZAL/BGEZAL
8865               alloc_reg(&current,i,31);
8866               dirty_reg(&current,31);
8867               assert(rs1[i+1]!=31&&rs2[i+1]!=31);
8868               //#ifdef REG_PREFETCH
8869               //alloc_reg(&current,i,PTEMP);
8870               //#endif
8871               //current.is32|=1LL<<rt1[i];
8872             }
8873             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
8874               // The delay slot overwrites the branch condition.
8875               // Allocate the branch condition registers instead.
8876               // Note that such a sequence of instructions could
8877               // be considered a bug since the branch can not be
8878               // re-executed if an exception occurs.
8879               current.isconst=0;
8880               current.wasconst=0;
8881               regs[i].wasconst=0;
8882               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
8883               if(!((current.is32>>rs1[i])&1))
8884               {
8885                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
8886               }
8887             }
8888             else delayslot_alloc(&current,i+1);
8889           }
8890           else
8891           // Don't alloc the delay slot yet because we might not execute it
8892           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
8893           {
8894             current.isconst=0;
8895             current.wasconst=0;
8896             regs[i].wasconst=0;
8897             alloc_cc(&current,i);
8898             dirty_reg(&current,CCREG);
8899             alloc_reg(&current,i,rs1[i]);
8900             if(!(current.is32>>rs1[i]&1))
8901             {
8902               alloc_reg64(&current,i,rs1[i]);
8903             }
8904           }
8905           ds=1;
8906           //current.isconst=0;
8907           break;
8908         case FJUMP:
8909           current.isconst=0;
8910           current.wasconst=0;
8911           regs[i].wasconst=0;
8912           if(likely[i]==0) // BC1F/BC1T
8913           {
8914             // TODO: Theoretically we can run out of registers here on x86.
8915             // The delay slot can allocate up to six, and we need to check
8916             // CSREG before executing the delay slot.  Possibly we can drop
8917             // the cycle count and then reload it after checking that the
8918             // FPU is in a usable state, or don't do out-of-order execution.
8919             alloc_cc(&current,i);
8920             dirty_reg(&current,CCREG);
8921             alloc_reg(&current,i,FSREG);
8922             alloc_reg(&current,i,CSREG);
8923             if(itype[i+1]==FCOMP) {
8924               // The delay slot overwrites the branch condition.
8925               // Allocate the branch condition registers instead.
8926               // Note that such a sequence of instructions could
8927               // be considered a bug since the branch can not be
8928               // re-executed if an exception occurs.
8929               alloc_cc(&current,i);
8930               dirty_reg(&current,CCREG);
8931               alloc_reg(&current,i,CSREG);
8932               alloc_reg(&current,i,FSREG);
8933             }
8934             else {
8935               delayslot_alloc(&current,i+1);
8936               alloc_reg(&current,i+1,CSREG);
8937             }
8938           }
8939           else
8940           // Don't alloc the delay slot yet because we might not execute it
8941           if(likely[i]) // BC1FL/BC1TL
8942           {
8943             alloc_cc(&current,i);
8944             dirty_reg(&current,CCREG);
8945             alloc_reg(&current,i,CSREG);
8946             alloc_reg(&current,i,FSREG);
8947           }
8948           ds=1;
8949           current.isconst=0;
8950           break;
8951         case IMM16:
8952           imm16_alloc(&current,i);
8953           break;
8954         case LOAD:
8955         case LOADLR:
8956           load_alloc(&current,i);
8957           break;
8958         case STORE:
8959         case STORELR:
8960           store_alloc(&current,i);
8961           break;
8962         case ALU:
8963           alu_alloc(&current,i);
8964           break;
8965         case SHIFT:
8966           shift_alloc(&current,i);
8967           break;
8968         case MULTDIV:
8969           multdiv_alloc(&current,i);
8970           break;
8971         case SHIFTIMM:
8972           shiftimm_alloc(&current,i);
8973           break;
8974         case MOV:
8975           mov_alloc(&current,i);
8976           break;
8977         case COP0:
8978           cop0_alloc(&current,i);
8979           break;
8980         case COP1:
8981         case COP2:
8982           cop1_alloc(&current,i);
8983           break;
8984         case C1LS:
8985           c1ls_alloc(&current,i);
8986           break;
8987         case C2LS:
8988           c2ls_alloc(&current,i);
8989           break;
8990         case C2OP:
8991           c2op_alloc(&current,i);
8992           break;
8993         case FCONV:
8994           fconv_alloc(&current,i);
8995           break;
8996         case FLOAT:
8997           float_alloc(&current,i);
8998           break;
8999         case FCOMP:
9000           fcomp_alloc(&current,i);
9001           break;
9002         case SYSCALL:
9003         case HLECALL:
9004           syscall_alloc(&current,i);
9005           break;
9006         case SPAN:
9007           pagespan_alloc(&current,i);
9008           break;
9009       }
9010       
9011       // Drop the upper half of registers that have become 32-bit
9012       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
9013       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
9014         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9015         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9016         current.uu|=1;
9017       } else {
9018         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
9019         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
9020         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
9021         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
9022         current.uu|=1;
9023       }
9024
9025       // Create entry (branch target) regmap
9026       for(hr=0;hr<HOST_REGS;hr++)
9027       {
9028         int r,or,er;
9029         r=current.regmap[hr];
9030         if(r>=0) {
9031           if(r!=regmap_pre[i][hr]) {
9032             // TODO: delay slot (?)
9033             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
9034             if(or<0||(r&63)>=TEMPREG){
9035               regs[i].regmap_entry[hr]=-1;
9036             }
9037             else
9038             {
9039               // Just move it to a different register
9040               regs[i].regmap_entry[hr]=r;
9041               // If it was dirty before, it's still dirty
9042               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
9043             }
9044           }
9045           else
9046           {
9047             // Unneeded
9048             if(r==0){
9049               regs[i].regmap_entry[hr]=0;
9050             }
9051             else
9052             if(r<64){
9053               if((current.u>>r)&1) {
9054                 regs[i].regmap_entry[hr]=-1;
9055                 //regs[i].regmap[hr]=-1;
9056                 current.regmap[hr]=-1;
9057               }else
9058                 regs[i].regmap_entry[hr]=r;
9059             }
9060             else {
9061               if((current.uu>>(r&63))&1) {
9062                 regs[i].regmap_entry[hr]=-1;
9063                 //regs[i].regmap[hr]=-1;
9064                 current.regmap[hr]=-1;
9065               }else
9066                 regs[i].regmap_entry[hr]=r;
9067             }
9068           }
9069         } else {
9070           // Branches expect CCREG to be allocated at the target
9071           if(regmap_pre[i][hr]==CCREG) 
9072             regs[i].regmap_entry[hr]=CCREG;
9073           else
9074             regs[i].regmap_entry[hr]=-1;
9075         }
9076       }
9077       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
9078     }
9079     /* Branch post-alloc */
9080     if(i>0)
9081     {
9082       current.was32=current.is32;
9083       current.wasdirty=current.dirty;
9084       switch(itype[i-1]) {
9085         case UJUMP:
9086           memcpy(&branch_regs[i-1],&current,sizeof(current));
9087           branch_regs[i-1].isconst=0;
9088           branch_regs[i-1].wasconst=0;
9089           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9090           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9091           alloc_cc(&branch_regs[i-1],i-1);
9092           dirty_reg(&branch_regs[i-1],CCREG);
9093           if(rt1[i-1]==31) { // JAL
9094             alloc_reg(&branch_regs[i-1],i-1,31);
9095             dirty_reg(&branch_regs[i-1],31);
9096             branch_regs[i-1].is32|=1LL<<31;
9097           }
9098           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9099           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9100           break;
9101         case RJUMP:
9102           memcpy(&branch_regs[i-1],&current,sizeof(current));
9103           branch_regs[i-1].isconst=0;
9104           branch_regs[i-1].wasconst=0;
9105           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9106           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9107           alloc_cc(&branch_regs[i-1],i-1);
9108           dirty_reg(&branch_regs[i-1],CCREG);
9109           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
9110           if(rt1[i-1]!=0) { // JALR
9111             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
9112             dirty_reg(&branch_regs[i-1],rt1[i-1]);
9113             branch_regs[i-1].is32|=1LL<<rt1[i-1];
9114           }
9115           #ifdef USE_MINI_HT
9116           if(rs1[i-1]==31) { // JALR
9117             alloc_reg(&branch_regs[i-1],i-1,RHASH);
9118             #ifndef HOST_IMM_ADDR32
9119             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
9120             #endif
9121           }
9122           #endif
9123           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9124           memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9125           break;
9126         case CJUMP:
9127           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
9128           {
9129             alloc_cc(&current,i-1);
9130             dirty_reg(&current,CCREG);
9131             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
9132                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
9133               // The delay slot overwrote one of our conditions
9134               // Delay slot goes after the test (in order)
9135               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9136               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9137               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9138               current.u|=1;
9139               current.uu|=1;
9140               delayslot_alloc(&current,i);
9141               current.isconst=0;
9142             }
9143             else
9144             {
9145               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
9146               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
9147               // Alloc the branch condition registers
9148               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
9149               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
9150               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
9151               {
9152                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
9153                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
9154               }
9155             }
9156             memcpy(&branch_regs[i-1],&current,sizeof(current));
9157             branch_regs[i-1].isconst=0;
9158             branch_regs[i-1].wasconst=0;
9159             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9160             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9161           }
9162           else
9163           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
9164           {
9165             alloc_cc(&current,i-1);
9166             dirty_reg(&current,CCREG);
9167             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9168               // The delay slot overwrote the branch condition
9169               // Delay slot goes after the test (in order)
9170               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9171               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9172               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9173               current.u|=1;
9174               current.uu|=1;
9175               delayslot_alloc(&current,i);
9176               current.isconst=0;
9177             }
9178             else
9179             {
9180               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9181               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9182               // Alloc the branch condition register
9183               alloc_reg(&current,i-1,rs1[i-1]);
9184               if(!(current.is32>>rs1[i-1]&1))
9185               {
9186                 alloc_reg64(&current,i-1,rs1[i-1]);
9187               }
9188             }
9189             memcpy(&branch_regs[i-1],&current,sizeof(current));
9190             branch_regs[i-1].isconst=0;
9191             branch_regs[i-1].wasconst=0;
9192             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9193             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9194           }
9195           else
9196           // Alloc the delay slot in case the branch is taken
9197           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
9198           {
9199             memcpy(&branch_regs[i-1],&current,sizeof(current));
9200             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9201             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9202             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9203             alloc_cc(&branch_regs[i-1],i);
9204             dirty_reg(&branch_regs[i-1],CCREG);
9205             delayslot_alloc(&branch_regs[i-1],i);
9206             branch_regs[i-1].isconst=0;
9207             alloc_reg(&current,i,CCREG); // Not taken path
9208             dirty_reg(&current,CCREG);
9209             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9210           }
9211           else
9212           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
9213           {
9214             memcpy(&branch_regs[i-1],&current,sizeof(current));
9215             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9216             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9217             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9218             alloc_cc(&branch_regs[i-1],i);
9219             dirty_reg(&branch_regs[i-1],CCREG);
9220             delayslot_alloc(&branch_regs[i-1],i);
9221             branch_regs[i-1].isconst=0;
9222             alloc_reg(&current,i,CCREG); // Not taken path
9223             dirty_reg(&current,CCREG);
9224             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9225           }
9226           break;
9227         case SJUMP:
9228           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
9229           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
9230           {
9231             alloc_cc(&current,i-1);
9232             dirty_reg(&current,CCREG);
9233             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
9234               // The delay slot overwrote the branch condition
9235               // Delay slot goes after the test (in order)
9236               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
9237               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
9238               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
9239               current.u|=1;
9240               current.uu|=1;
9241               delayslot_alloc(&current,i);
9242               current.isconst=0;
9243             }
9244             else
9245             {
9246               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9247               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9248               // Alloc the branch condition register
9249               alloc_reg(&current,i-1,rs1[i-1]);
9250               if(!(current.is32>>rs1[i-1]&1))
9251               {
9252                 alloc_reg64(&current,i-1,rs1[i-1]);
9253               }
9254             }
9255             memcpy(&branch_regs[i-1],&current,sizeof(current));
9256             branch_regs[i-1].isconst=0;
9257             branch_regs[i-1].wasconst=0;
9258             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9259             memcpy(constmap[i],constmap[i-1],sizeof(current.constmap));
9260           }
9261           else
9262           // Alloc the delay slot in case the branch is taken
9263           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
9264           {
9265             memcpy(&branch_regs[i-1],&current,sizeof(current));
9266             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9267             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9268             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9269             alloc_cc(&branch_regs[i-1],i);
9270             dirty_reg(&branch_regs[i-1],CCREG);
9271             delayslot_alloc(&branch_regs[i-1],i);
9272             branch_regs[i-1].isconst=0;
9273             alloc_reg(&current,i,CCREG); // Not taken path
9274             dirty_reg(&current,CCREG);
9275             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9276           }
9277           // FIXME: BLTZAL/BGEZAL
9278           if(opcode2[i-1]&0x10) { // BxxZAL
9279             alloc_reg(&branch_regs[i-1],i-1,31);
9280             dirty_reg(&branch_regs[i-1],31);
9281             branch_regs[i-1].is32|=1LL<<31;
9282           }
9283           break;
9284         case FJUMP:
9285           if(likely[i-1]==0) // BC1F/BC1T
9286           {
9287             alloc_cc(&current,i-1);
9288             dirty_reg(&current,CCREG);
9289             if(itype[i]==FCOMP) {
9290               // The delay slot overwrote the branch condition
9291               // Delay slot goes after the test (in order)
9292               delayslot_alloc(&current,i);
9293               current.isconst=0;
9294             }
9295             else
9296             {
9297               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
9298               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
9299               // Alloc the branch condition register
9300               alloc_reg(&current,i-1,FSREG);
9301             }
9302             memcpy(&branch_regs[i-1],&current,sizeof(current));
9303             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
9304           }
9305           else // BC1FL/BC1TL
9306           {
9307             // Alloc the delay slot in case the branch is taken
9308             memcpy(&branch_regs[i-1],&current,sizeof(current));
9309             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9310             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
9311             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
9312             alloc_cc(&branch_regs[i-1],i);
9313             dirty_reg(&branch_regs[i-1],CCREG);
9314             delayslot_alloc(&branch_regs[i-1],i);
9315             branch_regs[i-1].isconst=0;
9316             alloc_reg(&current,i,CCREG); // Not taken path
9317             dirty_reg(&current,CCREG);
9318             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
9319           }
9320           break;
9321       }
9322
9323       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
9324       {
9325         if(rt1[i-1]==31) // JAL/JALR
9326         {
9327           // Subroutine call will return here, don't alloc any registers
9328           current.is32=1;
9329           current.dirty=0;
9330           clear_all_regs(current.regmap);
9331           alloc_reg(&current,i,CCREG);
9332           dirty_reg(&current,CCREG);
9333         }
9334         else if(i+1<slen)
9335         {
9336           // Internal branch will jump here, match registers to caller
9337           current.is32=0x3FFFFFFFFLL;
9338           current.dirty=0;
9339           clear_all_regs(current.regmap);
9340           alloc_reg(&current,i,CCREG);
9341           dirty_reg(&current,CCREG);
9342           for(j=i-1;j>=0;j--)
9343           {
9344             if(ba[j]==start+i*4+4) {
9345               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
9346               current.is32=branch_regs[j].is32;
9347               current.dirty=branch_regs[j].dirty;
9348               break;
9349             }
9350           }
9351           while(j>=0) {
9352             if(ba[j]==start+i*4+4) {
9353               for(hr=0;hr<HOST_REGS;hr++) {
9354                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
9355                   current.regmap[hr]=-1;
9356                 }
9357                 current.is32&=branch_regs[j].is32;
9358                 current.dirty&=branch_regs[j].dirty;
9359               }
9360             }
9361             j--;
9362           }
9363         }
9364       }
9365     }
9366
9367     // Count cycles in between branches
9368     ccadj[i]=cc;
9369     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
9370     {
9371       cc=0;
9372     }
9373     else
9374     {
9375       cc++;
9376     }
9377
9378     flush_dirty_uppers(&current);
9379     if(!is_ds[i]) {
9380       regs[i].is32=current.is32;
9381       regs[i].dirty=current.dirty;
9382       regs[i].isconst=current.isconst;
9383       memcpy(constmap[i],current.constmap,sizeof(current.constmap));
9384     }
9385     for(hr=0;hr<HOST_REGS;hr++) {
9386       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
9387         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
9388           regs[i].wasconst&=~(1<<hr);
9389         }
9390       }
9391     }
9392     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
9393   }
9394   
9395   /* Pass 4 - Cull unused host registers */
9396   
9397   uint64_t nr=0;
9398   
9399   for (i=slen-1;i>=0;i--)
9400   {
9401     int hr;
9402     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9403     {
9404       if(ba[i]<start || ba[i]>=(start+slen*4))
9405       {
9406         // Branch out of this block, don't need anything
9407         nr=0;
9408       }
9409       else
9410       {
9411         // Internal branch
9412         // Need whatever matches the target
9413         nr=0;
9414         int t=(ba[i]-start)>>2;
9415         for(hr=0;hr<HOST_REGS;hr++)
9416         {
9417           if(regs[i].regmap_entry[hr]>=0) {
9418             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
9419           }
9420         }
9421       }
9422       // Conditional branch may need registers for following instructions
9423       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9424       {
9425         if(i<slen-2) {
9426           nr|=needed_reg[i+2];
9427           for(hr=0;hr<HOST_REGS;hr++)
9428           {
9429             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
9430             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
9431           }
9432         }
9433       }
9434       // Don't need stuff which is overwritten
9435       if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9436       if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9437       // Merge in delay slot
9438       for(hr=0;hr<HOST_REGS;hr++)
9439       {
9440         if(!likely[i]) {
9441           // These are overwritten unless the branch is "likely"
9442           // and the delay slot is nullified if not taken
9443           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9444           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9445         }
9446         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9447         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9448         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9449         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
9450         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9451         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9452         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9453         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9454         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
9455           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9456           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9457         }
9458         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
9459           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9460           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9461         }
9462         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
9463           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9464           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9465         }
9466       }
9467     }
9468     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
9469     {
9470       // SYSCALL instruction (software interrupt)
9471       nr=0;
9472     }
9473     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
9474     {
9475       // ERET instruction (return from interrupt)
9476       nr=0;
9477     }
9478     else // Non-branch
9479     {
9480       if(i<slen-1) {
9481         for(hr=0;hr<HOST_REGS;hr++) {
9482           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
9483           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
9484           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
9485           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
9486         }
9487       }
9488     }
9489     for(hr=0;hr<HOST_REGS;hr++)
9490     {
9491       // Overwritten registers are not needed
9492       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9493       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9494       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
9495       // Source registers are needed
9496       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9497       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9498       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
9499       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
9500       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9501       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9502       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9503       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
9504       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
9505         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9506         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9507       }
9508       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
9509         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9510         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9511       }
9512       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
9513         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
9514         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
9515       }
9516       // Don't store a register immediately after writing it,
9517       // may prevent dual-issue.
9518       // But do so if this is a branch target, otherwise we
9519       // might have to load the register before the branch.
9520       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
9521         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
9522            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
9523           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9524           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
9525         }
9526         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
9527            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
9528           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9529           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
9530         }
9531       }
9532     }
9533     // Cycle count is needed at branches.  Assume it is needed at the target too.
9534     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
9535       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9536       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
9537     }
9538     // Save it
9539     needed_reg[i]=nr;
9540     
9541     // Deallocate unneeded registers
9542     for(hr=0;hr<HOST_REGS;hr++)
9543     {
9544       if(!((nr>>hr)&1)) {
9545         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
9546         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9547            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9548            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
9549         {
9550           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9551           {
9552             if(likely[i]) {
9553               regs[i].regmap[hr]=-1;
9554               regs[i].isconst&=~(1<<hr);
9555               if(i<slen-2) regmap_pre[i+2][hr]=-1;
9556             }
9557           }
9558         }
9559         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9560         {
9561           int d1=0,d2=0,map=0,temp=0;
9562           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
9563           {
9564             d1=dep1[i+1];
9565             d2=dep2[i+1];
9566           }
9567           if(using_tlb) {
9568             if(itype[i+1]==LOAD || itype[i+1]==LOADLR ||
9569                itype[i+1]==STORE || itype[i+1]==STORELR ||
9570                itype[i+1]==C1LS || itype[i+1]==C2LS)
9571             map=TLREG;
9572           } else
9573           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
9574              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9575             map=INVCP;
9576           }
9577           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
9578              itype[i+1]==C1LS || itype[i+1]==C2LS)
9579             temp=FTEMP;
9580           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
9581              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9582              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
9583              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
9584              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9585              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
9586              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
9587              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
9588              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
9589              regs[i].regmap[hr]!=map )
9590           {
9591             regs[i].regmap[hr]=-1;
9592             regs[i].isconst&=~(1<<hr);
9593             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
9594                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
9595                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
9596                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
9597                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
9598                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
9599                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
9600                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
9601                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
9602                branch_regs[i].regmap[hr]!=map)
9603             {
9604               branch_regs[i].regmap[hr]=-1;
9605               branch_regs[i].regmap_entry[hr]=-1;
9606               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
9607               {
9608                 if(!likely[i]&&i<slen-2) {
9609                   regmap_pre[i+2][hr]=-1;
9610                 }
9611               }
9612             }
9613           }
9614         }
9615         else
9616         {
9617           // Non-branch
9618           if(i>0)
9619           {
9620             int d1=0,d2=0,map=-1,temp=-1;
9621             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
9622             {
9623               d1=dep1[i];
9624               d2=dep2[i];
9625             }
9626             if(using_tlb) {
9627               if(itype[i]==LOAD || itype[i]==LOADLR ||
9628                  itype[i]==STORE || itype[i]==STORELR ||
9629                  itype[i]==C1LS || itype[i]==C2LS)
9630               map=TLREG;
9631             } else if(itype[i]==STORE || itype[i]==STORELR ||
9632                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
9633               map=INVCP;
9634             }
9635             if(itype[i]==LOADLR || itype[i]==STORELR ||
9636                itype[i]==C1LS || itype[i]==C2LS)
9637               temp=FTEMP;
9638             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
9639                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
9640                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
9641                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
9642                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
9643                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
9644             {
9645               if(i<slen-1&&!is_ds[i]) {
9646                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
9647                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
9648                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
9649                 {
9650                   printf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
9651                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
9652                 }
9653                 regmap_pre[i+1][hr]=-1;
9654                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
9655               }
9656               regs[i].regmap[hr]=-1;
9657               regs[i].isconst&=~(1<<hr);
9658             }
9659           }
9660         }
9661       }
9662     }
9663   }
9664   
9665   /* Pass 5 - Pre-allocate registers */
9666   
9667   // If a register is allocated during a loop, try to allocate it for the
9668   // entire loop, if possible.  This avoids loading/storing registers
9669   // inside of the loop.
9670
9671   signed char f_regmap[HOST_REGS];
9672   clear_all_regs(f_regmap);
9673   for(i=0;i<slen-1;i++)
9674   {
9675     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9676     {
9677       if(ba[i]>=start && ba[i]<(start+i*4)) 
9678       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
9679       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
9680       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9681       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9682       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9683       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9684       {
9685         int t=(ba[i]-start)>>2;
9686         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
9687         if(t<2||(itype[t-2]!=UJUMP)) // call/ret assumes no registers allocated
9688         for(hr=0;hr<HOST_REGS;hr++)
9689         {
9690           if(regs[i].regmap[hr]>64) {
9691             if(!((regs[i].dirty>>hr)&1))
9692               f_regmap[hr]=regs[i].regmap[hr];
9693             else f_regmap[hr]=-1;
9694           }
9695           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9696           if(branch_regs[i].regmap[hr]>64) {
9697             if(!((branch_regs[i].dirty>>hr)&1))
9698               f_regmap[hr]=branch_regs[i].regmap[hr];
9699             else f_regmap[hr]=-1;
9700           }
9701           else if(branch_regs[i].regmap[hr]>=0) f_regmap[hr]=branch_regs[i].regmap[hr];
9702           if(itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
9703           ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
9704           ||itype[i+1]==FCOMP||itype[i+1]==FCONV
9705           ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
9706           {
9707             // Test both in case the delay slot is ooo,
9708             // could be done better...
9709             if(count_free_regs(branch_regs[i].regmap)<2
9710              ||count_free_regs(regs[i].regmap)<2) 
9711               f_regmap[hr]=branch_regs[i].regmap[hr];
9712           }
9713           // Avoid dirty->clean transition
9714           // #ifdef DESTRUCTIVE_WRITEBACK here?
9715           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
9716           if(f_regmap[hr]>0) {
9717             if(regs[t].regmap_entry[hr]<0) {
9718               int r=f_regmap[hr];
9719               for(j=t;j<=i;j++)
9720               {
9721                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9722                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
9723                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
9724                 if(r>63) {
9725                   // NB This can exclude the case where the upper-half
9726                   // register is lower numbered than the lower-half
9727                   // register.  Not sure if it's worth fixing...
9728                   if(get_reg(regs[j].regmap,r&63)<0) break;
9729                   if(regs[j].is32&(1LL<<(r&63))) break;
9730                 }
9731                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
9732                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
9733                   int k;
9734                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
9735                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
9736                     if(r>63) {
9737                       if(get_reg(regs[i].regmap,r&63)<0) break;
9738                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
9739                     }
9740                     k=i;
9741                     while(k>1&&regs[k-1].regmap[hr]==-1) {
9742                       if(itype[k-1]==STORE||itype[k-1]==STORELR
9743                       ||itype[k-1]==C1LS||itype[k-1]==SHIFT||itype[k-1]==COP1
9744                       ||itype[k-1]==FLOAT||itype[k-1]==FCONV||itype[k-1]==FCOMP
9745                       ||itype[k-1]==COP2||itype[k-1]==C2LS||itype[k-1]==C2OP) {
9746                         if(count_free_regs(regs[k-1].regmap)<2) {
9747                           //printf("no free regs for store %x\n",start+(k-1)*4);
9748                           break;
9749                         }
9750                       }
9751                       else
9752                       if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9753                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
9754                         //printf("no-match due to different register\n");
9755                         break;
9756                       }
9757                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
9758                         //printf("no-match due to branch\n");
9759                         break;
9760                       }
9761                       // call/ret fast path assumes no registers allocated
9762                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)) {
9763                         break;
9764                       }
9765                       if(r>63) {
9766                         // NB This can exclude the case where the upper-half
9767                         // register is lower numbered than the lower-half
9768                         // register.  Not sure if it's worth fixing...
9769                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
9770                         if(regs[k-1].is32&(1LL<<(r&63))) break;
9771                       }
9772                       k--;
9773                     }
9774                     if(i<slen-1) {
9775                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
9776                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
9777                         //printf("bad match after branch\n");
9778                         break;
9779                       }
9780                     }
9781                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
9782                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
9783                       while(k<i) {
9784                         regs[k].regmap_entry[hr]=f_regmap[hr];
9785                         regs[k].regmap[hr]=f_regmap[hr];
9786                         regmap_pre[k+1][hr]=f_regmap[hr];
9787                         regs[k].wasdirty&=~(1<<hr);
9788                         regs[k].dirty&=~(1<<hr);
9789                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
9790                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
9791                         regs[k].wasconst&=~(1<<hr);
9792                         regs[k].isconst&=~(1<<hr);
9793                         k++;
9794                       }
9795                     }
9796                     else {
9797                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
9798                       break;
9799                     }
9800                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
9801                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
9802                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
9803                       regs[i].regmap_entry[hr]=f_regmap[hr];
9804                       regs[i].regmap[hr]=f_regmap[hr];
9805                       regs[i].wasdirty&=~(1<<hr);
9806                       regs[i].dirty&=~(1<<hr);
9807                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
9808                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
9809                       regs[i].wasconst&=~(1<<hr);
9810                       regs[i].isconst&=~(1<<hr);
9811                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
9812                       branch_regs[i].wasdirty&=~(1<<hr);
9813                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
9814                       branch_regs[i].regmap[hr]=f_regmap[hr];
9815                       branch_regs[i].dirty&=~(1<<hr);
9816                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
9817                       branch_regs[i].wasconst&=~(1<<hr);
9818                       branch_regs[i].isconst&=~(1<<hr);
9819                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
9820                         regmap_pre[i+2][hr]=f_regmap[hr];
9821                         regs[i+2].wasdirty&=~(1<<hr);
9822                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
9823                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
9824                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
9825                       }
9826                     }
9827                   }
9828                   for(k=t;k<j;k++) {
9829                     regs[k].regmap_entry[hr]=f_regmap[hr];
9830                     regs[k].regmap[hr]=f_regmap[hr];
9831                     regmap_pre[k+1][hr]=f_regmap[hr];
9832                     regs[k+1].wasdirty&=~(1<<hr);
9833                     regs[k].dirty&=~(1<<hr);
9834                     regs[k].wasconst&=~(1<<hr);
9835                     regs[k].isconst&=~(1<<hr);
9836                   }
9837                   if(regs[j].regmap[hr]==f_regmap[hr])
9838                     regs[j].regmap_entry[hr]=f_regmap[hr];
9839                   break;
9840                 }
9841                 if(j==i) break;
9842                 if(regs[j].regmap[hr]>=0)
9843                   break;
9844                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
9845                   //printf("no-match due to different register\n");
9846                   break;
9847                 }
9848                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
9849                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
9850                   break;
9851                 }
9852                 if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9853                 ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9854                 ||itype[j]==FCOMP||itype[j]==FCONV
9855                 ||itype[j]==COP2||itype[j]==C2LS||itype[j]==C2OP) {
9856                   if(count_free_regs(regs[j].regmap)<2) {
9857                     //printf("No free regs for store %x\n",start+j*4);
9858                     break;
9859                   }
9860                 }
9861                 else if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9862                 if(f_regmap[hr]>=64) {
9863                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
9864                     break;
9865                   }
9866                   else
9867                   {
9868                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
9869                       break;
9870                     }
9871                   }
9872                 }
9873               }
9874             }
9875           }
9876         }
9877       }
9878     }else{
9879       int count=0;
9880       for(hr=0;hr<HOST_REGS;hr++)
9881       {
9882         if(hr!=EXCLUDE_REG) {
9883           if(regs[i].regmap[hr]>64) {
9884             if(!((regs[i].dirty>>hr)&1))
9885               f_regmap[hr]=regs[i].regmap[hr];
9886           }
9887           else if(regs[i].regmap[hr]>=0) f_regmap[hr]=regs[i].regmap[hr];
9888           else if(regs[i].regmap[hr]<0) count++;
9889         }
9890       }
9891       // Try to restore cycle count at branch targets
9892       if(bt[i]) {
9893         for(j=i;j<slen-1;j++) {
9894           if(regs[j].regmap[HOST_CCREG]!=-1) break;
9895           if(itype[j]==STORE||itype[j]==STORELR||itype[j]==C1LS
9896           ||itype[j]==SHIFT||itype[j]==COP1||itype[j]==FLOAT
9897           ||itype[j]==FCOMP||itype[j]==FCONV
9898           ||itype[j]==COP2||itype[j]==C2LS||itype[j]==C2OP) {
9899             if(count_free_regs(regs[j].regmap)<2) {
9900               //printf("no free regs for store %x\n",start+j*4);
9901               break;
9902             }
9903           }
9904           else
9905           if(itype[j]!=NOP&&itype[j]!=MOV&&itype[j]!=ALU&&itype[j]!=SHIFTIMM&&itype[j]!=IMM16&&itype[j]!=LOAD) break;
9906         }
9907         if(regs[j].regmap[HOST_CCREG]==CCREG) {
9908           int k=i;
9909           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
9910           while(k<j) {
9911             regs[k].regmap_entry[HOST_CCREG]=CCREG;
9912             regs[k].regmap[HOST_CCREG]=CCREG;
9913             regmap_pre[k+1][HOST_CCREG]=CCREG;
9914             regs[k+1].wasdirty|=1<<HOST_CCREG;
9915             regs[k].dirty|=1<<HOST_CCREG;
9916             regs[k].wasconst&=~(1<<HOST_CCREG);
9917             regs[k].isconst&=~(1<<HOST_CCREG);
9918             k++;
9919           }
9920           regs[j].regmap_entry[HOST_CCREG]=CCREG;          
9921         }
9922         // Work backwards from the branch target
9923         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9924         {
9925           //printf("Extend backwards\n");
9926           int k;
9927           k=i;
9928           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9929             if(itype[k-1]==STORE||itype[k-1]==STORELR||itype[k-1]==C1LS
9930             ||itype[k-1]==SHIFT||itype[k-1]==COP1||itype[k-1]==FLOAT
9931             ||itype[k-1]==FCONV||itype[k-1]==FCOMP
9932             ||itype[k-1]==COP2||itype[k-1]==C2LS||itype[k-1]==C2OP) {
9933               if(count_free_regs(regs[k-1].regmap)<2) {
9934                 //printf("no free regs for store %x\n",start+(k-1)*4);
9935                 break;
9936               }
9937             }
9938             else
9939             if(itype[k-1]!=NOP&&itype[k-1]!=MOV&&itype[k-1]!=ALU&&itype[k-1]!=SHIFTIMM&&itype[k-1]!=IMM16&&itype[k-1]!=LOAD) break;
9940             k--;
9941           }
9942           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9943             //printf("Extend CC, %x ->\n",start+k*4);
9944             while(k<=i) {
9945               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9946               regs[k].regmap[HOST_CCREG]=CCREG;
9947               regmap_pre[k+1][HOST_CCREG]=CCREG;
9948               regs[k+1].wasdirty|=1<<HOST_CCREG;
9949               regs[k].dirty|=1<<HOST_CCREG;
9950               regs[k].wasconst&=~(1<<HOST_CCREG);
9951               regs[k].isconst&=~(1<<HOST_CCREG);
9952               k++;
9953             }
9954           }
9955           else {
9956             //printf("Fail Extend CC, %x ->\n",start+k*4);
9957           }
9958         }
9959       }
9960       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9961          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9962          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9963          itype[i]!=FCONV&&itype[i]!=FCOMP&&
9964          itype[i]!=COP2&&itype[i]!=C2LS&&itype[i]!=C2OP)
9965       {
9966         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9967       }
9968     }
9969   }
9970   
9971   // This allocates registers (if possible) one instruction prior
9972   // to use, which can avoid a load-use penalty on certain CPUs.
9973   for(i=0;i<slen-1;i++)
9974   {
9975     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9976     {
9977       if(!bt[i+1])
9978       {
9979         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9980            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9981         {
9982           if(rs1[i+1]) {
9983             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9984             {
9985               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9986               {
9987                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9988                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9989                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9990                 regs[i].isconst&=~(1<<hr);
9991                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9992                 constmap[i][hr]=constmap[i+1][hr];
9993                 regs[i+1].wasdirty&=~(1<<hr);
9994                 regs[i].dirty&=~(1<<hr);
9995               }
9996             }
9997           }
9998           if(rs2[i+1]) {
9999             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
10000             {
10001               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10002               {
10003                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
10004                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
10005                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
10006                 regs[i].isconst&=~(1<<hr);
10007                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10008                 constmap[i][hr]=constmap[i+1][hr];
10009                 regs[i+1].wasdirty&=~(1<<hr);
10010                 regs[i].dirty&=~(1<<hr);
10011               }
10012             }
10013           }
10014           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10015             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10016             {
10017               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10018               {
10019                 regs[i].regmap[hr]=rs1[i+1];
10020                 regmap_pre[i+1][hr]=rs1[i+1];
10021                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10022                 regs[i].isconst&=~(1<<hr);
10023                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10024                 constmap[i][hr]=constmap[i+1][hr];
10025                 regs[i+1].wasdirty&=~(1<<hr);
10026                 regs[i].dirty&=~(1<<hr);
10027               }
10028             }
10029           }
10030           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10031             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
10032             {
10033               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10034               {
10035                 regs[i].regmap[hr]=rs1[i+1];
10036                 regmap_pre[i+1][hr]=rs1[i+1];
10037                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10038                 regs[i].isconst&=~(1<<hr);
10039                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10040                 constmap[i][hr]=constmap[i+1][hr];
10041                 regs[i+1].wasdirty&=~(1<<hr);
10042                 regs[i].dirty&=~(1<<hr);
10043               }
10044             }
10045           }
10046           #ifndef HOST_IMM_ADDR32
10047           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
10048             hr=get_reg(regs[i+1].regmap,TLREG);
10049             if(hr>=0) {
10050               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
10051               if(sr>=0&&((regs[i+1].wasconst>>sr)&1)) {
10052                 int nr;
10053                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10054                 {
10055                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
10056                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
10057                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
10058                   regs[i].isconst&=~(1<<hr);
10059                   regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10060                   constmap[i][hr]=constmap[i+1][hr];
10061                   regs[i+1].wasdirty&=~(1<<hr);
10062                   regs[i].dirty&=~(1<<hr);
10063                 }
10064                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10065                 {
10066                   // move it to another register
10067                   regs[i+1].regmap[hr]=-1;
10068                   regmap_pre[i+2][hr]=-1;
10069                   regs[i+1].regmap[nr]=TLREG;
10070                   regmap_pre[i+2][nr]=TLREG;
10071                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
10072                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
10073                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
10074                   regs[i].isconst&=~(1<<nr);
10075                   regs[i+1].isconst&=~(1<<nr);
10076                   regs[i].dirty&=~(1<<nr);
10077                   regs[i+1].wasdirty&=~(1<<nr);
10078                   regs[i+1].dirty&=~(1<<nr);
10079                   regs[i+2].wasdirty&=~(1<<nr);
10080                 }
10081               }
10082             }
10083           }
10084           #endif
10085           if(itype[i+1]==STORE||itype[i+1]==STORELR
10086              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
10087             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10088               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
10089               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10090               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
10091               assert(hr>=0);
10092               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10093               {
10094                 regs[i].regmap[hr]=rs1[i+1];
10095                 regmap_pre[i+1][hr]=rs1[i+1];
10096                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10097                 regs[i].isconst&=~(1<<hr);
10098                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10099                 constmap[i][hr]=constmap[i+1][hr];
10100                 regs[i+1].wasdirty&=~(1<<hr);
10101                 regs[i].dirty&=~(1<<hr);
10102               }
10103             }
10104           }
10105           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
10106             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
10107               int nr;
10108               hr=get_reg(regs[i+1].regmap,FTEMP);
10109               assert(hr>=0);
10110               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
10111               {
10112                 regs[i].regmap[hr]=rs1[i+1];
10113                 regmap_pre[i+1][hr]=rs1[i+1];
10114                 regs[i+1].regmap_entry[hr]=rs1[i+1];
10115                 regs[i].isconst&=~(1<<hr);
10116                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
10117                 constmap[i][hr]=constmap[i+1][hr];
10118                 regs[i+1].wasdirty&=~(1<<hr);
10119                 regs[i].dirty&=~(1<<hr);
10120               }
10121               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
10122               {
10123                 // move it to another register
10124                 regs[i+1].regmap[hr]=-1;
10125                 regmap_pre[i+2][hr]=-1;
10126                 regs[i+1].regmap[nr]=FTEMP;
10127                 regmap_pre[i+2][nr]=FTEMP;
10128                 regs[i].regmap[nr]=rs1[i+1];
10129                 regmap_pre[i+1][nr]=rs1[i+1];
10130                 regs[i+1].regmap_entry[nr]=rs1[i+1];
10131                 regs[i].isconst&=~(1<<nr);
10132                 regs[i+1].isconst&=~(1<<nr);
10133                 regs[i].dirty&=~(1<<nr);
10134                 regs[i+1].wasdirty&=~(1<<nr);
10135                 regs[i+1].dirty&=~(1<<nr);
10136                 regs[i+2].wasdirty&=~(1<<nr);
10137               }
10138             }
10139           }
10140           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
10141             if(itype[i+1]==LOAD) 
10142               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
10143             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
10144               hr=get_reg(regs[i+1].regmap,FTEMP);
10145             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
10146               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
10147               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
10148             }
10149             if(hr>=0&&regs[i].regmap[hr]<0) {
10150               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
10151               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
10152                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
10153                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
10154                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
10155                 regs[i].isconst&=~(1<<hr);
10156                 regs[i+1].wasdirty&=~(1<<hr);
10157                 regs[i].dirty&=~(1<<hr);
10158               }
10159             }
10160           }
10161         }
10162       }
10163     }
10164   }
10165   
10166   /* Pass 6 - Optimize clean/dirty state */
10167   clean_registers(0,slen-1,1);
10168   
10169   /* Pass 7 - Identify 32-bit registers */
10170   
10171   provisional_r32();
10172
10173   u_int r32=0;
10174   
10175   for (i=slen-1;i>=0;i--)
10176   {
10177     int hr;
10178     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10179     {
10180       if(ba[i]<start || ba[i]>=(start+slen*4))
10181       {
10182         // Branch out of this block, don't need anything
10183         r32=0;
10184       }
10185       else
10186       {
10187         // Internal branch
10188         // Need whatever matches the target
10189         // (and doesn't get overwritten by the delay slot instruction)
10190         r32=0;
10191         int t=(ba[i]-start)>>2;
10192         if(ba[i]>start+i*4) {
10193           // Forward branch
10194           if(!(requires_32bit[t]&~regs[i].was32))
10195             r32|=requires_32bit[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10196         }else{
10197           // Backward branch
10198           //if(!(regs[t].was32&~unneeded_reg_upper[t]&~regs[i].was32))
10199           //  r32|=regs[t].was32&~unneeded_reg_upper[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10200           if(!(pr32[t]&~regs[i].was32))
10201             r32|=pr32[t]&(~(1LL<<rt1[i+1]))&(~(1LL<<rt2[i+1]));
10202         }
10203       }
10204       // Conditional branch may need registers for following instructions
10205       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
10206       {
10207         if(i<slen-2) {
10208           r32|=requires_32bit[i+2];
10209           r32&=regs[i].was32;
10210           // Mark this address as a branch target since it may be called
10211           // upon return from interrupt
10212           bt[i+2]=1;
10213         }
10214       }
10215       // Merge in delay slot
10216       if(!likely[i]) {
10217         // These are overwritten unless the branch is "likely"
10218         // and the delay slot is nullified if not taken
10219         r32&=~(1LL<<rt1[i+1]);
10220         r32&=~(1LL<<rt2[i+1]);
10221       }
10222       // Assume these are needed (delay slot)
10223       if(us1[i+1]>0)
10224       {
10225         if((regs[i].was32>>us1[i+1])&1) r32|=1LL<<us1[i+1];
10226       }
10227       if(us2[i+1]>0)
10228       {
10229         if((regs[i].was32>>us2[i+1])&1) r32|=1LL<<us2[i+1];
10230       }
10231       if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1))
10232       {
10233         if((regs[i].was32>>dep1[i+1])&1) r32|=1LL<<dep1[i+1];
10234       }
10235       if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1))
10236       {
10237         if((regs[i].was32>>dep2[i+1])&1) r32|=1LL<<dep2[i+1];
10238       }
10239     }
10240     else if(itype[i]==SYSCALL||itype[i]==HLECALL)
10241     {
10242       // SYSCALL instruction (software interrupt)
10243       r32=0;
10244     }
10245     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
10246     {
10247       // ERET instruction (return from interrupt)
10248       r32=0;
10249     }
10250     // Check 32 bits
10251     r32&=~(1LL<<rt1[i]);
10252     r32&=~(1LL<<rt2[i]);
10253     if(us1[i]>0)
10254     {
10255       if((regs[i].was32>>us1[i])&1) r32|=1LL<<us1[i];
10256     }
10257     if(us2[i]>0)
10258     {
10259       if((regs[i].was32>>us2[i])&1) r32|=1LL<<us2[i];
10260     }
10261     if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1))
10262     {
10263       if((regs[i].was32>>dep1[i])&1) r32|=1LL<<dep1[i];
10264     }
10265     if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1))
10266     {
10267       if((regs[i].was32>>dep2[i])&1) r32|=1LL<<dep2[i];
10268     }
10269     requires_32bit[i]=r32;
10270     
10271     // Dirty registers which are 32-bit, require 32-bit input
10272     // as they will be written as 32-bit values
10273     for(hr=0;hr<HOST_REGS;hr++)
10274     {
10275       if(regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64) {
10276         if((regs[i].was32>>regs[i].regmap_entry[hr])&(regs[i].wasdirty>>hr)&1) {
10277           if(!((unneeded_reg_upper[i]>>regs[i].regmap_entry[hr])&1))
10278           requires_32bit[i]|=1LL<<regs[i].regmap_entry[hr];
10279         }
10280       }
10281     }
10282     //requires_32bit[i]=is32[i]&~unneeded_reg_upper[i]; // DEBUG
10283   }
10284
10285   if(itype[slen-1]==SPAN) {
10286     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
10287   }
10288   
10289   /* Debug/disassembly */
10290   if((void*)assem_debug==(void*)printf) 
10291   for(i=0;i<slen;i++)
10292   {
10293     printf("U:");
10294     int r;
10295     for(r=1;r<=CCREG;r++) {
10296       if((unneeded_reg[i]>>r)&1) {
10297         if(r==HIREG) printf(" HI");
10298         else if(r==LOREG) printf(" LO");
10299         else printf(" r%d",r);
10300       }
10301     }
10302 #ifndef FORCE32
10303     printf(" UU:");
10304     for(r=1;r<=CCREG;r++) {
10305       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
10306         if(r==HIREG) printf(" HI");
10307         else if(r==LOREG) printf(" LO");
10308         else printf(" r%d",r);
10309       }
10310     }
10311     printf(" 32:");
10312     for(r=0;r<=CCREG;r++) {
10313       //if(((is32[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10314       if((regs[i].was32>>r)&1) {
10315         if(r==CCREG) printf(" CC");
10316         else if(r==HIREG) printf(" HI");
10317         else if(r==LOREG) printf(" LO");
10318         else printf(" r%d",r);
10319       }
10320     }
10321 #endif
10322     printf("\n");
10323     #if defined(__i386__) || defined(__x86_64__)
10324     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
10325     #endif
10326     #ifdef __arm__
10327     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
10328     #endif
10329     printf("needs: ");
10330     if(needed_reg[i]&1) printf("eax ");
10331     if((needed_reg[i]>>1)&1) printf("ecx ");
10332     if((needed_reg[i]>>2)&1) printf("edx ");
10333     if((needed_reg[i]>>3)&1) printf("ebx ");
10334     if((needed_reg[i]>>5)&1) printf("ebp ");
10335     if((needed_reg[i]>>6)&1) printf("esi ");
10336     if((needed_reg[i]>>7)&1) printf("edi ");
10337     printf("r:");
10338     for(r=0;r<=CCREG;r++) {
10339       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10340       if((requires_32bit[i]>>r)&1) {
10341         if(r==CCREG) printf(" CC");
10342         else if(r==HIREG) printf(" HI");
10343         else if(r==LOREG) printf(" LO");
10344         else printf(" r%d",r);
10345       }
10346     }
10347     printf("\n");
10348     /*printf("pr:");
10349     for(r=0;r<=CCREG;r++) {
10350       //if(((requires_32bit[i]>>r)&(~unneeded_reg[i]>>r))&1) {
10351       if((pr32[i]>>r)&1) {
10352         if(r==CCREG) printf(" CC");
10353         else if(r==HIREG) printf(" HI");
10354         else if(r==LOREG) printf(" LO");
10355         else printf(" r%d",r);
10356       }
10357     }
10358     if(pr32[i]!=requires_32bit[i]) printf(" OOPS");
10359     printf("\n");*/
10360     #if defined(__i386__) || defined(__x86_64__)
10361     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
10362     printf("dirty: ");
10363     if(regs[i].wasdirty&1) printf("eax ");
10364     if((regs[i].wasdirty>>1)&1) printf("ecx ");
10365     if((regs[i].wasdirty>>2)&1) printf("edx ");
10366     if((regs[i].wasdirty>>3)&1) printf("ebx ");
10367     if((regs[i].wasdirty>>5)&1) printf("ebp ");
10368     if((regs[i].wasdirty>>6)&1) printf("esi ");
10369     if((regs[i].wasdirty>>7)&1) printf("edi ");
10370     #endif
10371     #ifdef __arm__
10372     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
10373     printf("dirty: ");
10374     if(regs[i].wasdirty&1) printf("r0 ");
10375     if((regs[i].wasdirty>>1)&1) printf("r1 ");
10376     if((regs[i].wasdirty>>2)&1) printf("r2 ");
10377     if((regs[i].wasdirty>>3)&1) printf("r3 ");
10378     if((regs[i].wasdirty>>4)&1) printf("r4 ");
10379     if((regs[i].wasdirty>>5)&1) printf("r5 ");
10380     if((regs[i].wasdirty>>6)&1) printf("r6 ");
10381     if((regs[i].wasdirty>>7)&1) printf("r7 ");
10382     if((regs[i].wasdirty>>8)&1) printf("r8 ");
10383     if((regs[i].wasdirty>>9)&1) printf("r9 ");
10384     if((regs[i].wasdirty>>10)&1) printf("r10 ");
10385     if((regs[i].wasdirty>>12)&1) printf("r12 ");
10386     #endif
10387     printf("\n");
10388     disassemble_inst(i);
10389     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
10390     #if defined(__i386__) || defined(__x86_64__)
10391     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
10392     if(regs[i].dirty&1) printf("eax ");
10393     if((regs[i].dirty>>1)&1) printf("ecx ");
10394     if((regs[i].dirty>>2)&1) printf("edx ");
10395     if((regs[i].dirty>>3)&1) printf("ebx ");
10396     if((regs[i].dirty>>5)&1) printf("ebp ");
10397     if((regs[i].dirty>>6)&1) printf("esi ");
10398     if((regs[i].dirty>>7)&1) printf("edi ");
10399     #endif
10400     #ifdef __arm__
10401     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
10402     if(regs[i].dirty&1) printf("r0 ");
10403     if((regs[i].dirty>>1)&1) printf("r1 ");
10404     if((regs[i].dirty>>2)&1) printf("r2 ");
10405     if((regs[i].dirty>>3)&1) printf("r3 ");
10406     if((regs[i].dirty>>4)&1) printf("r4 ");
10407     if((regs[i].dirty>>5)&1) printf("r5 ");
10408     if((regs[i].dirty>>6)&1) printf("r6 ");
10409     if((regs[i].dirty>>7)&1) printf("r7 ");
10410     if((regs[i].dirty>>8)&1) printf("r8 ");
10411     if((regs[i].dirty>>9)&1) printf("r9 ");
10412     if((regs[i].dirty>>10)&1) printf("r10 ");
10413     if((regs[i].dirty>>12)&1) printf("r12 ");
10414     #endif
10415     printf("\n");
10416     if(regs[i].isconst) {
10417       printf("constants: ");
10418       #if defined(__i386__) || defined(__x86_64__)
10419       if(regs[i].isconst&1) printf("eax=%x ",(int)constmap[i][0]);
10420       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(int)constmap[i][1]);
10421       if((regs[i].isconst>>2)&1) printf("edx=%x ",(int)constmap[i][2]);
10422       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(int)constmap[i][3]);
10423       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(int)constmap[i][5]);
10424       if((regs[i].isconst>>6)&1) printf("esi=%x ",(int)constmap[i][6]);
10425       if((regs[i].isconst>>7)&1) printf("edi=%x ",(int)constmap[i][7]);
10426       #endif
10427       #ifdef __arm__
10428       if(regs[i].isconst&1) printf("r0=%x ",(int)constmap[i][0]);
10429       if((regs[i].isconst>>1)&1) printf("r1=%x ",(int)constmap[i][1]);
10430       if((regs[i].isconst>>2)&1) printf("r2=%x ",(int)constmap[i][2]);
10431       if((regs[i].isconst>>3)&1) printf("r3=%x ",(int)constmap[i][3]);
10432       if((regs[i].isconst>>4)&1) printf("r4=%x ",(int)constmap[i][4]);
10433       if((regs[i].isconst>>5)&1) printf("r5=%x ",(int)constmap[i][5]);
10434       if((regs[i].isconst>>6)&1) printf("r6=%x ",(int)constmap[i][6]);
10435       if((regs[i].isconst>>7)&1) printf("r7=%x ",(int)constmap[i][7]);
10436       if((regs[i].isconst>>8)&1) printf("r8=%x ",(int)constmap[i][8]);
10437       if((regs[i].isconst>>9)&1) printf("r9=%x ",(int)constmap[i][9]);
10438       if((regs[i].isconst>>10)&1) printf("r10=%x ",(int)constmap[i][10]);
10439       if((regs[i].isconst>>12)&1) printf("r12=%x ",(int)constmap[i][12]);
10440       #endif
10441       printf("\n");
10442     }
10443 #ifndef FORCE32
10444     printf(" 32:");
10445     for(r=0;r<=CCREG;r++) {
10446       if((regs[i].is32>>r)&1) {
10447         if(r==CCREG) printf(" CC");
10448         else if(r==HIREG) printf(" HI");
10449         else if(r==LOREG) printf(" LO");
10450         else printf(" r%d",r);
10451       }
10452     }
10453     printf("\n");
10454 #endif
10455     /*printf(" p32:");
10456     for(r=0;r<=CCREG;r++) {
10457       if((p32[i]>>r)&1) {
10458         if(r==CCREG) printf(" CC");
10459         else if(r==HIREG) printf(" HI");
10460         else if(r==LOREG) printf(" LO");
10461         else printf(" r%d",r);
10462       }
10463     }
10464     if(p32[i]!=regs[i].is32) printf(" NO MATCH\n");
10465     else printf("\n");*/
10466     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
10467       #if defined(__i386__) || defined(__x86_64__)
10468       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
10469       if(branch_regs[i].dirty&1) printf("eax ");
10470       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
10471       if((branch_regs[i].dirty>>2)&1) printf("edx ");
10472       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
10473       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
10474       if((branch_regs[i].dirty>>6)&1) printf("esi ");
10475       if((branch_regs[i].dirty>>7)&1) printf("edi ");
10476       #endif
10477       #ifdef __arm__
10478       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
10479       if(branch_regs[i].dirty&1) printf("r0 ");
10480       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
10481       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
10482       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
10483       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
10484       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
10485       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
10486       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
10487       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
10488       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
10489       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
10490       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
10491       #endif
10492 #ifndef FORCE32
10493       printf(" 32:");
10494       for(r=0;r<=CCREG;r++) {
10495         if((branch_regs[i].is32>>r)&1) {
10496           if(r==CCREG) printf(" CC");
10497           else if(r==HIREG) printf(" HI");
10498           else if(r==LOREG) printf(" LO");
10499           else printf(" r%d",r);
10500         }
10501       }
10502       printf("\n");
10503 #endif
10504     }
10505   }
10506
10507   /* Pass 8 - Assembly */
10508   linkcount=0;stubcount=0;
10509   ds=0;is_delayslot=0;
10510   cop1_usable=0;
10511   uint64_t is32_pre=0;
10512   u_int dirty_pre=0;
10513   u_int beginning=(u_int)out;
10514   if((u_int)addr&1) {
10515     ds=1;
10516     pagespan_ds();
10517   }
10518   for(i=0;i<slen;i++)
10519   {
10520     //if(ds) printf("ds: ");
10521     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
10522     if(ds) {
10523       ds=0; // Skip delay slot
10524       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
10525       instr_addr[i]=0;
10526     } else {
10527       #ifndef DESTRUCTIVE_WRITEBACK
10528       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10529       {
10530         wb_sx(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,is32_pre,regs[i].was32,
10531               unneeded_reg[i],unneeded_reg_upper[i]);
10532         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
10533               unneeded_reg[i],unneeded_reg_upper[i]);
10534       }
10535       is32_pre=regs[i].is32;
10536       dirty_pre=regs[i].dirty;
10537       #endif
10538       // write back
10539       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
10540       {
10541         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
10542                       unneeded_reg[i],unneeded_reg_upper[i]);
10543         loop_preload(regmap_pre[i],regs[i].regmap_entry);
10544       }
10545       // branch target entry point
10546       instr_addr[i]=(u_int)out;
10547       assem_debug("<->\n");
10548       // load regs
10549       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
10550         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
10551       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
10552       address_generation(i,&regs[i],regs[i].regmap_entry);
10553       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
10554       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
10555       {
10556         // Load the delay slot registers if necessary
10557         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10558           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10559         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10560           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10561         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
10562           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10563       }
10564       else if(i+1<slen)
10565       {
10566         // Preload registers for following instruction
10567         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
10568           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
10569             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
10570         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
10571           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
10572             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
10573       }
10574       // TODO: if(is_ooo(i)) address_generation(i+1);
10575       if(itype[i]==CJUMP||itype[i]==FJUMP)
10576         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
10577       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
10578         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
10579       if(bt[i]) cop1_usable=0;
10580       // assemble
10581       switch(itype[i]) {
10582         case ALU:
10583           alu_assemble(i,&regs[i]);break;
10584         case IMM16:
10585           imm16_assemble(i,&regs[i]);break;
10586         case SHIFT:
10587           shift_assemble(i,&regs[i]);break;
10588         case SHIFTIMM:
10589           shiftimm_assemble(i,&regs[i]);break;
10590         case LOAD:
10591           load_assemble(i,&regs[i]);break;
10592         case LOADLR:
10593           loadlr_assemble(i,&regs[i]);break;
10594         case STORE:
10595           store_assemble(i,&regs[i]);break;
10596         case STORELR:
10597           storelr_assemble(i,&regs[i]);break;
10598         case COP0:
10599           cop0_assemble(i,&regs[i]);break;
10600         case COP1:
10601           cop1_assemble(i,&regs[i]);break;
10602         case C1LS:
10603           c1ls_assemble(i,&regs[i]);break;
10604         case COP2:
10605           cop2_assemble(i,&regs[i]);break;
10606         case C2LS:
10607           c2ls_assemble(i,&regs[i]);break;
10608         case C2OP:
10609           c2op_assemble(i,&regs[i]);break;
10610         case FCONV:
10611           fconv_assemble(i,&regs[i]);break;
10612         case FLOAT:
10613           float_assemble(i,&regs[i]);break;
10614         case FCOMP:
10615           fcomp_assemble(i,&regs[i]);break;
10616         case MULTDIV:
10617           multdiv_assemble(i,&regs[i]);break;
10618         case MOV:
10619           mov_assemble(i,&regs[i]);break;
10620         case SYSCALL:
10621           syscall_assemble(i,&regs[i]);break;
10622         case HLECALL:
10623           hlecall_assemble(i,&regs[i]);break;
10624         case UJUMP:
10625           ujump_assemble(i,&regs[i]);ds=1;break;
10626         case RJUMP:
10627           rjump_assemble(i,&regs[i]);ds=1;break;
10628         case CJUMP:
10629           cjump_assemble(i,&regs[i]);ds=1;break;
10630         case SJUMP:
10631           sjump_assemble(i,&regs[i]);ds=1;break;
10632         case FJUMP:
10633           fjump_assemble(i,&regs[i]);ds=1;break;
10634         case SPAN:
10635           pagespan_assemble(i,&regs[i]);break;
10636       }
10637       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
10638         literal_pool(1024);
10639       else
10640         literal_pool_jumpover(256);
10641     }
10642   }
10643   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
10644   // If the block did not end with an unconditional branch,
10645   // add a jump to the next instruction.
10646   if(i>1) {
10647     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
10648       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10649       assert(i==slen);
10650       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
10651         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10652         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10653           emit_loadreg(CCREG,HOST_CCREG);
10654         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10655       }
10656       else if(!likely[i-2])
10657       {
10658         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
10659         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
10660       }
10661       else
10662       {
10663         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
10664         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
10665       }
10666       add_to_linker((int)out,start+i*4,0);
10667       emit_jmp(0);
10668     }
10669   }
10670   else
10671   {
10672     assert(i>0);
10673     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
10674     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
10675     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
10676       emit_loadreg(CCREG,HOST_CCREG);
10677     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
10678     add_to_linker((int)out,start+i*4,0);
10679     emit_jmp(0);
10680   }
10681
10682   // TODO: delay slot stubs?
10683   // Stubs
10684   for(i=0;i<stubcount;i++)
10685   {
10686     switch(stubs[i][0])
10687     {
10688       case LOADB_STUB:
10689       case LOADH_STUB:
10690       case LOADW_STUB:
10691       case LOADD_STUB:
10692       case LOADBU_STUB:
10693       case LOADHU_STUB:
10694         do_readstub(i);break;
10695       case STOREB_STUB:
10696       case STOREH_STUB:
10697       case STOREW_STUB:
10698       case STORED_STUB:
10699         do_writestub(i);break;
10700       case CC_STUB:
10701         do_ccstub(i);break;
10702       case INVCODE_STUB:
10703         do_invstub(i);break;
10704       case FP_STUB:
10705         do_cop1stub(i);break;
10706       case STORELR_STUB:
10707         do_unalignedwritestub(i);break;
10708     }
10709   }
10710
10711   /* Pass 9 - Linker */
10712   for(i=0;i<linkcount;i++)
10713   {
10714     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
10715     literal_pool(64);
10716     if(!link_addr[i][2])
10717     {
10718       void *stub=out;
10719       void *addr=check_addr(link_addr[i][1]);
10720       emit_extjump(link_addr[i][0],link_addr[i][1]);
10721       if(addr) {
10722         set_jump_target(link_addr[i][0],(int)addr);
10723         add_link(link_addr[i][1],stub);
10724       }
10725       else set_jump_target(link_addr[i][0],(int)stub);
10726     }
10727     else
10728     {
10729       // Internal branch
10730       int target=(link_addr[i][1]-start)>>2;
10731       assert(target>=0&&target<slen);
10732       assert(instr_addr[target]);
10733       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10734       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
10735       //#else
10736       set_jump_target(link_addr[i][0],instr_addr[target]);
10737       //#endif
10738     }
10739   }
10740   // External Branch Targets (jump_in)
10741   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
10742   for(i=0;i<slen;i++)
10743   {
10744     if(bt[i]||i==0)
10745     {
10746       if(instr_addr[i]) // TODO - delay slots (=null)
10747       {
10748         u_int vaddr=start+i*4;
10749         u_int page=get_page(vaddr);
10750         u_int vpage=get_vpage(vaddr);
10751         literal_pool(256);
10752         //if(!(is32[i]&(~unneeded_reg_upper[i])&~(1LL<<CCREG)))
10753         if(!requires_32bit[i])
10754         {
10755           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10756           assem_debug("jump_in: %x\n",start+i*4);
10757           ll_add(jump_dirty+vpage,vaddr,(void *)out);
10758           int entry_point=do_dirty_stub(i);
10759           ll_add(jump_in+page,vaddr,(void *)entry_point);
10760           // If there was an existing entry in the hash table,
10761           // replace it with the new address.
10762           // Don't add new entries.  We'll insert the
10763           // ones that actually get used in check_addr().
10764           int *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
10765           if(ht_bin[0]==vaddr) {
10766             ht_bin[1]=entry_point;
10767           }
10768           if(ht_bin[2]==vaddr) {
10769             ht_bin[3]=entry_point;
10770           }
10771         }
10772         else
10773         {
10774           u_int r=requires_32bit[i]|!!(requires_32bit[i]>>32);
10775           assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*4);
10776           assem_debug("jump_in: %x (restricted - %x)\n",start+i*4,r);
10777           //int entry_point=(int)out;
10778           ////assem_debug("entry_point: %x\n",entry_point);
10779           //load_regs_entry(i);
10780           //if(entry_point==(int)out)
10781           //  entry_point=instr_addr[i];
10782           //else
10783           //  emit_jmp(instr_addr[i]);
10784           //ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10785           ll_add_32(jump_dirty+vpage,vaddr,r,(void *)out);
10786           int entry_point=do_dirty_stub(i);
10787           ll_add_32(jump_in+page,vaddr,r,(void *)entry_point);
10788         }
10789       }
10790     }
10791   }
10792   // Write out the literal pool if necessary
10793   literal_pool(0);
10794   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
10795   // Align code
10796   if(((u_int)out)&7) emit_addnop(13);
10797   #endif
10798   assert((u_int)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
10799   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
10800   memcpy(copy,source,slen*4);
10801   copy+=slen*4;
10802   
10803   #ifdef __arm__
10804   __clear_cache((void *)beginning,out);
10805   #endif
10806   
10807   // If we're within 256K of the end of the buffer,
10808   // start over from the beginning. (Is 256K enough?)
10809   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE) out=(u_char *)BASE_ADDR;
10810   
10811   // Trap writes to any of the pages we compiled
10812   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
10813     invalid_code[i]=0;
10814 #ifndef DISABLE_TLB
10815     memory_map[i]|=0x40000000;
10816     if((signed int)start>=(signed int)0xC0000000) {
10817       assert(using_tlb);
10818       j=(((u_int)i<<12)+(memory_map[i]<<2)-(u_int)rdram+(u_int)0x80000000)>>12;
10819       invalid_code[j]=0;
10820       memory_map[j]|=0x40000000;
10821       //printf("write protect physical page: %x (virtual %x)\n",j<<12,start);
10822     }
10823 #endif
10824   }
10825   
10826   /* Pass 10 - Free memory by expiring oldest blocks */
10827   
10828   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
10829   while(expirep!=end)
10830   {
10831     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
10832     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
10833     inv_debug("EXP: Phase %d\n",expirep);
10834     switch((expirep>>11)&3)
10835     {
10836       case 0:
10837         // Clear jump_in and jump_dirty
10838         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
10839         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
10840         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
10841         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
10842         break;
10843       case 1:
10844         // Clear pointers
10845         ll_kill_pointers(jump_out[expirep&2047],base,shift);
10846         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
10847         break;
10848       case 2:
10849         // Clear hash table
10850         for(i=0;i<32;i++) {
10851           int *ht_bin=hash_table[((expirep&2047)<<5)+i];
10852           if((ht_bin[3]>>shift)==(base>>shift) ||
10853              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10854             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
10855             ht_bin[2]=ht_bin[3]=-1;
10856           }
10857           if((ht_bin[1]>>shift)==(base>>shift) ||
10858              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
10859             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
10860             ht_bin[0]=ht_bin[2];
10861             ht_bin[1]=ht_bin[3];
10862             ht_bin[2]=ht_bin[3]=-1;
10863           }
10864         }
10865         break;
10866       case 3:
10867         // Clear jump_out
10868         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
10869         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
10870         break;
10871     }
10872     expirep=(expirep+1)&65535;
10873   }
10874   return 0;
10875 }
10876
10877 // vim:shiftwidth=2:expandtab