drc: rework for 64bit, part 4
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h" //emulator interface
39 #include "emu_if.h" //emulator interface
40
41 #ifndef ARRAY_SIZE
42 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
43 #endif
44
45 //#define DISASM
46 //#define assem_debug printf
47 //#define inv_debug printf
48 #define assem_debug(...)
49 #define inv_debug(...)
50
51 #ifdef __i386__
52 #include "assem_x86.h"
53 #endif
54 #ifdef __x86_64__
55 #include "assem_x64.h"
56 #endif
57 #ifdef __arm__
58 #include "assem_arm.h"
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 // stubs
65 enum stub_type {
66   CC_STUB = 1,
67   FP_STUB = 2,
68   LOADB_STUB = 3,
69   LOADH_STUB = 4,
70   LOADW_STUB = 5,
71   LOADD_STUB = 6,
72   LOADBU_STUB = 7,
73   LOADHU_STUB = 8,
74   STOREB_STUB = 9,
75   STOREH_STUB = 10,
76   STOREW_STUB = 11,
77   STORED_STUB = 12,
78   STORELR_STUB = 13,
79   INVCODE_STUB = 14,
80 };
81
82 struct regstat
83 {
84   signed char regmap_entry[HOST_REGS];
85   signed char regmap[HOST_REGS];
86   uint64_t was32;
87   uint64_t is32;
88   uint64_t wasdirty;
89   uint64_t dirty;
90   uint64_t u;
91   uint64_t uu;
92   u_int wasconst;
93   u_int isconst;
94   u_int loadedconst;             // host regs that have constants loaded
95   u_int waswritten;              // MIPS regs that were used as store base before
96 };
97
98 // note: asm depends on this layout
99 struct ll_entry
100 {
101   u_int vaddr;
102   u_int reg_sv_flags;
103   void *addr;
104   struct ll_entry *next;
105 };
106
107 struct ht_entry
108 {
109   u_int vaddr[2];
110   void *tcaddr[2];
111 };
112
113 struct code_stub
114 {
115   enum stub_type type;
116   void *addr;
117   void *retaddr;
118   u_int a;
119   uintptr_t b;
120   uintptr_t c;
121   u_int d;
122   u_int e;
123 };
124
125 struct link_entry
126 {
127   void *addr;
128   u_int target;
129   u_int ext;
130 };
131
132   // used by asm:
133   u_char *out;
134   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
135   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
136   struct ll_entry *jump_dirty[4096];
137
138   static struct ll_entry *jump_out[4096];
139   static u_int start;
140   static u_int *source;
141   static char insn[MAXBLOCK][10];
142   static u_char itype[MAXBLOCK];
143   static u_char opcode[MAXBLOCK];
144   static u_char opcode2[MAXBLOCK];
145   static u_char bt[MAXBLOCK];
146   static u_char rs1[MAXBLOCK];
147   static u_char rs2[MAXBLOCK];
148   static u_char rt1[MAXBLOCK];
149   static u_char rt2[MAXBLOCK];
150   static u_char us1[MAXBLOCK];
151   static u_char us2[MAXBLOCK];
152   static u_char dep1[MAXBLOCK];
153   static u_char dep2[MAXBLOCK];
154   static u_char lt1[MAXBLOCK];
155   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
156   static uint64_t gte_rt[MAXBLOCK];
157   static uint64_t gte_unneeded[MAXBLOCK];
158   static u_int smrv[32]; // speculated MIPS register values
159   static u_int smrv_strong; // mask or regs that are likely to have correct values
160   static u_int smrv_weak; // same, but somewhat less likely
161   static u_int smrv_strong_next; // same, but after current insn executes
162   static u_int smrv_weak_next;
163   static int imm[MAXBLOCK];
164   static u_int ba[MAXBLOCK];
165   static char likely[MAXBLOCK];
166   static char is_ds[MAXBLOCK];
167   static char ooo[MAXBLOCK];
168   static uint64_t unneeded_reg[MAXBLOCK];
169   static uint64_t unneeded_reg_upper[MAXBLOCK];
170   static uint64_t branch_unneeded_reg[MAXBLOCK];
171   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
172   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
173   static uint64_t current_constmap[HOST_REGS];
174   static uint64_t constmap[MAXBLOCK][HOST_REGS];
175   static struct regstat regs[MAXBLOCK];
176   static struct regstat branch_regs[MAXBLOCK];
177   static signed char minimum_free_regs[MAXBLOCK];
178   static u_int needed_reg[MAXBLOCK];
179   static u_int wont_dirty[MAXBLOCK];
180   static u_int will_dirty[MAXBLOCK];
181   static int ccadj[MAXBLOCK];
182   static int slen;
183   static void *instr_addr[MAXBLOCK];
184   static struct link_entry link_addr[MAXBLOCK];
185   static int linkcount;
186   static struct code_stub stubs[MAXBLOCK*3];
187   static int stubcount;
188   static u_int literals[1024][2];
189   static int literalcount;
190   static int is_delayslot;
191   static int cop1_usable;
192   static char shadow[1048576]  __attribute__((aligned(16)));
193   static void *copy;
194   static int expirep;
195   static u_int stop_after_jal;
196 #ifndef RAM_FIXED
197   static uintptr_t ram_offset;
198 #else
199   static const uintptr_t ram_offset=0;
200 #endif
201
202   int new_dynarec_hacks;
203   int new_dynarec_did_compile;
204   extern u_char restore_candidate[512];
205   extern int cycle_count;
206
207   /* registers that may be allocated */
208   /* 1-31 gpr */
209 #define HIREG 32 // hi
210 #define LOREG 33 // lo
211 #define FSREG 34 // FPU status (FCSR)
212 #define CSREG 35 // Coprocessor status
213 #define CCREG 36 // Cycle count
214 #define INVCP 37 // Pointer to invalid_code
215 //#define MMREG 38 // Pointer to memory_map
216 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
217 #define TEMPREG 40
218 #define FTEMP 40 // FPU temporary register
219 #define PTEMP 41 // Prefetch temporary register
220 //#define TLREG 42 // TLB mapping offset
221 #define RHASH 43 // Return address hash
222 #define RHTBL 44 // Return address hash table address
223 #define RTEMP 45 // JR/JALR address register
224 #define MAXREG 45
225 #define AGEN1 46 // Address generation temporary register
226 //#define AGEN2 47 // Address generation temporary register
227 //#define MGEN1 48 // Maptable address generation temporary register
228 //#define MGEN2 49 // Maptable address generation temporary register
229 #define BTREG 50 // Branch target temporary register
230
231   /* instruction types */
232 #define NOP 0     // No operation
233 #define LOAD 1    // Load
234 #define STORE 2   // Store
235 #define LOADLR 3  // Unaligned load
236 #define STORELR 4 // Unaligned store
237 #define MOV 5     // Move
238 #define ALU 6     // Arithmetic/logic
239 #define MULTDIV 7 // Multiply/divide
240 #define SHIFT 8   // Shift by register
241 #define SHIFTIMM 9// Shift by immediate
242 #define IMM16 10  // 16-bit immediate
243 #define RJUMP 11  // Unconditional jump to register
244 #define UJUMP 12  // Unconditional jump
245 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
246 #define SJUMP 14  // Conditional branch (regimm format)
247 #define COP0 15   // Coprocessor 0
248 #define COP1 16   // Coprocessor 1
249 #define C1LS 17   // Coprocessor 1 load/store
250 #define FJUMP 18  // Conditional branch (floating point)
251 #define FLOAT 19  // Floating point unit
252 #define FCONV 20  // Convert integer to float
253 #define FCOMP 21  // Floating point compare (sets FSREG)
254 #define SYSCALL 22// SYSCALL
255 #define OTHER 23  // Other
256 #define SPAN 24   // Branch/delay slot spans 2 pages
257 #define NI 25     // Not implemented
258 #define HLECALL 26// PCSX fake opcodes for HLE
259 #define COP2 27   // Coprocessor 2 move
260 #define C2LS 28   // Coprocessor 2 load/store
261 #define C2OP 29   // Coprocessor 2 operation
262 #define INTCALL 30// Call interpreter to handle rare corner cases
263
264   /* branch codes */
265 #define TAKEN 1
266 #define NOTTAKEN 2
267 #define NULLDS 3
268
269 // asm linkage
270 int new_recompile_block(int addr);
271 void *get_addr_ht(u_int vaddr);
272 void invalidate_block(u_int block);
273 void invalidate_addr(u_int addr);
274 void remove_hash(int vaddr);
275 void dyna_linker();
276 void dyna_linker_ds();
277 void verify_code();
278 void verify_code_vm();
279 void verify_code_ds();
280 void cc_interrupt();
281 void fp_exception();
282 void fp_exception_ds();
283 void jump_syscall_hle();
284 void jump_hlecall();
285 void jump_intcall();
286 void new_dyna_leave();
287
288 // Needed by assembler
289 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
290 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
291 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
292 static void load_all_regs(signed char i_regmap[]);
293 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
294 static void load_regs_entry(int t);
295 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
296
297 static int verify_dirty(u_int *ptr);
298 static int get_final_value(int hr, int i, int *value);
299 static void add_stub(enum stub_type type, void *addr, void *retaddr,
300   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
301 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
302   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
303 static void add_to_linker(void *addr, u_int target, int ext);
304
305 static void mprotect_w_x(void *start, void *end, int is_x)
306 {
307 #ifdef NO_WRITE_EXEC
308   #if defined(VITA)
309   // *Open* enables write on all memory that was
310   // allocated by sceKernelAllocMemBlockForVM()?
311   if (is_x)
312     sceKernelCloseVMDomain();
313   else
314     sceKernelOpenVMDomain();
315   #else
316   u_long mstart = (u_long)start & ~4095ul;
317   u_long mend = (u_long)end;
318   if (mprotect((void *)mstart, mend - mstart,
319                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
320     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
321   #endif
322 #endif
323 }
324
325 static void start_tcache_write(void *start, void *end)
326 {
327   mprotect_w_x(start, end, 0);
328 }
329
330 static void end_tcache_write(void *start, void *end)
331 {
332 #ifdef __arm__
333   size_t len = (char *)end - (char *)start;
334   #if   defined(__BLACKBERRY_QNX__)
335   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
336   #elif defined(__MACH__)
337   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
338   #elif defined(VITA)
339   sceKernelSyncVMDomain(sceBlock, start, len);
340   #elif defined(_3DS)
341   ctr_flush_invalidate_cache();
342   #else
343   __clear_cache(start, end);
344   #endif
345   (void)len;
346 #endif
347
348   mprotect_w_x(start, end, 1);
349 }
350
351 static void *start_block(void)
352 {
353   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
354   if (end > translation_cache + (1<<TARGET_SIZE_2))
355     end = translation_cache + (1<<TARGET_SIZE_2);
356   start_tcache_write(out, end);
357   return out;
358 }
359
360 static void end_block(void *start)
361 {
362   end_tcache_write(start, out);
363 }
364
365 //#define DEBUG_CYCLE_COUNT 1
366
367 #define NO_CYCLE_PENALTY_THR 12
368
369 int cycle_multiplier; // 100 for 1.0
370
371 static int CLOCK_ADJUST(int x)
372 {
373   int s=(x>>31)|1;
374   return (x * cycle_multiplier + s * 50) / 100;
375 }
376
377 static u_int get_page(u_int vaddr)
378 {
379   u_int page=vaddr&~0xe0000000;
380   if (page < 0x1000000)
381     page &= ~0x0e00000; // RAM mirrors
382   page>>=12;
383   if(page>2048) page=2048+(page&2047);
384   return page;
385 }
386
387 // no virtual mem in PCSX
388 static u_int get_vpage(u_int vaddr)
389 {
390   return get_page(vaddr);
391 }
392
393 static struct ht_entry *hash_table_get(u_int vaddr)
394 {
395   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
396 }
397
398 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
399 {
400   ht_bin->vaddr[1] = ht_bin->vaddr[0];
401   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
402   ht_bin->vaddr[0] = vaddr;
403   ht_bin->tcaddr[0] = tcaddr;
404 }
405
406 // some messy ari64's code, seems to rely on unsigned 32bit overflow
407 static int doesnt_expire_soon(void *tcaddr)
408 {
409   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
410   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
411 }
412
413 // Get address from virtual address
414 // This is called from the recompiled JR/JALR instructions
415 void *get_addr(u_int vaddr)
416 {
417   u_int page=get_page(vaddr);
418   u_int vpage=get_vpage(vaddr);
419   struct ll_entry *head;
420   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
421   head=jump_in[page];
422   while(head!=NULL) {
423     if(head->vaddr==vaddr) {
424   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
425       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
426       return head->addr;
427     }
428     head=head->next;
429   }
430   head=jump_dirty[vpage];
431   while(head!=NULL) {
432     if(head->vaddr==vaddr) {
433       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
434       // Don't restore blocks which are about to expire from the cache
435       if (doesnt_expire_soon(head->addr))
436       if (verify_dirty(head->addr)) {
437         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
438         invalid_code[vaddr>>12]=0;
439         inv_code_start=inv_code_end=~0;
440         if(vpage<2048) {
441           restore_candidate[vpage>>3]|=1<<(vpage&7);
442         }
443         else restore_candidate[page>>3]|=1<<(page&7);
444         struct ht_entry *ht_bin = hash_table_get(vaddr);
445         if (ht_bin->vaddr[0] == vaddr)
446           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
447         else
448           hash_table_add(ht_bin, vaddr, head->addr);
449
450         return head->addr;
451       }
452     }
453     head=head->next;
454   }
455   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
456   int r=new_recompile_block(vaddr);
457   if(r==0) return get_addr(vaddr);
458   // Execute in unmapped page, generate pagefault execption
459   Status|=2;
460   Cause=(vaddr<<31)|0x8;
461   EPC=(vaddr&1)?vaddr-5:vaddr;
462   BadVAddr=(vaddr&~1);
463   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
464   EntryHi=BadVAddr&0xFFFFE000;
465   return get_addr_ht(0x80000000);
466 }
467 // Look up address in hash table first
468 void *get_addr_ht(u_int vaddr)
469 {
470   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
471   const struct ht_entry *ht_bin = hash_table_get(vaddr);
472   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
473   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
474   return get_addr(vaddr);
475 }
476
477 void clear_all_regs(signed char regmap[])
478 {
479   int hr;
480   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
481 }
482
483 signed char get_reg(signed char regmap[],int r)
484 {
485   int hr;
486   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
487   return -1;
488 }
489
490 // Find a register that is available for two consecutive cycles
491 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
492 {
493   int hr;
494   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
495   return -1;
496 }
497
498 int count_free_regs(signed char regmap[])
499 {
500   int count=0;
501   int hr;
502   for(hr=0;hr<HOST_REGS;hr++)
503   {
504     if(hr!=EXCLUDE_REG) {
505       if(regmap[hr]<0) count++;
506     }
507   }
508   return count;
509 }
510
511 void dirty_reg(struct regstat *cur,signed char reg)
512 {
513   int hr;
514   if(!reg) return;
515   for (hr=0;hr<HOST_REGS;hr++) {
516     if((cur->regmap[hr]&63)==reg) {
517       cur->dirty|=1<<hr;
518     }
519   }
520 }
521
522 // If we dirty the lower half of a 64 bit register which is now being
523 // sign-extended, we need to dump the upper half.
524 // Note: Do this only after completion of the instruction, because
525 // some instructions may need to read the full 64-bit value even if
526 // overwriting it (eg SLTI, DSRA32).
527 static void flush_dirty_uppers(struct regstat *cur)
528 {
529   int hr,reg;
530   for (hr=0;hr<HOST_REGS;hr++) {
531     if((cur->dirty>>hr)&1) {
532       reg=cur->regmap[hr];
533       if(reg>=64)
534         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
535     }
536   }
537 }
538
539 void set_const(struct regstat *cur,signed char reg,uint64_t value)
540 {
541   int hr;
542   if(!reg) return;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if(cur->regmap[hr]==reg) {
545       cur->isconst|=1<<hr;
546       current_constmap[hr]=value;
547     }
548     else if((cur->regmap[hr]^64)==reg) {
549       cur->isconst|=1<<hr;
550       current_constmap[hr]=value>>32;
551     }
552   }
553 }
554
555 void clear_const(struct regstat *cur,signed char reg)
556 {
557   int hr;
558   if(!reg) return;
559   for (hr=0;hr<HOST_REGS;hr++) {
560     if((cur->regmap[hr]&63)==reg) {
561       cur->isconst&=~(1<<hr);
562     }
563   }
564 }
565
566 int is_const(struct regstat *cur,signed char reg)
567 {
568   int hr;
569   if(reg<0) return 0;
570   if(!reg) return 1;
571   for (hr=0;hr<HOST_REGS;hr++) {
572     if((cur->regmap[hr]&63)==reg) {
573       return (cur->isconst>>hr)&1;
574     }
575   }
576   return 0;
577 }
578 uint64_t get_const(struct regstat *cur,signed char reg)
579 {
580   int hr;
581   if(!reg) return 0;
582   for (hr=0;hr<HOST_REGS;hr++) {
583     if(cur->regmap[hr]==reg) {
584       return current_constmap[hr];
585     }
586   }
587   SysPrintf("Unknown constant in r%d\n",reg);
588   exit(1);
589 }
590
591 // Least soon needed registers
592 // Look at the next ten instructions and see which registers
593 // will be used.  Try not to reallocate these.
594 void lsn(u_char hsn[], int i, int *preferred_reg)
595 {
596   int j;
597   int b=-1;
598   for(j=0;j<9;j++)
599   {
600     if(i+j>=slen) {
601       j=slen-i-1;
602       break;
603     }
604     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
605     {
606       // Don't go past an unconditonal jump
607       j++;
608       break;
609     }
610   }
611   for(;j>=0;j--)
612   {
613     if(rs1[i+j]) hsn[rs1[i+j]]=j;
614     if(rs2[i+j]) hsn[rs2[i+j]]=j;
615     if(rt1[i+j]) hsn[rt1[i+j]]=j;
616     if(rt2[i+j]) hsn[rt2[i+j]]=j;
617     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
618       // Stores can allocate zero
619       hsn[rs1[i+j]]=j;
620       hsn[rs2[i+j]]=j;
621     }
622     // On some architectures stores need invc_ptr
623     #if defined(HOST_IMM8)
624     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
625       hsn[INVCP]=j;
626     }
627     #endif
628     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
629     {
630       hsn[CCREG]=j;
631       b=j;
632     }
633   }
634   if(b>=0)
635   {
636     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
637     {
638       // Follow first branch
639       int t=(ba[i+b]-start)>>2;
640       j=7-b;if(t+j>=slen) j=slen-t-1;
641       for(;j>=0;j--)
642       {
643         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
644         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
645         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
646         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
647       }
648     }
649     // TODO: preferred register based on backward branch
650   }
651   // Delay slot should preferably not overwrite branch conditions or cycle count
652   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
653     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
654     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
655     hsn[CCREG]=1;
656     // ...or hash tables
657     hsn[RHASH]=1;
658     hsn[RHTBL]=1;
659   }
660   // Coprocessor load/store needs FTEMP, even if not declared
661   if(itype[i]==C1LS||itype[i]==C2LS) {
662     hsn[FTEMP]=0;
663   }
664   // Load L/R also uses FTEMP as a temporary register
665   if(itype[i]==LOADLR) {
666     hsn[FTEMP]=0;
667   }
668   // Also SWL/SWR/SDL/SDR
669   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
670     hsn[FTEMP]=0;
671   }
672   // Don't remove the miniht registers
673   if(itype[i]==UJUMP||itype[i]==RJUMP)
674   {
675     hsn[RHASH]=0;
676     hsn[RHTBL]=0;
677   }
678 }
679
680 // We only want to allocate registers if we're going to use them again soon
681 int needed_again(int r, int i)
682 {
683   int j;
684   int b=-1;
685   int rn=10;
686
687   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
688   {
689     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
690       return 0; // Don't need any registers if exiting the block
691   }
692   for(j=0;j<9;j++)
693   {
694     if(i+j>=slen) {
695       j=slen-i-1;
696       break;
697     }
698     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
699     {
700       // Don't go past an unconditonal jump
701       j++;
702       break;
703     }
704     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
705     {
706       break;
707     }
708   }
709   for(;j>=1;j--)
710   {
711     if(rs1[i+j]==r) rn=j;
712     if(rs2[i+j]==r) rn=j;
713     if((unneeded_reg[i+j]>>r)&1) rn=10;
714     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
715     {
716       b=j;
717     }
718   }
719   /*
720   if(b>=0)
721   {
722     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
723     {
724       // Follow first branch
725       int o=rn;
726       int t=(ba[i+b]-start)>>2;
727       j=7-b;if(t+j>=slen) j=slen-t-1;
728       for(;j>=0;j--)
729       {
730         if(!((unneeded_reg[t+j]>>r)&1)) {
731           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
732           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
733         }
734         else rn=o;
735       }
736     }
737   }*/
738   if(rn<10) return 1;
739   (void)b;
740   return 0;
741 }
742
743 // Try to match register allocations at the end of a loop with those
744 // at the beginning
745 int loop_reg(int i, int r, int hr)
746 {
747   int j,k;
748   for(j=0;j<9;j++)
749   {
750     if(i+j>=slen) {
751       j=slen-i-1;
752       break;
753     }
754     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
755     {
756       // Don't go past an unconditonal jump
757       j++;
758       break;
759     }
760   }
761   k=0;
762   if(i>0){
763     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
764       k--;
765   }
766   for(;k<j;k++)
767   {
768     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
769     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
770     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
771     {
772       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
773       {
774         int t=(ba[i+k]-start)>>2;
775         int reg=get_reg(regs[t].regmap_entry,r);
776         if(reg>=0) return reg;
777         //reg=get_reg(regs[t+1].regmap_entry,r);
778         //if(reg>=0) return reg;
779       }
780     }
781   }
782   return hr;
783 }
784
785
786 // Allocate every register, preserving source/target regs
787 void alloc_all(struct regstat *cur,int i)
788 {
789   int hr;
790
791   for(hr=0;hr<HOST_REGS;hr++) {
792     if(hr!=EXCLUDE_REG) {
793       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
794          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
795       {
796         cur->regmap[hr]=-1;
797         cur->dirty&=~(1<<hr);
798       }
799       // Don't need zeros
800       if((cur->regmap[hr]&63)==0)
801       {
802         cur->regmap[hr]=-1;
803         cur->dirty&=~(1<<hr);
804       }
805     }
806   }
807 }
808
809 #ifdef __i386__
810 #include "assem_x86.c"
811 #endif
812 #ifdef __x86_64__
813 #include "assem_x64.c"
814 #endif
815 #ifdef __arm__
816 #include "assem_arm.c"
817 #endif
818
819 // Add virtual address mapping to linked list
820 void ll_add(struct ll_entry **head,int vaddr,void *addr)
821 {
822   struct ll_entry *new_entry;
823   new_entry=malloc(sizeof(struct ll_entry));
824   assert(new_entry!=NULL);
825   new_entry->vaddr=vaddr;
826   new_entry->reg_sv_flags=0;
827   new_entry->addr=addr;
828   new_entry->next=*head;
829   *head=new_entry;
830 }
831
832 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
833 {
834   ll_add(head,vaddr,addr);
835   (*head)->reg_sv_flags=reg_sv_flags;
836 }
837
838 // Check if an address is already compiled
839 // but don't return addresses which are about to expire from the cache
840 void *check_addr(u_int vaddr)
841 {
842   struct ht_entry *ht_bin = hash_table_get(vaddr);
843   size_t i;
844   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
845     if (ht_bin->vaddr[i] == vaddr)
846       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
847         if (isclean(ht_bin->tcaddr[i]))
848           return ht_bin->tcaddr[i];
849   }
850   u_int page=get_page(vaddr);
851   struct ll_entry *head;
852   head=jump_in[page];
853   while (head != NULL) {
854     if (head->vaddr == vaddr) {
855       if (doesnt_expire_soon(head->addr)) {
856         // Update existing entry with current address
857         if (ht_bin->vaddr[0] == vaddr) {
858           ht_bin->tcaddr[0] = head->addr;
859           return head->addr;
860         }
861         if (ht_bin->vaddr[1] == vaddr) {
862           ht_bin->tcaddr[1] = head->addr;
863           return head->addr;
864         }
865         // Insert into hash table with low priority.
866         // Don't evict existing entries, as they are probably
867         // addresses that are being accessed frequently.
868         if (ht_bin->vaddr[0] == -1) {
869           ht_bin->vaddr[0] = vaddr;
870           ht_bin->tcaddr[0] = head->addr;
871         }
872         else if (ht_bin->vaddr[1] == -1) {
873           ht_bin->vaddr[1] = vaddr;
874           ht_bin->tcaddr[1] = head->addr;
875         }
876         return head->addr;
877       }
878     }
879     head=head->next;
880   }
881   return 0;
882 }
883
884 void remove_hash(int vaddr)
885 {
886   //printf("remove hash: %x\n",vaddr);
887   struct ht_entry *ht_bin = hash_table_get(vaddr);
888   if (ht_bin->vaddr[1] == vaddr) {
889     ht_bin->vaddr[1] = -1;
890     ht_bin->tcaddr[1] = NULL;
891   }
892   if (ht_bin->vaddr[0] == vaddr) {
893     ht_bin->vaddr[0] = ht_bin->vaddr[1];
894     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
895     ht_bin->vaddr[1] = -1;
896     ht_bin->tcaddr[1] = NULL;
897   }
898 }
899
900 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
901 {
902   struct ll_entry *next;
903   while(*head) {
904     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
905        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
906     {
907       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
908       remove_hash((*head)->vaddr);
909       next=(*head)->next;
910       free(*head);
911       *head=next;
912     }
913     else
914     {
915       head=&((*head)->next);
916     }
917   }
918 }
919
920 // Remove all entries from linked list
921 void ll_clear(struct ll_entry **head)
922 {
923   struct ll_entry *cur;
924   struct ll_entry *next;
925   if((cur=*head)) {
926     *head=0;
927     while(cur) {
928       next=cur->next;
929       free(cur);
930       cur=next;
931     }
932   }
933 }
934
935 // Dereference the pointers and remove if it matches
936 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
937 {
938   while(head) {
939     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
940     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
941     if(((ptr>>shift)==(addr>>shift)) ||
942        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
943     {
944       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
945       void *host_addr=find_extjump_insn(head->addr);
946       #ifdef __arm__
947         mark_clear_cache(host_addr);
948       #endif
949       set_jump_target(host_addr, head->addr);
950     }
951     head=head->next;
952   }
953 }
954
955 // This is called when we write to a compiled block (see do_invstub)
956 void invalidate_page(u_int page)
957 {
958   struct ll_entry *head;
959   struct ll_entry *next;
960   head=jump_in[page];
961   jump_in[page]=0;
962   while(head!=NULL) {
963     inv_debug("INVALIDATE: %x\n",head->vaddr);
964     remove_hash(head->vaddr);
965     next=head->next;
966     free(head);
967     head=next;
968   }
969   head=jump_out[page];
970   jump_out[page]=0;
971   while(head!=NULL) {
972     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
973     void *host_addr=find_extjump_insn(head->addr);
974     #ifdef __arm__
975       mark_clear_cache(host_addr);
976     #endif
977     set_jump_target(host_addr, head->addr);
978     next=head->next;
979     free(head);
980     head=next;
981   }
982 }
983
984 static void invalidate_block_range(u_int block, u_int first, u_int last)
985 {
986   u_int page=get_page(block<<12);
987   //printf("first=%d last=%d\n",first,last);
988   invalidate_page(page);
989   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
990   assert(last<page+5);
991   // Invalidate the adjacent pages if a block crosses a 4K boundary
992   while(first<page) {
993     invalidate_page(first);
994     first++;
995   }
996   for(first=page+1;first<last;first++) {
997     invalidate_page(first);
998   }
999   #ifdef __arm__
1000     do_clear_cache();
1001   #endif
1002
1003   // Don't trap writes
1004   invalid_code[block]=1;
1005
1006   #ifdef USE_MINI_HT
1007   memset(mini_ht,-1,sizeof(mini_ht));
1008   #endif
1009 }
1010
1011 void invalidate_block(u_int block)
1012 {
1013   u_int page=get_page(block<<12);
1014   u_int vpage=get_vpage(block<<12);
1015   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1016   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1017   u_int first,last;
1018   first=last=page;
1019   struct ll_entry *head;
1020   head=jump_dirty[vpage];
1021   //printf("page=%d vpage=%d\n",page,vpage);
1022   while(head!=NULL) {
1023     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1024       u_char *start, *end;
1025       get_bounds(head->addr, &start, &end);
1026       //printf("start: %p end: %p\n", start, end);
1027       if (page < 2048 && start >= rdram && end < rdram+RAM_SIZE) {
1028         if (((start-rdram)>>12) <= page && ((end-1-rdram)>>12) >= page) {
1029           if ((((start-rdram)>>12)&2047) < first) first = ((start-rdram)>>12)&2047;
1030           if ((((end-1-rdram)>>12)&2047) > last)  last = ((end-1-rdram)>>12)&2047;
1031         }
1032       }
1033     }
1034     head=head->next;
1035   }
1036   invalidate_block_range(block,first,last);
1037 }
1038
1039 void invalidate_addr(u_int addr)
1040 {
1041   //static int rhits;
1042   // this check is done by the caller
1043   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1044   u_int page=get_vpage(addr);
1045   if(page<2048) { // RAM
1046     struct ll_entry *head;
1047     u_int addr_min=~0, addr_max=0;
1048     u_int mask=RAM_SIZE-1;
1049     u_int addr_main=0x80000000|(addr&mask);
1050     int pg1;
1051     inv_code_start=addr_main&~0xfff;
1052     inv_code_end=addr_main|0xfff;
1053     pg1=page;
1054     if (pg1>0) {
1055       // must check previous page too because of spans..
1056       pg1--;
1057       inv_code_start-=0x1000;
1058     }
1059     for(;pg1<=page;pg1++) {
1060       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1061         u_char *start_h, *end_h;
1062         u_int start, end;
1063         get_bounds(head->addr, &start_h, &end_h);
1064         start = (uintptr_t)start_h - ram_offset;
1065         end = (uintptr_t)end_h - ram_offset;
1066         if(start<=addr_main&&addr_main<end) {
1067           if(start<addr_min) addr_min=start;
1068           if(end>addr_max) addr_max=end;
1069         }
1070         else if(addr_main<start) {
1071           if(start<inv_code_end)
1072             inv_code_end=start-1;
1073         }
1074         else {
1075           if(end>inv_code_start)
1076             inv_code_start=end;
1077         }
1078       }
1079     }
1080     if (addr_min!=~0) {
1081       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1082       inv_code_start=inv_code_end=~0;
1083       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1084       return;
1085     }
1086     else {
1087       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1088       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1089       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1090       return;
1091     }
1092   }
1093   invalidate_block(addr>>12);
1094 }
1095
1096 // This is called when loading a save state.
1097 // Anything could have changed, so invalidate everything.
1098 void invalidate_all_pages()
1099 {
1100   u_int page;
1101   for(page=0;page<4096;page++)
1102     invalidate_page(page);
1103   for(page=0;page<1048576;page++)
1104     if(!invalid_code[page]) {
1105       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1106       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1107     }
1108   #ifdef USE_MINI_HT
1109   memset(mini_ht,-1,sizeof(mini_ht));
1110   #endif
1111 }
1112
1113 // Add an entry to jump_out after making a link
1114 void add_link(u_int vaddr,void *src)
1115 {
1116   u_int page=get_page(vaddr);
1117   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1118   int *ptr=(int *)(src+4);
1119   assert((*ptr&0x0fff0000)==0x059f0000);
1120   (void)ptr;
1121   ll_add(jump_out+page,vaddr,src);
1122   //void *ptr=get_pointer(src);
1123   //inv_debug("add_link: Pointer is to %p\n",ptr);
1124 }
1125
1126 // If a code block was found to be unmodified (bit was set in
1127 // restore_candidate) and it remains unmodified (bit is clear
1128 // in invalid_code) then move the entries for that 4K page from
1129 // the dirty list to the clean list.
1130 void clean_blocks(u_int page)
1131 {
1132   struct ll_entry *head;
1133   inv_debug("INV: clean_blocks page=%d\n",page);
1134   head=jump_dirty[page];
1135   while(head!=NULL) {
1136     if(!invalid_code[head->vaddr>>12]) {
1137       // Don't restore blocks which are about to expire from the cache
1138       if (doesnt_expire_soon(head->addr)) {
1139         if(verify_dirty(head->addr)) {
1140           u_char *start, *end;
1141           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1142           u_int i;
1143           u_int inv=0;
1144           get_bounds(head->addr, &start, &end);
1145           if (start - rdram < RAM_SIZE) {
1146             for (i = (start-rdram+0x80000000)>>12; i <= (end-1-rdram+0x80000000)>>12; i++) {
1147               inv|=invalid_code[i];
1148             }
1149           }
1150           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1151             inv=1;
1152           }
1153           if(!inv) {
1154             void *clean_addr = get_clean_addr(head->addr);
1155             if (doesnt_expire_soon(clean_addr)) {
1156               u_int ppage=page;
1157               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1158               //printf("page=%x, addr=%x\n",page,head->vaddr);
1159               //assert(head->vaddr>>12==(page|0x80000));
1160               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1161               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1162               if (ht_bin->vaddr[0] == head->vaddr)
1163                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1164               if (ht_bin->vaddr[1] == head->vaddr)
1165                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1166             }
1167           }
1168         }
1169       }
1170     }
1171     head=head->next;
1172   }
1173 }
1174
1175
1176 void mov_alloc(struct regstat *current,int i)
1177 {
1178   // Note: Don't need to actually alloc the source registers
1179   if((~current->is32>>rs1[i])&1) {
1180     //alloc_reg64(current,i,rs1[i]);
1181     alloc_reg64(current,i,rt1[i]);
1182     current->is32&=~(1LL<<rt1[i]);
1183   } else {
1184     //alloc_reg(current,i,rs1[i]);
1185     alloc_reg(current,i,rt1[i]);
1186     current->is32|=(1LL<<rt1[i]);
1187   }
1188   clear_const(current,rs1[i]);
1189   clear_const(current,rt1[i]);
1190   dirty_reg(current,rt1[i]);
1191 }
1192
1193 void shiftimm_alloc(struct regstat *current,int i)
1194 {
1195   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1196   {
1197     if(rt1[i]) {
1198       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1199       else lt1[i]=rs1[i];
1200       alloc_reg(current,i,rt1[i]);
1201       current->is32|=1LL<<rt1[i];
1202       dirty_reg(current,rt1[i]);
1203       if(is_const(current,rs1[i])) {
1204         int v=get_const(current,rs1[i]);
1205         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1206         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1207         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1208       }
1209       else clear_const(current,rt1[i]);
1210     }
1211   }
1212   else
1213   {
1214     clear_const(current,rs1[i]);
1215     clear_const(current,rt1[i]);
1216   }
1217
1218   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1219   {
1220     assert(0);
1221   }
1222   if(opcode2[i]==0x3c) // DSLL32
1223   {
1224     assert(0);
1225   }
1226   if(opcode2[i]==0x3e) // DSRL32
1227   {
1228     assert(0);
1229   }
1230   if(opcode2[i]==0x3f) // DSRA32
1231   {
1232     assert(0);
1233   }
1234 }
1235
1236 void shift_alloc(struct regstat *current,int i)
1237 {
1238   if(rt1[i]) {
1239     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1240     {
1241       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1242       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1243       alloc_reg(current,i,rt1[i]);
1244       if(rt1[i]==rs2[i]) {
1245         alloc_reg_temp(current,i,-1);
1246         minimum_free_regs[i]=1;
1247       }
1248       current->is32|=1LL<<rt1[i];
1249     } else { // DSLLV/DSRLV/DSRAV
1250       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1251       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1252       alloc_reg64(current,i,rt1[i]);
1253       current->is32&=~(1LL<<rt1[i]);
1254       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1255       {
1256         alloc_reg_temp(current,i,-1);
1257         minimum_free_regs[i]=1;
1258       }
1259     }
1260     clear_const(current,rs1[i]);
1261     clear_const(current,rs2[i]);
1262     clear_const(current,rt1[i]);
1263     dirty_reg(current,rt1[i]);
1264   }
1265 }
1266
1267 void alu_alloc(struct regstat *current,int i)
1268 {
1269   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1270     if(rt1[i]) {
1271       if(rs1[i]&&rs2[i]) {
1272         alloc_reg(current,i,rs1[i]);
1273         alloc_reg(current,i,rs2[i]);
1274       }
1275       else {
1276         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1277         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1278       }
1279       alloc_reg(current,i,rt1[i]);
1280     }
1281     current->is32|=1LL<<rt1[i];
1282   }
1283   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1284     if(rt1[i]) {
1285       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1286       {
1287         alloc_reg64(current,i,rs1[i]);
1288         alloc_reg64(current,i,rs2[i]);
1289         alloc_reg(current,i,rt1[i]);
1290       } else {
1291         alloc_reg(current,i,rs1[i]);
1292         alloc_reg(current,i,rs2[i]);
1293         alloc_reg(current,i,rt1[i]);
1294       }
1295     }
1296     current->is32|=1LL<<rt1[i];
1297   }
1298   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1299     if(rt1[i]) {
1300       if(rs1[i]&&rs2[i]) {
1301         alloc_reg(current,i,rs1[i]);
1302         alloc_reg(current,i,rs2[i]);
1303       }
1304       else
1305       {
1306         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1307         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1308       }
1309       alloc_reg(current,i,rt1[i]);
1310       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1311       {
1312         if(!((current->uu>>rt1[i])&1)) {
1313           alloc_reg64(current,i,rt1[i]);
1314         }
1315         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1316           if(rs1[i]&&rs2[i]) {
1317             alloc_reg64(current,i,rs1[i]);
1318             alloc_reg64(current,i,rs2[i]);
1319           }
1320           else
1321           {
1322             // Is is really worth it to keep 64-bit values in registers?
1323             #ifdef NATIVE_64BIT
1324             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1325             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1326             #endif
1327           }
1328         }
1329         current->is32&=~(1LL<<rt1[i]);
1330       } else {
1331         current->is32|=1LL<<rt1[i];
1332       }
1333     }
1334   }
1335   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1336     if(rt1[i]) {
1337       if(rs1[i]&&rs2[i]) {
1338         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1339           alloc_reg64(current,i,rs1[i]);
1340           alloc_reg64(current,i,rs2[i]);
1341           alloc_reg64(current,i,rt1[i]);
1342         } else {
1343           alloc_reg(current,i,rs1[i]);
1344           alloc_reg(current,i,rs2[i]);
1345           alloc_reg(current,i,rt1[i]);
1346         }
1347       }
1348       else {
1349         alloc_reg(current,i,rt1[i]);
1350         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1351           // DADD used as move, or zeroing
1352           // If we have a 64-bit source, then make the target 64 bits too
1353           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1354             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1355             alloc_reg64(current,i,rt1[i]);
1356           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1357             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1358             alloc_reg64(current,i,rt1[i]);
1359           }
1360           if(opcode2[i]>=0x2e&&rs2[i]) {
1361             // DSUB used as negation - 64-bit result
1362             // If we have a 32-bit register, extend it to 64 bits
1363             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1364             alloc_reg64(current,i,rt1[i]);
1365           }
1366         }
1367       }
1368       if(rs1[i]&&rs2[i]) {
1369         current->is32&=~(1LL<<rt1[i]);
1370       } else if(rs1[i]) {
1371         current->is32&=~(1LL<<rt1[i]);
1372         if((current->is32>>rs1[i])&1)
1373           current->is32|=1LL<<rt1[i];
1374       } else if(rs2[i]) {
1375         current->is32&=~(1LL<<rt1[i]);
1376         if((current->is32>>rs2[i])&1)
1377           current->is32|=1LL<<rt1[i];
1378       } else {
1379         current->is32|=1LL<<rt1[i];
1380       }
1381     }
1382   }
1383   clear_const(current,rs1[i]);
1384   clear_const(current,rs2[i]);
1385   clear_const(current,rt1[i]);
1386   dirty_reg(current,rt1[i]);
1387 }
1388
1389 void imm16_alloc(struct regstat *current,int i)
1390 {
1391   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1392   else lt1[i]=rs1[i];
1393   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1394   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1395     current->is32&=~(1LL<<rt1[i]);
1396     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1397       // TODO: Could preserve the 32-bit flag if the immediate is zero
1398       alloc_reg64(current,i,rt1[i]);
1399       alloc_reg64(current,i,rs1[i]);
1400     }
1401     clear_const(current,rs1[i]);
1402     clear_const(current,rt1[i]);
1403   }
1404   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1405     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1406     current->is32|=1LL<<rt1[i];
1407     clear_const(current,rs1[i]);
1408     clear_const(current,rt1[i]);
1409   }
1410   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1411     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1412       if(rs1[i]!=rt1[i]) {
1413         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1414         alloc_reg64(current,i,rt1[i]);
1415         current->is32&=~(1LL<<rt1[i]);
1416       }
1417     }
1418     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1419     if(is_const(current,rs1[i])) {
1420       int v=get_const(current,rs1[i]);
1421       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1422       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1423       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1424     }
1425     else clear_const(current,rt1[i]);
1426   }
1427   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1428     if(is_const(current,rs1[i])) {
1429       int v=get_const(current,rs1[i]);
1430       set_const(current,rt1[i],v+imm[i]);
1431     }
1432     else clear_const(current,rt1[i]);
1433     current->is32|=1LL<<rt1[i];
1434   }
1435   else {
1436     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1437     current->is32|=1LL<<rt1[i];
1438   }
1439   dirty_reg(current,rt1[i]);
1440 }
1441
1442 void load_alloc(struct regstat *current,int i)
1443 {
1444   clear_const(current,rt1[i]);
1445   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1446   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1447   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1448   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1449     alloc_reg(current,i,rt1[i]);
1450     assert(get_reg(current->regmap,rt1[i])>=0);
1451     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1452     {
1453       current->is32&=~(1LL<<rt1[i]);
1454       alloc_reg64(current,i,rt1[i]);
1455     }
1456     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1457     {
1458       current->is32&=~(1LL<<rt1[i]);
1459       alloc_reg64(current,i,rt1[i]);
1460       alloc_all(current,i);
1461       alloc_reg64(current,i,FTEMP);
1462       minimum_free_regs[i]=HOST_REGS;
1463     }
1464     else current->is32|=1LL<<rt1[i];
1465     dirty_reg(current,rt1[i]);
1466     // LWL/LWR need a temporary register for the old value
1467     if(opcode[i]==0x22||opcode[i]==0x26)
1468     {
1469       alloc_reg(current,i,FTEMP);
1470       alloc_reg_temp(current,i,-1);
1471       minimum_free_regs[i]=1;
1472     }
1473   }
1474   else
1475   {
1476     // Load to r0 or unneeded register (dummy load)
1477     // but we still need a register to calculate the address
1478     if(opcode[i]==0x22||opcode[i]==0x26)
1479     {
1480       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1481     }
1482     alloc_reg_temp(current,i,-1);
1483     minimum_free_regs[i]=1;
1484     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1485     {
1486       alloc_all(current,i);
1487       alloc_reg64(current,i,FTEMP);
1488       minimum_free_regs[i]=HOST_REGS;
1489     }
1490   }
1491 }
1492
1493 void store_alloc(struct regstat *current,int i)
1494 {
1495   clear_const(current,rs2[i]);
1496   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1497   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1498   alloc_reg(current,i,rs2[i]);
1499   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1500     alloc_reg64(current,i,rs2[i]);
1501     if(rs2[i]) alloc_reg(current,i,FTEMP);
1502   }
1503   #if defined(HOST_IMM8)
1504   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1505   else alloc_reg(current,i,INVCP);
1506   #endif
1507   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1508     alloc_reg(current,i,FTEMP);
1509   }
1510   // We need a temporary register for address generation
1511   alloc_reg_temp(current,i,-1);
1512   minimum_free_regs[i]=1;
1513 }
1514
1515 void c1ls_alloc(struct regstat *current,int i)
1516 {
1517   //clear_const(current,rs1[i]); // FIXME
1518   clear_const(current,rt1[i]);
1519   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1520   alloc_reg(current,i,CSREG); // Status
1521   alloc_reg(current,i,FTEMP);
1522   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1523     alloc_reg64(current,i,FTEMP);
1524   }
1525   #if defined(HOST_IMM8)
1526   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1527   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1528     alloc_reg(current,i,INVCP);
1529   #endif
1530   // We need a temporary register for address generation
1531   alloc_reg_temp(current,i,-1);
1532 }
1533
1534 void c2ls_alloc(struct regstat *current,int i)
1535 {
1536   clear_const(current,rt1[i]);
1537   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1538   alloc_reg(current,i,FTEMP);
1539   #if defined(HOST_IMM8)
1540   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1541   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1542     alloc_reg(current,i,INVCP);
1543   #endif
1544   // We need a temporary register for address generation
1545   alloc_reg_temp(current,i,-1);
1546   minimum_free_regs[i]=1;
1547 }
1548
1549 #ifndef multdiv_alloc
1550 void multdiv_alloc(struct regstat *current,int i)
1551 {
1552   //  case 0x18: MULT
1553   //  case 0x19: MULTU
1554   //  case 0x1A: DIV
1555   //  case 0x1B: DIVU
1556   //  case 0x1C: DMULT
1557   //  case 0x1D: DMULTU
1558   //  case 0x1E: DDIV
1559   //  case 0x1F: DDIVU
1560   clear_const(current,rs1[i]);
1561   clear_const(current,rs2[i]);
1562   if(rs1[i]&&rs2[i])
1563   {
1564     if((opcode2[i]&4)==0) // 32-bit
1565     {
1566       current->u&=~(1LL<<HIREG);
1567       current->u&=~(1LL<<LOREG);
1568       alloc_reg(current,i,HIREG);
1569       alloc_reg(current,i,LOREG);
1570       alloc_reg(current,i,rs1[i]);
1571       alloc_reg(current,i,rs2[i]);
1572       current->is32|=1LL<<HIREG;
1573       current->is32|=1LL<<LOREG;
1574       dirty_reg(current,HIREG);
1575       dirty_reg(current,LOREG);
1576     }
1577     else // 64-bit
1578     {
1579       current->u&=~(1LL<<HIREG);
1580       current->u&=~(1LL<<LOREG);
1581       current->uu&=~(1LL<<HIREG);
1582       current->uu&=~(1LL<<LOREG);
1583       alloc_reg64(current,i,HIREG);
1584       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1585       alloc_reg64(current,i,rs1[i]);
1586       alloc_reg64(current,i,rs2[i]);
1587       alloc_all(current,i);
1588       current->is32&=~(1LL<<HIREG);
1589       current->is32&=~(1LL<<LOREG);
1590       dirty_reg(current,HIREG);
1591       dirty_reg(current,LOREG);
1592       minimum_free_regs[i]=HOST_REGS;
1593     }
1594   }
1595   else
1596   {
1597     // Multiply by zero is zero.
1598     // MIPS does not have a divide by zero exception.
1599     // The result is undefined, we return zero.
1600     alloc_reg(current,i,HIREG);
1601     alloc_reg(current,i,LOREG);
1602     current->is32|=1LL<<HIREG;
1603     current->is32|=1LL<<LOREG;
1604     dirty_reg(current,HIREG);
1605     dirty_reg(current,LOREG);
1606   }
1607 }
1608 #endif
1609
1610 void cop0_alloc(struct regstat *current,int i)
1611 {
1612   if(opcode2[i]==0) // MFC0
1613   {
1614     if(rt1[i]) {
1615       clear_const(current,rt1[i]);
1616       alloc_all(current,i);
1617       alloc_reg(current,i,rt1[i]);
1618       current->is32|=1LL<<rt1[i];
1619       dirty_reg(current,rt1[i]);
1620     }
1621   }
1622   else if(opcode2[i]==4) // MTC0
1623   {
1624     if(rs1[i]){
1625       clear_const(current,rs1[i]);
1626       alloc_reg(current,i,rs1[i]);
1627       alloc_all(current,i);
1628     }
1629     else {
1630       alloc_all(current,i); // FIXME: Keep r0
1631       current->u&=~1LL;
1632       alloc_reg(current,i,0);
1633     }
1634   }
1635   else
1636   {
1637     // TLBR/TLBWI/TLBWR/TLBP/ERET
1638     assert(opcode2[i]==0x10);
1639     alloc_all(current,i);
1640   }
1641   minimum_free_regs[i]=HOST_REGS;
1642 }
1643
1644 void cop1_alloc(struct regstat *current,int i)
1645 {
1646   alloc_reg(current,i,CSREG); // Load status
1647   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1648   {
1649     if(rt1[i]){
1650       clear_const(current,rt1[i]);
1651       if(opcode2[i]==1) {
1652         alloc_reg64(current,i,rt1[i]); // DMFC1
1653         current->is32&=~(1LL<<rt1[i]);
1654       }else{
1655         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1656         current->is32|=1LL<<rt1[i];
1657       }
1658       dirty_reg(current,rt1[i]);
1659     }
1660     alloc_reg_temp(current,i,-1);
1661   }
1662   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1663   {
1664     if(rs1[i]){
1665       clear_const(current,rs1[i]);
1666       if(opcode2[i]==5)
1667         alloc_reg64(current,i,rs1[i]); // DMTC1
1668       else
1669         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1670       alloc_reg_temp(current,i,-1);
1671     }
1672     else {
1673       current->u&=~1LL;
1674       alloc_reg(current,i,0);
1675       alloc_reg_temp(current,i,-1);
1676     }
1677   }
1678   minimum_free_regs[i]=1;
1679 }
1680 void fconv_alloc(struct regstat *current,int i)
1681 {
1682   alloc_reg(current,i,CSREG); // Load status
1683   alloc_reg_temp(current,i,-1);
1684   minimum_free_regs[i]=1;
1685 }
1686 void float_alloc(struct regstat *current,int i)
1687 {
1688   alloc_reg(current,i,CSREG); // Load status
1689   alloc_reg_temp(current,i,-1);
1690   minimum_free_regs[i]=1;
1691 }
1692 void c2op_alloc(struct regstat *current,int i)
1693 {
1694   alloc_reg_temp(current,i,-1);
1695 }
1696 void fcomp_alloc(struct regstat *current,int i)
1697 {
1698   alloc_reg(current,i,CSREG); // Load status
1699   alloc_reg(current,i,FSREG); // Load flags
1700   dirty_reg(current,FSREG); // Flag will be modified
1701   alloc_reg_temp(current,i,-1);
1702   minimum_free_regs[i]=1;
1703 }
1704
1705 void syscall_alloc(struct regstat *current,int i)
1706 {
1707   alloc_cc(current,i);
1708   dirty_reg(current,CCREG);
1709   alloc_all(current,i);
1710   minimum_free_regs[i]=HOST_REGS;
1711   current->isconst=0;
1712 }
1713
1714 void delayslot_alloc(struct regstat *current,int i)
1715 {
1716   switch(itype[i]) {
1717     case UJUMP:
1718     case CJUMP:
1719     case SJUMP:
1720     case RJUMP:
1721     case FJUMP:
1722     case SYSCALL:
1723     case HLECALL:
1724     case SPAN:
1725       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1726       SysPrintf("Disabled speculative precompilation\n");
1727       stop_after_jal=1;
1728       break;
1729     case IMM16:
1730       imm16_alloc(current,i);
1731       break;
1732     case LOAD:
1733     case LOADLR:
1734       load_alloc(current,i);
1735       break;
1736     case STORE:
1737     case STORELR:
1738       store_alloc(current,i);
1739       break;
1740     case ALU:
1741       alu_alloc(current,i);
1742       break;
1743     case SHIFT:
1744       shift_alloc(current,i);
1745       break;
1746     case MULTDIV:
1747       multdiv_alloc(current,i);
1748       break;
1749     case SHIFTIMM:
1750       shiftimm_alloc(current,i);
1751       break;
1752     case MOV:
1753       mov_alloc(current,i);
1754       break;
1755     case COP0:
1756       cop0_alloc(current,i);
1757       break;
1758     case COP1:
1759     case COP2:
1760       cop1_alloc(current,i);
1761       break;
1762     case C1LS:
1763       c1ls_alloc(current,i);
1764       break;
1765     case C2LS:
1766       c2ls_alloc(current,i);
1767       break;
1768     case FCONV:
1769       fconv_alloc(current,i);
1770       break;
1771     case FLOAT:
1772       float_alloc(current,i);
1773       break;
1774     case FCOMP:
1775       fcomp_alloc(current,i);
1776       break;
1777     case C2OP:
1778       c2op_alloc(current,i);
1779       break;
1780   }
1781 }
1782
1783 // Special case where a branch and delay slot span two pages in virtual memory
1784 static void pagespan_alloc(struct regstat *current,int i)
1785 {
1786   current->isconst=0;
1787   current->wasconst=0;
1788   regs[i].wasconst=0;
1789   minimum_free_regs[i]=HOST_REGS;
1790   alloc_all(current,i);
1791   alloc_cc(current,i);
1792   dirty_reg(current,CCREG);
1793   if(opcode[i]==3) // JAL
1794   {
1795     alloc_reg(current,i,31);
1796     dirty_reg(current,31);
1797   }
1798   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1799   {
1800     alloc_reg(current,i,rs1[i]);
1801     if (rt1[i]!=0) {
1802       alloc_reg(current,i,rt1[i]);
1803       dirty_reg(current,rt1[i]);
1804     }
1805   }
1806   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1807   {
1808     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1809     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1810     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1811     {
1812       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1813       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1814     }
1815   }
1816   else
1817   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1818   {
1819     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1820     if(!((current->is32>>rs1[i])&1))
1821     {
1822       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1823     }
1824   }
1825   else
1826   if(opcode[i]==0x11) // BC1
1827   {
1828     alloc_reg(current,i,FSREG);
1829     alloc_reg(current,i,CSREG);
1830   }
1831   //else ...
1832 }
1833
1834 static void add_stub(enum stub_type type, void *addr, void *retaddr,
1835   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
1836 {
1837   assert(a < ARRAY_SIZE(stubs));
1838   stubs[stubcount].type = type;
1839   stubs[stubcount].addr = addr;
1840   stubs[stubcount].retaddr = retaddr;
1841   stubs[stubcount].a = a;
1842   stubs[stubcount].b = b;
1843   stubs[stubcount].c = c;
1844   stubs[stubcount].d = d;
1845   stubs[stubcount].e = e;
1846   stubcount++;
1847 }
1848
1849 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
1850   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
1851 {
1852   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
1853 }
1854
1855 // Write out a single register
1856 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1857 {
1858   int hr;
1859   for(hr=0;hr<HOST_REGS;hr++) {
1860     if(hr!=EXCLUDE_REG) {
1861       if((regmap[hr]&63)==r) {
1862         if((dirty>>hr)&1) {
1863           if(regmap[hr]<64) {
1864             emit_storereg(r,hr);
1865           }else{
1866             emit_storereg(r|64,hr);
1867           }
1868         }
1869       }
1870     }
1871   }
1872 }
1873
1874 void rlist()
1875 {
1876   int i;
1877   printf("TRACE: ");
1878   for(i=0;i<32;i++)
1879     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1880   printf("\n");
1881 }
1882
1883 void alu_assemble(int i,struct regstat *i_regs)
1884 {
1885   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1886     if(rt1[i]) {
1887       signed char s1,s2,t;
1888       t=get_reg(i_regs->regmap,rt1[i]);
1889       if(t>=0) {
1890         s1=get_reg(i_regs->regmap,rs1[i]);
1891         s2=get_reg(i_regs->regmap,rs2[i]);
1892         if(rs1[i]&&rs2[i]) {
1893           assert(s1>=0);
1894           assert(s2>=0);
1895           if(opcode2[i]&2) emit_sub(s1,s2,t);
1896           else emit_add(s1,s2,t);
1897         }
1898         else if(rs1[i]) {
1899           if(s1>=0) emit_mov(s1,t);
1900           else emit_loadreg(rs1[i],t);
1901         }
1902         else if(rs2[i]) {
1903           if(s2>=0) {
1904             if(opcode2[i]&2) emit_neg(s2,t);
1905             else emit_mov(s2,t);
1906           }
1907           else {
1908             emit_loadreg(rs2[i],t);
1909             if(opcode2[i]&2) emit_neg(t,t);
1910           }
1911         }
1912         else emit_zeroreg(t);
1913       }
1914     }
1915   }
1916   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1917     if(rt1[i]) {
1918       signed char s1l,s2l,s1h,s2h,tl,th;
1919       tl=get_reg(i_regs->regmap,rt1[i]);
1920       th=get_reg(i_regs->regmap,rt1[i]|64);
1921       if(tl>=0) {
1922         s1l=get_reg(i_regs->regmap,rs1[i]);
1923         s2l=get_reg(i_regs->regmap,rs2[i]);
1924         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1925         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1926         if(rs1[i]&&rs2[i]) {
1927           assert(s1l>=0);
1928           assert(s2l>=0);
1929           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1930           else emit_adds(s1l,s2l,tl);
1931           if(th>=0) {
1932             #ifdef INVERTED_CARRY
1933             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1934             #else
1935             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1936             #endif
1937             else emit_add(s1h,s2h,th);
1938           }
1939         }
1940         else if(rs1[i]) {
1941           if(s1l>=0) emit_mov(s1l,tl);
1942           else emit_loadreg(rs1[i],tl);
1943           if(th>=0) {
1944             if(s1h>=0) emit_mov(s1h,th);
1945             else emit_loadreg(rs1[i]|64,th);
1946           }
1947         }
1948         else if(rs2[i]) {
1949           if(s2l>=0) {
1950             if(opcode2[i]&2) emit_negs(s2l,tl);
1951             else emit_mov(s2l,tl);
1952           }
1953           else {
1954             emit_loadreg(rs2[i],tl);
1955             if(opcode2[i]&2) emit_negs(tl,tl);
1956           }
1957           if(th>=0) {
1958             #ifdef INVERTED_CARRY
1959             if(s2h>=0) emit_mov(s2h,th);
1960             else emit_loadreg(rs2[i]|64,th);
1961             if(opcode2[i]&2) {
1962               emit_adcimm(-1,th); // x86 has inverted carry flag
1963               emit_not(th,th);
1964             }
1965             #else
1966             if(opcode2[i]&2) {
1967               if(s2h>=0) emit_rscimm(s2h,0,th);
1968               else {
1969                 emit_loadreg(rs2[i]|64,th);
1970                 emit_rscimm(th,0,th);
1971               }
1972             }else{
1973               if(s2h>=0) emit_mov(s2h,th);
1974               else emit_loadreg(rs2[i]|64,th);
1975             }
1976             #endif
1977           }
1978         }
1979         else {
1980           emit_zeroreg(tl);
1981           if(th>=0) emit_zeroreg(th);
1982         }
1983       }
1984     }
1985   }
1986   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1987     if(rt1[i]) {
1988       signed char s1l,s1h,s2l,s2h,t;
1989       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
1990       {
1991         t=get_reg(i_regs->regmap,rt1[i]);
1992         //assert(t>=0);
1993         if(t>=0) {
1994           s1l=get_reg(i_regs->regmap,rs1[i]);
1995           s1h=get_reg(i_regs->regmap,rs1[i]|64);
1996           s2l=get_reg(i_regs->regmap,rs2[i]);
1997           s2h=get_reg(i_regs->regmap,rs2[i]|64);
1998           if(rs2[i]==0) // rx<r0
1999           {
2000             assert(s1h>=0);
2001             if(opcode2[i]==0x2a) // SLT
2002               emit_shrimm(s1h,31,t);
2003             else // SLTU (unsigned can not be less than zero)
2004               emit_zeroreg(t);
2005           }
2006           else if(rs1[i]==0) // r0<rx
2007           {
2008             assert(s2h>=0);
2009             if(opcode2[i]==0x2a) // SLT
2010               emit_set_gz64_32(s2h,s2l,t);
2011             else // SLTU (set if not zero)
2012               emit_set_nz64_32(s2h,s2l,t);
2013           }
2014           else {
2015             assert(s1l>=0);assert(s1h>=0);
2016             assert(s2l>=0);assert(s2h>=0);
2017             if(opcode2[i]==0x2a) // SLT
2018               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2019             else // SLTU
2020               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2021           }
2022         }
2023       } else {
2024         t=get_reg(i_regs->regmap,rt1[i]);
2025         //assert(t>=0);
2026         if(t>=0) {
2027           s1l=get_reg(i_regs->regmap,rs1[i]);
2028           s2l=get_reg(i_regs->regmap,rs2[i]);
2029           if(rs2[i]==0) // rx<r0
2030           {
2031             assert(s1l>=0);
2032             if(opcode2[i]==0x2a) // SLT
2033               emit_shrimm(s1l,31,t);
2034             else // SLTU (unsigned can not be less than zero)
2035               emit_zeroreg(t);
2036           }
2037           else if(rs1[i]==0) // r0<rx
2038           {
2039             assert(s2l>=0);
2040             if(opcode2[i]==0x2a) // SLT
2041               emit_set_gz32(s2l,t);
2042             else // SLTU (set if not zero)
2043               emit_set_nz32(s2l,t);
2044           }
2045           else{
2046             assert(s1l>=0);assert(s2l>=0);
2047             if(opcode2[i]==0x2a) // SLT
2048               emit_set_if_less32(s1l,s2l,t);
2049             else // SLTU
2050               emit_set_if_carry32(s1l,s2l,t);
2051           }
2052         }
2053       }
2054     }
2055   }
2056   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2057     if(rt1[i]) {
2058       signed char s1l,s1h,s2l,s2h,th,tl;
2059       tl=get_reg(i_regs->regmap,rt1[i]);
2060       th=get_reg(i_regs->regmap,rt1[i]|64);
2061       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2062       {
2063         assert(tl>=0);
2064         if(tl>=0) {
2065           s1l=get_reg(i_regs->regmap,rs1[i]);
2066           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2067           s2l=get_reg(i_regs->regmap,rs2[i]);
2068           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2069           if(rs1[i]&&rs2[i]) {
2070             assert(s1l>=0);assert(s1h>=0);
2071             assert(s2l>=0);assert(s2h>=0);
2072             if(opcode2[i]==0x24) { // AND
2073               emit_and(s1l,s2l,tl);
2074               emit_and(s1h,s2h,th);
2075             } else
2076             if(opcode2[i]==0x25) { // OR
2077               emit_or(s1l,s2l,tl);
2078               emit_or(s1h,s2h,th);
2079             } else
2080             if(opcode2[i]==0x26) { // XOR
2081               emit_xor(s1l,s2l,tl);
2082               emit_xor(s1h,s2h,th);
2083             } else
2084             if(opcode2[i]==0x27) { // NOR
2085               emit_or(s1l,s2l,tl);
2086               emit_or(s1h,s2h,th);
2087               emit_not(tl,tl);
2088               emit_not(th,th);
2089             }
2090           }
2091           else
2092           {
2093             if(opcode2[i]==0x24) { // AND
2094               emit_zeroreg(tl);
2095               emit_zeroreg(th);
2096             } else
2097             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2098               if(rs1[i]){
2099                 if(s1l>=0) emit_mov(s1l,tl);
2100                 else emit_loadreg(rs1[i],tl);
2101                 if(s1h>=0) emit_mov(s1h,th);
2102                 else emit_loadreg(rs1[i]|64,th);
2103               }
2104               else
2105               if(rs2[i]){
2106                 if(s2l>=0) emit_mov(s2l,tl);
2107                 else emit_loadreg(rs2[i],tl);
2108                 if(s2h>=0) emit_mov(s2h,th);
2109                 else emit_loadreg(rs2[i]|64,th);
2110               }
2111               else{
2112                 emit_zeroreg(tl);
2113                 emit_zeroreg(th);
2114               }
2115             } else
2116             if(opcode2[i]==0x27) { // NOR
2117               if(rs1[i]){
2118                 if(s1l>=0) emit_not(s1l,tl);
2119                 else{
2120                   emit_loadreg(rs1[i],tl);
2121                   emit_not(tl,tl);
2122                 }
2123                 if(s1h>=0) emit_not(s1h,th);
2124                 else{
2125                   emit_loadreg(rs1[i]|64,th);
2126                   emit_not(th,th);
2127                 }
2128               }
2129               else
2130               if(rs2[i]){
2131                 if(s2l>=0) emit_not(s2l,tl);
2132                 else{
2133                   emit_loadreg(rs2[i],tl);
2134                   emit_not(tl,tl);
2135                 }
2136                 if(s2h>=0) emit_not(s2h,th);
2137                 else{
2138                   emit_loadreg(rs2[i]|64,th);
2139                   emit_not(th,th);
2140                 }
2141               }
2142               else {
2143                 emit_movimm(-1,tl);
2144                 emit_movimm(-1,th);
2145               }
2146             }
2147           }
2148         }
2149       }
2150       else
2151       {
2152         // 32 bit
2153         if(tl>=0) {
2154           s1l=get_reg(i_regs->regmap,rs1[i]);
2155           s2l=get_reg(i_regs->regmap,rs2[i]);
2156           if(rs1[i]&&rs2[i]) {
2157             assert(s1l>=0);
2158             assert(s2l>=0);
2159             if(opcode2[i]==0x24) { // AND
2160               emit_and(s1l,s2l,tl);
2161             } else
2162             if(opcode2[i]==0x25) { // OR
2163               emit_or(s1l,s2l,tl);
2164             } else
2165             if(opcode2[i]==0x26) { // XOR
2166               emit_xor(s1l,s2l,tl);
2167             } else
2168             if(opcode2[i]==0x27) { // NOR
2169               emit_or(s1l,s2l,tl);
2170               emit_not(tl,tl);
2171             }
2172           }
2173           else
2174           {
2175             if(opcode2[i]==0x24) { // AND
2176               emit_zeroreg(tl);
2177             } else
2178             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2179               if(rs1[i]){
2180                 if(s1l>=0) emit_mov(s1l,tl);
2181                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2182               }
2183               else
2184               if(rs2[i]){
2185                 if(s2l>=0) emit_mov(s2l,tl);
2186                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2187               }
2188               else emit_zeroreg(tl);
2189             } else
2190             if(opcode2[i]==0x27) { // NOR
2191               if(rs1[i]){
2192                 if(s1l>=0) emit_not(s1l,tl);
2193                 else {
2194                   emit_loadreg(rs1[i],tl);
2195                   emit_not(tl,tl);
2196                 }
2197               }
2198               else
2199               if(rs2[i]){
2200                 if(s2l>=0) emit_not(s2l,tl);
2201                 else {
2202                   emit_loadreg(rs2[i],tl);
2203                   emit_not(tl,tl);
2204                 }
2205               }
2206               else emit_movimm(-1,tl);
2207             }
2208           }
2209         }
2210       }
2211     }
2212   }
2213 }
2214
2215 void imm16_assemble(int i,struct regstat *i_regs)
2216 {
2217   if (opcode[i]==0x0f) { // LUI
2218     if(rt1[i]) {
2219       signed char t;
2220       t=get_reg(i_regs->regmap,rt1[i]);
2221       //assert(t>=0);
2222       if(t>=0) {
2223         if(!((i_regs->isconst>>t)&1))
2224           emit_movimm(imm[i]<<16,t);
2225       }
2226     }
2227   }
2228   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2229     if(rt1[i]) {
2230       signed char s,t;
2231       t=get_reg(i_regs->regmap,rt1[i]);
2232       s=get_reg(i_regs->regmap,rs1[i]);
2233       if(rs1[i]) {
2234         //assert(t>=0);
2235         //assert(s>=0);
2236         if(t>=0) {
2237           if(!((i_regs->isconst>>t)&1)) {
2238             if(s<0) {
2239               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2240               emit_addimm(t,imm[i],t);
2241             }else{
2242               if(!((i_regs->wasconst>>s)&1))
2243                 emit_addimm(s,imm[i],t);
2244               else
2245                 emit_movimm(constmap[i][s]+imm[i],t);
2246             }
2247           }
2248         }
2249       } else {
2250         if(t>=0) {
2251           if(!((i_regs->isconst>>t)&1))
2252             emit_movimm(imm[i],t);
2253         }
2254       }
2255     }
2256   }
2257   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2258     if(rt1[i]) {
2259       signed char sh,sl,th,tl;
2260       th=get_reg(i_regs->regmap,rt1[i]|64);
2261       tl=get_reg(i_regs->regmap,rt1[i]);
2262       sh=get_reg(i_regs->regmap,rs1[i]|64);
2263       sl=get_reg(i_regs->regmap,rs1[i]);
2264       if(tl>=0) {
2265         if(rs1[i]) {
2266           assert(sh>=0);
2267           assert(sl>=0);
2268           if(th>=0) {
2269             emit_addimm64_32(sh,sl,imm[i],th,tl);
2270           }
2271           else {
2272             emit_addimm(sl,imm[i],tl);
2273           }
2274         } else {
2275           emit_movimm(imm[i],tl);
2276           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2277         }
2278       }
2279     }
2280   }
2281   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2282     if(rt1[i]) {
2283       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2284       signed char sh,sl,t;
2285       t=get_reg(i_regs->regmap,rt1[i]);
2286       sh=get_reg(i_regs->regmap,rs1[i]|64);
2287       sl=get_reg(i_regs->regmap,rs1[i]);
2288       //assert(t>=0);
2289       if(t>=0) {
2290         if(rs1[i]>0) {
2291           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2292           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2293             if(opcode[i]==0x0a) { // SLTI
2294               if(sl<0) {
2295                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2296                 emit_slti32(t,imm[i],t);
2297               }else{
2298                 emit_slti32(sl,imm[i],t);
2299               }
2300             }
2301             else { // SLTIU
2302               if(sl<0) {
2303                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2304                 emit_sltiu32(t,imm[i],t);
2305               }else{
2306                 emit_sltiu32(sl,imm[i],t);
2307               }
2308             }
2309           }else{ // 64-bit
2310             assert(sl>=0);
2311             if(opcode[i]==0x0a) // SLTI
2312               emit_slti64_32(sh,sl,imm[i],t);
2313             else // SLTIU
2314               emit_sltiu64_32(sh,sl,imm[i],t);
2315           }
2316         }else{
2317           // SLTI(U) with r0 is just stupid,
2318           // nonetheless examples can be found
2319           if(opcode[i]==0x0a) // SLTI
2320             if(0<imm[i]) emit_movimm(1,t);
2321             else emit_zeroreg(t);
2322           else // SLTIU
2323           {
2324             if(imm[i]) emit_movimm(1,t);
2325             else emit_zeroreg(t);
2326           }
2327         }
2328       }
2329     }
2330   }
2331   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2332     if(rt1[i]) {
2333       signed char sh,sl,th,tl;
2334       th=get_reg(i_regs->regmap,rt1[i]|64);
2335       tl=get_reg(i_regs->regmap,rt1[i]);
2336       sh=get_reg(i_regs->regmap,rs1[i]|64);
2337       sl=get_reg(i_regs->regmap,rs1[i]);
2338       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2339         if(opcode[i]==0x0c) //ANDI
2340         {
2341           if(rs1[i]) {
2342             if(sl<0) {
2343               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2344               emit_andimm(tl,imm[i],tl);
2345             }else{
2346               if(!((i_regs->wasconst>>sl)&1))
2347                 emit_andimm(sl,imm[i],tl);
2348               else
2349                 emit_movimm(constmap[i][sl]&imm[i],tl);
2350             }
2351           }
2352           else
2353             emit_zeroreg(tl);
2354           if(th>=0) emit_zeroreg(th);
2355         }
2356         else
2357         {
2358           if(rs1[i]) {
2359             if(sl<0) {
2360               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2361             }
2362             if(th>=0) {
2363               if(sh<0) {
2364                 emit_loadreg(rs1[i]|64,th);
2365               }else{
2366                 emit_mov(sh,th);
2367               }
2368             }
2369             if(opcode[i]==0x0d) { // ORI
2370               if(sl<0) {
2371                 emit_orimm(tl,imm[i],tl);
2372               }else{
2373                 if(!((i_regs->wasconst>>sl)&1))
2374                   emit_orimm(sl,imm[i],tl);
2375                 else
2376                   emit_movimm(constmap[i][sl]|imm[i],tl);
2377               }
2378             }
2379             if(opcode[i]==0x0e) { // XORI
2380               if(sl<0) {
2381                 emit_xorimm(tl,imm[i],tl);
2382               }else{
2383                 if(!((i_regs->wasconst>>sl)&1))
2384                   emit_xorimm(sl,imm[i],tl);
2385                 else
2386                   emit_movimm(constmap[i][sl]^imm[i],tl);
2387               }
2388             }
2389           }
2390           else {
2391             emit_movimm(imm[i],tl);
2392             if(th>=0) emit_zeroreg(th);
2393           }
2394         }
2395       }
2396     }
2397   }
2398 }
2399
2400 void shiftimm_assemble(int i,struct regstat *i_regs)
2401 {
2402   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2403   {
2404     if(rt1[i]) {
2405       signed char s,t;
2406       t=get_reg(i_regs->regmap,rt1[i]);
2407       s=get_reg(i_regs->regmap,rs1[i]);
2408       //assert(t>=0);
2409       if(t>=0&&!((i_regs->isconst>>t)&1)){
2410         if(rs1[i]==0)
2411         {
2412           emit_zeroreg(t);
2413         }
2414         else
2415         {
2416           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2417           if(imm[i]) {
2418             if(opcode2[i]==0) // SLL
2419             {
2420               emit_shlimm(s<0?t:s,imm[i],t);
2421             }
2422             if(opcode2[i]==2) // SRL
2423             {
2424               emit_shrimm(s<0?t:s,imm[i],t);
2425             }
2426             if(opcode2[i]==3) // SRA
2427             {
2428               emit_sarimm(s<0?t:s,imm[i],t);
2429             }
2430           }else{
2431             // Shift by zero
2432             if(s>=0 && s!=t) emit_mov(s,t);
2433           }
2434         }
2435       }
2436       //emit_storereg(rt1[i],t); //DEBUG
2437     }
2438   }
2439   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2440   {
2441     assert(0);
2442   }
2443   if(opcode2[i]==0x3c) // DSLL32
2444   {
2445     assert(0);
2446   }
2447   if(opcode2[i]==0x3e) // DSRL32
2448   {
2449     assert(0);
2450   }
2451   if(opcode2[i]==0x3f) // DSRA32
2452   {
2453     assert(0);
2454   }
2455 }
2456
2457 #ifndef shift_assemble
2458 void shift_assemble(int i,struct regstat *i_regs)
2459 {
2460   printf("Need shift_assemble for this architecture.\n");
2461   exit(1);
2462 }
2463 #endif
2464
2465 void load_assemble(int i,struct regstat *i_regs)
2466 {
2467   int s,th,tl,addr;
2468   int offset;
2469   void *jaddr=0;
2470   int memtarget=0,c=0;
2471   int fastload_reg_override=0;
2472   u_int hr,reglist=0;
2473   th=get_reg(i_regs->regmap,rt1[i]|64);
2474   tl=get_reg(i_regs->regmap,rt1[i]);
2475   s=get_reg(i_regs->regmap,rs1[i]);
2476   offset=imm[i];
2477   for(hr=0;hr<HOST_REGS;hr++) {
2478     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2479   }
2480   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2481   if(s>=0) {
2482     c=(i_regs->wasconst>>s)&1;
2483     if (c) {
2484       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2485     }
2486   }
2487   //printf("load_assemble: c=%d\n",c);
2488   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2489   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2490   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2491     ||rt1[i]==0) {
2492       // could be FIFO, must perform the read
2493       // ||dummy read
2494       assem_debug("(forced read)\n");
2495       tl=get_reg(i_regs->regmap,-1);
2496       assert(tl>=0);
2497   }
2498   if(offset||s<0||c) addr=tl;
2499   else addr=s;
2500   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2501  if(tl>=0) {
2502   //printf("load_assemble: c=%d\n",c);
2503   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2504   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2505   reglist&=~(1<<tl);
2506   if(th>=0) reglist&=~(1<<th);
2507   if(!c) {
2508     #ifdef R29_HACK
2509     // Strmnnrmn's speed hack
2510     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2511     #endif
2512     {
2513       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2514     }
2515   }
2516   else if(ram_offset&&memtarget) {
2517     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2518     fastload_reg_override=HOST_TEMPREG;
2519   }
2520   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2521   if (opcode[i]==0x20) { // LB
2522     if(!c||memtarget) {
2523       if(!dummy) {
2524         {
2525           int x=0,a=tl;
2526           if(!c) a=addr;
2527           if(fastload_reg_override) a=fastload_reg_override;
2528
2529           emit_movsbl_indexed(x,a,tl);
2530         }
2531       }
2532       if(jaddr)
2533         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2534     }
2535     else
2536       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2537   }
2538   if (opcode[i]==0x21) { // LH
2539     if(!c||memtarget) {
2540       if(!dummy) {
2541         int x=0,a=tl;
2542         if(!c) a=addr;
2543         if(fastload_reg_override) a=fastload_reg_override;
2544         emit_movswl_indexed(x,a,tl);
2545       }
2546       if(jaddr)
2547         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2548     }
2549     else
2550       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2551   }
2552   if (opcode[i]==0x23) { // LW
2553     if(!c||memtarget) {
2554       if(!dummy) {
2555         int a=addr;
2556         if(fastload_reg_override) a=fastload_reg_override;
2557         emit_readword_indexed(0,a,tl);
2558       }
2559       if(jaddr)
2560         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2561     }
2562     else
2563       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2564   }
2565   if (opcode[i]==0x24) { // LBU
2566     if(!c||memtarget) {
2567       if(!dummy) {
2568         int x=0,a=tl;
2569         if(!c) a=addr;
2570         if(fastload_reg_override) a=fastload_reg_override;
2571
2572         emit_movzbl_indexed(x,a,tl);
2573       }
2574       if(jaddr)
2575         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2576     }
2577     else
2578       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2579   }
2580   if (opcode[i]==0x25) { // LHU
2581     if(!c||memtarget) {
2582       if(!dummy) {
2583         int x=0,a=tl;
2584         if(!c) a=addr;
2585         if(fastload_reg_override) a=fastload_reg_override;
2586         emit_movzwl_indexed(x,a,tl);
2587       }
2588       if(jaddr)
2589         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2590     }
2591     else
2592       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2593   }
2594   if (opcode[i]==0x27) { // LWU
2595     assert(th>=0);
2596     if(!c||memtarget) {
2597       if(!dummy) {
2598         int a=addr;
2599         if(fastload_reg_override) a=fastload_reg_override;
2600         emit_readword_indexed(0,a,tl);
2601       }
2602       if(jaddr)
2603         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2604     }
2605     else {
2606       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2607     }
2608     emit_zeroreg(th);
2609   }
2610   if (opcode[i]==0x37) { // LD
2611     assert(0);
2612   }
2613  }
2614 }
2615
2616 #ifndef loadlr_assemble
2617 void loadlr_assemble(int i,struct regstat *i_regs)
2618 {
2619   printf("Need loadlr_assemble for this architecture.\n");
2620   exit(1);
2621 }
2622 #endif
2623
2624 void store_assemble(int i,struct regstat *i_regs)
2625 {
2626   int s,tl;
2627   int addr,temp;
2628   int offset;
2629   void *jaddr=0;
2630   enum stub_type type;
2631   int memtarget=0,c=0;
2632   int agr=AGEN1+(i&1);
2633   int faststore_reg_override=0;
2634   u_int hr,reglist=0;
2635   tl=get_reg(i_regs->regmap,rs2[i]);
2636   s=get_reg(i_regs->regmap,rs1[i]);
2637   temp=get_reg(i_regs->regmap,agr);
2638   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2639   offset=imm[i];
2640   if(s>=0) {
2641     c=(i_regs->wasconst>>s)&1;
2642     if(c) {
2643       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2644     }
2645   }
2646   assert(tl>=0);
2647   assert(temp>=0);
2648   for(hr=0;hr<HOST_REGS;hr++) {
2649     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2650   }
2651   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2652   if(offset||s<0||c) addr=temp;
2653   else addr=s;
2654   if(!c) {
2655     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2656   }
2657   else if(ram_offset&&memtarget) {
2658     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2659     faststore_reg_override=HOST_TEMPREG;
2660   }
2661
2662   if (opcode[i]==0x28) { // SB
2663     if(!c||memtarget) {
2664       int x=0,a=temp;
2665       if(!c) a=addr;
2666       if(faststore_reg_override) a=faststore_reg_override;
2667       emit_writebyte_indexed(tl,x,a);
2668     }
2669     type=STOREB_STUB;
2670   }
2671   if (opcode[i]==0x29) { // SH
2672     if(!c||memtarget) {
2673       int x=0,a=temp;
2674       if(!c) a=addr;
2675       if(faststore_reg_override) a=faststore_reg_override;
2676       emit_writehword_indexed(tl,x,a);
2677     }
2678     type=STOREH_STUB;
2679   }
2680   if (opcode[i]==0x2B) { // SW
2681     if(!c||memtarget) {
2682       int a=addr;
2683       if(faststore_reg_override) a=faststore_reg_override;
2684       emit_writeword_indexed(tl,0,a);
2685     }
2686     type=STOREW_STUB;
2687   }
2688   if (opcode[i]==0x3F) { // SD
2689     assert(0);
2690     type=STORED_STUB;
2691   }
2692   if(jaddr) {
2693     // PCSX store handlers don't check invcode again
2694     reglist|=1<<addr;
2695     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2696     jaddr=0;
2697   }
2698   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2699     if(!c||memtarget) {
2700       #ifdef DESTRUCTIVE_SHIFT
2701       // The x86 shift operation is 'destructive'; it overwrites the
2702       // source register, so we need to make a copy first and use that.
2703       addr=temp;
2704       #endif
2705       #if defined(HOST_IMM8)
2706       int ir=get_reg(i_regs->regmap,INVCP);
2707       assert(ir>=0);
2708       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2709       #else
2710       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2711       #endif
2712       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2713       emit_callne(invalidate_addr_reg[addr]);
2714       #else
2715       void *jaddr2 = out;
2716       emit_jne(0);
2717       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2718       #endif
2719     }
2720   }
2721   u_int addr_val=constmap[i][s]+offset;
2722   if(jaddr) {
2723     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2724   } else if(c&&!memtarget) {
2725     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2726   }
2727   // basic current block modification detection..
2728   // not looking back as that should be in mips cache already
2729   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2730     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2731     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2732     if(i_regs->regmap==regs[i].regmap) {
2733       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
2734       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
2735       emit_movimm(start+i*4+4,0);
2736       emit_writeword(0,&pcaddr);
2737       emit_jmp(do_interrupt);
2738     }
2739   }
2740 }
2741
2742 void storelr_assemble(int i,struct regstat *i_regs)
2743 {
2744   int s,tl;
2745   int temp;
2746   int offset;
2747   void *jaddr=0;
2748   void *case1, *case2, *case3;
2749   void *done0, *done1, *done2;
2750   int memtarget=0,c=0;
2751   int agr=AGEN1+(i&1);
2752   u_int hr,reglist=0;
2753   tl=get_reg(i_regs->regmap,rs2[i]);
2754   s=get_reg(i_regs->regmap,rs1[i]);
2755   temp=get_reg(i_regs->regmap,agr);
2756   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2757   offset=imm[i];
2758   if(s>=0) {
2759     c=(i_regs->isconst>>s)&1;
2760     if(c) {
2761       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2762     }
2763   }
2764   assert(tl>=0);
2765   for(hr=0;hr<HOST_REGS;hr++) {
2766     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2767   }
2768   assert(temp>=0);
2769   if(!c) {
2770     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
2771     if(!offset&&s!=temp) emit_mov(s,temp);
2772     jaddr=out;
2773     emit_jno(0);
2774   }
2775   else
2776   {
2777     if(!memtarget||!rs1[i]) {
2778       jaddr=out;
2779       emit_jmp(0);
2780     }
2781   }
2782   emit_addimm_no_flags(ram_offset,temp);
2783
2784   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
2785     assert(0);
2786   }
2787
2788   emit_xorimm(temp,3,temp);
2789   emit_testimm(temp,2);
2790   case2=out;
2791   emit_jne(0);
2792   emit_testimm(temp,1);
2793   case1=out;
2794   emit_jne(0);
2795   // 0
2796   if (opcode[i]==0x2A) { // SWL
2797     emit_writeword_indexed(tl,0,temp);
2798   }
2799   if (opcode[i]==0x2E) { // SWR
2800     emit_writebyte_indexed(tl,3,temp);
2801   }
2802   if (opcode[i]==0x2C) { // SDL
2803     assert(0);
2804   }
2805   if (opcode[i]==0x2D) { // SDR
2806     assert(0);
2807   }
2808   done0=out;
2809   emit_jmp(0);
2810   // 1
2811   set_jump_target(case1, out);
2812   if (opcode[i]==0x2A) { // SWL
2813     // Write 3 msb into three least significant bytes
2814     if(rs2[i]) emit_rorimm(tl,8,tl);
2815     emit_writehword_indexed(tl,-1,temp);
2816     if(rs2[i]) emit_rorimm(tl,16,tl);
2817     emit_writebyte_indexed(tl,1,temp);
2818     if(rs2[i]) emit_rorimm(tl,8,tl);
2819   }
2820   if (opcode[i]==0x2E) { // SWR
2821     // Write two lsb into two most significant bytes
2822     emit_writehword_indexed(tl,1,temp);
2823   }
2824   if (opcode[i]==0x2C) { // SDL
2825     assert(0);
2826   }
2827   if (opcode[i]==0x2D) { // SDR
2828     assert(0);
2829   }
2830   done1=out;
2831   emit_jmp(0);
2832   // 2
2833   set_jump_target(case2, out);
2834   emit_testimm(temp,1);
2835   case3=out;
2836   emit_jne(0);
2837   if (opcode[i]==0x2A) { // SWL
2838     // Write two msb into two least significant bytes
2839     if(rs2[i]) emit_rorimm(tl,16,tl);
2840     emit_writehword_indexed(tl,-2,temp);
2841     if(rs2[i]) emit_rorimm(tl,16,tl);
2842   }
2843   if (opcode[i]==0x2E) { // SWR
2844     // Write 3 lsb into three most significant bytes
2845     emit_writebyte_indexed(tl,-1,temp);
2846     if(rs2[i]) emit_rorimm(tl,8,tl);
2847     emit_writehword_indexed(tl,0,temp);
2848     if(rs2[i]) emit_rorimm(tl,24,tl);
2849   }
2850   if (opcode[i]==0x2C) { // SDL
2851     assert(0);
2852   }
2853   if (opcode[i]==0x2D) { // SDR
2854     assert(0);
2855   }
2856   done2=out;
2857   emit_jmp(0);
2858   // 3
2859   set_jump_target(case3, out);
2860   if (opcode[i]==0x2A) { // SWL
2861     // Write msb into least significant byte
2862     if(rs2[i]) emit_rorimm(tl,24,tl);
2863     emit_writebyte_indexed(tl,-3,temp);
2864     if(rs2[i]) emit_rorimm(tl,8,tl);
2865   }
2866   if (opcode[i]==0x2E) { // SWR
2867     // Write entire word
2868     emit_writeword_indexed(tl,-3,temp);
2869   }
2870   if (opcode[i]==0x2C) { // SDL
2871     assert(0);
2872   }
2873   if (opcode[i]==0x2D) { // SDR
2874     assert(0);
2875   }
2876   set_jump_target(done0, out);
2877   set_jump_target(done1, out);
2878   set_jump_target(done2, out);
2879   if (opcode[i]==0x2C) { // SDL
2880     assert(0);
2881   }
2882   if (opcode[i]==0x2D) { // SDR
2883     assert(0);
2884   }
2885   if(!c||!memtarget)
2886     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
2887   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2888     emit_addimm_no_flags(-ram_offset,temp);
2889     #if defined(HOST_IMM8)
2890     int ir=get_reg(i_regs->regmap,INVCP);
2891     assert(ir>=0);
2892     emit_cmpmem_indexedsr12_reg(ir,temp,1);
2893     #else
2894     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
2895     #endif
2896     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2897     emit_callne(invalidate_addr_reg[temp]);
2898     #else
2899     void *jaddr2 = out;
2900     emit_jne(0);
2901     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
2902     #endif
2903   }
2904 }
2905
2906 void c1ls_assemble(int i,struct regstat *i_regs)
2907 {
2908   cop1_unusable(i, i_regs);
2909 }
2910
2911 void c2ls_assemble(int i,struct regstat *i_regs)
2912 {
2913   int s,tl;
2914   int ar;
2915   int offset;
2916   int memtarget=0,c=0;
2917   void *jaddr2=NULL;
2918   enum stub_type type;
2919   int agr=AGEN1+(i&1);
2920   int fastio_reg_override=0;
2921   u_int hr,reglist=0;
2922   u_int copr=(source[i]>>16)&0x1f;
2923   s=get_reg(i_regs->regmap,rs1[i]);
2924   tl=get_reg(i_regs->regmap,FTEMP);
2925   offset=imm[i];
2926   assert(rs1[i]>0);
2927   assert(tl>=0);
2928
2929   for(hr=0;hr<HOST_REGS;hr++) {
2930     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2931   }
2932   if(i_regs->regmap[HOST_CCREG]==CCREG)
2933     reglist&=~(1<<HOST_CCREG);
2934
2935   // get the address
2936   if (opcode[i]==0x3a) { // SWC2
2937     ar=get_reg(i_regs->regmap,agr);
2938     if(ar<0) ar=get_reg(i_regs->regmap,-1);
2939     reglist|=1<<ar;
2940   } else { // LWC2
2941     ar=tl;
2942   }
2943   if(s>=0) c=(i_regs->wasconst>>s)&1;
2944   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
2945   if (!offset&&!c&&s>=0) ar=s;
2946   assert(ar>=0);
2947
2948   if (opcode[i]==0x3a) { // SWC2
2949     cop2_get_dreg(copr,tl,HOST_TEMPREG);
2950     type=STOREW_STUB;
2951   }
2952   else
2953     type=LOADW_STUB;
2954
2955   if(c&&!memtarget) {
2956     jaddr2=out;
2957     emit_jmp(0); // inline_readstub/inline_writestub?
2958   }
2959   else {
2960     if(!c) {
2961       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
2962     }
2963     else if(ram_offset&&memtarget) {
2964       emit_addimm(ar,ram_offset,HOST_TEMPREG);
2965       fastio_reg_override=HOST_TEMPREG;
2966     }
2967     if (opcode[i]==0x32) { // LWC2
2968       int a=ar;
2969       if(fastio_reg_override) a=fastio_reg_override;
2970       emit_readword_indexed(0,a,tl);
2971     }
2972     if (opcode[i]==0x3a) { // SWC2
2973       #ifdef DESTRUCTIVE_SHIFT
2974       if(!offset&&!c&&s>=0) emit_mov(s,ar);
2975       #endif
2976       int a=ar;
2977       if(fastio_reg_override) a=fastio_reg_override;
2978       emit_writeword_indexed(tl,0,a);
2979     }
2980   }
2981   if(jaddr2)
2982     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
2983   if(opcode[i]==0x3a) // SWC2
2984   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2985 #if defined(HOST_IMM8)
2986     int ir=get_reg(i_regs->regmap,INVCP);
2987     assert(ir>=0);
2988     emit_cmpmem_indexedsr12_reg(ir,ar,1);
2989 #else
2990     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
2991 #endif
2992     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2993     emit_callne(invalidate_addr_reg[ar]);
2994     #else
2995     void *jaddr3 = out;
2996     emit_jne(0);
2997     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
2998     #endif
2999   }
3000   if (opcode[i]==0x32) { // LWC2
3001     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3002   }
3003 }
3004
3005 #ifndef multdiv_assemble
3006 void multdiv_assemble(int i,struct regstat *i_regs)
3007 {
3008   printf("Need multdiv_assemble for this architecture.\n");
3009   exit(1);
3010 }
3011 #endif
3012
3013 void mov_assemble(int i,struct regstat *i_regs)
3014 {
3015   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3016   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3017   if(rt1[i]) {
3018     signed char sh,sl,th,tl;
3019     th=get_reg(i_regs->regmap,rt1[i]|64);
3020     tl=get_reg(i_regs->regmap,rt1[i]);
3021     //assert(tl>=0);
3022     if(tl>=0) {
3023       sh=get_reg(i_regs->regmap,rs1[i]|64);
3024       sl=get_reg(i_regs->regmap,rs1[i]);
3025       if(sl>=0) emit_mov(sl,tl);
3026       else emit_loadreg(rs1[i],tl);
3027       if(th>=0) {
3028         if(sh>=0) emit_mov(sh,th);
3029         else emit_loadreg(rs1[i]|64,th);
3030       }
3031     }
3032   }
3033 }
3034
3035 #ifndef fconv_assemble
3036 void fconv_assemble(int i,struct regstat *i_regs)
3037 {
3038   printf("Need fconv_assemble for this architecture.\n");
3039   exit(1);
3040 }
3041 #endif
3042
3043 #if 0
3044 void float_assemble(int i,struct regstat *i_regs)
3045 {
3046   printf("Need float_assemble for this architecture.\n");
3047   exit(1);
3048 }
3049 #endif
3050
3051 void syscall_assemble(int i,struct regstat *i_regs)
3052 {
3053   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3054   assert(ccreg==HOST_CCREG);
3055   assert(!is_delayslot);
3056   (void)ccreg;
3057   emit_movimm(start+i*4,EAX); // Get PC
3058   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3059   emit_jmp(jump_syscall_hle); // XXX
3060 }
3061
3062 void hlecall_assemble(int i,struct regstat *i_regs)
3063 {
3064   extern void psxNULL();
3065   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3066   assert(ccreg==HOST_CCREG);
3067   assert(!is_delayslot);
3068   (void)ccreg;
3069   emit_movimm(start+i*4+4,0); // Get PC
3070   uint32_t hleCode = source[i] & 0x03ffffff;
3071   if (hleCode >= ARRAY_SIZE(psxHLEt))
3072     emit_movimm((uintptr_t)psxNULL,1);
3073   else
3074     emit_movimm((uintptr_t)psxHLEt[hleCode],1);
3075   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3076   emit_jmp(jump_hlecall);
3077 }
3078
3079 void intcall_assemble(int i,struct regstat *i_regs)
3080 {
3081   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3082   assert(ccreg==HOST_CCREG);
3083   assert(!is_delayslot);
3084   (void)ccreg;
3085   emit_movimm(start+i*4,0); // Get PC
3086   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3087   emit_jmp(jump_intcall);
3088 }
3089
3090 void ds_assemble(int i,struct regstat *i_regs)
3091 {
3092   speculate_register_values(i);
3093   is_delayslot=1;
3094   switch(itype[i]) {
3095     case ALU:
3096       alu_assemble(i,i_regs);break;
3097     case IMM16:
3098       imm16_assemble(i,i_regs);break;
3099     case SHIFT:
3100       shift_assemble(i,i_regs);break;
3101     case SHIFTIMM:
3102       shiftimm_assemble(i,i_regs);break;
3103     case LOAD:
3104       load_assemble(i,i_regs);break;
3105     case LOADLR:
3106       loadlr_assemble(i,i_regs);break;
3107     case STORE:
3108       store_assemble(i,i_regs);break;
3109     case STORELR:
3110       storelr_assemble(i,i_regs);break;
3111     case COP0:
3112       cop0_assemble(i,i_regs);break;
3113     case COP1:
3114       cop1_assemble(i,i_regs);break;
3115     case C1LS:
3116       c1ls_assemble(i,i_regs);break;
3117     case COP2:
3118       cop2_assemble(i,i_regs);break;
3119     case C2LS:
3120       c2ls_assemble(i,i_regs);break;
3121     case C2OP:
3122       c2op_assemble(i,i_regs);break;
3123     case FCONV:
3124       fconv_assemble(i,i_regs);break;
3125     case FLOAT:
3126       float_assemble(i,i_regs);break;
3127     case FCOMP:
3128       fcomp_assemble(i,i_regs);break;
3129     case MULTDIV:
3130       multdiv_assemble(i,i_regs);break;
3131     case MOV:
3132       mov_assemble(i,i_regs);break;
3133     case SYSCALL:
3134     case HLECALL:
3135     case INTCALL:
3136     case SPAN:
3137     case UJUMP:
3138     case RJUMP:
3139     case CJUMP:
3140     case SJUMP:
3141     case FJUMP:
3142       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3143   }
3144   is_delayslot=0;
3145 }
3146
3147 // Is the branch target a valid internal jump?
3148 int internal_branch(uint64_t i_is32,int addr)
3149 {
3150   if(addr&1) return 0; // Indirect (register) jump
3151   if(addr>=start && addr<start+slen*4-4)
3152   {
3153     //int t=(addr-start)>>2;
3154     // Delay slots are not valid branch targets
3155     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3156     // 64 -> 32 bit transition requires a recompile
3157     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3158     {
3159       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3160       else printf("optimizable: yes\n");
3161     }*/
3162     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3163     return 1;
3164   }
3165   return 0;
3166 }
3167
3168 #ifndef wb_invalidate
3169 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3170   uint64_t u,uint64_t uu)
3171 {
3172   int hr;
3173   for(hr=0;hr<HOST_REGS;hr++) {
3174     if(hr!=EXCLUDE_REG) {
3175       if(pre[hr]!=entry[hr]) {
3176         if(pre[hr]>=0) {
3177           if((dirty>>hr)&1) {
3178             if(get_reg(entry,pre[hr])<0) {
3179               if(pre[hr]<64) {
3180                 if(!((u>>pre[hr])&1)) {
3181                   emit_storereg(pre[hr],hr);
3182                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3183                     emit_sarimm(hr,31,hr);
3184                     emit_storereg(pre[hr]|64,hr);
3185                   }
3186                 }
3187               }else{
3188                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3189                   emit_storereg(pre[hr],hr);
3190                 }
3191               }
3192             }
3193           }
3194         }
3195       }
3196     }
3197   }
3198   // Move from one register to another (no writeback)
3199   for(hr=0;hr<HOST_REGS;hr++) {
3200     if(hr!=EXCLUDE_REG) {
3201       if(pre[hr]!=entry[hr]) {
3202         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3203           int nr;
3204           if((nr=get_reg(entry,pre[hr]))>=0) {
3205             emit_mov(hr,nr);
3206           }
3207         }
3208       }
3209     }
3210   }
3211 }
3212 #endif
3213
3214 // Load the specified registers
3215 // This only loads the registers given as arguments because
3216 // we don't want to load things that will be overwritten
3217 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3218 {
3219   int hr;
3220   // Load 32-bit regs
3221   for(hr=0;hr<HOST_REGS;hr++) {
3222     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3223       if(entry[hr]!=regmap[hr]) {
3224         if(regmap[hr]==rs1||regmap[hr]==rs2)
3225         {
3226           if(regmap[hr]==0) {
3227             emit_zeroreg(hr);
3228           }
3229           else
3230           {
3231             emit_loadreg(regmap[hr],hr);
3232           }
3233         }
3234       }
3235     }
3236   }
3237   //Load 64-bit regs
3238   for(hr=0;hr<HOST_REGS;hr++) {
3239     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3240       if(entry[hr]!=regmap[hr]) {
3241         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3242         {
3243           assert(regmap[hr]!=64);
3244           if((is32>>(regmap[hr]&63))&1) {
3245             int lr=get_reg(regmap,regmap[hr]-64);
3246             if(lr>=0)
3247               emit_sarimm(lr,31,hr);
3248             else
3249               emit_loadreg(regmap[hr],hr);
3250           }
3251           else
3252           {
3253             emit_loadreg(regmap[hr],hr);
3254           }
3255         }
3256       }
3257     }
3258   }
3259 }
3260
3261 // Load registers prior to the start of a loop
3262 // so that they are not loaded within the loop
3263 static void loop_preload(signed char pre[],signed char entry[])
3264 {
3265   int hr;
3266   for(hr=0;hr<HOST_REGS;hr++) {
3267     if(hr!=EXCLUDE_REG) {
3268       if(pre[hr]!=entry[hr]) {
3269         if(entry[hr]>=0) {
3270           if(get_reg(pre,entry[hr])<0) {
3271             assem_debug("loop preload:\n");
3272             //printf("loop preload: %d\n",hr);
3273             if(entry[hr]==0) {
3274               emit_zeroreg(hr);
3275             }
3276             else if(entry[hr]<TEMPREG)
3277             {
3278               emit_loadreg(entry[hr],hr);
3279             }
3280             else if(entry[hr]-64<TEMPREG)
3281             {
3282               emit_loadreg(entry[hr],hr);
3283             }
3284           }
3285         }
3286       }
3287     }
3288   }
3289 }
3290
3291 // Generate address for load/store instruction
3292 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3293 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3294 {
3295   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3296     int ra=-1;
3297     int agr=AGEN1+(i&1);
3298     if(itype[i]==LOAD) {
3299       ra=get_reg(i_regs->regmap,rt1[i]);
3300       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3301       assert(ra>=0);
3302     }
3303     if(itype[i]==LOADLR) {
3304       ra=get_reg(i_regs->regmap,FTEMP);
3305     }
3306     if(itype[i]==STORE||itype[i]==STORELR) {
3307       ra=get_reg(i_regs->regmap,agr);
3308       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3309     }
3310     if(itype[i]==C1LS||itype[i]==C2LS) {
3311       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3312         ra=get_reg(i_regs->regmap,FTEMP);
3313       else { // SWC1/SDC1/SWC2/SDC2
3314         ra=get_reg(i_regs->regmap,agr);
3315         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3316       }
3317     }
3318     int rs=get_reg(i_regs->regmap,rs1[i]);
3319     if(ra>=0) {
3320       int offset=imm[i];
3321       int c=(i_regs->wasconst>>rs)&1;
3322       if(rs1[i]==0) {
3323         // Using r0 as a base address
3324         if(!entry||entry[ra]!=agr) {
3325           if (opcode[i]==0x22||opcode[i]==0x26) {
3326             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3327           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3328             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3329           }else{
3330             emit_movimm(offset,ra);
3331           }
3332         } // else did it in the previous cycle
3333       }
3334       else if(rs<0) {
3335         if(!entry||entry[ra]!=rs1[i])
3336           emit_loadreg(rs1[i],ra);
3337         //if(!entry||entry[ra]!=rs1[i])
3338         //  printf("poor load scheduling!\n");
3339       }
3340       else if(c) {
3341         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3342           if(!entry||entry[ra]!=agr) {
3343             if (opcode[i]==0x22||opcode[i]==0x26) {
3344               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3345             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3346               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3347             }else{
3348               emit_movimm(constmap[i][rs]+offset,ra);
3349               regs[i].loadedconst|=1<<ra;
3350             }
3351           } // else did it in the previous cycle
3352         } // else load_consts already did it
3353       }
3354       if(offset&&!c&&rs1[i]) {
3355         if(rs>=0) {
3356           emit_addimm(rs,offset,ra);
3357         }else{
3358           emit_addimm(ra,offset,ra);
3359         }
3360       }
3361     }
3362   }
3363   // Preload constants for next instruction
3364   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3365     int agr,ra;
3366     // Actual address
3367     agr=AGEN1+((i+1)&1);
3368     ra=get_reg(i_regs->regmap,agr);
3369     if(ra>=0) {
3370       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3371       int offset=imm[i+1];
3372       int c=(regs[i+1].wasconst>>rs)&1;
3373       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3374         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3375           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3376         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3377           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3378         }else{
3379           emit_movimm(constmap[i+1][rs]+offset,ra);
3380           regs[i+1].loadedconst|=1<<ra;
3381         }
3382       }
3383       else if(rs1[i+1]==0) {
3384         // Using r0 as a base address
3385         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3386           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3387         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3388           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3389         }else{
3390           emit_movimm(offset,ra);
3391         }
3392       }
3393     }
3394   }
3395 }
3396
3397 static int get_final_value(int hr, int i, int *value)
3398 {
3399   int reg=regs[i].regmap[hr];
3400   while(i<slen-1) {
3401     if(regs[i+1].regmap[hr]!=reg) break;
3402     if(!((regs[i+1].isconst>>hr)&1)) break;
3403     if(bt[i+1]) break;
3404     i++;
3405   }
3406   if(i<slen-1) {
3407     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3408       *value=constmap[i][hr];
3409       return 1;
3410     }
3411     if(!bt[i+1]) {
3412       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3413         // Load in delay slot, out-of-order execution
3414         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3415         {
3416           // Precompute load address
3417           *value=constmap[i][hr]+imm[i+2];
3418           return 1;
3419         }
3420       }
3421       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3422       {
3423         // Precompute load address
3424         *value=constmap[i][hr]+imm[i+1];
3425         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
3426         return 1;
3427       }
3428     }
3429   }
3430   *value=constmap[i][hr];
3431   //printf("c=%lx\n",(long)constmap[i][hr]);
3432   if(i==slen-1) return 1;
3433   if(reg<64) {
3434     return !((unneeded_reg[i+1]>>reg)&1);
3435   }else{
3436     return !((unneeded_reg_upper[i+1]>>reg)&1);
3437   }
3438 }
3439
3440 // Load registers with known constants
3441 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3442 {
3443   int hr,hr2;
3444   // propagate loaded constant flags
3445   if(i==0||bt[i])
3446     regs[i].loadedconst=0;
3447   else {
3448     for(hr=0;hr<HOST_REGS;hr++) {
3449       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3450          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3451       {
3452         regs[i].loadedconst|=1<<hr;
3453       }
3454     }
3455   }
3456   // Load 32-bit regs
3457   for(hr=0;hr<HOST_REGS;hr++) {
3458     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3459       //if(entry[hr]!=regmap[hr]) {
3460       if(!((regs[i].loadedconst>>hr)&1)) {
3461         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3462           int value,similar=0;
3463           if(get_final_value(hr,i,&value)) {
3464             // see if some other register has similar value
3465             for(hr2=0;hr2<HOST_REGS;hr2++) {
3466               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3467                 if(is_similar_value(value,constmap[i][hr2])) {
3468                   similar=1;
3469                   break;
3470                 }
3471               }
3472             }
3473             if(similar) {
3474               int value2;
3475               if(get_final_value(hr2,i,&value2)) // is this needed?
3476                 emit_movimm_from(value2,hr2,value,hr);
3477               else
3478                 emit_movimm(value,hr);
3479             }
3480             else if(value==0) {
3481               emit_zeroreg(hr);
3482             }
3483             else {
3484               emit_movimm(value,hr);
3485             }
3486           }
3487           regs[i].loadedconst|=1<<hr;
3488         }
3489       }
3490     }
3491   }
3492   // Load 64-bit regs
3493   for(hr=0;hr<HOST_REGS;hr++) {
3494     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3495       //if(entry[hr]!=regmap[hr]) {
3496       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3497         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3498           if((is32>>(regmap[hr]&63))&1) {
3499             int lr=get_reg(regmap,regmap[hr]-64);
3500             assert(lr>=0);
3501             emit_sarimm(lr,31,hr);
3502           }
3503           else
3504           {
3505             int value;
3506             if(get_final_value(hr,i,&value)) {
3507               if(value==0) {
3508                 emit_zeroreg(hr);
3509               }
3510               else {
3511                 emit_movimm(value,hr);
3512               }
3513             }
3514           }
3515         }
3516       }
3517     }
3518   }
3519 }
3520 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3521 {
3522   int hr;
3523   // Load 32-bit regs
3524   for(hr=0;hr<HOST_REGS;hr++) {
3525     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3526       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3527         int value=constmap[i][hr];
3528         if(value==0) {
3529           emit_zeroreg(hr);
3530         }
3531         else {
3532           emit_movimm(value,hr);
3533         }
3534       }
3535     }
3536   }
3537   // Load 64-bit regs
3538   for(hr=0;hr<HOST_REGS;hr++) {
3539     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3540       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3541         if((is32>>(regmap[hr]&63))&1) {
3542           int lr=get_reg(regmap,regmap[hr]-64);
3543           assert(lr>=0);
3544           emit_sarimm(lr,31,hr);
3545         }
3546         else
3547         {
3548           int value=constmap[i][hr];
3549           if(value==0) {
3550             emit_zeroreg(hr);
3551           }
3552           else {
3553             emit_movimm(value,hr);
3554           }
3555         }
3556       }
3557     }
3558   }
3559 }
3560
3561 // Write out all dirty registers (except cycle count)
3562 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3563 {
3564   int hr;
3565   for(hr=0;hr<HOST_REGS;hr++) {
3566     if(hr!=EXCLUDE_REG) {
3567       if(i_regmap[hr]>0) {
3568         if(i_regmap[hr]!=CCREG) {
3569           if((i_dirty>>hr)&1) {
3570             if(i_regmap[hr]<64) {
3571               emit_storereg(i_regmap[hr],hr);
3572             }else{
3573               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3574                 emit_storereg(i_regmap[hr],hr);
3575               }
3576             }
3577           }
3578         }
3579       }
3580     }
3581   }
3582 }
3583 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3584 // This writes the registers not written by store_regs_bt
3585 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3586 {
3587   int hr;
3588   int t=(addr-start)>>2;
3589   for(hr=0;hr<HOST_REGS;hr++) {
3590     if(hr!=EXCLUDE_REG) {
3591       if(i_regmap[hr]>0) {
3592         if(i_regmap[hr]!=CCREG) {
3593           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3594             if((i_dirty>>hr)&1) {
3595               if(i_regmap[hr]<64) {
3596                 emit_storereg(i_regmap[hr],hr);
3597               }else{
3598                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3599                   emit_storereg(i_regmap[hr],hr);
3600                 }
3601               }
3602             }
3603           }
3604         }
3605       }
3606     }
3607   }
3608 }
3609
3610 // Load all registers (except cycle count)
3611 void load_all_regs(signed char i_regmap[])
3612 {
3613   int hr;
3614   for(hr=0;hr<HOST_REGS;hr++) {
3615     if(hr!=EXCLUDE_REG) {
3616       if(i_regmap[hr]==0) {
3617         emit_zeroreg(hr);
3618       }
3619       else
3620       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3621       {
3622         emit_loadreg(i_regmap[hr],hr);
3623       }
3624     }
3625   }
3626 }
3627
3628 // Load all current registers also needed by next instruction
3629 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
3630 {
3631   int hr;
3632   for(hr=0;hr<HOST_REGS;hr++) {
3633     if(hr!=EXCLUDE_REG) {
3634       if(get_reg(next_regmap,i_regmap[hr])>=0) {
3635         if(i_regmap[hr]==0) {
3636           emit_zeroreg(hr);
3637         }
3638         else
3639         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3640         {
3641           emit_loadreg(i_regmap[hr],hr);
3642         }
3643       }
3644     }
3645   }
3646 }
3647
3648 // Load all regs, storing cycle count if necessary
3649 void load_regs_entry(int t)
3650 {
3651   int hr;
3652   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
3653   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
3654   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3655     emit_storereg(CCREG,HOST_CCREG);
3656   }
3657   // Load 32-bit regs
3658   for(hr=0;hr<HOST_REGS;hr++) {
3659     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3660       if(regs[t].regmap_entry[hr]==0) {
3661         emit_zeroreg(hr);
3662       }
3663       else if(regs[t].regmap_entry[hr]!=CCREG)
3664       {
3665         emit_loadreg(regs[t].regmap_entry[hr],hr);
3666       }
3667     }
3668   }
3669   // Load 64-bit regs
3670   for(hr=0;hr<HOST_REGS;hr++) {
3671     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
3672       assert(regs[t].regmap_entry[hr]!=64);
3673       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
3674         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3675         if(lr<0) {
3676           emit_loadreg(regs[t].regmap_entry[hr],hr);
3677         }
3678         else
3679         {
3680           emit_sarimm(lr,31,hr);
3681         }
3682       }
3683       else
3684       {
3685         emit_loadreg(regs[t].regmap_entry[hr],hr);
3686       }
3687     }
3688   }
3689 }
3690
3691 // Store dirty registers prior to branch
3692 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3693 {
3694   if(internal_branch(i_is32,addr))
3695   {
3696     int t=(addr-start)>>2;
3697     int hr;
3698     for(hr=0;hr<HOST_REGS;hr++) {
3699       if(hr!=EXCLUDE_REG) {
3700         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
3701           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3702             if((i_dirty>>hr)&1) {
3703               if(i_regmap[hr]<64) {
3704                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
3705                   emit_storereg(i_regmap[hr],hr);
3706                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
3707                     #ifdef DESTRUCTIVE_WRITEBACK
3708                     emit_sarimm(hr,31,hr);
3709                     emit_storereg(i_regmap[hr]|64,hr);
3710                     #else
3711                     emit_sarimm(hr,31,HOST_TEMPREG);
3712                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
3713                     #endif
3714                   }
3715                 }
3716               }else{
3717                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
3718                   emit_storereg(i_regmap[hr],hr);
3719                 }
3720               }
3721             }
3722           }
3723         }
3724       }
3725     }
3726   }
3727   else
3728   {
3729     // Branch out of this block, write out all dirty regs
3730     wb_dirtys(i_regmap,i_is32,i_dirty);
3731   }
3732 }
3733
3734 // Load all needed registers for branch target
3735 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3736 {
3737   //if(addr>=start && addr<(start+slen*4))
3738   if(internal_branch(i_is32,addr))
3739   {
3740     int t=(addr-start)>>2;
3741     int hr;
3742     // Store the cycle count before loading something else
3743     if(i_regmap[HOST_CCREG]!=CCREG) {
3744       assert(i_regmap[HOST_CCREG]==-1);
3745     }
3746     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3747       emit_storereg(CCREG,HOST_CCREG);
3748     }
3749     // Load 32-bit regs
3750     for(hr=0;hr<HOST_REGS;hr++) {
3751       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3752         #ifdef DESTRUCTIVE_WRITEBACK
3753         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3754         #else
3755         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
3756         #endif
3757           if(regs[t].regmap_entry[hr]==0) {
3758             emit_zeroreg(hr);
3759           }
3760           else if(regs[t].regmap_entry[hr]!=CCREG)
3761           {
3762             emit_loadreg(regs[t].regmap_entry[hr],hr);
3763           }
3764         }
3765       }
3766     }
3767     //Load 64-bit regs
3768     for(hr=0;hr<HOST_REGS;hr++) {
3769       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
3770         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
3771           assert(regs[t].regmap_entry[hr]!=64);
3772           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
3773             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3774             if(lr<0) {
3775               emit_loadreg(regs[t].regmap_entry[hr],hr);
3776             }
3777             else
3778             {
3779               emit_sarimm(lr,31,hr);
3780             }
3781           }
3782           else
3783           {
3784             emit_loadreg(regs[t].regmap_entry[hr],hr);
3785           }
3786         }
3787         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
3788           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3789           assert(lr>=0);
3790           emit_sarimm(lr,31,hr);
3791         }
3792       }
3793     }
3794   }
3795 }
3796
3797 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3798 {
3799   if(addr>=start && addr<start+slen*4-4)
3800   {
3801     int t=(addr-start)>>2;
3802     int hr;
3803     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
3804     for(hr=0;hr<HOST_REGS;hr++)
3805     {
3806       if(hr!=EXCLUDE_REG)
3807       {
3808         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
3809         {
3810           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
3811           {
3812             return 0;
3813           }
3814           else
3815           if((i_dirty>>hr)&1)
3816           {
3817             if(i_regmap[hr]<TEMPREG)
3818             {
3819               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3820                 return 0;
3821             }
3822             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
3823             {
3824               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
3825                 return 0;
3826             }
3827           }
3828         }
3829         else // Same register but is it 32-bit or dirty?
3830         if(i_regmap[hr]>=0)
3831         {
3832           if(!((regs[t].dirty>>hr)&1))
3833           {
3834             if((i_dirty>>hr)&1)
3835             {
3836               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3837               {
3838                 //printf("%x: dirty no match\n",addr);
3839                 return 0;
3840               }
3841             }
3842           }
3843           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
3844           {
3845             //printf("%x: is32 no match\n",addr);
3846             return 0;
3847           }
3848         }
3849       }
3850     }
3851     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3852     // Delay slots are not valid branch targets
3853     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3854     // Delay slots require additional processing, so do not match
3855     if(is_ds[t]) return 0;
3856   }
3857   else
3858   {
3859     int hr;
3860     for(hr=0;hr<HOST_REGS;hr++)
3861     {
3862       if(hr!=EXCLUDE_REG)
3863       {
3864         if(i_regmap[hr]>=0)
3865         {
3866           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
3867           {
3868             if((i_dirty>>hr)&1)
3869             {
3870               return 0;
3871             }
3872           }
3873         }
3874       }
3875     }
3876   }
3877   return 1;
3878 }
3879
3880 #ifdef DRC_DBG
3881 static void drc_dbg_emit_do_cmp(int i)
3882 {
3883   extern void do_insn_cmp();
3884   extern int cycle;
3885   u_int hr,reglist=0;
3886
3887   for(hr=0;hr<HOST_REGS;hr++)
3888     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
3889   save_regs(reglist);
3890   emit_movimm(start+i*4,0);
3891   emit_writeword(0,&pcaddr);
3892   emit_call(do_insn_cmp);
3893   //emit_readword(&cycle,0);
3894   //emit_addimm(0,2,0);
3895   //emit_writeword(0,&cycle);
3896   restore_regs(reglist);
3897 }
3898 #else
3899 #define drc_dbg_emit_do_cmp(x)
3900 #endif
3901
3902 // Used when a branch jumps into the delay slot of another branch
3903 void ds_assemble_entry(int i)
3904 {
3905   int t=(ba[i]-start)>>2;
3906   if (!instr_addr[t])
3907     instr_addr[t] = out;
3908   assem_debug("Assemble delay slot at %x\n",ba[i]);
3909   assem_debug("<->\n");
3910   drc_dbg_emit_do_cmp(t);
3911   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
3912     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
3913   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
3914   address_generation(t,&regs[t],regs[t].regmap_entry);
3915   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
3916     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
3917   cop1_usable=0;
3918   is_delayslot=0;
3919   switch(itype[t]) {
3920     case ALU:
3921       alu_assemble(t,&regs[t]);break;
3922     case IMM16:
3923       imm16_assemble(t,&regs[t]);break;
3924     case SHIFT:
3925       shift_assemble(t,&regs[t]);break;
3926     case SHIFTIMM:
3927       shiftimm_assemble(t,&regs[t]);break;
3928     case LOAD:
3929       load_assemble(t,&regs[t]);break;
3930     case LOADLR:
3931       loadlr_assemble(t,&regs[t]);break;
3932     case STORE:
3933       store_assemble(t,&regs[t]);break;
3934     case STORELR:
3935       storelr_assemble(t,&regs[t]);break;
3936     case COP0:
3937       cop0_assemble(t,&regs[t]);break;
3938     case COP1:
3939       cop1_assemble(t,&regs[t]);break;
3940     case C1LS:
3941       c1ls_assemble(t,&regs[t]);break;
3942     case COP2:
3943       cop2_assemble(t,&regs[t]);break;
3944     case C2LS:
3945       c2ls_assemble(t,&regs[t]);break;
3946     case C2OP:
3947       c2op_assemble(t,&regs[t]);break;
3948     case FCONV:
3949       fconv_assemble(t,&regs[t]);break;
3950     case FLOAT:
3951       float_assemble(t,&regs[t]);break;
3952     case FCOMP:
3953       fcomp_assemble(t,&regs[t]);break;
3954     case MULTDIV:
3955       multdiv_assemble(t,&regs[t]);break;
3956     case MOV:
3957       mov_assemble(t,&regs[t]);break;
3958     case SYSCALL:
3959     case HLECALL:
3960     case INTCALL:
3961     case SPAN:
3962     case UJUMP:
3963     case RJUMP:
3964     case CJUMP:
3965     case SJUMP:
3966     case FJUMP:
3967       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3968   }
3969   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
3970   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
3971   if(internal_branch(regs[t].is32,ba[i]+4))
3972     assem_debug("branch: internal\n");
3973   else
3974     assem_debug("branch: external\n");
3975   assert(internal_branch(regs[t].is32,ba[i]+4));
3976   add_to_linker(out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
3977   emit_jmp(0);
3978 }
3979
3980 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
3981 {
3982   int count;
3983   void *jaddr;
3984   void *idle=NULL;
3985   int t=0;
3986   if(itype[i]==RJUMP)
3987   {
3988     *adj=0;
3989   }
3990   //if(ba[i]>=start && ba[i]<(start+slen*4))
3991   if(internal_branch(branch_regs[i].is32,ba[i]))
3992   {
3993     t=(ba[i]-start)>>2;
3994     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
3995     else *adj=ccadj[t];
3996   }
3997   else
3998   {
3999     *adj=0;
4000   }
4001   count=ccadj[i];
4002   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4003     // Idle loop
4004     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4005     idle=out;
4006     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4007     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4008     jaddr=out;
4009     emit_jmp(0);
4010   }
4011   else if(*adj==0||invert) {
4012     int cycles=CLOCK_ADJUST(count+2);
4013     // faster loop HACK
4014     if (t&&*adj) {
4015       int rel=t-i;
4016       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4017         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4018     }
4019     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4020     jaddr=out;
4021     emit_jns(0);
4022   }
4023   else
4024   {
4025     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4026     jaddr=out;
4027     emit_jns(0);
4028   }
4029   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4030 }
4031
4032 static void do_ccstub(int n)
4033 {
4034   literal_pool(256);
4035   assem_debug("do_ccstub %x\n",start+stubs[n].b*4);
4036   set_jump_target(stubs[n].addr, out);
4037   int i=stubs[n].b;
4038   if(stubs[n].d==NULLDS) {
4039     // Delay slot instruction is nullified ("likely" branch)
4040     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4041   }
4042   else if(stubs[n].d!=TAKEN) {
4043     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4044   }
4045   else {
4046     if(internal_branch(branch_regs[i].is32,ba[i]))
4047       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4048   }
4049   if(stubs[n].c!=-1)
4050   {
4051     // Save PC as return address
4052     emit_movimm(stubs[n].c,EAX);
4053     emit_writeword(EAX,&pcaddr);
4054   }
4055   else
4056   {
4057     // Return address depends on which way the branch goes
4058     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4059     {
4060       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4061       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4062       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4063       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4064       if(rs1[i]==0)
4065       {
4066         s1l=s2l;s1h=s2h;
4067         s2l=s2h=-1;
4068       }
4069       else if(rs2[i]==0)
4070       {
4071         s2l=s2h=-1;
4072       }
4073       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4074         s1h=s2h=-1;
4075       }
4076       assert(s1l>=0);
4077       #ifdef DESTRUCTIVE_WRITEBACK
4078       if(rs1[i]) {
4079         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4080           emit_loadreg(rs1[i],s1l);
4081       }
4082       else {
4083         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4084           emit_loadreg(rs2[i],s1l);
4085       }
4086       if(s2l>=0)
4087         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4088           emit_loadreg(rs2[i],s2l);
4089       #endif
4090       int hr=0;
4091       int addr=-1,alt=-1,ntaddr=-1;
4092       while(hr<HOST_REGS)
4093       {
4094         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4095            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4096            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4097         {
4098           addr=hr++;break;
4099         }
4100         hr++;
4101       }
4102       while(hr<HOST_REGS)
4103       {
4104         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4105            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4106            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4107         {
4108           alt=hr++;break;
4109         }
4110         hr++;
4111       }
4112       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4113       {
4114         while(hr<HOST_REGS)
4115         {
4116           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4117              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4118              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4119           {
4120             ntaddr=hr;break;
4121           }
4122           hr++;
4123         }
4124         assert(hr<HOST_REGS);
4125       }
4126       if((opcode[i]&0x2f)==4) // BEQ
4127       {
4128         #ifdef HAVE_CMOV_IMM
4129         if(s1h<0) {
4130           if(s2l>=0) emit_cmp(s1l,s2l);
4131           else emit_test(s1l,s1l);
4132           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4133         }
4134         else
4135         #endif
4136         {
4137           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4138           if(s1h>=0) {
4139             if(s2h>=0) emit_cmp(s1h,s2h);
4140             else emit_test(s1h,s1h);
4141             emit_cmovne_reg(alt,addr);
4142           }
4143           if(s2l>=0) emit_cmp(s1l,s2l);
4144           else emit_test(s1l,s1l);
4145           emit_cmovne_reg(alt,addr);
4146         }
4147       }
4148       if((opcode[i]&0x2f)==5) // BNE
4149       {
4150         #ifdef HAVE_CMOV_IMM
4151         if(s1h<0) {
4152           if(s2l>=0) emit_cmp(s1l,s2l);
4153           else emit_test(s1l,s1l);
4154           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4155         }
4156         else
4157         #endif
4158         {
4159           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4160           if(s1h>=0) {
4161             if(s2h>=0) emit_cmp(s1h,s2h);
4162             else emit_test(s1h,s1h);
4163             emit_cmovne_reg(alt,addr);
4164           }
4165           if(s2l>=0) emit_cmp(s1l,s2l);
4166           else emit_test(s1l,s1l);
4167           emit_cmovne_reg(alt,addr);
4168         }
4169       }
4170       if((opcode[i]&0x2f)==6) // BLEZ
4171       {
4172         //emit_movimm(ba[i],alt);
4173         //emit_movimm(start+i*4+8,addr);
4174         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4175         emit_cmpimm(s1l,1);
4176         if(s1h>=0) emit_mov(addr,ntaddr);
4177         emit_cmovl_reg(alt,addr);
4178         if(s1h>=0) {
4179           emit_test(s1h,s1h);
4180           emit_cmovne_reg(ntaddr,addr);
4181           emit_cmovs_reg(alt,addr);
4182         }
4183       }
4184       if((opcode[i]&0x2f)==7) // BGTZ
4185       {
4186         //emit_movimm(ba[i],addr);
4187         //emit_movimm(start+i*4+8,ntaddr);
4188         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4189         emit_cmpimm(s1l,1);
4190         if(s1h>=0) emit_mov(addr,alt);
4191         emit_cmovl_reg(ntaddr,addr);
4192         if(s1h>=0) {
4193           emit_test(s1h,s1h);
4194           emit_cmovne_reg(alt,addr);
4195           emit_cmovs_reg(ntaddr,addr);
4196         }
4197       }
4198       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4199       {
4200         //emit_movimm(ba[i],alt);
4201         //emit_movimm(start+i*4+8,addr);
4202         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4203         if(s1h>=0) emit_test(s1h,s1h);
4204         else emit_test(s1l,s1l);
4205         emit_cmovs_reg(alt,addr);
4206       }
4207       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4208       {
4209         //emit_movimm(ba[i],addr);
4210         //emit_movimm(start+i*4+8,alt);
4211         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4212         if(s1h>=0) emit_test(s1h,s1h);
4213         else emit_test(s1l,s1l);
4214         emit_cmovs_reg(alt,addr);
4215       }
4216       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4217         if(source[i]&0x10000) // BC1T
4218         {
4219           //emit_movimm(ba[i],alt);
4220           //emit_movimm(start+i*4+8,addr);
4221           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4222           emit_testimm(s1l,0x800000);
4223           emit_cmovne_reg(alt,addr);
4224         }
4225         else // BC1F
4226         {
4227           //emit_movimm(ba[i],addr);
4228           //emit_movimm(start+i*4+8,alt);
4229           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4230           emit_testimm(s1l,0x800000);
4231           emit_cmovne_reg(alt,addr);
4232         }
4233       }
4234       emit_writeword(addr,&pcaddr);
4235     }
4236     else
4237     if(itype[i]==RJUMP)
4238     {
4239       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4240       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4241         r=get_reg(branch_regs[i].regmap,RTEMP);
4242       }
4243       emit_writeword(r,&pcaddr);
4244     }
4245     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4246   }
4247   // Update cycle count
4248   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4249   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4250   emit_call(cc_interrupt);
4251   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4252   if(stubs[n].d==TAKEN) {
4253     if(internal_branch(branch_regs[i].is32,ba[i]))
4254       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4255     else if(itype[i]==RJUMP) {
4256       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4257         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4258       else
4259         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4260     }
4261   }else if(stubs[n].d==NOTTAKEN) {
4262     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4263     else load_all_regs(branch_regs[i].regmap);
4264   }else if(stubs[n].d==NULLDS) {
4265     // Delay slot instruction is nullified ("likely" branch)
4266     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4267     else load_all_regs(regs[i].regmap);
4268   }else{
4269     load_all_regs(branch_regs[i].regmap);
4270   }
4271   emit_jmp(stubs[n].retaddr);
4272 }
4273
4274 static void add_to_linker(void *addr, u_int target, int ext)
4275 {
4276   assert(linkcount < ARRAY_SIZE(link_addr));
4277   link_addr[linkcount].addr = addr;
4278   link_addr[linkcount].target = target;
4279   link_addr[linkcount].ext = ext;
4280   linkcount++;
4281 }
4282
4283 static void ujump_assemble_write_ra(int i)
4284 {
4285   int rt;
4286   unsigned int return_address;
4287   rt=get_reg(branch_regs[i].regmap,31);
4288   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4289   //assert(rt>=0);
4290   return_address=start+i*4+8;
4291   if(rt>=0) {
4292     #ifdef USE_MINI_HT
4293     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4294       int temp=-1; // note: must be ds-safe
4295       #ifdef HOST_TEMPREG
4296       temp=HOST_TEMPREG;
4297       #endif
4298       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4299       else emit_movimm(return_address,rt);
4300     }
4301     else
4302     #endif
4303     {
4304       #ifdef REG_PREFETCH
4305       if(temp>=0)
4306       {
4307         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4308       }
4309       #endif
4310       emit_movimm(return_address,rt); // PC into link register
4311       #ifdef IMM_PREFETCH
4312       emit_prefetch(hash_table_get(return_address));
4313       #endif
4314     }
4315   }
4316 }
4317
4318 void ujump_assemble(int i,struct regstat *i_regs)
4319 {
4320   int ra_done=0;
4321   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4322   address_generation(i+1,i_regs,regs[i].regmap_entry);
4323   #ifdef REG_PREFETCH
4324   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4325   if(rt1[i]==31&&temp>=0)
4326   {
4327     signed char *i_regmap=i_regs->regmap;
4328     int return_address=start+i*4+8;
4329     if(get_reg(branch_regs[i].regmap,31)>0)
4330     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4331   }
4332   #endif
4333   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4334     ujump_assemble_write_ra(i); // writeback ra for DS
4335     ra_done=1;
4336   }
4337   ds_assemble(i+1,i_regs);
4338   uint64_t bc_unneeded=branch_regs[i].u;
4339   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4340   bc_unneeded|=1|(1LL<<rt1[i]);
4341   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4342   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4343                 bc_unneeded,bc_unneeded_upper);
4344   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4345   if(!ra_done&&rt1[i]==31)
4346     ujump_assemble_write_ra(i);
4347   int cc,adj;
4348   cc=get_reg(branch_regs[i].regmap,CCREG);
4349   assert(cc==HOST_CCREG);
4350   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4351   #ifdef REG_PREFETCH
4352   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4353   #endif
4354   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4355   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4356   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4357   if(internal_branch(branch_regs[i].is32,ba[i]))
4358     assem_debug("branch: internal\n");
4359   else
4360     assem_debug("branch: external\n");
4361   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4362     ds_assemble_entry(i);
4363   }
4364   else {
4365     add_to_linker(out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4366     emit_jmp(0);
4367   }
4368 }
4369
4370 static void rjump_assemble_write_ra(int i)
4371 {
4372   int rt,return_address;
4373   assert(rt1[i+1]!=rt1[i]);
4374   assert(rt2[i+1]!=rt1[i]);
4375   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4376   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4377   assert(rt>=0);
4378   return_address=start+i*4+8;
4379   #ifdef REG_PREFETCH
4380   if(temp>=0)
4381   {
4382     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4383   }
4384   #endif
4385   emit_movimm(return_address,rt); // PC into link register
4386   #ifdef IMM_PREFETCH
4387   emit_prefetch(hash_table_get(return_address));
4388   #endif
4389 }
4390
4391 void rjump_assemble(int i,struct regstat *i_regs)
4392 {
4393   int temp;
4394   int rs,cc;
4395   int ra_done=0;
4396   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4397   assert(rs>=0);
4398   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4399     // Delay slot abuse, make a copy of the branch address register
4400     temp=get_reg(branch_regs[i].regmap,RTEMP);
4401     assert(temp>=0);
4402     assert(regs[i].regmap[temp]==RTEMP);
4403     emit_mov(rs,temp);
4404     rs=temp;
4405   }
4406   address_generation(i+1,i_regs,regs[i].regmap_entry);
4407   #ifdef REG_PREFETCH
4408   if(rt1[i]==31)
4409   {
4410     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4411       signed char *i_regmap=i_regs->regmap;
4412       int return_address=start+i*4+8;
4413       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4414     }
4415   }
4416   #endif
4417   #ifdef USE_MINI_HT
4418   if(rs1[i]==31) {
4419     int rh=get_reg(regs[i].regmap,RHASH);
4420     if(rh>=0) do_preload_rhash(rh);
4421   }
4422   #endif
4423   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4424     rjump_assemble_write_ra(i);
4425     ra_done=1;
4426   }
4427   ds_assemble(i+1,i_regs);
4428   uint64_t bc_unneeded=branch_regs[i].u;
4429   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4430   bc_unneeded|=1|(1LL<<rt1[i]);
4431   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4432   bc_unneeded&=~(1LL<<rs1[i]);
4433   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4434                 bc_unneeded,bc_unneeded_upper);
4435   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4436   if(!ra_done&&rt1[i]!=0)
4437     rjump_assemble_write_ra(i);
4438   cc=get_reg(branch_regs[i].regmap,CCREG);
4439   assert(cc==HOST_CCREG);
4440   (void)cc;
4441   #ifdef USE_MINI_HT
4442   int rh=get_reg(branch_regs[i].regmap,RHASH);
4443   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4444   if(rs1[i]==31) {
4445     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4446     do_preload_rhtbl(ht);
4447     do_rhash(rs,rh);
4448   }
4449   #endif
4450   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4451   #ifdef DESTRUCTIVE_WRITEBACK
4452   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4453     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4454       emit_loadreg(rs1[i],rs);
4455     }
4456   }
4457   #endif
4458   #ifdef REG_PREFETCH
4459   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4460   #endif
4461   #ifdef USE_MINI_HT
4462   if(rs1[i]==31) {
4463     do_miniht_load(ht,rh);
4464   }
4465   #endif
4466   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4467   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4468   //assert(adj==0);
4469   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4470   add_stub(CC_STUB,out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4471   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4472     // special case for RFE
4473     emit_jmp(0);
4474   else
4475     emit_jns(0);
4476   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4477   #ifdef USE_MINI_HT
4478   if(rs1[i]==31) {
4479     do_miniht_jump(rs,rh,ht);
4480   }
4481   else
4482   #endif
4483   {
4484     emit_jmp(jump_vaddr_reg[rs]);
4485   }
4486   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4487   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4488   #endif
4489 }
4490
4491 void cjump_assemble(int i,struct regstat *i_regs)
4492 {
4493   signed char *i_regmap=i_regs->regmap;
4494   int cc;
4495   int match;
4496   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4497   assem_debug("match=%d\n",match);
4498   int s1h,s1l,s2h,s2l;
4499   int prev_cop1_usable=cop1_usable;
4500   int unconditional=0,nop=0;
4501   int only32=0;
4502   int invert=0;
4503   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4504   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4505   if(!match) invert=1;
4506   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4507   if(i>(ba[i]-start)>>2) invert=1;
4508   #endif
4509
4510   if(ooo[i]) {
4511     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4512     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4513     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4514     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4515   }
4516   else {
4517     s1l=get_reg(i_regmap,rs1[i]);
4518     s1h=get_reg(i_regmap,rs1[i]|64);
4519     s2l=get_reg(i_regmap,rs2[i]);
4520     s2h=get_reg(i_regmap,rs2[i]|64);
4521   }
4522   if(rs1[i]==0&&rs2[i]==0)
4523   {
4524     if(opcode[i]&1) nop=1;
4525     else unconditional=1;
4526     //assert(opcode[i]!=5);
4527     //assert(opcode[i]!=7);
4528     //assert(opcode[i]!=0x15);
4529     //assert(opcode[i]!=0x17);
4530   }
4531   else if(rs1[i]==0)
4532   {
4533     s1l=s2l;s1h=s2h;
4534     s2l=s2h=-1;
4535     only32=(regs[i].was32>>rs2[i])&1;
4536   }
4537   else if(rs2[i]==0)
4538   {
4539     s2l=s2h=-1;
4540     only32=(regs[i].was32>>rs1[i])&1;
4541   }
4542   else {
4543     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
4544   }
4545
4546   if(ooo[i]) {
4547     // Out of order execution (delay slot first)
4548     //printf("OOOE\n");
4549     address_generation(i+1,i_regs,regs[i].regmap_entry);
4550     ds_assemble(i+1,i_regs);
4551     int adj;
4552     uint64_t bc_unneeded=branch_regs[i].u;
4553     uint64_t bc_unneeded_upper=branch_regs[i].uu;
4554     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4555     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
4556     bc_unneeded|=1;
4557     bc_unneeded_upper|=1;
4558     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4559                   bc_unneeded,bc_unneeded_upper);
4560     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
4561     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4562     cc=get_reg(branch_regs[i].regmap,CCREG);
4563     assert(cc==HOST_CCREG);
4564     if(unconditional)
4565       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4566     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4567     //assem_debug("cycle count (adj)\n");
4568     if(unconditional) {
4569       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4570       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4571         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4572         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4573         if(internal)
4574           assem_debug("branch: internal\n");
4575         else
4576           assem_debug("branch: external\n");
4577         if(internal&&is_ds[(ba[i]-start)>>2]) {
4578           ds_assemble_entry(i);
4579         }
4580         else {
4581           add_to_linker(out,ba[i],internal);
4582           emit_jmp(0);
4583         }
4584         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4585         if(((u_int)out)&7) emit_addnop(0);
4586         #endif
4587       }
4588     }
4589     else if(nop) {
4590       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4591       void *jaddr=out;
4592       emit_jns(0);
4593       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4594     }
4595     else {
4596       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4597       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4598       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4599       if(!only32)
4600       {
4601         assert(s1h>=0);
4602         if(opcode[i]==4) // BEQ
4603         {
4604           if(s2h>=0) emit_cmp(s1h,s2h);
4605           else emit_test(s1h,s1h);
4606           nottaken1=out;
4607           emit_jne((void *)1l);
4608         }
4609         if(opcode[i]==5) // BNE
4610         {
4611           if(s2h>=0) emit_cmp(s1h,s2h);
4612           else emit_test(s1h,s1h);
4613           if(invert) taken=out;
4614           else add_to_linker(out,ba[i],internal);
4615           emit_jne(0);
4616         }
4617         if(opcode[i]==6) // BLEZ
4618         {
4619           emit_test(s1h,s1h);
4620           if(invert) taken=out;
4621           else add_to_linker(out,ba[i],internal);
4622           emit_js(0);
4623           nottaken1=out;
4624           emit_jne((void *)1l);
4625         }
4626         if(opcode[i]==7) // BGTZ
4627         {
4628           emit_test(s1h,s1h);
4629           nottaken1=out;
4630           emit_js(1);
4631           if(invert) taken=out;
4632           else add_to_linker(out,ba[i],internal);
4633           emit_jne(0);
4634         }
4635       } // if(!only32)
4636
4637       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4638       assert(s1l>=0);
4639       if(opcode[i]==4) // BEQ
4640       {
4641         if(s2l>=0) emit_cmp(s1l,s2l);
4642         else emit_test(s1l,s1l);
4643         if(invert){
4644           nottaken=out;
4645           emit_jne((void *)1l);
4646         }else{
4647           add_to_linker(out,ba[i],internal);
4648           emit_jeq(0);
4649         }
4650       }
4651       if(opcode[i]==5) // BNE
4652       {
4653         if(s2l>=0) emit_cmp(s1l,s2l);
4654         else emit_test(s1l,s1l);
4655         if(invert){
4656           nottaken=out;
4657           emit_jeq(1);
4658         }else{
4659           add_to_linker(out,ba[i],internal);
4660           emit_jne(0);
4661         }
4662       }
4663       if(opcode[i]==6) // BLEZ
4664       {
4665         emit_cmpimm(s1l,1);
4666         if(invert){
4667           nottaken=out;
4668           emit_jge(1);
4669         }else{
4670           add_to_linker(out,ba[i],internal);
4671           emit_jl(0);
4672         }
4673       }
4674       if(opcode[i]==7) // BGTZ
4675       {
4676         emit_cmpimm(s1l,1);
4677         if(invert){
4678           nottaken=out;
4679           emit_jl(1);
4680         }else{
4681           add_to_linker(out,ba[i],internal);
4682           emit_jge(0);
4683         }
4684       }
4685       if(invert) {
4686         if(taken) set_jump_target(taken, out);
4687         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4688         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
4689           if(adj) {
4690             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
4691             add_to_linker(out,ba[i],internal);
4692           }else{
4693             emit_addnop(13);
4694             add_to_linker(out,ba[i],internal*2);
4695           }
4696           emit_jmp(0);
4697         }else
4698         #endif
4699         {
4700           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
4701           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4702           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4703           if(internal)
4704             assem_debug("branch: internal\n");
4705           else
4706             assem_debug("branch: external\n");
4707           if(internal&&is_ds[(ba[i]-start)>>2]) {
4708             ds_assemble_entry(i);
4709           }
4710           else {
4711             add_to_linker(out,ba[i],internal);
4712             emit_jmp(0);
4713           }
4714         }
4715         set_jump_target(nottaken, out);
4716       }
4717
4718       if(nottaken1) set_jump_target(nottaken1, out);
4719       if(adj) {
4720         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
4721       }
4722     } // (!unconditional)
4723   } // if(ooo)
4724   else
4725   {
4726     // In-order execution (branch first)
4727     //if(likely[i]) printf("IOL\n");
4728     //else
4729     //printf("IOE\n");
4730     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4731     if(!unconditional&&!nop) {
4732       if(!only32)
4733       {
4734         assert(s1h>=0);
4735         if((opcode[i]&0x2f)==4) // BEQ
4736         {
4737           if(s2h>=0) emit_cmp(s1h,s2h);
4738           else emit_test(s1h,s1h);
4739           nottaken1=out;
4740           emit_jne((void *)2l);
4741         }
4742         if((opcode[i]&0x2f)==5) // BNE
4743         {
4744           if(s2h>=0) emit_cmp(s1h,s2h);
4745           else emit_test(s1h,s1h);
4746           taken=out;
4747           emit_jne((void *)1l);
4748         }
4749         if((opcode[i]&0x2f)==6) // BLEZ
4750         {
4751           emit_test(s1h,s1h);
4752           taken=out;
4753           emit_js(1);
4754           nottaken1=out;
4755           emit_jne((void *)2l);
4756         }
4757         if((opcode[i]&0x2f)==7) // BGTZ
4758         {
4759           emit_test(s1h,s1h);
4760           nottaken1=out;
4761           emit_js(2);
4762           taken=out;
4763           emit_jne((void *)1l);
4764         }
4765       } // if(!only32)
4766
4767       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4768       assert(s1l>=0);
4769       if((opcode[i]&0x2f)==4) // BEQ
4770       {
4771         if(s2l>=0) emit_cmp(s1l,s2l);
4772         else emit_test(s1l,s1l);
4773         nottaken=out;
4774         emit_jne((void *)2l);
4775       }
4776       if((opcode[i]&0x2f)==5) // BNE
4777       {
4778         if(s2l>=0) emit_cmp(s1l,s2l);
4779         else emit_test(s1l,s1l);
4780         nottaken=out;
4781         emit_jeq(2);
4782       }
4783       if((opcode[i]&0x2f)==6) // BLEZ
4784       {
4785         emit_cmpimm(s1l,1);
4786         nottaken=out;
4787         emit_jge(2);
4788       }
4789       if((opcode[i]&0x2f)==7) // BGTZ
4790       {
4791         emit_cmpimm(s1l,1);
4792         nottaken=out;
4793         emit_jl(2);
4794       }
4795     } // if(!unconditional)
4796     int adj;
4797     uint64_t ds_unneeded=branch_regs[i].u;
4798     uint64_t ds_unneeded_upper=branch_regs[i].uu;
4799     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
4800     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
4801     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
4802     ds_unneeded|=1;
4803     ds_unneeded_upper|=1;
4804     // branch taken
4805     if(!nop) {
4806       if(taken) set_jump_target(taken, out);
4807       assem_debug("1:\n");
4808       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4809                     ds_unneeded,ds_unneeded_upper);
4810       // load regs
4811       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
4812       address_generation(i+1,&branch_regs[i],0);
4813       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
4814       ds_assemble(i+1,&branch_regs[i]);
4815       cc=get_reg(branch_regs[i].regmap,CCREG);
4816       if(cc==-1) {
4817         emit_loadreg(CCREG,cc=HOST_CCREG);
4818         // CHECK: Is the following instruction (fall thru) allocated ok?
4819       }
4820       assert(cc==HOST_CCREG);
4821       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4822       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
4823       assem_debug("cycle count (adj)\n");
4824       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4825       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4826       if(internal)
4827         assem_debug("branch: internal\n");
4828       else
4829         assem_debug("branch: external\n");
4830       if(internal&&is_ds[(ba[i]-start)>>2]) {
4831         ds_assemble_entry(i);
4832       }
4833       else {
4834         add_to_linker(out,ba[i],internal);
4835         emit_jmp(0);
4836       }
4837     }
4838     // branch not taken
4839     cop1_usable=prev_cop1_usable;
4840     if(!unconditional) {
4841       if(nottaken1) set_jump_target(nottaken1, out);
4842       set_jump_target(nottaken, out);
4843       assem_debug("2:\n");
4844       if(!likely[i]) {
4845         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4846                       ds_unneeded,ds_unneeded_upper);
4847         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
4848         address_generation(i+1,&branch_regs[i],0);
4849         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4850         ds_assemble(i+1,&branch_regs[i]);
4851       }
4852       cc=get_reg(branch_regs[i].regmap,CCREG);
4853       if(cc==-1&&!likely[i]) {
4854         // Cycle count isn't in a register, temporarily load it then write it out
4855         emit_loadreg(CCREG,HOST_CCREG);
4856         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4857         void *jaddr=out;
4858         emit_jns(0);
4859         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4860         emit_storereg(CCREG,HOST_CCREG);
4861       }
4862       else{
4863         cc=get_reg(i_regmap,CCREG);
4864         assert(cc==HOST_CCREG);
4865         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4866         void *jaddr=out;
4867         emit_jns(0);
4868         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
4869       }
4870     }
4871   }
4872 }
4873
4874 void sjump_assemble(int i,struct regstat *i_regs)
4875 {
4876   signed char *i_regmap=i_regs->regmap;
4877   int cc;
4878   int match;
4879   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4880   assem_debug("smatch=%d\n",match);
4881   int s1h,s1l;
4882   int prev_cop1_usable=cop1_usable;
4883   int unconditional=0,nevertaken=0;
4884   int only32=0;
4885   int invert=0;
4886   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4887   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4888   if(!match) invert=1;
4889   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4890   if(i>(ba[i]-start)>>2) invert=1;
4891   #endif
4892
4893   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
4894   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
4895
4896   if(ooo[i]) {
4897     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4898     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4899   }
4900   else {
4901     s1l=get_reg(i_regmap,rs1[i]);
4902     s1h=get_reg(i_regmap,rs1[i]|64);
4903   }
4904   if(rs1[i]==0)
4905   {
4906     if(opcode2[i]&1) unconditional=1;
4907     else nevertaken=1;
4908     // These are never taken (r0 is never less than zero)
4909     //assert(opcode2[i]!=0);
4910     //assert(opcode2[i]!=2);
4911     //assert(opcode2[i]!=0x10);
4912     //assert(opcode2[i]!=0x12);
4913   }
4914   else {
4915     only32=(regs[i].was32>>rs1[i])&1;
4916   }
4917
4918   if(ooo[i]) {
4919     // Out of order execution (delay slot first)
4920     //printf("OOOE\n");
4921     address_generation(i+1,i_regs,regs[i].regmap_entry);
4922     ds_assemble(i+1,i_regs);
4923     int adj;
4924     uint64_t bc_unneeded=branch_regs[i].u;
4925     uint64_t bc_unneeded_upper=branch_regs[i].uu;
4926     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4927     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
4928     bc_unneeded|=1;
4929     bc_unneeded_upper|=1;
4930     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4931                   bc_unneeded,bc_unneeded_upper);
4932     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
4933     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4934     if(rt1[i]==31) {
4935       int rt,return_address;
4936       rt=get_reg(branch_regs[i].regmap,31);
4937       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4938       if(rt>=0) {
4939         // Save the PC even if the branch is not taken
4940         return_address=start+i*4+8;
4941         emit_movimm(return_address,rt); // PC into link register
4942         #ifdef IMM_PREFETCH
4943         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
4944         #endif
4945       }
4946     }
4947     cc=get_reg(branch_regs[i].regmap,CCREG);
4948     assert(cc==HOST_CCREG);
4949     if(unconditional)
4950       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4951     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4952     assem_debug("cycle count (adj)\n");
4953     if(unconditional) {
4954       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4955       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4956         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4957         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4958         if(internal)
4959           assem_debug("branch: internal\n");
4960         else
4961           assem_debug("branch: external\n");
4962         if(internal&&is_ds[(ba[i]-start)>>2]) {
4963           ds_assemble_entry(i);
4964         }
4965         else {
4966           add_to_linker(out,ba[i],internal);
4967           emit_jmp(0);
4968         }
4969         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4970         if(((u_int)out)&7) emit_addnop(0);
4971         #endif
4972       }
4973     }
4974     else if(nevertaken) {
4975       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4976       void *jaddr=out;
4977       emit_jns(0);
4978       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4979     }
4980     else {
4981       void *nottaken = NULL;
4982       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4983       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4984       if(!only32)
4985       {
4986         assert(s1h>=0);
4987         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
4988         {
4989           emit_test(s1h,s1h);
4990           if(invert){
4991             nottaken=out;
4992             emit_jns(1);
4993           }else{
4994             add_to_linker(out,ba[i],internal);
4995             emit_js(0);
4996           }
4997         }
4998         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
4999         {
5000           emit_test(s1h,s1h);
5001           if(invert){
5002             nottaken=out;
5003             emit_js(1);
5004           }else{
5005             add_to_linker(out,ba[i],internal);
5006             emit_jns(0);
5007           }
5008         }
5009       } // if(!only32)
5010       else
5011       {
5012         assert(s1l>=0);
5013         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5014         {
5015           emit_test(s1l,s1l);
5016           if(invert){
5017             nottaken=out;
5018             emit_jns(1);
5019           }else{
5020             add_to_linker(out,ba[i],internal);
5021             emit_js(0);
5022           }
5023         }
5024         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5025         {
5026           emit_test(s1l,s1l);
5027           if(invert){
5028             nottaken=out;
5029             emit_js(1);
5030           }else{
5031             add_to_linker(out,ba[i],internal);
5032             emit_jns(0);
5033           }
5034         }
5035       } // if(!only32)
5036
5037       if(invert) {
5038         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5039         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5040           if(adj) {
5041             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5042             add_to_linker(out,ba[i],internal);
5043           }else{
5044             emit_addnop(13);
5045             add_to_linker(out,ba[i],internal*2);
5046           }
5047           emit_jmp(0);
5048         }else
5049         #endif
5050         {
5051           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5052           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5053           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5054           if(internal)
5055             assem_debug("branch: internal\n");
5056           else
5057             assem_debug("branch: external\n");
5058           if(internal&&is_ds[(ba[i]-start)>>2]) {
5059             ds_assemble_entry(i);
5060           }
5061           else {
5062             add_to_linker(out,ba[i],internal);
5063             emit_jmp(0);
5064           }
5065         }
5066         set_jump_target(nottaken, out);
5067       }
5068
5069       if(adj) {
5070         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5071       }
5072     } // (!unconditional)
5073   } // if(ooo)
5074   else
5075   {
5076     // In-order execution (branch first)
5077     //printf("IOE\n");
5078     void *nottaken = NULL;
5079     if(rt1[i]==31) {
5080       int rt,return_address;
5081       rt=get_reg(branch_regs[i].regmap,31);
5082       if(rt>=0) {
5083         // Save the PC even if the branch is not taken
5084         return_address=start+i*4+8;
5085         emit_movimm(return_address,rt); // PC into link register
5086         #ifdef IMM_PREFETCH
5087         emit_prefetch(hash_table_get(return_address));
5088         #endif
5089       }
5090     }
5091     if(!unconditional) {
5092       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5093       if(!only32)
5094       {
5095         assert(s1h>=0);
5096         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5097         {
5098           emit_test(s1h,s1h);
5099           nottaken=out;
5100           emit_jns(1);
5101         }
5102         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5103         {
5104           emit_test(s1h,s1h);
5105           nottaken=out;
5106           emit_js(1);
5107         }
5108       } // if(!only32)
5109       else
5110       {
5111         assert(s1l>=0);
5112         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5113         {
5114           emit_test(s1l,s1l);
5115           nottaken=out;
5116           emit_jns(1);
5117         }
5118         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5119         {
5120           emit_test(s1l,s1l);
5121           nottaken=out;
5122           emit_js(1);
5123         }
5124       }
5125     } // if(!unconditional)
5126     int adj;
5127     uint64_t ds_unneeded=branch_regs[i].u;
5128     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5129     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5130     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5131     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5132     ds_unneeded|=1;
5133     ds_unneeded_upper|=1;
5134     // branch taken
5135     if(!nevertaken) {
5136       //assem_debug("1:\n");
5137       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5138                     ds_unneeded,ds_unneeded_upper);
5139       // load regs
5140       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5141       address_generation(i+1,&branch_regs[i],0);
5142       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5143       ds_assemble(i+1,&branch_regs[i]);
5144       cc=get_reg(branch_regs[i].regmap,CCREG);
5145       if(cc==-1) {
5146         emit_loadreg(CCREG,cc=HOST_CCREG);
5147         // CHECK: Is the following instruction (fall thru) allocated ok?
5148       }
5149       assert(cc==HOST_CCREG);
5150       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5151       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5152       assem_debug("cycle count (adj)\n");
5153       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5154       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5155       if(internal)
5156         assem_debug("branch: internal\n");
5157       else
5158         assem_debug("branch: external\n");
5159       if(internal&&is_ds[(ba[i]-start)>>2]) {
5160         ds_assemble_entry(i);
5161       }
5162       else {
5163         add_to_linker(out,ba[i],internal);
5164         emit_jmp(0);
5165       }
5166     }
5167     // branch not taken
5168     cop1_usable=prev_cop1_usable;
5169     if(!unconditional) {
5170       set_jump_target(nottaken, out);
5171       assem_debug("1:\n");
5172       if(!likely[i]) {
5173         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5174                       ds_unneeded,ds_unneeded_upper);
5175         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5176         address_generation(i+1,&branch_regs[i],0);
5177         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5178         ds_assemble(i+1,&branch_regs[i]);
5179       }
5180       cc=get_reg(branch_regs[i].regmap,CCREG);
5181       if(cc==-1&&!likely[i]) {
5182         // Cycle count isn't in a register, temporarily load it then write it out
5183         emit_loadreg(CCREG,HOST_CCREG);
5184         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5185         void *jaddr=out;
5186         emit_jns(0);
5187         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5188         emit_storereg(CCREG,HOST_CCREG);
5189       }
5190       else{
5191         cc=get_reg(i_regmap,CCREG);
5192         assert(cc==HOST_CCREG);
5193         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5194         void *jaddr=out;
5195         emit_jns(0);
5196         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5197       }
5198     }
5199   }
5200 }
5201
5202 void fjump_assemble(int i,struct regstat *i_regs)
5203 {
5204   signed char *i_regmap=i_regs->regmap;
5205   int cc;
5206   int match;
5207   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5208   assem_debug("fmatch=%d\n",match);
5209   int fs,cs;
5210   void *eaddr;
5211   int invert=0;
5212   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5213   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5214   if(!match) invert=1;
5215   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5216   if(i>(ba[i]-start)>>2) invert=1;
5217   #endif
5218
5219   if(ooo[i]) {
5220     fs=get_reg(branch_regs[i].regmap,FSREG);
5221     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5222   }
5223   else {
5224     fs=get_reg(i_regmap,FSREG);
5225   }
5226
5227   // Check cop1 unusable
5228   if(!cop1_usable) {
5229     cs=get_reg(i_regmap,CSREG);
5230     assert(cs>=0);
5231     emit_testimm(cs,0x20000000);
5232     eaddr=out;
5233     emit_jeq(0);
5234     add_stub_r(FP_STUB,eaddr,out,i,cs,i_regs,0,0);
5235     cop1_usable=1;
5236   }
5237
5238   if(ooo[i]) {
5239     // Out of order execution (delay slot first)
5240     //printf("OOOE\n");
5241     ds_assemble(i+1,i_regs);
5242     int adj;
5243     uint64_t bc_unneeded=branch_regs[i].u;
5244     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5245     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5246     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5247     bc_unneeded|=1;
5248     bc_unneeded_upper|=1;
5249     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5250                   bc_unneeded,bc_unneeded_upper);
5251     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5252     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5253     cc=get_reg(branch_regs[i].regmap,CCREG);
5254     assert(cc==HOST_CCREG);
5255     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5256     assem_debug("cycle count (adj)\n");
5257     if(1) {
5258       void *nottaken = NULL;
5259       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5260       if(1) {
5261         assert(fs>=0);
5262         emit_testimm(fs,0x800000);
5263         if(source[i]&0x10000) // BC1T
5264         {
5265           if(invert){
5266             nottaken=out;
5267             emit_jeq(1);
5268           }else{
5269             add_to_linker(out,ba[i],internal);
5270             emit_jne(0);
5271           }
5272         }
5273         else // BC1F
5274           if(invert){
5275             nottaken=out;
5276             emit_jne((void *)1l);
5277           }else{
5278             add_to_linker(out,ba[i],internal);
5279             emit_jeq(0);
5280           }
5281         {
5282         }
5283       } // if(!only32)
5284
5285       if(invert) {
5286         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5287         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5288         else if(match) emit_addnop(13);
5289         #endif
5290         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5291         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5292         if(internal)
5293           assem_debug("branch: internal\n");
5294         else
5295           assem_debug("branch: external\n");
5296         if(internal&&is_ds[(ba[i]-start)>>2]) {
5297           ds_assemble_entry(i);
5298         }
5299         else {
5300           add_to_linker(out,ba[i],internal);
5301           emit_jmp(0);
5302         }
5303         set_jump_target(nottaken, out);
5304       }
5305
5306       if(adj) {
5307         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5308       }
5309     } // (!unconditional)
5310   } // if(ooo)
5311   else
5312   {
5313     // In-order execution (branch first)
5314     //printf("IOE\n");
5315     void *nottaken = NULL;
5316     if(1) {
5317       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5318       if(1) {
5319         assert(fs>=0);
5320         emit_testimm(fs,0x800000);
5321         if(source[i]&0x10000) // BC1T
5322         {
5323           nottaken=out;
5324           emit_jeq(1);
5325         }
5326         else // BC1F
5327         {
5328           nottaken=out;
5329           emit_jne((void *)1l);
5330         }
5331       }
5332     } // if(!unconditional)
5333     int adj;
5334     uint64_t ds_unneeded=branch_regs[i].u;
5335     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5336     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5337     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5338     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5339     ds_unneeded|=1;
5340     ds_unneeded_upper|=1;
5341     // branch taken
5342     //assem_debug("1:\n");
5343     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5344                   ds_unneeded,ds_unneeded_upper);
5345     // load regs
5346     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5347     address_generation(i+1,&branch_regs[i],0);
5348     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5349     ds_assemble(i+1,&branch_regs[i]);
5350     cc=get_reg(branch_regs[i].regmap,CCREG);
5351     if(cc==-1) {
5352       emit_loadreg(CCREG,cc=HOST_CCREG);
5353       // CHECK: Is the following instruction (fall thru) allocated ok?
5354     }
5355     assert(cc==HOST_CCREG);
5356     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5357     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5358     assem_debug("cycle count (adj)\n");
5359     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5360     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5361     if(internal)
5362       assem_debug("branch: internal\n");
5363     else
5364       assem_debug("branch: external\n");
5365     if(internal&&is_ds[(ba[i]-start)>>2]) {
5366       ds_assemble_entry(i);
5367     }
5368     else {
5369       add_to_linker(out,ba[i],internal);
5370       emit_jmp(0);
5371     }
5372
5373     // branch not taken
5374     if(1) { // <- FIXME (don't need this)
5375       set_jump_target(nottaken, out);
5376       assem_debug("1:\n");
5377       if(!likely[i]) {
5378         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5379                       ds_unneeded,ds_unneeded_upper);
5380         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5381         address_generation(i+1,&branch_regs[i],0);
5382         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5383         ds_assemble(i+1,&branch_regs[i]);
5384       }
5385       cc=get_reg(branch_regs[i].regmap,CCREG);
5386       if(cc==-1&&!likely[i]) {
5387         // Cycle count isn't in a register, temporarily load it then write it out
5388         emit_loadreg(CCREG,HOST_CCREG);
5389         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5390         void *jaddr=out;
5391         emit_jns(0);
5392         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5393         emit_storereg(CCREG,HOST_CCREG);
5394       }
5395       else{
5396         cc=get_reg(i_regmap,CCREG);
5397         assert(cc==HOST_CCREG);
5398         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5399         void *jaddr=out;
5400         emit_jns(0);
5401         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5402       }
5403     }
5404   }
5405 }
5406
5407 static void pagespan_assemble(int i,struct regstat *i_regs)
5408 {
5409   int s1l=get_reg(i_regs->regmap,rs1[i]);
5410   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5411   int s2l=get_reg(i_regs->regmap,rs2[i]);
5412   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5413   void *taken = NULL;
5414   void *nottaken = NULL;
5415   int unconditional=0;
5416   if(rs1[i]==0)
5417   {
5418     s1l=s2l;s1h=s2h;
5419     s2l=s2h=-1;
5420   }
5421   else if(rs2[i]==0)
5422   {
5423     s2l=s2h=-1;
5424   }
5425   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5426     s1h=s2h=-1;
5427   }
5428   int hr=0;
5429   int addr=-1,alt=-1,ntaddr=-1;
5430   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5431   else {
5432     while(hr<HOST_REGS)
5433     {
5434       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5435          (i_regs->regmap[hr]&63)!=rs1[i] &&
5436          (i_regs->regmap[hr]&63)!=rs2[i] )
5437       {
5438         addr=hr++;break;
5439       }
5440       hr++;
5441     }
5442   }
5443   while(hr<HOST_REGS)
5444   {
5445     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5446        (i_regs->regmap[hr]&63)!=rs1[i] &&
5447        (i_regs->regmap[hr]&63)!=rs2[i] )
5448     {
5449       alt=hr++;break;
5450     }
5451     hr++;
5452   }
5453   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5454   {
5455     while(hr<HOST_REGS)
5456     {
5457       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5458          (i_regs->regmap[hr]&63)!=rs1[i] &&
5459          (i_regs->regmap[hr]&63)!=rs2[i] )
5460       {
5461         ntaddr=hr;break;
5462       }
5463       hr++;
5464     }
5465   }
5466   assert(hr<HOST_REGS);
5467   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5468     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5469   }
5470   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5471   if(opcode[i]==2) // J
5472   {
5473     unconditional=1;
5474   }
5475   if(opcode[i]==3) // JAL
5476   {
5477     // TODO: mini_ht
5478     int rt=get_reg(i_regs->regmap,31);
5479     emit_movimm(start+i*4+8,rt);
5480     unconditional=1;
5481   }
5482   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5483   {
5484     emit_mov(s1l,addr);
5485     if(opcode2[i]==9) // JALR
5486     {
5487       int rt=get_reg(i_regs->regmap,rt1[i]);
5488       emit_movimm(start+i*4+8,rt);
5489     }
5490   }
5491   if((opcode[i]&0x3f)==4) // BEQ
5492   {
5493     if(rs1[i]==rs2[i])
5494     {
5495       unconditional=1;
5496     }
5497     else
5498     #ifdef HAVE_CMOV_IMM
5499     if(s1h<0) {
5500       if(s2l>=0) emit_cmp(s1l,s2l);
5501       else emit_test(s1l,s1l);
5502       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5503     }
5504     else
5505     #endif
5506     {
5507       assert(s1l>=0);
5508       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5509       if(s1h>=0) {
5510         if(s2h>=0) emit_cmp(s1h,s2h);
5511         else emit_test(s1h,s1h);
5512         emit_cmovne_reg(alt,addr);
5513       }
5514       if(s2l>=0) emit_cmp(s1l,s2l);
5515       else emit_test(s1l,s1l);
5516       emit_cmovne_reg(alt,addr);
5517     }
5518   }
5519   if((opcode[i]&0x3f)==5) // BNE
5520   {
5521     #ifdef HAVE_CMOV_IMM
5522     if(s1h<0) {
5523       if(s2l>=0) emit_cmp(s1l,s2l);
5524       else emit_test(s1l,s1l);
5525       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5526     }
5527     else
5528     #endif
5529     {
5530       assert(s1l>=0);
5531       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5532       if(s1h>=0) {
5533         if(s2h>=0) emit_cmp(s1h,s2h);
5534         else emit_test(s1h,s1h);
5535         emit_cmovne_reg(alt,addr);
5536       }
5537       if(s2l>=0) emit_cmp(s1l,s2l);
5538       else emit_test(s1l,s1l);
5539       emit_cmovne_reg(alt,addr);
5540     }
5541   }
5542   if((opcode[i]&0x3f)==0x14) // BEQL
5543   {
5544     if(s1h>=0) {
5545       if(s2h>=0) emit_cmp(s1h,s2h);
5546       else emit_test(s1h,s1h);
5547       nottaken=out;
5548       emit_jne(0);
5549     }
5550     if(s2l>=0) emit_cmp(s1l,s2l);
5551     else emit_test(s1l,s1l);
5552     if(nottaken) set_jump_target(nottaken, out);
5553     nottaken=out;
5554     emit_jne(0);
5555   }
5556   if((opcode[i]&0x3f)==0x15) // BNEL
5557   {
5558     if(s1h>=0) {
5559       if(s2h>=0) emit_cmp(s1h,s2h);
5560       else emit_test(s1h,s1h);
5561       taken=out;
5562       emit_jne(0);
5563     }
5564     if(s2l>=0) emit_cmp(s1l,s2l);
5565     else emit_test(s1l,s1l);
5566     nottaken=out;
5567     emit_jeq(0);
5568     if(taken) set_jump_target(taken, out);
5569   }
5570   if((opcode[i]&0x3f)==6) // BLEZ
5571   {
5572     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5573     emit_cmpimm(s1l,1);
5574     if(s1h>=0) emit_mov(addr,ntaddr);
5575     emit_cmovl_reg(alt,addr);
5576     if(s1h>=0) {
5577       emit_test(s1h,s1h);
5578       emit_cmovne_reg(ntaddr,addr);
5579       emit_cmovs_reg(alt,addr);
5580     }
5581   }
5582   if((opcode[i]&0x3f)==7) // BGTZ
5583   {
5584     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5585     emit_cmpimm(s1l,1);
5586     if(s1h>=0) emit_mov(addr,alt);
5587     emit_cmovl_reg(ntaddr,addr);
5588     if(s1h>=0) {
5589       emit_test(s1h,s1h);
5590       emit_cmovne_reg(alt,addr);
5591       emit_cmovs_reg(ntaddr,addr);
5592     }
5593   }
5594   if((opcode[i]&0x3f)==0x16) // BLEZL
5595   {
5596     assert((opcode[i]&0x3f)!=0x16);
5597   }
5598   if((opcode[i]&0x3f)==0x17) // BGTZL
5599   {
5600     assert((opcode[i]&0x3f)!=0x17);
5601   }
5602   assert(opcode[i]!=1); // BLTZ/BGEZ
5603
5604   //FIXME: Check CSREG
5605   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5606     if((source[i]&0x30000)==0) // BC1F
5607     {
5608       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5609       emit_testimm(s1l,0x800000);
5610       emit_cmovne_reg(alt,addr);
5611     }
5612     if((source[i]&0x30000)==0x10000) // BC1T
5613     {
5614       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5615       emit_testimm(s1l,0x800000);
5616       emit_cmovne_reg(alt,addr);
5617     }
5618     if((source[i]&0x30000)==0x20000) // BC1FL
5619     {
5620       emit_testimm(s1l,0x800000);
5621       nottaken=out;
5622       emit_jne(0);
5623     }
5624     if((source[i]&0x30000)==0x30000) // BC1TL
5625     {
5626       emit_testimm(s1l,0x800000);
5627       nottaken=out;
5628       emit_jeq(0);
5629     }
5630   }
5631
5632   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5633   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5634   if(likely[i]||unconditional)
5635   {
5636     emit_movimm(ba[i],HOST_BTREG);
5637   }
5638   else if(addr!=HOST_BTREG)
5639   {
5640     emit_mov(addr,HOST_BTREG);
5641   }
5642   void *branch_addr=out;
5643   emit_jmp(0);
5644   int target_addr=start+i*4+5;
5645   void *stub=out;
5646   void *compiled_target_addr=check_addr(target_addr);
5647   emit_extjump_ds(branch_addr, target_addr);
5648   if(compiled_target_addr) {
5649     set_jump_target(branch_addr, compiled_target_addr);
5650     add_link(target_addr,stub);
5651   }
5652   else set_jump_target(branch_addr, stub);
5653   if(likely[i]) {
5654     // Not-taken path
5655     set_jump_target(nottaken, out);
5656     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5657     void *branch_addr=out;
5658     emit_jmp(0);
5659     int target_addr=start+i*4+8;
5660     void *stub=out;
5661     void *compiled_target_addr=check_addr(target_addr);
5662     emit_extjump_ds(branch_addr, target_addr);
5663     if(compiled_target_addr) {
5664       set_jump_target(branch_addr, compiled_target_addr);
5665       add_link(target_addr,stub);
5666     }
5667     else set_jump_target(branch_addr, stub);
5668   }
5669 }
5670
5671 // Assemble the delay slot for the above
5672 static void pagespan_ds()
5673 {
5674   assem_debug("initial delay slot:\n");
5675   u_int vaddr=start+1;
5676   u_int page=get_page(vaddr);
5677   u_int vpage=get_vpage(vaddr);
5678   ll_add(jump_dirty+vpage,vaddr,(void *)out);
5679   do_dirty_stub_ds();
5680   ll_add(jump_in+page,vaddr,(void *)out);
5681   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
5682   if(regs[0].regmap[HOST_CCREG]!=CCREG)
5683     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
5684   if(regs[0].regmap[HOST_BTREG]!=BTREG)
5685     emit_writeword(HOST_BTREG,&branch_target);
5686   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
5687   address_generation(0,&regs[0],regs[0].regmap_entry);
5688   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
5689     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
5690   cop1_usable=0;
5691   is_delayslot=0;
5692   switch(itype[0]) {
5693     case ALU:
5694       alu_assemble(0,&regs[0]);break;
5695     case IMM16:
5696       imm16_assemble(0,&regs[0]);break;
5697     case SHIFT:
5698       shift_assemble(0,&regs[0]);break;
5699     case SHIFTIMM:
5700       shiftimm_assemble(0,&regs[0]);break;
5701     case LOAD:
5702       load_assemble(0,&regs[0]);break;
5703     case LOADLR:
5704       loadlr_assemble(0,&regs[0]);break;
5705     case STORE:
5706       store_assemble(0,&regs[0]);break;
5707     case STORELR:
5708       storelr_assemble(0,&regs[0]);break;
5709     case COP0:
5710       cop0_assemble(0,&regs[0]);break;
5711     case COP1:
5712       cop1_assemble(0,&regs[0]);break;
5713     case C1LS:
5714       c1ls_assemble(0,&regs[0]);break;
5715     case COP2:
5716       cop2_assemble(0,&regs[0]);break;
5717     case C2LS:
5718       c2ls_assemble(0,&regs[0]);break;
5719     case C2OP:
5720       c2op_assemble(0,&regs[0]);break;
5721     case FCONV:
5722       fconv_assemble(0,&regs[0]);break;
5723     case FLOAT:
5724       float_assemble(0,&regs[0]);break;
5725     case FCOMP:
5726       fcomp_assemble(0,&regs[0]);break;
5727     case MULTDIV:
5728       multdiv_assemble(0,&regs[0]);break;
5729     case MOV:
5730       mov_assemble(0,&regs[0]);break;
5731     case SYSCALL:
5732     case HLECALL:
5733     case INTCALL:
5734     case SPAN:
5735     case UJUMP:
5736     case RJUMP:
5737     case CJUMP:
5738     case SJUMP:
5739     case FJUMP:
5740       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
5741   }
5742   int btaddr=get_reg(regs[0].regmap,BTREG);
5743   if(btaddr<0) {
5744     btaddr=get_reg(regs[0].regmap,-1);
5745     emit_readword(&branch_target,btaddr);
5746   }
5747   assert(btaddr!=HOST_CCREG);
5748   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
5749 #ifdef HOST_IMM8
5750   emit_movimm(start+4,HOST_TEMPREG);
5751   emit_cmp(btaddr,HOST_TEMPREG);
5752 #else
5753   emit_cmpimm(btaddr,start+4);
5754 #endif
5755   void *branch = out;
5756   emit_jeq(0);
5757   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
5758   emit_jmp(jump_vaddr_reg[btaddr]);
5759   set_jump_target(branch, out);
5760   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
5761   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
5762 }
5763
5764 // Basic liveness analysis for MIPS registers
5765 void unneeded_registers(int istart,int iend,int r)
5766 {
5767   int i;
5768   uint64_t u,uu,gte_u,b,bu,gte_bu;
5769   uint64_t temp_u,temp_uu,temp_gte_u=0;
5770   uint64_t tdep;
5771   uint64_t gte_u_unknown=0;
5772   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
5773     gte_u_unknown=~0ll;
5774   if(iend==slen-1) {
5775     u=1;uu=1;
5776     gte_u=gte_u_unknown;
5777   }else{
5778     u=unneeded_reg[iend+1];
5779     uu=unneeded_reg_upper[iend+1];
5780     u=1;uu=1;
5781     gte_u=gte_unneeded[iend+1];
5782   }
5783
5784   for (i=iend;i>=istart;i--)
5785   {
5786     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
5787     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
5788     {
5789       // If subroutine call, flag return address as a possible branch target
5790       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
5791
5792       if(ba[i]<start || ba[i]>=(start+slen*4))
5793       {
5794         // Branch out of this block, flush all regs
5795         u=1;
5796         uu=1;
5797         gte_u=gte_u_unknown;
5798         /* Hexagon hack
5799         if(itype[i]==UJUMP&&rt1[i]==31)
5800         {
5801           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
5802         }
5803         if(itype[i]==RJUMP&&rs1[i]==31)
5804         {
5805           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
5806         }
5807         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
5808           if(itype[i]==UJUMP&&rt1[i]==31)
5809           {
5810             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
5811             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
5812           }
5813           if(itype[i]==RJUMP&&rs1[i]==31)
5814           {
5815             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
5816             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
5817           }
5818         }*/
5819         branch_unneeded_reg[i]=u;
5820         branch_unneeded_reg_upper[i]=uu;
5821         // Merge in delay slot
5822         tdep=(~uu>>rt1[i+1])&1;
5823         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5824         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5825         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5826         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5827         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
5828         u|=1;uu|=1;
5829         gte_u|=gte_rt[i+1];
5830         gte_u&=~gte_rs[i+1];
5831         // If branch is "likely" (and conditional)
5832         // then we skip the delay slot on the fall-thru path
5833         if(likely[i]) {
5834           if(i<slen-1) {
5835             u&=unneeded_reg[i+2];
5836             uu&=unneeded_reg_upper[i+2];
5837             gte_u&=gte_unneeded[i+2];
5838           }
5839           else
5840           {
5841             u=1;
5842             uu=1;
5843             gte_u=gte_u_unknown;
5844           }
5845         }
5846       }
5847       else
5848       {
5849         // Internal branch, flag target
5850         bt[(ba[i]-start)>>2]=1;
5851         if(ba[i]<=start+i*4) {
5852           // Backward branch
5853           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5854           {
5855             // Unconditional branch
5856             temp_u=1;temp_uu=1;
5857             temp_gte_u=0;
5858           } else {
5859             // Conditional branch (not taken case)
5860             temp_u=unneeded_reg[i+2];
5861             temp_uu=unneeded_reg_upper[i+2];
5862             temp_gte_u&=gte_unneeded[i+2];
5863           }
5864           // Merge in delay slot
5865           tdep=(~temp_uu>>rt1[i+1])&1;
5866           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5867           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5868           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5869           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5870           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
5871           temp_u|=1;temp_uu|=1;
5872           temp_gte_u|=gte_rt[i+1];
5873           temp_gte_u&=~gte_rs[i+1];
5874           // If branch is "likely" (and conditional)
5875           // then we skip the delay slot on the fall-thru path
5876           if(likely[i]) {
5877             if(i<slen-1) {
5878               temp_u&=unneeded_reg[i+2];
5879               temp_uu&=unneeded_reg_upper[i+2];
5880               temp_gte_u&=gte_unneeded[i+2];
5881             }
5882             else
5883             {
5884               temp_u=1;
5885               temp_uu=1;
5886               temp_gte_u=gte_u_unknown;
5887             }
5888           }
5889           tdep=(~temp_uu>>rt1[i])&1;
5890           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
5891           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
5892           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5893           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
5894           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
5895           temp_u|=1;temp_uu|=1;
5896           temp_gte_u|=gte_rt[i];
5897           temp_gte_u&=~gte_rs[i];
5898           unneeded_reg[i]=temp_u;
5899           unneeded_reg_upper[i]=temp_uu;
5900           gte_unneeded[i]=temp_gte_u;
5901           // Only go three levels deep.  This recursion can take an
5902           // excessive amount of time if there are a lot of nested loops.
5903           if(r<2) {
5904             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
5905           }else{
5906             unneeded_reg[(ba[i]-start)>>2]=1;
5907             unneeded_reg_upper[(ba[i]-start)>>2]=1;
5908             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
5909           }
5910         } /*else*/ if(1) {
5911           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5912           {
5913             // Unconditional branch
5914             u=unneeded_reg[(ba[i]-start)>>2];
5915             uu=unneeded_reg_upper[(ba[i]-start)>>2];
5916             gte_u=gte_unneeded[(ba[i]-start)>>2];
5917             branch_unneeded_reg[i]=u;
5918             branch_unneeded_reg_upper[i]=uu;
5919         //u=1;
5920         //uu=1;
5921         //branch_unneeded_reg[i]=u;
5922         //branch_unneeded_reg_upper[i]=uu;
5923             // Merge in delay slot
5924             tdep=(~uu>>rt1[i+1])&1;
5925             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5926             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5927             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5928             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5929             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
5930             u|=1;uu|=1;
5931             gte_u|=gte_rt[i+1];
5932             gte_u&=~gte_rs[i+1];
5933           } else {
5934             // Conditional branch
5935             b=unneeded_reg[(ba[i]-start)>>2];
5936             bu=unneeded_reg_upper[(ba[i]-start)>>2];
5937             gte_bu=gte_unneeded[(ba[i]-start)>>2];
5938             branch_unneeded_reg[i]=b;
5939             branch_unneeded_reg_upper[i]=bu;
5940         //b=1;
5941         //bu=1;
5942         //branch_unneeded_reg[i]=b;
5943         //branch_unneeded_reg_upper[i]=bu;
5944             // Branch delay slot
5945             tdep=(~uu>>rt1[i+1])&1;
5946             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5947             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5948             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5949             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5950             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
5951             b|=1;bu|=1;
5952             gte_bu|=gte_rt[i+1];
5953             gte_bu&=~gte_rs[i+1];
5954             // If branch is "likely" then we skip the
5955             // delay slot on the fall-thru path
5956             if(likely[i]) {
5957               u=b;
5958               uu=bu;
5959               gte_u=gte_bu;
5960               if(i<slen-1) {
5961                 u&=unneeded_reg[i+2];
5962                 uu&=unneeded_reg_upper[i+2];
5963                 gte_u&=gte_unneeded[i+2];
5964         //u=1;
5965         //uu=1;
5966               }
5967             } else {
5968               u&=b;
5969               uu&=bu;
5970               gte_u&=gte_bu;
5971         //u=1;
5972         //uu=1;
5973             }
5974             if(i<slen-1) {
5975               branch_unneeded_reg[i]&=unneeded_reg[i+2];
5976               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
5977         //branch_unneeded_reg[i]=1;
5978         //branch_unneeded_reg_upper[i]=1;
5979             } else {
5980               branch_unneeded_reg[i]=1;
5981               branch_unneeded_reg_upper[i]=1;
5982             }
5983           }
5984         }
5985       }
5986     }
5987     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
5988     {
5989       // SYSCALL instruction (software interrupt)
5990       u=1;
5991       uu=1;
5992     }
5993     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
5994     {
5995       // ERET instruction (return from interrupt)
5996       u=1;
5997       uu=1;
5998     }
5999     //u=uu=1; // DEBUG
6000     tdep=(~uu>>rt1[i])&1;
6001     // Written registers are unneeded
6002     u|=1LL<<rt1[i];
6003     u|=1LL<<rt2[i];
6004     uu|=1LL<<rt1[i];
6005     uu|=1LL<<rt2[i];
6006     gte_u|=gte_rt[i];
6007     // Accessed registers are needed
6008     u&=~(1LL<<rs1[i]);
6009     u&=~(1LL<<rs2[i]);
6010     uu&=~(1LL<<us1[i]);
6011     uu&=~(1LL<<us2[i]);
6012     gte_u&=~gte_rs[i];
6013     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6014       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6015     // Source-target dependencies
6016     uu&=~(tdep<<dep1[i]);
6017     uu&=~(tdep<<dep2[i]);
6018     // R0 is always unneeded
6019     u|=1;uu|=1;
6020     // Save it
6021     unneeded_reg[i]=u;
6022     unneeded_reg_upper[i]=uu;
6023     gte_unneeded[i]=gte_u;
6024     /*
6025     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6026     printf("U:");
6027     int r;
6028     for(r=1;r<=CCREG;r++) {
6029       if((unneeded_reg[i]>>r)&1) {
6030         if(r==HIREG) printf(" HI");
6031         else if(r==LOREG) printf(" LO");
6032         else printf(" r%d",r);
6033       }
6034     }
6035     printf(" UU:");
6036     for(r=1;r<=CCREG;r++) {
6037       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6038         if(r==HIREG) printf(" HI");
6039         else if(r==LOREG) printf(" LO");
6040         else printf(" r%d",r);
6041       }
6042     }
6043     printf("\n");*/
6044   }
6045   for (i=iend;i>=istart;i--)
6046   {
6047     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6048   }
6049 }
6050
6051 // Write back dirty registers as soon as we will no longer modify them,
6052 // so that we don't end up with lots of writes at the branches.
6053 void clean_registers(int istart,int iend,int wr)
6054 {
6055   int i;
6056   int r;
6057   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6058   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6059   if(iend==slen-1) {
6060     will_dirty_i=will_dirty_next=0;
6061     wont_dirty_i=wont_dirty_next=0;
6062   }else{
6063     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6064     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6065   }
6066   for (i=iend;i>=istart;i--)
6067   {
6068     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6069     {
6070       if(ba[i]<start || ba[i]>=(start+slen*4))
6071       {
6072         // Branch out of this block, flush all regs
6073         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6074         {
6075           // Unconditional branch
6076           will_dirty_i=0;
6077           wont_dirty_i=0;
6078           // Merge in delay slot (will dirty)
6079           for(r=0;r<HOST_REGS;r++) {
6080             if(r!=EXCLUDE_REG) {
6081               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6082               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6083               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6084               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6085               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6086               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6087               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6088               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6089               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6090               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6091               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6092               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6093               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6094               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6095             }
6096           }
6097         }
6098         else
6099         {
6100           // Conditional branch
6101           will_dirty_i=0;
6102           wont_dirty_i=wont_dirty_next;
6103           // Merge in delay slot (will dirty)
6104           for(r=0;r<HOST_REGS;r++) {
6105             if(r!=EXCLUDE_REG) {
6106               if(!likely[i]) {
6107                 // Might not dirty if likely branch is not taken
6108                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6109                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6110                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6111                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6112                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6113                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6114                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6115                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6116                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6117                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6118                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6119                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6120                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6121                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6122               }
6123             }
6124           }
6125         }
6126         // Merge in delay slot (wont dirty)
6127         for(r=0;r<HOST_REGS;r++) {
6128           if(r!=EXCLUDE_REG) {
6129             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6130             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6131             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6132             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6133             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6134             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6135             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6136             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6137             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6138             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6139           }
6140         }
6141         if(wr) {
6142           #ifndef DESTRUCTIVE_WRITEBACK
6143           branch_regs[i].dirty&=wont_dirty_i;
6144           #endif
6145           branch_regs[i].dirty|=will_dirty_i;
6146         }
6147       }
6148       else
6149       {
6150         // Internal branch
6151         if(ba[i]<=start+i*4) {
6152           // Backward branch
6153           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6154           {
6155             // Unconditional branch
6156             temp_will_dirty=0;
6157             temp_wont_dirty=0;
6158             // Merge in delay slot (will dirty)
6159             for(r=0;r<HOST_REGS;r++) {
6160               if(r!=EXCLUDE_REG) {
6161                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6162                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6163                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6164                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6165                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6166                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6167                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6168                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6169                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6170                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6171                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6172                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6173                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6174                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6175               }
6176             }
6177           } else {
6178             // Conditional branch (not taken case)
6179             temp_will_dirty=will_dirty_next;
6180             temp_wont_dirty=wont_dirty_next;
6181             // Merge in delay slot (will dirty)
6182             for(r=0;r<HOST_REGS;r++) {
6183               if(r!=EXCLUDE_REG) {
6184                 if(!likely[i]) {
6185                   // Will not dirty if likely branch is not taken
6186                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6187                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6188                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6189                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6190                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6191                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6192                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6193                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6194                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6195                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6196                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6197                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6198                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6199                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6200                 }
6201               }
6202             }
6203           }
6204           // Merge in delay slot (wont dirty)
6205           for(r=0;r<HOST_REGS;r++) {
6206             if(r!=EXCLUDE_REG) {
6207               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6208               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6209               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6210               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6211               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6212               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6213               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6214               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6215               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6216               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6217             }
6218           }
6219           // Deal with changed mappings
6220           if(i<iend) {
6221             for(r=0;r<HOST_REGS;r++) {
6222               if(r!=EXCLUDE_REG) {
6223                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6224                   temp_will_dirty&=~(1<<r);
6225                   temp_wont_dirty&=~(1<<r);
6226                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6227                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6228                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6229                   } else {
6230                     temp_will_dirty|=1<<r;
6231                     temp_wont_dirty|=1<<r;
6232                   }
6233                 }
6234               }
6235             }
6236           }
6237           if(wr) {
6238             will_dirty[i]=temp_will_dirty;
6239             wont_dirty[i]=temp_wont_dirty;
6240             clean_registers((ba[i]-start)>>2,i-1,0);
6241           }else{
6242             // Limit recursion.  It can take an excessive amount
6243             // of time if there are a lot of nested loops.
6244             will_dirty[(ba[i]-start)>>2]=0;
6245             wont_dirty[(ba[i]-start)>>2]=-1;
6246           }
6247         }
6248         /*else*/ if(1)
6249         {
6250           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6251           {
6252             // Unconditional branch
6253             will_dirty_i=0;
6254             wont_dirty_i=0;
6255           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6256             for(r=0;r<HOST_REGS;r++) {
6257               if(r!=EXCLUDE_REG) {
6258                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6259                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6260                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6261                 }
6262                 if(branch_regs[i].regmap[r]>=0) {
6263                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6264                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6265                 }
6266               }
6267             }
6268           //}
6269             // Merge in delay slot
6270             for(r=0;r<HOST_REGS;r++) {
6271               if(r!=EXCLUDE_REG) {
6272                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6273                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6274                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6275                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6276                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6277                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6278                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6279                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6280                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6281                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6282                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6283                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6284                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6285                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6286               }
6287             }
6288           } else {
6289             // Conditional branch
6290             will_dirty_i=will_dirty_next;
6291             wont_dirty_i=wont_dirty_next;
6292           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6293             for(r=0;r<HOST_REGS;r++) {
6294               if(r!=EXCLUDE_REG) {
6295                 signed char target_reg=branch_regs[i].regmap[r];
6296                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6297                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6298                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6299                 }
6300                 else if(target_reg>=0) {
6301                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6302                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6303                 }
6304                 // Treat delay slot as part of branch too
6305                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6306                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6307                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6308                 }
6309                 else
6310                 {
6311                   will_dirty[i+1]&=~(1<<r);
6312                 }*/
6313               }
6314             }
6315           //}
6316             // Merge in delay slot
6317             for(r=0;r<HOST_REGS;r++) {
6318               if(r!=EXCLUDE_REG) {
6319                 if(!likely[i]) {
6320                   // Might not dirty if likely branch is not taken
6321                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6322                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6323                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6324                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6325                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6326                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6327                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6328                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6329                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6330                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6331                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6332                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6333                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6334                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6335                 }
6336               }
6337             }
6338           }
6339           // Merge in delay slot (won't dirty)
6340           for(r=0;r<HOST_REGS;r++) {
6341             if(r!=EXCLUDE_REG) {
6342               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6343               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6344               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6345               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6346               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6347               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6348               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6349               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6350               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6351               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6352             }
6353           }
6354           if(wr) {
6355             #ifndef DESTRUCTIVE_WRITEBACK
6356             branch_regs[i].dirty&=wont_dirty_i;
6357             #endif
6358             branch_regs[i].dirty|=will_dirty_i;
6359           }
6360         }
6361       }
6362     }
6363     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6364     {
6365       // SYSCALL instruction (software interrupt)
6366       will_dirty_i=0;
6367       wont_dirty_i=0;
6368     }
6369     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6370     {
6371       // ERET instruction (return from interrupt)
6372       will_dirty_i=0;
6373       wont_dirty_i=0;
6374     }
6375     will_dirty_next=will_dirty_i;
6376     wont_dirty_next=wont_dirty_i;
6377     for(r=0;r<HOST_REGS;r++) {
6378       if(r!=EXCLUDE_REG) {
6379         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6380         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6381         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6382         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6383         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6384         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6385         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6386         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6387         if(i>istart) {
6388           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6389           {
6390             // Don't store a register immediately after writing it,
6391             // may prevent dual-issue.
6392             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6393             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6394           }
6395         }
6396       }
6397     }
6398     // Save it
6399     will_dirty[i]=will_dirty_i;
6400     wont_dirty[i]=wont_dirty_i;
6401     // Mark registers that won't be dirtied as not dirty
6402     if(wr) {
6403       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6404       for(r=0;r<HOST_REGS;r++) {
6405         if((will_dirty_i>>r)&1) {
6406           printf(" r%d",r);
6407         }
6408       }
6409       printf("\n");*/
6410
6411       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6412         regs[i].dirty|=will_dirty_i;
6413         #ifndef DESTRUCTIVE_WRITEBACK
6414         regs[i].dirty&=wont_dirty_i;
6415         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6416         {
6417           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6418             for(r=0;r<HOST_REGS;r++) {
6419               if(r!=EXCLUDE_REG) {
6420                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6421                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6422                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6423               }
6424             }
6425           }
6426         }
6427         else
6428         {
6429           if(i<iend) {
6430             for(r=0;r<HOST_REGS;r++) {
6431               if(r!=EXCLUDE_REG) {
6432                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6433                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6434                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6435               }
6436             }
6437           }
6438         }
6439         #endif
6440       //}
6441     }
6442     // Deal with changed mappings
6443     temp_will_dirty=will_dirty_i;
6444     temp_wont_dirty=wont_dirty_i;
6445     for(r=0;r<HOST_REGS;r++) {
6446       if(r!=EXCLUDE_REG) {
6447         int nr;
6448         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6449           if(wr) {
6450             #ifndef DESTRUCTIVE_WRITEBACK
6451             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6452             #endif
6453             regs[i].wasdirty|=will_dirty_i&(1<<r);
6454           }
6455         }
6456         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6457           // Register moved to a different register
6458           will_dirty_i&=~(1<<r);
6459           wont_dirty_i&=~(1<<r);
6460           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6461           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6462           if(wr) {
6463             #ifndef DESTRUCTIVE_WRITEBACK
6464             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6465             #endif
6466             regs[i].wasdirty|=will_dirty_i&(1<<r);
6467           }
6468         }
6469         else {
6470           will_dirty_i&=~(1<<r);
6471           wont_dirty_i&=~(1<<r);
6472           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6473             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6474             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6475           } else {
6476             wont_dirty_i|=1<<r;
6477             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6478           }
6479         }
6480       }
6481     }
6482   }
6483 }
6484
6485 #ifdef DISASM
6486   /* disassembly */
6487 void disassemble_inst(int i)
6488 {
6489     if (bt[i]) printf("*"); else printf(" ");
6490     switch(itype[i]) {
6491       case UJUMP:
6492         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6493       case CJUMP:
6494         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6495       case SJUMP:
6496         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6497       case FJUMP:
6498         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6499       case RJUMP:
6500         if (opcode[i]==0x9&&rt1[i]!=31)
6501           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6502         else
6503           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6504         break;
6505       case SPAN:
6506         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6507       case IMM16:
6508         if(opcode[i]==0xf) //LUI
6509           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6510         else
6511           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6512         break;
6513       case LOAD:
6514       case LOADLR:
6515         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6516         break;
6517       case STORE:
6518       case STORELR:
6519         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6520         break;
6521       case ALU:
6522       case SHIFT:
6523         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6524         break;
6525       case MULTDIV:
6526         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6527         break;
6528       case SHIFTIMM:
6529         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6530         break;
6531       case MOV:
6532         if((opcode2[i]&0x1d)==0x10)
6533           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6534         else if((opcode2[i]&0x1d)==0x11)
6535           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6536         else
6537           printf (" %x: %s\n",start+i*4,insn[i]);
6538         break;
6539       case COP0:
6540         if(opcode2[i]==0)
6541           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6542         else if(opcode2[i]==4)
6543           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6544         else printf (" %x: %s\n",start+i*4,insn[i]);
6545         break;
6546       case COP1:
6547         if(opcode2[i]<3)
6548           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6549         else if(opcode2[i]>3)
6550           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6551         else printf (" %x: %s\n",start+i*4,insn[i]);
6552         break;
6553       case COP2:
6554         if(opcode2[i]<3)
6555           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6556         else if(opcode2[i]>3)
6557           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6558         else printf (" %x: %s\n",start+i*4,insn[i]);
6559         break;
6560       case C1LS:
6561         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6562         break;
6563       case C2LS:
6564         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6565         break;
6566       case INTCALL:
6567         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6568         break;
6569       default:
6570         //printf (" %s %8x\n",insn[i],source[i]);
6571         printf (" %x: %s\n",start+i*4,insn[i]);
6572     }
6573 }
6574 #else
6575 static void disassemble_inst(int i) {}
6576 #endif // DISASM
6577
6578 #define DRC_TEST_VAL 0x74657374
6579
6580 static int new_dynarec_test(void)
6581 {
6582   int (*testfunc)(void) = (void *)out;
6583   void *beginning;
6584   int ret;
6585
6586   beginning = start_block();
6587   emit_movimm(DRC_TEST_VAL,0); // test
6588   emit_jmpreg(14);
6589   literal_pool(0);
6590   end_block(beginning);
6591   SysPrintf("testing if we can run recompiled code..\n");
6592   ret = testfunc();
6593   if (ret == DRC_TEST_VAL)
6594     SysPrintf("test passed.\n");
6595   else
6596     SysPrintf("test failed: %08x\n", ret);
6597   out = translation_cache;
6598   return ret == DRC_TEST_VAL;
6599 }
6600
6601 // clear the state completely, instead of just marking
6602 // things invalid like invalidate_all_pages() does
6603 void new_dynarec_clear_full()
6604 {
6605   int n;
6606   out = translation_cache;
6607   memset(invalid_code,1,sizeof(invalid_code));
6608   memset(hash_table,0xff,sizeof(hash_table));
6609   memset(mini_ht,-1,sizeof(mini_ht));
6610   memset(restore_candidate,0,sizeof(restore_candidate));
6611   memset(shadow,0,sizeof(shadow));
6612   copy=shadow;
6613   expirep=16384; // Expiry pointer, +2 blocks
6614   pending_exception=0;
6615   literalcount=0;
6616   stop_after_jal=0;
6617   inv_code_start=inv_code_end=~0;
6618   // TLB
6619   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6620   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6621   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6622 }
6623
6624 void new_dynarec_init()
6625 {
6626   SysPrintf("Init new dynarec\n");
6627
6628   // allocate/prepare a buffer for translation cache
6629   // see assem_arm.h for some explanation
6630 #if   defined(BASE_ADDR_FIXED)
6631   if (mmap(translation_cache, 1 << TARGET_SIZE_2,
6632             PROT_READ | PROT_WRITE | PROT_EXEC,
6633             MAP_PRIVATE | MAP_ANONYMOUS,
6634             -1, 0) != translation_cache) {
6635     SysPrintf("mmap() failed: %s\n", strerror(errno));
6636     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
6637     abort();
6638   }
6639 #elif defined(BASE_ADDR_DYNAMIC)
6640   #ifdef VITA
6641   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6642   if (sceBlock < 0)
6643     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6644   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
6645   if (ret < 0)
6646     SysPrintf("sceKernelGetMemBlockBase failed\n");
6647   #else
6648   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
6649             PROT_READ | PROT_WRITE | PROT_EXEC,
6650             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6651   if (translation_cache == MAP_FAILED) {
6652     SysPrintf("mmap() failed: %s\n", strerror(errno));
6653     abort();
6654   }
6655   #endif
6656 #else
6657   #ifndef NO_WRITE_EXEC
6658   // not all systems allow execute in data segment by default
6659   if (mprotect(translation_cache, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6660     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6661   #endif
6662 #endif
6663   out = translation_cache;
6664   cycle_multiplier=200;
6665   new_dynarec_clear_full();
6666 #ifdef HOST_IMM8
6667   // Copy this into local area so we don't have to put it in every literal pool
6668   invc_ptr=invalid_code;
6669 #endif
6670   arch_init();
6671   new_dynarec_test();
6672 #ifndef RAM_FIXED
6673   ram_offset=(uintptr_t)rdram-0x80000000;
6674 #endif
6675   if (ram_offset!=0)
6676     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6677 }
6678
6679 void new_dynarec_cleanup()
6680 {
6681   int n;
6682 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
6683   #ifdef VITA
6684   sceKernelFreeMemBlock(sceBlock);
6685   sceBlock = -1;
6686   #else
6687   if (munmap(translation_cache, 1<<TARGET_SIZE_2) < 0)
6688     SysPrintf("munmap() failed\n");
6689   #endif
6690 #endif
6691   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6692   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6693   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6694   #ifdef ROM_COPY
6695   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
6696   #endif
6697 }
6698
6699 static u_int *get_source_start(u_int addr, u_int *limit)
6700 {
6701   if (addr < 0x00200000 ||
6702     (0xa0000000 <= addr && addr < 0xa0200000)) {
6703     // used for BIOS calls mostly?
6704     *limit = (addr&0xa0000000)|0x00200000;
6705     return (u_int *)(rdram + (addr&0x1fffff));
6706   }
6707   else if (!Config.HLE && (
6708     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
6709     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
6710     // BIOS
6711     *limit = (addr & 0xfff00000) | 0x80000;
6712     return (u_int *)((u_char *)psxR + (addr&0x7ffff));
6713   }
6714   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
6715     *limit = (addr & 0x80600000) + 0x00200000;
6716     return (u_int *)(rdram + (addr&0x1fffff));
6717   }
6718   return NULL;
6719 }
6720
6721 static u_int scan_for_ret(u_int addr)
6722 {
6723   u_int limit = 0;
6724   u_int *mem;
6725
6726   mem = get_source_start(addr, &limit);
6727   if (mem == NULL)
6728     return addr;
6729
6730   if (limit > addr + 0x1000)
6731     limit = addr + 0x1000;
6732   for (; addr < limit; addr += 4, mem++) {
6733     if (*mem == 0x03e00008) // jr $ra
6734       return addr + 8;
6735   }
6736   return addr;
6737 }
6738
6739 struct savestate_block {
6740   uint32_t addr;
6741   uint32_t regflags;
6742 };
6743
6744 static int addr_cmp(const void *p1_, const void *p2_)
6745 {
6746   const struct savestate_block *p1 = p1_, *p2 = p2_;
6747   return p1->addr - p2->addr;
6748 }
6749
6750 int new_dynarec_save_blocks(void *save, int size)
6751 {
6752   struct savestate_block *blocks = save;
6753   int maxcount = size / sizeof(blocks[0]);
6754   struct savestate_block tmp_blocks[1024];
6755   struct ll_entry *head;
6756   int p, s, d, o, bcnt;
6757   u_int addr;
6758
6759   o = 0;
6760   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
6761     bcnt = 0;
6762     for (head = jump_in[p]; head != NULL; head = head->next) {
6763       tmp_blocks[bcnt].addr = head->vaddr;
6764       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
6765       bcnt++;
6766     }
6767     if (bcnt < 1)
6768       continue;
6769     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
6770
6771     addr = tmp_blocks[0].addr;
6772     for (s = d = 0; s < bcnt; s++) {
6773       if (tmp_blocks[s].addr < addr)
6774         continue;
6775       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
6776         tmp_blocks[d++] = tmp_blocks[s];
6777       addr = scan_for_ret(tmp_blocks[s].addr);
6778     }
6779
6780     if (o + d > maxcount)
6781       d = maxcount - o;
6782     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
6783     o += d;
6784   }
6785
6786   return o * sizeof(blocks[0]);
6787 }
6788
6789 void new_dynarec_load_blocks(const void *save, int size)
6790 {
6791   const struct savestate_block *blocks = save;
6792   int count = size / sizeof(blocks[0]);
6793   u_int regs_save[32];
6794   uint32_t f;
6795   int i, b;
6796
6797   get_addr(psxRegs.pc);
6798
6799   // change GPRs for speculation to at least partially work..
6800   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
6801   for (i = 1; i < 32; i++)
6802     psxRegs.GPR.r[i] = 0x80000000;
6803
6804   for (b = 0; b < count; b++) {
6805     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6806       if (f & 1)
6807         psxRegs.GPR.r[i] = 0x1f800000;
6808     }
6809
6810     get_addr(blocks[b].addr);
6811
6812     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6813       if (f & 1)
6814         psxRegs.GPR.r[i] = 0x80000000;
6815     }
6816   }
6817
6818   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
6819 }
6820
6821 int new_recompile_block(int addr)
6822 {
6823   u_int pagelimit = 0;
6824   u_int state_rflags = 0;
6825   int i;
6826
6827   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
6828   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
6829   //if(debug)
6830   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
6831
6832   // this is just for speculation
6833   for (i = 1; i < 32; i++) {
6834     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
6835       state_rflags |= 1 << i;
6836   }
6837
6838   start = (u_int)addr&~3;
6839   //assert(((u_int)addr&1)==0);
6840   new_dynarec_did_compile=1;
6841   if (Config.HLE && start == 0x80001000) // hlecall
6842   {
6843     // XXX: is this enough? Maybe check hleSoftCall?
6844     void *beginning=start_block();
6845     u_int page=get_page(start);
6846
6847     invalid_code[start>>12]=0;
6848     emit_movimm(start,0);
6849     emit_writeword(0,&pcaddr);
6850     emit_jmp(new_dyna_leave);
6851     literal_pool(0);
6852     end_block(beginning);
6853     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
6854     return 0;
6855   }
6856
6857   source = get_source_start(start, &pagelimit);
6858   if (source == NULL) {
6859     SysPrintf("Compile at bogus memory address: %08x\n", addr);
6860     exit(1);
6861   }
6862
6863   /* Pass 1: disassemble */
6864   /* Pass 2: register dependencies, branch targets */
6865   /* Pass 3: register allocation */
6866   /* Pass 4: branch dependencies */
6867   /* Pass 5: pre-alloc */
6868   /* Pass 6: optimize clean/dirty state */
6869   /* Pass 7: flag 32-bit registers */
6870   /* Pass 8: assembly */
6871   /* Pass 9: linker */
6872   /* Pass 10: garbage collection / free memory */
6873
6874   int j;
6875   int done=0;
6876   unsigned int type,op,op2;
6877
6878   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
6879
6880   /* Pass 1 disassembly */
6881
6882   for(i=0;!done;i++) {
6883     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
6884     minimum_free_regs[i]=0;
6885     opcode[i]=op=source[i]>>26;
6886     switch(op)
6887     {
6888       case 0x00: strcpy(insn[i],"special"); type=NI;
6889         op2=source[i]&0x3f;
6890         switch(op2)
6891         {
6892           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
6893           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
6894           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
6895           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
6896           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
6897           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
6898           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
6899           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
6900           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
6901           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
6902           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
6903           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
6904           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
6905           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
6906           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
6907           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
6908           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
6909           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
6910           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
6911           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
6912           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
6913           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
6914           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
6915           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
6916           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
6917           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
6918           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
6919           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
6920           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
6921           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
6922           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
6923           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
6924           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
6925           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
6926           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
6927 #if 0
6928           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
6929           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
6930           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
6931           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
6932           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
6933           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
6934           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
6935           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
6936           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
6937           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
6938           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
6939           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
6940           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
6941           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
6942           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
6943           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
6944           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
6945 #endif
6946         }
6947         break;
6948       case 0x01: strcpy(insn[i],"regimm"); type=NI;
6949         op2=(source[i]>>16)&0x1f;
6950         switch(op2)
6951         {
6952           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
6953           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
6954           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
6955           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
6956           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
6957           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
6958           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
6959           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
6960           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
6961           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
6962           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
6963           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
6964           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
6965           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
6966         }
6967         break;
6968       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
6969       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
6970       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
6971       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
6972       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
6973       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
6974       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
6975       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
6976       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
6977       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
6978       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
6979       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
6980       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
6981       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
6982       case 0x10: strcpy(insn[i],"cop0"); type=NI;
6983         op2=(source[i]>>21)&0x1f;
6984         switch(op2)
6985         {
6986           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
6987           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
6988           case 0x10: strcpy(insn[i],"tlb"); type=NI;
6989           switch(source[i]&0x3f)
6990           {
6991             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
6992             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
6993             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
6994             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
6995             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
6996             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
6997           }
6998         }
6999         break;
7000       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7001         op2=(source[i]>>21)&0x1f;
7002         switch(op2)
7003         {
7004           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7005           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7006           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7007           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7008           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7009           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7010           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7011           switch((source[i]>>16)&0x3)
7012           {
7013             case 0x00: strcpy(insn[i],"BC1F"); break;
7014             case 0x01: strcpy(insn[i],"BC1T"); break;
7015             case 0x02: strcpy(insn[i],"BC1FL"); break;
7016             case 0x03: strcpy(insn[i],"BC1TL"); break;
7017           }
7018           break;
7019           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7020           switch(source[i]&0x3f)
7021           {
7022             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7023             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7024             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7025             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7026             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7027             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7028             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7029             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7030             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7031             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7032             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7033             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7034             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7035             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7036             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7037             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7038             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7039             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7040             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7041             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7042             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7043             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7044             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7045             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7046             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7047             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7048             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7049             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7050             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7051             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7052             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7053             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7054             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7055             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7056             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7057           }
7058           break;
7059           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7060           switch(source[i]&0x3f)
7061           {
7062             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7063             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7064             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7065             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7066             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7067             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7068             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7069             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7070             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7071             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7072             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7073             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7074             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7075             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7076             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7077             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7078             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7079             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7080             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7081             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7082             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7083             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7084             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7085             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7086             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7087             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7088             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7089             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7090             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7091             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7092             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7093             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7094             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7095             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7096             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7097           }
7098           break;
7099           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7100           switch(source[i]&0x3f)
7101           {
7102             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7103             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7104           }
7105           break;
7106           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7107           switch(source[i]&0x3f)
7108           {
7109             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7110             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7111           }
7112           break;
7113         }
7114         break;
7115 #if 0
7116       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7117       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7118       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7119       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7120       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7121       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7122       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7123       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7124 #endif
7125       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7126       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7127       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7128       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7129       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7130       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7131       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7132 #if 0
7133       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7134 #endif
7135       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7136       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7137       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7138       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7139 #if 0
7140       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7141       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7142 #endif
7143       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7144       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7145       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7146       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7147 #if 0
7148       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7149       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7150       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7151 #endif
7152       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7153       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7154 #if 0
7155       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7156       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7157       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7158 #endif
7159       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7160         op2=(source[i]>>21)&0x1f;
7161         //if (op2 & 0x10) {
7162         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7163           if (gte_handlers[source[i]&0x3f]!=NULL) {
7164             if (gte_regnames[source[i]&0x3f]!=NULL)
7165               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7166             else
7167               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7168             type=C2OP;
7169           }
7170         }
7171         else switch(op2)
7172         {
7173           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7174           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7175           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7176           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7177         }
7178         break;
7179       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7180       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7181       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7182       default: strcpy(insn[i],"???"); type=NI;
7183         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7184         break;
7185     }
7186     itype[i]=type;
7187     opcode2[i]=op2;
7188     /* Get registers/immediates */
7189     lt1[i]=0;
7190     us1[i]=0;
7191     us2[i]=0;
7192     dep1[i]=0;
7193     dep2[i]=0;
7194     gte_rs[i]=gte_rt[i]=0;
7195     switch(type) {
7196       case LOAD:
7197         rs1[i]=(source[i]>>21)&0x1f;
7198         rs2[i]=0;
7199         rt1[i]=(source[i]>>16)&0x1f;
7200         rt2[i]=0;
7201         imm[i]=(short)source[i];
7202         break;
7203       case STORE:
7204       case STORELR:
7205         rs1[i]=(source[i]>>21)&0x1f;
7206         rs2[i]=(source[i]>>16)&0x1f;
7207         rt1[i]=0;
7208         rt2[i]=0;
7209         imm[i]=(short)source[i];
7210         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7211         break;
7212       case LOADLR:
7213         // LWL/LWR only load part of the register,
7214         // therefore the target register must be treated as a source too
7215         rs1[i]=(source[i]>>21)&0x1f;
7216         rs2[i]=(source[i]>>16)&0x1f;
7217         rt1[i]=(source[i]>>16)&0x1f;
7218         rt2[i]=0;
7219         imm[i]=(short)source[i];
7220         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7221         if(op==0x26) dep1[i]=rt1[i]; // LWR
7222         break;
7223       case IMM16:
7224         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7225         else rs1[i]=(source[i]>>21)&0x1f;
7226         rs2[i]=0;
7227         rt1[i]=(source[i]>>16)&0x1f;
7228         rt2[i]=0;
7229         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7230           imm[i]=(unsigned short)source[i];
7231         }else{
7232           imm[i]=(short)source[i];
7233         }
7234         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7235         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7236         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7237         break;
7238       case UJUMP:
7239         rs1[i]=0;
7240         rs2[i]=0;
7241         rt1[i]=0;
7242         rt2[i]=0;
7243         // The JAL instruction writes to r31.
7244         if (op&1) {
7245           rt1[i]=31;
7246         }
7247         rs2[i]=CCREG;
7248         break;
7249       case RJUMP:
7250         rs1[i]=(source[i]>>21)&0x1f;
7251         rs2[i]=0;
7252         rt1[i]=0;
7253         rt2[i]=0;
7254         // The JALR instruction writes to rd.
7255         if (op2&1) {
7256           rt1[i]=(source[i]>>11)&0x1f;
7257         }
7258         rs2[i]=CCREG;
7259         break;
7260       case CJUMP:
7261         rs1[i]=(source[i]>>21)&0x1f;
7262         rs2[i]=(source[i]>>16)&0x1f;
7263         rt1[i]=0;
7264         rt2[i]=0;
7265         if(op&2) { // BGTZ/BLEZ
7266           rs2[i]=0;
7267         }
7268         us1[i]=rs1[i];
7269         us2[i]=rs2[i];
7270         likely[i]=op>>4;
7271         break;
7272       case SJUMP:
7273         rs1[i]=(source[i]>>21)&0x1f;
7274         rs2[i]=CCREG;
7275         rt1[i]=0;
7276         rt2[i]=0;
7277         us1[i]=rs1[i];
7278         if(op2&0x10) { // BxxAL
7279           rt1[i]=31;
7280           // NOTE: If the branch is not taken, r31 is still overwritten
7281         }
7282         likely[i]=(op2&2)>>1;
7283         break;
7284       case FJUMP:
7285         rs1[i]=FSREG;
7286         rs2[i]=CSREG;
7287         rt1[i]=0;
7288         rt2[i]=0;
7289         likely[i]=((source[i])>>17)&1;
7290         break;
7291       case ALU:
7292         rs1[i]=(source[i]>>21)&0x1f; // source
7293         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7294         rt1[i]=(source[i]>>11)&0x1f; // destination
7295         rt2[i]=0;
7296         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7297           us1[i]=rs1[i];us2[i]=rs2[i];
7298         }
7299         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7300           dep1[i]=rs1[i];dep2[i]=rs2[i];
7301         }
7302         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7303           dep1[i]=rs1[i];dep2[i]=rs2[i];
7304         }
7305         break;
7306       case MULTDIV:
7307         rs1[i]=(source[i]>>21)&0x1f; // source
7308         rs2[i]=(source[i]>>16)&0x1f; // divisor
7309         rt1[i]=HIREG;
7310         rt2[i]=LOREG;
7311         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7312           us1[i]=rs1[i];us2[i]=rs2[i];
7313         }
7314         break;
7315       case MOV:
7316         rs1[i]=0;
7317         rs2[i]=0;
7318         rt1[i]=0;
7319         rt2[i]=0;
7320         if(op2==0x10) rs1[i]=HIREG; // MFHI
7321         if(op2==0x11) rt1[i]=HIREG; // MTHI
7322         if(op2==0x12) rs1[i]=LOREG; // MFLO
7323         if(op2==0x13) rt1[i]=LOREG; // MTLO
7324         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7325         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7326         dep1[i]=rs1[i];
7327         break;
7328       case SHIFT:
7329         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7330         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7331         rt1[i]=(source[i]>>11)&0x1f; // destination
7332         rt2[i]=0;
7333         // DSLLV/DSRLV/DSRAV are 64-bit
7334         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7335         break;
7336       case SHIFTIMM:
7337         rs1[i]=(source[i]>>16)&0x1f;
7338         rs2[i]=0;
7339         rt1[i]=(source[i]>>11)&0x1f;
7340         rt2[i]=0;
7341         imm[i]=(source[i]>>6)&0x1f;
7342         // DSxx32 instructions
7343         if(op2>=0x3c) imm[i]|=0x20;
7344         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7345         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7346         break;
7347       case COP0:
7348         rs1[i]=0;
7349         rs2[i]=0;
7350         rt1[i]=0;
7351         rt2[i]=0;
7352         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7353         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7354         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7355         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7356         break;
7357       case COP1:
7358         rs1[i]=0;
7359         rs2[i]=0;
7360         rt1[i]=0;
7361         rt2[i]=0;
7362         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7363         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7364         if(op2==5) us1[i]=rs1[i]; // DMTC1
7365         rs2[i]=CSREG;
7366         break;
7367       case COP2:
7368         rs1[i]=0;
7369         rs2[i]=0;
7370         rt1[i]=0;
7371         rt2[i]=0;
7372         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7373         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7374         rs2[i]=CSREG;
7375         int gr=(source[i]>>11)&0x1F;
7376         switch(op2)
7377         {
7378           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7379           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7380           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7381           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7382         }
7383         break;
7384       case C1LS:
7385         rs1[i]=(source[i]>>21)&0x1F;
7386         rs2[i]=CSREG;
7387         rt1[i]=0;
7388         rt2[i]=0;
7389         imm[i]=(short)source[i];
7390         break;
7391       case C2LS:
7392         rs1[i]=(source[i]>>21)&0x1F;
7393         rs2[i]=0;
7394         rt1[i]=0;
7395         rt2[i]=0;
7396         imm[i]=(short)source[i];
7397         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7398         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7399         break;
7400       case C2OP:
7401         rs1[i]=0;
7402         rs2[i]=0;
7403         rt1[i]=0;
7404         rt2[i]=0;
7405         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7406         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7407         gte_rt[i]|=1ll<<63; // every op changes flags
7408         if((source[i]&0x3f)==GTE_MVMVA) {
7409           int v = (source[i] >> 15) & 3;
7410           gte_rs[i]&=~0xe3fll;
7411           if(v==3) gte_rs[i]|=0xe00ll;
7412           else gte_rs[i]|=3ll<<(v*2);
7413         }
7414         break;
7415       case FLOAT:
7416       case FCONV:
7417         rs1[i]=0;
7418         rs2[i]=CSREG;
7419         rt1[i]=0;
7420         rt2[i]=0;
7421         break;
7422       case FCOMP:
7423         rs1[i]=FSREG;
7424         rs2[i]=CSREG;
7425         rt1[i]=FSREG;
7426         rt2[i]=0;
7427         break;
7428       case SYSCALL:
7429       case HLECALL:
7430       case INTCALL:
7431         rs1[i]=CCREG;
7432         rs2[i]=0;
7433         rt1[i]=0;
7434         rt2[i]=0;
7435         break;
7436       default:
7437         rs1[i]=0;
7438         rs2[i]=0;
7439         rt1[i]=0;
7440         rt2[i]=0;
7441     }
7442     /* Calculate branch target addresses */
7443     if(type==UJUMP)
7444       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7445     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7446       ba[i]=start+i*4+8; // Ignore never taken branch
7447     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7448       ba[i]=start+i*4+8; // Ignore never taken branch
7449     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7450       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7451     else ba[i]=-1;
7452     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7453       int do_in_intrp=0;
7454       // branch in delay slot?
7455       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7456         // don't handle first branch and call interpreter if it's hit
7457         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7458         do_in_intrp=1;
7459       }
7460       // basic load delay detection
7461       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7462         int t=(ba[i-1]-start)/4;
7463         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7464           // jump target wants DS result - potential load delay effect
7465           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7466           do_in_intrp=1;
7467           bt[t+1]=1; // expected return from interpreter
7468         }
7469         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7470               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7471           // v0 overwrite like this is a sign of trouble, bail out
7472           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7473           do_in_intrp=1;
7474         }
7475       }
7476       if(do_in_intrp) {
7477         rs1[i-1]=CCREG;
7478         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7479         ba[i-1]=-1;
7480         itype[i-1]=INTCALL;
7481         done=2;
7482         i--; // don't compile the DS
7483       }
7484     }
7485     /* Is this the end of the block? */
7486     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7487       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7488         done=2;
7489       }
7490       else {
7491         if(stop_after_jal) done=1;
7492         // Stop on BREAK
7493         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7494       }
7495       // Don't recompile stuff that's already compiled
7496       if(check_addr(start+i*4+4)) done=1;
7497       // Don't get too close to the limit
7498       if(i>MAXBLOCK/2) done=1;
7499     }
7500     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7501     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7502     if(done==2) {
7503       // Does the block continue due to a branch?
7504       for(j=i-1;j>=0;j--)
7505       {
7506         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7507         if(ba[j]==start+i*4+4) done=j=0;
7508         if(ba[j]==start+i*4+8) done=j=0;
7509       }
7510     }
7511     //assert(i<MAXBLOCK-1);
7512     if(start+i*4==pagelimit-4) done=1;
7513     assert(start+i*4<pagelimit);
7514     if (i==MAXBLOCK-1) done=1;
7515     // Stop if we're compiling junk
7516     if(itype[i]==NI&&opcode[i]==0x11) {
7517       done=stop_after_jal=1;
7518       SysPrintf("Disabled speculative precompilation\n");
7519     }
7520   }
7521   slen=i;
7522   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
7523     if(start+i*4==pagelimit) {
7524       itype[i-1]=SPAN;
7525     }
7526   }
7527   assert(slen>0);
7528
7529   /* Pass 2 - Register dependencies and branch targets */
7530
7531   unneeded_registers(0,slen-1,0);
7532
7533   /* Pass 3 - Register allocation */
7534
7535   struct regstat current; // Current register allocations/status
7536   current.is32=1;
7537   current.dirty=0;
7538   current.u=unneeded_reg[0];
7539   current.uu=unneeded_reg_upper[0];
7540   clear_all_regs(current.regmap);
7541   alloc_reg(&current,0,CCREG);
7542   dirty_reg(&current,CCREG);
7543   current.isconst=0;
7544   current.wasconst=0;
7545   current.waswritten=0;
7546   int ds=0;
7547   int cc=0;
7548   int hr=-1;
7549
7550   if((u_int)addr&1) {
7551     // First instruction is delay slot
7552     cc=-1;
7553     bt[1]=1;
7554     ds=1;
7555     unneeded_reg[0]=1;
7556     unneeded_reg_upper[0]=1;
7557     current.regmap[HOST_BTREG]=BTREG;
7558   }
7559
7560   for(i=0;i<slen;i++)
7561   {
7562     if(bt[i])
7563     {
7564       int hr;
7565       for(hr=0;hr<HOST_REGS;hr++)
7566       {
7567         // Is this really necessary?
7568         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7569       }
7570       current.isconst=0;
7571       current.waswritten=0;
7572     }
7573     if(i>1)
7574     {
7575       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7576       {
7577         if(rs1[i-2]==0||rs2[i-2]==0)
7578         {
7579           if(rs1[i-2]) {
7580             current.is32|=1LL<<rs1[i-2];
7581             int hr=get_reg(current.regmap,rs1[i-2]|64);
7582             if(hr>=0) current.regmap[hr]=-1;
7583           }
7584           if(rs2[i-2]) {
7585             current.is32|=1LL<<rs2[i-2];
7586             int hr=get_reg(current.regmap,rs2[i-2]|64);
7587             if(hr>=0) current.regmap[hr]=-1;
7588           }
7589         }
7590       }
7591     }
7592     current.is32=-1LL;
7593
7594     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7595     regs[i].wasconst=current.isconst;
7596     regs[i].was32=current.is32;
7597     regs[i].wasdirty=current.dirty;
7598     regs[i].loadedconst=0;
7599     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
7600       if(i+1<slen) {
7601         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7602         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
7603         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7604         current.u|=1;
7605         current.uu|=1;
7606       } else {
7607         current.u=1;
7608         current.uu=1;
7609       }
7610     } else {
7611       if(i+1<slen) {
7612         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7613         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
7614         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
7615         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7616         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7617         current.u|=1;
7618         current.uu|=1;
7619       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
7620     }
7621     is_ds[i]=ds;
7622     if(ds) {
7623       ds=0; // Skip delay slot, already allocated as part of branch
7624       // ...but we need to alloc it in case something jumps here
7625       if(i+1<slen) {
7626         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7627         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
7628       }else{
7629         current.u=branch_unneeded_reg[i-1];
7630         current.uu=branch_unneeded_reg_upper[i-1];
7631       }
7632       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7633       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7634       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7635       current.u|=1;
7636       current.uu|=1;
7637       struct regstat temp;
7638       memcpy(&temp,&current,sizeof(current));
7639       temp.wasdirty=temp.dirty;
7640       temp.was32=temp.is32;
7641       // TODO: Take into account unconditional branches, as below
7642       delayslot_alloc(&temp,i);
7643       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7644       regs[i].wasdirty=temp.wasdirty;
7645       regs[i].was32=temp.was32;
7646       regs[i].dirty=temp.dirty;
7647       regs[i].is32=temp.is32;
7648       regs[i].isconst=0;
7649       regs[i].wasconst=0;
7650       current.isconst=0;
7651       // Create entry (branch target) regmap
7652       for(hr=0;hr<HOST_REGS;hr++)
7653       {
7654         int r=temp.regmap[hr];
7655         if(r>=0) {
7656           if(r!=regmap_pre[i][hr]) {
7657             regs[i].regmap_entry[hr]=-1;
7658           }
7659           else
7660           {
7661             if(r<64){
7662               if((current.u>>r)&1) {
7663                 regs[i].regmap_entry[hr]=-1;
7664                 regs[i].regmap[hr]=-1;
7665                 //Don't clear regs in the delay slot as the branch might need them
7666                 //current.regmap[hr]=-1;
7667               }else
7668                 regs[i].regmap_entry[hr]=r;
7669             }
7670             else {
7671               if((current.uu>>(r&63))&1) {
7672                 regs[i].regmap_entry[hr]=-1;
7673                 regs[i].regmap[hr]=-1;
7674                 //Don't clear regs in the delay slot as the branch might need them
7675                 //current.regmap[hr]=-1;
7676               }else
7677                 regs[i].regmap_entry[hr]=r;
7678             }
7679           }
7680         } else {
7681           // First instruction expects CCREG to be allocated
7682           if(i==0&&hr==HOST_CCREG)
7683             regs[i].regmap_entry[hr]=CCREG;
7684           else
7685             regs[i].regmap_entry[hr]=-1;
7686         }
7687       }
7688     }
7689     else { // Not delay slot
7690       switch(itype[i]) {
7691         case UJUMP:
7692           //current.isconst=0; // DEBUG
7693           //current.wasconst=0; // DEBUG
7694           //regs[i].wasconst=0; // DEBUG
7695           clear_const(&current,rt1[i]);
7696           alloc_cc(&current,i);
7697           dirty_reg(&current,CCREG);
7698           if (rt1[i]==31) {
7699             alloc_reg(&current,i,31);
7700             dirty_reg(&current,31);
7701             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
7702             //assert(rt1[i+1]!=rt1[i]);
7703             #ifdef REG_PREFETCH
7704             alloc_reg(&current,i,PTEMP);
7705             #endif
7706             //current.is32|=1LL<<rt1[i];
7707           }
7708           ooo[i]=1;
7709           delayslot_alloc(&current,i+1);
7710           //current.isconst=0; // DEBUG
7711           ds=1;
7712           //printf("i=%d, isconst=%x\n",i,current.isconst);
7713           break;
7714         case RJUMP:
7715           //current.isconst=0;
7716           //current.wasconst=0;
7717           //regs[i].wasconst=0;
7718           clear_const(&current,rs1[i]);
7719           clear_const(&current,rt1[i]);
7720           alloc_cc(&current,i);
7721           dirty_reg(&current,CCREG);
7722           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
7723             alloc_reg(&current,i,rs1[i]);
7724             if (rt1[i]!=0) {
7725               alloc_reg(&current,i,rt1[i]);
7726               dirty_reg(&current,rt1[i]);
7727               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
7728               assert(rt1[i+1]!=rt1[i]);
7729               #ifdef REG_PREFETCH
7730               alloc_reg(&current,i,PTEMP);
7731               #endif
7732             }
7733             #ifdef USE_MINI_HT
7734             if(rs1[i]==31) { // JALR
7735               alloc_reg(&current,i,RHASH);
7736               alloc_reg(&current,i,RHTBL);
7737             }
7738             #endif
7739             delayslot_alloc(&current,i+1);
7740           } else {
7741             // The delay slot overwrites our source register,
7742             // allocate a temporary register to hold the old value.
7743             current.isconst=0;
7744             current.wasconst=0;
7745             regs[i].wasconst=0;
7746             delayslot_alloc(&current,i+1);
7747             current.isconst=0;
7748             alloc_reg(&current,i,RTEMP);
7749           }
7750           //current.isconst=0; // DEBUG
7751           ooo[i]=1;
7752           ds=1;
7753           break;
7754         case CJUMP:
7755           //current.isconst=0;
7756           //current.wasconst=0;
7757           //regs[i].wasconst=0;
7758           clear_const(&current,rs1[i]);
7759           clear_const(&current,rs2[i]);
7760           if((opcode[i]&0x3E)==4) // BEQ/BNE
7761           {
7762             alloc_cc(&current,i);
7763             dirty_reg(&current,CCREG);
7764             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7765             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7766             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
7767             {
7768               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
7769               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
7770             }
7771             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
7772                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
7773               // The delay slot overwrites one of our conditions.
7774               // Allocate the branch condition registers instead.
7775               current.isconst=0;
7776               current.wasconst=0;
7777               regs[i].wasconst=0;
7778               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7779               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7780               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
7781               {
7782                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
7783                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
7784               }
7785             }
7786             else
7787             {
7788               ooo[i]=1;
7789               delayslot_alloc(&current,i+1);
7790             }
7791           }
7792           else
7793           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
7794           {
7795             alloc_cc(&current,i);
7796             dirty_reg(&current,CCREG);
7797             alloc_reg(&current,i,rs1[i]);
7798             if(!(current.is32>>rs1[i]&1))
7799             {
7800               alloc_reg64(&current,i,rs1[i]);
7801             }
7802             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
7803               // The delay slot overwrites one of our conditions.
7804               // Allocate the branch condition registers instead.
7805               current.isconst=0;
7806               current.wasconst=0;
7807               regs[i].wasconst=0;
7808               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7809               if(!((current.is32>>rs1[i])&1))
7810               {
7811                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
7812               }
7813             }
7814             else
7815             {
7816               ooo[i]=1;
7817               delayslot_alloc(&current,i+1);
7818             }
7819           }
7820           else
7821           // Don't alloc the delay slot yet because we might not execute it
7822           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
7823           {
7824             current.isconst=0;
7825             current.wasconst=0;
7826             regs[i].wasconst=0;
7827             alloc_cc(&current,i);
7828             dirty_reg(&current,CCREG);
7829             alloc_reg(&current,i,rs1[i]);
7830             alloc_reg(&current,i,rs2[i]);
7831             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
7832             {
7833               alloc_reg64(&current,i,rs1[i]);
7834               alloc_reg64(&current,i,rs2[i]);
7835             }
7836           }
7837           else
7838           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
7839           {
7840             current.isconst=0;
7841             current.wasconst=0;
7842             regs[i].wasconst=0;
7843             alloc_cc(&current,i);
7844             dirty_reg(&current,CCREG);
7845             alloc_reg(&current,i,rs1[i]);
7846             if(!(current.is32>>rs1[i]&1))
7847             {
7848               alloc_reg64(&current,i,rs1[i]);
7849             }
7850           }
7851           ds=1;
7852           //current.isconst=0;
7853           break;
7854         case SJUMP:
7855           //current.isconst=0;
7856           //current.wasconst=0;
7857           //regs[i].wasconst=0;
7858           clear_const(&current,rs1[i]);
7859           clear_const(&current,rt1[i]);
7860           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
7861           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
7862           {
7863             alloc_cc(&current,i);
7864             dirty_reg(&current,CCREG);
7865             alloc_reg(&current,i,rs1[i]);
7866             if(!(current.is32>>rs1[i]&1))
7867             {
7868               alloc_reg64(&current,i,rs1[i]);
7869             }
7870             if (rt1[i]==31) { // BLTZAL/BGEZAL
7871               alloc_reg(&current,i,31);
7872               dirty_reg(&current,31);
7873               //#ifdef REG_PREFETCH
7874               //alloc_reg(&current,i,PTEMP);
7875               //#endif
7876               //current.is32|=1LL<<rt1[i];
7877             }
7878             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
7879                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
7880               // Allocate the branch condition registers instead.
7881               current.isconst=0;
7882               current.wasconst=0;
7883               regs[i].wasconst=0;
7884               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7885               if(!((current.is32>>rs1[i])&1))
7886               {
7887                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
7888               }
7889             }
7890             else
7891             {
7892               ooo[i]=1;
7893               delayslot_alloc(&current,i+1);
7894             }
7895           }
7896           else
7897           // Don't alloc the delay slot yet because we might not execute it
7898           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
7899           {
7900             current.isconst=0;
7901             current.wasconst=0;
7902             regs[i].wasconst=0;
7903             alloc_cc(&current,i);
7904             dirty_reg(&current,CCREG);
7905             alloc_reg(&current,i,rs1[i]);
7906             if(!(current.is32>>rs1[i]&1))
7907             {
7908               alloc_reg64(&current,i,rs1[i]);
7909             }
7910           }
7911           ds=1;
7912           //current.isconst=0;
7913           break;
7914         case FJUMP:
7915           current.isconst=0;
7916           current.wasconst=0;
7917           regs[i].wasconst=0;
7918           if(likely[i]==0) // BC1F/BC1T
7919           {
7920             // TODO: Theoretically we can run out of registers here on x86.
7921             // The delay slot can allocate up to six, and we need to check
7922             // CSREG before executing the delay slot.  Possibly we can drop
7923             // the cycle count and then reload it after checking that the
7924             // FPU is in a usable state, or don't do out-of-order execution.
7925             alloc_cc(&current,i);
7926             dirty_reg(&current,CCREG);
7927             alloc_reg(&current,i,FSREG);
7928             alloc_reg(&current,i,CSREG);
7929             if(itype[i+1]==FCOMP) {
7930               // The delay slot overwrites the branch condition.
7931               // Allocate the branch condition registers instead.
7932               alloc_cc(&current,i);
7933               dirty_reg(&current,CCREG);
7934               alloc_reg(&current,i,CSREG);
7935               alloc_reg(&current,i,FSREG);
7936             }
7937             else {
7938               ooo[i]=1;
7939               delayslot_alloc(&current,i+1);
7940               alloc_reg(&current,i+1,CSREG);
7941             }
7942           }
7943           else
7944           // Don't alloc the delay slot yet because we might not execute it
7945           if(likely[i]) // BC1FL/BC1TL
7946           {
7947             alloc_cc(&current,i);
7948             dirty_reg(&current,CCREG);
7949             alloc_reg(&current,i,CSREG);
7950             alloc_reg(&current,i,FSREG);
7951           }
7952           ds=1;
7953           current.isconst=0;
7954           break;
7955         case IMM16:
7956           imm16_alloc(&current,i);
7957           break;
7958         case LOAD:
7959         case LOADLR:
7960           load_alloc(&current,i);
7961           break;
7962         case STORE:
7963         case STORELR:
7964           store_alloc(&current,i);
7965           break;
7966         case ALU:
7967           alu_alloc(&current,i);
7968           break;
7969         case SHIFT:
7970           shift_alloc(&current,i);
7971           break;
7972         case MULTDIV:
7973           multdiv_alloc(&current,i);
7974           break;
7975         case SHIFTIMM:
7976           shiftimm_alloc(&current,i);
7977           break;
7978         case MOV:
7979           mov_alloc(&current,i);
7980           break;
7981         case COP0:
7982           cop0_alloc(&current,i);
7983           break;
7984         case COP1:
7985         case COP2:
7986           cop1_alloc(&current,i);
7987           break;
7988         case C1LS:
7989           c1ls_alloc(&current,i);
7990           break;
7991         case C2LS:
7992           c2ls_alloc(&current,i);
7993           break;
7994         case C2OP:
7995           c2op_alloc(&current,i);
7996           break;
7997         case FCONV:
7998           fconv_alloc(&current,i);
7999           break;
8000         case FLOAT:
8001           float_alloc(&current,i);
8002           break;
8003         case FCOMP:
8004           fcomp_alloc(&current,i);
8005           break;
8006         case SYSCALL:
8007         case HLECALL:
8008         case INTCALL:
8009           syscall_alloc(&current,i);
8010           break;
8011         case SPAN:
8012           pagespan_alloc(&current,i);
8013           break;
8014       }
8015
8016       // Drop the upper half of registers that have become 32-bit
8017       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8018       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8019         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8020         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8021         current.uu|=1;
8022       } else {
8023         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8024         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8025         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8026         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8027         current.uu|=1;
8028       }
8029
8030       // Create entry (branch target) regmap
8031       for(hr=0;hr<HOST_REGS;hr++)
8032       {
8033         int r,or;
8034         r=current.regmap[hr];
8035         if(r>=0) {
8036           if(r!=regmap_pre[i][hr]) {
8037             // TODO: delay slot (?)
8038             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8039             if(or<0||(r&63)>=TEMPREG){
8040               regs[i].regmap_entry[hr]=-1;
8041             }
8042             else
8043             {
8044               // Just move it to a different register
8045               regs[i].regmap_entry[hr]=r;
8046               // If it was dirty before, it's still dirty
8047               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8048             }
8049           }
8050           else
8051           {
8052             // Unneeded
8053             if(r==0){
8054               regs[i].regmap_entry[hr]=0;
8055             }
8056             else
8057             if(r<64){
8058               if((current.u>>r)&1) {
8059                 regs[i].regmap_entry[hr]=-1;
8060                 //regs[i].regmap[hr]=-1;
8061                 current.regmap[hr]=-1;
8062               }else
8063                 regs[i].regmap_entry[hr]=r;
8064             }
8065             else {
8066               if((current.uu>>(r&63))&1) {
8067                 regs[i].regmap_entry[hr]=-1;
8068                 //regs[i].regmap[hr]=-1;
8069                 current.regmap[hr]=-1;
8070               }else
8071                 regs[i].regmap_entry[hr]=r;
8072             }
8073           }
8074         } else {
8075           // Branches expect CCREG to be allocated at the target
8076           if(regmap_pre[i][hr]==CCREG)
8077             regs[i].regmap_entry[hr]=CCREG;
8078           else
8079             regs[i].regmap_entry[hr]=-1;
8080         }
8081       }
8082       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8083     }
8084
8085     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8086       current.waswritten|=1<<rs1[i-1];
8087     current.waswritten&=~(1<<rt1[i]);
8088     current.waswritten&=~(1<<rt2[i]);
8089     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8090       current.waswritten&=~(1<<rs1[i]);
8091
8092     /* Branch post-alloc */
8093     if(i>0)
8094     {
8095       current.was32=current.is32;
8096       current.wasdirty=current.dirty;
8097       switch(itype[i-1]) {
8098         case UJUMP:
8099           memcpy(&branch_regs[i-1],&current,sizeof(current));
8100           branch_regs[i-1].isconst=0;
8101           branch_regs[i-1].wasconst=0;
8102           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8103           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8104           alloc_cc(&branch_regs[i-1],i-1);
8105           dirty_reg(&branch_regs[i-1],CCREG);
8106           if(rt1[i-1]==31) { // JAL
8107             alloc_reg(&branch_regs[i-1],i-1,31);
8108             dirty_reg(&branch_regs[i-1],31);
8109             branch_regs[i-1].is32|=1LL<<31;
8110           }
8111           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8112           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8113           break;
8114         case RJUMP:
8115           memcpy(&branch_regs[i-1],&current,sizeof(current));
8116           branch_regs[i-1].isconst=0;
8117           branch_regs[i-1].wasconst=0;
8118           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8119           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8120           alloc_cc(&branch_regs[i-1],i-1);
8121           dirty_reg(&branch_regs[i-1],CCREG);
8122           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8123           if(rt1[i-1]!=0) { // JALR
8124             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8125             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8126             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8127           }
8128           #ifdef USE_MINI_HT
8129           if(rs1[i-1]==31) { // JALR
8130             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8131             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8132           }
8133           #endif
8134           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8135           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8136           break;
8137         case CJUMP:
8138           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8139           {
8140             alloc_cc(&current,i-1);
8141             dirty_reg(&current,CCREG);
8142             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8143                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8144               // The delay slot overwrote one of our conditions
8145               // Delay slot goes after the test (in order)
8146               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8147               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8148               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8149               current.u|=1;
8150               current.uu|=1;
8151               delayslot_alloc(&current,i);
8152               current.isconst=0;
8153             }
8154             else
8155             {
8156               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8157               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8158               // Alloc the branch condition registers
8159               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8160               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8161               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8162               {
8163                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8164                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8165               }
8166             }
8167             memcpy(&branch_regs[i-1],&current,sizeof(current));
8168             branch_regs[i-1].isconst=0;
8169             branch_regs[i-1].wasconst=0;
8170             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8171             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8172           }
8173           else
8174           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8175           {
8176             alloc_cc(&current,i-1);
8177             dirty_reg(&current,CCREG);
8178             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8179               // The delay slot overwrote the branch condition
8180               // Delay slot goes after the test (in order)
8181               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8182               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8183               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8184               current.u|=1;
8185               current.uu|=1;
8186               delayslot_alloc(&current,i);
8187               current.isconst=0;
8188             }
8189             else
8190             {
8191               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8192               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8193               // Alloc the branch condition register
8194               alloc_reg(&current,i-1,rs1[i-1]);
8195               if(!(current.is32>>rs1[i-1]&1))
8196               {
8197                 alloc_reg64(&current,i-1,rs1[i-1]);
8198               }
8199             }
8200             memcpy(&branch_regs[i-1],&current,sizeof(current));
8201             branch_regs[i-1].isconst=0;
8202             branch_regs[i-1].wasconst=0;
8203             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8204             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8205           }
8206           else
8207           // Alloc the delay slot in case the branch is taken
8208           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8209           {
8210             memcpy(&branch_regs[i-1],&current,sizeof(current));
8211             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8212             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8213             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8214             alloc_cc(&branch_regs[i-1],i);
8215             dirty_reg(&branch_regs[i-1],CCREG);
8216             delayslot_alloc(&branch_regs[i-1],i);
8217             branch_regs[i-1].isconst=0;
8218             alloc_reg(&current,i,CCREG); // Not taken path
8219             dirty_reg(&current,CCREG);
8220             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8221           }
8222           else
8223           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8224           {
8225             memcpy(&branch_regs[i-1],&current,sizeof(current));
8226             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8227             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8228             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8229             alloc_cc(&branch_regs[i-1],i);
8230             dirty_reg(&branch_regs[i-1],CCREG);
8231             delayslot_alloc(&branch_regs[i-1],i);
8232             branch_regs[i-1].isconst=0;
8233             alloc_reg(&current,i,CCREG); // Not taken path
8234             dirty_reg(&current,CCREG);
8235             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8236           }
8237           break;
8238         case SJUMP:
8239           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8240           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8241           {
8242             alloc_cc(&current,i-1);
8243             dirty_reg(&current,CCREG);
8244             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8245               // The delay slot overwrote the branch condition
8246               // Delay slot goes after the test (in order)
8247               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8248               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8249               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8250               current.u|=1;
8251               current.uu|=1;
8252               delayslot_alloc(&current,i);
8253               current.isconst=0;
8254             }
8255             else
8256             {
8257               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8258               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8259               // Alloc the branch condition register
8260               alloc_reg(&current,i-1,rs1[i-1]);
8261               if(!(current.is32>>rs1[i-1]&1))
8262               {
8263                 alloc_reg64(&current,i-1,rs1[i-1]);
8264               }
8265             }
8266             memcpy(&branch_regs[i-1],&current,sizeof(current));
8267             branch_regs[i-1].isconst=0;
8268             branch_regs[i-1].wasconst=0;
8269             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8270             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8271           }
8272           else
8273           // Alloc the delay slot in case the branch is taken
8274           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8275           {
8276             memcpy(&branch_regs[i-1],&current,sizeof(current));
8277             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8278             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8279             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8280             alloc_cc(&branch_regs[i-1],i);
8281             dirty_reg(&branch_regs[i-1],CCREG);
8282             delayslot_alloc(&branch_regs[i-1],i);
8283             branch_regs[i-1].isconst=0;
8284             alloc_reg(&current,i,CCREG); // Not taken path
8285             dirty_reg(&current,CCREG);
8286             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8287           }
8288           // FIXME: BLTZAL/BGEZAL
8289           if(opcode2[i-1]&0x10) { // BxxZAL
8290             alloc_reg(&branch_regs[i-1],i-1,31);
8291             dirty_reg(&branch_regs[i-1],31);
8292             branch_regs[i-1].is32|=1LL<<31;
8293           }
8294           break;
8295         case FJUMP:
8296           if(likely[i-1]==0) // BC1F/BC1T
8297           {
8298             alloc_cc(&current,i-1);
8299             dirty_reg(&current,CCREG);
8300             if(itype[i]==FCOMP) {
8301               // The delay slot overwrote the branch condition
8302               // Delay slot goes after the test (in order)
8303               delayslot_alloc(&current,i);
8304               current.isconst=0;
8305             }
8306             else
8307             {
8308               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8309               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8310               // Alloc the branch condition register
8311               alloc_reg(&current,i-1,FSREG);
8312             }
8313             memcpy(&branch_regs[i-1],&current,sizeof(current));
8314             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8315           }
8316           else // BC1FL/BC1TL
8317           {
8318             // Alloc the delay slot in case the branch is taken
8319             memcpy(&branch_regs[i-1],&current,sizeof(current));
8320             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8321             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8322             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8323             alloc_cc(&branch_regs[i-1],i);
8324             dirty_reg(&branch_regs[i-1],CCREG);
8325             delayslot_alloc(&branch_regs[i-1],i);
8326             branch_regs[i-1].isconst=0;
8327             alloc_reg(&current,i,CCREG); // Not taken path
8328             dirty_reg(&current,CCREG);
8329             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8330           }
8331           break;
8332       }
8333
8334       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8335       {
8336         if(rt1[i-1]==31) // JAL/JALR
8337         {
8338           // Subroutine call will return here, don't alloc any registers
8339           current.is32=1;
8340           current.dirty=0;
8341           clear_all_regs(current.regmap);
8342           alloc_reg(&current,i,CCREG);
8343           dirty_reg(&current,CCREG);
8344         }
8345         else if(i+1<slen)
8346         {
8347           // Internal branch will jump here, match registers to caller
8348           current.is32=0x3FFFFFFFFLL;
8349           current.dirty=0;
8350           clear_all_regs(current.regmap);
8351           alloc_reg(&current,i,CCREG);
8352           dirty_reg(&current,CCREG);
8353           for(j=i-1;j>=0;j--)
8354           {
8355             if(ba[j]==start+i*4+4) {
8356               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8357               current.is32=branch_regs[j].is32;
8358               current.dirty=branch_regs[j].dirty;
8359               break;
8360             }
8361           }
8362           while(j>=0) {
8363             if(ba[j]==start+i*4+4) {
8364               for(hr=0;hr<HOST_REGS;hr++) {
8365                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8366                   current.regmap[hr]=-1;
8367                 }
8368                 current.is32&=branch_regs[j].is32;
8369                 current.dirty&=branch_regs[j].dirty;
8370               }
8371             }
8372             j--;
8373           }
8374         }
8375       }
8376     }
8377
8378     // Count cycles in between branches
8379     ccadj[i]=cc;
8380     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8381     {
8382       cc=0;
8383     }
8384 #if !defined(DRC_DBG)
8385     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8386     {
8387       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8388       cc+=gte_cycletab[source[i]&0x3f]/2;
8389     }
8390     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8391     {
8392       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8393     }
8394     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8395     {
8396       cc+=4;
8397     }
8398     else if(itype[i]==C2LS)
8399     {
8400       cc+=4;
8401     }
8402 #endif
8403     else
8404     {
8405       cc++;
8406     }
8407
8408     flush_dirty_uppers(&current);
8409     if(!is_ds[i]) {
8410       regs[i].is32=current.is32;
8411       regs[i].dirty=current.dirty;
8412       regs[i].isconst=current.isconst;
8413       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8414     }
8415     for(hr=0;hr<HOST_REGS;hr++) {
8416       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8417         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8418           regs[i].wasconst&=~(1<<hr);
8419         }
8420       }
8421     }
8422     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8423     regs[i].waswritten=current.waswritten;
8424   }
8425
8426   /* Pass 4 - Cull unused host registers */
8427
8428   uint64_t nr=0;
8429
8430   for (i=slen-1;i>=0;i--)
8431   {
8432     int hr;
8433     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8434     {
8435       if(ba[i]<start || ba[i]>=(start+slen*4))
8436       {
8437         // Branch out of this block, don't need anything
8438         nr=0;
8439       }
8440       else
8441       {
8442         // Internal branch
8443         // Need whatever matches the target
8444         nr=0;
8445         int t=(ba[i]-start)>>2;
8446         for(hr=0;hr<HOST_REGS;hr++)
8447         {
8448           if(regs[i].regmap_entry[hr]>=0) {
8449             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8450           }
8451         }
8452       }
8453       // Conditional branch may need registers for following instructions
8454       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8455       {
8456         if(i<slen-2) {
8457           nr|=needed_reg[i+2];
8458           for(hr=0;hr<HOST_REGS;hr++)
8459           {
8460             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8461             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8462           }
8463         }
8464       }
8465       // Don't need stuff which is overwritten
8466       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8467       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8468       // Merge in delay slot
8469       for(hr=0;hr<HOST_REGS;hr++)
8470       {
8471         if(!likely[i]) {
8472           // These are overwritten unless the branch is "likely"
8473           // and the delay slot is nullified if not taken
8474           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8475           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8476         }
8477         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8478         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8479         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8480         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8481         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8482         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8483         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8484         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8485         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8486           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8487           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8488         }
8489         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8490           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8491           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8492         }
8493         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8494           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8495           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8496         }
8497       }
8498     }
8499     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8500     {
8501       // SYSCALL instruction (software interrupt)
8502       nr=0;
8503     }
8504     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8505     {
8506       // ERET instruction (return from interrupt)
8507       nr=0;
8508     }
8509     else // Non-branch
8510     {
8511       if(i<slen-1) {
8512         for(hr=0;hr<HOST_REGS;hr++) {
8513           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8514           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8515           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8516           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8517         }
8518       }
8519     }
8520     for(hr=0;hr<HOST_REGS;hr++)
8521     {
8522       // Overwritten registers are not needed
8523       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8524       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8525       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8526       // Source registers are needed
8527       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8528       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8529       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8530       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8531       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8532       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8533       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8534       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8535       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
8536         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8537         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8538       }
8539       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
8540         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8541         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8542       }
8543       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8544         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8545         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8546       }
8547       // Don't store a register immediately after writing it,
8548       // may prevent dual-issue.
8549       // But do so if this is a branch target, otherwise we
8550       // might have to load the register before the branch.
8551       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8552         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
8553            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
8554           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8555           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8556         }
8557         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
8558            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
8559           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8560           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8561         }
8562       }
8563     }
8564     // Cycle count is needed at branches.  Assume it is needed at the target too.
8565     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
8566       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8567       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8568     }
8569     // Save it
8570     needed_reg[i]=nr;
8571
8572     // Deallocate unneeded registers
8573     for(hr=0;hr<HOST_REGS;hr++)
8574     {
8575       if(!((nr>>hr)&1)) {
8576         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8577         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8578            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8579            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8580         {
8581           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8582           {
8583             if(likely[i]) {
8584               regs[i].regmap[hr]=-1;
8585               regs[i].isconst&=~(1<<hr);
8586               if(i<slen-2) {
8587                 regmap_pre[i+2][hr]=-1;
8588                 regs[i+2].wasconst&=~(1<<hr);
8589               }
8590             }
8591           }
8592         }
8593         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8594         {
8595           int d1=0,d2=0,map=0,temp=0;
8596           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
8597           {
8598             d1=dep1[i+1];
8599             d2=dep2[i+1];
8600           }
8601           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8602              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8603             map=INVCP;
8604           }
8605           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8606              itype[i+1]==C1LS || itype[i+1]==C2LS)
8607             temp=FTEMP;
8608           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8609              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8610              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8611              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
8612              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8613              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8614              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8615              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8616              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8617              regs[i].regmap[hr]!=map )
8618           {
8619             regs[i].regmap[hr]=-1;
8620             regs[i].isconst&=~(1<<hr);
8621             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8622                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8623                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8624                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
8625                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
8626                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8627                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8628                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8629                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8630                branch_regs[i].regmap[hr]!=map)
8631             {
8632               branch_regs[i].regmap[hr]=-1;
8633               branch_regs[i].regmap_entry[hr]=-1;
8634               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8635               {
8636                 if(!likely[i]&&i<slen-2) {
8637                   regmap_pre[i+2][hr]=-1;
8638                   regs[i+2].wasconst&=~(1<<hr);
8639                 }
8640               }
8641             }
8642           }
8643         }
8644         else
8645         {
8646           // Non-branch
8647           if(i>0)
8648           {
8649             int d1=0,d2=0,map=-1,temp=-1;
8650             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
8651             {
8652               d1=dep1[i];
8653               d2=dep2[i];
8654             }
8655             if(itype[i]==STORE || itype[i]==STORELR ||
8656                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8657               map=INVCP;
8658             }
8659             if(itype[i]==LOADLR || itype[i]==STORELR ||
8660                itype[i]==C1LS || itype[i]==C2LS)
8661               temp=FTEMP;
8662             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8663                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
8664                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8665                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8666                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8667                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8668             {
8669               if(i<slen-1&&!is_ds[i]) {
8670                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
8671                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8672                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
8673                 {
8674                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
8675                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
8676                 }
8677                 regmap_pre[i+1][hr]=-1;
8678                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
8679                 regs[i+1].wasconst&=~(1<<hr);
8680               }
8681               regs[i].regmap[hr]=-1;
8682               regs[i].isconst&=~(1<<hr);
8683             }
8684           }
8685         }
8686       }
8687     }
8688   }
8689
8690   /* Pass 5 - Pre-allocate registers */
8691
8692   // If a register is allocated during a loop, try to allocate it for the
8693   // entire loop, if possible.  This avoids loading/storing registers
8694   // inside of the loop.
8695
8696   signed char f_regmap[HOST_REGS];
8697   clear_all_regs(f_regmap);
8698   for(i=0;i<slen-1;i++)
8699   {
8700     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8701     {
8702       if(ba[i]>=start && ba[i]<(start+i*4))
8703       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
8704       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
8705       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
8706       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
8707       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
8708       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
8709       {
8710         int t=(ba[i]-start)>>2;
8711         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
8712         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
8713         for(hr=0;hr<HOST_REGS;hr++)
8714         {
8715           if(regs[i].regmap[hr]>64) {
8716             if(!((regs[i].dirty>>hr)&1))
8717               f_regmap[hr]=regs[i].regmap[hr];
8718             else f_regmap[hr]=-1;
8719           }
8720           else if(regs[i].regmap[hr]>=0) {
8721             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8722               // dealloc old register
8723               int n;
8724               for(n=0;n<HOST_REGS;n++)
8725               {
8726                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8727               }
8728               // and alloc new one
8729               f_regmap[hr]=regs[i].regmap[hr];
8730             }
8731           }
8732           if(branch_regs[i].regmap[hr]>64) {
8733             if(!((branch_regs[i].dirty>>hr)&1))
8734               f_regmap[hr]=branch_regs[i].regmap[hr];
8735             else f_regmap[hr]=-1;
8736           }
8737           else if(branch_regs[i].regmap[hr]>=0) {
8738             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
8739               // dealloc old register
8740               int n;
8741               for(n=0;n<HOST_REGS;n++)
8742               {
8743                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
8744               }
8745               // and alloc new one
8746               f_regmap[hr]=branch_regs[i].regmap[hr];
8747             }
8748           }
8749           if(ooo[i]) {
8750             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
8751               f_regmap[hr]=branch_regs[i].regmap[hr];
8752           }else{
8753             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
8754               f_regmap[hr]=branch_regs[i].regmap[hr];
8755           }
8756           // Avoid dirty->clean transition
8757           #ifdef DESTRUCTIVE_WRITEBACK
8758           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
8759           #endif
8760           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
8761           // case above, however it's always a good idea.  We can't hoist the
8762           // load if the register was already allocated, so there's no point
8763           // wasting time analyzing most of these cases.  It only "succeeds"
8764           // when the mapping was different and the load can be replaced with
8765           // a mov, which is of negligible benefit.  So such cases are
8766           // skipped below.
8767           if(f_regmap[hr]>0) {
8768             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
8769               int r=f_regmap[hr];
8770               for(j=t;j<=i;j++)
8771               {
8772                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8773                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
8774                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
8775                 if(r>63) {
8776                   // NB This can exclude the case where the upper-half
8777                   // register is lower numbered than the lower-half
8778                   // register.  Not sure if it's worth fixing...
8779                   if(get_reg(regs[j].regmap,r&63)<0) break;
8780                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
8781                   if(regs[j].is32&(1LL<<(r&63))) break;
8782                 }
8783                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
8784                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8785                   int k;
8786                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
8787                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
8788                     if(r>63) {
8789                       if(get_reg(regs[i].regmap,r&63)<0) break;
8790                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
8791                     }
8792                     k=i;
8793                     while(k>1&&regs[k-1].regmap[hr]==-1) {
8794                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8795                         //printf("no free regs for store %x\n",start+(k-1)*4);
8796                         break;
8797                       }
8798                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
8799                         //printf("no-match due to different register\n");
8800                         break;
8801                       }
8802                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
8803                         //printf("no-match due to branch\n");
8804                         break;
8805                       }
8806                       // call/ret fast path assumes no registers allocated
8807                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
8808                         break;
8809                       }
8810                       if(r>63) {
8811                         // NB This can exclude the case where the upper-half
8812                         // register is lower numbered than the lower-half
8813                         // register.  Not sure if it's worth fixing...
8814                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
8815                         if(regs[k-1].is32&(1LL<<(r&63))) break;
8816                       }
8817                       k--;
8818                     }
8819                     if(i<slen-1) {
8820                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
8821                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
8822                         //printf("bad match after branch\n");
8823                         break;
8824                       }
8825                     }
8826                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
8827                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
8828                       while(k<i) {
8829                         regs[k].regmap_entry[hr]=f_regmap[hr];
8830                         regs[k].regmap[hr]=f_regmap[hr];
8831                         regmap_pre[k+1][hr]=f_regmap[hr];
8832                         regs[k].wasdirty&=~(1<<hr);
8833                         regs[k].dirty&=~(1<<hr);
8834                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
8835                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
8836                         regs[k].wasconst&=~(1<<hr);
8837                         regs[k].isconst&=~(1<<hr);
8838                         k++;
8839                       }
8840                     }
8841                     else {
8842                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
8843                       break;
8844                     }
8845                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
8846                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
8847                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
8848                       regs[i].regmap_entry[hr]=f_regmap[hr];
8849                       regs[i].regmap[hr]=f_regmap[hr];
8850                       regs[i].wasdirty&=~(1<<hr);
8851                       regs[i].dirty&=~(1<<hr);
8852                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
8853                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
8854                       regs[i].wasconst&=~(1<<hr);
8855                       regs[i].isconst&=~(1<<hr);
8856                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
8857                       branch_regs[i].wasdirty&=~(1<<hr);
8858                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
8859                       branch_regs[i].regmap[hr]=f_regmap[hr];
8860                       branch_regs[i].dirty&=~(1<<hr);
8861                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
8862                       branch_regs[i].wasconst&=~(1<<hr);
8863                       branch_regs[i].isconst&=~(1<<hr);
8864                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
8865                         regmap_pre[i+2][hr]=f_regmap[hr];
8866                         regs[i+2].wasdirty&=~(1<<hr);
8867                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
8868                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
8869                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
8870                       }
8871                     }
8872                   }
8873                   for(k=t;k<j;k++) {
8874                     // Alloc register clean at beginning of loop,
8875                     // but may dirty it in pass 6
8876                     regs[k].regmap_entry[hr]=f_regmap[hr];
8877                     regs[k].regmap[hr]=f_regmap[hr];
8878                     regs[k].dirty&=~(1<<hr);
8879                     regs[k].wasconst&=~(1<<hr);
8880                     regs[k].isconst&=~(1<<hr);
8881                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
8882                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
8883                       branch_regs[k].regmap[hr]=f_regmap[hr];
8884                       branch_regs[k].dirty&=~(1<<hr);
8885                       branch_regs[k].wasconst&=~(1<<hr);
8886                       branch_regs[k].isconst&=~(1<<hr);
8887                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
8888                         regmap_pre[k+2][hr]=f_regmap[hr];
8889                         regs[k+2].wasdirty&=~(1<<hr);
8890                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
8891                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
8892                       }
8893                     }
8894                     else
8895                     {
8896                       regmap_pre[k+1][hr]=f_regmap[hr];
8897                       regs[k+1].wasdirty&=~(1<<hr);
8898                     }
8899                   }
8900                   if(regs[j].regmap[hr]==f_regmap[hr])
8901                     regs[j].regmap_entry[hr]=f_regmap[hr];
8902                   break;
8903                 }
8904                 if(j==i) break;
8905                 if(regs[j].regmap[hr]>=0)
8906                   break;
8907                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
8908                   //printf("no-match due to different register\n");
8909                   break;
8910                 }
8911                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
8912                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
8913                   break;
8914                 }
8915                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
8916                 {
8917                   // Stop on unconditional branch
8918                   break;
8919                 }
8920                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
8921                 {
8922                   if(ooo[j]) {
8923                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
8924                       break;
8925                   }else{
8926                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
8927                       break;
8928                   }
8929                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
8930                     //printf("no-match due to different register (branch)\n");
8931                     break;
8932                   }
8933                 }
8934                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8935                   //printf("No free regs for store %x\n",start+j*4);
8936                   break;
8937                 }
8938                 if(f_regmap[hr]>=64) {
8939                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
8940                     break;
8941                   }
8942                   else
8943                   {
8944                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
8945                       break;
8946                     }
8947                   }
8948                 }
8949               }
8950             }
8951           }
8952         }
8953       }
8954     }else{
8955       // Non branch or undetermined branch target
8956       for(hr=0;hr<HOST_REGS;hr++)
8957       {
8958         if(hr!=EXCLUDE_REG) {
8959           if(regs[i].regmap[hr]>64) {
8960             if(!((regs[i].dirty>>hr)&1))
8961               f_regmap[hr]=regs[i].regmap[hr];
8962           }
8963           else if(regs[i].regmap[hr]>=0) {
8964             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8965               // dealloc old register
8966               int n;
8967               for(n=0;n<HOST_REGS;n++)
8968               {
8969                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8970               }
8971               // and alloc new one
8972               f_regmap[hr]=regs[i].regmap[hr];
8973             }
8974           }
8975         }
8976       }
8977       // Try to restore cycle count at branch targets
8978       if(bt[i]) {
8979         for(j=i;j<slen-1;j++) {
8980           if(regs[j].regmap[HOST_CCREG]!=-1) break;
8981           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8982             //printf("no free regs for store %x\n",start+j*4);
8983             break;
8984           }
8985         }
8986         if(regs[j].regmap[HOST_CCREG]==CCREG) {
8987           int k=i;
8988           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
8989           while(k<j) {
8990             regs[k].regmap_entry[HOST_CCREG]=CCREG;
8991             regs[k].regmap[HOST_CCREG]=CCREG;
8992             regmap_pre[k+1][HOST_CCREG]=CCREG;
8993             regs[k+1].wasdirty|=1<<HOST_CCREG;
8994             regs[k].dirty|=1<<HOST_CCREG;
8995             regs[k].wasconst&=~(1<<HOST_CCREG);
8996             regs[k].isconst&=~(1<<HOST_CCREG);
8997             k++;
8998           }
8999           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9000         }
9001         // Work backwards from the branch target
9002         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9003         {
9004           //printf("Extend backwards\n");
9005           int k;
9006           k=i;
9007           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9008             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9009               //printf("no free regs for store %x\n",start+(k-1)*4);
9010               break;
9011             }
9012             k--;
9013           }
9014           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9015             //printf("Extend CC, %x ->\n",start+k*4);
9016             while(k<=i) {
9017               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9018               regs[k].regmap[HOST_CCREG]=CCREG;
9019               regmap_pre[k+1][HOST_CCREG]=CCREG;
9020               regs[k+1].wasdirty|=1<<HOST_CCREG;
9021               regs[k].dirty|=1<<HOST_CCREG;
9022               regs[k].wasconst&=~(1<<HOST_CCREG);
9023               regs[k].isconst&=~(1<<HOST_CCREG);
9024               k++;
9025             }
9026           }
9027           else {
9028             //printf("Fail Extend CC, %x ->\n",start+k*4);
9029           }
9030         }
9031       }
9032       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9033          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9034          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9035          itype[i]!=FCONV&&itype[i]!=FCOMP)
9036       {
9037         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9038       }
9039     }
9040   }
9041
9042   // This allocates registers (if possible) one instruction prior
9043   // to use, which can avoid a load-use penalty on certain CPUs.
9044   for(i=0;i<slen-1;i++)
9045   {
9046     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9047     {
9048       if(!bt[i+1])
9049       {
9050         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9051            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9052         {
9053           if(rs1[i+1]) {
9054             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9055             {
9056               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9057               {
9058                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9059                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9060                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9061                 regs[i].isconst&=~(1<<hr);
9062                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9063                 constmap[i][hr]=constmap[i+1][hr];
9064                 regs[i+1].wasdirty&=~(1<<hr);
9065                 regs[i].dirty&=~(1<<hr);
9066               }
9067             }
9068           }
9069           if(rs2[i+1]) {
9070             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9071             {
9072               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9073               {
9074                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9075                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9076                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9077                 regs[i].isconst&=~(1<<hr);
9078                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9079                 constmap[i][hr]=constmap[i+1][hr];
9080                 regs[i+1].wasdirty&=~(1<<hr);
9081                 regs[i].dirty&=~(1<<hr);
9082               }
9083             }
9084           }
9085           // Preload target address for load instruction (non-constant)
9086           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9087             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9088             {
9089               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9090               {
9091                 regs[i].regmap[hr]=rs1[i+1];
9092                 regmap_pre[i+1][hr]=rs1[i+1];
9093                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9094                 regs[i].isconst&=~(1<<hr);
9095                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9096                 constmap[i][hr]=constmap[i+1][hr];
9097                 regs[i+1].wasdirty&=~(1<<hr);
9098                 regs[i].dirty&=~(1<<hr);
9099               }
9100             }
9101           }
9102           // Load source into target register
9103           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9104             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9105             {
9106               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9107               {
9108                 regs[i].regmap[hr]=rs1[i+1];
9109                 regmap_pre[i+1][hr]=rs1[i+1];
9110                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9111                 regs[i].isconst&=~(1<<hr);
9112                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9113                 constmap[i][hr]=constmap[i+1][hr];
9114                 regs[i+1].wasdirty&=~(1<<hr);
9115                 regs[i].dirty&=~(1<<hr);
9116               }
9117             }
9118           }
9119           // Address for store instruction (non-constant)
9120           if(itype[i+1]==STORE||itype[i+1]==STORELR
9121              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9122             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9123               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9124               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9125               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9126               assert(hr>=0);
9127               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9128               {
9129                 regs[i].regmap[hr]=rs1[i+1];
9130                 regmap_pre[i+1][hr]=rs1[i+1];
9131                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9132                 regs[i].isconst&=~(1<<hr);
9133                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9134                 constmap[i][hr]=constmap[i+1][hr];
9135                 regs[i+1].wasdirty&=~(1<<hr);
9136                 regs[i].dirty&=~(1<<hr);
9137               }
9138             }
9139           }
9140           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9141             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9142               int nr;
9143               hr=get_reg(regs[i+1].regmap,FTEMP);
9144               assert(hr>=0);
9145               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9146               {
9147                 regs[i].regmap[hr]=rs1[i+1];
9148                 regmap_pre[i+1][hr]=rs1[i+1];
9149                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9150                 regs[i].isconst&=~(1<<hr);
9151                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9152                 constmap[i][hr]=constmap[i+1][hr];
9153                 regs[i+1].wasdirty&=~(1<<hr);
9154                 regs[i].dirty&=~(1<<hr);
9155               }
9156               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9157               {
9158                 // move it to another register
9159                 regs[i+1].regmap[hr]=-1;
9160                 regmap_pre[i+2][hr]=-1;
9161                 regs[i+1].regmap[nr]=FTEMP;
9162                 regmap_pre[i+2][nr]=FTEMP;
9163                 regs[i].regmap[nr]=rs1[i+1];
9164                 regmap_pre[i+1][nr]=rs1[i+1];
9165                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9166                 regs[i].isconst&=~(1<<nr);
9167                 regs[i+1].isconst&=~(1<<nr);
9168                 regs[i].dirty&=~(1<<nr);
9169                 regs[i+1].wasdirty&=~(1<<nr);
9170                 regs[i+1].dirty&=~(1<<nr);
9171                 regs[i+2].wasdirty&=~(1<<nr);
9172               }
9173             }
9174           }
9175           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9176             if(itype[i+1]==LOAD)
9177               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9178             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9179               hr=get_reg(regs[i+1].regmap,FTEMP);
9180             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9181               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9182               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9183             }
9184             if(hr>=0&&regs[i].regmap[hr]<0) {
9185               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9186               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9187                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9188                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9189                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9190                 regs[i].isconst&=~(1<<hr);
9191                 regs[i+1].wasdirty&=~(1<<hr);
9192                 regs[i].dirty&=~(1<<hr);
9193               }
9194             }
9195           }
9196         }
9197       }
9198     }
9199   }
9200
9201   /* Pass 6 - Optimize clean/dirty state */
9202   clean_registers(0,slen-1,1);
9203
9204   /* Pass 7 - Identify 32-bit registers */
9205   for (i=slen-1;i>=0;i--)
9206   {
9207     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9208     {
9209       // Conditional branch
9210       if((source[i]>>16)!=0x1000&&i<slen-2) {
9211         // Mark this address as a branch target since it may be called
9212         // upon return from interrupt
9213         bt[i+2]=1;
9214       }
9215     }
9216   }
9217
9218   if(itype[slen-1]==SPAN) {
9219     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9220   }
9221
9222 #ifdef DISASM
9223   /* Debug/disassembly */
9224   for(i=0;i<slen;i++)
9225   {
9226     printf("U:");
9227     int r;
9228     for(r=1;r<=CCREG;r++) {
9229       if((unneeded_reg[i]>>r)&1) {
9230         if(r==HIREG) printf(" HI");
9231         else if(r==LOREG) printf(" LO");
9232         else printf(" r%d",r);
9233       }
9234     }
9235     printf("\n");
9236     #if defined(__i386__) || defined(__x86_64__)
9237     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9238     #endif
9239     #ifdef __arm__
9240     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9241     #endif
9242     printf("needs: ");
9243     if(needed_reg[i]&1) printf("eax ");
9244     if((needed_reg[i]>>1)&1) printf("ecx ");
9245     if((needed_reg[i]>>2)&1) printf("edx ");
9246     if((needed_reg[i]>>3)&1) printf("ebx ");
9247     if((needed_reg[i]>>5)&1) printf("ebp ");
9248     if((needed_reg[i]>>6)&1) printf("esi ");
9249     if((needed_reg[i]>>7)&1) printf("edi ");
9250     printf("\n");
9251     #if defined(__i386__) || defined(__x86_64__)
9252     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9253     printf("dirty: ");
9254     if(regs[i].wasdirty&1) printf("eax ");
9255     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9256     if((regs[i].wasdirty>>2)&1) printf("edx ");
9257     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9258     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9259     if((regs[i].wasdirty>>6)&1) printf("esi ");
9260     if((regs[i].wasdirty>>7)&1) printf("edi ");
9261     #endif
9262     #ifdef __arm__
9263     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9264     printf("dirty: ");
9265     if(regs[i].wasdirty&1) printf("r0 ");
9266     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9267     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9268     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9269     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9270     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9271     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9272     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9273     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9274     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9275     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9276     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9277     #endif
9278     printf("\n");
9279     disassemble_inst(i);
9280     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9281     #if defined(__i386__) || defined(__x86_64__)
9282     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9283     if(regs[i].dirty&1) printf("eax ");
9284     if((regs[i].dirty>>1)&1) printf("ecx ");
9285     if((regs[i].dirty>>2)&1) printf("edx ");
9286     if((regs[i].dirty>>3)&1) printf("ebx ");
9287     if((regs[i].dirty>>5)&1) printf("ebp ");
9288     if((regs[i].dirty>>6)&1) printf("esi ");
9289     if((regs[i].dirty>>7)&1) printf("edi ");
9290     #endif
9291     #ifdef __arm__
9292     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9293     if(regs[i].dirty&1) printf("r0 ");
9294     if((regs[i].dirty>>1)&1) printf("r1 ");
9295     if((regs[i].dirty>>2)&1) printf("r2 ");
9296     if((regs[i].dirty>>3)&1) printf("r3 ");
9297     if((regs[i].dirty>>4)&1) printf("r4 ");
9298     if((regs[i].dirty>>5)&1) printf("r5 ");
9299     if((regs[i].dirty>>6)&1) printf("r6 ");
9300     if((regs[i].dirty>>7)&1) printf("r7 ");
9301     if((regs[i].dirty>>8)&1) printf("r8 ");
9302     if((regs[i].dirty>>9)&1) printf("r9 ");
9303     if((regs[i].dirty>>10)&1) printf("r10 ");
9304     if((regs[i].dirty>>12)&1) printf("r12 ");
9305     #endif
9306     printf("\n");
9307     if(regs[i].isconst) {
9308       printf("constants: ");
9309       #if defined(__i386__) || defined(__x86_64__)
9310       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
9311       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
9312       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
9313       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
9314       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
9315       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
9316       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
9317       #endif
9318       #ifdef __arm__
9319       int r;
9320       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
9321         if ((regs[i].isconst >> r) & 1)
9322           printf(" r%d=%x", r, (u_int)constmap[i][r]);
9323       #endif
9324       printf("\n");
9325     }
9326     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9327       #if defined(__i386__) || defined(__x86_64__)
9328       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9329       if(branch_regs[i].dirty&1) printf("eax ");
9330       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
9331       if((branch_regs[i].dirty>>2)&1) printf("edx ");
9332       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
9333       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
9334       if((branch_regs[i].dirty>>6)&1) printf("esi ");
9335       if((branch_regs[i].dirty>>7)&1) printf("edi ");
9336       #endif
9337       #ifdef __arm__
9338       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
9339       if(branch_regs[i].dirty&1) printf("r0 ");
9340       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
9341       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
9342       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
9343       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
9344       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
9345       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
9346       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
9347       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
9348       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
9349       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
9350       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
9351       #endif
9352     }
9353   }
9354 #endif // DISASM
9355
9356   /* Pass 8 - Assembly */
9357   linkcount=0;stubcount=0;
9358   ds=0;is_delayslot=0;
9359   cop1_usable=0;
9360   uint64_t is32_pre=0;
9361   u_int dirty_pre=0;
9362   void *beginning=start_block();
9363   if((u_int)addr&1) {
9364     ds=1;
9365     pagespan_ds();
9366   }
9367   void *instr_addr0_override = NULL;
9368
9369   if (start == 0x80030000) {
9370     // nasty hack for fastbios thing
9371     // override block entry to this code
9372     instr_addr0_override = out;
9373     emit_movimm(start,0);
9374     // abuse io address var as a flag that we
9375     // have already returned here once
9376     emit_readword(&address,1);
9377     emit_writeword(0,&pcaddr);
9378     emit_writeword(0,&address);
9379     emit_cmp(0,1);
9380     emit_jne(new_dyna_leave);
9381   }
9382   for(i=0;i<slen;i++)
9383   {
9384     //if(ds) printf("ds: ");
9385     disassemble_inst(i);
9386     if(ds) {
9387       ds=0; // Skip delay slot
9388       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
9389       instr_addr[i] = NULL;
9390     } else {
9391       speculate_register_values(i);
9392       #ifndef DESTRUCTIVE_WRITEBACK
9393       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9394       {
9395         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
9396               unneeded_reg[i],unneeded_reg_upper[i]);
9397       }
9398       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
9399         is32_pre=branch_regs[i].is32;
9400         dirty_pre=branch_regs[i].dirty;
9401       }else{
9402         is32_pre=regs[i].is32;
9403         dirty_pre=regs[i].dirty;
9404       }
9405       #endif
9406       // write back
9407       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9408       {
9409         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
9410                       unneeded_reg[i],unneeded_reg_upper[i]);
9411         loop_preload(regmap_pre[i],regs[i].regmap_entry);
9412       }
9413       // branch target entry point
9414       instr_addr[i] = out;
9415       assem_debug("<->\n");
9416       drc_dbg_emit_do_cmp(i);
9417
9418       // load regs
9419       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
9420         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
9421       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
9422       address_generation(i,&regs[i],regs[i].regmap_entry);
9423       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
9424       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9425       {
9426         // Load the delay slot registers if necessary
9427         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
9428           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9429         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
9430           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9431         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
9432           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9433       }
9434       else if(i+1<slen)
9435       {
9436         // Preload registers for following instruction
9437         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
9438           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
9439             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9440         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
9441           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
9442             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9443       }
9444       // TODO: if(is_ooo(i)) address_generation(i+1);
9445       if(itype[i]==CJUMP||itype[i]==FJUMP)
9446         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
9447       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
9448         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9449       if(bt[i]) cop1_usable=0;
9450       // assemble
9451       switch(itype[i]) {
9452         case ALU:
9453           alu_assemble(i,&regs[i]);break;
9454         case IMM16:
9455           imm16_assemble(i,&regs[i]);break;
9456         case SHIFT:
9457           shift_assemble(i,&regs[i]);break;
9458         case SHIFTIMM:
9459           shiftimm_assemble(i,&regs[i]);break;
9460         case LOAD:
9461           load_assemble(i,&regs[i]);break;
9462         case LOADLR:
9463           loadlr_assemble(i,&regs[i]);break;
9464         case STORE:
9465           store_assemble(i,&regs[i]);break;
9466         case STORELR:
9467           storelr_assemble(i,&regs[i]);break;
9468         case COP0:
9469           cop0_assemble(i,&regs[i]);break;
9470         case COP1:
9471           cop1_assemble(i,&regs[i]);break;
9472         case C1LS:
9473           c1ls_assemble(i,&regs[i]);break;
9474         case COP2:
9475           cop2_assemble(i,&regs[i]);break;
9476         case C2LS:
9477           c2ls_assemble(i,&regs[i]);break;
9478         case C2OP:
9479           c2op_assemble(i,&regs[i]);break;
9480         case FCONV:
9481           fconv_assemble(i,&regs[i]);break;
9482         case FLOAT:
9483           float_assemble(i,&regs[i]);break;
9484         case FCOMP:
9485           fcomp_assemble(i,&regs[i]);break;
9486         case MULTDIV:
9487           multdiv_assemble(i,&regs[i]);break;
9488         case MOV:
9489           mov_assemble(i,&regs[i]);break;
9490         case SYSCALL:
9491           syscall_assemble(i,&regs[i]);break;
9492         case HLECALL:
9493           hlecall_assemble(i,&regs[i]);break;
9494         case INTCALL:
9495           intcall_assemble(i,&regs[i]);break;
9496         case UJUMP:
9497           ujump_assemble(i,&regs[i]);ds=1;break;
9498         case RJUMP:
9499           rjump_assemble(i,&regs[i]);ds=1;break;
9500         case CJUMP:
9501           cjump_assemble(i,&regs[i]);ds=1;break;
9502         case SJUMP:
9503           sjump_assemble(i,&regs[i]);ds=1;break;
9504         case FJUMP:
9505           fjump_assemble(i,&regs[i]);ds=1;break;
9506         case SPAN:
9507           pagespan_assemble(i,&regs[i]);break;
9508       }
9509       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9510         literal_pool(1024);
9511       else
9512         literal_pool_jumpover(256);
9513     }
9514   }
9515   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
9516   // If the block did not end with an unconditional branch,
9517   // add a jump to the next instruction.
9518   if(i>1) {
9519     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
9520       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
9521       assert(i==slen);
9522       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
9523         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
9524         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9525           emit_loadreg(CCREG,HOST_CCREG);
9526         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9527       }
9528       else if(!likely[i-2])
9529       {
9530         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
9531         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
9532       }
9533       else
9534       {
9535         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
9536         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
9537       }
9538       add_to_linker(out,start+i*4,0);
9539       emit_jmp(0);
9540     }
9541   }
9542   else
9543   {
9544     assert(i>0);
9545     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
9546     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
9547     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9548       emit_loadreg(CCREG,HOST_CCREG);
9549     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9550     add_to_linker(out,start+i*4,0);
9551     emit_jmp(0);
9552   }
9553
9554   // TODO: delay slot stubs?
9555   // Stubs
9556   for(i=0;i<stubcount;i++)
9557   {
9558     switch(stubs[i].type)
9559     {
9560       case LOADB_STUB:
9561       case LOADH_STUB:
9562       case LOADW_STUB:
9563       case LOADD_STUB:
9564       case LOADBU_STUB:
9565       case LOADHU_STUB:
9566         do_readstub(i);break;
9567       case STOREB_STUB:
9568       case STOREH_STUB:
9569       case STOREW_STUB:
9570       case STORED_STUB:
9571         do_writestub(i);break;
9572       case CC_STUB:
9573         do_ccstub(i);break;
9574       case INVCODE_STUB:
9575         do_invstub(i);break;
9576       case FP_STUB:
9577         do_cop1stub(i);break;
9578       case STORELR_STUB:
9579         do_unalignedwritestub(i);break;
9580     }
9581   }
9582
9583   if (instr_addr0_override)
9584     instr_addr[0] = instr_addr0_override;
9585
9586   /* Pass 9 - Linker */
9587   for(i=0;i<linkcount;i++)
9588   {
9589     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
9590     literal_pool(64);
9591     if (!link_addr[i].ext)
9592     {
9593       void *stub = out;
9594       void *addr = check_addr(link_addr[i].target);
9595       emit_extjump(link_addr[i].addr, link_addr[i].target);
9596       if (addr) {
9597         set_jump_target(link_addr[i].addr, addr);
9598         add_link(link_addr[i].target,stub);
9599       }
9600       else
9601         set_jump_target(link_addr[i].addr, stub);
9602     }
9603     else
9604     {
9605       // Internal branch
9606       int target=(link_addr[i].target-start)>>2;
9607       assert(target>=0&&target<slen);
9608       assert(instr_addr[target]);
9609       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9610       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
9611       //#else
9612       set_jump_target(link_addr[i].addr, instr_addr[target]);
9613       //#endif
9614     }
9615   }
9616   // External Branch Targets (jump_in)
9617   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
9618   for(i=0;i<slen;i++)
9619   {
9620     if(bt[i]||i==0)
9621     {
9622       if(instr_addr[i]) // TODO - delay slots (=null)
9623       {
9624         u_int vaddr=start+i*4;
9625         u_int page=get_page(vaddr);
9626         u_int vpage=get_vpage(vaddr);
9627         literal_pool(256);
9628         {
9629           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
9630           assem_debug("jump_in: %x\n",start+i*4);
9631           ll_add(jump_dirty+vpage,vaddr,out);
9632           void *entry_point = do_dirty_stub(i);
9633           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
9634           // If there was an existing entry in the hash table,
9635           // replace it with the new address.
9636           // Don't add new entries.  We'll insert the
9637           // ones that actually get used in check_addr().
9638           struct ht_entry *ht_bin = hash_table_get(vaddr);
9639           if (ht_bin->vaddr[0] == vaddr)
9640             ht_bin->tcaddr[0] = entry_point;
9641           if (ht_bin->vaddr[1] == vaddr)
9642             ht_bin->tcaddr[1] = entry_point;
9643         }
9644       }
9645     }
9646   }
9647   // Write out the literal pool if necessary
9648   literal_pool(0);
9649   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9650   // Align code
9651   if(((u_int)out)&7) emit_addnop(13);
9652   #endif
9653   assert(out - (u_char *)beginning < MAX_OUTPUT_BLOCK_SIZE);
9654   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
9655   memcpy(copy,source,slen*4);
9656   copy+=slen*4;
9657
9658   end_block(beginning);
9659
9660   // If we're within 256K of the end of the buffer,
9661   // start over from the beginning. (Is 256K enough?)
9662   if (out > translation_cache+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE)
9663     out = translation_cache;
9664
9665   // Trap writes to any of the pages we compiled
9666   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
9667     invalid_code[i]=0;
9668   }
9669   inv_code_start=inv_code_end=~0;
9670
9671   // for PCSX we need to mark all mirrors too
9672   if(get_page(start)<(RAM_SIZE>>12))
9673     for(i=start>>12;i<=(start+slen*4)>>12;i++)
9674       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
9675       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
9676       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
9677
9678   /* Pass 10 - Free memory by expiring oldest blocks */
9679
9680   int end=(((out-translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
9681   while(expirep!=end)
9682   {
9683     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
9684     uintptr_t base=(uintptr_t)translation_cache+((expirep>>13)<<shift); // Base address of this block
9685     inv_debug("EXP: Phase %d\n",expirep);
9686     switch((expirep>>11)&3)
9687     {
9688       case 0:
9689         // Clear jump_in and jump_dirty
9690         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
9691         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
9692         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
9693         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
9694         break;
9695       case 1:
9696         // Clear pointers
9697         ll_kill_pointers(jump_out[expirep&2047],base,shift);
9698         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
9699         break;
9700       case 2:
9701         // Clear hash table
9702         for(i=0;i<32;i++) {
9703           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
9704           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
9705              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9706             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
9707             ht_bin->vaddr[1] = -1;
9708             ht_bin->tcaddr[1] = NULL;
9709           }
9710           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
9711              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9712             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
9713             ht_bin->vaddr[0] = ht_bin->vaddr[1];
9714             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
9715             ht_bin->vaddr[1] = -1;
9716             ht_bin->tcaddr[1] = NULL;
9717           }
9718         }
9719         break;
9720       case 3:
9721         // Clear jump_out
9722         #ifdef __arm__
9723         if((expirep&2047)==0)
9724           do_clear_cache();
9725         #endif
9726         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
9727         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
9728         break;
9729     }
9730     expirep=(expirep+1)&65535;
9731   }
9732   return 0;
9733 }
9734
9735 // vim:shiftwidth=2:expandtab