61dac47a7b0211bcee6b6f93a24b6417bf99b052
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h" //emulator interface
39 #include "emu_if.h" //emulator interface
40
41 #ifndef ARRAY_SIZE
42 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
43 #endif
44
45 //#define DISASM
46 //#define assem_debug printf
47 //#define inv_debug printf
48 #define assem_debug(...)
49 #define inv_debug(...)
50
51 #ifdef __i386__
52 #include "assem_x86.h"
53 #endif
54 #ifdef __x86_64__
55 #include "assem_x64.h"
56 #endif
57 #ifdef __arm__
58 #include "assem_arm.h"
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 // stubs
65 enum stub_type {
66   CC_STUB = 1,
67   FP_STUB = 2,
68   LOADB_STUB = 3,
69   LOADH_STUB = 4,
70   LOADW_STUB = 5,
71   LOADD_STUB = 6,
72   LOADBU_STUB = 7,
73   LOADHU_STUB = 8,
74   STOREB_STUB = 9,
75   STOREH_STUB = 10,
76   STOREW_STUB = 11,
77   STORED_STUB = 12,
78   STORELR_STUB = 13,
79   INVCODE_STUB = 14,
80 };
81
82 struct regstat
83 {
84   signed char regmap_entry[HOST_REGS];
85   signed char regmap[HOST_REGS];
86   uint64_t was32;
87   uint64_t is32;
88   uint64_t wasdirty;
89   uint64_t dirty;
90   uint64_t u;
91   uint64_t uu;
92   u_int wasconst;
93   u_int isconst;
94   u_int loadedconst;             // host regs that have constants loaded
95   u_int waswritten;              // MIPS regs that were used as store base before
96 };
97
98 // note: asm depends on this layout
99 struct ll_entry
100 {
101   u_int vaddr;
102   u_int reg_sv_flags;
103   void *addr;
104   struct ll_entry *next;
105 };
106
107 struct ht_entry
108 {
109   u_int vaddr[2];
110   void *tcaddr[2];
111 };
112
113 struct code_stub
114 {
115   enum stub_type type;
116   void *addr;
117   void *retaddr;
118   u_int a;
119   uintptr_t b;
120   uintptr_t c;
121   u_int d;
122   u_int e;
123 };
124
125 struct link_entry
126 {
127   void *addr;
128   u_int target;
129   u_int ext;
130 };
131
132   // used by asm:
133   u_char *out;
134   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
135   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
136   struct ll_entry *jump_dirty[4096];
137
138   static struct ll_entry *jump_out[4096];
139   static u_int start;
140   static u_int *source;
141   static char insn[MAXBLOCK][10];
142   static u_char itype[MAXBLOCK];
143   static u_char opcode[MAXBLOCK];
144   static u_char opcode2[MAXBLOCK];
145   static u_char bt[MAXBLOCK];
146   static u_char rs1[MAXBLOCK];
147   static u_char rs2[MAXBLOCK];
148   static u_char rt1[MAXBLOCK];
149   static u_char rt2[MAXBLOCK];
150   static u_char us1[MAXBLOCK];
151   static u_char us2[MAXBLOCK];
152   static u_char dep1[MAXBLOCK];
153   static u_char dep2[MAXBLOCK];
154   static u_char lt1[MAXBLOCK];
155   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
156   static uint64_t gte_rt[MAXBLOCK];
157   static uint64_t gte_unneeded[MAXBLOCK];
158   static u_int smrv[32]; // speculated MIPS register values
159   static u_int smrv_strong; // mask or regs that are likely to have correct values
160   static u_int smrv_weak; // same, but somewhat less likely
161   static u_int smrv_strong_next; // same, but after current insn executes
162   static u_int smrv_weak_next;
163   static int imm[MAXBLOCK];
164   static u_int ba[MAXBLOCK];
165   static char likely[MAXBLOCK];
166   static char is_ds[MAXBLOCK];
167   static char ooo[MAXBLOCK];
168   static uint64_t unneeded_reg[MAXBLOCK];
169   static uint64_t unneeded_reg_upper[MAXBLOCK];
170   static uint64_t branch_unneeded_reg[MAXBLOCK];
171   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
172   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
173   static uint64_t current_constmap[HOST_REGS];
174   static uint64_t constmap[MAXBLOCK][HOST_REGS];
175   static struct regstat regs[MAXBLOCK];
176   static struct regstat branch_regs[MAXBLOCK];
177   static signed char minimum_free_regs[MAXBLOCK];
178   static u_int needed_reg[MAXBLOCK];
179   static u_int wont_dirty[MAXBLOCK];
180   static u_int will_dirty[MAXBLOCK];
181   static int ccadj[MAXBLOCK];
182   static int slen;
183   static void *instr_addr[MAXBLOCK];
184   static struct link_entry link_addr[MAXBLOCK];
185   static int linkcount;
186   static struct code_stub stubs[MAXBLOCK*3];
187   static int stubcount;
188   static u_int literals[1024][2];
189   static int literalcount;
190   static int is_delayslot;
191   static int cop1_usable;
192   static char shadow[1048576]  __attribute__((aligned(16)));
193   static void *copy;
194   static int expirep;
195   static u_int stop_after_jal;
196 #ifndef RAM_FIXED
197   static u_int ram_offset;
198 #else
199   static const u_int ram_offset=0;
200 #endif
201
202   int new_dynarec_hacks;
203   int new_dynarec_did_compile;
204   extern u_char restore_candidate[512];
205   extern int cycle_count;
206
207   /* registers that may be allocated */
208   /* 1-31 gpr */
209 #define HIREG 32 // hi
210 #define LOREG 33 // lo
211 #define FSREG 34 // FPU status (FCSR)
212 #define CSREG 35 // Coprocessor status
213 #define CCREG 36 // Cycle count
214 #define INVCP 37 // Pointer to invalid_code
215 //#define MMREG 38 // Pointer to memory_map
216 //#define ROREG 39 // ram offset (if rdram!=0x80000000)
217 #define TEMPREG 40
218 #define FTEMP 40 // FPU temporary register
219 #define PTEMP 41 // Prefetch temporary register
220 //#define TLREG 42 // TLB mapping offset
221 #define RHASH 43 // Return address hash
222 #define RHTBL 44 // Return address hash table address
223 #define RTEMP 45 // JR/JALR address register
224 #define MAXREG 45
225 #define AGEN1 46 // Address generation temporary register
226 //#define AGEN2 47 // Address generation temporary register
227 //#define MGEN1 48 // Maptable address generation temporary register
228 //#define MGEN2 49 // Maptable address generation temporary register
229 #define BTREG 50 // Branch target temporary register
230
231   /* instruction types */
232 #define NOP 0     // No operation
233 #define LOAD 1    // Load
234 #define STORE 2   // Store
235 #define LOADLR 3  // Unaligned load
236 #define STORELR 4 // Unaligned store
237 #define MOV 5     // Move
238 #define ALU 6     // Arithmetic/logic
239 #define MULTDIV 7 // Multiply/divide
240 #define SHIFT 8   // Shift by register
241 #define SHIFTIMM 9// Shift by immediate
242 #define IMM16 10  // 16-bit immediate
243 #define RJUMP 11  // Unconditional jump to register
244 #define UJUMP 12  // Unconditional jump
245 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
246 #define SJUMP 14  // Conditional branch (regimm format)
247 #define COP0 15   // Coprocessor 0
248 #define COP1 16   // Coprocessor 1
249 #define C1LS 17   // Coprocessor 1 load/store
250 #define FJUMP 18  // Conditional branch (floating point)
251 #define FLOAT 19  // Floating point unit
252 #define FCONV 20  // Convert integer to float
253 #define FCOMP 21  // Floating point compare (sets FSREG)
254 #define SYSCALL 22// SYSCALL
255 #define OTHER 23  // Other
256 #define SPAN 24   // Branch/delay slot spans 2 pages
257 #define NI 25     // Not implemented
258 #define HLECALL 26// PCSX fake opcodes for HLE
259 #define COP2 27   // Coprocessor 2 move
260 #define C2LS 28   // Coprocessor 2 load/store
261 #define C2OP 29   // Coprocessor 2 operation
262 #define INTCALL 30// Call interpreter to handle rare corner cases
263
264   /* branch codes */
265 #define TAKEN 1
266 #define NOTTAKEN 2
267 #define NULLDS 3
268
269 // asm linkage
270 int new_recompile_block(int addr);
271 void *get_addr_ht(u_int vaddr);
272 void invalidate_block(u_int block);
273 void invalidate_addr(u_int addr);
274 void remove_hash(int vaddr);
275 void dyna_linker();
276 void dyna_linker_ds();
277 void verify_code();
278 void verify_code_vm();
279 void verify_code_ds();
280 void cc_interrupt();
281 void fp_exception();
282 void fp_exception_ds();
283 void jump_syscall_hle();
284 void jump_hlecall();
285 void jump_intcall();
286 void new_dyna_leave();
287
288 // Needed by assembler
289 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
290 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
291 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
292 static void load_all_regs(signed char i_regmap[]);
293 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
294 static void load_regs_entry(int t);
295 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
296
297 static int verify_dirty(u_int *ptr);
298 static int get_final_value(int hr, int i, int *value);
299 static void add_stub(enum stub_type type, void *addr, void *retaddr,
300   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
301 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
302   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
303 static void add_to_linker(void *addr, u_int target, int ext);
304
305 static void mprotect_w_x(void *start, void *end, int is_x)
306 {
307 #ifdef NO_WRITE_EXEC
308   #if defined(VITA)
309   // *Open* enables write on all memory that was
310   // allocated by sceKernelAllocMemBlockForVM()?
311   if (is_x)
312     sceKernelCloseVMDomain();
313   else
314     sceKernelOpenVMDomain();
315   #else
316   u_long mstart = (u_long)start & ~4095ul;
317   u_long mend = (u_long)end;
318   if (mprotect((void *)mstart, mend - mstart,
319                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
320     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
321   #endif
322 #endif
323 }
324
325 static void start_tcache_write(void *start, void *end)
326 {
327   mprotect_w_x(start, end, 0);
328 }
329
330 static void end_tcache_write(void *start, void *end)
331 {
332 #ifdef __arm__
333   size_t len = (char *)end - (char *)start;
334   #if   defined(__BLACKBERRY_QNX__)
335   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
336   #elif defined(__MACH__)
337   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
338   #elif defined(VITA)
339   sceKernelSyncVMDomain(sceBlock, start, len);
340   #elif defined(_3DS)
341   ctr_flush_invalidate_cache();
342   #else
343   __clear_cache(start, end);
344   #endif
345   (void)len;
346 #endif
347
348   mprotect_w_x(start, end, 1);
349 }
350
351 static void *start_block(void)
352 {
353   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
354   if (end > translation_cache + (1<<TARGET_SIZE_2))
355     end = translation_cache + (1<<TARGET_SIZE_2);
356   start_tcache_write(out, end);
357   return out;
358 }
359
360 static void end_block(void *start)
361 {
362   end_tcache_write(start, out);
363 }
364
365 //#define DEBUG_CYCLE_COUNT 1
366
367 #define NO_CYCLE_PENALTY_THR 12
368
369 int cycle_multiplier; // 100 for 1.0
370
371 static int CLOCK_ADJUST(int x)
372 {
373   int s=(x>>31)|1;
374   return (x * cycle_multiplier + s * 50) / 100;
375 }
376
377 static u_int get_page(u_int vaddr)
378 {
379   u_int page=vaddr&~0xe0000000;
380   if (page < 0x1000000)
381     page &= ~0x0e00000; // RAM mirrors
382   page>>=12;
383   if(page>2048) page=2048+(page&2047);
384   return page;
385 }
386
387 // no virtual mem in PCSX
388 static u_int get_vpage(u_int vaddr)
389 {
390   return get_page(vaddr);
391 }
392
393 static struct ht_entry *hash_table_get(u_int vaddr)
394 {
395   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
396 }
397
398 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
399 {
400   ht_bin->vaddr[1] = ht_bin->vaddr[0];
401   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
402   ht_bin->vaddr[0] = vaddr;
403   ht_bin->tcaddr[0] = tcaddr;
404 }
405
406 // some messy ari64's code, seems to rely on unsigned 32bit overflow
407 static int doesnt_expire_soon(void *tcaddr)
408 {
409   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
410   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
411 }
412
413 // Get address from virtual address
414 // This is called from the recompiled JR/JALR instructions
415 void *get_addr(u_int vaddr)
416 {
417   u_int page=get_page(vaddr);
418   u_int vpage=get_vpage(vaddr);
419   struct ll_entry *head;
420   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
421   head=jump_in[page];
422   while(head!=NULL) {
423     if(head->vaddr==vaddr) {
424   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
425       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
426       return head->addr;
427     }
428     head=head->next;
429   }
430   head=jump_dirty[vpage];
431   while(head!=NULL) {
432     if(head->vaddr==vaddr) {
433       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
434       // Don't restore blocks which are about to expire from the cache
435       if (doesnt_expire_soon(head->addr))
436       if (verify_dirty(head->addr)) {
437         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
438         invalid_code[vaddr>>12]=0;
439         inv_code_start=inv_code_end=~0;
440         if(vpage<2048) {
441           restore_candidate[vpage>>3]|=1<<(vpage&7);
442         }
443         else restore_candidate[page>>3]|=1<<(page&7);
444         struct ht_entry *ht_bin = hash_table_get(vaddr);
445         if (ht_bin->vaddr[0] == vaddr)
446           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
447         else
448           hash_table_add(ht_bin, vaddr, head->addr);
449
450         return head->addr;
451       }
452     }
453     head=head->next;
454   }
455   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
456   int r=new_recompile_block(vaddr);
457   if(r==0) return get_addr(vaddr);
458   // Execute in unmapped page, generate pagefault execption
459   Status|=2;
460   Cause=(vaddr<<31)|0x8;
461   EPC=(vaddr&1)?vaddr-5:vaddr;
462   BadVAddr=(vaddr&~1);
463   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
464   EntryHi=BadVAddr&0xFFFFE000;
465   return get_addr_ht(0x80000000);
466 }
467 // Look up address in hash table first
468 void *get_addr_ht(u_int vaddr)
469 {
470   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
471   const struct ht_entry *ht_bin = hash_table_get(vaddr);
472   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
473   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
474   return get_addr(vaddr);
475 }
476
477 void clear_all_regs(signed char regmap[])
478 {
479   int hr;
480   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
481 }
482
483 signed char get_reg(signed char regmap[],int r)
484 {
485   int hr;
486   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
487   return -1;
488 }
489
490 // Find a register that is available for two consecutive cycles
491 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
492 {
493   int hr;
494   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
495   return -1;
496 }
497
498 int count_free_regs(signed char regmap[])
499 {
500   int count=0;
501   int hr;
502   for(hr=0;hr<HOST_REGS;hr++)
503   {
504     if(hr!=EXCLUDE_REG) {
505       if(regmap[hr]<0) count++;
506     }
507   }
508   return count;
509 }
510
511 void dirty_reg(struct regstat *cur,signed char reg)
512 {
513   int hr;
514   if(!reg) return;
515   for (hr=0;hr<HOST_REGS;hr++) {
516     if((cur->regmap[hr]&63)==reg) {
517       cur->dirty|=1<<hr;
518     }
519   }
520 }
521
522 // If we dirty the lower half of a 64 bit register which is now being
523 // sign-extended, we need to dump the upper half.
524 // Note: Do this only after completion of the instruction, because
525 // some instructions may need to read the full 64-bit value even if
526 // overwriting it (eg SLTI, DSRA32).
527 static void flush_dirty_uppers(struct regstat *cur)
528 {
529   int hr,reg;
530   for (hr=0;hr<HOST_REGS;hr++) {
531     if((cur->dirty>>hr)&1) {
532       reg=cur->regmap[hr];
533       if(reg>=64)
534         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
535     }
536   }
537 }
538
539 void set_const(struct regstat *cur,signed char reg,uint64_t value)
540 {
541   int hr;
542   if(!reg) return;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if(cur->regmap[hr]==reg) {
545       cur->isconst|=1<<hr;
546       current_constmap[hr]=value;
547     }
548     else if((cur->regmap[hr]^64)==reg) {
549       cur->isconst|=1<<hr;
550       current_constmap[hr]=value>>32;
551     }
552   }
553 }
554
555 void clear_const(struct regstat *cur,signed char reg)
556 {
557   int hr;
558   if(!reg) return;
559   for (hr=0;hr<HOST_REGS;hr++) {
560     if((cur->regmap[hr]&63)==reg) {
561       cur->isconst&=~(1<<hr);
562     }
563   }
564 }
565
566 int is_const(struct regstat *cur,signed char reg)
567 {
568   int hr;
569   if(reg<0) return 0;
570   if(!reg) return 1;
571   for (hr=0;hr<HOST_REGS;hr++) {
572     if((cur->regmap[hr]&63)==reg) {
573       return (cur->isconst>>hr)&1;
574     }
575   }
576   return 0;
577 }
578 uint64_t get_const(struct regstat *cur,signed char reg)
579 {
580   int hr;
581   if(!reg) return 0;
582   for (hr=0;hr<HOST_REGS;hr++) {
583     if(cur->regmap[hr]==reg) {
584       return current_constmap[hr];
585     }
586   }
587   SysPrintf("Unknown constant in r%d\n",reg);
588   exit(1);
589 }
590
591 // Least soon needed registers
592 // Look at the next ten instructions and see which registers
593 // will be used.  Try not to reallocate these.
594 void lsn(u_char hsn[], int i, int *preferred_reg)
595 {
596   int j;
597   int b=-1;
598   for(j=0;j<9;j++)
599   {
600     if(i+j>=slen) {
601       j=slen-i-1;
602       break;
603     }
604     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
605     {
606       // Don't go past an unconditonal jump
607       j++;
608       break;
609     }
610   }
611   for(;j>=0;j--)
612   {
613     if(rs1[i+j]) hsn[rs1[i+j]]=j;
614     if(rs2[i+j]) hsn[rs2[i+j]]=j;
615     if(rt1[i+j]) hsn[rt1[i+j]]=j;
616     if(rt2[i+j]) hsn[rt2[i+j]]=j;
617     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
618       // Stores can allocate zero
619       hsn[rs1[i+j]]=j;
620       hsn[rs2[i+j]]=j;
621     }
622     // On some architectures stores need invc_ptr
623     #if defined(HOST_IMM8)
624     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
625       hsn[INVCP]=j;
626     }
627     #endif
628     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
629     {
630       hsn[CCREG]=j;
631       b=j;
632     }
633   }
634   if(b>=0)
635   {
636     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
637     {
638       // Follow first branch
639       int t=(ba[i+b]-start)>>2;
640       j=7-b;if(t+j>=slen) j=slen-t-1;
641       for(;j>=0;j--)
642       {
643         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
644         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
645         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
646         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
647       }
648     }
649     // TODO: preferred register based on backward branch
650   }
651   // Delay slot should preferably not overwrite branch conditions or cycle count
652   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
653     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
654     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
655     hsn[CCREG]=1;
656     // ...or hash tables
657     hsn[RHASH]=1;
658     hsn[RHTBL]=1;
659   }
660   // Coprocessor load/store needs FTEMP, even if not declared
661   if(itype[i]==C1LS||itype[i]==C2LS) {
662     hsn[FTEMP]=0;
663   }
664   // Load L/R also uses FTEMP as a temporary register
665   if(itype[i]==LOADLR) {
666     hsn[FTEMP]=0;
667   }
668   // Also SWL/SWR/SDL/SDR
669   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
670     hsn[FTEMP]=0;
671   }
672   // Don't remove the miniht registers
673   if(itype[i]==UJUMP||itype[i]==RJUMP)
674   {
675     hsn[RHASH]=0;
676     hsn[RHTBL]=0;
677   }
678 }
679
680 // We only want to allocate registers if we're going to use them again soon
681 int needed_again(int r, int i)
682 {
683   int j;
684   int b=-1;
685   int rn=10;
686
687   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
688   {
689     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
690       return 0; // Don't need any registers if exiting the block
691   }
692   for(j=0;j<9;j++)
693   {
694     if(i+j>=slen) {
695       j=slen-i-1;
696       break;
697     }
698     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
699     {
700       // Don't go past an unconditonal jump
701       j++;
702       break;
703     }
704     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
705     {
706       break;
707     }
708   }
709   for(;j>=1;j--)
710   {
711     if(rs1[i+j]==r) rn=j;
712     if(rs2[i+j]==r) rn=j;
713     if((unneeded_reg[i+j]>>r)&1) rn=10;
714     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
715     {
716       b=j;
717     }
718   }
719   /*
720   if(b>=0)
721   {
722     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
723     {
724       // Follow first branch
725       int o=rn;
726       int t=(ba[i+b]-start)>>2;
727       j=7-b;if(t+j>=slen) j=slen-t-1;
728       for(;j>=0;j--)
729       {
730         if(!((unneeded_reg[t+j]>>r)&1)) {
731           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
732           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
733         }
734         else rn=o;
735       }
736     }
737   }*/
738   if(rn<10) return 1;
739   (void)b;
740   return 0;
741 }
742
743 // Try to match register allocations at the end of a loop with those
744 // at the beginning
745 int loop_reg(int i, int r, int hr)
746 {
747   int j,k;
748   for(j=0;j<9;j++)
749   {
750     if(i+j>=slen) {
751       j=slen-i-1;
752       break;
753     }
754     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
755     {
756       // Don't go past an unconditonal jump
757       j++;
758       break;
759     }
760   }
761   k=0;
762   if(i>0){
763     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
764       k--;
765   }
766   for(;k<j;k++)
767   {
768     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
769     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
770     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
771     {
772       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
773       {
774         int t=(ba[i+k]-start)>>2;
775         int reg=get_reg(regs[t].regmap_entry,r);
776         if(reg>=0) return reg;
777         //reg=get_reg(regs[t+1].regmap_entry,r);
778         //if(reg>=0) return reg;
779       }
780     }
781   }
782   return hr;
783 }
784
785
786 // Allocate every register, preserving source/target regs
787 void alloc_all(struct regstat *cur,int i)
788 {
789   int hr;
790
791   for(hr=0;hr<HOST_REGS;hr++) {
792     if(hr!=EXCLUDE_REG) {
793       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
794          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
795       {
796         cur->regmap[hr]=-1;
797         cur->dirty&=~(1<<hr);
798       }
799       // Don't need zeros
800       if((cur->regmap[hr]&63)==0)
801       {
802         cur->regmap[hr]=-1;
803         cur->dirty&=~(1<<hr);
804       }
805     }
806   }
807 }
808
809 #ifdef __i386__
810 #include "assem_x86.c"
811 #endif
812 #ifdef __x86_64__
813 #include "assem_x64.c"
814 #endif
815 #ifdef __arm__
816 #include "assem_arm.c"
817 #endif
818
819 // Add virtual address mapping to linked list
820 void ll_add(struct ll_entry **head,int vaddr,void *addr)
821 {
822   struct ll_entry *new_entry;
823   new_entry=malloc(sizeof(struct ll_entry));
824   assert(new_entry!=NULL);
825   new_entry->vaddr=vaddr;
826   new_entry->reg_sv_flags=0;
827   new_entry->addr=addr;
828   new_entry->next=*head;
829   *head=new_entry;
830 }
831
832 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
833 {
834   ll_add(head,vaddr,addr);
835   (*head)->reg_sv_flags=reg_sv_flags;
836 }
837
838 // Check if an address is already compiled
839 // but don't return addresses which are about to expire from the cache
840 void *check_addr(u_int vaddr)
841 {
842   struct ht_entry *ht_bin = hash_table_get(vaddr);
843   size_t i;
844   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
845     if (ht_bin->vaddr[i] == vaddr)
846       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
847         if (isclean(ht_bin->tcaddr[i]))
848           return ht_bin->tcaddr[i];
849   }
850   u_int page=get_page(vaddr);
851   struct ll_entry *head;
852   head=jump_in[page];
853   while (head != NULL) {
854     if (head->vaddr == vaddr) {
855       if (doesnt_expire_soon(head->addr)) {
856         // Update existing entry with current address
857         if (ht_bin->vaddr[0] == vaddr) {
858           ht_bin->tcaddr[0] = head->addr;
859           return head->addr;
860         }
861         if (ht_bin->vaddr[1] == vaddr) {
862           ht_bin->tcaddr[1] = head->addr;
863           return head->addr;
864         }
865         // Insert into hash table with low priority.
866         // Don't evict existing entries, as they are probably
867         // addresses that are being accessed frequently.
868         if (ht_bin->vaddr[0] == -1) {
869           ht_bin->vaddr[0] = vaddr;
870           ht_bin->tcaddr[0] = head->addr;
871         }
872         else if (ht_bin->vaddr[1] == -1) {
873           ht_bin->vaddr[1] = vaddr;
874           ht_bin->tcaddr[1] = head->addr;
875         }
876         return head->addr;
877       }
878     }
879     head=head->next;
880   }
881   return 0;
882 }
883
884 void remove_hash(int vaddr)
885 {
886   //printf("remove hash: %x\n",vaddr);
887   struct ht_entry *ht_bin = hash_table_get(vaddr);
888   if (ht_bin->vaddr[1] == vaddr) {
889     ht_bin->vaddr[1] = -1;
890     ht_bin->tcaddr[1] = NULL;
891   }
892   if (ht_bin->vaddr[0] == vaddr) {
893     ht_bin->vaddr[0] = ht_bin->vaddr[1];
894     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
895     ht_bin->vaddr[1] = -1;
896     ht_bin->tcaddr[1] = NULL;
897   }
898 }
899
900 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
901 {
902   struct ll_entry *next;
903   while(*head) {
904     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
905        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
906     {
907       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
908       remove_hash((*head)->vaddr);
909       next=(*head)->next;
910       free(*head);
911       *head=next;
912     }
913     else
914     {
915       head=&((*head)->next);
916     }
917   }
918 }
919
920 // Remove all entries from linked list
921 void ll_clear(struct ll_entry **head)
922 {
923   struct ll_entry *cur;
924   struct ll_entry *next;
925   if((cur=*head)) {
926     *head=0;
927     while(cur) {
928       next=cur->next;
929       free(cur);
930       cur=next;
931     }
932   }
933 }
934
935 // Dereference the pointers and remove if it matches
936 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
937 {
938   while(head) {
939     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
940     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
941     if(((ptr>>shift)==(addr>>shift)) ||
942        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
943     {
944       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
945       void *host_addr=find_extjump_insn(head->addr);
946       #ifdef __arm__
947         mark_clear_cache(host_addr);
948       #endif
949       set_jump_target(host_addr, head->addr);
950     }
951     head=head->next;
952   }
953 }
954
955 // This is called when we write to a compiled block (see do_invstub)
956 void invalidate_page(u_int page)
957 {
958   struct ll_entry *head;
959   struct ll_entry *next;
960   head=jump_in[page];
961   jump_in[page]=0;
962   while(head!=NULL) {
963     inv_debug("INVALIDATE: %x\n",head->vaddr);
964     remove_hash(head->vaddr);
965     next=head->next;
966     free(head);
967     head=next;
968   }
969   head=jump_out[page];
970   jump_out[page]=0;
971   while(head!=NULL) {
972     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
973     void *host_addr=find_extjump_insn(head->addr);
974     #ifdef __arm__
975       mark_clear_cache(host_addr);
976     #endif
977     set_jump_target(host_addr, head->addr);
978     next=head->next;
979     free(head);
980     head=next;
981   }
982 }
983
984 static void invalidate_block_range(u_int block, u_int first, u_int last)
985 {
986   u_int page=get_page(block<<12);
987   //printf("first=%d last=%d\n",first,last);
988   invalidate_page(page);
989   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
990   assert(last<page+5);
991   // Invalidate the adjacent pages if a block crosses a 4K boundary
992   while(first<page) {
993     invalidate_page(first);
994     first++;
995   }
996   for(first=page+1;first<last;first++) {
997     invalidate_page(first);
998   }
999   #ifdef __arm__
1000     do_clear_cache();
1001   #endif
1002
1003   // Don't trap writes
1004   invalid_code[block]=1;
1005
1006   #ifdef USE_MINI_HT
1007   memset(mini_ht,-1,sizeof(mini_ht));
1008   #endif
1009 }
1010
1011 void invalidate_block(u_int block)
1012 {
1013   u_int page=get_page(block<<12);
1014   u_int vpage=get_vpage(block<<12);
1015   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1016   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1017   u_int first,last;
1018   first=last=page;
1019   struct ll_entry *head;
1020   head=jump_dirty[vpage];
1021   //printf("page=%d vpage=%d\n",page,vpage);
1022   while(head!=NULL) {
1023     u_int start,end;
1024     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1025       get_bounds(head->addr,&start,&end);
1026       //printf("start: %x end: %x\n",start,end);
1027       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
1028         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1029           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1030           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1031         }
1032       }
1033     }
1034     head=head->next;
1035   }
1036   invalidate_block_range(block,first,last);
1037 }
1038
1039 void invalidate_addr(u_int addr)
1040 {
1041   //static int rhits;
1042   // this check is done by the caller
1043   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1044   u_int page=get_vpage(addr);
1045   if(page<2048) { // RAM
1046     struct ll_entry *head;
1047     u_int addr_min=~0, addr_max=0;
1048     u_int mask=RAM_SIZE-1;
1049     u_int addr_main=0x80000000|(addr&mask);
1050     int pg1;
1051     inv_code_start=addr_main&~0xfff;
1052     inv_code_end=addr_main|0xfff;
1053     pg1=page;
1054     if (pg1>0) {
1055       // must check previous page too because of spans..
1056       pg1--;
1057       inv_code_start-=0x1000;
1058     }
1059     for(;pg1<=page;pg1++) {
1060       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1061         u_int start,end;
1062         get_bounds(head->addr,&start,&end);
1063         if(ram_offset) {
1064           start-=ram_offset;
1065           end-=ram_offset;
1066         }
1067         if(start<=addr_main&&addr_main<end) {
1068           if(start<addr_min) addr_min=start;
1069           if(end>addr_max) addr_max=end;
1070         }
1071         else if(addr_main<start) {
1072           if(start<inv_code_end)
1073             inv_code_end=start-1;
1074         }
1075         else {
1076           if(end>inv_code_start)
1077             inv_code_start=end;
1078         }
1079       }
1080     }
1081     if (addr_min!=~0) {
1082       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1083       inv_code_start=inv_code_end=~0;
1084       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1085       return;
1086     }
1087     else {
1088       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1089       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1090       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1091       return;
1092     }
1093   }
1094   invalidate_block(addr>>12);
1095 }
1096
1097 // This is called when loading a save state.
1098 // Anything could have changed, so invalidate everything.
1099 void invalidate_all_pages()
1100 {
1101   u_int page;
1102   for(page=0;page<4096;page++)
1103     invalidate_page(page);
1104   for(page=0;page<1048576;page++)
1105     if(!invalid_code[page]) {
1106       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1107       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1108     }
1109   #ifdef USE_MINI_HT
1110   memset(mini_ht,-1,sizeof(mini_ht));
1111   #endif
1112 }
1113
1114 // Add an entry to jump_out after making a link
1115 void add_link(u_int vaddr,void *src)
1116 {
1117   u_int page=get_page(vaddr);
1118   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1119   int *ptr=(int *)(src+4);
1120   assert((*ptr&0x0fff0000)==0x059f0000);
1121   (void)ptr;
1122   ll_add(jump_out+page,vaddr,src);
1123   //void *ptr=get_pointer(src);
1124   //inv_debug("add_link: Pointer is to %p\n",ptr);
1125 }
1126
1127 // If a code block was found to be unmodified (bit was set in
1128 // restore_candidate) and it remains unmodified (bit is clear
1129 // in invalid_code) then move the entries for that 4K page from
1130 // the dirty list to the clean list.
1131 void clean_blocks(u_int page)
1132 {
1133   struct ll_entry *head;
1134   inv_debug("INV: clean_blocks page=%d\n",page);
1135   head=jump_dirty[page];
1136   while(head!=NULL) {
1137     if(!invalid_code[head->vaddr>>12]) {
1138       // Don't restore blocks which are about to expire from the cache
1139       if (doesnt_expire_soon(head->addr)) {
1140         u_int start,end;
1141         if(verify_dirty(head->addr)) {
1142           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1143           u_int i;
1144           u_int inv=0;
1145           get_bounds(head->addr,&start,&end);
1146           if(start-(u_int)rdram<RAM_SIZE) {
1147             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1148               inv|=invalid_code[i];
1149             }
1150           }
1151           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1152             inv=1;
1153           }
1154           if(!inv) {
1155             void *clean_addr = get_clean_addr(head->addr);
1156             if (doesnt_expire_soon(clean_addr)) {
1157               u_int ppage=page;
1158               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1159               //printf("page=%x, addr=%x\n",page,head->vaddr);
1160               //assert(head->vaddr>>12==(page|0x80000));
1161               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1162               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1163               if (ht_bin->vaddr[0] == head->vaddr)
1164                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1165               if (ht_bin->vaddr[1] == head->vaddr)
1166                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1167             }
1168           }
1169         }
1170       }
1171     }
1172     head=head->next;
1173   }
1174 }
1175
1176
1177 void mov_alloc(struct regstat *current,int i)
1178 {
1179   // Note: Don't need to actually alloc the source registers
1180   if((~current->is32>>rs1[i])&1) {
1181     //alloc_reg64(current,i,rs1[i]);
1182     alloc_reg64(current,i,rt1[i]);
1183     current->is32&=~(1LL<<rt1[i]);
1184   } else {
1185     //alloc_reg(current,i,rs1[i]);
1186     alloc_reg(current,i,rt1[i]);
1187     current->is32|=(1LL<<rt1[i]);
1188   }
1189   clear_const(current,rs1[i]);
1190   clear_const(current,rt1[i]);
1191   dirty_reg(current,rt1[i]);
1192 }
1193
1194 void shiftimm_alloc(struct regstat *current,int i)
1195 {
1196   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1197   {
1198     if(rt1[i]) {
1199       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1200       else lt1[i]=rs1[i];
1201       alloc_reg(current,i,rt1[i]);
1202       current->is32|=1LL<<rt1[i];
1203       dirty_reg(current,rt1[i]);
1204       if(is_const(current,rs1[i])) {
1205         int v=get_const(current,rs1[i]);
1206         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1207         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1208         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1209       }
1210       else clear_const(current,rt1[i]);
1211     }
1212   }
1213   else
1214   {
1215     clear_const(current,rs1[i]);
1216     clear_const(current,rt1[i]);
1217   }
1218
1219   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1220   {
1221     assert(0);
1222   }
1223   if(opcode2[i]==0x3c) // DSLL32
1224   {
1225     assert(0);
1226   }
1227   if(opcode2[i]==0x3e) // DSRL32
1228   {
1229     assert(0);
1230   }
1231   if(opcode2[i]==0x3f) // DSRA32
1232   {
1233     assert(0);
1234   }
1235 }
1236
1237 void shift_alloc(struct regstat *current,int i)
1238 {
1239   if(rt1[i]) {
1240     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1241     {
1242       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1243       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1244       alloc_reg(current,i,rt1[i]);
1245       if(rt1[i]==rs2[i]) {
1246         alloc_reg_temp(current,i,-1);
1247         minimum_free_regs[i]=1;
1248       }
1249       current->is32|=1LL<<rt1[i];
1250     } else { // DSLLV/DSRLV/DSRAV
1251       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1252       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1253       alloc_reg64(current,i,rt1[i]);
1254       current->is32&=~(1LL<<rt1[i]);
1255       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1256       {
1257         alloc_reg_temp(current,i,-1);
1258         minimum_free_regs[i]=1;
1259       }
1260     }
1261     clear_const(current,rs1[i]);
1262     clear_const(current,rs2[i]);
1263     clear_const(current,rt1[i]);
1264     dirty_reg(current,rt1[i]);
1265   }
1266 }
1267
1268 void alu_alloc(struct regstat *current,int i)
1269 {
1270   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1271     if(rt1[i]) {
1272       if(rs1[i]&&rs2[i]) {
1273         alloc_reg(current,i,rs1[i]);
1274         alloc_reg(current,i,rs2[i]);
1275       }
1276       else {
1277         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1278         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1279       }
1280       alloc_reg(current,i,rt1[i]);
1281     }
1282     current->is32|=1LL<<rt1[i];
1283   }
1284   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1285     if(rt1[i]) {
1286       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1287       {
1288         alloc_reg64(current,i,rs1[i]);
1289         alloc_reg64(current,i,rs2[i]);
1290         alloc_reg(current,i,rt1[i]);
1291       } else {
1292         alloc_reg(current,i,rs1[i]);
1293         alloc_reg(current,i,rs2[i]);
1294         alloc_reg(current,i,rt1[i]);
1295       }
1296     }
1297     current->is32|=1LL<<rt1[i];
1298   }
1299   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1300     if(rt1[i]) {
1301       if(rs1[i]&&rs2[i]) {
1302         alloc_reg(current,i,rs1[i]);
1303         alloc_reg(current,i,rs2[i]);
1304       }
1305       else
1306       {
1307         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1308         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1309       }
1310       alloc_reg(current,i,rt1[i]);
1311       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1312       {
1313         if(!((current->uu>>rt1[i])&1)) {
1314           alloc_reg64(current,i,rt1[i]);
1315         }
1316         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1317           if(rs1[i]&&rs2[i]) {
1318             alloc_reg64(current,i,rs1[i]);
1319             alloc_reg64(current,i,rs2[i]);
1320           }
1321           else
1322           {
1323             // Is is really worth it to keep 64-bit values in registers?
1324             #ifdef NATIVE_64BIT
1325             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1326             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1327             #endif
1328           }
1329         }
1330         current->is32&=~(1LL<<rt1[i]);
1331       } else {
1332         current->is32|=1LL<<rt1[i];
1333       }
1334     }
1335   }
1336   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1337     if(rt1[i]) {
1338       if(rs1[i]&&rs2[i]) {
1339         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1340           alloc_reg64(current,i,rs1[i]);
1341           alloc_reg64(current,i,rs2[i]);
1342           alloc_reg64(current,i,rt1[i]);
1343         } else {
1344           alloc_reg(current,i,rs1[i]);
1345           alloc_reg(current,i,rs2[i]);
1346           alloc_reg(current,i,rt1[i]);
1347         }
1348       }
1349       else {
1350         alloc_reg(current,i,rt1[i]);
1351         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1352           // DADD used as move, or zeroing
1353           // If we have a 64-bit source, then make the target 64 bits too
1354           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1355             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1356             alloc_reg64(current,i,rt1[i]);
1357           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1358             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1359             alloc_reg64(current,i,rt1[i]);
1360           }
1361           if(opcode2[i]>=0x2e&&rs2[i]) {
1362             // DSUB used as negation - 64-bit result
1363             // If we have a 32-bit register, extend it to 64 bits
1364             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1365             alloc_reg64(current,i,rt1[i]);
1366           }
1367         }
1368       }
1369       if(rs1[i]&&rs2[i]) {
1370         current->is32&=~(1LL<<rt1[i]);
1371       } else if(rs1[i]) {
1372         current->is32&=~(1LL<<rt1[i]);
1373         if((current->is32>>rs1[i])&1)
1374           current->is32|=1LL<<rt1[i];
1375       } else if(rs2[i]) {
1376         current->is32&=~(1LL<<rt1[i]);
1377         if((current->is32>>rs2[i])&1)
1378           current->is32|=1LL<<rt1[i];
1379       } else {
1380         current->is32|=1LL<<rt1[i];
1381       }
1382     }
1383   }
1384   clear_const(current,rs1[i]);
1385   clear_const(current,rs2[i]);
1386   clear_const(current,rt1[i]);
1387   dirty_reg(current,rt1[i]);
1388 }
1389
1390 void imm16_alloc(struct regstat *current,int i)
1391 {
1392   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1393   else lt1[i]=rs1[i];
1394   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1395   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1396     current->is32&=~(1LL<<rt1[i]);
1397     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1398       // TODO: Could preserve the 32-bit flag if the immediate is zero
1399       alloc_reg64(current,i,rt1[i]);
1400       alloc_reg64(current,i,rs1[i]);
1401     }
1402     clear_const(current,rs1[i]);
1403     clear_const(current,rt1[i]);
1404   }
1405   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1406     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1407     current->is32|=1LL<<rt1[i];
1408     clear_const(current,rs1[i]);
1409     clear_const(current,rt1[i]);
1410   }
1411   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1412     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1413       if(rs1[i]!=rt1[i]) {
1414         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1415         alloc_reg64(current,i,rt1[i]);
1416         current->is32&=~(1LL<<rt1[i]);
1417       }
1418     }
1419     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1420     if(is_const(current,rs1[i])) {
1421       int v=get_const(current,rs1[i]);
1422       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1423       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1424       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1425     }
1426     else clear_const(current,rt1[i]);
1427   }
1428   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1429     if(is_const(current,rs1[i])) {
1430       int v=get_const(current,rs1[i]);
1431       set_const(current,rt1[i],v+imm[i]);
1432     }
1433     else clear_const(current,rt1[i]);
1434     current->is32|=1LL<<rt1[i];
1435   }
1436   else {
1437     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1438     current->is32|=1LL<<rt1[i];
1439   }
1440   dirty_reg(current,rt1[i]);
1441 }
1442
1443 void load_alloc(struct regstat *current,int i)
1444 {
1445   clear_const(current,rt1[i]);
1446   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1447   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1448   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1449   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1450     alloc_reg(current,i,rt1[i]);
1451     assert(get_reg(current->regmap,rt1[i])>=0);
1452     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1453     {
1454       current->is32&=~(1LL<<rt1[i]);
1455       alloc_reg64(current,i,rt1[i]);
1456     }
1457     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1458     {
1459       current->is32&=~(1LL<<rt1[i]);
1460       alloc_reg64(current,i,rt1[i]);
1461       alloc_all(current,i);
1462       alloc_reg64(current,i,FTEMP);
1463       minimum_free_regs[i]=HOST_REGS;
1464     }
1465     else current->is32|=1LL<<rt1[i];
1466     dirty_reg(current,rt1[i]);
1467     // LWL/LWR need a temporary register for the old value
1468     if(opcode[i]==0x22||opcode[i]==0x26)
1469     {
1470       alloc_reg(current,i,FTEMP);
1471       alloc_reg_temp(current,i,-1);
1472       minimum_free_regs[i]=1;
1473     }
1474   }
1475   else
1476   {
1477     // Load to r0 or unneeded register (dummy load)
1478     // but we still need a register to calculate the address
1479     if(opcode[i]==0x22||opcode[i]==0x26)
1480     {
1481       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1482     }
1483     alloc_reg_temp(current,i,-1);
1484     minimum_free_regs[i]=1;
1485     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1486     {
1487       alloc_all(current,i);
1488       alloc_reg64(current,i,FTEMP);
1489       minimum_free_regs[i]=HOST_REGS;
1490     }
1491   }
1492 }
1493
1494 void store_alloc(struct regstat *current,int i)
1495 {
1496   clear_const(current,rs2[i]);
1497   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1498   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1499   alloc_reg(current,i,rs2[i]);
1500   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1501     alloc_reg64(current,i,rs2[i]);
1502     if(rs2[i]) alloc_reg(current,i,FTEMP);
1503   }
1504   #if defined(HOST_IMM8)
1505   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1506   else alloc_reg(current,i,INVCP);
1507   #endif
1508   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1509     alloc_reg(current,i,FTEMP);
1510   }
1511   // We need a temporary register for address generation
1512   alloc_reg_temp(current,i,-1);
1513   minimum_free_regs[i]=1;
1514 }
1515
1516 void c1ls_alloc(struct regstat *current,int i)
1517 {
1518   //clear_const(current,rs1[i]); // FIXME
1519   clear_const(current,rt1[i]);
1520   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1521   alloc_reg(current,i,CSREG); // Status
1522   alloc_reg(current,i,FTEMP);
1523   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1524     alloc_reg64(current,i,FTEMP);
1525   }
1526   #if defined(HOST_IMM8)
1527   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1528   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1529     alloc_reg(current,i,INVCP);
1530   #endif
1531   // We need a temporary register for address generation
1532   alloc_reg_temp(current,i,-1);
1533 }
1534
1535 void c2ls_alloc(struct regstat *current,int i)
1536 {
1537   clear_const(current,rt1[i]);
1538   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1539   alloc_reg(current,i,FTEMP);
1540   #if defined(HOST_IMM8)
1541   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1542   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1543     alloc_reg(current,i,INVCP);
1544   #endif
1545   // We need a temporary register for address generation
1546   alloc_reg_temp(current,i,-1);
1547   minimum_free_regs[i]=1;
1548 }
1549
1550 #ifndef multdiv_alloc
1551 void multdiv_alloc(struct regstat *current,int i)
1552 {
1553   //  case 0x18: MULT
1554   //  case 0x19: MULTU
1555   //  case 0x1A: DIV
1556   //  case 0x1B: DIVU
1557   //  case 0x1C: DMULT
1558   //  case 0x1D: DMULTU
1559   //  case 0x1E: DDIV
1560   //  case 0x1F: DDIVU
1561   clear_const(current,rs1[i]);
1562   clear_const(current,rs2[i]);
1563   if(rs1[i]&&rs2[i])
1564   {
1565     if((opcode2[i]&4)==0) // 32-bit
1566     {
1567       current->u&=~(1LL<<HIREG);
1568       current->u&=~(1LL<<LOREG);
1569       alloc_reg(current,i,HIREG);
1570       alloc_reg(current,i,LOREG);
1571       alloc_reg(current,i,rs1[i]);
1572       alloc_reg(current,i,rs2[i]);
1573       current->is32|=1LL<<HIREG;
1574       current->is32|=1LL<<LOREG;
1575       dirty_reg(current,HIREG);
1576       dirty_reg(current,LOREG);
1577     }
1578     else // 64-bit
1579     {
1580       current->u&=~(1LL<<HIREG);
1581       current->u&=~(1LL<<LOREG);
1582       current->uu&=~(1LL<<HIREG);
1583       current->uu&=~(1LL<<LOREG);
1584       alloc_reg64(current,i,HIREG);
1585       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1586       alloc_reg64(current,i,rs1[i]);
1587       alloc_reg64(current,i,rs2[i]);
1588       alloc_all(current,i);
1589       current->is32&=~(1LL<<HIREG);
1590       current->is32&=~(1LL<<LOREG);
1591       dirty_reg(current,HIREG);
1592       dirty_reg(current,LOREG);
1593       minimum_free_regs[i]=HOST_REGS;
1594     }
1595   }
1596   else
1597   {
1598     // Multiply by zero is zero.
1599     // MIPS does not have a divide by zero exception.
1600     // The result is undefined, we return zero.
1601     alloc_reg(current,i,HIREG);
1602     alloc_reg(current,i,LOREG);
1603     current->is32|=1LL<<HIREG;
1604     current->is32|=1LL<<LOREG;
1605     dirty_reg(current,HIREG);
1606     dirty_reg(current,LOREG);
1607   }
1608 }
1609 #endif
1610
1611 void cop0_alloc(struct regstat *current,int i)
1612 {
1613   if(opcode2[i]==0) // MFC0
1614   {
1615     if(rt1[i]) {
1616       clear_const(current,rt1[i]);
1617       alloc_all(current,i);
1618       alloc_reg(current,i,rt1[i]);
1619       current->is32|=1LL<<rt1[i];
1620       dirty_reg(current,rt1[i]);
1621     }
1622   }
1623   else if(opcode2[i]==4) // MTC0
1624   {
1625     if(rs1[i]){
1626       clear_const(current,rs1[i]);
1627       alloc_reg(current,i,rs1[i]);
1628       alloc_all(current,i);
1629     }
1630     else {
1631       alloc_all(current,i); // FIXME: Keep r0
1632       current->u&=~1LL;
1633       alloc_reg(current,i,0);
1634     }
1635   }
1636   else
1637   {
1638     // TLBR/TLBWI/TLBWR/TLBP/ERET
1639     assert(opcode2[i]==0x10);
1640     alloc_all(current,i);
1641   }
1642   minimum_free_regs[i]=HOST_REGS;
1643 }
1644
1645 void cop1_alloc(struct regstat *current,int i)
1646 {
1647   alloc_reg(current,i,CSREG); // Load status
1648   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1649   {
1650     if(rt1[i]){
1651       clear_const(current,rt1[i]);
1652       if(opcode2[i]==1) {
1653         alloc_reg64(current,i,rt1[i]); // DMFC1
1654         current->is32&=~(1LL<<rt1[i]);
1655       }else{
1656         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1657         current->is32|=1LL<<rt1[i];
1658       }
1659       dirty_reg(current,rt1[i]);
1660     }
1661     alloc_reg_temp(current,i,-1);
1662   }
1663   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1664   {
1665     if(rs1[i]){
1666       clear_const(current,rs1[i]);
1667       if(opcode2[i]==5)
1668         alloc_reg64(current,i,rs1[i]); // DMTC1
1669       else
1670         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1671       alloc_reg_temp(current,i,-1);
1672     }
1673     else {
1674       current->u&=~1LL;
1675       alloc_reg(current,i,0);
1676       alloc_reg_temp(current,i,-1);
1677     }
1678   }
1679   minimum_free_regs[i]=1;
1680 }
1681 void fconv_alloc(struct regstat *current,int i)
1682 {
1683   alloc_reg(current,i,CSREG); // Load status
1684   alloc_reg_temp(current,i,-1);
1685   minimum_free_regs[i]=1;
1686 }
1687 void float_alloc(struct regstat *current,int i)
1688 {
1689   alloc_reg(current,i,CSREG); // Load status
1690   alloc_reg_temp(current,i,-1);
1691   minimum_free_regs[i]=1;
1692 }
1693 void c2op_alloc(struct regstat *current,int i)
1694 {
1695   alloc_reg_temp(current,i,-1);
1696 }
1697 void fcomp_alloc(struct regstat *current,int i)
1698 {
1699   alloc_reg(current,i,CSREG); // Load status
1700   alloc_reg(current,i,FSREG); // Load flags
1701   dirty_reg(current,FSREG); // Flag will be modified
1702   alloc_reg_temp(current,i,-1);
1703   minimum_free_regs[i]=1;
1704 }
1705
1706 void syscall_alloc(struct regstat *current,int i)
1707 {
1708   alloc_cc(current,i);
1709   dirty_reg(current,CCREG);
1710   alloc_all(current,i);
1711   minimum_free_regs[i]=HOST_REGS;
1712   current->isconst=0;
1713 }
1714
1715 void delayslot_alloc(struct regstat *current,int i)
1716 {
1717   switch(itype[i]) {
1718     case UJUMP:
1719     case CJUMP:
1720     case SJUMP:
1721     case RJUMP:
1722     case FJUMP:
1723     case SYSCALL:
1724     case HLECALL:
1725     case SPAN:
1726       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1727       SysPrintf("Disabled speculative precompilation\n");
1728       stop_after_jal=1;
1729       break;
1730     case IMM16:
1731       imm16_alloc(current,i);
1732       break;
1733     case LOAD:
1734     case LOADLR:
1735       load_alloc(current,i);
1736       break;
1737     case STORE:
1738     case STORELR:
1739       store_alloc(current,i);
1740       break;
1741     case ALU:
1742       alu_alloc(current,i);
1743       break;
1744     case SHIFT:
1745       shift_alloc(current,i);
1746       break;
1747     case MULTDIV:
1748       multdiv_alloc(current,i);
1749       break;
1750     case SHIFTIMM:
1751       shiftimm_alloc(current,i);
1752       break;
1753     case MOV:
1754       mov_alloc(current,i);
1755       break;
1756     case COP0:
1757       cop0_alloc(current,i);
1758       break;
1759     case COP1:
1760     case COP2:
1761       cop1_alloc(current,i);
1762       break;
1763     case C1LS:
1764       c1ls_alloc(current,i);
1765       break;
1766     case C2LS:
1767       c2ls_alloc(current,i);
1768       break;
1769     case FCONV:
1770       fconv_alloc(current,i);
1771       break;
1772     case FLOAT:
1773       float_alloc(current,i);
1774       break;
1775     case FCOMP:
1776       fcomp_alloc(current,i);
1777       break;
1778     case C2OP:
1779       c2op_alloc(current,i);
1780       break;
1781   }
1782 }
1783
1784 // Special case where a branch and delay slot span two pages in virtual memory
1785 static void pagespan_alloc(struct regstat *current,int i)
1786 {
1787   current->isconst=0;
1788   current->wasconst=0;
1789   regs[i].wasconst=0;
1790   minimum_free_regs[i]=HOST_REGS;
1791   alloc_all(current,i);
1792   alloc_cc(current,i);
1793   dirty_reg(current,CCREG);
1794   if(opcode[i]==3) // JAL
1795   {
1796     alloc_reg(current,i,31);
1797     dirty_reg(current,31);
1798   }
1799   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1800   {
1801     alloc_reg(current,i,rs1[i]);
1802     if (rt1[i]!=0) {
1803       alloc_reg(current,i,rt1[i]);
1804       dirty_reg(current,rt1[i]);
1805     }
1806   }
1807   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1808   {
1809     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1810     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1811     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1812     {
1813       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1814       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1815     }
1816   }
1817   else
1818   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1819   {
1820     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1821     if(!((current->is32>>rs1[i])&1))
1822     {
1823       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1824     }
1825   }
1826   else
1827   if(opcode[i]==0x11) // BC1
1828   {
1829     alloc_reg(current,i,FSREG);
1830     alloc_reg(current,i,CSREG);
1831   }
1832   //else ...
1833 }
1834
1835 static void add_stub(enum stub_type type, void *addr, void *retaddr,
1836   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
1837 {
1838   assert(a < ARRAY_SIZE(stubs));
1839   stubs[stubcount].type = type;
1840   stubs[stubcount].addr = addr;
1841   stubs[stubcount].retaddr = retaddr;
1842   stubs[stubcount].a = a;
1843   stubs[stubcount].b = b;
1844   stubs[stubcount].c = c;
1845   stubs[stubcount].d = d;
1846   stubs[stubcount].e = e;
1847   stubcount++;
1848 }
1849
1850 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
1851   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
1852 {
1853   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
1854 }
1855
1856 // Write out a single register
1857 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1858 {
1859   int hr;
1860   for(hr=0;hr<HOST_REGS;hr++) {
1861     if(hr!=EXCLUDE_REG) {
1862       if((regmap[hr]&63)==r) {
1863         if((dirty>>hr)&1) {
1864           if(regmap[hr]<64) {
1865             emit_storereg(r,hr);
1866           }else{
1867             emit_storereg(r|64,hr);
1868           }
1869         }
1870       }
1871     }
1872   }
1873 }
1874
1875 void rlist()
1876 {
1877   int i;
1878   printf("TRACE: ");
1879   for(i=0;i<32;i++)
1880     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1881   printf("\n");
1882 }
1883
1884 void alu_assemble(int i,struct regstat *i_regs)
1885 {
1886   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1887     if(rt1[i]) {
1888       signed char s1,s2,t;
1889       t=get_reg(i_regs->regmap,rt1[i]);
1890       if(t>=0) {
1891         s1=get_reg(i_regs->regmap,rs1[i]);
1892         s2=get_reg(i_regs->regmap,rs2[i]);
1893         if(rs1[i]&&rs2[i]) {
1894           assert(s1>=0);
1895           assert(s2>=0);
1896           if(opcode2[i]&2) emit_sub(s1,s2,t);
1897           else emit_add(s1,s2,t);
1898         }
1899         else if(rs1[i]) {
1900           if(s1>=0) emit_mov(s1,t);
1901           else emit_loadreg(rs1[i],t);
1902         }
1903         else if(rs2[i]) {
1904           if(s2>=0) {
1905             if(opcode2[i]&2) emit_neg(s2,t);
1906             else emit_mov(s2,t);
1907           }
1908           else {
1909             emit_loadreg(rs2[i],t);
1910             if(opcode2[i]&2) emit_neg(t,t);
1911           }
1912         }
1913         else emit_zeroreg(t);
1914       }
1915     }
1916   }
1917   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1918     if(rt1[i]) {
1919       signed char s1l,s2l,s1h,s2h,tl,th;
1920       tl=get_reg(i_regs->regmap,rt1[i]);
1921       th=get_reg(i_regs->regmap,rt1[i]|64);
1922       if(tl>=0) {
1923         s1l=get_reg(i_regs->regmap,rs1[i]);
1924         s2l=get_reg(i_regs->regmap,rs2[i]);
1925         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1926         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1927         if(rs1[i]&&rs2[i]) {
1928           assert(s1l>=0);
1929           assert(s2l>=0);
1930           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1931           else emit_adds(s1l,s2l,tl);
1932           if(th>=0) {
1933             #ifdef INVERTED_CARRY
1934             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1935             #else
1936             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1937             #endif
1938             else emit_add(s1h,s2h,th);
1939           }
1940         }
1941         else if(rs1[i]) {
1942           if(s1l>=0) emit_mov(s1l,tl);
1943           else emit_loadreg(rs1[i],tl);
1944           if(th>=0) {
1945             if(s1h>=0) emit_mov(s1h,th);
1946             else emit_loadreg(rs1[i]|64,th);
1947           }
1948         }
1949         else if(rs2[i]) {
1950           if(s2l>=0) {
1951             if(opcode2[i]&2) emit_negs(s2l,tl);
1952             else emit_mov(s2l,tl);
1953           }
1954           else {
1955             emit_loadreg(rs2[i],tl);
1956             if(opcode2[i]&2) emit_negs(tl,tl);
1957           }
1958           if(th>=0) {
1959             #ifdef INVERTED_CARRY
1960             if(s2h>=0) emit_mov(s2h,th);
1961             else emit_loadreg(rs2[i]|64,th);
1962             if(opcode2[i]&2) {
1963               emit_adcimm(-1,th); // x86 has inverted carry flag
1964               emit_not(th,th);
1965             }
1966             #else
1967             if(opcode2[i]&2) {
1968               if(s2h>=0) emit_rscimm(s2h,0,th);
1969               else {
1970                 emit_loadreg(rs2[i]|64,th);
1971                 emit_rscimm(th,0,th);
1972               }
1973             }else{
1974               if(s2h>=0) emit_mov(s2h,th);
1975               else emit_loadreg(rs2[i]|64,th);
1976             }
1977             #endif
1978           }
1979         }
1980         else {
1981           emit_zeroreg(tl);
1982           if(th>=0) emit_zeroreg(th);
1983         }
1984       }
1985     }
1986   }
1987   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1988     if(rt1[i]) {
1989       signed char s1l,s1h,s2l,s2h,t;
1990       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
1991       {
1992         t=get_reg(i_regs->regmap,rt1[i]);
1993         //assert(t>=0);
1994         if(t>=0) {
1995           s1l=get_reg(i_regs->regmap,rs1[i]);
1996           s1h=get_reg(i_regs->regmap,rs1[i]|64);
1997           s2l=get_reg(i_regs->regmap,rs2[i]);
1998           s2h=get_reg(i_regs->regmap,rs2[i]|64);
1999           if(rs2[i]==0) // rx<r0
2000           {
2001             assert(s1h>=0);
2002             if(opcode2[i]==0x2a) // SLT
2003               emit_shrimm(s1h,31,t);
2004             else // SLTU (unsigned can not be less than zero)
2005               emit_zeroreg(t);
2006           }
2007           else if(rs1[i]==0) // r0<rx
2008           {
2009             assert(s2h>=0);
2010             if(opcode2[i]==0x2a) // SLT
2011               emit_set_gz64_32(s2h,s2l,t);
2012             else // SLTU (set if not zero)
2013               emit_set_nz64_32(s2h,s2l,t);
2014           }
2015           else {
2016             assert(s1l>=0);assert(s1h>=0);
2017             assert(s2l>=0);assert(s2h>=0);
2018             if(opcode2[i]==0x2a) // SLT
2019               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2020             else // SLTU
2021               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2022           }
2023         }
2024       } else {
2025         t=get_reg(i_regs->regmap,rt1[i]);
2026         //assert(t>=0);
2027         if(t>=0) {
2028           s1l=get_reg(i_regs->regmap,rs1[i]);
2029           s2l=get_reg(i_regs->regmap,rs2[i]);
2030           if(rs2[i]==0) // rx<r0
2031           {
2032             assert(s1l>=0);
2033             if(opcode2[i]==0x2a) // SLT
2034               emit_shrimm(s1l,31,t);
2035             else // SLTU (unsigned can not be less than zero)
2036               emit_zeroreg(t);
2037           }
2038           else if(rs1[i]==0) // r0<rx
2039           {
2040             assert(s2l>=0);
2041             if(opcode2[i]==0x2a) // SLT
2042               emit_set_gz32(s2l,t);
2043             else // SLTU (set if not zero)
2044               emit_set_nz32(s2l,t);
2045           }
2046           else{
2047             assert(s1l>=0);assert(s2l>=0);
2048             if(opcode2[i]==0x2a) // SLT
2049               emit_set_if_less32(s1l,s2l,t);
2050             else // SLTU
2051               emit_set_if_carry32(s1l,s2l,t);
2052           }
2053         }
2054       }
2055     }
2056   }
2057   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2058     if(rt1[i]) {
2059       signed char s1l,s1h,s2l,s2h,th,tl;
2060       tl=get_reg(i_regs->regmap,rt1[i]);
2061       th=get_reg(i_regs->regmap,rt1[i]|64);
2062       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2063       {
2064         assert(tl>=0);
2065         if(tl>=0) {
2066           s1l=get_reg(i_regs->regmap,rs1[i]);
2067           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2068           s2l=get_reg(i_regs->regmap,rs2[i]);
2069           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2070           if(rs1[i]&&rs2[i]) {
2071             assert(s1l>=0);assert(s1h>=0);
2072             assert(s2l>=0);assert(s2h>=0);
2073             if(opcode2[i]==0x24) { // AND
2074               emit_and(s1l,s2l,tl);
2075               emit_and(s1h,s2h,th);
2076             } else
2077             if(opcode2[i]==0x25) { // OR
2078               emit_or(s1l,s2l,tl);
2079               emit_or(s1h,s2h,th);
2080             } else
2081             if(opcode2[i]==0x26) { // XOR
2082               emit_xor(s1l,s2l,tl);
2083               emit_xor(s1h,s2h,th);
2084             } else
2085             if(opcode2[i]==0x27) { // NOR
2086               emit_or(s1l,s2l,tl);
2087               emit_or(s1h,s2h,th);
2088               emit_not(tl,tl);
2089               emit_not(th,th);
2090             }
2091           }
2092           else
2093           {
2094             if(opcode2[i]==0x24) { // AND
2095               emit_zeroreg(tl);
2096               emit_zeroreg(th);
2097             } else
2098             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2099               if(rs1[i]){
2100                 if(s1l>=0) emit_mov(s1l,tl);
2101                 else emit_loadreg(rs1[i],tl);
2102                 if(s1h>=0) emit_mov(s1h,th);
2103                 else emit_loadreg(rs1[i]|64,th);
2104               }
2105               else
2106               if(rs2[i]){
2107                 if(s2l>=0) emit_mov(s2l,tl);
2108                 else emit_loadreg(rs2[i],tl);
2109                 if(s2h>=0) emit_mov(s2h,th);
2110                 else emit_loadreg(rs2[i]|64,th);
2111               }
2112               else{
2113                 emit_zeroreg(tl);
2114                 emit_zeroreg(th);
2115               }
2116             } else
2117             if(opcode2[i]==0x27) { // NOR
2118               if(rs1[i]){
2119                 if(s1l>=0) emit_not(s1l,tl);
2120                 else{
2121                   emit_loadreg(rs1[i],tl);
2122                   emit_not(tl,tl);
2123                 }
2124                 if(s1h>=0) emit_not(s1h,th);
2125                 else{
2126                   emit_loadreg(rs1[i]|64,th);
2127                   emit_not(th,th);
2128                 }
2129               }
2130               else
2131               if(rs2[i]){
2132                 if(s2l>=0) emit_not(s2l,tl);
2133                 else{
2134                   emit_loadreg(rs2[i],tl);
2135                   emit_not(tl,tl);
2136                 }
2137                 if(s2h>=0) emit_not(s2h,th);
2138                 else{
2139                   emit_loadreg(rs2[i]|64,th);
2140                   emit_not(th,th);
2141                 }
2142               }
2143               else {
2144                 emit_movimm(-1,tl);
2145                 emit_movimm(-1,th);
2146               }
2147             }
2148           }
2149         }
2150       }
2151       else
2152       {
2153         // 32 bit
2154         if(tl>=0) {
2155           s1l=get_reg(i_regs->regmap,rs1[i]);
2156           s2l=get_reg(i_regs->regmap,rs2[i]);
2157           if(rs1[i]&&rs2[i]) {
2158             assert(s1l>=0);
2159             assert(s2l>=0);
2160             if(opcode2[i]==0x24) { // AND
2161               emit_and(s1l,s2l,tl);
2162             } else
2163             if(opcode2[i]==0x25) { // OR
2164               emit_or(s1l,s2l,tl);
2165             } else
2166             if(opcode2[i]==0x26) { // XOR
2167               emit_xor(s1l,s2l,tl);
2168             } else
2169             if(opcode2[i]==0x27) { // NOR
2170               emit_or(s1l,s2l,tl);
2171               emit_not(tl,tl);
2172             }
2173           }
2174           else
2175           {
2176             if(opcode2[i]==0x24) { // AND
2177               emit_zeroreg(tl);
2178             } else
2179             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2180               if(rs1[i]){
2181                 if(s1l>=0) emit_mov(s1l,tl);
2182                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2183               }
2184               else
2185               if(rs2[i]){
2186                 if(s2l>=0) emit_mov(s2l,tl);
2187                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2188               }
2189               else emit_zeroreg(tl);
2190             } else
2191             if(opcode2[i]==0x27) { // NOR
2192               if(rs1[i]){
2193                 if(s1l>=0) emit_not(s1l,tl);
2194                 else {
2195                   emit_loadreg(rs1[i],tl);
2196                   emit_not(tl,tl);
2197                 }
2198               }
2199               else
2200               if(rs2[i]){
2201                 if(s2l>=0) emit_not(s2l,tl);
2202                 else {
2203                   emit_loadreg(rs2[i],tl);
2204                   emit_not(tl,tl);
2205                 }
2206               }
2207               else emit_movimm(-1,tl);
2208             }
2209           }
2210         }
2211       }
2212     }
2213   }
2214 }
2215
2216 void imm16_assemble(int i,struct regstat *i_regs)
2217 {
2218   if (opcode[i]==0x0f) { // LUI
2219     if(rt1[i]) {
2220       signed char t;
2221       t=get_reg(i_regs->regmap,rt1[i]);
2222       //assert(t>=0);
2223       if(t>=0) {
2224         if(!((i_regs->isconst>>t)&1))
2225           emit_movimm(imm[i]<<16,t);
2226       }
2227     }
2228   }
2229   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2230     if(rt1[i]) {
2231       signed char s,t;
2232       t=get_reg(i_regs->regmap,rt1[i]);
2233       s=get_reg(i_regs->regmap,rs1[i]);
2234       if(rs1[i]) {
2235         //assert(t>=0);
2236         //assert(s>=0);
2237         if(t>=0) {
2238           if(!((i_regs->isconst>>t)&1)) {
2239             if(s<0) {
2240               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2241               emit_addimm(t,imm[i],t);
2242             }else{
2243               if(!((i_regs->wasconst>>s)&1))
2244                 emit_addimm(s,imm[i],t);
2245               else
2246                 emit_movimm(constmap[i][s]+imm[i],t);
2247             }
2248           }
2249         }
2250       } else {
2251         if(t>=0) {
2252           if(!((i_regs->isconst>>t)&1))
2253             emit_movimm(imm[i],t);
2254         }
2255       }
2256     }
2257   }
2258   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2259     if(rt1[i]) {
2260       signed char sh,sl,th,tl;
2261       th=get_reg(i_regs->regmap,rt1[i]|64);
2262       tl=get_reg(i_regs->regmap,rt1[i]);
2263       sh=get_reg(i_regs->regmap,rs1[i]|64);
2264       sl=get_reg(i_regs->regmap,rs1[i]);
2265       if(tl>=0) {
2266         if(rs1[i]) {
2267           assert(sh>=0);
2268           assert(sl>=0);
2269           if(th>=0) {
2270             emit_addimm64_32(sh,sl,imm[i],th,tl);
2271           }
2272           else {
2273             emit_addimm(sl,imm[i],tl);
2274           }
2275         } else {
2276           emit_movimm(imm[i],tl);
2277           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2278         }
2279       }
2280     }
2281   }
2282   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2283     if(rt1[i]) {
2284       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2285       signed char sh,sl,t;
2286       t=get_reg(i_regs->regmap,rt1[i]);
2287       sh=get_reg(i_regs->regmap,rs1[i]|64);
2288       sl=get_reg(i_regs->regmap,rs1[i]);
2289       //assert(t>=0);
2290       if(t>=0) {
2291         if(rs1[i]>0) {
2292           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2293           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2294             if(opcode[i]==0x0a) { // SLTI
2295               if(sl<0) {
2296                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2297                 emit_slti32(t,imm[i],t);
2298               }else{
2299                 emit_slti32(sl,imm[i],t);
2300               }
2301             }
2302             else { // SLTIU
2303               if(sl<0) {
2304                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2305                 emit_sltiu32(t,imm[i],t);
2306               }else{
2307                 emit_sltiu32(sl,imm[i],t);
2308               }
2309             }
2310           }else{ // 64-bit
2311             assert(sl>=0);
2312             if(opcode[i]==0x0a) // SLTI
2313               emit_slti64_32(sh,sl,imm[i],t);
2314             else // SLTIU
2315               emit_sltiu64_32(sh,sl,imm[i],t);
2316           }
2317         }else{
2318           // SLTI(U) with r0 is just stupid,
2319           // nonetheless examples can be found
2320           if(opcode[i]==0x0a) // SLTI
2321             if(0<imm[i]) emit_movimm(1,t);
2322             else emit_zeroreg(t);
2323           else // SLTIU
2324           {
2325             if(imm[i]) emit_movimm(1,t);
2326             else emit_zeroreg(t);
2327           }
2328         }
2329       }
2330     }
2331   }
2332   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2333     if(rt1[i]) {
2334       signed char sh,sl,th,tl;
2335       th=get_reg(i_regs->regmap,rt1[i]|64);
2336       tl=get_reg(i_regs->regmap,rt1[i]);
2337       sh=get_reg(i_regs->regmap,rs1[i]|64);
2338       sl=get_reg(i_regs->regmap,rs1[i]);
2339       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2340         if(opcode[i]==0x0c) //ANDI
2341         {
2342           if(rs1[i]) {
2343             if(sl<0) {
2344               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2345               emit_andimm(tl,imm[i],tl);
2346             }else{
2347               if(!((i_regs->wasconst>>sl)&1))
2348                 emit_andimm(sl,imm[i],tl);
2349               else
2350                 emit_movimm(constmap[i][sl]&imm[i],tl);
2351             }
2352           }
2353           else
2354             emit_zeroreg(tl);
2355           if(th>=0) emit_zeroreg(th);
2356         }
2357         else
2358         {
2359           if(rs1[i]) {
2360             if(sl<0) {
2361               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2362             }
2363             if(th>=0) {
2364               if(sh<0) {
2365                 emit_loadreg(rs1[i]|64,th);
2366               }else{
2367                 emit_mov(sh,th);
2368               }
2369             }
2370             if(opcode[i]==0x0d) { // ORI
2371               if(sl<0) {
2372                 emit_orimm(tl,imm[i],tl);
2373               }else{
2374                 if(!((i_regs->wasconst>>sl)&1))
2375                   emit_orimm(sl,imm[i],tl);
2376                 else
2377                   emit_movimm(constmap[i][sl]|imm[i],tl);
2378               }
2379             }
2380             if(opcode[i]==0x0e) { // XORI
2381               if(sl<0) {
2382                 emit_xorimm(tl,imm[i],tl);
2383               }else{
2384                 if(!((i_regs->wasconst>>sl)&1))
2385                   emit_xorimm(sl,imm[i],tl);
2386                 else
2387                   emit_movimm(constmap[i][sl]^imm[i],tl);
2388               }
2389             }
2390           }
2391           else {
2392             emit_movimm(imm[i],tl);
2393             if(th>=0) emit_zeroreg(th);
2394           }
2395         }
2396       }
2397     }
2398   }
2399 }
2400
2401 void shiftimm_assemble(int i,struct regstat *i_regs)
2402 {
2403   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2404   {
2405     if(rt1[i]) {
2406       signed char s,t;
2407       t=get_reg(i_regs->regmap,rt1[i]);
2408       s=get_reg(i_regs->regmap,rs1[i]);
2409       //assert(t>=0);
2410       if(t>=0&&!((i_regs->isconst>>t)&1)){
2411         if(rs1[i]==0)
2412         {
2413           emit_zeroreg(t);
2414         }
2415         else
2416         {
2417           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2418           if(imm[i]) {
2419             if(opcode2[i]==0) // SLL
2420             {
2421               emit_shlimm(s<0?t:s,imm[i],t);
2422             }
2423             if(opcode2[i]==2) // SRL
2424             {
2425               emit_shrimm(s<0?t:s,imm[i],t);
2426             }
2427             if(opcode2[i]==3) // SRA
2428             {
2429               emit_sarimm(s<0?t:s,imm[i],t);
2430             }
2431           }else{
2432             // Shift by zero
2433             if(s>=0 && s!=t) emit_mov(s,t);
2434           }
2435         }
2436       }
2437       //emit_storereg(rt1[i],t); //DEBUG
2438     }
2439   }
2440   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2441   {
2442     assert(0);
2443   }
2444   if(opcode2[i]==0x3c) // DSLL32
2445   {
2446     assert(0);
2447   }
2448   if(opcode2[i]==0x3e) // DSRL32
2449   {
2450     assert(0);
2451   }
2452   if(opcode2[i]==0x3f) // DSRA32
2453   {
2454     assert(0);
2455   }
2456 }
2457
2458 #ifndef shift_assemble
2459 void shift_assemble(int i,struct regstat *i_regs)
2460 {
2461   printf("Need shift_assemble for this architecture.\n");
2462   exit(1);
2463 }
2464 #endif
2465
2466 void load_assemble(int i,struct regstat *i_regs)
2467 {
2468   int s,th,tl,addr;
2469   int offset;
2470   void *jaddr=0;
2471   int memtarget=0,c=0;
2472   int fastload_reg_override=0;
2473   u_int hr,reglist=0;
2474   th=get_reg(i_regs->regmap,rt1[i]|64);
2475   tl=get_reg(i_regs->regmap,rt1[i]);
2476   s=get_reg(i_regs->regmap,rs1[i]);
2477   offset=imm[i];
2478   for(hr=0;hr<HOST_REGS;hr++) {
2479     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2480   }
2481   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2482   if(s>=0) {
2483     c=(i_regs->wasconst>>s)&1;
2484     if (c) {
2485       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2486     }
2487   }
2488   //printf("load_assemble: c=%d\n",c);
2489   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2490   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2491   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2492     ||rt1[i]==0) {
2493       // could be FIFO, must perform the read
2494       // ||dummy read
2495       assem_debug("(forced read)\n");
2496       tl=get_reg(i_regs->regmap,-1);
2497       assert(tl>=0);
2498   }
2499   if(offset||s<0||c) addr=tl;
2500   else addr=s;
2501   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2502  if(tl>=0) {
2503   //printf("load_assemble: c=%d\n",c);
2504   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2505   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2506   reglist&=~(1<<tl);
2507   if(th>=0) reglist&=~(1<<th);
2508   if(!c) {
2509     #ifdef R29_HACK
2510     // Strmnnrmn's speed hack
2511     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2512     #endif
2513     {
2514       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2515     }
2516   }
2517   else if(ram_offset&&memtarget) {
2518     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2519     fastload_reg_override=HOST_TEMPREG;
2520   }
2521   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2522   if (opcode[i]==0x20) { // LB
2523     if(!c||memtarget) {
2524       if(!dummy) {
2525         {
2526           int x=0,a=tl;
2527           if(!c) a=addr;
2528           if(fastload_reg_override) a=fastload_reg_override;
2529
2530           emit_movsbl_indexed(x,a,tl);
2531         }
2532       }
2533       if(jaddr)
2534         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2535     }
2536     else
2537       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2538   }
2539   if (opcode[i]==0x21) { // LH
2540     if(!c||memtarget) {
2541       if(!dummy) {
2542         int x=0,a=tl;
2543         if(!c) a=addr;
2544         if(fastload_reg_override) a=fastload_reg_override;
2545         emit_movswl_indexed(x,a,tl);
2546       }
2547       if(jaddr)
2548         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2549     }
2550     else
2551       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2552   }
2553   if (opcode[i]==0x23) { // LW
2554     if(!c||memtarget) {
2555       if(!dummy) {
2556         int a=addr;
2557         if(fastload_reg_override) a=fastload_reg_override;
2558         emit_readword_indexed(0,a,tl);
2559       }
2560       if(jaddr)
2561         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2562     }
2563     else
2564       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2565   }
2566   if (opcode[i]==0x24) { // LBU
2567     if(!c||memtarget) {
2568       if(!dummy) {
2569         int x=0,a=tl;
2570         if(!c) a=addr;
2571         if(fastload_reg_override) a=fastload_reg_override;
2572
2573         emit_movzbl_indexed(x,a,tl);
2574       }
2575       if(jaddr)
2576         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2577     }
2578     else
2579       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2580   }
2581   if (opcode[i]==0x25) { // LHU
2582     if(!c||memtarget) {
2583       if(!dummy) {
2584         int x=0,a=tl;
2585         if(!c) a=addr;
2586         if(fastload_reg_override) a=fastload_reg_override;
2587         emit_movzwl_indexed(x,a,tl);
2588       }
2589       if(jaddr)
2590         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2591     }
2592     else
2593       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2594   }
2595   if (opcode[i]==0x27) { // LWU
2596     assert(th>=0);
2597     if(!c||memtarget) {
2598       if(!dummy) {
2599         int a=addr;
2600         if(fastload_reg_override) a=fastload_reg_override;
2601         emit_readword_indexed(0,a,tl);
2602       }
2603       if(jaddr)
2604         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2605     }
2606     else {
2607       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2608     }
2609     emit_zeroreg(th);
2610   }
2611   if (opcode[i]==0x37) { // LD
2612     assert(0);
2613   }
2614  }
2615 }
2616
2617 #ifndef loadlr_assemble
2618 void loadlr_assemble(int i,struct regstat *i_regs)
2619 {
2620   printf("Need loadlr_assemble for this architecture.\n");
2621   exit(1);
2622 }
2623 #endif
2624
2625 void store_assemble(int i,struct regstat *i_regs)
2626 {
2627   int s,tl;
2628   int addr,temp;
2629   int offset;
2630   void *jaddr=0;
2631   enum stub_type type;
2632   int memtarget=0,c=0;
2633   int agr=AGEN1+(i&1);
2634   int faststore_reg_override=0;
2635   u_int hr,reglist=0;
2636   tl=get_reg(i_regs->regmap,rs2[i]);
2637   s=get_reg(i_regs->regmap,rs1[i]);
2638   temp=get_reg(i_regs->regmap,agr);
2639   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2640   offset=imm[i];
2641   if(s>=0) {
2642     c=(i_regs->wasconst>>s)&1;
2643     if(c) {
2644       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2645     }
2646   }
2647   assert(tl>=0);
2648   assert(temp>=0);
2649   for(hr=0;hr<HOST_REGS;hr++) {
2650     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2651   }
2652   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2653   if(offset||s<0||c) addr=temp;
2654   else addr=s;
2655   if(!c) {
2656     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2657   }
2658   else if(ram_offset&&memtarget) {
2659     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2660     faststore_reg_override=HOST_TEMPREG;
2661   }
2662
2663   if (opcode[i]==0x28) { // SB
2664     if(!c||memtarget) {
2665       int x=0,a=temp;
2666       if(!c) a=addr;
2667       if(faststore_reg_override) a=faststore_reg_override;
2668       emit_writebyte_indexed(tl,x,a);
2669     }
2670     type=STOREB_STUB;
2671   }
2672   if (opcode[i]==0x29) { // SH
2673     if(!c||memtarget) {
2674       int x=0,a=temp;
2675       if(!c) a=addr;
2676       if(faststore_reg_override) a=faststore_reg_override;
2677       emit_writehword_indexed(tl,x,a);
2678     }
2679     type=STOREH_STUB;
2680   }
2681   if (opcode[i]==0x2B) { // SW
2682     if(!c||memtarget) {
2683       int a=addr;
2684       if(faststore_reg_override) a=faststore_reg_override;
2685       emit_writeword_indexed(tl,0,a);
2686     }
2687     type=STOREW_STUB;
2688   }
2689   if (opcode[i]==0x3F) { // SD
2690     assert(0);
2691     type=STORED_STUB;
2692   }
2693   if(jaddr) {
2694     // PCSX store handlers don't check invcode again
2695     reglist|=1<<addr;
2696     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2697     jaddr=0;
2698   }
2699   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2700     if(!c||memtarget) {
2701       #ifdef DESTRUCTIVE_SHIFT
2702       // The x86 shift operation is 'destructive'; it overwrites the
2703       // source register, so we need to make a copy first and use that.
2704       addr=temp;
2705       #endif
2706       #if defined(HOST_IMM8)
2707       int ir=get_reg(i_regs->regmap,INVCP);
2708       assert(ir>=0);
2709       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2710       #else
2711       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2712       #endif
2713       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2714       emit_callne(invalidate_addr_reg[addr]);
2715       #else
2716       void *jaddr2 = out;
2717       emit_jne(0);
2718       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2719       #endif
2720     }
2721   }
2722   u_int addr_val=constmap[i][s]+offset;
2723   if(jaddr) {
2724     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2725   } else if(c&&!memtarget) {
2726     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2727   }
2728   // basic current block modification detection..
2729   // not looking back as that should be in mips cache already
2730   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2731     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2732     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2733     if(i_regs->regmap==regs[i].regmap) {
2734       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
2735       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
2736       emit_movimm(start+i*4+4,0);
2737       emit_writeword(0,&pcaddr);
2738       emit_jmp(do_interrupt);
2739     }
2740   }
2741 }
2742
2743 void storelr_assemble(int i,struct regstat *i_regs)
2744 {
2745   int s,tl;
2746   int temp;
2747   int offset;
2748   void *jaddr=0;
2749   void *case1, *case2, *case3;
2750   void *done0, *done1, *done2;
2751   int memtarget=0,c=0;
2752   int agr=AGEN1+(i&1);
2753   u_int hr,reglist=0;
2754   tl=get_reg(i_regs->regmap,rs2[i]);
2755   s=get_reg(i_regs->regmap,rs1[i]);
2756   temp=get_reg(i_regs->regmap,agr);
2757   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2758   offset=imm[i];
2759   if(s>=0) {
2760     c=(i_regs->isconst>>s)&1;
2761     if(c) {
2762       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2763     }
2764   }
2765   assert(tl>=0);
2766   for(hr=0;hr<HOST_REGS;hr++) {
2767     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2768   }
2769   assert(temp>=0);
2770   if(!c) {
2771     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
2772     if(!offset&&s!=temp) emit_mov(s,temp);
2773     jaddr=out;
2774     emit_jno(0);
2775   }
2776   else
2777   {
2778     if(!memtarget||!rs1[i]) {
2779       jaddr=out;
2780       emit_jmp(0);
2781     }
2782   }
2783   emit_addimm_no_flags(ram_offset,temp);
2784
2785   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
2786     assert(0);
2787   }
2788
2789   emit_xorimm(temp,3,temp);
2790   emit_testimm(temp,2);
2791   case2=out;
2792   emit_jne(0);
2793   emit_testimm(temp,1);
2794   case1=out;
2795   emit_jne(0);
2796   // 0
2797   if (opcode[i]==0x2A) { // SWL
2798     emit_writeword_indexed(tl,0,temp);
2799   }
2800   if (opcode[i]==0x2E) { // SWR
2801     emit_writebyte_indexed(tl,3,temp);
2802   }
2803   if (opcode[i]==0x2C) { // SDL
2804     assert(0);
2805   }
2806   if (opcode[i]==0x2D) { // SDR
2807     assert(0);
2808   }
2809   done0=out;
2810   emit_jmp(0);
2811   // 1
2812   set_jump_target(case1, out);
2813   if (opcode[i]==0x2A) { // SWL
2814     // Write 3 msb into three least significant bytes
2815     if(rs2[i]) emit_rorimm(tl,8,tl);
2816     emit_writehword_indexed(tl,-1,temp);
2817     if(rs2[i]) emit_rorimm(tl,16,tl);
2818     emit_writebyte_indexed(tl,1,temp);
2819     if(rs2[i]) emit_rorimm(tl,8,tl);
2820   }
2821   if (opcode[i]==0x2E) { // SWR
2822     // Write two lsb into two most significant bytes
2823     emit_writehword_indexed(tl,1,temp);
2824   }
2825   if (opcode[i]==0x2C) { // SDL
2826     assert(0);
2827   }
2828   if (opcode[i]==0x2D) { // SDR
2829     assert(0);
2830   }
2831   done1=out;
2832   emit_jmp(0);
2833   // 2
2834   set_jump_target(case2, out);
2835   emit_testimm(temp,1);
2836   case3=out;
2837   emit_jne(0);
2838   if (opcode[i]==0x2A) { // SWL
2839     // Write two msb into two least significant bytes
2840     if(rs2[i]) emit_rorimm(tl,16,tl);
2841     emit_writehword_indexed(tl,-2,temp);
2842     if(rs2[i]) emit_rorimm(tl,16,tl);
2843   }
2844   if (opcode[i]==0x2E) { // SWR
2845     // Write 3 lsb into three most significant bytes
2846     emit_writebyte_indexed(tl,-1,temp);
2847     if(rs2[i]) emit_rorimm(tl,8,tl);
2848     emit_writehword_indexed(tl,0,temp);
2849     if(rs2[i]) emit_rorimm(tl,24,tl);
2850   }
2851   if (opcode[i]==0x2C) { // SDL
2852     assert(0);
2853   }
2854   if (opcode[i]==0x2D) { // SDR
2855     assert(0);
2856   }
2857   done2=out;
2858   emit_jmp(0);
2859   // 3
2860   set_jump_target(case3, out);
2861   if (opcode[i]==0x2A) { // SWL
2862     // Write msb into least significant byte
2863     if(rs2[i]) emit_rorimm(tl,24,tl);
2864     emit_writebyte_indexed(tl,-3,temp);
2865     if(rs2[i]) emit_rorimm(tl,8,tl);
2866   }
2867   if (opcode[i]==0x2E) { // SWR
2868     // Write entire word
2869     emit_writeword_indexed(tl,-3,temp);
2870   }
2871   if (opcode[i]==0x2C) { // SDL
2872     assert(0);
2873   }
2874   if (opcode[i]==0x2D) { // SDR
2875     assert(0);
2876   }
2877   set_jump_target(done0, out);
2878   set_jump_target(done1, out);
2879   set_jump_target(done2, out);
2880   if (opcode[i]==0x2C) { // SDL
2881     assert(0);
2882   }
2883   if (opcode[i]==0x2D) { // SDR
2884     assert(0);
2885   }
2886   if(!c||!memtarget)
2887     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
2888   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2889     emit_addimm_no_flags(-ram_offset,temp);
2890     #if defined(HOST_IMM8)
2891     int ir=get_reg(i_regs->regmap,INVCP);
2892     assert(ir>=0);
2893     emit_cmpmem_indexedsr12_reg(ir,temp,1);
2894     #else
2895     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
2896     #endif
2897     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2898     emit_callne(invalidate_addr_reg[temp]);
2899     #else
2900     void *jaddr2 = out;
2901     emit_jne(0);
2902     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
2903     #endif
2904   }
2905 }
2906
2907 void c1ls_assemble(int i,struct regstat *i_regs)
2908 {
2909   cop1_unusable(i, i_regs);
2910 }
2911
2912 void c2ls_assemble(int i,struct regstat *i_regs)
2913 {
2914   int s,tl;
2915   int ar;
2916   int offset;
2917   int memtarget=0,c=0;
2918   void *jaddr2=NULL;
2919   enum stub_type type;
2920   int agr=AGEN1+(i&1);
2921   int fastio_reg_override=0;
2922   u_int hr,reglist=0;
2923   u_int copr=(source[i]>>16)&0x1f;
2924   s=get_reg(i_regs->regmap,rs1[i]);
2925   tl=get_reg(i_regs->regmap,FTEMP);
2926   offset=imm[i];
2927   assert(rs1[i]>0);
2928   assert(tl>=0);
2929
2930   for(hr=0;hr<HOST_REGS;hr++) {
2931     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2932   }
2933   if(i_regs->regmap[HOST_CCREG]==CCREG)
2934     reglist&=~(1<<HOST_CCREG);
2935
2936   // get the address
2937   if (opcode[i]==0x3a) { // SWC2
2938     ar=get_reg(i_regs->regmap,agr);
2939     if(ar<0) ar=get_reg(i_regs->regmap,-1);
2940     reglist|=1<<ar;
2941   } else { // LWC2
2942     ar=tl;
2943   }
2944   if(s>=0) c=(i_regs->wasconst>>s)&1;
2945   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
2946   if (!offset&&!c&&s>=0) ar=s;
2947   assert(ar>=0);
2948
2949   if (opcode[i]==0x3a) { // SWC2
2950     cop2_get_dreg(copr,tl,HOST_TEMPREG);
2951     type=STOREW_STUB;
2952   }
2953   else
2954     type=LOADW_STUB;
2955
2956   if(c&&!memtarget) {
2957     jaddr2=out;
2958     emit_jmp(0); // inline_readstub/inline_writestub?
2959   }
2960   else {
2961     if(!c) {
2962       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
2963     }
2964     else if(ram_offset&&memtarget) {
2965       emit_addimm(ar,ram_offset,HOST_TEMPREG);
2966       fastio_reg_override=HOST_TEMPREG;
2967     }
2968     if (opcode[i]==0x32) { // LWC2
2969       int a=ar;
2970       if(fastio_reg_override) a=fastio_reg_override;
2971       emit_readword_indexed(0,a,tl);
2972     }
2973     if (opcode[i]==0x3a) { // SWC2
2974       #ifdef DESTRUCTIVE_SHIFT
2975       if(!offset&&!c&&s>=0) emit_mov(s,ar);
2976       #endif
2977       int a=ar;
2978       if(fastio_reg_override) a=fastio_reg_override;
2979       emit_writeword_indexed(tl,0,a);
2980     }
2981   }
2982   if(jaddr2)
2983     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
2984   if(opcode[i]==0x3a) // SWC2
2985   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2986 #if defined(HOST_IMM8)
2987     int ir=get_reg(i_regs->regmap,INVCP);
2988     assert(ir>=0);
2989     emit_cmpmem_indexedsr12_reg(ir,ar,1);
2990 #else
2991     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
2992 #endif
2993     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2994     emit_callne(invalidate_addr_reg[ar]);
2995     #else
2996     void *jaddr3 = out;
2997     emit_jne(0);
2998     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
2999     #endif
3000   }
3001   if (opcode[i]==0x32) { // LWC2
3002     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3003   }
3004 }
3005
3006 #ifndef multdiv_assemble
3007 void multdiv_assemble(int i,struct regstat *i_regs)
3008 {
3009   printf("Need multdiv_assemble for this architecture.\n");
3010   exit(1);
3011 }
3012 #endif
3013
3014 void mov_assemble(int i,struct regstat *i_regs)
3015 {
3016   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3017   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3018   if(rt1[i]) {
3019     signed char sh,sl,th,tl;
3020     th=get_reg(i_regs->regmap,rt1[i]|64);
3021     tl=get_reg(i_regs->regmap,rt1[i]);
3022     //assert(tl>=0);
3023     if(tl>=0) {
3024       sh=get_reg(i_regs->regmap,rs1[i]|64);
3025       sl=get_reg(i_regs->regmap,rs1[i]);
3026       if(sl>=0) emit_mov(sl,tl);
3027       else emit_loadreg(rs1[i],tl);
3028       if(th>=0) {
3029         if(sh>=0) emit_mov(sh,th);
3030         else emit_loadreg(rs1[i]|64,th);
3031       }
3032     }
3033   }
3034 }
3035
3036 #ifndef fconv_assemble
3037 void fconv_assemble(int i,struct regstat *i_regs)
3038 {
3039   printf("Need fconv_assemble for this architecture.\n");
3040   exit(1);
3041 }
3042 #endif
3043
3044 #if 0
3045 void float_assemble(int i,struct regstat *i_regs)
3046 {
3047   printf("Need float_assemble for this architecture.\n");
3048   exit(1);
3049 }
3050 #endif
3051
3052 void syscall_assemble(int i,struct regstat *i_regs)
3053 {
3054   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3055   assert(ccreg==HOST_CCREG);
3056   assert(!is_delayslot);
3057   (void)ccreg;
3058   emit_movimm(start+i*4,EAX); // Get PC
3059   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3060   emit_jmp(jump_syscall_hle); // XXX
3061 }
3062
3063 void hlecall_assemble(int i,struct regstat *i_regs)
3064 {
3065   extern void psxNULL();
3066   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3067   assert(ccreg==HOST_CCREG);
3068   assert(!is_delayslot);
3069   (void)ccreg;
3070   emit_movimm(start+i*4+4,0); // Get PC
3071   uint32_t hleCode = source[i] & 0x03ffffff;
3072   if (hleCode >= ARRAY_SIZE(psxHLEt))
3073     emit_movimm((uintptr_t)psxNULL,1);
3074   else
3075     emit_movimm((uintptr_t)psxHLEt[hleCode],1);
3076   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3077   emit_jmp(jump_hlecall);
3078 }
3079
3080 void intcall_assemble(int i,struct regstat *i_regs)
3081 {
3082   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3083   assert(ccreg==HOST_CCREG);
3084   assert(!is_delayslot);
3085   (void)ccreg;
3086   emit_movimm(start+i*4,0); // Get PC
3087   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG);
3088   emit_jmp(jump_intcall);
3089 }
3090
3091 void ds_assemble(int i,struct regstat *i_regs)
3092 {
3093   speculate_register_values(i);
3094   is_delayslot=1;
3095   switch(itype[i]) {
3096     case ALU:
3097       alu_assemble(i,i_regs);break;
3098     case IMM16:
3099       imm16_assemble(i,i_regs);break;
3100     case SHIFT:
3101       shift_assemble(i,i_regs);break;
3102     case SHIFTIMM:
3103       shiftimm_assemble(i,i_regs);break;
3104     case LOAD:
3105       load_assemble(i,i_regs);break;
3106     case LOADLR:
3107       loadlr_assemble(i,i_regs);break;
3108     case STORE:
3109       store_assemble(i,i_regs);break;
3110     case STORELR:
3111       storelr_assemble(i,i_regs);break;
3112     case COP0:
3113       cop0_assemble(i,i_regs);break;
3114     case COP1:
3115       cop1_assemble(i,i_regs);break;
3116     case C1LS:
3117       c1ls_assemble(i,i_regs);break;
3118     case COP2:
3119       cop2_assemble(i,i_regs);break;
3120     case C2LS:
3121       c2ls_assemble(i,i_regs);break;
3122     case C2OP:
3123       c2op_assemble(i,i_regs);break;
3124     case FCONV:
3125       fconv_assemble(i,i_regs);break;
3126     case FLOAT:
3127       float_assemble(i,i_regs);break;
3128     case FCOMP:
3129       fcomp_assemble(i,i_regs);break;
3130     case MULTDIV:
3131       multdiv_assemble(i,i_regs);break;
3132     case MOV:
3133       mov_assemble(i,i_regs);break;
3134     case SYSCALL:
3135     case HLECALL:
3136     case INTCALL:
3137     case SPAN:
3138     case UJUMP:
3139     case RJUMP:
3140     case CJUMP:
3141     case SJUMP:
3142     case FJUMP:
3143       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3144   }
3145   is_delayslot=0;
3146 }
3147
3148 // Is the branch target a valid internal jump?
3149 int internal_branch(uint64_t i_is32,int addr)
3150 {
3151   if(addr&1) return 0; // Indirect (register) jump
3152   if(addr>=start && addr<start+slen*4-4)
3153   {
3154     //int t=(addr-start)>>2;
3155     // Delay slots are not valid branch targets
3156     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3157     // 64 -> 32 bit transition requires a recompile
3158     /*if(is32[t]&~unneeded_reg_upper[t]&~i_is32)
3159     {
3160       if(requires_32bit[t]&~i_is32) printf("optimizable: no\n");
3161       else printf("optimizable: yes\n");
3162     }*/
3163     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3164     return 1;
3165   }
3166   return 0;
3167 }
3168
3169 #ifndef wb_invalidate
3170 void wb_invalidate(signed char pre[],signed char entry[],uint64_t dirty,uint64_t is32,
3171   uint64_t u,uint64_t uu)
3172 {
3173   int hr;
3174   for(hr=0;hr<HOST_REGS;hr++) {
3175     if(hr!=EXCLUDE_REG) {
3176       if(pre[hr]!=entry[hr]) {
3177         if(pre[hr]>=0) {
3178           if((dirty>>hr)&1) {
3179             if(get_reg(entry,pre[hr])<0) {
3180               if(pre[hr]<64) {
3181                 if(!((u>>pre[hr])&1)) {
3182                   emit_storereg(pre[hr],hr);
3183                   if( ((is32>>pre[hr])&1) && !((uu>>pre[hr])&1) ) {
3184                     emit_sarimm(hr,31,hr);
3185                     emit_storereg(pre[hr]|64,hr);
3186                   }
3187                 }
3188               }else{
3189                 if(!((uu>>(pre[hr]&63))&1) && !((is32>>(pre[hr]&63))&1)) {
3190                   emit_storereg(pre[hr],hr);
3191                 }
3192               }
3193             }
3194           }
3195         }
3196       }
3197     }
3198   }
3199   // Move from one register to another (no writeback)
3200   for(hr=0;hr<HOST_REGS;hr++) {
3201     if(hr!=EXCLUDE_REG) {
3202       if(pre[hr]!=entry[hr]) {
3203         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
3204           int nr;
3205           if((nr=get_reg(entry,pre[hr]))>=0) {
3206             emit_mov(hr,nr);
3207           }
3208         }
3209       }
3210     }
3211   }
3212 }
3213 #endif
3214
3215 // Load the specified registers
3216 // This only loads the registers given as arguments because
3217 // we don't want to load things that will be overwritten
3218 void load_regs(signed char entry[],signed char regmap[],int is32,int rs1,int rs2)
3219 {
3220   int hr;
3221   // Load 32-bit regs
3222   for(hr=0;hr<HOST_REGS;hr++) {
3223     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3224       if(entry[hr]!=regmap[hr]) {
3225         if(regmap[hr]==rs1||regmap[hr]==rs2)
3226         {
3227           if(regmap[hr]==0) {
3228             emit_zeroreg(hr);
3229           }
3230           else
3231           {
3232             emit_loadreg(regmap[hr],hr);
3233           }
3234         }
3235       }
3236     }
3237   }
3238   //Load 64-bit regs
3239   for(hr=0;hr<HOST_REGS;hr++) {
3240     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3241       if(entry[hr]!=regmap[hr]) {
3242         if(regmap[hr]-64==rs1||regmap[hr]-64==rs2)
3243         {
3244           assert(regmap[hr]!=64);
3245           if((is32>>(regmap[hr]&63))&1) {
3246             int lr=get_reg(regmap,regmap[hr]-64);
3247             if(lr>=0)
3248               emit_sarimm(lr,31,hr);
3249             else
3250               emit_loadreg(regmap[hr],hr);
3251           }
3252           else
3253           {
3254             emit_loadreg(regmap[hr],hr);
3255           }
3256         }
3257       }
3258     }
3259   }
3260 }
3261
3262 // Load registers prior to the start of a loop
3263 // so that they are not loaded within the loop
3264 static void loop_preload(signed char pre[],signed char entry[])
3265 {
3266   int hr;
3267   for(hr=0;hr<HOST_REGS;hr++) {
3268     if(hr!=EXCLUDE_REG) {
3269       if(pre[hr]!=entry[hr]) {
3270         if(entry[hr]>=0) {
3271           if(get_reg(pre,entry[hr])<0) {
3272             assem_debug("loop preload:\n");
3273             //printf("loop preload: %d\n",hr);
3274             if(entry[hr]==0) {
3275               emit_zeroreg(hr);
3276             }
3277             else if(entry[hr]<TEMPREG)
3278             {
3279               emit_loadreg(entry[hr],hr);
3280             }
3281             else if(entry[hr]-64<TEMPREG)
3282             {
3283               emit_loadreg(entry[hr],hr);
3284             }
3285           }
3286         }
3287       }
3288     }
3289   }
3290 }
3291
3292 // Generate address for load/store instruction
3293 // goes to AGEN for writes, FTEMP for LOADLR and cop1/2 loads
3294 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3295 {
3296   if(itype[i]==LOAD||itype[i]==LOADLR||itype[i]==STORE||itype[i]==STORELR||itype[i]==C1LS||itype[i]==C2LS) {
3297     int ra=-1;
3298     int agr=AGEN1+(i&1);
3299     if(itype[i]==LOAD) {
3300       ra=get_reg(i_regs->regmap,rt1[i]);
3301       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3302       assert(ra>=0);
3303     }
3304     if(itype[i]==LOADLR) {
3305       ra=get_reg(i_regs->regmap,FTEMP);
3306     }
3307     if(itype[i]==STORE||itype[i]==STORELR) {
3308       ra=get_reg(i_regs->regmap,agr);
3309       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3310     }
3311     if(itype[i]==C1LS||itype[i]==C2LS) {
3312       if ((opcode[i]&0x3b)==0x31||(opcode[i]&0x3b)==0x32) // LWC1/LDC1/LWC2/LDC2
3313         ra=get_reg(i_regs->regmap,FTEMP);
3314       else { // SWC1/SDC1/SWC2/SDC2
3315         ra=get_reg(i_regs->regmap,agr);
3316         if(ra<0) ra=get_reg(i_regs->regmap,-1);
3317       }
3318     }
3319     int rs=get_reg(i_regs->regmap,rs1[i]);
3320     if(ra>=0) {
3321       int offset=imm[i];
3322       int c=(i_regs->wasconst>>rs)&1;
3323       if(rs1[i]==0) {
3324         // Using r0 as a base address
3325         if(!entry||entry[ra]!=agr) {
3326           if (opcode[i]==0x22||opcode[i]==0x26) {
3327             emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3328           }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3329             emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3330           }else{
3331             emit_movimm(offset,ra);
3332           }
3333         } // else did it in the previous cycle
3334       }
3335       else if(rs<0) {
3336         if(!entry||entry[ra]!=rs1[i])
3337           emit_loadreg(rs1[i],ra);
3338         //if(!entry||entry[ra]!=rs1[i])
3339         //  printf("poor load scheduling!\n");
3340       }
3341       else if(c) {
3342         if(rs1[i]!=rt1[i]||itype[i]!=LOAD) {
3343           if(!entry||entry[ra]!=agr) {
3344             if (opcode[i]==0x22||opcode[i]==0x26) {
3345               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3346             }else if (opcode[i]==0x1a||opcode[i]==0x1b) {
3347               emit_movimm((constmap[i][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3348             }else{
3349               emit_movimm(constmap[i][rs]+offset,ra);
3350               regs[i].loadedconst|=1<<ra;
3351             }
3352           } // else did it in the previous cycle
3353         } // else load_consts already did it
3354       }
3355       if(offset&&!c&&rs1[i]) {
3356         if(rs>=0) {
3357           emit_addimm(rs,offset,ra);
3358         }else{
3359           emit_addimm(ra,offset,ra);
3360         }
3361       }
3362     }
3363   }
3364   // Preload constants for next instruction
3365   if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS||itype[i+1]==C2LS) {
3366     int agr,ra;
3367     // Actual address
3368     agr=AGEN1+((i+1)&1);
3369     ra=get_reg(i_regs->regmap,agr);
3370     if(ra>=0) {
3371       int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3372       int offset=imm[i+1];
3373       int c=(regs[i+1].wasconst>>rs)&1;
3374       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3375         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3376           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFFC,ra); // LWL/LWR
3377         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3378           emit_movimm((constmap[i+1][rs]+offset)&0xFFFFFFF8,ra); // LDL/LDR
3379         }else{
3380           emit_movimm(constmap[i+1][rs]+offset,ra);
3381           regs[i+1].loadedconst|=1<<ra;
3382         }
3383       }
3384       else if(rs1[i+1]==0) {
3385         // Using r0 as a base address
3386         if (opcode[i+1]==0x22||opcode[i+1]==0x26) {
3387           emit_movimm(offset&0xFFFFFFFC,ra); // LWL/LWR
3388         }else if (opcode[i+1]==0x1a||opcode[i+1]==0x1b) {
3389           emit_movimm(offset&0xFFFFFFF8,ra); // LDL/LDR
3390         }else{
3391           emit_movimm(offset,ra);
3392         }
3393       }
3394     }
3395   }
3396 }
3397
3398 static int get_final_value(int hr, int i, int *value)
3399 {
3400   int reg=regs[i].regmap[hr];
3401   while(i<slen-1) {
3402     if(regs[i+1].regmap[hr]!=reg) break;
3403     if(!((regs[i+1].isconst>>hr)&1)) break;
3404     if(bt[i+1]) break;
3405     i++;
3406   }
3407   if(i<slen-1) {
3408     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3409       *value=constmap[i][hr];
3410       return 1;
3411     }
3412     if(!bt[i+1]) {
3413       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==CJUMP||itype[i+1]==SJUMP) {
3414         // Load in delay slot, out-of-order execution
3415         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasconst>>hr)&1))
3416         {
3417           // Precompute load address
3418           *value=constmap[i][hr]+imm[i+2];
3419           return 1;
3420         }
3421       }
3422       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3423       {
3424         // Precompute load address
3425         *value=constmap[i][hr]+imm[i+1];
3426         //printf("c=%x imm=%lx\n",(long)constmap[i][hr],imm[i+1]);
3427         return 1;
3428       }
3429     }
3430   }
3431   *value=constmap[i][hr];
3432   //printf("c=%lx\n",(long)constmap[i][hr]);
3433   if(i==slen-1) return 1;
3434   if(reg<64) {
3435     return !((unneeded_reg[i+1]>>reg)&1);
3436   }else{
3437     return !((unneeded_reg_upper[i+1]>>reg)&1);
3438   }
3439 }
3440
3441 // Load registers with known constants
3442 void load_consts(signed char pre[],signed char regmap[],int is32,int i)
3443 {
3444   int hr,hr2;
3445   // propagate loaded constant flags
3446   if(i==0||bt[i])
3447     regs[i].loadedconst=0;
3448   else {
3449     for(hr=0;hr<HOST_REGS;hr++) {
3450       if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((regs[i-1].isconst>>hr)&1)&&pre[hr]==regmap[hr]
3451          &&regmap[hr]==regs[i-1].regmap[hr]&&((regs[i-1].loadedconst>>hr)&1))
3452       {
3453         regs[i].loadedconst|=1<<hr;
3454       }
3455     }
3456   }
3457   // Load 32-bit regs
3458   for(hr=0;hr<HOST_REGS;hr++) {
3459     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3460       //if(entry[hr]!=regmap[hr]) {
3461       if(!((regs[i].loadedconst>>hr)&1)) {
3462         if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3463           int value,similar=0;
3464           if(get_final_value(hr,i,&value)) {
3465             // see if some other register has similar value
3466             for(hr2=0;hr2<HOST_REGS;hr2++) {
3467               if(hr2!=EXCLUDE_REG&&((regs[i].loadedconst>>hr2)&1)) {
3468                 if(is_similar_value(value,constmap[i][hr2])) {
3469                   similar=1;
3470                   break;
3471                 }
3472               }
3473             }
3474             if(similar) {
3475               int value2;
3476               if(get_final_value(hr2,i,&value2)) // is this needed?
3477                 emit_movimm_from(value2,hr2,value,hr);
3478               else
3479                 emit_movimm(value,hr);
3480             }
3481             else if(value==0) {
3482               emit_zeroreg(hr);
3483             }
3484             else {
3485               emit_movimm(value,hr);
3486             }
3487           }
3488           regs[i].loadedconst|=1<<hr;
3489         }
3490       }
3491     }
3492   }
3493   // Load 64-bit regs
3494   for(hr=0;hr<HOST_REGS;hr++) {
3495     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3496       //if(entry[hr]!=regmap[hr]) {
3497       if(i==0||!((regs[i-1].isconst>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3498         if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3499           if((is32>>(regmap[hr]&63))&1) {
3500             int lr=get_reg(regmap,regmap[hr]-64);
3501             assert(lr>=0);
3502             emit_sarimm(lr,31,hr);
3503           }
3504           else
3505           {
3506             int value;
3507             if(get_final_value(hr,i,&value)) {
3508               if(value==0) {
3509                 emit_zeroreg(hr);
3510               }
3511               else {
3512                 emit_movimm(value,hr);
3513               }
3514             }
3515           }
3516         }
3517       }
3518     }
3519   }
3520 }
3521 void load_all_consts(signed char regmap[],int is32,u_int dirty,int i)
3522 {
3523   int hr;
3524   // Load 32-bit regs
3525   for(hr=0;hr<HOST_REGS;hr++) {
3526     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3527       if(((regs[i].isconst>>hr)&1)&&regmap[hr]<64&&regmap[hr]>0) {
3528         int value=constmap[i][hr];
3529         if(value==0) {
3530           emit_zeroreg(hr);
3531         }
3532         else {
3533           emit_movimm(value,hr);
3534         }
3535       }
3536     }
3537   }
3538   // Load 64-bit regs
3539   for(hr=0;hr<HOST_REGS;hr++) {
3540     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3541       if(((regs[i].isconst>>hr)&1)&&regmap[hr]>64) {
3542         if((is32>>(regmap[hr]&63))&1) {
3543           int lr=get_reg(regmap,regmap[hr]-64);
3544           assert(lr>=0);
3545           emit_sarimm(lr,31,hr);
3546         }
3547         else
3548         {
3549           int value=constmap[i][hr];
3550           if(value==0) {
3551             emit_zeroreg(hr);
3552           }
3553           else {
3554             emit_movimm(value,hr);
3555           }
3556         }
3557       }
3558     }
3559   }
3560 }
3561
3562 // Write out all dirty registers (except cycle count)
3563 void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty)
3564 {
3565   int hr;
3566   for(hr=0;hr<HOST_REGS;hr++) {
3567     if(hr!=EXCLUDE_REG) {
3568       if(i_regmap[hr]>0) {
3569         if(i_regmap[hr]!=CCREG) {
3570           if((i_dirty>>hr)&1) {
3571             if(i_regmap[hr]<64) {
3572               emit_storereg(i_regmap[hr],hr);
3573             }else{
3574               if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3575                 emit_storereg(i_regmap[hr],hr);
3576               }
3577             }
3578           }
3579         }
3580       }
3581     }
3582   }
3583 }
3584 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3585 // This writes the registers not written by store_regs_bt
3586 void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3587 {
3588   int hr;
3589   int t=(addr-start)>>2;
3590   for(hr=0;hr<HOST_REGS;hr++) {
3591     if(hr!=EXCLUDE_REG) {
3592       if(i_regmap[hr]>0) {
3593         if(i_regmap[hr]!=CCREG) {
3594           if(i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1) && !(((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3595             if((i_dirty>>hr)&1) {
3596               if(i_regmap[hr]<64) {
3597                 emit_storereg(i_regmap[hr],hr);
3598               }else{
3599                 if( !((i_is32>>(i_regmap[hr]&63))&1) ) {
3600                   emit_storereg(i_regmap[hr],hr);
3601                 }
3602               }
3603             }
3604           }
3605         }
3606       }
3607     }
3608   }
3609 }
3610
3611 // Load all registers (except cycle count)
3612 void load_all_regs(signed char i_regmap[])
3613 {
3614   int hr;
3615   for(hr=0;hr<HOST_REGS;hr++) {
3616     if(hr!=EXCLUDE_REG) {
3617       if(i_regmap[hr]==0) {
3618         emit_zeroreg(hr);
3619       }
3620       else
3621       if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3622       {
3623         emit_loadreg(i_regmap[hr],hr);
3624       }
3625     }
3626   }
3627 }
3628
3629 // Load all current registers also needed by next instruction
3630 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
3631 {
3632   int hr;
3633   for(hr=0;hr<HOST_REGS;hr++) {
3634     if(hr!=EXCLUDE_REG) {
3635       if(get_reg(next_regmap,i_regmap[hr])>=0) {
3636         if(i_regmap[hr]==0) {
3637           emit_zeroreg(hr);
3638         }
3639         else
3640         if(i_regmap[hr]>0 && (i_regmap[hr]&63)<TEMPREG && i_regmap[hr]!=CCREG)
3641         {
3642           emit_loadreg(i_regmap[hr],hr);
3643         }
3644       }
3645     }
3646   }
3647 }
3648
3649 // Load all regs, storing cycle count if necessary
3650 void load_regs_entry(int t)
3651 {
3652   int hr;
3653   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_ADJUST(1),HOST_CCREG);
3654   else if(ccadj[t]) emit_addimm(HOST_CCREG,-CLOCK_ADJUST(ccadj[t]),HOST_CCREG);
3655   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3656     emit_storereg(CCREG,HOST_CCREG);
3657   }
3658   // Load 32-bit regs
3659   for(hr=0;hr<HOST_REGS;hr++) {
3660     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3661       if(regs[t].regmap_entry[hr]==0) {
3662         emit_zeroreg(hr);
3663       }
3664       else if(regs[t].regmap_entry[hr]!=CCREG)
3665       {
3666         emit_loadreg(regs[t].regmap_entry[hr],hr);
3667       }
3668     }
3669   }
3670   // Load 64-bit regs
3671   for(hr=0;hr<HOST_REGS;hr++) {
3672     if(regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
3673       assert(regs[t].regmap_entry[hr]!=64);
3674       if((regs[t].was32>>(regs[t].regmap_entry[hr]&63))&1) {
3675         int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3676         if(lr<0) {
3677           emit_loadreg(regs[t].regmap_entry[hr],hr);
3678         }
3679         else
3680         {
3681           emit_sarimm(lr,31,hr);
3682         }
3683       }
3684       else
3685       {
3686         emit_loadreg(regs[t].regmap_entry[hr],hr);
3687       }
3688     }
3689   }
3690 }
3691
3692 // Store dirty registers prior to branch
3693 void store_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3694 {
3695   if(internal_branch(i_is32,addr))
3696   {
3697     int t=(addr-start)>>2;
3698     int hr;
3699     for(hr=0;hr<HOST_REGS;hr++) {
3700       if(hr!=EXCLUDE_REG) {
3701         if(i_regmap[hr]>0 && i_regmap[hr]!=CCREG) {
3702           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3703             if((i_dirty>>hr)&1) {
3704               if(i_regmap[hr]<64) {
3705                 if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
3706                   emit_storereg(i_regmap[hr],hr);
3707                   if( ((i_is32>>i_regmap[hr])&1) && !((unneeded_reg_upper[t]>>i_regmap[hr])&1) ) {
3708                     #ifdef DESTRUCTIVE_WRITEBACK
3709                     emit_sarimm(hr,31,hr);
3710                     emit_storereg(i_regmap[hr]|64,hr);
3711                     #else
3712                     emit_sarimm(hr,31,HOST_TEMPREG);
3713                     emit_storereg(i_regmap[hr]|64,HOST_TEMPREG);
3714                     #endif
3715                   }
3716                 }
3717               }else{
3718                 if( !((i_is32>>(i_regmap[hr]&63))&1) && !((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1) ) {
3719                   emit_storereg(i_regmap[hr],hr);
3720                 }
3721               }
3722             }
3723           }
3724         }
3725       }
3726     }
3727   }
3728   else
3729   {
3730     // Branch out of this block, write out all dirty regs
3731     wb_dirtys(i_regmap,i_is32,i_dirty);
3732   }
3733 }
3734
3735 // Load all needed registers for branch target
3736 void load_regs_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3737 {
3738   //if(addr>=start && addr<(start+slen*4))
3739   if(internal_branch(i_is32,addr))
3740   {
3741     int t=(addr-start)>>2;
3742     int hr;
3743     // Store the cycle count before loading something else
3744     if(i_regmap[HOST_CCREG]!=CCREG) {
3745       assert(i_regmap[HOST_CCREG]==-1);
3746     }
3747     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3748       emit_storereg(CCREG,HOST_CCREG);
3749     }
3750     // Load 32-bit regs
3751     for(hr=0;hr<HOST_REGS;hr++) {
3752       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3753         #ifdef DESTRUCTIVE_WRITEBACK
3754         if(i_regmap[hr]!=regs[t].regmap_entry[hr] || ( !((regs[t].dirty>>hr)&1) && ((i_dirty>>hr)&1) && (((i_is32&~unneeded_reg_upper[t])>>i_regmap[hr])&1) ) || (((i_is32&~regs[t].was32&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)) {
3755         #else
3756         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
3757         #endif
3758           if(regs[t].regmap_entry[hr]==0) {
3759             emit_zeroreg(hr);
3760           }
3761           else if(regs[t].regmap_entry[hr]!=CCREG)
3762           {
3763             emit_loadreg(regs[t].regmap_entry[hr],hr);
3764           }
3765         }
3766       }
3767     }
3768     //Load 64-bit regs
3769     for(hr=0;hr<HOST_REGS;hr++) {
3770       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=64&&regs[t].regmap_entry[hr]<TEMPREG+64) {
3771         if(i_regmap[hr]!=regs[t].regmap_entry[hr]) {
3772           assert(regs[t].regmap_entry[hr]!=64);
3773           if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
3774             int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3775             if(lr<0) {
3776               emit_loadreg(regs[t].regmap_entry[hr],hr);
3777             }
3778             else
3779             {
3780               emit_sarimm(lr,31,hr);
3781             }
3782           }
3783           else
3784           {
3785             emit_loadreg(regs[t].regmap_entry[hr],hr);
3786           }
3787         }
3788         else if((i_is32>>(regs[t].regmap_entry[hr]&63))&1) {
3789           int lr=get_reg(regs[t].regmap_entry,regs[t].regmap_entry[hr]-64);
3790           assert(lr>=0);
3791           emit_sarimm(lr,31,hr);
3792         }
3793       }
3794     }
3795   }
3796 }
3797
3798 int match_bt(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr)
3799 {
3800   if(addr>=start && addr<start+slen*4-4)
3801   {
3802     int t=(addr-start)>>2;
3803     int hr;
3804     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
3805     for(hr=0;hr<HOST_REGS;hr++)
3806     {
3807       if(hr!=EXCLUDE_REG)
3808       {
3809         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
3810         {
3811           if(regs[t].regmap_entry[hr]>=0&&(regs[t].regmap_entry[hr]|64)<TEMPREG+64)
3812           {
3813             return 0;
3814           }
3815           else
3816           if((i_dirty>>hr)&1)
3817           {
3818             if(i_regmap[hr]<TEMPREG)
3819             {
3820               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3821                 return 0;
3822             }
3823             else if(i_regmap[hr]>=64&&i_regmap[hr]<TEMPREG+64)
3824             {
3825               if(!((unneeded_reg_upper[t]>>(i_regmap[hr]&63))&1))
3826                 return 0;
3827             }
3828           }
3829         }
3830         else // Same register but is it 32-bit or dirty?
3831         if(i_regmap[hr]>=0)
3832         {
3833           if(!((regs[t].dirty>>hr)&1))
3834           {
3835             if((i_dirty>>hr)&1)
3836             {
3837               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3838               {
3839                 //printf("%x: dirty no match\n",addr);
3840                 return 0;
3841               }
3842             }
3843           }
3844           if((((regs[t].was32^i_is32)&~unneeded_reg_upper[t])>>(i_regmap[hr]&63))&1)
3845           {
3846             //printf("%x: is32 no match\n",addr);
3847             return 0;
3848           }
3849         }
3850       }
3851     }
3852     //if(is32[t]&~unneeded_reg_upper[t]&~i_is32) return 0;
3853     // Delay slots are not valid branch targets
3854     //if(t>0&&(itype[t-1]==RJUMP||itype[t-1]==UJUMP||itype[t-1]==CJUMP||itype[t-1]==SJUMP||itype[t-1]==FJUMP)) return 0;
3855     // Delay slots require additional processing, so do not match
3856     if(is_ds[t]) return 0;
3857   }
3858   else
3859   {
3860     int hr;
3861     for(hr=0;hr<HOST_REGS;hr++)
3862     {
3863       if(hr!=EXCLUDE_REG)
3864       {
3865         if(i_regmap[hr]>=0)
3866         {
3867           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
3868           {
3869             if((i_dirty>>hr)&1)
3870             {
3871               return 0;
3872             }
3873           }
3874         }
3875       }
3876     }
3877   }
3878   return 1;
3879 }
3880
3881 #ifdef DRC_DBG
3882 static void drc_dbg_emit_do_cmp(int i)
3883 {
3884   extern void do_insn_cmp();
3885   extern int cycle;
3886   u_int hr,reglist=0;
3887
3888   for(hr=0;hr<HOST_REGS;hr++)
3889     if(regs[i].regmap[hr]>=0) reglist|=1<<hr;
3890   save_regs(reglist);
3891   emit_movimm(start+i*4,0);
3892   emit_writeword(0,&pcaddr);
3893   emit_call(do_insn_cmp);
3894   //emit_readword(&cycle,0);
3895   //emit_addimm(0,2,0);
3896   //emit_writeword(0,&cycle);
3897   restore_regs(reglist);
3898 }
3899 #else
3900 #define drc_dbg_emit_do_cmp(x)
3901 #endif
3902
3903 // Used when a branch jumps into the delay slot of another branch
3904 void ds_assemble_entry(int i)
3905 {
3906   int t=(ba[i]-start)>>2;
3907   if (!instr_addr[t])
3908     instr_addr[t] = out;
3909   assem_debug("Assemble delay slot at %x\n",ba[i]);
3910   assem_debug("<->\n");
3911   drc_dbg_emit_do_cmp(t);
3912   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
3913     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty,regs[t].was32);
3914   load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,rs1[t],rs2[t]);
3915   address_generation(t,&regs[t],regs[t].regmap_entry);
3916   if(itype[t]==STORE||itype[t]==STORELR||(opcode[t]&0x3b)==0x39||(opcode[t]&0x3b)==0x3a)
3917     load_regs(regs[t].regmap_entry,regs[t].regmap,regs[t].was32,INVCP,INVCP);
3918   cop1_usable=0;
3919   is_delayslot=0;
3920   switch(itype[t]) {
3921     case ALU:
3922       alu_assemble(t,&regs[t]);break;
3923     case IMM16:
3924       imm16_assemble(t,&regs[t]);break;
3925     case SHIFT:
3926       shift_assemble(t,&regs[t]);break;
3927     case SHIFTIMM:
3928       shiftimm_assemble(t,&regs[t]);break;
3929     case LOAD:
3930       load_assemble(t,&regs[t]);break;
3931     case LOADLR:
3932       loadlr_assemble(t,&regs[t]);break;
3933     case STORE:
3934       store_assemble(t,&regs[t]);break;
3935     case STORELR:
3936       storelr_assemble(t,&regs[t]);break;
3937     case COP0:
3938       cop0_assemble(t,&regs[t]);break;
3939     case COP1:
3940       cop1_assemble(t,&regs[t]);break;
3941     case C1LS:
3942       c1ls_assemble(t,&regs[t]);break;
3943     case COP2:
3944       cop2_assemble(t,&regs[t]);break;
3945     case C2LS:
3946       c2ls_assemble(t,&regs[t]);break;
3947     case C2OP:
3948       c2op_assemble(t,&regs[t]);break;
3949     case FCONV:
3950       fconv_assemble(t,&regs[t]);break;
3951     case FLOAT:
3952       float_assemble(t,&regs[t]);break;
3953     case FCOMP:
3954       fcomp_assemble(t,&regs[t]);break;
3955     case MULTDIV:
3956       multdiv_assemble(t,&regs[t]);break;
3957     case MOV:
3958       mov_assemble(t,&regs[t]);break;
3959     case SYSCALL:
3960     case HLECALL:
3961     case INTCALL:
3962     case SPAN:
3963     case UJUMP:
3964     case RJUMP:
3965     case CJUMP:
3966     case SJUMP:
3967     case FJUMP:
3968       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
3969   }
3970   store_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
3971   load_regs_bt(regs[t].regmap,regs[t].is32,regs[t].dirty,ba[i]+4);
3972   if(internal_branch(regs[t].is32,ba[i]+4))
3973     assem_debug("branch: internal\n");
3974   else
3975     assem_debug("branch: external\n");
3976   assert(internal_branch(regs[t].is32,ba[i]+4));
3977   add_to_linker(out,ba[i]+4,internal_branch(regs[t].is32,ba[i]+4));
3978   emit_jmp(0);
3979 }
3980
3981 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
3982 {
3983   int count;
3984   void *jaddr;
3985   void *idle=NULL;
3986   int t=0;
3987   if(itype[i]==RJUMP)
3988   {
3989     *adj=0;
3990   }
3991   //if(ba[i]>=start && ba[i]<(start+slen*4))
3992   if(internal_branch(branch_regs[i].is32,ba[i]))
3993   {
3994     t=(ba[i]-start)>>2;
3995     if(is_ds[t]) *adj=-1; // Branch into delay slot adds an extra cycle
3996     else *adj=ccadj[t];
3997   }
3998   else
3999   {
4000     *adj=0;
4001   }
4002   count=ccadj[i];
4003   if(taken==TAKEN && i==(ba[i]-start)>>2 && source[i+1]==0) {
4004     // Idle loop
4005     if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
4006     idle=out;
4007     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
4008     emit_andimm(HOST_CCREG,3,HOST_CCREG);
4009     jaddr=out;
4010     emit_jmp(0);
4011   }
4012   else if(*adj==0||invert) {
4013     int cycles=CLOCK_ADJUST(count+2);
4014     // faster loop HACK
4015     if (t&&*adj) {
4016       int rel=t-i;
4017       if(-NO_CYCLE_PENALTY_THR<rel&&rel<0)
4018         cycles=CLOCK_ADJUST(*adj)+count+2-*adj;
4019     }
4020     emit_addimm_and_set_flags(cycles,HOST_CCREG);
4021     jaddr=out;
4022     emit_jns(0);
4023   }
4024   else
4025   {
4026     emit_cmpimm(HOST_CCREG,-CLOCK_ADJUST(count+2));
4027     jaddr=out;
4028     emit_jns(0);
4029   }
4030   add_stub(CC_STUB,jaddr,idle?idle:out,(*adj==0||invert||idle)?0:(count+2),i,addr,taken,0);
4031 }
4032
4033 static void do_ccstub(int n)
4034 {
4035   literal_pool(256);
4036   assem_debug("do_ccstub %x\n",start+stubs[n].b*4);
4037   set_jump_target(stubs[n].addr, out);
4038   int i=stubs[n].b;
4039   if(stubs[n].d==NULLDS) {
4040     // Delay slot instruction is nullified ("likely" branch)
4041     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
4042   }
4043   else if(stubs[n].d!=TAKEN) {
4044     wb_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty);
4045   }
4046   else {
4047     if(internal_branch(branch_regs[i].is32,ba[i]))
4048       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4049   }
4050   if(stubs[n].c!=-1)
4051   {
4052     // Save PC as return address
4053     emit_movimm(stubs[n].c,EAX);
4054     emit_writeword(EAX,&pcaddr);
4055   }
4056   else
4057   {
4058     // Return address depends on which way the branch goes
4059     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
4060     {
4061       int s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4062       int s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4063       int s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4064       int s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4065       if(rs1[i]==0)
4066       {
4067         s1l=s2l;s1h=s2h;
4068         s2l=s2h=-1;
4069       }
4070       else if(rs2[i]==0)
4071       {
4072         s2l=s2h=-1;
4073       }
4074       if((branch_regs[i].is32>>rs1[i])&(branch_regs[i].is32>>rs2[i])&1) {
4075         s1h=s2h=-1;
4076       }
4077       assert(s1l>=0);
4078       #ifdef DESTRUCTIVE_WRITEBACK
4079       if(rs1[i]) {
4080         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs1[i])&1)
4081           emit_loadreg(rs1[i],s1l);
4082       }
4083       else {
4084         if((branch_regs[i].dirty>>s1l)&(branch_regs[i].is32>>rs2[i])&1)
4085           emit_loadreg(rs2[i],s1l);
4086       }
4087       if(s2l>=0)
4088         if((branch_regs[i].dirty>>s2l)&(branch_regs[i].is32>>rs2[i])&1)
4089           emit_loadreg(rs2[i],s2l);
4090       #endif
4091       int hr=0;
4092       int addr=-1,alt=-1,ntaddr=-1;
4093       while(hr<HOST_REGS)
4094       {
4095         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4096            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4097            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4098         {
4099           addr=hr++;break;
4100         }
4101         hr++;
4102       }
4103       while(hr<HOST_REGS)
4104       {
4105         if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4106            (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4107            (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4108         {
4109           alt=hr++;break;
4110         }
4111         hr++;
4112       }
4113       if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
4114       {
4115         while(hr<HOST_REGS)
4116         {
4117           if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
4118              (branch_regs[i].regmap[hr]&63)!=rs1[i] &&
4119              (branch_regs[i].regmap[hr]&63)!=rs2[i] )
4120           {
4121             ntaddr=hr;break;
4122           }
4123           hr++;
4124         }
4125         assert(hr<HOST_REGS);
4126       }
4127       if((opcode[i]&0x2f)==4) // BEQ
4128       {
4129         #ifdef HAVE_CMOV_IMM
4130         if(s1h<0) {
4131           if(s2l>=0) emit_cmp(s1l,s2l);
4132           else emit_test(s1l,s1l);
4133           emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
4134         }
4135         else
4136         #endif
4137         {
4138           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4139           if(s1h>=0) {
4140             if(s2h>=0) emit_cmp(s1h,s2h);
4141             else emit_test(s1h,s1h);
4142             emit_cmovne_reg(alt,addr);
4143           }
4144           if(s2l>=0) emit_cmp(s1l,s2l);
4145           else emit_test(s1l,s1l);
4146           emit_cmovne_reg(alt,addr);
4147         }
4148       }
4149       if((opcode[i]&0x2f)==5) // BNE
4150       {
4151         #ifdef HAVE_CMOV_IMM
4152         if(s1h<0) {
4153           if(s2l>=0) emit_cmp(s1l,s2l);
4154           else emit_test(s1l,s1l);
4155           emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
4156         }
4157         else
4158         #endif
4159         {
4160           emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
4161           if(s1h>=0) {
4162             if(s2h>=0) emit_cmp(s1h,s2h);
4163             else emit_test(s1h,s1h);
4164             emit_cmovne_reg(alt,addr);
4165           }
4166           if(s2l>=0) emit_cmp(s1l,s2l);
4167           else emit_test(s1l,s1l);
4168           emit_cmovne_reg(alt,addr);
4169         }
4170       }
4171       if((opcode[i]&0x2f)==6) // BLEZ
4172       {
4173         //emit_movimm(ba[i],alt);
4174         //emit_movimm(start+i*4+8,addr);
4175         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4176         emit_cmpimm(s1l,1);
4177         if(s1h>=0) emit_mov(addr,ntaddr);
4178         emit_cmovl_reg(alt,addr);
4179         if(s1h>=0) {
4180           emit_test(s1h,s1h);
4181           emit_cmovne_reg(ntaddr,addr);
4182           emit_cmovs_reg(alt,addr);
4183         }
4184       }
4185       if((opcode[i]&0x2f)==7) // BGTZ
4186       {
4187         //emit_movimm(ba[i],addr);
4188         //emit_movimm(start+i*4+8,ntaddr);
4189         emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
4190         emit_cmpimm(s1l,1);
4191         if(s1h>=0) emit_mov(addr,alt);
4192         emit_cmovl_reg(ntaddr,addr);
4193         if(s1h>=0) {
4194           emit_test(s1h,s1h);
4195           emit_cmovne_reg(alt,addr);
4196           emit_cmovs_reg(ntaddr,addr);
4197         }
4198       }
4199       if((opcode[i]==1)&&(opcode2[i]&0x2D)==0) // BLTZ
4200       {
4201         //emit_movimm(ba[i],alt);
4202         //emit_movimm(start+i*4+8,addr);
4203         emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4204         if(s1h>=0) emit_test(s1h,s1h);
4205         else emit_test(s1l,s1l);
4206         emit_cmovs_reg(alt,addr);
4207       }
4208       if((opcode[i]==1)&&(opcode2[i]&0x2D)==1) // BGEZ
4209       {
4210         //emit_movimm(ba[i],addr);
4211         //emit_movimm(start+i*4+8,alt);
4212         emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4213         if(s1h>=0) emit_test(s1h,s1h);
4214         else emit_test(s1l,s1l);
4215         emit_cmovs_reg(alt,addr);
4216       }
4217       if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
4218         if(source[i]&0x10000) // BC1T
4219         {
4220           //emit_movimm(ba[i],alt);
4221           //emit_movimm(start+i*4+8,addr);
4222           emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
4223           emit_testimm(s1l,0x800000);
4224           emit_cmovne_reg(alt,addr);
4225         }
4226         else // BC1F
4227         {
4228           //emit_movimm(ba[i],addr);
4229           //emit_movimm(start+i*4+8,alt);
4230           emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
4231           emit_testimm(s1l,0x800000);
4232           emit_cmovne_reg(alt,addr);
4233         }
4234       }
4235       emit_writeword(addr,&pcaddr);
4236     }
4237     else
4238     if(itype[i]==RJUMP)
4239     {
4240       int r=get_reg(branch_regs[i].regmap,rs1[i]);
4241       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4242         r=get_reg(branch_regs[i].regmap,RTEMP);
4243       }
4244       emit_writeword(r,&pcaddr);
4245     }
4246     else {SysPrintf("Unknown branch type in do_ccstub\n");exit(1);}
4247   }
4248   // Update cycle count
4249   assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
4250   if(stubs[n].a) emit_addimm(HOST_CCREG,CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4251   emit_call(cc_interrupt);
4252   if(stubs[n].a) emit_addimm(HOST_CCREG,-CLOCK_ADJUST((signed int)stubs[n].a),HOST_CCREG);
4253   if(stubs[n].d==TAKEN) {
4254     if(internal_branch(branch_regs[i].is32,ba[i]))
4255       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>2].regmap_entry);
4256     else if(itype[i]==RJUMP) {
4257       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
4258         emit_readword(&pcaddr,get_reg(branch_regs[i].regmap,RTEMP));
4259       else
4260         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
4261     }
4262   }else if(stubs[n].d==NOTTAKEN) {
4263     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
4264     else load_all_regs(branch_regs[i].regmap);
4265   }else if(stubs[n].d==NULLDS) {
4266     // Delay slot instruction is nullified ("likely" branch)
4267     if(i<slen-2) load_needed_regs(regs[i].regmap,regmap_pre[i+2]);
4268     else load_all_regs(regs[i].regmap);
4269   }else{
4270     load_all_regs(branch_regs[i].regmap);
4271   }
4272   emit_jmp(stubs[n].retaddr);
4273 }
4274
4275 static void add_to_linker(void *addr, u_int target, int ext)
4276 {
4277   assert(linkcount < ARRAY_SIZE(link_addr));
4278   link_addr[linkcount].addr = addr;
4279   link_addr[linkcount].target = target;
4280   link_addr[linkcount].ext = ext;
4281   linkcount++;
4282 }
4283
4284 static void ujump_assemble_write_ra(int i)
4285 {
4286   int rt;
4287   unsigned int return_address;
4288   rt=get_reg(branch_regs[i].regmap,31);
4289   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4290   //assert(rt>=0);
4291   return_address=start+i*4+8;
4292   if(rt>=0) {
4293     #ifdef USE_MINI_HT
4294     if(internal_branch(branch_regs[i].is32,return_address)&&rt1[i+1]!=31) {
4295       int temp=-1; // note: must be ds-safe
4296       #ifdef HOST_TEMPREG
4297       temp=HOST_TEMPREG;
4298       #endif
4299       if(temp>=0) do_miniht_insert(return_address,rt,temp);
4300       else emit_movimm(return_address,rt);
4301     }
4302     else
4303     #endif
4304     {
4305       #ifdef REG_PREFETCH
4306       if(temp>=0)
4307       {
4308         if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4309       }
4310       #endif
4311       emit_movimm(return_address,rt); // PC into link register
4312       #ifdef IMM_PREFETCH
4313       emit_prefetch(hash_table_get(return_address));
4314       #endif
4315     }
4316   }
4317 }
4318
4319 void ujump_assemble(int i,struct regstat *i_regs)
4320 {
4321   int ra_done=0;
4322   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4323   address_generation(i+1,i_regs,regs[i].regmap_entry);
4324   #ifdef REG_PREFETCH
4325   int temp=get_reg(branch_regs[i].regmap,PTEMP);
4326   if(rt1[i]==31&&temp>=0)
4327   {
4328     signed char *i_regmap=i_regs->regmap;
4329     int return_address=start+i*4+8;
4330     if(get_reg(branch_regs[i].regmap,31)>0)
4331     if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4332   }
4333   #endif
4334   if(rt1[i]==31&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4335     ujump_assemble_write_ra(i); // writeback ra for DS
4336     ra_done=1;
4337   }
4338   ds_assemble(i+1,i_regs);
4339   uint64_t bc_unneeded=branch_regs[i].u;
4340   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4341   bc_unneeded|=1|(1LL<<rt1[i]);
4342   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4343   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4344                 bc_unneeded,bc_unneeded_upper);
4345   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4346   if(!ra_done&&rt1[i]==31)
4347     ujump_assemble_write_ra(i);
4348   int cc,adj;
4349   cc=get_reg(branch_regs[i].regmap,CCREG);
4350   assert(cc==HOST_CCREG);
4351   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4352   #ifdef REG_PREFETCH
4353   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4354   #endif
4355   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4356   if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4357   load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4358   if(internal_branch(branch_regs[i].is32,ba[i]))
4359     assem_debug("branch: internal\n");
4360   else
4361     assem_debug("branch: external\n");
4362   if(internal_branch(branch_regs[i].is32,ba[i])&&is_ds[(ba[i]-start)>>2]) {
4363     ds_assemble_entry(i);
4364   }
4365   else {
4366     add_to_linker(out,ba[i],internal_branch(branch_regs[i].is32,ba[i]));
4367     emit_jmp(0);
4368   }
4369 }
4370
4371 static void rjump_assemble_write_ra(int i)
4372 {
4373   int rt,return_address;
4374   assert(rt1[i+1]!=rt1[i]);
4375   assert(rt2[i+1]!=rt1[i]);
4376   rt=get_reg(branch_regs[i].regmap,rt1[i]);
4377   assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4378   assert(rt>=0);
4379   return_address=start+i*4+8;
4380   #ifdef REG_PREFETCH
4381   if(temp>=0)
4382   {
4383     if(i_regmap[temp]!=PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4384   }
4385   #endif
4386   emit_movimm(return_address,rt); // PC into link register
4387   #ifdef IMM_PREFETCH
4388   emit_prefetch(hash_table_get(return_address));
4389   #endif
4390 }
4391
4392 void rjump_assemble(int i,struct regstat *i_regs)
4393 {
4394   int temp;
4395   int rs,cc;
4396   int ra_done=0;
4397   rs=get_reg(branch_regs[i].regmap,rs1[i]);
4398   assert(rs>=0);
4399   if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
4400     // Delay slot abuse, make a copy of the branch address register
4401     temp=get_reg(branch_regs[i].regmap,RTEMP);
4402     assert(temp>=0);
4403     assert(regs[i].regmap[temp]==RTEMP);
4404     emit_mov(rs,temp);
4405     rs=temp;
4406   }
4407   address_generation(i+1,i_regs,regs[i].regmap_entry);
4408   #ifdef REG_PREFETCH
4409   if(rt1[i]==31)
4410   {
4411     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
4412       signed char *i_regmap=i_regs->regmap;
4413       int return_address=start+i*4+8;
4414       if(i_regmap[temp]==PTEMP) emit_movimm((uintptr_t)hash_table_get(return_address),temp);
4415     }
4416   }
4417   #endif
4418   #ifdef USE_MINI_HT
4419   if(rs1[i]==31) {
4420     int rh=get_reg(regs[i].regmap,RHASH);
4421     if(rh>=0) do_preload_rhash(rh);
4422   }
4423   #endif
4424   if(rt1[i]!=0&&(rt1[i]==rs1[i+1]||rt1[i]==rs2[i+1])) {
4425     rjump_assemble_write_ra(i);
4426     ra_done=1;
4427   }
4428   ds_assemble(i+1,i_regs);
4429   uint64_t bc_unneeded=branch_regs[i].u;
4430   uint64_t bc_unneeded_upper=branch_regs[i].uu;
4431   bc_unneeded|=1|(1LL<<rt1[i]);
4432   bc_unneeded_upper|=1|(1LL<<rt1[i]);
4433   bc_unneeded&=~(1LL<<rs1[i]);
4434   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4435                 bc_unneeded,bc_unneeded_upper);
4436   load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],CCREG);
4437   if(!ra_done&&rt1[i]!=0)
4438     rjump_assemble_write_ra(i);
4439   cc=get_reg(branch_regs[i].regmap,CCREG);
4440   assert(cc==HOST_CCREG);
4441   (void)cc;
4442   #ifdef USE_MINI_HT
4443   int rh=get_reg(branch_regs[i].regmap,RHASH);
4444   int ht=get_reg(branch_regs[i].regmap,RHTBL);
4445   if(rs1[i]==31) {
4446     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
4447     do_preload_rhtbl(ht);
4448     do_rhash(rs,rh);
4449   }
4450   #endif
4451   store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4452   #ifdef DESTRUCTIVE_WRITEBACK
4453   if((branch_regs[i].dirty>>rs)&(branch_regs[i].is32>>rs1[i])&1) {
4454     if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4455       emit_loadreg(rs1[i],rs);
4456     }
4457   }
4458   #endif
4459   #ifdef REG_PREFETCH
4460   if(rt1[i]==31&&temp>=0) emit_prefetchreg(temp);
4461   #endif
4462   #ifdef USE_MINI_HT
4463   if(rs1[i]==31) {
4464     do_miniht_load(ht,rh);
4465   }
4466   #endif
4467   //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4468   //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4469   //assert(adj==0);
4470   emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4471   add_stub(CC_STUB,out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
4472   if(itype[i+1]==COP0&&(source[i+1]&0x3f)==0x10)
4473     // special case for RFE
4474     emit_jmp(0);
4475   else
4476     emit_jns(0);
4477   //load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,-1);
4478   #ifdef USE_MINI_HT
4479   if(rs1[i]==31) {
4480     do_miniht_jump(rs,rh,ht);
4481   }
4482   else
4483   #endif
4484   {
4485     emit_jmp(jump_vaddr_reg[rs]);
4486   }
4487   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4488   if(rt1[i]!=31&&i<slen-2&&(((u_int)out)&7)) emit_mov(13,13);
4489   #endif
4490 }
4491
4492 void cjump_assemble(int i,struct regstat *i_regs)
4493 {
4494   signed char *i_regmap=i_regs->regmap;
4495   int cc;
4496   int match;
4497   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4498   assem_debug("match=%d\n",match);
4499   int s1h,s1l,s2h,s2l;
4500   int prev_cop1_usable=cop1_usable;
4501   int unconditional=0,nop=0;
4502   int only32=0;
4503   int invert=0;
4504   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4505   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4506   if(!match) invert=1;
4507   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4508   if(i>(ba[i]-start)>>2) invert=1;
4509   #endif
4510
4511   if(ooo[i]) {
4512     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4513     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4514     s2l=get_reg(branch_regs[i].regmap,rs2[i]);
4515     s2h=get_reg(branch_regs[i].regmap,rs2[i]|64);
4516   }
4517   else {
4518     s1l=get_reg(i_regmap,rs1[i]);
4519     s1h=get_reg(i_regmap,rs1[i]|64);
4520     s2l=get_reg(i_regmap,rs2[i]);
4521     s2h=get_reg(i_regmap,rs2[i]|64);
4522   }
4523   if(rs1[i]==0&&rs2[i]==0)
4524   {
4525     if(opcode[i]&1) nop=1;
4526     else unconditional=1;
4527     //assert(opcode[i]!=5);
4528     //assert(opcode[i]!=7);
4529     //assert(opcode[i]!=0x15);
4530     //assert(opcode[i]!=0x17);
4531   }
4532   else if(rs1[i]==0)
4533   {
4534     s1l=s2l;s1h=s2h;
4535     s2l=s2h=-1;
4536     only32=(regs[i].was32>>rs2[i])&1;
4537   }
4538   else if(rs2[i]==0)
4539   {
4540     s2l=s2h=-1;
4541     only32=(regs[i].was32>>rs1[i])&1;
4542   }
4543   else {
4544     only32=(regs[i].was32>>rs1[i])&(regs[i].was32>>rs2[i])&1;
4545   }
4546
4547   if(ooo[i]) {
4548     // Out of order execution (delay slot first)
4549     //printf("OOOE\n");
4550     address_generation(i+1,i_regs,regs[i].regmap_entry);
4551     ds_assemble(i+1,i_regs);
4552     int adj;
4553     uint64_t bc_unneeded=branch_regs[i].u;
4554     uint64_t bc_unneeded_upper=branch_regs[i].uu;
4555     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4556     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
4557     bc_unneeded|=1;
4558     bc_unneeded_upper|=1;
4559     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4560                   bc_unneeded,bc_unneeded_upper);
4561     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
4562     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4563     cc=get_reg(branch_regs[i].regmap,CCREG);
4564     assert(cc==HOST_CCREG);
4565     if(unconditional)
4566       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4567     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4568     //assem_debug("cycle count (adj)\n");
4569     if(unconditional) {
4570       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4571       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4572         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4573         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4574         if(internal)
4575           assem_debug("branch: internal\n");
4576         else
4577           assem_debug("branch: external\n");
4578         if(internal&&is_ds[(ba[i]-start)>>2]) {
4579           ds_assemble_entry(i);
4580         }
4581         else {
4582           add_to_linker(out,ba[i],internal);
4583           emit_jmp(0);
4584         }
4585         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4586         if(((u_int)out)&7) emit_addnop(0);
4587         #endif
4588       }
4589     }
4590     else if(nop) {
4591       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4592       void *jaddr=out;
4593       emit_jns(0);
4594       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4595     }
4596     else {
4597       void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4598       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4599       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4600       if(!only32)
4601       {
4602         assert(s1h>=0);
4603         if(opcode[i]==4) // BEQ
4604         {
4605           if(s2h>=0) emit_cmp(s1h,s2h);
4606           else emit_test(s1h,s1h);
4607           nottaken1=out;
4608           emit_jne((void *)1l);
4609         }
4610         if(opcode[i]==5) // BNE
4611         {
4612           if(s2h>=0) emit_cmp(s1h,s2h);
4613           else emit_test(s1h,s1h);
4614           if(invert) taken=out;
4615           else add_to_linker(out,ba[i],internal);
4616           emit_jne(0);
4617         }
4618         if(opcode[i]==6) // BLEZ
4619         {
4620           emit_test(s1h,s1h);
4621           if(invert) taken=out;
4622           else add_to_linker(out,ba[i],internal);
4623           emit_js(0);
4624           nottaken1=out;
4625           emit_jne((void *)1l);
4626         }
4627         if(opcode[i]==7) // BGTZ
4628         {
4629           emit_test(s1h,s1h);
4630           nottaken1=out;
4631           emit_js(1);
4632           if(invert) taken=out;
4633           else add_to_linker(out,ba[i],internal);
4634           emit_jne(0);
4635         }
4636       } // if(!only32)
4637
4638       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4639       assert(s1l>=0);
4640       if(opcode[i]==4) // BEQ
4641       {
4642         if(s2l>=0) emit_cmp(s1l,s2l);
4643         else emit_test(s1l,s1l);
4644         if(invert){
4645           nottaken=out;
4646           emit_jne((void *)1l);
4647         }else{
4648           add_to_linker(out,ba[i],internal);
4649           emit_jeq(0);
4650         }
4651       }
4652       if(opcode[i]==5) // BNE
4653       {
4654         if(s2l>=0) emit_cmp(s1l,s2l);
4655         else emit_test(s1l,s1l);
4656         if(invert){
4657           nottaken=out;
4658           emit_jeq(1);
4659         }else{
4660           add_to_linker(out,ba[i],internal);
4661           emit_jne(0);
4662         }
4663       }
4664       if(opcode[i]==6) // BLEZ
4665       {
4666         emit_cmpimm(s1l,1);
4667         if(invert){
4668           nottaken=out;
4669           emit_jge(1);
4670         }else{
4671           add_to_linker(out,ba[i],internal);
4672           emit_jl(0);
4673         }
4674       }
4675       if(opcode[i]==7) // BGTZ
4676       {
4677         emit_cmpimm(s1l,1);
4678         if(invert){
4679           nottaken=out;
4680           emit_jl(1);
4681         }else{
4682           add_to_linker(out,ba[i],internal);
4683           emit_jge(0);
4684         }
4685       }
4686       if(invert) {
4687         if(taken) set_jump_target(taken, out);
4688         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4689         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
4690           if(adj) {
4691             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
4692             add_to_linker(out,ba[i],internal);
4693           }else{
4694             emit_addnop(13);
4695             add_to_linker(out,ba[i],internal*2);
4696           }
4697           emit_jmp(0);
4698         }else
4699         #endif
4700         {
4701           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
4702           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4703           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4704           if(internal)
4705             assem_debug("branch: internal\n");
4706           else
4707             assem_debug("branch: external\n");
4708           if(internal&&is_ds[(ba[i]-start)>>2]) {
4709             ds_assemble_entry(i);
4710           }
4711           else {
4712             add_to_linker(out,ba[i],internal);
4713             emit_jmp(0);
4714           }
4715         }
4716         set_jump_target(nottaken, out);
4717       }
4718
4719       if(nottaken1) set_jump_target(nottaken1, out);
4720       if(adj) {
4721         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
4722       }
4723     } // (!unconditional)
4724   } // if(ooo)
4725   else
4726   {
4727     // In-order execution (branch first)
4728     //if(likely[i]) printf("IOL\n");
4729     //else
4730     //printf("IOE\n");
4731     void *taken = NULL, *nottaken = NULL, *nottaken1 = NULL;
4732     if(!unconditional&&!nop) {
4733       if(!only32)
4734       {
4735         assert(s1h>=0);
4736         if((opcode[i]&0x2f)==4) // BEQ
4737         {
4738           if(s2h>=0) emit_cmp(s1h,s2h);
4739           else emit_test(s1h,s1h);
4740           nottaken1=out;
4741           emit_jne((void *)2l);
4742         }
4743         if((opcode[i]&0x2f)==5) // BNE
4744         {
4745           if(s2h>=0) emit_cmp(s1h,s2h);
4746           else emit_test(s1h,s1h);
4747           taken=out;
4748           emit_jne((void *)1l);
4749         }
4750         if((opcode[i]&0x2f)==6) // BLEZ
4751         {
4752           emit_test(s1h,s1h);
4753           taken=out;
4754           emit_js(1);
4755           nottaken1=out;
4756           emit_jne((void *)2l);
4757         }
4758         if((opcode[i]&0x2f)==7) // BGTZ
4759         {
4760           emit_test(s1h,s1h);
4761           nottaken1=out;
4762           emit_js(2);
4763           taken=out;
4764           emit_jne((void *)1l);
4765         }
4766       } // if(!only32)
4767
4768       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4769       assert(s1l>=0);
4770       if((opcode[i]&0x2f)==4) // BEQ
4771       {
4772         if(s2l>=0) emit_cmp(s1l,s2l);
4773         else emit_test(s1l,s1l);
4774         nottaken=out;
4775         emit_jne((void *)2l);
4776       }
4777       if((opcode[i]&0x2f)==5) // BNE
4778       {
4779         if(s2l>=0) emit_cmp(s1l,s2l);
4780         else emit_test(s1l,s1l);
4781         nottaken=out;
4782         emit_jeq(2);
4783       }
4784       if((opcode[i]&0x2f)==6) // BLEZ
4785       {
4786         emit_cmpimm(s1l,1);
4787         nottaken=out;
4788         emit_jge(2);
4789       }
4790       if((opcode[i]&0x2f)==7) // BGTZ
4791       {
4792         emit_cmpimm(s1l,1);
4793         nottaken=out;
4794         emit_jl(2);
4795       }
4796     } // if(!unconditional)
4797     int adj;
4798     uint64_t ds_unneeded=branch_regs[i].u;
4799     uint64_t ds_unneeded_upper=branch_regs[i].uu;
4800     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
4801     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
4802     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
4803     ds_unneeded|=1;
4804     ds_unneeded_upper|=1;
4805     // branch taken
4806     if(!nop) {
4807       if(taken) set_jump_target(taken, out);
4808       assem_debug("1:\n");
4809       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4810                     ds_unneeded,ds_unneeded_upper);
4811       // load regs
4812       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
4813       address_generation(i+1,&branch_regs[i],0);
4814       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
4815       ds_assemble(i+1,&branch_regs[i]);
4816       cc=get_reg(branch_regs[i].regmap,CCREG);
4817       if(cc==-1) {
4818         emit_loadreg(CCREG,cc=HOST_CCREG);
4819         // CHECK: Is the following instruction (fall thru) allocated ok?
4820       }
4821       assert(cc==HOST_CCREG);
4822       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4823       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
4824       assem_debug("cycle count (adj)\n");
4825       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4826       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4827       if(internal)
4828         assem_debug("branch: internal\n");
4829       else
4830         assem_debug("branch: external\n");
4831       if(internal&&is_ds[(ba[i]-start)>>2]) {
4832         ds_assemble_entry(i);
4833       }
4834       else {
4835         add_to_linker(out,ba[i],internal);
4836         emit_jmp(0);
4837       }
4838     }
4839     // branch not taken
4840     cop1_usable=prev_cop1_usable;
4841     if(!unconditional) {
4842       if(nottaken1) set_jump_target(nottaken1, out);
4843       set_jump_target(nottaken, out);
4844       assem_debug("2:\n");
4845       if(!likely[i]) {
4846         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4847                       ds_unneeded,ds_unneeded_upper);
4848         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
4849         address_generation(i+1,&branch_regs[i],0);
4850         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4851         ds_assemble(i+1,&branch_regs[i]);
4852       }
4853       cc=get_reg(branch_regs[i].regmap,CCREG);
4854       if(cc==-1&&!likely[i]) {
4855         // Cycle count isn't in a register, temporarily load it then write it out
4856         emit_loadreg(CCREG,HOST_CCREG);
4857         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
4858         void *jaddr=out;
4859         emit_jns(0);
4860         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4861         emit_storereg(CCREG,HOST_CCREG);
4862       }
4863       else{
4864         cc=get_reg(i_regmap,CCREG);
4865         assert(cc==HOST_CCREG);
4866         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4867         void *jaddr=out;
4868         emit_jns(0);
4869         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
4870       }
4871     }
4872   }
4873 }
4874
4875 void sjump_assemble(int i,struct regstat *i_regs)
4876 {
4877   signed char *i_regmap=i_regs->regmap;
4878   int cc;
4879   int match;
4880   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4881   assem_debug("smatch=%d\n",match);
4882   int s1h,s1l;
4883   int prev_cop1_usable=cop1_usable;
4884   int unconditional=0,nevertaken=0;
4885   int only32=0;
4886   int invert=0;
4887   int internal=internal_branch(branch_regs[i].is32,ba[i]);
4888   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
4889   if(!match) invert=1;
4890   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4891   if(i>(ba[i]-start)>>2) invert=1;
4892   #endif
4893
4894   //if(opcode2[i]>=0x10) return; // FIXME (BxxZAL)
4895   //assert(opcode2[i]<0x10||rs1[i]==0); // FIXME (BxxZAL)
4896
4897   if(ooo[i]) {
4898     s1l=get_reg(branch_regs[i].regmap,rs1[i]);
4899     s1h=get_reg(branch_regs[i].regmap,rs1[i]|64);
4900   }
4901   else {
4902     s1l=get_reg(i_regmap,rs1[i]);
4903     s1h=get_reg(i_regmap,rs1[i]|64);
4904   }
4905   if(rs1[i]==0)
4906   {
4907     if(opcode2[i]&1) unconditional=1;
4908     else nevertaken=1;
4909     // These are never taken (r0 is never less than zero)
4910     //assert(opcode2[i]!=0);
4911     //assert(opcode2[i]!=2);
4912     //assert(opcode2[i]!=0x10);
4913     //assert(opcode2[i]!=0x12);
4914   }
4915   else {
4916     only32=(regs[i].was32>>rs1[i])&1;
4917   }
4918
4919   if(ooo[i]) {
4920     // Out of order execution (delay slot first)
4921     //printf("OOOE\n");
4922     address_generation(i+1,i_regs,regs[i].regmap_entry);
4923     ds_assemble(i+1,i_regs);
4924     int adj;
4925     uint64_t bc_unneeded=branch_regs[i].u;
4926     uint64_t bc_unneeded_upper=branch_regs[i].uu;
4927     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4928     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
4929     bc_unneeded|=1;
4930     bc_unneeded_upper|=1;
4931     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
4932                   bc_unneeded,bc_unneeded_upper);
4933     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
4934     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
4935     if(rt1[i]==31) {
4936       int rt,return_address;
4937       rt=get_reg(branch_regs[i].regmap,31);
4938       assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4939       if(rt>=0) {
4940         // Save the PC even if the branch is not taken
4941         return_address=start+i*4+8;
4942         emit_movimm(return_address,rt); // PC into link register
4943         #ifdef IMM_PREFETCH
4944         if(!nevertaken) emit_prefetch(hash_table_get(return_address));
4945         #endif
4946       }
4947     }
4948     cc=get_reg(branch_regs[i].regmap,CCREG);
4949     assert(cc==HOST_CCREG);
4950     if(unconditional)
4951       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4952     //do_cc(i,branch_regs[i].regmap,&adj,unconditional?ba[i]:-1,unconditional);
4953     assem_debug("cycle count (adj)\n");
4954     if(unconditional) {
4955       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4956       if(i!=(ba[i]-start)>>2 || source[i+1]!=0) {
4957         if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4958         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
4959         if(internal)
4960           assem_debug("branch: internal\n");
4961         else
4962           assem_debug("branch: external\n");
4963         if(internal&&is_ds[(ba[i]-start)>>2]) {
4964           ds_assemble_entry(i);
4965         }
4966         else {
4967           add_to_linker(out,ba[i],internal);
4968           emit_jmp(0);
4969         }
4970         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4971         if(((u_int)out)&7) emit_addnop(0);
4972         #endif
4973       }
4974     }
4975     else if(nevertaken) {
4976       emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
4977       void *jaddr=out;
4978       emit_jns(0);
4979       add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
4980     }
4981     else {
4982       void *nottaken = NULL;
4983       do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4984       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
4985       if(!only32)
4986       {
4987         assert(s1h>=0);
4988         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
4989         {
4990           emit_test(s1h,s1h);
4991           if(invert){
4992             nottaken=out;
4993             emit_jns(1);
4994           }else{
4995             add_to_linker(out,ba[i],internal);
4996             emit_js(0);
4997           }
4998         }
4999         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5000         {
5001           emit_test(s1h,s1h);
5002           if(invert){
5003             nottaken=out;
5004             emit_js(1);
5005           }else{
5006             add_to_linker(out,ba[i],internal);
5007             emit_jns(0);
5008           }
5009         }
5010       } // if(!only32)
5011       else
5012       {
5013         assert(s1l>=0);
5014         if((opcode2[i]&0xf)==0) // BLTZ/BLTZAL
5015         {
5016           emit_test(s1l,s1l);
5017           if(invert){
5018             nottaken=out;
5019             emit_jns(1);
5020           }else{
5021             add_to_linker(out,ba[i],internal);
5022             emit_js(0);
5023           }
5024         }
5025         if((opcode2[i]&0xf)==1) // BGEZ/BLTZAL
5026         {
5027           emit_test(s1l,s1l);
5028           if(invert){
5029             nottaken=out;
5030             emit_js(1);
5031           }else{
5032             add_to_linker(out,ba[i],internal);
5033             emit_jns(0);
5034           }
5035         }
5036       } // if(!only32)
5037
5038       if(invert) {
5039         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5040         if(match&&(!internal||!is_ds[(ba[i]-start)>>2])) {
5041           if(adj) {
5042             emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5043             add_to_linker(out,ba[i],internal);
5044           }else{
5045             emit_addnop(13);
5046             add_to_linker(out,ba[i],internal*2);
5047           }
5048           emit_jmp(0);
5049         }else
5050         #endif
5051         {
5052           if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5053           store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5054           load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5055           if(internal)
5056             assem_debug("branch: internal\n");
5057           else
5058             assem_debug("branch: external\n");
5059           if(internal&&is_ds[(ba[i]-start)>>2]) {
5060             ds_assemble_entry(i);
5061           }
5062           else {
5063             add_to_linker(out,ba[i],internal);
5064             emit_jmp(0);
5065           }
5066         }
5067         set_jump_target(nottaken, out);
5068       }
5069
5070       if(adj) {
5071         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5072       }
5073     } // (!unconditional)
5074   } // if(ooo)
5075   else
5076   {
5077     // In-order execution (branch first)
5078     //printf("IOE\n");
5079     void *nottaken = NULL;
5080     if(rt1[i]==31) {
5081       int rt,return_address;
5082       rt=get_reg(branch_regs[i].regmap,31);
5083       if(rt>=0) {
5084         // Save the PC even if the branch is not taken
5085         return_address=start+i*4+8;
5086         emit_movimm(return_address,rt); // PC into link register
5087         #ifdef IMM_PREFETCH
5088         emit_prefetch(hash_table_get(return_address));
5089         #endif
5090       }
5091     }
5092     if(!unconditional) {
5093       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5094       if(!only32)
5095       {
5096         assert(s1h>=0);
5097         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5098         {
5099           emit_test(s1h,s1h);
5100           nottaken=out;
5101           emit_jns(1);
5102         }
5103         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5104         {
5105           emit_test(s1h,s1h);
5106           nottaken=out;
5107           emit_js(1);
5108         }
5109       } // if(!only32)
5110       else
5111       {
5112         assert(s1l>=0);
5113         if((opcode2[i]&0x0d)==0) // BLTZ/BLTZL/BLTZAL/BLTZALL
5114         {
5115           emit_test(s1l,s1l);
5116           nottaken=out;
5117           emit_jns(1);
5118         }
5119         if((opcode2[i]&0x0d)==1) // BGEZ/BGEZL/BGEZAL/BGEZALL
5120         {
5121           emit_test(s1l,s1l);
5122           nottaken=out;
5123           emit_js(1);
5124         }
5125       }
5126     } // if(!unconditional)
5127     int adj;
5128     uint64_t ds_unneeded=branch_regs[i].u;
5129     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5130     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5131     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5132     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5133     ds_unneeded|=1;
5134     ds_unneeded_upper|=1;
5135     // branch taken
5136     if(!nevertaken) {
5137       //assem_debug("1:\n");
5138       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5139                     ds_unneeded,ds_unneeded_upper);
5140       // load regs
5141       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5142       address_generation(i+1,&branch_regs[i],0);
5143       load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5144       ds_assemble(i+1,&branch_regs[i]);
5145       cc=get_reg(branch_regs[i].regmap,CCREG);
5146       if(cc==-1) {
5147         emit_loadreg(CCREG,cc=HOST_CCREG);
5148         // CHECK: Is the following instruction (fall thru) allocated ok?
5149       }
5150       assert(cc==HOST_CCREG);
5151       store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5152       do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5153       assem_debug("cycle count (adj)\n");
5154       if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5155       load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5156       if(internal)
5157         assem_debug("branch: internal\n");
5158       else
5159         assem_debug("branch: external\n");
5160       if(internal&&is_ds[(ba[i]-start)>>2]) {
5161         ds_assemble_entry(i);
5162       }
5163       else {
5164         add_to_linker(out,ba[i],internal);
5165         emit_jmp(0);
5166       }
5167     }
5168     // branch not taken
5169     cop1_usable=prev_cop1_usable;
5170     if(!unconditional) {
5171       set_jump_target(nottaken, out);
5172       assem_debug("1:\n");
5173       if(!likely[i]) {
5174         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5175                       ds_unneeded,ds_unneeded_upper);
5176         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5177         address_generation(i+1,&branch_regs[i],0);
5178         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5179         ds_assemble(i+1,&branch_regs[i]);
5180       }
5181       cc=get_reg(branch_regs[i].regmap,CCREG);
5182       if(cc==-1&&!likely[i]) {
5183         // Cycle count isn't in a register, temporarily load it then write it out
5184         emit_loadreg(CCREG,HOST_CCREG);
5185         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5186         void *jaddr=out;
5187         emit_jns(0);
5188         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5189         emit_storereg(CCREG,HOST_CCREG);
5190       }
5191       else{
5192         cc=get_reg(i_regmap,CCREG);
5193         assert(cc==HOST_CCREG);
5194         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5195         void *jaddr=out;
5196         emit_jns(0);
5197         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5198       }
5199     }
5200   }
5201 }
5202
5203 void fjump_assemble(int i,struct regstat *i_regs)
5204 {
5205   signed char *i_regmap=i_regs->regmap;
5206   int cc;
5207   int match;
5208   match=match_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5209   assem_debug("fmatch=%d\n",match);
5210   int fs,cs;
5211   void *eaddr;
5212   int invert=0;
5213   int internal=internal_branch(branch_regs[i].is32,ba[i]);
5214   if(i==(ba[i]-start)>>2) assem_debug("idle loop\n");
5215   if(!match) invert=1;
5216   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5217   if(i>(ba[i]-start)>>2) invert=1;
5218   #endif
5219
5220   if(ooo[i]) {
5221     fs=get_reg(branch_regs[i].regmap,FSREG);
5222     address_generation(i+1,i_regs,regs[i].regmap_entry); // Is this okay?
5223   }
5224   else {
5225     fs=get_reg(i_regmap,FSREG);
5226   }
5227
5228   // Check cop1 unusable
5229   if(!cop1_usable) {
5230     cs=get_reg(i_regmap,CSREG);
5231     assert(cs>=0);
5232     emit_testimm(cs,0x20000000);
5233     eaddr=out;
5234     emit_jeq(0);
5235     add_stub_r(FP_STUB,eaddr,out,i,cs,i_regs,0,0);
5236     cop1_usable=1;
5237   }
5238
5239   if(ooo[i]) {
5240     // Out of order execution (delay slot first)
5241     //printf("OOOE\n");
5242     ds_assemble(i+1,i_regs);
5243     int adj;
5244     uint64_t bc_unneeded=branch_regs[i].u;
5245     uint64_t bc_unneeded_upper=branch_regs[i].uu;
5246     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5247     bc_unneeded_upper&=~((1LL<<us1[i])|(1LL<<us2[i]));
5248     bc_unneeded|=1;
5249     bc_unneeded_upper|=1;
5250     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5251                   bc_unneeded,bc_unneeded_upper);
5252     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i],rs1[i]);
5253     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5254     cc=get_reg(branch_regs[i].regmap,CCREG);
5255     assert(cc==HOST_CCREG);
5256     do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
5257     assem_debug("cycle count (adj)\n");
5258     if(1) {
5259       void *nottaken = NULL;
5260       if(adj&&!invert) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5261       if(1) {
5262         assert(fs>=0);
5263         emit_testimm(fs,0x800000);
5264         if(source[i]&0x10000) // BC1T
5265         {
5266           if(invert){
5267             nottaken=out;
5268             emit_jeq(1);
5269           }else{
5270             add_to_linker(out,ba[i],internal);
5271             emit_jne(0);
5272           }
5273         }
5274         else // BC1F
5275           if(invert){
5276             nottaken=out;
5277             emit_jne((void *)1l);
5278           }else{
5279             add_to_linker(out,ba[i],internal);
5280             emit_jeq(0);
5281           }
5282         {
5283         }
5284       } // if(!only32)
5285
5286       if(invert) {
5287         if(adj) emit_addimm(cc,-CLOCK_ADJUST(adj),cc);
5288         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
5289         else if(match) emit_addnop(13);
5290         #endif
5291         store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5292         load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5293         if(internal)
5294           assem_debug("branch: internal\n");
5295         else
5296           assem_debug("branch: external\n");
5297         if(internal&&is_ds[(ba[i]-start)>>2]) {
5298           ds_assemble_entry(i);
5299         }
5300         else {
5301           add_to_linker(out,ba[i],internal);
5302           emit_jmp(0);
5303         }
5304         set_jump_target(nottaken, out);
5305       }
5306
5307       if(adj) {
5308         if(!invert) emit_addimm(cc,CLOCK_ADJUST(adj),cc);
5309       }
5310     } // (!unconditional)
5311   } // if(ooo)
5312   else
5313   {
5314     // In-order execution (branch first)
5315     //printf("IOE\n");
5316     void *nottaken = NULL;
5317     if(1) {
5318       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
5319       if(1) {
5320         assert(fs>=0);
5321         emit_testimm(fs,0x800000);
5322         if(source[i]&0x10000) // BC1T
5323         {
5324           nottaken=out;
5325           emit_jeq(1);
5326         }
5327         else // BC1F
5328         {
5329           nottaken=out;
5330           emit_jne((void *)1l);
5331         }
5332       }
5333     } // if(!unconditional)
5334     int adj;
5335     uint64_t ds_unneeded=branch_regs[i].u;
5336     uint64_t ds_unneeded_upper=branch_regs[i].uu;
5337     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5338     ds_unneeded_upper&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5339     if((~ds_unneeded_upper>>rt1[i+1])&1) ds_unneeded_upper&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
5340     ds_unneeded|=1;
5341     ds_unneeded_upper|=1;
5342     // branch taken
5343     //assem_debug("1:\n");
5344     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5345                   ds_unneeded,ds_unneeded_upper);
5346     // load regs
5347     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5348     address_generation(i+1,&branch_regs[i],0);
5349     load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,INVCP);
5350     ds_assemble(i+1,&branch_regs[i]);
5351     cc=get_reg(branch_regs[i].regmap,CCREG);
5352     if(cc==-1) {
5353       emit_loadreg(CCREG,cc=HOST_CCREG);
5354       // CHECK: Is the following instruction (fall thru) allocated ok?
5355     }
5356     assert(cc==HOST_CCREG);
5357     store_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5358     do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
5359     assem_debug("cycle count (adj)\n");
5360     if(adj) emit_addimm(cc,CLOCK_ADJUST(ccadj[i]+2-adj),cc);
5361     load_regs_bt(branch_regs[i].regmap,branch_regs[i].is32,branch_regs[i].dirty,ba[i]);
5362     if(internal)
5363       assem_debug("branch: internal\n");
5364     else
5365       assem_debug("branch: external\n");
5366     if(internal&&is_ds[(ba[i]-start)>>2]) {
5367       ds_assemble_entry(i);
5368     }
5369     else {
5370       add_to_linker(out,ba[i],internal);
5371       emit_jmp(0);
5372     }
5373
5374     // branch not taken
5375     if(1) { // <- FIXME (don't need this)
5376       set_jump_target(nottaken, out);
5377       assem_debug("1:\n");
5378       if(!likely[i]) {
5379         wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,regs[i].is32,
5380                       ds_unneeded,ds_unneeded_upper);
5381         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,rs1[i+1],rs2[i+1]);
5382         address_generation(i+1,&branch_regs[i],0);
5383         load_regs(regs[i].regmap,branch_regs[i].regmap,regs[i].was32,CCREG,CCREG);
5384         ds_assemble(i+1,&branch_regs[i]);
5385       }
5386       cc=get_reg(branch_regs[i].regmap,CCREG);
5387       if(cc==-1&&!likely[i]) {
5388         // Cycle count isn't in a register, temporarily load it then write it out
5389         emit_loadreg(CCREG,HOST_CCREG);
5390         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5391         void *jaddr=out;
5392         emit_jns(0);
5393         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,NOTTAKEN,0);
5394         emit_storereg(CCREG,HOST_CCREG);
5395       }
5396       else{
5397         cc=get_reg(i_regmap,CCREG);
5398         assert(cc==HOST_CCREG);
5399         emit_addimm_and_set_flags(CLOCK_ADJUST(ccadj[i]+2),cc);
5400         void *jaddr=out;
5401         emit_jns(0);
5402         add_stub(CC_STUB,jaddr,out,0,i,start+i*4+8,likely[i]?NULLDS:NOTTAKEN,0);
5403       }
5404     }
5405   }
5406 }
5407
5408 static void pagespan_assemble(int i,struct regstat *i_regs)
5409 {
5410   int s1l=get_reg(i_regs->regmap,rs1[i]);
5411   int s1h=get_reg(i_regs->regmap,rs1[i]|64);
5412   int s2l=get_reg(i_regs->regmap,rs2[i]);
5413   int s2h=get_reg(i_regs->regmap,rs2[i]|64);
5414   void *taken = NULL;
5415   void *nottaken = NULL;
5416   int unconditional=0;
5417   if(rs1[i]==0)
5418   {
5419     s1l=s2l;s1h=s2h;
5420     s2l=s2h=-1;
5421   }
5422   else if(rs2[i]==0)
5423   {
5424     s2l=s2h=-1;
5425   }
5426   if((i_regs->is32>>rs1[i])&(i_regs->is32>>rs2[i])&1) {
5427     s1h=s2h=-1;
5428   }
5429   int hr=0;
5430   int addr=-1,alt=-1,ntaddr=-1;
5431   if(i_regs->regmap[HOST_BTREG]<0) {addr=HOST_BTREG;}
5432   else {
5433     while(hr<HOST_REGS)
5434     {
5435       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG &&
5436          (i_regs->regmap[hr]&63)!=rs1[i] &&
5437          (i_regs->regmap[hr]&63)!=rs2[i] )
5438       {
5439         addr=hr++;break;
5440       }
5441       hr++;
5442     }
5443   }
5444   while(hr<HOST_REGS)
5445   {
5446     if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5447        (i_regs->regmap[hr]&63)!=rs1[i] &&
5448        (i_regs->regmap[hr]&63)!=rs2[i] )
5449     {
5450       alt=hr++;break;
5451     }
5452     hr++;
5453   }
5454   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ needs another register
5455   {
5456     while(hr<HOST_REGS)
5457     {
5458       if(hr!=EXCLUDE_REG && hr!=HOST_CCREG && hr!=HOST_BTREG &&
5459          (i_regs->regmap[hr]&63)!=rs1[i] &&
5460          (i_regs->regmap[hr]&63)!=rs2[i] )
5461       {
5462         ntaddr=hr;break;
5463       }
5464       hr++;
5465     }
5466   }
5467   assert(hr<HOST_REGS);
5468   if((opcode[i]&0x2e)==4||opcode[i]==0x11) { // BEQ/BNE/BEQL/BNEL/BC1
5469     load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
5470   }
5471   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]+2),HOST_CCREG);
5472   if(opcode[i]==2) // J
5473   {
5474     unconditional=1;
5475   }
5476   if(opcode[i]==3) // JAL
5477   {
5478     // TODO: mini_ht
5479     int rt=get_reg(i_regs->regmap,31);
5480     emit_movimm(start+i*4+8,rt);
5481     unconditional=1;
5482   }
5483   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
5484   {
5485     emit_mov(s1l,addr);
5486     if(opcode2[i]==9) // JALR
5487     {
5488       int rt=get_reg(i_regs->regmap,rt1[i]);
5489       emit_movimm(start+i*4+8,rt);
5490     }
5491   }
5492   if((opcode[i]&0x3f)==4) // BEQ
5493   {
5494     if(rs1[i]==rs2[i])
5495     {
5496       unconditional=1;
5497     }
5498     else
5499     #ifdef HAVE_CMOV_IMM
5500     if(s1h<0) {
5501       if(s2l>=0) emit_cmp(s1l,s2l);
5502       else emit_test(s1l,s1l);
5503       emit_cmov2imm_e_ne_compact(ba[i],start+i*4+8,addr);
5504     }
5505     else
5506     #endif
5507     {
5508       assert(s1l>=0);
5509       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5510       if(s1h>=0) {
5511         if(s2h>=0) emit_cmp(s1h,s2h);
5512         else emit_test(s1h,s1h);
5513         emit_cmovne_reg(alt,addr);
5514       }
5515       if(s2l>=0) emit_cmp(s1l,s2l);
5516       else emit_test(s1l,s1l);
5517       emit_cmovne_reg(alt,addr);
5518     }
5519   }
5520   if((opcode[i]&0x3f)==5) // BNE
5521   {
5522     #ifdef HAVE_CMOV_IMM
5523     if(s1h<0) {
5524       if(s2l>=0) emit_cmp(s1l,s2l);
5525       else emit_test(s1l,s1l);
5526       emit_cmov2imm_e_ne_compact(start+i*4+8,ba[i],addr);
5527     }
5528     else
5529     #endif
5530     {
5531       assert(s1l>=0);
5532       emit_mov2imm_compact(start+i*4+8,addr,ba[i],alt);
5533       if(s1h>=0) {
5534         if(s2h>=0) emit_cmp(s1h,s2h);
5535         else emit_test(s1h,s1h);
5536         emit_cmovne_reg(alt,addr);
5537       }
5538       if(s2l>=0) emit_cmp(s1l,s2l);
5539       else emit_test(s1l,s1l);
5540       emit_cmovne_reg(alt,addr);
5541     }
5542   }
5543   if((opcode[i]&0x3f)==0x14) // BEQL
5544   {
5545     if(s1h>=0) {
5546       if(s2h>=0) emit_cmp(s1h,s2h);
5547       else emit_test(s1h,s1h);
5548       nottaken=out;
5549       emit_jne(0);
5550     }
5551     if(s2l>=0) emit_cmp(s1l,s2l);
5552     else emit_test(s1l,s1l);
5553     if(nottaken) set_jump_target(nottaken, out);
5554     nottaken=out;
5555     emit_jne(0);
5556   }
5557   if((opcode[i]&0x3f)==0x15) // BNEL
5558   {
5559     if(s1h>=0) {
5560       if(s2h>=0) emit_cmp(s1h,s2h);
5561       else emit_test(s1h,s1h);
5562       taken=out;
5563       emit_jne(0);
5564     }
5565     if(s2l>=0) emit_cmp(s1l,s2l);
5566     else emit_test(s1l,s1l);
5567     nottaken=out;
5568     emit_jeq(0);
5569     if(taken) set_jump_target(taken, out);
5570   }
5571   if((opcode[i]&0x3f)==6) // BLEZ
5572   {
5573     emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5574     emit_cmpimm(s1l,1);
5575     if(s1h>=0) emit_mov(addr,ntaddr);
5576     emit_cmovl_reg(alt,addr);
5577     if(s1h>=0) {
5578       emit_test(s1h,s1h);
5579       emit_cmovne_reg(ntaddr,addr);
5580       emit_cmovs_reg(alt,addr);
5581     }
5582   }
5583   if((opcode[i]&0x3f)==7) // BGTZ
5584   {
5585     emit_mov2imm_compact(ba[i],addr,start+i*4+8,ntaddr);
5586     emit_cmpimm(s1l,1);
5587     if(s1h>=0) emit_mov(addr,alt);
5588     emit_cmovl_reg(ntaddr,addr);
5589     if(s1h>=0) {
5590       emit_test(s1h,s1h);
5591       emit_cmovne_reg(alt,addr);
5592       emit_cmovs_reg(ntaddr,addr);
5593     }
5594   }
5595   if((opcode[i]&0x3f)==0x16) // BLEZL
5596   {
5597     assert((opcode[i]&0x3f)!=0x16);
5598   }
5599   if((opcode[i]&0x3f)==0x17) // BGTZL
5600   {
5601     assert((opcode[i]&0x3f)!=0x17);
5602   }
5603   assert(opcode[i]!=1); // BLTZ/BGEZ
5604
5605   //FIXME: Check CSREG
5606   if(opcode[i]==0x11 && opcode2[i]==0x08 ) {
5607     if((source[i]&0x30000)==0) // BC1F
5608     {
5609       emit_mov2imm_compact(ba[i],addr,start+i*4+8,alt);
5610       emit_testimm(s1l,0x800000);
5611       emit_cmovne_reg(alt,addr);
5612     }
5613     if((source[i]&0x30000)==0x10000) // BC1T
5614     {
5615       emit_mov2imm_compact(ba[i],alt,start+i*4+8,addr);
5616       emit_testimm(s1l,0x800000);
5617       emit_cmovne_reg(alt,addr);
5618     }
5619     if((source[i]&0x30000)==0x20000) // BC1FL
5620     {
5621       emit_testimm(s1l,0x800000);
5622       nottaken=out;
5623       emit_jne(0);
5624     }
5625     if((source[i]&0x30000)==0x30000) // BC1TL
5626     {
5627       emit_testimm(s1l,0x800000);
5628       nottaken=out;
5629       emit_jeq(0);
5630     }
5631   }
5632
5633   assert(i_regs->regmap[HOST_CCREG]==CCREG);
5634   wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5635   if(likely[i]||unconditional)
5636   {
5637     emit_movimm(ba[i],HOST_BTREG);
5638   }
5639   else if(addr!=HOST_BTREG)
5640   {
5641     emit_mov(addr,HOST_BTREG);
5642   }
5643   void *branch_addr=out;
5644   emit_jmp(0);
5645   int target_addr=start+i*4+5;
5646   void *stub=out;
5647   void *compiled_target_addr=check_addr(target_addr);
5648   emit_extjump_ds(branch_addr, target_addr);
5649   if(compiled_target_addr) {
5650     set_jump_target(branch_addr, compiled_target_addr);
5651     add_link(target_addr,stub);
5652   }
5653   else set_jump_target(branch_addr, stub);
5654   if(likely[i]) {
5655     // Not-taken path
5656     set_jump_target(nottaken, out);
5657     wb_dirtys(regs[i].regmap,regs[i].is32,regs[i].dirty);
5658     void *branch_addr=out;
5659     emit_jmp(0);
5660     int target_addr=start+i*4+8;
5661     void *stub=out;
5662     void *compiled_target_addr=check_addr(target_addr);
5663     emit_extjump_ds(branch_addr, target_addr);
5664     if(compiled_target_addr) {
5665       set_jump_target(branch_addr, compiled_target_addr);
5666       add_link(target_addr,stub);
5667     }
5668     else set_jump_target(branch_addr, stub);
5669   }
5670 }
5671
5672 // Assemble the delay slot for the above
5673 static void pagespan_ds()
5674 {
5675   assem_debug("initial delay slot:\n");
5676   u_int vaddr=start+1;
5677   u_int page=get_page(vaddr);
5678   u_int vpage=get_vpage(vaddr);
5679   ll_add(jump_dirty+vpage,vaddr,(void *)out);
5680   do_dirty_stub_ds();
5681   ll_add(jump_in+page,vaddr,(void *)out);
5682   assert(regs[0].regmap_entry[HOST_CCREG]==CCREG);
5683   if(regs[0].regmap[HOST_CCREG]!=CCREG)
5684     wb_register(CCREG,regs[0].regmap_entry,regs[0].wasdirty,regs[0].was32);
5685   if(regs[0].regmap[HOST_BTREG]!=BTREG)
5686     emit_writeword(HOST_BTREG,&branch_target);
5687   load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,rs1[0],rs2[0]);
5688   address_generation(0,&regs[0],regs[0].regmap_entry);
5689   if(itype[0]==STORE||itype[0]==STORELR||(opcode[0]&0x3b)==0x39||(opcode[0]&0x3b)==0x3a)
5690     load_regs(regs[0].regmap_entry,regs[0].regmap,regs[0].was32,INVCP,INVCP);
5691   cop1_usable=0;
5692   is_delayslot=0;
5693   switch(itype[0]) {
5694     case ALU:
5695       alu_assemble(0,&regs[0]);break;
5696     case IMM16:
5697       imm16_assemble(0,&regs[0]);break;
5698     case SHIFT:
5699       shift_assemble(0,&regs[0]);break;
5700     case SHIFTIMM:
5701       shiftimm_assemble(0,&regs[0]);break;
5702     case LOAD:
5703       load_assemble(0,&regs[0]);break;
5704     case LOADLR:
5705       loadlr_assemble(0,&regs[0]);break;
5706     case STORE:
5707       store_assemble(0,&regs[0]);break;
5708     case STORELR:
5709       storelr_assemble(0,&regs[0]);break;
5710     case COP0:
5711       cop0_assemble(0,&regs[0]);break;
5712     case COP1:
5713       cop1_assemble(0,&regs[0]);break;
5714     case C1LS:
5715       c1ls_assemble(0,&regs[0]);break;
5716     case COP2:
5717       cop2_assemble(0,&regs[0]);break;
5718     case C2LS:
5719       c2ls_assemble(0,&regs[0]);break;
5720     case C2OP:
5721       c2op_assemble(0,&regs[0]);break;
5722     case FCONV:
5723       fconv_assemble(0,&regs[0]);break;
5724     case FLOAT:
5725       float_assemble(0,&regs[0]);break;
5726     case FCOMP:
5727       fcomp_assemble(0,&regs[0]);break;
5728     case MULTDIV:
5729       multdiv_assemble(0,&regs[0]);break;
5730     case MOV:
5731       mov_assemble(0,&regs[0]);break;
5732     case SYSCALL:
5733     case HLECALL:
5734     case INTCALL:
5735     case SPAN:
5736     case UJUMP:
5737     case RJUMP:
5738     case CJUMP:
5739     case SJUMP:
5740     case FJUMP:
5741       SysPrintf("Jump in the delay slot.  This is probably a bug.\n");
5742   }
5743   int btaddr=get_reg(regs[0].regmap,BTREG);
5744   if(btaddr<0) {
5745     btaddr=get_reg(regs[0].regmap,-1);
5746     emit_readword(&branch_target,btaddr);
5747   }
5748   assert(btaddr!=HOST_CCREG);
5749   if(regs[0].regmap[HOST_CCREG]!=CCREG) emit_loadreg(CCREG,HOST_CCREG);
5750 #ifdef HOST_IMM8
5751   emit_movimm(start+4,HOST_TEMPREG);
5752   emit_cmp(btaddr,HOST_TEMPREG);
5753 #else
5754   emit_cmpimm(btaddr,start+4);
5755 #endif
5756   void *branch = out;
5757   emit_jeq(0);
5758   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,-1);
5759   emit_jmp(jump_vaddr_reg[btaddr]);
5760   set_jump_target(branch, out);
5761   store_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
5762   load_regs_bt(regs[0].regmap,regs[0].is32,regs[0].dirty,start+4);
5763 }
5764
5765 // Basic liveness analysis for MIPS registers
5766 void unneeded_registers(int istart,int iend,int r)
5767 {
5768   int i;
5769   uint64_t u,uu,gte_u,b,bu,gte_bu;
5770   uint64_t temp_u,temp_uu,temp_gte_u=0;
5771   uint64_t tdep;
5772   uint64_t gte_u_unknown=0;
5773   if(new_dynarec_hacks&NDHACK_GTE_UNNEEDED)
5774     gte_u_unknown=~0ll;
5775   if(iend==slen-1) {
5776     u=1;uu=1;
5777     gte_u=gte_u_unknown;
5778   }else{
5779     u=unneeded_reg[iend+1];
5780     uu=unneeded_reg_upper[iend+1];
5781     u=1;uu=1;
5782     gte_u=gte_unneeded[iend+1];
5783   }
5784
5785   for (i=iend;i>=istart;i--)
5786   {
5787     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
5788     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
5789     {
5790       // If subroutine call, flag return address as a possible branch target
5791       if(rt1[i]==31 && i<slen-2) bt[i+2]=1;
5792
5793       if(ba[i]<start || ba[i]>=(start+slen*4))
5794       {
5795         // Branch out of this block, flush all regs
5796         u=1;
5797         uu=1;
5798         gte_u=gte_u_unknown;
5799         /* Hexagon hack
5800         if(itype[i]==UJUMP&&rt1[i]==31)
5801         {
5802           uu=u=0x300C00F; // Discard at, v0-v1, t6-t9
5803         }
5804         if(itype[i]==RJUMP&&rs1[i]==31)
5805         {
5806           uu=u=0x300C0F3; // Discard at, a0-a3, t6-t9
5807         }
5808         if(start>0x80000400&&start<0x80000000+RAM_SIZE) {
5809           if(itype[i]==UJUMP&&rt1[i]==31)
5810           {
5811             //uu=u=0x30300FF0FLL; // Discard at, v0-v1, t0-t9, lo, hi
5812             uu=u=0x300FF0F; // Discard at, v0-v1, t0-t9
5813           }
5814           if(itype[i]==RJUMP&&rs1[i]==31)
5815           {
5816             //uu=u=0x30300FFF3LL; // Discard at, a0-a3, t0-t9, lo, hi
5817             uu=u=0x300FFF3; // Discard at, a0-a3, t0-t9
5818           }
5819         }*/
5820         branch_unneeded_reg[i]=u;
5821         branch_unneeded_reg_upper[i]=uu;
5822         // Merge in delay slot
5823         tdep=(~uu>>rt1[i+1])&1;
5824         u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5825         uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5826         u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5827         uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5828         uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
5829         u|=1;uu|=1;
5830         gte_u|=gte_rt[i+1];
5831         gte_u&=~gte_rs[i+1];
5832         // If branch is "likely" (and conditional)
5833         // then we skip the delay slot on the fall-thru path
5834         if(likely[i]) {
5835           if(i<slen-1) {
5836             u&=unneeded_reg[i+2];
5837             uu&=unneeded_reg_upper[i+2];
5838             gte_u&=gte_unneeded[i+2];
5839           }
5840           else
5841           {
5842             u=1;
5843             uu=1;
5844             gte_u=gte_u_unknown;
5845           }
5846         }
5847       }
5848       else
5849       {
5850         // Internal branch, flag target
5851         bt[(ba[i]-start)>>2]=1;
5852         if(ba[i]<=start+i*4) {
5853           // Backward branch
5854           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5855           {
5856             // Unconditional branch
5857             temp_u=1;temp_uu=1;
5858             temp_gte_u=0;
5859           } else {
5860             // Conditional branch (not taken case)
5861             temp_u=unneeded_reg[i+2];
5862             temp_uu=unneeded_reg_upper[i+2];
5863             temp_gte_u&=gte_unneeded[i+2];
5864           }
5865           // Merge in delay slot
5866           tdep=(~temp_uu>>rt1[i+1])&1;
5867           temp_u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5868           temp_uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5869           temp_u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5870           temp_uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5871           temp_uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
5872           temp_u|=1;temp_uu|=1;
5873           temp_gte_u|=gte_rt[i+1];
5874           temp_gte_u&=~gte_rs[i+1];
5875           // If branch is "likely" (and conditional)
5876           // then we skip the delay slot on the fall-thru path
5877           if(likely[i]) {
5878             if(i<slen-1) {
5879               temp_u&=unneeded_reg[i+2];
5880               temp_uu&=unneeded_reg_upper[i+2];
5881               temp_gte_u&=gte_unneeded[i+2];
5882             }
5883             else
5884             {
5885               temp_u=1;
5886               temp_uu=1;
5887               temp_gte_u=gte_u_unknown;
5888             }
5889           }
5890           tdep=(~temp_uu>>rt1[i])&1;
5891           temp_u|=(1LL<<rt1[i])|(1LL<<rt2[i]);
5892           temp_uu|=(1LL<<rt1[i])|(1LL<<rt2[i]);
5893           temp_u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
5894           temp_uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
5895           temp_uu&=~((tdep<<dep1[i])|(tdep<<dep2[i]));
5896           temp_u|=1;temp_uu|=1;
5897           temp_gte_u|=gte_rt[i];
5898           temp_gte_u&=~gte_rs[i];
5899           unneeded_reg[i]=temp_u;
5900           unneeded_reg_upper[i]=temp_uu;
5901           gte_unneeded[i]=temp_gte_u;
5902           // Only go three levels deep.  This recursion can take an
5903           // excessive amount of time if there are a lot of nested loops.
5904           if(r<2) {
5905             unneeded_registers((ba[i]-start)>>2,i-1,r+1);
5906           }else{
5907             unneeded_reg[(ba[i]-start)>>2]=1;
5908             unneeded_reg_upper[(ba[i]-start)>>2]=1;
5909             gte_unneeded[(ba[i]-start)>>2]=gte_u_unknown;
5910           }
5911         } /*else*/ if(1) {
5912           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
5913           {
5914             // Unconditional branch
5915             u=unneeded_reg[(ba[i]-start)>>2];
5916             uu=unneeded_reg_upper[(ba[i]-start)>>2];
5917             gte_u=gte_unneeded[(ba[i]-start)>>2];
5918             branch_unneeded_reg[i]=u;
5919             branch_unneeded_reg_upper[i]=uu;
5920         //u=1;
5921         //uu=1;
5922         //branch_unneeded_reg[i]=u;
5923         //branch_unneeded_reg_upper[i]=uu;
5924             // Merge in delay slot
5925             tdep=(~uu>>rt1[i+1])&1;
5926             u|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5927             uu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5928             u&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5929             uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5930             uu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
5931             u|=1;uu|=1;
5932             gte_u|=gte_rt[i+1];
5933             gte_u&=~gte_rs[i+1];
5934           } else {
5935             // Conditional branch
5936             b=unneeded_reg[(ba[i]-start)>>2];
5937             bu=unneeded_reg_upper[(ba[i]-start)>>2];
5938             gte_bu=gte_unneeded[(ba[i]-start)>>2];
5939             branch_unneeded_reg[i]=b;
5940             branch_unneeded_reg_upper[i]=bu;
5941         //b=1;
5942         //bu=1;
5943         //branch_unneeded_reg[i]=b;
5944         //branch_unneeded_reg_upper[i]=bu;
5945             // Branch delay slot
5946             tdep=(~uu>>rt1[i+1])&1;
5947             b|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5948             bu|=(1LL<<rt1[i+1])|(1LL<<rt2[i+1]);
5949             b&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
5950             bu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
5951             bu&=~((tdep<<dep1[i+1])|(tdep<<dep2[i+1]));
5952             b|=1;bu|=1;
5953             gte_bu|=gte_rt[i+1];
5954             gte_bu&=~gte_rs[i+1];
5955             // If branch is "likely" then we skip the
5956             // delay slot on the fall-thru path
5957             if(likely[i]) {
5958               u=b;
5959               uu=bu;
5960               gte_u=gte_bu;
5961               if(i<slen-1) {
5962                 u&=unneeded_reg[i+2];
5963                 uu&=unneeded_reg_upper[i+2];
5964                 gte_u&=gte_unneeded[i+2];
5965         //u=1;
5966         //uu=1;
5967               }
5968             } else {
5969               u&=b;
5970               uu&=bu;
5971               gte_u&=gte_bu;
5972         //u=1;
5973         //uu=1;
5974             }
5975             if(i<slen-1) {
5976               branch_unneeded_reg[i]&=unneeded_reg[i+2];
5977               branch_unneeded_reg_upper[i]&=unneeded_reg_upper[i+2];
5978         //branch_unneeded_reg[i]=1;
5979         //branch_unneeded_reg_upper[i]=1;
5980             } else {
5981               branch_unneeded_reg[i]=1;
5982               branch_unneeded_reg_upper[i]=1;
5983             }
5984           }
5985         }
5986       }
5987     }
5988     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
5989     {
5990       // SYSCALL instruction (software interrupt)
5991       u=1;
5992       uu=1;
5993     }
5994     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
5995     {
5996       // ERET instruction (return from interrupt)
5997       u=1;
5998       uu=1;
5999     }
6000     //u=uu=1; // DEBUG
6001     tdep=(~uu>>rt1[i])&1;
6002     // Written registers are unneeded
6003     u|=1LL<<rt1[i];
6004     u|=1LL<<rt2[i];
6005     uu|=1LL<<rt1[i];
6006     uu|=1LL<<rt2[i];
6007     gte_u|=gte_rt[i];
6008     // Accessed registers are needed
6009     u&=~(1LL<<rs1[i]);
6010     u&=~(1LL<<rs2[i]);
6011     uu&=~(1LL<<us1[i]);
6012     uu&=~(1LL<<us2[i]);
6013     gte_u&=~gte_rs[i];
6014     if(gte_rs[i]&&rt1[i]&&(unneeded_reg[i+1]&(1ll<<rt1[i])))
6015       gte_u|=gte_rs[i]&gte_unneeded[i+1]; // MFC2/CFC2 to dead register, unneeded
6016     // Source-target dependencies
6017     uu&=~(tdep<<dep1[i]);
6018     uu&=~(tdep<<dep2[i]);
6019     // R0 is always unneeded
6020     u|=1;uu|=1;
6021     // Save it
6022     unneeded_reg[i]=u;
6023     unneeded_reg_upper[i]=uu;
6024     gte_unneeded[i]=gte_u;
6025     /*
6026     printf("ur (%d,%d) %x: ",istart,iend,start+i*4);
6027     printf("U:");
6028     int r;
6029     for(r=1;r<=CCREG;r++) {
6030       if((unneeded_reg[i]>>r)&1) {
6031         if(r==HIREG) printf(" HI");
6032         else if(r==LOREG) printf(" LO");
6033         else printf(" r%d",r);
6034       }
6035     }
6036     printf(" UU:");
6037     for(r=1;r<=CCREG;r++) {
6038       if(((unneeded_reg_upper[i]&~unneeded_reg[i])>>r)&1) {
6039         if(r==HIREG) printf(" HI");
6040         else if(r==LOREG) printf(" LO");
6041         else printf(" r%d",r);
6042       }
6043     }
6044     printf("\n");*/
6045   }
6046   for (i=iend;i>=istart;i--)
6047   {
6048     unneeded_reg_upper[i]=branch_unneeded_reg_upper[i]=-1LL;
6049   }
6050 }
6051
6052 // Write back dirty registers as soon as we will no longer modify them,
6053 // so that we don't end up with lots of writes at the branches.
6054 void clean_registers(int istart,int iend,int wr)
6055 {
6056   int i;
6057   int r;
6058   u_int will_dirty_i,will_dirty_next,temp_will_dirty;
6059   u_int wont_dirty_i,wont_dirty_next,temp_wont_dirty;
6060   if(iend==slen-1) {
6061     will_dirty_i=will_dirty_next=0;
6062     wont_dirty_i=wont_dirty_next=0;
6063   }else{
6064     will_dirty_i=will_dirty_next=will_dirty[iend+1];
6065     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
6066   }
6067   for (i=iend;i>=istart;i--)
6068   {
6069     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6070     {
6071       if(ba[i]<start || ba[i]>=(start+slen*4))
6072       {
6073         // Branch out of this block, flush all regs
6074         if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6075         {
6076           // Unconditional branch
6077           will_dirty_i=0;
6078           wont_dirty_i=0;
6079           // Merge in delay slot (will dirty)
6080           for(r=0;r<HOST_REGS;r++) {
6081             if(r!=EXCLUDE_REG) {
6082               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6083               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6084               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6085               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6086               if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6087               if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6088               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6089               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6090               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6091               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6092               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6093               if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6094               if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6095               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6096             }
6097           }
6098         }
6099         else
6100         {
6101           // Conditional branch
6102           will_dirty_i=0;
6103           wont_dirty_i=wont_dirty_next;
6104           // Merge in delay slot (will dirty)
6105           for(r=0;r<HOST_REGS;r++) {
6106             if(r!=EXCLUDE_REG) {
6107               if(!likely[i]) {
6108                 // Might not dirty if likely branch is not taken
6109                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6110                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6111                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6112                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6113                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6114                 if(branch_regs[i].regmap[r]==0) will_dirty_i&=~(1<<r);
6115                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6116                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6117                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6118                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6119                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6120                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6121                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6122                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6123               }
6124             }
6125           }
6126         }
6127         // Merge in delay slot (wont dirty)
6128         for(r=0;r<HOST_REGS;r++) {
6129           if(r!=EXCLUDE_REG) {
6130             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6131             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6132             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6133             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6134             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6135             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6136             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6137             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6138             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6139             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6140           }
6141         }
6142         if(wr) {
6143           #ifndef DESTRUCTIVE_WRITEBACK
6144           branch_regs[i].dirty&=wont_dirty_i;
6145           #endif
6146           branch_regs[i].dirty|=will_dirty_i;
6147         }
6148       }
6149       else
6150       {
6151         // Internal branch
6152         if(ba[i]<=start+i*4) {
6153           // Backward branch
6154           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6155           {
6156             // Unconditional branch
6157             temp_will_dirty=0;
6158             temp_wont_dirty=0;
6159             // Merge in delay slot (will dirty)
6160             for(r=0;r<HOST_REGS;r++) {
6161               if(r!=EXCLUDE_REG) {
6162                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6163                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6164                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6165                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6166                 if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6167                 if(branch_regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6168                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6169                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6170                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6171                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6172                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6173                 if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6174                 if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6175                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6176               }
6177             }
6178           } else {
6179             // Conditional branch (not taken case)
6180             temp_will_dirty=will_dirty_next;
6181             temp_wont_dirty=wont_dirty_next;
6182             // Merge in delay slot (will dirty)
6183             for(r=0;r<HOST_REGS;r++) {
6184               if(r!=EXCLUDE_REG) {
6185                 if(!likely[i]) {
6186                   // Will not dirty if likely branch is not taken
6187                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6188                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6189                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6190                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6191                   if((branch_regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6192                   if(branch_regs[i].regmap[r]==0) temp_will_dirty&=~(1<<r);
6193                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6194                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
6195                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
6196                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
6197                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
6198                   if((regs[i].regmap[r]&63)>33) temp_will_dirty&=~(1<<r);
6199                   if(regs[i].regmap[r]<=0) temp_will_dirty&=~(1<<r);
6200                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
6201                 }
6202               }
6203             }
6204           }
6205           // Merge in delay slot (wont dirty)
6206           for(r=0;r<HOST_REGS;r++) {
6207             if(r!=EXCLUDE_REG) {
6208               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6209               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6210               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6211               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6212               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6213               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
6214               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
6215               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
6216               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
6217               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
6218             }
6219           }
6220           // Deal with changed mappings
6221           if(i<iend) {
6222             for(r=0;r<HOST_REGS;r++) {
6223               if(r!=EXCLUDE_REG) {
6224                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
6225                   temp_will_dirty&=~(1<<r);
6226                   temp_wont_dirty&=~(1<<r);
6227                   if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6228                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6229                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6230                   } else {
6231                     temp_will_dirty|=1<<r;
6232                     temp_wont_dirty|=1<<r;
6233                   }
6234                 }
6235               }
6236             }
6237           }
6238           if(wr) {
6239             will_dirty[i]=temp_will_dirty;
6240             wont_dirty[i]=temp_wont_dirty;
6241             clean_registers((ba[i]-start)>>2,i-1,0);
6242           }else{
6243             // Limit recursion.  It can take an excessive amount
6244             // of time if there are a lot of nested loops.
6245             will_dirty[(ba[i]-start)>>2]=0;
6246             wont_dirty[(ba[i]-start)>>2]=-1;
6247           }
6248         }
6249         /*else*/ if(1)
6250         {
6251           if(itype[i]==RJUMP||itype[i]==UJUMP||(source[i]>>16)==0x1000)
6252           {
6253             // Unconditional branch
6254             will_dirty_i=0;
6255             wont_dirty_i=0;
6256           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6257             for(r=0;r<HOST_REGS;r++) {
6258               if(r!=EXCLUDE_REG) {
6259                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6260                   will_dirty_i|=will_dirty[(ba[i]-start)>>2]&(1<<r);
6261                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6262                 }
6263                 if(branch_regs[i].regmap[r]>=0) {
6264                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6265                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(branch_regs[i].regmap[r]&63))&1)<<r;
6266                 }
6267               }
6268             }
6269           //}
6270             // Merge in delay slot
6271             for(r=0;r<HOST_REGS;r++) {
6272               if(r!=EXCLUDE_REG) {
6273                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6274                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6275                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6276                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6277                 if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6278                 if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6279                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6280                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6281                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6282                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6283                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6284                 if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6285                 if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6286                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6287               }
6288             }
6289           } else {
6290             // Conditional branch
6291             will_dirty_i=will_dirty_next;
6292             wont_dirty_i=wont_dirty_next;
6293           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
6294             for(r=0;r<HOST_REGS;r++) {
6295               if(r!=EXCLUDE_REG) {
6296                 signed char target_reg=branch_regs[i].regmap[r];
6297                 if(target_reg==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6298                   will_dirty_i&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6299                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6300                 }
6301                 else if(target_reg>=0) {
6302                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6303                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>2]>>(target_reg&63))&1)<<r;
6304                 }
6305                 // Treat delay slot as part of branch too
6306                 /*if(regs[i+1].regmap[r]==regs[(ba[i]-start)>>2].regmap_entry[r]) {
6307                   will_dirty[i+1]&=will_dirty[(ba[i]-start)>>2]&(1<<r);
6308                   wont_dirty[i+1]|=wont_dirty[(ba[i]-start)>>2]&(1<<r);
6309                 }
6310                 else
6311                 {
6312                   will_dirty[i+1]&=~(1<<r);
6313                 }*/
6314               }
6315             }
6316           //}
6317             // Merge in delay slot
6318             for(r=0;r<HOST_REGS;r++) {
6319               if(r!=EXCLUDE_REG) {
6320                 if(!likely[i]) {
6321                   // Might not dirty if likely branch is not taken
6322                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6323                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6324                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6325                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6326                   if((branch_regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6327                   if(branch_regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6328                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6329                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6330                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6331                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
6332                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
6333                   if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6334                   if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6335                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6336                 }
6337               }
6338             }
6339           }
6340           // Merge in delay slot (won't dirty)
6341           for(r=0;r<HOST_REGS;r++) {
6342             if(r!=EXCLUDE_REG) {
6343               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6344               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6345               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6346               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6347               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6348               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6349               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6350               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
6351               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
6352               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6353             }
6354           }
6355           if(wr) {
6356             #ifndef DESTRUCTIVE_WRITEBACK
6357             branch_regs[i].dirty&=wont_dirty_i;
6358             #endif
6359             branch_regs[i].dirty|=will_dirty_i;
6360           }
6361         }
6362       }
6363     }
6364     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
6365     {
6366       // SYSCALL instruction (software interrupt)
6367       will_dirty_i=0;
6368       wont_dirty_i=0;
6369     }
6370     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
6371     {
6372       // ERET instruction (return from interrupt)
6373       will_dirty_i=0;
6374       wont_dirty_i=0;
6375     }
6376     will_dirty_next=will_dirty_i;
6377     wont_dirty_next=wont_dirty_i;
6378     for(r=0;r<HOST_REGS;r++) {
6379       if(r!=EXCLUDE_REG) {
6380         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
6381         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
6382         if((regs[i].regmap[r]&63)>33) will_dirty_i&=~(1<<r);
6383         if(regs[i].regmap[r]<=0) will_dirty_i&=~(1<<r);
6384         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
6385         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
6386         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
6387         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
6388         if(i>istart) {
6389           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=FJUMP)
6390           {
6391             // Don't store a register immediately after writing it,
6392             // may prevent dual-issue.
6393             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
6394             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
6395           }
6396         }
6397       }
6398     }
6399     // Save it
6400     will_dirty[i]=will_dirty_i;
6401     wont_dirty[i]=wont_dirty_i;
6402     // Mark registers that won't be dirtied as not dirty
6403     if(wr) {
6404       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
6405       for(r=0;r<HOST_REGS;r++) {
6406         if((will_dirty_i>>r)&1) {
6407           printf(" r%d",r);
6408         }
6409       }
6410       printf("\n");*/
6411
6412       //if(i==istart||(itype[i-1]!=RJUMP&&itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=FJUMP)) {
6413         regs[i].dirty|=will_dirty_i;
6414         #ifndef DESTRUCTIVE_WRITEBACK
6415         regs[i].dirty&=wont_dirty_i;
6416         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
6417         {
6418           if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
6419             for(r=0;r<HOST_REGS;r++) {
6420               if(r!=EXCLUDE_REG) {
6421                 if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
6422                   regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
6423                 }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6424               }
6425             }
6426           }
6427         }
6428         else
6429         {
6430           if(i<iend) {
6431             for(r=0;r<HOST_REGS;r++) {
6432               if(r!=EXCLUDE_REG) {
6433                 if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
6434                   regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
6435                 }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
6436               }
6437             }
6438           }
6439         }
6440         #endif
6441       //}
6442     }
6443     // Deal with changed mappings
6444     temp_will_dirty=will_dirty_i;
6445     temp_wont_dirty=wont_dirty_i;
6446     for(r=0;r<HOST_REGS;r++) {
6447       if(r!=EXCLUDE_REG) {
6448         int nr;
6449         if(regs[i].regmap[r]==regmap_pre[i][r]) {
6450           if(wr) {
6451             #ifndef DESTRUCTIVE_WRITEBACK
6452             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6453             #endif
6454             regs[i].wasdirty|=will_dirty_i&(1<<r);
6455           }
6456         }
6457         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
6458           // Register moved to a different register
6459           will_dirty_i&=~(1<<r);
6460           wont_dirty_i&=~(1<<r);
6461           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
6462           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
6463           if(wr) {
6464             #ifndef DESTRUCTIVE_WRITEBACK
6465             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
6466             #endif
6467             regs[i].wasdirty|=will_dirty_i&(1<<r);
6468           }
6469         }
6470         else {
6471           will_dirty_i&=~(1<<r);
6472           wont_dirty_i&=~(1<<r);
6473           if((regmap_pre[i][r]&63)>0 && (regmap_pre[i][r]&63)<34) {
6474             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6475             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
6476           } else {
6477             wont_dirty_i|=1<<r;
6478             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
6479           }
6480         }
6481       }
6482     }
6483   }
6484 }
6485
6486 #ifdef DISASM
6487   /* disassembly */
6488 void disassemble_inst(int i)
6489 {
6490     if (bt[i]) printf("*"); else printf(" ");
6491     switch(itype[i]) {
6492       case UJUMP:
6493         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6494       case CJUMP:
6495         printf (" %x: %s r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],i?start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14):*ba);break;
6496       case SJUMP:
6497         printf (" %x: %s r%d,%8x\n",start+i*4,insn[i],rs1[i],start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14));break;
6498       case FJUMP:
6499         printf (" %x: %s %8x\n",start+i*4,insn[i],ba[i]);break;
6500       case RJUMP:
6501         if (opcode[i]==0x9&&rt1[i]!=31)
6502           printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i]);
6503         else
6504           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6505         break;
6506       case SPAN:
6507         printf (" %x: %s (pagespan) r%d,r%d,%8x\n",start+i*4,insn[i],rs1[i],rs2[i],ba[i]);break;
6508       case IMM16:
6509         if(opcode[i]==0xf) //LUI
6510           printf (" %x: %s r%d,%4x0000\n",start+i*4,insn[i],rt1[i],imm[i]&0xffff);
6511         else
6512           printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6513         break;
6514       case LOAD:
6515       case LOADLR:
6516         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6517         break;
6518       case STORE:
6519       case STORELR:
6520         printf (" %x: %s r%d,r%d+%x\n",start+i*4,insn[i],rs2[i],rs1[i],imm[i]);
6521         break;
6522       case ALU:
6523       case SHIFT:
6524         printf (" %x: %s r%d,r%d,r%d\n",start+i*4,insn[i],rt1[i],rs1[i],rs2[i]);
6525         break;
6526       case MULTDIV:
6527         printf (" %x: %s r%d,r%d\n",start+i*4,insn[i],rs1[i],rs2[i]);
6528         break;
6529       case SHIFTIMM:
6530         printf (" %x: %s r%d,r%d,%d\n",start+i*4,insn[i],rt1[i],rs1[i],imm[i]);
6531         break;
6532       case MOV:
6533         if((opcode2[i]&0x1d)==0x10)
6534           printf (" %x: %s r%d\n",start+i*4,insn[i],rt1[i]);
6535         else if((opcode2[i]&0x1d)==0x11)
6536           printf (" %x: %s r%d\n",start+i*4,insn[i],rs1[i]);
6537         else
6538           printf (" %x: %s\n",start+i*4,insn[i]);
6539         break;
6540       case COP0:
6541         if(opcode2[i]==0)
6542           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC0
6543         else if(opcode2[i]==4)
6544           printf (" %x: %s r%d,cpr0[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC0
6545         else printf (" %x: %s\n",start+i*4,insn[i]);
6546         break;
6547       case COP1:
6548         if(opcode2[i]<3)
6549           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC1
6550         else if(opcode2[i]>3)
6551           printf (" %x: %s r%d,cpr1[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC1
6552         else printf (" %x: %s\n",start+i*4,insn[i]);
6553         break;
6554       case COP2:
6555         if(opcode2[i]<3)
6556           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rt1[i],(source[i]>>11)&0x1f); // MFC2
6557         else if(opcode2[i]>3)
6558           printf (" %x: %s r%d,cpr2[%d]\n",start+i*4,insn[i],rs1[i],(source[i]>>11)&0x1f); // MTC2
6559         else printf (" %x: %s\n",start+i*4,insn[i]);
6560         break;
6561       case C1LS:
6562         printf (" %x: %s cpr1[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6563         break;
6564       case C2LS:
6565         printf (" %x: %s cpr2[%d],r%d+%x\n",start+i*4,insn[i],(source[i]>>16)&0x1f,rs1[i],imm[i]);
6566         break;
6567       case INTCALL:
6568         printf (" %x: %s (INTCALL)\n",start+i*4,insn[i]);
6569         break;
6570       default:
6571         //printf (" %s %8x\n",insn[i],source[i]);
6572         printf (" %x: %s\n",start+i*4,insn[i]);
6573     }
6574 }
6575 #else
6576 static void disassemble_inst(int i) {}
6577 #endif // DISASM
6578
6579 #define DRC_TEST_VAL 0x74657374
6580
6581 static int new_dynarec_test(void)
6582 {
6583   int (*testfunc)(void) = (void *)out;
6584   void *beginning;
6585   int ret;
6586
6587   beginning = start_block();
6588   emit_movimm(DRC_TEST_VAL,0); // test
6589   emit_jmpreg(14);
6590   literal_pool(0);
6591   end_block(beginning);
6592   SysPrintf("testing if we can run recompiled code..\n");
6593   ret = testfunc();
6594   if (ret == DRC_TEST_VAL)
6595     SysPrintf("test passed.\n");
6596   else
6597     SysPrintf("test failed: %08x\n", ret);
6598   out = translation_cache;
6599   return ret == DRC_TEST_VAL;
6600 }
6601
6602 // clear the state completely, instead of just marking
6603 // things invalid like invalidate_all_pages() does
6604 void new_dynarec_clear_full()
6605 {
6606   int n;
6607   out = translation_cache;
6608   memset(invalid_code,1,sizeof(invalid_code));
6609   memset(hash_table,0xff,sizeof(hash_table));
6610   memset(mini_ht,-1,sizeof(mini_ht));
6611   memset(restore_candidate,0,sizeof(restore_candidate));
6612   memset(shadow,0,sizeof(shadow));
6613   copy=shadow;
6614   expirep=16384; // Expiry pointer, +2 blocks
6615   pending_exception=0;
6616   literalcount=0;
6617   stop_after_jal=0;
6618   inv_code_start=inv_code_end=~0;
6619   // TLB
6620   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6621   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6622   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6623 }
6624
6625 void new_dynarec_init()
6626 {
6627   SysPrintf("Init new dynarec\n");
6628
6629   // allocate/prepare a buffer for translation cache
6630   // see assem_arm.h for some explanation
6631 #if   defined(BASE_ADDR_FIXED)
6632   if (mmap(translation_cache, 1 << TARGET_SIZE_2,
6633             PROT_READ | PROT_WRITE | PROT_EXEC,
6634             MAP_PRIVATE | MAP_ANONYMOUS,
6635             -1, 0) != translation_cache) {
6636     SysPrintf("mmap() failed: %s\n", strerror(errno));
6637     SysPrintf("disable BASE_ADDR_FIXED and recompile\n");
6638     abort();
6639   }
6640 #elif defined(BASE_ADDR_DYNAMIC)
6641   #ifdef VITA
6642   sceBlock = sceKernelAllocMemBlockForVM("code", 1 << TARGET_SIZE_2);
6643   if (sceBlock < 0)
6644     SysPrintf("sceKernelAllocMemBlockForVM failed\n");
6645   int ret = sceKernelGetMemBlockBase(sceBlock, (void **)&translation_cache);
6646   if (ret < 0)
6647     SysPrintf("sceKernelGetMemBlockBase failed\n");
6648   #else
6649   translation_cache = mmap (NULL, 1 << TARGET_SIZE_2,
6650             PROT_READ | PROT_WRITE | PROT_EXEC,
6651             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
6652   if (translation_cache == MAP_FAILED) {
6653     SysPrintf("mmap() failed: %s\n", strerror(errno));
6654     abort();
6655   }
6656   #endif
6657 #else
6658   #ifndef NO_WRITE_EXEC
6659   // not all systems allow execute in data segment by default
6660   if (mprotect(translation_cache, 1<<TARGET_SIZE_2, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
6661     SysPrintf("mprotect() failed: %s\n", strerror(errno));
6662   #endif
6663 #endif
6664   out = translation_cache;
6665   cycle_multiplier=200;
6666   new_dynarec_clear_full();
6667 #ifdef HOST_IMM8
6668   // Copy this into local area so we don't have to put it in every literal pool
6669   invc_ptr=invalid_code;
6670 #endif
6671   arch_init();
6672   new_dynarec_test();
6673 #ifndef RAM_FIXED
6674   ram_offset=(u_int)rdram-0x80000000;
6675 #endif
6676   if (ram_offset!=0)
6677     SysPrintf("warning: RAM is not directly mapped, performance will suffer\n");
6678 }
6679
6680 void new_dynarec_cleanup()
6681 {
6682   int n;
6683 #if defined(BASE_ADDR_FIXED) || defined(BASE_ADDR_DYNAMIC)
6684   #ifdef VITA
6685   sceKernelFreeMemBlock(sceBlock);
6686   sceBlock = -1;
6687   #else
6688   if (munmap(translation_cache, 1<<TARGET_SIZE_2) < 0)
6689     SysPrintf("munmap() failed\n");
6690   #endif
6691 #endif
6692   for(n=0;n<4096;n++) ll_clear(jump_in+n);
6693   for(n=0;n<4096;n++) ll_clear(jump_out+n);
6694   for(n=0;n<4096;n++) ll_clear(jump_dirty+n);
6695   #ifdef ROM_COPY
6696   if (munmap (ROM_COPY, 67108864) < 0) {SysPrintf("munmap() failed\n");}
6697   #endif
6698 }
6699
6700 static u_int *get_source_start(u_int addr, u_int *limit)
6701 {
6702   if (addr < 0x00200000 ||
6703     (0xa0000000 <= addr && addr < 0xa0200000)) {
6704     // used for BIOS calls mostly?
6705     *limit = (addr&0xa0000000)|0x00200000;
6706     return (u_int *)((u_int)rdram + (addr&0x1fffff));
6707   }
6708   else if (!Config.HLE && (
6709     /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/
6710     (0xbfc00000 <= addr && addr < 0xbfc80000))) {
6711     // BIOS
6712     *limit = (addr & 0xfff00000) | 0x80000;
6713     return (u_int *)((u_int)psxR + (addr&0x7ffff));
6714   }
6715   else if (addr >= 0x80000000 && addr < 0x80000000+RAM_SIZE) {
6716     *limit = (addr & 0x80600000) + 0x00200000;
6717     return (u_int *)((u_int)rdram + (addr&0x1fffff));
6718   }
6719   return NULL;
6720 }
6721
6722 static u_int scan_for_ret(u_int addr)
6723 {
6724   u_int limit = 0;
6725   u_int *mem;
6726
6727   mem = get_source_start(addr, &limit);
6728   if (mem == NULL)
6729     return addr;
6730
6731   if (limit > addr + 0x1000)
6732     limit = addr + 0x1000;
6733   for (; addr < limit; addr += 4, mem++) {
6734     if (*mem == 0x03e00008) // jr $ra
6735       return addr + 8;
6736   }
6737   return addr;
6738 }
6739
6740 struct savestate_block {
6741   uint32_t addr;
6742   uint32_t regflags;
6743 };
6744
6745 static int addr_cmp(const void *p1_, const void *p2_)
6746 {
6747   const struct savestate_block *p1 = p1_, *p2 = p2_;
6748   return p1->addr - p2->addr;
6749 }
6750
6751 int new_dynarec_save_blocks(void *save, int size)
6752 {
6753   struct savestate_block *blocks = save;
6754   int maxcount = size / sizeof(blocks[0]);
6755   struct savestate_block tmp_blocks[1024];
6756   struct ll_entry *head;
6757   int p, s, d, o, bcnt;
6758   u_int addr;
6759
6760   o = 0;
6761   for (p = 0; p < ARRAY_SIZE(jump_in); p++) {
6762     bcnt = 0;
6763     for (head = jump_in[p]; head != NULL; head = head->next) {
6764       tmp_blocks[bcnt].addr = head->vaddr;
6765       tmp_blocks[bcnt].regflags = head->reg_sv_flags;
6766       bcnt++;
6767     }
6768     if (bcnt < 1)
6769       continue;
6770     qsort(tmp_blocks, bcnt, sizeof(tmp_blocks[0]), addr_cmp);
6771
6772     addr = tmp_blocks[0].addr;
6773     for (s = d = 0; s < bcnt; s++) {
6774       if (tmp_blocks[s].addr < addr)
6775         continue;
6776       if (d == 0 || tmp_blocks[d-1].addr != tmp_blocks[s].addr)
6777         tmp_blocks[d++] = tmp_blocks[s];
6778       addr = scan_for_ret(tmp_blocks[s].addr);
6779     }
6780
6781     if (o + d > maxcount)
6782       d = maxcount - o;
6783     memcpy(&blocks[o], tmp_blocks, d * sizeof(blocks[0]));
6784     o += d;
6785   }
6786
6787   return o * sizeof(blocks[0]);
6788 }
6789
6790 void new_dynarec_load_blocks(const void *save, int size)
6791 {
6792   const struct savestate_block *blocks = save;
6793   int count = size / sizeof(blocks[0]);
6794   u_int regs_save[32];
6795   uint32_t f;
6796   int i, b;
6797
6798   get_addr(psxRegs.pc);
6799
6800   // change GPRs for speculation to at least partially work..
6801   memcpy(regs_save, &psxRegs.GPR, sizeof(regs_save));
6802   for (i = 1; i < 32; i++)
6803     psxRegs.GPR.r[i] = 0x80000000;
6804
6805   for (b = 0; b < count; b++) {
6806     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6807       if (f & 1)
6808         psxRegs.GPR.r[i] = 0x1f800000;
6809     }
6810
6811     get_addr(blocks[b].addr);
6812
6813     for (f = blocks[b].regflags, i = 0; f; f >>= 1, i++) {
6814       if (f & 1)
6815         psxRegs.GPR.r[i] = 0x80000000;
6816     }
6817   }
6818
6819   memcpy(&psxRegs.GPR, regs_save, sizeof(regs_save));
6820 }
6821
6822 int new_recompile_block(int addr)
6823 {
6824   u_int pagelimit = 0;
6825   u_int state_rflags = 0;
6826   int i;
6827
6828   assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out);
6829   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
6830   //if(debug)
6831   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
6832
6833   // this is just for speculation
6834   for (i = 1; i < 32; i++) {
6835     if ((psxRegs.GPR.r[i] & 0xffff0000) == 0x1f800000)
6836       state_rflags |= 1 << i;
6837   }
6838
6839   start = (u_int)addr&~3;
6840   //assert(((u_int)addr&1)==0);
6841   new_dynarec_did_compile=1;
6842   if (Config.HLE && start == 0x80001000) // hlecall
6843   {
6844     // XXX: is this enough? Maybe check hleSoftCall?
6845     void *beginning=start_block();
6846     u_int page=get_page(start);
6847
6848     invalid_code[start>>12]=0;
6849     emit_movimm(start,0);
6850     emit_writeword(0,&pcaddr);
6851     emit_jmp(new_dyna_leave);
6852     literal_pool(0);
6853     end_block(beginning);
6854     ll_add_flags(jump_in+page,start,state_rflags,(void *)beginning);
6855     return 0;
6856   }
6857
6858   source = get_source_start(start, &pagelimit);
6859   if (source == NULL) {
6860     SysPrintf("Compile at bogus memory address: %08x\n", addr);
6861     exit(1);
6862   }
6863
6864   /* Pass 1: disassemble */
6865   /* Pass 2: register dependencies, branch targets */
6866   /* Pass 3: register allocation */
6867   /* Pass 4: branch dependencies */
6868   /* Pass 5: pre-alloc */
6869   /* Pass 6: optimize clean/dirty state */
6870   /* Pass 7: flag 32-bit registers */
6871   /* Pass 8: assembly */
6872   /* Pass 9: linker */
6873   /* Pass 10: garbage collection / free memory */
6874
6875   int j;
6876   int done=0;
6877   unsigned int type,op,op2;
6878
6879   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
6880
6881   /* Pass 1 disassembly */
6882
6883   for(i=0;!done;i++) {
6884     bt[i]=0;likely[i]=0;ooo[i]=0;op2=0;
6885     minimum_free_regs[i]=0;
6886     opcode[i]=op=source[i]>>26;
6887     switch(op)
6888     {
6889       case 0x00: strcpy(insn[i],"special"); type=NI;
6890         op2=source[i]&0x3f;
6891         switch(op2)
6892         {
6893           case 0x00: strcpy(insn[i],"SLL"); type=SHIFTIMM; break;
6894           case 0x02: strcpy(insn[i],"SRL"); type=SHIFTIMM; break;
6895           case 0x03: strcpy(insn[i],"SRA"); type=SHIFTIMM; break;
6896           case 0x04: strcpy(insn[i],"SLLV"); type=SHIFT; break;
6897           case 0x06: strcpy(insn[i],"SRLV"); type=SHIFT; break;
6898           case 0x07: strcpy(insn[i],"SRAV"); type=SHIFT; break;
6899           case 0x08: strcpy(insn[i],"JR"); type=RJUMP; break;
6900           case 0x09: strcpy(insn[i],"JALR"); type=RJUMP; break;
6901           case 0x0C: strcpy(insn[i],"SYSCALL"); type=SYSCALL; break;
6902           case 0x0D: strcpy(insn[i],"BREAK"); type=OTHER; break;
6903           case 0x0F: strcpy(insn[i],"SYNC"); type=OTHER; break;
6904           case 0x10: strcpy(insn[i],"MFHI"); type=MOV; break;
6905           case 0x11: strcpy(insn[i],"MTHI"); type=MOV; break;
6906           case 0x12: strcpy(insn[i],"MFLO"); type=MOV; break;
6907           case 0x13: strcpy(insn[i],"MTLO"); type=MOV; break;
6908           case 0x18: strcpy(insn[i],"MULT"); type=MULTDIV; break;
6909           case 0x19: strcpy(insn[i],"MULTU"); type=MULTDIV; break;
6910           case 0x1A: strcpy(insn[i],"DIV"); type=MULTDIV; break;
6911           case 0x1B: strcpy(insn[i],"DIVU"); type=MULTDIV; break;
6912           case 0x20: strcpy(insn[i],"ADD"); type=ALU; break;
6913           case 0x21: strcpy(insn[i],"ADDU"); type=ALU; break;
6914           case 0x22: strcpy(insn[i],"SUB"); type=ALU; break;
6915           case 0x23: strcpy(insn[i],"SUBU"); type=ALU; break;
6916           case 0x24: strcpy(insn[i],"AND"); type=ALU; break;
6917           case 0x25: strcpy(insn[i],"OR"); type=ALU; break;
6918           case 0x26: strcpy(insn[i],"XOR"); type=ALU; break;
6919           case 0x27: strcpy(insn[i],"NOR"); type=ALU; break;
6920           case 0x2A: strcpy(insn[i],"SLT"); type=ALU; break;
6921           case 0x2B: strcpy(insn[i],"SLTU"); type=ALU; break;
6922           case 0x30: strcpy(insn[i],"TGE"); type=NI; break;
6923           case 0x31: strcpy(insn[i],"TGEU"); type=NI; break;
6924           case 0x32: strcpy(insn[i],"TLT"); type=NI; break;
6925           case 0x33: strcpy(insn[i],"TLTU"); type=NI; break;
6926           case 0x34: strcpy(insn[i],"TEQ"); type=NI; break;
6927           case 0x36: strcpy(insn[i],"TNE"); type=NI; break;
6928 #if 0
6929           case 0x14: strcpy(insn[i],"DSLLV"); type=SHIFT; break;
6930           case 0x16: strcpy(insn[i],"DSRLV"); type=SHIFT; break;
6931           case 0x17: strcpy(insn[i],"DSRAV"); type=SHIFT; break;
6932           case 0x1C: strcpy(insn[i],"DMULT"); type=MULTDIV; break;
6933           case 0x1D: strcpy(insn[i],"DMULTU"); type=MULTDIV; break;
6934           case 0x1E: strcpy(insn[i],"DDIV"); type=MULTDIV; break;
6935           case 0x1F: strcpy(insn[i],"DDIVU"); type=MULTDIV; break;
6936           case 0x2C: strcpy(insn[i],"DADD"); type=ALU; break;
6937           case 0x2D: strcpy(insn[i],"DADDU"); type=ALU; break;
6938           case 0x2E: strcpy(insn[i],"DSUB"); type=ALU; break;
6939           case 0x2F: strcpy(insn[i],"DSUBU"); type=ALU; break;
6940           case 0x38: strcpy(insn[i],"DSLL"); type=SHIFTIMM; break;
6941           case 0x3A: strcpy(insn[i],"DSRL"); type=SHIFTIMM; break;
6942           case 0x3B: strcpy(insn[i],"DSRA"); type=SHIFTIMM; break;
6943           case 0x3C: strcpy(insn[i],"DSLL32"); type=SHIFTIMM; break;
6944           case 0x3E: strcpy(insn[i],"DSRL32"); type=SHIFTIMM; break;
6945           case 0x3F: strcpy(insn[i],"DSRA32"); type=SHIFTIMM; break;
6946 #endif
6947         }
6948         break;
6949       case 0x01: strcpy(insn[i],"regimm"); type=NI;
6950         op2=(source[i]>>16)&0x1f;
6951         switch(op2)
6952         {
6953           case 0x00: strcpy(insn[i],"BLTZ"); type=SJUMP; break;
6954           case 0x01: strcpy(insn[i],"BGEZ"); type=SJUMP; break;
6955           case 0x02: strcpy(insn[i],"BLTZL"); type=SJUMP; break;
6956           case 0x03: strcpy(insn[i],"BGEZL"); type=SJUMP; break;
6957           case 0x08: strcpy(insn[i],"TGEI"); type=NI; break;
6958           case 0x09: strcpy(insn[i],"TGEIU"); type=NI; break;
6959           case 0x0A: strcpy(insn[i],"TLTI"); type=NI; break;
6960           case 0x0B: strcpy(insn[i],"TLTIU"); type=NI; break;
6961           case 0x0C: strcpy(insn[i],"TEQI"); type=NI; break;
6962           case 0x0E: strcpy(insn[i],"TNEI"); type=NI; break;
6963           case 0x10: strcpy(insn[i],"BLTZAL"); type=SJUMP; break;
6964           case 0x11: strcpy(insn[i],"BGEZAL"); type=SJUMP; break;
6965           case 0x12: strcpy(insn[i],"BLTZALL"); type=SJUMP; break;
6966           case 0x13: strcpy(insn[i],"BGEZALL"); type=SJUMP; break;
6967         }
6968         break;
6969       case 0x02: strcpy(insn[i],"J"); type=UJUMP; break;
6970       case 0x03: strcpy(insn[i],"JAL"); type=UJUMP; break;
6971       case 0x04: strcpy(insn[i],"BEQ"); type=CJUMP; break;
6972       case 0x05: strcpy(insn[i],"BNE"); type=CJUMP; break;
6973       case 0x06: strcpy(insn[i],"BLEZ"); type=CJUMP; break;
6974       case 0x07: strcpy(insn[i],"BGTZ"); type=CJUMP; break;
6975       case 0x08: strcpy(insn[i],"ADDI"); type=IMM16; break;
6976       case 0x09: strcpy(insn[i],"ADDIU"); type=IMM16; break;
6977       case 0x0A: strcpy(insn[i],"SLTI"); type=IMM16; break;
6978       case 0x0B: strcpy(insn[i],"SLTIU"); type=IMM16; break;
6979       case 0x0C: strcpy(insn[i],"ANDI"); type=IMM16; break;
6980       case 0x0D: strcpy(insn[i],"ORI"); type=IMM16; break;
6981       case 0x0E: strcpy(insn[i],"XORI"); type=IMM16; break;
6982       case 0x0F: strcpy(insn[i],"LUI"); type=IMM16; break;
6983       case 0x10: strcpy(insn[i],"cop0"); type=NI;
6984         op2=(source[i]>>21)&0x1f;
6985         switch(op2)
6986         {
6987           case 0x00: strcpy(insn[i],"MFC0"); type=COP0; break;
6988           case 0x04: strcpy(insn[i],"MTC0"); type=COP0; break;
6989           case 0x10: strcpy(insn[i],"tlb"); type=NI;
6990           switch(source[i]&0x3f)
6991           {
6992             case 0x01: strcpy(insn[i],"TLBR"); type=COP0; break;
6993             case 0x02: strcpy(insn[i],"TLBWI"); type=COP0; break;
6994             case 0x06: strcpy(insn[i],"TLBWR"); type=COP0; break;
6995             case 0x08: strcpy(insn[i],"TLBP"); type=COP0; break;
6996             case 0x10: strcpy(insn[i],"RFE"); type=COP0; break;
6997             //case 0x18: strcpy(insn[i],"ERET"); type=COP0; break;
6998           }
6999         }
7000         break;
7001       case 0x11: strcpy(insn[i],"cop1"); type=NI;
7002         op2=(source[i]>>21)&0x1f;
7003         switch(op2)
7004         {
7005           case 0x00: strcpy(insn[i],"MFC1"); type=COP1; break;
7006           case 0x01: strcpy(insn[i],"DMFC1"); type=COP1; break;
7007           case 0x02: strcpy(insn[i],"CFC1"); type=COP1; break;
7008           case 0x04: strcpy(insn[i],"MTC1"); type=COP1; break;
7009           case 0x05: strcpy(insn[i],"DMTC1"); type=COP1; break;
7010           case 0x06: strcpy(insn[i],"CTC1"); type=COP1; break;
7011           case 0x08: strcpy(insn[i],"BC1"); type=FJUMP;
7012           switch((source[i]>>16)&0x3)
7013           {
7014             case 0x00: strcpy(insn[i],"BC1F"); break;
7015             case 0x01: strcpy(insn[i],"BC1T"); break;
7016             case 0x02: strcpy(insn[i],"BC1FL"); break;
7017             case 0x03: strcpy(insn[i],"BC1TL"); break;
7018           }
7019           break;
7020           case 0x10: strcpy(insn[i],"C1.S"); type=NI;
7021           switch(source[i]&0x3f)
7022           {
7023             case 0x00: strcpy(insn[i],"ADD.S"); type=FLOAT; break;
7024             case 0x01: strcpy(insn[i],"SUB.S"); type=FLOAT; break;
7025             case 0x02: strcpy(insn[i],"MUL.S"); type=FLOAT; break;
7026             case 0x03: strcpy(insn[i],"DIV.S"); type=FLOAT; break;
7027             case 0x04: strcpy(insn[i],"SQRT.S"); type=FLOAT; break;
7028             case 0x05: strcpy(insn[i],"ABS.S"); type=FLOAT; break;
7029             case 0x06: strcpy(insn[i],"MOV.S"); type=FLOAT; break;
7030             case 0x07: strcpy(insn[i],"NEG.S"); type=FLOAT; break;
7031             case 0x08: strcpy(insn[i],"ROUND.L.S"); type=FCONV; break;
7032             case 0x09: strcpy(insn[i],"TRUNC.L.S"); type=FCONV; break;
7033             case 0x0A: strcpy(insn[i],"CEIL.L.S"); type=FCONV; break;
7034             case 0x0B: strcpy(insn[i],"FLOOR.L.S"); type=FCONV; break;
7035             case 0x0C: strcpy(insn[i],"ROUND.W.S"); type=FCONV; break;
7036             case 0x0D: strcpy(insn[i],"TRUNC.W.S"); type=FCONV; break;
7037             case 0x0E: strcpy(insn[i],"CEIL.W.S"); type=FCONV; break;
7038             case 0x0F: strcpy(insn[i],"FLOOR.W.S"); type=FCONV; break;
7039             case 0x21: strcpy(insn[i],"CVT.D.S"); type=FCONV; break;
7040             case 0x24: strcpy(insn[i],"CVT.W.S"); type=FCONV; break;
7041             case 0x25: strcpy(insn[i],"CVT.L.S"); type=FCONV; break;
7042             case 0x30: strcpy(insn[i],"C.F.S"); type=FCOMP; break;
7043             case 0x31: strcpy(insn[i],"C.UN.S"); type=FCOMP; break;
7044             case 0x32: strcpy(insn[i],"C.EQ.S"); type=FCOMP; break;
7045             case 0x33: strcpy(insn[i],"C.UEQ.S"); type=FCOMP; break;
7046             case 0x34: strcpy(insn[i],"C.OLT.S"); type=FCOMP; break;
7047             case 0x35: strcpy(insn[i],"C.ULT.S"); type=FCOMP; break;
7048             case 0x36: strcpy(insn[i],"C.OLE.S"); type=FCOMP; break;
7049             case 0x37: strcpy(insn[i],"C.ULE.S"); type=FCOMP; break;
7050             case 0x38: strcpy(insn[i],"C.SF.S"); type=FCOMP; break;
7051             case 0x39: strcpy(insn[i],"C.NGLE.S"); type=FCOMP; break;
7052             case 0x3A: strcpy(insn[i],"C.SEQ.S"); type=FCOMP; break;
7053             case 0x3B: strcpy(insn[i],"C.NGL.S"); type=FCOMP; break;
7054             case 0x3C: strcpy(insn[i],"C.LT.S"); type=FCOMP; break;
7055             case 0x3D: strcpy(insn[i],"C.NGE.S"); type=FCOMP; break;
7056             case 0x3E: strcpy(insn[i],"C.LE.S"); type=FCOMP; break;
7057             case 0x3F: strcpy(insn[i],"C.NGT.S"); type=FCOMP; break;
7058           }
7059           break;
7060           case 0x11: strcpy(insn[i],"C1.D"); type=NI;
7061           switch(source[i]&0x3f)
7062           {
7063             case 0x00: strcpy(insn[i],"ADD.D"); type=FLOAT; break;
7064             case 0x01: strcpy(insn[i],"SUB.D"); type=FLOAT; break;
7065             case 0x02: strcpy(insn[i],"MUL.D"); type=FLOAT; break;
7066             case 0x03: strcpy(insn[i],"DIV.D"); type=FLOAT; break;
7067             case 0x04: strcpy(insn[i],"SQRT.D"); type=FLOAT; break;
7068             case 0x05: strcpy(insn[i],"ABS.D"); type=FLOAT; break;
7069             case 0x06: strcpy(insn[i],"MOV.D"); type=FLOAT; break;
7070             case 0x07: strcpy(insn[i],"NEG.D"); type=FLOAT; break;
7071             case 0x08: strcpy(insn[i],"ROUND.L.D"); type=FCONV; break;
7072             case 0x09: strcpy(insn[i],"TRUNC.L.D"); type=FCONV; break;
7073             case 0x0A: strcpy(insn[i],"CEIL.L.D"); type=FCONV; break;
7074             case 0x0B: strcpy(insn[i],"FLOOR.L.D"); type=FCONV; break;
7075             case 0x0C: strcpy(insn[i],"ROUND.W.D"); type=FCONV; break;
7076             case 0x0D: strcpy(insn[i],"TRUNC.W.D"); type=FCONV; break;
7077             case 0x0E: strcpy(insn[i],"CEIL.W.D"); type=FCONV; break;
7078             case 0x0F: strcpy(insn[i],"FLOOR.W.D"); type=FCONV; break;
7079             case 0x20: strcpy(insn[i],"CVT.S.D"); type=FCONV; break;
7080             case 0x24: strcpy(insn[i],"CVT.W.D"); type=FCONV; break;
7081             case 0x25: strcpy(insn[i],"CVT.L.D"); type=FCONV; break;
7082             case 0x30: strcpy(insn[i],"C.F.D"); type=FCOMP; break;
7083             case 0x31: strcpy(insn[i],"C.UN.D"); type=FCOMP; break;
7084             case 0x32: strcpy(insn[i],"C.EQ.D"); type=FCOMP; break;
7085             case 0x33: strcpy(insn[i],"C.UEQ.D"); type=FCOMP; break;
7086             case 0x34: strcpy(insn[i],"C.OLT.D"); type=FCOMP; break;
7087             case 0x35: strcpy(insn[i],"C.ULT.D"); type=FCOMP; break;
7088             case 0x36: strcpy(insn[i],"C.OLE.D"); type=FCOMP; break;
7089             case 0x37: strcpy(insn[i],"C.ULE.D"); type=FCOMP; break;
7090             case 0x38: strcpy(insn[i],"C.SF.D"); type=FCOMP; break;
7091             case 0x39: strcpy(insn[i],"C.NGLE.D"); type=FCOMP; break;
7092             case 0x3A: strcpy(insn[i],"C.SEQ.D"); type=FCOMP; break;
7093             case 0x3B: strcpy(insn[i],"C.NGL.D"); type=FCOMP; break;
7094             case 0x3C: strcpy(insn[i],"C.LT.D"); type=FCOMP; break;
7095             case 0x3D: strcpy(insn[i],"C.NGE.D"); type=FCOMP; break;
7096             case 0x3E: strcpy(insn[i],"C.LE.D"); type=FCOMP; break;
7097             case 0x3F: strcpy(insn[i],"C.NGT.D"); type=FCOMP; break;
7098           }
7099           break;
7100           case 0x14: strcpy(insn[i],"C1.W"); type=NI;
7101           switch(source[i]&0x3f)
7102           {
7103             case 0x20: strcpy(insn[i],"CVT.S.W"); type=FCONV; break;
7104             case 0x21: strcpy(insn[i],"CVT.D.W"); type=FCONV; break;
7105           }
7106           break;
7107           case 0x15: strcpy(insn[i],"C1.L"); type=NI;
7108           switch(source[i]&0x3f)
7109           {
7110             case 0x20: strcpy(insn[i],"CVT.S.L"); type=FCONV; break;
7111             case 0x21: strcpy(insn[i],"CVT.D.L"); type=FCONV; break;
7112           }
7113           break;
7114         }
7115         break;
7116 #if 0
7117       case 0x14: strcpy(insn[i],"BEQL"); type=CJUMP; break;
7118       case 0x15: strcpy(insn[i],"BNEL"); type=CJUMP; break;
7119       case 0x16: strcpy(insn[i],"BLEZL"); type=CJUMP; break;
7120       case 0x17: strcpy(insn[i],"BGTZL"); type=CJUMP; break;
7121       case 0x18: strcpy(insn[i],"DADDI"); type=IMM16; break;
7122       case 0x19: strcpy(insn[i],"DADDIU"); type=IMM16; break;
7123       case 0x1A: strcpy(insn[i],"LDL"); type=LOADLR; break;
7124       case 0x1B: strcpy(insn[i],"LDR"); type=LOADLR; break;
7125 #endif
7126       case 0x20: strcpy(insn[i],"LB"); type=LOAD; break;
7127       case 0x21: strcpy(insn[i],"LH"); type=LOAD; break;
7128       case 0x22: strcpy(insn[i],"LWL"); type=LOADLR; break;
7129       case 0x23: strcpy(insn[i],"LW"); type=LOAD; break;
7130       case 0x24: strcpy(insn[i],"LBU"); type=LOAD; break;
7131       case 0x25: strcpy(insn[i],"LHU"); type=LOAD; break;
7132       case 0x26: strcpy(insn[i],"LWR"); type=LOADLR; break;
7133 #if 0
7134       case 0x27: strcpy(insn[i],"LWU"); type=LOAD; break;
7135 #endif
7136       case 0x28: strcpy(insn[i],"SB"); type=STORE; break;
7137       case 0x29: strcpy(insn[i],"SH"); type=STORE; break;
7138       case 0x2A: strcpy(insn[i],"SWL"); type=STORELR; break;
7139       case 0x2B: strcpy(insn[i],"SW"); type=STORE; break;
7140 #if 0
7141       case 0x2C: strcpy(insn[i],"SDL"); type=STORELR; break;
7142       case 0x2D: strcpy(insn[i],"SDR"); type=STORELR; break;
7143 #endif
7144       case 0x2E: strcpy(insn[i],"SWR"); type=STORELR; break;
7145       case 0x2F: strcpy(insn[i],"CACHE"); type=NOP; break;
7146       case 0x30: strcpy(insn[i],"LL"); type=NI; break;
7147       case 0x31: strcpy(insn[i],"LWC1"); type=C1LS; break;
7148 #if 0
7149       case 0x34: strcpy(insn[i],"LLD"); type=NI; break;
7150       case 0x35: strcpy(insn[i],"LDC1"); type=C1LS; break;
7151       case 0x37: strcpy(insn[i],"LD"); type=LOAD; break;
7152 #endif
7153       case 0x38: strcpy(insn[i],"SC"); type=NI; break;
7154       case 0x39: strcpy(insn[i],"SWC1"); type=C1LS; break;
7155 #if 0
7156       case 0x3C: strcpy(insn[i],"SCD"); type=NI; break;
7157       case 0x3D: strcpy(insn[i],"SDC1"); type=C1LS; break;
7158       case 0x3F: strcpy(insn[i],"SD"); type=STORE; break;
7159 #endif
7160       case 0x12: strcpy(insn[i],"COP2"); type=NI;
7161         op2=(source[i]>>21)&0x1f;
7162         //if (op2 & 0x10) {
7163         if (source[i]&0x3f) { // use this hack to support old savestates with patched gte insns
7164           if (gte_handlers[source[i]&0x3f]!=NULL) {
7165             if (gte_regnames[source[i]&0x3f]!=NULL)
7166               strcpy(insn[i],gte_regnames[source[i]&0x3f]);
7167             else
7168               snprintf(insn[i], sizeof(insn[i]), "COP2 %x", source[i]&0x3f);
7169             type=C2OP;
7170           }
7171         }
7172         else switch(op2)
7173         {
7174           case 0x00: strcpy(insn[i],"MFC2"); type=COP2; break;
7175           case 0x02: strcpy(insn[i],"CFC2"); type=COP2; break;
7176           case 0x04: strcpy(insn[i],"MTC2"); type=COP2; break;
7177           case 0x06: strcpy(insn[i],"CTC2"); type=COP2; break;
7178         }
7179         break;
7180       case 0x32: strcpy(insn[i],"LWC2"); type=C2LS; break;
7181       case 0x3A: strcpy(insn[i],"SWC2"); type=C2LS; break;
7182       case 0x3B: strcpy(insn[i],"HLECALL"); type=HLECALL; break;
7183       default: strcpy(insn[i],"???"); type=NI;
7184         SysPrintf("NI %08x @%08x (%08x)\n", source[i], addr + i*4, addr);
7185         break;
7186     }
7187     itype[i]=type;
7188     opcode2[i]=op2;
7189     /* Get registers/immediates */
7190     lt1[i]=0;
7191     us1[i]=0;
7192     us2[i]=0;
7193     dep1[i]=0;
7194     dep2[i]=0;
7195     gte_rs[i]=gte_rt[i]=0;
7196     switch(type) {
7197       case LOAD:
7198         rs1[i]=(source[i]>>21)&0x1f;
7199         rs2[i]=0;
7200         rt1[i]=(source[i]>>16)&0x1f;
7201         rt2[i]=0;
7202         imm[i]=(short)source[i];
7203         break;
7204       case STORE:
7205       case STORELR:
7206         rs1[i]=(source[i]>>21)&0x1f;
7207         rs2[i]=(source[i]>>16)&0x1f;
7208         rt1[i]=0;
7209         rt2[i]=0;
7210         imm[i]=(short)source[i];
7211         if(op==0x2c||op==0x2d||op==0x3f) us1[i]=rs2[i]; // 64-bit SDL/SDR/SD
7212         break;
7213       case LOADLR:
7214         // LWL/LWR only load part of the register,
7215         // therefore the target register must be treated as a source too
7216         rs1[i]=(source[i]>>21)&0x1f;
7217         rs2[i]=(source[i]>>16)&0x1f;
7218         rt1[i]=(source[i]>>16)&0x1f;
7219         rt2[i]=0;
7220         imm[i]=(short)source[i];
7221         if(op==0x1a||op==0x1b) us1[i]=rs2[i]; // LDR/LDL
7222         if(op==0x26) dep1[i]=rt1[i]; // LWR
7223         break;
7224       case IMM16:
7225         if (op==0x0f) rs1[i]=0; // LUI instruction has no source register
7226         else rs1[i]=(source[i]>>21)&0x1f;
7227         rs2[i]=0;
7228         rt1[i]=(source[i]>>16)&0x1f;
7229         rt2[i]=0;
7230         if(op>=0x0c&&op<=0x0e) { // ANDI/ORI/XORI
7231           imm[i]=(unsigned short)source[i];
7232         }else{
7233           imm[i]=(short)source[i];
7234         }
7235         if(op==0x18||op==0x19) us1[i]=rs1[i]; // DADDI/DADDIU
7236         if(op==0x0a||op==0x0b) us1[i]=rs1[i]; // SLTI/SLTIU
7237         if(op==0x0d||op==0x0e) dep1[i]=rs1[i]; // ORI/XORI
7238         break;
7239       case UJUMP:
7240         rs1[i]=0;
7241         rs2[i]=0;
7242         rt1[i]=0;
7243         rt2[i]=0;
7244         // The JAL instruction writes to r31.
7245         if (op&1) {
7246           rt1[i]=31;
7247         }
7248         rs2[i]=CCREG;
7249         break;
7250       case RJUMP:
7251         rs1[i]=(source[i]>>21)&0x1f;
7252         rs2[i]=0;
7253         rt1[i]=0;
7254         rt2[i]=0;
7255         // The JALR instruction writes to rd.
7256         if (op2&1) {
7257           rt1[i]=(source[i]>>11)&0x1f;
7258         }
7259         rs2[i]=CCREG;
7260         break;
7261       case CJUMP:
7262         rs1[i]=(source[i]>>21)&0x1f;
7263         rs2[i]=(source[i]>>16)&0x1f;
7264         rt1[i]=0;
7265         rt2[i]=0;
7266         if(op&2) { // BGTZ/BLEZ
7267           rs2[i]=0;
7268         }
7269         us1[i]=rs1[i];
7270         us2[i]=rs2[i];
7271         likely[i]=op>>4;
7272         break;
7273       case SJUMP:
7274         rs1[i]=(source[i]>>21)&0x1f;
7275         rs2[i]=CCREG;
7276         rt1[i]=0;
7277         rt2[i]=0;
7278         us1[i]=rs1[i];
7279         if(op2&0x10) { // BxxAL
7280           rt1[i]=31;
7281           // NOTE: If the branch is not taken, r31 is still overwritten
7282         }
7283         likely[i]=(op2&2)>>1;
7284         break;
7285       case FJUMP:
7286         rs1[i]=FSREG;
7287         rs2[i]=CSREG;
7288         rt1[i]=0;
7289         rt2[i]=0;
7290         likely[i]=((source[i])>>17)&1;
7291         break;
7292       case ALU:
7293         rs1[i]=(source[i]>>21)&0x1f; // source
7294         rs2[i]=(source[i]>>16)&0x1f; // subtract amount
7295         rt1[i]=(source[i]>>11)&0x1f; // destination
7296         rt2[i]=0;
7297         if(op2==0x2a||op2==0x2b) { // SLT/SLTU
7298           us1[i]=rs1[i];us2[i]=rs2[i];
7299         }
7300         else if(op2>=0x24&&op2<=0x27) { // AND/OR/XOR/NOR
7301           dep1[i]=rs1[i];dep2[i]=rs2[i];
7302         }
7303         else if(op2>=0x2c&&op2<=0x2f) { // DADD/DSUB
7304           dep1[i]=rs1[i];dep2[i]=rs2[i];
7305         }
7306         break;
7307       case MULTDIV:
7308         rs1[i]=(source[i]>>21)&0x1f; // source
7309         rs2[i]=(source[i]>>16)&0x1f; // divisor
7310         rt1[i]=HIREG;
7311         rt2[i]=LOREG;
7312         if (op2>=0x1c&&op2<=0x1f) { // DMULT/DMULTU/DDIV/DDIVU
7313           us1[i]=rs1[i];us2[i]=rs2[i];
7314         }
7315         break;
7316       case MOV:
7317         rs1[i]=0;
7318         rs2[i]=0;
7319         rt1[i]=0;
7320         rt2[i]=0;
7321         if(op2==0x10) rs1[i]=HIREG; // MFHI
7322         if(op2==0x11) rt1[i]=HIREG; // MTHI
7323         if(op2==0x12) rs1[i]=LOREG; // MFLO
7324         if(op2==0x13) rt1[i]=LOREG; // MTLO
7325         if((op2&0x1d)==0x10) rt1[i]=(source[i]>>11)&0x1f; // MFxx
7326         if((op2&0x1d)==0x11) rs1[i]=(source[i]>>21)&0x1f; // MTxx
7327         dep1[i]=rs1[i];
7328         break;
7329       case SHIFT:
7330         rs1[i]=(source[i]>>16)&0x1f; // target of shift
7331         rs2[i]=(source[i]>>21)&0x1f; // shift amount
7332         rt1[i]=(source[i]>>11)&0x1f; // destination
7333         rt2[i]=0;
7334         // DSLLV/DSRLV/DSRAV are 64-bit
7335         if(op2>=0x14&&op2<=0x17) us1[i]=rs1[i];
7336         break;
7337       case SHIFTIMM:
7338         rs1[i]=(source[i]>>16)&0x1f;
7339         rs2[i]=0;
7340         rt1[i]=(source[i]>>11)&0x1f;
7341         rt2[i]=0;
7342         imm[i]=(source[i]>>6)&0x1f;
7343         // DSxx32 instructions
7344         if(op2>=0x3c) imm[i]|=0x20;
7345         // DSLL/DSRL/DSRA/DSRA32/DSRL32 but not DSLL32 require 64-bit source
7346         if(op2>=0x38&&op2!=0x3c) us1[i]=rs1[i];
7347         break;
7348       case COP0:
7349         rs1[i]=0;
7350         rs2[i]=0;
7351         rt1[i]=0;
7352         rt2[i]=0;
7353         if(op2==0) rt1[i]=(source[i]>>16)&0x1F; // MFC0
7354         if(op2==4) rs1[i]=(source[i]>>16)&0x1F; // MTC0
7355         if(op2==4&&((source[i]>>11)&0x1f)==12) rt2[i]=CSREG; // Status
7356         if(op2==16) if((source[i]&0x3f)==0x18) rs2[i]=CCREG; // ERET
7357         break;
7358       case COP1:
7359         rs1[i]=0;
7360         rs2[i]=0;
7361         rt1[i]=0;
7362         rt2[i]=0;
7363         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC1/DMFC1/CFC1
7364         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC1/DMTC1/CTC1
7365         if(op2==5) us1[i]=rs1[i]; // DMTC1
7366         rs2[i]=CSREG;
7367         break;
7368       case COP2:
7369         rs1[i]=0;
7370         rs2[i]=0;
7371         rt1[i]=0;
7372         rt2[i]=0;
7373         if(op2<3) rt1[i]=(source[i]>>16)&0x1F; // MFC2/CFC2
7374         if(op2>3) rs1[i]=(source[i]>>16)&0x1F; // MTC2/CTC2
7375         rs2[i]=CSREG;
7376         int gr=(source[i]>>11)&0x1F;
7377         switch(op2)
7378         {
7379           case 0x00: gte_rs[i]=1ll<<gr; break; // MFC2
7380           case 0x04: gte_rt[i]=1ll<<gr; break; // MTC2
7381           case 0x02: gte_rs[i]=1ll<<(gr+32); break; // CFC2
7382           case 0x06: gte_rt[i]=1ll<<(gr+32); break; // CTC2
7383         }
7384         break;
7385       case C1LS:
7386         rs1[i]=(source[i]>>21)&0x1F;
7387         rs2[i]=CSREG;
7388         rt1[i]=0;
7389         rt2[i]=0;
7390         imm[i]=(short)source[i];
7391         break;
7392       case C2LS:
7393         rs1[i]=(source[i]>>21)&0x1F;
7394         rs2[i]=0;
7395         rt1[i]=0;
7396         rt2[i]=0;
7397         imm[i]=(short)source[i];
7398         if(op==0x32) gte_rt[i]=1ll<<((source[i]>>16)&0x1F); // LWC2
7399         else gte_rs[i]=1ll<<((source[i]>>16)&0x1F); // SWC2
7400         break;
7401       case C2OP:
7402         rs1[i]=0;
7403         rs2[i]=0;
7404         rt1[i]=0;
7405         rt2[i]=0;
7406         gte_rs[i]=gte_reg_reads[source[i]&0x3f];
7407         gte_rt[i]=gte_reg_writes[source[i]&0x3f];
7408         gte_rt[i]|=1ll<<63; // every op changes flags
7409         if((source[i]&0x3f)==GTE_MVMVA) {
7410           int v = (source[i] >> 15) & 3;
7411           gte_rs[i]&=~0xe3fll;
7412           if(v==3) gte_rs[i]|=0xe00ll;
7413           else gte_rs[i]|=3ll<<(v*2);
7414         }
7415         break;
7416       case FLOAT:
7417       case FCONV:
7418         rs1[i]=0;
7419         rs2[i]=CSREG;
7420         rt1[i]=0;
7421         rt2[i]=0;
7422         break;
7423       case FCOMP:
7424         rs1[i]=FSREG;
7425         rs2[i]=CSREG;
7426         rt1[i]=FSREG;
7427         rt2[i]=0;
7428         break;
7429       case SYSCALL:
7430       case HLECALL:
7431       case INTCALL:
7432         rs1[i]=CCREG;
7433         rs2[i]=0;
7434         rt1[i]=0;
7435         rt2[i]=0;
7436         break;
7437       default:
7438         rs1[i]=0;
7439         rs2[i]=0;
7440         rt1[i]=0;
7441         rt2[i]=0;
7442     }
7443     /* Calculate branch target addresses */
7444     if(type==UJUMP)
7445       ba[i]=((start+i*4+4)&0xF0000000)|(((unsigned int)source[i]<<6)>>4);
7446     else if(type==CJUMP&&rs1[i]==rs2[i]&&(op&1))
7447       ba[i]=start+i*4+8; // Ignore never taken branch
7448     else if(type==SJUMP&&rs1[i]==0&&!(op2&1))
7449       ba[i]=start+i*4+8; // Ignore never taken branch
7450     else if(type==CJUMP||type==SJUMP||type==FJUMP)
7451       ba[i]=start+i*4+4+((signed int)((unsigned int)source[i]<<16)>>14);
7452     else ba[i]=-1;
7453     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
7454       int do_in_intrp=0;
7455       // branch in delay slot?
7456       if(type==RJUMP||type==UJUMP||type==CJUMP||type==SJUMP||type==FJUMP) {
7457         // don't handle first branch and call interpreter if it's hit
7458         SysPrintf("branch in delay slot @%08x (%08x)\n", addr + i*4, addr);
7459         do_in_intrp=1;
7460       }
7461       // basic load delay detection
7462       else if((type==LOAD||type==LOADLR||type==COP0||type==COP2||type==C2LS)&&rt1[i]!=0) {
7463         int t=(ba[i-1]-start)/4;
7464         if(0 <= t && t < i &&(rt1[i]==rs1[t]||rt1[i]==rs2[t])&&itype[t]!=CJUMP&&itype[t]!=SJUMP) {
7465           // jump target wants DS result - potential load delay effect
7466           SysPrintf("load delay @%08x (%08x)\n", addr + i*4, addr);
7467           do_in_intrp=1;
7468           bt[t+1]=1; // expected return from interpreter
7469         }
7470         else if(i>=2&&rt1[i-2]==2&&rt1[i]==2&&rs1[i]!=2&&rs2[i]!=2&&rs1[i-1]!=2&&rs2[i-1]!=2&&
7471               !(i>=3&&(itype[i-3]==RJUMP||itype[i-3]==UJUMP||itype[i-3]==CJUMP||itype[i-3]==SJUMP))) {
7472           // v0 overwrite like this is a sign of trouble, bail out
7473           SysPrintf("v0 overwrite @%08x (%08x)\n", addr + i*4, addr);
7474           do_in_intrp=1;
7475         }
7476       }
7477       if(do_in_intrp) {
7478         rs1[i-1]=CCREG;
7479         rs2[i-1]=rt1[i-1]=rt2[i-1]=0;
7480         ba[i-1]=-1;
7481         itype[i-1]=INTCALL;
7482         done=2;
7483         i--; // don't compile the DS
7484       }
7485     }
7486     /* Is this the end of the block? */
7487     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)) {
7488       if(rt1[i-1]==0) { // Continue past subroutine call (JAL)
7489         done=2;
7490       }
7491       else {
7492         if(stop_after_jal) done=1;
7493         // Stop on BREAK
7494         if((source[i+1]&0xfc00003f)==0x0d) done=1;
7495       }
7496       // Don't recompile stuff that's already compiled
7497       if(check_addr(start+i*4+4)) done=1;
7498       // Don't get too close to the limit
7499       if(i>MAXBLOCK/2) done=1;
7500     }
7501     if(itype[i]==SYSCALL&&stop_after_jal) done=1;
7502     if(itype[i]==HLECALL||itype[i]==INTCALL) done=2;
7503     if(done==2) {
7504       // Does the block continue due to a branch?
7505       for(j=i-1;j>=0;j--)
7506       {
7507         if(ba[j]==start+i*4) done=j=0; // Branch into delay slot
7508         if(ba[j]==start+i*4+4) done=j=0;
7509         if(ba[j]==start+i*4+8) done=j=0;
7510       }
7511     }
7512     //assert(i<MAXBLOCK-1);
7513     if(start+i*4==pagelimit-4) done=1;
7514     assert(start+i*4<pagelimit);
7515     if (i==MAXBLOCK-1) done=1;
7516     // Stop if we're compiling junk
7517     if(itype[i]==NI&&opcode[i]==0x11) {
7518       done=stop_after_jal=1;
7519       SysPrintf("Disabled speculative precompilation\n");
7520     }
7521   }
7522   slen=i;
7523   if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==RJUMP||itype[i-1]==FJUMP) {
7524     if(start+i*4==pagelimit) {
7525       itype[i-1]=SPAN;
7526     }
7527   }
7528   assert(slen>0);
7529
7530   /* Pass 2 - Register dependencies and branch targets */
7531
7532   unneeded_registers(0,slen-1,0);
7533
7534   /* Pass 3 - Register allocation */
7535
7536   struct regstat current; // Current register allocations/status
7537   current.is32=1;
7538   current.dirty=0;
7539   current.u=unneeded_reg[0];
7540   current.uu=unneeded_reg_upper[0];
7541   clear_all_regs(current.regmap);
7542   alloc_reg(&current,0,CCREG);
7543   dirty_reg(&current,CCREG);
7544   current.isconst=0;
7545   current.wasconst=0;
7546   current.waswritten=0;
7547   int ds=0;
7548   int cc=0;
7549   int hr=-1;
7550
7551   if((u_int)addr&1) {
7552     // First instruction is delay slot
7553     cc=-1;
7554     bt[1]=1;
7555     ds=1;
7556     unneeded_reg[0]=1;
7557     unneeded_reg_upper[0]=1;
7558     current.regmap[HOST_BTREG]=BTREG;
7559   }
7560
7561   for(i=0;i<slen;i++)
7562   {
7563     if(bt[i])
7564     {
7565       int hr;
7566       for(hr=0;hr<HOST_REGS;hr++)
7567       {
7568         // Is this really necessary?
7569         if(current.regmap[hr]==0) current.regmap[hr]=-1;
7570       }
7571       current.isconst=0;
7572       current.waswritten=0;
7573     }
7574     if(i>1)
7575     {
7576       if((opcode[i-2]&0x2f)==0x05) // BNE/BNEL
7577       {
7578         if(rs1[i-2]==0||rs2[i-2]==0)
7579         {
7580           if(rs1[i-2]) {
7581             current.is32|=1LL<<rs1[i-2];
7582             int hr=get_reg(current.regmap,rs1[i-2]|64);
7583             if(hr>=0) current.regmap[hr]=-1;
7584           }
7585           if(rs2[i-2]) {
7586             current.is32|=1LL<<rs2[i-2];
7587             int hr=get_reg(current.regmap,rs2[i-2]|64);
7588             if(hr>=0) current.regmap[hr]=-1;
7589           }
7590         }
7591       }
7592     }
7593     current.is32=-1LL;
7594
7595     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
7596     regs[i].wasconst=current.isconst;
7597     regs[i].was32=current.is32;
7598     regs[i].wasdirty=current.dirty;
7599     regs[i].loadedconst=0;
7600     if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
7601       if(i+1<slen) {
7602         current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
7603         current.uu=unneeded_reg_upper[i+1]&~((1LL<<us1[i])|(1LL<<us2[i]));
7604         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7605         current.u|=1;
7606         current.uu|=1;
7607       } else {
7608         current.u=1;
7609         current.uu=1;
7610       }
7611     } else {
7612       if(i+1<slen) {
7613         current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
7614         current.uu=branch_unneeded_reg_upper[i]&~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
7615         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
7616         current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7617         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7618         current.u|=1;
7619         current.uu|=1;
7620       } else { SysPrintf("oops, branch at end of block with no delay slot\n");exit(1); }
7621     }
7622     is_ds[i]=ds;
7623     if(ds) {
7624       ds=0; // Skip delay slot, already allocated as part of branch
7625       // ...but we need to alloc it in case something jumps here
7626       if(i+1<slen) {
7627         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
7628         current.uu=branch_unneeded_reg_upper[i-1]&unneeded_reg_upper[i+1];
7629       }else{
7630         current.u=branch_unneeded_reg[i-1];
7631         current.uu=branch_unneeded_reg_upper[i-1];
7632       }
7633       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
7634       current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
7635       if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
7636       current.u|=1;
7637       current.uu|=1;
7638       struct regstat temp;
7639       memcpy(&temp,&current,sizeof(current));
7640       temp.wasdirty=temp.dirty;
7641       temp.was32=temp.is32;
7642       // TODO: Take into account unconditional branches, as below
7643       delayslot_alloc(&temp,i);
7644       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
7645       regs[i].wasdirty=temp.wasdirty;
7646       regs[i].was32=temp.was32;
7647       regs[i].dirty=temp.dirty;
7648       regs[i].is32=temp.is32;
7649       regs[i].isconst=0;
7650       regs[i].wasconst=0;
7651       current.isconst=0;
7652       // Create entry (branch target) regmap
7653       for(hr=0;hr<HOST_REGS;hr++)
7654       {
7655         int r=temp.regmap[hr];
7656         if(r>=0) {
7657           if(r!=regmap_pre[i][hr]) {
7658             regs[i].regmap_entry[hr]=-1;
7659           }
7660           else
7661           {
7662             if(r<64){
7663               if((current.u>>r)&1) {
7664                 regs[i].regmap_entry[hr]=-1;
7665                 regs[i].regmap[hr]=-1;
7666                 //Don't clear regs in the delay slot as the branch might need them
7667                 //current.regmap[hr]=-1;
7668               }else
7669                 regs[i].regmap_entry[hr]=r;
7670             }
7671             else {
7672               if((current.uu>>(r&63))&1) {
7673                 regs[i].regmap_entry[hr]=-1;
7674                 regs[i].regmap[hr]=-1;
7675                 //Don't clear regs in the delay slot as the branch might need them
7676                 //current.regmap[hr]=-1;
7677               }else
7678                 regs[i].regmap_entry[hr]=r;
7679             }
7680           }
7681         } else {
7682           // First instruction expects CCREG to be allocated
7683           if(i==0&&hr==HOST_CCREG)
7684             regs[i].regmap_entry[hr]=CCREG;
7685           else
7686             regs[i].regmap_entry[hr]=-1;
7687         }
7688       }
7689     }
7690     else { // Not delay slot
7691       switch(itype[i]) {
7692         case UJUMP:
7693           //current.isconst=0; // DEBUG
7694           //current.wasconst=0; // DEBUG
7695           //regs[i].wasconst=0; // DEBUG
7696           clear_const(&current,rt1[i]);
7697           alloc_cc(&current,i);
7698           dirty_reg(&current,CCREG);
7699           if (rt1[i]==31) {
7700             alloc_reg(&current,i,31);
7701             dirty_reg(&current,31);
7702             //assert(rs1[i+1]!=31&&rs2[i+1]!=31);
7703             //assert(rt1[i+1]!=rt1[i]);
7704             #ifdef REG_PREFETCH
7705             alloc_reg(&current,i,PTEMP);
7706             #endif
7707             //current.is32|=1LL<<rt1[i];
7708           }
7709           ooo[i]=1;
7710           delayslot_alloc(&current,i+1);
7711           //current.isconst=0; // DEBUG
7712           ds=1;
7713           //printf("i=%d, isconst=%x\n",i,current.isconst);
7714           break;
7715         case RJUMP:
7716           //current.isconst=0;
7717           //current.wasconst=0;
7718           //regs[i].wasconst=0;
7719           clear_const(&current,rs1[i]);
7720           clear_const(&current,rt1[i]);
7721           alloc_cc(&current,i);
7722           dirty_reg(&current,CCREG);
7723           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
7724             alloc_reg(&current,i,rs1[i]);
7725             if (rt1[i]!=0) {
7726               alloc_reg(&current,i,rt1[i]);
7727               dirty_reg(&current,rt1[i]);
7728               assert(rs1[i+1]!=rt1[i]&&rs2[i+1]!=rt1[i]);
7729               assert(rt1[i+1]!=rt1[i]);
7730               #ifdef REG_PREFETCH
7731               alloc_reg(&current,i,PTEMP);
7732               #endif
7733             }
7734             #ifdef USE_MINI_HT
7735             if(rs1[i]==31) { // JALR
7736               alloc_reg(&current,i,RHASH);
7737               alloc_reg(&current,i,RHTBL);
7738             }
7739             #endif
7740             delayslot_alloc(&current,i+1);
7741           } else {
7742             // The delay slot overwrites our source register,
7743             // allocate a temporary register to hold the old value.
7744             current.isconst=0;
7745             current.wasconst=0;
7746             regs[i].wasconst=0;
7747             delayslot_alloc(&current,i+1);
7748             current.isconst=0;
7749             alloc_reg(&current,i,RTEMP);
7750           }
7751           //current.isconst=0; // DEBUG
7752           ooo[i]=1;
7753           ds=1;
7754           break;
7755         case CJUMP:
7756           //current.isconst=0;
7757           //current.wasconst=0;
7758           //regs[i].wasconst=0;
7759           clear_const(&current,rs1[i]);
7760           clear_const(&current,rs2[i]);
7761           if((opcode[i]&0x3E)==4) // BEQ/BNE
7762           {
7763             alloc_cc(&current,i);
7764             dirty_reg(&current,CCREG);
7765             if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7766             if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7767             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
7768             {
7769               if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
7770               if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
7771             }
7772             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]))||
7773                (rs2[i]&&(rs2[i]==rt1[i+1]||rs2[i]==rt2[i+1]))) {
7774               // The delay slot overwrites one of our conditions.
7775               // Allocate the branch condition registers instead.
7776               current.isconst=0;
7777               current.wasconst=0;
7778               regs[i].wasconst=0;
7779               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7780               if(rs2[i]) alloc_reg(&current,i,rs2[i]);
7781               if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
7782               {
7783                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
7784                 if(rs2[i]) alloc_reg64(&current,i,rs2[i]);
7785               }
7786             }
7787             else
7788             {
7789               ooo[i]=1;
7790               delayslot_alloc(&current,i+1);
7791             }
7792           }
7793           else
7794           if((opcode[i]&0x3E)==6) // BLEZ/BGTZ
7795           {
7796             alloc_cc(&current,i);
7797             dirty_reg(&current,CCREG);
7798             alloc_reg(&current,i,rs1[i]);
7799             if(!(current.is32>>rs1[i]&1))
7800             {
7801               alloc_reg64(&current,i,rs1[i]);
7802             }
7803             if(rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) {
7804               // The delay slot overwrites one of our conditions.
7805               // Allocate the branch condition registers instead.
7806               current.isconst=0;
7807               current.wasconst=0;
7808               regs[i].wasconst=0;
7809               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7810               if(!((current.is32>>rs1[i])&1))
7811               {
7812                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
7813               }
7814             }
7815             else
7816             {
7817               ooo[i]=1;
7818               delayslot_alloc(&current,i+1);
7819             }
7820           }
7821           else
7822           // Don't alloc the delay slot yet because we might not execute it
7823           if((opcode[i]&0x3E)==0x14) // BEQL/BNEL
7824           {
7825             current.isconst=0;
7826             current.wasconst=0;
7827             regs[i].wasconst=0;
7828             alloc_cc(&current,i);
7829             dirty_reg(&current,CCREG);
7830             alloc_reg(&current,i,rs1[i]);
7831             alloc_reg(&current,i,rs2[i]);
7832             if(!((current.is32>>rs1[i])&(current.is32>>rs2[i])&1))
7833             {
7834               alloc_reg64(&current,i,rs1[i]);
7835               alloc_reg64(&current,i,rs2[i]);
7836             }
7837           }
7838           else
7839           if((opcode[i]&0x3E)==0x16) // BLEZL/BGTZL
7840           {
7841             current.isconst=0;
7842             current.wasconst=0;
7843             regs[i].wasconst=0;
7844             alloc_cc(&current,i);
7845             dirty_reg(&current,CCREG);
7846             alloc_reg(&current,i,rs1[i]);
7847             if(!(current.is32>>rs1[i]&1))
7848             {
7849               alloc_reg64(&current,i,rs1[i]);
7850             }
7851           }
7852           ds=1;
7853           //current.isconst=0;
7854           break;
7855         case SJUMP:
7856           //current.isconst=0;
7857           //current.wasconst=0;
7858           //regs[i].wasconst=0;
7859           clear_const(&current,rs1[i]);
7860           clear_const(&current,rt1[i]);
7861           //if((opcode2[i]&0x1E)==0x0) // BLTZ/BGEZ
7862           if((opcode2[i]&0x0E)==0x0) // BLTZ/BGEZ
7863           {
7864             alloc_cc(&current,i);
7865             dirty_reg(&current,CCREG);
7866             alloc_reg(&current,i,rs1[i]);
7867             if(!(current.is32>>rs1[i]&1))
7868             {
7869               alloc_reg64(&current,i,rs1[i]);
7870             }
7871             if (rt1[i]==31) { // BLTZAL/BGEZAL
7872               alloc_reg(&current,i,31);
7873               dirty_reg(&current,31);
7874               //#ifdef REG_PREFETCH
7875               //alloc_reg(&current,i,PTEMP);
7876               //#endif
7877               //current.is32|=1LL<<rt1[i];
7878             }
7879             if((rs1[i]&&(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1])) // The delay slot overwrites the branch condition.
7880                ||(rt1[i]==31&&(rs1[i+1]==31||rs2[i+1]==31||rt1[i+1]==31||rt2[i+1]==31))) { // DS touches $ra
7881               // Allocate the branch condition registers instead.
7882               current.isconst=0;
7883               current.wasconst=0;
7884               regs[i].wasconst=0;
7885               if(rs1[i]) alloc_reg(&current,i,rs1[i]);
7886               if(!((current.is32>>rs1[i])&1))
7887               {
7888                 if(rs1[i]) alloc_reg64(&current,i,rs1[i]);
7889               }
7890             }
7891             else
7892             {
7893               ooo[i]=1;
7894               delayslot_alloc(&current,i+1);
7895             }
7896           }
7897           else
7898           // Don't alloc the delay slot yet because we might not execute it
7899           if((opcode2[i]&0x1E)==0x2) // BLTZL/BGEZL
7900           {
7901             current.isconst=0;
7902             current.wasconst=0;
7903             regs[i].wasconst=0;
7904             alloc_cc(&current,i);
7905             dirty_reg(&current,CCREG);
7906             alloc_reg(&current,i,rs1[i]);
7907             if(!(current.is32>>rs1[i]&1))
7908             {
7909               alloc_reg64(&current,i,rs1[i]);
7910             }
7911           }
7912           ds=1;
7913           //current.isconst=0;
7914           break;
7915         case FJUMP:
7916           current.isconst=0;
7917           current.wasconst=0;
7918           regs[i].wasconst=0;
7919           if(likely[i]==0) // BC1F/BC1T
7920           {
7921             // TODO: Theoretically we can run out of registers here on x86.
7922             // The delay slot can allocate up to six, and we need to check
7923             // CSREG before executing the delay slot.  Possibly we can drop
7924             // the cycle count and then reload it after checking that the
7925             // FPU is in a usable state, or don't do out-of-order execution.
7926             alloc_cc(&current,i);
7927             dirty_reg(&current,CCREG);
7928             alloc_reg(&current,i,FSREG);
7929             alloc_reg(&current,i,CSREG);
7930             if(itype[i+1]==FCOMP) {
7931               // The delay slot overwrites the branch condition.
7932               // Allocate the branch condition registers instead.
7933               alloc_cc(&current,i);
7934               dirty_reg(&current,CCREG);
7935               alloc_reg(&current,i,CSREG);
7936               alloc_reg(&current,i,FSREG);
7937             }
7938             else {
7939               ooo[i]=1;
7940               delayslot_alloc(&current,i+1);
7941               alloc_reg(&current,i+1,CSREG);
7942             }
7943           }
7944           else
7945           // Don't alloc the delay slot yet because we might not execute it
7946           if(likely[i]) // BC1FL/BC1TL
7947           {
7948             alloc_cc(&current,i);
7949             dirty_reg(&current,CCREG);
7950             alloc_reg(&current,i,CSREG);
7951             alloc_reg(&current,i,FSREG);
7952           }
7953           ds=1;
7954           current.isconst=0;
7955           break;
7956         case IMM16:
7957           imm16_alloc(&current,i);
7958           break;
7959         case LOAD:
7960         case LOADLR:
7961           load_alloc(&current,i);
7962           break;
7963         case STORE:
7964         case STORELR:
7965           store_alloc(&current,i);
7966           break;
7967         case ALU:
7968           alu_alloc(&current,i);
7969           break;
7970         case SHIFT:
7971           shift_alloc(&current,i);
7972           break;
7973         case MULTDIV:
7974           multdiv_alloc(&current,i);
7975           break;
7976         case SHIFTIMM:
7977           shiftimm_alloc(&current,i);
7978           break;
7979         case MOV:
7980           mov_alloc(&current,i);
7981           break;
7982         case COP0:
7983           cop0_alloc(&current,i);
7984           break;
7985         case COP1:
7986         case COP2:
7987           cop1_alloc(&current,i);
7988           break;
7989         case C1LS:
7990           c1ls_alloc(&current,i);
7991           break;
7992         case C2LS:
7993           c2ls_alloc(&current,i);
7994           break;
7995         case C2OP:
7996           c2op_alloc(&current,i);
7997           break;
7998         case FCONV:
7999           fconv_alloc(&current,i);
8000           break;
8001         case FLOAT:
8002           float_alloc(&current,i);
8003           break;
8004         case FCOMP:
8005           fcomp_alloc(&current,i);
8006           break;
8007         case SYSCALL:
8008         case HLECALL:
8009         case INTCALL:
8010           syscall_alloc(&current,i);
8011           break;
8012         case SPAN:
8013           pagespan_alloc(&current,i);
8014           break;
8015       }
8016
8017       // Drop the upper half of registers that have become 32-bit
8018       current.uu|=current.is32&((1LL<<rt1[i])|(1LL<<rt2[i]));
8019       if(itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP&&itype[i]!=RJUMP&&itype[i]!=FJUMP) {
8020         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8021         if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8022         current.uu|=1;
8023       } else {
8024         current.uu|=current.is32&((1LL<<rt1[i+1])|(1LL<<rt2[i+1]));
8025         current.uu&=~((1LL<<us1[i+1])|(1LL<<us2[i+1]));
8026         if((~current.uu>>rt1[i+1])&1) current.uu&=~((1LL<<dep1[i+1])|(1LL<<dep2[i+1]));
8027         current.uu&=~((1LL<<us1[i])|(1LL<<us2[i]));
8028         current.uu|=1;
8029       }
8030
8031       // Create entry (branch target) regmap
8032       for(hr=0;hr<HOST_REGS;hr++)
8033       {
8034         int r,or;
8035         r=current.regmap[hr];
8036         if(r>=0) {
8037           if(r!=regmap_pre[i][hr]) {
8038             // TODO: delay slot (?)
8039             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
8040             if(or<0||(r&63)>=TEMPREG){
8041               regs[i].regmap_entry[hr]=-1;
8042             }
8043             else
8044             {
8045               // Just move it to a different register
8046               regs[i].regmap_entry[hr]=r;
8047               // If it was dirty before, it's still dirty
8048               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
8049             }
8050           }
8051           else
8052           {
8053             // Unneeded
8054             if(r==0){
8055               regs[i].regmap_entry[hr]=0;
8056             }
8057             else
8058             if(r<64){
8059               if((current.u>>r)&1) {
8060                 regs[i].regmap_entry[hr]=-1;
8061                 //regs[i].regmap[hr]=-1;
8062                 current.regmap[hr]=-1;
8063               }else
8064                 regs[i].regmap_entry[hr]=r;
8065             }
8066             else {
8067               if((current.uu>>(r&63))&1) {
8068                 regs[i].regmap_entry[hr]=-1;
8069                 //regs[i].regmap[hr]=-1;
8070                 current.regmap[hr]=-1;
8071               }else
8072                 regs[i].regmap_entry[hr]=r;
8073             }
8074           }
8075         } else {
8076           // Branches expect CCREG to be allocated at the target
8077           if(regmap_pre[i][hr]==CCREG)
8078             regs[i].regmap_entry[hr]=CCREG;
8079           else
8080             regs[i].regmap_entry[hr]=-1;
8081         }
8082       }
8083       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
8084     }
8085
8086     if(i>0&&(itype[i-1]==STORE||itype[i-1]==STORELR||(itype[i-1]==C2LS&&opcode[i-1]==0x3a))&&(u_int)imm[i-1]<0x800)
8087       current.waswritten|=1<<rs1[i-1];
8088     current.waswritten&=~(1<<rt1[i]);
8089     current.waswritten&=~(1<<rt2[i]);
8090     if((itype[i]==STORE||itype[i]==STORELR||(itype[i]==C2LS&&opcode[i]==0x3a))&&(u_int)imm[i]>=0x800)
8091       current.waswritten&=~(1<<rs1[i]);
8092
8093     /* Branch post-alloc */
8094     if(i>0)
8095     {
8096       current.was32=current.is32;
8097       current.wasdirty=current.dirty;
8098       switch(itype[i-1]) {
8099         case UJUMP:
8100           memcpy(&branch_regs[i-1],&current,sizeof(current));
8101           branch_regs[i-1].isconst=0;
8102           branch_regs[i-1].wasconst=0;
8103           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8104           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8105           alloc_cc(&branch_regs[i-1],i-1);
8106           dirty_reg(&branch_regs[i-1],CCREG);
8107           if(rt1[i-1]==31) { // JAL
8108             alloc_reg(&branch_regs[i-1],i-1,31);
8109             dirty_reg(&branch_regs[i-1],31);
8110             branch_regs[i-1].is32|=1LL<<31;
8111           }
8112           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8113           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8114           break;
8115         case RJUMP:
8116           memcpy(&branch_regs[i-1],&current,sizeof(current));
8117           branch_regs[i-1].isconst=0;
8118           branch_regs[i-1].wasconst=0;
8119           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8120           branch_regs[i-1].uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8121           alloc_cc(&branch_regs[i-1],i-1);
8122           dirty_reg(&branch_regs[i-1],CCREG);
8123           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
8124           if(rt1[i-1]!=0) { // JALR
8125             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
8126             dirty_reg(&branch_regs[i-1],rt1[i-1]);
8127             branch_regs[i-1].is32|=1LL<<rt1[i-1];
8128           }
8129           #ifdef USE_MINI_HT
8130           if(rs1[i-1]==31) { // JALR
8131             alloc_reg(&branch_regs[i-1],i-1,RHASH);
8132             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
8133           }
8134           #endif
8135           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8136           memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8137           break;
8138         case CJUMP:
8139           if((opcode[i-1]&0x3E)==4) // BEQ/BNE
8140           {
8141             alloc_cc(&current,i-1);
8142             dirty_reg(&current,CCREG);
8143             if((rs1[i-1]&&(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]))||
8144                (rs2[i-1]&&(rs2[i-1]==rt1[i]||rs2[i-1]==rt2[i]))) {
8145               // The delay slot overwrote one of our conditions
8146               // Delay slot goes after the test (in order)
8147               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8148               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8149               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8150               current.u|=1;
8151               current.uu|=1;
8152               delayslot_alloc(&current,i);
8153               current.isconst=0;
8154             }
8155             else
8156             {
8157               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
8158               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i-1])|(1LL<<us2[i-1]));
8159               // Alloc the branch condition registers
8160               if(rs1[i-1]) alloc_reg(&current,i-1,rs1[i-1]);
8161               if(rs2[i-1]) alloc_reg(&current,i-1,rs2[i-1]);
8162               if(!((current.is32>>rs1[i-1])&(current.is32>>rs2[i-1])&1))
8163               {
8164                 if(rs1[i-1]) alloc_reg64(&current,i-1,rs1[i-1]);
8165                 if(rs2[i-1]) alloc_reg64(&current,i-1,rs2[i-1]);
8166               }
8167             }
8168             memcpy(&branch_regs[i-1],&current,sizeof(current));
8169             branch_regs[i-1].isconst=0;
8170             branch_regs[i-1].wasconst=0;
8171             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8172             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8173           }
8174           else
8175           if((opcode[i-1]&0x3E)==6) // BLEZ/BGTZ
8176           {
8177             alloc_cc(&current,i-1);
8178             dirty_reg(&current,CCREG);
8179             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8180               // The delay slot overwrote the branch condition
8181               // Delay slot goes after the test (in order)
8182               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8183               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8184               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8185               current.u|=1;
8186               current.uu|=1;
8187               delayslot_alloc(&current,i);
8188               current.isconst=0;
8189             }
8190             else
8191             {
8192               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8193               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8194               // Alloc the branch condition register
8195               alloc_reg(&current,i-1,rs1[i-1]);
8196               if(!(current.is32>>rs1[i-1]&1))
8197               {
8198                 alloc_reg64(&current,i-1,rs1[i-1]);
8199               }
8200             }
8201             memcpy(&branch_regs[i-1],&current,sizeof(current));
8202             branch_regs[i-1].isconst=0;
8203             branch_regs[i-1].wasconst=0;
8204             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8205             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8206           }
8207           else
8208           // Alloc the delay slot in case the branch is taken
8209           if((opcode[i-1]&0x3E)==0x14) // BEQL/BNEL
8210           {
8211             memcpy(&branch_regs[i-1],&current,sizeof(current));
8212             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8213             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8214             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8215             alloc_cc(&branch_regs[i-1],i);
8216             dirty_reg(&branch_regs[i-1],CCREG);
8217             delayslot_alloc(&branch_regs[i-1],i);
8218             branch_regs[i-1].isconst=0;
8219             alloc_reg(&current,i,CCREG); // Not taken path
8220             dirty_reg(&current,CCREG);
8221             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8222           }
8223           else
8224           if((opcode[i-1]&0x3E)==0x16) // BLEZL/BGTZL
8225           {
8226             memcpy(&branch_regs[i-1],&current,sizeof(current));
8227             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8228             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8229             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8230             alloc_cc(&branch_regs[i-1],i);
8231             dirty_reg(&branch_regs[i-1],CCREG);
8232             delayslot_alloc(&branch_regs[i-1],i);
8233             branch_regs[i-1].isconst=0;
8234             alloc_reg(&current,i,CCREG); // Not taken path
8235             dirty_reg(&current,CCREG);
8236             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8237           }
8238           break;
8239         case SJUMP:
8240           //if((opcode2[i-1]&0x1E)==0) // BLTZ/BGEZ
8241           if((opcode2[i-1]&0x0E)==0) // BLTZ/BGEZ
8242           {
8243             alloc_cc(&current,i-1);
8244             dirty_reg(&current,CCREG);
8245             if(rs1[i-1]==rt1[i]||rs1[i-1]==rt2[i]) {
8246               // The delay slot overwrote the branch condition
8247               // Delay slot goes after the test (in order)
8248               current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
8249               current.uu=branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i]));
8250               if((~current.uu>>rt1[i])&1) current.uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]));
8251               current.u|=1;
8252               current.uu|=1;
8253               delayslot_alloc(&current,i);
8254               current.isconst=0;
8255             }
8256             else
8257             {
8258               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8259               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8260               // Alloc the branch condition register
8261               alloc_reg(&current,i-1,rs1[i-1]);
8262               if(!(current.is32>>rs1[i-1]&1))
8263               {
8264                 alloc_reg64(&current,i-1,rs1[i-1]);
8265               }
8266             }
8267             memcpy(&branch_regs[i-1],&current,sizeof(current));
8268             branch_regs[i-1].isconst=0;
8269             branch_regs[i-1].wasconst=0;
8270             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8271             memcpy(constmap[i],constmap[i-1],sizeof(current_constmap));
8272           }
8273           else
8274           // Alloc the delay slot in case the branch is taken
8275           if((opcode2[i-1]&0x1E)==2) // BLTZL/BGEZL
8276           {
8277             memcpy(&branch_regs[i-1],&current,sizeof(current));
8278             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8279             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8280             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8281             alloc_cc(&branch_regs[i-1],i);
8282             dirty_reg(&branch_regs[i-1],CCREG);
8283             delayslot_alloc(&branch_regs[i-1],i);
8284             branch_regs[i-1].isconst=0;
8285             alloc_reg(&current,i,CCREG); // Not taken path
8286             dirty_reg(&current,CCREG);
8287             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8288           }
8289           // FIXME: BLTZAL/BGEZAL
8290           if(opcode2[i-1]&0x10) { // BxxZAL
8291             alloc_reg(&branch_regs[i-1],i-1,31);
8292             dirty_reg(&branch_regs[i-1],31);
8293             branch_regs[i-1].is32|=1LL<<31;
8294           }
8295           break;
8296         case FJUMP:
8297           if(likely[i-1]==0) // BC1F/BC1T
8298           {
8299             alloc_cc(&current,i-1);
8300             dirty_reg(&current,CCREG);
8301             if(itype[i]==FCOMP) {
8302               // The delay slot overwrote the branch condition
8303               // Delay slot goes after the test (in order)
8304               delayslot_alloc(&current,i);
8305               current.isconst=0;
8306             }
8307             else
8308             {
8309               current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
8310               current.uu=branch_unneeded_reg_upper[i-1]&~(1LL<<us1[i-1]);
8311               // Alloc the branch condition register
8312               alloc_reg(&current,i-1,FSREG);
8313             }
8314             memcpy(&branch_regs[i-1],&current,sizeof(current));
8315             memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
8316           }
8317           else // BC1FL/BC1TL
8318           {
8319             // Alloc the delay slot in case the branch is taken
8320             memcpy(&branch_regs[i-1],&current,sizeof(current));
8321             branch_regs[i-1].u=(branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8322             branch_regs[i-1].uu=(branch_unneeded_reg_upper[i-1]&~((1LL<<us1[i])|(1LL<<us2[i])|(1LL<<rt1[i])|(1LL<<rt2[i])))|1;
8323             if((~branch_regs[i-1].uu>>rt1[i])&1) branch_regs[i-1].uu&=~((1LL<<dep1[i])|(1LL<<dep2[i]))|1;
8324             alloc_cc(&branch_regs[i-1],i);
8325             dirty_reg(&branch_regs[i-1],CCREG);
8326             delayslot_alloc(&branch_regs[i-1],i);
8327             branch_regs[i-1].isconst=0;
8328             alloc_reg(&current,i,CCREG); // Not taken path
8329             dirty_reg(&current,CCREG);
8330             memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
8331           }
8332           break;
8333       }
8334
8335       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000)
8336       {
8337         if(rt1[i-1]==31) // JAL/JALR
8338         {
8339           // Subroutine call will return here, don't alloc any registers
8340           current.is32=1;
8341           current.dirty=0;
8342           clear_all_regs(current.regmap);
8343           alloc_reg(&current,i,CCREG);
8344           dirty_reg(&current,CCREG);
8345         }
8346         else if(i+1<slen)
8347         {
8348           // Internal branch will jump here, match registers to caller
8349           current.is32=0x3FFFFFFFFLL;
8350           current.dirty=0;
8351           clear_all_regs(current.regmap);
8352           alloc_reg(&current,i,CCREG);
8353           dirty_reg(&current,CCREG);
8354           for(j=i-1;j>=0;j--)
8355           {
8356             if(ba[j]==start+i*4+4) {
8357               memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
8358               current.is32=branch_regs[j].is32;
8359               current.dirty=branch_regs[j].dirty;
8360               break;
8361             }
8362           }
8363           while(j>=0) {
8364             if(ba[j]==start+i*4+4) {
8365               for(hr=0;hr<HOST_REGS;hr++) {
8366                 if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
8367                   current.regmap[hr]=-1;
8368                 }
8369                 current.is32&=branch_regs[j].is32;
8370                 current.dirty&=branch_regs[j].dirty;
8371               }
8372             }
8373             j--;
8374           }
8375         }
8376       }
8377     }
8378
8379     // Count cycles in between branches
8380     ccadj[i]=cc;
8381     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP||itype[i]==SYSCALL||itype[i]==HLECALL))
8382     {
8383       cc=0;
8384     }
8385 #if !defined(DRC_DBG)
8386     else if(itype[i]==C2OP&&gte_cycletab[source[i]&0x3f]>2)
8387     {
8388       // GTE runs in parallel until accessed, divide by 2 for a rough guess
8389       cc+=gte_cycletab[source[i]&0x3f]/2;
8390     }
8391     else if(/*itype[i]==LOAD||itype[i]==STORE||*/itype[i]==C1LS) // load,store causes weird timing issues
8392     {
8393       cc+=2; // 2 cycle penalty (after CLOCK_DIVIDER)
8394     }
8395     else if(i>1&&itype[i]==STORE&&itype[i-1]==STORE&&itype[i-2]==STORE&&!bt[i])
8396     {
8397       cc+=4;
8398     }
8399     else if(itype[i]==C2LS)
8400     {
8401       cc+=4;
8402     }
8403 #endif
8404     else
8405     {
8406       cc++;
8407     }
8408
8409     flush_dirty_uppers(&current);
8410     if(!is_ds[i]) {
8411       regs[i].is32=current.is32;
8412       regs[i].dirty=current.dirty;
8413       regs[i].isconst=current.isconst;
8414       memcpy(constmap[i],current_constmap,sizeof(current_constmap));
8415     }
8416     for(hr=0;hr<HOST_REGS;hr++) {
8417       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
8418         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
8419           regs[i].wasconst&=~(1<<hr);
8420         }
8421       }
8422     }
8423     if(current.regmap[HOST_BTREG]==BTREG) current.regmap[HOST_BTREG]=-1;
8424     regs[i].waswritten=current.waswritten;
8425   }
8426
8427   /* Pass 4 - Cull unused host registers */
8428
8429   uint64_t nr=0;
8430
8431   for (i=slen-1;i>=0;i--)
8432   {
8433     int hr;
8434     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8435     {
8436       if(ba[i]<start || ba[i]>=(start+slen*4))
8437       {
8438         // Branch out of this block, don't need anything
8439         nr=0;
8440       }
8441       else
8442       {
8443         // Internal branch
8444         // Need whatever matches the target
8445         nr=0;
8446         int t=(ba[i]-start)>>2;
8447         for(hr=0;hr<HOST_REGS;hr++)
8448         {
8449           if(regs[i].regmap_entry[hr]>=0) {
8450             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
8451           }
8452         }
8453       }
8454       // Conditional branch may need registers for following instructions
8455       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8456       {
8457         if(i<slen-2) {
8458           nr|=needed_reg[i+2];
8459           for(hr=0;hr<HOST_REGS;hr++)
8460           {
8461             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
8462             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*4,hr,regmap_entry[i+2][hr]);
8463           }
8464         }
8465       }
8466       // Don't need stuff which is overwritten
8467       //if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8468       //if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8469       // Merge in delay slot
8470       for(hr=0;hr<HOST_REGS;hr++)
8471       {
8472         if(!likely[i]) {
8473           // These are overwritten unless the branch is "likely"
8474           // and the delay slot is nullified if not taken
8475           if(rt1[i+1]&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8476           if(rt2[i+1]&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8477         }
8478         if(us1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8479         if(us2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8480         if(rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8481         if(rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
8482         if(us1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8483         if(us2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8484         if(rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8485         if(rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8486         if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
8487           if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8488           if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8489         }
8490         if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
8491           if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8492           if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8493         }
8494         if(itype[i+1]==STORE || itype[i+1]==STORELR || (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) {
8495           if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8496           if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8497         }
8498       }
8499     }
8500     else if(itype[i]==SYSCALL||itype[i]==HLECALL||itype[i]==INTCALL)
8501     {
8502       // SYSCALL instruction (software interrupt)
8503       nr=0;
8504     }
8505     else if(itype[i]==COP0 && (source[i]&0x3f)==0x18)
8506     {
8507       // ERET instruction (return from interrupt)
8508       nr=0;
8509     }
8510     else // Non-branch
8511     {
8512       if(i<slen-1) {
8513         for(hr=0;hr<HOST_REGS;hr++) {
8514           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
8515           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
8516           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
8517           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
8518         }
8519       }
8520     }
8521     for(hr=0;hr<HOST_REGS;hr++)
8522     {
8523       // Overwritten registers are not needed
8524       if(rt1[i]&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8525       if(rt2[i]&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8526       if(FTEMP==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
8527       // Source registers are needed
8528       if(us1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8529       if(us2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8530       if(rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
8531       if(rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
8532       if(us1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8533       if(us2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8534       if(rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8535       if(rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
8536       if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
8537         if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8538         if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8539       }
8540       if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
8541         if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8542         if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8543       }
8544       if(itype[i]==STORE || itype[i]==STORELR || (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) {
8545         if(regmap_pre[i][hr]==INVCP) nr|=1<<hr;
8546         if(regs[i].regmap_entry[hr]==INVCP) nr|=1<<hr;
8547       }
8548       // Don't store a register immediately after writing it,
8549       // may prevent dual-issue.
8550       // But do so if this is a branch target, otherwise we
8551       // might have to load the register before the branch.
8552       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
8553         if((regmap_pre[i][hr]>0&&regmap_pre[i][hr]<64&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) ||
8554            (regmap_pre[i][hr]>64&&!((unneeded_reg_upper[i]>>(regmap_pre[i][hr]&63))&1)) ) {
8555           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8556           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
8557         }
8558         if((regs[i].regmap_entry[hr]>0&&regs[i].regmap_entry[hr]<64&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) ||
8559            (regs[i].regmap_entry[hr]>64&&!((unneeded_reg_upper[i]>>(regs[i].regmap_entry[hr]&63))&1)) ) {
8560           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8561           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
8562         }
8563       }
8564     }
8565     // Cycle count is needed at branches.  Assume it is needed at the target too.
8566     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==FJUMP||itype[i]==SPAN) {
8567       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8568       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
8569     }
8570     // Save it
8571     needed_reg[i]=nr;
8572
8573     // Deallocate unneeded registers
8574     for(hr=0;hr<HOST_REGS;hr++)
8575     {
8576       if(!((nr>>hr)&1)) {
8577         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
8578         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8579            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8580            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
8581         {
8582           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8583           {
8584             if(likely[i]) {
8585               regs[i].regmap[hr]=-1;
8586               regs[i].isconst&=~(1<<hr);
8587               if(i<slen-2) {
8588                 regmap_pre[i+2][hr]=-1;
8589                 regs[i+2].wasconst&=~(1<<hr);
8590               }
8591             }
8592           }
8593         }
8594         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8595         {
8596           int d1=0,d2=0,map=0,temp=0;
8597           if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
8598           {
8599             d1=dep1[i+1];
8600             d2=dep2[i+1];
8601           }
8602           if(itype[i+1]==STORE || itype[i+1]==STORELR ||
8603              (opcode[i+1]&0x3b)==0x39 || (opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8604             map=INVCP;
8605           }
8606           if(itype[i+1]==LOADLR || itype[i+1]==STORELR ||
8607              itype[i+1]==C1LS || itype[i+1]==C2LS)
8608             temp=FTEMP;
8609           if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
8610              (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8611              (regs[i].regmap[hr]&63)!=rt1[i+1] && (regs[i].regmap[hr]&63)!=rt2[i+1] &&
8612              (regs[i].regmap[hr]^64)!=us1[i+1] && (regs[i].regmap[hr]^64)!=us2[i+1] &&
8613              (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8614              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] &&
8615              (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=PTEMP &&
8616              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
8617              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=CCREG &&
8618              regs[i].regmap[hr]!=map )
8619           {
8620             regs[i].regmap[hr]=-1;
8621             regs[i].isconst&=~(1<<hr);
8622             if((branch_regs[i].regmap[hr]&63)!=rs1[i] && (branch_regs[i].regmap[hr]&63)!=rs2[i] &&
8623                (branch_regs[i].regmap[hr]&63)!=rt1[i] && (branch_regs[i].regmap[hr]&63)!=rt2[i] &&
8624                (branch_regs[i].regmap[hr]&63)!=rt1[i+1] && (branch_regs[i].regmap[hr]&63)!=rt2[i+1] &&
8625                (branch_regs[i].regmap[hr]^64)!=us1[i+1] && (branch_regs[i].regmap[hr]^64)!=us2[i+1] &&
8626                (branch_regs[i].regmap[hr]^64)!=d1 && (branch_regs[i].regmap[hr]^64)!=d2 &&
8627                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] &&
8628                (branch_regs[i].regmap[hr]&63)!=temp && branch_regs[i].regmap[hr]!=PTEMP &&
8629                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
8630                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=CCREG &&
8631                branch_regs[i].regmap[hr]!=map)
8632             {
8633               branch_regs[i].regmap[hr]=-1;
8634               branch_regs[i].regmap_entry[hr]=-1;
8635               if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000)
8636               {
8637                 if(!likely[i]&&i<slen-2) {
8638                   regmap_pre[i+2][hr]=-1;
8639                   regs[i+2].wasconst&=~(1<<hr);
8640                 }
8641               }
8642             }
8643           }
8644         }
8645         else
8646         {
8647           // Non-branch
8648           if(i>0)
8649           {
8650             int d1=0,d2=0,map=-1,temp=-1;
8651             if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
8652             {
8653               d1=dep1[i];
8654               d2=dep2[i];
8655             }
8656             if(itype[i]==STORE || itype[i]==STORELR ||
8657                       (opcode[i]&0x3b)==0x39 || (opcode[i]&0x3b)==0x3a) { // SWC1/SDC1 || SWC2/SDC2
8658               map=INVCP;
8659             }
8660             if(itype[i]==LOADLR || itype[i]==STORELR ||
8661                itype[i]==C1LS || itype[i]==C2LS)
8662               temp=FTEMP;
8663             if((regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
8664                (regs[i].regmap[hr]^64)!=us1[i] && (regs[i].regmap[hr]^64)!=us2[i] &&
8665                (regs[i].regmap[hr]^64)!=d1 && (regs[i].regmap[hr]^64)!=d2 &&
8666                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
8667                (regs[i].regmap[hr]&63)!=temp && regs[i].regmap[hr]!=map &&
8668                (itype[i]!=SPAN||regs[i].regmap[hr]!=CCREG))
8669             {
8670               if(i<slen-1&&!is_ds[i]) {
8671                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
8672                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
8673                 if(regs[i].regmap[hr]<64||!((regs[i].was32>>(regs[i].regmap[hr]&63))&1))
8674                 {
8675                   SysPrintf("fail: %x (%d %d!=%d)\n",start+i*4,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
8676                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
8677                 }
8678                 regmap_pre[i+1][hr]=-1;
8679                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
8680                 regs[i+1].wasconst&=~(1<<hr);
8681               }
8682               regs[i].regmap[hr]=-1;
8683               regs[i].isconst&=~(1<<hr);
8684             }
8685           }
8686         }
8687       }
8688     }
8689   }
8690
8691   /* Pass 5 - Pre-allocate registers */
8692
8693   // If a register is allocated during a loop, try to allocate it for the
8694   // entire loop, if possible.  This avoids loading/storing registers
8695   // inside of the loop.
8696
8697   signed char f_regmap[HOST_REGS];
8698   clear_all_regs(f_regmap);
8699   for(i=0;i<slen-1;i++)
8700   {
8701     if(itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
8702     {
8703       if(ba[i]>=start && ba[i]<(start+i*4))
8704       if(itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
8705       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM16||itype[i+1]==LOAD
8706       ||itype[i+1]==STORE||itype[i+1]==STORELR||itype[i+1]==C1LS
8707       ||itype[i+1]==SHIFT||itype[i+1]==COP1||itype[i+1]==FLOAT
8708       ||itype[i+1]==FCOMP||itype[i+1]==FCONV
8709       ||itype[i+1]==COP2||itype[i+1]==C2LS||itype[i+1]==C2OP)
8710       {
8711         int t=(ba[i]-start)>>2;
8712         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=CJUMP&&itype[t-1]!=SJUMP&&itype[t-1]!=FJUMP)) // loop_preload can't handle jumps into delay slots
8713         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=31) // call/ret assumes no registers allocated
8714         for(hr=0;hr<HOST_REGS;hr++)
8715         {
8716           if(regs[i].regmap[hr]>64) {
8717             if(!((regs[i].dirty>>hr)&1))
8718               f_regmap[hr]=regs[i].regmap[hr];
8719             else f_regmap[hr]=-1;
8720           }
8721           else if(regs[i].regmap[hr]>=0) {
8722             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8723               // dealloc old register
8724               int n;
8725               for(n=0;n<HOST_REGS;n++)
8726               {
8727                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8728               }
8729               // and alloc new one
8730               f_regmap[hr]=regs[i].regmap[hr];
8731             }
8732           }
8733           if(branch_regs[i].regmap[hr]>64) {
8734             if(!((branch_regs[i].dirty>>hr)&1))
8735               f_regmap[hr]=branch_regs[i].regmap[hr];
8736             else f_regmap[hr]=-1;
8737           }
8738           else if(branch_regs[i].regmap[hr]>=0) {
8739             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
8740               // dealloc old register
8741               int n;
8742               for(n=0;n<HOST_REGS;n++)
8743               {
8744                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
8745               }
8746               // and alloc new one
8747               f_regmap[hr]=branch_regs[i].regmap[hr];
8748             }
8749           }
8750           if(ooo[i]) {
8751             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
8752               f_regmap[hr]=branch_regs[i].regmap[hr];
8753           }else{
8754             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
8755               f_regmap[hr]=branch_regs[i].regmap[hr];
8756           }
8757           // Avoid dirty->clean transition
8758           #ifdef DESTRUCTIVE_WRITEBACK
8759           if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
8760           #endif
8761           // This check is only strictly required in the DESTRUCTIVE_WRITEBACK
8762           // case above, however it's always a good idea.  We can't hoist the
8763           // load if the register was already allocated, so there's no point
8764           // wasting time analyzing most of these cases.  It only "succeeds"
8765           // when the mapping was different and the load can be replaced with
8766           // a mov, which is of negligible benefit.  So such cases are
8767           // skipped below.
8768           if(f_regmap[hr]>0) {
8769             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
8770               int r=f_regmap[hr];
8771               for(j=t;j<=i;j++)
8772               {
8773                 //printf("Test %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8774                 if(r<34&&((unneeded_reg[j]>>r)&1)) break;
8775                 if(r>63&&((unneeded_reg_upper[j]>>(r&63))&1)) break;
8776                 if(r>63) {
8777                   // NB This can exclude the case where the upper-half
8778                   // register is lower numbered than the lower-half
8779                   // register.  Not sure if it's worth fixing...
8780                   if(get_reg(regs[j].regmap,r&63)<0) break;
8781                   if(get_reg(regs[j].regmap_entry,r&63)<0) break;
8782                   if(regs[j].is32&(1LL<<(r&63))) break;
8783                 }
8784                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
8785                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*4,ba[i],start+j*4,hr,r);
8786                   int k;
8787                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
8788                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
8789                     if(r>63) {
8790                       if(get_reg(regs[i].regmap,r&63)<0) break;
8791                       if(get_reg(branch_regs[i].regmap,r&63)<0) break;
8792                     }
8793                     k=i;
8794                     while(k>1&&regs[k-1].regmap[hr]==-1) {
8795                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
8796                         //printf("no free regs for store %x\n",start+(k-1)*4);
8797                         break;
8798                       }
8799                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
8800                         //printf("no-match due to different register\n");
8801                         break;
8802                       }
8803                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP||itype[k-2]==FJUMP) {
8804                         //printf("no-match due to branch\n");
8805                         break;
8806                       }
8807                       // call/ret fast path assumes no registers allocated
8808                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==31) {
8809                         break;
8810                       }
8811                       if(r>63) {
8812                         // NB This can exclude the case where the upper-half
8813                         // register is lower numbered than the lower-half
8814                         // register.  Not sure if it's worth fixing...
8815                         if(get_reg(regs[k-1].regmap,r&63)<0) break;
8816                         if(regs[k-1].is32&(1LL<<(r&63))) break;
8817                       }
8818                       k--;
8819                     }
8820                     if(i<slen-1) {
8821                       if((regs[k].is32&(1LL<<f_regmap[hr]))!=
8822                         (regs[i+2].was32&(1LL<<f_regmap[hr]))) {
8823                         //printf("bad match after branch\n");
8824                         break;
8825                       }
8826                     }
8827                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
8828                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
8829                       while(k<i) {
8830                         regs[k].regmap_entry[hr]=f_regmap[hr];
8831                         regs[k].regmap[hr]=f_regmap[hr];
8832                         regmap_pre[k+1][hr]=f_regmap[hr];
8833                         regs[k].wasdirty&=~(1<<hr);
8834                         regs[k].dirty&=~(1<<hr);
8835                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
8836                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
8837                         regs[k].wasconst&=~(1<<hr);
8838                         regs[k].isconst&=~(1<<hr);
8839                         k++;
8840                       }
8841                     }
8842                     else {
8843                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
8844                       break;
8845                     }
8846                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
8847                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
8848                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
8849                       regs[i].regmap_entry[hr]=f_regmap[hr];
8850                       regs[i].regmap[hr]=f_regmap[hr];
8851                       regs[i].wasdirty&=~(1<<hr);
8852                       regs[i].dirty&=~(1<<hr);
8853                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
8854                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
8855                       regs[i].wasconst&=~(1<<hr);
8856                       regs[i].isconst&=~(1<<hr);
8857                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
8858                       branch_regs[i].wasdirty&=~(1<<hr);
8859                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
8860                       branch_regs[i].regmap[hr]=f_regmap[hr];
8861                       branch_regs[i].dirty&=~(1<<hr);
8862                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
8863                       branch_regs[i].wasconst&=~(1<<hr);
8864                       branch_regs[i].isconst&=~(1<<hr);
8865                       if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&(source[i]>>16)!=0x1000) {
8866                         regmap_pre[i+2][hr]=f_regmap[hr];
8867                         regs[i+2].wasdirty&=~(1<<hr);
8868                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
8869                         assert((branch_regs[i].is32&(1LL<<f_regmap[hr]))==
8870                           (regs[i+2].was32&(1LL<<f_regmap[hr])));
8871                       }
8872                     }
8873                   }
8874                   for(k=t;k<j;k++) {
8875                     // Alloc register clean at beginning of loop,
8876                     // but may dirty it in pass 6
8877                     regs[k].regmap_entry[hr]=f_regmap[hr];
8878                     regs[k].regmap[hr]=f_regmap[hr];
8879                     regs[k].dirty&=~(1<<hr);
8880                     regs[k].wasconst&=~(1<<hr);
8881                     regs[k].isconst&=~(1<<hr);
8882                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==CJUMP||itype[k]==SJUMP||itype[k]==FJUMP) {
8883                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
8884                       branch_regs[k].regmap[hr]=f_regmap[hr];
8885                       branch_regs[k].dirty&=~(1<<hr);
8886                       branch_regs[k].wasconst&=~(1<<hr);
8887                       branch_regs[k].isconst&=~(1<<hr);
8888                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP&&(source[k]>>16)!=0x1000) {
8889                         regmap_pre[k+2][hr]=f_regmap[hr];
8890                         regs[k+2].wasdirty&=~(1<<hr);
8891                         assert((branch_regs[k].is32&(1LL<<f_regmap[hr]))==
8892                           (regs[k+2].was32&(1LL<<f_regmap[hr])));
8893                       }
8894                     }
8895                     else
8896                     {
8897                       regmap_pre[k+1][hr]=f_regmap[hr];
8898                       regs[k+1].wasdirty&=~(1<<hr);
8899                     }
8900                   }
8901                   if(regs[j].regmap[hr]==f_regmap[hr])
8902                     regs[j].regmap_entry[hr]=f_regmap[hr];
8903                   break;
8904                 }
8905                 if(j==i) break;
8906                 if(regs[j].regmap[hr]>=0)
8907                   break;
8908                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
8909                   //printf("no-match due to different register\n");
8910                   break;
8911                 }
8912                 if((regs[j+1].is32&(1LL<<f_regmap[hr]))!=(regs[j].is32&(1LL<<f_regmap[hr]))) {
8913                   //printf("32/64 mismatch %x %d\n",start+j*4,hr);
8914                   break;
8915                 }
8916                 if(itype[j]==UJUMP||itype[j]==RJUMP||(source[j]>>16)==0x1000)
8917                 {
8918                   // Stop on unconditional branch
8919                   break;
8920                 }
8921                 if(itype[j]==CJUMP||itype[j]==SJUMP||itype[j]==FJUMP)
8922                 {
8923                   if(ooo[j]) {
8924                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
8925                       break;
8926                   }else{
8927                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
8928                       break;
8929                   }
8930                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
8931                     //printf("no-match due to different register (branch)\n");
8932                     break;
8933                   }
8934                 }
8935                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8936                   //printf("No free regs for store %x\n",start+j*4);
8937                   break;
8938                 }
8939                 if(f_regmap[hr]>=64) {
8940                   if(regs[j].is32&(1LL<<(f_regmap[hr]&63))) {
8941                     break;
8942                   }
8943                   else
8944                   {
8945                     if(get_reg(regs[j].regmap,f_regmap[hr]&63)<0) {
8946                       break;
8947                     }
8948                   }
8949                 }
8950               }
8951             }
8952           }
8953         }
8954       }
8955     }else{
8956       // Non branch or undetermined branch target
8957       for(hr=0;hr<HOST_REGS;hr++)
8958       {
8959         if(hr!=EXCLUDE_REG) {
8960           if(regs[i].regmap[hr]>64) {
8961             if(!((regs[i].dirty>>hr)&1))
8962               f_regmap[hr]=regs[i].regmap[hr];
8963           }
8964           else if(regs[i].regmap[hr]>=0) {
8965             if(f_regmap[hr]!=regs[i].regmap[hr]) {
8966               // dealloc old register
8967               int n;
8968               for(n=0;n<HOST_REGS;n++)
8969               {
8970                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
8971               }
8972               // and alloc new one
8973               f_regmap[hr]=regs[i].regmap[hr];
8974             }
8975           }
8976         }
8977       }
8978       // Try to restore cycle count at branch targets
8979       if(bt[i]) {
8980         for(j=i;j<slen-1;j++) {
8981           if(regs[j].regmap[HOST_CCREG]!=-1) break;
8982           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
8983             //printf("no free regs for store %x\n",start+j*4);
8984             break;
8985           }
8986         }
8987         if(regs[j].regmap[HOST_CCREG]==CCREG) {
8988           int k=i;
8989           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
8990           while(k<j) {
8991             regs[k].regmap_entry[HOST_CCREG]=CCREG;
8992             regs[k].regmap[HOST_CCREG]=CCREG;
8993             regmap_pre[k+1][HOST_CCREG]=CCREG;
8994             regs[k+1].wasdirty|=1<<HOST_CCREG;
8995             regs[k].dirty|=1<<HOST_CCREG;
8996             regs[k].wasconst&=~(1<<HOST_CCREG);
8997             regs[k].isconst&=~(1<<HOST_CCREG);
8998             k++;
8999           }
9000           regs[j].regmap_entry[HOST_CCREG]=CCREG;
9001         }
9002         // Work backwards from the branch target
9003         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
9004         {
9005           //printf("Extend backwards\n");
9006           int k;
9007           k=i;
9008           while(regs[k-1].regmap[HOST_CCREG]==-1) {
9009             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
9010               //printf("no free regs for store %x\n",start+(k-1)*4);
9011               break;
9012             }
9013             k--;
9014           }
9015           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
9016             //printf("Extend CC, %x ->\n",start+k*4);
9017             while(k<=i) {
9018               regs[k].regmap_entry[HOST_CCREG]=CCREG;
9019               regs[k].regmap[HOST_CCREG]=CCREG;
9020               regmap_pre[k+1][HOST_CCREG]=CCREG;
9021               regs[k+1].wasdirty|=1<<HOST_CCREG;
9022               regs[k].dirty|=1<<HOST_CCREG;
9023               regs[k].wasconst&=~(1<<HOST_CCREG);
9024               regs[k].isconst&=~(1<<HOST_CCREG);
9025               k++;
9026             }
9027           }
9028           else {
9029             //printf("Fail Extend CC, %x ->\n",start+k*4);
9030           }
9031         }
9032       }
9033       if(itype[i]!=STORE&&itype[i]!=STORELR&&itype[i]!=C1LS&&itype[i]!=SHIFT&&
9034          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
9035          itype[i]!=IMM16&&itype[i]!=LOAD&&itype[i]!=COP1&&itype[i]!=FLOAT&&
9036          itype[i]!=FCONV&&itype[i]!=FCOMP)
9037       {
9038         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
9039       }
9040     }
9041   }
9042
9043   // This allocates registers (if possible) one instruction prior
9044   // to use, which can avoid a load-use penalty on certain CPUs.
9045   for(i=0;i<slen-1;i++)
9046   {
9047     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP))
9048     {
9049       if(!bt[i+1])
9050       {
9051         if(itype[i]==ALU||itype[i]==MOV||itype[i]==LOAD||itype[i]==SHIFTIMM||itype[i]==IMM16
9052            ||((itype[i]==COP1||itype[i]==COP2)&&opcode2[i]<3))
9053         {
9054           if(rs1[i+1]) {
9055             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]))>=0)
9056             {
9057               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9058               {
9059                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9060                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9061                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9062                 regs[i].isconst&=~(1<<hr);
9063                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9064                 constmap[i][hr]=constmap[i+1][hr];
9065                 regs[i+1].wasdirty&=~(1<<hr);
9066                 regs[i].dirty&=~(1<<hr);
9067               }
9068             }
9069           }
9070           if(rs2[i+1]) {
9071             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]))>=0)
9072             {
9073               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9074               {
9075                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
9076                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
9077                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
9078                 regs[i].isconst&=~(1<<hr);
9079                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9080                 constmap[i][hr]=constmap[i+1][hr];
9081                 regs[i+1].wasdirty&=~(1<<hr);
9082                 regs[i].dirty&=~(1<<hr);
9083               }
9084             }
9085           }
9086           // Preload target address for load instruction (non-constant)
9087           if(itype[i+1]==LOAD&&rs1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9088             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9089             {
9090               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9091               {
9092                 regs[i].regmap[hr]=rs1[i+1];
9093                 regmap_pre[i+1][hr]=rs1[i+1];
9094                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9095                 regs[i].isconst&=~(1<<hr);
9096                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9097                 constmap[i][hr]=constmap[i+1][hr];
9098                 regs[i+1].wasdirty&=~(1<<hr);
9099                 regs[i].dirty&=~(1<<hr);
9100               }
9101             }
9102           }
9103           // Load source into target register
9104           if(lt1[i+1]&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9105             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
9106             {
9107               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9108               {
9109                 regs[i].regmap[hr]=rs1[i+1];
9110                 regmap_pre[i+1][hr]=rs1[i+1];
9111                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9112                 regs[i].isconst&=~(1<<hr);
9113                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9114                 constmap[i][hr]=constmap[i+1][hr];
9115                 regs[i+1].wasdirty&=~(1<<hr);
9116                 regs[i].dirty&=~(1<<hr);
9117               }
9118             }
9119           }
9120           // Address for store instruction (non-constant)
9121           if(itype[i+1]==STORE||itype[i+1]==STORELR
9122              ||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SB/SH/SW/SD/SWC1/SDC1/SWC2/SDC2
9123             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9124               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
9125               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9126               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isconst&=~(1<<hr);}
9127               assert(hr>=0);
9128               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9129               {
9130                 regs[i].regmap[hr]=rs1[i+1];
9131                 regmap_pre[i+1][hr]=rs1[i+1];
9132                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9133                 regs[i].isconst&=~(1<<hr);
9134                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9135                 constmap[i][hr]=constmap[i+1][hr];
9136                 regs[i+1].wasdirty&=~(1<<hr);
9137                 regs[i].dirty&=~(1<<hr);
9138               }
9139             }
9140           }
9141           if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) { // LWC1/LDC1, LWC2/LDC2
9142             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
9143               int nr;
9144               hr=get_reg(regs[i+1].regmap,FTEMP);
9145               assert(hr>=0);
9146               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
9147               {
9148                 regs[i].regmap[hr]=rs1[i+1];
9149                 regmap_pre[i+1][hr]=rs1[i+1];
9150                 regs[i+1].regmap_entry[hr]=rs1[i+1];
9151                 regs[i].isconst&=~(1<<hr);
9152                 regs[i].isconst|=regs[i+1].isconst&(1<<hr);
9153                 constmap[i][hr]=constmap[i+1][hr];
9154                 regs[i+1].wasdirty&=~(1<<hr);
9155                 regs[i].dirty&=~(1<<hr);
9156               }
9157               else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
9158               {
9159                 // move it to another register
9160                 regs[i+1].regmap[hr]=-1;
9161                 regmap_pre[i+2][hr]=-1;
9162                 regs[i+1].regmap[nr]=FTEMP;
9163                 regmap_pre[i+2][nr]=FTEMP;
9164                 regs[i].regmap[nr]=rs1[i+1];
9165                 regmap_pre[i+1][nr]=rs1[i+1];
9166                 regs[i+1].regmap_entry[nr]=rs1[i+1];
9167                 regs[i].isconst&=~(1<<nr);
9168                 regs[i+1].isconst&=~(1<<nr);
9169                 regs[i].dirty&=~(1<<nr);
9170                 regs[i+1].wasdirty&=~(1<<nr);
9171                 regs[i+1].dirty&=~(1<<nr);
9172                 regs[i+2].wasdirty&=~(1<<nr);
9173               }
9174             }
9175           }
9176           if(itype[i+1]==LOAD||itype[i+1]==LOADLR||itype[i+1]==STORE||itype[i+1]==STORELR/*||itype[i+1]==C1LS||||itype[i+1]==C2LS*/) {
9177             if(itype[i+1]==LOAD)
9178               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
9179             if(itype[i+1]==LOADLR||(opcode[i+1]&0x3b)==0x31||(opcode[i+1]&0x3b)==0x32) // LWC1/LDC1, LWC2/LDC2
9180               hr=get_reg(regs[i+1].regmap,FTEMP);
9181             if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a) { // SWC1/SDC1/SWC2/SDC2
9182               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
9183               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
9184             }
9185             if(hr>=0&&regs[i].regmap[hr]<0) {
9186               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
9187               if(rs>=0&&((regs[i+1].wasconst>>rs)&1)) {
9188                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
9189                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
9190                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
9191                 regs[i].isconst&=~(1<<hr);
9192                 regs[i+1].wasdirty&=~(1<<hr);
9193                 regs[i].dirty&=~(1<<hr);
9194               }
9195             }
9196           }
9197         }
9198       }
9199     }
9200   }
9201
9202   /* Pass 6 - Optimize clean/dirty state */
9203   clean_registers(0,slen-1,1);
9204
9205   /* Pass 7 - Identify 32-bit registers */
9206   for (i=slen-1;i>=0;i--)
9207   {
9208     if(itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9209     {
9210       // Conditional branch
9211       if((source[i]>>16)!=0x1000&&i<slen-2) {
9212         // Mark this address as a branch target since it may be called
9213         // upon return from interrupt
9214         bt[i+2]=1;
9215       }
9216     }
9217   }
9218
9219   if(itype[slen-1]==SPAN) {
9220     bt[slen-1]=1; // Mark as a branch target so instruction can restart after exception
9221   }
9222
9223 #ifdef DISASM
9224   /* Debug/disassembly */
9225   for(i=0;i<slen;i++)
9226   {
9227     printf("U:");
9228     int r;
9229     for(r=1;r<=CCREG;r++) {
9230       if((unneeded_reg[i]>>r)&1) {
9231         if(r==HIREG) printf(" HI");
9232         else if(r==LOREG) printf(" LO");
9233         else printf(" r%d",r);
9234       }
9235     }
9236     printf("\n");
9237     #if defined(__i386__) || defined(__x86_64__)
9238     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
9239     #endif
9240     #ifdef __arm__
9241     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
9242     #endif
9243     printf("needs: ");
9244     if(needed_reg[i]&1) printf("eax ");
9245     if((needed_reg[i]>>1)&1) printf("ecx ");
9246     if((needed_reg[i]>>2)&1) printf("edx ");
9247     if((needed_reg[i]>>3)&1) printf("ebx ");
9248     if((needed_reg[i]>>5)&1) printf("ebp ");
9249     if((needed_reg[i]>>6)&1) printf("esi ");
9250     if((needed_reg[i]>>7)&1) printf("edi ");
9251     printf("\n");
9252     #if defined(__i386__) || defined(__x86_64__)
9253     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
9254     printf("dirty: ");
9255     if(regs[i].wasdirty&1) printf("eax ");
9256     if((regs[i].wasdirty>>1)&1) printf("ecx ");
9257     if((regs[i].wasdirty>>2)&1) printf("edx ");
9258     if((regs[i].wasdirty>>3)&1) printf("ebx ");
9259     if((regs[i].wasdirty>>5)&1) printf("ebp ");
9260     if((regs[i].wasdirty>>6)&1) printf("esi ");
9261     if((regs[i].wasdirty>>7)&1) printf("edi ");
9262     #endif
9263     #ifdef __arm__
9264     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
9265     printf("dirty: ");
9266     if(regs[i].wasdirty&1) printf("r0 ");
9267     if((regs[i].wasdirty>>1)&1) printf("r1 ");
9268     if((regs[i].wasdirty>>2)&1) printf("r2 ");
9269     if((regs[i].wasdirty>>3)&1) printf("r3 ");
9270     if((regs[i].wasdirty>>4)&1) printf("r4 ");
9271     if((regs[i].wasdirty>>5)&1) printf("r5 ");
9272     if((regs[i].wasdirty>>6)&1) printf("r6 ");
9273     if((regs[i].wasdirty>>7)&1) printf("r7 ");
9274     if((regs[i].wasdirty>>8)&1) printf("r8 ");
9275     if((regs[i].wasdirty>>9)&1) printf("r9 ");
9276     if((regs[i].wasdirty>>10)&1) printf("r10 ");
9277     if((regs[i].wasdirty>>12)&1) printf("r12 ");
9278     #endif
9279     printf("\n");
9280     disassemble_inst(i);
9281     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
9282     #if defined(__i386__) || defined(__x86_64__)
9283     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
9284     if(regs[i].dirty&1) printf("eax ");
9285     if((regs[i].dirty>>1)&1) printf("ecx ");
9286     if((regs[i].dirty>>2)&1) printf("edx ");
9287     if((regs[i].dirty>>3)&1) printf("ebx ");
9288     if((regs[i].dirty>>5)&1) printf("ebp ");
9289     if((regs[i].dirty>>6)&1) printf("esi ");
9290     if((regs[i].dirty>>7)&1) printf("edi ");
9291     #endif
9292     #ifdef __arm__
9293     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
9294     if(regs[i].dirty&1) printf("r0 ");
9295     if((regs[i].dirty>>1)&1) printf("r1 ");
9296     if((regs[i].dirty>>2)&1) printf("r2 ");
9297     if((regs[i].dirty>>3)&1) printf("r3 ");
9298     if((regs[i].dirty>>4)&1) printf("r4 ");
9299     if((regs[i].dirty>>5)&1) printf("r5 ");
9300     if((regs[i].dirty>>6)&1) printf("r6 ");
9301     if((regs[i].dirty>>7)&1) printf("r7 ");
9302     if((regs[i].dirty>>8)&1) printf("r8 ");
9303     if((regs[i].dirty>>9)&1) printf("r9 ");
9304     if((regs[i].dirty>>10)&1) printf("r10 ");
9305     if((regs[i].dirty>>12)&1) printf("r12 ");
9306     #endif
9307     printf("\n");
9308     if(regs[i].isconst) {
9309       printf("constants: ");
9310       #if defined(__i386__) || defined(__x86_64__)
9311       if(regs[i].isconst&1) printf("eax=%x ",(u_int)constmap[i][0]);
9312       if((regs[i].isconst>>1)&1) printf("ecx=%x ",(u_int)constmap[i][1]);
9313       if((regs[i].isconst>>2)&1) printf("edx=%x ",(u_int)constmap[i][2]);
9314       if((regs[i].isconst>>3)&1) printf("ebx=%x ",(u_int)constmap[i][3]);
9315       if((regs[i].isconst>>5)&1) printf("ebp=%x ",(u_int)constmap[i][5]);
9316       if((regs[i].isconst>>6)&1) printf("esi=%x ",(u_int)constmap[i][6]);
9317       if((regs[i].isconst>>7)&1) printf("edi=%x ",(u_int)constmap[i][7]);
9318       #endif
9319       #ifdef __arm__
9320       int r;
9321       for (r = 0; r < ARRAY_SIZE(constmap[i]); r++)
9322         if ((regs[i].isconst >> r) & 1)
9323           printf(" r%d=%x", r, (u_int)constmap[i][r]);
9324       #endif
9325       printf("\n");
9326     }
9327     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP) {
9328       #if defined(__i386__) || defined(__x86_64__)
9329       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
9330       if(branch_regs[i].dirty&1) printf("eax ");
9331       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
9332       if((branch_regs[i].dirty>>2)&1) printf("edx ");
9333       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
9334       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
9335       if((branch_regs[i].dirty>>6)&1) printf("esi ");
9336       if((branch_regs[i].dirty>>7)&1) printf("edi ");
9337       #endif
9338       #ifdef __arm__
9339       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
9340       if(branch_regs[i].dirty&1) printf("r0 ");
9341       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
9342       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
9343       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
9344       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
9345       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
9346       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
9347       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
9348       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
9349       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
9350       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
9351       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
9352       #endif
9353     }
9354   }
9355 #endif // DISASM
9356
9357   /* Pass 8 - Assembly */
9358   linkcount=0;stubcount=0;
9359   ds=0;is_delayslot=0;
9360   cop1_usable=0;
9361   uint64_t is32_pre=0;
9362   u_int dirty_pre=0;
9363   void *beginning=start_block();
9364   if((u_int)addr&1) {
9365     ds=1;
9366     pagespan_ds();
9367   }
9368   void *instr_addr0_override = NULL;
9369
9370   if (start == 0x80030000) {
9371     // nasty hack for fastbios thing
9372     // override block entry to this code
9373     instr_addr0_override = out;
9374     emit_movimm(start,0);
9375     // abuse io address var as a flag that we
9376     // have already returned here once
9377     emit_readword(&address,1);
9378     emit_writeword(0,&pcaddr);
9379     emit_writeword(0,&address);
9380     emit_cmp(0,1);
9381     emit_jne(new_dyna_leave);
9382   }
9383   for(i=0;i<slen;i++)
9384   {
9385     //if(ds) printf("ds: ");
9386     disassemble_inst(i);
9387     if(ds) {
9388       ds=0; // Skip delay slot
9389       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
9390       instr_addr[i] = NULL;
9391     } else {
9392       speculate_register_values(i);
9393       #ifndef DESTRUCTIVE_WRITEBACK
9394       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9395       {
9396         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,is32_pre,
9397               unneeded_reg[i],unneeded_reg_upper[i]);
9398       }
9399       if((itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)&&!likely[i]) {
9400         is32_pre=branch_regs[i].is32;
9401         dirty_pre=branch_regs[i].dirty;
9402       }else{
9403         is32_pre=regs[i].is32;
9404         dirty_pre=regs[i].dirty;
9405       }
9406       #endif
9407       // write back
9408       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000))
9409       {
9410         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32,
9411                       unneeded_reg[i],unneeded_reg_upper[i]);
9412         loop_preload(regmap_pre[i],regs[i].regmap_entry);
9413       }
9414       // branch target entry point
9415       instr_addr[i] = out;
9416       assem_debug("<->\n");
9417       drc_dbg_emit_do_cmp(i);
9418
9419       // load regs
9420       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
9421         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty,regs[i].was32);
9422       load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i],rs2[i]);
9423       address_generation(i,&regs[i],regs[i].regmap_entry);
9424       load_consts(regmap_pre[i],regs[i].regmap,regs[i].was32,i);
9425       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP||itype[i]==FJUMP)
9426       {
9427         // Load the delay slot registers if necessary
9428         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&(rs1[i+1]!=rt1[i]||rt1[i]==0))
9429           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9430         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&(rs2[i+1]!=rt1[i]||rt1[i]==0))
9431           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9432         if(itype[i+1]==STORE||itype[i+1]==STORELR||(opcode[i+1]&0x3b)==0x39||(opcode[i+1]&0x3b)==0x3a)
9433           load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9434       }
9435       else if(i+1<slen)
9436       {
9437         // Preload registers for following instruction
9438         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i])
9439           if(rs1[i+1]!=rt1[i]&&rs1[i+1]!=rt2[i])
9440             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs1[i+1],rs1[i+1]);
9441         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i])
9442           if(rs2[i+1]!=rt1[i]&&rs2[i+1]!=rt2[i])
9443             load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,rs2[i+1],rs2[i+1]);
9444       }
9445       // TODO: if(is_ooo(i)) address_generation(i+1);
9446       if(itype[i]==CJUMP||itype[i]==FJUMP)
9447         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,CCREG,CCREG);
9448       if(itype[i]==STORE||itype[i]==STORELR||(opcode[i]&0x3b)==0x39||(opcode[i]&0x3b)==0x3a)
9449         load_regs(regs[i].regmap_entry,regs[i].regmap,regs[i].was32,INVCP,INVCP);
9450       if(bt[i]) cop1_usable=0;
9451       // assemble
9452       switch(itype[i]) {
9453         case ALU:
9454           alu_assemble(i,&regs[i]);break;
9455         case IMM16:
9456           imm16_assemble(i,&regs[i]);break;
9457         case SHIFT:
9458           shift_assemble(i,&regs[i]);break;
9459         case SHIFTIMM:
9460           shiftimm_assemble(i,&regs[i]);break;
9461         case LOAD:
9462           load_assemble(i,&regs[i]);break;
9463         case LOADLR:
9464           loadlr_assemble(i,&regs[i]);break;
9465         case STORE:
9466           store_assemble(i,&regs[i]);break;
9467         case STORELR:
9468           storelr_assemble(i,&regs[i]);break;
9469         case COP0:
9470           cop0_assemble(i,&regs[i]);break;
9471         case COP1:
9472           cop1_assemble(i,&regs[i]);break;
9473         case C1LS:
9474           c1ls_assemble(i,&regs[i]);break;
9475         case COP2:
9476           cop2_assemble(i,&regs[i]);break;
9477         case C2LS:
9478           c2ls_assemble(i,&regs[i]);break;
9479         case C2OP:
9480           c2op_assemble(i,&regs[i]);break;
9481         case FCONV:
9482           fconv_assemble(i,&regs[i]);break;
9483         case FLOAT:
9484           float_assemble(i,&regs[i]);break;
9485         case FCOMP:
9486           fcomp_assemble(i,&regs[i]);break;
9487         case MULTDIV:
9488           multdiv_assemble(i,&regs[i]);break;
9489         case MOV:
9490           mov_assemble(i,&regs[i]);break;
9491         case SYSCALL:
9492           syscall_assemble(i,&regs[i]);break;
9493         case HLECALL:
9494           hlecall_assemble(i,&regs[i]);break;
9495         case INTCALL:
9496           intcall_assemble(i,&regs[i]);break;
9497         case UJUMP:
9498           ujump_assemble(i,&regs[i]);ds=1;break;
9499         case RJUMP:
9500           rjump_assemble(i,&regs[i]);ds=1;break;
9501         case CJUMP:
9502           cjump_assemble(i,&regs[i]);ds=1;break;
9503         case SJUMP:
9504           sjump_assemble(i,&regs[i]);ds=1;break;
9505         case FJUMP:
9506           fjump_assemble(i,&regs[i]);ds=1;break;
9507         case SPAN:
9508           pagespan_assemble(i,&regs[i]);break;
9509       }
9510       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
9511         literal_pool(1024);
9512       else
9513         literal_pool_jumpover(256);
9514     }
9515   }
9516   //assert(itype[i-2]==UJUMP||itype[i-2]==RJUMP||(source[i-2]>>16)==0x1000);
9517   // If the block did not end with an unconditional branch,
9518   // add a jump to the next instruction.
9519   if(i>1) {
9520     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&(source[i-2]>>16)!=0x1000&&itype[i-1]!=SPAN) {
9521       assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
9522       assert(i==slen);
9523       if(itype[i-2]!=CJUMP&&itype[i-2]!=SJUMP&&itype[i-2]!=FJUMP) {
9524         store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
9525         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9526           emit_loadreg(CCREG,HOST_CCREG);
9527         emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9528       }
9529       else if(!likely[i-2])
9530       {
9531         store_regs_bt(branch_regs[i-2].regmap,branch_regs[i-2].is32,branch_regs[i-2].dirty,start+i*4);
9532         assert(branch_regs[i-2].regmap[HOST_CCREG]==CCREG);
9533       }
9534       else
9535       {
9536         store_regs_bt(regs[i-2].regmap,regs[i-2].is32,regs[i-2].dirty,start+i*4);
9537         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
9538       }
9539       add_to_linker(out,start+i*4,0);
9540       emit_jmp(0);
9541     }
9542   }
9543   else
9544   {
9545     assert(i>0);
9546     assert(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP&&itype[i-1]!=FJUMP);
9547     store_regs_bt(regs[i-1].regmap,regs[i-1].is32,regs[i-1].dirty,start+i*4);
9548     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
9549       emit_loadreg(CCREG,HOST_CCREG);
9550     emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i-1]+1),HOST_CCREG);
9551     add_to_linker(out,start+i*4,0);
9552     emit_jmp(0);
9553   }
9554
9555   // TODO: delay slot stubs?
9556   // Stubs
9557   for(i=0;i<stubcount;i++)
9558   {
9559     switch(stubs[i].type)
9560     {
9561       case LOADB_STUB:
9562       case LOADH_STUB:
9563       case LOADW_STUB:
9564       case LOADD_STUB:
9565       case LOADBU_STUB:
9566       case LOADHU_STUB:
9567         do_readstub(i);break;
9568       case STOREB_STUB:
9569       case STOREH_STUB:
9570       case STOREW_STUB:
9571       case STORED_STUB:
9572         do_writestub(i);break;
9573       case CC_STUB:
9574         do_ccstub(i);break;
9575       case INVCODE_STUB:
9576         do_invstub(i);break;
9577       case FP_STUB:
9578         do_cop1stub(i);break;
9579       case STORELR_STUB:
9580         do_unalignedwritestub(i);break;
9581     }
9582   }
9583
9584   if (instr_addr0_override)
9585     instr_addr[0] = instr_addr0_override;
9586
9587   /* Pass 9 - Linker */
9588   for(i=0;i<linkcount;i++)
9589   {
9590     assem_debug("%p -> %8x\n",link_addr[i].addr,link_addr[i].target);
9591     literal_pool(64);
9592     if (!link_addr[i].ext)
9593     {
9594       void *stub = out;
9595       void *addr = check_addr(link_addr[i].target);
9596       emit_extjump(link_addr[i].addr, link_addr[i].target);
9597       if (addr) {
9598         set_jump_target(link_addr[i].addr, addr);
9599         add_link(link_addr[i].target,stub);
9600       }
9601       else
9602         set_jump_target(link_addr[i].addr, stub);
9603     }
9604     else
9605     {
9606       // Internal branch
9607       int target=(link_addr[i].target-start)>>2;
9608       assert(target>=0&&target<slen);
9609       assert(instr_addr[target]);
9610       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9611       //set_jump_target_fillslot(link_addr[i].addr,instr_addr[target],link_addr[i].ext>>1);
9612       //#else
9613       set_jump_target(link_addr[i].addr, instr_addr[target]);
9614       //#endif
9615     }
9616   }
9617   // External Branch Targets (jump_in)
9618   if(copy+slen*4>(void *)shadow+sizeof(shadow)) copy=shadow;
9619   for(i=0;i<slen;i++)
9620   {
9621     if(bt[i]||i==0)
9622     {
9623       if(instr_addr[i]) // TODO - delay slots (=null)
9624       {
9625         u_int vaddr=start+i*4;
9626         u_int page=get_page(vaddr);
9627         u_int vpage=get_vpage(vaddr);
9628         literal_pool(256);
9629         {
9630           assem_debug("%p (%d) <- %8x\n",instr_addr[i],i,start+i*4);
9631           assem_debug("jump_in: %x\n",start+i*4);
9632           ll_add(jump_dirty+vpage,vaddr,out);
9633           void *entry_point = do_dirty_stub(i);
9634           ll_add_flags(jump_in+page,vaddr,state_rflags,entry_point);
9635           // If there was an existing entry in the hash table,
9636           // replace it with the new address.
9637           // Don't add new entries.  We'll insert the
9638           // ones that actually get used in check_addr().
9639           struct ht_entry *ht_bin = hash_table_get(vaddr);
9640           if (ht_bin->vaddr[0] == vaddr)
9641             ht_bin->tcaddr[0] = entry_point;
9642           if (ht_bin->vaddr[1] == vaddr)
9643             ht_bin->tcaddr[1] = entry_point;
9644         }
9645       }
9646     }
9647   }
9648   // Write out the literal pool if necessary
9649   literal_pool(0);
9650   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
9651   // Align code
9652   if(((u_int)out)&7) emit_addnop(13);
9653   #endif
9654   assert((u_int)out-(u_int)beginning<MAX_OUTPUT_BLOCK_SIZE);
9655   //printf("shadow buffer: %p-%p\n",copy,(u_char *)copy+slen*4);
9656   memcpy(copy,source,slen*4);
9657   copy+=slen*4;
9658
9659   end_block(beginning);
9660
9661   // If we're within 256K of the end of the buffer,
9662   // start over from the beginning. (Is 256K enough?)
9663   if (out > translation_cache+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE)
9664     out = translation_cache;
9665
9666   // Trap writes to any of the pages we compiled
9667   for(i=start>>12;i<=(start+slen*4)>>12;i++) {
9668     invalid_code[i]=0;
9669   }
9670   inv_code_start=inv_code_end=~0;
9671
9672   // for PCSX we need to mark all mirrors too
9673   if(get_page(start)<(RAM_SIZE>>12))
9674     for(i=start>>12;i<=(start+slen*4)>>12;i++)
9675       invalid_code[((u_int)0x00000000>>12)|(i&0x1ff)]=
9676       invalid_code[((u_int)0x80000000>>12)|(i&0x1ff)]=
9677       invalid_code[((u_int)0xa0000000>>12)|(i&0x1ff)]=0;
9678
9679   /* Pass 10 - Free memory by expiring oldest blocks */
9680
9681   int end=(((out-translation_cache)>>(TARGET_SIZE_2-16))+16384)&65535;
9682   while(expirep!=end)
9683   {
9684     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
9685     uintptr_t base=(uintptr_t)translation_cache+((expirep>>13)<<shift); // Base address of this block
9686     inv_debug("EXP: Phase %d\n",expirep);
9687     switch((expirep>>11)&3)
9688     {
9689       case 0:
9690         // Clear jump_in and jump_dirty
9691         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
9692         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
9693         ll_remove_matching_addrs(jump_in+2048+(expirep&2047),base,shift);
9694         ll_remove_matching_addrs(jump_dirty+2048+(expirep&2047),base,shift);
9695         break;
9696       case 1:
9697         // Clear pointers
9698         ll_kill_pointers(jump_out[expirep&2047],base,shift);
9699         ll_kill_pointers(jump_out[(expirep&2047)+2048],base,shift);
9700         break;
9701       case 2:
9702         // Clear hash table
9703         for(i=0;i<32;i++) {
9704           struct ht_entry *ht_bin = &hash_table[((expirep&2047)<<5)+i];
9705           if (((uintptr_t)ht_bin->tcaddr[1]>>shift) == (base>>shift) ||
9706              (((uintptr_t)ht_bin->tcaddr[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9707             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[1],ht_bin->tcaddr[1]);
9708             ht_bin->vaddr[1] = -1;
9709             ht_bin->tcaddr[1] = NULL;
9710           }
9711           if (((uintptr_t)ht_bin->tcaddr[0]>>shift) == (base>>shift) ||
9712              (((uintptr_t)ht_bin->tcaddr[0]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
9713             inv_debug("EXP: Remove hash %x -> %p\n",ht_bin->vaddr[0],ht_bin->tcaddr[0]);
9714             ht_bin->vaddr[0] = ht_bin->vaddr[1];
9715             ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
9716             ht_bin->vaddr[1] = -1;
9717             ht_bin->tcaddr[1] = NULL;
9718           }
9719         }
9720         break;
9721       case 3:
9722         // Clear jump_out
9723         #ifdef __arm__
9724         if((expirep&2047)==0)
9725           do_clear_cache();
9726         #endif
9727         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
9728         ll_remove_matching_addrs(jump_out+2048+(expirep&2047),base,shift);
9729         break;
9730     }
9731     expirep=(expirep+1)&65535;
9732   }
9733   return 0;
9734 }
9735
9736 // vim:shiftwidth=2:expandtab