drc: rework for 64bit, part 3
[pcsx_rearmed.git] / libpcsxcore / new_dynarec / new_dynarec.c
1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus - new_dynarec.c                                           *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20
21 #include <stdlib.h>
22 #include <stdint.h> //include for uint64_t
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/mman.h>
26 #ifdef __MACH__
27 #include <libkern/OSCacheControl.h>
28 #endif
29 #ifdef _3DS
30 #include <3ds_utils.h>
31 #endif
32 #ifdef VITA
33 #include <psp2/kernel/sysmem.h>
34 static int sceBlock;
35 #endif
36
37 #include "new_dynarec_config.h"
38 #include "../psxhle.h" //emulator interface
39 #include "emu_if.h" //emulator interface
40
41 #ifndef ARRAY_SIZE
42 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
43 #endif
44
45 //#define DISASM
46 //#define assem_debug printf
47 //#define inv_debug printf
48 #define assem_debug(...)
49 #define inv_debug(...)
50
51 #ifdef __i386__
52 #include "assem_x86.h"
53 #endif
54 #ifdef __x86_64__
55 #include "assem_x64.h"
56 #endif
57 #ifdef __arm__
58 #include "assem_arm.h"
59 #endif
60
61 #define MAXBLOCK 4096
62 #define MAX_OUTPUT_BLOCK_SIZE 262144
63
64 // stubs
65 enum stub_type {
66   CC_STUB = 1,
67   FP_STUB = 2,
68   LOADB_STUB = 3,
69   LOADH_STUB = 4,
70   LOADW_STUB = 5,
71   LOADD_STUB = 6,
72   LOADBU_STUB = 7,
73   LOADHU_STUB = 8,
74   STOREB_STUB = 9,
75   STOREH_STUB = 10,
76   STOREW_STUB = 11,
77   STORED_STUB = 12,
78   STORELR_STUB = 13,
79   INVCODE_STUB = 14,
80 };
81
82 struct regstat
83 {
84   signed char regmap_entry[HOST_REGS];
85   signed char regmap[HOST_REGS];
86   uint64_t was32;
87   uint64_t is32;
88   uint64_t wasdirty;
89   uint64_t dirty;
90   uint64_t u;
91   uint64_t uu;
92   u_int wasconst;
93   u_int isconst;
94   u_int loadedconst;             // host regs that have constants loaded
95   u_int waswritten;              // MIPS regs that were used as store base before
96 };
97
98 // note: asm depends on this layout
99 struct ll_entry
100 {
101   u_int vaddr;
102   u_int reg_sv_flags;
103   void *addr;
104   struct ll_entry *next;
105 };
106
107 struct ht_entry
108 {
109   u_int vaddr[2];
110   void *tcaddr[2];
111 };
112
113 struct code_stub
114 {
115   enum stub_type type;
116   void *addr;
117   void *retaddr;
118   u_int a;
119   uintptr_t b;
120   uintptr_t c;
121   u_int d;
122   u_int e;
123 };
124
125 struct link_entry
126 {
127   void *addr;
128   u_int target;
129   u_int ext;
130 };
131
132   // used by asm:
133   u_char *out;
134   struct ht_entry hash_table[65536]  __attribute__((aligned(16)));
135   struct ll_entry *jump_in[4096] __attribute__((aligned(16)));
136   struct ll_entry *jump_dirty[4096];
137
138   static struct ll_entry *jump_out[4096];
139   static u_int start;
140   static u_int *source;
141   static char insn[MAXBLOCK][10];
142   static u_char itype[MAXBLOCK];
143   static u_char opcode[MAXBLOCK];
144   static u_char opcode2[MAXBLOCK];
145   static u_char bt[MAXBLOCK];
146   static u_char rs1[MAXBLOCK];
147   static u_char rs2[MAXBLOCK];
148   static u_char rt1[MAXBLOCK];
149   static u_char rt2[MAXBLOCK];
150   static u_char us1[MAXBLOCK];
151   static u_char us2[MAXBLOCK];
152   static u_char dep1[MAXBLOCK];
153   static u_char dep2[MAXBLOCK];
154   static u_char lt1[MAXBLOCK];
155   static uint64_t gte_rs[MAXBLOCK]; // gte: 32 data and 32 ctl regs
156   static uint64_t gte_rt[MAXBLOCK];
157   static uint64_t gte_unneeded[MAXBLOCK];
158   static u_int smrv[32]; // speculated MIPS register values
159   static u_int smrv_strong; // mask or regs that are likely to have correct values
160   static u_int smrv_weak; // same, but somewhat less likely
161   static u_int smrv_strong_next; // same, but after current insn executes
162   static u_int smrv_weak_next;
163   static int imm[MAXBLOCK];
164   static u_int ba[MAXBLOCK];
165   static char likely[MAXBLOCK];
166   static char is_ds[MAXBLOCK];
167   static char ooo[MAXBLOCK];
168   static uint64_t unneeded_reg[MAXBLOCK];
169   static uint64_t unneeded_reg_upper[MAXBLOCK];
170   static uint64_t branch_unneeded_reg[MAXBLOCK];
171   static uint64_t branch_unneeded_reg_upper[MAXBLOCK];
172   static signed char regmap_pre[MAXBLOCK][HOST_REGS];
173   static uint64_t current_constmap[HOST_REGS];
174   static uint64_t constmap[MAXBLOCK][HOST_REGS];
175   static struct regstat regs[MAXBLOCK];
176   static struct regstat branch_regs[MAXBLOCK];
177   static signed char minimum_free_regs[MAXBLOCK];
178   static u_int needed_reg[MAXBLOCK];
179   static u_int wont_dirty[MAXBLOCK];
180   static u_int will_dirty[MAXBLOCK];
181   static int ccadj[MAXBLOCK];
182   static int slen;
183   static void *instr_addr[MAXBLOCK];
184   static struct link_entry link_addr[MAXBLOCK];
185   static int linkcount;
186   static struct code_stub stubs[MAXBLOCK*3];
187   static int stubcount;
188   static u_int literals[1024][2];
189   static int literalcount;
190   static int is_delayslot;
191   static int cop1_usable;
192   static char shadow[1048576]  __attribute__((aligned(16)));
193   static void *copy;
194   static int expirep;
195   static u_int stop_after_jal;
196 #ifndef RAM_FIXED
197   static u_int ram_offset;
198 #else
199   static const u_int ram_offset=0;
200 #endif
201
202   int new_dynarec_hacks;
203   int new_dynarec_did_compile;
204   extern u_char restore_candidate[512];
205   extern int cycle_count;
206
207   /* registers that may be allocated */
208   /* 1-31 gpr */
209 #define HIREG 32 // hi
210 #define LOREG 33 // lo
211 #define FSREG 34 // FPU status (FCSR)
212 #define CSREG 35 // Coprocessor status
213 #define CCREG 36 // Cycle count
214 #define INVCP 37 // Pointer to invalid_code
215 //#define MMREG 38 // Pointer to memory_map
216 #define ROREG 39 // ram offset (if rdram!=0x80000000)
217 #define TEMPREG 40
218 #define FTEMP 40 // FPU temporary register
219 #define PTEMP 41 // Prefetch temporary register
220 //#define TLREG 42 // TLB mapping offset
221 #define RHASH 43 // Return address hash
222 #define RHTBL 44 // Return address hash table address
223 #define RTEMP 45 // JR/JALR address register
224 #define MAXREG 45
225 #define AGEN1 46 // Address generation temporary register
226 //#define AGEN2 47 // Address generation temporary register
227 //#define MGEN1 48 // Maptable address generation temporary register
228 //#define MGEN2 49 // Maptable address generation temporary register
229 #define BTREG 50 // Branch target temporary register
230
231   /* instruction types */
232 #define NOP 0     // No operation
233 #define LOAD 1    // Load
234 #define STORE 2   // Store
235 #define LOADLR 3  // Unaligned load
236 #define STORELR 4 // Unaligned store
237 #define MOV 5     // Move
238 #define ALU 6     // Arithmetic/logic
239 #define MULTDIV 7 // Multiply/divide
240 #define SHIFT 8   // Shift by register
241 #define SHIFTIMM 9// Shift by immediate
242 #define IMM16 10  // 16-bit immediate
243 #define RJUMP 11  // Unconditional jump to register
244 #define UJUMP 12  // Unconditional jump
245 #define CJUMP 13  // Conditional branch (BEQ/BNE/BGTZ/BLEZ)
246 #define SJUMP 14  // Conditional branch (regimm format)
247 #define COP0 15   // Coprocessor 0
248 #define COP1 16   // Coprocessor 1
249 #define C1LS 17   // Coprocessor 1 load/store
250 #define FJUMP 18  // Conditional branch (floating point)
251 #define FLOAT 19  // Floating point unit
252 #define FCONV 20  // Convert integer to float
253 #define FCOMP 21  // Floating point compare (sets FSREG)
254 #define SYSCALL 22// SYSCALL
255 #define OTHER 23  // Other
256 #define SPAN 24   // Branch/delay slot spans 2 pages
257 #define NI 25     // Not implemented
258 #define HLECALL 26// PCSX fake opcodes for HLE
259 #define COP2 27   // Coprocessor 2 move
260 #define C2LS 28   // Coprocessor 2 load/store
261 #define C2OP 29   // Coprocessor 2 operation
262 #define INTCALL 30// Call interpreter to handle rare corner cases
263
264   /* branch codes */
265 #define TAKEN 1
266 #define NOTTAKEN 2
267 #define NULLDS 3
268
269 // asm linkage
270 int new_recompile_block(int addr);
271 void *get_addr_ht(u_int vaddr);
272 void invalidate_block(u_int block);
273 void invalidate_addr(u_int addr);
274 void remove_hash(int vaddr);
275 void dyna_linker();
276 void dyna_linker_ds();
277 void verify_code();
278 void verify_code_vm();
279 void verify_code_ds();
280 void cc_interrupt();
281 void fp_exception();
282 void fp_exception_ds();
283 void jump_syscall_hle();
284 void jump_hlecall();
285 void jump_intcall();
286 void new_dyna_leave();
287
288 // Needed by assembler
289 static void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32);
290 static void wb_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty);
291 static void wb_needed_dirtys(signed char i_regmap[],uint64_t i_is32,uint64_t i_dirty,int addr);
292 static void load_all_regs(signed char i_regmap[]);
293 static void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
294 static void load_regs_entry(int t);
295 static void load_all_consts(signed char regmap[],int is32,u_int dirty,int i);
296
297 static int verify_dirty(u_int *ptr);
298 static int get_final_value(int hr, int i, int *value);
299 static void add_stub(enum stub_type type, void *addr, void *retaddr,
300   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e);
301 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
302   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist);
303 static void add_to_linker(void *addr, u_int target, int ext);
304
305 static void mprotect_w_x(void *start, void *end, int is_x)
306 {
307 #ifdef NO_WRITE_EXEC
308   #if defined(VITA)
309   // *Open* enables write on all memory that was
310   // allocated by sceKernelAllocMemBlockForVM()?
311   if (is_x)
312     sceKernelCloseVMDomain();
313   else
314     sceKernelOpenVMDomain();
315   #else
316   u_long mstart = (u_long)start & ~4095ul;
317   u_long mend = (u_long)end;
318   if (mprotect((void *)mstart, mend - mstart,
319                PROT_READ | (is_x ? PROT_EXEC : PROT_WRITE)) != 0)
320     SysPrintf("mprotect(%c) failed: %s\n", is_x ? 'x' : 'w', strerror(errno));
321   #endif
322 #endif
323 }
324
325 static void start_tcache_write(void *start, void *end)
326 {
327   mprotect_w_x(start, end, 0);
328 }
329
330 static void end_tcache_write(void *start, void *end)
331 {
332 #ifdef __arm__
333   size_t len = (char *)end - (char *)start;
334   #if   defined(__BLACKBERRY_QNX__)
335   msync(start, len, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
336   #elif defined(__MACH__)
337   sys_cache_control(kCacheFunctionPrepareForExecution, start, len);
338   #elif defined(VITA)
339   sceKernelSyncVMDomain(sceBlock, start, len);
340   #elif defined(_3DS)
341   ctr_flush_invalidate_cache();
342   #else
343   __clear_cache(start, end);
344   #endif
345   (void)len;
346 #endif
347
348   mprotect_w_x(start, end, 1);
349 }
350
351 static void *start_block(void)
352 {
353   u_char *end = out + MAX_OUTPUT_BLOCK_SIZE;
354   if (end > translation_cache + (1<<TARGET_SIZE_2))
355     end = translation_cache + (1<<TARGET_SIZE_2);
356   start_tcache_write(out, end);
357   return out;
358 }
359
360 static void end_block(void *start)
361 {
362   end_tcache_write(start, out);
363 }
364
365 //#define DEBUG_CYCLE_COUNT 1
366
367 #define NO_CYCLE_PENALTY_THR 12
368
369 int cycle_multiplier; // 100 for 1.0
370
371 static int CLOCK_ADJUST(int x)
372 {
373   int s=(x>>31)|1;
374   return (x * cycle_multiplier + s * 50) / 100;
375 }
376
377 static u_int get_page(u_int vaddr)
378 {
379   u_int page=vaddr&~0xe0000000;
380   if (page < 0x1000000)
381     page &= ~0x0e00000; // RAM mirrors
382   page>>=12;
383   if(page>2048) page=2048+(page&2047);
384   return page;
385 }
386
387 // no virtual mem in PCSX
388 static u_int get_vpage(u_int vaddr)
389 {
390   return get_page(vaddr);
391 }
392
393 static struct ht_entry *hash_table_get(u_int vaddr)
394 {
395   return &hash_table[((vaddr>>16)^vaddr)&0xFFFF];
396 }
397
398 static void hash_table_add(struct ht_entry *ht_bin, u_int vaddr, void *tcaddr)
399 {
400   ht_bin->vaddr[1] = ht_bin->vaddr[0];
401   ht_bin->tcaddr[1] = ht_bin->tcaddr[0];
402   ht_bin->vaddr[0] = vaddr;
403   ht_bin->tcaddr[0] = tcaddr;
404 }
405
406 // some messy ari64's code, seems to rely on unsigned 32bit overflow
407 static int doesnt_expire_soon(void *tcaddr)
408 {
409   u_int diff = (u_int)((u_char *)tcaddr - out) << (32-TARGET_SIZE_2);
410   return diff > (u_int)(0x60000000 + (MAX_OUTPUT_BLOCK_SIZE << (32-TARGET_SIZE_2)));
411 }
412
413 // Get address from virtual address
414 // This is called from the recompiled JR/JALR instructions
415 void *get_addr(u_int vaddr)
416 {
417   u_int page=get_page(vaddr);
418   u_int vpage=get_vpage(vaddr);
419   struct ll_entry *head;
420   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
421   head=jump_in[page];
422   while(head!=NULL) {
423     if(head->vaddr==vaddr) {
424   //printf("TRACE: count=%d next=%d (get_addr match %x: %p)\n",Count,next_interupt,vaddr,head->addr);
425       hash_table_add(hash_table_get(vaddr), vaddr, head->addr);
426       return head->addr;
427     }
428     head=head->next;
429   }
430   head=jump_dirty[vpage];
431   while(head!=NULL) {
432     if(head->vaddr==vaddr) {
433       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %p)\n",Count,next_interupt,vaddr,head->addr);
434       // Don't restore blocks which are about to expire from the cache
435       if (doesnt_expire_soon(head->addr))
436       if (verify_dirty(head->addr)) {
437         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,invalid_code[vaddr>>12]);
438         invalid_code[vaddr>>12]=0;
439         inv_code_start=inv_code_end=~0;
440         if(vpage<2048) {
441           restore_candidate[vpage>>3]|=1<<(vpage&7);
442         }
443         else restore_candidate[page>>3]|=1<<(page&7);
444         struct ht_entry *ht_bin = hash_table_get(vaddr);
445         if (ht_bin->vaddr[0] == vaddr)
446           ht_bin->tcaddr[0] = head->addr; // Replace existing entry
447         else
448           hash_table_add(ht_bin, vaddr, head->addr);
449
450         return head->addr;
451       }
452     }
453     head=head->next;
454   }
455   //printf("TRACE: count=%d next=%d (get_addr no-match %x)\n",Count,next_interupt,vaddr);
456   int r=new_recompile_block(vaddr);
457   if(r==0) return get_addr(vaddr);
458   // Execute in unmapped page, generate pagefault execption
459   Status|=2;
460   Cause=(vaddr<<31)|0x8;
461   EPC=(vaddr&1)?vaddr-5:vaddr;
462   BadVAddr=(vaddr&~1);
463   Context=(Context&0xFF80000F)|((BadVAddr>>9)&0x007FFFF0);
464   EntryHi=BadVAddr&0xFFFFE000;
465   return get_addr_ht(0x80000000);
466 }
467 // Look up address in hash table first
468 void *get_addr_ht(u_int vaddr)
469 {
470   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
471   const struct ht_entry *ht_bin = hash_table_get(vaddr);
472   if (ht_bin->vaddr[0] == vaddr) return ht_bin->tcaddr[0];
473   if (ht_bin->vaddr[1] == vaddr) return ht_bin->tcaddr[1];
474   return get_addr(vaddr);
475 }
476
477 void clear_all_regs(signed char regmap[])
478 {
479   int hr;
480   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
481 }
482
483 signed char get_reg(signed char regmap[],int r)
484 {
485   int hr;
486   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
487   return -1;
488 }
489
490 // Find a register that is available for two consecutive cycles
491 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
492 {
493   int hr;
494   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
495   return -1;
496 }
497
498 int count_free_regs(signed char regmap[])
499 {
500   int count=0;
501   int hr;
502   for(hr=0;hr<HOST_REGS;hr++)
503   {
504     if(hr!=EXCLUDE_REG) {
505       if(regmap[hr]<0) count++;
506     }
507   }
508   return count;
509 }
510
511 void dirty_reg(struct regstat *cur,signed char reg)
512 {
513   int hr;
514   if(!reg) return;
515   for (hr=0;hr<HOST_REGS;hr++) {
516     if((cur->regmap[hr]&63)==reg) {
517       cur->dirty|=1<<hr;
518     }
519   }
520 }
521
522 // If we dirty the lower half of a 64 bit register which is now being
523 // sign-extended, we need to dump the upper half.
524 // Note: Do this only after completion of the instruction, because
525 // some instructions may need to read the full 64-bit value even if
526 // overwriting it (eg SLTI, DSRA32).
527 static void flush_dirty_uppers(struct regstat *cur)
528 {
529   int hr,reg;
530   for (hr=0;hr<HOST_REGS;hr++) {
531     if((cur->dirty>>hr)&1) {
532       reg=cur->regmap[hr];
533       if(reg>=64)
534         if((cur->is32>>(reg&63))&1) cur->regmap[hr]=-1;
535     }
536   }
537 }
538
539 void set_const(struct regstat *cur,signed char reg,uint64_t value)
540 {
541   int hr;
542   if(!reg) return;
543   for (hr=0;hr<HOST_REGS;hr++) {
544     if(cur->regmap[hr]==reg) {
545       cur->isconst|=1<<hr;
546       current_constmap[hr]=value;
547     }
548     else if((cur->regmap[hr]^64)==reg) {
549       cur->isconst|=1<<hr;
550       current_constmap[hr]=value>>32;
551     }
552   }
553 }
554
555 void clear_const(struct regstat *cur,signed char reg)
556 {
557   int hr;
558   if(!reg) return;
559   for (hr=0;hr<HOST_REGS;hr++) {
560     if((cur->regmap[hr]&63)==reg) {
561       cur->isconst&=~(1<<hr);
562     }
563   }
564 }
565
566 int is_const(struct regstat *cur,signed char reg)
567 {
568   int hr;
569   if(reg<0) return 0;
570   if(!reg) return 1;
571   for (hr=0;hr<HOST_REGS;hr++) {
572     if((cur->regmap[hr]&63)==reg) {
573       return (cur->isconst>>hr)&1;
574     }
575   }
576   return 0;
577 }
578 uint64_t get_const(struct regstat *cur,signed char reg)
579 {
580   int hr;
581   if(!reg) return 0;
582   for (hr=0;hr<HOST_REGS;hr++) {
583     if(cur->regmap[hr]==reg) {
584       return current_constmap[hr];
585     }
586   }
587   SysPrintf("Unknown constant in r%d\n",reg);
588   exit(1);
589 }
590
591 // Least soon needed registers
592 // Look at the next ten instructions and see which registers
593 // will be used.  Try not to reallocate these.
594 void lsn(u_char hsn[], int i, int *preferred_reg)
595 {
596   int j;
597   int b=-1;
598   for(j=0;j<9;j++)
599   {
600     if(i+j>=slen) {
601       j=slen-i-1;
602       break;
603     }
604     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
605     {
606       // Don't go past an unconditonal jump
607       j++;
608       break;
609     }
610   }
611   for(;j>=0;j--)
612   {
613     if(rs1[i+j]) hsn[rs1[i+j]]=j;
614     if(rs2[i+j]) hsn[rs2[i+j]]=j;
615     if(rt1[i+j]) hsn[rt1[i+j]]=j;
616     if(rt2[i+j]) hsn[rt2[i+j]]=j;
617     if(itype[i+j]==STORE || itype[i+j]==STORELR) {
618       // Stores can allocate zero
619       hsn[rs1[i+j]]=j;
620       hsn[rs2[i+j]]=j;
621     }
622     // On some architectures stores need invc_ptr
623     #if defined(HOST_IMM8)
624     if(itype[i+j]==STORE || itype[i+j]==STORELR || (opcode[i+j]&0x3b)==0x39 || (opcode[i+j]&0x3b)==0x3a) {
625       hsn[INVCP]=j;
626     }
627     #endif
628     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
629     {
630       hsn[CCREG]=j;
631       b=j;
632     }
633   }
634   if(b>=0)
635   {
636     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
637     {
638       // Follow first branch
639       int t=(ba[i+b]-start)>>2;
640       j=7-b;if(t+j>=slen) j=slen-t-1;
641       for(;j>=0;j--)
642       {
643         if(rs1[t+j]) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
644         if(rs2[t+j]) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
645         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
646         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
647       }
648     }
649     // TODO: preferred register based on backward branch
650   }
651   // Delay slot should preferably not overwrite branch conditions or cycle count
652   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)) {
653     if(rs1[i-1]) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
654     if(rs2[i-1]) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
655     hsn[CCREG]=1;
656     // ...or hash tables
657     hsn[RHASH]=1;
658     hsn[RHTBL]=1;
659   }
660   // Coprocessor load/store needs FTEMP, even if not declared
661   if(itype[i]==C1LS||itype[i]==C2LS) {
662     hsn[FTEMP]=0;
663   }
664   // Load L/R also uses FTEMP as a temporary register
665   if(itype[i]==LOADLR) {
666     hsn[FTEMP]=0;
667   }
668   // Also SWL/SWR/SDL/SDR
669   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) {
670     hsn[FTEMP]=0;
671   }
672   // Don't remove the miniht registers
673   if(itype[i]==UJUMP||itype[i]==RJUMP)
674   {
675     hsn[RHASH]=0;
676     hsn[RHTBL]=0;
677   }
678 }
679
680 // We only want to allocate registers if we're going to use them again soon
681 int needed_again(int r, int i)
682 {
683   int j;
684   int b=-1;
685   int rn=10;
686
687   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP||(source[i-1]>>16)==0x1000))
688   {
689     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
690       return 0; // Don't need any registers if exiting the block
691   }
692   for(j=0;j<9;j++)
693   {
694     if(i+j>=slen) {
695       j=slen-i-1;
696       break;
697     }
698     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
699     {
700       // Don't go past an unconditonal jump
701       j++;
702       break;
703     }
704     if(itype[i+j]==SYSCALL||itype[i+j]==HLECALL||itype[i+j]==INTCALL||((source[i+j]&0xfc00003f)==0x0d))
705     {
706       break;
707     }
708   }
709   for(;j>=1;j--)
710   {
711     if(rs1[i+j]==r) rn=j;
712     if(rs2[i+j]==r) rn=j;
713     if((unneeded_reg[i+j]>>r)&1) rn=10;
714     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP||itype[i+j]==FJUMP))
715     {
716       b=j;
717     }
718   }
719   /*
720   if(b>=0)
721   {
722     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
723     {
724       // Follow first branch
725       int o=rn;
726       int t=(ba[i+b]-start)>>2;
727       j=7-b;if(t+j>=slen) j=slen-t-1;
728       for(;j>=0;j--)
729       {
730         if(!((unneeded_reg[t+j]>>r)&1)) {
731           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
732           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
733         }
734         else rn=o;
735       }
736     }
737   }*/
738   if(rn<10) return 1;
739   (void)b;
740   return 0;
741 }
742
743 // Try to match register allocations at the end of a loop with those
744 // at the beginning
745 int loop_reg(int i, int r, int hr)
746 {
747   int j,k;
748   for(j=0;j<9;j++)
749   {
750     if(i+j>=slen) {
751       j=slen-i-1;
752       break;
753     }
754     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP||(source[i+j]>>16)==0x1000)
755     {
756       // Don't go past an unconditonal jump
757       j++;
758       break;
759     }
760   }
761   k=0;
762   if(i>0){
763     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP||itype[i-1]==FJUMP)
764       k--;
765   }
766   for(;k<j;k++)
767   {
768     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
769     if(r>64&&((unneeded_reg_upper[i+k]>>r)&1)) return hr;
770     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP||itype[i+k]==FJUMP))
771     {
772       if(ba[i+k]>=start && ba[i+k]<(start+i*4))
773       {
774         int t=(ba[i+k]-start)>>2;
775         int reg=get_reg(regs[t].regmap_entry,r);
776         if(reg>=0) return reg;
777         //reg=get_reg(regs[t+1].regmap_entry,r);
778         //if(reg>=0) return reg;
779       }
780     }
781   }
782   return hr;
783 }
784
785
786 // Allocate every register, preserving source/target regs
787 void alloc_all(struct regstat *cur,int i)
788 {
789   int hr;
790
791   for(hr=0;hr<HOST_REGS;hr++) {
792     if(hr!=EXCLUDE_REG) {
793       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&
794          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
795       {
796         cur->regmap[hr]=-1;
797         cur->dirty&=~(1<<hr);
798       }
799       // Don't need zeros
800       if((cur->regmap[hr]&63)==0)
801       {
802         cur->regmap[hr]=-1;
803         cur->dirty&=~(1<<hr);
804       }
805     }
806   }
807 }
808
809 #ifdef __i386__
810 #include "assem_x86.c"
811 #endif
812 #ifdef __x86_64__
813 #include "assem_x64.c"
814 #endif
815 #ifdef __arm__
816 #include "assem_arm.c"
817 #endif
818
819 // Add virtual address mapping to linked list
820 void ll_add(struct ll_entry **head,int vaddr,void *addr)
821 {
822   struct ll_entry *new_entry;
823   new_entry=malloc(sizeof(struct ll_entry));
824   assert(new_entry!=NULL);
825   new_entry->vaddr=vaddr;
826   new_entry->reg_sv_flags=0;
827   new_entry->addr=addr;
828   new_entry->next=*head;
829   *head=new_entry;
830 }
831
832 void ll_add_flags(struct ll_entry **head,int vaddr,u_int reg_sv_flags,void *addr)
833 {
834   ll_add(head,vaddr,addr);
835   (*head)->reg_sv_flags=reg_sv_flags;
836 }
837
838 // Check if an address is already compiled
839 // but don't return addresses which are about to expire from the cache
840 void *check_addr(u_int vaddr)
841 {
842   struct ht_entry *ht_bin = hash_table_get(vaddr);
843   size_t i;
844   for (i = 0; i < ARRAY_SIZE(ht_bin->vaddr); i++) {
845     if (ht_bin->vaddr[i] == vaddr)
846       if (doesnt_expire_soon((u_char *)ht_bin->tcaddr[i] - MAX_OUTPUT_BLOCK_SIZE))
847         if (isclean(ht_bin->tcaddr[i]))
848           return ht_bin->tcaddr[i];
849   }
850   u_int page=get_page(vaddr);
851   struct ll_entry *head;
852   head=jump_in[page];
853   while (head != NULL) {
854     if (head->vaddr == vaddr) {
855       if (doesnt_expire_soon(head->addr)) {
856         // Update existing entry with current address
857         if (ht_bin->vaddr[0] == vaddr) {
858           ht_bin->tcaddr[0] = head->addr;
859           return head->addr;
860         }
861         if (ht_bin->vaddr[1] == vaddr) {
862           ht_bin->tcaddr[1] = head->addr;
863           return head->addr;
864         }
865         // Insert into hash table with low priority.
866         // Don't evict existing entries, as they are probably
867         // addresses that are being accessed frequently.
868         if (ht_bin->vaddr[0] == -1) {
869           ht_bin->vaddr[0] = vaddr;
870           ht_bin->tcaddr[0] = head->addr;
871         }
872         else if (ht_bin->vaddr[1] == -1) {
873           ht_bin->vaddr[1] = vaddr;
874           ht_bin->tcaddr[1] = head->addr;
875         }
876         return head->addr;
877       }
878     }
879     head=head->next;
880   }
881   return 0;
882 }
883
884 void remove_hash(int vaddr)
885 {
886   //printf("remove hash: %x\n",vaddr);
887   struct ht_entry *ht_bin = hash_table_get(vaddr);
888   if (ht_bin->vaddr[1] == vaddr) {
889     ht_bin->vaddr[1] = -1;
890     ht_bin->tcaddr[1] = NULL;
891   }
892   if (ht_bin->vaddr[0] == vaddr) {
893     ht_bin->vaddr[0] = ht_bin->vaddr[1];
894     ht_bin->tcaddr[0] = ht_bin->tcaddr[1];
895     ht_bin->vaddr[1] = -1;
896     ht_bin->tcaddr[1] = NULL;
897   }
898 }
899
900 void ll_remove_matching_addrs(struct ll_entry **head,uintptr_t addr,int shift)
901 {
902   struct ll_entry *next;
903   while(*head) {
904     if(((uintptr_t)((*head)->addr)>>shift)==(addr>>shift) ||
905        ((uintptr_t)((*head)->addr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
906     {
907       inv_debug("EXP: Remove pointer to %p (%x)\n",(*head)->addr,(*head)->vaddr);
908       remove_hash((*head)->vaddr);
909       next=(*head)->next;
910       free(*head);
911       *head=next;
912     }
913     else
914     {
915       head=&((*head)->next);
916     }
917   }
918 }
919
920 // Remove all entries from linked list
921 void ll_clear(struct ll_entry **head)
922 {
923   struct ll_entry *cur;
924   struct ll_entry *next;
925   if((cur=*head)) {
926     *head=0;
927     while(cur) {
928       next=cur->next;
929       free(cur);
930       cur=next;
931     }
932   }
933 }
934
935 // Dereference the pointers and remove if it matches
936 static void ll_kill_pointers(struct ll_entry *head,uintptr_t addr,int shift)
937 {
938   while(head) {
939     uintptr_t ptr = (uintptr_t)get_pointer(head->addr);
940     inv_debug("EXP: Lookup pointer to %lx at %p (%x)\n",(long)ptr,head->addr,head->vaddr);
941     if(((ptr>>shift)==(addr>>shift)) ||
942        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
943     {
944       inv_debug("EXP: Kill pointer at %p (%x)\n",head->addr,head->vaddr);
945       void *host_addr=find_extjump_insn(head->addr);
946       #ifdef __arm__
947         mark_clear_cache(host_addr);
948       #endif
949       set_jump_target(host_addr, head->addr);
950     }
951     head=head->next;
952   }
953 }
954
955 // This is called when we write to a compiled block (see do_invstub)
956 void invalidate_page(u_int page)
957 {
958   struct ll_entry *head;
959   struct ll_entry *next;
960   head=jump_in[page];
961   jump_in[page]=0;
962   while(head!=NULL) {
963     inv_debug("INVALIDATE: %x\n",head->vaddr);
964     remove_hash(head->vaddr);
965     next=head->next;
966     free(head);
967     head=next;
968   }
969   head=jump_out[page];
970   jump_out[page]=0;
971   while(head!=NULL) {
972     inv_debug("INVALIDATE: kill pointer to %x (%p)\n",head->vaddr,head->addr);
973     void *host_addr=find_extjump_insn(head->addr);
974     #ifdef __arm__
975       mark_clear_cache(host_addr);
976     #endif
977     set_jump_target(host_addr, head->addr);
978     next=head->next;
979     free(head);
980     head=next;
981   }
982 }
983
984 static void invalidate_block_range(u_int block, u_int first, u_int last)
985 {
986   u_int page=get_page(block<<12);
987   //printf("first=%d last=%d\n",first,last);
988   invalidate_page(page);
989   assert(first+5>page); // NB: this assumes MAXBLOCK<=4096 (4 pages)
990   assert(last<page+5);
991   // Invalidate the adjacent pages if a block crosses a 4K boundary
992   while(first<page) {
993     invalidate_page(first);
994     first++;
995   }
996   for(first=page+1;first<last;first++) {
997     invalidate_page(first);
998   }
999   #ifdef __arm__
1000     do_clear_cache();
1001   #endif
1002
1003   // Don't trap writes
1004   invalid_code[block]=1;
1005
1006   #ifdef USE_MINI_HT
1007   memset(mini_ht,-1,sizeof(mini_ht));
1008   #endif
1009 }
1010
1011 void invalidate_block(u_int block)
1012 {
1013   u_int page=get_page(block<<12);
1014   u_int vpage=get_vpage(block<<12);
1015   inv_debug("INVALIDATE: %x (%d)\n",block<<12,page);
1016   //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
1017   u_int first,last;
1018   first=last=page;
1019   struct ll_entry *head;
1020   head=jump_dirty[vpage];
1021   //printf("page=%d vpage=%d\n",page,vpage);
1022   while(head!=NULL) {
1023     u_int start,end;
1024     if(vpage>2047||(head->vaddr>>12)==block) { // Ignore vaddr hash collision
1025       get_bounds(head->addr,&start,&end);
1026       //printf("start: %x end: %x\n",start,end);
1027       if(page<2048&&start>=(u_int)rdram&&end<(u_int)rdram+RAM_SIZE) {
1028         if(((start-(u_int)rdram)>>12)<=page&&((end-1-(u_int)rdram)>>12)>=page) {
1029           if((((start-(u_int)rdram)>>12)&2047)<first) first=((start-(u_int)rdram)>>12)&2047;
1030           if((((end-1-(u_int)rdram)>>12)&2047)>last) last=((end-1-(u_int)rdram)>>12)&2047;
1031         }
1032       }
1033     }
1034     head=head->next;
1035   }
1036   invalidate_block_range(block,first,last);
1037 }
1038
1039 void invalidate_addr(u_int addr)
1040 {
1041   //static int rhits;
1042   // this check is done by the caller
1043   //if (inv_code_start<=addr&&addr<=inv_code_end) { rhits++; return; }
1044   u_int page=get_vpage(addr);
1045   if(page<2048) { // RAM
1046     struct ll_entry *head;
1047     u_int addr_min=~0, addr_max=0;
1048     u_int mask=RAM_SIZE-1;
1049     u_int addr_main=0x80000000|(addr&mask);
1050     int pg1;
1051     inv_code_start=addr_main&~0xfff;
1052     inv_code_end=addr_main|0xfff;
1053     pg1=page;
1054     if (pg1>0) {
1055       // must check previous page too because of spans..
1056       pg1--;
1057       inv_code_start-=0x1000;
1058     }
1059     for(;pg1<=page;pg1++) {
1060       for(head=jump_dirty[pg1];head!=NULL;head=head->next) {
1061         u_int start,end;
1062         get_bounds(head->addr,&start,&end);
1063         if(ram_offset) {
1064           start-=ram_offset;
1065           end-=ram_offset;
1066         }
1067         if(start<=addr_main&&addr_main<end) {
1068           if(start<addr_min) addr_min=start;
1069           if(end>addr_max) addr_max=end;
1070         }
1071         else if(addr_main<start) {
1072           if(start<inv_code_end)
1073             inv_code_end=start-1;
1074         }
1075         else {
1076           if(end>inv_code_start)
1077             inv_code_start=end;
1078         }
1079       }
1080     }
1081     if (addr_min!=~0) {
1082       inv_debug("INV ADDR: %08x hit %08x-%08x\n", addr, addr_min, addr_max);
1083       inv_code_start=inv_code_end=~0;
1084       invalidate_block_range(addr>>12,(addr_min&mask)>>12,(addr_max&mask)>>12);
1085       return;
1086     }
1087     else {
1088       inv_code_start=(addr&~mask)|(inv_code_start&mask);
1089       inv_code_end=(addr&~mask)|(inv_code_end&mask);
1090       inv_debug("INV ADDR: %08x miss, inv %08x-%08x, sk %d\n", addr, inv_code_start, inv_code_end, 0);
1091       return;
1092     }
1093   }
1094   invalidate_block(addr>>12);
1095 }
1096
1097 // This is called when loading a save state.
1098 // Anything could have changed, so invalidate everything.
1099 void invalidate_all_pages()
1100 {
1101   u_int page;
1102   for(page=0;page<4096;page++)
1103     invalidate_page(page);
1104   for(page=0;page<1048576;page++)
1105     if(!invalid_code[page]) {
1106       restore_candidate[(page&2047)>>3]|=1<<(page&7);
1107       restore_candidate[((page&2047)>>3)+256]|=1<<(page&7);
1108     }
1109   #ifdef USE_MINI_HT
1110   memset(mini_ht,-1,sizeof(mini_ht));
1111   #endif
1112 }
1113
1114 // Add an entry to jump_out after making a link
1115 void add_link(u_int vaddr,void *src)
1116 {
1117   u_int page=get_page(vaddr);
1118   inv_debug("add_link: %p -> %x (%d)\n",src,vaddr,page);
1119   int *ptr=(int *)(src+4);
1120   assert((*ptr&0x0fff0000)==0x059f0000);
1121   (void)ptr;
1122   ll_add(jump_out+page,vaddr,src);
1123   //void *ptr=get_pointer(src);
1124   //inv_debug("add_link: Pointer is to %p\n",ptr);
1125 }
1126
1127 // If a code block was found to be unmodified (bit was set in
1128 // restore_candidate) and it remains unmodified (bit is clear
1129 // in invalid_code) then move the entries for that 4K page from
1130 // the dirty list to the clean list.
1131 void clean_blocks(u_int page)
1132 {
1133   struct ll_entry *head;
1134   inv_debug("INV: clean_blocks page=%d\n",page);
1135   head=jump_dirty[page];
1136   while(head!=NULL) {
1137     if(!invalid_code[head->vaddr>>12]) {
1138       // Don't restore blocks which are about to expire from the cache
1139       if (doesnt_expire_soon(head->addr)) {
1140         u_int start,end;
1141         if(verify_dirty(head->addr)) {
1142           //printf("Possibly Restore %x (%p)\n",head->vaddr, head->addr);
1143           u_int i;
1144           u_int inv=0;
1145           get_bounds(head->addr,&start,&end);
1146           if(start-(u_int)rdram<RAM_SIZE) {
1147             for(i=(start-(u_int)rdram+0x80000000)>>12;i<=(end-1-(u_int)rdram+0x80000000)>>12;i++) {
1148               inv|=invalid_code[i];
1149             }
1150           }
1151           else if((signed int)head->vaddr>=(signed int)0x80000000+RAM_SIZE) {
1152             inv=1;
1153           }
1154           if(!inv) {
1155             void *clean_addr = get_clean_addr(head->addr);
1156             if (doesnt_expire_soon(clean_addr)) {
1157               u_int ppage=page;
1158               inv_debug("INV: Restored %x (%p/%p)\n",head->vaddr, head->addr, clean_addr);
1159               //printf("page=%x, addr=%x\n",page,head->vaddr);
1160               //assert(head->vaddr>>12==(page|0x80000));
1161               ll_add_flags(jump_in+ppage,head->vaddr,head->reg_sv_flags,clean_addr);
1162               struct ht_entry *ht_bin = hash_table_get(head->vaddr);
1163               if (ht_bin->vaddr[0] == head->vaddr)
1164                 ht_bin->tcaddr[0] = clean_addr; // Replace existing entry
1165               if (ht_bin->vaddr[1] == head->vaddr)
1166                 ht_bin->tcaddr[1] = clean_addr; // Replace existing entry
1167             }
1168           }
1169         }
1170       }
1171     }
1172     head=head->next;
1173   }
1174 }
1175
1176
1177 void mov_alloc(struct regstat *current,int i)
1178 {
1179   // Note: Don't need to actually alloc the source registers
1180   if((~current->is32>>rs1[i])&1) {
1181     //alloc_reg64(current,i,rs1[i]);
1182     alloc_reg64(current,i,rt1[i]);
1183     current->is32&=~(1LL<<rt1[i]);
1184   } else {
1185     //alloc_reg(current,i,rs1[i]);
1186     alloc_reg(current,i,rt1[i]);
1187     current->is32|=(1LL<<rt1[i]);
1188   }
1189   clear_const(current,rs1[i]);
1190   clear_const(current,rt1[i]);
1191   dirty_reg(current,rt1[i]);
1192 }
1193
1194 void shiftimm_alloc(struct regstat *current,int i)
1195 {
1196   if(opcode2[i]<=0x3) // SLL/SRL/SRA
1197   {
1198     if(rt1[i]) {
1199       if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1200       else lt1[i]=rs1[i];
1201       alloc_reg(current,i,rt1[i]);
1202       current->is32|=1LL<<rt1[i];
1203       dirty_reg(current,rt1[i]);
1204       if(is_const(current,rs1[i])) {
1205         int v=get_const(current,rs1[i]);
1206         if(opcode2[i]==0x00) set_const(current,rt1[i],v<<imm[i]);
1207         if(opcode2[i]==0x02) set_const(current,rt1[i],(u_int)v>>imm[i]);
1208         if(opcode2[i]==0x03) set_const(current,rt1[i],v>>imm[i]);
1209       }
1210       else clear_const(current,rt1[i]);
1211     }
1212   }
1213   else
1214   {
1215     clear_const(current,rs1[i]);
1216     clear_const(current,rt1[i]);
1217   }
1218
1219   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
1220   {
1221     if(rt1[i]) {
1222       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1223       alloc_reg64(current,i,rt1[i]);
1224       current->is32&=~(1LL<<rt1[i]);
1225       dirty_reg(current,rt1[i]);
1226     }
1227   }
1228   if(opcode2[i]==0x3c) // DSLL32
1229   {
1230     if(rt1[i]) {
1231       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1232       alloc_reg64(current,i,rt1[i]);
1233       current->is32&=~(1LL<<rt1[i]);
1234       dirty_reg(current,rt1[i]);
1235     }
1236   }
1237   if(opcode2[i]==0x3e) // DSRL32
1238   {
1239     if(rt1[i]) {
1240       alloc_reg64(current,i,rs1[i]);
1241       if(imm[i]==32) {
1242         alloc_reg64(current,i,rt1[i]);
1243         current->is32&=~(1LL<<rt1[i]);
1244       } else {
1245         alloc_reg(current,i,rt1[i]);
1246         current->is32|=1LL<<rt1[i];
1247       }
1248       dirty_reg(current,rt1[i]);
1249     }
1250   }
1251   if(opcode2[i]==0x3f) // DSRA32
1252   {
1253     if(rt1[i]) {
1254       alloc_reg64(current,i,rs1[i]);
1255       alloc_reg(current,i,rt1[i]);
1256       current->is32|=1LL<<rt1[i];
1257       dirty_reg(current,rt1[i]);
1258     }
1259   }
1260 }
1261
1262 void shift_alloc(struct regstat *current,int i)
1263 {
1264   if(rt1[i]) {
1265     if(opcode2[i]<=0x07) // SLLV/SRLV/SRAV
1266     {
1267       if(rs1[i]) alloc_reg(current,i,rs1[i]);
1268       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1269       alloc_reg(current,i,rt1[i]);
1270       if(rt1[i]==rs2[i]) {
1271         alloc_reg_temp(current,i,-1);
1272         minimum_free_regs[i]=1;
1273       }
1274       current->is32|=1LL<<rt1[i];
1275     } else { // DSLLV/DSRLV/DSRAV
1276       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1277       if(rs2[i]) alloc_reg(current,i,rs2[i]);
1278       alloc_reg64(current,i,rt1[i]);
1279       current->is32&=~(1LL<<rt1[i]);
1280       if(opcode2[i]==0x16||opcode2[i]==0x17) // DSRLV and DSRAV need a temporary register
1281       {
1282         alloc_reg_temp(current,i,-1);
1283         minimum_free_regs[i]=1;
1284       }
1285     }
1286     clear_const(current,rs1[i]);
1287     clear_const(current,rs2[i]);
1288     clear_const(current,rt1[i]);
1289     dirty_reg(current,rt1[i]);
1290   }
1291 }
1292
1293 void alu_alloc(struct regstat *current,int i)
1294 {
1295   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1296     if(rt1[i]) {
1297       if(rs1[i]&&rs2[i]) {
1298         alloc_reg(current,i,rs1[i]);
1299         alloc_reg(current,i,rs2[i]);
1300       }
1301       else {
1302         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1303         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1304       }
1305       alloc_reg(current,i,rt1[i]);
1306     }
1307     current->is32|=1LL<<rt1[i];
1308   }
1309   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
1310     if(rt1[i]) {
1311       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1312       {
1313         alloc_reg64(current,i,rs1[i]);
1314         alloc_reg64(current,i,rs2[i]);
1315         alloc_reg(current,i,rt1[i]);
1316       } else {
1317         alloc_reg(current,i,rs1[i]);
1318         alloc_reg(current,i,rs2[i]);
1319         alloc_reg(current,i,rt1[i]);
1320       }
1321     }
1322     current->is32|=1LL<<rt1[i];
1323   }
1324   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
1325     if(rt1[i]) {
1326       if(rs1[i]&&rs2[i]) {
1327         alloc_reg(current,i,rs1[i]);
1328         alloc_reg(current,i,rs2[i]);
1329       }
1330       else
1331       {
1332         if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1333         if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1334       }
1335       alloc_reg(current,i,rt1[i]);
1336       if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1337       {
1338         if(!((current->uu>>rt1[i])&1)) {
1339           alloc_reg64(current,i,rt1[i]);
1340         }
1341         if(get_reg(current->regmap,rt1[i]|64)>=0) {
1342           if(rs1[i]&&rs2[i]) {
1343             alloc_reg64(current,i,rs1[i]);
1344             alloc_reg64(current,i,rs2[i]);
1345           }
1346           else
1347           {
1348             // Is is really worth it to keep 64-bit values in registers?
1349             #ifdef NATIVE_64BIT
1350             if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1351             if(rs2[i]&&needed_again(rs2[i],i)) alloc_reg64(current,i,rs2[i]);
1352             #endif
1353           }
1354         }
1355         current->is32&=~(1LL<<rt1[i]);
1356       } else {
1357         current->is32|=1LL<<rt1[i];
1358       }
1359     }
1360   }
1361   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1362     if(rt1[i]) {
1363       if(rs1[i]&&rs2[i]) {
1364         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1365           alloc_reg64(current,i,rs1[i]);
1366           alloc_reg64(current,i,rs2[i]);
1367           alloc_reg64(current,i,rt1[i]);
1368         } else {
1369           alloc_reg(current,i,rs1[i]);
1370           alloc_reg(current,i,rs2[i]);
1371           alloc_reg(current,i,rt1[i]);
1372         }
1373       }
1374       else {
1375         alloc_reg(current,i,rt1[i]);
1376         if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1377           // DADD used as move, or zeroing
1378           // If we have a 64-bit source, then make the target 64 bits too
1379           if(rs1[i]&&!((current->is32>>rs1[i])&1)) {
1380             if(get_reg(current->regmap,rs1[i])>=0) alloc_reg64(current,i,rs1[i]);
1381             alloc_reg64(current,i,rt1[i]);
1382           } else if(rs2[i]&&!((current->is32>>rs2[i])&1)) {
1383             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1384             alloc_reg64(current,i,rt1[i]);
1385           }
1386           if(opcode2[i]>=0x2e&&rs2[i]) {
1387             // DSUB used as negation - 64-bit result
1388             // If we have a 32-bit register, extend it to 64 bits
1389             if(get_reg(current->regmap,rs2[i])>=0) alloc_reg64(current,i,rs2[i]);
1390             alloc_reg64(current,i,rt1[i]);
1391           }
1392         }
1393       }
1394       if(rs1[i]&&rs2[i]) {
1395         current->is32&=~(1LL<<rt1[i]);
1396       } else if(rs1[i]) {
1397         current->is32&=~(1LL<<rt1[i]);
1398         if((current->is32>>rs1[i])&1)
1399           current->is32|=1LL<<rt1[i];
1400       } else if(rs2[i]) {
1401         current->is32&=~(1LL<<rt1[i]);
1402         if((current->is32>>rs2[i])&1)
1403           current->is32|=1LL<<rt1[i];
1404       } else {
1405         current->is32|=1LL<<rt1[i];
1406       }
1407     }
1408   }
1409   clear_const(current,rs1[i]);
1410   clear_const(current,rs2[i]);
1411   clear_const(current,rt1[i]);
1412   dirty_reg(current,rt1[i]);
1413 }
1414
1415 void imm16_alloc(struct regstat *current,int i)
1416 {
1417   if(rs1[i]&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1418   else lt1[i]=rs1[i];
1419   if(rt1[i]) alloc_reg(current,i,rt1[i]);
1420   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
1421     current->is32&=~(1LL<<rt1[i]);
1422     if(!((current->uu>>rt1[i])&1)||get_reg(current->regmap,rt1[i]|64)>=0) {
1423       // TODO: Could preserve the 32-bit flag if the immediate is zero
1424       alloc_reg64(current,i,rt1[i]);
1425       alloc_reg64(current,i,rs1[i]);
1426     }
1427     clear_const(current,rs1[i]);
1428     clear_const(current,rt1[i]);
1429   }
1430   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
1431     if((~current->is32>>rs1[i])&1) alloc_reg64(current,i,rs1[i]);
1432     current->is32|=1LL<<rt1[i];
1433     clear_const(current,rs1[i]);
1434     clear_const(current,rt1[i]);
1435   }
1436   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
1437     if(((~current->is32>>rs1[i])&1)&&opcode[i]>0x0c) {
1438       if(rs1[i]!=rt1[i]) {
1439         if(needed_again(rs1[i],i)) alloc_reg64(current,i,rs1[i]);
1440         alloc_reg64(current,i,rt1[i]);
1441         current->is32&=~(1LL<<rt1[i]);
1442       }
1443     }
1444     else current->is32|=1LL<<rt1[i]; // ANDI clears upper bits
1445     if(is_const(current,rs1[i])) {
1446       int v=get_const(current,rs1[i]);
1447       if(opcode[i]==0x0c) set_const(current,rt1[i],v&imm[i]);
1448       if(opcode[i]==0x0d) set_const(current,rt1[i],v|imm[i]);
1449       if(opcode[i]==0x0e) set_const(current,rt1[i],v^imm[i]);
1450     }
1451     else clear_const(current,rt1[i]);
1452   }
1453   else if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
1454     if(is_const(current,rs1[i])) {
1455       int v=get_const(current,rs1[i]);
1456       set_const(current,rt1[i],v+imm[i]);
1457     }
1458     else clear_const(current,rt1[i]);
1459     current->is32|=1LL<<rt1[i];
1460   }
1461   else {
1462     set_const(current,rt1[i],((long long)((short)imm[i]))<<16); // LUI
1463     current->is32|=1LL<<rt1[i];
1464   }
1465   dirty_reg(current,rt1[i]);
1466 }
1467
1468 void load_alloc(struct regstat *current,int i)
1469 {
1470   clear_const(current,rt1[i]);
1471   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1472   if(!rs1[i]) current->u&=~1LL; // Allow allocating r0 if it's the source register
1473   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1474   if(rt1[i]&&!((current->u>>rt1[i])&1)) {
1475     alloc_reg(current,i,rt1[i]);
1476     assert(get_reg(current->regmap,rt1[i])>=0);
1477     if(opcode[i]==0x27||opcode[i]==0x37) // LWU/LD
1478     {
1479       current->is32&=~(1LL<<rt1[i]);
1480       alloc_reg64(current,i,rt1[i]);
1481     }
1482     else if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1483     {
1484       current->is32&=~(1LL<<rt1[i]);
1485       alloc_reg64(current,i,rt1[i]);
1486       alloc_all(current,i);
1487       alloc_reg64(current,i,FTEMP);
1488       minimum_free_regs[i]=HOST_REGS;
1489     }
1490     else current->is32|=1LL<<rt1[i];
1491     dirty_reg(current,rt1[i]);
1492     // LWL/LWR need a temporary register for the old value
1493     if(opcode[i]==0x22||opcode[i]==0x26)
1494     {
1495       alloc_reg(current,i,FTEMP);
1496       alloc_reg_temp(current,i,-1);
1497       minimum_free_regs[i]=1;
1498     }
1499   }
1500   else
1501   {
1502     // Load to r0 or unneeded register (dummy load)
1503     // but we still need a register to calculate the address
1504     if(opcode[i]==0x22||opcode[i]==0x26)
1505     {
1506       alloc_reg(current,i,FTEMP); // LWL/LWR need another temporary
1507     }
1508     alloc_reg_temp(current,i,-1);
1509     minimum_free_regs[i]=1;
1510     if(opcode[i]==0x1A||opcode[i]==0x1B) // LDL/LDR
1511     {
1512       alloc_all(current,i);
1513       alloc_reg64(current,i,FTEMP);
1514       minimum_free_regs[i]=HOST_REGS;
1515     }
1516   }
1517 }
1518
1519 void store_alloc(struct regstat *current,int i)
1520 {
1521   clear_const(current,rs2[i]);
1522   if(!(rs2[i])) current->u&=~1LL; // Allow allocating r0 if necessary
1523   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1524   alloc_reg(current,i,rs2[i]);
1525   if(opcode[i]==0x2c||opcode[i]==0x2d||opcode[i]==0x3f) { // 64-bit SDL/SDR/SD
1526     alloc_reg64(current,i,rs2[i]);
1527     if(rs2[i]) alloc_reg(current,i,FTEMP);
1528   }
1529   #if defined(HOST_IMM8)
1530   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1531   else alloc_reg(current,i,INVCP);
1532   #endif
1533   if(opcode[i]==0x2a||opcode[i]==0x2e||opcode[i]==0x2c||opcode[i]==0x2d) { // SWL/SWL/SDL/SDR
1534     alloc_reg(current,i,FTEMP);
1535   }
1536   // We need a temporary register for address generation
1537   alloc_reg_temp(current,i,-1);
1538   minimum_free_regs[i]=1;
1539 }
1540
1541 void c1ls_alloc(struct regstat *current,int i)
1542 {
1543   //clear_const(current,rs1[i]); // FIXME
1544   clear_const(current,rt1[i]);
1545   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1546   alloc_reg(current,i,CSREG); // Status
1547   alloc_reg(current,i,FTEMP);
1548   if(opcode[i]==0x35||opcode[i]==0x3d) { // 64-bit LDC1/SDC1
1549     alloc_reg64(current,i,FTEMP);
1550   }
1551   #if defined(HOST_IMM8)
1552   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1553   else if((opcode[i]&0x3b)==0x39) // SWC1/SDC1
1554     alloc_reg(current,i,INVCP);
1555   #endif
1556   // We need a temporary register for address generation
1557   alloc_reg_temp(current,i,-1);
1558 }
1559
1560 void c2ls_alloc(struct regstat *current,int i)
1561 {
1562   clear_const(current,rt1[i]);
1563   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1564   alloc_reg(current,i,FTEMP);
1565   #if defined(HOST_IMM8)
1566   // On CPUs without 32-bit immediates we need a pointer to invalid_code
1567   if((opcode[i]&0x3b)==0x3a) // SWC2/SDC2
1568     alloc_reg(current,i,INVCP);
1569   #endif
1570   // We need a temporary register for address generation
1571   alloc_reg_temp(current,i,-1);
1572   minimum_free_regs[i]=1;
1573 }
1574
1575 #ifndef multdiv_alloc
1576 void multdiv_alloc(struct regstat *current,int i)
1577 {
1578   //  case 0x18: MULT
1579   //  case 0x19: MULTU
1580   //  case 0x1A: DIV
1581   //  case 0x1B: DIVU
1582   //  case 0x1C: DMULT
1583   //  case 0x1D: DMULTU
1584   //  case 0x1E: DDIV
1585   //  case 0x1F: DDIVU
1586   clear_const(current,rs1[i]);
1587   clear_const(current,rs2[i]);
1588   if(rs1[i]&&rs2[i])
1589   {
1590     if((opcode2[i]&4)==0) // 32-bit
1591     {
1592       current->u&=~(1LL<<HIREG);
1593       current->u&=~(1LL<<LOREG);
1594       alloc_reg(current,i,HIREG);
1595       alloc_reg(current,i,LOREG);
1596       alloc_reg(current,i,rs1[i]);
1597       alloc_reg(current,i,rs2[i]);
1598       current->is32|=1LL<<HIREG;
1599       current->is32|=1LL<<LOREG;
1600       dirty_reg(current,HIREG);
1601       dirty_reg(current,LOREG);
1602     }
1603     else // 64-bit
1604     {
1605       current->u&=~(1LL<<HIREG);
1606       current->u&=~(1LL<<LOREG);
1607       current->uu&=~(1LL<<HIREG);
1608       current->uu&=~(1LL<<LOREG);
1609       alloc_reg64(current,i,HIREG);
1610       //if(HOST_REGS>10) alloc_reg64(current,i,LOREG);
1611       alloc_reg64(current,i,rs1[i]);
1612       alloc_reg64(current,i,rs2[i]);
1613       alloc_all(current,i);
1614       current->is32&=~(1LL<<HIREG);
1615       current->is32&=~(1LL<<LOREG);
1616       dirty_reg(current,HIREG);
1617       dirty_reg(current,LOREG);
1618       minimum_free_regs[i]=HOST_REGS;
1619     }
1620   }
1621   else
1622   {
1623     // Multiply by zero is zero.
1624     // MIPS does not have a divide by zero exception.
1625     // The result is undefined, we return zero.
1626     alloc_reg(current,i,HIREG);
1627     alloc_reg(current,i,LOREG);
1628     current->is32|=1LL<<HIREG;
1629     current->is32|=1LL<<LOREG;
1630     dirty_reg(current,HIREG);
1631     dirty_reg(current,LOREG);
1632   }
1633 }
1634 #endif
1635
1636 void cop0_alloc(struct regstat *current,int i)
1637 {
1638   if(opcode2[i]==0) // MFC0
1639   {
1640     if(rt1[i]) {
1641       clear_const(current,rt1[i]);
1642       alloc_all(current,i);
1643       alloc_reg(current,i,rt1[i]);
1644       current->is32|=1LL<<rt1[i];
1645       dirty_reg(current,rt1[i]);
1646     }
1647   }
1648   else if(opcode2[i]==4) // MTC0
1649   {
1650     if(rs1[i]){
1651       clear_const(current,rs1[i]);
1652       alloc_reg(current,i,rs1[i]);
1653       alloc_all(current,i);
1654     }
1655     else {
1656       alloc_all(current,i); // FIXME: Keep r0
1657       current->u&=~1LL;
1658       alloc_reg(current,i,0);
1659     }
1660   }
1661   else
1662   {
1663     // TLBR/TLBWI/TLBWR/TLBP/ERET
1664     assert(opcode2[i]==0x10);
1665     alloc_all(current,i);
1666   }
1667   minimum_free_regs[i]=HOST_REGS;
1668 }
1669
1670 void cop1_alloc(struct regstat *current,int i)
1671 {
1672   alloc_reg(current,i,CSREG); // Load status
1673   if(opcode2[i]<3) // MFC1/DMFC1/CFC1
1674   {
1675     if(rt1[i]){
1676       clear_const(current,rt1[i]);
1677       if(opcode2[i]==1) {
1678         alloc_reg64(current,i,rt1[i]); // DMFC1
1679         current->is32&=~(1LL<<rt1[i]);
1680       }else{
1681         alloc_reg(current,i,rt1[i]); // MFC1/CFC1
1682         current->is32|=1LL<<rt1[i];
1683       }
1684       dirty_reg(current,rt1[i]);
1685     }
1686     alloc_reg_temp(current,i,-1);
1687   }
1688   else if(opcode2[i]>3) // MTC1/DMTC1/CTC1
1689   {
1690     if(rs1[i]){
1691       clear_const(current,rs1[i]);
1692       if(opcode2[i]==5)
1693         alloc_reg64(current,i,rs1[i]); // DMTC1
1694       else
1695         alloc_reg(current,i,rs1[i]); // MTC1/CTC1
1696       alloc_reg_temp(current,i,-1);
1697     }
1698     else {
1699       current->u&=~1LL;
1700       alloc_reg(current,i,0);
1701       alloc_reg_temp(current,i,-1);
1702     }
1703   }
1704   minimum_free_regs[i]=1;
1705 }
1706 void fconv_alloc(struct regstat *current,int i)
1707 {
1708   alloc_reg(current,i,CSREG); // Load status
1709   alloc_reg_temp(current,i,-1);
1710   minimum_free_regs[i]=1;
1711 }
1712 void float_alloc(struct regstat *current,int i)
1713 {
1714   alloc_reg(current,i,CSREG); // Load status
1715   alloc_reg_temp(current,i,-1);
1716   minimum_free_regs[i]=1;
1717 }
1718 void c2op_alloc(struct regstat *current,int i)
1719 {
1720   alloc_reg_temp(current,i,-1);
1721 }
1722 void fcomp_alloc(struct regstat *current,int i)
1723 {
1724   alloc_reg(current,i,CSREG); // Load status
1725   alloc_reg(current,i,FSREG); // Load flags
1726   dirty_reg(current,FSREG); // Flag will be modified
1727   alloc_reg_temp(current,i,-1);
1728   minimum_free_regs[i]=1;
1729 }
1730
1731 void syscall_alloc(struct regstat *current,int i)
1732 {
1733   alloc_cc(current,i);
1734   dirty_reg(current,CCREG);
1735   alloc_all(current,i);
1736   minimum_free_regs[i]=HOST_REGS;
1737   current->isconst=0;
1738 }
1739
1740 void delayslot_alloc(struct regstat *current,int i)
1741 {
1742   switch(itype[i]) {
1743     case UJUMP:
1744     case CJUMP:
1745     case SJUMP:
1746     case RJUMP:
1747     case FJUMP:
1748     case SYSCALL:
1749     case HLECALL:
1750     case SPAN:
1751       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1752       SysPrintf("Disabled speculative precompilation\n");
1753       stop_after_jal=1;
1754       break;
1755     case IMM16:
1756       imm16_alloc(current,i);
1757       break;
1758     case LOAD:
1759     case LOADLR:
1760       load_alloc(current,i);
1761       break;
1762     case STORE:
1763     case STORELR:
1764       store_alloc(current,i);
1765       break;
1766     case ALU:
1767       alu_alloc(current,i);
1768       break;
1769     case SHIFT:
1770       shift_alloc(current,i);
1771       break;
1772     case MULTDIV:
1773       multdiv_alloc(current,i);
1774       break;
1775     case SHIFTIMM:
1776       shiftimm_alloc(current,i);
1777       break;
1778     case MOV:
1779       mov_alloc(current,i);
1780       break;
1781     case COP0:
1782       cop0_alloc(current,i);
1783       break;
1784     case COP1:
1785     case COP2:
1786       cop1_alloc(current,i);
1787       break;
1788     case C1LS:
1789       c1ls_alloc(current,i);
1790       break;
1791     case C2LS:
1792       c2ls_alloc(current,i);
1793       break;
1794     case FCONV:
1795       fconv_alloc(current,i);
1796       break;
1797     case FLOAT:
1798       float_alloc(current,i);
1799       break;
1800     case FCOMP:
1801       fcomp_alloc(current,i);
1802       break;
1803     case C2OP:
1804       c2op_alloc(current,i);
1805       break;
1806   }
1807 }
1808
1809 // Special case where a branch and delay slot span two pages in virtual memory
1810 static void pagespan_alloc(struct regstat *current,int i)
1811 {
1812   current->isconst=0;
1813   current->wasconst=0;
1814   regs[i].wasconst=0;
1815   minimum_free_regs[i]=HOST_REGS;
1816   alloc_all(current,i);
1817   alloc_cc(current,i);
1818   dirty_reg(current,CCREG);
1819   if(opcode[i]==3) // JAL
1820   {
1821     alloc_reg(current,i,31);
1822     dirty_reg(current,31);
1823   }
1824   if(opcode[i]==0&&(opcode2[i]&0x3E)==8) // JR/JALR
1825   {
1826     alloc_reg(current,i,rs1[i]);
1827     if (rt1[i]!=0) {
1828       alloc_reg(current,i,rt1[i]);
1829       dirty_reg(current,rt1[i]);
1830     }
1831   }
1832   if((opcode[i]&0x2E)==4) // BEQ/BNE/BEQL/BNEL
1833   {
1834     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1835     if(rs2[i]) alloc_reg(current,i,rs2[i]);
1836     if(!((current->is32>>rs1[i])&(current->is32>>rs2[i])&1))
1837     {
1838       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1839       if(rs2[i]) alloc_reg64(current,i,rs2[i]);
1840     }
1841   }
1842   else
1843   if((opcode[i]&0x2E)==6) // BLEZ/BGTZ/BLEZL/BGTZL
1844   {
1845     if(rs1[i]) alloc_reg(current,i,rs1[i]);
1846     if(!((current->is32>>rs1[i])&1))
1847     {
1848       if(rs1[i]) alloc_reg64(current,i,rs1[i]);
1849     }
1850   }
1851   else
1852   if(opcode[i]==0x11) // BC1
1853   {
1854     alloc_reg(current,i,FSREG);
1855     alloc_reg(current,i,CSREG);
1856   }
1857   //else ...
1858 }
1859
1860 static void add_stub(enum stub_type type, void *addr, void *retaddr,
1861   u_int a, uintptr_t b, uintptr_t c, u_int d, u_int e)
1862 {
1863   assert(a < ARRAY_SIZE(stubs));
1864   stubs[stubcount].type = type;
1865   stubs[stubcount].addr = addr;
1866   stubs[stubcount].retaddr = retaddr;
1867   stubs[stubcount].a = a;
1868   stubs[stubcount].b = b;
1869   stubs[stubcount].c = c;
1870   stubs[stubcount].d = d;
1871   stubs[stubcount].e = e;
1872   stubcount++;
1873 }
1874
1875 static void add_stub_r(enum stub_type type, void *addr, void *retaddr,
1876   int i, int addr_reg, struct regstat *i_regs, int ccadj, u_int reglist)
1877 {
1878   add_stub(type, addr, retaddr, i, addr_reg, (uintptr_t)i_regs, ccadj, reglist);
1879 }
1880
1881 // Write out a single register
1882 void wb_register(signed char r,signed char regmap[],uint64_t dirty,uint64_t is32)
1883 {
1884   int hr;
1885   for(hr=0;hr<HOST_REGS;hr++) {
1886     if(hr!=EXCLUDE_REG) {
1887       if((regmap[hr]&63)==r) {
1888         if((dirty>>hr)&1) {
1889           if(regmap[hr]<64) {
1890             emit_storereg(r,hr);
1891           }else{
1892             emit_storereg(r|64,hr);
1893           }
1894         }
1895       }
1896     }
1897   }
1898 }
1899
1900 int mchecksum()
1901 {
1902   int i;
1903   int sum=0;
1904   for(i=0;i<2097152;i++) {
1905     unsigned int temp=sum;
1906     sum<<=1;
1907     sum|=(~temp)>>31;
1908     sum^=((u_int *)rdram)[i];
1909   }
1910   return sum;
1911 }
1912 int rchecksum()
1913 {
1914   int i;
1915   int sum=0;
1916   for(i=0;i<64;i++)
1917     sum^=((u_int *)reg)[i];
1918   return sum;
1919 }
1920 void rlist()
1921 {
1922   int i;
1923   printf("TRACE: ");
1924   for(i=0;i<32;i++)
1925     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1926   printf("\n");
1927 }
1928
1929 void alu_assemble(int i,struct regstat *i_regs)
1930 {
1931   if(opcode2[i]>=0x20&&opcode2[i]<=0x23) { // ADD/ADDU/SUB/SUBU
1932     if(rt1[i]) {
1933       signed char s1,s2,t;
1934       t=get_reg(i_regs->regmap,rt1[i]);
1935       if(t>=0) {
1936         s1=get_reg(i_regs->regmap,rs1[i]);
1937         s2=get_reg(i_regs->regmap,rs2[i]);
1938         if(rs1[i]&&rs2[i]) {
1939           assert(s1>=0);
1940           assert(s2>=0);
1941           if(opcode2[i]&2) emit_sub(s1,s2,t);
1942           else emit_add(s1,s2,t);
1943         }
1944         else if(rs1[i]) {
1945           if(s1>=0) emit_mov(s1,t);
1946           else emit_loadreg(rs1[i],t);
1947         }
1948         else if(rs2[i]) {
1949           if(s2>=0) {
1950             if(opcode2[i]&2) emit_neg(s2,t);
1951             else emit_mov(s2,t);
1952           }
1953           else {
1954             emit_loadreg(rs2[i],t);
1955             if(opcode2[i]&2) emit_neg(t,t);
1956           }
1957         }
1958         else emit_zeroreg(t);
1959       }
1960     }
1961   }
1962   if(opcode2[i]>=0x2c&&opcode2[i]<=0x2f) { // DADD/DADDU/DSUB/DSUBU
1963     if(rt1[i]) {
1964       signed char s1l,s2l,s1h,s2h,tl,th;
1965       tl=get_reg(i_regs->regmap,rt1[i]);
1966       th=get_reg(i_regs->regmap,rt1[i]|64);
1967       if(tl>=0) {
1968         s1l=get_reg(i_regs->regmap,rs1[i]);
1969         s2l=get_reg(i_regs->regmap,rs2[i]);
1970         s1h=get_reg(i_regs->regmap,rs1[i]|64);
1971         s2h=get_reg(i_regs->regmap,rs2[i]|64);
1972         if(rs1[i]&&rs2[i]) {
1973           assert(s1l>=0);
1974           assert(s2l>=0);
1975           if(opcode2[i]&2) emit_subs(s1l,s2l,tl);
1976           else emit_adds(s1l,s2l,tl);
1977           if(th>=0) {
1978             #ifdef INVERTED_CARRY
1979             if(opcode2[i]&2) {if(s1h!=th) emit_mov(s1h,th);emit_sbb(th,s2h);}
1980             #else
1981             if(opcode2[i]&2) emit_sbc(s1h,s2h,th);
1982             #endif
1983             else emit_add(s1h,s2h,th);
1984           }
1985         }
1986         else if(rs1[i]) {
1987           if(s1l>=0) emit_mov(s1l,tl);
1988           else emit_loadreg(rs1[i],tl);
1989           if(th>=0) {
1990             if(s1h>=0) emit_mov(s1h,th);
1991             else emit_loadreg(rs1[i]|64,th);
1992           }
1993         }
1994         else if(rs2[i]) {
1995           if(s2l>=0) {
1996             if(opcode2[i]&2) emit_negs(s2l,tl);
1997             else emit_mov(s2l,tl);
1998           }
1999           else {
2000             emit_loadreg(rs2[i],tl);
2001             if(opcode2[i]&2) emit_negs(tl,tl);
2002           }
2003           if(th>=0) {
2004             #ifdef INVERTED_CARRY
2005             if(s2h>=0) emit_mov(s2h,th);
2006             else emit_loadreg(rs2[i]|64,th);
2007             if(opcode2[i]&2) {
2008               emit_adcimm(-1,th); // x86 has inverted carry flag
2009               emit_not(th,th);
2010             }
2011             #else
2012             if(opcode2[i]&2) {
2013               if(s2h>=0) emit_rscimm(s2h,0,th);
2014               else {
2015                 emit_loadreg(rs2[i]|64,th);
2016                 emit_rscimm(th,0,th);
2017               }
2018             }else{
2019               if(s2h>=0) emit_mov(s2h,th);
2020               else emit_loadreg(rs2[i]|64,th);
2021             }
2022             #endif
2023           }
2024         }
2025         else {
2026           emit_zeroreg(tl);
2027           if(th>=0) emit_zeroreg(th);
2028         }
2029       }
2030     }
2031   }
2032   if(opcode2[i]==0x2a||opcode2[i]==0x2b) { // SLT/SLTU
2033     if(rt1[i]) {
2034       signed char s1l,s1h,s2l,s2h,t;
2035       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1))
2036       {
2037         t=get_reg(i_regs->regmap,rt1[i]);
2038         //assert(t>=0);
2039         if(t>=0) {
2040           s1l=get_reg(i_regs->regmap,rs1[i]);
2041           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2042           s2l=get_reg(i_regs->regmap,rs2[i]);
2043           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2044           if(rs2[i]==0) // rx<r0
2045           {
2046             assert(s1h>=0);
2047             if(opcode2[i]==0x2a) // SLT
2048               emit_shrimm(s1h,31,t);
2049             else // SLTU (unsigned can not be less than zero)
2050               emit_zeroreg(t);
2051           }
2052           else if(rs1[i]==0) // r0<rx
2053           {
2054             assert(s2h>=0);
2055             if(opcode2[i]==0x2a) // SLT
2056               emit_set_gz64_32(s2h,s2l,t);
2057             else // SLTU (set if not zero)
2058               emit_set_nz64_32(s2h,s2l,t);
2059           }
2060           else {
2061             assert(s1l>=0);assert(s1h>=0);
2062             assert(s2l>=0);assert(s2h>=0);
2063             if(opcode2[i]==0x2a) // SLT
2064               emit_set_if_less64_32(s1h,s1l,s2h,s2l,t);
2065             else // SLTU
2066               emit_set_if_carry64_32(s1h,s1l,s2h,s2l,t);
2067           }
2068         }
2069       } else {
2070         t=get_reg(i_regs->regmap,rt1[i]);
2071         //assert(t>=0);
2072         if(t>=0) {
2073           s1l=get_reg(i_regs->regmap,rs1[i]);
2074           s2l=get_reg(i_regs->regmap,rs2[i]);
2075           if(rs2[i]==0) // rx<r0
2076           {
2077             assert(s1l>=0);
2078             if(opcode2[i]==0x2a) // SLT
2079               emit_shrimm(s1l,31,t);
2080             else // SLTU (unsigned can not be less than zero)
2081               emit_zeroreg(t);
2082           }
2083           else if(rs1[i]==0) // r0<rx
2084           {
2085             assert(s2l>=0);
2086             if(opcode2[i]==0x2a) // SLT
2087               emit_set_gz32(s2l,t);
2088             else // SLTU (set if not zero)
2089               emit_set_nz32(s2l,t);
2090           }
2091           else{
2092             assert(s1l>=0);assert(s2l>=0);
2093             if(opcode2[i]==0x2a) // SLT
2094               emit_set_if_less32(s1l,s2l,t);
2095             else // SLTU
2096               emit_set_if_carry32(s1l,s2l,t);
2097           }
2098         }
2099       }
2100     }
2101   }
2102   if(opcode2[i]>=0x24&&opcode2[i]<=0x27) { // AND/OR/XOR/NOR
2103     if(rt1[i]) {
2104       signed char s1l,s1h,s2l,s2h,th,tl;
2105       tl=get_reg(i_regs->regmap,rt1[i]);
2106       th=get_reg(i_regs->regmap,rt1[i]|64);
2107       if(!((i_regs->was32>>rs1[i])&(i_regs->was32>>rs2[i])&1)&&th>=0)
2108       {
2109         assert(tl>=0);
2110         if(tl>=0) {
2111           s1l=get_reg(i_regs->regmap,rs1[i]);
2112           s1h=get_reg(i_regs->regmap,rs1[i]|64);
2113           s2l=get_reg(i_regs->regmap,rs2[i]);
2114           s2h=get_reg(i_regs->regmap,rs2[i]|64);
2115           if(rs1[i]&&rs2[i]) {
2116             assert(s1l>=0);assert(s1h>=0);
2117             assert(s2l>=0);assert(s2h>=0);
2118             if(opcode2[i]==0x24) { // AND
2119               emit_and(s1l,s2l,tl);
2120               emit_and(s1h,s2h,th);
2121             } else
2122             if(opcode2[i]==0x25) { // OR
2123               emit_or(s1l,s2l,tl);
2124               emit_or(s1h,s2h,th);
2125             } else
2126             if(opcode2[i]==0x26) { // XOR
2127               emit_xor(s1l,s2l,tl);
2128               emit_xor(s1h,s2h,th);
2129             } else
2130             if(opcode2[i]==0x27) { // NOR
2131               emit_or(s1l,s2l,tl);
2132               emit_or(s1h,s2h,th);
2133               emit_not(tl,tl);
2134               emit_not(th,th);
2135             }
2136           }
2137           else
2138           {
2139             if(opcode2[i]==0x24) { // AND
2140               emit_zeroreg(tl);
2141               emit_zeroreg(th);
2142             } else
2143             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2144               if(rs1[i]){
2145                 if(s1l>=0) emit_mov(s1l,tl);
2146                 else emit_loadreg(rs1[i],tl);
2147                 if(s1h>=0) emit_mov(s1h,th);
2148                 else emit_loadreg(rs1[i]|64,th);
2149               }
2150               else
2151               if(rs2[i]){
2152                 if(s2l>=0) emit_mov(s2l,tl);
2153                 else emit_loadreg(rs2[i],tl);
2154                 if(s2h>=0) emit_mov(s2h,th);
2155                 else emit_loadreg(rs2[i]|64,th);
2156               }
2157               else{
2158                 emit_zeroreg(tl);
2159                 emit_zeroreg(th);
2160               }
2161             } else
2162             if(opcode2[i]==0x27) { // NOR
2163               if(rs1[i]){
2164                 if(s1l>=0) emit_not(s1l,tl);
2165                 else{
2166                   emit_loadreg(rs1[i],tl);
2167                   emit_not(tl,tl);
2168                 }
2169                 if(s1h>=0) emit_not(s1h,th);
2170                 else{
2171                   emit_loadreg(rs1[i]|64,th);
2172                   emit_not(th,th);
2173                 }
2174               }
2175               else
2176               if(rs2[i]){
2177                 if(s2l>=0) emit_not(s2l,tl);
2178                 else{
2179                   emit_loadreg(rs2[i],tl);
2180                   emit_not(tl,tl);
2181                 }
2182                 if(s2h>=0) emit_not(s2h,th);
2183                 else{
2184                   emit_loadreg(rs2[i]|64,th);
2185                   emit_not(th,th);
2186                 }
2187               }
2188               else {
2189                 emit_movimm(-1,tl);
2190                 emit_movimm(-1,th);
2191               }
2192             }
2193           }
2194         }
2195       }
2196       else
2197       {
2198         // 32 bit
2199         if(tl>=0) {
2200           s1l=get_reg(i_regs->regmap,rs1[i]);
2201           s2l=get_reg(i_regs->regmap,rs2[i]);
2202           if(rs1[i]&&rs2[i]) {
2203             assert(s1l>=0);
2204             assert(s2l>=0);
2205             if(opcode2[i]==0x24) { // AND
2206               emit_and(s1l,s2l,tl);
2207             } else
2208             if(opcode2[i]==0x25) { // OR
2209               emit_or(s1l,s2l,tl);
2210             } else
2211             if(opcode2[i]==0x26) { // XOR
2212               emit_xor(s1l,s2l,tl);
2213             } else
2214             if(opcode2[i]==0x27) { // NOR
2215               emit_or(s1l,s2l,tl);
2216               emit_not(tl,tl);
2217             }
2218           }
2219           else
2220           {
2221             if(opcode2[i]==0x24) { // AND
2222               emit_zeroreg(tl);
2223             } else
2224             if(opcode2[i]==0x25||opcode2[i]==0x26) { // OR/XOR
2225               if(rs1[i]){
2226                 if(s1l>=0) emit_mov(s1l,tl);
2227                 else emit_loadreg(rs1[i],tl); // CHECK: regmap_entry?
2228               }
2229               else
2230               if(rs2[i]){
2231                 if(s2l>=0) emit_mov(s2l,tl);
2232                 else emit_loadreg(rs2[i],tl); // CHECK: regmap_entry?
2233               }
2234               else emit_zeroreg(tl);
2235             } else
2236             if(opcode2[i]==0x27) { // NOR
2237               if(rs1[i]){
2238                 if(s1l>=0) emit_not(s1l,tl);
2239                 else {
2240                   emit_loadreg(rs1[i],tl);
2241                   emit_not(tl,tl);
2242                 }
2243               }
2244               else
2245               if(rs2[i]){
2246                 if(s2l>=0) emit_not(s2l,tl);
2247                 else {
2248                   emit_loadreg(rs2[i],tl);
2249                   emit_not(tl,tl);
2250                 }
2251               }
2252               else emit_movimm(-1,tl);
2253             }
2254           }
2255         }
2256       }
2257     }
2258   }
2259 }
2260
2261 void imm16_assemble(int i,struct regstat *i_regs)
2262 {
2263   if (opcode[i]==0x0f) { // LUI
2264     if(rt1[i]) {
2265       signed char t;
2266       t=get_reg(i_regs->regmap,rt1[i]);
2267       //assert(t>=0);
2268       if(t>=0) {
2269         if(!((i_regs->isconst>>t)&1))
2270           emit_movimm(imm[i]<<16,t);
2271       }
2272     }
2273   }
2274   if(opcode[i]==0x08||opcode[i]==0x09) { // ADDI/ADDIU
2275     if(rt1[i]) {
2276       signed char s,t;
2277       t=get_reg(i_regs->regmap,rt1[i]);
2278       s=get_reg(i_regs->regmap,rs1[i]);
2279       if(rs1[i]) {
2280         //assert(t>=0);
2281         //assert(s>=0);
2282         if(t>=0) {
2283           if(!((i_regs->isconst>>t)&1)) {
2284             if(s<0) {
2285               if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2286               emit_addimm(t,imm[i],t);
2287             }else{
2288               if(!((i_regs->wasconst>>s)&1))
2289                 emit_addimm(s,imm[i],t);
2290               else
2291                 emit_movimm(constmap[i][s]+imm[i],t);
2292             }
2293           }
2294         }
2295       } else {
2296         if(t>=0) {
2297           if(!((i_regs->isconst>>t)&1))
2298             emit_movimm(imm[i],t);
2299         }
2300       }
2301     }
2302   }
2303   if(opcode[i]==0x18||opcode[i]==0x19) { // DADDI/DADDIU
2304     if(rt1[i]) {
2305       signed char sh,sl,th,tl;
2306       th=get_reg(i_regs->regmap,rt1[i]|64);
2307       tl=get_reg(i_regs->regmap,rt1[i]);
2308       sh=get_reg(i_regs->regmap,rs1[i]|64);
2309       sl=get_reg(i_regs->regmap,rs1[i]);
2310       if(tl>=0) {
2311         if(rs1[i]) {
2312           assert(sh>=0);
2313           assert(sl>=0);
2314           if(th>=0) {
2315             emit_addimm64_32(sh,sl,imm[i],th,tl);
2316           }
2317           else {
2318             emit_addimm(sl,imm[i],tl);
2319           }
2320         } else {
2321           emit_movimm(imm[i],tl);
2322           if(th>=0) emit_movimm(((signed int)imm[i])>>31,th);
2323         }
2324       }
2325     }
2326   }
2327   else if(opcode[i]==0x0a||opcode[i]==0x0b) { // SLTI/SLTIU
2328     if(rt1[i]) {
2329       //assert(rs1[i]!=0); // r0 might be valid, but it's probably a bug
2330       signed char sh,sl,t;
2331       t=get_reg(i_regs->regmap,rt1[i]);
2332       sh=get_reg(i_regs->regmap,rs1[i]|64);
2333       sl=get_reg(i_regs->regmap,rs1[i]);
2334       //assert(t>=0);
2335       if(t>=0) {
2336         if(rs1[i]>0) {
2337           if(sh<0) assert((i_regs->was32>>rs1[i])&1);
2338           if(sh<0||((i_regs->was32>>rs1[i])&1)) {
2339             if(opcode[i]==0x0a) { // SLTI
2340               if(sl<0) {
2341                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2342                 emit_slti32(t,imm[i],t);
2343               }else{
2344                 emit_slti32(sl,imm[i],t);
2345               }
2346             }
2347             else { // SLTIU
2348               if(sl<0) {
2349                 if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2350                 emit_sltiu32(t,imm[i],t);
2351               }else{
2352                 emit_sltiu32(sl,imm[i],t);
2353               }
2354             }
2355           }else{ // 64-bit
2356             assert(sl>=0);
2357             if(opcode[i]==0x0a) // SLTI
2358               emit_slti64_32(sh,sl,imm[i],t);
2359             else // SLTIU
2360               emit_sltiu64_32(sh,sl,imm[i],t);
2361           }
2362         }else{
2363           // SLTI(U) with r0 is just stupid,
2364           // nonetheless examples can be found
2365           if(opcode[i]==0x0a) // SLTI
2366             if(0<imm[i]) emit_movimm(1,t);
2367             else emit_zeroreg(t);
2368           else // SLTIU
2369           {
2370             if(imm[i]) emit_movimm(1,t);
2371             else emit_zeroreg(t);
2372           }
2373         }
2374       }
2375     }
2376   }
2377   else if(opcode[i]>=0x0c&&opcode[i]<=0x0e) { // ANDI/ORI/XORI
2378     if(rt1[i]) {
2379       signed char sh,sl,th,tl;
2380       th=get_reg(i_regs->regmap,rt1[i]|64);
2381       tl=get_reg(i_regs->regmap,rt1[i]);
2382       sh=get_reg(i_regs->regmap,rs1[i]|64);
2383       sl=get_reg(i_regs->regmap,rs1[i]);
2384       if(tl>=0 && !((i_regs->isconst>>tl)&1)) {
2385         if(opcode[i]==0x0c) //ANDI
2386         {
2387           if(rs1[i]) {
2388             if(sl<0) {
2389               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2390               emit_andimm(tl,imm[i],tl);
2391             }else{
2392               if(!((i_regs->wasconst>>sl)&1))
2393                 emit_andimm(sl,imm[i],tl);
2394               else
2395                 emit_movimm(constmap[i][sl]&imm[i],tl);
2396             }
2397           }
2398           else
2399             emit_zeroreg(tl);
2400           if(th>=0) emit_zeroreg(th);
2401         }
2402         else
2403         {
2404           if(rs1[i]) {
2405             if(sl<0) {
2406               if(i_regs->regmap_entry[tl]!=rs1[i]) emit_loadreg(rs1[i],tl);
2407             }
2408             if(th>=0) {
2409               if(sh<0) {
2410                 emit_loadreg(rs1[i]|64,th);
2411               }else{
2412                 emit_mov(sh,th);
2413               }
2414             }
2415             if(opcode[i]==0x0d) { // ORI
2416               if(sl<0) {
2417                 emit_orimm(tl,imm[i],tl);
2418               }else{
2419                 if(!((i_regs->wasconst>>sl)&1))
2420                   emit_orimm(sl,imm[i],tl);
2421                 else
2422                   emit_movimm(constmap[i][sl]|imm[i],tl);
2423               }
2424             }
2425             if(opcode[i]==0x0e) { // XORI
2426               if(sl<0) {
2427                 emit_xorimm(tl,imm[i],tl);
2428               }else{
2429                 if(!((i_regs->wasconst>>sl)&1))
2430                   emit_xorimm(sl,imm[i],tl);
2431                 else
2432                   emit_movimm(constmap[i][sl]^imm[i],tl);
2433               }
2434             }
2435           }
2436           else {
2437             emit_movimm(imm[i],tl);
2438             if(th>=0) emit_zeroreg(th);
2439           }
2440         }
2441       }
2442     }
2443   }
2444 }
2445
2446 void shiftimm_assemble(int i,struct regstat *i_regs)
2447 {
2448   if(opcode2[i]<=0x3) // SLL/SRL/SRA
2449   {
2450     if(rt1[i]) {
2451       signed char s,t;
2452       t=get_reg(i_regs->regmap,rt1[i]);
2453       s=get_reg(i_regs->regmap,rs1[i]);
2454       //assert(t>=0);
2455       if(t>=0&&!((i_regs->isconst>>t)&1)){
2456         if(rs1[i]==0)
2457         {
2458           emit_zeroreg(t);
2459         }
2460         else
2461         {
2462           if(s<0&&i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2463           if(imm[i]) {
2464             if(opcode2[i]==0) // SLL
2465             {
2466               emit_shlimm(s<0?t:s,imm[i],t);
2467             }
2468             if(opcode2[i]==2) // SRL
2469             {
2470               emit_shrimm(s<0?t:s,imm[i],t);
2471             }
2472             if(opcode2[i]==3) // SRA
2473             {
2474               emit_sarimm(s<0?t:s,imm[i],t);
2475             }
2476           }else{
2477             // Shift by zero
2478             if(s>=0 && s!=t) emit_mov(s,t);
2479           }
2480         }
2481       }
2482       //emit_storereg(rt1[i],t); //DEBUG
2483     }
2484   }
2485   if(opcode2[i]>=0x38&&opcode2[i]<=0x3b) // DSLL/DSRL/DSRA
2486   {
2487     if(rt1[i]) {
2488       signed char sh,sl,th,tl;
2489       th=get_reg(i_regs->regmap,rt1[i]|64);
2490       tl=get_reg(i_regs->regmap,rt1[i]);
2491       sh=get_reg(i_regs->regmap,rs1[i]|64);
2492       sl=get_reg(i_regs->regmap,rs1[i]);
2493       if(tl>=0) {
2494         if(rs1[i]==0)
2495         {
2496           emit_zeroreg(tl);
2497           if(th>=0) emit_zeroreg(th);
2498         }
2499         else
2500         {
2501           assert(sl>=0);
2502           assert(sh>=0);
2503           if(imm[i]) {
2504             if(opcode2[i]==0x38) // DSLL
2505             {
2506               if(th>=0) emit_shldimm(sh,sl,imm[i],th);
2507               emit_shlimm(sl,imm[i],tl);
2508             }
2509             if(opcode2[i]==0x3a) // DSRL
2510             {
2511               emit_shrdimm(sl,sh,imm[i],tl);
2512               if(th>=0) emit_shrimm(sh,imm[i],th);
2513             }
2514             if(opcode2[i]==0x3b) // DSRA
2515             {
2516               emit_shrdimm(sl,sh,imm[i],tl);
2517               if(th>=0) emit_sarimm(sh,imm[i],th);
2518             }
2519           }else{
2520             // Shift by zero
2521             if(sl!=tl) emit_mov(sl,tl);
2522             if(th>=0&&sh!=th) emit_mov(sh,th);
2523           }
2524         }
2525       }
2526     }
2527   }
2528   if(opcode2[i]==0x3c) // DSLL32
2529   {
2530     if(rt1[i]) {
2531       signed char sl,tl,th;
2532       tl=get_reg(i_regs->regmap,rt1[i]);
2533       th=get_reg(i_regs->regmap,rt1[i]|64);
2534       sl=get_reg(i_regs->regmap,rs1[i]);
2535       if(th>=0||tl>=0){
2536         assert(tl>=0);
2537         assert(th>=0);
2538         assert(sl>=0);
2539         emit_mov(sl,th);
2540         emit_zeroreg(tl);
2541         if(imm[i]>32)
2542         {
2543           emit_shlimm(th,imm[i]&31,th);
2544         }
2545       }
2546     }
2547   }
2548   if(opcode2[i]==0x3e) // DSRL32
2549   {
2550     if(rt1[i]) {
2551       signed char sh,tl,th;
2552       tl=get_reg(i_regs->regmap,rt1[i]);
2553       th=get_reg(i_regs->regmap,rt1[i]|64);
2554       sh=get_reg(i_regs->regmap,rs1[i]|64);
2555       if(tl>=0){
2556         assert(sh>=0);
2557         emit_mov(sh,tl);
2558         if(th>=0) emit_zeroreg(th);
2559         if(imm[i]>32)
2560         {
2561           emit_shrimm(tl,imm[i]&31,tl);
2562         }
2563       }
2564     }
2565   }
2566   if(opcode2[i]==0x3f) // DSRA32
2567   {
2568     if(rt1[i]) {
2569       signed char sh,tl;
2570       tl=get_reg(i_regs->regmap,rt1[i]);
2571       sh=get_reg(i_regs->regmap,rs1[i]|64);
2572       if(tl>=0){
2573         assert(sh>=0);
2574         emit_mov(sh,tl);
2575         if(imm[i]>32)
2576         {
2577           emit_sarimm(tl,imm[i]&31,tl);
2578         }
2579       }
2580     }
2581   }
2582 }
2583
2584 #ifndef shift_assemble
2585 void shift_assemble(int i,struct regstat *i_regs)
2586 {
2587   printf("Need shift_assemble for this architecture.\n");
2588   exit(1);
2589 }
2590 #endif
2591
2592 void load_assemble(int i,struct regstat *i_regs)
2593 {
2594   int s,th,tl,addr,map=-1;
2595   int offset;
2596   void *jaddr=0;
2597   int memtarget=0,c=0;
2598   int fastload_reg_override=0;
2599   u_int hr,reglist=0;
2600   th=get_reg(i_regs->regmap,rt1[i]|64);
2601   tl=get_reg(i_regs->regmap,rt1[i]);
2602   s=get_reg(i_regs->regmap,rs1[i]);
2603   offset=imm[i];
2604   for(hr=0;hr<HOST_REGS;hr++) {
2605     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2606   }
2607   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2608   if(s>=0) {
2609     c=(i_regs->wasconst>>s)&1;
2610     if (c) {
2611       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2612     }
2613   }
2614   //printf("load_assemble: c=%d\n",c);
2615   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2616   // FIXME: Even if the load is a NOP, we should check for pagefaults...
2617   if((tl<0&&(!c||(((u_int)constmap[i][s]+offset)>>16)==0x1f80))
2618     ||rt1[i]==0) {
2619       // could be FIFO, must perform the read
2620       // ||dummy read
2621       assem_debug("(forced read)\n");
2622       tl=get_reg(i_regs->regmap,-1);
2623       assert(tl>=0);
2624   }
2625   if(offset||s<0||c) addr=tl;
2626   else addr=s;
2627   //if(tl<0) tl=get_reg(i_regs->regmap,-1);
2628  if(tl>=0) {
2629   //printf("load_assemble: c=%d\n",c);
2630   //if(c) printf("load_assemble: const=%lx\n",(long)constmap[i][s]+offset);
2631   assert(tl>=0); // Even if the load is a NOP, we must check for pagefaults and I/O
2632   reglist&=~(1<<tl);
2633   if(th>=0) reglist&=~(1<<th);
2634   if(!c) {
2635     #ifdef RAM_OFFSET
2636     map=get_reg(i_regs->regmap,ROREG);
2637     if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
2638     #endif
2639     #ifdef R29_HACK
2640     // Strmnnrmn's speed hack
2641     if(rs1[i]!=29||start<0x80001000||start>=0x80000000+RAM_SIZE)
2642     #endif
2643     {
2644       jaddr=emit_fastpath_cmp_jump(i,addr,&fastload_reg_override);
2645     }
2646   }
2647   else if(ram_offset&&memtarget) {
2648     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2649     fastload_reg_override=HOST_TEMPREG;
2650   }
2651   int dummy=(rt1[i]==0)||(tl!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to r0 and unneeded reg
2652   if (opcode[i]==0x20) { // LB
2653     if(!c||memtarget) {
2654       if(!dummy) {
2655         #ifdef HOST_IMM_ADDR32
2656         if(c)
2657           emit_movsbl_tlb((constmap[i][s]+offset)^3,map,tl);
2658         else
2659         #endif
2660         {
2661           //emit_xorimm(addr,3,tl);
2662           //emit_movsbl_indexed(rdram-0x80000000,tl,tl);
2663           int x=0,a=tl;
2664 #ifdef BIG_ENDIAN_MIPS
2665           if(!c) emit_xorimm(addr,3,tl);
2666           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2667 #else
2668           if(!c) a=addr;
2669 #endif
2670           if(fastload_reg_override) a=fastload_reg_override;
2671
2672           emit_movsbl_indexed_tlb(x,a,map,tl);
2673         }
2674       }
2675       if(jaddr)
2676         add_stub_r(LOADB_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2677     }
2678     else
2679       inline_readstub(LOADB_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2680   }
2681   if (opcode[i]==0x21) { // LH
2682     if(!c||memtarget) {
2683       if(!dummy) {
2684         #ifdef HOST_IMM_ADDR32
2685         if(c)
2686           emit_movswl_tlb((constmap[i][s]+offset)^2,map,tl);
2687         else
2688         #endif
2689         {
2690           int x=0,a=tl;
2691 #ifdef BIG_ENDIAN_MIPS
2692           if(!c) emit_xorimm(addr,2,tl);
2693           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2694 #else
2695           if(!c) a=addr;
2696 #endif
2697           if(fastload_reg_override) a=fastload_reg_override;
2698           //#ifdef
2699           //emit_movswl_indexed_tlb(x,tl,map,tl);
2700           //else
2701           if(map>=0) {
2702             emit_movswl_indexed(x,a,tl);
2703           }else{
2704             #if 1 //def RAM_OFFSET
2705             emit_movswl_indexed(x,a,tl);
2706             #else
2707             emit_movswl_indexed(rdram-0x80000000+x,a,tl);
2708             #endif
2709           }
2710         }
2711       }
2712       if(jaddr)
2713         add_stub_r(LOADH_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2714     }
2715     else
2716       inline_readstub(LOADH_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2717   }
2718   if (opcode[i]==0x23) { // LW
2719     if(!c||memtarget) {
2720       if(!dummy) {
2721         int a=addr;
2722         if(fastload_reg_override) a=fastload_reg_override;
2723         //emit_readword_indexed(rdram-0x80000000,addr,tl);
2724         #ifdef HOST_IMM_ADDR32
2725         if(c)
2726           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2727         else
2728         #endif
2729         emit_readword_indexed_tlb(0,a,map,tl);
2730       }
2731       if(jaddr)
2732         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2733     }
2734     else
2735       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2736   }
2737   if (opcode[i]==0x24) { // LBU
2738     if(!c||memtarget) {
2739       if(!dummy) {
2740         #ifdef HOST_IMM_ADDR32
2741         if(c)
2742           emit_movzbl_tlb((constmap[i][s]+offset)^3,map,tl);
2743         else
2744         #endif
2745         {
2746           //emit_xorimm(addr,3,tl);
2747           //emit_movzbl_indexed(rdram-0x80000000,tl,tl);
2748           int x=0,a=tl;
2749 #ifdef BIG_ENDIAN_MIPS
2750           if(!c) emit_xorimm(addr,3,tl);
2751           else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2752 #else
2753           if(!c) a=addr;
2754 #endif
2755           if(fastload_reg_override) a=fastload_reg_override;
2756
2757           emit_movzbl_indexed_tlb(x,a,map,tl);
2758         }
2759       }
2760       if(jaddr)
2761         add_stub_r(LOADBU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2762     }
2763     else
2764       inline_readstub(LOADBU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2765   }
2766   if (opcode[i]==0x25) { // LHU
2767     if(!c||memtarget) {
2768       if(!dummy) {
2769         #ifdef HOST_IMM_ADDR32
2770         if(c)
2771           emit_movzwl_tlb((constmap[i][s]+offset)^2,map,tl);
2772         else
2773         #endif
2774         {
2775           int x=0,a=tl;
2776 #ifdef BIG_ENDIAN_MIPS
2777           if(!c) emit_xorimm(addr,2,tl);
2778           else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2779 #else
2780           if(!c) a=addr;
2781 #endif
2782           if(fastload_reg_override) a=fastload_reg_override;
2783           //#ifdef
2784           //emit_movzwl_indexed_tlb(x,tl,map,tl);
2785           //#else
2786           if(map>=0) {
2787             emit_movzwl_indexed(x,a,tl);
2788           }else{
2789             #if 1 //def RAM_OFFSET
2790             emit_movzwl_indexed(x,a,tl);
2791             #else
2792             emit_movzwl_indexed(rdram-0x80000000+x,a,tl);
2793             #endif
2794           }
2795         }
2796       }
2797       if(jaddr)
2798         add_stub_r(LOADHU_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2799     }
2800     else
2801       inline_readstub(LOADHU_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2802   }
2803   if (opcode[i]==0x27) { // LWU
2804     assert(th>=0);
2805     if(!c||memtarget) {
2806       if(!dummy) {
2807         int a=addr;
2808         if(fastload_reg_override) a=fastload_reg_override;
2809         //emit_readword_indexed(rdram-0x80000000,addr,tl);
2810         #ifdef HOST_IMM_ADDR32
2811         if(c)
2812           emit_readword_tlb(constmap[i][s]+offset,map,tl);
2813         else
2814         #endif
2815         emit_readword_indexed_tlb(0,a,map,tl);
2816       }
2817       if(jaddr)
2818         add_stub_r(LOADW_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2819     }
2820     else {
2821       inline_readstub(LOADW_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2822     }
2823     emit_zeroreg(th);
2824   }
2825   if (opcode[i]==0x37) { // LD
2826     if(!c||memtarget) {
2827       if(!dummy) {
2828         int a=addr;
2829         if(fastload_reg_override) a=fastload_reg_override;
2830         //if(th>=0) emit_readword_indexed(rdram-0x80000000,addr,th);
2831         //emit_readword_indexed(rdram-0x7FFFFFFC,addr,tl);
2832         #ifdef HOST_IMM_ADDR32
2833         if(c)
2834           emit_readdword_tlb(constmap[i][s]+offset,map,th,tl);
2835         else
2836         #endif
2837         emit_readdword_indexed_tlb(0,a,map,th,tl);
2838       }
2839       if(jaddr)
2840         add_stub_r(LOADD_STUB,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2841     }
2842     else
2843       inline_readstub(LOADD_STUB,i,constmap[i][s]+offset,i_regs->regmap,rt1[i],ccadj[i],reglist);
2844   }
2845  }
2846 }
2847
2848 #ifndef loadlr_assemble
2849 void loadlr_assemble(int i,struct regstat *i_regs)
2850 {
2851   printf("Need loadlr_assemble for this architecture.\n");
2852   exit(1);
2853 }
2854 #endif
2855
2856 void store_assemble(int i,struct regstat *i_regs)
2857 {
2858   int s,th,tl,map=-1;
2859   int addr,temp;
2860   int offset;
2861   void *jaddr=0;
2862   enum stub_type type;
2863   int memtarget=0,c=0;
2864   int agr=AGEN1+(i&1);
2865   int faststore_reg_override=0;
2866   u_int hr,reglist=0;
2867   th=get_reg(i_regs->regmap,rs2[i]|64);
2868   tl=get_reg(i_regs->regmap,rs2[i]);
2869   s=get_reg(i_regs->regmap,rs1[i]);
2870   temp=get_reg(i_regs->regmap,agr);
2871   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2872   offset=imm[i];
2873   if(s>=0) {
2874     c=(i_regs->wasconst>>s)&1;
2875     if(c) {
2876       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
2877     }
2878   }
2879   assert(tl>=0);
2880   assert(temp>=0);
2881   for(hr=0;hr<HOST_REGS;hr++) {
2882     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2883   }
2884   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2885   if(offset||s<0||c) addr=temp;
2886   else addr=s;
2887   if(!c) {
2888     jaddr=emit_fastpath_cmp_jump(i,addr,&faststore_reg_override);
2889   }
2890   else if(ram_offset&&memtarget) {
2891     emit_addimm(addr,ram_offset,HOST_TEMPREG);
2892     faststore_reg_override=HOST_TEMPREG;
2893   }
2894
2895   if (opcode[i]==0x28) { // SB
2896     if(!c||memtarget) {
2897       int x=0,a=temp;
2898 #ifdef BIG_ENDIAN_MIPS
2899       if(!c) emit_xorimm(addr,3,temp);
2900       else x=((constmap[i][s]+offset)^3)-(constmap[i][s]+offset);
2901 #else
2902       if(!c) a=addr;
2903 #endif
2904       if(faststore_reg_override) a=faststore_reg_override;
2905       //emit_writebyte_indexed(tl,rdram-0x80000000,temp);
2906       emit_writebyte_indexed_tlb(tl,x,a,map,a);
2907     }
2908     type=STOREB_STUB;
2909   }
2910   if (opcode[i]==0x29) { // SH
2911     if(!c||memtarget) {
2912       int x=0,a=temp;
2913 #ifdef BIG_ENDIAN_MIPS
2914       if(!c) emit_xorimm(addr,2,temp);
2915       else x=((constmap[i][s]+offset)^2)-(constmap[i][s]+offset);
2916 #else
2917       if(!c) a=addr;
2918 #endif
2919       if(faststore_reg_override) a=faststore_reg_override;
2920       //#ifdef
2921       //emit_writehword_indexed_tlb(tl,x,temp,map,temp);
2922       //#else
2923       if(map>=0) {
2924         emit_writehword_indexed(tl,x,a);
2925       }else
2926         //emit_writehword_indexed(tl,rdram-0x80000000+x,a);
2927         emit_writehword_indexed(tl,x,a);
2928     }
2929     type=STOREH_STUB;
2930   }
2931   if (opcode[i]==0x2B) { // SW
2932     if(!c||memtarget) {
2933       int a=addr;
2934       if(faststore_reg_override) a=faststore_reg_override;
2935       //emit_writeword_indexed(tl,rdram-0x80000000,addr);
2936       emit_writeword_indexed_tlb(tl,0,a,map,temp);
2937     }
2938     type=STOREW_STUB;
2939   }
2940   if (opcode[i]==0x3F) { // SD
2941     if(!c||memtarget) {
2942       int a=addr;
2943       if(faststore_reg_override) a=faststore_reg_override;
2944       if(rs2[i]) {
2945         assert(th>=0);
2946         //emit_writeword_indexed(th,rdram-0x80000000,addr);
2947         //emit_writeword_indexed(tl,rdram-0x7FFFFFFC,addr);
2948         emit_writedword_indexed_tlb(th,tl,0,a,map,temp);
2949       }else{
2950         // Store zero
2951         //emit_writeword_indexed(tl,rdram-0x80000000,temp);
2952         //emit_writeword_indexed(tl,rdram-0x7FFFFFFC,temp);
2953         emit_writedword_indexed_tlb(tl,tl,0,a,map,temp);
2954       }
2955     }
2956     type=STORED_STUB;
2957   }
2958   if(jaddr) {
2959     // PCSX store handlers don't check invcode again
2960     reglist|=1<<addr;
2961     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2962     jaddr=0;
2963   }
2964   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
2965     if(!c||memtarget) {
2966       #ifdef DESTRUCTIVE_SHIFT
2967       // The x86 shift operation is 'destructive'; it overwrites the
2968       // source register, so we need to make a copy first and use that.
2969       addr=temp;
2970       #endif
2971       #if defined(HOST_IMM8)
2972       int ir=get_reg(i_regs->regmap,INVCP);
2973       assert(ir>=0);
2974       emit_cmpmem_indexedsr12_reg(ir,addr,1);
2975       #else
2976       emit_cmpmem_indexedsr12_imm(invalid_code,addr,1);
2977       #endif
2978       #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
2979       emit_callne(invalidate_addr_reg[addr]);
2980       #else
2981       void *jaddr2 = out;
2982       emit_jne(0);
2983       add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),addr,0,0,0);
2984       #endif
2985     }
2986   }
2987   u_int addr_val=constmap[i][s]+offset;
2988   if(jaddr) {
2989     add_stub_r(type,jaddr,out,i,addr,i_regs,ccadj[i],reglist);
2990   } else if(c&&!memtarget) {
2991     inline_writestub(type,i,addr_val,i_regs->regmap,rs2[i],ccadj[i],reglist);
2992   }
2993   // basic current block modification detection..
2994   // not looking back as that should be in mips cache already
2995   if(c&&start+i*4<addr_val&&addr_val<start+slen*4) {
2996     SysPrintf("write to %08x hits block %08x, pc=%08x\n",addr_val,start,start+i*4);
2997     assert(i_regs->regmap==regs[i].regmap); // not delay slot
2998     if(i_regs->regmap==regs[i].regmap) {
2999       load_all_consts(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty,i);
3000       wb_dirtys(regs[i].regmap_entry,regs[i].was32,regs[i].wasdirty);
3001       emit_movimm(start+i*4+4,0);
3002       emit_writeword(0,&pcaddr);
3003       emit_jmp(do_interrupt);
3004     }
3005   }
3006 }
3007
3008 void storelr_assemble(int i,struct regstat *i_regs)
3009 {
3010   int s,th,tl;
3011   int temp;
3012   int temp2=-1;
3013   int offset;
3014   void *jaddr=0;
3015   void *case1, *case2, *case3;
3016   void *done0, *done1, *done2;
3017   int memtarget=0,c=0;
3018   int agr=AGEN1+(i&1);
3019   u_int hr,reglist=0;
3020   th=get_reg(i_regs->regmap,rs2[i]|64);
3021   tl=get_reg(i_regs->regmap,rs2[i]);
3022   s=get_reg(i_regs->regmap,rs1[i]);
3023   temp=get_reg(i_regs->regmap,agr);
3024   if(temp<0) temp=get_reg(i_regs->regmap,-1);
3025   offset=imm[i];
3026   if(s>=0) {
3027     c=(i_regs->isconst>>s)&1;
3028     if(c) {
3029       memtarget=((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE;
3030     }
3031   }
3032   assert(tl>=0);
3033   for(hr=0;hr<HOST_REGS;hr++) {
3034     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3035   }
3036   assert(temp>=0);
3037   if(!c) {
3038     emit_cmpimm(s<0||offset?temp:s,RAM_SIZE);
3039     if(!offset&&s!=temp) emit_mov(s,temp);
3040     jaddr=out;
3041     emit_jno(0);
3042   }
3043   else
3044   {
3045     if(!memtarget||!rs1[i]) {
3046       jaddr=out;
3047       emit_jmp(0);
3048     }
3049   }
3050   #ifdef RAM_OFFSET
3051   int map=get_reg(i_regs->regmap,ROREG);
3052   if(map<0) emit_loadreg(ROREG,map=HOST_TEMPREG);
3053   #else
3054   if((u_int)rdram!=0x80000000)
3055     emit_addimm_no_flags((u_int)rdram-(u_int)0x80000000,temp);
3056   #endif
3057
3058   if (opcode[i]==0x2C||opcode[i]==0x2D) { // SDL/SDR
3059     temp2=get_reg(i_regs->regmap,FTEMP);
3060     if(!rs2[i]) temp2=th=tl;
3061   }
3062
3063 #ifndef BIG_ENDIAN_MIPS
3064     emit_xorimm(temp,3,temp);
3065 #endif
3066   emit_testimm(temp,2);
3067   case2=out;
3068   emit_jne(0);
3069   emit_testimm(temp,1);
3070   case1=out;
3071   emit_jne(0);
3072   // 0
3073   if (opcode[i]==0x2A) { // SWL
3074     emit_writeword_indexed(tl,0,temp);
3075   }
3076   if (opcode[i]==0x2E) { // SWR
3077     emit_writebyte_indexed(tl,3,temp);
3078   }
3079   if (opcode[i]==0x2C) { // SDL
3080     emit_writeword_indexed(th,0,temp);
3081     if(rs2[i]) emit_mov(tl,temp2);
3082   }
3083   if (opcode[i]==0x2D) { // SDR
3084     emit_writebyte_indexed(tl,3,temp);
3085     if(rs2[i]) emit_shldimm(th,tl,24,temp2);
3086   }
3087   done0=out;
3088   emit_jmp(0);
3089   // 1
3090   set_jump_target(case1, out);
3091   if (opcode[i]==0x2A) { // SWL
3092     // Write 3 msb into three least significant bytes
3093     if(rs2[i]) emit_rorimm(tl,8,tl);
3094     emit_writehword_indexed(tl,-1,temp);
3095     if(rs2[i]) emit_rorimm(tl,16,tl);
3096     emit_writebyte_indexed(tl,1,temp);
3097     if(rs2[i]) emit_rorimm(tl,8,tl);
3098   }
3099   if (opcode[i]==0x2E) { // SWR
3100     // Write two lsb into two most significant bytes
3101     emit_writehword_indexed(tl,1,temp);
3102   }
3103   if (opcode[i]==0x2C) { // SDL
3104     if(rs2[i]) emit_shrdimm(tl,th,8,temp2);
3105     // Write 3 msb into three least significant bytes
3106     if(rs2[i]) emit_rorimm(th,8,th);
3107     emit_writehword_indexed(th,-1,temp);
3108     if(rs2[i]) emit_rorimm(th,16,th);
3109     emit_writebyte_indexed(th,1,temp);
3110     if(rs2[i]) emit_rorimm(th,8,th);
3111   }
3112   if (opcode[i]==0x2D) { // SDR
3113     if(rs2[i]) emit_shldimm(th,tl,16,temp2);
3114     // Write two lsb into two most significant bytes
3115     emit_writehword_indexed(tl,1,temp);
3116   }
3117   done1=out;
3118   emit_jmp(0);
3119   // 2
3120   set_jump_target(case2, out);
3121   emit_testimm(temp,1);
3122   case3=out;
3123   emit_jne(0);
3124   if (opcode[i]==0x2A) { // SWL
3125     // Write two msb into two least significant bytes
3126     if(rs2[i]) emit_rorimm(tl,16,tl);
3127     emit_writehword_indexed(tl,-2,temp);
3128     if(rs2[i]) emit_rorimm(tl,16,tl);
3129   }
3130   if (opcode[i]==0x2E) { // SWR
3131     // Write 3 lsb into three most significant bytes
3132     emit_writebyte_indexed(tl,-1,temp);
3133     if(rs2[i]) emit_rorimm(tl,8,tl);
3134     emit_writehword_indexed(tl,0,temp);
3135     if(rs2[i]) emit_rorimm(tl,24,tl);
3136   }
3137   if (opcode[i]==0x2C) { // SDL
3138     if(rs2[i]) emit_shrdimm(tl,th,16,temp2);
3139     // Write two msb into two least significant bytes
3140     if(rs2[i]) emit_rorimm(th,16,th);
3141     emit_writehword_indexed(th,-2,temp);
3142     if(rs2[i]) emit_rorimm(th,16,th);
3143   }
3144   if (opcode[i]==0x2D) { // SDR
3145     if(rs2[i]) emit_shldimm(th,tl,8,temp2);
3146     // Write 3 lsb into three most significant bytes
3147     emit_writebyte_indexed(tl,-1,temp);
3148     if(rs2[i]) emit_rorimm(tl,8,tl);
3149     emit_writehword_indexed(tl,0,temp);
3150     if(rs2[i]) emit_rorimm(tl,24,tl);
3151   }
3152   done2=out;
3153   emit_jmp(0);
3154   // 3
3155   set_jump_target(case3, out);
3156   if (opcode[i]==0x2A) { // SWL
3157     // Write msb into least significant byte
3158     if(rs2[i]) emit_rorimm(tl,24,tl);
3159     emit_writebyte_indexed(tl,-3,temp);
3160     if(rs2[i]) emit_rorimm(tl,8,tl);
3161   }
3162   if (opcode[i]==0x2E) { // SWR
3163     // Write entire word
3164     emit_writeword_indexed(tl,-3,temp);
3165   }
3166   if (opcode[i]==0x2C) { // SDL
3167     if(rs2[i]) emit_shrdimm(tl,th,24,temp2);
3168     // Write msb into least significant byte
3169     if(rs2[i]) emit_rorimm(th,24,th);
3170     emit_writebyte_indexed(th,-3,temp);
3171     if(rs2[i]) emit_rorimm(th,8,th);
3172   }
3173   if (opcode[i]==0x2D) { // SDR
3174     if(rs2[i]) emit_mov(th,temp2);
3175     // Write entire word
3176     emit_writeword_indexed(tl,-3,temp);
3177   }
3178   set_jump_target(done0, out);
3179   set_jump_target(done1, out);
3180   set_jump_target(done2, out);
3181   if (opcode[i]==0x2C) { // SDL
3182     emit_testimm(temp,4);
3183     done0=out;
3184     emit_jne(0);
3185     emit_andimm(temp,~3,temp);
3186     emit_writeword_indexed(temp2,4,temp);
3187     set_jump_target(done0, out);
3188   }
3189   if (opcode[i]==0x2D) { // SDR
3190     emit_testimm(temp,4);
3191     done0=out;
3192     emit_jeq(0);
3193     emit_andimm(temp,~3,temp);
3194     emit_writeword_indexed(temp2,-4,temp);
3195     set_jump_target(done0, out);
3196   }
3197   if(!c||!memtarget)
3198     add_stub_r(STORELR_STUB,jaddr,out,i,temp,i_regs,ccadj[i],reglist);
3199   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3200     #ifdef RAM_OFFSET
3201     int map=get_reg(i_regs->regmap,ROREG);
3202     if(map<0) map=HOST_TEMPREG;
3203     gen_orig_addr_w(temp,map);
3204     #else
3205     emit_addimm_no_flags((u_int)0x80000000-(u_int)rdram,temp);
3206     #endif
3207     #if defined(HOST_IMM8)
3208     int ir=get_reg(i_regs->regmap,INVCP);
3209     assert(ir>=0);
3210     emit_cmpmem_indexedsr12_reg(ir,temp,1);
3211     #else
3212     emit_cmpmem_indexedsr12_imm(invalid_code,temp,1);
3213     #endif
3214     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3215     emit_callne(invalidate_addr_reg[temp]);
3216     #else
3217     void *jaddr2 = out;
3218     emit_jne(0);
3219     add_stub(INVCODE_STUB,jaddr2,out,reglist|(1<<HOST_CCREG),temp,0,0,0);
3220     #endif
3221   }
3222 }
3223
3224 void c1ls_assemble(int i,struct regstat *i_regs)
3225 {
3226   cop1_unusable(i, i_regs);
3227 }
3228
3229 void c2ls_assemble(int i,struct regstat *i_regs)
3230 {
3231   int s,tl;
3232   int ar;
3233   int offset;
3234   int memtarget=0,c=0;
3235   void *jaddr2=NULL;
3236   enum stub_type type;
3237   int agr=AGEN1+(i&1);
3238   int fastio_reg_override=0;
3239   u_int hr,reglist=0;
3240   u_int copr=(source[i]>>16)&0x1f;
3241   s=get_reg(i_regs->regmap,rs1[i]);
3242   tl=get_reg(i_regs->regmap,FTEMP);
3243   offset=imm[i];
3244   assert(rs1[i]>0);
3245   assert(tl>=0);
3246
3247   for(hr=0;hr<HOST_REGS;hr++) {
3248     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3249   }
3250   if(i_regs->regmap[HOST_CCREG]==CCREG)
3251     reglist&=~(1<<HOST_CCREG);
3252
3253   // get the address
3254   if (opcode[i]==0x3a) { // SWC2
3255     ar=get_reg(i_regs->regmap,agr);
3256     if(ar<0) ar=get_reg(i_regs->regmap,-1);
3257     reglist|=1<<ar;
3258   } else { // LWC2
3259     ar=tl;
3260   }
3261   if(s>=0) c=(i_regs->wasconst>>s)&1;
3262   memtarget=c&&(((signed int)(constmap[i][s]+offset))<(signed int)0x80000000+RAM_SIZE);
3263   if (!offset&&!c&&s>=0) ar=s;
3264   assert(ar>=0);
3265
3266   if (opcode[i]==0x3a) { // SWC2
3267     cop2_get_dreg(copr,tl,HOST_TEMPREG);
3268     type=STOREW_STUB;
3269   }
3270   else
3271     type=LOADW_STUB;
3272
3273   if(c&&!memtarget) {
3274     jaddr2=out;
3275     emit_jmp(0); // inline_readstub/inline_writestub?
3276   }
3277   else {
3278     if(!c) {
3279       jaddr2=emit_fastpath_cmp_jump(i,ar,&fastio_reg_override);
3280     }
3281     else if(ram_offset&&memtarget) {
3282       emit_addimm(ar,ram_offset,HOST_TEMPREG);
3283       fastio_reg_override=HOST_TEMPREG;
3284     }
3285     if (opcode[i]==0x32) { // LWC2
3286       #ifdef HOST_IMM_ADDR32
3287       if(c) emit_readword_tlb(constmap[i][s]+offset,-1,tl);
3288       else
3289       #endif
3290       int a=ar;
3291       if(fastio_reg_override) a=fastio_reg_override;
3292       emit_readword_indexed(0,a,tl);
3293     }
3294     if (opcode[i]==0x3a) { // SWC2
3295       #ifdef DESTRUCTIVE_SHIFT
3296       if(!offset&&!c&&s>=0) emit_mov(s,ar);
3297       #endif
3298       int a=ar;
3299       if(fastio_reg_override) a=fastio_reg_override;
3300       emit_writeword_indexed(tl,0,a);
3301     }
3302   }
3303   if(jaddr2)
3304     add_stub_r(type,jaddr2,out,i,ar,i_regs,ccadj[i],reglist);
3305   if(opcode[i]==0x3a) // SWC2
3306   if(!(i_regs->waswritten&(1<<rs1[i]))&&!(new_dynarec_hacks&NDHACK_NO_SMC_CHECK)) {
3307 #if defined(HOST_IMM8)
3308     int ir=get_reg(i_regs->regmap,INVCP);
3309     assert(ir>=0);
3310     emit_cmpmem_indexedsr12_reg(ir,ar,1);
3311 #else
3312     emit_cmpmem_indexedsr12_imm(invalid_code,ar,1);
3313 #endif
3314     #if defined(HAVE_CONDITIONAL_CALL) && !defined(DESTRUCTIVE_SHIFT)
3315     emit_callne(invalidate_addr_reg[ar]);
3316     #else
3317     void *jaddr3 = out;
3318     emit_jne(0);
3319     add_stub(INVCODE_STUB,jaddr3,out,reglist|(1<<HOST_CCREG),ar,0,0,0);
3320     #endif
3321   }
3322   if (opcode[i]==0x32) { // LWC2
3323     cop2_put_dreg(copr,tl,HOST_TEMPREG);
3324   }
3325 }
3326
3327 #ifndef multdiv_assemble
3328 void multdiv_assemble(int i,struct regstat *i_regs)
3329 {
3330   printf("Need multdiv_assemble for this architecture.\n");
3331   exit(1);
3332 }
3333 #endif
3334
3335 void mov_assemble(int i,struct regstat *i_regs)
3336 {
3337   //if(opcode2[i]==0x10||opcode2[i]==0x12) { // MFHI/MFLO
3338   //if(opcode2[i]==0x11||opcode2[i]==0x13) { // MTHI/MTLO
3339   if(rt1[i]) {
3340     signed char sh,sl,th,tl;
3341     th=get_reg(i_regs->regmap,rt1[i]|64);
3342     tl=get_reg(i_regs->regmap,rt1[i]);
3343     //assert(tl>=0);
3344     if(tl>=0) {
3345       sh=get_reg(i_regs->regmap,rs1[i]|64);
3346       sl=get_reg(i_regs->regmap,rs1[i]);
3347       if(sl>=0) emit_mov(sl,tl);
3348       else emit_loadreg(rs1[i],tl);
3349       if(th>=0) {
3350         if(sh>=0) emit_mov(sh,th);
3351         else emit_loadreg(rs1[i]|64,th);
3352       }
3353     }
3354   }
3355 }
3356
3357 #ifndef fconv_assemble
3358 void fconv_assemble(int i,struct regstat *i_regs)
3359 {
3360   printf("Need fconv_assemble for this architecture.\n");
3361   exit(1);
3362 }
3363 #endif
3364
3365 #if 0
3366 void float_assemble(int i,struct regstat *i_regs)
3367 {
3368   printf("Need float_assemble for this architecture.\n");
3369   exit(1);
3370 }
3371 #endif
3372
3373 void syscall_assemble(int i,struct regstat *i_regs)
3374 {
3375   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3376   assert(ccreg==HOST_CCREG);
3377   assert(!is_delayslot);
3378   (void)ccreg;
3379   emit_movimm(start+i*4,EAX); // Get PC
3380   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // CHECK: is this right?  There should probably be an extra cycle...
3381   emit_jmp(jump_syscall_hle); // XXX
3382 }
3383
3384 void hlecall_assemble(int i,struct regstat *i_regs)
3385 {
3386   extern void psxNULL();
3387   signed char ccreg=get_reg(i_regs->regmap,CCREG);
3388   assert(ccreg==HOST_CCREG);
3389   assert(!is_delayslot);
3390   (void)ccreg;
3391   emit_movimm(start+i*4+4,0); // Get PC
3392   uint32_t hleCode = source[i] & 0x03ffffff;
3393   if (hleCode >= ARRAY_SIZE(psxHLEt))
3394     emit_movimm((uintptr_t)psxNULL,1);
3395   else
3396     emit_movimm((uintptr_t)psxHLEt[hleCode],1);
3397   emit_addimm(HOST_CCREG,CLOCK_ADJUST(ccadj[i]),HOST_CCREG); // XXX
3398   emit_jmp(jump_hlecall);
3399 }
3400
3401 void intcall_assemble(int i,struct regstat *i_regs)
3402 {
3403   signed char ccreg=get_reg(i_regs->regmap,CCREG);