1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 * Mupen64plus/PCSX - assem_arm64.c *
3 * Copyright (C) 2009-2011 Ari64 *
4 * Copyright (C) 2009-2018 Gillou68310 *
5 * Copyright (C) 2021 notaz *
7 * This program is free software; you can redistribute it and/or modify *
8 * it under the terms of the GNU General Public License as published by *
9 * the Free Software Foundation; either version 2 of the License, or *
10 * (at your option) any later version. *
12 * This program is distributed in the hope that it will be useful, *
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 * GNU General Public License for more details. *
17 * You should have received a copy of the GNU General Public License *
18 * along with this program; if not, write to the *
19 * Free Software Foundation, Inc., *
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
21 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
24 #include "arm_features.h"
26 void do_memhandler_pre();
27 void do_memhandler_post();
30 static void set_jump_target(void *addr, void *target)
32 u_int *ptr = NDRC_WRITE_OFFSET(addr);
33 intptr_t offset = (u_char *)target - (u_char *)addr;
35 if ((*ptr&0xFC000000) == 0x14000000) { // b
36 assert(offset>=-134217728LL&&offset<134217728LL);
37 *ptr=(*ptr&0xFC000000)|((offset>>2)&0x3ffffff);
39 else if ((*ptr&0xff000000) == 0x54000000 // b.cond
40 || (*ptr&0x7e000000) == 0x34000000) { // cbz/cbnz
41 // Conditional branch are limited to +/- 1MB
42 // block max size is 256k so branching beyond the +/- 1MB limit
43 // should only happen when jumping to an already compiled block (see add_jump_out)
44 // a workaround would be to do a trampoline jump via a stub at the end of the block
45 assert(-1048576 <= offset && offset < 1048576);
46 *ptr=(*ptr&0xFF00001F)|(((offset>>2)&0x7ffff)<<5);
48 else if((*ptr&0x9f000000)==0x10000000) { // adr
49 // generated by do_miniht_insert
50 assert(offset>=-1048576LL&&offset<1048576LL);
51 *ptr=(*ptr&0x9F00001F)|(offset&0x3)<<29|((offset>>2)&0x7ffff)<<5;
54 abort(); // should not happen
57 // from a pointer to external jump stub (which was produced by emit_extjump2)
58 // find where the jumping insn is
59 static void *find_extjump_insn(void *stub)
61 int *ptr = (int *)stub + 2;
62 assert((*ptr&0x9f000000) == 0x10000000); // adr
63 int offset = (((signed int)(*ptr<<8)>>13)<<2)|((*ptr>>29)&0x3);
64 return ptr + offset / 4;
68 // find where external branch is liked to using addr of it's stub:
69 // get address that the stub loads (dyna_linker arg1),
70 // treat it as a pointer to branch insn,
71 // return addr where that branch jumps to
72 static void *get_pointer(void *stub)
74 int *i_ptr = find_extjump_insn(stub);
75 if ((*i_ptr&0xfc000000) == 0x14000000) // b
76 return i_ptr + ((signed int)(*i_ptr<<6)>>6);
77 if ((*i_ptr&0xff000000) == 0x54000000 // b.cond
78 || (*i_ptr&0x7e000000) == 0x34000000) // cbz/cbnz
79 return i_ptr + ((signed int)(*i_ptr<<8)>>13);
85 // Allocate a specific ARM register.
86 static void alloc_arm_reg(struct regstat *cur,int i,signed char reg,int hr)
91 // see if it's already allocated (and dealloc it)
92 for(n=0;n<HOST_REGS;n++)
94 if(n!=EXCLUDE_REG&&cur->regmap[n]==reg) {
95 dirty=(cur->dirty>>n)&1;
101 cur->dirty&=~(1<<hr);
102 cur->dirty|=dirty<<hr;
103 cur->isconst&=~(1<<hr);
106 // Alloc cycle count into dedicated register
107 static void alloc_cc(struct regstat *cur,int i)
109 alloc_arm_reg(cur,i,CCREG,HOST_CCREG);
117 static unused const char *regname[32] = {
118 "w0", "w1", "w2", "w3", "w4", "w5", "w6", "w7",
119 "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15",
120 "ip0", "ip1", "w18", "w19", "w20", "w21", "w22", "w23",
121 "w24", "w25", "w26", "w27", "w28", "wfp", "wlr", "wsp"
124 static unused const char *regname64[32] = {
125 "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
126 "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
127 "ip0", "ip1", "x18", "x19", "x20", "x21", "x22", "x23",
128 "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"
132 COND_EQ, COND_NE, COND_CS, COND_CC, COND_MI, COND_PL, COND_VS, COND_VC,
133 COND_HI, COND_LS, COND_GE, COND_LT, COND_GT, COND_LE, COND_AW, COND_NV
136 static unused const char *condname[16] = {
137 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
138 "hi", "ls", "ge", "lt", "gt", "le", "aw", "nv"
141 static void output_w32(u_int word)
143 *((u_int *)NDRC_WRITE_OFFSET(out)) = word;
147 static u_int rn_rd(u_int rn, u_int rd)
151 return (rn << 5) | rd;
154 static u_int rm_rn_rd(u_int rm, u_int rn, u_int rd)
159 return (rm << 16) | (rn << 5) | rd;
162 static u_int rm_ra_rn_rd(u_int rm, u_int ra, u_int rn, u_int rd)
165 return rm_rn_rd(rm, rn, rd) | (ra << 10);
168 static u_int imm7_rt2_rn_rt(u_int imm7, u_int rt2, u_int rn, u_int rt)
174 return (imm7 << 15) | (rt2 << 10) | (rn << 5) | rt;
177 static u_int rm_imm6_rn_rd(u_int rm, u_int imm6, u_int rn, u_int rd)
180 return rm_rn_rd(rm, rn, rd) | (imm6 << 10);
183 static u_int imm16_rd(u_int imm16, u_int rd)
185 assert(imm16 < 0x10000);
187 return (imm16 << 5) | rd;
190 static u_int imm12_rn_rd(u_int imm12, u_int rn, u_int rd)
192 assert(imm12 < 0x1000);
195 return (imm12 << 10) | (rn << 5) | rd;
198 static u_int imm9_rn_rt(u_int imm9, u_int rn, u_int rd)
200 assert(imm9 < 0x200);
203 return (imm9 << 12) | (rn << 5) | rd;
206 static u_int imm19_rt(u_int imm19, u_int rt)
208 assert(imm19 < 0x80000);
210 return (imm19 << 5) | rt;
213 static u_int n_immr_imms_rn_rd(u_int n, u_int immr, u_int imms, u_int rn, u_int rd)
220 return (n << 22) | (immr << 16) | (imms << 10) | (rn << 5) | rd;
223 static u_int genjmp(const u_char *addr)
225 intptr_t offset = addr - out;
226 if ((uintptr_t)addr < 3) return 0; // a branch that will be patched later
227 if (offset < -134217728 || offset > 134217727) {
228 SysPrintf("%s: out of range: %p %lx\n", __func__, addr, offset);
232 return ((u_int)offset >> 2) & 0x03ffffff;
235 static u_int genjmpcc(const u_char *addr)
237 intptr_t offset = addr - out;
238 if ((uintptr_t)addr < 3) return 0;
239 if (offset < -1048576 || offset > 1048572) {
240 SysPrintf("%s: out of range: %p %lx\n", __func__, addr, offset);
244 return ((u_int)offset >> 2) & 0x7ffff;
247 static uint32_t is_mask(u_int value)
249 return value && ((value + 1) & value) == 0;
252 // This function returns true if the argument contains a
253 // non-empty sequence of ones (possibly rotated) with the remainder zero.
254 static uint32_t is_rotated_mask(u_int value)
256 if (value == 0 || value == ~0)
258 if (is_mask((value - 1) | value))
260 return is_mask((~value - 1) | ~value);
263 static void gen_logical_imm(u_int value, u_int *immr, u_int *imms)
265 int lzeros, tzeros, ones;
267 if (is_mask((value - 1) | value)) {
268 lzeros = __builtin_clz(value);
269 tzeros = __builtin_ctz(value);
270 ones = 32 - lzeros - tzeros;
271 *immr = (32 - tzeros) & 31;
276 if (is_mask((value - 1) | value)) {
277 lzeros = __builtin_clz(value);
278 tzeros = __builtin_ctz(value);
279 ones = 32 - lzeros - tzeros;
287 static void emit_mov(u_int rs, u_int rt)
289 assem_debug("mov %s,%s\n", regname[rt], regname[rs]);
290 output_w32(0x2a000000 | rm_rn_rd(rs, WZR, rt));
293 static void emit_mov64(u_int rs, u_int rt)
295 assem_debug("mov %s,%s\n", regname64[rt], regname64[rs]);
296 output_w32(0xaa000000 | rm_rn_rd(rs, WZR, rt));
299 static void emit_add(u_int rs1, u_int rs2, u_int rt)
301 assem_debug("add %s,%s,%s\n", regname[rt], regname[rs1], regname[rs2]);
302 output_w32(0x0b000000 | rm_rn_rd(rs2, rs1, rt));
305 static void emit_add64(u_int rs1, u_int rs2, u_int rt)
307 assem_debug("add %s,%s,%s\n", regname64[rt], regname64[rs1], regname64[rs2]);
308 output_w32(0x8b000000 | rm_rn_rd(rs2, rs1, rt));
311 static void emit_adds64(u_int rs1, u_int rs2, u_int rt)
313 assem_debug("adds %s,%s,%s\n",regname64[rt],regname64[rs1],regname64[rs2]);
314 output_w32(0xab000000 | rm_rn_rd(rs2, rs1, rt));
316 #define emit_adds_ptr emit_adds64
318 static void emit_neg(u_int rs, u_int rt)
320 assem_debug("neg %s,%s\n",regname[rt],regname[rs]);
321 output_w32(0x4b000000 | rm_rn_rd(rs, WZR, rt));
324 static void emit_sub(u_int rs1, u_int rs2, u_int rt)
326 assem_debug("sub %s,%s,%s\n", regname[rt], regname[rs1], regname[rs2]);
327 output_w32(0x4b000000 | rm_imm6_rn_rd(rs2, 0, rs1, rt));
330 static void emit_sub_asrimm(u_int rs1, u_int rs2, u_int shift, u_int rt)
332 assem_debug("sub %s,%s,%s,asr #%u\n",regname[rt],regname[rs1],regname[rs2],shift);
333 output_w32(0x4b800000 | rm_imm6_rn_rd(rs2, shift, rs1, rt));
336 static void emit_movz(u_int imm, u_int rt)
338 assem_debug("movz %s,#%#x\n", regname[rt], imm);
339 output_w32(0x52800000 | imm16_rd(imm, rt));
342 static void emit_movz_lsl16(u_int imm, u_int rt)
344 assem_debug("movz %s,#%#x,lsl #16\n", regname[rt], imm);
345 output_w32(0x52a00000 | imm16_rd(imm, rt));
348 static void emit_movn(u_int imm, u_int rt)
350 assem_debug("movn %s,#%#x\n", regname[rt], imm);
351 output_w32(0x12800000 | imm16_rd(imm, rt));
354 static void emit_movn_lsl16(u_int imm,u_int rt)
356 assem_debug("movn %s,#%#x,lsl #16\n", regname[rt], imm);
357 output_w32(0x12a00000 | imm16_rd(imm, rt));
360 static void emit_movk(u_int imm,u_int rt)
362 assem_debug("movk %s,#%#x\n", regname[rt], imm);
363 output_w32(0x72800000 | imm16_rd(imm, rt));
366 static void emit_movk_lsl16(u_int imm,u_int rt)
369 assem_debug("movk %s,#%#x,lsl #16\n", regname[rt], imm);
370 output_w32(0x72a00000 | imm16_rd(imm, rt));
373 static void emit_zeroreg(u_int rt)
378 static void emit_movimm(u_int imm, u_int rt)
382 else if ((~imm) < 65536)
384 else if ((imm&0xffff) == 0)
385 emit_movz_lsl16(imm >> 16, rt);
386 else if (((~imm)&0xffff) == 0)
387 emit_movn_lsl16(~imm >> 16, rt);
388 else if (is_rotated_mask(imm)) {
390 gen_logical_imm(imm, &immr, &imms);
391 assem_debug("orr %s,wzr,#%#x\n", regname[rt], imm);
392 output_w32(0x32000000 | n_immr_imms_rn_rd(0, immr, imms, WZR, rt));
395 emit_movz(imm & 0xffff, rt);
396 emit_movk_lsl16(imm >> 16, rt);
400 static void emit_movimm64(uint64_t imm, u_int rt)
402 u_int shift, op, imm16, insns = 0;
403 for (shift = 0; shift < 4; shift++) {
404 imm16 = (imm >> shift * 16) & 0xffff;
407 op = insns ? 0xf2800000 : 0xd2800000;
408 assem_debug("mov%c %s,#%#x", insns ? 'k' : 'z', regname64[rt], imm16);
410 assem_debug(",lsl #%u", shift * 16);
412 output_w32(op | (shift << 21) | imm16_rd(imm16, rt));
416 assem_debug("movz %s,#0\n", regname64[rt]);
417 output_w32(0xd2800000 | imm16_rd(0, rt));
421 static void emit_readword(void *addr, u_int rt)
423 uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
424 if (!(offset & 3) && offset <= 16380) {
425 assem_debug("ldr %s,[x%d+%#lx]\n", regname[rt], FP, offset);
426 output_w32(0xb9400000 | imm12_rn_rd(offset >> 2, FP, rt));
432 static void emit_readdword(void *addr, u_int rt)
434 uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
435 if (!(offset & 7) && offset <= 32760) {
436 assem_debug("ldr %s,[x%d+%#lx]\n", regname64[rt], FP, offset);
437 output_w32(0xf9400000 | imm12_rn_rd(offset >> 3, FP, rt));
442 #define emit_readptr emit_readdword
444 static void emit_readshword(void *addr, u_int rt)
446 uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
447 if (!(offset & 1) && offset <= 8190) {
448 assem_debug("ldrsh %s,[x%d+%#lx]\n", regname[rt], FP, offset);
449 output_w32(0x79c00000 | imm12_rn_rd(offset >> 1, FP, rt));
455 static void emit_loadreg(u_int r, u_int hr)
463 //case HIREG: addr = &hi; break;
464 //case LOREG: addr = &lo; break;
465 case CCREG: addr = &cycle_count; break;
466 case CSREG: addr = &psxRegs.CP0.n.SR; break;
467 case INVCP: addr = &invc_ptr; is64 = 1; break;
468 case ROREG: addr = &ram_offset; is64 = 1; break;
471 addr = &psxRegs.GPR.r[r];
475 emit_readdword(addr, hr);
477 emit_readword(addr, hr);
481 static void emit_writeword(u_int rt, void *addr)
483 uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
484 if (!(offset & 3) && offset <= 16380) {
485 assem_debug("str %s,[x%d+%#lx]\n", regname[rt], FP, offset);
486 output_w32(0xb9000000 | imm12_rn_rd(offset >> 2, FP, rt));
492 static void emit_writedword(u_int rt, void *addr)
494 uintptr_t offset = (u_char *)addr - (u_char *)&dynarec_local;
495 if (!(offset & 7) && offset <= 32760) {
496 assem_debug("str %s,[x%d+%#lx]\n", regname64[rt], FP, offset);
497 output_w32(0xf9000000 | imm12_rn_rd(offset >> 3, FP, rt));
503 static void emit_storereg(u_int r, u_int hr)
506 void *addr = &psxRegs.GPR.r[r];
508 //case HIREG: addr = &hi; break;
509 //case LOREG: addr = &lo; break;
510 case CCREG: addr = &cycle_count; break;
511 default: assert(r < 34); break;
513 emit_writeword(hr, addr);
516 static void emit_test(u_int rs, u_int rt)
518 assem_debug("tst %s,%s\n", regname[rs], regname[rt]);
519 output_w32(0x6a000000 | rm_rn_rd(rt, rs, WZR));
522 static void emit_testimm(u_int rs, u_int imm)
525 assem_debug("tst %s,#%#x\n", regname[rs], imm);
526 assert(is_rotated_mask(imm)); // good enough for PCSX
527 gen_logical_imm(imm, &immr, &imms);
528 output_w32(0x72000000 | n_immr_imms_rn_rd(0, immr, imms, rs, WZR));
531 static void emit_not(u_int rs,u_int rt)
533 assem_debug("mvn %s,%s\n",regname[rt],regname[rs]);
534 output_w32(0x2a200000 | rm_rn_rd(rs, WZR, rt));
537 static void emit_and(u_int rs1,u_int rs2,u_int rt)
539 assem_debug("and %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
540 output_w32(0x0a000000 | rm_rn_rd(rs2, rs1, rt));
543 static void emit_or(u_int rs1,u_int rs2,u_int rt)
545 assem_debug("orr %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
546 output_w32(0x2a000000 | rm_rn_rd(rs2, rs1, rt));
549 static void emit_bic(u_int rs1,u_int rs2,u_int rt)
551 assem_debug("bic %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
552 output_w32(0x0a200000 | rm_rn_rd(rs2, rs1, rt));
555 static void emit_orrshl_imm(u_int rs,u_int imm,u_int rt)
557 assem_debug("orr %s,%s,%s,lsl #%d\n",regname[rt],regname[rt],regname[rs],imm);
558 output_w32(0x2a000000 | rm_imm6_rn_rd(rs, imm, rt, rt));
561 static void emit_orrshr_imm(u_int rs,u_int imm,u_int rt)
563 assem_debug("orr %s,%s,%s,lsr #%d\n",regname[rt],regname[rt],regname[rs],imm);
564 output_w32(0x2a400000 | rm_imm6_rn_rd(rs, imm, rt, rt));
567 static void emit_bicsar_imm(u_int rs,u_int imm,u_int rt)
569 assem_debug("bic %s,%s,%s,asr #%d\n",regname[rt],regname[rt],regname[rs],imm);
570 output_w32(0x0aa00000 | rm_imm6_rn_rd(rs, imm, rt, rt));
573 static void emit_xor(u_int rs1,u_int rs2,u_int rt)
575 assem_debug("eor %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
576 output_w32(0x4a000000 | rm_rn_rd(rs2, rs1, rt));
579 static void emit_xorsar_imm(u_int rs1, u_int rs2, u_int imm, u_int rt)
581 assem_debug("eor %s,%s,%s,asr #%d\n",regname[rt],regname[rs1],regname[rs2],imm);
582 output_w32(0x4a800000 | rm_imm6_rn_rd(rs2, imm, rs1, rt));
585 static void emit_addimm_s(u_int s, u_int is64, u_int rs, uintptr_t imm, u_int rt)
587 unused const char *st = s ? "s" : "";
588 s = s ? 0x20000000 : 0;
589 is64 = is64 ? 0x80000000 : 0;
591 assem_debug("add%s %s,%s,%#lx\n", st, regname[rt], regname[rs], imm);
592 output_w32(0x11000000 | is64 | s | imm12_rn_rd(imm, rs, rt));
594 else if (-imm < 4096) {
595 assem_debug("sub%s %s,%s,%#lx\n", st, regname[rt], regname[rs], -imm);
596 output_w32(0x51000000 | is64 | s | imm12_rn_rd(-imm, rs, rt));
598 else if (imm < 16777216) {
599 assem_debug("add %s,%s,#%#lx\n",regname[rt],regname[rt],imm&0xfff000);
600 output_w32(0x11400000 | is64 | imm12_rn_rd(imm >> 12, rs, rt));
601 if ((imm & 0xfff) || s) {
602 assem_debug("add%s %s,%s,#%#lx\n",st,regname[rt],regname[rs],imm&0xfff);
603 output_w32(0x11000000 | is64 | s | imm12_rn_rd(imm & 0xfff, rt, rt));
606 else if (-imm < 16777216) {
607 assem_debug("sub %s,%s,#%#lx\n",regname[rt],regname[rt],-imm&0xfff000);
608 output_w32(0x51400000 | is64 | imm12_rn_rd(-imm >> 12, rs, rt));
609 if ((imm & 0xfff) || s) {
610 assem_debug("sub%s %s,%s,#%#lx\n",st,regname[rt],regname[rs],-imm&0xfff);
611 output_w32(0x51000000 | is64 | s | imm12_rn_rd(-imm & 0xfff, rt, rt));
618 static void emit_addimm(u_int rs, uintptr_t imm, u_int rt)
624 emit_addimm_s(0, 0, rs, imm, rt);
627 static void emit_addimm64(u_int rs, uintptr_t imm, u_int rt)
629 emit_addimm_s(0, 1, rs, imm, rt);
632 static void emit_addimm_ptr(u_int rs, uintptr_t imm, u_int rt)
634 emit_addimm64(rs, imm, rt);
637 static void emit_addimm_and_set_flags(int imm, u_int rt)
639 emit_addimm_s(1, 0, rt, imm, rt);
642 static void emit_logicop_imm(u_int op, u_int rs, u_int imm, u_int rt)
644 const char *names[] = { "and", "orr", "eor", "ands" };
645 const char *name = names[op];
648 if (is_rotated_mask(imm)) {
649 gen_logical_imm(imm, &immr, &imms);
650 assem_debug("%s %s,%s,#%#x\n", name, regname[rt], regname[rs], imm);
651 output_w32(op | 0x12000000 | n_immr_imms_rn_rd(0, immr, imms, rs, rt));
654 if (rs == HOST_TEMPREG || rt != HOST_TEMPREG)
655 host_tempreg_acquire();
656 emit_movimm(imm, HOST_TEMPREG);
657 assem_debug("%s %s,%s,%s\n", name, regname[rt], regname[rs], regname[HOST_TEMPREG]);
658 output_w32(op | 0x0a000000 | rm_rn_rd(HOST_TEMPREG, rs, rt));
659 if (rs == HOST_TEMPREG || rt != HOST_TEMPREG)
660 host_tempreg_release();
665 static void emit_andimm(u_int rs, u_int imm, u_int rt)
670 emit_logicop_imm(0, rs, imm, rt);
673 static void emit_orimm(u_int rs, u_int imm, u_int rt)
680 emit_logicop_imm(1, rs, imm, rt);
683 static void emit_xorimm(u_int rs, u_int imm, u_int rt)
690 emit_logicop_imm(2, rs, imm, rt);
693 static void emit_sbfm(u_int rs,u_int imm,u_int rt)
695 assem_debug("sbfm %s,%s,#0,#%d\n",regname[rt],regname[rs],imm);
696 output_w32(0x13000000 | n_immr_imms_rn_rd(0, 0, imm, rs, rt));
699 static void emit_ubfm(u_int rs,u_int imm,u_int rt)
701 assem_debug("ubfm %s,%s,#0,#%d\n",regname[rt],regname[rs],imm);
702 output_w32(0x53000000 | n_immr_imms_rn_rd(0, 0, imm, rs, rt));
705 static void emit_shlimm(u_int rs,u_int imm,u_int rt)
707 assem_debug("lsl %s,%s,#%d\n",regname[rt],regname[rs],imm);
708 output_w32(0x53000000 | n_immr_imms_rn_rd(0, (31-imm)+1, 31-imm, rs, rt));
711 static void emit_shrimm(u_int rs,u_int imm,u_int rt)
713 assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm);
714 output_w32(0x53000000 | n_immr_imms_rn_rd(0, imm, 31, rs, rt));
717 static void emit_shrimm64(u_int rs,u_int imm,u_int rt)
719 assem_debug("lsr %s,%s,#%d\n",regname[rt],regname[rs],imm);
720 output_w32(0xd3400000 | n_immr_imms_rn_rd(0, imm, 63, rs, rt));
723 static void emit_sarimm(u_int rs,u_int imm,u_int rt)
725 assem_debug("asr %s,%s,#%d\n",regname[rt],regname[rs],imm);
726 output_w32(0x13000000 | n_immr_imms_rn_rd(0, imm, 31, rs, rt));
729 static void emit_rorimm(u_int rs,u_int imm,u_int rt)
731 assem_debug("ror %s,%s,#%d\n",regname[rt],regname[rs],imm);
732 output_w32(0x13800000 | rm_imm6_rn_rd(rs, imm, rs, rt));
735 static void emit_signextend16(u_int rs, u_int rt)
737 assem_debug("sxth %s,%s\n", regname[rt], regname[rs]);
738 output_w32(0x13000000 | n_immr_imms_rn_rd(0, 0, 15, rs, rt));
741 static void emit_shl(u_int rs,u_int rshift,u_int rt)
743 assem_debug("lsl %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]);
744 output_w32(0x1ac02000 | rm_rn_rd(rshift, rs, rt));
747 static void emit_shr(u_int rs,u_int rshift,u_int rt)
749 assem_debug("lsr %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]);
750 output_w32(0x1ac02400 | rm_rn_rd(rshift, rs, rt));
753 static void emit_sar(u_int rs,u_int rshift,u_int rt)
755 assem_debug("asr %s,%s,%s\n",regname[rt],regname[rs],regname[rshift]);
756 output_w32(0x1ac02800 | rm_rn_rd(rshift, rs, rt));
759 static void emit_cmpimm(u_int rs, u_int imm)
762 assem_debug("cmp %s,%#x\n", regname[rs], imm);
763 output_w32(0x71000000 | imm12_rn_rd(imm, rs, WZR));
765 else if (-imm < 4096) {
766 assem_debug("cmn %s,%#x\n", regname[rs], imm);
767 output_w32(0x31000000 | imm12_rn_rd(-imm, rs, WZR));
769 else if (imm < 16777216 && !(imm & 0xfff)) {
770 assem_debug("cmp %s,#%#x\n", regname[rs], imm);
771 output_w32(0x71400000 | imm12_rn_rd(imm >> 12, rs, WZR));
774 host_tempreg_acquire();
775 emit_movimm(imm, HOST_TEMPREG);
776 assem_debug("cmp %s,%s\n", regname[rs], regname[HOST_TEMPREG]);
777 output_w32(0x6b000000 | rm_rn_rd(HOST_TEMPREG, rs, WZR));
778 host_tempreg_release();
782 static void emit_cmov_imm(u_int cond0, u_int cond1, u_int imm, u_int rt)
784 assert(imm == 0 || imm == 1);
785 assert(cond0 < 0x10);
786 assert(cond1 < 0x10);
788 assem_debug("csinc %s,%s,%s,%s\n",regname[rt],regname[rt],regname[WZR],condname[cond1]);
789 output_w32(0x1a800400 | (cond1 << 12) | rm_rn_rd(WZR, rt, rt));
791 assem_debug("csel %s,%s,%s,%s\n",regname[rt],regname[WZR],regname[rt],condname[cond0]);
792 output_w32(0x1a800000 | (cond0 << 12) | rm_rn_rd(rt, WZR, rt));
796 static void emit_cmovne_imm(u_int imm,u_int rt)
798 emit_cmov_imm(COND_NE, COND_EQ, imm, rt);
801 static void emit_cmovl_imm(u_int imm,u_int rt)
803 emit_cmov_imm(COND_LT, COND_GE, imm, rt);
806 static void emit_cmovb_imm(int imm,u_int rt)
808 emit_cmov_imm(COND_CC, COND_CS, imm, rt);
811 static void emit_cmoveq_reg(u_int rs,u_int rt)
813 assem_debug("csel %s,%s,%s,eq\n",regname[rt],regname[rs],regname[rt]);
814 output_w32(0x1a800000 | (COND_EQ << 12) | rm_rn_rd(rt, rs, rt));
817 static void emit_cmovne_reg(u_int rs,u_int rt)
819 assem_debug("csel %s,%s,%s,ne\n",regname[rt],regname[rs],regname[rt]);
820 output_w32(0x1a800000 | (COND_NE << 12) | rm_rn_rd(rt, rs, rt));
823 static void emit_cmovl_reg(u_int rs,u_int rt)
825 assem_debug("csel %s,%s,%s,lt\n",regname[rt],regname[rs],regname[rt]);
826 output_w32(0x1a800000 | (COND_LT << 12) | rm_rn_rd(rt, rs, rt));
829 static void emit_cmovb_reg(u_int rs,u_int rt)
831 assem_debug("csel %s,%s,%s,cc\n",regname[rt],regname[rs],regname[rt]);
832 output_w32(0x1a800000 | (COND_CC << 12) | rm_rn_rd(rt, rs, rt));
835 static void emit_cmovs_reg(u_int rs,u_int rt)
837 assem_debug("csel %s,%s,%s,mi\n",regname[rt],regname[rs],regname[rt]);
838 output_w32(0x1a800000 | (COND_MI << 12) | rm_rn_rd(rt, rs, rt));
841 static void emit_csinvle_reg(u_int rs1,u_int rs2,u_int rt)
843 assem_debug("csinv %s,%s,%s,le\n",regname[rt],regname[rs1],regname[rs2]);
844 output_w32(0x5a800000 | (COND_LE << 12) | rm_rn_rd(rs2, rs1, rt));
847 static void emit_slti32(u_int rs,int imm,u_int rt)
849 if(rs!=rt) emit_zeroreg(rt);
851 if(rs==rt) emit_movimm(0,rt);
852 emit_cmovl_imm(1,rt);
855 static void emit_sltiu32(u_int rs,int imm,u_int rt)
857 if(rs!=rt) emit_zeroreg(rt);
859 if(rs==rt) emit_movimm(0,rt);
860 emit_cmovb_imm(1,rt);
863 static void emit_cmp(u_int rs,u_int rt)
865 assem_debug("cmp %s,%s\n",regname[rs],regname[rt]);
866 output_w32(0x6b000000 | rm_rn_rd(rt, rs, WZR));
869 static void emit_cmpcs(u_int rs,u_int rt)
871 assem_debug("ccmp %s,%s,#0,cs\n",regname[rs],regname[rt]);
872 output_w32(0x7a400000 | (COND_CS << 12) | rm_rn_rd(rt, rs, 0));
875 static void emit_set_gz32(u_int rs, u_int rt)
877 //assem_debug("set_gz32\n");
880 emit_cmovl_imm(0,rt);
883 static void emit_set_nz32(u_int rs, u_int rt)
885 //assem_debug("set_nz32\n");
886 if(rs!=rt) emit_mov(rs,rt);
888 emit_cmovne_imm(1,rt);
891 static void emit_set_if_less32(u_int rs1, u_int rs2, u_int rt)
893 //assem_debug("set if less (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]);
894 if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt);
896 if(rs1==rt||rs2==rt) emit_movimm(0,rt);
897 emit_cmovl_imm(1,rt);
900 static void emit_set_if_carry32(u_int rs1, u_int rs2, u_int rt)
902 //assem_debug("set if carry (%%%s,%%%s),%%%s\n",regname[rs1],regname[rs2],regname[rt]);
903 if(rs1!=rt&&rs2!=rt) emit_zeroreg(rt);
905 if(rs1==rt||rs2==rt) emit_movimm(0,rt);
906 emit_cmovb_imm(1,rt);
909 static int can_jump_or_call(const void *a)
911 intptr_t diff = (u_char *)a - out;
912 return (-134217728 <= diff && diff <= 134217727);
915 static void emit_call(const void *a)
917 intptr_t diff = (u_char *)a - out;
918 assem_debug("bl %p (%p+%lx)%s\n", a, out, diff, func_name(a));
920 if (-134217728 <= diff && diff <= 134217727)
921 output_w32(0x94000000 | ((diff >> 2) & 0x03ffffff));
926 static void emit_jmp(const void *a)
928 assem_debug("b %p (%p+%lx)%s\n", a, out, (u_char *)a - out, func_name(a));
929 u_int offset = genjmp(a);
930 output_w32(0x14000000 | offset);
933 static void emit_jne(const void *a)
935 assem_debug("bne %p\n", a);
936 u_int offset = genjmpcc(a);
937 output_w32(0x54000000 | (offset << 5) | COND_NE);
940 static void emit_jeq(const void *a)
942 assem_debug("beq %p\n", a);
943 u_int offset = genjmpcc(a);
944 output_w32(0x54000000 | (offset << 5) | COND_EQ);
947 static void emit_js(const void *a)
949 assem_debug("bmi %p\n", a);
950 u_int offset = genjmpcc(a);
951 output_w32(0x54000000 | (offset << 5) | COND_MI);
954 static void emit_jns(const void *a)
956 assem_debug("bpl %p\n", a);
957 u_int offset = genjmpcc(a);
958 output_w32(0x54000000 | (offset << 5) | COND_PL);
961 static void emit_jl(const void *a)
963 assem_debug("blt %p\n", a);
964 u_int offset = genjmpcc(a);
965 output_w32(0x54000000 | (offset << 5) | COND_LT);
968 static void emit_jge(const void *a)
970 assem_debug("bge %p\n", a);
971 u_int offset = genjmpcc(a);
972 output_w32(0x54000000 | (offset << 5) | COND_GE);
975 static void emit_jno(const void *a)
977 assem_debug("bvc %p\n", a);
978 u_int offset = genjmpcc(a);
979 output_w32(0x54000000 | (offset << 5) | COND_VC);
982 static void emit_jc(const void *a)
984 assem_debug("bcs %p\n", a);
985 u_int offset = genjmpcc(a);
986 output_w32(0x54000000 | (offset << 5) | COND_CS);
989 static void emit_cb(u_int isnz, u_int is64, const void *a, u_int r)
991 assem_debug("cb%sz %s,%p\n", isnz?"n":"", is64?regname64[r]:regname[r], a);
992 u_int offset = genjmpcc(a);
993 is64 = is64 ? 0x80000000 : 0;
994 isnz = isnz ? 0x01000000 : 0;
995 output_w32(0x34000000 | is64 | isnz | imm19_rt(offset, r));
998 static void *emit_cbz(u_int r, const void *a)
1001 emit_cb(0, 0, a, r);
1005 static void emit_jmpreg(u_int r)
1007 assem_debug("br %s\n", regname64[r]);
1008 output_w32(0xd61f0000 | rm_rn_rd(0, r, 0));
1011 static void emit_retreg(u_int r)
1013 assem_debug("ret %s\n", r == LR ? "" : regname64[r]);
1014 output_w32(0xd65f0000 | rm_rn_rd(0, r, 0));
1017 static void emit_ret(void)
1022 static void emit_adr(void *addr, u_int rt)
1024 intptr_t offset = (u_char *)addr - out;
1025 assert(-1048576 <= offset && offset < 1048576);
1027 assem_debug("adr x%d,#%#lx\n", rt, offset);
1028 output_w32(0x10000000 | ((offset&0x3) << 29) | (((offset>>2)&0x7ffff) << 5) | rt);
1031 static void emit_adrp(void *addr, u_int rt)
1033 intptr_t offset = ((intptr_t)addr & ~0xfffl) - ((intptr_t)out & ~0xfffl);
1034 assert(-4294967296l <= offset && offset < 4294967296l);
1037 assem_debug("adrp %s,#%#lx(000)\n",regname64[rt],offset);
1038 output_w32(0x90000000 | ((offset&0x3)<<29) | (((offset>>2)&0x7ffff)<<5) | rt);
1041 static void emit_readword_indexed(int offset, u_int rs, u_int rt)
1043 assem_debug("ldur %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
1044 assert(-256 <= offset && offset < 256);
1045 output_w32(0xb8400000 | imm9_rn_rt(offset&0x1ff, rs, rt));
1048 static void emit_strb_dualindexed(u_int rs1, u_int rs2, u_int rt)
1050 assem_debug("strb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
1051 output_w32(0x38204800 | rm_rn_rd(rs2, rs1, rt));
1054 static void emit_strh_dualindexed(u_int rs1, u_int rs2, u_int rt)
1056 assem_debug("strh %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
1057 output_w32(0x78204800 | rm_rn_rd(rs2, rs1, rt));
1060 static void emit_str_dualindexed(u_int rs1, u_int rs2, u_int rt)
1062 assem_debug("str %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
1063 output_w32(0xb8204800 | rm_rn_rd(rs2, rs1, rt));
1066 static void emit_readdword_dualindexedx8(u_int rs1, u_int rs2, u_int rt)
1068 assem_debug("ldr %s, [%s,%s, uxtw #3]\n",regname64[rt],regname64[rs1],regname[rs2]);
1069 output_w32(0xf8605800 | rm_rn_rd(rs2, rs1, rt));
1071 #define emit_readptr_dualindexedx_ptrlen emit_readdword_dualindexedx8
1073 static void emit_ldrb_dualindexed(u_int rs1, u_int rs2, u_int rt)
1075 assem_debug("ldrb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
1076 output_w32(0x38604800 | rm_rn_rd(rs2, rs1, rt));
1079 static void emit_ldrsb_dualindexed(u_int rs1, u_int rs2, u_int rt)
1081 assem_debug("ldrsb %s, [%s,%s]\n",regname[rt],regname64[rs1],regname[rs2]);
1082 output_w32(0x38a04800 | rm_rn_rd(rs2, rs1, rt));
1085 static void emit_ldrh_dualindexed(u_int rs1, u_int rs2, u_int rt)
1087 assem_debug("ldrh %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]);
1088 output_w32(0x78604800 | rm_rn_rd(rs2, rs1, rt));
1091 static void emit_ldrsh_dualindexed(u_int rs1, u_int rs2, u_int rt)
1093 assem_debug("ldrsh %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]);
1094 output_w32(0x78a04800 | rm_rn_rd(rs2, rs1, rt));
1097 static void emit_ldr_dualindexed(u_int rs1, u_int rs2, u_int rt)
1099 assem_debug("ldr %s, [%s,%s, uxtw]\n",regname[rt],regname64[rs1],regname[rs2]);
1100 output_w32(0xb8604800 | rm_rn_rd(rs2, rs1, rt));
1103 static void emit_movsbl_indexed(int offset, u_int rs, u_int rt)
1105 assem_debug("ldursb %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
1106 assert(-256 <= offset && offset < 256);
1107 output_w32(0x38c00000 | imm9_rn_rt(offset&0x1ff, rs, rt));
1110 static void emit_movswl_indexed(int offset, u_int rs, u_int rt)
1112 assem_debug("ldursh %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
1113 assert(-256 <= offset && offset < 256);
1114 output_w32(0x78c00000 | imm9_rn_rt(offset&0x1ff, rs, rt));
1117 static void emit_movzbl_indexed(int offset, u_int rs, u_int rt)
1119 assem_debug("ldurb %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
1120 assert(-256 <= offset && offset < 256);
1121 output_w32(0x38400000 | imm9_rn_rt(offset&0x1ff, rs, rt));
1124 static void emit_movzwl_indexed(int offset, u_int rs, u_int rt)
1126 assem_debug("ldurh %s,[%s+%#x]\n",regname[rt],regname64[rs],offset);
1127 assert(-256 <= offset && offset < 256);
1128 output_w32(0x78400000 | imm9_rn_rt(offset&0x1ff, rs, rt));
1131 static void emit_writeword_indexed(u_int rt, int offset, u_int rs)
1133 if (!(offset & 3) && (u_int)offset <= 16380) {
1134 assem_debug("str %s,[%s+%#x]\n", regname[rt], regname[rs], offset);
1135 output_w32(0xb9000000 | imm12_rn_rd(offset >> 2, rs, rt));
1137 else if (-256 <= offset && offset < 256) {
1138 assem_debug("stur %s,[%s+%#x]\n", regname[rt], regname[rs], offset);
1139 output_w32(0xb8000000 | imm9_rn_rt(offset & 0x1ff, rs, rt));
1145 static void emit_writehword_indexed(u_int rt, int offset, u_int rs)
1147 if (!(offset & 1) && (u_int)offset <= 8190) {
1148 assem_debug("strh %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
1149 output_w32(0x79000000 | imm12_rn_rd(offset >> 1, rs, rt));
1151 else if (-256 <= offset && offset < 256) {
1152 assem_debug("sturh %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
1153 output_w32(0x78000000 | imm9_rn_rt(offset & 0x1ff, rs, rt));
1159 static void emit_writebyte_indexed(u_int rt, int offset, u_int rs)
1161 if ((u_int)offset < 4096) {
1162 assem_debug("strb %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
1163 output_w32(0x39000000 | imm12_rn_rd(offset, rs, rt));
1165 else if (-256 <= offset && offset < 256) {
1166 assem_debug("sturb %s,[%s+%#x]\n", regname[rt], regname64[rs], offset);
1167 output_w32(0x38000000 | imm9_rn_rt(offset & 0x1ff, rs, rt));
1173 static void emit_umull(u_int rs1, u_int rs2, u_int rt)
1175 assem_debug("umull %s,%s,%s\n",regname64[rt],regname[rs1],regname[rs2]);
1176 output_w32(0x9ba00000 | rm_ra_rn_rd(rs2, WZR, rs1, rt));
1179 static void emit_smull(u_int rs1, u_int rs2, u_int rt)
1181 assem_debug("smull %s,%s,%s\n",regname64[rt],regname[rs1],regname[rs2]);
1182 output_w32(0x9b200000 | rm_ra_rn_rd(rs2, WZR, rs1, rt));
1185 static void emit_msub(u_int rs1, u_int rs2, u_int rs3, u_int rt)
1187 assem_debug("msub %s,%s,%s,%s\n",regname[rt],regname[rs1],regname[rs2],regname[rs3]);
1188 output_w32(0x1b008000 | rm_ra_rn_rd(rs2, rs3, rs1, rt));
1191 static void emit_sdiv(u_int rs1, u_int rs2, u_int rt)
1193 assem_debug("sdiv %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
1194 output_w32(0x1ac00c00 | rm_rn_rd(rs2, rs1, rt));
1197 static void emit_udiv(u_int rs1, u_int rs2, u_int rt)
1199 assem_debug("udiv %s,%s,%s\n",regname[rt],regname[rs1],regname[rs2]);
1200 output_w32(0x1ac00800 | rm_rn_rd(rs2, rs1, rt));
1203 static void emit_clz(u_int rs, u_int rt)
1205 assem_debug("clz %s,%s\n",regname[rt],regname[rs]);
1206 output_w32(0x5ac01000 | rn_rd(rs, rt));
1209 // special case for checking invalid_code
1210 static void emit_ldrb_indexedsr12_reg(u_int rbase, u_int r, u_int rt)
1212 emit_shrimm(r, 12, rt);
1213 assem_debug("ldrb %s,[%s,%s,uxtw]\n",regname[rt],regname64[rbase],regname[rt]);
1214 output_w32(0x38604800 | rm_rn_rd(rt, rbase, rt));
1217 // special for loadlr_assemble, rs2 is destroyed
1218 static void emit_bic_lsl(u_int rs1,u_int rs2,u_int shift,u_int rt)
1220 emit_shl(rs2, shift, rs2);
1221 emit_bic(rs1, rs2, rt);
1224 static void emit_bic_lsr(u_int rs1,u_int rs2,u_int shift,u_int rt)
1226 emit_shr(rs2, shift, rs2);
1227 emit_bic(rs1, rs2, rt);
1230 static void emit_ldst(int is_st, int is64, u_int rt, u_int rn, u_int ofs)
1232 u_int op = 0xb9000000;
1233 unused const char *ldst = is_st ? "st" : "ld";
1234 unused char rp = is64 ? 'x' : 'w';
1235 assem_debug("%sr %c%d,[x%d,#%#x]\n", ldst, rp, rt, rn, ofs);
1236 is64 = is64 ? 1 : 0;
1237 assert((ofs & ((1 << (2+is64)) - 1)) == 0);
1238 ofs = (ofs >> (2+is64));
1239 if (!is_st) op |= 0x00400000;
1240 if (is64) op |= 0x40000000;
1241 output_w32(op | imm12_rn_rd(ofs, rn, rt));
1244 static void emit_ldstp(int is_st, int is64, u_int rt1, u_int rt2, u_int rn, int ofs)
1246 u_int op = 0x29000000;
1247 unused const char *ldst = is_st ? "st" : "ld";
1248 unused char rp = is64 ? 'x' : 'w';
1249 assem_debug("%sp %c%d,%c%d,[x%d,#%#x]\n", ldst, rp, rt1, rp, rt2, rn, ofs);
1250 is64 = is64 ? 1 : 0;
1251 assert((ofs & ((1 << (2+is64)) - 1)) == 0);
1252 ofs = (ofs >> (2+is64));
1253 assert(-64 <= ofs && ofs <= 63);
1255 if (!is_st) op |= 0x00400000;
1256 if (is64) op |= 0x80000000;
1257 output_w32(op | imm7_rt2_rn_rt(ofs, rt2, rn, rt1));
1260 static void save_load_regs_all(int is_store, u_int reglist)
1264 for (r = 0; reglist; r++, reglist >>= 1) {
1268 emit_ldstp(is_store, 1, pair[0], pair[1], SP, SSP_CALLEE_REGS + ofs);
1274 emit_ldst(is_store, 1, pair[0], SP, SSP_CALLEE_REGS + ofs);
1277 assert(ofs <= SSP_CALLER_REGS);
1280 // Save registers before function call
1281 static void save_regs(u_int reglist)
1283 reglist &= CALLER_SAVE_REGS; // only save the caller-save registers
1284 save_load_regs_all(1, reglist);
1287 // Restore registers after function call
1288 static void restore_regs(u_int reglist)
1290 reglist &= CALLER_SAVE_REGS;
1291 save_load_regs_all(0, reglist);
1294 /* Stubs/epilogue */
1296 static void literal_pool(int n)
1301 static void literal_pool_jumpover(int n)
1305 // parsed by get_pointer, find_extjump_insn
1306 static void emit_extjump(u_char *addr, u_int target)
1308 assert(((addr[3]&0xfc)==0x14) || ((addr[3]&0xff)==0x54)); // b or b.cond
1310 emit_movz(target & 0xffff, 0);
1311 emit_movk_lsl16(target >> 16, 0);
1313 // addr is in the current recompiled block (max 256k)
1314 // offset shouldn't exceed +/-1MB
1316 emit_far_jump(dyna_linker);
1319 static void check_extjump2(void *src)
1322 assert((ptr[0] & 0xffe0001f) == 0x52800000); // movz r0, #val
1326 // put rt_val into rt, potentially making use of rs with value rs_val
1327 static void emit_movimm_from(u_int rs_val, u_int rs, u_int rt_val, u_int rt)
1329 int diff = rt_val - rs_val;
1330 if ((-4096 < diff && diff < 4096)
1331 || (-16777216 < diff && diff < 16777216 && !(diff & 0xfff)))
1332 emit_addimm(rs, diff, rt);
1333 else if (rt_val == ~rs_val)
1335 else if (is_rotated_mask(rs_val ^ rt_val))
1336 emit_xorimm(rs, rs_val ^ rt_val, rt);
1338 emit_movimm(rt_val, rt);
1341 // return 1 if the above function can do it's job cheaply
1342 static int is_similar_value(u_int v1, u_int v2)
1345 return (-4096 < diff && diff < 4096)
1346 || (-16777216 < diff && diff < 16777216 && !(diff & 0xfff))
1348 || is_rotated_mask(v1 ^ v2);
1351 static void emit_movimm_from64(u_int rs_val, u_int rs, uintptr_t rt_val, u_int rt)
1353 if (rt_val < 0x100000000ull) {
1354 emit_movimm_from(rs_val, rs, rt_val, rt);
1357 // just move the whole thing. At least on Linux all addresses
1358 // seem to be 48bit, so 3 insns - not great not terrible
1359 emit_movimm64(rt_val, rt);
1363 static void pass_args64(u_int a0, u_int a1)
1367 emit_mov64(a0,2); emit_mov64(a1,1); emit_mov64(2,0);
1369 else if(a0!=0&&a1==0) {
1371 if (a0>=0) emit_mov64(a0,0);
1374 if(a0>=0&&a0!=0) emit_mov64(a0,0);
1375 if(a1>=0&&a1!=1) emit_mov64(a1,1);
1379 static void loadstore_extend(enum stub_type type, u_int rs, u_int rt)
1382 case LOADB_STUB: emit_sbfm(rs, 7, rt); break;
1384 case STOREB_STUB: emit_ubfm(rs, 7, rt); break;
1385 case LOADH_STUB: emit_sbfm(rs, 15, rt); break;
1387 case STOREH_STUB: emit_ubfm(rs, 15, rt); break;
1389 case STOREW_STUB: if (rs != rt) emit_mov(rs, rt); break;
1394 #include "pcsxmem.h"
1395 //#include "pcsxmem_inline.c"
1397 static void do_readstub(int n)
1399 assem_debug("do_readstub %x\n",start+stubs[n].a*4);
1400 set_jump_target(stubs[n].addr, out);
1401 enum stub_type type = stubs[n].type;
1403 int rs = stubs[n].b;
1404 const struct regstat *i_regs = (void *)stubs[n].c;
1405 u_int reglist = stubs[n].e;
1406 const signed char *i_regmap = i_regs->regmap;
1408 if(dops[i].itype==C1LS||dops[i].itype==C2LS||dops[i].itype==LOADLR) {
1409 rt=get_reg(i_regmap,FTEMP);
1411 rt=get_reg(i_regmap,dops[i].rt1);
1414 int r,temp=-1,temp2=HOST_TEMPREG,regs_saved=0;
1415 void *restore_jump = NULL, *handler_jump = NULL;
1417 for (r = 0; r < HOST_CCREG; r++) {
1418 if (r != EXCLUDE_REG && ((1 << r) & reglist) == 0) {
1423 if(rt>=0&&dops[i].rt1!=0)
1430 if((regs_saved||(reglist&2)==0)&&temp!=1&&rs!=1)
1432 emit_readdword(&mem_rtab,temp);
1433 emit_shrimm(rs,12,temp2);
1434 emit_readdword_dualindexedx8(temp,temp2,temp2);
1435 emit_adds64(temp2,temp2,temp2);
1438 if(dops[i].itype==C1LS||dops[i].itype==C2LS||(rt>=0&&dops[i].rt1!=0)) {
1440 case LOADB_STUB: emit_ldrsb_dualindexed(temp2,rs,rt); break;
1441 case LOADBU_STUB: emit_ldrb_dualindexed(temp2,rs,rt); break;
1442 case LOADH_STUB: emit_ldrsh_dualindexed(temp2,rs,rt); break;
1443 case LOADHU_STUB: emit_ldrh_dualindexed(temp2,rs,rt); break;
1444 case LOADW_STUB: emit_ldr_dualindexed(temp2,rs,rt); break;
1450 emit_jmp(0); // jump to reg restore
1453 emit_jmp(stubs[n].retaddr); // return address
1454 set_jump_target(handler_jump, out);
1459 if(type==LOADB_STUB||type==LOADBU_STUB)
1460 handler=jump_handler_read8;
1461 if(type==LOADH_STUB||type==LOADHU_STUB)
1462 handler=jump_handler_read16;
1463 if(type==LOADW_STUB)
1464 handler=jump_handler_read32;
1466 pass_args64(rs,temp2);
1467 int cc=get_reg(i_regmap,CCREG);
1469 emit_loadreg(CCREG,2);
1470 emit_addimm(cc<0?2:cc,(int)stubs[n].d,2);
1471 emit_far_call(handler);
1472 // (no cycle reload after read)
1473 if(dops[i].itype==C1LS||dops[i].itype==C2LS||(rt>=0&&dops[i].rt1!=0)) {
1474 loadstore_extend(type,0,rt);
1477 set_jump_target(restore_jump, out);
1478 restore_regs(reglist);
1479 emit_jmp(stubs[n].retaddr);
1482 static void inline_readstub(enum stub_type type, int i, u_int addr,
1483 const signed char regmap[], int target, int adj, u_int reglist)
1485 int rs=get_reg(regmap,target);
1486 int rt=get_reg(regmap,target);
1487 if(rs<0) rs=get_reg_temp(regmap);
1490 uintptr_t host_addr = 0;
1492 int cc=get_reg(regmap,CCREG);
1493 //if(pcsx_direct_read(type,addr,adj,cc,target?rs:-1,rt))
1495 handler = get_direct_memhandler(mem_rtab, addr, type, &host_addr);
1496 if (handler == NULL) {
1497 if(rt<0||dops[i].rt1==0)
1499 if (addr != host_addr)
1500 emit_movimm_from64(addr, rs, host_addr, rs);
1502 case LOADB_STUB: emit_movsbl_indexed(0,rs,rt); break;
1503 case LOADBU_STUB: emit_movzbl_indexed(0,rs,rt); break;
1504 case LOADH_STUB: emit_movswl_indexed(0,rs,rt); break;
1505 case LOADHU_STUB: emit_movzwl_indexed(0,rs,rt); break;
1506 case LOADW_STUB: emit_readword_indexed(0,rs,rt); break;
1511 is_dynamic = pcsxmem_is_handler_dynamic(addr);
1513 if(type==LOADB_STUB||type==LOADBU_STUB)
1514 handler=jump_handler_read8;
1515 if(type==LOADH_STUB||type==LOADHU_STUB)
1516 handler=jump_handler_read16;
1517 if(type==LOADW_STUB)
1518 handler=jump_handler_read32;
1521 // call a memhandler
1522 if(rt>=0&&dops[i].rt1!=0)
1526 emit_movimm(addr,0);
1530 emit_loadreg(CCREG,2);
1531 emit_addimm(cc<0?2:cc,adj,2);
1533 uintptr_t l1 = ((uintptr_t *)mem_rtab)[addr>>12] << 1;
1534 intptr_t offset = (l1 & ~0xfffl) - ((intptr_t)out & ~0xfffl);
1535 if (-4294967296l <= offset && offset < 4294967296l) {
1536 emit_adrp((void *)l1, 1);
1537 emit_addimm64(1, l1 & 0xfff, 1);
1540 emit_movimm64(l1, 1);
1543 emit_far_call(do_memhandler_pre);
1545 emit_far_call(handler);
1547 // (no cycle reload after read)
1548 if(rt>=0&&dops[i].rt1!=0)
1549 loadstore_extend(type, 0, rt);
1550 restore_regs(reglist);
1553 static void do_writestub(int n)
1555 assem_debug("do_writestub %x\n",start+stubs[n].a*4);
1556 set_jump_target(stubs[n].addr, out);
1557 enum stub_type type=stubs[n].type;
1560 struct regstat *i_regs=(struct regstat *)stubs[n].c;
1561 u_int reglist=stubs[n].e;
1562 signed char *i_regmap=i_regs->regmap;
1564 if(dops[i].itype==C1LS||dops[i].itype==C2LS) {
1565 rt=get_reg(i_regmap,r=FTEMP);
1567 rt=get_reg(i_regmap,r=dops[i].rs2);
1571 int rtmp,temp=-1,temp2,regs_saved=0;
1572 void *restore_jump = NULL, *handler_jump = NULL;
1573 int reglist2=reglist|(1<<rs)|(1<<rt);
1574 for (rtmp = 0; rtmp < HOST_CCREG; rtmp++) {
1575 if (rtmp != EXCLUDE_REG && ((1 << rtmp) & reglist) == 0) {
1583 for(rtmp=0;rtmp<=3;rtmp++)
1584 if(rtmp!=rs&&rtmp!=rt)
1587 if((regs_saved||(reglist2&8)==0)&&temp!=3&&rs!=3&&rt!=3)
1590 host_tempreg_acquire();
1593 emit_readdword(&mem_wtab,temp);
1594 emit_shrimm(rs,12,temp2);
1595 emit_readdword_dualindexedx8(temp,temp2,temp2);
1596 emit_adds64(temp2,temp2,temp2);
1600 case STOREB_STUB: emit_strb_dualindexed(temp2,rs,rt); break;
1601 case STOREH_STUB: emit_strh_dualindexed(temp2,rs,rt); break;
1602 case STOREW_STUB: emit_str_dualindexed(temp2,rs,rt); break;
1607 emit_jmp(0); // jump to reg restore
1610 emit_jmp(stubs[n].retaddr); // return address (invcode check)
1611 set_jump_target(handler_jump, out);
1617 case STOREB_STUB: handler=jump_handler_write8; break;
1618 case STOREH_STUB: handler=jump_handler_write16; break;
1619 case STOREW_STUB: handler=jump_handler_write32; break;
1625 emit_mov64(temp2,3);
1626 host_tempreg_release();
1628 int cc=get_reg(i_regmap,CCREG);
1630 emit_loadreg(CCREG,2);
1631 emit_addimm(cc<0?2:cc,(int)stubs[n].d,2);
1632 // returns new cycle_count
1633 emit_far_call(handler);
1634 emit_addimm(0,-(int)stubs[n].d,cc<0?2:cc);
1636 emit_storereg(CCREG,2);
1638 set_jump_target(restore_jump, out);
1639 restore_regs(reglist);
1640 emit_jmp(stubs[n].retaddr);
1643 static void inline_writestub(enum stub_type type, int i, u_int addr,
1644 const signed char regmap[], int target, int adj, u_int reglist)
1646 int rs = get_reg_temp(regmap);
1647 int rt = get_reg(regmap,target);
1650 uintptr_t host_addr = 0;
1651 void *handler = get_direct_memhandler(mem_wtab, addr, type, &host_addr);
1652 if (handler == NULL) {
1653 if (addr != host_addr)
1654 emit_movimm_from64(addr, rs, host_addr, rs);
1656 case STOREB_STUB: emit_writebyte_indexed(rt, 0, rs); break;
1657 case STOREH_STUB: emit_writehword_indexed(rt, 0, rs); break;
1658 case STOREW_STUB: emit_writeword_indexed(rt, 0, rs); break;
1664 // call a memhandler
1666 emit_writeword(rs, &address); // some handlers still need it
1667 loadstore_extend(type, rt, 0);
1669 cc = cc_use = get_reg(regmap, CCREG);
1671 emit_loadreg(CCREG, (cc_use = 2));
1672 emit_addimm(cc_use, adj, 2);
1674 emit_far_call(do_memhandler_pre);
1675 emit_far_call(handler);
1676 emit_far_call(do_memhandler_post);
1677 emit_addimm(0, -adj, cc_use);
1679 emit_storereg(CCREG, cc_use);
1680 restore_regs(reglist);
1685 static void c2op_prologue(u_int op, int i, const struct regstat *i_regs, u_int reglist)
1687 save_load_regs_all(1, reglist);
1688 cop2_do_stall_check(op, i, i_regs, 0);
1691 emit_far_call(pcnt_gte_start);
1693 // pointer to cop2 regs
1694 emit_addimm64(FP, (u_char *)&psxRegs.CP2D.r[0] - (u_char *)&dynarec_local, 0);
1697 static void c2op_epilogue(u_int op,u_int reglist)
1701 emit_far_call(pcnt_gte_end);
1703 save_load_regs_all(0, reglist);
1706 static void c2op_assemble(int i, const struct regstat *i_regs)
1708 u_int c2op=source[i]&0x3f;
1709 u_int hr,reglist_full=0,reglist;
1710 int need_flags,need_ir;
1711 for(hr=0;hr<HOST_REGS;hr++) {
1712 if(i_regs->regmap[hr]>=0) reglist_full|=1<<hr;
1714 reglist=reglist_full&CALLER_SAVE_REGS;
1716 if (gte_handlers[c2op]!=NULL) {
1717 need_flags=!(gte_unneeded[i+1]>>63); // +1 because of how liveness detection works
1718 need_ir=(gte_unneeded[i+1]&0xe00)!=0xe00;
1719 assem_debug("gte op %08x, unneeded %016lx, need_flags %d, need_ir %d\n",
1720 source[i],gte_unneeded[i+1],need_flags,need_ir);
1721 if(HACK_ENABLED(NDHACK_GTE_NO_FLAGS))
1723 //int shift = (source[i] >> 19) & 1;
1724 //int lm = (source[i] >> 10) & 1;
1728 c2op_prologue(c2op, i, i_regs, reglist);
1729 emit_movimm(source[i],1); // opcode
1730 emit_writeword(1,&psxRegs.code);
1731 emit_far_call(need_flags?gte_handlers[c2op]:gte_handlers_nf[c2op]);
1734 c2op_epilogue(c2op,reglist);
1738 static void c2op_ctc2_31_assemble(signed char sl, signed char temp)
1740 //value = value & 0x7ffff000;
1741 //if (value & 0x7f87e000) value |= 0x80000000;
1742 emit_andimm(sl, 0x7fffe000, temp);
1743 emit_testimm(temp, 0xff87ffff);
1744 emit_andimm(sl, 0x7ffff000, temp);
1745 host_tempreg_acquire();
1746 emit_orimm(temp, 0x80000000, HOST_TEMPREG);
1747 emit_cmovne_reg(HOST_TEMPREG, temp);
1748 host_tempreg_release();
1749 assert(0); // testing needed
1752 static void do_mfc2_31_one(u_int copr,signed char temp)
1754 emit_readshword(®_cop2d[copr],temp);
1755 emit_bicsar_imm(temp,31,temp);
1756 emit_cmpimm(temp,0xf80);
1757 emit_csinvle_reg(temp,WZR,temp); // if (temp > 0xf80) temp = ~0;
1758 emit_andimm(temp,0xf80,temp);
1761 static void c2op_mfc2_29_assemble(signed char tl, signed char temp)
1764 host_tempreg_acquire();
1765 temp = HOST_TEMPREG;
1767 do_mfc2_31_one(9,temp);
1768 emit_shrimm(temp,7,tl);
1769 do_mfc2_31_one(10,temp);
1770 emit_orrshr_imm(temp,2,tl);
1771 do_mfc2_31_one(11,temp);
1772 emit_orrshl_imm(temp,3,tl);
1773 emit_writeword(tl,®_cop2d[29]);
1775 if (temp == HOST_TEMPREG)
1776 host_tempreg_release();
1779 static void multdiv_assemble_arm64(int i, const struct regstat *i_regs)
1785 if(dops[i].rs1&&dops[i].rs2)
1787 switch(dops[i].opcode2)
1792 signed char m1=get_reg(i_regs->regmap,dops[i].rs1);
1793 signed char m2=get_reg(i_regs->regmap,dops[i].rs2);
1794 signed char hi=get_reg(i_regs->regmap,HIREG);
1795 signed char lo=get_reg(i_regs->regmap,LOREG);
1801 if(dops[i].opcode2==0x18) // MULT
1802 emit_smull(m1,m2,hi);
1804 emit_umull(m1,m2,hi);
1807 emit_shrimm64(hi,32,hi);
1813 signed char numerator=get_reg(i_regs->regmap,dops[i].rs1);
1814 signed char denominator=get_reg(i_regs->regmap,dops[i].rs2);
1815 signed char quotient=get_reg(i_regs->regmap,LOREG);
1816 signed char remainder=get_reg(i_regs->regmap,HIREG);
1817 assert(numerator>=0);
1818 assert(denominator>=0);
1819 assert(quotient>=0);
1820 assert(remainder>=0);
1822 if (dops[i].opcode2 == 0x1A) // DIV
1823 emit_sdiv(numerator,denominator,quotient);
1825 emit_udiv(numerator,denominator,quotient);
1826 emit_msub(quotient,denominator,numerator,remainder);
1828 // div 0 quotient (remainder is already correct)
1829 host_tempreg_acquire();
1830 if (dops[i].opcode2 == 0x1A) // DIV
1831 emit_sub_asrimm(0,numerator,31,HOST_TEMPREG);
1833 emit_movimm(~0,HOST_TEMPREG);
1834 emit_test(denominator,denominator);
1835 emit_cmoveq_reg(HOST_TEMPREG,quotient);
1836 host_tempreg_release();
1845 signed char hr=get_reg(i_regs->regmap,HIREG);
1846 signed char lr=get_reg(i_regs->regmap,LOREG);
1847 if ((dops[i].opcode2==0x1A || dops[i].opcode2==0x1B) && dops[i].rs2==0) // div 0
1850 signed char numerator = get_reg(i_regs->regmap, dops[i].rs1);
1851 assert(numerator >= 0);
1853 emit_mov(numerator,hr);
1855 if (dops[i].opcode2 == 0x1A) // DIV
1856 emit_sub_asrimm(0,numerator,31,lr);
1862 if (hr >= 0) emit_zeroreg(hr);
1863 if (lr >= 0) emit_movimm(~0,lr);
1868 // Multiply by zero is zero.
1869 if (hr >= 0) emit_zeroreg(hr);
1870 if (lr >= 0) emit_zeroreg(lr);
1874 #define multdiv_assemble multdiv_assemble_arm64
1876 static void do_jump_vaddr(u_int rs)
1880 emit_far_call(ndrc_get_addr_ht);
1884 static void do_preload_rhash(u_int r) {
1885 // Don't need this for ARM. On x86, this puts the value 0xf8 into the
1886 // register. On ARM the hash can be done with a single instruction (below)
1889 static void do_preload_rhtbl(u_int ht) {
1890 emit_addimm64(FP, (u_char *)&mini_ht - (u_char *)&dynarec_local, ht);
1893 static void do_rhash(u_int rs,u_int rh) {
1894 emit_andimm(rs, 0xf8, rh);
1897 static void do_miniht_load(int ht, u_int rh) {
1898 emit_add64(ht, rh, ht);
1899 emit_ldst(0, 0, rh, ht, 0);
1902 static void do_miniht_jump(u_int rs, u_int rh, u_int ht) {
1908 set_jump_target(jaddr, out);
1909 assem_debug("ldr %s,[%s,#8]\n",regname64[ht], regname64[ht]);
1910 output_w32(0xf9400000 | imm12_rn_rd(8 >> 3, ht, ht));
1914 // parsed by set_jump_target?
1915 static void do_miniht_insert(u_int return_address,u_int rt,int temp) {
1916 emit_movz_lsl16((return_address>>16)&0xffff,rt);
1917 emit_movk(return_address&0xffff,rt);
1918 add_to_linker(out,return_address,1);
1920 emit_writedword(temp,&mini_ht[(return_address&0xFF)>>3][1]);
1921 emit_writeword(rt,&mini_ht[(return_address&0xFF)>>3][0]);
1924 static unused void clear_cache_arm64(char *start, char *end)
1926 // Don't rely on GCC's __clear_cache implementation, as it caches
1927 // icache/dcache cache line sizes, that can vary between cores on
1928 // big.LITTLE architectures.
1929 uint64_t addr, ctr_el0;
1930 static size_t icache_line_size = 0xffff, dcache_line_size = 0xffff;
1931 size_t isize, dsize;
1933 __asm__ volatile("mrs %0, ctr_el0" : "=r"(ctr_el0));
1934 isize = 4 << ((ctr_el0 >> 0) & 0xf);
1935 dsize = 4 << ((ctr_el0 >> 16) & 0xf);
1937 // use the global minimum cache line size
1938 icache_line_size = isize = icache_line_size < isize ? icache_line_size : isize;
1939 dcache_line_size = dsize = dcache_line_size < dsize ? dcache_line_size : dsize;
1941 /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is
1942 not required for instruction to data coherence. */
1943 if ((ctr_el0 & (1 << 28)) == 0x0) {
1944 addr = (uint64_t)start & ~(uint64_t)(dsize - 1);
1945 for (; addr < (uint64_t)end; addr += dsize)
1946 // use "civac" instead of "cvau", as this is the suggested workaround for
1947 // Cortex-A53 errata 819472, 826319, 827319 and 824069.
1948 __asm__ volatile("dc civac, %0" : : "r"(addr) : "memory");
1950 __asm__ volatile("dsb ish" : : : "memory");
1952 /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of
1953 Unification is not required for instruction to data coherence. */
1954 if ((ctr_el0 & (1 << 29)) == 0x0) {
1955 addr = (uint64_t)start & ~(uint64_t)(isize - 1);
1956 for (; addr < (uint64_t)end; addr += isize)
1957 __asm__ volatile("ic ivau, %0" : : "r"(addr) : "memory");
1959 __asm__ volatile("dsb ish" : : : "memory");
1962 __asm__ volatile("isb" : : : "memory");
1965 // CPU-architecture-specific initialization
1966 static void arch_init(void)
1968 uintptr_t diff = (u_char *)&ndrc->tramp.f - (u_char *)&ndrc->tramp.ops;
1969 struct tramp_insns *ops = NDRC_WRITE_OFFSET(ndrc->tramp.ops);
1971 assert(!(diff & 3));
1972 start_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops));
1973 for (i = 0; i < ARRAY_SIZE(ndrc->tramp.ops); i++) {
1974 ops[i].ldr = 0x58000000 | imm19_rt(diff >> 2, 17); // ldr x17, [=val]
1975 ops[i].br = 0xd61f0000 | rm_rn_rd(0, 17, 0); // br x17
1977 end_tcache_write(ops, (u_char *)ops + sizeof(ndrc->tramp.ops));
1980 // vim:shiftwidth=2:expandtab