1 /* Bra.c -- Branch converters for RISC code
2 2024-01-20 : Igor Pavlov : Public domain */
7 #include "RotateDefs.h"
10 #if defined(MY_CPU_SIZEOF_POINTER) \
11 && ( MY_CPU_SIZEOF_POINTER == 4 \
12 || MY_CPU_SIZEOF_POINTER == 8)
13 #define BR_CONV_USE_OPT_PC_PTR
16 #ifdef BR_CONV_USE_OPT_PC_PTR
17 #define BR_PC_INIT pc -= (UInt32)(SizeT)p;
18 #define BR_PC_GET (pc + (UInt32)(SizeT)p)
20 #define BR_PC_INIT pc += (UInt32)size;
21 #define BR_PC_GET (pc - (UInt32)(SizeT)(lim - p))
23 // #define BR_PC_GET (pc + (UInt32)(SizeT)(p - data))
26 #define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c;
27 // #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c;
29 #define Z7_BRANCH_CONV(name) z7_ ## name
31 #define Z7_BRANCH_FUNC_MAIN(name) \
35 Byte *Z7_BRANCH_CONV(name)(Byte *p, SizeT size, UInt32 pc, int encoding)
37 #define Z7_BRANCH_FUNC_IMP(name, m, encoding) \
40 Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \
41 { return Z7_BRANCH_CONV(name)(data, size, pc, encoding); } \
43 #ifdef Z7_EXTRACT_ONLY
44 #define Z7_BRANCH_FUNCS_IMP(name) \
45 Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0)
47 #define Z7_BRANCH_FUNCS_IMP(name) \
48 Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) \
49 Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC_2, 1)
52 #if defined(__clang__)
53 #define BR_EXTERNAL_FOR
54 #define BR_NEXT_ITERATION continue;
56 #define BR_EXTERNAL_FOR for (;;)
57 #define BR_NEXT_ITERATION break;
60 #if defined(__clang__) && (__clang_major__ >= 8) \
61 || defined(__GNUC__) && (__GNUC__ >= 1000) \
62 // GCC is not good for __builtin_expect() here
63 /* || defined(_MSC_VER) && (_MSC_VER >= 1920) */
64 // #define Z7_unlikely [[unlikely]]
65 // #define Z7_LIKELY(x) (__builtin_expect((x), 1))
66 #define Z7_UNLIKELY(x) (__builtin_expect((x), 0))
67 // #define Z7_likely [[likely]]
69 // #define Z7_LIKELY(x) (x)
70 #define Z7_UNLIKELY(x) (x)
75 Z7_BRANCH_FUNC_MAIN(BranchConv_ARM64)
79 const UInt32 flag = (UInt32)1 << (24 - 4);
80 const UInt32 mask = ((UInt32)1 << 24) - (flag << 1);
82 // if (size == 0) return p;
85 pc -= 4; // because (p) will point to next instruction
89 // Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
93 if Z7_UNLIKELY(p == lim)
97 if Z7_UNLIKELY(((v - 0x94000000) & 0xfc000000) == 0)
99 UInt32 c = BR_PC_GET >> 2;
106 // v = rotlFixed(v, 8); v += (flag << 8) - 0x90; if Z7_UNLIKELY((v & ((mask << 8) + 0x9f)) == 0)
107 v -= 0x90000000; if Z7_UNLIKELY((v & 0x9f000000) == 0)
110 // v = rotrFixed(v, 8);
111 v += flag; if Z7_UNLIKELY(v & mask) continue;
112 z = (v & 0xffffffe0) | (v >> 26);
113 c = (BR_PC_GET >> (12 - 3)) & ~(UInt32)7;
118 v |= 0x00ffffe0 & ((z & (((flag << 1) - 1))) - flag);
124 Z7_BRANCH_FUNCS_IMP(BranchConv_ARM64)
127 Z7_BRANCH_FUNC_MAIN(BranchConv_ARM)
134 /* in ARM: branch offset is relative to the +2 instructions from current instruction.
135 (p) will point to next instruction */
142 if Z7_UNLIKELY(p >= lim) { return p; } p += 4; if Z7_UNLIKELY(p[-1] == 0xeb) break;
143 if Z7_UNLIKELY(p >= lim) { return p; } p += 4; if Z7_UNLIKELY(p[-1] == 0xeb) break;
146 UInt32 v = GetUi32a(p - 4);
147 UInt32 c = BR_PC_GET >> 2;
155 Z7_BRANCH_FUNCS_IMP(BranchConv_ARM)
158 Z7_BRANCH_FUNC_MAIN(BranchConv_PPC)
165 pc -= 4; // because (p) will point to next instruction
172 if Z7_UNLIKELY(p == lim)
175 v = *(UInt32 *)(void *)p;
177 // if ((v & 0xfc000003) == 0x48000001) break;
178 // if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1) break;
180 ((v - Z7_CONV_BE_TO_NATIVE_CONST32(0x48000001))
181 & Z7_CONV_BE_TO_NATIVE_CONST32(0xfc000003)) == 0) break;
184 v = Z7_CONV_NATIVE_TO_BE_32(v);
186 UInt32 c = BR_PC_GET;
195 Z7_BRANCH_FUNCS_IMP(BranchConv_PPC)
198 #ifdef Z7_CPU_FAST_ROTATE_SUPPORTED
199 #define BR_SPARC_USE_ROTATE
202 Z7_BRANCH_FUNC_MAIN(BranchConv_SPARC)
206 const UInt32 flag = (UInt32)1 << 22;
210 pc -= 4; // because (p) will point to next instruction
216 if Z7_UNLIKELY(p == lim)
218 /* // the code without GetBe32a():
219 { const UInt32 v = GetUi16a(p) & 0xc0ff; p += 4; if (v == 0x40 || v == 0xc07f) break; }
223 #ifdef BR_SPARC_USE_ROTATE
225 v += (flag << 2) - 1;
226 if Z7_UNLIKELY((v & (3 - (flag << 3))) == 0)
228 v += (UInt32)5 << 29;
229 v ^= (UInt32)7 << 29;
231 if Z7_UNLIKELY((v & (0 - (flag << 1))) == 0)
236 // UInt32 v = GetBe32a(p - 4);
237 #ifndef BR_SPARC_USE_ROTATE
241 UInt32 c = BR_PC_GET;
244 v &= (flag << 3) - 1;
245 #ifdef BR_SPARC_USE_ROTATE
246 v -= (flag << 2) - 1;
251 v |= (UInt32)1 << 30;
257 Z7_BRANCH_FUNCS_IMP(BranchConv_SPARC)
260 Z7_BRANCH_FUNC_MAIN(BranchConv_ARMT)
265 // if (size == 0) return p;
266 if (size <= 2) return p;
270 /* in ARM: branch offset is relative to the +2 instructions from current instruction.
271 (p) will point to the +2 instructions from current instruction */
273 // if (encoding) pc -= 0xf800 << 1; else pc += 0xf800 << 1;
274 // #define ARMT_TAIL_PROC { goto armt_tail; }
275 #define ARMT_TAIL_PROC { return p; }
279 /* in MSVC 32-bit x86 compilers:
280 UInt32 version : it loads value from memory with movzx
281 Byte version : it loads value to 8-bit register (AL/CL)
282 movzx version is slightly faster in some cpus
287 // optimized version to reduce one (p >= lim) check:
288 // unsigned a1 = p[1]; b1 = p[3]; p += 2; if Z7_LIKELY((b1 & (a1 ^ 8)) < 0xf8)
291 unsigned b3; // Byte / UInt32
292 /* (Byte)(b3) normalization can use low byte computations in MSVC.
293 It gives smaller code, and no loss of speed in some compilers/cpus.
294 But new MSVC 32-bit x86 compilers use more slow load
295 from memory to low byte register in that case.
296 So we try to use full 32-bit computations for faster code.
298 // if (p >= lim) { ARMT_TAIL_PROC } b3 = b1 + 8; b1 = p[3]; p += 2; if ((b3 & b1) >= 0xf8) break;
299 if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC } b3 = p[3]; p += 2; if Z7_UNLIKELY((b3 & (b1 ^ 8)) >= 0xf8) break;
300 if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC } b1 = p[3]; p += 2; if Z7_UNLIKELY((b1 & (b3 ^ 8)) >= 0xf8) break;
303 /* we can adjust pc for (0xf800) to rid of (& 0x7FF) operation.
304 But gcc/clang for arm64 can use bfi instruction for full code here */
306 ((UInt32)GetUi16a(p - 2) << 11) |
307 ((UInt32)GetUi16a(p) & 0x7FF);
310 ((UInt32)p[1 - 2] << 19)
311 + (((UInt32)p[1] & 0x7) << 8)
312 + (((UInt32)p[-2] << 11))
317 UInt32 c = BR_PC_GET >> 1;
320 SetUi16a(p - 4, (UInt16)(((v >> 11) & 0x7ff) | 0xf000))
321 SetUi16a(p - 2, (UInt16)(v | 0xf800))
323 p[-4] = (Byte)(v >> 11);
324 p[-3] = (Byte)(0xf0 | ((v >> 19) & 0x7));
326 p[-1] = (Byte)(0xf8 | (v >> 8));
333 // if ((Byte)((lim[1] & 0xf8)) != 0xf0) { lim += 2; } return lim;
334 // return (Byte *)(lim + ((Byte)((lim[1] ^ 0xf0) & 0xf8) == 0 ? 0 : 2));
335 // return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2));
336 // return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2));
338 Z7_BRANCH_FUNCS_IMP(BranchConv_ARMT)
341 // #define BR_IA64_NO_INLINE
343 Z7_BRANCH_FUNC_MAIN(BranchConv_IA64)
358 if Z7_UNLIKELY(p == lim)
360 m = (unsigned)((UInt32)0x334b0000 >> (*p & 0x1e));
367 p += (ptrdiff_t)m * 5 - 20; // negative value is expected here.
371 #if defined(MY_CPU_X86_OR_AMD64)
372 // we use 32-bit load here to reduce code size on x86:
377 UInt32 z = GetUi32(p + 1) >> m;
379 if (((t >> m) & (0x70 << 1)) == 0
380 && ((z - (0x5000000 << 1)) & (0xf000000 << 1)) == 0)
382 UInt32 v = (UInt32)((0x8fffff << 1) | 1) & z;
384 #ifdef BR_IA64_NO_INLINE
385 v |= (v & ((UInt32)1 << (23 + 1))) >> 3;
390 v &= (0x1fffff << 1) | 1;
395 // pc &= ~(0xc00000 << 1); // we just need to clear at least 2 bits
396 pc &= (0x1fffff << 1) | 1;
401 // pc |= 0xc00000 << 1; // we need to set at least 2 bits
402 pc |= ~(UInt32)((0x1fffff << 1) | 1);
406 v &= ~(UInt32)(0x600000 << 1);
408 v += (0x700000 << 1);
409 v &= (0x8fffff << 1) | 1;
412 SetUi32(p + 1 - 5, z)
416 while (m &= 3); // while (m < 4);
420 Z7_BRANCH_FUNCS_IMP(BranchConv_IA64)
423 #define BR_CONVERT_VAL_ENC(v) v += BR_PC_GET;
424 #define BR_CONVERT_VAL_DEC(v) v -= BR_PC_GET;
426 #if 1 && defined(MY_CPU_LE_UNALIGN)
427 #define RISCV_USE_UNALIGNED_LOAD
430 #ifdef RISCV_USE_UNALIGNED_LOAD
431 #define RISCV_GET_UI32(p) GetUi32(p)
432 #define RISCV_SET_UI32(p, v) { SetUi32(p, v) }
434 #define RISCV_GET_UI32(p) \
435 ((UInt32)GetUi16a(p) + \
436 ((UInt32)GetUi16a((p) + 2) << 16))
437 #define RISCV_SET_UI32(p, v) { \
438 SetUi16a(p, (UInt16)(v)) \
439 SetUi16a((p) + 2, (UInt16)(v >> 16)) }
442 #if 1 && defined(MY_CPU_LE)
443 #define RISCV_USE_16BIT_LOAD
446 #ifdef RISCV_USE_16BIT_LOAD
447 #define RISCV_LOAD_VAL(p) GetUi16a(p)
449 #define RISCV_LOAD_VAL(p) (*(p))
452 #define RISCV_INSTR_SIZE 2
453 #define RISCV_STEP_1 (4 + RISCV_INSTR_SIZE)
454 #define RISCV_STEP_2 4
455 #define RISCV_REG_VAL (2 << 7)
456 #define RISCV_CMD_VAL 3
458 // for code size optimization:
459 #define RISCV_DELTA_7F 0x7f
461 #define RISCV_DELTA_7F 0
464 #define RISCV_CHECK_1(v, b) \
465 (((((b) - RISCV_CMD_VAL) ^ ((v) << 8)) & (0xf8000 + RISCV_CMD_VAL)) == 0)
468 #define RISCV_CHECK_2(v, r) \
469 ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL | 8)) \
473 // this branch gives larger code, because
474 // compilers generate larger code for big constants.
475 #define RISCV_CHECK_2(v, r) \
476 ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \
477 & ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \
482 #define RISCV_SCAN_LOOP \
484 size &= ~(SizeT)(RISCV_INSTR_SIZE - 1); \
485 if (size <= 6) return p; \
492 /* Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE */ \
495 if Z7_UNLIKELY(p >= lim) { return p; } \
496 a = (RISCV_LOAD_VAL(p) ^ 0x10u) + 1; \
497 if ((a & 0x77) == 0) break; \
498 a = (RISCV_LOAD_VAL(p + RISCV_INSTR_SIZE) ^ 0x10u) + 1; \
499 p += RISCV_INSTR_SIZE * 2; \
500 if ((a & 0x77) == 0) \
502 p -= RISCV_INSTR_SIZE; \
503 if Z7_UNLIKELY(p >= lim) { return p; } \
507 // (xx6f ^ 10) + 1 = xx7f + 1 = xx80 : JAL
508 // (xxef ^ 10) + 1 = xxff + 1 = xx00 + 100 : JAL
509 // (xx17 ^ 10) + 1 = xx07 + 1 = xx08 : AUIPC
510 // (xx97 ^ 10) + 1 = xx87 + 1 = xx88 : AUIPC
512 Byte * Z7_BRANCH_CONV_ENC(RISCV)(Byte *p, SizeT size, UInt32 pc)
516 a = RISCV_GET_UI32(p);
517 #ifndef RISCV_USE_16BIT_LOAD
518 v += (UInt32)p[1] << 8;
521 if ((v & 8) == 0) // JAL
523 if ((v - (0x100 /* - RISCV_DELTA_7F */)) & 0xd80)
525 p += RISCV_INSTR_SIZE;
529 v = ((a & 1u << 31) >> 11)
530 | ((a & 0x3ff << 21) >> 20)
531 | ((a & 1 << 20) >> 9)
533 BR_CONVERT_VAL_ENC(v)
535 // v: bits [1 : 20] contain offset bits
536 #if 0 && defined(RISCV_USE_UNALIGNED_LOAD)
538 a |= ((UInt32)(v << 23))
539 | ((UInt32)(v << 7) & ((UInt32)0xff << 16))
540 | ((UInt32)(v >> 5) & ((UInt32)0xf0 << 8));
544 SetUi16a(p, (UInt16)(((v >> 5) & 0xf000) | (a & 0xfff)))
546 p[1] = (Byte)(((v >> 13) & 0xf0) | ((a >> 8) & 0xf));
549 #if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
552 SetUi16a(p + 2, (UInt16)v)
554 p[2] = (Byte)(v >> 9);
555 p[3] = (Byte)(v >> 1);
565 if (v & 0xe80) // (not x0) and (not x2)
567 const UInt32 b = RISCV_GET_UI32(p + 4);
568 if (RISCV_CHECK_1(v, b))
571 const UInt32 temp = (b << 12) | (0x17 + RISCV_REG_VAL);
572 RISCV_SET_UI32(p, temp)
577 const int t = -1 >> 1;
579 a += (b >> 20) - ((b >> 19) & 0x1000); // arithmetic right shift emulation
582 a += (UInt32)((Int32)b >> 20); // arithmetic right shift (sign-extension).
584 BR_CONVERT_VAL_ENC(a)
585 #if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
587 RISCV_SET_UI32(p + 4, a)
599 if (RISCV_CHECK_2(v, r))
601 v = RISCV_GET_UI32(p + 4);
602 r = (r << 7) + 0x17 + (v & 0xfffff000);
603 a = (a >> 12) | (v << 20);
605 RISCV_SET_UI32(p + 4, a)
616 Byte * Z7_BRANCH_CONV_DEC(RISCV)(Byte *p, SizeT size, UInt32 pc)
619 #ifdef RISCV_USE_16BIT_LOAD
624 a += (UInt32)p[1] << 8;
629 a -= 0x100 - RISCV_DELTA_7F;
632 p += RISCV_INSTR_SIZE;
636 const UInt32 a_old = (a + (0xef - RISCV_DELTA_7F)) & 0xfff;
639 v = (UInt32)(a >> 23) & ((UInt32)0xff << 1)
640 | (UInt32)(a >> 7) & ((UInt32)0xff << 9)
641 #elif 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
643 v = Z7_BSWAP32(v) >> 15
645 v = (UInt32)p[3] << 1
648 | (UInt32)((a & 0xf000) << 5);
649 BR_CONVERT_VAL_DEC(v)
651 | (v << 11 & 1u << 31)
652 | (v << 20 & 0x3ff << 21)
664 #if 1 && defined(RISCV_USE_UNALIGNED_LOAD)
667 a |= (UInt32)GetUi16a(p + 2) << 16;
669 if ((v & 0xe80) == 0) // x0/x2
671 const UInt32 r = a >> 27;
672 if (RISCV_CHECK_2(v, r))
675 #if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
676 b = RISCV_GET_UI32(p + 4);
682 BR_CONVERT_VAL_DEC(b)
684 a += (b + 0x800) & 0xfffff000;
687 RISCV_SET_UI32(p + 4, v)
695 const UInt32 b = RISCV_GET_UI32(p + 4);
696 if (!RISCV_CHECK_1(v, b))
700 v = (a & 0xfffff000) | (b >> 20);
701 a = (b << 12) | (0x17 + RISCV_REG_VAL);
703 RISCV_SET_UI32(p + 4, v)