1 /* CpuArch.h -- CPU specific code
2 2024-05-13 : Igor Pavlov : Public domain */
4 #ifndef ZIP7_INC_CPU_ARCH_H
5 #define ZIP7_INC_CPU_ARCH_H
12 MY_CPU_LE means that CPU is LITTLE ENDIAN.
13 MY_CPU_BE means that CPU is BIG ENDIAN.
14 If MY_CPU_LE and MY_CPU_BE are not defined, we don't know about ENDIANNESS of platform.
16 MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned memory accesses.
18 MY_CPU_64BIT means that processor can work with 64-bit registers.
19 MY_CPU_64BIT can be used to select fast code branch
20 MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8)
23 #if !defined(_M_ARM64EC)
25 || defined(_M_AMD64) \
26 || defined(__x86_64__) \
27 || defined(__AMD64__) \
31 #define MY_CPU_NAME "x32"
32 #define MY_CPU_SIZEOF_POINTER 4
34 #define MY_CPU_NAME "x64"
35 #define MY_CPU_SIZEOF_POINTER 8
42 #if defined(_M_IX86) \
45 #define MY_CPU_NAME "x86"
46 /* #define MY_CPU_32BIT */
47 #define MY_CPU_SIZEOF_POINTER 4
51 #if defined(_M_ARM64) \
52 || defined(_M_ARM64EC) \
53 || defined(__AARCH64EL__) \
54 || defined(__AARCH64EB__) \
55 || defined(__aarch64__)
57 #if defined(__ILP32__) \
58 || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
59 #define MY_CPU_NAME "arm64-32"
60 #define MY_CPU_SIZEOF_POINTER 4
61 #elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
62 #define MY_CPU_NAME "arm64-128"
63 #define MY_CPU_SIZEOF_POINTER 16
65 #if defined(_M_ARM64EC)
66 #define MY_CPU_NAME "arm64ec"
68 #define MY_CPU_NAME "arm64"
70 #define MY_CPU_SIZEOF_POINTER 8
77 || defined(_M_ARM_NT) \
80 || defined(__thumb__) \
81 || defined(__ARMEL__) \
82 || defined(__ARMEB__) \
83 || defined(__THUMBEL__) \
84 || defined(__THUMBEB__)
87 #if defined(__thumb__) || defined(__THUMBEL__) || defined(_M_ARMT)
89 #define MY_CPU_NAME "armt"
92 #define MY_CPU_NAME "arm"
94 /* #define MY_CPU_32BIT */
95 #define MY_CPU_SIZEOF_POINTER 4
99 #if defined(_M_IA64) \
102 #define MY_CPU_NAME "ia64"
107 #if defined(__mips64) \
108 || defined(__mips64__) \
109 || (defined(__mips) && (__mips == 64 || __mips == 4 || __mips == 3))
110 #define MY_CPU_NAME "mips64"
112 #elif defined(__mips__)
113 #define MY_CPU_NAME "mips"
114 /* #define MY_CPU_32BIT */
118 #if defined(__ppc64__) \
119 || defined(__powerpc64__) \
120 || defined(__ppc__) \
121 || defined(__powerpc__) \
122 || defined(__PPC__) \
125 #define MY_CPU_PPC_OR_PPC64
127 #if defined(__ppc64__) \
128 || defined(__powerpc64__) \
130 || defined(__64BIT__)
132 #define MY_CPU_NAME "ppc64-32"
133 #define MY_CPU_SIZEOF_POINTER 4
135 #define MY_CPU_NAME "ppc64"
136 #define MY_CPU_SIZEOF_POINTER 8
140 #define MY_CPU_NAME "ppc"
141 #define MY_CPU_SIZEOF_POINTER 4
142 /* #define MY_CPU_32BIT */
147 #if defined(__sparc__) \
150 #if defined(__LP64__) \
152 || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
153 #define MY_CPU_NAME "sparcv9"
154 #define MY_CPU_SIZEOF_POINTER 8
156 #elif defined(__sparc_v9__) \
157 || defined(__sparcv9)
159 #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
160 #define MY_CPU_NAME "sparcv9-32"
162 #define MY_CPU_NAME "sparcv9m"
164 #elif defined(__sparc_v8__) \
165 || defined(__sparcv8)
166 #define MY_CPU_NAME "sparcv8"
167 #define MY_CPU_SIZEOF_POINTER 4
169 #define MY_CPU_NAME "sparc"
174 #if defined(__riscv) \
175 || defined(__riscv__)
177 #if __riscv_xlen == 32
178 #define MY_CPU_NAME "riscv32"
179 #elif __riscv_xlen == 64
180 #define MY_CPU_NAME "riscv64"
182 #define MY_CPU_NAME "riscv"
187 #if defined(__loongarch__)
188 #define MY_CPU_LOONGARCH
189 #if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64)
192 #if defined(__loongarch64)
193 #define MY_CPU_NAME "loongarch64"
194 #define MY_CPU_LOONGARCH64
196 #define MY_CPU_NAME "loongarch"
201 // #undef MY_CPU_NAME
202 // #undef MY_CPU_SIZEOF_POINTER
204 // #define __SIZEOF_POINTER__ 4
207 #if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
208 #define MY_CPU_NAME "e2k-32"
209 #define MY_CPU_SIZEOF_POINTER 4
211 #define MY_CPU_NAME "e2k"
212 #if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
213 #define MY_CPU_SIZEOF_POINTER 8
220 #if defined(MY_CPU_X86) || defined(MY_CPU_AMD64)
221 #define MY_CPU_X86_OR_AMD64
224 #if defined(MY_CPU_ARM) || defined(MY_CPU_ARM64)
225 #define MY_CPU_ARM_OR_ARM64
232 #define MY_CPU_ARM_LE
236 #define MY_CPU_ARM64_LE
240 #define MY_CPU_IA64_LE
246 #if defined(MY_CPU_X86_OR_AMD64) \
247 || defined(MY_CPU_ARM_LE) \
248 || defined(MY_CPU_ARM64_LE) \
249 || defined(MY_CPU_IA64_LE) \
250 || defined(_LITTLE_ENDIAN) \
251 || defined(__LITTLE_ENDIAN__) \
252 || defined(__ARMEL__) \
253 || defined(__THUMBEL__) \
254 || defined(__AARCH64EL__) \
255 || defined(__MIPSEL__) \
256 || defined(__MIPSEL) \
257 || defined(_MIPSEL) \
258 || defined(__BFIN__) \
259 || (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
263 #if defined(__BIG_ENDIAN__) \
264 || defined(__ARMEB__) \
265 || defined(__THUMBEB__) \
266 || defined(__AARCH64EB__) \
267 || defined(__MIPSEB__) \
268 || defined(__MIPSEB) \
269 || defined(_MIPSEB) \
270 || defined(__m68k__) \
271 || defined(__s390__) \
272 || defined(__s390x__) \
273 || defined(__zarch__) \
274 || (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))
279 #if defined(MY_CPU_LE) && defined(MY_CPU_BE)
280 #error Stop_Compiling_Bad_Endian
283 #if !defined(MY_CPU_LE) && !defined(MY_CPU_BE)
284 #error Stop_Compiling_CPU_ENDIAN_must_be_detected_at_compile_time
287 #if defined(MY_CPU_32BIT) && defined(MY_CPU_64BIT)
288 #error Stop_Compiling_Bad_32_64_BIT
291 #ifdef __SIZEOF_POINTER__
292 #ifdef MY_CPU_SIZEOF_POINTER
293 #if MY_CPU_SIZEOF_POINTER != __SIZEOF_POINTER__
294 #error Stop_Compiling_Bad_MY_CPU_PTR_SIZE
297 #define MY_CPU_SIZEOF_POINTER __SIZEOF_POINTER__
301 #if defined(MY_CPU_SIZEOF_POINTER) && (MY_CPU_SIZEOF_POINTER == 4)
303 #error Stop_Compiling_Bad_MY_CPU_PTR_SIZE
309 #define MY_CPU_pragma_pack_push_1 __pragma(pack(push, 1))
310 #define MY_CPU_pragma_pop __pragma(pack(pop))
312 #define MY_CPU_pragma_pack_push_1
313 #define MY_CPU_pragma_pop
317 #define MY_CPU_pragma_pack_push_1 _Pragma("pack(1)")
318 #define MY_CPU_pragma_pop _Pragma("pack()")
320 #define MY_CPU_pragma_pack_push_1 _Pragma("pack(push, 1)")
321 #define MY_CPU_pragma_pop _Pragma("pack(pop)")
327 // #define MY_CPU_IS_UNKNOWN
329 #define MY_CPU_NAME "LE"
330 #elif defined(MY_CPU_BE)
331 #define MY_CPU_NAME "BE"
334 #define MY_CPU_NAME ""
344 #define Z7_has_builtin(x) __has_builtin(x)
346 #define Z7_has_builtin(x) 0
350 #define Z7_BSWAP32_CONST(v) \
351 ( (((UInt32)(v) << 24) ) \
352 | (((UInt32)(v) << 8) & (UInt32)0xff0000) \
353 | (((UInt32)(v) >> 8) & (UInt32)0xff00 ) \
354 | (((UInt32)(v) >> 24) ))
357 #if defined(_MSC_VER) && (_MSC_VER >= 1300)
361 /* Note: these macros will use bswap instruction (486), that is unsupported in 386 cpu */
363 #pragma intrinsic(_byteswap_ushort)
364 #pragma intrinsic(_byteswap_ulong)
365 #pragma intrinsic(_byteswap_uint64)
367 #define Z7_BSWAP16(v) _byteswap_ushort(v)
368 #define Z7_BSWAP32(v) _byteswap_ulong (v)
369 #define Z7_BSWAP64(v) _byteswap_uint64(v)
370 #define Z7_CPU_FAST_BSWAP_SUPPORTED
372 /* GCC can generate slow code that calls function for __builtin_bswap32() for:
373 - GCC for RISCV, if Zbb extension is not used.
375 The code from CLANG for SPARC also is not fastest.
376 So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases.
378 #elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb)) \
379 && !defined(MY_CPU_SPARC) \
381 (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
382 || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \
385 #define Z7_BSWAP16(v) __builtin_bswap16(v)
386 #define Z7_BSWAP32(v) __builtin_bswap32(v)
387 #define Z7_BSWAP64(v) __builtin_bswap64(v)
388 #define Z7_CPU_FAST_BSWAP_SUPPORTED
392 #define Z7_BSWAP16(v) ((UInt16) \
393 ( ((UInt32)(v) << 8) \
394 | ((UInt32)(v) >> 8) \
397 #define Z7_BSWAP32(v) Z7_BSWAP32_CONST(v)
399 #define Z7_BSWAP64(v) \
400 ( ( ( (UInt64)(v) ) << 8 * 7 ) \
401 | ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 1) ) << 8 * 5 ) \
402 | ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 2) ) << 8 * 3 ) \
403 | ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 3) ) << 8 * 1 ) \
404 | ( ( (UInt64)(v) >> 8 * 1 ) & ((UInt32)0xff << 8 * 3) ) \
405 | ( ( (UInt64)(v) >> 8 * 3 ) & ((UInt32)0xff << 8 * 2) ) \
406 | ( ( (UInt64)(v) >> 8 * 5 ) & ((UInt32)0xff << 8 * 1) ) \
407 | ( ( (UInt64)(v) >> 8 * 7 ) ) \
415 #if defined(MY_CPU_X86_OR_AMD64) \
416 || defined(MY_CPU_ARM64) \
417 || defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \
418 || defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6)
419 #define MY_CPU_LE_UNALIGN
420 #define MY_CPU_LE_UNALIGN_64
421 #elif defined(__ARM_FEATURE_UNALIGNED)
422 /* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions.
423 Description of problems:
424 problem-1 : 32-bit ARM architecture:
425 multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM)
426 require 32-bit (WORD) alignment (by 32-bit ARM architecture).
427 So there is "Alignment fault exception", if data is not aligned for 32-bit.
429 problem-2 : 32-bit kernels and arm64 kernels:
430 32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception".
431 So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux.
433 But some arm64 kernels do not handle these faults in 32-bit programs.
434 So we have unhandled exception for such instructions.
435 Probably some new arm64 kernels have fixed it, and unaligned
436 paired-access instructions work in new kernels?
438 problem-3 : compiler for 32-bit arm:
439 Compilers use LDRD/STRD/LDM/STM for UInt64 accesses
440 and for another cases where two 32-bit accesses are fused
441 to one multi-access instruction.
442 So UInt64 variables must be aligned for 32-bit, and each
443 32-bit access must be aligned for 32-bit, if we want to
444 avoid "Alignment fault" exception (handled or unhandled).
446 problem-4 : performace:
447 Even if unaligned access is handled by kernel, it will be slow.
448 So if we allow unaligned access, we can get fast unaligned
449 single-access, and slow unaligned paired-access.
451 We don't allow unaligned access on 32-bit arm, because compiler
452 genarates paired-access instructions that require 32-bit alignment,
453 and some arm64 kernels have no handler for these instructions.
454 Also unaligned paired-access instructions will be slow, if kernel handles them.
456 // it must be disabled:
457 // #define MY_CPU_LE_UNALIGN
462 #ifdef MY_CPU_LE_UNALIGN
464 #define GetUi16(p) (*(const UInt16 *)(const void *)(p))
465 #define GetUi32(p) (*(const UInt32 *)(const void *)(p))
466 #ifdef MY_CPU_LE_UNALIGN_64
467 #define GetUi64(p) (*(const UInt64 *)(const void *)(p))
468 #define SetUi64(p, v) { *(UInt64 *)(void *)(p) = (v); }
471 #define SetUi16(p, v) { *(UInt16 *)(void *)(p) = (v); }
472 #define SetUi32(p, v) { *(UInt32 *)(void *)(p) = (v); }
476 #define GetUi16(p) ( (UInt16) ( \
477 ((const Byte *)(p))[0] | \
478 ((UInt16)((const Byte *)(p))[1] << 8) ))
480 #define GetUi32(p) ( \
481 ((const Byte *)(p))[0] | \
482 ((UInt32)((const Byte *)(p))[1] << 8) | \
483 ((UInt32)((const Byte *)(p))[2] << 16) | \
484 ((UInt32)((const Byte *)(p))[3] << 24))
486 #define SetUi16(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \
487 _ppp_[0] = (Byte)_vvv_; \
488 _ppp_[1] = (Byte)(_vvv_ >> 8); }
490 #define SetUi32(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \
491 _ppp_[0] = (Byte)_vvv_; \
492 _ppp_[1] = (Byte)(_vvv_ >> 8); \
493 _ppp_[2] = (Byte)(_vvv_ >> 16); \
494 _ppp_[3] = (Byte)(_vvv_ >> 24); }
500 #define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32))
504 #define SetUi64(p, v) { Byte *_ppp2_ = (Byte *)(p); UInt64 _vvv2_ = (v); \
505 SetUi32(_ppp2_ , (UInt32)_vvv2_) \
506 SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)) }
510 #if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
512 #define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p))
513 #define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); }
515 #if defined(MY_CPU_LE_UNALIGN_64)
516 #define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p))
521 #define GetBe32(p) ( \
522 ((UInt32)((const Byte *)(p))[0] << 24) | \
523 ((UInt32)((const Byte *)(p))[1] << 16) | \
524 ((UInt32)((const Byte *)(p))[2] << 8) | \
525 ((const Byte *)(p))[3] )
527 #define SetBe32(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \
528 _ppp_[0] = (Byte)(_vvv_ >> 24); \
529 _ppp_[1] = (Byte)(_vvv_ >> 16); \
530 _ppp_[2] = (Byte)(_vvv_ >> 8); \
531 _ppp_[3] = (Byte)_vvv_; }
536 #define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
540 #define GetBe16(p) ( (UInt16) ( \
541 ((UInt16)((const Byte *)(p))[0] << 8) | \
542 ((const Byte *)(p))[1] ))
546 #if defined(MY_CPU_BE)
547 #define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v)
548 #define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
549 #define Z7_CONV_NATIVE_TO_BE_32(v) (v)
550 #elif defined(MY_CPU_LE)
551 #define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
552 #define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v)
553 #define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v)
555 #error Stop_Compiling_Unknown_Endian_CONV
559 #if defined(MY_CPU_BE)
561 #define GetBe64a(p) (*(const UInt64 *)(const void *)(p))
562 #define GetBe32a(p) (*(const UInt32 *)(const void *)(p))
563 #define GetBe16a(p) (*(const UInt16 *)(const void *)(p))
564 #define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); }
565 #define SetBe16a(p, v) { *(UInt16 *)(void *)(p) = (v); }
567 #define GetUi32a(p) GetUi32(p)
568 #define GetUi16a(p) GetUi16(p)
569 #define SetUi32a(p, v) SetUi32(p, v)
570 #define SetUi16a(p, v) SetUi16(p, v)
572 #elif defined(MY_CPU_LE)
574 #define GetUi32a(p) (*(const UInt32 *)(const void *)(p))
575 #define GetUi16a(p) (*(const UInt16 *)(const void *)(p))
576 #define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); }
577 #define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); }
579 #define GetBe64a(p) GetBe64(p)
580 #define GetBe32a(p) GetBe32(p)
581 #define GetBe16a(p) GetBe16(p)
582 #define SetBe32a(p, v) SetBe32(p, v)
583 #define SetBe16a(p, v) SetBe16(p, v)
586 #error Stop_Compiling_Unknown_Endian_CPU_a
590 #if defined(MY_CPU_X86_OR_AMD64) \
591 || defined(MY_CPU_ARM_OR_ARM64) \
592 || defined(MY_CPU_PPC_OR_PPC64)
593 #define Z7_CPU_FAST_ROTATE_SUPPORTED
597 #ifdef MY_CPU_X86_OR_AMD64
599 void Z7_FASTCALL z7_x86_cpuid(UInt32 a[4], UInt32 function);
600 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void);
601 #if defined(MY_CPU_AMD64)
602 #define Z7_IF_X86_CPUID_SUPPORTED
604 #define Z7_IF_X86_CPUID_SUPPORTED if (z7_x86_cpuid_GetMaxFunc())
607 BoolInt CPU_IsSupported_AES(void);
608 BoolInt CPU_IsSupported_AVX(void);
609 BoolInt CPU_IsSupported_AVX2(void);
610 // BoolInt CPU_IsSupported_AVX512F_AVX512VL(void);
611 BoolInt CPU_IsSupported_VAES_AVX2(void);
612 BoolInt CPU_IsSupported_CMOV(void);
613 BoolInt CPU_IsSupported_SSE(void);
614 BoolInt CPU_IsSupported_SSE2(void);
615 BoolInt CPU_IsSupported_SSSE3(void);
616 BoolInt CPU_IsSupported_SSE41(void);
617 BoolInt CPU_IsSupported_SHA(void);
618 BoolInt CPU_IsSupported_PageGB(void);
620 #elif defined(MY_CPU_ARM_OR_ARM64)
622 BoolInt CPU_IsSupported_CRC32(void);
623 BoolInt CPU_IsSupported_NEON(void);
626 BoolInt CPU_IsSupported_CRYPTO(void);
627 #define CPU_IsSupported_SHA1 CPU_IsSupported_CRYPTO
628 #define CPU_IsSupported_SHA2 CPU_IsSupported_CRYPTO
629 #define CPU_IsSupported_AES CPU_IsSupported_CRYPTO
631 BoolInt CPU_IsSupported_SHA1(void);
632 BoolInt CPU_IsSupported_SHA2(void);
633 BoolInt CPU_IsSupported_AES(void);
638 #if defined(__APPLE__)
639 int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize);
640 int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val);