f535537f |
1 | /* CpuArch.c -- CPU specific code |
2 | 2024-03-02 : Igor Pavlov : Public domain */ |
3 | |
4 | #include "Precomp.h" |
5 | |
6 | // #include <stdio.h> |
7 | |
8 | #include "CpuArch.h" |
9 | |
10 | #ifdef MY_CPU_X86_OR_AMD64 |
11 | |
12 | #undef NEED_CHECK_FOR_CPUID |
13 | #if !defined(MY_CPU_AMD64) |
14 | #define NEED_CHECK_FOR_CPUID |
15 | #endif |
16 | |
17 | /* |
18 | cpuid instruction supports (subFunction) parameter in ECX, |
19 | that is used only with some specific (function) parameter values. |
20 | But we always use only (subFunction==0). |
21 | */ |
22 | /* |
23 | __cpuid(): MSVC and GCC/CLANG use same function/macro name |
24 | but parameters are different. |
25 | We use MSVC __cpuid() parameters style for our z7_x86_cpuid() function. |
26 | */ |
27 | |
28 | #if defined(__GNUC__) /* && (__GNUC__ >= 10) */ \ |
29 | || defined(__clang__) /* && (__clang_major__ >= 10) */ |
30 | |
31 | /* there was some CLANG/GCC compilers that have issues with |
32 | rbx(ebx) handling in asm blocks in -fPIC mode (__PIC__ is defined). |
33 | compiler's <cpuid.h> contains the macro __cpuid() that is similar to our code. |
34 | The history of __cpuid() changes in CLANG/GCC: |
35 | GCC: |
36 | 2007: it preserved ebx for (__PIC__ && __i386__) |
37 | 2013: it preserved rbx and ebx for __PIC__ |
38 | 2014: it doesn't preserves rbx and ebx anymore |
39 | we suppose that (__GNUC__ >= 5) fixed that __PIC__ ebx/rbx problem. |
40 | CLANG: |
41 | 2014+: it preserves rbx, but only for 64-bit code. No __PIC__ check. |
42 | Why CLANG cares about 64-bit mode only, and doesn't care about ebx (in 32-bit)? |
43 | Do we need __PIC__ test for CLANG or we must care about rbx even if |
44 | __PIC__ is not defined? |
45 | */ |
46 | |
47 | #define ASM_LN "\n" |
48 | |
49 | #if defined(MY_CPU_AMD64) && defined(__PIC__) \ |
50 | && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) |
51 | |
52 | #define x86_cpuid_MACRO(p, func) { \ |
53 | __asm__ __volatile__ ( \ |
54 | ASM_LN "mov %%rbx, %q1" \ |
55 | ASM_LN "cpuid" \ |
56 | ASM_LN "xchg %%rbx, %q1" \ |
57 | : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } |
58 | |
59 | /* "=&r" selects free register. It can select even rbx, if that register is free. |
60 | "=&D" for (RDI) also works, but the code can be larger with "=&D" |
61 | "2"(0) means (subFunction = 0), |
62 | 2 is (zero-based) index in the output constraint list "=c" (ECX). */ |
63 | |
64 | #elif defined(MY_CPU_X86) && defined(__PIC__) \ |
65 | && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) |
66 | |
67 | #define x86_cpuid_MACRO(p, func) { \ |
68 | __asm__ __volatile__ ( \ |
69 | ASM_LN "mov %%ebx, %k1" \ |
70 | ASM_LN "cpuid" \ |
71 | ASM_LN "xchg %%ebx, %k1" \ |
72 | : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } |
73 | |
74 | #else |
75 | |
76 | #define x86_cpuid_MACRO(p, func) { \ |
77 | __asm__ __volatile__ ( \ |
78 | ASM_LN "cpuid" \ |
79 | : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } |
80 | |
81 | #endif |
82 | |
83 | |
84 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) |
85 | { |
86 | x86_cpuid_MACRO(p, func) |
87 | } |
88 | |
89 | |
90 | Z7_NO_INLINE |
91 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) |
92 | { |
93 | #if defined(NEED_CHECK_FOR_CPUID) |
94 | #define EFALGS_CPUID_BIT 21 |
95 | UInt32 a; |
96 | __asm__ __volatile__ ( |
97 | ASM_LN "pushf" |
98 | ASM_LN "pushf" |
99 | ASM_LN "pop %0" |
100 | // ASM_LN "movl %0, %1" |
101 | // ASM_LN "xorl $0x200000, %0" |
102 | ASM_LN "btc %1, %0" |
103 | ASM_LN "push %0" |
104 | ASM_LN "popf" |
105 | ASM_LN "pushf" |
106 | ASM_LN "pop %0" |
107 | ASM_LN "xorl (%%esp), %0" |
108 | |
109 | ASM_LN "popf" |
110 | ASM_LN |
111 | : "=&r" (a) // "=a" |
112 | : "i" (EFALGS_CPUID_BIT) |
113 | ); |
114 | if ((a & (1 << EFALGS_CPUID_BIT)) == 0) |
115 | return 0; |
116 | #endif |
117 | { |
118 | UInt32 p[4]; |
119 | x86_cpuid_MACRO(p, 0) |
120 | return p[0]; |
121 | } |
122 | } |
123 | |
124 | #undef ASM_LN |
125 | |
126 | #elif !defined(_MSC_VER) |
127 | |
128 | /* |
129 | // for gcc/clang and other: we can try to use __cpuid macro: |
130 | #include <cpuid.h> |
131 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) |
132 | { |
133 | __cpuid(func, p[0], p[1], p[2], p[3]); |
134 | } |
135 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) |
136 | { |
137 | return (UInt32)__get_cpuid_max(0, NULL); |
138 | } |
139 | */ |
140 | // for unsupported cpuid: |
141 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) |
142 | { |
143 | UNUSED_VAR(func) |
144 | p[0] = p[1] = p[2] = p[3] = 0; |
145 | } |
146 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) |
147 | { |
148 | return 0; |
149 | } |
150 | |
151 | #else // _MSC_VER |
152 | |
153 | #if !defined(MY_CPU_AMD64) |
154 | |
155 | UInt32 __declspec(naked) Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) |
156 | { |
157 | #if defined(NEED_CHECK_FOR_CPUID) |
158 | #define EFALGS_CPUID_BIT 21 |
159 | __asm pushfd |
160 | __asm pushfd |
161 | /* |
162 | __asm pop eax |
163 | // __asm mov edx, eax |
164 | __asm btc eax, EFALGS_CPUID_BIT |
165 | __asm push eax |
166 | */ |
167 | __asm btc dword ptr [esp], EFALGS_CPUID_BIT |
168 | __asm popfd |
169 | __asm pushfd |
170 | __asm pop eax |
171 | // __asm xor eax, edx |
172 | __asm xor eax, [esp] |
173 | // __asm push edx |
174 | __asm popfd |
175 | __asm and eax, (1 shl EFALGS_CPUID_BIT) |
176 | __asm jz end_func |
177 | #endif |
178 | __asm push ebx |
179 | __asm xor eax, eax // func |
180 | __asm xor ecx, ecx // subFunction (optional) for (func == 0) |
181 | __asm cpuid |
182 | __asm pop ebx |
183 | #if defined(NEED_CHECK_FOR_CPUID) |
184 | end_func: |
185 | #endif |
186 | __asm ret 0 |
187 | } |
188 | |
189 | void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) |
190 | { |
191 | UNUSED_VAR(p) |
192 | UNUSED_VAR(func) |
193 | __asm push ebx |
194 | __asm push edi |
195 | __asm mov edi, ecx // p |
196 | __asm mov eax, edx // func |
197 | __asm xor ecx, ecx // subfunction (optional) for (func == 0) |
198 | __asm cpuid |
199 | __asm mov [edi ], eax |
200 | __asm mov [edi + 4], ebx |
201 | __asm mov [edi + 8], ecx |
202 | __asm mov [edi + 12], edx |
203 | __asm pop edi |
204 | __asm pop ebx |
205 | __asm ret 0 |
206 | } |
207 | |
208 | #else // MY_CPU_AMD64 |
209 | |
210 | #if _MSC_VER >= 1600 |
211 | #include <intrin.h> |
212 | #define MY_cpuidex __cpuidex |
213 | #else |
214 | /* |
215 | __cpuid (func == (0 or 7)) requires subfunction number in ECX. |
216 | MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction. |
217 | __cpuid() in new MSVC clears ECX. |
218 | __cpuid() in old MSVC (14.00) x64 doesn't clear ECX |
219 | We still can use __cpuid for low (func) values that don't require ECX, |
220 | but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). |
221 | So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, |
222 | where ECX value is first parameter for FASTCALL / NO_INLINE func, |
223 | So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and |
224 | old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. |
225 | |
226 | DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!! |
227 | */ |
228 | static |
229 | Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo) |
230 | { |
231 | UNUSED_VAR(subFunction) |
232 | __cpuid(CPUInfo, func); |
233 | } |
234 | #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) |
235 | #pragma message("======== MY_cpuidex_HACK WAS USED ========") |
236 | #endif // _MSC_VER >= 1600 |
237 | |
238 | #if !defined(MY_CPU_AMD64) |
239 | /* inlining for __cpuid() in MSVC x86 (32-bit) produces big ineffective code, |
240 | so we disable inlining here */ |
241 | Z7_NO_INLINE |
242 | #endif |
243 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) |
244 | { |
245 | MY_cpuidex((Int32 *)p, (Int32)func, 0); |
246 | } |
247 | |
248 | Z7_NO_INLINE |
249 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) |
250 | { |
251 | Int32 a[4]; |
252 | MY_cpuidex(a, 0, 0); |
253 | return a[0]; |
254 | } |
255 | |
256 | #endif // MY_CPU_AMD64 |
257 | #endif // _MSC_VER |
258 | |
259 | #if defined(NEED_CHECK_FOR_CPUID) |
260 | #define CHECK_CPUID_IS_SUPPORTED { if (z7_x86_cpuid_GetMaxFunc() == 0) return 0; } |
261 | #else |
262 | #define CHECK_CPUID_IS_SUPPORTED |
263 | #endif |
264 | #undef NEED_CHECK_FOR_CPUID |
265 | |
266 | |
267 | static |
268 | BoolInt x86cpuid_Func_1(UInt32 *p) |
269 | { |
270 | CHECK_CPUID_IS_SUPPORTED |
271 | z7_x86_cpuid(p, 1); |
272 | return True; |
273 | } |
274 | |
275 | /* |
276 | static const UInt32 kVendors[][1] = |
277 | { |
278 | { 0x756E6547 }, // , 0x49656E69, 0x6C65746E }, |
279 | { 0x68747541 }, // , 0x69746E65, 0x444D4163 }, |
280 | { 0x746E6543 } // , 0x48727561, 0x736C7561 } |
281 | }; |
282 | */ |
283 | |
284 | /* |
285 | typedef struct |
286 | { |
287 | UInt32 maxFunc; |
288 | UInt32 vendor[3]; |
289 | UInt32 ver; |
290 | UInt32 b; |
291 | UInt32 c; |
292 | UInt32 d; |
293 | } Cx86cpuid; |
294 | |
295 | enum |
296 | { |
297 | CPU_FIRM_INTEL, |
298 | CPU_FIRM_AMD, |
299 | CPU_FIRM_VIA |
300 | }; |
301 | int x86cpuid_GetFirm(const Cx86cpuid *p); |
302 | #define x86cpuid_ver_GetFamily(ver) (((ver >> 16) & 0xff0) | ((ver >> 8) & 0xf)) |
303 | #define x86cpuid_ver_GetModel(ver) (((ver >> 12) & 0xf0) | ((ver >> 4) & 0xf)) |
304 | #define x86cpuid_ver_GetStepping(ver) (ver & 0xf) |
305 | |
306 | int x86cpuid_GetFirm(const Cx86cpuid *p) |
307 | { |
308 | unsigned i; |
309 | for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[0]); i++) |
310 | { |
311 | const UInt32 *v = kVendors[i]; |
312 | if (v[0] == p->vendor[0] |
313 | // && v[1] == p->vendor[1] |
314 | // && v[2] == p->vendor[2] |
315 | ) |
316 | return (int)i; |
317 | } |
318 | return -1; |
319 | } |
320 | |
321 | BoolInt CPU_Is_InOrder() |
322 | { |
323 | Cx86cpuid p; |
324 | UInt32 family, model; |
325 | if (!x86cpuid_CheckAndRead(&p)) |
326 | return True; |
327 | |
328 | family = x86cpuid_ver_GetFamily(p.ver); |
329 | model = x86cpuid_ver_GetModel(p.ver); |
330 | |
331 | switch (x86cpuid_GetFirm(&p)) |
332 | { |
333 | case CPU_FIRM_INTEL: return (family < 6 || (family == 6 && ( |
334 | // In-Order Atom CPU |
335 | model == 0x1C // 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330 |
336 | || model == 0x26 // 45 nm, Z6xx |
337 | || model == 0x27 // 32 nm, Z2460 |
338 | || model == 0x35 // 32 nm, Z2760 |
339 | || model == 0x36 // 32 nm, N2xxx, D2xxx |
340 | ))); |
341 | case CPU_FIRM_AMD: return (family < 5 || (family == 5 && (model < 6 || model == 0xA))); |
342 | case CPU_FIRM_VIA: return (family < 6 || (family == 6 && model < 0xF)); |
343 | } |
344 | return False; // v23 : unknown processors are not In-Order |
345 | } |
346 | */ |
347 | |
348 | #ifdef _WIN32 |
349 | #include "7zWindows.h" |
350 | #endif |
351 | |
352 | #if !defined(MY_CPU_AMD64) && defined(_WIN32) |
353 | |
354 | /* for legacy SSE ia32: there is no user-space cpu instruction to check |
355 | that OS supports SSE register storing/restoring on context switches. |
356 | So we need some OS-specific function to check that it's safe to use SSE registers. |
357 | */ |
358 | |
359 | Z7_FORCE_INLINE |
360 | static BoolInt CPU_Sys_Is_SSE_Supported(void) |
361 | { |
362 | #ifdef _MSC_VER |
363 | #pragma warning(push) |
364 | #pragma warning(disable : 4996) // `GetVersion': was declared deprecated |
365 | #endif |
366 | /* low byte is major version of Windows |
367 | We suppose that any Windows version since |
368 | Windows2000 (major == 5) supports SSE registers */ |
369 | return (Byte)GetVersion() >= 5; |
370 | #if defined(_MSC_VER) |
371 | #pragma warning(pop) |
372 | #endif |
373 | } |
374 | #define CHECK_SYS_SSE_SUPPORT if (!CPU_Sys_Is_SSE_Supported()) return False; |
375 | #else |
376 | #define CHECK_SYS_SSE_SUPPORT |
377 | #endif |
378 | |
379 | |
380 | #if !defined(MY_CPU_AMD64) |
381 | |
382 | BoolInt CPU_IsSupported_CMOV(void) |
383 | { |
384 | UInt32 a[4]; |
385 | if (!x86cpuid_Func_1(&a[0])) |
386 | return 0; |
387 | return (BoolInt)(a[3] >> 15) & 1; |
388 | } |
389 | |
390 | BoolInt CPU_IsSupported_SSE(void) |
391 | { |
392 | UInt32 a[4]; |
393 | CHECK_SYS_SSE_SUPPORT |
394 | if (!x86cpuid_Func_1(&a[0])) |
395 | return 0; |
396 | return (BoolInt)(a[3] >> 25) & 1; |
397 | } |
398 | |
399 | BoolInt CPU_IsSupported_SSE2(void) |
400 | { |
401 | UInt32 a[4]; |
402 | CHECK_SYS_SSE_SUPPORT |
403 | if (!x86cpuid_Func_1(&a[0])) |
404 | return 0; |
405 | return (BoolInt)(a[3] >> 26) & 1; |
406 | } |
407 | |
408 | #endif |
409 | |
410 | |
411 | static UInt32 x86cpuid_Func_1_ECX(void) |
412 | { |
413 | UInt32 a[4]; |
414 | CHECK_SYS_SSE_SUPPORT |
415 | if (!x86cpuid_Func_1(&a[0])) |
416 | return 0; |
417 | return a[2]; |
418 | } |
419 | |
420 | BoolInt CPU_IsSupported_AES(void) |
421 | { |
422 | return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1; |
423 | } |
424 | |
425 | BoolInt CPU_IsSupported_SSSE3(void) |
426 | { |
427 | return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1; |
428 | } |
429 | |
430 | BoolInt CPU_IsSupported_SSE41(void) |
431 | { |
432 | return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1; |
433 | } |
434 | |
435 | BoolInt CPU_IsSupported_SHA(void) |
436 | { |
437 | CHECK_SYS_SSE_SUPPORT |
438 | |
439 | if (z7_x86_cpuid_GetMaxFunc() < 7) |
440 | return False; |
441 | { |
442 | UInt32 d[4]; |
443 | z7_x86_cpuid(d, 7); |
444 | return (BoolInt)(d[1] >> 29) & 1; |
445 | } |
446 | } |
447 | |
448 | /* |
449 | MSVC: _xgetbv() intrinsic is available since VS2010SP1. |
450 | MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in |
451 | <immintrin.h> that we can use or check. |
452 | For any 32-bit x86 we can use asm code in MSVC, |
453 | but MSVC asm code is huge after compilation. |
454 | So _xgetbv() is better |
455 | |
456 | ICC: _xgetbv() intrinsic is available (in what version of ICC?) |
457 | ICC defines (__GNUC___) and it supports gnu assembler |
458 | also ICC supports MASM style code with -use-msasm switch. |
459 | but ICC doesn't support __attribute__((__target__)) |
460 | |
461 | GCC/CLANG 9: |
462 | _xgetbv() is macro that works via __builtin_ia32_xgetbv() |
463 | and we need __attribute__((__target__("xsave")). |
464 | But with __target__("xsave") the function will be not |
465 | inlined to function that has no __target__("xsave") attribute. |
466 | If we want _xgetbv() call inlining, then we should use asm version |
467 | instead of calling _xgetbv(). |
468 | Note:intrinsic is broke before GCC 8.2: |
469 | https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85684 |
470 | */ |
471 | |
472 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) \ |
473 | || defined(_MSC_VER) && (_MSC_VER >= 1600) && (_MSC_FULL_VER >= 160040219) \ |
474 | || defined(__GNUC__) && (__GNUC__ >= 9) \ |
475 | || defined(__clang__) && (__clang_major__ >= 9) |
476 | // we define ATTRIB_XGETBV, if we want to use predefined _xgetbv() from compiler |
477 | #if defined(__INTEL_COMPILER) |
478 | #define ATTRIB_XGETBV |
479 | #elif defined(__GNUC__) || defined(__clang__) |
480 | // we don't define ATTRIB_XGETBV here, because asm version is better for inlining. |
481 | // #define ATTRIB_XGETBV __attribute__((__target__("xsave"))) |
482 | #else |
483 | #define ATTRIB_XGETBV |
484 | #endif |
485 | #endif |
486 | |
487 | #if defined(ATTRIB_XGETBV) |
488 | #include <immintrin.h> |
489 | #endif |
490 | |
491 | |
492 | // XFEATURE_ENABLED_MASK/XCR0 |
493 | #define MY_XCR_XFEATURE_ENABLED_MASK 0 |
494 | |
495 | #if defined(ATTRIB_XGETBV) |
496 | ATTRIB_XGETBV |
497 | #endif |
498 | static UInt64 x86_xgetbv_0(UInt32 num) |
499 | { |
500 | #if defined(ATTRIB_XGETBV) |
501 | { |
502 | return |
503 | #if (defined(_MSC_VER)) |
504 | _xgetbv(num); |
505 | #else |
506 | __builtin_ia32_xgetbv( |
507 | #if !defined(__clang__) |
508 | (int) |
509 | #endif |
510 | num); |
511 | #endif |
512 | } |
513 | |
514 | #elif defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_CC) |
515 | |
516 | UInt32 a, d; |
517 | #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) |
518 | __asm__ |
519 | ( |
520 | "xgetbv" |
521 | : "=a"(a), "=d"(d) : "c"(num) : "cc" |
522 | ); |
523 | #else // is old gcc |
524 | __asm__ |
525 | ( |
526 | ".byte 0x0f, 0x01, 0xd0" "\n\t" |
527 | : "=a"(a), "=d"(d) : "c"(num) : "cc" |
528 | ); |
529 | #endif |
530 | return ((UInt64)d << 32) | a; |
531 | // return a; |
532 | |
533 | #elif defined(_MSC_VER) && !defined(MY_CPU_AMD64) |
534 | |
535 | UInt32 a, d; |
536 | __asm { |
537 | push eax |
538 | push edx |
539 | push ecx |
540 | mov ecx, num; |
541 | // xor ecx, ecx // = MY_XCR_XFEATURE_ENABLED_MASK |
542 | _emit 0x0f |
543 | _emit 0x01 |
544 | _emit 0xd0 |
545 | mov a, eax |
546 | mov d, edx |
547 | pop ecx |
548 | pop edx |
549 | pop eax |
550 | } |
551 | return ((UInt64)d << 32) | a; |
552 | // return a; |
553 | |
554 | #else // it's unknown compiler |
555 | // #error "Need xgetbv function" |
556 | UNUSED_VAR(num) |
557 | // for MSVC-X64 we could call external function from external file. |
558 | /* Actually we had checked OSXSAVE/AVX in cpuid before. |
559 | So it's expected that OS supports at least AVX and below. */ |
560 | // if (num != MY_XCR_XFEATURE_ENABLED_MASK) return 0; // if not XCR0 |
561 | return |
562 | // (1 << 0) | // x87 |
563 | (1 << 1) // SSE |
564 | | (1 << 2); // AVX |
565 | |
566 | #endif |
567 | } |
568 | |
569 | #ifdef _WIN32 |
570 | /* |
571 | Windows versions do not know about new ISA extensions that |
572 | can be introduced. But we still can use new extensions, |
573 | even if Windows doesn't report about supporting them, |
574 | But we can use new extensions, only if Windows knows about new ISA extension |
575 | that changes the number or size of registers: SSE, AVX/XSAVE, AVX512 |
576 | So it's enough to check |
577 | MY_PF_AVX_INSTRUCTIONS_AVAILABLE |
578 | instead of |
579 | MY_PF_AVX2_INSTRUCTIONS_AVAILABLE |
580 | */ |
581 | #define MY_PF_XSAVE_ENABLED 17 |
582 | // #define MY_PF_SSSE3_INSTRUCTIONS_AVAILABLE 36 |
583 | // #define MY_PF_SSE4_1_INSTRUCTIONS_AVAILABLE 37 |
584 | // #define MY_PF_SSE4_2_INSTRUCTIONS_AVAILABLE 38 |
585 | // #define MY_PF_AVX_INSTRUCTIONS_AVAILABLE 39 |
586 | // #define MY_PF_AVX2_INSTRUCTIONS_AVAILABLE 40 |
587 | // #define MY_PF_AVX512F_INSTRUCTIONS_AVAILABLE 41 |
588 | #endif |
589 | |
590 | BoolInt CPU_IsSupported_AVX(void) |
591 | { |
592 | #ifdef _WIN32 |
593 | if (!IsProcessorFeaturePresent(MY_PF_XSAVE_ENABLED)) |
594 | return False; |
595 | /* PF_AVX_INSTRUCTIONS_AVAILABLE probably is supported starting from |
596 | some latest Win10 revisions. But we need AVX in older Windows also. |
597 | So we don't use the following check: */ |
598 | /* |
599 | if (!IsProcessorFeaturePresent(MY_PF_AVX_INSTRUCTIONS_AVAILABLE)) |
600 | return False; |
601 | */ |
602 | #endif |
603 | |
604 | /* |
605 | OS must use new special XSAVE/XRSTOR instructions to save |
606 | AVX registers when it required for context switching. |
607 | At OS statring: |
608 | OS sets CR4.OSXSAVE flag to signal the processor that OS supports the XSAVE extensions. |
609 | Also OS sets bitmask in XCR0 register that defines what |
610 | registers will be processed by XSAVE instruction: |
611 | XCR0.SSE[bit 0] - x87 registers and state |
612 | XCR0.SSE[bit 1] - SSE registers and state |
613 | XCR0.AVX[bit 2] - AVX registers and state |
614 | CR4.OSXSAVE is reflected to CPUID.1:ECX.OSXSAVE[bit 27]. |
615 | So we can read that bit in user-space. |
616 | XCR0 is available for reading in user-space by new XGETBV instruction. |
617 | */ |
618 | { |
619 | const UInt32 c = x86cpuid_Func_1_ECX(); |
620 | if (0 == (1 |
621 | & (c >> 28) // AVX instructions are supported by hardware |
622 | & (c >> 27))) // OSXSAVE bit: XSAVE and related instructions are enabled by OS. |
623 | return False; |
624 | } |
625 | |
626 | /* also we can check |
627 | CPUID.1:ECX.XSAVE [bit 26] : that shows that |
628 | XSAVE, XRESTOR, XSETBV, XGETBV instructions are supported by hardware. |
629 | But that check is redundant, because if OSXSAVE bit is set, then XSAVE is also set */ |
630 | |
631 | /* If OS have enabled XSAVE extension instructions (OSXSAVE == 1), |
632 | in most cases we expect that OS also will support storing/restoring |
633 | for AVX and SSE states at least. |
634 | But to be ensure for that we call user-space instruction |
635 | XGETBV(0) to get XCR0 value that contains bitmask that defines |
636 | what exact states(registers) OS have enabled for storing/restoring. |
637 | */ |
638 | |
639 | { |
640 | const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); |
641 | // printf("\n=== XGetBV=%d\n", bm); |
642 | return 1 |
643 | & (BoolInt)(bm >> 1) // SSE state is supported (set by OS) for storing/restoring |
644 | & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring |
645 | } |
646 | // since Win7SP1: we can use GetEnabledXStateFeatures(); |
647 | } |
648 | |
649 | |
650 | BoolInt CPU_IsSupported_AVX2(void) |
651 | { |
652 | if (!CPU_IsSupported_AVX()) |
653 | return False; |
654 | if (z7_x86_cpuid_GetMaxFunc() < 7) |
655 | return False; |
656 | { |
657 | UInt32 d[4]; |
658 | z7_x86_cpuid(d, 7); |
659 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); |
660 | return 1 |
661 | & (BoolInt)(d[1] >> 5); // avx2 |
662 | } |
663 | } |
664 | |
665 | /* |
666 | // fix it: |
667 | BoolInt CPU_IsSupported_AVX512F_AVX512VL(void) |
668 | { |
669 | if (!CPU_IsSupported_AVX()) |
670 | return False; |
671 | if (z7_x86_cpuid_GetMaxFunc() < 7) |
672 | return False; |
673 | { |
674 | UInt32 d[4]; |
675 | z7_x86_cpuid(d, 7); |
676 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); |
677 | return 1 |
678 | & (BoolInt)(d[1] >> 16) // avx512-f |
679 | & (BoolInt)(d[1] >> 31); // avx512-Vl |
680 | } |
681 | } |
682 | */ |
683 | |
684 | BoolInt CPU_IsSupported_VAES_AVX2(void) |
685 | { |
686 | if (!CPU_IsSupported_AVX()) |
687 | return False; |
688 | if (z7_x86_cpuid_GetMaxFunc() < 7) |
689 | return False; |
690 | { |
691 | UInt32 d[4]; |
692 | z7_x86_cpuid(d, 7); |
693 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); |
694 | return 1 |
695 | & (BoolInt)(d[1] >> 5) // avx2 |
696 | // & (d[1] >> 31) // avx512vl |
697 | & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX |
698 | } |
699 | } |
700 | |
701 | BoolInt CPU_IsSupported_PageGB(void) |
702 | { |
703 | CHECK_CPUID_IS_SUPPORTED |
704 | { |
705 | UInt32 d[4]; |
706 | z7_x86_cpuid(d, 0x80000000); |
707 | if (d[0] < 0x80000001) |
708 | return False; |
709 | z7_x86_cpuid(d, 0x80000001); |
710 | return (BoolInt)(d[3] >> 26) & 1; |
711 | } |
712 | } |
713 | |
714 | |
715 | #elif defined(MY_CPU_ARM_OR_ARM64) |
716 | |
717 | #ifdef _WIN32 |
718 | |
719 | #include "7zWindows.h" |
720 | |
721 | BoolInt CPU_IsSupported_CRC32(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } |
722 | BoolInt CPU_IsSupported_CRYPTO(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } |
723 | BoolInt CPU_IsSupported_NEON(void) { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } |
724 | |
725 | #else |
726 | |
727 | #if defined(__APPLE__) |
728 | |
729 | /* |
730 | #include <stdio.h> |
731 | #include <string.h> |
732 | static void Print_sysctlbyname(const char *name) |
733 | { |
734 | size_t bufSize = 256; |
735 | char buf[256]; |
736 | int res = sysctlbyname(name, &buf, &bufSize, NULL, 0); |
737 | { |
738 | int i; |
739 | printf("\nres = %d : %s : '%s' : bufSize = %d, numeric", res, name, buf, (unsigned)bufSize); |
740 | for (i = 0; i < 20; i++) |
741 | printf(" %2x", (unsigned)(Byte)buf[i]); |
742 | |
743 | } |
744 | } |
745 | */ |
746 | /* |
747 | Print_sysctlbyname("hw.pagesize"); |
748 | Print_sysctlbyname("machdep.cpu.brand_string"); |
749 | */ |
750 | |
751 | static BoolInt z7_sysctlbyname_Get_BoolInt(const char *name) |
752 | { |
753 | UInt32 val = 0; |
754 | if (z7_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1) |
755 | return 1; |
756 | return 0; |
757 | } |
758 | |
759 | BoolInt CPU_IsSupported_CRC32(void) |
760 | { |
761 | return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32"); |
762 | } |
763 | |
764 | BoolInt CPU_IsSupported_NEON(void) |
765 | { |
766 | return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); |
767 | } |
768 | |
769 | #ifdef MY_CPU_ARM64 |
770 | #define APPLE_CRYPTO_SUPPORT_VAL 1 |
771 | #else |
772 | #define APPLE_CRYPTO_SUPPORT_VAL 0 |
773 | #endif |
774 | |
775 | BoolInt CPU_IsSupported_SHA1(void) { return APPLE_CRYPTO_SUPPORT_VAL; } |
776 | BoolInt CPU_IsSupported_SHA2(void) { return APPLE_CRYPTO_SUPPORT_VAL; } |
777 | BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; } |
778 | |
779 | |
780 | #else // __APPLE__ |
781 | |
782 | #if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216) |
783 | #define Z7_GETAUXV_AVAILABLE |
784 | #else |
785 | // #pragma message("=== is not NEW GLIBC === ") |
786 | #if defined __has_include |
787 | #if __has_include (<sys/auxv.h>) |
788 | // #pragma message("=== sys/auxv.h is avail=== ") |
789 | #define Z7_GETAUXV_AVAILABLE |
790 | #endif |
791 | #endif |
792 | #endif |
793 | |
794 | #ifdef Z7_GETAUXV_AVAILABLE |
795 | // #pragma message("=== Z7_GETAUXV_AVAILABLE === ") |
796 | #include <sys/auxv.h> |
797 | #define USE_HWCAP |
798 | #endif |
799 | |
800 | #ifdef USE_HWCAP |
801 | |
802 | #if defined(__FreeBSD__) |
803 | static unsigned long MY_getauxval(int aux) |
804 | { |
805 | unsigned long val; |
806 | if (elf_aux_info(aux, &val, sizeof(val))) |
807 | return 0; |
808 | return val; |
809 | } |
810 | #else |
811 | #define MY_getauxval getauxval |
812 | #if defined __has_include |
813 | #if __has_include (<asm/hwcap.h>) |
814 | #include <asm/hwcap.h> |
815 | #endif |
816 | #endif |
817 | #endif |
818 | |
819 | #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ |
820 | BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP) & (HWCAP_ ## name2)); } |
821 | |
822 | #ifdef MY_CPU_ARM64 |
823 | #define MY_HWCAP_CHECK_FUNC(name) \ |
824 | MY_HWCAP_CHECK_FUNC_2(name, name) |
825 | #if 1 || defined(__ARM_NEON) |
826 | BoolInt CPU_IsSupported_NEON(void) { return True; } |
827 | #else |
828 | MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) |
829 | #endif |
830 | // MY_HWCAP_CHECK_FUNC (ASIMD) |
831 | #elif defined(MY_CPU_ARM) |
832 | #define MY_HWCAP_CHECK_FUNC(name) \ |
833 | BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); } |
834 | MY_HWCAP_CHECK_FUNC_2(NEON, NEON) |
835 | #endif |
836 | |
837 | #else // USE_HWCAP |
838 | |
839 | #define MY_HWCAP_CHECK_FUNC(name) \ |
840 | BoolInt CPU_IsSupported_ ## name(void) { return 0; } |
841 | |
842 | #if defined(__SWITCH__) || defined(__vita__) |
843 | BoolInt CPU_IsSupported_NEON(void) { return 1; } |
844 | #else |
845 | MY_HWCAP_CHECK_FUNC(NEON) |
846 | #endif |
847 | |
848 | #endif // USE_HWCAP |
849 | |
850 | MY_HWCAP_CHECK_FUNC (CRC32) |
851 | MY_HWCAP_CHECK_FUNC (SHA1) |
852 | MY_HWCAP_CHECK_FUNC (SHA2) |
853 | MY_HWCAP_CHECK_FUNC (AES) |
854 | |
855 | #endif // __APPLE__ |
856 | #endif // _WIN32 |
857 | |
858 | #endif // MY_CPU_ARM_OR_ARM64 |
859 | |
860 | |
861 | |
862 | #ifdef __APPLE__ |
863 | |
864 | #include <sys/sysctl.h> |
865 | |
866 | int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize) |
867 | { |
868 | return sysctlbyname(name, buf, bufSize, NULL, 0); |
869 | } |
870 | |
871 | int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val) |
872 | { |
873 | size_t bufSize = sizeof(*val); |
874 | const int res = z7_sysctlbyname_Get(name, val, &bufSize); |
875 | if (res == 0 && bufSize != sizeof(*val)) |
876 | return EFAULT; |
877 | return res; |
878 | } |
879 | |
880 | #endif |