git subrepo pull (merge) --force deps/libchdr
[pcsx_rearmed.git] / deps / libchdr / deps / zstd-1.5.6 / lib / decompress / huf_decompress_amd64.S
CommitLineData
648db22b 1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11#include "../common/portability_macros.h"
12
f535537f 13#if defined(__ELF__) && defined(__GNUC__)
648db22b 14/* Stack marking
15 * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
16 */
648db22b 17.section .note.GNU-stack,"",%progbits
f535537f 18
19#if defined(__aarch64__)
20/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
21 * See: https://github.com/facebook/zstd/issues/3841
22 * See: https://gcc.godbolt.org/z/sqr5T4ffK
23 * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
24 * See: https://reviews.llvm.org/D62609
25 */
26.pushsection .note.gnu.property, "a"
27.p2align 3
28.long 4 /* size of the name - "GNU\0" */
29.long 0x10 /* size of descriptor */
30.long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */
31.asciz "GNU"
32.long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
33.long 4 /* pr_datasz - 4 bytes */
34.long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
35.p2align 3 /* pr_padding - bring everything to 8 byte alignment */
36.popsection
37#endif
38
648db22b 39#endif
40
41#if ZSTD_ENABLE_ASM_X86_64_BMI2
42
43/* Calling convention:
44 *
45 * %rdi contains the first argument: HUF_DecompressAsmArgs*.
46 * %rbp isn't maintained (no frame pointer).
47 * %rsp contains the stack pointer that grows down.
48 * No red-zone is assumed, only addresses >= %rsp are used.
49 * All register contents are preserved.
50 *
51 * TODO: Support Windows calling convention.
52 */
53
54ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
55ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
56ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
57ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
58.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
59.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
60.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
61.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
62.text
63
64/* Sets up register mappings for clarity.
65 * op[], bits[], dtable & ip[0] each get their own register.
66 * ip[1,2,3] & olimit alias var[].
67 * %rax is a scratch register.
68 */
69
70#define op0 rsi
71#define op1 rbx
72#define op2 rcx
73#define op3 rdi
74
75#define ip0 r8
76#define ip1 r9
77#define ip2 r10
78#define ip3 r11
79
80#define bits0 rbp
81#define bits1 rdx
82#define bits2 r12
83#define bits3 r13
84#define dtable r14
85#define olimit r15
86
87/* var[] aliases ip[1,2,3] & olimit
88 * ip[1,2,3] are saved every iteration.
89 * olimit is only used in compute_olimit.
90 */
91#define var0 r15
92#define var1 r9
93#define var2 r10
94#define var3 r11
95
96/* 32-bit var registers */
97#define vard0 r15d
98#define vard1 r9d
99#define vard2 r10d
100#define vard3 r11d
101
102/* Calls X(N) for each stream 0, 1, 2, 3. */
103#define FOR_EACH_STREAM(X) \
104 X(0); \
105 X(1); \
106 X(2); \
107 X(3)
108
109/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
110#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
111 X(0, idx); \
112 X(1, idx); \
113 X(2, idx); \
114 X(3, idx)
115
116/* Define both _HUF_* & HUF_* symbols because MacOS
117 * C symbols are prefixed with '_' & Linux symbols aren't.
118 */
119_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
120HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
121 ZSTD_CET_ENDBRANCH
122 /* Save all registers - even if they are callee saved for simplicity. */
123 push %rax
124 push %rbx
125 push %rcx
126 push %rdx
127 push %rbp
128 push %rsi
129 push %rdi
130 push %r8
131 push %r9
132 push %r10
133 push %r11
134 push %r12
135 push %r13
136 push %r14
137 push %r15
138
139 /* Read HUF_DecompressAsmArgs* args from %rax */
140 movq %rdi, %rax
141 movq 0(%rax), %ip0
142 movq 8(%rax), %ip1
143 movq 16(%rax), %ip2
144 movq 24(%rax), %ip3
145 movq 32(%rax), %op0
146 movq 40(%rax), %op1
147 movq 48(%rax), %op2
148 movq 56(%rax), %op3
149 movq 64(%rax), %bits0
150 movq 72(%rax), %bits1
151 movq 80(%rax), %bits2
152 movq 88(%rax), %bits3
153 movq 96(%rax), %dtable
154 push %rax /* argument */
f535537f 155 push 104(%rax) /* ilowest */
648db22b 156 push 112(%rax) /* oend */
157 push %olimit /* olimit space */
158
159 subq $24, %rsp
160
161.L_4X1_compute_olimit:
162 /* Computes how many iterations we can do safely
163 * %r15, %rax may be clobbered
164 * rbx, rdx must be saved
165 * op3 & ip0 mustn't be clobbered
166 */
167 movq %rbx, 0(%rsp)
168 movq %rdx, 8(%rsp)
169
170 movq 32(%rsp), %rax /* rax = oend */
171 subq %op3, %rax /* rax = oend - op3 */
172
173 /* r15 = (oend - op3) / 5 */
174 movabsq $-3689348814741910323, %rdx
175 mulq %rdx
176 movq %rdx, %r15
177 shrq $2, %r15
178
179 movq %ip0, %rax /* rax = ip0 */
f535537f 180 movq 40(%rsp), %rdx /* rdx = ilowest */
181 subq %rdx, %rax /* rax = ip0 - ilowest */
182 movq %rax, %rbx /* rbx = ip0 - ilowest */
648db22b 183
f535537f 184 /* rdx = (ip0 - ilowest) / 7 */
648db22b 185 movabsq $2635249153387078803, %rdx
186 mulq %rdx
187 subq %rdx, %rbx
188 shrq %rbx
189 addq %rbx, %rdx
190 shrq $2, %rdx
191
192 /* r15 = min(%rdx, %r15) */
193 cmpq %rdx, %r15
194 cmova %rdx, %r15
195
196 /* r15 = r15 * 5 */
197 leaq (%r15, %r15, 4), %r15
198
199 /* olimit = op3 + r15 */
200 addq %op3, %olimit
201
202 movq 8(%rsp), %rdx
203 movq 0(%rsp), %rbx
204
205 /* If (op3 + 20 > olimit) */
206 movq %op3, %rax /* rax = op3 */
f535537f 207 cmpq %rax, %olimit /* op3 == olimit */
208 je .L_4X1_exit
648db22b 209
210 /* If (ip1 < ip0) go to exit */
211 cmpq %ip0, %ip1
212 jb .L_4X1_exit
213
214 /* If (ip2 < ip1) go to exit */
215 cmpq %ip1, %ip2
216 jb .L_4X1_exit
217
218 /* If (ip3 < ip2) go to exit */
219 cmpq %ip2, %ip3
220 jb .L_4X1_exit
221
222/* Reads top 11 bits from bits[n]
223 * Loads dt[bits[n]] into var[n]
224 */
225#define GET_NEXT_DELT(n) \
226 movq $53, %var##n; \
227 shrxq %var##n, %bits##n, %var##n; \
228 movzwl (%dtable,%var##n,2),%vard##n
229
230/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
231 * Moves var[n] to %rax
232 * bits[n] <<= var[n] & 63
233 * op[n][idx] = %rax >> 8
234 * %ah is a way to access bits [8, 16) of %rax
235 */
236#define DECODE_FROM_DELT(n, idx) \
237 movq %var##n, %rax; \
238 shlxq %var##n, %bits##n, %bits##n; \
239 movb %ah, idx(%op##n)
240
241/* Assumes GET_NEXT_DELT has been called.
242 * Calls DECODE_FROM_DELT then GET_NEXT_DELT
243 */
244#define DECODE_AND_GET_NEXT(n, idx) \
245 DECODE_FROM_DELT(n, idx); \
246 GET_NEXT_DELT(n) \
247
248/* // ctz & nbBytes is stored in bits[n]
249 * // nbBits is stored in %rax
250 * ctz = CTZ[bits[n]]
251 * nbBits = ctz & 7
252 * nbBytes = ctz >> 3
253 * op[n] += 5
254 * ip[n] -= nbBytes
255 * // Note: x86-64 is little-endian ==> no bswap
256 * bits[n] = MEM_readST(ip[n]) | 1
257 * bits[n] <<= nbBits
258 */
259#define RELOAD_BITS(n) \
260 bsfq %bits##n, %bits##n; \
261 movq %bits##n, %rax; \
262 andq $7, %rax; \
263 shrq $3, %bits##n; \
264 leaq 5(%op##n), %op##n; \
265 subq %bits##n, %ip##n; \
266 movq (%ip##n), %bits##n; \
267 orq $1, %bits##n; \
268 shlx %rax, %bits##n, %bits##n
269
270 /* Store clobbered variables on the stack */
271 movq %olimit, 24(%rsp)
272 movq %ip1, 0(%rsp)
273 movq %ip2, 8(%rsp)
274 movq %ip3, 16(%rsp)
275
276 /* Call GET_NEXT_DELT for each stream */
277 FOR_EACH_STREAM(GET_NEXT_DELT)
278
279 .p2align 6
280
281.L_4X1_loop_body:
282 /* Decode 5 symbols in each of the 4 streams (20 total)
283 * Must have called GET_NEXT_DELT for each stream
284 */
285 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
286 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
287 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
288 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
289 FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
290
291 /* Load ip[1,2,3] from stack (var[] aliases them)
292 * ip[] is needed for RELOAD_BITS
293 * Each will be stored back to the stack after RELOAD
294 */
295 movq 0(%rsp), %ip1
296 movq 8(%rsp), %ip2
297 movq 16(%rsp), %ip3
298
299 /* Reload each stream & fetch the next table entry
300 * to prepare for the next iteration
301 */
302 RELOAD_BITS(0)
303 GET_NEXT_DELT(0)
304
305 RELOAD_BITS(1)
306 movq %ip1, 0(%rsp)
307 GET_NEXT_DELT(1)
308
309 RELOAD_BITS(2)
310 movq %ip2, 8(%rsp)
311 GET_NEXT_DELT(2)
312
313 RELOAD_BITS(3)
314 movq %ip3, 16(%rsp)
315 GET_NEXT_DELT(3)
316
317 /* If op3 < olimit: continue the loop */
318 cmp %op3, 24(%rsp)
319 ja .L_4X1_loop_body
320
321 /* Reload ip[1,2,3] from stack */
322 movq 0(%rsp), %ip1
323 movq 8(%rsp), %ip2
324 movq 16(%rsp), %ip3
325
326 /* Re-compute olimit */
327 jmp .L_4X1_compute_olimit
328
329#undef GET_NEXT_DELT
330#undef DECODE_FROM_DELT
331#undef DECODE
332#undef RELOAD_BITS
333.L_4X1_exit:
334 addq $24, %rsp
335
336 /* Restore stack (oend & olimit) */
337 pop %rax /* olimit */
338 pop %rax /* oend */
f535537f 339 pop %rax /* ilowest */
648db22b 340 pop %rax /* arg */
341
342 /* Save ip / op / bits */
343 movq %ip0, 0(%rax)
344 movq %ip1, 8(%rax)
345 movq %ip2, 16(%rax)
346 movq %ip3, 24(%rax)
347 movq %op0, 32(%rax)
348 movq %op1, 40(%rax)
349 movq %op2, 48(%rax)
350 movq %op3, 56(%rax)
351 movq %bits0, 64(%rax)
352 movq %bits1, 72(%rax)
353 movq %bits2, 80(%rax)
354 movq %bits3, 88(%rax)
355
356 /* Restore registers */
357 pop %r15
358 pop %r14
359 pop %r13
360 pop %r12
361 pop %r11
362 pop %r10
363 pop %r9
364 pop %r8
365 pop %rdi
366 pop %rsi
367 pop %rbp
368 pop %rdx
369 pop %rcx
370 pop %rbx
371 pop %rax
372 ret
373
374_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
375HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
376 ZSTD_CET_ENDBRANCH
377 /* Save all registers - even if they are callee saved for simplicity. */
378 push %rax
379 push %rbx
380 push %rcx
381 push %rdx
382 push %rbp
383 push %rsi
384 push %rdi
385 push %r8
386 push %r9
387 push %r10
388 push %r11
389 push %r12
390 push %r13
391 push %r14
392 push %r15
393
394 movq %rdi, %rax
395 movq 0(%rax), %ip0
396 movq 8(%rax), %ip1
397 movq 16(%rax), %ip2
398 movq 24(%rax), %ip3
399 movq 32(%rax), %op0
400 movq 40(%rax), %op1
401 movq 48(%rax), %op2
402 movq 56(%rax), %op3
403 movq 64(%rax), %bits0
404 movq 72(%rax), %bits1
405 movq 80(%rax), %bits2
406 movq 88(%rax), %bits3
407 movq 96(%rax), %dtable
408 push %rax /* argument */
409 push %rax /* olimit */
f535537f 410 push 104(%rax) /* ilowest */
648db22b 411
412 movq 112(%rax), %rax
413 push %rax /* oend3 */
414
415 movq %op3, %rax
416 push %rax /* oend2 */
417
418 movq %op2, %rax
419 push %rax /* oend1 */
420
421 movq %op1, %rax
422 push %rax /* oend0 */
423
424 /* Scratch space */
425 subq $8, %rsp
426
427.L_4X2_compute_olimit:
428 /* Computes how many iterations we can do safely
429 * %r15, %rax may be clobbered
430 * rdx must be saved
431 * op[1,2,3,4] & ip0 mustn't be clobbered
432 */
433 movq %rdx, 0(%rsp)
434
435 /* We can consume up to 7 input bytes each iteration. */
436 movq %ip0, %rax /* rax = ip0 */
f535537f 437 movq 40(%rsp), %rdx /* rdx = ilowest */
438 subq %rdx, %rax /* rax = ip0 - ilowest */
439 movq %rax, %r15 /* r15 = ip0 - ilowest */
648db22b 440
441 /* rdx = rax / 7 */
442 movabsq $2635249153387078803, %rdx
443 mulq %rdx
444 subq %rdx, %r15
445 shrq %r15
446 addq %r15, %rdx
447 shrq $2, %rdx
448
f535537f 449 /* r15 = (ip0 - ilowest) / 7 */
648db22b 450 movq %rdx, %r15
451
452 /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
453 movq 8(%rsp), %rax /* rax = oend0 */
454 subq %op0, %rax /* rax = oend0 - op0 */
455 movq 16(%rsp), %rdx /* rdx = oend1 */
456 subq %op1, %rdx /* rdx = oend1 - op1 */
457
458 cmpq %rax, %rdx
459 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
460
461 movq 24(%rsp), %rax /* rax = oend2 */
462 subq %op2, %rax /* rax = oend2 - op2 */
463
464 cmpq %rax, %rdx
465 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
466
467 movq 32(%rsp), %rax /* rax = oend3 */
468 subq %op3, %rax /* rax = oend3 - op3 */
469
470 cmpq %rax, %rdx
471 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
472
473 movabsq $-3689348814741910323, %rax
474 mulq %rdx
475 shrq $3, %rdx /* rdx = rdx / 10 */
476
477 /* r15 = min(%rdx, %r15) */
478 cmpq %rdx, %r15
479 cmova %rdx, %r15
480
481 /* olimit = op3 + 5 * r15 */
482 movq %r15, %rax
483 leaq (%op3, %rax, 4), %olimit
484 addq %rax, %olimit
485
486 movq 0(%rsp), %rdx
487
488 /* If (op3 + 10 > olimit) */
489 movq %op3, %rax /* rax = op3 */
f535537f 490 cmpq %rax, %olimit /* op3 == olimit */
491 je .L_4X2_exit
648db22b 492
493 /* If (ip1 < ip0) go to exit */
494 cmpq %ip0, %ip1
495 jb .L_4X2_exit
496
497 /* If (ip2 < ip1) go to exit */
498 cmpq %ip1, %ip2
499 jb .L_4X2_exit
500
501 /* If (ip3 < ip2) go to exit */
502 cmpq %ip2, %ip3
503 jb .L_4X2_exit
504
505#define DECODE(n, idx) \
506 movq %bits##n, %rax; \
507 shrq $53, %rax; \
508 movzwl 0(%dtable,%rax,4),%r8d; \
509 movzbl 2(%dtable,%rax,4),%r15d; \
510 movzbl 3(%dtable,%rax,4),%eax; \
511 movw %r8w, (%op##n); \
512 shlxq %r15, %bits##n, %bits##n; \
513 addq %rax, %op##n
514
515#define RELOAD_BITS(n) \
516 bsfq %bits##n, %bits##n; \
517 movq %bits##n, %rax; \
518 shrq $3, %bits##n; \
519 andq $7, %rax; \
520 subq %bits##n, %ip##n; \
521 movq (%ip##n), %bits##n; \
522 orq $1, %bits##n; \
523 shlxq %rax, %bits##n, %bits##n
524
525
526 movq %olimit, 48(%rsp)
527
528 .p2align 6
529
530.L_4X2_loop_body:
531 /* We clobber r8, so store it on the stack */
532 movq %r8, 0(%rsp)
533
534 /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
535 FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
536 FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
537 FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
538 FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
539 FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
540
541 /* Reload r8 */
542 movq 0(%rsp), %r8
543
544 FOR_EACH_STREAM(RELOAD_BITS)
545
546 cmp %op3, 48(%rsp)
547 ja .L_4X2_loop_body
548 jmp .L_4X2_compute_olimit
549
550#undef DECODE
551#undef RELOAD_BITS
552.L_4X2_exit:
553 addq $8, %rsp
554 /* Restore stack (oend & olimit) */
555 pop %rax /* oend0 */
556 pop %rax /* oend1 */
557 pop %rax /* oend2 */
558 pop %rax /* oend3 */
f535537f 559 pop %rax /* ilowest */
648db22b 560 pop %rax /* olimit */
561 pop %rax /* arg */
562
563 /* Save ip / op / bits */
564 movq %ip0, 0(%rax)
565 movq %ip1, 8(%rax)
566 movq %ip2, 16(%rax)
567 movq %ip3, 24(%rax)
568 movq %op0, 32(%rax)
569 movq %op1, 40(%rax)
570 movq %op2, 48(%rax)
571 movq %op3, 56(%rax)
572 movq %bits0, 64(%rax)
573 movq %bits1, 72(%rax)
574 movq %bits2, 80(%rax)
575 movq %bits3, 88(%rax)
576
577 /* Restore registers */
578 pop %r15
579 pop %r14
580 pop %r13
581 pop %r12
582 pop %r11
583 pop %r10
584 pop %r9
585 pop %r8
586 pop %rdi
587 pop %rsi
588 pop %rbp
589 pop %rdx
590 pop %rcx
591 pop %rbx
592 pop %rax
593 ret
594
595#endif