2 * Copyright (c) Meta Platforms, Inc. and affiliates.
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
11 #include "../common/portability_macros.h"
14 * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
16 #if defined(__ELF__) && defined(__GNUC__)
17 .section .note.GNU-stack,"",%progbits
20 #if ZSTD_ENABLE_ASM_X86_64_BMI2
22 /* Calling convention:
24 * %rdi contains the first argument: HUF_DecompressAsmArgs*.
25 * %rbp isn't maintained (no frame pointer).
26 * %rsp contains the stack pointer that grows down.
27 * No red-zone is assumed, only addresses >= %rsp are used.
28 * All register contents are preserved.
30 * TODO: Support Windows calling convention.
33 ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
34 ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
35 ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
36 ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
37 .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
38 .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
39 .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
40 .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
43 /* Sets up register mappings for clarity.
44 * op[], bits[], dtable & ip[0] each get their own register.
45 * ip[1,2,3] & olimit alias var[].
46 * %rax is a scratch register.
66 /* var[] aliases ip[1,2,3] & olimit
67 * ip[1,2,3] are saved every iteration.
68 * olimit is only used in compute_olimit.
75 /* 32-bit var registers */
81 /* Calls X(N) for each stream 0, 1, 2, 3. */
82 #define FOR_EACH_STREAM(X) \
88 /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
89 #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
95 /* Define both _HUF_* & HUF_* symbols because MacOS
96 * C symbols are prefixed with '_' & Linux symbols aren't.
98 _HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
99 HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
101 /* Save all registers - even if they are callee saved for simplicity. */
118 /* Read HUF_DecompressAsmArgs* args from %rax */
128 movq 64(%rax), %bits0
129 movq 72(%rax), %bits1
130 movq 80(%rax), %bits2
131 movq 88(%rax), %bits3
132 movq 96(%rax), %dtable
133 push %rax /* argument */
134 push 104(%rax) /* ilimit */
135 push 112(%rax) /* oend */
136 push %olimit /* olimit space */
140 .L_4X1_compute_olimit:
141 /* Computes how many iterations we can do safely
142 * %r15, %rax may be clobbered
143 * rbx, rdx must be saved
144 * op3 & ip0 mustn't be clobbered
149 movq 32(%rsp), %rax /* rax = oend */
150 subq %op3, %rax /* rax = oend - op3 */
152 /* r15 = (oend - op3) / 5 */
153 movabsq $-3689348814741910323, %rdx
158 movq %ip0, %rax /* rax = ip0 */
159 movq 40(%rsp), %rdx /* rdx = ilimit */
160 subq %rdx, %rax /* rax = ip0 - ilimit */
161 movq %rax, %rbx /* rbx = ip0 - ilimit */
163 /* rdx = (ip0 - ilimit) / 7 */
164 movabsq $2635249153387078803, %rdx
171 /* r15 = min(%rdx, %r15) */
176 leaq (%r15, %r15, 4), %r15
178 /* olimit = op3 + r15 */
184 /* If (op3 + 20 > olimit) */
185 movq %op3, %rax /* rax = op3 */
186 addq $20, %rax /* rax = op3 + 20 */
187 cmpq %rax, %olimit /* op3 + 20 > olimit */
190 /* If (ip1 < ip0) go to exit */
194 /* If (ip2 < ip1) go to exit */
198 /* If (ip3 < ip2) go to exit */
202 /* Reads top 11 bits from bits[n]
203 * Loads dt[bits[n]] into var[n]
205 #define GET_NEXT_DELT(n) \
207 shrxq %var##n, %bits##n, %var##n; \
208 movzwl (%dtable,%var##n,2),%vard##n
210 /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
211 * Moves var[n] to %rax
212 * bits[n] <<= var[n] & 63
213 * op[n][idx] = %rax >> 8
214 * %ah is a way to access bits [8, 16) of %rax
216 #define DECODE_FROM_DELT(n, idx) \
217 movq %var##n, %rax; \
218 shlxq %var##n, %bits##n, %bits##n; \
219 movb %ah, idx(%op##n)
221 /* Assumes GET_NEXT_DELT has been called.
222 * Calls DECODE_FROM_DELT then GET_NEXT_DELT
224 #define DECODE_AND_GET_NEXT(n, idx) \
225 DECODE_FROM_DELT(n, idx); \
228 /* // ctz & nbBytes is stored in bits[n]
229 * // nbBits is stored in %rax
235 * // Note: x86-64 is little-endian ==> no bswap
236 * bits[n] = MEM_readST(ip[n]) | 1
239 #define RELOAD_BITS(n) \
240 bsfq %bits##n, %bits##n; \
241 movq %bits##n, %rax; \
244 leaq 5(%op##n), %op##n; \
245 subq %bits##n, %ip##n; \
246 movq (%ip##n), %bits##n; \
248 shlx %rax, %bits##n, %bits##n
250 /* Store clobbered variables on the stack */
251 movq %olimit, 24(%rsp)
256 /* Call GET_NEXT_DELT for each stream */
257 FOR_EACH_STREAM(GET_NEXT_DELT)
262 /* Decode 5 symbols in each of the 4 streams (20 total)
263 * Must have called GET_NEXT_DELT for each stream
265 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
266 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
267 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
268 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
269 FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
271 /* Load ip[1,2,3] from stack (var[] aliases them)
272 * ip[] is needed for RELOAD_BITS
273 * Each will be stored back to the stack after RELOAD
279 /* Reload each stream & fetch the next table entry
280 * to prepare for the next iteration
297 /* If op3 < olimit: continue the loop */
301 /* Reload ip[1,2,3] from stack */
306 /* Re-compute olimit */
307 jmp .L_4X1_compute_olimit
310 #undef DECODE_FROM_DELT
316 /* Restore stack (oend & olimit) */
317 pop %rax /* olimit */
319 pop %rax /* ilimit */
322 /* Save ip / op / bits */
331 movq %bits0, 64(%rax)
332 movq %bits1, 72(%rax)
333 movq %bits2, 80(%rax)
334 movq %bits3, 88(%rax)
336 /* Restore registers */
354 _HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
355 HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
357 /* Save all registers - even if they are callee saved for simplicity. */
383 movq 64(%rax), %bits0
384 movq 72(%rax), %bits1
385 movq 80(%rax), %bits2
386 movq 88(%rax), %bits3
387 movq 96(%rax), %dtable
388 push %rax /* argument */
389 push %rax /* olimit */
390 push 104(%rax) /* ilimit */
393 push %rax /* oend3 */
396 push %rax /* oend2 */
399 push %rax /* oend1 */
402 push %rax /* oend0 */
407 .L_4X2_compute_olimit:
408 /* Computes how many iterations we can do safely
409 * %r15, %rax may be clobbered
411 * op[1,2,3,4] & ip0 mustn't be clobbered
415 /* We can consume up to 7 input bytes each iteration. */
416 movq %ip0, %rax /* rax = ip0 */
417 movq 40(%rsp), %rdx /* rdx = ilimit */
418 subq %rdx, %rax /* rax = ip0 - ilimit */
419 movq %rax, %r15 /* r15 = ip0 - ilimit */
422 movabsq $2635249153387078803, %rdx
429 /* r15 = (ip0 - ilimit) / 7 */
432 /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
433 movq 8(%rsp), %rax /* rax = oend0 */
434 subq %op0, %rax /* rax = oend0 - op0 */
435 movq 16(%rsp), %rdx /* rdx = oend1 */
436 subq %op1, %rdx /* rdx = oend1 - op1 */
439 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
441 movq 24(%rsp), %rax /* rax = oend2 */
442 subq %op2, %rax /* rax = oend2 - op2 */
445 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
447 movq 32(%rsp), %rax /* rax = oend3 */
448 subq %op3, %rax /* rax = oend3 - op3 */
451 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
453 movabsq $-3689348814741910323, %rax
455 shrq $3, %rdx /* rdx = rdx / 10 */
457 /* r15 = min(%rdx, %r15) */
461 /* olimit = op3 + 5 * r15 */
463 leaq (%op3, %rax, 4), %olimit
468 /* If (op3 + 10 > olimit) */
469 movq %op3, %rax /* rax = op3 */
470 addq $10, %rax /* rax = op3 + 10 */
471 cmpq %rax, %olimit /* op3 + 10 > olimit */
474 /* If (ip1 < ip0) go to exit */
478 /* If (ip2 < ip1) go to exit */
482 /* If (ip3 < ip2) go to exit */
486 #define DECODE(n, idx) \
487 movq %bits##n, %rax; \
489 movzwl 0(%dtable,%rax,4),%r8d; \
490 movzbl 2(%dtable,%rax,4),%r15d; \
491 movzbl 3(%dtable,%rax,4),%eax; \
492 movw %r8w, (%op##n); \
493 shlxq %r15, %bits##n, %bits##n; \
496 #define RELOAD_BITS(n) \
497 bsfq %bits##n, %bits##n; \
498 movq %bits##n, %rax; \
501 subq %bits##n, %ip##n; \
502 movq (%ip##n), %bits##n; \
504 shlxq %rax, %bits##n, %bits##n
507 movq %olimit, 48(%rsp)
512 /* We clobber r8, so store it on the stack */
515 /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
516 FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
517 FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
518 FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
519 FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
520 FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
525 FOR_EACH_STREAM(RELOAD_BITS)
529 jmp .L_4X2_compute_olimit
535 /* Restore stack (oend & olimit) */
540 pop %rax /* ilimit */
541 pop %rax /* olimit */
544 /* Save ip / op / bits */
553 movq %bits0, 64(%rax)
554 movq %bits1, 72(%rax)
555 movq %bits2, 80(%rax)
556 movq %bits3, 88(%rax)
558 /* Restore registers */