| 1 | /* |
| 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
| 3 | * All rights reserved. |
| 4 | * |
| 5 | * This source code is licensed under both the BSD-style license (found in the |
| 6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
| 7 | * in the COPYING file in the root directory of this source tree). |
| 8 | * You may select, at your option, one of the above-listed licenses. |
| 9 | */ |
| 10 | |
| 11 | #include "../common/portability_macros.h" |
| 12 | |
| 13 | /* Stack marking |
| 14 | * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart |
| 15 | */ |
| 16 | #if defined(__ELF__) && defined(__GNUC__) |
| 17 | .section .note.GNU-stack,"",%progbits |
| 18 | #endif |
| 19 | |
| 20 | #if ZSTD_ENABLE_ASM_X86_64_BMI2 |
| 21 | |
| 22 | /* Calling convention: |
| 23 | * |
| 24 | * %rdi contains the first argument: HUF_DecompressAsmArgs*. |
| 25 | * %rbp isn't maintained (no frame pointer). |
| 26 | * %rsp contains the stack pointer that grows down. |
| 27 | * No red-zone is assumed, only addresses >= %rsp are used. |
| 28 | * All register contents are preserved. |
| 29 | * |
| 30 | * TODO: Support Windows calling convention. |
| 31 | */ |
| 32 | |
| 33 | ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop) |
| 34 | ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop) |
| 35 | ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop) |
| 36 | ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop) |
| 37 | .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop |
| 38 | .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop |
| 39 | .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop |
| 40 | .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop |
| 41 | .text |
| 42 | |
| 43 | /* Sets up register mappings for clarity. |
| 44 | * op[], bits[], dtable & ip[0] each get their own register. |
| 45 | * ip[1,2,3] & olimit alias var[]. |
| 46 | * %rax is a scratch register. |
| 47 | */ |
| 48 | |
| 49 | #define op0 rsi |
| 50 | #define op1 rbx |
| 51 | #define op2 rcx |
| 52 | #define op3 rdi |
| 53 | |
| 54 | #define ip0 r8 |
| 55 | #define ip1 r9 |
| 56 | #define ip2 r10 |
| 57 | #define ip3 r11 |
| 58 | |
| 59 | #define bits0 rbp |
| 60 | #define bits1 rdx |
| 61 | #define bits2 r12 |
| 62 | #define bits3 r13 |
| 63 | #define dtable r14 |
| 64 | #define olimit r15 |
| 65 | |
| 66 | /* var[] aliases ip[1,2,3] & olimit |
| 67 | * ip[1,2,3] are saved every iteration. |
| 68 | * olimit is only used in compute_olimit. |
| 69 | */ |
| 70 | #define var0 r15 |
| 71 | #define var1 r9 |
| 72 | #define var2 r10 |
| 73 | #define var3 r11 |
| 74 | |
| 75 | /* 32-bit var registers */ |
| 76 | #define vard0 r15d |
| 77 | #define vard1 r9d |
| 78 | #define vard2 r10d |
| 79 | #define vard3 r11d |
| 80 | |
| 81 | /* Calls X(N) for each stream 0, 1, 2, 3. */ |
| 82 | #define FOR_EACH_STREAM(X) \ |
| 83 | X(0); \ |
| 84 | X(1); \ |
| 85 | X(2); \ |
| 86 | X(3) |
| 87 | |
| 88 | /* Calls X(N, idx) for each stream 0, 1, 2, 3. */ |
| 89 | #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ |
| 90 | X(0, idx); \ |
| 91 | X(1, idx); \ |
| 92 | X(2, idx); \ |
| 93 | X(3, idx) |
| 94 | |
| 95 | /* Define both _HUF_* & HUF_* symbols because MacOS |
| 96 | * C symbols are prefixed with '_' & Linux symbols aren't. |
| 97 | */ |
| 98 | _HUF_decompress4X1_usingDTable_internal_fast_asm_loop: |
| 99 | HUF_decompress4X1_usingDTable_internal_fast_asm_loop: |
| 100 | ZSTD_CET_ENDBRANCH |
| 101 | /* Save all registers - even if they are callee saved for simplicity. */ |
| 102 | push %rax |
| 103 | push %rbx |
| 104 | push %rcx |
| 105 | push %rdx |
| 106 | push %rbp |
| 107 | push %rsi |
| 108 | push %rdi |
| 109 | push %r8 |
| 110 | push %r9 |
| 111 | push %r10 |
| 112 | push %r11 |
| 113 | push %r12 |
| 114 | push %r13 |
| 115 | push %r14 |
| 116 | push %r15 |
| 117 | |
| 118 | /* Read HUF_DecompressAsmArgs* args from %rax */ |
| 119 | movq %rdi, %rax |
| 120 | movq 0(%rax), %ip0 |
| 121 | movq 8(%rax), %ip1 |
| 122 | movq 16(%rax), %ip2 |
| 123 | movq 24(%rax), %ip3 |
| 124 | movq 32(%rax), %op0 |
| 125 | movq 40(%rax), %op1 |
| 126 | movq 48(%rax), %op2 |
| 127 | movq 56(%rax), %op3 |
| 128 | movq 64(%rax), %bits0 |
| 129 | movq 72(%rax), %bits1 |
| 130 | movq 80(%rax), %bits2 |
| 131 | movq 88(%rax), %bits3 |
| 132 | movq 96(%rax), %dtable |
| 133 | push %rax /* argument */ |
| 134 | push 104(%rax) /* ilimit */ |
| 135 | push 112(%rax) /* oend */ |
| 136 | push %olimit /* olimit space */ |
| 137 | |
| 138 | subq $24, %rsp |
| 139 | |
| 140 | .L_4X1_compute_olimit: |
| 141 | /* Computes how many iterations we can do safely |
| 142 | * %r15, %rax may be clobbered |
| 143 | * rbx, rdx must be saved |
| 144 | * op3 & ip0 mustn't be clobbered |
| 145 | */ |
| 146 | movq %rbx, 0(%rsp) |
| 147 | movq %rdx, 8(%rsp) |
| 148 | |
| 149 | movq 32(%rsp), %rax /* rax = oend */ |
| 150 | subq %op3, %rax /* rax = oend - op3 */ |
| 151 | |
| 152 | /* r15 = (oend - op3) / 5 */ |
| 153 | movabsq $-3689348814741910323, %rdx |
| 154 | mulq %rdx |
| 155 | movq %rdx, %r15 |
| 156 | shrq $2, %r15 |
| 157 | |
| 158 | movq %ip0, %rax /* rax = ip0 */ |
| 159 | movq 40(%rsp), %rdx /* rdx = ilimit */ |
| 160 | subq %rdx, %rax /* rax = ip0 - ilimit */ |
| 161 | movq %rax, %rbx /* rbx = ip0 - ilimit */ |
| 162 | |
| 163 | /* rdx = (ip0 - ilimit) / 7 */ |
| 164 | movabsq $2635249153387078803, %rdx |
| 165 | mulq %rdx |
| 166 | subq %rdx, %rbx |
| 167 | shrq %rbx |
| 168 | addq %rbx, %rdx |
| 169 | shrq $2, %rdx |
| 170 | |
| 171 | /* r15 = min(%rdx, %r15) */ |
| 172 | cmpq %rdx, %r15 |
| 173 | cmova %rdx, %r15 |
| 174 | |
| 175 | /* r15 = r15 * 5 */ |
| 176 | leaq (%r15, %r15, 4), %r15 |
| 177 | |
| 178 | /* olimit = op3 + r15 */ |
| 179 | addq %op3, %olimit |
| 180 | |
| 181 | movq 8(%rsp), %rdx |
| 182 | movq 0(%rsp), %rbx |
| 183 | |
| 184 | /* If (op3 + 20 > olimit) */ |
| 185 | movq %op3, %rax /* rax = op3 */ |
| 186 | addq $20, %rax /* rax = op3 + 20 */ |
| 187 | cmpq %rax, %olimit /* op3 + 20 > olimit */ |
| 188 | jb .L_4X1_exit |
| 189 | |
| 190 | /* If (ip1 < ip0) go to exit */ |
| 191 | cmpq %ip0, %ip1 |
| 192 | jb .L_4X1_exit |
| 193 | |
| 194 | /* If (ip2 < ip1) go to exit */ |
| 195 | cmpq %ip1, %ip2 |
| 196 | jb .L_4X1_exit |
| 197 | |
| 198 | /* If (ip3 < ip2) go to exit */ |
| 199 | cmpq %ip2, %ip3 |
| 200 | jb .L_4X1_exit |
| 201 | |
| 202 | /* Reads top 11 bits from bits[n] |
| 203 | * Loads dt[bits[n]] into var[n] |
| 204 | */ |
| 205 | #define GET_NEXT_DELT(n) \ |
| 206 | movq $53, %var##n; \ |
| 207 | shrxq %var##n, %bits##n, %var##n; \ |
| 208 | movzwl (%dtable,%var##n,2),%vard##n |
| 209 | |
| 210 | /* var[n] must contain the DTable entry computed with GET_NEXT_DELT |
| 211 | * Moves var[n] to %rax |
| 212 | * bits[n] <<= var[n] & 63 |
| 213 | * op[n][idx] = %rax >> 8 |
| 214 | * %ah is a way to access bits [8, 16) of %rax |
| 215 | */ |
| 216 | #define DECODE_FROM_DELT(n, idx) \ |
| 217 | movq %var##n, %rax; \ |
| 218 | shlxq %var##n, %bits##n, %bits##n; \ |
| 219 | movb %ah, idx(%op##n) |
| 220 | |
| 221 | /* Assumes GET_NEXT_DELT has been called. |
| 222 | * Calls DECODE_FROM_DELT then GET_NEXT_DELT |
| 223 | */ |
| 224 | #define DECODE_AND_GET_NEXT(n, idx) \ |
| 225 | DECODE_FROM_DELT(n, idx); \ |
| 226 | GET_NEXT_DELT(n) \ |
| 227 | |
| 228 | /* // ctz & nbBytes is stored in bits[n] |
| 229 | * // nbBits is stored in %rax |
| 230 | * ctz = CTZ[bits[n]] |
| 231 | * nbBits = ctz & 7 |
| 232 | * nbBytes = ctz >> 3 |
| 233 | * op[n] += 5 |
| 234 | * ip[n] -= nbBytes |
| 235 | * // Note: x86-64 is little-endian ==> no bswap |
| 236 | * bits[n] = MEM_readST(ip[n]) | 1 |
| 237 | * bits[n] <<= nbBits |
| 238 | */ |
| 239 | #define RELOAD_BITS(n) \ |
| 240 | bsfq %bits##n, %bits##n; \ |
| 241 | movq %bits##n, %rax; \ |
| 242 | andq $7, %rax; \ |
| 243 | shrq $3, %bits##n; \ |
| 244 | leaq 5(%op##n), %op##n; \ |
| 245 | subq %bits##n, %ip##n; \ |
| 246 | movq (%ip##n), %bits##n; \ |
| 247 | orq $1, %bits##n; \ |
| 248 | shlx %rax, %bits##n, %bits##n |
| 249 | |
| 250 | /* Store clobbered variables on the stack */ |
| 251 | movq %olimit, 24(%rsp) |
| 252 | movq %ip1, 0(%rsp) |
| 253 | movq %ip2, 8(%rsp) |
| 254 | movq %ip3, 16(%rsp) |
| 255 | |
| 256 | /* Call GET_NEXT_DELT for each stream */ |
| 257 | FOR_EACH_STREAM(GET_NEXT_DELT) |
| 258 | |
| 259 | .p2align 6 |
| 260 | |
| 261 | .L_4X1_loop_body: |
| 262 | /* Decode 5 symbols in each of the 4 streams (20 total) |
| 263 | * Must have called GET_NEXT_DELT for each stream |
| 264 | */ |
| 265 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) |
| 266 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) |
| 267 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) |
| 268 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) |
| 269 | FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) |
| 270 | |
| 271 | /* Load ip[1,2,3] from stack (var[] aliases them) |
| 272 | * ip[] is needed for RELOAD_BITS |
| 273 | * Each will be stored back to the stack after RELOAD |
| 274 | */ |
| 275 | movq 0(%rsp), %ip1 |
| 276 | movq 8(%rsp), %ip2 |
| 277 | movq 16(%rsp), %ip3 |
| 278 | |
| 279 | /* Reload each stream & fetch the next table entry |
| 280 | * to prepare for the next iteration |
| 281 | */ |
| 282 | RELOAD_BITS(0) |
| 283 | GET_NEXT_DELT(0) |
| 284 | |
| 285 | RELOAD_BITS(1) |
| 286 | movq %ip1, 0(%rsp) |
| 287 | GET_NEXT_DELT(1) |
| 288 | |
| 289 | RELOAD_BITS(2) |
| 290 | movq %ip2, 8(%rsp) |
| 291 | GET_NEXT_DELT(2) |
| 292 | |
| 293 | RELOAD_BITS(3) |
| 294 | movq %ip3, 16(%rsp) |
| 295 | GET_NEXT_DELT(3) |
| 296 | |
| 297 | /* If op3 < olimit: continue the loop */ |
| 298 | cmp %op3, 24(%rsp) |
| 299 | ja .L_4X1_loop_body |
| 300 | |
| 301 | /* Reload ip[1,2,3] from stack */ |
| 302 | movq 0(%rsp), %ip1 |
| 303 | movq 8(%rsp), %ip2 |
| 304 | movq 16(%rsp), %ip3 |
| 305 | |
| 306 | /* Re-compute olimit */ |
| 307 | jmp .L_4X1_compute_olimit |
| 308 | |
| 309 | #undef GET_NEXT_DELT |
| 310 | #undef DECODE_FROM_DELT |
| 311 | #undef DECODE |
| 312 | #undef RELOAD_BITS |
| 313 | .L_4X1_exit: |
| 314 | addq $24, %rsp |
| 315 | |
| 316 | /* Restore stack (oend & olimit) */ |
| 317 | pop %rax /* olimit */ |
| 318 | pop %rax /* oend */ |
| 319 | pop %rax /* ilimit */ |
| 320 | pop %rax /* arg */ |
| 321 | |
| 322 | /* Save ip / op / bits */ |
| 323 | movq %ip0, 0(%rax) |
| 324 | movq %ip1, 8(%rax) |
| 325 | movq %ip2, 16(%rax) |
| 326 | movq %ip3, 24(%rax) |
| 327 | movq %op0, 32(%rax) |
| 328 | movq %op1, 40(%rax) |
| 329 | movq %op2, 48(%rax) |
| 330 | movq %op3, 56(%rax) |
| 331 | movq %bits0, 64(%rax) |
| 332 | movq %bits1, 72(%rax) |
| 333 | movq %bits2, 80(%rax) |
| 334 | movq %bits3, 88(%rax) |
| 335 | |
| 336 | /* Restore registers */ |
| 337 | pop %r15 |
| 338 | pop %r14 |
| 339 | pop %r13 |
| 340 | pop %r12 |
| 341 | pop %r11 |
| 342 | pop %r10 |
| 343 | pop %r9 |
| 344 | pop %r8 |
| 345 | pop %rdi |
| 346 | pop %rsi |
| 347 | pop %rbp |
| 348 | pop %rdx |
| 349 | pop %rcx |
| 350 | pop %rbx |
| 351 | pop %rax |
| 352 | ret |
| 353 | |
| 354 | _HUF_decompress4X2_usingDTable_internal_fast_asm_loop: |
| 355 | HUF_decompress4X2_usingDTable_internal_fast_asm_loop: |
| 356 | ZSTD_CET_ENDBRANCH |
| 357 | /* Save all registers - even if they are callee saved for simplicity. */ |
| 358 | push %rax |
| 359 | push %rbx |
| 360 | push %rcx |
| 361 | push %rdx |
| 362 | push %rbp |
| 363 | push %rsi |
| 364 | push %rdi |
| 365 | push %r8 |
| 366 | push %r9 |
| 367 | push %r10 |
| 368 | push %r11 |
| 369 | push %r12 |
| 370 | push %r13 |
| 371 | push %r14 |
| 372 | push %r15 |
| 373 | |
| 374 | movq %rdi, %rax |
| 375 | movq 0(%rax), %ip0 |
| 376 | movq 8(%rax), %ip1 |
| 377 | movq 16(%rax), %ip2 |
| 378 | movq 24(%rax), %ip3 |
| 379 | movq 32(%rax), %op0 |
| 380 | movq 40(%rax), %op1 |
| 381 | movq 48(%rax), %op2 |
| 382 | movq 56(%rax), %op3 |
| 383 | movq 64(%rax), %bits0 |
| 384 | movq 72(%rax), %bits1 |
| 385 | movq 80(%rax), %bits2 |
| 386 | movq 88(%rax), %bits3 |
| 387 | movq 96(%rax), %dtable |
| 388 | push %rax /* argument */ |
| 389 | push %rax /* olimit */ |
| 390 | push 104(%rax) /* ilimit */ |
| 391 | |
| 392 | movq 112(%rax), %rax |
| 393 | push %rax /* oend3 */ |
| 394 | |
| 395 | movq %op3, %rax |
| 396 | push %rax /* oend2 */ |
| 397 | |
| 398 | movq %op2, %rax |
| 399 | push %rax /* oend1 */ |
| 400 | |
| 401 | movq %op1, %rax |
| 402 | push %rax /* oend0 */ |
| 403 | |
| 404 | /* Scratch space */ |
| 405 | subq $8, %rsp |
| 406 | |
| 407 | .L_4X2_compute_olimit: |
| 408 | /* Computes how many iterations we can do safely |
| 409 | * %r15, %rax may be clobbered |
| 410 | * rdx must be saved |
| 411 | * op[1,2,3,4] & ip0 mustn't be clobbered |
| 412 | */ |
| 413 | movq %rdx, 0(%rsp) |
| 414 | |
| 415 | /* We can consume up to 7 input bytes each iteration. */ |
| 416 | movq %ip0, %rax /* rax = ip0 */ |
| 417 | movq 40(%rsp), %rdx /* rdx = ilimit */ |
| 418 | subq %rdx, %rax /* rax = ip0 - ilimit */ |
| 419 | movq %rax, %r15 /* r15 = ip0 - ilimit */ |
| 420 | |
| 421 | /* rdx = rax / 7 */ |
| 422 | movabsq $2635249153387078803, %rdx |
| 423 | mulq %rdx |
| 424 | subq %rdx, %r15 |
| 425 | shrq %r15 |
| 426 | addq %r15, %rdx |
| 427 | shrq $2, %rdx |
| 428 | |
| 429 | /* r15 = (ip0 - ilimit) / 7 */ |
| 430 | movq %rdx, %r15 |
| 431 | |
| 432 | /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */ |
| 433 | movq 8(%rsp), %rax /* rax = oend0 */ |
| 434 | subq %op0, %rax /* rax = oend0 - op0 */ |
| 435 | movq 16(%rsp), %rdx /* rdx = oend1 */ |
| 436 | subq %op1, %rdx /* rdx = oend1 - op1 */ |
| 437 | |
| 438 | cmpq %rax, %rdx |
| 439 | cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ |
| 440 | |
| 441 | movq 24(%rsp), %rax /* rax = oend2 */ |
| 442 | subq %op2, %rax /* rax = oend2 - op2 */ |
| 443 | |
| 444 | cmpq %rax, %rdx |
| 445 | cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ |
| 446 | |
| 447 | movq 32(%rsp), %rax /* rax = oend3 */ |
| 448 | subq %op3, %rax /* rax = oend3 - op3 */ |
| 449 | |
| 450 | cmpq %rax, %rdx |
| 451 | cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ |
| 452 | |
| 453 | movabsq $-3689348814741910323, %rax |
| 454 | mulq %rdx |
| 455 | shrq $3, %rdx /* rdx = rdx / 10 */ |
| 456 | |
| 457 | /* r15 = min(%rdx, %r15) */ |
| 458 | cmpq %rdx, %r15 |
| 459 | cmova %rdx, %r15 |
| 460 | |
| 461 | /* olimit = op3 + 5 * r15 */ |
| 462 | movq %r15, %rax |
| 463 | leaq (%op3, %rax, 4), %olimit |
| 464 | addq %rax, %olimit |
| 465 | |
| 466 | movq 0(%rsp), %rdx |
| 467 | |
| 468 | /* If (op3 + 10 > olimit) */ |
| 469 | movq %op3, %rax /* rax = op3 */ |
| 470 | addq $10, %rax /* rax = op3 + 10 */ |
| 471 | cmpq %rax, %olimit /* op3 + 10 > olimit */ |
| 472 | jb .L_4X2_exit |
| 473 | |
| 474 | /* If (ip1 < ip0) go to exit */ |
| 475 | cmpq %ip0, %ip1 |
| 476 | jb .L_4X2_exit |
| 477 | |
| 478 | /* If (ip2 < ip1) go to exit */ |
| 479 | cmpq %ip1, %ip2 |
| 480 | jb .L_4X2_exit |
| 481 | |
| 482 | /* If (ip3 < ip2) go to exit */ |
| 483 | cmpq %ip2, %ip3 |
| 484 | jb .L_4X2_exit |
| 485 | |
| 486 | #define DECODE(n, idx) \ |
| 487 | movq %bits##n, %rax; \ |
| 488 | shrq $53, %rax; \ |
| 489 | movzwl 0(%dtable,%rax,4),%r8d; \ |
| 490 | movzbl 2(%dtable,%rax,4),%r15d; \ |
| 491 | movzbl 3(%dtable,%rax,4),%eax; \ |
| 492 | movw %r8w, (%op##n); \ |
| 493 | shlxq %r15, %bits##n, %bits##n; \ |
| 494 | addq %rax, %op##n |
| 495 | |
| 496 | #define RELOAD_BITS(n) \ |
| 497 | bsfq %bits##n, %bits##n; \ |
| 498 | movq %bits##n, %rax; \ |
| 499 | shrq $3, %bits##n; \ |
| 500 | andq $7, %rax; \ |
| 501 | subq %bits##n, %ip##n; \ |
| 502 | movq (%ip##n), %bits##n; \ |
| 503 | orq $1, %bits##n; \ |
| 504 | shlxq %rax, %bits##n, %bits##n |
| 505 | |
| 506 | |
| 507 | movq %olimit, 48(%rsp) |
| 508 | |
| 509 | .p2align 6 |
| 510 | |
| 511 | .L_4X2_loop_body: |
| 512 | /* We clobber r8, so store it on the stack */ |
| 513 | movq %r8, 0(%rsp) |
| 514 | |
| 515 | /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ |
| 516 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) |
| 517 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) |
| 518 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) |
| 519 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) |
| 520 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) |
| 521 | |
| 522 | /* Reload r8 */ |
| 523 | movq 0(%rsp), %r8 |
| 524 | |
| 525 | FOR_EACH_STREAM(RELOAD_BITS) |
| 526 | |
| 527 | cmp %op3, 48(%rsp) |
| 528 | ja .L_4X2_loop_body |
| 529 | jmp .L_4X2_compute_olimit |
| 530 | |
| 531 | #undef DECODE |
| 532 | #undef RELOAD_BITS |
| 533 | .L_4X2_exit: |
| 534 | addq $8, %rsp |
| 535 | /* Restore stack (oend & olimit) */ |
| 536 | pop %rax /* oend0 */ |
| 537 | pop %rax /* oend1 */ |
| 538 | pop %rax /* oend2 */ |
| 539 | pop %rax /* oend3 */ |
| 540 | pop %rax /* ilimit */ |
| 541 | pop %rax /* olimit */ |
| 542 | pop %rax /* arg */ |
| 543 | |
| 544 | /* Save ip / op / bits */ |
| 545 | movq %ip0, 0(%rax) |
| 546 | movq %ip1, 8(%rax) |
| 547 | movq %ip2, 16(%rax) |
| 548 | movq %ip3, 24(%rax) |
| 549 | movq %op0, 32(%rax) |
| 550 | movq %op1, 40(%rax) |
| 551 | movq %op2, 48(%rax) |
| 552 | movq %op3, 56(%rax) |
| 553 | movq %bits0, 64(%rax) |
| 554 | movq %bits1, 72(%rax) |
| 555 | movq %bits2, 80(%rax) |
| 556 | movq %bits3, 88(%rax) |
| 557 | |
| 558 | /* Restore registers */ |
| 559 | pop %r15 |
| 560 | pop %r14 |
| 561 | pop %r13 |
| 562 | pop %r12 |
| 563 | pop %r11 |
| 564 | pop %r10 |
| 565 | pop %r9 |
| 566 | pop %r8 |
| 567 | pop %rdi |
| 568 | pop %rsi |
| 569 | pop %rbp |
| 570 | pop %rdx |
| 571 | pop %rcx |
| 572 | pop %rbx |
| 573 | pop %rax |
| 574 | ret |
| 575 | |
| 576 | #endif |