648db22b |
1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | #include "../common/portability_macros.h" |
12 | |
f535537f |
13 | #if defined(__ELF__) && defined(__GNUC__) |
648db22b |
14 | /* Stack marking |
15 | * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart |
16 | */ |
648db22b |
17 | .section .note.GNU-stack,"",%progbits |
f535537f |
18 | |
19 | #if defined(__aarch64__) |
20 | /* Mark that this assembly supports BTI & PAC, because it is empty for aarch64. |
21 | * See: https://github.com/facebook/zstd/issues/3841 |
22 | * See: https://gcc.godbolt.org/z/sqr5T4ffK |
23 | * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/ |
24 | * See: https://reviews.llvm.org/D62609 |
25 | */ |
26 | .pushsection .note.gnu.property, "a" |
27 | .p2align 3 |
28 | .long 4 /* size of the name - "GNU\0" */ |
29 | .long 0x10 /* size of descriptor */ |
30 | .long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */ |
31 | .asciz "GNU" |
32 | .long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */ |
33 | .long 4 /* pr_datasz - 4 bytes */ |
34 | .long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */ |
35 | .p2align 3 /* pr_padding - bring everything to 8 byte alignment */ |
36 | .popsection |
37 | #endif |
38 | |
648db22b |
39 | #endif |
40 | |
41 | #if ZSTD_ENABLE_ASM_X86_64_BMI2 |
42 | |
43 | /* Calling convention: |
44 | * |
45 | * %rdi contains the first argument: HUF_DecompressAsmArgs*. |
46 | * %rbp isn't maintained (no frame pointer). |
47 | * %rsp contains the stack pointer that grows down. |
48 | * No red-zone is assumed, only addresses >= %rsp are used. |
49 | * All register contents are preserved. |
50 | * |
51 | * TODO: Support Windows calling convention. |
52 | */ |
53 | |
54 | ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop) |
55 | ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop) |
56 | ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop) |
57 | ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop) |
58 | .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop |
59 | .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop |
60 | .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop |
61 | .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop |
62 | .text |
63 | |
64 | /* Sets up register mappings for clarity. |
65 | * op[], bits[], dtable & ip[0] each get their own register. |
66 | * ip[1,2,3] & olimit alias var[]. |
67 | * %rax is a scratch register. |
68 | */ |
69 | |
70 | #define op0 rsi |
71 | #define op1 rbx |
72 | #define op2 rcx |
73 | #define op3 rdi |
74 | |
75 | #define ip0 r8 |
76 | #define ip1 r9 |
77 | #define ip2 r10 |
78 | #define ip3 r11 |
79 | |
80 | #define bits0 rbp |
81 | #define bits1 rdx |
82 | #define bits2 r12 |
83 | #define bits3 r13 |
84 | #define dtable r14 |
85 | #define olimit r15 |
86 | |
87 | /* var[] aliases ip[1,2,3] & olimit |
88 | * ip[1,2,3] are saved every iteration. |
89 | * olimit is only used in compute_olimit. |
90 | */ |
91 | #define var0 r15 |
92 | #define var1 r9 |
93 | #define var2 r10 |
94 | #define var3 r11 |
95 | |
96 | /* 32-bit var registers */ |
97 | #define vard0 r15d |
98 | #define vard1 r9d |
99 | #define vard2 r10d |
100 | #define vard3 r11d |
101 | |
102 | /* Calls X(N) for each stream 0, 1, 2, 3. */ |
103 | #define FOR_EACH_STREAM(X) \ |
104 | X(0); \ |
105 | X(1); \ |
106 | X(2); \ |
107 | X(3) |
108 | |
109 | /* Calls X(N, idx) for each stream 0, 1, 2, 3. */ |
110 | #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ |
111 | X(0, idx); \ |
112 | X(1, idx); \ |
113 | X(2, idx); \ |
114 | X(3, idx) |
115 | |
116 | /* Define both _HUF_* & HUF_* symbols because MacOS |
117 | * C symbols are prefixed with '_' & Linux symbols aren't. |
118 | */ |
119 | _HUF_decompress4X1_usingDTable_internal_fast_asm_loop: |
120 | HUF_decompress4X1_usingDTable_internal_fast_asm_loop: |
121 | ZSTD_CET_ENDBRANCH |
122 | /* Save all registers - even if they are callee saved for simplicity. */ |
123 | push %rax |
124 | push %rbx |
125 | push %rcx |
126 | push %rdx |
127 | push %rbp |
128 | push %rsi |
129 | push %rdi |
130 | push %r8 |
131 | push %r9 |
132 | push %r10 |
133 | push %r11 |
134 | push %r12 |
135 | push %r13 |
136 | push %r14 |
137 | push %r15 |
138 | |
139 | /* Read HUF_DecompressAsmArgs* args from %rax */ |
140 | movq %rdi, %rax |
141 | movq 0(%rax), %ip0 |
142 | movq 8(%rax), %ip1 |
143 | movq 16(%rax), %ip2 |
144 | movq 24(%rax), %ip3 |
145 | movq 32(%rax), %op0 |
146 | movq 40(%rax), %op1 |
147 | movq 48(%rax), %op2 |
148 | movq 56(%rax), %op3 |
149 | movq 64(%rax), %bits0 |
150 | movq 72(%rax), %bits1 |
151 | movq 80(%rax), %bits2 |
152 | movq 88(%rax), %bits3 |
153 | movq 96(%rax), %dtable |
154 | push %rax /* argument */ |
f535537f |
155 | push 104(%rax) /* ilowest */ |
648db22b |
156 | push 112(%rax) /* oend */ |
157 | push %olimit /* olimit space */ |
158 | |
159 | subq $24, %rsp |
160 | |
161 | .L_4X1_compute_olimit: |
162 | /* Computes how many iterations we can do safely |
163 | * %r15, %rax may be clobbered |
164 | * rbx, rdx must be saved |
165 | * op3 & ip0 mustn't be clobbered |
166 | */ |
167 | movq %rbx, 0(%rsp) |
168 | movq %rdx, 8(%rsp) |
169 | |
170 | movq 32(%rsp), %rax /* rax = oend */ |
171 | subq %op3, %rax /* rax = oend - op3 */ |
172 | |
173 | /* r15 = (oend - op3) / 5 */ |
174 | movabsq $-3689348814741910323, %rdx |
175 | mulq %rdx |
176 | movq %rdx, %r15 |
177 | shrq $2, %r15 |
178 | |
179 | movq %ip0, %rax /* rax = ip0 */ |
f535537f |
180 | movq 40(%rsp), %rdx /* rdx = ilowest */ |
181 | subq %rdx, %rax /* rax = ip0 - ilowest */ |
182 | movq %rax, %rbx /* rbx = ip0 - ilowest */ |
648db22b |
183 | |
f535537f |
184 | /* rdx = (ip0 - ilowest) / 7 */ |
648db22b |
185 | movabsq $2635249153387078803, %rdx |
186 | mulq %rdx |
187 | subq %rdx, %rbx |
188 | shrq %rbx |
189 | addq %rbx, %rdx |
190 | shrq $2, %rdx |
191 | |
192 | /* r15 = min(%rdx, %r15) */ |
193 | cmpq %rdx, %r15 |
194 | cmova %rdx, %r15 |
195 | |
196 | /* r15 = r15 * 5 */ |
197 | leaq (%r15, %r15, 4), %r15 |
198 | |
199 | /* olimit = op3 + r15 */ |
200 | addq %op3, %olimit |
201 | |
202 | movq 8(%rsp), %rdx |
203 | movq 0(%rsp), %rbx |
204 | |
205 | /* If (op3 + 20 > olimit) */ |
206 | movq %op3, %rax /* rax = op3 */ |
f535537f |
207 | cmpq %rax, %olimit /* op3 == olimit */ |
208 | je .L_4X1_exit |
648db22b |
209 | |
210 | /* If (ip1 < ip0) go to exit */ |
211 | cmpq %ip0, %ip1 |
212 | jb .L_4X1_exit |
213 | |
214 | /* If (ip2 < ip1) go to exit */ |
215 | cmpq %ip1, %ip2 |
216 | jb .L_4X1_exit |
217 | |
218 | /* If (ip3 < ip2) go to exit */ |
219 | cmpq %ip2, %ip3 |
220 | jb .L_4X1_exit |
221 | |
222 | /* Reads top 11 bits from bits[n] |
223 | * Loads dt[bits[n]] into var[n] |
224 | */ |
225 | #define GET_NEXT_DELT(n) \ |
226 | movq $53, %var##n; \ |
227 | shrxq %var##n, %bits##n, %var##n; \ |
228 | movzwl (%dtable,%var##n,2),%vard##n |
229 | |
230 | /* var[n] must contain the DTable entry computed with GET_NEXT_DELT |
231 | * Moves var[n] to %rax |
232 | * bits[n] <<= var[n] & 63 |
233 | * op[n][idx] = %rax >> 8 |
234 | * %ah is a way to access bits [8, 16) of %rax |
235 | */ |
236 | #define DECODE_FROM_DELT(n, idx) \ |
237 | movq %var##n, %rax; \ |
238 | shlxq %var##n, %bits##n, %bits##n; \ |
239 | movb %ah, idx(%op##n) |
240 | |
241 | /* Assumes GET_NEXT_DELT has been called. |
242 | * Calls DECODE_FROM_DELT then GET_NEXT_DELT |
243 | */ |
244 | #define DECODE_AND_GET_NEXT(n, idx) \ |
245 | DECODE_FROM_DELT(n, idx); \ |
246 | GET_NEXT_DELT(n) \ |
247 | |
248 | /* // ctz & nbBytes is stored in bits[n] |
249 | * // nbBits is stored in %rax |
250 | * ctz = CTZ[bits[n]] |
251 | * nbBits = ctz & 7 |
252 | * nbBytes = ctz >> 3 |
253 | * op[n] += 5 |
254 | * ip[n] -= nbBytes |
255 | * // Note: x86-64 is little-endian ==> no bswap |
256 | * bits[n] = MEM_readST(ip[n]) | 1 |
257 | * bits[n] <<= nbBits |
258 | */ |
259 | #define RELOAD_BITS(n) \ |
260 | bsfq %bits##n, %bits##n; \ |
261 | movq %bits##n, %rax; \ |
262 | andq $7, %rax; \ |
263 | shrq $3, %bits##n; \ |
264 | leaq 5(%op##n), %op##n; \ |
265 | subq %bits##n, %ip##n; \ |
266 | movq (%ip##n), %bits##n; \ |
267 | orq $1, %bits##n; \ |
268 | shlx %rax, %bits##n, %bits##n |
269 | |
270 | /* Store clobbered variables on the stack */ |
271 | movq %olimit, 24(%rsp) |
272 | movq %ip1, 0(%rsp) |
273 | movq %ip2, 8(%rsp) |
274 | movq %ip3, 16(%rsp) |
275 | |
276 | /* Call GET_NEXT_DELT for each stream */ |
277 | FOR_EACH_STREAM(GET_NEXT_DELT) |
278 | |
279 | .p2align 6 |
280 | |
281 | .L_4X1_loop_body: |
282 | /* Decode 5 symbols in each of the 4 streams (20 total) |
283 | * Must have called GET_NEXT_DELT for each stream |
284 | */ |
285 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) |
286 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) |
287 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) |
288 | FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) |
289 | FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) |
290 | |
291 | /* Load ip[1,2,3] from stack (var[] aliases them) |
292 | * ip[] is needed for RELOAD_BITS |
293 | * Each will be stored back to the stack after RELOAD |
294 | */ |
295 | movq 0(%rsp), %ip1 |
296 | movq 8(%rsp), %ip2 |
297 | movq 16(%rsp), %ip3 |
298 | |
299 | /* Reload each stream & fetch the next table entry |
300 | * to prepare for the next iteration |
301 | */ |
302 | RELOAD_BITS(0) |
303 | GET_NEXT_DELT(0) |
304 | |
305 | RELOAD_BITS(1) |
306 | movq %ip1, 0(%rsp) |
307 | GET_NEXT_DELT(1) |
308 | |
309 | RELOAD_BITS(2) |
310 | movq %ip2, 8(%rsp) |
311 | GET_NEXT_DELT(2) |
312 | |
313 | RELOAD_BITS(3) |
314 | movq %ip3, 16(%rsp) |
315 | GET_NEXT_DELT(3) |
316 | |
317 | /* If op3 < olimit: continue the loop */ |
318 | cmp %op3, 24(%rsp) |
319 | ja .L_4X1_loop_body |
320 | |
321 | /* Reload ip[1,2,3] from stack */ |
322 | movq 0(%rsp), %ip1 |
323 | movq 8(%rsp), %ip2 |
324 | movq 16(%rsp), %ip3 |
325 | |
326 | /* Re-compute olimit */ |
327 | jmp .L_4X1_compute_olimit |
328 | |
329 | #undef GET_NEXT_DELT |
330 | #undef DECODE_FROM_DELT |
331 | #undef DECODE |
332 | #undef RELOAD_BITS |
333 | .L_4X1_exit: |
334 | addq $24, %rsp |
335 | |
336 | /* Restore stack (oend & olimit) */ |
337 | pop %rax /* olimit */ |
338 | pop %rax /* oend */ |
f535537f |
339 | pop %rax /* ilowest */ |
648db22b |
340 | pop %rax /* arg */ |
341 | |
342 | /* Save ip / op / bits */ |
343 | movq %ip0, 0(%rax) |
344 | movq %ip1, 8(%rax) |
345 | movq %ip2, 16(%rax) |
346 | movq %ip3, 24(%rax) |
347 | movq %op0, 32(%rax) |
348 | movq %op1, 40(%rax) |
349 | movq %op2, 48(%rax) |
350 | movq %op3, 56(%rax) |
351 | movq %bits0, 64(%rax) |
352 | movq %bits1, 72(%rax) |
353 | movq %bits2, 80(%rax) |
354 | movq %bits3, 88(%rax) |
355 | |
356 | /* Restore registers */ |
357 | pop %r15 |
358 | pop %r14 |
359 | pop %r13 |
360 | pop %r12 |
361 | pop %r11 |
362 | pop %r10 |
363 | pop %r9 |
364 | pop %r8 |
365 | pop %rdi |
366 | pop %rsi |
367 | pop %rbp |
368 | pop %rdx |
369 | pop %rcx |
370 | pop %rbx |
371 | pop %rax |
372 | ret |
373 | |
374 | _HUF_decompress4X2_usingDTable_internal_fast_asm_loop: |
375 | HUF_decompress4X2_usingDTable_internal_fast_asm_loop: |
376 | ZSTD_CET_ENDBRANCH |
377 | /* Save all registers - even if they are callee saved for simplicity. */ |
378 | push %rax |
379 | push %rbx |
380 | push %rcx |
381 | push %rdx |
382 | push %rbp |
383 | push %rsi |
384 | push %rdi |
385 | push %r8 |
386 | push %r9 |
387 | push %r10 |
388 | push %r11 |
389 | push %r12 |
390 | push %r13 |
391 | push %r14 |
392 | push %r15 |
393 | |
394 | movq %rdi, %rax |
395 | movq 0(%rax), %ip0 |
396 | movq 8(%rax), %ip1 |
397 | movq 16(%rax), %ip2 |
398 | movq 24(%rax), %ip3 |
399 | movq 32(%rax), %op0 |
400 | movq 40(%rax), %op1 |
401 | movq 48(%rax), %op2 |
402 | movq 56(%rax), %op3 |
403 | movq 64(%rax), %bits0 |
404 | movq 72(%rax), %bits1 |
405 | movq 80(%rax), %bits2 |
406 | movq 88(%rax), %bits3 |
407 | movq 96(%rax), %dtable |
408 | push %rax /* argument */ |
409 | push %rax /* olimit */ |
f535537f |
410 | push 104(%rax) /* ilowest */ |
648db22b |
411 | |
412 | movq 112(%rax), %rax |
413 | push %rax /* oend3 */ |
414 | |
415 | movq %op3, %rax |
416 | push %rax /* oend2 */ |
417 | |
418 | movq %op2, %rax |
419 | push %rax /* oend1 */ |
420 | |
421 | movq %op1, %rax |
422 | push %rax /* oend0 */ |
423 | |
424 | /* Scratch space */ |
425 | subq $8, %rsp |
426 | |
427 | .L_4X2_compute_olimit: |
428 | /* Computes how many iterations we can do safely |
429 | * %r15, %rax may be clobbered |
430 | * rdx must be saved |
431 | * op[1,2,3,4] & ip0 mustn't be clobbered |
432 | */ |
433 | movq %rdx, 0(%rsp) |
434 | |
435 | /* We can consume up to 7 input bytes each iteration. */ |
436 | movq %ip0, %rax /* rax = ip0 */ |
f535537f |
437 | movq 40(%rsp), %rdx /* rdx = ilowest */ |
438 | subq %rdx, %rax /* rax = ip0 - ilowest */ |
439 | movq %rax, %r15 /* r15 = ip0 - ilowest */ |
648db22b |
440 | |
441 | /* rdx = rax / 7 */ |
442 | movabsq $2635249153387078803, %rdx |
443 | mulq %rdx |
444 | subq %rdx, %r15 |
445 | shrq %r15 |
446 | addq %r15, %rdx |
447 | shrq $2, %rdx |
448 | |
f535537f |
449 | /* r15 = (ip0 - ilowest) / 7 */ |
648db22b |
450 | movq %rdx, %r15 |
451 | |
452 | /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */ |
453 | movq 8(%rsp), %rax /* rax = oend0 */ |
454 | subq %op0, %rax /* rax = oend0 - op0 */ |
455 | movq 16(%rsp), %rdx /* rdx = oend1 */ |
456 | subq %op1, %rdx /* rdx = oend1 - op1 */ |
457 | |
458 | cmpq %rax, %rdx |
459 | cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ |
460 | |
461 | movq 24(%rsp), %rax /* rax = oend2 */ |
462 | subq %op2, %rax /* rax = oend2 - op2 */ |
463 | |
464 | cmpq %rax, %rdx |
465 | cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ |
466 | |
467 | movq 32(%rsp), %rax /* rax = oend3 */ |
468 | subq %op3, %rax /* rax = oend3 - op3 */ |
469 | |
470 | cmpq %rax, %rdx |
471 | cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ |
472 | |
473 | movabsq $-3689348814741910323, %rax |
474 | mulq %rdx |
475 | shrq $3, %rdx /* rdx = rdx / 10 */ |
476 | |
477 | /* r15 = min(%rdx, %r15) */ |
478 | cmpq %rdx, %r15 |
479 | cmova %rdx, %r15 |
480 | |
481 | /* olimit = op3 + 5 * r15 */ |
482 | movq %r15, %rax |
483 | leaq (%op3, %rax, 4), %olimit |
484 | addq %rax, %olimit |
485 | |
486 | movq 0(%rsp), %rdx |
487 | |
488 | /* If (op3 + 10 > olimit) */ |
489 | movq %op3, %rax /* rax = op3 */ |
f535537f |
490 | cmpq %rax, %olimit /* op3 == olimit */ |
491 | je .L_4X2_exit |
648db22b |
492 | |
493 | /* If (ip1 < ip0) go to exit */ |
494 | cmpq %ip0, %ip1 |
495 | jb .L_4X2_exit |
496 | |
497 | /* If (ip2 < ip1) go to exit */ |
498 | cmpq %ip1, %ip2 |
499 | jb .L_4X2_exit |
500 | |
501 | /* If (ip3 < ip2) go to exit */ |
502 | cmpq %ip2, %ip3 |
503 | jb .L_4X2_exit |
504 | |
505 | #define DECODE(n, idx) \ |
506 | movq %bits##n, %rax; \ |
507 | shrq $53, %rax; \ |
508 | movzwl 0(%dtable,%rax,4),%r8d; \ |
509 | movzbl 2(%dtable,%rax,4),%r15d; \ |
510 | movzbl 3(%dtable,%rax,4),%eax; \ |
511 | movw %r8w, (%op##n); \ |
512 | shlxq %r15, %bits##n, %bits##n; \ |
513 | addq %rax, %op##n |
514 | |
515 | #define RELOAD_BITS(n) \ |
516 | bsfq %bits##n, %bits##n; \ |
517 | movq %bits##n, %rax; \ |
518 | shrq $3, %bits##n; \ |
519 | andq $7, %rax; \ |
520 | subq %bits##n, %ip##n; \ |
521 | movq (%ip##n), %bits##n; \ |
522 | orq $1, %bits##n; \ |
523 | shlxq %rax, %bits##n, %bits##n |
524 | |
525 | |
526 | movq %olimit, 48(%rsp) |
527 | |
528 | .p2align 6 |
529 | |
530 | .L_4X2_loop_body: |
531 | /* We clobber r8, so store it on the stack */ |
532 | movq %r8, 0(%rsp) |
533 | |
534 | /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ |
535 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) |
536 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) |
537 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) |
538 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) |
539 | FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) |
540 | |
541 | /* Reload r8 */ |
542 | movq 0(%rsp), %r8 |
543 | |
544 | FOR_EACH_STREAM(RELOAD_BITS) |
545 | |
546 | cmp %op3, 48(%rsp) |
547 | ja .L_4X2_loop_body |
548 | jmp .L_4X2_compute_olimit |
549 | |
550 | #undef DECODE |
551 | #undef RELOAD_BITS |
552 | .L_4X2_exit: |
553 | addq $8, %rsp |
554 | /* Restore stack (oend & olimit) */ |
555 | pop %rax /* oend0 */ |
556 | pop %rax /* oend1 */ |
557 | pop %rax /* oend2 */ |
558 | pop %rax /* oend3 */ |
f535537f |
559 | pop %rax /* ilowest */ |
648db22b |
560 | pop %rax /* olimit */ |
561 | pop %rax /* arg */ |
562 | |
563 | /* Save ip / op / bits */ |
564 | movq %ip0, 0(%rax) |
565 | movq %ip1, 8(%rax) |
566 | movq %ip2, 16(%rax) |
567 | movq %ip3, 24(%rax) |
568 | movq %op0, 32(%rax) |
569 | movq %op1, 40(%rax) |
570 | movq %op2, 48(%rax) |
571 | movq %op3, 56(%rax) |
572 | movq %bits0, 64(%rax) |
573 | movq %bits1, 72(%rax) |
574 | movq %bits2, 80(%rax) |
575 | movq %bits3, 88(%rax) |
576 | |
577 | /* Restore registers */ |
578 | pop %r15 |
579 | pop %r14 |
580 | pop %r13 |
581 | pop %r12 |
582 | pop %r11 |
583 | pop %r10 |
584 | pop %r9 |
585 | pop %r8 |
586 | pop %rdi |
587 | pop %rsi |
588 | pop %rbp |
589 | pop %rdx |
590 | pop %rcx |
591 | pop %rbx |
592 | pop %rax |
593 | ret |
594 | |
595 | #endif |