make: don't enable neon for all ARMs
[pcsx_rearmed.git] / deps / libchdr / deps / lzma-22.01 / src / Asm / x86 / AesOpt.asm
CommitLineData
9e052883 1; AesOpt.asm -- AES optimized code for x86 AES hardware instructions\r
2; 2021-12-25 : Igor Pavlov : Public domain\r
3\r
4include 7zAsm.asm\r
5\r
6ifdef __ASMC__\r
7 use_vaes_256 equ 1\r
8else\r
9ifdef ymm0\r
10 use_vaes_256 equ 1\r
11endif\r
12endif\r
13\r
14\r
15ifdef use_vaes_256\r
16 ECHO "++ VAES 256"\r
17else\r
18 ECHO "-- NO VAES 256"\r
19endif\r
20\r
21ifdef x64\r
22 ECHO "x86-64"\r
23else\r
24 ECHO "x86"\r
25if (IS_CDECL gt 0)\r
26 ECHO "ABI : CDECL"\r
27else\r
28 ECHO "ABI : no CDECL : FASTCALL"\r
29endif\r
30endif\r
31\r
32if (IS_LINUX gt 0)\r
33 ECHO "ABI : LINUX"\r
34else\r
35 ECHO "ABI : WINDOWS"\r
36endif\r
37\r
38MY_ASM_START\r
39\r
40ifndef x64\r
41 .686\r
42 .xmm\r
43endif\r
44\r
45\r
46; MY_ALIGN EQU ALIGN(64)\r
47MY_ALIGN EQU\r
48\r
49SEG_ALIGN EQU MY_ALIGN\r
50\r
51MY_SEG_PROC macro name:req, numParams:req\r
52 ; seg_name equ @CatStr(_TEXT$, name)\r
53 ; seg_name SEGMENT SEG_ALIGN 'CODE'\r
54 MY_PROC name, numParams\r
55endm\r
56\r
57MY_SEG_ENDP macro\r
58 ; seg_name ENDS\r
59endm\r
60\r
61\r
62NUM_AES_KEYS_MAX equ 15\r
63\r
64; the number of push operators in function PROLOG\r
65if (IS_LINUX eq 0) or (IS_X64 eq 0)\r
66num_regs_push equ 2\r
67stack_param_offset equ (REG_SIZE * (1 + num_regs_push))\r
68endif\r
69\r
70ifdef x64\r
71 num_param equ REG_ABI_PARAM_2\r
72else\r
73 if (IS_CDECL gt 0)\r
74 ; size_t size\r
75 ; void * data\r
76 ; UInt32 * aes\r
77 ; ret-ip <- (r4)\r
78 aes_OFFS equ (stack_param_offset)\r
79 data_OFFS equ (REG_SIZE + aes_OFFS)\r
80 size_OFFS equ (REG_SIZE + data_OFFS)\r
81 num_param equ [r4 + size_OFFS]\r
82 else\r
83 num_param equ [r4 + stack_param_offset]\r
84 endif\r
85endif\r
86\r
87keys equ REG_PARAM_0 ; r1\r
88rD equ REG_PARAM_1 ; r2\r
89rN equ r0\r
90\r
91koffs_x equ x7\r
92koffs_r equ r7\r
93\r
94ksize_x equ x6\r
95ksize_r equ r6\r
96\r
97keys2 equ r3\r
98\r
99state equ xmm0\r
100key equ xmm0\r
101key_ymm equ ymm0\r
102key_ymm_n equ 0\r
103\r
104ifdef x64\r
105 ways = 11\r
106else\r
107 ways = 4\r
108endif\r
109\r
110ways_start_reg equ 1\r
111\r
112iv equ @CatStr(xmm, %(ways_start_reg + ways))\r
113iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))\r
114\r
115\r
116WOP macro op, op2\r
117 i = 0\r
118 rept ways\r
119 op @CatStr(xmm, %(ways_start_reg + i)), op2\r
120 i = i + 1\r
121 endm\r
122endm\r
123\r
124\r
125ifndef ABI_LINUX\r
126ifdef x64\r
127\r
128; we use 32 bytes of home space in stack in WIN64-x64\r
129NUM_HOME_MM_REGS equ (32 / 16)\r
130; we preserve xmm registers starting from xmm6 in WIN64-x64\r
131MM_START_SAVE_REG equ 6\r
132\r
133SAVE_XMM macro num_used_mm_regs:req\r
134 num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG\r
135 if num_save_mm_regs GT 0\r
136 num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS\r
137 ; RSP is (16*x + 8) after entering the function in WIN64-x64\r
138 stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)\r
139 \r
140 i = 0\r
141 rept num_save_mm_regs\r
142 \r
143 if i eq NUM_HOME_MM_REGS\r
144 sub r4, stack_offset\r
145 endif\r
146 \r
147 if i lt NUM_HOME_MM_REGS\r
148 movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))\r
149 else\r
150 movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))\r
151 endif\r
152 \r
153 i = i + 1\r
154 endm\r
155 endif\r
156endm\r
157\r
158RESTORE_XMM macro num_used_mm_regs:req\r
159 if num_save_mm_regs GT 0\r
160 i = 0\r
161 if num_save_mm_regs2 GT 0\r
162 rept num_save_mm_regs2\r
163 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]\r
164 i = i + 1\r
165 endm\r
166 add r4, stack_offset\r
167 endif\r
168\r
169 num_low_regs = num_save_mm_regs - i\r
170 i = 0\r
171 rept num_low_regs\r
172 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]\r
173 i = i + 1\r
174 endm\r
175 endif\r
176endm\r
177\r
178endif ; x64\r
179endif ; ABI_LINUX\r
180\r
181\r
182MY_PROLOG macro num_used_mm_regs:req\r
183 ; num_regs_push: must be equal to the number of push operators\r
184 ; push r3\r
185 ; push r5\r
186 if (IS_LINUX eq 0) or (IS_X64 eq 0)\r
187 push r6\r
188 push r7\r
189 endif\r
190\r
191 mov rN, num_param ; don't move it; num_param can use stack pointer (r4)\r
192\r
193 if (IS_X64 eq 0)\r
194 if (IS_CDECL gt 0)\r
195 mov rD, [r4 + data_OFFS]\r
196 mov keys, [r4 + aes_OFFS]\r
197 endif\r
198 elseif (IS_LINUX gt 0)\r
199 MY_ABI_LINUX_TO_WIN_2\r
200 endif\r
201\r
202\r
203 ifndef ABI_LINUX\r
204 ifdef x64\r
205 SAVE_XMM num_used_mm_regs\r
206 endif\r
207 endif\r
208 \r
209 mov ksize_x, [keys + 16]\r
210 shl ksize_x, 5\r
211endm\r
212\r
213\r
214MY_EPILOG macro\r
215 ifndef ABI_LINUX\r
216 ifdef x64\r
217 RESTORE_XMM num_save_mm_regs\r
218 endif\r
219 endif\r
220 \r
221 if (IS_LINUX eq 0) or (IS_X64 eq 0)\r
222 pop r7\r
223 pop r6\r
224 endif\r
225 ; pop r5\r
226 ; pop r3\r
227 MY_ENDP\r
228endm\r
229\r
230\r
231OP_KEY macro op:req, offs:req\r
232 op state, [keys + offs]\r
233endm\r
234\r
235 \r
236WOP_KEY macro op:req, offs:req\r
237 movdqa key, [keys + offs]\r
238 WOP op, key\r
239endm\r
240\r
241\r
242; ---------- AES-CBC Decode ----------\r
243\r
244\r
245XOR_WITH_DATA macro reg, _ppp_\r
246 pxor reg, [rD + i * 16]\r
247endm\r
248\r
249WRITE_TO_DATA macro reg, _ppp_\r
250 movdqa [rD + i * 16], reg\r
251endm\r
252\r
253\r
254; state0 equ @CatStr(xmm, %(ways_start_reg))\r
255\r
256key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))\r
257key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))\r
258\r
259key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))\r
260key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))\r
261key_last_ymm_n equ (ways_start_reg + ways + 2)\r
262\r
263NUM_CBC_REGS equ (ways_start_reg + ways + 3)\r
264\r
265\r
266MY_SEG_PROC AesCbc_Decode_HW, 3\r
267\r
268 AesCbc_Decode_HW_start::\r
269 MY_PROLOG NUM_CBC_REGS\r
270 \r
271 AesCbc_Decode_HW_start_2::\r
272 movdqa iv, [keys]\r
273 add keys, 32\r
274\r
275 movdqa key0, [keys + 1 * ksize_r]\r
276 movdqa key_last, [keys]\r
277 sub ksize_x, 16\r
278\r
279 jmp check2\r
280 align 16\r
281 nextBlocks2:\r
282 WOP movdqa, [rD + i * 16]\r
283 mov koffs_x, ksize_x\r
284 ; WOP_KEY pxor, ksize_r + 16\r
285 WOP pxor, key0\r
286 ; align 16\r
287 @@:\r
288 WOP_KEY aesdec, 1 * koffs_r\r
289 sub koffs_r, 16\r
290 jnz @B\r
291 ; WOP_KEY aesdeclast, 0\r
292 WOP aesdeclast, key_last\r
293 \r
294 pxor @CatStr(xmm, %(ways_start_reg)), iv\r
295 i = 1\r
296 rept ways - 1\r
297 pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]\r
298 i = i + 1\r
299 endm\r
300 movdqa iv, [rD + ways * 16 - 16]\r
301 WOP WRITE_TO_DATA\r
302\r
303 add rD, ways * 16\r
304 AesCbc_Decode_HW_start_3::\r
305 check2:\r
306 sub rN, ways\r
307 jnc nextBlocks2\r
308 add rN, ways\r
309\r
310 sub ksize_x, 16\r
311\r
312 jmp check\r
313 nextBlock:\r
314 movdqa state, [rD]\r
315 mov koffs_x, ksize_x\r
316 ; OP_KEY pxor, 1 * ksize_r + 32\r
317 pxor state, key0\r
318 ; movdqa state0, [rD]\r
319 ; movdqa state, key0\r
320 ; pxor state, state0\r
321 @@:\r
322 OP_KEY aesdec, 1 * koffs_r + 16\r
323 OP_KEY aesdec, 1 * koffs_r\r
324 sub koffs_r, 32\r
325 jnz @B\r
326 OP_KEY aesdec, 16\r
327 ; OP_KEY aesdeclast, 0\r
328 aesdeclast state, key_last\r
329 \r
330 pxor state, iv\r
331 movdqa iv, [rD]\r
332 ; movdqa iv, state0\r
333 movdqa [rD], state\r
334 \r
335 add rD, 16\r
336 check:\r
337 sub rN, 1\r
338 jnc nextBlock\r
339\r
340 movdqa [keys - 32], iv\r
341MY_EPILOG\r
342\r
343\r
344\r
345\r
346; ---------- AVX ----------\r
347\r
348\r
349AVX__WOP_n macro op\r
350 i = 0\r
351 rept ways\r
352 op (ways_start_reg + i)\r
353 i = i + 1\r
354 endm\r
355endm\r
356\r
357AVX__WOP macro op\r
358 i = 0\r
359 rept ways\r
360 op @CatStr(ymm, %(ways_start_reg + i))\r
361 i = i + 1\r
362 endm\r
363endm\r
364\r
365\r
366AVX__WOP_KEY macro op:req, offs:req\r
367 vmovdqa key_ymm, ymmword ptr [keys2 + offs]\r
368 AVX__WOP_n op\r
369endm\r
370\r
371\r
372AVX__CBC_START macro reg\r
373 ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]\r
374 vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]\r
375endm\r
376\r
377AVX__CBC_END macro reg\r
378 if i eq 0\r
379 vpxor reg, reg, iv_ymm\r
380 else\r
381 vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]\r
382 endif\r
383endm\r
384\r
385\r
386AVX__WRITE_TO_DATA macro reg\r
387 vmovdqu ymmword ptr [rD + 32 * i], reg\r
388endm\r
389\r
390AVX__XOR_WITH_DATA macro reg\r
391 vpxor reg, reg, ymmword ptr [rD + 32 * i]\r
392endm\r
393\r
394AVX__CTR_START macro reg\r
395 vpaddq iv_ymm, iv_ymm, one_ymm\r
396 ; vpxor reg, iv_ymm, key_ymm\r
397 vpxor reg, iv_ymm, key0_ymm\r
398endm\r
399\r
400\r
401MY_VAES_INSTR_2 macro cmd, dest, a1, a2\r
402 db 0c4H\r
403 db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)\r
404 db 5 + 8 * ((not (a1)) and 15)\r
405 db cmd\r
406 db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)\r
407endm\r
408\r
409MY_VAES_INSTR macro cmd, dest, a\r
410 MY_VAES_INSTR_2 cmd, dest, dest, a\r
411endm\r
412\r
413MY_vaesenc macro dest, a\r
414 MY_VAES_INSTR 0dcH, dest, a\r
415endm\r
416MY_vaesenclast macro dest, a\r
417 MY_VAES_INSTR 0ddH, dest, a\r
418endm\r
419MY_vaesdec macro dest, a\r
420 MY_VAES_INSTR 0deH, dest, a\r
421endm\r
422MY_vaesdeclast macro dest, a\r
423 MY_VAES_INSTR 0dfH, dest, a\r
424endm\r
425\r
426\r
427AVX__VAES_DEC macro reg\r
428 MY_vaesdec reg, key_ymm_n\r
429endm\r
430\r
431AVX__VAES_DEC_LAST_key_last macro reg\r
432 ; MY_vaesdeclast reg, key_ymm_n\r
433 MY_vaesdeclast reg, key_last_ymm_n\r
434endm\r
435\r
436AVX__VAES_ENC macro reg\r
437 MY_vaesenc reg, key_ymm_n\r
438endm\r
439\r
440AVX__VAES_ENC_LAST macro reg\r
441 MY_vaesenclast reg, key_ymm_n\r
442endm\r
443\r
444AVX__vinserti128_TO_HIGH macro dest, src\r
445 vinserti128 dest, dest, src, 1\r
446endm\r
447\r
448\r
449MY_PROC AesCbc_Decode_HW_256, 3\r
450 ifdef use_vaes_256\r
451 MY_PROLOG NUM_CBC_REGS\r
452 \r
453 cmp rN, ways * 2\r
454 jb AesCbc_Decode_HW_start_2\r
455\r
456 vmovdqa iv, xmmword ptr [keys]\r
457 add keys, 32\r
458\r
459 vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]\r
460 vbroadcasti128 key_last_ymm, xmmword ptr [keys]\r
461 sub ksize_x, 16\r
462 mov koffs_x, ksize_x\r
463 add ksize_x, ksize_x\r
464 \r
465 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)\r
466 push keys2\r
467 sub r4, AVX_STACK_SUB\r
468 ; sub r4, 32\r
469 ; sub r4, ksize_r\r
470 ; lea keys2, [r4 + 32]\r
471 mov keys2, r4\r
472 and keys2, -32\r
473 broad:\r
474 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]\r
475 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm\r
476 sub koffs_r, 16\r
477 ; jnc broad\r
478 jnz broad\r
479\r
480 sub rN, ways * 2\r
481\r
482 align 16\r
483 avx_cbcdec_nextBlock2:\r
484 mov koffs_x, ksize_x\r
485 ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32\r
486 AVX__WOP AVX__CBC_START\r
487 @@:\r
488 AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r\r
489 sub koffs_r, 32\r
490 jnz @B\r
491 ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0\r
492 AVX__WOP_n AVX__VAES_DEC_LAST_key_last\r
493\r
494 AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]\r
495 AVX__WOP AVX__CBC_END\r
496\r
497 vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]\r
498 AVX__WOP AVX__WRITE_TO_DATA\r
499 \r
500 add rD, ways * 32\r
501 sub rN, ways * 2\r
502 jnc avx_cbcdec_nextBlock2\r
503 add rN, ways * 2\r
504\r
505 shr ksize_x, 1\r
506 \r
507 ; lea r4, [r4 + 1 * ksize_r + 32]\r
508 add r4, AVX_STACK_SUB\r
509 pop keys2\r
510\r
511 vzeroupper\r
512 jmp AesCbc_Decode_HW_start_3\r
513 else\r
514 jmp AesCbc_Decode_HW_start\r
515 endif\r
516MY_ENDP\r
517MY_SEG_ENDP\r
518\r
519\r
520\r
521 \r
522; ---------- AES-CBC Encode ----------\r
523\r
524e0 equ xmm1\r
525\r
526CENC_START_KEY equ 2\r
527CENC_NUM_REG_KEYS equ (3 * 2)\r
528; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))\r
529\r
530MY_SEG_PROC AesCbc_Encode_HW, 3\r
531 MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)\r
532\r
533 movdqa state, [keys]\r
534 add keys, 32\r
535 \r
536 i = 0\r
537 rept CENC_NUM_REG_KEYS\r
538 movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]\r
539 i = i + 1\r
540 endm\r
541 \r
542 add keys, ksize_r\r
543 neg ksize_r\r
544 add ksize_r, (16 * CENC_NUM_REG_KEYS)\r
545 ; movdqa last_key, [keys]\r
546 jmp check_e\r
547\r
548 align 16\r
549 nextBlock_e:\r
550 movdqa e0, [rD]\r
551 mov koffs_r, ksize_r\r
552 pxor e0, @CatStr(xmm, %(CENC_START_KEY))\r
553 pxor state, e0\r
554 \r
555 i = 1\r
556 rept (CENC_NUM_REG_KEYS - 1)\r
557 aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))\r
558 i = i + 1\r
559 endm\r
560\r
561 @@:\r
562 OP_KEY aesenc, 1 * koffs_r\r
563 OP_KEY aesenc, 1 * koffs_r + 16\r
564 add koffs_r, 32\r
565 jnz @B\r
566 OP_KEY aesenclast, 0\r
567 ; aesenclast state, last_key\r
568 \r
569 movdqa [rD], state\r
570 add rD, 16\r
571 check_e:\r
572 sub rN, 1\r
573 jnc nextBlock_e\r
574\r
575 ; movdqa [keys - 32], state\r
576 movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state\r
577MY_EPILOG\r
578MY_SEG_ENDP\r
579\r
580\r
581 \r
582; ---------- AES-CTR ----------\r
583\r
584ifdef x64\r
585 ; ways = 11\r
586endif\r
587\r
588 \r
589one equ @CatStr(xmm, %(ways_start_reg + ways + 1))\r
590one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))\r
591key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))\r
592key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))\r
593NUM_CTR_REGS equ (ways_start_reg + ways + 3)\r
594\r
595INIT_CTR macro reg, _ppp_\r
596 paddq iv, one\r
597 movdqa reg, iv\r
598endm\r
599\r
600\r
601MY_SEG_PROC AesCtr_Code_HW, 3\r
602 Ctr_start::\r
603 MY_PROLOG NUM_CTR_REGS\r
604\r
605 Ctr_start_2::\r
606 movdqa iv, [keys]\r
607 add keys, 32\r
608 movdqa key0, [keys]\r
609\r
610 add keys, ksize_r\r
611 neg ksize_r\r
612 add ksize_r, 16\r
613 \r
614 Ctr_start_3::\r
615 mov koffs_x, 1\r
616 movd one, koffs_x\r
617 jmp check2_c\r
618\r
619 align 16\r
620 nextBlocks2_c:\r
621 WOP INIT_CTR, 0\r
622 mov koffs_r, ksize_r\r
623 ; WOP_KEY pxor, 1 * koffs_r -16\r
624 WOP pxor, key0\r
625 @@:\r
626 WOP_KEY aesenc, 1 * koffs_r\r
627 add koffs_r, 16\r
628 jnz @B\r
629 WOP_KEY aesenclast, 0\r
630 \r
631 WOP XOR_WITH_DATA\r
632 WOP WRITE_TO_DATA\r
633 add rD, ways * 16\r
634 check2_c:\r
635 sub rN, ways\r
636 jnc nextBlocks2_c\r
637 add rN, ways\r
638\r
639 sub keys, 16\r
640 add ksize_r, 16\r
641 \r
642 jmp check_c\r
643\r
644 ; align 16\r
645 nextBlock_c:\r
646 paddq iv, one\r
647 ; movdqa state, [keys + 1 * koffs_r - 16]\r
648 movdqa state, key0\r
649 mov koffs_r, ksize_r\r
650 pxor state, iv\r
651 \r
652 @@:\r
653 OP_KEY aesenc, 1 * koffs_r\r
654 OP_KEY aesenc, 1 * koffs_r + 16\r
655 add koffs_r, 32\r
656 jnz @B\r
657 OP_KEY aesenc, 0\r
658 OP_KEY aesenclast, 16\r
659 \r
660 pxor state, [rD]\r
661 movdqa [rD], state\r
662 add rD, 16\r
663 check_c:\r
664 sub rN, 1\r
665 jnc nextBlock_c\r
666\r
667 ; movdqa [keys - 32], iv\r
668 movdqa [keys + 1 * ksize_r - 16 - 32], iv\r
669MY_EPILOG\r
670\r
671\r
672MY_PROC AesCtr_Code_HW_256, 3\r
673 ifdef use_vaes_256\r
674 MY_PROLOG NUM_CTR_REGS\r
675\r
676 cmp rN, ways * 2\r
677 jb Ctr_start_2\r
678\r
679 vbroadcasti128 iv_ymm, xmmword ptr [keys]\r
680 add keys, 32\r
681 vbroadcasti128 key0_ymm, xmmword ptr [keys]\r
682 mov koffs_x, 1\r
683 vmovd one, koffs_x\r
684 vpsubq iv_ymm, iv_ymm, one_ymm\r
685 vpaddq one, one, one\r
686 AVX__vinserti128_TO_HIGH one_ymm, one\r
687 \r
688 add keys, ksize_r\r
689 sub ksize_x, 16\r
690 neg ksize_r\r
691 mov koffs_r, ksize_r\r
692 add ksize_r, ksize_r\r
693\r
694 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)\r
695 push keys2\r
696 lea keys2, [r4 - 32]\r
697 sub r4, AVX_STACK_SUB\r
698 and keys2, -32\r
699 vbroadcasti128 key_ymm, xmmword ptr [keys]\r
700 vmovdqa ymmword ptr [keys2], key_ymm\r
701 @@:\r
702 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]\r
703 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm\r
704 add koffs_r, 16\r
705 jnz @B\r
706\r
707 sub rN, ways * 2\r
708 \r
709 align 16\r
710 avx_ctr_nextBlock2:\r
711 mov koffs_r, ksize_r\r
712 AVX__WOP AVX__CTR_START\r
713 ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32\r
714 @@:\r
715 AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r\r
716 add koffs_r, 32\r
717 jnz @B\r
718 AVX__WOP_KEY AVX__VAES_ENC_LAST, 0\r
719 \r
720 AVX__WOP AVX__XOR_WITH_DATA\r
721 AVX__WOP AVX__WRITE_TO_DATA\r
722 \r
723 add rD, ways * 32\r
724 sub rN, ways * 2\r
725 jnc avx_ctr_nextBlock2\r
726 add rN, ways * 2\r
727 \r
728 vextracti128 iv, iv_ymm, 1\r
729 sar ksize_r, 1\r
730 \r
731 add r4, AVX_STACK_SUB\r
732 pop keys2\r
733 \r
734 vzeroupper\r
735 jmp Ctr_start_3\r
736 else\r
737 jmp Ctr_start\r
738 endif\r
739MY_ENDP\r
740MY_SEG_ENDP\r
741\r
742end\r