9e052883 |
1 | ; AesOpt.asm -- AES optimized code for x86 AES hardware instructions\r |
2 | ; 2021-12-25 : Igor Pavlov : Public domain\r |
3 | \r |
4 | include 7zAsm.asm\r |
5 | \r |
6 | ifdef __ASMC__\r |
7 | use_vaes_256 equ 1\r |
8 | else\r |
9 | ifdef ymm0\r |
10 | use_vaes_256 equ 1\r |
11 | endif\r |
12 | endif\r |
13 | \r |
14 | \r |
15 | ifdef use_vaes_256\r |
16 | ECHO "++ VAES 256"\r |
17 | else\r |
18 | ECHO "-- NO VAES 256"\r |
19 | endif\r |
20 | \r |
21 | ifdef x64\r |
22 | ECHO "x86-64"\r |
23 | else\r |
24 | ECHO "x86"\r |
25 | if (IS_CDECL gt 0)\r |
26 | ECHO "ABI : CDECL"\r |
27 | else\r |
28 | ECHO "ABI : no CDECL : FASTCALL"\r |
29 | endif\r |
30 | endif\r |
31 | \r |
32 | if (IS_LINUX gt 0)\r |
33 | ECHO "ABI : LINUX"\r |
34 | else\r |
35 | ECHO "ABI : WINDOWS"\r |
36 | endif\r |
37 | \r |
38 | MY_ASM_START\r |
39 | \r |
40 | ifndef x64\r |
41 | .686\r |
42 | .xmm\r |
43 | endif\r |
44 | \r |
45 | \r |
46 | ; MY_ALIGN EQU ALIGN(64)\r |
47 | MY_ALIGN EQU\r |
48 | \r |
49 | SEG_ALIGN EQU MY_ALIGN\r |
50 | \r |
51 | MY_SEG_PROC macro name:req, numParams:req\r |
52 | ; seg_name equ @CatStr(_TEXT$, name)\r |
53 | ; seg_name SEGMENT SEG_ALIGN 'CODE'\r |
54 | MY_PROC name, numParams\r |
55 | endm\r |
56 | \r |
57 | MY_SEG_ENDP macro\r |
58 | ; seg_name ENDS\r |
59 | endm\r |
60 | \r |
61 | \r |
62 | NUM_AES_KEYS_MAX equ 15\r |
63 | \r |
64 | ; the number of push operators in function PROLOG\r |
65 | if (IS_LINUX eq 0) or (IS_X64 eq 0)\r |
66 | num_regs_push equ 2\r |
67 | stack_param_offset equ (REG_SIZE * (1 + num_regs_push))\r |
68 | endif\r |
69 | \r |
70 | ifdef x64\r |
71 | num_param equ REG_ABI_PARAM_2\r |
72 | else\r |
73 | if (IS_CDECL gt 0)\r |
74 | ; size_t size\r |
75 | ; void * data\r |
76 | ; UInt32 * aes\r |
77 | ; ret-ip <- (r4)\r |
78 | aes_OFFS equ (stack_param_offset)\r |
79 | data_OFFS equ (REG_SIZE + aes_OFFS)\r |
80 | size_OFFS equ (REG_SIZE + data_OFFS)\r |
81 | num_param equ [r4 + size_OFFS]\r |
82 | else\r |
83 | num_param equ [r4 + stack_param_offset]\r |
84 | endif\r |
85 | endif\r |
86 | \r |
87 | keys equ REG_PARAM_0 ; r1\r |
88 | rD equ REG_PARAM_1 ; r2\r |
89 | rN equ r0\r |
90 | \r |
91 | koffs_x equ x7\r |
92 | koffs_r equ r7\r |
93 | \r |
94 | ksize_x equ x6\r |
95 | ksize_r equ r6\r |
96 | \r |
97 | keys2 equ r3\r |
98 | \r |
99 | state equ xmm0\r |
100 | key equ xmm0\r |
101 | key_ymm equ ymm0\r |
102 | key_ymm_n equ 0\r |
103 | \r |
104 | ifdef x64\r |
105 | ways = 11\r |
106 | else\r |
107 | ways = 4\r |
108 | endif\r |
109 | \r |
110 | ways_start_reg equ 1\r |
111 | \r |
112 | iv equ @CatStr(xmm, %(ways_start_reg + ways))\r |
113 | iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))\r |
114 | \r |
115 | \r |
116 | WOP macro op, op2\r |
117 | i = 0\r |
118 | rept ways\r |
119 | op @CatStr(xmm, %(ways_start_reg + i)), op2\r |
120 | i = i + 1\r |
121 | endm\r |
122 | endm\r |
123 | \r |
124 | \r |
125 | ifndef ABI_LINUX\r |
126 | ifdef x64\r |
127 | \r |
128 | ; we use 32 bytes of home space in stack in WIN64-x64\r |
129 | NUM_HOME_MM_REGS equ (32 / 16)\r |
130 | ; we preserve xmm registers starting from xmm6 in WIN64-x64\r |
131 | MM_START_SAVE_REG equ 6\r |
132 | \r |
133 | SAVE_XMM macro num_used_mm_regs:req\r |
134 | num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG\r |
135 | if num_save_mm_regs GT 0\r |
136 | num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS\r |
137 | ; RSP is (16*x + 8) after entering the function in WIN64-x64\r |
138 | stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)\r |
139 | \r |
140 | i = 0\r |
141 | rept num_save_mm_regs\r |
142 | \r |
143 | if i eq NUM_HOME_MM_REGS\r |
144 | sub r4, stack_offset\r |
145 | endif\r |
146 | \r |
147 | if i lt NUM_HOME_MM_REGS\r |
148 | movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))\r |
149 | else\r |
150 | movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))\r |
151 | endif\r |
152 | \r |
153 | i = i + 1\r |
154 | endm\r |
155 | endif\r |
156 | endm\r |
157 | \r |
158 | RESTORE_XMM macro num_used_mm_regs:req\r |
159 | if num_save_mm_regs GT 0\r |
160 | i = 0\r |
161 | if num_save_mm_regs2 GT 0\r |
162 | rept num_save_mm_regs2\r |
163 | movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]\r |
164 | i = i + 1\r |
165 | endm\r |
166 | add r4, stack_offset\r |
167 | endif\r |
168 | \r |
169 | num_low_regs = num_save_mm_regs - i\r |
170 | i = 0\r |
171 | rept num_low_regs\r |
172 | movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]\r |
173 | i = i + 1\r |
174 | endm\r |
175 | endif\r |
176 | endm\r |
177 | \r |
178 | endif ; x64\r |
179 | endif ; ABI_LINUX\r |
180 | \r |
181 | \r |
182 | MY_PROLOG macro num_used_mm_regs:req\r |
183 | ; num_regs_push: must be equal to the number of push operators\r |
184 | ; push r3\r |
185 | ; push r5\r |
186 | if (IS_LINUX eq 0) or (IS_X64 eq 0)\r |
187 | push r6\r |
188 | push r7\r |
189 | endif\r |
190 | \r |
191 | mov rN, num_param ; don't move it; num_param can use stack pointer (r4)\r |
192 | \r |
193 | if (IS_X64 eq 0)\r |
194 | if (IS_CDECL gt 0)\r |
195 | mov rD, [r4 + data_OFFS]\r |
196 | mov keys, [r4 + aes_OFFS]\r |
197 | endif\r |
198 | elseif (IS_LINUX gt 0)\r |
199 | MY_ABI_LINUX_TO_WIN_2\r |
200 | endif\r |
201 | \r |
202 | \r |
203 | ifndef ABI_LINUX\r |
204 | ifdef x64\r |
205 | SAVE_XMM num_used_mm_regs\r |
206 | endif\r |
207 | endif\r |
208 | \r |
209 | mov ksize_x, [keys + 16]\r |
210 | shl ksize_x, 5\r |
211 | endm\r |
212 | \r |
213 | \r |
214 | MY_EPILOG macro\r |
215 | ifndef ABI_LINUX\r |
216 | ifdef x64\r |
217 | RESTORE_XMM num_save_mm_regs\r |
218 | endif\r |
219 | endif\r |
220 | \r |
221 | if (IS_LINUX eq 0) or (IS_X64 eq 0)\r |
222 | pop r7\r |
223 | pop r6\r |
224 | endif\r |
225 | ; pop r5\r |
226 | ; pop r3\r |
227 | MY_ENDP\r |
228 | endm\r |
229 | \r |
230 | \r |
231 | OP_KEY macro op:req, offs:req\r |
232 | op state, [keys + offs]\r |
233 | endm\r |
234 | \r |
235 | \r |
236 | WOP_KEY macro op:req, offs:req\r |
237 | movdqa key, [keys + offs]\r |
238 | WOP op, key\r |
239 | endm\r |
240 | \r |
241 | \r |
242 | ; ---------- AES-CBC Decode ----------\r |
243 | \r |
244 | \r |
245 | XOR_WITH_DATA macro reg, _ppp_\r |
246 | pxor reg, [rD + i * 16]\r |
247 | endm\r |
248 | \r |
249 | WRITE_TO_DATA macro reg, _ppp_\r |
250 | movdqa [rD + i * 16], reg\r |
251 | endm\r |
252 | \r |
253 | \r |
254 | ; state0 equ @CatStr(xmm, %(ways_start_reg))\r |
255 | \r |
256 | key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))\r |
257 | key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))\r |
258 | \r |
259 | key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))\r |
260 | key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))\r |
261 | key_last_ymm_n equ (ways_start_reg + ways + 2)\r |
262 | \r |
263 | NUM_CBC_REGS equ (ways_start_reg + ways + 3)\r |
264 | \r |
265 | \r |
266 | MY_SEG_PROC AesCbc_Decode_HW, 3\r |
267 | \r |
268 | AesCbc_Decode_HW_start::\r |
269 | MY_PROLOG NUM_CBC_REGS\r |
270 | \r |
271 | AesCbc_Decode_HW_start_2::\r |
272 | movdqa iv, [keys]\r |
273 | add keys, 32\r |
274 | \r |
275 | movdqa key0, [keys + 1 * ksize_r]\r |
276 | movdqa key_last, [keys]\r |
277 | sub ksize_x, 16\r |
278 | \r |
279 | jmp check2\r |
280 | align 16\r |
281 | nextBlocks2:\r |
282 | WOP movdqa, [rD + i * 16]\r |
283 | mov koffs_x, ksize_x\r |
284 | ; WOP_KEY pxor, ksize_r + 16\r |
285 | WOP pxor, key0\r |
286 | ; align 16\r |
287 | @@:\r |
288 | WOP_KEY aesdec, 1 * koffs_r\r |
289 | sub koffs_r, 16\r |
290 | jnz @B\r |
291 | ; WOP_KEY aesdeclast, 0\r |
292 | WOP aesdeclast, key_last\r |
293 | \r |
294 | pxor @CatStr(xmm, %(ways_start_reg)), iv\r |
295 | i = 1\r |
296 | rept ways - 1\r |
297 | pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]\r |
298 | i = i + 1\r |
299 | endm\r |
300 | movdqa iv, [rD + ways * 16 - 16]\r |
301 | WOP WRITE_TO_DATA\r |
302 | \r |
303 | add rD, ways * 16\r |
304 | AesCbc_Decode_HW_start_3::\r |
305 | check2:\r |
306 | sub rN, ways\r |
307 | jnc nextBlocks2\r |
308 | add rN, ways\r |
309 | \r |
310 | sub ksize_x, 16\r |
311 | \r |
312 | jmp check\r |
313 | nextBlock:\r |
314 | movdqa state, [rD]\r |
315 | mov koffs_x, ksize_x\r |
316 | ; OP_KEY pxor, 1 * ksize_r + 32\r |
317 | pxor state, key0\r |
318 | ; movdqa state0, [rD]\r |
319 | ; movdqa state, key0\r |
320 | ; pxor state, state0\r |
321 | @@:\r |
322 | OP_KEY aesdec, 1 * koffs_r + 16\r |
323 | OP_KEY aesdec, 1 * koffs_r\r |
324 | sub koffs_r, 32\r |
325 | jnz @B\r |
326 | OP_KEY aesdec, 16\r |
327 | ; OP_KEY aesdeclast, 0\r |
328 | aesdeclast state, key_last\r |
329 | \r |
330 | pxor state, iv\r |
331 | movdqa iv, [rD]\r |
332 | ; movdqa iv, state0\r |
333 | movdqa [rD], state\r |
334 | \r |
335 | add rD, 16\r |
336 | check:\r |
337 | sub rN, 1\r |
338 | jnc nextBlock\r |
339 | \r |
340 | movdqa [keys - 32], iv\r |
341 | MY_EPILOG\r |
342 | \r |
343 | \r |
344 | \r |
345 | \r |
346 | ; ---------- AVX ----------\r |
347 | \r |
348 | \r |
349 | AVX__WOP_n macro op\r |
350 | i = 0\r |
351 | rept ways\r |
352 | op (ways_start_reg + i)\r |
353 | i = i + 1\r |
354 | endm\r |
355 | endm\r |
356 | \r |
357 | AVX__WOP macro op\r |
358 | i = 0\r |
359 | rept ways\r |
360 | op @CatStr(ymm, %(ways_start_reg + i))\r |
361 | i = i + 1\r |
362 | endm\r |
363 | endm\r |
364 | \r |
365 | \r |
366 | AVX__WOP_KEY macro op:req, offs:req\r |
367 | vmovdqa key_ymm, ymmword ptr [keys2 + offs]\r |
368 | AVX__WOP_n op\r |
369 | endm\r |
370 | \r |
371 | \r |
372 | AVX__CBC_START macro reg\r |
373 | ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]\r |
374 | vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]\r |
375 | endm\r |
376 | \r |
377 | AVX__CBC_END macro reg\r |
378 | if i eq 0\r |
379 | vpxor reg, reg, iv_ymm\r |
380 | else\r |
381 | vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]\r |
382 | endif\r |
383 | endm\r |
384 | \r |
385 | \r |
386 | AVX__WRITE_TO_DATA macro reg\r |
387 | vmovdqu ymmword ptr [rD + 32 * i], reg\r |
388 | endm\r |
389 | \r |
390 | AVX__XOR_WITH_DATA macro reg\r |
391 | vpxor reg, reg, ymmword ptr [rD + 32 * i]\r |
392 | endm\r |
393 | \r |
394 | AVX__CTR_START macro reg\r |
395 | vpaddq iv_ymm, iv_ymm, one_ymm\r |
396 | ; vpxor reg, iv_ymm, key_ymm\r |
397 | vpxor reg, iv_ymm, key0_ymm\r |
398 | endm\r |
399 | \r |
400 | \r |
401 | MY_VAES_INSTR_2 macro cmd, dest, a1, a2\r |
402 | db 0c4H\r |
403 | db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)\r |
404 | db 5 + 8 * ((not (a1)) and 15)\r |
405 | db cmd\r |
406 | db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)\r |
407 | endm\r |
408 | \r |
409 | MY_VAES_INSTR macro cmd, dest, a\r |
410 | MY_VAES_INSTR_2 cmd, dest, dest, a\r |
411 | endm\r |
412 | \r |
413 | MY_vaesenc macro dest, a\r |
414 | MY_VAES_INSTR 0dcH, dest, a\r |
415 | endm\r |
416 | MY_vaesenclast macro dest, a\r |
417 | MY_VAES_INSTR 0ddH, dest, a\r |
418 | endm\r |
419 | MY_vaesdec macro dest, a\r |
420 | MY_VAES_INSTR 0deH, dest, a\r |
421 | endm\r |
422 | MY_vaesdeclast macro dest, a\r |
423 | MY_VAES_INSTR 0dfH, dest, a\r |
424 | endm\r |
425 | \r |
426 | \r |
427 | AVX__VAES_DEC macro reg\r |
428 | MY_vaesdec reg, key_ymm_n\r |
429 | endm\r |
430 | \r |
431 | AVX__VAES_DEC_LAST_key_last macro reg\r |
432 | ; MY_vaesdeclast reg, key_ymm_n\r |
433 | MY_vaesdeclast reg, key_last_ymm_n\r |
434 | endm\r |
435 | \r |
436 | AVX__VAES_ENC macro reg\r |
437 | MY_vaesenc reg, key_ymm_n\r |
438 | endm\r |
439 | \r |
440 | AVX__VAES_ENC_LAST macro reg\r |
441 | MY_vaesenclast reg, key_ymm_n\r |
442 | endm\r |
443 | \r |
444 | AVX__vinserti128_TO_HIGH macro dest, src\r |
445 | vinserti128 dest, dest, src, 1\r |
446 | endm\r |
447 | \r |
448 | \r |
449 | MY_PROC AesCbc_Decode_HW_256, 3\r |
450 | ifdef use_vaes_256\r |
451 | MY_PROLOG NUM_CBC_REGS\r |
452 | \r |
453 | cmp rN, ways * 2\r |
454 | jb AesCbc_Decode_HW_start_2\r |
455 | \r |
456 | vmovdqa iv, xmmword ptr [keys]\r |
457 | add keys, 32\r |
458 | \r |
459 | vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]\r |
460 | vbroadcasti128 key_last_ymm, xmmword ptr [keys]\r |
461 | sub ksize_x, 16\r |
462 | mov koffs_x, ksize_x\r |
463 | add ksize_x, ksize_x\r |
464 | \r |
465 | AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)\r |
466 | push keys2\r |
467 | sub r4, AVX_STACK_SUB\r |
468 | ; sub r4, 32\r |
469 | ; sub r4, ksize_r\r |
470 | ; lea keys2, [r4 + 32]\r |
471 | mov keys2, r4\r |
472 | and keys2, -32\r |
473 | broad:\r |
474 | vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]\r |
475 | vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm\r |
476 | sub koffs_r, 16\r |
477 | ; jnc broad\r |
478 | jnz broad\r |
479 | \r |
480 | sub rN, ways * 2\r |
481 | \r |
482 | align 16\r |
483 | avx_cbcdec_nextBlock2:\r |
484 | mov koffs_x, ksize_x\r |
485 | ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32\r |
486 | AVX__WOP AVX__CBC_START\r |
487 | @@:\r |
488 | AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r\r |
489 | sub koffs_r, 32\r |
490 | jnz @B\r |
491 | ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0\r |
492 | AVX__WOP_n AVX__VAES_DEC_LAST_key_last\r |
493 | \r |
494 | AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]\r |
495 | AVX__WOP AVX__CBC_END\r |
496 | \r |
497 | vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]\r |
498 | AVX__WOP AVX__WRITE_TO_DATA\r |
499 | \r |
500 | add rD, ways * 32\r |
501 | sub rN, ways * 2\r |
502 | jnc avx_cbcdec_nextBlock2\r |
503 | add rN, ways * 2\r |
504 | \r |
505 | shr ksize_x, 1\r |
506 | \r |
507 | ; lea r4, [r4 + 1 * ksize_r + 32]\r |
508 | add r4, AVX_STACK_SUB\r |
509 | pop keys2\r |
510 | \r |
511 | vzeroupper\r |
512 | jmp AesCbc_Decode_HW_start_3\r |
513 | else\r |
514 | jmp AesCbc_Decode_HW_start\r |
515 | endif\r |
516 | MY_ENDP\r |
517 | MY_SEG_ENDP\r |
518 | \r |
519 | \r |
520 | \r |
521 | \r |
522 | ; ---------- AES-CBC Encode ----------\r |
523 | \r |
524 | e0 equ xmm1\r |
525 | \r |
526 | CENC_START_KEY equ 2\r |
527 | CENC_NUM_REG_KEYS equ (3 * 2)\r |
528 | ; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))\r |
529 | \r |
530 | MY_SEG_PROC AesCbc_Encode_HW, 3\r |
531 | MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)\r |
532 | \r |
533 | movdqa state, [keys]\r |
534 | add keys, 32\r |
535 | \r |
536 | i = 0\r |
537 | rept CENC_NUM_REG_KEYS\r |
538 | movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]\r |
539 | i = i + 1\r |
540 | endm\r |
541 | \r |
542 | add keys, ksize_r\r |
543 | neg ksize_r\r |
544 | add ksize_r, (16 * CENC_NUM_REG_KEYS)\r |
545 | ; movdqa last_key, [keys]\r |
546 | jmp check_e\r |
547 | \r |
548 | align 16\r |
549 | nextBlock_e:\r |
550 | movdqa e0, [rD]\r |
551 | mov koffs_r, ksize_r\r |
552 | pxor e0, @CatStr(xmm, %(CENC_START_KEY))\r |
553 | pxor state, e0\r |
554 | \r |
555 | i = 1\r |
556 | rept (CENC_NUM_REG_KEYS - 1)\r |
557 | aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))\r |
558 | i = i + 1\r |
559 | endm\r |
560 | \r |
561 | @@:\r |
562 | OP_KEY aesenc, 1 * koffs_r\r |
563 | OP_KEY aesenc, 1 * koffs_r + 16\r |
564 | add koffs_r, 32\r |
565 | jnz @B\r |
566 | OP_KEY aesenclast, 0\r |
567 | ; aesenclast state, last_key\r |
568 | \r |
569 | movdqa [rD], state\r |
570 | add rD, 16\r |
571 | check_e:\r |
572 | sub rN, 1\r |
573 | jnc nextBlock_e\r |
574 | \r |
575 | ; movdqa [keys - 32], state\r |
576 | movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state\r |
577 | MY_EPILOG\r |
578 | MY_SEG_ENDP\r |
579 | \r |
580 | \r |
581 | \r |
582 | ; ---------- AES-CTR ----------\r |
583 | \r |
584 | ifdef x64\r |
585 | ; ways = 11\r |
586 | endif\r |
587 | \r |
588 | \r |
589 | one equ @CatStr(xmm, %(ways_start_reg + ways + 1))\r |
590 | one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))\r |
591 | key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))\r |
592 | key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))\r |
593 | NUM_CTR_REGS equ (ways_start_reg + ways + 3)\r |
594 | \r |
595 | INIT_CTR macro reg, _ppp_\r |
596 | paddq iv, one\r |
597 | movdqa reg, iv\r |
598 | endm\r |
599 | \r |
600 | \r |
601 | MY_SEG_PROC AesCtr_Code_HW, 3\r |
602 | Ctr_start::\r |
603 | MY_PROLOG NUM_CTR_REGS\r |
604 | \r |
605 | Ctr_start_2::\r |
606 | movdqa iv, [keys]\r |
607 | add keys, 32\r |
608 | movdqa key0, [keys]\r |
609 | \r |
610 | add keys, ksize_r\r |
611 | neg ksize_r\r |
612 | add ksize_r, 16\r |
613 | \r |
614 | Ctr_start_3::\r |
615 | mov koffs_x, 1\r |
616 | movd one, koffs_x\r |
617 | jmp check2_c\r |
618 | \r |
619 | align 16\r |
620 | nextBlocks2_c:\r |
621 | WOP INIT_CTR, 0\r |
622 | mov koffs_r, ksize_r\r |
623 | ; WOP_KEY pxor, 1 * koffs_r -16\r |
624 | WOP pxor, key0\r |
625 | @@:\r |
626 | WOP_KEY aesenc, 1 * koffs_r\r |
627 | add koffs_r, 16\r |
628 | jnz @B\r |
629 | WOP_KEY aesenclast, 0\r |
630 | \r |
631 | WOP XOR_WITH_DATA\r |
632 | WOP WRITE_TO_DATA\r |
633 | add rD, ways * 16\r |
634 | check2_c:\r |
635 | sub rN, ways\r |
636 | jnc nextBlocks2_c\r |
637 | add rN, ways\r |
638 | \r |
639 | sub keys, 16\r |
640 | add ksize_r, 16\r |
641 | \r |
642 | jmp check_c\r |
643 | \r |
644 | ; align 16\r |
645 | nextBlock_c:\r |
646 | paddq iv, one\r |
647 | ; movdqa state, [keys + 1 * koffs_r - 16]\r |
648 | movdqa state, key0\r |
649 | mov koffs_r, ksize_r\r |
650 | pxor state, iv\r |
651 | \r |
652 | @@:\r |
653 | OP_KEY aesenc, 1 * koffs_r\r |
654 | OP_KEY aesenc, 1 * koffs_r + 16\r |
655 | add koffs_r, 32\r |
656 | jnz @B\r |
657 | OP_KEY aesenc, 0\r |
658 | OP_KEY aesenclast, 16\r |
659 | \r |
660 | pxor state, [rD]\r |
661 | movdqa [rD], state\r |
662 | add rD, 16\r |
663 | check_c:\r |
664 | sub rN, 1\r |
665 | jnc nextBlock_c\r |
666 | \r |
667 | ; movdqa [keys - 32], iv\r |
668 | movdqa [keys + 1 * ksize_r - 16 - 32], iv\r |
669 | MY_EPILOG\r |
670 | \r |
671 | \r |
672 | MY_PROC AesCtr_Code_HW_256, 3\r |
673 | ifdef use_vaes_256\r |
674 | MY_PROLOG NUM_CTR_REGS\r |
675 | \r |
676 | cmp rN, ways * 2\r |
677 | jb Ctr_start_2\r |
678 | \r |
679 | vbroadcasti128 iv_ymm, xmmword ptr [keys]\r |
680 | add keys, 32\r |
681 | vbroadcasti128 key0_ymm, xmmword ptr [keys]\r |
682 | mov koffs_x, 1\r |
683 | vmovd one, koffs_x\r |
684 | vpsubq iv_ymm, iv_ymm, one_ymm\r |
685 | vpaddq one, one, one\r |
686 | AVX__vinserti128_TO_HIGH one_ymm, one\r |
687 | \r |
688 | add keys, ksize_r\r |
689 | sub ksize_x, 16\r |
690 | neg ksize_r\r |
691 | mov koffs_r, ksize_r\r |
692 | add ksize_r, ksize_r\r |
693 | \r |
694 | AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)\r |
695 | push keys2\r |
696 | lea keys2, [r4 - 32]\r |
697 | sub r4, AVX_STACK_SUB\r |
698 | and keys2, -32\r |
699 | vbroadcasti128 key_ymm, xmmword ptr [keys]\r |
700 | vmovdqa ymmword ptr [keys2], key_ymm\r |
701 | @@:\r |
702 | vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]\r |
703 | vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm\r |
704 | add koffs_r, 16\r |
705 | jnz @B\r |
706 | \r |
707 | sub rN, ways * 2\r |
708 | \r |
709 | align 16\r |
710 | avx_ctr_nextBlock2:\r |
711 | mov koffs_r, ksize_r\r |
712 | AVX__WOP AVX__CTR_START\r |
713 | ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32\r |
714 | @@:\r |
715 | AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r\r |
716 | add koffs_r, 32\r |
717 | jnz @B\r |
718 | AVX__WOP_KEY AVX__VAES_ENC_LAST, 0\r |
719 | \r |
720 | AVX__WOP AVX__XOR_WITH_DATA\r |
721 | AVX__WOP AVX__WRITE_TO_DATA\r |
722 | \r |
723 | add rD, ways * 32\r |
724 | sub rN, ways * 2\r |
725 | jnc avx_ctr_nextBlock2\r |
726 | add rN, ways * 2\r |
727 | \r |
728 | vextracti128 iv, iv_ymm, 1\r |
729 | sar ksize_r, 1\r |
730 | \r |
731 | add r4, AVX_STACK_SUB\r |
732 | pop keys2\r |
733 | \r |
734 | vzeroupper\r |
735 | jmp Ctr_start_3\r |
736 | else\r |
737 | jmp Ctr_start\r |
738 | endif\r |
739 | MY_ENDP\r |
740 | MY_SEG_ENDP\r |
741 | \r |
742 | end\r |