1 ; AesOpt.asm -- AES optimized code for x86 AES hardware instructions
\r
2 ; 2021-12-25 : Igor Pavlov : Public domain
\r
18 ECHO "-- NO VAES 256"
\r
28 ECHO "ABI : no CDECL : FASTCALL"
\r
35 ECHO "ABI : WINDOWS"
\r
46 ; MY_ALIGN EQU ALIGN(64)
\r
49 SEG_ALIGN EQU MY_ALIGN
\r
51 MY_SEG_PROC macro name:req, numParams:req
\r
52 ; seg_name equ @CatStr(_TEXT$, name)
\r
53 ; seg_name SEGMENT SEG_ALIGN 'CODE'
\r
54 MY_PROC name, numParams
\r
62 NUM_AES_KEYS_MAX equ 15
\r
64 ; the number of push operators in function PROLOG
\r
65 if (IS_LINUX eq 0) or (IS_X64 eq 0)
\r
67 stack_param_offset equ (REG_SIZE * (1 + num_regs_push))
\r
71 num_param equ REG_ABI_PARAM_2
\r
78 aes_OFFS equ (stack_param_offset)
\r
79 data_OFFS equ (REG_SIZE + aes_OFFS)
\r
80 size_OFFS equ (REG_SIZE + data_OFFS)
\r
81 num_param equ [r4 + size_OFFS]
\r
83 num_param equ [r4 + stack_param_offset]
\r
87 keys equ REG_PARAM_0 ; r1
\r
88 rD equ REG_PARAM_1 ; r2
\r
110 ways_start_reg equ 1
\r
112 iv equ @CatStr(xmm, %(ways_start_reg + ways))
\r
113 iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))
\r
119 op @CatStr(xmm, %(ways_start_reg + i)), op2
\r
128 ; we use 32 bytes of home space in stack in WIN64-x64
\r
129 NUM_HOME_MM_REGS equ (32 / 16)
\r
130 ; we preserve xmm registers starting from xmm6 in WIN64-x64
\r
131 MM_START_SAVE_REG equ 6
\r
133 SAVE_XMM macro num_used_mm_regs:req
\r
134 num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG
\r
135 if num_save_mm_regs GT 0
\r
136 num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS
\r
137 ; RSP is (16*x + 8) after entering the function in WIN64-x64
\r
138 stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)
\r
141 rept num_save_mm_regs
\r
143 if i eq NUM_HOME_MM_REGS
\r
144 sub r4, stack_offset
\r
147 if i lt NUM_HOME_MM_REGS
\r
148 movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
\r
150 movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
\r
158 RESTORE_XMM macro num_used_mm_regs:req
\r
159 if num_save_mm_regs GT 0
\r
161 if num_save_mm_regs2 GT 0
\r
162 rept num_save_mm_regs2
\r
163 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]
\r
166 add r4, stack_offset
\r
169 num_low_regs = num_save_mm_regs - i
\r
172 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]
\r
182 MY_PROLOG macro num_used_mm_regs:req
\r
183 ; num_regs_push: must be equal to the number of push operators
\r
186 if (IS_LINUX eq 0) or (IS_X64 eq 0)
\r
191 mov rN, num_param ; don't move it; num_param can use stack pointer (r4)
\r
195 mov rD, [r4 + data_OFFS]
\r
196 mov keys, [r4 + aes_OFFS]
\r
198 elseif (IS_LINUX gt 0)
\r
199 MY_ABI_LINUX_TO_WIN_2
\r
205 SAVE_XMM num_used_mm_regs
\r
209 mov ksize_x, [keys + 16]
\r
217 RESTORE_XMM num_save_mm_regs
\r
221 if (IS_LINUX eq 0) or (IS_X64 eq 0)
\r
231 OP_KEY macro op:req, offs:req
\r
232 op state, [keys + offs]
\r
236 WOP_KEY macro op:req, offs:req
\r
237 movdqa key, [keys + offs]
\r
242 ; ---------- AES-CBC Decode ----------
\r
245 XOR_WITH_DATA macro reg, _ppp_
\r
246 pxor reg, [rD + i * 16]
\r
249 WRITE_TO_DATA macro reg, _ppp_
\r
250 movdqa [rD + i * 16], reg
\r
254 ; state0 equ @CatStr(xmm, %(ways_start_reg))
\r
256 key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))
\r
257 key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
\r
259 key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))
\r
260 key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
\r
261 key_last_ymm_n equ (ways_start_reg + ways + 2)
\r
263 NUM_CBC_REGS equ (ways_start_reg + ways + 3)
\r
266 MY_SEG_PROC AesCbc_Decode_HW, 3
\r
268 AesCbc_Decode_HW_start::
\r
269 MY_PROLOG NUM_CBC_REGS
\r
271 AesCbc_Decode_HW_start_2::
\r
275 movdqa key0, [keys + 1 * ksize_r]
\r
276 movdqa key_last, [keys]
\r
282 WOP movdqa, [rD + i * 16]
\r
283 mov koffs_x, ksize_x
\r
284 ; WOP_KEY pxor, ksize_r + 16
\r
288 WOP_KEY aesdec, 1 * koffs_r
\r
291 ; WOP_KEY aesdeclast, 0
\r
292 WOP aesdeclast, key_last
\r
294 pxor @CatStr(xmm, %(ways_start_reg)), iv
\r
297 pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]
\r
300 movdqa iv, [rD + ways * 16 - 16]
\r
304 AesCbc_Decode_HW_start_3::
\r
315 mov koffs_x, ksize_x
\r
316 ; OP_KEY pxor, 1 * ksize_r + 32
\r
318 ; movdqa state0, [rD]
\r
319 ; movdqa state, key0
\r
320 ; pxor state, state0
\r
322 OP_KEY aesdec, 1 * koffs_r + 16
\r
323 OP_KEY aesdec, 1 * koffs_r
\r
327 ; OP_KEY aesdeclast, 0
\r
328 aesdeclast state, key_last
\r
332 ; movdqa iv, state0
\r
340 movdqa [keys - 32], iv
\r
346 ; ---------- AVX ----------
\r
349 AVX__WOP_n macro op
\r
352 op (ways_start_reg + i)
\r
360 op @CatStr(ymm, %(ways_start_reg + i))
\r
366 AVX__WOP_KEY macro op:req, offs:req
\r
367 vmovdqa key_ymm, ymmword ptr [keys2 + offs]
\r
372 AVX__CBC_START macro reg
\r
373 ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]
\r
374 vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]
\r
377 AVX__CBC_END macro reg
\r
379 vpxor reg, reg, iv_ymm
\r
381 vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]
\r
386 AVX__WRITE_TO_DATA macro reg
\r
387 vmovdqu ymmword ptr [rD + 32 * i], reg
\r
390 AVX__XOR_WITH_DATA macro reg
\r
391 vpxor reg, reg, ymmword ptr [rD + 32 * i]
\r
394 AVX__CTR_START macro reg
\r
395 vpaddq iv_ymm, iv_ymm, one_ymm
\r
396 ; vpxor reg, iv_ymm, key_ymm
\r
397 vpxor reg, iv_ymm, key0_ymm
\r
401 MY_VAES_INSTR_2 macro cmd, dest, a1, a2
\r
403 db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)
\r
404 db 5 + 8 * ((not (a1)) and 15)
\r
406 db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)
\r
409 MY_VAES_INSTR macro cmd, dest, a
\r
410 MY_VAES_INSTR_2 cmd, dest, dest, a
\r
413 MY_vaesenc macro dest, a
\r
414 MY_VAES_INSTR 0dcH, dest, a
\r
416 MY_vaesenclast macro dest, a
\r
417 MY_VAES_INSTR 0ddH, dest, a
\r
419 MY_vaesdec macro dest, a
\r
420 MY_VAES_INSTR 0deH, dest, a
\r
422 MY_vaesdeclast macro dest, a
\r
423 MY_VAES_INSTR 0dfH, dest, a
\r
427 AVX__VAES_DEC macro reg
\r
428 MY_vaesdec reg, key_ymm_n
\r
431 AVX__VAES_DEC_LAST_key_last macro reg
\r
432 ; MY_vaesdeclast reg, key_ymm_n
\r
433 MY_vaesdeclast reg, key_last_ymm_n
\r
436 AVX__VAES_ENC macro reg
\r
437 MY_vaesenc reg, key_ymm_n
\r
440 AVX__VAES_ENC_LAST macro reg
\r
441 MY_vaesenclast reg, key_ymm_n
\r
444 AVX__vinserti128_TO_HIGH macro dest, src
\r
445 vinserti128 dest, dest, src, 1
\r
449 MY_PROC AesCbc_Decode_HW_256, 3
\r
451 MY_PROLOG NUM_CBC_REGS
\r
454 jb AesCbc_Decode_HW_start_2
\r
456 vmovdqa iv, xmmword ptr [keys]
\r
459 vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]
\r
460 vbroadcasti128 key_last_ymm, xmmword ptr [keys]
\r
462 mov koffs_x, ksize_x
\r
463 add ksize_x, ksize_x
\r
465 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)
\r
467 sub r4, AVX_STACK_SUB
\r
470 ; lea keys2, [r4 + 32]
\r
474 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
\r
475 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
\r
483 avx_cbcdec_nextBlock2:
\r
484 mov koffs_x, ksize_x
\r
485 ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32
\r
486 AVX__WOP AVX__CBC_START
\r
488 AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r
\r
491 ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0
\r
492 AVX__WOP_n AVX__VAES_DEC_LAST_key_last
\r
494 AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]
\r
495 AVX__WOP AVX__CBC_END
\r
497 vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]
\r
498 AVX__WOP AVX__WRITE_TO_DATA
\r
502 jnc avx_cbcdec_nextBlock2
\r
507 ; lea r4, [r4 + 1 * ksize_r + 32]
\r
508 add r4, AVX_STACK_SUB
\r
512 jmp AesCbc_Decode_HW_start_3
\r
514 jmp AesCbc_Decode_HW_start
\r
522 ; ---------- AES-CBC Encode ----------
\r
526 CENC_START_KEY equ 2
\r
527 CENC_NUM_REG_KEYS equ (3 * 2)
\r
528 ; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))
\r
530 MY_SEG_PROC AesCbc_Encode_HW, 3
\r
531 MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)
\r
533 movdqa state, [keys]
\r
537 rept CENC_NUM_REG_KEYS
\r
538 movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]
\r
544 add ksize_r, (16 * CENC_NUM_REG_KEYS)
\r
545 ; movdqa last_key, [keys]
\r
551 mov koffs_r, ksize_r
\r
552 pxor e0, @CatStr(xmm, %(CENC_START_KEY))
\r
556 rept (CENC_NUM_REG_KEYS - 1)
\r
557 aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))
\r
562 OP_KEY aesenc, 1 * koffs_r
\r
563 OP_KEY aesenc, 1 * koffs_r + 16
\r
566 OP_KEY aesenclast, 0
\r
567 ; aesenclast state, last_key
\r
575 ; movdqa [keys - 32], state
\r
576 movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state
\r
582 ; ---------- AES-CTR ----------
\r
589 one equ @CatStr(xmm, %(ways_start_reg + ways + 1))
\r
590 one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
\r
591 key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))
\r
592 key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
\r
593 NUM_CTR_REGS equ (ways_start_reg + ways + 3)
\r
595 INIT_CTR macro reg, _ppp_
\r
601 MY_SEG_PROC AesCtr_Code_HW, 3
\r
603 MY_PROLOG NUM_CTR_REGS
\r
608 movdqa key0, [keys]
\r
622 mov koffs_r, ksize_r
\r
623 ; WOP_KEY pxor, 1 * koffs_r -16
\r
626 WOP_KEY aesenc, 1 * koffs_r
\r
629 WOP_KEY aesenclast, 0
\r
647 ; movdqa state, [keys + 1 * koffs_r - 16]
\r
649 mov koffs_r, ksize_r
\r
653 OP_KEY aesenc, 1 * koffs_r
\r
654 OP_KEY aesenc, 1 * koffs_r + 16
\r
658 OP_KEY aesenclast, 16
\r
667 ; movdqa [keys - 32], iv
\r
668 movdqa [keys + 1 * ksize_r - 16 - 32], iv
\r
672 MY_PROC AesCtr_Code_HW_256, 3
\r
674 MY_PROLOG NUM_CTR_REGS
\r
679 vbroadcasti128 iv_ymm, xmmword ptr [keys]
\r
681 vbroadcasti128 key0_ymm, xmmword ptr [keys]
\r
684 vpsubq iv_ymm, iv_ymm, one_ymm
\r
685 vpaddq one, one, one
\r
686 AVX__vinserti128_TO_HIGH one_ymm, one
\r
691 mov koffs_r, ksize_r
\r
692 add ksize_r, ksize_r
\r
694 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)
\r
696 lea keys2, [r4 - 32]
\r
697 sub r4, AVX_STACK_SUB
\r
699 vbroadcasti128 key_ymm, xmmword ptr [keys]
\r
700 vmovdqa ymmword ptr [keys2], key_ymm
\r
702 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
\r
703 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
\r
710 avx_ctr_nextBlock2:
\r
711 mov koffs_r, ksize_r
\r
712 AVX__WOP AVX__CTR_START
\r
713 ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32
\r
715 AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r
\r
718 AVX__WOP_KEY AVX__VAES_ENC_LAST, 0
\r
720 AVX__WOP AVX__XOR_WITH_DATA
\r
721 AVX__WOP AVX__WRITE_TO_DATA
\r
725 jnc avx_ctr_nextBlock2
\r
728 vextracti128 iv, iv_ymm, 1
\r
731 add r4, AVX_STACK_SUB
\r