--- /dev/null
+; AesOpt.asm -- AES optimized code for x86 AES hardware instructions\r
+; 2021-12-25 : Igor Pavlov : Public domain\r
+\r
+include 7zAsm.asm\r
+\r
+ifdef __ASMC__\r
+ use_vaes_256 equ 1\r
+else\r
+ifdef ymm0\r
+ use_vaes_256 equ 1\r
+endif\r
+endif\r
+\r
+\r
+ifdef use_vaes_256\r
+ ECHO "++ VAES 256"\r
+else\r
+ ECHO "-- NO VAES 256"\r
+endif\r
+\r
+ifdef x64\r
+ ECHO "x86-64"\r
+else\r
+ ECHO "x86"\r
+if (IS_CDECL gt 0)\r
+ ECHO "ABI : CDECL"\r
+else\r
+ ECHO "ABI : no CDECL : FASTCALL"\r
+endif\r
+endif\r
+\r
+if (IS_LINUX gt 0)\r
+ ECHO "ABI : LINUX"\r
+else\r
+ ECHO "ABI : WINDOWS"\r
+endif\r
+\r
+MY_ASM_START\r
+\r
+ifndef x64\r
+ .686\r
+ .xmm\r
+endif\r
+\r
+\r
+; MY_ALIGN EQU ALIGN(64)\r
+MY_ALIGN EQU\r
+\r
+SEG_ALIGN EQU MY_ALIGN\r
+\r
+MY_SEG_PROC macro name:req, numParams:req\r
+ ; seg_name equ @CatStr(_TEXT$, name)\r
+ ; seg_name SEGMENT SEG_ALIGN 'CODE'\r
+ MY_PROC name, numParams\r
+endm\r
+\r
+MY_SEG_ENDP macro\r
+ ; seg_name ENDS\r
+endm\r
+\r
+\r
+NUM_AES_KEYS_MAX equ 15\r
+\r
+; the number of push operators in function PROLOG\r
+if (IS_LINUX eq 0) or (IS_X64 eq 0)\r
+num_regs_push equ 2\r
+stack_param_offset equ (REG_SIZE * (1 + num_regs_push))\r
+endif\r
+\r
+ifdef x64\r
+ num_param equ REG_ABI_PARAM_2\r
+else\r
+ if (IS_CDECL gt 0)\r
+ ; size_t size\r
+ ; void * data\r
+ ; UInt32 * aes\r
+ ; ret-ip <- (r4)\r
+ aes_OFFS equ (stack_param_offset)\r
+ data_OFFS equ (REG_SIZE + aes_OFFS)\r
+ size_OFFS equ (REG_SIZE + data_OFFS)\r
+ num_param equ [r4 + size_OFFS]\r
+ else\r
+ num_param equ [r4 + stack_param_offset]\r
+ endif\r
+endif\r
+\r
+keys equ REG_PARAM_0 ; r1\r
+rD equ REG_PARAM_1 ; r2\r
+rN equ r0\r
+\r
+koffs_x equ x7\r
+koffs_r equ r7\r
+\r
+ksize_x equ x6\r
+ksize_r equ r6\r
+\r
+keys2 equ r3\r
+\r
+state equ xmm0\r
+key equ xmm0\r
+key_ymm equ ymm0\r
+key_ymm_n equ 0\r
+\r
+ifdef x64\r
+ ways = 11\r
+else\r
+ ways = 4\r
+endif\r
+\r
+ways_start_reg equ 1\r
+\r
+iv equ @CatStr(xmm, %(ways_start_reg + ways))\r
+iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))\r
+\r
+\r
+WOP macro op, op2\r
+ i = 0\r
+ rept ways\r
+ op @CatStr(xmm, %(ways_start_reg + i)), op2\r
+ i = i + 1\r
+ endm\r
+endm\r
+\r
+\r
+ifndef ABI_LINUX\r
+ifdef x64\r
+\r
+; we use 32 bytes of home space in stack in WIN64-x64\r
+NUM_HOME_MM_REGS equ (32 / 16)\r
+; we preserve xmm registers starting from xmm6 in WIN64-x64\r
+MM_START_SAVE_REG equ 6\r
+\r
+SAVE_XMM macro num_used_mm_regs:req\r
+ num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG\r
+ if num_save_mm_regs GT 0\r
+ num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS\r
+ ; RSP is (16*x + 8) after entering the function in WIN64-x64\r
+ stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)\r
+ \r
+ i = 0\r
+ rept num_save_mm_regs\r
+ \r
+ if i eq NUM_HOME_MM_REGS\r
+ sub r4, stack_offset\r
+ endif\r
+ \r
+ if i lt NUM_HOME_MM_REGS\r
+ movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))\r
+ else\r
+ movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))\r
+ endif\r
+ \r
+ i = i + 1\r
+ endm\r
+ endif\r
+endm\r
+\r
+RESTORE_XMM macro num_used_mm_regs:req\r
+ if num_save_mm_regs GT 0\r
+ i = 0\r
+ if num_save_mm_regs2 GT 0\r
+ rept num_save_mm_regs2\r
+ movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]\r
+ i = i + 1\r
+ endm\r
+ add r4, stack_offset\r
+ endif\r
+\r
+ num_low_regs = num_save_mm_regs - i\r
+ i = 0\r
+ rept num_low_regs\r
+ movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]\r
+ i = i + 1\r
+ endm\r
+ endif\r
+endm\r
+\r
+endif ; x64\r
+endif ; ABI_LINUX\r
+\r
+\r
+MY_PROLOG macro num_used_mm_regs:req\r
+ ; num_regs_push: must be equal to the number of push operators\r
+ ; push r3\r
+ ; push r5\r
+ if (IS_LINUX eq 0) or (IS_X64 eq 0)\r
+ push r6\r
+ push r7\r
+ endif\r
+\r
+ mov rN, num_param ; don't move it; num_param can use stack pointer (r4)\r
+\r
+ if (IS_X64 eq 0)\r
+ if (IS_CDECL gt 0)\r
+ mov rD, [r4 + data_OFFS]\r
+ mov keys, [r4 + aes_OFFS]\r
+ endif\r
+ elseif (IS_LINUX gt 0)\r
+ MY_ABI_LINUX_TO_WIN_2\r
+ endif\r
+\r
+\r
+ ifndef ABI_LINUX\r
+ ifdef x64\r
+ SAVE_XMM num_used_mm_regs\r
+ endif\r
+ endif\r
+ \r
+ mov ksize_x, [keys + 16]\r
+ shl ksize_x, 5\r
+endm\r
+\r
+\r
+MY_EPILOG macro\r
+ ifndef ABI_LINUX\r
+ ifdef x64\r
+ RESTORE_XMM num_save_mm_regs\r
+ endif\r
+ endif\r
+ \r
+ if (IS_LINUX eq 0) or (IS_X64 eq 0)\r
+ pop r7\r
+ pop r6\r
+ endif\r
+ ; pop r5\r
+ ; pop r3\r
+ MY_ENDP\r
+endm\r
+\r
+\r
+OP_KEY macro op:req, offs:req\r
+ op state, [keys + offs]\r
+endm\r
+\r
+ \r
+WOP_KEY macro op:req, offs:req\r
+ movdqa key, [keys + offs]\r
+ WOP op, key\r
+endm\r
+\r
+\r
+; ---------- AES-CBC Decode ----------\r
+\r
+\r
+XOR_WITH_DATA macro reg, _ppp_\r
+ pxor reg, [rD + i * 16]\r
+endm\r
+\r
+WRITE_TO_DATA macro reg, _ppp_\r
+ movdqa [rD + i * 16], reg\r
+endm\r
+\r
+\r
+; state0 equ @CatStr(xmm, %(ways_start_reg))\r
+\r
+key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))\r
+key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))\r
+\r
+key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))\r
+key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))\r
+key_last_ymm_n equ (ways_start_reg + ways + 2)\r
+\r
+NUM_CBC_REGS equ (ways_start_reg + ways + 3)\r
+\r
+\r
+MY_SEG_PROC AesCbc_Decode_HW, 3\r
+\r
+ AesCbc_Decode_HW_start::\r
+ MY_PROLOG NUM_CBC_REGS\r
+ \r
+ AesCbc_Decode_HW_start_2::\r
+ movdqa iv, [keys]\r
+ add keys, 32\r
+\r
+ movdqa key0, [keys + 1 * ksize_r]\r
+ movdqa key_last, [keys]\r
+ sub ksize_x, 16\r
+\r
+ jmp check2\r
+ align 16\r
+ nextBlocks2:\r
+ WOP movdqa, [rD + i * 16]\r
+ mov koffs_x, ksize_x\r
+ ; WOP_KEY pxor, ksize_r + 16\r
+ WOP pxor, key0\r
+ ; align 16\r
+ @@:\r
+ WOP_KEY aesdec, 1 * koffs_r\r
+ sub koffs_r, 16\r
+ jnz @B\r
+ ; WOP_KEY aesdeclast, 0\r
+ WOP aesdeclast, key_last\r
+ \r
+ pxor @CatStr(xmm, %(ways_start_reg)), iv\r
+ i = 1\r
+ rept ways - 1\r
+ pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]\r
+ i = i + 1\r
+ endm\r
+ movdqa iv, [rD + ways * 16 - 16]\r
+ WOP WRITE_TO_DATA\r
+\r
+ add rD, ways * 16\r
+ AesCbc_Decode_HW_start_3::\r
+ check2:\r
+ sub rN, ways\r
+ jnc nextBlocks2\r
+ add rN, ways\r
+\r
+ sub ksize_x, 16\r
+\r
+ jmp check\r
+ nextBlock:\r
+ movdqa state, [rD]\r
+ mov koffs_x, ksize_x\r
+ ; OP_KEY pxor, 1 * ksize_r + 32\r
+ pxor state, key0\r
+ ; movdqa state0, [rD]\r
+ ; movdqa state, key0\r
+ ; pxor state, state0\r
+ @@:\r
+ OP_KEY aesdec, 1 * koffs_r + 16\r
+ OP_KEY aesdec, 1 * koffs_r\r
+ sub koffs_r, 32\r
+ jnz @B\r
+ OP_KEY aesdec, 16\r
+ ; OP_KEY aesdeclast, 0\r
+ aesdeclast state, key_last\r
+ \r
+ pxor state, iv\r
+ movdqa iv, [rD]\r
+ ; movdqa iv, state0\r
+ movdqa [rD], state\r
+ \r
+ add rD, 16\r
+ check:\r
+ sub rN, 1\r
+ jnc nextBlock\r
+\r
+ movdqa [keys - 32], iv\r
+MY_EPILOG\r
+\r
+\r
+\r
+\r
+; ---------- AVX ----------\r
+\r
+\r
+AVX__WOP_n macro op\r
+ i = 0\r
+ rept ways\r
+ op (ways_start_reg + i)\r
+ i = i + 1\r
+ endm\r
+endm\r
+\r
+AVX__WOP macro op\r
+ i = 0\r
+ rept ways\r
+ op @CatStr(ymm, %(ways_start_reg + i))\r
+ i = i + 1\r
+ endm\r
+endm\r
+\r
+\r
+AVX__WOP_KEY macro op:req, offs:req\r
+ vmovdqa key_ymm, ymmword ptr [keys2 + offs]\r
+ AVX__WOP_n op\r
+endm\r
+\r
+\r
+AVX__CBC_START macro reg\r
+ ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]\r
+ vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]\r
+endm\r
+\r
+AVX__CBC_END macro reg\r
+ if i eq 0\r
+ vpxor reg, reg, iv_ymm\r
+ else\r
+ vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]\r
+ endif\r
+endm\r
+\r
+\r
+AVX__WRITE_TO_DATA macro reg\r
+ vmovdqu ymmword ptr [rD + 32 * i], reg\r
+endm\r
+\r
+AVX__XOR_WITH_DATA macro reg\r
+ vpxor reg, reg, ymmword ptr [rD + 32 * i]\r
+endm\r
+\r
+AVX__CTR_START macro reg\r
+ vpaddq iv_ymm, iv_ymm, one_ymm\r
+ ; vpxor reg, iv_ymm, key_ymm\r
+ vpxor reg, iv_ymm, key0_ymm\r
+endm\r
+\r
+\r
+MY_VAES_INSTR_2 macro cmd, dest, a1, a2\r
+ db 0c4H\r
+ db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)\r
+ db 5 + 8 * ((not (a1)) and 15)\r
+ db cmd\r
+ db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)\r
+endm\r
+\r
+MY_VAES_INSTR macro cmd, dest, a\r
+ MY_VAES_INSTR_2 cmd, dest, dest, a\r
+endm\r
+\r
+MY_vaesenc macro dest, a\r
+ MY_VAES_INSTR 0dcH, dest, a\r
+endm\r
+MY_vaesenclast macro dest, a\r
+ MY_VAES_INSTR 0ddH, dest, a\r
+endm\r
+MY_vaesdec macro dest, a\r
+ MY_VAES_INSTR 0deH, dest, a\r
+endm\r
+MY_vaesdeclast macro dest, a\r
+ MY_VAES_INSTR 0dfH, dest, a\r
+endm\r
+\r
+\r
+AVX__VAES_DEC macro reg\r
+ MY_vaesdec reg, key_ymm_n\r
+endm\r
+\r
+AVX__VAES_DEC_LAST_key_last macro reg\r
+ ; MY_vaesdeclast reg, key_ymm_n\r
+ MY_vaesdeclast reg, key_last_ymm_n\r
+endm\r
+\r
+AVX__VAES_ENC macro reg\r
+ MY_vaesenc reg, key_ymm_n\r
+endm\r
+\r
+AVX__VAES_ENC_LAST macro reg\r
+ MY_vaesenclast reg, key_ymm_n\r
+endm\r
+\r
+AVX__vinserti128_TO_HIGH macro dest, src\r
+ vinserti128 dest, dest, src, 1\r
+endm\r
+\r
+\r
+MY_PROC AesCbc_Decode_HW_256, 3\r
+ ifdef use_vaes_256\r
+ MY_PROLOG NUM_CBC_REGS\r
+ \r
+ cmp rN, ways * 2\r
+ jb AesCbc_Decode_HW_start_2\r
+\r
+ vmovdqa iv, xmmword ptr [keys]\r
+ add keys, 32\r
+\r
+ vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]\r
+ vbroadcasti128 key_last_ymm, xmmword ptr [keys]\r
+ sub ksize_x, 16\r
+ mov koffs_x, ksize_x\r
+ add ksize_x, ksize_x\r
+ \r
+ AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)\r
+ push keys2\r
+ sub r4, AVX_STACK_SUB\r
+ ; sub r4, 32\r
+ ; sub r4, ksize_r\r
+ ; lea keys2, [r4 + 32]\r
+ mov keys2, r4\r
+ and keys2, -32\r
+ broad:\r
+ vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]\r
+ vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm\r
+ sub koffs_r, 16\r
+ ; jnc broad\r
+ jnz broad\r
+\r
+ sub rN, ways * 2\r
+\r
+ align 16\r
+ avx_cbcdec_nextBlock2:\r
+ mov koffs_x, ksize_x\r
+ ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32\r
+ AVX__WOP AVX__CBC_START\r
+ @@:\r
+ AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r\r
+ sub koffs_r, 32\r
+ jnz @B\r
+ ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0\r
+ AVX__WOP_n AVX__VAES_DEC_LAST_key_last\r
+\r
+ AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]\r
+ AVX__WOP AVX__CBC_END\r
+\r
+ vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]\r
+ AVX__WOP AVX__WRITE_TO_DATA\r
+ \r
+ add rD, ways * 32\r
+ sub rN, ways * 2\r
+ jnc avx_cbcdec_nextBlock2\r
+ add rN, ways * 2\r
+\r
+ shr ksize_x, 1\r
+ \r
+ ; lea r4, [r4 + 1 * ksize_r + 32]\r
+ add r4, AVX_STACK_SUB\r
+ pop keys2\r
+\r
+ vzeroupper\r
+ jmp AesCbc_Decode_HW_start_3\r
+ else\r
+ jmp AesCbc_Decode_HW_start\r
+ endif\r
+MY_ENDP\r
+MY_SEG_ENDP\r
+\r
+\r
+\r
+ \r
+; ---------- AES-CBC Encode ----------\r
+\r
+e0 equ xmm1\r
+\r
+CENC_START_KEY equ 2\r
+CENC_NUM_REG_KEYS equ (3 * 2)\r
+; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))\r
+\r
+MY_SEG_PROC AesCbc_Encode_HW, 3\r
+ MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)\r
+\r
+ movdqa state, [keys]\r
+ add keys, 32\r
+ \r
+ i = 0\r
+ rept CENC_NUM_REG_KEYS\r
+ movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]\r
+ i = i + 1\r
+ endm\r
+ \r
+ add keys, ksize_r\r
+ neg ksize_r\r
+ add ksize_r, (16 * CENC_NUM_REG_KEYS)\r
+ ; movdqa last_key, [keys]\r
+ jmp check_e\r
+\r
+ align 16\r
+ nextBlock_e:\r
+ movdqa e0, [rD]\r
+ mov koffs_r, ksize_r\r
+ pxor e0, @CatStr(xmm, %(CENC_START_KEY))\r
+ pxor state, e0\r
+ \r
+ i = 1\r
+ rept (CENC_NUM_REG_KEYS - 1)\r
+ aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))\r
+ i = i + 1\r
+ endm\r
+\r
+ @@:\r
+ OP_KEY aesenc, 1 * koffs_r\r
+ OP_KEY aesenc, 1 * koffs_r + 16\r
+ add koffs_r, 32\r
+ jnz @B\r
+ OP_KEY aesenclast, 0\r
+ ; aesenclast state, last_key\r
+ \r
+ movdqa [rD], state\r
+ add rD, 16\r
+ check_e:\r
+ sub rN, 1\r
+ jnc nextBlock_e\r
+\r
+ ; movdqa [keys - 32], state\r
+ movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state\r
+MY_EPILOG\r
+MY_SEG_ENDP\r
+\r
+\r
+ \r
+; ---------- AES-CTR ----------\r
+\r
+ifdef x64\r
+ ; ways = 11\r
+endif\r
+\r
+ \r
+one equ @CatStr(xmm, %(ways_start_reg + ways + 1))\r
+one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))\r
+key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))\r
+key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))\r
+NUM_CTR_REGS equ (ways_start_reg + ways + 3)\r
+\r
+INIT_CTR macro reg, _ppp_\r
+ paddq iv, one\r
+ movdqa reg, iv\r
+endm\r
+\r
+\r
+MY_SEG_PROC AesCtr_Code_HW, 3\r
+ Ctr_start::\r
+ MY_PROLOG NUM_CTR_REGS\r
+\r
+ Ctr_start_2::\r
+ movdqa iv, [keys]\r
+ add keys, 32\r
+ movdqa key0, [keys]\r
+\r
+ add keys, ksize_r\r
+ neg ksize_r\r
+ add ksize_r, 16\r
+ \r
+ Ctr_start_3::\r
+ mov koffs_x, 1\r
+ movd one, koffs_x\r
+ jmp check2_c\r
+\r
+ align 16\r
+ nextBlocks2_c:\r
+ WOP INIT_CTR, 0\r
+ mov koffs_r, ksize_r\r
+ ; WOP_KEY pxor, 1 * koffs_r -16\r
+ WOP pxor, key0\r
+ @@:\r
+ WOP_KEY aesenc, 1 * koffs_r\r
+ add koffs_r, 16\r
+ jnz @B\r
+ WOP_KEY aesenclast, 0\r
+ \r
+ WOP XOR_WITH_DATA\r
+ WOP WRITE_TO_DATA\r
+ add rD, ways * 16\r
+ check2_c:\r
+ sub rN, ways\r
+ jnc nextBlocks2_c\r
+ add rN, ways\r
+\r
+ sub keys, 16\r
+ add ksize_r, 16\r
+ \r
+ jmp check_c\r
+\r
+ ; align 16\r
+ nextBlock_c:\r
+ paddq iv, one\r
+ ; movdqa state, [keys + 1 * koffs_r - 16]\r
+ movdqa state, key0\r
+ mov koffs_r, ksize_r\r
+ pxor state, iv\r
+ \r
+ @@:\r
+ OP_KEY aesenc, 1 * koffs_r\r
+ OP_KEY aesenc, 1 * koffs_r + 16\r
+ add koffs_r, 32\r
+ jnz @B\r
+ OP_KEY aesenc, 0\r
+ OP_KEY aesenclast, 16\r
+ \r
+ pxor state, [rD]\r
+ movdqa [rD], state\r
+ add rD, 16\r
+ check_c:\r
+ sub rN, 1\r
+ jnc nextBlock_c\r
+\r
+ ; movdqa [keys - 32], iv\r
+ movdqa [keys + 1 * ksize_r - 16 - 32], iv\r
+MY_EPILOG\r
+\r
+\r
+MY_PROC AesCtr_Code_HW_256, 3\r
+ ifdef use_vaes_256\r
+ MY_PROLOG NUM_CTR_REGS\r
+\r
+ cmp rN, ways * 2\r
+ jb Ctr_start_2\r
+\r
+ vbroadcasti128 iv_ymm, xmmword ptr [keys]\r
+ add keys, 32\r
+ vbroadcasti128 key0_ymm, xmmword ptr [keys]\r
+ mov koffs_x, 1\r
+ vmovd one, koffs_x\r
+ vpsubq iv_ymm, iv_ymm, one_ymm\r
+ vpaddq one, one, one\r
+ AVX__vinserti128_TO_HIGH one_ymm, one\r
+ \r
+ add keys, ksize_r\r
+ sub ksize_x, 16\r
+ neg ksize_r\r
+ mov koffs_r, ksize_r\r
+ add ksize_r, ksize_r\r
+\r
+ AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)\r
+ push keys2\r
+ lea keys2, [r4 - 32]\r
+ sub r4, AVX_STACK_SUB\r
+ and keys2, -32\r
+ vbroadcasti128 key_ymm, xmmword ptr [keys]\r
+ vmovdqa ymmword ptr [keys2], key_ymm\r
+ @@:\r
+ vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]\r
+ vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm\r
+ add koffs_r, 16\r
+ jnz @B\r
+\r
+ sub rN, ways * 2\r
+ \r
+ align 16\r
+ avx_ctr_nextBlock2:\r
+ mov koffs_r, ksize_r\r
+ AVX__WOP AVX__CTR_START\r
+ ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32\r
+ @@:\r
+ AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r\r
+ add koffs_r, 32\r
+ jnz @B\r
+ AVX__WOP_KEY AVX__VAES_ENC_LAST, 0\r
+ \r
+ AVX__WOP AVX__XOR_WITH_DATA\r
+ AVX__WOP AVX__WRITE_TO_DATA\r
+ \r
+ add rD, ways * 32\r
+ sub rN, ways * 2\r
+ jnc avx_ctr_nextBlock2\r
+ add rN, ways * 2\r
+ \r
+ vextracti128 iv, iv_ymm, 1\r
+ sar ksize_r, 1\r
+ \r
+ add r4, AVX_STACK_SUB\r
+ pop keys2\r
+ \r
+ vzeroupper\r
+ jmp Ctr_start_3\r
+ else\r
+ jmp Ctr_start\r
+ endif\r
+MY_ENDP\r
+MY_SEG_ENDP\r
+\r
+end\r