X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=deps%2Flibchdr%2Fdeps%2Flzma-22.01%2Fsrc%2FAsm%2Fx86%2FAesOpt.asm;fp=deps%2Flibchdr%2Fdeps%2Flzma-22.01%2Fsrc%2FAsm%2Fx86%2FAesOpt.asm;h=1b7b5d4500db0f730e398d8294ffd93783b2e573;hb=9e052883388b2b607a488e48b3cb6db52a0997dd;hp=0000000000000000000000000000000000000000;hpb=7e49a25d527a1d79569f39a6f2d8d28a0d6cebd5;p=pcsx_rearmed.git diff --git a/deps/libchdr/deps/lzma-22.01/src/Asm/x86/AesOpt.asm b/deps/libchdr/deps/lzma-22.01/src/Asm/x86/AesOpt.asm new file mode 100644 index 00000000..1b7b5d45 --- /dev/null +++ b/deps/libchdr/deps/lzma-22.01/src/Asm/x86/AesOpt.asm @@ -0,0 +1,742 @@ +; AesOpt.asm -- AES optimized code for x86 AES hardware instructions +; 2021-12-25 : Igor Pavlov : Public domain + +include 7zAsm.asm + +ifdef __ASMC__ + use_vaes_256 equ 1 +else +ifdef ymm0 + use_vaes_256 equ 1 +endif +endif + + +ifdef use_vaes_256 + ECHO "++ VAES 256" +else + ECHO "-- NO VAES 256" +endif + +ifdef x64 + ECHO "x86-64" +else + ECHO "x86" +if (IS_CDECL gt 0) + ECHO "ABI : CDECL" +else + ECHO "ABI : no CDECL : FASTCALL" +endif +endif + +if (IS_LINUX gt 0) + ECHO "ABI : LINUX" +else + ECHO "ABI : WINDOWS" +endif + +MY_ASM_START + +ifndef x64 + .686 + .xmm +endif + + +; MY_ALIGN EQU ALIGN(64) +MY_ALIGN EQU + +SEG_ALIGN EQU MY_ALIGN + +MY_SEG_PROC macro name:req, numParams:req + ; seg_name equ @CatStr(_TEXT$, name) + ; seg_name SEGMENT SEG_ALIGN 'CODE' + MY_PROC name, numParams +endm + +MY_SEG_ENDP macro + ; seg_name ENDS +endm + + +NUM_AES_KEYS_MAX equ 15 + +; the number of push operators in function PROLOG +if (IS_LINUX eq 0) or (IS_X64 eq 0) +num_regs_push equ 2 +stack_param_offset equ (REG_SIZE * (1 + num_regs_push)) +endif + +ifdef x64 + num_param equ REG_ABI_PARAM_2 +else + if (IS_CDECL gt 0) + ; size_t size + ; void * data + ; UInt32 * aes + ; ret-ip <- (r4) + aes_OFFS equ (stack_param_offset) + data_OFFS equ (REG_SIZE + aes_OFFS) + size_OFFS equ (REG_SIZE + data_OFFS) + num_param equ [r4 + size_OFFS] + else + num_param equ [r4 + stack_param_offset] + endif +endif + +keys equ REG_PARAM_0 ; r1 +rD equ REG_PARAM_1 ; r2 +rN equ r0 + +koffs_x equ x7 +koffs_r equ r7 + +ksize_x equ x6 +ksize_r equ r6 + +keys2 equ r3 + +state equ xmm0 +key equ xmm0 +key_ymm equ ymm0 +key_ymm_n equ 0 + +ifdef x64 + ways = 11 +else + ways = 4 +endif + +ways_start_reg equ 1 + +iv equ @CatStr(xmm, %(ways_start_reg + ways)) +iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways)) + + +WOP macro op, op2 + i = 0 + rept ways + op @CatStr(xmm, %(ways_start_reg + i)), op2 + i = i + 1 + endm +endm + + +ifndef ABI_LINUX +ifdef x64 + +; we use 32 bytes of home space in stack in WIN64-x64 +NUM_HOME_MM_REGS equ (32 / 16) +; we preserve xmm registers starting from xmm6 in WIN64-x64 +MM_START_SAVE_REG equ 6 + +SAVE_XMM macro num_used_mm_regs:req + num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG + if num_save_mm_regs GT 0 + num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS + ; RSP is (16*x + 8) after entering the function in WIN64-x64 + stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16) + + i = 0 + rept num_save_mm_regs + + if i eq NUM_HOME_MM_REGS + sub r4, stack_offset + endif + + if i lt NUM_HOME_MM_REGS + movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i)) + else + movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i)) + endif + + i = i + 1 + endm + endif +endm + +RESTORE_XMM macro num_used_mm_regs:req + if num_save_mm_regs GT 0 + i = 0 + if num_save_mm_regs2 GT 0 + rept num_save_mm_regs2 + movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16] + i = i + 1 + endm + add r4, stack_offset + endif + + num_low_regs = num_save_mm_regs - i + i = 0 + rept num_low_regs + movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16] + i = i + 1 + endm + endif +endm + +endif ; x64 +endif ; ABI_LINUX + + +MY_PROLOG macro num_used_mm_regs:req + ; num_regs_push: must be equal to the number of push operators + ; push r3 + ; push r5 + if (IS_LINUX eq 0) or (IS_X64 eq 0) + push r6 + push r7 + endif + + mov rN, num_param ; don't move it; num_param can use stack pointer (r4) + + if (IS_X64 eq 0) + if (IS_CDECL gt 0) + mov rD, [r4 + data_OFFS] + mov keys, [r4 + aes_OFFS] + endif + elseif (IS_LINUX gt 0) + MY_ABI_LINUX_TO_WIN_2 + endif + + + ifndef ABI_LINUX + ifdef x64 + SAVE_XMM num_used_mm_regs + endif + endif + + mov ksize_x, [keys + 16] + shl ksize_x, 5 +endm + + +MY_EPILOG macro + ifndef ABI_LINUX + ifdef x64 + RESTORE_XMM num_save_mm_regs + endif + endif + + if (IS_LINUX eq 0) or (IS_X64 eq 0) + pop r7 + pop r6 + endif + ; pop r5 + ; pop r3 + MY_ENDP +endm + + +OP_KEY macro op:req, offs:req + op state, [keys + offs] +endm + + +WOP_KEY macro op:req, offs:req + movdqa key, [keys + offs] + WOP op, key +endm + + +; ---------- AES-CBC Decode ---------- + + +XOR_WITH_DATA macro reg, _ppp_ + pxor reg, [rD + i * 16] +endm + +WRITE_TO_DATA macro reg, _ppp_ + movdqa [rD + i * 16], reg +endm + + +; state0 equ @CatStr(xmm, %(ways_start_reg)) + +key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1)) +key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1)) + +key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2)) +key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2)) +key_last_ymm_n equ (ways_start_reg + ways + 2) + +NUM_CBC_REGS equ (ways_start_reg + ways + 3) + + +MY_SEG_PROC AesCbc_Decode_HW, 3 + + AesCbc_Decode_HW_start:: + MY_PROLOG NUM_CBC_REGS + + AesCbc_Decode_HW_start_2:: + movdqa iv, [keys] + add keys, 32 + + movdqa key0, [keys + 1 * ksize_r] + movdqa key_last, [keys] + sub ksize_x, 16 + + jmp check2 + align 16 + nextBlocks2: + WOP movdqa, [rD + i * 16] + mov koffs_x, ksize_x + ; WOP_KEY pxor, ksize_r + 16 + WOP pxor, key0 + ; align 16 + @@: + WOP_KEY aesdec, 1 * koffs_r + sub koffs_r, 16 + jnz @B + ; WOP_KEY aesdeclast, 0 + WOP aesdeclast, key_last + + pxor @CatStr(xmm, %(ways_start_reg)), iv + i = 1 + rept ways - 1 + pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16] + i = i + 1 + endm + movdqa iv, [rD + ways * 16 - 16] + WOP WRITE_TO_DATA + + add rD, ways * 16 + AesCbc_Decode_HW_start_3:: + check2: + sub rN, ways + jnc nextBlocks2 + add rN, ways + + sub ksize_x, 16 + + jmp check + nextBlock: + movdqa state, [rD] + mov koffs_x, ksize_x + ; OP_KEY pxor, 1 * ksize_r + 32 + pxor state, key0 + ; movdqa state0, [rD] + ; movdqa state, key0 + ; pxor state, state0 + @@: + OP_KEY aesdec, 1 * koffs_r + 16 + OP_KEY aesdec, 1 * koffs_r + sub koffs_r, 32 + jnz @B + OP_KEY aesdec, 16 + ; OP_KEY aesdeclast, 0 + aesdeclast state, key_last + + pxor state, iv + movdqa iv, [rD] + ; movdqa iv, state0 + movdqa [rD], state + + add rD, 16 + check: + sub rN, 1 + jnc nextBlock + + movdqa [keys - 32], iv +MY_EPILOG + + + + +; ---------- AVX ---------- + + +AVX__WOP_n macro op + i = 0 + rept ways + op (ways_start_reg + i) + i = i + 1 + endm +endm + +AVX__WOP macro op + i = 0 + rept ways + op @CatStr(ymm, %(ways_start_reg + i)) + i = i + 1 + endm +endm + + +AVX__WOP_KEY macro op:req, offs:req + vmovdqa key_ymm, ymmword ptr [keys2 + offs] + AVX__WOP_n op +endm + + +AVX__CBC_START macro reg + ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i] + vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i] +endm + +AVX__CBC_END macro reg + if i eq 0 + vpxor reg, reg, iv_ymm + else + vpxor reg, reg, ymmword ptr [rD + i * 32 - 16] + endif +endm + + +AVX__WRITE_TO_DATA macro reg + vmovdqu ymmword ptr [rD + 32 * i], reg +endm + +AVX__XOR_WITH_DATA macro reg + vpxor reg, reg, ymmword ptr [rD + 32 * i] +endm + +AVX__CTR_START macro reg + vpaddq iv_ymm, iv_ymm, one_ymm + ; vpxor reg, iv_ymm, key_ymm + vpxor reg, iv_ymm, key0_ymm +endm + + +MY_VAES_INSTR_2 macro cmd, dest, a1, a2 + db 0c4H + db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8) + db 5 + 8 * ((not (a1)) and 15) + db cmd + db 0c0H + 8 * ((dest) and 7) + ((a2) and 7) +endm + +MY_VAES_INSTR macro cmd, dest, a + MY_VAES_INSTR_2 cmd, dest, dest, a +endm + +MY_vaesenc macro dest, a + MY_VAES_INSTR 0dcH, dest, a +endm +MY_vaesenclast macro dest, a + MY_VAES_INSTR 0ddH, dest, a +endm +MY_vaesdec macro dest, a + MY_VAES_INSTR 0deH, dest, a +endm +MY_vaesdeclast macro dest, a + MY_VAES_INSTR 0dfH, dest, a +endm + + +AVX__VAES_DEC macro reg + MY_vaesdec reg, key_ymm_n +endm + +AVX__VAES_DEC_LAST_key_last macro reg + ; MY_vaesdeclast reg, key_ymm_n + MY_vaesdeclast reg, key_last_ymm_n +endm + +AVX__VAES_ENC macro reg + MY_vaesenc reg, key_ymm_n +endm + +AVX__VAES_ENC_LAST macro reg + MY_vaesenclast reg, key_ymm_n +endm + +AVX__vinserti128_TO_HIGH macro dest, src + vinserti128 dest, dest, src, 1 +endm + + +MY_PROC AesCbc_Decode_HW_256, 3 + ifdef use_vaes_256 + MY_PROLOG NUM_CBC_REGS + + cmp rN, ways * 2 + jb AesCbc_Decode_HW_start_2 + + vmovdqa iv, xmmword ptr [keys] + add keys, 32 + + vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r] + vbroadcasti128 key_last_ymm, xmmword ptr [keys] + sub ksize_x, 16 + mov koffs_x, ksize_x + add ksize_x, ksize_x + + AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32) + push keys2 + sub r4, AVX_STACK_SUB + ; sub r4, 32 + ; sub r4, ksize_r + ; lea keys2, [r4 + 32] + mov keys2, r4 + and keys2, -32 + broad: + vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r] + vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm + sub koffs_r, 16 + ; jnc broad + jnz broad + + sub rN, ways * 2 + + align 16 + avx_cbcdec_nextBlock2: + mov koffs_x, ksize_x + ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32 + AVX__WOP AVX__CBC_START + @@: + AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r + sub koffs_r, 32 + jnz @B + ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0 + AVX__WOP_n AVX__VAES_DEC_LAST_key_last + + AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD] + AVX__WOP AVX__CBC_END + + vmovdqa iv, xmmword ptr [rD + ways * 32 - 16] + AVX__WOP AVX__WRITE_TO_DATA + + add rD, ways * 32 + sub rN, ways * 2 + jnc avx_cbcdec_nextBlock2 + add rN, ways * 2 + + shr ksize_x, 1 + + ; lea r4, [r4 + 1 * ksize_r + 32] + add r4, AVX_STACK_SUB + pop keys2 + + vzeroupper + jmp AesCbc_Decode_HW_start_3 + else + jmp AesCbc_Decode_HW_start + endif +MY_ENDP +MY_SEG_ENDP + + + + +; ---------- AES-CBC Encode ---------- + +e0 equ xmm1 + +CENC_START_KEY equ 2 +CENC_NUM_REG_KEYS equ (3 * 2) +; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS)) + +MY_SEG_PROC AesCbc_Encode_HW, 3 + MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0) + + movdqa state, [keys] + add keys, 32 + + i = 0 + rept CENC_NUM_REG_KEYS + movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16] + i = i + 1 + endm + + add keys, ksize_r + neg ksize_r + add ksize_r, (16 * CENC_NUM_REG_KEYS) + ; movdqa last_key, [keys] + jmp check_e + + align 16 + nextBlock_e: + movdqa e0, [rD] + mov koffs_r, ksize_r + pxor e0, @CatStr(xmm, %(CENC_START_KEY)) + pxor state, e0 + + i = 1 + rept (CENC_NUM_REG_KEYS - 1) + aesenc state, @CatStr(xmm, %(CENC_START_KEY + i)) + i = i + 1 + endm + + @@: + OP_KEY aesenc, 1 * koffs_r + OP_KEY aesenc, 1 * koffs_r + 16 + add koffs_r, 32 + jnz @B + OP_KEY aesenclast, 0 + ; aesenclast state, last_key + + movdqa [rD], state + add rD, 16 + check_e: + sub rN, 1 + jnc nextBlock_e + + ; movdqa [keys - 32], state + movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state +MY_EPILOG +MY_SEG_ENDP + + + +; ---------- AES-CTR ---------- + +ifdef x64 + ; ways = 11 +endif + + +one equ @CatStr(xmm, %(ways_start_reg + ways + 1)) +one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1)) +key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2)) +key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2)) +NUM_CTR_REGS equ (ways_start_reg + ways + 3) + +INIT_CTR macro reg, _ppp_ + paddq iv, one + movdqa reg, iv +endm + + +MY_SEG_PROC AesCtr_Code_HW, 3 + Ctr_start:: + MY_PROLOG NUM_CTR_REGS + + Ctr_start_2:: + movdqa iv, [keys] + add keys, 32 + movdqa key0, [keys] + + add keys, ksize_r + neg ksize_r + add ksize_r, 16 + + Ctr_start_3:: + mov koffs_x, 1 + movd one, koffs_x + jmp check2_c + + align 16 + nextBlocks2_c: + WOP INIT_CTR, 0 + mov koffs_r, ksize_r + ; WOP_KEY pxor, 1 * koffs_r -16 + WOP pxor, key0 + @@: + WOP_KEY aesenc, 1 * koffs_r + add koffs_r, 16 + jnz @B + WOP_KEY aesenclast, 0 + + WOP XOR_WITH_DATA + WOP WRITE_TO_DATA + add rD, ways * 16 + check2_c: + sub rN, ways + jnc nextBlocks2_c + add rN, ways + + sub keys, 16 + add ksize_r, 16 + + jmp check_c + + ; align 16 + nextBlock_c: + paddq iv, one + ; movdqa state, [keys + 1 * koffs_r - 16] + movdqa state, key0 + mov koffs_r, ksize_r + pxor state, iv + + @@: + OP_KEY aesenc, 1 * koffs_r + OP_KEY aesenc, 1 * koffs_r + 16 + add koffs_r, 32 + jnz @B + OP_KEY aesenc, 0 + OP_KEY aesenclast, 16 + + pxor state, [rD] + movdqa [rD], state + add rD, 16 + check_c: + sub rN, 1 + jnc nextBlock_c + + ; movdqa [keys - 32], iv + movdqa [keys + 1 * ksize_r - 16 - 32], iv +MY_EPILOG + + +MY_PROC AesCtr_Code_HW_256, 3 + ifdef use_vaes_256 + MY_PROLOG NUM_CTR_REGS + + cmp rN, ways * 2 + jb Ctr_start_2 + + vbroadcasti128 iv_ymm, xmmword ptr [keys] + add keys, 32 + vbroadcasti128 key0_ymm, xmmword ptr [keys] + mov koffs_x, 1 + vmovd one, koffs_x + vpsubq iv_ymm, iv_ymm, one_ymm + vpaddq one, one, one + AVX__vinserti128_TO_HIGH one_ymm, one + + add keys, ksize_r + sub ksize_x, 16 + neg ksize_r + mov koffs_r, ksize_r + add ksize_r, ksize_r + + AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32) + push keys2 + lea keys2, [r4 - 32] + sub r4, AVX_STACK_SUB + and keys2, -32 + vbroadcasti128 key_ymm, xmmword ptr [keys] + vmovdqa ymmword ptr [keys2], key_ymm + @@: + vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r] + vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm + add koffs_r, 16 + jnz @B + + sub rN, ways * 2 + + align 16 + avx_ctr_nextBlock2: + mov koffs_r, ksize_r + AVX__WOP AVX__CTR_START + ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32 + @@: + AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r + add koffs_r, 32 + jnz @B + AVX__WOP_KEY AVX__VAES_ENC_LAST, 0 + + AVX__WOP AVX__XOR_WITH_DATA + AVX__WOP AVX__WRITE_TO_DATA + + add rD, ways * 32 + sub rN, ways * 2 + jnc avx_ctr_nextBlock2 + add rN, ways * 2 + + vextracti128 iv, iv_ymm, 1 + sar ksize_r, 1 + + add r4, AVX_STACK_SUB + pop keys2 + + vzeroupper + jmp Ctr_start_3 + else + jmp Ctr_start + endif +MY_ENDP +MY_SEG_ENDP + +end