update libchdr
[pcsx_rearmed.git] / deps / libchdr / deps / lzma-22.01 / src / Asm / x86 / Sha256Opt.asm
CommitLineData
9e052883 1; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions\r
2; 2022-04-17 : Igor Pavlov : Public domain\r
3\r
4include 7zAsm.asm\r
5\r
6MY_ASM_START\r
7\r
8; .data\r
9; public K\r
10\r
11; we can use external SHA256_K_ARRAY defined in Sha256.c\r
12; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes\r
13\r
14COMMENT @\r
15ifdef x64\r
16K_CONST equ SHA256_K_ARRAY\r
17else\r
18K_CONST equ _SHA256_K_ARRAY\r
19endif\r
20EXTRN K_CONST:xmmword\r
21@\r
22\r
23CONST SEGMENT\r
24\r
25align 16\r
26Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12\r
27\r
28; COMMENT @\r
29align 16\r
30K_CONST \\r
31DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H\r
32DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H\r
33DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H\r
34DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H\r
35DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH\r
36DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH\r
37DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H\r
38DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H\r
39DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H\r
40DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H\r
41DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H\r
42DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H\r
43DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H\r
44DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H\r
45DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H\r
46DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H\r
47; @\r
48\r
49CONST ENDS\r
50\r
51; _TEXT$SHA256OPT SEGMENT 'CODE'\r
52\r
53ifndef x64\r
54 .686\r
55 .xmm\r
56endif\r
57 \r
58; jwasm-based assemblers for linux and linker from new versions of binutils\r
59; can generate incorrect code for load [ARRAY + offset] instructions.\r
60; 22.00: we load K_CONST offset to (rTable) register to avoid jwasm+binutils problem \r
61 rTable equ r0\r
62 ; rTable equ K_CONST\r
63 \r
64ifdef x64\r
65 rNum equ REG_ABI_PARAM_2\r
66 if (IS_LINUX eq 0)\r
67 LOCAL_SIZE equ (16 * 2)\r
68 endif\r
69else\r
70 rNum equ r3\r
71 LOCAL_SIZE equ (16 * 1)\r
72endif\r
73\r
74rState equ REG_ABI_PARAM_0\r
75rData equ REG_ABI_PARAM_1\r
76\r
77\r
78\r
79\r
80\r
81\r
82MY_SHA_INSTR macro cmd, a1, a2\r
83 db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)\r
84endm\r
85\r
86cmd_sha256rnds2 equ 0cbH\r
87cmd_sha256msg1 equ 0ccH\r
88cmd_sha256msg2 equ 0cdH\r
89\r
90MY_sha256rnds2 macro a1, a2\r
91 MY_SHA_INSTR cmd_sha256rnds2, a1, a2\r
92endm\r
93\r
94MY_sha256msg1 macro a1, a2\r
95 MY_SHA_INSTR cmd_sha256msg1, a1, a2\r
96endm\r
97\r
98MY_sha256msg2 macro a1, a2\r
99 MY_SHA_INSTR cmd_sha256msg2, a1, a2\r
100endm\r
101\r
102MY_PROLOG macro\r
103 ifdef x64\r
104 if (IS_LINUX eq 0)\r
105 movdqa [r4 + 8], xmm6\r
106 movdqa [r4 + 8 + 16], xmm7\r
107 sub r4, LOCAL_SIZE + 8\r
108 movdqa [r4 ], xmm8\r
109 movdqa [r4 + 16], xmm9\r
110 endif\r
111 else ; x86\r
112 push r3\r
113 push r5\r
114 mov r5, r4\r
115 NUM_PUSH_REGS equ 2\r
116 PARAM_OFFSET equ (REG_SIZE * (1 + NUM_PUSH_REGS))\r
117 if (IS_CDECL gt 0)\r
118 mov rState, [r4 + PARAM_OFFSET]\r
119 mov rData, [r4 + PARAM_OFFSET + REG_SIZE * 1]\r
120 mov rNum, [r4 + PARAM_OFFSET + REG_SIZE * 2]\r
121 else ; fastcall\r
122 mov rNum, [r4 + PARAM_OFFSET]\r
123 endif\r
124 and r4, -16\r
125 sub r4, LOCAL_SIZE\r
126 endif\r
127endm\r
128\r
129MY_EPILOG macro\r
130 ifdef x64\r
131 if (IS_LINUX eq 0)\r
132 movdqa xmm8, [r4]\r
133 movdqa xmm9, [r4 + 16]\r
134 add r4, LOCAL_SIZE + 8\r
135 movdqa xmm6, [r4 + 8]\r
136 movdqa xmm7, [r4 + 8 + 16]\r
137 endif\r
138 else ; x86\r
139 mov r4, r5\r
140 pop r5\r
141 pop r3\r
142 endif\r
143 MY_ENDP\r
144endm\r
145\r
146\r
147msg equ xmm0\r
148tmp equ xmm0\r
149state0_N equ 2\r
150state1_N equ 3\r
151w_regs equ 4\r
152\r
153\r
154state1_save equ xmm1\r
155state0 equ @CatStr(xmm, %state0_N)\r
156state1 equ @CatStr(xmm, %state1_N)\r
157\r
158\r
159ifdef x64\r
160 state0_save equ xmm8\r
161 mask2 equ xmm9\r
162else\r
163 state0_save equ [r4]\r
164 mask2 equ xmm0\r
165endif\r
166\r
167LOAD_MASK macro\r
168 movdqa mask2, XMMWORD PTR Reverse_Endian_Mask\r
169endm\r
170\r
171LOAD_W macro k:req\r
172 movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]\r
173 pshufb @CatStr(xmm, %(w_regs + k)), mask2\r
174endm\r
175\r
176\r
177; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1\r
178pre1 equ 3\r
179pre2 equ 2\r
180 \r
181\r
182\r
183RND4 macro k\r
184 movdqa msg, xmmword ptr [rTable + (k) * 16]\r
185 paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))\r
186 MY_sha256rnds2 state0_N, state1_N\r
187 pshufd msg, msg, 0eH\r
188 \r
189 if (k GE (4 - pre1)) AND (k LT (16 - pre1))\r
190 ; w4[0] = msg1(w4[-4], w4[-3])\r
191 MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))\r
192 endif\r
193 \r
194 MY_sha256rnds2 state1_N, state0_N\r
195\r
196 if (k GE (4 - pre2)) AND (k LT (16 - pre2))\r
197 movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4)))\r
198 palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4\r
199 paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp\r
200 ; w4[0] = msg2(w4[0], w4[-1])\r
201 MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4))\r
202 endif\r
203endm\r
204\r
205\r
206\r
207\r
208\r
209REVERSE_STATE macro\r
210 ; state0 ; dcba\r
211 ; state1 ; hgfe\r
212 pshufd tmp, state0, 01bH ; abcd\r
213 pshufd state0, state1, 01bH ; efgh\r
214 movdqa state1, state0 ; efgh\r
215 punpcklqdq state0, tmp ; cdgh\r
216 punpckhqdq state1, tmp ; abef\r
217endm\r
218\r
219\r
220MY_PROC Sha256_UpdateBlocks_HW, 3\r
221 MY_PROLOG\r
222\r
223 lea rTable, [K_CONST]\r
224\r
225 cmp rNum, 0\r
226 je end_c\r
227\r
228 movdqu state0, [rState] ; dcba\r
229 movdqu state1, [rState + 16] ; hgfe\r
230\r
231 REVERSE_STATE\r
232 \r
233 ifdef x64\r
234 LOAD_MASK\r
235 endif\r
236\r
237 align 16\r
238 nextBlock:\r
239 movdqa state0_save, state0\r
240 movdqa state1_save, state1\r
241 \r
242 ifndef x64\r
243 LOAD_MASK\r
244 endif\r
245 \r
246 LOAD_W 0\r
247 LOAD_W 1\r
248 LOAD_W 2\r
249 LOAD_W 3\r
250\r
251 \r
252 k = 0\r
253 rept 16\r
254 RND4 k\r
255 k = k + 1\r
256 endm\r
257\r
258 paddd state0, state0_save\r
259 paddd state1, state1_save\r
260\r
261 add rData, 64\r
262 sub rNum, 1\r
263 jnz nextBlock\r
264 \r
265 REVERSE_STATE\r
266\r
267 movdqu [rState], state0\r
268 movdqu [rState + 16], state1\r
269 \r
270 end_c:\r
271MY_EPILOG\r
272\r
273; _TEXT$SHA256OPT ENDS\r
274\r
275end\r