9e052883 |
1 | ; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions\r |
2 | ; 2022-04-17 : Igor Pavlov : Public domain\r |
3 | \r |
4 | include 7zAsm.asm\r |
5 | \r |
6 | MY_ASM_START\r |
7 | \r |
8 | ; .data\r |
9 | ; public K\r |
10 | \r |
11 | ; we can use external SHA256_K_ARRAY defined in Sha256.c\r |
12 | ; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes\r |
13 | \r |
14 | COMMENT @\r |
15 | ifdef x64\r |
16 | K_CONST equ SHA256_K_ARRAY\r |
17 | else\r |
18 | K_CONST equ _SHA256_K_ARRAY\r |
19 | endif\r |
20 | EXTRN K_CONST:xmmword\r |
21 | @\r |
22 | \r |
23 | CONST SEGMENT\r |
24 | \r |
25 | align 16\r |
26 | Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12\r |
27 | \r |
28 | ; COMMENT @\r |
29 | align 16\r |
30 | K_CONST \\r |
31 | DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H\r |
32 | DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H\r |
33 | DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H\r |
34 | DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H\r |
35 | DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH\r |
36 | DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH\r |
37 | DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H\r |
38 | DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H\r |
39 | DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H\r |
40 | DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H\r |
41 | DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H\r |
42 | DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H\r |
43 | DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H\r |
44 | DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H\r |
45 | DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H\r |
46 | DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H\r |
47 | ; @\r |
48 | \r |
49 | CONST ENDS\r |
50 | \r |
51 | ; _TEXT$SHA256OPT SEGMENT 'CODE'\r |
52 | \r |
53 | ifndef x64\r |
54 | .686\r |
55 | .xmm\r |
56 | endif\r |
57 | \r |
58 | ; jwasm-based assemblers for linux and linker from new versions of binutils\r |
59 | ; can generate incorrect code for load [ARRAY + offset] instructions.\r |
60 | ; 22.00: we load K_CONST offset to (rTable) register to avoid jwasm+binutils problem \r |
61 | rTable equ r0\r |
62 | ; rTable equ K_CONST\r |
63 | \r |
64 | ifdef x64\r |
65 | rNum equ REG_ABI_PARAM_2\r |
66 | if (IS_LINUX eq 0)\r |
67 | LOCAL_SIZE equ (16 * 2)\r |
68 | endif\r |
69 | else\r |
70 | rNum equ r3\r |
71 | LOCAL_SIZE equ (16 * 1)\r |
72 | endif\r |
73 | \r |
74 | rState equ REG_ABI_PARAM_0\r |
75 | rData equ REG_ABI_PARAM_1\r |
76 | \r |
77 | \r |
78 | \r |
79 | \r |
80 | \r |
81 | \r |
82 | MY_SHA_INSTR macro cmd, a1, a2\r |
83 | db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)\r |
84 | endm\r |
85 | \r |
86 | cmd_sha256rnds2 equ 0cbH\r |
87 | cmd_sha256msg1 equ 0ccH\r |
88 | cmd_sha256msg2 equ 0cdH\r |
89 | \r |
90 | MY_sha256rnds2 macro a1, a2\r |
91 | MY_SHA_INSTR cmd_sha256rnds2, a1, a2\r |
92 | endm\r |
93 | \r |
94 | MY_sha256msg1 macro a1, a2\r |
95 | MY_SHA_INSTR cmd_sha256msg1, a1, a2\r |
96 | endm\r |
97 | \r |
98 | MY_sha256msg2 macro a1, a2\r |
99 | MY_SHA_INSTR cmd_sha256msg2, a1, a2\r |
100 | endm\r |
101 | \r |
102 | MY_PROLOG macro\r |
103 | ifdef x64\r |
104 | if (IS_LINUX eq 0)\r |
105 | movdqa [r4 + 8], xmm6\r |
106 | movdqa [r4 + 8 + 16], xmm7\r |
107 | sub r4, LOCAL_SIZE + 8\r |
108 | movdqa [r4 ], xmm8\r |
109 | movdqa [r4 + 16], xmm9\r |
110 | endif\r |
111 | else ; x86\r |
112 | push r3\r |
113 | push r5\r |
114 | mov r5, r4\r |
115 | NUM_PUSH_REGS equ 2\r |
116 | PARAM_OFFSET equ (REG_SIZE * (1 + NUM_PUSH_REGS))\r |
117 | if (IS_CDECL gt 0)\r |
118 | mov rState, [r4 + PARAM_OFFSET]\r |
119 | mov rData, [r4 + PARAM_OFFSET + REG_SIZE * 1]\r |
120 | mov rNum, [r4 + PARAM_OFFSET + REG_SIZE * 2]\r |
121 | else ; fastcall\r |
122 | mov rNum, [r4 + PARAM_OFFSET]\r |
123 | endif\r |
124 | and r4, -16\r |
125 | sub r4, LOCAL_SIZE\r |
126 | endif\r |
127 | endm\r |
128 | \r |
129 | MY_EPILOG macro\r |
130 | ifdef x64\r |
131 | if (IS_LINUX eq 0)\r |
132 | movdqa xmm8, [r4]\r |
133 | movdqa xmm9, [r4 + 16]\r |
134 | add r4, LOCAL_SIZE + 8\r |
135 | movdqa xmm6, [r4 + 8]\r |
136 | movdqa xmm7, [r4 + 8 + 16]\r |
137 | endif\r |
138 | else ; x86\r |
139 | mov r4, r5\r |
140 | pop r5\r |
141 | pop r3\r |
142 | endif\r |
143 | MY_ENDP\r |
144 | endm\r |
145 | \r |
146 | \r |
147 | msg equ xmm0\r |
148 | tmp equ xmm0\r |
149 | state0_N equ 2\r |
150 | state1_N equ 3\r |
151 | w_regs equ 4\r |
152 | \r |
153 | \r |
154 | state1_save equ xmm1\r |
155 | state0 equ @CatStr(xmm, %state0_N)\r |
156 | state1 equ @CatStr(xmm, %state1_N)\r |
157 | \r |
158 | \r |
159 | ifdef x64\r |
160 | state0_save equ xmm8\r |
161 | mask2 equ xmm9\r |
162 | else\r |
163 | state0_save equ [r4]\r |
164 | mask2 equ xmm0\r |
165 | endif\r |
166 | \r |
167 | LOAD_MASK macro\r |
168 | movdqa mask2, XMMWORD PTR Reverse_Endian_Mask\r |
169 | endm\r |
170 | \r |
171 | LOAD_W macro k:req\r |
172 | movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]\r |
173 | pshufb @CatStr(xmm, %(w_regs + k)), mask2\r |
174 | endm\r |
175 | \r |
176 | \r |
177 | ; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1\r |
178 | pre1 equ 3\r |
179 | pre2 equ 2\r |
180 | \r |
181 | \r |
182 | \r |
183 | RND4 macro k\r |
184 | movdqa msg, xmmword ptr [rTable + (k) * 16]\r |
185 | paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))\r |
186 | MY_sha256rnds2 state0_N, state1_N\r |
187 | pshufd msg, msg, 0eH\r |
188 | \r |
189 | if (k GE (4 - pre1)) AND (k LT (16 - pre1))\r |
190 | ; w4[0] = msg1(w4[-4], w4[-3])\r |
191 | MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))\r |
192 | endif\r |
193 | \r |
194 | MY_sha256rnds2 state1_N, state0_N\r |
195 | \r |
196 | if (k GE (4 - pre2)) AND (k LT (16 - pre2))\r |
197 | movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4)))\r |
198 | palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4\r |
199 | paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp\r |
200 | ; w4[0] = msg2(w4[0], w4[-1])\r |
201 | MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4))\r |
202 | endif\r |
203 | endm\r |
204 | \r |
205 | \r |
206 | \r |
207 | \r |
208 | \r |
209 | REVERSE_STATE macro\r |
210 | ; state0 ; dcba\r |
211 | ; state1 ; hgfe\r |
212 | pshufd tmp, state0, 01bH ; abcd\r |
213 | pshufd state0, state1, 01bH ; efgh\r |
214 | movdqa state1, state0 ; efgh\r |
215 | punpcklqdq state0, tmp ; cdgh\r |
216 | punpckhqdq state1, tmp ; abef\r |
217 | endm\r |
218 | \r |
219 | \r |
220 | MY_PROC Sha256_UpdateBlocks_HW, 3\r |
221 | MY_PROLOG\r |
222 | \r |
223 | lea rTable, [K_CONST]\r |
224 | \r |
225 | cmp rNum, 0\r |
226 | je end_c\r |
227 | \r |
228 | movdqu state0, [rState] ; dcba\r |
229 | movdqu state1, [rState + 16] ; hgfe\r |
230 | \r |
231 | REVERSE_STATE\r |
232 | \r |
233 | ifdef x64\r |
234 | LOAD_MASK\r |
235 | endif\r |
236 | \r |
237 | align 16\r |
238 | nextBlock:\r |
239 | movdqa state0_save, state0\r |
240 | movdqa state1_save, state1\r |
241 | \r |
242 | ifndef x64\r |
243 | LOAD_MASK\r |
244 | endif\r |
245 | \r |
246 | LOAD_W 0\r |
247 | LOAD_W 1\r |
248 | LOAD_W 2\r |
249 | LOAD_W 3\r |
250 | \r |
251 | \r |
252 | k = 0\r |
253 | rept 16\r |
254 | RND4 k\r |
255 | k = k + 1\r |
256 | endm\r |
257 | \r |
258 | paddd state0, state0_save\r |
259 | paddd state1, state1_save\r |
260 | \r |
261 | add rData, 64\r |
262 | sub rNum, 1\r |
263 | jnz nextBlock\r |
264 | \r |
265 | REVERSE_STATE\r |
266 | \r |
267 | movdqu [rState], state0\r |
268 | movdqu [rState + 16], state1\r |
269 | \r |
270 | end_c:\r |
271 | MY_EPILOG\r |
272 | \r |
273 | ; _TEXT$SHA256OPT ENDS\r |
274 | \r |
275 | end\r |