1 ; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions
\r
2 ; 2022-04-17 : Igor Pavlov : Public domain
\r
11 ; we can use external SHA256_K_ARRAY defined in Sha256.c
\r
12 ; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes
\r
16 K_CONST equ SHA256_K_ARRAY
\r
18 K_CONST equ _SHA256_K_ARRAY
\r
20 EXTRN K_CONST:xmmword
\r
26 Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
\r
31 DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H
\r
32 DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H
\r
33 DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H
\r
34 DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H
\r
35 DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH
\r
36 DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH
\r
37 DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H
\r
38 DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H
\r
39 DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H
\r
40 DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H
\r
41 DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H
\r
42 DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H
\r
43 DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H
\r
44 DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H
\r
45 DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H
\r
46 DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H
\r
51 ; _TEXT$SHA256OPT SEGMENT 'CODE'
\r
58 ; jwasm-based assemblers for linux and linker from new versions of binutils
\r
59 ; can generate incorrect code for load [ARRAY + offset] instructions.
\r
60 ; 22.00: we load K_CONST offset to (rTable) register to avoid jwasm+binutils problem
\r
62 ; rTable equ K_CONST
\r
65 rNum equ REG_ABI_PARAM_2
\r
67 LOCAL_SIZE equ (16 * 2)
\r
71 LOCAL_SIZE equ (16 * 1)
\r
74 rState equ REG_ABI_PARAM_0
\r
75 rData equ REG_ABI_PARAM_1
\r
82 MY_SHA_INSTR macro cmd, a1, a2
\r
83 db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)
\r
86 cmd_sha256rnds2 equ 0cbH
\r
87 cmd_sha256msg1 equ 0ccH
\r
88 cmd_sha256msg2 equ 0cdH
\r
90 MY_sha256rnds2 macro a1, a2
\r
91 MY_SHA_INSTR cmd_sha256rnds2, a1, a2
\r
94 MY_sha256msg1 macro a1, a2
\r
95 MY_SHA_INSTR cmd_sha256msg1, a1, a2
\r
98 MY_sha256msg2 macro a1, a2
\r
99 MY_SHA_INSTR cmd_sha256msg2, a1, a2
\r
105 movdqa [r4 + 8], xmm6
\r
106 movdqa [r4 + 8 + 16], xmm7
\r
107 sub r4, LOCAL_SIZE + 8
\r
109 movdqa [r4 + 16], xmm9
\r
115 NUM_PUSH_REGS equ 2
\r
116 PARAM_OFFSET equ (REG_SIZE * (1 + NUM_PUSH_REGS))
\r
118 mov rState, [r4 + PARAM_OFFSET]
\r
119 mov rData, [r4 + PARAM_OFFSET + REG_SIZE * 1]
\r
120 mov rNum, [r4 + PARAM_OFFSET + REG_SIZE * 2]
\r
122 mov rNum, [r4 + PARAM_OFFSET]
\r
133 movdqa xmm9, [r4 + 16]
\r
134 add r4, LOCAL_SIZE + 8
\r
135 movdqa xmm6, [r4 + 8]
\r
136 movdqa xmm7, [r4 + 8 + 16]
\r
154 state1_save equ xmm1
\r
155 state0 equ @CatStr(xmm, %state0_N)
\r
156 state1 equ @CatStr(xmm, %state1_N)
\r
160 state0_save equ xmm8
\r
163 state0_save equ [r4]
\r
168 movdqa mask2, XMMWORD PTR Reverse_Endian_Mask
\r
172 movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]
\r
173 pshufb @CatStr(xmm, %(w_regs + k)), mask2
\r
177 ; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1
\r
184 movdqa msg, xmmword ptr [rTable + (k) * 16]
\r
185 paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))
\r
186 MY_sha256rnds2 state0_N, state1_N
\r
187 pshufd msg, msg, 0eH
\r
189 if (k GE (4 - pre1)) AND (k LT (16 - pre1))
\r
190 ; w4[0] = msg1(w4[-4], w4[-3])
\r
191 MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))
\r
194 MY_sha256rnds2 state1_N, state0_N
\r
196 if (k GE (4 - pre2)) AND (k LT (16 - pre2))
\r
197 movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4)))
\r
198 palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4
\r
199 paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp
\r
200 ; w4[0] = msg2(w4[0], w4[-1])
\r
201 MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4))
\r
209 REVERSE_STATE macro
\r
212 pshufd tmp, state0, 01bH ; abcd
\r
213 pshufd state0, state1, 01bH ; efgh
\r
214 movdqa state1, state0 ; efgh
\r
215 punpcklqdq state0, tmp ; cdgh
\r
216 punpckhqdq state1, tmp ; abef
\r
220 MY_PROC Sha256_UpdateBlocks_HW, 3
\r
223 lea rTable, [K_CONST]
\r
228 movdqu state0, [rState] ; dcba
\r
229 movdqu state1, [rState + 16] ; hgfe
\r
239 movdqa state0_save, state0
\r
240 movdqa state1_save, state1
\r
258 paddd state0, state0_save
\r
259 paddd state1, state1_save
\r
267 movdqu [rState], state0
\r
268 movdqu [rState + 16], state1
\r
273 ; _TEXT$SHA256OPT ENDS
\r