| 1 | ; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions\r |
| 2 | ; 2022-04-17 : Igor Pavlov : Public domain\r |
| 3 | \r |
| 4 | include 7zAsm.asm\r |
| 5 | \r |
| 6 | MY_ASM_START\r |
| 7 | \r |
| 8 | ; .data\r |
| 9 | ; public K\r |
| 10 | \r |
| 11 | ; we can use external SHA256_K_ARRAY defined in Sha256.c\r |
| 12 | ; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes\r |
| 13 | \r |
| 14 | COMMENT @\r |
| 15 | ifdef x64\r |
| 16 | K_CONST equ SHA256_K_ARRAY\r |
| 17 | else\r |
| 18 | K_CONST equ _SHA256_K_ARRAY\r |
| 19 | endif\r |
| 20 | EXTRN K_CONST:xmmword\r |
| 21 | @\r |
| 22 | \r |
| 23 | CONST SEGMENT\r |
| 24 | \r |
| 25 | align 16\r |
| 26 | Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12\r |
| 27 | \r |
| 28 | ; COMMENT @\r |
| 29 | align 16\r |
| 30 | K_CONST \\r |
| 31 | DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H\r |
| 32 | DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H\r |
| 33 | DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H\r |
| 34 | DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H\r |
| 35 | DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH\r |
| 36 | DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH\r |
| 37 | DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H\r |
| 38 | DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H\r |
| 39 | DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H\r |
| 40 | DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H\r |
| 41 | DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H\r |
| 42 | DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H\r |
| 43 | DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H\r |
| 44 | DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H\r |
| 45 | DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H\r |
| 46 | DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H\r |
| 47 | ; @\r |
| 48 | \r |
| 49 | CONST ENDS\r |
| 50 | \r |
| 51 | ; _TEXT$SHA256OPT SEGMENT 'CODE'\r |
| 52 | \r |
| 53 | ifndef x64\r |
| 54 | .686\r |
| 55 | .xmm\r |
| 56 | endif\r |
| 57 | \r |
| 58 | ; jwasm-based assemblers for linux and linker from new versions of binutils\r |
| 59 | ; can generate incorrect code for load [ARRAY + offset] instructions.\r |
| 60 | ; 22.00: we load K_CONST offset to (rTable) register to avoid jwasm+binutils problem \r |
| 61 | rTable equ r0\r |
| 62 | ; rTable equ K_CONST\r |
| 63 | \r |
| 64 | ifdef x64\r |
| 65 | rNum equ REG_ABI_PARAM_2\r |
| 66 | if (IS_LINUX eq 0)\r |
| 67 | LOCAL_SIZE equ (16 * 2)\r |
| 68 | endif\r |
| 69 | else\r |
| 70 | rNum equ r3\r |
| 71 | LOCAL_SIZE equ (16 * 1)\r |
| 72 | endif\r |
| 73 | \r |
| 74 | rState equ REG_ABI_PARAM_0\r |
| 75 | rData equ REG_ABI_PARAM_1\r |
| 76 | \r |
| 77 | \r |
| 78 | \r |
| 79 | \r |
| 80 | \r |
| 81 | \r |
| 82 | MY_SHA_INSTR macro cmd, a1, a2\r |
| 83 | db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)\r |
| 84 | endm\r |
| 85 | \r |
| 86 | cmd_sha256rnds2 equ 0cbH\r |
| 87 | cmd_sha256msg1 equ 0ccH\r |
| 88 | cmd_sha256msg2 equ 0cdH\r |
| 89 | \r |
| 90 | MY_sha256rnds2 macro a1, a2\r |
| 91 | MY_SHA_INSTR cmd_sha256rnds2, a1, a2\r |
| 92 | endm\r |
| 93 | \r |
| 94 | MY_sha256msg1 macro a1, a2\r |
| 95 | MY_SHA_INSTR cmd_sha256msg1, a1, a2\r |
| 96 | endm\r |
| 97 | \r |
| 98 | MY_sha256msg2 macro a1, a2\r |
| 99 | MY_SHA_INSTR cmd_sha256msg2, a1, a2\r |
| 100 | endm\r |
| 101 | \r |
| 102 | MY_PROLOG macro\r |
| 103 | ifdef x64\r |
| 104 | if (IS_LINUX eq 0)\r |
| 105 | movdqa [r4 + 8], xmm6\r |
| 106 | movdqa [r4 + 8 + 16], xmm7\r |
| 107 | sub r4, LOCAL_SIZE + 8\r |
| 108 | movdqa [r4 ], xmm8\r |
| 109 | movdqa [r4 + 16], xmm9\r |
| 110 | endif\r |
| 111 | else ; x86\r |
| 112 | push r3\r |
| 113 | push r5\r |
| 114 | mov r5, r4\r |
| 115 | NUM_PUSH_REGS equ 2\r |
| 116 | PARAM_OFFSET equ (REG_SIZE * (1 + NUM_PUSH_REGS))\r |
| 117 | if (IS_CDECL gt 0)\r |
| 118 | mov rState, [r4 + PARAM_OFFSET]\r |
| 119 | mov rData, [r4 + PARAM_OFFSET + REG_SIZE * 1]\r |
| 120 | mov rNum, [r4 + PARAM_OFFSET + REG_SIZE * 2]\r |
| 121 | else ; fastcall\r |
| 122 | mov rNum, [r4 + PARAM_OFFSET]\r |
| 123 | endif\r |
| 124 | and r4, -16\r |
| 125 | sub r4, LOCAL_SIZE\r |
| 126 | endif\r |
| 127 | endm\r |
| 128 | \r |
| 129 | MY_EPILOG macro\r |
| 130 | ifdef x64\r |
| 131 | if (IS_LINUX eq 0)\r |
| 132 | movdqa xmm8, [r4]\r |
| 133 | movdqa xmm9, [r4 + 16]\r |
| 134 | add r4, LOCAL_SIZE + 8\r |
| 135 | movdqa xmm6, [r4 + 8]\r |
| 136 | movdqa xmm7, [r4 + 8 + 16]\r |
| 137 | endif\r |
| 138 | else ; x86\r |
| 139 | mov r4, r5\r |
| 140 | pop r5\r |
| 141 | pop r3\r |
| 142 | endif\r |
| 143 | MY_ENDP\r |
| 144 | endm\r |
| 145 | \r |
| 146 | \r |
| 147 | msg equ xmm0\r |
| 148 | tmp equ xmm0\r |
| 149 | state0_N equ 2\r |
| 150 | state1_N equ 3\r |
| 151 | w_regs equ 4\r |
| 152 | \r |
| 153 | \r |
| 154 | state1_save equ xmm1\r |
| 155 | state0 equ @CatStr(xmm, %state0_N)\r |
| 156 | state1 equ @CatStr(xmm, %state1_N)\r |
| 157 | \r |
| 158 | \r |
| 159 | ifdef x64\r |
| 160 | state0_save equ xmm8\r |
| 161 | mask2 equ xmm9\r |
| 162 | else\r |
| 163 | state0_save equ [r4]\r |
| 164 | mask2 equ xmm0\r |
| 165 | endif\r |
| 166 | \r |
| 167 | LOAD_MASK macro\r |
| 168 | movdqa mask2, XMMWORD PTR Reverse_Endian_Mask\r |
| 169 | endm\r |
| 170 | \r |
| 171 | LOAD_W macro k:req\r |
| 172 | movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]\r |
| 173 | pshufb @CatStr(xmm, %(w_regs + k)), mask2\r |
| 174 | endm\r |
| 175 | \r |
| 176 | \r |
| 177 | ; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1\r |
| 178 | pre1 equ 3\r |
| 179 | pre2 equ 2\r |
| 180 | \r |
| 181 | \r |
| 182 | \r |
| 183 | RND4 macro k\r |
| 184 | movdqa msg, xmmword ptr [rTable + (k) * 16]\r |
| 185 | paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))\r |
| 186 | MY_sha256rnds2 state0_N, state1_N\r |
| 187 | pshufd msg, msg, 0eH\r |
| 188 | \r |
| 189 | if (k GE (4 - pre1)) AND (k LT (16 - pre1))\r |
| 190 | ; w4[0] = msg1(w4[-4], w4[-3])\r |
| 191 | MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))\r |
| 192 | endif\r |
| 193 | \r |
| 194 | MY_sha256rnds2 state1_N, state0_N\r |
| 195 | \r |
| 196 | if (k GE (4 - pre2)) AND (k LT (16 - pre2))\r |
| 197 | movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4)))\r |
| 198 | palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4\r |
| 199 | paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp\r |
| 200 | ; w4[0] = msg2(w4[0], w4[-1])\r |
| 201 | MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4))\r |
| 202 | endif\r |
| 203 | endm\r |
| 204 | \r |
| 205 | \r |
| 206 | \r |
| 207 | \r |
| 208 | \r |
| 209 | REVERSE_STATE macro\r |
| 210 | ; state0 ; dcba\r |
| 211 | ; state1 ; hgfe\r |
| 212 | pshufd tmp, state0, 01bH ; abcd\r |
| 213 | pshufd state0, state1, 01bH ; efgh\r |
| 214 | movdqa state1, state0 ; efgh\r |
| 215 | punpcklqdq state0, tmp ; cdgh\r |
| 216 | punpckhqdq state1, tmp ; abef\r |
| 217 | endm\r |
| 218 | \r |
| 219 | \r |
| 220 | MY_PROC Sha256_UpdateBlocks_HW, 3\r |
| 221 | MY_PROLOG\r |
| 222 | \r |
| 223 | lea rTable, [K_CONST]\r |
| 224 | \r |
| 225 | cmp rNum, 0\r |
| 226 | je end_c\r |
| 227 | \r |
| 228 | movdqu state0, [rState] ; dcba\r |
| 229 | movdqu state1, [rState + 16] ; hgfe\r |
| 230 | \r |
| 231 | REVERSE_STATE\r |
| 232 | \r |
| 233 | ifdef x64\r |
| 234 | LOAD_MASK\r |
| 235 | endif\r |
| 236 | \r |
| 237 | align 16\r |
| 238 | nextBlock:\r |
| 239 | movdqa state0_save, state0\r |
| 240 | movdqa state1_save, state1\r |
| 241 | \r |
| 242 | ifndef x64\r |
| 243 | LOAD_MASK\r |
| 244 | endif\r |
| 245 | \r |
| 246 | LOAD_W 0\r |
| 247 | LOAD_W 1\r |
| 248 | LOAD_W 2\r |
| 249 | LOAD_W 3\r |
| 250 | \r |
| 251 | \r |
| 252 | k = 0\r |
| 253 | rept 16\r |
| 254 | RND4 k\r |
| 255 | k = k + 1\r |
| 256 | endm\r |
| 257 | \r |
| 258 | paddd state0, state0_save\r |
| 259 | paddd state1, state1_save\r |
| 260 | \r |
| 261 | add rData, 64\r |
| 262 | sub rNum, 1\r |
| 263 | jnz nextBlock\r |
| 264 | \r |
| 265 | REVERSE_STATE\r |
| 266 | \r |
| 267 | movdqu [rState], state0\r |
| 268 | movdqu [rState + 16], state1\r |
| 269 | \r |
| 270 | end_c:\r |
| 271 | MY_EPILOG\r |
| 272 | \r |
| 273 | ; _TEXT$SHA256OPT ENDS\r |
| 274 | \r |
| 275 | end\r |