| 1 | ; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function\r |
| 2 | ; 2021-07-21: Igor Pavlov : Public domain\r |
| 3 | ;\r |
| 4 | \r |
| 5 | ifndef x64\r |
| 6 | ; x64=1\r |
| 7 | ; .err <x64_IS_REQUIRED>\r |
| 8 | endif\r |
| 9 | \r |
| 10 | include 7zAsm.asm\r |
| 11 | \r |
| 12 | MY_ASM_START\r |
| 13 | \r |
| 14 | _TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'\r |
| 15 | \r |
| 16 | MY_ALIGN macro num:req\r |
| 17 | align num\r |
| 18 | endm\r |
| 19 | \r |
| 20 | MY_ALIGN_32 macro\r |
| 21 | MY_ALIGN 32\r |
| 22 | endm\r |
| 23 | \r |
| 24 | MY_ALIGN_64 macro\r |
| 25 | MY_ALIGN 64\r |
| 26 | endm\r |
| 27 | \r |
| 28 | \r |
| 29 | t0_L equ x0_L\r |
| 30 | t0_x equ x0\r |
| 31 | t0 equ r0\r |
| 32 | t1_x equ x3\r |
| 33 | t1 equ r3\r |
| 34 | \r |
| 35 | cp_x equ t1_x\r |
| 36 | cp_r equ t1\r |
| 37 | m equ x5\r |
| 38 | m_r equ r5\r |
| 39 | len_x equ x6\r |
| 40 | len equ r6\r |
| 41 | diff_x equ x7\r |
| 42 | diff equ r7\r |
| 43 | len0 equ r10\r |
| 44 | len1_x equ x11\r |
| 45 | len1 equ r11\r |
| 46 | maxLen_x equ x12\r |
| 47 | maxLen equ r12\r |
| 48 | d equ r13\r |
| 49 | ptr0 equ r14\r |
| 50 | ptr1 equ r15\r |
| 51 | \r |
| 52 | d_lim equ m_r\r |
| 53 | cycSize equ len_x\r |
| 54 | hash_lim equ len0\r |
| 55 | delta1_x equ len1_x\r |
| 56 | delta1_r equ len1\r |
| 57 | delta_x equ maxLen_x\r |
| 58 | delta_r equ maxLen\r |
| 59 | hash equ ptr0\r |
| 60 | src equ ptr1\r |
| 61 | \r |
| 62 | \r |
| 63 | \r |
| 64 | if (IS_LINUX gt 0)\r |
| 65 | \r |
| 66 | ; r1 r2 r8 r9 : win32\r |
| 67 | ; r7 r6 r2 r1 r8 r9 : linux\r |
| 68 | \r |
| 69 | lenLimit equ r8\r |
| 70 | lenLimit_x equ x8\r |
| 71 | ; pos_r equ r2\r |
| 72 | pos equ x2\r |
| 73 | cur equ r1\r |
| 74 | son equ r9\r |
| 75 | \r |
| 76 | else\r |
| 77 | \r |
| 78 | lenLimit equ REG_ABI_PARAM_2\r |
| 79 | lenLimit_x equ REG_ABI_PARAM_2_x\r |
| 80 | pos equ REG_ABI_PARAM_1_x\r |
| 81 | cur equ REG_ABI_PARAM_0\r |
| 82 | son equ REG_ABI_PARAM_3\r |
| 83 | \r |
| 84 | endif\r |
| 85 | \r |
| 86 | \r |
| 87 | if (IS_LINUX gt 0)\r |
| 88 | maxLen_OFFS equ (REG_SIZE * (6 + 1))\r |
| 89 | else\r |
| 90 | cutValue_OFFS equ (REG_SIZE * (8 + 1 + 4))\r |
| 91 | d_OFFS equ (REG_SIZE + cutValue_OFFS)\r |
| 92 | maxLen_OFFS equ (REG_SIZE + d_OFFS)\r |
| 93 | endif\r |
| 94 | hash_OFFS equ (REG_SIZE + maxLen_OFFS)\r |
| 95 | limit_OFFS equ (REG_SIZE + hash_OFFS)\r |
| 96 | size_OFFS equ (REG_SIZE + limit_OFFS)\r |
| 97 | cycPos_OFFS equ (REG_SIZE + size_OFFS)\r |
| 98 | cycSize_OFFS equ (REG_SIZE + cycPos_OFFS)\r |
| 99 | posRes_OFFS equ (REG_SIZE + cycSize_OFFS)\r |
| 100 | \r |
| 101 | if (IS_LINUX gt 0)\r |
| 102 | else\r |
| 103 | cutValue_PAR equ [r0 + cutValue_OFFS]\r |
| 104 | d_PAR equ [r0 + d_OFFS]\r |
| 105 | endif\r |
| 106 | maxLen_PAR equ [r0 + maxLen_OFFS]\r |
| 107 | hash_PAR equ [r0 + hash_OFFS]\r |
| 108 | limit_PAR equ [r0 + limit_OFFS]\r |
| 109 | size_PAR equ [r0 + size_OFFS]\r |
| 110 | cycPos_PAR equ [r0 + cycPos_OFFS]\r |
| 111 | cycSize_PAR equ [r0 + cycSize_OFFS]\r |
| 112 | posRes_PAR equ [r0 + posRes_OFFS]\r |
| 113 | \r |
| 114 | \r |
| 115 | cutValue_VAR equ DWORD PTR [r4 + 8 * 0]\r |
| 116 | cutValueCur_VAR equ DWORD PTR [r4 + 8 * 0 + 4]\r |
| 117 | cycPos_VAR equ DWORD PTR [r4 + 8 * 1 + 0]\r |
| 118 | cycSize_VAR equ DWORD PTR [r4 + 8 * 1 + 4]\r |
| 119 | hash_VAR equ QWORD PTR [r4 + 8 * 2]\r |
| 120 | limit_VAR equ QWORD PTR [r4 + 8 * 3]\r |
| 121 | size_VAR equ QWORD PTR [r4 + 8 * 4]\r |
| 122 | distances equ QWORD PTR [r4 + 8 * 5]\r |
| 123 | maxLen_VAR equ QWORD PTR [r4 + 8 * 6]\r |
| 124 | \r |
| 125 | Old_RSP equ QWORD PTR [r4 + 8 * 7]\r |
| 126 | LOCAL_SIZE equ 8 * 8\r |
| 127 | \r |
| 128 | COPY_VAR_32 macro dest_var, src_var\r |
| 129 | mov x3, src_var\r |
| 130 | mov dest_var, x3\r |
| 131 | endm\r |
| 132 | \r |
| 133 | COPY_VAR_64 macro dest_var, src_var\r |
| 134 | mov r3, src_var\r |
| 135 | mov dest_var, r3\r |
| 136 | endm\r |
| 137 | \r |
| 138 | \r |
| 139 | ; MY_ALIGN_64\r |
| 140 | MY_PROC GetMatchesSpecN_2, 13\r |
| 141 | MY_PUSH_PRESERVED_ABI_REGS\r |
| 142 | mov r0, RSP\r |
| 143 | lea r3, [r0 - LOCAL_SIZE]\r |
| 144 | and r3, -64\r |
| 145 | mov RSP, r3\r |
| 146 | mov Old_RSP, r0\r |
| 147 | \r |
| 148 | if (IS_LINUX gt 0)\r |
| 149 | mov d, REG_ABI_PARAM_5 ; r13 = r9\r |
| 150 | mov cutValue_VAR, REG_ABI_PARAM_4_x ; = r8\r |
| 151 | mov son, REG_ABI_PARAM_3 ; r9 = r1\r |
| 152 | mov r8, REG_ABI_PARAM_2 ; r8 = r2\r |
| 153 | mov pos, REG_ABI_PARAM_1_x ; r2 = x6\r |
| 154 | mov r1, REG_ABI_PARAM_0 ; r1 = r7\r |
| 155 | else\r |
| 156 | COPY_VAR_32 cutValue_VAR, cutValue_PAR\r |
| 157 | mov d, d_PAR\r |
| 158 | endif\r |
| 159 | \r |
| 160 | COPY_VAR_64 limit_VAR, limit_PAR\r |
| 161 | \r |
| 162 | mov hash_lim, size_PAR\r |
| 163 | mov size_VAR, hash_lim\r |
| 164 | \r |
| 165 | mov cp_x, cycPos_PAR\r |
| 166 | mov hash, hash_PAR\r |
| 167 | \r |
| 168 | mov cycSize, cycSize_PAR\r |
| 169 | mov cycSize_VAR, cycSize\r |
| 170 | \r |
| 171 | ; we want cur in (rcx). So we change the cur and lenLimit variables\r |
| 172 | sub lenLimit, cur\r |
| 173 | neg lenLimit_x\r |
| 174 | inc lenLimit_x\r |
| 175 | \r |
| 176 | mov t0_x, maxLen_PAR\r |
| 177 | sub t0, lenLimit\r |
| 178 | mov maxLen_VAR, t0\r |
| 179 | \r |
| 180 | jmp main_loop\r |
| 181 | \r |
| 182 | MY_ALIGN_64\r |
| 183 | fill_empty:\r |
| 184 | ; ptr0 = *ptr1 = kEmptyHashValue;\r |
| 185 | mov QWORD PTR [ptr1], 0\r |
| 186 | inc pos\r |
| 187 | inc cp_x\r |
| 188 | mov DWORD PTR [d - 4], 0\r |
| 189 | cmp d, limit_VAR\r |
| 190 | jae fin\r |
| 191 | cmp hash, hash_lim\r |
| 192 | je fin\r |
| 193 | \r |
| 194 | ; MY_ALIGN_64\r |
| 195 | main_loop:\r |
| 196 | ; UInt32 delta = *hash++;\r |
| 197 | mov diff_x, [hash] ; delta\r |
| 198 | add hash, 4\r |
| 199 | ; mov cycPos_VAR, cp_x\r |
| 200 | \r |
| 201 | inc cur\r |
| 202 | add d, 4\r |
| 203 | mov m, pos\r |
| 204 | sub m, diff_x; ; matchPos\r |
| 205 | \r |
| 206 | ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;\r |
| 207 | lea ptr1, [son + 8 * cp_r]\r |
| 208 | ; mov cycSize, cycSize_VAR\r |
| 209 | cmp pos, cycSize\r |
| 210 | jb directMode ; if (pos < cycSize_VAR)\r |
| 211 | \r |
| 212 | ; CYC MODE\r |
| 213 | \r |
| 214 | cmp diff_x, cycSize\r |
| 215 | jae fill_empty ; if (delta >= cycSize_VAR)\r |
| 216 | \r |
| 217 | xor t0_x, t0_x\r |
| 218 | mov cycPos_VAR, cp_x\r |
| 219 | sub cp_x, diff_x\r |
| 220 | ; jae prepare_for_tree_loop\r |
| 221 | ; add cp_x, cycSize\r |
| 222 | cmovb t0_x, cycSize\r |
| 223 | add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)\r |
| 224 | jmp prepare_for_tree_loop\r |
| 225 | \r |
| 226 | \r |
| 227 | directMode:\r |
| 228 | cmp diff_x, pos\r |
| 229 | je fill_empty ; if (delta == pos)\r |
| 230 | jae fin_error ; if (delta >= pos)\r |
| 231 | \r |
| 232 | mov cycPos_VAR, cp_x\r |
| 233 | mov cp_x, m\r |
| 234 | \r |
| 235 | prepare_for_tree_loop:\r |
| 236 | mov len0, lenLimit\r |
| 237 | mov hash_VAR, hash\r |
| 238 | ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;\r |
| 239 | lea ptr0, [ptr1 + 4]\r |
| 240 | ; UInt32 *_distances = ++d;\r |
| 241 | mov distances, d\r |
| 242 | \r |
| 243 | neg len0\r |
| 244 | mov len1, len0\r |
| 245 | \r |
| 246 | mov t0_x, cutValue_VAR\r |
| 247 | mov maxLen, maxLen_VAR\r |
| 248 | mov cutValueCur_VAR, t0_x\r |
| 249 | \r |
| 250 | MY_ALIGN_32\r |
| 251 | tree_loop:\r |
| 252 | neg diff\r |
| 253 | mov len, len0\r |
| 254 | cmp len1, len0\r |
| 255 | cmovb len, len1 ; len = (len1 < len0 ? len1 : len0);\r |
| 256 | add diff, cur\r |
| 257 | \r |
| 258 | mov t0_x, [son + cp_r * 8] ; prefetch\r |
| 259 | movzx t0_x, BYTE PTR [diff + 1 * len]\r |
| 260 | lea cp_r, [son + cp_r * 8]\r |
| 261 | cmp [cur + 1 * len], t0_L\r |
| 262 | je matched_1\r |
| 263 | \r |
| 264 | jb left_0\r |
| 265 | \r |
| 266 | mov [ptr1], m\r |
| 267 | mov m, [cp_r + 4]\r |
| 268 | lea ptr1, [cp_r + 4]\r |
| 269 | sub diff, cur ; FIX32\r |
| 270 | jmp next_node\r |
| 271 | \r |
| 272 | MY_ALIGN_32\r |
| 273 | left_0:\r |
| 274 | mov [ptr0], m\r |
| 275 | mov m, [cp_r]\r |
| 276 | mov ptr0, cp_r\r |
| 277 | sub diff, cur ; FIX32\r |
| 278 | ; jmp next_node\r |
| 279 | \r |
| 280 | ; ------------ NEXT NODE ------------\r |
| 281 | ; MY_ALIGN_32\r |
| 282 | next_node:\r |
| 283 | mov cycSize, cycSize_VAR\r |
| 284 | dec cutValueCur_VAR\r |
| 285 | je finish_tree\r |
| 286 | \r |
| 287 | add diff_x, pos ; prev_match = pos + diff\r |
| 288 | cmp m, diff_x\r |
| 289 | jae fin_error ; if (new_match >= prev_match)\r |
| 290 | \r |
| 291 | mov diff_x, pos\r |
| 292 | sub diff_x, m ; delta = pos - new_match\r |
| 293 | cmp pos, cycSize\r |
| 294 | jae cyc_mode_2 ; if (pos >= cycSize)\r |
| 295 | \r |
| 296 | mov cp_x, m\r |
| 297 | test m, m\r |
| 298 | jne tree_loop ; if (m != 0)\r |
| 299 | \r |
| 300 | finish_tree:\r |
| 301 | ; ptr0 = *ptr1 = kEmptyHashValue;\r |
| 302 | mov DWORD PTR [ptr0], 0\r |
| 303 | mov DWORD PTR [ptr1], 0\r |
| 304 | \r |
| 305 | inc pos\r |
| 306 | \r |
| 307 | ; _distances[-1] = (UInt32)(d - _distances);\r |
| 308 | mov t0, distances\r |
| 309 | mov t1, d\r |
| 310 | sub t1, t0\r |
| 311 | shr t1_x, 2\r |
| 312 | mov [t0 - 4], t1_x\r |
| 313 | \r |
| 314 | cmp d, limit_VAR\r |
| 315 | jae fin ; if (d >= limit)\r |
| 316 | \r |
| 317 | mov cp_x, cycPos_VAR\r |
| 318 | mov hash, hash_VAR\r |
| 319 | mov hash_lim, size_VAR\r |
| 320 | inc cp_x\r |
| 321 | cmp hash, hash_lim\r |
| 322 | jne main_loop ; if (hash != size)\r |
| 323 | jmp fin\r |
| 324 | \r |
| 325 | \r |
| 326 | MY_ALIGN_32\r |
| 327 | cyc_mode_2:\r |
| 328 | cmp diff_x, cycSize\r |
| 329 | jae finish_tree ; if (delta >= cycSize)\r |
| 330 | \r |
| 331 | mov cp_x, cycPos_VAR\r |
| 332 | xor t0_x, t0_x\r |
| 333 | sub cp_x, diff_x ; cp_x = cycPos - delta\r |
| 334 | cmovb t0_x, cycSize\r |
| 335 | add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)\r |
| 336 | jmp tree_loop\r |
| 337 | \r |
| 338 | \r |
| 339 | MY_ALIGN_32\r |
| 340 | matched_1:\r |
| 341 | \r |
| 342 | inc len\r |
| 343 | ; cmp len_x, lenLimit_x\r |
| 344 | je short lenLimit_reach\r |
| 345 | movzx t0_x, BYTE PTR [diff + 1 * len]\r |
| 346 | cmp [cur + 1 * len], t0_L\r |
| 347 | jne mismatch\r |
| 348 | \r |
| 349 | \r |
| 350 | MY_ALIGN_32\r |
| 351 | match_loop:\r |
| 352 | ; while (++len != lenLimit) (len[diff] != len[0]) ;\r |
| 353 | \r |
| 354 | inc len\r |
| 355 | ; cmp len_x, lenLimit_x\r |
| 356 | je short lenLimit_reach\r |
| 357 | movzx t0_x, BYTE PTR [diff + 1 * len]\r |
| 358 | cmp BYTE PTR [cur + 1 * len], t0_L\r |
| 359 | je match_loop\r |
| 360 | \r |
| 361 | mismatch:\r |
| 362 | jb left_2\r |
| 363 | \r |
| 364 | mov [ptr1], m\r |
| 365 | mov m, [cp_r + 4]\r |
| 366 | lea ptr1, [cp_r + 4]\r |
| 367 | mov len1, len\r |
| 368 | \r |
| 369 | jmp max_update\r |
| 370 | \r |
| 371 | MY_ALIGN_32\r |
| 372 | left_2:\r |
| 373 | mov [ptr0], m\r |
| 374 | mov m, [cp_r]\r |
| 375 | mov ptr0, cp_r\r |
| 376 | mov len0, len\r |
| 377 | \r |
| 378 | max_update:\r |
| 379 | sub diff, cur ; restore diff\r |
| 380 | \r |
| 381 | cmp maxLen, len\r |
| 382 | jae next_node\r |
| 383 | \r |
| 384 | mov maxLen, len\r |
| 385 | add len, lenLimit\r |
| 386 | mov [d], len_x\r |
| 387 | mov t0_x, diff_x\r |
| 388 | not t0_x\r |
| 389 | mov [d + 4], t0_x\r |
| 390 | add d, 8\r |
| 391 | \r |
| 392 | jmp next_node\r |
| 393 | \r |
| 394 | \r |
| 395 | \r |
| 396 | MY_ALIGN_32\r |
| 397 | lenLimit_reach:\r |
| 398 | \r |
| 399 | mov delta_r, cur\r |
| 400 | sub delta_r, diff\r |
| 401 | lea delta1_r, [delta_r - 1]\r |
| 402 | \r |
| 403 | mov t0_x, [cp_r]\r |
| 404 | mov [ptr1], t0_x\r |
| 405 | mov t0_x, [cp_r + 4]\r |
| 406 | mov [ptr0], t0_x\r |
| 407 | \r |
| 408 | mov [d], lenLimit_x\r |
| 409 | mov [d + 4], delta1_x\r |
| 410 | add d, 8\r |
| 411 | \r |
| 412 | ; _distances[-1] = (UInt32)(d - _distances);\r |
| 413 | mov t0, distances\r |
| 414 | mov t1, d\r |
| 415 | sub t1, t0\r |
| 416 | shr t1_x, 2\r |
| 417 | mov [t0 - 4], t1_x\r |
| 418 | \r |
| 419 | mov hash, hash_VAR\r |
| 420 | mov hash_lim, size_VAR\r |
| 421 | \r |
| 422 | inc pos\r |
| 423 | mov cp_x, cycPos_VAR\r |
| 424 | inc cp_x\r |
| 425 | \r |
| 426 | mov d_lim, limit_VAR\r |
| 427 | mov cycSize, cycSize_VAR\r |
| 428 | ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)\r |
| 429 | ; break;\r |
| 430 | cmp hash, hash_lim\r |
| 431 | je fin\r |
| 432 | cmp d, d_lim\r |
| 433 | jae fin\r |
| 434 | cmp delta_x, [hash]\r |
| 435 | jne main_loop\r |
| 436 | movzx t0_x, BYTE PTR [diff]\r |
| 437 | cmp [cur], t0_L\r |
| 438 | jne main_loop\r |
| 439 | \r |
| 440 | ; jmp main_loop ; bypass for debug\r |
| 441 | \r |
| 442 | mov cycPos_VAR, cp_x\r |
| 443 | shl len, 3 ; cycSize * 8\r |
| 444 | sub diff, cur ; restore diff\r |
| 445 | xor t0_x, t0_x\r |
| 446 | cmp cp_x, delta_x ; cmp (cycPos_VAR, delta)\r |
| 447 | lea cp_r, [son + 8 * cp_r] ; dest\r |
| 448 | lea src, [cp_r + 8 * diff]\r |
| 449 | cmovb t0, len ; t0 = (cycPos_VAR < delta ? cycSize * 8 : 0)\r |
| 450 | add src, t0\r |
| 451 | add len, son ; len = son + cycSize * 8\r |
| 452 | \r |
| 453 | \r |
| 454 | MY_ALIGN_32\r |
| 455 | long_loop:\r |
| 456 | add hash, 4\r |
| 457 | \r |
| 458 | ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];\r |
| 459 | \r |
| 460 | mov t0, [src]\r |
| 461 | add src, 8\r |
| 462 | mov [cp_r], t0\r |
| 463 | add cp_r, 8\r |
| 464 | cmp src, len\r |
| 465 | cmove src, son ; if end of (son) buffer is reached, we wrap to begin\r |
| 466 | \r |
| 467 | mov DWORD PTR [d], 2\r |
| 468 | mov [d + 4], lenLimit_x\r |
| 469 | mov [d + 8], delta1_x\r |
| 470 | add d, 12\r |
| 471 | \r |
| 472 | inc cur\r |
| 473 | \r |
| 474 | cmp hash, hash_lim\r |
| 475 | je long_footer\r |
| 476 | cmp delta_x, [hash]\r |
| 477 | jne long_footer\r |
| 478 | movzx t0_x, BYTE PTR [diff + 1 * cur]\r |
| 479 | cmp [cur], t0_L\r |
| 480 | jne long_footer\r |
| 481 | cmp d, d_lim\r |
| 482 | jb long_loop\r |
| 483 | \r |
| 484 | long_footer:\r |
| 485 | sub cp_r, son\r |
| 486 | shr cp_r, 3\r |
| 487 | add pos, cp_x\r |
| 488 | sub pos, cycPos_VAR\r |
| 489 | mov cycSize, cycSize_VAR\r |
| 490 | \r |
| 491 | cmp d, d_lim\r |
| 492 | jae fin\r |
| 493 | cmp hash, hash_lim\r |
| 494 | jne main_loop\r |
| 495 | jmp fin\r |
| 496 | \r |
| 497 | \r |
| 498 | \r |
| 499 | fin_error:\r |
| 500 | xor d, d\r |
| 501 | \r |
| 502 | fin:\r |
| 503 | mov RSP, Old_RSP\r |
| 504 | mov t0, [r4 + posRes_OFFS]\r |
| 505 | mov [t0], pos\r |
| 506 | mov r0, d\r |
| 507 | \r |
| 508 | MY_POP_PRESERVED_ABI_REGS\r |
| 509 | MY_ENDP\r |
| 510 | \r |
| 511 | _TEXT$LZFINDOPT ENDS\r |
| 512 | \r |
| 513 | end\r |