9e052883 |
1 | ; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function\r |
2 | ; 2021-07-21: Igor Pavlov : Public domain\r |
3 | ;\r |
4 | \r |
5 | ifndef x64\r |
6 | ; x64=1\r |
7 | ; .err <x64_IS_REQUIRED>\r |
8 | endif\r |
9 | \r |
10 | include 7zAsm.asm\r |
11 | \r |
12 | MY_ASM_START\r |
13 | \r |
14 | _TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'\r |
15 | \r |
16 | MY_ALIGN macro num:req\r |
17 | align num\r |
18 | endm\r |
19 | \r |
20 | MY_ALIGN_32 macro\r |
21 | MY_ALIGN 32\r |
22 | endm\r |
23 | \r |
24 | MY_ALIGN_64 macro\r |
25 | MY_ALIGN 64\r |
26 | endm\r |
27 | \r |
28 | \r |
29 | t0_L equ x0_L\r |
30 | t0_x equ x0\r |
31 | t0 equ r0\r |
32 | t1_x equ x3\r |
33 | t1 equ r3\r |
34 | \r |
35 | cp_x equ t1_x\r |
36 | cp_r equ t1\r |
37 | m equ x5\r |
38 | m_r equ r5\r |
39 | len_x equ x6\r |
40 | len equ r6\r |
41 | diff_x equ x7\r |
42 | diff equ r7\r |
43 | len0 equ r10\r |
44 | len1_x equ x11\r |
45 | len1 equ r11\r |
46 | maxLen_x equ x12\r |
47 | maxLen equ r12\r |
48 | d equ r13\r |
49 | ptr0 equ r14\r |
50 | ptr1 equ r15\r |
51 | \r |
52 | d_lim equ m_r\r |
53 | cycSize equ len_x\r |
54 | hash_lim equ len0\r |
55 | delta1_x equ len1_x\r |
56 | delta1_r equ len1\r |
57 | delta_x equ maxLen_x\r |
58 | delta_r equ maxLen\r |
59 | hash equ ptr0\r |
60 | src equ ptr1\r |
61 | \r |
62 | \r |
63 | \r |
64 | if (IS_LINUX gt 0)\r |
65 | \r |
66 | ; r1 r2 r8 r9 : win32\r |
67 | ; r7 r6 r2 r1 r8 r9 : linux\r |
68 | \r |
69 | lenLimit equ r8\r |
70 | lenLimit_x equ x8\r |
71 | ; pos_r equ r2\r |
72 | pos equ x2\r |
73 | cur equ r1\r |
74 | son equ r9\r |
75 | \r |
76 | else\r |
77 | \r |
78 | lenLimit equ REG_ABI_PARAM_2\r |
79 | lenLimit_x equ REG_ABI_PARAM_2_x\r |
80 | pos equ REG_ABI_PARAM_1_x\r |
81 | cur equ REG_ABI_PARAM_0\r |
82 | son equ REG_ABI_PARAM_3\r |
83 | \r |
84 | endif\r |
85 | \r |
86 | \r |
87 | if (IS_LINUX gt 0)\r |
88 | maxLen_OFFS equ (REG_SIZE * (6 + 1))\r |
89 | else\r |
90 | cutValue_OFFS equ (REG_SIZE * (8 + 1 + 4))\r |
91 | d_OFFS equ (REG_SIZE + cutValue_OFFS)\r |
92 | maxLen_OFFS equ (REG_SIZE + d_OFFS)\r |
93 | endif\r |
94 | hash_OFFS equ (REG_SIZE + maxLen_OFFS)\r |
95 | limit_OFFS equ (REG_SIZE + hash_OFFS)\r |
96 | size_OFFS equ (REG_SIZE + limit_OFFS)\r |
97 | cycPos_OFFS equ (REG_SIZE + size_OFFS)\r |
98 | cycSize_OFFS equ (REG_SIZE + cycPos_OFFS)\r |
99 | posRes_OFFS equ (REG_SIZE + cycSize_OFFS)\r |
100 | \r |
101 | if (IS_LINUX gt 0)\r |
102 | else\r |
103 | cutValue_PAR equ [r0 + cutValue_OFFS]\r |
104 | d_PAR equ [r0 + d_OFFS]\r |
105 | endif\r |
106 | maxLen_PAR equ [r0 + maxLen_OFFS]\r |
107 | hash_PAR equ [r0 + hash_OFFS]\r |
108 | limit_PAR equ [r0 + limit_OFFS]\r |
109 | size_PAR equ [r0 + size_OFFS]\r |
110 | cycPos_PAR equ [r0 + cycPos_OFFS]\r |
111 | cycSize_PAR equ [r0 + cycSize_OFFS]\r |
112 | posRes_PAR equ [r0 + posRes_OFFS]\r |
113 | \r |
114 | \r |
115 | cutValue_VAR equ DWORD PTR [r4 + 8 * 0]\r |
116 | cutValueCur_VAR equ DWORD PTR [r4 + 8 * 0 + 4]\r |
117 | cycPos_VAR equ DWORD PTR [r4 + 8 * 1 + 0]\r |
118 | cycSize_VAR equ DWORD PTR [r4 + 8 * 1 + 4]\r |
119 | hash_VAR equ QWORD PTR [r4 + 8 * 2]\r |
120 | limit_VAR equ QWORD PTR [r4 + 8 * 3]\r |
121 | size_VAR equ QWORD PTR [r4 + 8 * 4]\r |
122 | distances equ QWORD PTR [r4 + 8 * 5]\r |
123 | maxLen_VAR equ QWORD PTR [r4 + 8 * 6]\r |
124 | \r |
125 | Old_RSP equ QWORD PTR [r4 + 8 * 7]\r |
126 | LOCAL_SIZE equ 8 * 8\r |
127 | \r |
128 | COPY_VAR_32 macro dest_var, src_var\r |
129 | mov x3, src_var\r |
130 | mov dest_var, x3\r |
131 | endm\r |
132 | \r |
133 | COPY_VAR_64 macro dest_var, src_var\r |
134 | mov r3, src_var\r |
135 | mov dest_var, r3\r |
136 | endm\r |
137 | \r |
138 | \r |
139 | ; MY_ALIGN_64\r |
140 | MY_PROC GetMatchesSpecN_2, 13\r |
141 | MY_PUSH_PRESERVED_ABI_REGS\r |
142 | mov r0, RSP\r |
143 | lea r3, [r0 - LOCAL_SIZE]\r |
144 | and r3, -64\r |
145 | mov RSP, r3\r |
146 | mov Old_RSP, r0\r |
147 | \r |
148 | if (IS_LINUX gt 0)\r |
149 | mov d, REG_ABI_PARAM_5 ; r13 = r9\r |
150 | mov cutValue_VAR, REG_ABI_PARAM_4_x ; = r8\r |
151 | mov son, REG_ABI_PARAM_3 ; r9 = r1\r |
152 | mov r8, REG_ABI_PARAM_2 ; r8 = r2\r |
153 | mov pos, REG_ABI_PARAM_1_x ; r2 = x6\r |
154 | mov r1, REG_ABI_PARAM_0 ; r1 = r7\r |
155 | else\r |
156 | COPY_VAR_32 cutValue_VAR, cutValue_PAR\r |
157 | mov d, d_PAR\r |
158 | endif\r |
159 | \r |
160 | COPY_VAR_64 limit_VAR, limit_PAR\r |
161 | \r |
162 | mov hash_lim, size_PAR\r |
163 | mov size_VAR, hash_lim\r |
164 | \r |
165 | mov cp_x, cycPos_PAR\r |
166 | mov hash, hash_PAR\r |
167 | \r |
168 | mov cycSize, cycSize_PAR\r |
169 | mov cycSize_VAR, cycSize\r |
170 | \r |
171 | ; we want cur in (rcx). So we change the cur and lenLimit variables\r |
172 | sub lenLimit, cur\r |
173 | neg lenLimit_x\r |
174 | inc lenLimit_x\r |
175 | \r |
176 | mov t0_x, maxLen_PAR\r |
177 | sub t0, lenLimit\r |
178 | mov maxLen_VAR, t0\r |
179 | \r |
180 | jmp main_loop\r |
181 | \r |
182 | MY_ALIGN_64\r |
183 | fill_empty:\r |
184 | ; ptr0 = *ptr1 = kEmptyHashValue;\r |
185 | mov QWORD PTR [ptr1], 0\r |
186 | inc pos\r |
187 | inc cp_x\r |
188 | mov DWORD PTR [d - 4], 0\r |
189 | cmp d, limit_VAR\r |
190 | jae fin\r |
191 | cmp hash, hash_lim\r |
192 | je fin\r |
193 | \r |
194 | ; MY_ALIGN_64\r |
195 | main_loop:\r |
196 | ; UInt32 delta = *hash++;\r |
197 | mov diff_x, [hash] ; delta\r |
198 | add hash, 4\r |
199 | ; mov cycPos_VAR, cp_x\r |
200 | \r |
201 | inc cur\r |
202 | add d, 4\r |
203 | mov m, pos\r |
204 | sub m, diff_x; ; matchPos\r |
205 | \r |
206 | ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;\r |
207 | lea ptr1, [son + 8 * cp_r]\r |
208 | ; mov cycSize, cycSize_VAR\r |
209 | cmp pos, cycSize\r |
210 | jb directMode ; if (pos < cycSize_VAR)\r |
211 | \r |
212 | ; CYC MODE\r |
213 | \r |
214 | cmp diff_x, cycSize\r |
215 | jae fill_empty ; if (delta >= cycSize_VAR)\r |
216 | \r |
217 | xor t0_x, t0_x\r |
218 | mov cycPos_VAR, cp_x\r |
219 | sub cp_x, diff_x\r |
220 | ; jae prepare_for_tree_loop\r |
221 | ; add cp_x, cycSize\r |
222 | cmovb t0_x, cycSize\r |
223 | add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)\r |
224 | jmp prepare_for_tree_loop\r |
225 | \r |
226 | \r |
227 | directMode:\r |
228 | cmp diff_x, pos\r |
229 | je fill_empty ; if (delta == pos)\r |
230 | jae fin_error ; if (delta >= pos)\r |
231 | \r |
232 | mov cycPos_VAR, cp_x\r |
233 | mov cp_x, m\r |
234 | \r |
235 | prepare_for_tree_loop:\r |
236 | mov len0, lenLimit\r |
237 | mov hash_VAR, hash\r |
238 | ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;\r |
239 | lea ptr0, [ptr1 + 4]\r |
240 | ; UInt32 *_distances = ++d;\r |
241 | mov distances, d\r |
242 | \r |
243 | neg len0\r |
244 | mov len1, len0\r |
245 | \r |
246 | mov t0_x, cutValue_VAR\r |
247 | mov maxLen, maxLen_VAR\r |
248 | mov cutValueCur_VAR, t0_x\r |
249 | \r |
250 | MY_ALIGN_32\r |
251 | tree_loop:\r |
252 | neg diff\r |
253 | mov len, len0\r |
254 | cmp len1, len0\r |
255 | cmovb len, len1 ; len = (len1 < len0 ? len1 : len0);\r |
256 | add diff, cur\r |
257 | \r |
258 | mov t0_x, [son + cp_r * 8] ; prefetch\r |
259 | movzx t0_x, BYTE PTR [diff + 1 * len]\r |
260 | lea cp_r, [son + cp_r * 8]\r |
261 | cmp [cur + 1 * len], t0_L\r |
262 | je matched_1\r |
263 | \r |
264 | jb left_0\r |
265 | \r |
266 | mov [ptr1], m\r |
267 | mov m, [cp_r + 4]\r |
268 | lea ptr1, [cp_r + 4]\r |
269 | sub diff, cur ; FIX32\r |
270 | jmp next_node\r |
271 | \r |
272 | MY_ALIGN_32\r |
273 | left_0:\r |
274 | mov [ptr0], m\r |
275 | mov m, [cp_r]\r |
276 | mov ptr0, cp_r\r |
277 | sub diff, cur ; FIX32\r |
278 | ; jmp next_node\r |
279 | \r |
280 | ; ------------ NEXT NODE ------------\r |
281 | ; MY_ALIGN_32\r |
282 | next_node:\r |
283 | mov cycSize, cycSize_VAR\r |
284 | dec cutValueCur_VAR\r |
285 | je finish_tree\r |
286 | \r |
287 | add diff_x, pos ; prev_match = pos + diff\r |
288 | cmp m, diff_x\r |
289 | jae fin_error ; if (new_match >= prev_match)\r |
290 | \r |
291 | mov diff_x, pos\r |
292 | sub diff_x, m ; delta = pos - new_match\r |
293 | cmp pos, cycSize\r |
294 | jae cyc_mode_2 ; if (pos >= cycSize)\r |
295 | \r |
296 | mov cp_x, m\r |
297 | test m, m\r |
298 | jne tree_loop ; if (m != 0)\r |
299 | \r |
300 | finish_tree:\r |
301 | ; ptr0 = *ptr1 = kEmptyHashValue;\r |
302 | mov DWORD PTR [ptr0], 0\r |
303 | mov DWORD PTR [ptr1], 0\r |
304 | \r |
305 | inc pos\r |
306 | \r |
307 | ; _distances[-1] = (UInt32)(d - _distances);\r |
308 | mov t0, distances\r |
309 | mov t1, d\r |
310 | sub t1, t0\r |
311 | shr t1_x, 2\r |
312 | mov [t0 - 4], t1_x\r |
313 | \r |
314 | cmp d, limit_VAR\r |
315 | jae fin ; if (d >= limit)\r |
316 | \r |
317 | mov cp_x, cycPos_VAR\r |
318 | mov hash, hash_VAR\r |
319 | mov hash_lim, size_VAR\r |
320 | inc cp_x\r |
321 | cmp hash, hash_lim\r |
322 | jne main_loop ; if (hash != size)\r |
323 | jmp fin\r |
324 | \r |
325 | \r |
326 | MY_ALIGN_32\r |
327 | cyc_mode_2:\r |
328 | cmp diff_x, cycSize\r |
329 | jae finish_tree ; if (delta >= cycSize)\r |
330 | \r |
331 | mov cp_x, cycPos_VAR\r |
332 | xor t0_x, t0_x\r |
333 | sub cp_x, diff_x ; cp_x = cycPos - delta\r |
334 | cmovb t0_x, cycSize\r |
335 | add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)\r |
336 | jmp tree_loop\r |
337 | \r |
338 | \r |
339 | MY_ALIGN_32\r |
340 | matched_1:\r |
341 | \r |
342 | inc len\r |
343 | ; cmp len_x, lenLimit_x\r |
344 | je short lenLimit_reach\r |
345 | movzx t0_x, BYTE PTR [diff + 1 * len]\r |
346 | cmp [cur + 1 * len], t0_L\r |
347 | jne mismatch\r |
348 | \r |
349 | \r |
350 | MY_ALIGN_32\r |
351 | match_loop:\r |
352 | ; while (++len != lenLimit) (len[diff] != len[0]) ;\r |
353 | \r |
354 | inc len\r |
355 | ; cmp len_x, lenLimit_x\r |
356 | je short lenLimit_reach\r |
357 | movzx t0_x, BYTE PTR [diff + 1 * len]\r |
358 | cmp BYTE PTR [cur + 1 * len], t0_L\r |
359 | je match_loop\r |
360 | \r |
361 | mismatch:\r |
362 | jb left_2\r |
363 | \r |
364 | mov [ptr1], m\r |
365 | mov m, [cp_r + 4]\r |
366 | lea ptr1, [cp_r + 4]\r |
367 | mov len1, len\r |
368 | \r |
369 | jmp max_update\r |
370 | \r |
371 | MY_ALIGN_32\r |
372 | left_2:\r |
373 | mov [ptr0], m\r |
374 | mov m, [cp_r]\r |
375 | mov ptr0, cp_r\r |
376 | mov len0, len\r |
377 | \r |
378 | max_update:\r |
379 | sub diff, cur ; restore diff\r |
380 | \r |
381 | cmp maxLen, len\r |
382 | jae next_node\r |
383 | \r |
384 | mov maxLen, len\r |
385 | add len, lenLimit\r |
386 | mov [d], len_x\r |
387 | mov t0_x, diff_x\r |
388 | not t0_x\r |
389 | mov [d + 4], t0_x\r |
390 | add d, 8\r |
391 | \r |
392 | jmp next_node\r |
393 | \r |
394 | \r |
395 | \r |
396 | MY_ALIGN_32\r |
397 | lenLimit_reach:\r |
398 | \r |
399 | mov delta_r, cur\r |
400 | sub delta_r, diff\r |
401 | lea delta1_r, [delta_r - 1]\r |
402 | \r |
403 | mov t0_x, [cp_r]\r |
404 | mov [ptr1], t0_x\r |
405 | mov t0_x, [cp_r + 4]\r |
406 | mov [ptr0], t0_x\r |
407 | \r |
408 | mov [d], lenLimit_x\r |
409 | mov [d + 4], delta1_x\r |
410 | add d, 8\r |
411 | \r |
412 | ; _distances[-1] = (UInt32)(d - _distances);\r |
413 | mov t0, distances\r |
414 | mov t1, d\r |
415 | sub t1, t0\r |
416 | shr t1_x, 2\r |
417 | mov [t0 - 4], t1_x\r |
418 | \r |
419 | mov hash, hash_VAR\r |
420 | mov hash_lim, size_VAR\r |
421 | \r |
422 | inc pos\r |
423 | mov cp_x, cycPos_VAR\r |
424 | inc cp_x\r |
425 | \r |
426 | mov d_lim, limit_VAR\r |
427 | mov cycSize, cycSize_VAR\r |
428 | ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)\r |
429 | ; break;\r |
430 | cmp hash, hash_lim\r |
431 | je fin\r |
432 | cmp d, d_lim\r |
433 | jae fin\r |
434 | cmp delta_x, [hash]\r |
435 | jne main_loop\r |
436 | movzx t0_x, BYTE PTR [diff]\r |
437 | cmp [cur], t0_L\r |
438 | jne main_loop\r |
439 | \r |
440 | ; jmp main_loop ; bypass for debug\r |
441 | \r |
442 | mov cycPos_VAR, cp_x\r |
443 | shl len, 3 ; cycSize * 8\r |
444 | sub diff, cur ; restore diff\r |
445 | xor t0_x, t0_x\r |
446 | cmp cp_x, delta_x ; cmp (cycPos_VAR, delta)\r |
447 | lea cp_r, [son + 8 * cp_r] ; dest\r |
448 | lea src, [cp_r + 8 * diff]\r |
449 | cmovb t0, len ; t0 = (cycPos_VAR < delta ? cycSize * 8 : 0)\r |
450 | add src, t0\r |
451 | add len, son ; len = son + cycSize * 8\r |
452 | \r |
453 | \r |
454 | MY_ALIGN_32\r |
455 | long_loop:\r |
456 | add hash, 4\r |
457 | \r |
458 | ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];\r |
459 | \r |
460 | mov t0, [src]\r |
461 | add src, 8\r |
462 | mov [cp_r], t0\r |
463 | add cp_r, 8\r |
464 | cmp src, len\r |
465 | cmove src, son ; if end of (son) buffer is reached, we wrap to begin\r |
466 | \r |
467 | mov DWORD PTR [d], 2\r |
468 | mov [d + 4], lenLimit_x\r |
469 | mov [d + 8], delta1_x\r |
470 | add d, 12\r |
471 | \r |
472 | inc cur\r |
473 | \r |
474 | cmp hash, hash_lim\r |
475 | je long_footer\r |
476 | cmp delta_x, [hash]\r |
477 | jne long_footer\r |
478 | movzx t0_x, BYTE PTR [diff + 1 * cur]\r |
479 | cmp [cur], t0_L\r |
480 | jne long_footer\r |
481 | cmp d, d_lim\r |
482 | jb long_loop\r |
483 | \r |
484 | long_footer:\r |
485 | sub cp_r, son\r |
486 | shr cp_r, 3\r |
487 | add pos, cp_x\r |
488 | sub pos, cycPos_VAR\r |
489 | mov cycSize, cycSize_VAR\r |
490 | \r |
491 | cmp d, d_lim\r |
492 | jae fin\r |
493 | cmp hash, hash_lim\r |
494 | jne main_loop\r |
495 | jmp fin\r |
496 | \r |
497 | \r |
498 | \r |
499 | fin_error:\r |
500 | xor d, d\r |
501 | \r |
502 | fin:\r |
503 | mov RSP, Old_RSP\r |
504 | mov t0, [r4 + posRes_OFFS]\r |
505 | mov [t0], pos\r |
506 | mov r0, d\r |
507 | \r |
508 | MY_POP_PRESERVED_ABI_REGS\r |
509 | MY_ENDP\r |
510 | \r |
511 | _TEXT$LZFINDOPT ENDS\r |
512 | \r |
513 | end\r |