1 ; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function
\r
2 ; 2021-07-21: Igor Pavlov : Public domain
\r
7 ; .err <x64_IS_REQUIRED>
\r
14 _TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'
\r
16 MY_ALIGN macro num:req
\r
57 delta_x equ maxLen_x
\r
66 ; r1 r2 r8 r9 : win32
\r
67 ; r7 r6 r2 r1 r8 r9 : linux
\r
78 lenLimit equ REG_ABI_PARAM_2
\r
79 lenLimit_x equ REG_ABI_PARAM_2_x
\r
80 pos equ REG_ABI_PARAM_1_x
\r
81 cur equ REG_ABI_PARAM_0
\r
82 son equ REG_ABI_PARAM_3
\r
88 maxLen_OFFS equ (REG_SIZE * (6 + 1))
\r
90 cutValue_OFFS equ (REG_SIZE * (8 + 1 + 4))
\r
91 d_OFFS equ (REG_SIZE + cutValue_OFFS)
\r
92 maxLen_OFFS equ (REG_SIZE + d_OFFS)
\r
94 hash_OFFS equ (REG_SIZE + maxLen_OFFS)
\r
95 limit_OFFS equ (REG_SIZE + hash_OFFS)
\r
96 size_OFFS equ (REG_SIZE + limit_OFFS)
\r
97 cycPos_OFFS equ (REG_SIZE + size_OFFS)
\r
98 cycSize_OFFS equ (REG_SIZE + cycPos_OFFS)
\r
99 posRes_OFFS equ (REG_SIZE + cycSize_OFFS)
\r
103 cutValue_PAR equ [r0 + cutValue_OFFS]
\r
104 d_PAR equ [r0 + d_OFFS]
\r
106 maxLen_PAR equ [r0 + maxLen_OFFS]
\r
107 hash_PAR equ [r0 + hash_OFFS]
\r
108 limit_PAR equ [r0 + limit_OFFS]
\r
109 size_PAR equ [r0 + size_OFFS]
\r
110 cycPos_PAR equ [r0 + cycPos_OFFS]
\r
111 cycSize_PAR equ [r0 + cycSize_OFFS]
\r
112 posRes_PAR equ [r0 + posRes_OFFS]
\r
115 cutValue_VAR equ DWORD PTR [r4 + 8 * 0]
\r
116 cutValueCur_VAR equ DWORD PTR [r4 + 8 * 0 + 4]
\r
117 cycPos_VAR equ DWORD PTR [r4 + 8 * 1 + 0]
\r
118 cycSize_VAR equ DWORD PTR [r4 + 8 * 1 + 4]
\r
119 hash_VAR equ QWORD PTR [r4 + 8 * 2]
\r
120 limit_VAR equ QWORD PTR [r4 + 8 * 3]
\r
121 size_VAR equ QWORD PTR [r4 + 8 * 4]
\r
122 distances equ QWORD PTR [r4 + 8 * 5]
\r
123 maxLen_VAR equ QWORD PTR [r4 + 8 * 6]
\r
125 Old_RSP equ QWORD PTR [r4 + 8 * 7]
\r
126 LOCAL_SIZE equ 8 * 8
\r
128 COPY_VAR_32 macro dest_var, src_var
\r
133 COPY_VAR_64 macro dest_var, src_var
\r
140 MY_PROC GetMatchesSpecN_2, 13
\r
141 MY_PUSH_PRESERVED_ABI_REGS
\r
143 lea r3, [r0 - LOCAL_SIZE]
\r
149 mov d, REG_ABI_PARAM_5 ; r13 = r9
\r
150 mov cutValue_VAR, REG_ABI_PARAM_4_x ; = r8
\r
151 mov son, REG_ABI_PARAM_3 ; r9 = r1
\r
152 mov r8, REG_ABI_PARAM_2 ; r8 = r2
\r
153 mov pos, REG_ABI_PARAM_1_x ; r2 = x6
\r
154 mov r1, REG_ABI_PARAM_0 ; r1 = r7
\r
156 COPY_VAR_32 cutValue_VAR, cutValue_PAR
\r
160 COPY_VAR_64 limit_VAR, limit_PAR
\r
162 mov hash_lim, size_PAR
\r
163 mov size_VAR, hash_lim
\r
165 mov cp_x, cycPos_PAR
\r
168 mov cycSize, cycSize_PAR
\r
169 mov cycSize_VAR, cycSize
\r
171 ; we want cur in (rcx). So we change the cur and lenLimit variables
\r
176 mov t0_x, maxLen_PAR
\r
184 ; ptr0 = *ptr1 = kEmptyHashValue;
\r
185 mov QWORD PTR [ptr1], 0
\r
188 mov DWORD PTR [d - 4], 0
\r
196 ; UInt32 delta = *hash++;
\r
197 mov diff_x, [hash] ; delta
\r
199 ; mov cycPos_VAR, cp_x
\r
204 sub m, diff_x; ; matchPos
\r
206 ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
\r
207 lea ptr1, [son + 8 * cp_r]
\r
208 ; mov cycSize, cycSize_VAR
\r
210 jb directMode ; if (pos < cycSize_VAR)
\r
214 cmp diff_x, cycSize
\r
215 jae fill_empty ; if (delta >= cycSize_VAR)
\r
218 mov cycPos_VAR, cp_x
\r
220 ; jae prepare_for_tree_loop
\r
221 ; add cp_x, cycSize
\r
222 cmovb t0_x, cycSize
\r
223 add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)
\r
224 jmp prepare_for_tree_loop
\r
229 je fill_empty ; if (delta == pos)
\r
230 jae fin_error ; if (delta >= pos)
\r
232 mov cycPos_VAR, cp_x
\r
235 prepare_for_tree_loop:
\r
238 ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;
\r
239 lea ptr0, [ptr1 + 4]
\r
240 ; UInt32 *_distances = ++d;
\r
246 mov t0_x, cutValue_VAR
\r
247 mov maxLen, maxLen_VAR
\r
248 mov cutValueCur_VAR, t0_x
\r
255 cmovb len, len1 ; len = (len1 < len0 ? len1 : len0);
\r
258 mov t0_x, [son + cp_r * 8] ; prefetch
\r
259 movzx t0_x, BYTE PTR [diff + 1 * len]
\r
260 lea cp_r, [son + cp_r * 8]
\r
261 cmp [cur + 1 * len], t0_L
\r
268 lea ptr1, [cp_r + 4]
\r
269 sub diff, cur ; FIX32
\r
277 sub diff, cur ; FIX32
\r
280 ; ------------ NEXT NODE ------------
\r
283 mov cycSize, cycSize_VAR
\r
284 dec cutValueCur_VAR
\r
287 add diff_x, pos ; prev_match = pos + diff
\r
289 jae fin_error ; if (new_match >= prev_match)
\r
292 sub diff_x, m ; delta = pos - new_match
\r
294 jae cyc_mode_2 ; if (pos >= cycSize)
\r
298 jne tree_loop ; if (m != 0)
\r
301 ; ptr0 = *ptr1 = kEmptyHashValue;
\r
302 mov DWORD PTR [ptr0], 0
\r
303 mov DWORD PTR [ptr1], 0
\r
307 ; _distances[-1] = (UInt32)(d - _distances);
\r
315 jae fin ; if (d >= limit)
\r
317 mov cp_x, cycPos_VAR
\r
319 mov hash_lim, size_VAR
\r
322 jne main_loop ; if (hash != size)
\r
328 cmp diff_x, cycSize
\r
329 jae finish_tree ; if (delta >= cycSize)
\r
331 mov cp_x, cycPos_VAR
\r
333 sub cp_x, diff_x ; cp_x = cycPos - delta
\r
334 cmovb t0_x, cycSize
\r
335 add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)
\r
343 ; cmp len_x, lenLimit_x
\r
344 je short lenLimit_reach
\r
345 movzx t0_x, BYTE PTR [diff + 1 * len]
\r
346 cmp [cur + 1 * len], t0_L
\r
352 ; while (++len != lenLimit) (len[diff] != len[0]) ;
\r
355 ; cmp len_x, lenLimit_x
\r
356 je short lenLimit_reach
\r
357 movzx t0_x, BYTE PTR [diff + 1 * len]
\r
358 cmp BYTE PTR [cur + 1 * len], t0_L
\r
366 lea ptr1, [cp_r + 4]
\r
379 sub diff, cur ; restore diff
\r
401 lea delta1_r, [delta_r - 1]
\r
405 mov t0_x, [cp_r + 4]
\r
408 mov [d], lenLimit_x
\r
409 mov [d + 4], delta1_x
\r
412 ; _distances[-1] = (UInt32)(d - _distances);
\r
420 mov hash_lim, size_VAR
\r
423 mov cp_x, cycPos_VAR
\r
426 mov d_lim, limit_VAR
\r
427 mov cycSize, cycSize_VAR
\r
428 ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
\r
434 cmp delta_x, [hash]
\r
436 movzx t0_x, BYTE PTR [diff]
\r
440 ; jmp main_loop ; bypass for debug
\r
442 mov cycPos_VAR, cp_x
\r
443 shl len, 3 ; cycSize * 8
\r
444 sub diff, cur ; restore diff
\r
446 cmp cp_x, delta_x ; cmp (cycPos_VAR, delta)
\r
447 lea cp_r, [son + 8 * cp_r] ; dest
\r
448 lea src, [cp_r + 8 * diff]
\r
449 cmovb t0, len ; t0 = (cycPos_VAR < delta ? cycSize * 8 : 0)
\r
451 add len, son ; len = son + cycSize * 8
\r
458 ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];
\r
465 cmove src, son ; if end of (son) buffer is reached, we wrap to begin
\r
467 mov DWORD PTR [d], 2
\r
468 mov [d + 4], lenLimit_x
\r
469 mov [d + 8], delta1_x
\r
476 cmp delta_x, [hash]
\r
478 movzx t0_x, BYTE PTR [diff + 1 * cur]
\r
488 sub pos, cycPos_VAR
\r
489 mov cycSize, cycSize_VAR
\r
504 mov t0, [r4 + posRes_OFFS]
\r
508 MY_POP_PRESERVED_ABI_REGS
\r
511 _TEXT$LZFINDOPT ENDS
\r