obligatory forgotten android fixup
[pcsx_rearmed.git] / deps / libchdr / deps / lzma-24.05 / src / LzFind.c
... / ...
CommitLineData
1/* LzFind.c -- Match finder for LZ algorithms
22024-03-01 : Igor Pavlov : Public domain */
3
4#include "Precomp.h"
5
6#include <string.h>
7// #include <stdio.h>
8
9#include "CpuArch.h"
10#include "LzFind.h"
11#include "LzHash.h"
12
13#define kBlockMoveAlign (1 << 7) // alignment for memmove()
14#define kBlockSizeAlign (1 << 16) // alignment for block allocation
15#define kBlockSizeReserveMin (1 << 24) // it's 1/256 from 4 GB dictinary
16
17#define kEmptyHashValue 0
18
19#define kMaxValForNormalize ((UInt32)0)
20// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug
21
22// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
23
24#define GET_AVAIL_BYTES(p) \
25 Inline_MatchFinder_GetNumAvailableBytes(p)
26
27
28// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
29#define kFix5HashSize kFix4HashSize
30
31/*
32 HASH2_CALC:
33 if (hv) match, then cur[0] and cur[1] also match
34*/
35#define HASH2_CALC hv = GetUi16(cur);
36
37// (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255]
38
39/*
40 HASH3_CALC:
41 if (cur[0]) and (h2) match, then cur[1] also match
42 if (cur[0]) and (hv) match, then cur[1] and cur[2] also match
43*/
44#define HASH3_CALC { \
45 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
46 h2 = temp & (kHash2Size - 1); \
47 hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; }
48
49#define HASH4_CALC { \
50 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
51 h2 = temp & (kHash2Size - 1); \
52 temp ^= ((UInt32)cur[2] << 8); \
53 h3 = temp & (kHash3Size - 1); \
54 hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; }
55
56#define HASH5_CALC { \
57 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
58 h2 = temp & (kHash2Size - 1); \
59 temp ^= ((UInt32)cur[2] << 8); \
60 h3 = temp & (kHash3Size - 1); \
61 temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \
62 /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \
63 hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; }
64
65#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
66
67
68static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc)
69{
70 // if (!p->directInput)
71 {
72 ISzAlloc_Free(alloc, p->bufBase);
73 p->bufBase = NULL;
74 }
75}
76
77
78static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc)
79{
80 if (blockSize == 0)
81 return 0;
82 if (!p->bufBase || p->blockSize != blockSize)
83 {
84 // size_t blockSizeT;
85 LzInWindow_Free(p, alloc);
86 p->blockSize = blockSize;
87 // blockSizeT = blockSize;
88
89 // printf("\nblockSize = 0x%x\n", blockSize);
90 /*
91 #if defined _WIN64
92 // we can allocate 4GiB, but still use UInt32 for (p->blockSize)
93 // we use UInt32 type for (p->blockSize), because
94 // we don't want to wrap over 4 GiB,
95 // when we use (p->streamPos - p->pos) that is UInt32.
96 if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign)
97 {
98 blockSizeT = ((size_t)1 << 32);
99 printf("\nchanged to blockSizeT = 4GiB\n");
100 }
101 #endif
102 */
103
104 p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
105 // printf("\nbufferBase = %p\n", p->bufBase);
106 // return 0; // for debug
107 }
108 return (p->bufBase != NULL);
109}
110
111static const Byte *MatchFinder_GetPointerToCurrentPos(void *p)
112{
113 return ((CMatchFinder *)p)->buffer;
114}
115
116static UInt32 MatchFinder_GetNumAvailableBytes(void *p)
117{
118 return GET_AVAIL_BYTES((CMatchFinder *)p);
119}
120
121
122Z7_NO_INLINE
123static void MatchFinder_ReadBlock(CMatchFinder *p)
124{
125 if (p->streamEndWasReached || p->result != SZ_OK)
126 return;
127
128 /* We use (p->streamPos - p->pos) value.
129 (p->streamPos < p->pos) is allowed. */
130
131 if (p->directInput)
132 {
133 UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p);
134 if (curSize > p->directInputRem)
135 curSize = (UInt32)p->directInputRem;
136 p->streamPos += curSize;
137 p->directInputRem -= curSize;
138 if (p->directInputRem == 0)
139 p->streamEndWasReached = 1;
140 return;
141 }
142
143 for (;;)
144 {
145 const Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
146 size_t size = (size_t)(p->bufBase + p->blockSize - dest);
147 if (size == 0)
148 {
149 /* we call ReadBlock() after NeedMove() and MoveBlock().
150 NeedMove() and MoveBlock() povide more than (keepSizeAfter)
151 to the end of (blockSize).
152 So we don't execute this branch in normal code flow.
153 We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock().
154 */
155 // p->result = SZ_ERROR_FAIL; // we can show error here
156 return;
157 }
158
159 // #define kRead 3
160 // if (size > kRead) size = kRead; // for debug
161
162 /*
163 // we need cast (Byte *)dest.
164 #ifdef __clang__
165 #pragma GCC diagnostic ignored "-Wcast-qual"
166 #endif
167 */
168 p->result = ISeqInStream_Read(p->stream,
169 p->bufBase + (dest - p->bufBase), &size);
170 if (p->result != SZ_OK)
171 return;
172 if (size == 0)
173 {
174 p->streamEndWasReached = 1;
175 return;
176 }
177 p->streamPos += (UInt32)size;
178 if (GET_AVAIL_BYTES(p) > p->keepSizeAfter)
179 return;
180 /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function
181 (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */
182 }
183
184 // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter)
185}
186
187
188
189Z7_NO_INLINE
190void MatchFinder_MoveBlock(CMatchFinder *p)
191{
192 const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore;
193 const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore;
194 p->buffer = p->bufBase + keepBefore;
195 memmove(p->bufBase,
196 p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
197 keepBefore + (size_t)GET_AVAIL_BYTES(p));
198}
199
200/* We call MoveBlock() before ReadBlock().
201 So MoveBlock() can be wasteful operation, if the whole input data
202 can fit in current block even without calling MoveBlock().
203 in important case where (dataSize <= historySize)
204 condition (p->blockSize > dataSize + p->keepSizeAfter) is met
205 So there is no MoveBlock() in that case case.
206*/
207
208int MatchFinder_NeedMove(CMatchFinder *p)
209{
210 if (p->directInput)
211 return 0;
212 if (p->streamEndWasReached || p->result != SZ_OK)
213 return 0;
214 return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
215}
216
217void MatchFinder_ReadIfRequired(CMatchFinder *p)
218{
219 if (p->keepSizeAfter >= GET_AVAIL_BYTES(p))
220 MatchFinder_ReadBlock(p);
221}
222
223
224
225static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
226{
227 p->cutValue = 32;
228 p->btMode = 1;
229 p->numHashBytes = 4;
230 p->numHashBytes_Min = 2;
231 p->numHashOutBits = 0;
232 p->bigHash = 0;
233}
234
235#define kCrcPoly 0xEDB88320
236
237void MatchFinder_Construct(CMatchFinder *p)
238{
239 unsigned i;
240 p->buffer = NULL;
241 p->bufBase = NULL;
242 p->directInput = 0;
243 p->stream = NULL;
244 p->hash = NULL;
245 p->expectedDataSize = (UInt64)(Int64)-1;
246 MatchFinder_SetDefaultSettings(p);
247
248 for (i = 0; i < 256; i++)
249 {
250 UInt32 r = (UInt32)i;
251 unsigned j;
252 for (j = 0; j < 8; j++)
253 r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
254 p->crc[i] = r;
255 }
256}
257
258#undef kCrcPoly
259
260static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc)
261{
262 ISzAlloc_Free(alloc, p->hash);
263 p->hash = NULL;
264}
265
266void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc)
267{
268 MatchFinder_FreeThisClassMemory(p, alloc);
269 LzInWindow_Free(p, alloc);
270}
271
272static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc)
273{
274 const size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
275 if (sizeInBytes / sizeof(CLzRef) != num)
276 return NULL;
277 return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes);
278}
279
280#if (kBlockSizeReserveMin < kBlockSizeAlign * 2)
281 #error Stop_Compiling_Bad_Reserve
282#endif
283
284
285
286static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize)
287{
288 UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter);
289 /*
290 if (historySize > kMaxHistorySize)
291 return 0;
292 */
293 // printf("\nhistorySize == 0x%x\n", historySize);
294
295 if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore) // if 32-bit overflow
296 return 0;
297
298 {
299 const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign;
300 const UInt32 rem = kBlockSizeMax - blockSize;
301 const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2))
302 + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here
303 if (blockSize >= kBlockSizeMax
304 || rem < kBlockSizeReserveMin) // we reject settings that will be slow
305 return 0;
306 if (reserve >= rem)
307 blockSize = kBlockSizeMax;
308 else
309 {
310 blockSize += reserve;
311 blockSize &= ~(UInt32)(kBlockSizeAlign - 1);
312 }
313 }
314 // printf("\n LzFind_blockSize = %x\n", blockSize);
315 // printf("\n LzFind_blockSize = %d\n", blockSize >> 20);
316 return blockSize;
317}
318
319
320// input is historySize
321static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs)
322{
323 if (p->numHashBytes == 2)
324 return (1 << 16) - 1;
325 if (hs != 0)
326 hs--;
327 hs |= (hs >> 1);
328 hs |= (hs >> 2);
329 hs |= (hs >> 4);
330 hs |= (hs >> 8);
331 // we propagated 16 bits in (hs). Low 16 bits must be set later
332 if (hs >= (1 << 24))
333 {
334 if (p->numHashBytes == 3)
335 hs = (1 << 24) - 1;
336 /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
337 }
338 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
339 hs |= (1 << 16) - 1; /* don't change it! */
340 // bt5: we adjust the size with recommended minimum size
341 if (p->numHashBytes >= 5)
342 hs |= (256 << kLzHash_CrcShift_2) - 1;
343 return hs;
344}
345
346// input is historySize
347static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs)
348{
349 if (p->numHashBytes == 2)
350 return (1 << 16) - 1;
351 if (hs != 0)
352 hs--;
353 hs |= (hs >> 1);
354 hs |= (hs >> 2);
355 hs |= (hs >> 4);
356 hs |= (hs >> 8);
357 // we propagated 16 bits in (hs). Low 16 bits must be set later
358 hs >>= 1;
359 if (hs >= (1 << 24))
360 {
361 if (p->numHashBytes == 3)
362 hs = (1 << 24) - 1;
363 else
364 hs >>= 1;
365 /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
366 }
367 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
368 hs |= (1 << 16) - 1; /* don't change it! */
369 // bt5: we adjust the size with recommended minimum size
370 if (p->numHashBytes >= 5)
371 hs |= (256 << kLzHash_CrcShift_2) - 1;
372 return hs;
373}
374
375
376int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
377 UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
378 ISzAllocPtr alloc)
379{
380 /* we need one additional byte in (p->keepSizeBefore),
381 since we use MoveBlock() after (p->pos++) and before dictionary using */
382 // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug
383 p->keepSizeBefore = historySize + keepAddBufferBefore + 1;
384
385 keepAddBufferAfter += matchMaxLen;
386 /* we need (p->keepSizeAfter >= p->numHashBytes) */
387 if (keepAddBufferAfter < p->numHashBytes)
388 keepAddBufferAfter = p->numHashBytes;
389 // keepAddBufferAfter -= 2; // for debug
390 p->keepSizeAfter = keepAddBufferAfter;
391
392 if (p->directInput)
393 p->blockSize = 0;
394 if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc))
395 {
396 size_t hashSizeSum;
397 {
398 UInt32 hs;
399 UInt32 hsCur;
400
401 if (p->numHashOutBits != 0)
402 {
403 unsigned numBits = p->numHashOutBits;
404 const unsigned nbMax =
405 (p->numHashBytes == 2 ? 16 :
406 (p->numHashBytes == 3 ? 24 : 32));
407 if (numBits > nbMax)
408 numBits = nbMax;
409 if (numBits >= 32)
410 hs = (UInt32)0 - 1;
411 else
412 hs = ((UInt32)1 << numBits) - 1;
413 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
414 hs |= (1 << 16) - 1; /* don't change it! */
415 if (p->numHashBytes >= 5)
416 hs |= (256 << kLzHash_CrcShift_2) - 1;
417 {
418 const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
419 if (hs > hs2)
420 hs = hs2;
421 }
422 hsCur = hs;
423 if (p->expectedDataSize < historySize)
424 {
425 const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
426 if (hsCur > hs2)
427 hsCur = hs2;
428 }
429 }
430 else
431 {
432 hs = MatchFinder_GetHashMask(p, historySize);
433 hsCur = hs;
434 if (p->expectedDataSize < historySize)
435 {
436 hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
437 if (hsCur > hs) // is it possible?
438 hsCur = hs;
439 }
440 }
441
442 p->hashMask = hsCur;
443
444 hashSizeSum = hs;
445 hashSizeSum++;
446 if (hashSizeSum < hs)
447 return 0;
448 {
449 UInt32 fixedHashSize = 0;
450 if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size;
451 if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size;
452 // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
453 hashSizeSum += fixedHashSize;
454 p->fixedHashSize = fixedHashSize;
455 }
456 }
457
458 p->matchMaxLen = matchMaxLen;
459
460 {
461 size_t newSize;
462 size_t numSons;
463 const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
464 p->historySize = historySize;
465 p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1)
466
467 numSons = newCyclicBufferSize;
468 if (p->btMode)
469 numSons <<= 1;
470 newSize = hashSizeSum + numSons;
471
472 if (numSons < newCyclicBufferSize || newSize < numSons)
473 return 0;
474
475 // aligned size is not required here, but it can be better for some loops
476 #define NUM_REFS_ALIGN_MASK 0xF
477 newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK;
478
479 // 22.02: we don't reallocate buffer, if old size is enough
480 if (p->hash && p->numRefs >= newSize)
481 return 1;
482
483 MatchFinder_FreeThisClassMemory(p, alloc);
484 p->numRefs = newSize;
485 p->hash = AllocRefs(newSize, alloc);
486
487 if (p->hash)
488 {
489 p->son = p->hash + hashSizeSum;
490 return 1;
491 }
492 }
493 }
494
495 MatchFinder_Free(p, alloc);
496 return 0;
497}
498
499
500static void MatchFinder_SetLimits(CMatchFinder *p)
501{
502 UInt32 k;
503 UInt32 n = kMaxValForNormalize - p->pos;
504 if (n == 0)
505 n = (UInt32)(Int32)-1; // we allow (pos == 0) at start even with (kMaxValForNormalize == 0)
506
507 k = p->cyclicBufferSize - p->cyclicBufferPos;
508 if (k < n)
509 n = k;
510
511 k = GET_AVAIL_BYTES(p);
512 {
513 const UInt32 ksa = p->keepSizeAfter;
514 UInt32 mm = p->matchMaxLen;
515 if (k > ksa)
516 k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock
517 else if (k >= mm)
518 {
519 // the limitation for (p->lenLimit) update
520 k -= mm; // optimization : to reduce the number of checks
521 k++;
522 // k = 1; // non-optimized version : for debug
523 }
524 else
525 {
526 mm = k;
527 if (k != 0)
528 k = 1;
529 }
530 p->lenLimit = mm;
531 }
532 if (k < n)
533 n = k;
534
535 p->posLimit = p->pos + n;
536}
537
538
539void MatchFinder_Init_LowHash(CMatchFinder *p)
540{
541 size_t i;
542 CLzRef *items = p->hash;
543 const size_t numItems = p->fixedHashSize;
544 for (i = 0; i < numItems; i++)
545 items[i] = kEmptyHashValue;
546}
547
548
549void MatchFinder_Init_HighHash(CMatchFinder *p)
550{
551 size_t i;
552 CLzRef *items = p->hash + p->fixedHashSize;
553 const size_t numItems = (size_t)p->hashMask + 1;
554 for (i = 0; i < numItems; i++)
555 items[i] = kEmptyHashValue;
556}
557
558
559void MatchFinder_Init_4(CMatchFinder *p)
560{
561 if (!p->directInput)
562 p->buffer = p->bufBase;
563 {
564 /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker.
565 the code in CMatchFinderMt expects (pos = 1) */
566 p->pos =
567 p->streamPos =
568 1; // it's smallest optimal value. do not change it
569 // 0; // for debug
570 }
571 p->result = SZ_OK;
572 p->streamEndWasReached = 0;
573}
574
575
576// (CYC_TO_POS_OFFSET == 0) is expected by some optimized code
577#define CYC_TO_POS_OFFSET 0
578// #define CYC_TO_POS_OFFSET 1 // for debug
579
580void MatchFinder_Init(void *_p)
581{
582 CMatchFinder *p = (CMatchFinder *)_p;
583 MatchFinder_Init_HighHash(p);
584 MatchFinder_Init_LowHash(p);
585 MatchFinder_Init_4(p);
586 // if (readData)
587 MatchFinder_ReadBlock(p);
588
589 /* if we init (cyclicBufferPos = pos), then we can use one variable
590 instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */
591 p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos)
592 // p->cyclicBufferPos = 0; // smallest value
593 // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses.
594 MatchFinder_SetLimits(p);
595}
596
597
598
599#ifdef MY_CPU_X86_OR_AMD64
600 #if defined(__clang__) && (__clang_major__ >= 4) \
601 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
602 // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
603
604 #define USE_LZFIND_SATUR_SUB_128
605 #define USE_LZFIND_SATUR_SUB_256
606 #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
607 #define LZFIND_ATTRIB_AVX2 __attribute__((__target__("avx2")))
608 #elif defined(_MSC_VER)
609 #if (_MSC_VER >= 1600)
610 #define USE_LZFIND_SATUR_SUB_128
611 #endif
612 #if (_MSC_VER >= 1900)
613 #define USE_LZFIND_SATUR_SUB_256
614 #endif
615 #endif
616
617#elif defined(MY_CPU_ARM64) \
618 /* || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) */
619
620 #if defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
621 || defined(__GNUC__) && (__GNUC__ >= 6)
622 #define USE_LZFIND_SATUR_SUB_128
623 #ifdef MY_CPU_ARM64
624 // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("")))
625 #else
626 #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=neon")))
627 #endif
628
629 #elif defined(_MSC_VER)
630 #if (_MSC_VER >= 1910)
631 #define USE_LZFIND_SATUR_SUB_128
632 #endif
633 #endif
634
635 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
636 #include <arm64_neon.h>
637 #else
638 #include <arm_neon.h>
639 #endif
640
641#endif
642
643
644#ifdef USE_LZFIND_SATUR_SUB_128
645
646// #define Z7_SHOW_HW_STATUS
647
648#ifdef Z7_SHOW_HW_STATUS
649#include <stdio.h>
650#define PRF(x) x
651PRF(;)
652#else
653#define PRF(x)
654#endif
655
656
657#ifdef MY_CPU_ARM_OR_ARM64
658
659#ifdef MY_CPU_ARM64
660// #define FORCE_LZFIND_SATUR_SUB_128
661#endif
662typedef uint32x4_t LzFind_v128;
663#define SASUB_128_V(v, s) \
664 vsubq_u32(vmaxq_u32(v, s), s)
665
666#else // MY_CPU_ARM_OR_ARM64
667
668#include <smmintrin.h> // sse4.1
669
670typedef __m128i LzFind_v128;
671// SSE 4.1
672#define SASUB_128_V(v, s) \
673 _mm_sub_epi32(_mm_max_epu32(v, s), s)
674
675#endif // MY_CPU_ARM_OR_ARM64
676
677
678#define SASUB_128(i) \
679 *( LzFind_v128 *)( void *)(items + (i) * 4) = SASUB_128_V( \
680 *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2);
681
682
683Z7_NO_INLINE
684static
685#ifdef LZFIND_ATTRIB_SSE41
686LZFIND_ATTRIB_SSE41
687#endif
688void
689Z7_FASTCALL
690LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim)
691{
692 const LzFind_v128 sub2 =
693 #ifdef MY_CPU_ARM_OR_ARM64
694 vdupq_n_u32(subValue);
695 #else
696 _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
697 #endif
698 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
699 do
700 {
701 SASUB_128(0) SASUB_128(1) items += 2 * 4;
702 SASUB_128(0) SASUB_128(1) items += 2 * 4;
703 }
704 while (items != lim);
705}
706
707
708
709#ifdef USE_LZFIND_SATUR_SUB_256
710
711#include <immintrin.h> // avx
712/*
713clang :immintrin.h uses
714#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
715 defined(__AVX2__)
716#include <avx2intrin.h>
717#endif
718so we need <avxintrin.h> for clang-cl */
719
720#if defined(__clang__)
721#include <avxintrin.h>
722#include <avx2intrin.h>
723#endif
724
725// AVX2:
726#define SASUB_256(i) \
727 *( __m256i *)( void *)(items + (i) * 8) = \
728 _mm256_sub_epi32(_mm256_max_epu32( \
729 *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2);
730
731Z7_NO_INLINE
732static
733#ifdef LZFIND_ATTRIB_AVX2
734LZFIND_ATTRIB_AVX2
735#endif
736void
737Z7_FASTCALL
738LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim)
739{
740 const __m256i sub2 = _mm256_set_epi32(
741 (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue,
742 (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
743 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
744 do
745 {
746 SASUB_256(0) SASUB_256(1) items += 2 * 8;
747 SASUB_256(0) SASUB_256(1) items += 2 * 8;
748 }
749 while (items != lim);
750}
751#endif // USE_LZFIND_SATUR_SUB_256
752
753#ifndef FORCE_LZFIND_SATUR_SUB_128
754typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)(
755 UInt32 subValue, CLzRef *items, const CLzRef *lim);
756static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
757#endif // FORCE_LZFIND_SATUR_SUB_128
758
759#endif // USE_LZFIND_SATUR_SUB_128
760
761
762// kEmptyHashValue must be zero
763// #define SASUB_32(i) { UInt32 v = items[i]; UInt32 m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; }
764#define SASUB_32(i) { UInt32 v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; }
765
766#ifdef FORCE_LZFIND_SATUR_SUB_128
767
768#define DEFAULT_SaturSub LzFind_SaturSub_128
769
770#else
771
772#define DEFAULT_SaturSub LzFind_SaturSub_32
773
774Z7_NO_INLINE
775static
776void
777Z7_FASTCALL
778LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
779{
780 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
781 do
782 {
783 SASUB_32(0) SASUB_32(1) items += 2;
784 SASUB_32(0) SASUB_32(1) items += 2;
785 SASUB_32(0) SASUB_32(1) items += 2;
786 SASUB_32(0) SASUB_32(1) items += 2;
787 }
788 while (items != lim);
789}
790
791#endif
792
793
794Z7_NO_INLINE
795void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
796{
797 #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7)
798 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
799 for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
800 {
801 SASUB_32(0)
802 items++;
803 }
804 {
805 const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1);
806 CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask);
807 numItems &= k_Align_Mask;
808 if (items != lim)
809 {
810 #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128)
811 if (g_LzFind_SaturSub)
812 g_LzFind_SaturSub(subValue, items, lim);
813 else
814 #endif
815 DEFAULT_SaturSub(subValue, items, lim);
816 }
817 items = lim;
818 }
819 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
820 for (; numItems != 0; numItems--)
821 {
822 SASUB_32(0)
823 items++;
824 }
825}
826
827
828
829// call MatchFinder_CheckLimits() only after (p->pos++) update
830
831Z7_NO_INLINE
832static void MatchFinder_CheckLimits(CMatchFinder *p)
833{
834 if (// !p->streamEndWasReached && p->result == SZ_OK &&
835 p->keepSizeAfter == GET_AVAIL_BYTES(p))
836 {
837 // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p))
838 if (MatchFinder_NeedMove(p))
839 MatchFinder_MoveBlock(p);
840 MatchFinder_ReadBlock(p);
841 }
842
843 if (p->pos == kMaxValForNormalize)
844 if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data.
845 /*
846 if we disable normalization for last bytes of data, and
847 if (data_size == 4 GiB), we don't call wastfull normalization,
848 but (pos) will be wrapped over Zero (0) in that case.
849 And we cannot resume later to normal operation
850 */
851 {
852 // MatchFinder_Normalize(p);
853 /* after normalization we need (p->pos >= p->historySize + 1); */
854 /* we can reduce subValue to aligned value, if want to keep alignment
855 of (p->pos) and (p->buffer) for speculated accesses. */
856 const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */;
857 // const UInt32 subValue = (1 << 15); // for debug
858 // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue);
859 MatchFinder_REDUCE_OFFSETS(p, subValue)
860 MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize);
861 {
862 size_t numSonRefs = p->cyclicBufferSize;
863 if (p->btMode)
864 numSonRefs <<= 1;
865 MatchFinder_Normalize3(subValue, p->son, numSonRefs);
866 }
867 }
868
869 if (p->cyclicBufferPos == p->cyclicBufferSize)
870 p->cyclicBufferPos = 0;
871
872 MatchFinder_SetLimits(p);
873}
874
875
876/*
877 (lenLimit > maxLen)
878*/
879Z7_FORCE_INLINE
880static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
881 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
882 UInt32 *d, unsigned maxLen)
883{
884 /*
885 son[_cyclicBufferPos] = curMatch;
886 for (;;)
887 {
888 UInt32 delta = pos - curMatch;
889 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
890 return d;
891 {
892 const Byte *pb = cur - delta;
893 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
894 if (pb[maxLen] == cur[maxLen] && *pb == *cur)
895 {
896 UInt32 len = 0;
897 while (++len != lenLimit)
898 if (pb[len] != cur[len])
899 break;
900 if (maxLen < len)
901 {
902 maxLen = len;
903 *d++ = len;
904 *d++ = delta - 1;
905 if (len == lenLimit)
906 return d;
907 }
908 }
909 }
910 }
911 */
912
913 const Byte *lim = cur + lenLimit;
914 son[_cyclicBufferPos] = curMatch;
915
916 do
917 {
918 UInt32 delta;
919
920 if (curMatch == 0)
921 break;
922 // if (curMatch2 >= curMatch) return NULL;
923 delta = pos - curMatch;
924 if (delta >= _cyclicBufferSize)
925 break;
926 {
927 ptrdiff_t diff;
928 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
929 diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
930 if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
931 {
932 const Byte *c = cur;
933 while (*c == c[diff])
934 {
935 if (++c == lim)
936 {
937 d[0] = (UInt32)(lim - cur);
938 d[1] = delta - 1;
939 return d + 2;
940 }
941 }
942 {
943 const unsigned len = (unsigned)(c - cur);
944 if (maxLen < len)
945 {
946 maxLen = len;
947 d[0] = (UInt32)len;
948 d[1] = delta - 1;
949 d += 2;
950 }
951 }
952 }
953 }
954 }
955 while (--cutValue);
956
957 return d;
958}
959
960
961Z7_FORCE_INLINE
962UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
963 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
964 UInt32 *d, UInt32 maxLen)
965{
966 CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
967 CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
968 unsigned len0 = 0, len1 = 0;
969
970 UInt32 cmCheck;
971
972 // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
973
974 cmCheck = (UInt32)(pos - _cyclicBufferSize);
975 if ((UInt32)pos <= _cyclicBufferSize)
976 cmCheck = 0;
977
978 if (cmCheck < curMatch)
979 do
980 {
981 const UInt32 delta = pos - curMatch;
982 {
983 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
984 const Byte *pb = cur - delta;
985 unsigned len = (len0 < len1 ? len0 : len1);
986 const UInt32 pair0 = pair[0];
987 if (pb[len] == cur[len])
988 {
989 if (++len != lenLimit && pb[len] == cur[len])
990 while (++len != lenLimit)
991 if (pb[len] != cur[len])
992 break;
993 if (maxLen < len)
994 {
995 maxLen = (UInt32)len;
996 *d++ = (UInt32)len;
997 *d++ = delta - 1;
998 if (len == lenLimit)
999 {
1000 *ptr1 = pair0;
1001 *ptr0 = pair[1];
1002 return d;
1003 }
1004 }
1005 }
1006 if (pb[len] < cur[len])
1007 {
1008 *ptr1 = curMatch;
1009 // const UInt32 curMatch2 = pair[1];
1010 // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
1011 // curMatch = curMatch2;
1012 curMatch = pair[1];
1013 ptr1 = pair + 1;
1014 len1 = len;
1015 }
1016 else
1017 {
1018 *ptr0 = curMatch;
1019 curMatch = pair[0];
1020 ptr0 = pair;
1021 len0 = len;
1022 }
1023 }
1024 }
1025 while(--cutValue && cmCheck < curMatch);
1026
1027 *ptr0 = *ptr1 = kEmptyHashValue;
1028 return d;
1029}
1030
1031
1032static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
1033 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue)
1034{
1035 CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
1036 CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
1037 unsigned len0 = 0, len1 = 0;
1038
1039 UInt32 cmCheck;
1040
1041 cmCheck = (UInt32)(pos - _cyclicBufferSize);
1042 if ((UInt32)pos <= _cyclicBufferSize)
1043 cmCheck = 0;
1044
1045 if (// curMatch >= pos || // failure
1046 cmCheck < curMatch)
1047 do
1048 {
1049 const UInt32 delta = pos - curMatch;
1050 {
1051 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
1052 const Byte *pb = cur - delta;
1053 unsigned len = (len0 < len1 ? len0 : len1);
1054 if (pb[len] == cur[len])
1055 {
1056 while (++len != lenLimit)
1057 if (pb[len] != cur[len])
1058 break;
1059 {
1060 if (len == lenLimit)
1061 {
1062 *ptr1 = pair[0];
1063 *ptr0 = pair[1];
1064 return;
1065 }
1066 }
1067 }
1068 if (pb[len] < cur[len])
1069 {
1070 *ptr1 = curMatch;
1071 curMatch = pair[1];
1072 ptr1 = pair + 1;
1073 len1 = len;
1074 }
1075 else
1076 {
1077 *ptr0 = curMatch;
1078 curMatch = pair[0];
1079 ptr0 = pair;
1080 len0 = len;
1081 }
1082 }
1083 }
1084 while(--cutValue && cmCheck < curMatch);
1085
1086 *ptr0 = *ptr1 = kEmptyHashValue;
1087 return;
1088}
1089
1090
1091#define MOVE_POS \
1092 p->cyclicBufferPos++; \
1093 p->buffer++; \
1094 { const UInt32 pos1 = p->pos + 1; \
1095 p->pos = pos1; \
1096 if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); }
1097
1098#define MOVE_POS_RET MOVE_POS return distances;
1099
1100Z7_NO_INLINE
1101static void MatchFinder_MovePos(CMatchFinder *p)
1102{
1103 /* we go here at the end of stream data, when (avail < num_hash_bytes)
1104 We don't update sons[cyclicBufferPos << btMode].
1105 So (sons) record will contain junk. And we cannot resume match searching
1106 to normal operation, even if we will provide more input data in buffer.
1107 p->sons[p->cyclicBufferPos << p->btMode] = 0; // kEmptyHashValue
1108 if (p->btMode)
1109 p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue
1110 */
1111 MOVE_POS
1112}
1113
1114#define GET_MATCHES_HEADER2(minLen, ret_op) \
1115 UInt32 hv; const Byte *cur; UInt32 curMatch; \
1116 UInt32 lenLimit = p->lenLimit; \
1117 if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; } \
1118 cur = p->buffer;
1119
1120#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances)
1121#define SKIP_HEADER(minLen) \
1122 do { GET_MATCHES_HEADER2(minLen, continue)
1123
1124#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, \
1125 p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
1126
1127#define SKIP_FOOTER \
1128 SkipMatchesSpec(MF_PARAMS(p)); \
1129 MOVE_POS \
1130 } while (--num);
1131
1132#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
1133 distances = func(MF_PARAMS(p), distances, (UInt32)_maxLen_); \
1134 MOVE_POS_RET
1135
1136#define GET_MATCHES_FOOTER_BT(_maxLen_) \
1137 GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
1138
1139#define GET_MATCHES_FOOTER_HC(_maxLen_) \
1140 GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec)
1141
1142
1143
1144#define UPDATE_maxLen { \
1145 const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \
1146 const Byte *c = cur + maxLen; \
1147 const Byte *lim = cur + lenLimit; \
1148 for (; c != lim; c++) if (*(c + diff) != *c) break; \
1149 maxLen = (unsigned)(c - cur); }
1150
1151static UInt32* Bt2_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1152{
1153 CMatchFinder *p = (CMatchFinder *)_p;
1154 GET_MATCHES_HEADER(2)
1155 HASH2_CALC
1156 curMatch = p->hash[hv];
1157 p->hash[hv] = p->pos;
1158 GET_MATCHES_FOOTER_BT(1)
1159}
1160
1161UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1162{
1163 GET_MATCHES_HEADER(3)
1164 HASH_ZIP_CALC
1165 curMatch = p->hash[hv];
1166 p->hash[hv] = p->pos;
1167 GET_MATCHES_FOOTER_BT(2)
1168}
1169
1170
1171#define SET_mmm \
1172 mmm = p->cyclicBufferSize; \
1173 if (pos < mmm) \
1174 mmm = pos;
1175
1176
1177static UInt32* Bt3_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1178{
1179 CMatchFinder *p = (CMatchFinder *)_p;
1180 UInt32 mmm;
1181 UInt32 h2, d2, pos;
1182 unsigned maxLen;
1183 UInt32 *hash;
1184 GET_MATCHES_HEADER(3)
1185
1186 HASH3_CALC
1187
1188 hash = p->hash;
1189 pos = p->pos;
1190
1191 d2 = pos - hash[h2];
1192
1193 curMatch = (hash + kFix3HashSize)[hv];
1194
1195 hash[h2] = pos;
1196 (hash + kFix3HashSize)[hv] = pos;
1197
1198 SET_mmm
1199
1200 maxLen = 2;
1201
1202 if (d2 < mmm && *(cur - d2) == *cur)
1203 {
1204 UPDATE_maxLen
1205 distances[0] = (UInt32)maxLen;
1206 distances[1] = d2 - 1;
1207 distances += 2;
1208 if (maxLen == lenLimit)
1209 {
1210 SkipMatchesSpec(MF_PARAMS(p));
1211 MOVE_POS_RET
1212 }
1213 }
1214
1215 GET_MATCHES_FOOTER_BT(maxLen)
1216}
1217
1218
1219static UInt32* Bt4_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1220{
1221 CMatchFinder *p = (CMatchFinder *)_p;
1222 UInt32 mmm;
1223 UInt32 h2, h3, d2, d3, pos;
1224 unsigned maxLen;
1225 UInt32 *hash;
1226 GET_MATCHES_HEADER(4)
1227
1228 HASH4_CALC
1229
1230 hash = p->hash;
1231 pos = p->pos;
1232
1233 d2 = pos - hash [h2];
1234 d3 = pos - (hash + kFix3HashSize)[h3];
1235 curMatch = (hash + kFix4HashSize)[hv];
1236
1237 hash [h2] = pos;
1238 (hash + kFix3HashSize)[h3] = pos;
1239 (hash + kFix4HashSize)[hv] = pos;
1240
1241 SET_mmm
1242
1243 maxLen = 3;
1244
1245 for (;;)
1246 {
1247 if (d2 < mmm && *(cur - d2) == *cur)
1248 {
1249 distances[0] = 2;
1250 distances[1] = d2 - 1;
1251 distances += 2;
1252 if (*(cur - d2 + 2) == cur[2])
1253 {
1254 // distances[-2] = 3;
1255 }
1256 else if (d3 < mmm && *(cur - d3) == *cur)
1257 {
1258 d2 = d3;
1259 distances[1] = d3 - 1;
1260 distances += 2;
1261 }
1262 else
1263 break;
1264 }
1265 else if (d3 < mmm && *(cur - d3) == *cur)
1266 {
1267 d2 = d3;
1268 distances[1] = d3 - 1;
1269 distances += 2;
1270 }
1271 else
1272 break;
1273
1274 UPDATE_maxLen
1275 distances[-2] = (UInt32)maxLen;
1276 if (maxLen == lenLimit)
1277 {
1278 SkipMatchesSpec(MF_PARAMS(p));
1279 MOVE_POS_RET
1280 }
1281 break;
1282 }
1283
1284 GET_MATCHES_FOOTER_BT(maxLen)
1285}
1286
1287
1288static UInt32* Bt5_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1289{
1290 CMatchFinder *p = (CMatchFinder *)_p;
1291 UInt32 mmm;
1292 UInt32 h2, h3, d2, d3, pos;
1293 unsigned maxLen;
1294 UInt32 *hash;
1295 GET_MATCHES_HEADER(5)
1296
1297 HASH5_CALC
1298
1299 hash = p->hash;
1300 pos = p->pos;
1301
1302 d2 = pos - hash [h2];
1303 d3 = pos - (hash + kFix3HashSize)[h3];
1304 // d4 = pos - (hash + kFix4HashSize)[h4];
1305
1306 curMatch = (hash + kFix5HashSize)[hv];
1307
1308 hash [h2] = pos;
1309 (hash + kFix3HashSize)[h3] = pos;
1310 // (hash + kFix4HashSize)[h4] = pos;
1311 (hash + kFix5HashSize)[hv] = pos;
1312
1313 SET_mmm
1314
1315 maxLen = 4;
1316
1317 for (;;)
1318 {
1319 if (d2 < mmm && *(cur - d2) == *cur)
1320 {
1321 distances[0] = 2;
1322 distances[1] = d2 - 1;
1323 distances += 2;
1324 if (*(cur - d2 + 2) == cur[2])
1325 {
1326 }
1327 else if (d3 < mmm && *(cur - d3) == *cur)
1328 {
1329 distances[1] = d3 - 1;
1330 distances += 2;
1331 d2 = d3;
1332 }
1333 else
1334 break;
1335 }
1336 else if (d3 < mmm && *(cur - d3) == *cur)
1337 {
1338 distances[1] = d3 - 1;
1339 distances += 2;
1340 d2 = d3;
1341 }
1342 else
1343 break;
1344
1345 distances[-2] = 3;
1346 if (*(cur - d2 + 3) != cur[3])
1347 break;
1348 UPDATE_maxLen
1349 distances[-2] = (UInt32)maxLen;
1350 if (maxLen == lenLimit)
1351 {
1352 SkipMatchesSpec(MF_PARAMS(p));
1353 MOVE_POS_RET
1354 }
1355 break;
1356 }
1357
1358 GET_MATCHES_FOOTER_BT(maxLen)
1359}
1360
1361
1362static UInt32* Hc4_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1363{
1364 CMatchFinder *p = (CMatchFinder *)_p;
1365 UInt32 mmm;
1366 UInt32 h2, h3, d2, d3, pos;
1367 unsigned maxLen;
1368 UInt32 *hash;
1369 GET_MATCHES_HEADER(4)
1370
1371 HASH4_CALC
1372
1373 hash = p->hash;
1374 pos = p->pos;
1375
1376 d2 = pos - hash [h2];
1377 d3 = pos - (hash + kFix3HashSize)[h3];
1378 curMatch = (hash + kFix4HashSize)[hv];
1379
1380 hash [h2] = pos;
1381 (hash + kFix3HashSize)[h3] = pos;
1382 (hash + kFix4HashSize)[hv] = pos;
1383
1384 SET_mmm
1385
1386 maxLen = 3;
1387
1388 for (;;)
1389 {
1390 if (d2 < mmm && *(cur - d2) == *cur)
1391 {
1392 distances[0] = 2;
1393 distances[1] = d2 - 1;
1394 distances += 2;
1395 if (*(cur - d2 + 2) == cur[2])
1396 {
1397 // distances[-2] = 3;
1398 }
1399 else if (d3 < mmm && *(cur - d3) == *cur)
1400 {
1401 d2 = d3;
1402 distances[1] = d3 - 1;
1403 distances += 2;
1404 }
1405 else
1406 break;
1407 }
1408 else if (d3 < mmm && *(cur - d3) == *cur)
1409 {
1410 d2 = d3;
1411 distances[1] = d3 - 1;
1412 distances += 2;
1413 }
1414 else
1415 break;
1416
1417 UPDATE_maxLen
1418 distances[-2] = (UInt32)maxLen;
1419 if (maxLen == lenLimit)
1420 {
1421 p->son[p->cyclicBufferPos] = curMatch;
1422 MOVE_POS_RET
1423 }
1424 break;
1425 }
1426
1427 GET_MATCHES_FOOTER_HC(maxLen)
1428}
1429
1430
1431static UInt32 * Hc5_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1432{
1433 CMatchFinder *p = (CMatchFinder *)_p;
1434 UInt32 mmm;
1435 UInt32 h2, h3, d2, d3, pos;
1436 unsigned maxLen;
1437 UInt32 *hash;
1438 GET_MATCHES_HEADER(5)
1439
1440 HASH5_CALC
1441
1442 hash = p->hash;
1443 pos = p->pos;
1444
1445 d2 = pos - hash [h2];
1446 d3 = pos - (hash + kFix3HashSize)[h3];
1447 // d4 = pos - (hash + kFix4HashSize)[h4];
1448
1449 curMatch = (hash + kFix5HashSize)[hv];
1450
1451 hash [h2] = pos;
1452 (hash + kFix3HashSize)[h3] = pos;
1453 // (hash + kFix4HashSize)[h4] = pos;
1454 (hash + kFix5HashSize)[hv] = pos;
1455
1456 SET_mmm
1457
1458 maxLen = 4;
1459
1460 for (;;)
1461 {
1462 if (d2 < mmm && *(cur - d2) == *cur)
1463 {
1464 distances[0] = 2;
1465 distances[1] = d2 - 1;
1466 distances += 2;
1467 if (*(cur - d2 + 2) == cur[2])
1468 {
1469 }
1470 else if (d3 < mmm && *(cur - d3) == *cur)
1471 {
1472 distances[1] = d3 - 1;
1473 distances += 2;
1474 d2 = d3;
1475 }
1476 else
1477 break;
1478 }
1479 else if (d3 < mmm && *(cur - d3) == *cur)
1480 {
1481 distances[1] = d3 - 1;
1482 distances += 2;
1483 d2 = d3;
1484 }
1485 else
1486 break;
1487
1488 distances[-2] = 3;
1489 if (*(cur - d2 + 3) != cur[3])
1490 break;
1491 UPDATE_maxLen
1492 distances[-2] = (UInt32)maxLen;
1493 if (maxLen == lenLimit)
1494 {
1495 p->son[p->cyclicBufferPos] = curMatch;
1496 MOVE_POS_RET
1497 }
1498 break;
1499 }
1500
1501 GET_MATCHES_FOOTER_HC(maxLen)
1502}
1503
1504
1505UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1506{
1507 GET_MATCHES_HEADER(3)
1508 HASH_ZIP_CALC
1509 curMatch = p->hash[hv];
1510 p->hash[hv] = p->pos;
1511 GET_MATCHES_FOOTER_HC(2)
1512}
1513
1514
1515static void Bt2_MatchFinder_Skip(void *_p, UInt32 num)
1516{
1517 CMatchFinder *p = (CMatchFinder *)_p;
1518 SKIP_HEADER(2)
1519 {
1520 HASH2_CALC
1521 curMatch = p->hash[hv];
1522 p->hash[hv] = p->pos;
1523 }
1524 SKIP_FOOTER
1525}
1526
1527void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1528{
1529 SKIP_HEADER(3)
1530 {
1531 HASH_ZIP_CALC
1532 curMatch = p->hash[hv];
1533 p->hash[hv] = p->pos;
1534 }
1535 SKIP_FOOTER
1536}
1537
1538static void Bt3_MatchFinder_Skip(void *_p, UInt32 num)
1539{
1540 CMatchFinder *p = (CMatchFinder *)_p;
1541 SKIP_HEADER(3)
1542 {
1543 UInt32 h2;
1544 UInt32 *hash;
1545 HASH3_CALC
1546 hash = p->hash;
1547 curMatch = (hash + kFix3HashSize)[hv];
1548 hash[h2] =
1549 (hash + kFix3HashSize)[hv] = p->pos;
1550 }
1551 SKIP_FOOTER
1552}
1553
1554static void Bt4_MatchFinder_Skip(void *_p, UInt32 num)
1555{
1556 CMatchFinder *p = (CMatchFinder *)_p;
1557 SKIP_HEADER(4)
1558 {
1559 UInt32 h2, h3;
1560 UInt32 *hash;
1561 HASH4_CALC
1562 hash = p->hash;
1563 curMatch = (hash + kFix4HashSize)[hv];
1564 hash [h2] =
1565 (hash + kFix3HashSize)[h3] =
1566 (hash + kFix4HashSize)[hv] = p->pos;
1567 }
1568 SKIP_FOOTER
1569}
1570
1571static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
1572{
1573 CMatchFinder *p = (CMatchFinder *)_p;
1574 SKIP_HEADER(5)
1575 {
1576 UInt32 h2, h3;
1577 UInt32 *hash;
1578 HASH5_CALC
1579 hash = p->hash;
1580 curMatch = (hash + kFix5HashSize)[hv];
1581 hash [h2] =
1582 (hash + kFix3HashSize)[h3] =
1583 // (hash + kFix4HashSize)[h4] =
1584 (hash + kFix5HashSize)[hv] = p->pos;
1585 }
1586 SKIP_FOOTER
1587}
1588
1589
1590#define HC_SKIP_HEADER(minLen) \
1591 do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \
1592 const Byte *cur; \
1593 UInt32 *hash; \
1594 UInt32 *son; \
1595 UInt32 pos = p->pos; \
1596 UInt32 num2 = num; \
1597 /* (p->pos == p->posLimit) is not allowed here !!! */ \
1598 { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
1599 num -= num2; \
1600 { const UInt32 cycPos = p->cyclicBufferPos; \
1601 son = p->son + cycPos; \
1602 p->cyclicBufferPos = cycPos + num2; } \
1603 cur = p->buffer; \
1604 hash = p->hash; \
1605 do { \
1606 UInt32 curMatch; \
1607 UInt32 hv;
1608
1609
1610#define HC_SKIP_FOOTER \
1611 cur++; pos++; *son++ = curMatch; \
1612 } while (--num2); \
1613 p->buffer = cur; \
1614 p->pos = pos; \
1615 if (pos == p->posLimit) MatchFinder_CheckLimits(p); \
1616 }} while(num); \
1617
1618
1619static void Hc4_MatchFinder_Skip(void *_p, UInt32 num)
1620{
1621 CMatchFinder *p = (CMatchFinder *)_p;
1622 HC_SKIP_HEADER(4)
1623
1624 UInt32 h2, h3;
1625 HASH4_CALC
1626 curMatch = (hash + kFix4HashSize)[hv];
1627 hash [h2] =
1628 (hash + kFix3HashSize)[h3] =
1629 (hash + kFix4HashSize)[hv] = pos;
1630
1631 HC_SKIP_FOOTER
1632}
1633
1634
1635static void Hc5_MatchFinder_Skip(void *_p, UInt32 num)
1636{
1637 CMatchFinder *p = (CMatchFinder *)_p;
1638 HC_SKIP_HEADER(5)
1639
1640 UInt32 h2, h3;
1641 HASH5_CALC
1642 curMatch = (hash + kFix5HashSize)[hv];
1643 hash [h2] =
1644 (hash + kFix3HashSize)[h3] =
1645 // (hash + kFix4HashSize)[h4] =
1646 (hash + kFix5HashSize)[hv] = pos;
1647
1648 HC_SKIP_FOOTER
1649}
1650
1651
1652void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1653{
1654 HC_SKIP_HEADER(3)
1655
1656 HASH_ZIP_CALC
1657 curMatch = hash[hv];
1658 hash[hv] = pos;
1659
1660 HC_SKIP_FOOTER
1661}
1662
1663
1664void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
1665{
1666 vTable->Init = MatchFinder_Init;
1667 vTable->GetNumAvailableBytes = MatchFinder_GetNumAvailableBytes;
1668 vTable->GetPointerToCurrentPos = MatchFinder_GetPointerToCurrentPos;
1669 if (!p->btMode)
1670 {
1671 if (p->numHashBytes <= 4)
1672 {
1673 vTable->GetMatches = Hc4_MatchFinder_GetMatches;
1674 vTable->Skip = Hc4_MatchFinder_Skip;
1675 }
1676 else
1677 {
1678 vTable->GetMatches = Hc5_MatchFinder_GetMatches;
1679 vTable->Skip = Hc5_MatchFinder_Skip;
1680 }
1681 }
1682 else if (p->numHashBytes == 2)
1683 {
1684 vTable->GetMatches = Bt2_MatchFinder_GetMatches;
1685 vTable->Skip = Bt2_MatchFinder_Skip;
1686 }
1687 else if (p->numHashBytes == 3)
1688 {
1689 vTable->GetMatches = Bt3_MatchFinder_GetMatches;
1690 vTable->Skip = Bt3_MatchFinder_Skip;
1691 }
1692 else if (p->numHashBytes == 4)
1693 {
1694 vTable->GetMatches = Bt4_MatchFinder_GetMatches;
1695 vTable->Skip = Bt4_MatchFinder_Skip;
1696 }
1697 else
1698 {
1699 vTable->GetMatches = Bt5_MatchFinder_GetMatches;
1700 vTable->Skip = Bt5_MatchFinder_Skip;
1701 }
1702}
1703
1704
1705
1706void LzFindPrepare(void)
1707{
1708 #ifndef FORCE_LZFIND_SATUR_SUB_128
1709 #ifdef USE_LZFIND_SATUR_SUB_128
1710 LZFIND_SATUR_SUB_CODE_FUNC f = NULL;
1711 #ifdef MY_CPU_ARM_OR_ARM64
1712 {
1713 if (CPU_IsSupported_NEON())
1714 {
1715 // #pragma message ("=== LzFind NEON")
1716 PRF(printf("\n=== LzFind NEON\n"));
1717 f = LzFind_SaturSub_128;
1718 }
1719 // f = 0; // for debug
1720 }
1721 #else // MY_CPU_ARM_OR_ARM64
1722 if (CPU_IsSupported_SSE41())
1723 {
1724 // #pragma message ("=== LzFind SSE41")
1725 PRF(printf("\n=== LzFind SSE41\n"));
1726 f = LzFind_SaturSub_128;
1727
1728 #ifdef USE_LZFIND_SATUR_SUB_256
1729 if (CPU_IsSupported_AVX2())
1730 {
1731 // #pragma message ("=== LzFind AVX2")
1732 PRF(printf("\n=== LzFind AVX2\n"));
1733 f = LzFind_SaturSub_256;
1734 }
1735 #endif
1736 }
1737 #endif // MY_CPU_ARM_OR_ARM64
1738 g_LzFind_SaturSub = f;
1739 #endif // USE_LZFIND_SATUR_SUB_128
1740 #endif // FORCE_LZFIND_SATUR_SUB_128
1741}
1742
1743
1744#undef MOVE_POS
1745#undef MOVE_POS_RET
1746#undef PRF