1 /* LzFind.c -- Match finder for LZ algorithms
2 2024-03-01 : Igor Pavlov : Public domain */
13 #define kBlockMoveAlign (1 << 7) // alignment for memmove()
14 #define kBlockSizeAlign (1 << 16) // alignment for block allocation
15 #define kBlockSizeReserveMin (1 << 24) // it's 1/256 from 4 GB dictinary
17 #define kEmptyHashValue 0
19 #define kMaxValForNormalize ((UInt32)0)
20 // #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug
22 // #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
24 #define GET_AVAIL_BYTES(p) \
25 Inline_MatchFinder_GetNumAvailableBytes(p)
28 // #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
29 #define kFix5HashSize kFix4HashSize
33 if (hv) match, then cur[0] and cur[1] also match
35 #define HASH2_CALC hv = GetUi16(cur);
37 // (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255]
41 if (cur[0]) and (h2) match, then cur[1] also match
42 if (cur[0]) and (hv) match, then cur[1] and cur[2] also match
44 #define HASH3_CALC { \
45 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
46 h2 = temp & (kHash2Size - 1); \
47 hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; }
49 #define HASH4_CALC { \
50 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
51 h2 = temp & (kHash2Size - 1); \
52 temp ^= ((UInt32)cur[2] << 8); \
53 h3 = temp & (kHash3Size - 1); \
54 hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; }
56 #define HASH5_CALC { \
57 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
58 h2 = temp & (kHash2Size - 1); \
59 temp ^= ((UInt32)cur[2] << 8); \
60 h3 = temp & (kHash3Size - 1); \
61 temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \
62 /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \
63 hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; }
65 #define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
68 static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc)
70 // if (!p->directInput)
72 ISzAlloc_Free(alloc, p->bufBase);
78 static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc)
82 if (!p->bufBase || p->blockSize != blockSize)
85 LzInWindow_Free(p, alloc);
86 p->blockSize = blockSize;
87 // blockSizeT = blockSize;
89 // printf("\nblockSize = 0x%x\n", blockSize);
92 // we can allocate 4GiB, but still use UInt32 for (p->blockSize)
93 // we use UInt32 type for (p->blockSize), because
94 // we don't want to wrap over 4 GiB,
95 // when we use (p->streamPos - p->pos) that is UInt32.
96 if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign)
98 blockSizeT = ((size_t)1 << 32);
99 printf("\nchanged to blockSizeT = 4GiB\n");
104 p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
105 // printf("\nbufferBase = %p\n", p->bufBase);
106 // return 0; // for debug
108 return (p->bufBase != NULL);
111 static const Byte *MatchFinder_GetPointerToCurrentPos(void *p)
113 return ((CMatchFinder *)p)->buffer;
116 static UInt32 MatchFinder_GetNumAvailableBytes(void *p)
118 return GET_AVAIL_BYTES((CMatchFinder *)p);
123 static void MatchFinder_ReadBlock(CMatchFinder *p)
125 if (p->streamEndWasReached || p->result != SZ_OK)
128 /* We use (p->streamPos - p->pos) value.
129 (p->streamPos < p->pos) is allowed. */
133 UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p);
134 if (curSize > p->directInputRem)
135 curSize = (UInt32)p->directInputRem;
136 p->streamPos += curSize;
137 p->directInputRem -= curSize;
138 if (p->directInputRem == 0)
139 p->streamEndWasReached = 1;
145 const Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
146 size_t size = (size_t)(p->bufBase + p->blockSize - dest);
149 /* we call ReadBlock() after NeedMove() and MoveBlock().
150 NeedMove() and MoveBlock() povide more than (keepSizeAfter)
151 to the end of (blockSize).
152 So we don't execute this branch in normal code flow.
153 We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock().
155 // p->result = SZ_ERROR_FAIL; // we can show error here
160 // if (size > kRead) size = kRead; // for debug
163 // we need cast (Byte *)dest.
165 #pragma GCC diagnostic ignored "-Wcast-qual"
168 p->result = ISeqInStream_Read(p->stream,
169 p->bufBase + (dest - p->bufBase), &size);
170 if (p->result != SZ_OK)
174 p->streamEndWasReached = 1;
177 p->streamPos += (UInt32)size;
178 if (GET_AVAIL_BYTES(p) > p->keepSizeAfter)
180 /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function
181 (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */
184 // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter)
190 void MatchFinder_MoveBlock(CMatchFinder *p)
192 const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore;
193 const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore;
194 p->buffer = p->bufBase + keepBefore;
196 p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
197 keepBefore + (size_t)GET_AVAIL_BYTES(p));
200 /* We call MoveBlock() before ReadBlock().
201 So MoveBlock() can be wasteful operation, if the whole input data
202 can fit in current block even without calling MoveBlock().
203 in important case where (dataSize <= historySize)
204 condition (p->blockSize > dataSize + p->keepSizeAfter) is met
205 So there is no MoveBlock() in that case case.
208 int MatchFinder_NeedMove(CMatchFinder *p)
212 if (p->streamEndWasReached || p->result != SZ_OK)
214 return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
217 void MatchFinder_ReadIfRequired(CMatchFinder *p)
219 if (p->keepSizeAfter >= GET_AVAIL_BYTES(p))
220 MatchFinder_ReadBlock(p);
225 static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
230 p->numHashBytes_Min = 2;
231 p->numHashOutBits = 0;
235 #define kCrcPoly 0xEDB88320
237 void MatchFinder_Construct(CMatchFinder *p)
245 p->expectedDataSize = (UInt64)(Int64)-1;
246 MatchFinder_SetDefaultSettings(p);
248 for (i = 0; i < 256; i++)
250 UInt32 r = (UInt32)i;
252 for (j = 0; j < 8; j++)
253 r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
260 static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc)
262 ISzAlloc_Free(alloc, p->hash);
266 void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc)
268 MatchFinder_FreeThisClassMemory(p, alloc);
269 LzInWindow_Free(p, alloc);
272 static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc)
274 const size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
275 if (sizeInBytes / sizeof(CLzRef) != num)
277 return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes);
280 #if (kBlockSizeReserveMin < kBlockSizeAlign * 2)
281 #error Stop_Compiling_Bad_Reserve
286 static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize)
288 UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter);
290 if (historySize > kMaxHistorySize)
293 // printf("\nhistorySize == 0x%x\n", historySize);
295 if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore) // if 32-bit overflow
299 const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign;
300 const UInt32 rem = kBlockSizeMax - blockSize;
301 const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2))
302 + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here
303 if (blockSize >= kBlockSizeMax
304 || rem < kBlockSizeReserveMin) // we reject settings that will be slow
307 blockSize = kBlockSizeMax;
310 blockSize += reserve;
311 blockSize &= ~(UInt32)(kBlockSizeAlign - 1);
314 // printf("\n LzFind_blockSize = %x\n", blockSize);
315 // printf("\n LzFind_blockSize = %d\n", blockSize >> 20);
320 // input is historySize
321 static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs)
323 if (p->numHashBytes == 2)
324 return (1 << 16) - 1;
331 // we propagated 16 bits in (hs). Low 16 bits must be set later
334 if (p->numHashBytes == 3)
336 /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
338 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
339 hs |= (1 << 16) - 1; /* don't change it! */
340 // bt5: we adjust the size with recommended minimum size
341 if (p->numHashBytes >= 5)
342 hs |= (256 << kLzHash_CrcShift_2) - 1;
346 // input is historySize
347 static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs)
349 if (p->numHashBytes == 2)
350 return (1 << 16) - 1;
357 // we propagated 16 bits in (hs). Low 16 bits must be set later
361 if (p->numHashBytes == 3)
365 /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
367 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
368 hs |= (1 << 16) - 1; /* don't change it! */
369 // bt5: we adjust the size with recommended minimum size
370 if (p->numHashBytes >= 5)
371 hs |= (256 << kLzHash_CrcShift_2) - 1;
376 int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
377 UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
380 /* we need one additional byte in (p->keepSizeBefore),
381 since we use MoveBlock() after (p->pos++) and before dictionary using */
382 // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug
383 p->keepSizeBefore = historySize + keepAddBufferBefore + 1;
385 keepAddBufferAfter += matchMaxLen;
386 /* we need (p->keepSizeAfter >= p->numHashBytes) */
387 if (keepAddBufferAfter < p->numHashBytes)
388 keepAddBufferAfter = p->numHashBytes;
389 // keepAddBufferAfter -= 2; // for debug
390 p->keepSizeAfter = keepAddBufferAfter;
394 if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc))
401 if (p->numHashOutBits != 0)
403 unsigned numBits = p->numHashOutBits;
404 const unsigned nbMax =
405 (p->numHashBytes == 2 ? 16 :
406 (p->numHashBytes == 3 ? 24 : 32));
412 hs = ((UInt32)1 << numBits) - 1;
413 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
414 hs |= (1 << 16) - 1; /* don't change it! */
415 if (p->numHashBytes >= 5)
416 hs |= (256 << kLzHash_CrcShift_2) - 1;
418 const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
423 if (p->expectedDataSize < historySize)
425 const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
432 hs = MatchFinder_GetHashMask(p, historySize);
434 if (p->expectedDataSize < historySize)
436 hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
437 if (hsCur > hs) // is it possible?
446 if (hashSizeSum < hs)
449 UInt32 fixedHashSize = 0;
450 if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size;
451 if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size;
452 // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
453 hashSizeSum += fixedHashSize;
454 p->fixedHashSize = fixedHashSize;
458 p->matchMaxLen = matchMaxLen;
463 const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
464 p->historySize = historySize;
465 p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1)
467 numSons = newCyclicBufferSize;
470 newSize = hashSizeSum + numSons;
472 if (numSons < newCyclicBufferSize || newSize < numSons)
475 // aligned size is not required here, but it can be better for some loops
476 #define NUM_REFS_ALIGN_MASK 0xF
477 newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK;
479 // 22.02: we don't reallocate buffer, if old size is enough
480 if (p->hash && p->numRefs >= newSize)
483 MatchFinder_FreeThisClassMemory(p, alloc);
484 p->numRefs = newSize;
485 p->hash = AllocRefs(newSize, alloc);
489 p->son = p->hash + hashSizeSum;
495 MatchFinder_Free(p, alloc);
500 static void MatchFinder_SetLimits(CMatchFinder *p)
503 UInt32 n = kMaxValForNormalize - p->pos;
505 n = (UInt32)(Int32)-1; // we allow (pos == 0) at start even with (kMaxValForNormalize == 0)
507 k = p->cyclicBufferSize - p->cyclicBufferPos;
511 k = GET_AVAIL_BYTES(p);
513 const UInt32 ksa = p->keepSizeAfter;
514 UInt32 mm = p->matchMaxLen;
516 k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock
519 // the limitation for (p->lenLimit) update
520 k -= mm; // optimization : to reduce the number of checks
522 // k = 1; // non-optimized version : for debug
535 p->posLimit = p->pos + n;
539 void MatchFinder_Init_LowHash(CMatchFinder *p)
542 CLzRef *items = p->hash;
543 const size_t numItems = p->fixedHashSize;
544 for (i = 0; i < numItems; i++)
545 items[i] = kEmptyHashValue;
549 void MatchFinder_Init_HighHash(CMatchFinder *p)
552 CLzRef *items = p->hash + p->fixedHashSize;
553 const size_t numItems = (size_t)p->hashMask + 1;
554 for (i = 0; i < numItems; i++)
555 items[i] = kEmptyHashValue;
559 void MatchFinder_Init_4(CMatchFinder *p)
562 p->buffer = p->bufBase;
564 /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker.
565 the code in CMatchFinderMt expects (pos = 1) */
568 1; // it's smallest optimal value. do not change it
572 p->streamEndWasReached = 0;
576 // (CYC_TO_POS_OFFSET == 0) is expected by some optimized code
577 #define CYC_TO_POS_OFFSET 0
578 // #define CYC_TO_POS_OFFSET 1 // for debug
580 void MatchFinder_Init(void *_p)
582 CMatchFinder *p = (CMatchFinder *)_p;
583 MatchFinder_Init_HighHash(p);
584 MatchFinder_Init_LowHash(p);
585 MatchFinder_Init_4(p);
587 MatchFinder_ReadBlock(p);
589 /* if we init (cyclicBufferPos = pos), then we can use one variable
590 instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */
591 p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos)
592 // p->cyclicBufferPos = 0; // smallest value
593 // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses.
594 MatchFinder_SetLimits(p);
599 #ifdef MY_CPU_X86_OR_AMD64
600 #if defined(__clang__) && (__clang_major__ >= 4) \
601 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
602 // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
604 #define USE_LZFIND_SATUR_SUB_128
605 #define USE_LZFIND_SATUR_SUB_256
606 #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
607 #define LZFIND_ATTRIB_AVX2 __attribute__((__target__("avx2")))
608 #elif defined(_MSC_VER)
609 #if (_MSC_VER >= 1600)
610 #define USE_LZFIND_SATUR_SUB_128
612 #if (_MSC_VER >= 1900)
613 #define USE_LZFIND_SATUR_SUB_256
617 #elif defined(MY_CPU_ARM64) \
618 /* || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) */
620 #if defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
621 || defined(__GNUC__) && (__GNUC__ >= 6)
622 #define USE_LZFIND_SATUR_SUB_128
624 // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("")))
626 #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=neon")))
629 #elif defined(_MSC_VER)
630 #if (_MSC_VER >= 1910)
631 #define USE_LZFIND_SATUR_SUB_128
635 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
636 #include <arm64_neon.h>
638 #include <arm_neon.h>
644 #ifdef USE_LZFIND_SATUR_SUB_128
646 // #define Z7_SHOW_HW_STATUS
648 #ifdef Z7_SHOW_HW_STATUS
657 #ifdef MY_CPU_ARM_OR_ARM64
660 // #define FORCE_LZFIND_SATUR_SUB_128
662 typedef uint32x4_t LzFind_v128;
663 #define SASUB_128_V(v, s) \
664 vsubq_u32(vmaxq_u32(v, s), s)
666 #else // MY_CPU_ARM_OR_ARM64
668 #include <smmintrin.h> // sse4.1
670 typedef __m128i LzFind_v128;
672 #define SASUB_128_V(v, s) \
673 _mm_sub_epi32(_mm_max_epu32(v, s), s)
675 #endif // MY_CPU_ARM_OR_ARM64
678 #define SASUB_128(i) \
679 *( LzFind_v128 *)( void *)(items + (i) * 4) = SASUB_128_V( \
680 *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2);
685 #ifdef LZFIND_ATTRIB_SSE41
690 LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim)
692 const LzFind_v128 sub2 =
693 #ifdef MY_CPU_ARM_OR_ARM64
694 vdupq_n_u32(subValue);
696 _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
698 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
701 SASUB_128(0) SASUB_128(1) items += 2 * 4;
702 SASUB_128(0) SASUB_128(1) items += 2 * 4;
704 while (items != lim);
709 #ifdef USE_LZFIND_SATUR_SUB_256
711 #include <immintrin.h> // avx
713 clang :immintrin.h uses
714 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
716 #include <avx2intrin.h>
718 so we need <avxintrin.h> for clang-cl */
720 #if defined(__clang__)
721 #include <avxintrin.h>
722 #include <avx2intrin.h>
726 #define SASUB_256(i) \
727 *( __m256i *)( void *)(items + (i) * 8) = \
728 _mm256_sub_epi32(_mm256_max_epu32( \
729 *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2);
733 #ifdef LZFIND_ATTRIB_AVX2
738 LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim)
740 const __m256i sub2 = _mm256_set_epi32(
741 (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue,
742 (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
743 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
746 SASUB_256(0) SASUB_256(1) items += 2 * 8;
747 SASUB_256(0) SASUB_256(1) items += 2 * 8;
749 while (items != lim);
751 #endif // USE_LZFIND_SATUR_SUB_256
753 #ifndef FORCE_LZFIND_SATUR_SUB_128
754 typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)(
755 UInt32 subValue, CLzRef *items, const CLzRef *lim);
756 static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
757 #endif // FORCE_LZFIND_SATUR_SUB_128
759 #endif // USE_LZFIND_SATUR_SUB_128
762 // kEmptyHashValue must be zero
763 // #define SASUB_32(i) { UInt32 v = items[i]; UInt32 m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; }
764 #define SASUB_32(i) { UInt32 v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; }
766 #ifdef FORCE_LZFIND_SATUR_SUB_128
768 #define DEFAULT_SaturSub LzFind_SaturSub_128
772 #define DEFAULT_SaturSub LzFind_SaturSub_32
778 LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
780 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
783 SASUB_32(0) SASUB_32(1) items += 2;
784 SASUB_32(0) SASUB_32(1) items += 2;
785 SASUB_32(0) SASUB_32(1) items += 2;
786 SASUB_32(0) SASUB_32(1) items += 2;
788 while (items != lim);
795 void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
797 #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7)
798 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
799 for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
805 const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1);
806 CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask);
807 numItems &= k_Align_Mask;
810 #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128)
811 if (g_LzFind_SaturSub)
812 g_LzFind_SaturSub(subValue, items, lim);
815 DEFAULT_SaturSub(subValue, items, lim);
819 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
820 for (; numItems != 0; numItems--)
829 // call MatchFinder_CheckLimits() only after (p->pos++) update
832 static void MatchFinder_CheckLimits(CMatchFinder *p)
834 if (// !p->streamEndWasReached && p->result == SZ_OK &&
835 p->keepSizeAfter == GET_AVAIL_BYTES(p))
837 // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p))
838 if (MatchFinder_NeedMove(p))
839 MatchFinder_MoveBlock(p);
840 MatchFinder_ReadBlock(p);
843 if (p->pos == kMaxValForNormalize)
844 if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data.
846 if we disable normalization for last bytes of data, and
847 if (data_size == 4 GiB), we don't call wastfull normalization,
848 but (pos) will be wrapped over Zero (0) in that case.
849 And we cannot resume later to normal operation
852 // MatchFinder_Normalize(p);
853 /* after normalization we need (p->pos >= p->historySize + 1); */
854 /* we can reduce subValue to aligned value, if want to keep alignment
855 of (p->pos) and (p->buffer) for speculated accesses. */
856 const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */;
857 // const UInt32 subValue = (1 << 15); // for debug
858 // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue);
859 MatchFinder_REDUCE_OFFSETS(p, subValue)
860 MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize);
862 size_t numSonRefs = p->cyclicBufferSize;
865 MatchFinder_Normalize3(subValue, p->son, numSonRefs);
869 if (p->cyclicBufferPos == p->cyclicBufferSize)
870 p->cyclicBufferPos = 0;
872 MatchFinder_SetLimits(p);
880 static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
881 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
882 UInt32 *d, unsigned maxLen)
885 son[_cyclicBufferPos] = curMatch;
888 UInt32 delta = pos - curMatch;
889 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
892 const Byte *pb = cur - delta;
893 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
894 if (pb[maxLen] == cur[maxLen] && *pb == *cur)
897 while (++len != lenLimit)
898 if (pb[len] != cur[len])
913 const Byte *lim = cur + lenLimit;
914 son[_cyclicBufferPos] = curMatch;
922 // if (curMatch2 >= curMatch) return NULL;
923 delta = pos - curMatch;
924 if (delta >= _cyclicBufferSize)
928 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
929 diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
930 if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
933 while (*c == c[diff])
937 d[0] = (UInt32)(lim - cur);
943 const unsigned len = (unsigned)(c - cur);
962 UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
963 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
964 UInt32 *d, UInt32 maxLen)
966 CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
967 CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
968 unsigned len0 = 0, len1 = 0;
972 // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
974 cmCheck = (UInt32)(pos - _cyclicBufferSize);
975 if ((UInt32)pos <= _cyclicBufferSize)
978 if (cmCheck < curMatch)
981 const UInt32 delta = pos - curMatch;
983 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
984 const Byte *pb = cur - delta;
985 unsigned len = (len0 < len1 ? len0 : len1);
986 const UInt32 pair0 = pair[0];
987 if (pb[len] == cur[len])
989 if (++len != lenLimit && pb[len] == cur[len])
990 while (++len != lenLimit)
991 if (pb[len] != cur[len])
995 maxLen = (UInt32)len;
1006 if (pb[len] < cur[len])
1009 // const UInt32 curMatch2 = pair[1];
1010 // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
1011 // curMatch = curMatch2;
1025 while(--cutValue && cmCheck < curMatch);
1027 *ptr0 = *ptr1 = kEmptyHashValue;
1032 static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
1033 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue)
1035 CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
1036 CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
1037 unsigned len0 = 0, len1 = 0;
1041 cmCheck = (UInt32)(pos - _cyclicBufferSize);
1042 if ((UInt32)pos <= _cyclicBufferSize)
1045 if (// curMatch >= pos || // failure
1049 const UInt32 delta = pos - curMatch;
1051 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
1052 const Byte *pb = cur - delta;
1053 unsigned len = (len0 < len1 ? len0 : len1);
1054 if (pb[len] == cur[len])
1056 while (++len != lenLimit)
1057 if (pb[len] != cur[len])
1060 if (len == lenLimit)
1068 if (pb[len] < cur[len])
1084 while(--cutValue && cmCheck < curMatch);
1086 *ptr0 = *ptr1 = kEmptyHashValue;
1092 p->cyclicBufferPos++; \
1094 { const UInt32 pos1 = p->pos + 1; \
1096 if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); }
1098 #define MOVE_POS_RET MOVE_POS return distances;
1101 static void MatchFinder_MovePos(CMatchFinder *p)
1103 /* we go here at the end of stream data, when (avail < num_hash_bytes)
1104 We don't update sons[cyclicBufferPos << btMode].
1105 So (sons) record will contain junk. And we cannot resume match searching
1106 to normal operation, even if we will provide more input data in buffer.
1107 p->sons[p->cyclicBufferPos << p->btMode] = 0; // kEmptyHashValue
1109 p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue
1114 #define GET_MATCHES_HEADER2(minLen, ret_op) \
1115 UInt32 hv; const Byte *cur; UInt32 curMatch; \
1116 UInt32 lenLimit = p->lenLimit; \
1117 if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; } \
1120 #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances)
1121 #define SKIP_HEADER(minLen) \
1122 do { GET_MATCHES_HEADER2(minLen, continue)
1124 #define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, \
1125 p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
1127 #define SKIP_FOOTER \
1128 SkipMatchesSpec(MF_PARAMS(p)); \
1132 #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
1133 distances = func(MF_PARAMS(p), distances, (UInt32)_maxLen_); \
1136 #define GET_MATCHES_FOOTER_BT(_maxLen_) \
1137 GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
1139 #define GET_MATCHES_FOOTER_HC(_maxLen_) \
1140 GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec)
1144 #define UPDATE_maxLen { \
1145 const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \
1146 const Byte *c = cur + maxLen; \
1147 const Byte *lim = cur + lenLimit; \
1148 for (; c != lim; c++) if (*(c + diff) != *c) break; \
1149 maxLen = (unsigned)(c - cur); }
1151 static UInt32* Bt2_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1153 CMatchFinder *p = (CMatchFinder *)_p;
1154 GET_MATCHES_HEADER(2)
1156 curMatch = p->hash[hv];
1157 p->hash[hv] = p->pos;
1158 GET_MATCHES_FOOTER_BT(1)
1161 UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1163 GET_MATCHES_HEADER(3)
1165 curMatch = p->hash[hv];
1166 p->hash[hv] = p->pos;
1167 GET_MATCHES_FOOTER_BT(2)
1172 mmm = p->cyclicBufferSize; \
1177 static UInt32* Bt3_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1179 CMatchFinder *p = (CMatchFinder *)_p;
1184 GET_MATCHES_HEADER(3)
1191 d2 = pos - hash[h2];
1193 curMatch = (hash + kFix3HashSize)[hv];
1196 (hash + kFix3HashSize)[hv] = pos;
1202 if (d2 < mmm && *(cur - d2) == *cur)
1205 distances[0] = (UInt32)maxLen;
1206 distances[1] = d2 - 1;
1208 if (maxLen == lenLimit)
1210 SkipMatchesSpec(MF_PARAMS(p));
1215 GET_MATCHES_FOOTER_BT(maxLen)
1219 static UInt32* Bt4_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1221 CMatchFinder *p = (CMatchFinder *)_p;
1223 UInt32 h2, h3, d2, d3, pos;
1226 GET_MATCHES_HEADER(4)
1233 d2 = pos - hash [h2];
1234 d3 = pos - (hash + kFix3HashSize)[h3];
1235 curMatch = (hash + kFix4HashSize)[hv];
1238 (hash + kFix3HashSize)[h3] = pos;
1239 (hash + kFix4HashSize)[hv] = pos;
1247 if (d2 < mmm && *(cur - d2) == *cur)
1250 distances[1] = d2 - 1;
1252 if (*(cur - d2 + 2) == cur[2])
1254 // distances[-2] = 3;
1256 else if (d3 < mmm && *(cur - d3) == *cur)
1259 distances[1] = d3 - 1;
1265 else if (d3 < mmm && *(cur - d3) == *cur)
1268 distances[1] = d3 - 1;
1275 distances[-2] = (UInt32)maxLen;
1276 if (maxLen == lenLimit)
1278 SkipMatchesSpec(MF_PARAMS(p));
1284 GET_MATCHES_FOOTER_BT(maxLen)
1288 static UInt32* Bt5_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1290 CMatchFinder *p = (CMatchFinder *)_p;
1292 UInt32 h2, h3, d2, d3, pos;
1295 GET_MATCHES_HEADER(5)
1302 d2 = pos - hash [h2];
1303 d3 = pos - (hash + kFix3HashSize)[h3];
1304 // d4 = pos - (hash + kFix4HashSize)[h4];
1306 curMatch = (hash + kFix5HashSize)[hv];
1309 (hash + kFix3HashSize)[h3] = pos;
1310 // (hash + kFix4HashSize)[h4] = pos;
1311 (hash + kFix5HashSize)[hv] = pos;
1319 if (d2 < mmm && *(cur - d2) == *cur)
1322 distances[1] = d2 - 1;
1324 if (*(cur - d2 + 2) == cur[2])
1327 else if (d3 < mmm && *(cur - d3) == *cur)
1329 distances[1] = d3 - 1;
1336 else if (d3 < mmm && *(cur - d3) == *cur)
1338 distances[1] = d3 - 1;
1346 if (*(cur - d2 + 3) != cur[3])
1349 distances[-2] = (UInt32)maxLen;
1350 if (maxLen == lenLimit)
1352 SkipMatchesSpec(MF_PARAMS(p));
1358 GET_MATCHES_FOOTER_BT(maxLen)
1362 static UInt32* Hc4_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1364 CMatchFinder *p = (CMatchFinder *)_p;
1366 UInt32 h2, h3, d2, d3, pos;
1369 GET_MATCHES_HEADER(4)
1376 d2 = pos - hash [h2];
1377 d3 = pos - (hash + kFix3HashSize)[h3];
1378 curMatch = (hash + kFix4HashSize)[hv];
1381 (hash + kFix3HashSize)[h3] = pos;
1382 (hash + kFix4HashSize)[hv] = pos;
1390 if (d2 < mmm && *(cur - d2) == *cur)
1393 distances[1] = d2 - 1;
1395 if (*(cur - d2 + 2) == cur[2])
1397 // distances[-2] = 3;
1399 else if (d3 < mmm && *(cur - d3) == *cur)
1402 distances[1] = d3 - 1;
1408 else if (d3 < mmm && *(cur - d3) == *cur)
1411 distances[1] = d3 - 1;
1418 distances[-2] = (UInt32)maxLen;
1419 if (maxLen == lenLimit)
1421 p->son[p->cyclicBufferPos] = curMatch;
1427 GET_MATCHES_FOOTER_HC(maxLen)
1431 static UInt32 * Hc5_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1433 CMatchFinder *p = (CMatchFinder *)_p;
1435 UInt32 h2, h3, d2, d3, pos;
1438 GET_MATCHES_HEADER(5)
1445 d2 = pos - hash [h2];
1446 d3 = pos - (hash + kFix3HashSize)[h3];
1447 // d4 = pos - (hash + kFix4HashSize)[h4];
1449 curMatch = (hash + kFix5HashSize)[hv];
1452 (hash + kFix3HashSize)[h3] = pos;
1453 // (hash + kFix4HashSize)[h4] = pos;
1454 (hash + kFix5HashSize)[hv] = pos;
1462 if (d2 < mmm && *(cur - d2) == *cur)
1465 distances[1] = d2 - 1;
1467 if (*(cur - d2 + 2) == cur[2])
1470 else if (d3 < mmm && *(cur - d3) == *cur)
1472 distances[1] = d3 - 1;
1479 else if (d3 < mmm && *(cur - d3) == *cur)
1481 distances[1] = d3 - 1;
1489 if (*(cur - d2 + 3) != cur[3])
1492 distances[-2] = (UInt32)maxLen;
1493 if (maxLen == lenLimit)
1495 p->son[p->cyclicBufferPos] = curMatch;
1501 GET_MATCHES_FOOTER_HC(maxLen)
1505 UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1507 GET_MATCHES_HEADER(3)
1509 curMatch = p->hash[hv];
1510 p->hash[hv] = p->pos;
1511 GET_MATCHES_FOOTER_HC(2)
1515 static void Bt2_MatchFinder_Skip(void *_p, UInt32 num)
1517 CMatchFinder *p = (CMatchFinder *)_p;
1521 curMatch = p->hash[hv];
1522 p->hash[hv] = p->pos;
1527 void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1532 curMatch = p->hash[hv];
1533 p->hash[hv] = p->pos;
1538 static void Bt3_MatchFinder_Skip(void *_p, UInt32 num)
1540 CMatchFinder *p = (CMatchFinder *)_p;
1547 curMatch = (hash + kFix3HashSize)[hv];
1549 (hash + kFix3HashSize)[hv] = p->pos;
1554 static void Bt4_MatchFinder_Skip(void *_p, UInt32 num)
1556 CMatchFinder *p = (CMatchFinder *)_p;
1563 curMatch = (hash + kFix4HashSize)[hv];
1565 (hash + kFix3HashSize)[h3] =
1566 (hash + kFix4HashSize)[hv] = p->pos;
1571 static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
1573 CMatchFinder *p = (CMatchFinder *)_p;
1580 curMatch = (hash + kFix5HashSize)[hv];
1582 (hash + kFix3HashSize)[h3] =
1583 // (hash + kFix4HashSize)[h4] =
1584 (hash + kFix5HashSize)[hv] = p->pos;
1590 #define HC_SKIP_HEADER(minLen) \
1591 do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \
1595 UInt32 pos = p->pos; \
1596 UInt32 num2 = num; \
1597 /* (p->pos == p->posLimit) is not allowed here !!! */ \
1598 { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
1600 { const UInt32 cycPos = p->cyclicBufferPos; \
1601 son = p->son + cycPos; \
1602 p->cyclicBufferPos = cycPos + num2; } \
1610 #define HC_SKIP_FOOTER \
1611 cur++; pos++; *son++ = curMatch; \
1615 if (pos == p->posLimit) MatchFinder_CheckLimits(p); \
1619 static void Hc4_MatchFinder_Skip(void *_p, UInt32 num)
1621 CMatchFinder *p = (CMatchFinder *)_p;
1626 curMatch = (hash + kFix4HashSize)[hv];
1628 (hash + kFix3HashSize)[h3] =
1629 (hash + kFix4HashSize)[hv] = pos;
1635 static void Hc5_MatchFinder_Skip(void *_p, UInt32 num)
1637 CMatchFinder *p = (CMatchFinder *)_p;
1642 curMatch = (hash + kFix5HashSize)[hv];
1644 (hash + kFix3HashSize)[h3] =
1645 // (hash + kFix4HashSize)[h4] =
1646 (hash + kFix5HashSize)[hv] = pos;
1652 void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1657 curMatch = hash[hv];
1664 void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
1666 vTable->Init = MatchFinder_Init;
1667 vTable->GetNumAvailableBytes = MatchFinder_GetNumAvailableBytes;
1668 vTable->GetPointerToCurrentPos = MatchFinder_GetPointerToCurrentPos;
1671 if (p->numHashBytes <= 4)
1673 vTable->GetMatches = Hc4_MatchFinder_GetMatches;
1674 vTable->Skip = Hc4_MatchFinder_Skip;
1678 vTable->GetMatches = Hc5_MatchFinder_GetMatches;
1679 vTable->Skip = Hc5_MatchFinder_Skip;
1682 else if (p->numHashBytes == 2)
1684 vTable->GetMatches = Bt2_MatchFinder_GetMatches;
1685 vTable->Skip = Bt2_MatchFinder_Skip;
1687 else if (p->numHashBytes == 3)
1689 vTable->GetMatches = Bt3_MatchFinder_GetMatches;
1690 vTable->Skip = Bt3_MatchFinder_Skip;
1692 else if (p->numHashBytes == 4)
1694 vTable->GetMatches = Bt4_MatchFinder_GetMatches;
1695 vTable->Skip = Bt4_MatchFinder_Skip;
1699 vTable->GetMatches = Bt5_MatchFinder_GetMatches;
1700 vTable->Skip = Bt5_MatchFinder_Skip;
1706 void LzFindPrepare(void)
1708 #ifndef FORCE_LZFIND_SATUR_SUB_128
1709 #ifdef USE_LZFIND_SATUR_SUB_128
1710 LZFIND_SATUR_SUB_CODE_FUNC f = NULL;
1711 #ifdef MY_CPU_ARM_OR_ARM64
1713 if (CPU_IsSupported_NEON())
1715 // #pragma message ("=== LzFind NEON")
1716 PRF(printf("\n=== LzFind NEON\n"));
1717 f = LzFind_SaturSub_128;
1719 // f = 0; // for debug
1721 #else // MY_CPU_ARM_OR_ARM64
1722 if (CPU_IsSupported_SSE41())
1724 // #pragma message ("=== LzFind SSE41")
1725 PRF(printf("\n=== LzFind SSE41\n"));
1726 f = LzFind_SaturSub_128;
1728 #ifdef USE_LZFIND_SATUR_SUB_256
1729 if (CPU_IsSupported_AVX2())
1731 // #pragma message ("=== LzFind AVX2")
1732 PRF(printf("\n=== LzFind AVX2\n"));
1733 f = LzFind_SaturSub_256;
1737 #endif // MY_CPU_ARM_OR_ARM64
1738 g_LzFind_SaturSub = f;
1739 #endif // USE_LZFIND_SATUR_SUB_128
1740 #endif // FORCE_LZFIND_SATUR_SUB_128