1 /* LzFindMt.c -- multithreaded Match finder for LZ algorithms
\r
2 2021-12-21 : Igor Pavlov : Public domain */
\r
6 // #include <stdio.h>
\r
11 #include "LzFindMt.h"
\r
13 // #define LOG_ITERS
\r
15 // #define LOG_THREAD
\r
26 extern UInt64 g_NumIters_Tree;
\r
27 extern UInt64 g_NumIters_Loop;
\r
28 extern UInt64 g_NumIters_Bytes;
\r
29 #define LOG_ITER(x) x
\r
34 #define kMtHashBlockSize ((UInt32)1 << 17)
\r
35 #define kMtHashNumBlocks (1 << 1)
\r
37 #define GET_HASH_BLOCK_OFFSET(i) (((i) & (kMtHashNumBlocks - 1)) * kMtHashBlockSize)
\r
39 #define kMtBtBlockSize ((UInt32)1 << 16)
\r
40 #define kMtBtNumBlocks (1 << 4)
\r
42 #define GET_BT_BLOCK_OFFSET(i) (((i) & (kMtBtNumBlocks - 1)) * (size_t)kMtBtBlockSize)
\r
46 We use raw 8/16 bits from a[1] and a[2],
\r
47 xored with crc(a[0]) and crc(a[3]).
\r
48 We check a[0], a[3] only. We don't need to compare a[1] and a[2] in matches.
\r
49 our crc() function provides one-to-one correspondence for low 8-bit values:
\r
50 (crc[0...0xFF] & 0xFF) <-> [0...0xFF]
\r
53 #define MF(mt) ((mt)->MatchFinder)
\r
54 #define MF_CRC (p->crc)
\r
56 // #define MF(mt) (&(mt)->MatchFinder)
\r
57 // #define MF_CRC (p->MatchFinder.crc)
\r
59 #define MT_HASH2_CALC \
\r
60 h2 = (MF_CRC[cur[0]] ^ cur[1]) & (kHash2Size - 1);
\r
62 #define MT_HASH3_CALC { \
\r
63 UInt32 temp = MF_CRC[cur[0]] ^ cur[1]; \
\r
64 h2 = temp & (kHash2Size - 1); \
\r
65 h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
\r
68 #define MT_HASH3_CALC__NO_2 { \
\r
69 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
\r
70 h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
\r
72 #define __MT_HASH4_CALC { \
\r
73 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
\r
74 h2 = temp & (kHash2Size - 1); \
\r
75 temp ^= ((UInt32)cur[2] << 8); \
\r
76 h3 = temp & (kHash3Size - 1); \
\r
77 h4 = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hash4Mask; }
\r
78 // (kHash4Size - 1);
\r
83 static void MtSync_Construct(CMtSync *p)
\r
86 p->wasCreated = False;
\r
87 p->csWasInitialized = False;
\r
88 p->csWasEntered = False;
\r
89 Thread_Construct(&p->thread);
\r
90 Event_Construct(&p->canStart);
\r
91 Event_Construct(&p->wasStopped);
\r
92 Semaphore_Construct(&p->freeSemaphore);
\r
93 Semaphore_Construct(&p->filledSemaphore);
\r
97 #define DEBUG_BUFFER_LOCK // define it to debug lock state
\r
99 #ifdef DEBUG_BUFFER_LOCK
\r
100 #include <stdlib.h>
\r
101 #define BUFFER_MUST_BE_LOCKED(p) if (!(p)->csWasEntered) exit(1);
\r
102 #define BUFFER_MUST_BE_UNLOCKED(p) if ( (p)->csWasEntered) exit(1);
\r
104 #define BUFFER_MUST_BE_LOCKED(p)
\r
105 #define BUFFER_MUST_BE_UNLOCKED(p)
\r
108 #define LOCK_BUFFER(p) { \
\r
109 BUFFER_MUST_BE_UNLOCKED(p); \
\r
110 CriticalSection_Enter(&(p)->cs); \
\r
111 (p)->csWasEntered = True; }
\r
113 #define UNLOCK_BUFFER(p) { \
\r
114 BUFFER_MUST_BE_LOCKED(p); \
\r
115 CriticalSection_Leave(&(p)->cs); \
\r
116 (p)->csWasEntered = False; }
\r
120 static UInt32 MtSync_GetNextBlock(CMtSync *p)
\r
122 UInt32 numBlocks = 0;
\r
125 BUFFER_MUST_BE_UNLOCKED(p)
\r
126 p->numProcessedBlocks = 1;
\r
127 p->needStart = False;
\r
128 p->stopWriting = False;
\r
130 Event_Reset(&p->wasStopped);
\r
131 Event_Set(&p->canStart);
\r
136 // we free current block
\r
137 numBlocks = p->numProcessedBlocks++;
\r
138 Semaphore_Release1(&p->freeSemaphore);
\r
141 // buffer is UNLOCKED here
\r
142 Semaphore_Wait(&p->filledSemaphore);
\r
148 /* if Writing (Processing) thread was started, we must call MtSync_StopWriting() */
\r
151 static void MtSync_StopWriting(CMtSync *p)
\r
153 if (!Thread_WasCreated(&p->thread) || p->needStart)
\r
156 PRF(printf("\nMtSync_StopWriting %p\n", p));
\r
158 if (p->csWasEntered)
\r
160 /* we don't use buffer in this thread after StopWriting().
\r
161 So we UNLOCK buffer.
\r
162 And we restore default UNLOCKED state for stopped thread */
\r
166 /* We send (p->stopWriting) message and release freeSemaphore
\r
167 to free current block.
\r
168 So the thread will see (p->stopWriting) at some
\r
169 iteration after Wait(freeSemaphore).
\r
170 The thread doesn't need to fill all avail free blocks,
\r
171 so we can get fast thread stop.
\r
174 p->stopWriting = True;
\r
175 Semaphore_Release1(&p->freeSemaphore); // check semaphore count !!!
\r
177 PRF(printf("\nMtSync_StopWriting %p : Event_Wait(&p->wasStopped)\n", p));
\r
178 Event_Wait(&p->wasStopped);
\r
179 PRF(printf("\nMtSync_StopWriting %p : Event_Wait() finsihed\n", p));
\r
181 /* 21.03 : we don't restore samaphore counters here.
\r
182 We will recreate and reinit samaphores in next start */
\r
184 p->needStart = True;
\r
189 static void MtSync_Destruct(CMtSync *p)
\r
191 PRF(printf("\nMtSync_Destruct %p\n", p));
\r
193 if (Thread_WasCreated(&p->thread))
\r
195 /* we want thread to be in Stopped state before sending EXIT command.
\r
196 note: stop(btSync) will stop (htSync) also */
\r
197 MtSync_StopWriting(p);
\r
198 /* thread in Stopped state here : (p->needStart == true) */
\r
200 // if (p->needStart) // it's (true)
\r
201 Event_Set(&p->canStart); // we send EXIT command to thread
\r
202 Thread_Wait_Close(&p->thread); // we wait thread finishing
\r
205 if (p->csWasInitialized)
\r
207 CriticalSection_Delete(&p->cs);
\r
208 p->csWasInitialized = False;
\r
210 p->csWasEntered = False;
\r
212 Event_Close(&p->canStart);
\r
213 Event_Close(&p->wasStopped);
\r
214 Semaphore_Close(&p->freeSemaphore);
\r
215 Semaphore_Close(&p->filledSemaphore);
\r
217 p->wasCreated = False;
\r
221 // #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; }
\r
222 // we want to get real system error codes here instead of SZ_ERROR_THREAD
\r
223 #define RINOK_THREAD(x) RINOK(x)
\r
226 // call it before each new file (when new starting is required):
\r
228 static SRes MtSync_Init(CMtSync *p, UInt32 numBlocks)
\r
231 // BUFFER_MUST_BE_UNLOCKED(p)
\r
232 if (!p->needStart || p->csWasEntered)
\r
233 return SZ_ERROR_FAIL;
\r
234 wres = Semaphore_OptCreateInit(&p->freeSemaphore, numBlocks, numBlocks);
\r
236 wres = Semaphore_OptCreateInit(&p->filledSemaphore, 0, numBlocks);
\r
237 return MY_SRes_HRESULT_FROM_WRes(wres);
\r
241 static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj)
\r
248 RINOK_THREAD(CriticalSection_Init(&p->cs));
\r
249 p->csWasInitialized = True;
\r
250 p->csWasEntered = False;
\r
252 RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart));
\r
253 RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped));
\r
255 p->needStart = True;
\r
256 p->exit = True; /* p->exit is unused before (canStart) Event.
\r
257 But in case of some unexpected code failure we will get fast exit from thread */
\r
259 // return ERROR_TOO_MANY_POSTS; // for debug
\r
260 // return EINVAL; // for debug
\r
262 if (p->affinity != 0)
\r
263 wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
\r
265 wres = Thread_Create(&p->thread, startAddress, obj);
\r
267 RINOK_THREAD(wres);
\r
268 p->wasCreated = True;
\r
274 static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj)
\r
276 const WRes wres = MtSync_Create_WRes(p, startAddress, obj);
\r
279 MtSync_Destruct(p);
\r
280 return MY_SRes_HRESULT_FROM_WRes(wres);
\r
284 // ---------- HASH THREAD ----------
\r
286 #define kMtMaxValForNormalize 0xFFFFFFFF
\r
287 // #define kMtMaxValForNormalize ((1 << 21)) // for debug
\r
288 // #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
\r
290 #ifdef MY_CPU_LE_UNALIGN
\r
291 #define GetUi24hi_from32(p) ((UInt32)GetUi32(p) >> 8)
\r
293 #define GetUi24hi_from32(p) ((p)[1] ^ ((UInt32)(p)[2] << 8) ^ ((UInt32)(p)[3] << 16))
\r
296 #define GetHeads_DECL(name) \
\r
297 static void GetHeads ## name(const Byte *p, UInt32 pos, \
\r
298 UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc)
\r
300 #define GetHeads_LOOP(v) \
\r
301 for (; numHeads != 0; numHeads--) { \
\r
302 const UInt32 value = (v); \
\r
304 *heads++ = pos - hash[value]; \
\r
305 hash[value] = pos++; }
\r
307 #define DEF_GetHeads2(name, v, action) \
\r
308 GetHeads_DECL(name) { action \
\r
311 #define DEF_GetHeads(name, v) DEF_GetHeads2(name, v, ;)
\r
313 DEF_GetHeads2(2, GetUi16(p), UNUSED_VAR(hashMask); UNUSED_VAR(crc); )
\r
314 DEF_GetHeads(3, (crc[p[0]] ^ GetUi16(p + 1)) & hashMask)
\r
315 DEF_GetHeads2(3b, GetUi16(p) ^ ((UInt32)(p)[2] << 16), UNUSED_VAR(hashMask); UNUSED_VAR(crc); )
\r
316 // BT3 is not good for crc collisions for big hashMask values.
\r
321 UNUSED_VAR(hashMask);
\r
324 const Byte *pLim = p + numHeads;
\r
330 UInt32 v1 = GetUi32(p);
\r
331 UInt32 v0 = v1 & 0xFFFFFF;
\r
335 h0 = hash[v0]; hash[v0] = pos; heads[0] = pos - h0; pos++;
\r
336 h1 = hash[v1]; hash[v1] = pos; heads[1] = pos - h1; pos++;
\r
341 UInt32 v0 = GetUi16(p) ^ ((UInt32)(p)[2] << 16);
\r
342 *heads = pos - hash[v0];
\r
354 while ((hashMask & 0x80000000) == 0)
\r
359 GetHeads_LOOP((GetUi32(p) * 0xa54a1) >> sh)
\r
361 #define GetHeads4b GetHeads4
\r
364 #define USE_GetHeads_LOCAL_CRC
\r
366 #ifdef USE_GetHeads_LOCAL_CRC
\r
374 for (i = 0; i < 256; i++)
\r
377 crc0[i] = v & hashMask;
\r
378 crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
\r
379 // crc1[i] = rotlFixed(v, 8) & hashMask;
\r
382 GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ (UInt32)GetUi16(p+1))
\r
390 for (i = 0; i < 256; i++)
\r
391 crc0[i] = crc[i] & hashMask;
\r
393 GetHeads_LOOP(crc0[p[0]] ^ GetUi24hi_from32(p))
\r
403 for (i = 0; i < 256; i++)
\r
406 crc0[i] = v & hashMask;
\r
407 crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
\r
408 crc2[i] = (v << kLzHash_CrcShift_2) & hashMask;
\r
411 GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ crc2[p[4]] ^ (UInt32)GetUi16(p+1))
\r
420 for (i = 0; i < 256; i++)
\r
423 crc0[i] = v & hashMask;
\r
424 crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
\r
427 GetHeads_LOOP(crc0[p[0]] ^ crc1[p[4]] ^ GetUi24hi_from32(p))
\r
432 DEF_GetHeads(4, (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (UInt32)GetUi16(p+1)) & hashMask)
\r
433 DEF_GetHeads(4b, (crc[p[0]] ^ GetUi24hi_from32(p)) & hashMask)
\r
434 DEF_GetHeads(5, (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (crc[p[4]] << kLzHash_CrcShift_2) ^ (UInt32)GetUi16(p + 1)) & hashMask)
\r
435 DEF_GetHeads(5b, (crc[p[0]] ^ (crc[p[4]] << kLzHash_CrcShift_1) ^ GetUi24hi_from32(p)) & hashMask)
\r
440 static void HashThreadFunc(CMatchFinderMt *mt)
\r
442 CMtSync *p = &mt->hashSync;
\r
443 PRF(printf("\nHashThreadFunc\n"));
\r
447 UInt32 blockIndex = 0;
\r
448 PRF(printf("\nHashThreadFunc : Event_Wait(&p->canStart)\n"));
\r
449 Event_Wait(&p->canStart);
\r
450 PRF(printf("\nHashThreadFunc : Event_Wait(&p->canStart) : after \n"));
\r
453 PRF(printf("\nHashThreadFunc : exit \n"));
\r
457 MatchFinder_Init_HighHash(MF(mt));
\r
461 PRF(printf("Hash thread block = %d pos = %d\n", (unsigned)blockIndex, mt->MatchFinder->pos));
\r
464 CMatchFinder *mf = MF(mt);
\r
465 if (MatchFinder_NeedMove(mf))
\r
467 CriticalSection_Enter(&mt->btSync.cs);
\r
468 CriticalSection_Enter(&mt->hashSync.cs);
\r
470 const Byte *beforePtr = Inline_MatchFinder_GetPointerToCurrentPos(mf);
\r
472 MatchFinder_MoveBlock(mf);
\r
473 offset = beforePtr - Inline_MatchFinder_GetPointerToCurrentPos(mf);
\r
474 mt->pointerToCurPos -= offset;
\r
475 mt->buffer -= offset;
\r
477 CriticalSection_Leave(&mt->hashSync.cs);
\r
478 CriticalSection_Leave(&mt->btSync.cs);
\r
482 Semaphore_Wait(&p->freeSemaphore);
\r
484 if (p->exit) // exit is unexpected here. But we check it here for some failure case
\r
487 // for faster stop : we check (p->stopWriting) after Wait(freeSemaphore)
\r
488 if (p->stopWriting)
\r
491 MatchFinder_ReadIfRequired(mf);
\r
493 UInt32 *heads = mt->hashBuf + GET_HASH_BLOCK_OFFSET(blockIndex++);
\r
494 UInt32 num = Inline_MatchFinder_GetNumAvailableBytes(mf);
\r
498 /* heads[1] contains the number of avail bytes:
\r
499 if (avail < mf->numHashBytes) :
\r
501 it means that stream was finished
\r
502 HASH_THREAD and BT_TREAD must move position for heads[1] (avail) bytes.
\r
503 HASH_THREAD doesn't stop,
\r
504 HASH_THREAD fills only the header (2 numbers) for all next blocks:
\r
505 {2, NumHashBytes - 1}, {2,0}, {2,0}, ... , {2,0}
\r
509 HASH_THREAD and BT_TREAD must move position for (heads[0] - 2) bytes;
\r
513 if (num >= mf->numHashBytes)
\r
515 num = num - mf->numHashBytes + 1;
\r
516 if (num > kMtHashBlockSize - 2)
\r
517 num = kMtHashBlockSize - 2;
\r
519 if (mf->pos > (UInt32)kMtMaxValForNormalize - num)
\r
521 const UInt32 subValue = (mf->pos - mf->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1);
\r
522 Inline_MatchFinder_ReduceOffsets(mf, subValue);
\r
523 MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1);
\r
526 heads[0] = 2 + num;
\r
527 mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc);
\r
530 mf->pos += num; // wrap over zero is allowed at the end of stream
\r
535 Semaphore_Release1(&p->filledSemaphore);
\r
536 } // for() processing end
\r
538 // p->numBlocks_Sent = blockIndex;
\r
539 Event_Set(&p->wasStopped);
\r
540 } // for() thread end
\r
546 // ---------- BT THREAD ----------
\r
548 /* we use one variable instead of two (cyclicBufferPos == pos) before CyclicBuf wrap.
\r
549 here we define fixed offset of (p->pos) from (p->cyclicBufferPos) */
\r
550 #define CYC_TO_POS_OFFSET 0
\r
551 // #define CYC_TO_POS_OFFSET 1 // for debug
\r
553 #define MFMT_GM_INLINE
\r
555 #ifdef MFMT_GM_INLINE
\r
558 we use size_t for (pos) instead of UInt32
\r
559 to eliminate "movsx" BUG in old MSVC x64 compiler.
\r
563 UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
\r
564 UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
\r
565 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
\r
571 static void BtGetMatches(CMatchFinderMt *p, UInt32 *d)
\r
573 UInt32 numProcessed = 0;
\r
576 /* GetMatchesSpec() functions don't create (len = 1)
\r
577 in [len, dist] match pairs, if (p->numHashBytes >= 2)
\r
578 Also we suppose here that (matchMaxLen >= 2).
\r
579 So the following code for (reserve) is not required
\r
580 UInt32 reserve = (p->matchMaxLen * 2);
\r
581 const UInt32 kNumHashBytes_Max = 5; // BT_HASH_BYTES_MAX
\r
582 if (reserve < kNumHashBytes_Max - 1)
\r
583 reserve = kNumHashBytes_Max - 1;
\r
584 const UInt32 limit = kMtBtBlockSize - (reserve);
\r
587 const UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2);
\r
589 d[1] = p->hashNumAvail;
\r
593 // printf("\n == 1 BtGetMatches() p->failure_BT\n");
\r
599 while (curPos < limit)
\r
601 if (p->hashBufPos == p->hashBufPosLimit)
\r
603 // MatchFinderMt_GetNextBlock_Hash(p);
\r
606 const UInt32 bi = MtSync_GetNextBlock(&p->hashSync);
\r
607 const UInt32 k = GET_HASH_BLOCK_OFFSET(bi);
\r
608 const UInt32 *h = p->hashBuf + k;
\r
610 p->hashBufPosLimit = k + h[0];
\r
611 p->hashNumAvail = avail;
\r
612 p->hashBufPos = k + 2;
\r
616 /* we must prevent UInt32 overflow for avail total value,
\r
617 if avail was increased with new hash block */
\r
618 UInt32 availSum = numProcessed + avail;
\r
619 if (availSum < numProcessed)
\r
620 availSum = (UInt32)(Int32)-1;
\r
624 if (avail >= p->numHashBytes)
\r
627 // if (p->hashBufPos != p->hashBufPosLimit) exit(1);
\r
629 /* (avail < p->numHashBytes)
\r
630 It means that stream was finished.
\r
631 And (avail) - is a number of remaining bytes,
\r
632 we fill (d) for (avail) bytes for LZ_THREAD (receiver).
\r
633 but we don't update (p->pos) and (p->cyclicBufferPos) here in BT_THREAD */
\r
635 /* here we suppose that we have space enough:
\r
636 (kMtBtBlockSize - curPos >= p->hashNumAvail) */
\r
637 p->hashNumAvail = 0;
\r
638 d[0] = curPos + avail;
\r
640 for (; avail != 0; avail--)
\r
645 UInt32 size = p->hashBufPosLimit - p->hashBufPos;
\r
646 UInt32 pos = p->pos;
\r
647 UInt32 cyclicBufferPos = p->cyclicBufferPos;
\r
648 UInt32 lenLimit = p->matchMaxLen;
\r
649 if (lenLimit >= p->hashNumAvail)
\r
650 lenLimit = p->hashNumAvail;
\r
652 UInt32 size2 = p->hashNumAvail - lenLimit + 1;
\r
655 size2 = p->cyclicBufferSize - cyclicBufferPos;
\r
660 if (pos > (UInt32)kMtMaxValForNormalize - size)
\r
662 const UInt32 subValue = (pos - p->cyclicBufferSize); // & ~(UInt32)(kNormalizeAlign - 1);
\r
665 MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2);
\r
668 #ifndef MFMT_GM_INLINE
\r
669 while (curPos < limit && size-- != 0)
\r
671 UInt32 *startDistances = d + curPos;
\r
672 UInt32 num = (UInt32)(GetMatchesSpec1(lenLimit, pos - p->hashBuf[p->hashBufPos++],
\r
673 pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue,
\r
674 startDistances + 1, p->numHashBytes - 1) - startDistances);
\r
675 *startDistances = num - 1;
\r
683 UInt32 posRes = pos;
\r
684 const UInt32 *d_end;
\r
686 d_end = GetMatchesSpecN_2(
\r
687 p->buffer + lenLimit - 1,
\r
688 pos, p->buffer, p->son, p->cutValue, d + curPos,
\r
689 p->numHashBytes - 1, p->hashBuf + p->hashBufPos,
\r
690 d + limit, p->hashBuf + p->hashBufPos + size,
\r
691 cyclicBufferPos, p->cyclicBufferSize,
\r
697 // printf("\n == 2 BtGetMatches() p->failure_BT\n");
\r
698 // internal data failure
\r
699 p->failure_BT = True;
\r
705 curPos = (UInt32)(d_end - d);
\r
707 const UInt32 processed = posRes - pos;
\r
709 p->hashBufPos += processed;
\r
710 cyclicBufferPos += processed;
\r
711 p->buffer += processed;
\r
717 const UInt32 processed = pos - p->pos;
\r
718 numProcessed += processed;
\r
719 p->hashNumAvail -= processed;
\r
722 if (cyclicBufferPos == p->cyclicBufferSize)
\r
723 cyclicBufferPos = 0;
\r
724 p->cyclicBufferPos = cyclicBufferPos;
\r
732 static void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex)
\r
734 CMtSync *sync = &p->hashSync;
\r
736 BUFFER_MUST_BE_UNLOCKED(sync)
\r
738 if (!sync->needStart)
\r
743 BtGetMatches(p, p->btBuf + GET_BT_BLOCK_OFFSET(globalBlockIndex));
\r
745 /* We suppose that we have called GetNextBlock() from start.
\r
746 So buffer is LOCKED */
\r
748 UNLOCK_BUFFER(sync)
\r
753 static void BtThreadFunc(CMatchFinderMt *mt)
\r
755 CMtSync *p = &mt->btSync;
\r
758 UInt32 blockIndex = 0;
\r
759 Event_Wait(&p->canStart);
\r
763 PRF(printf(" BT thread block = %d pos = %d\n", (unsigned)blockIndex, mt->pos));
\r
764 /* (p->exit == true) is possible after (p->canStart) at first loop iteration
\r
765 and is unexpected after more Wait(freeSemaphore) iterations */
\r
769 Semaphore_Wait(&p->freeSemaphore);
\r
771 // for faster stop : we check (p->stopWriting) after Wait(freeSemaphore)
\r
772 if (p->stopWriting)
\r
775 BtFillBlock(mt, blockIndex++);
\r
777 Semaphore_Release1(&p->filledSemaphore);
\r
780 // we stop HASH_THREAD here
\r
781 MtSync_StopWriting(&mt->hashSync);
\r
783 // p->numBlocks_Sent = blockIndex;
\r
784 Event_Set(&p->wasStopped);
\r
789 void MatchFinderMt_Construct(CMatchFinderMt *p)
\r
792 MtSync_Construct(&p->hashSync);
\r
793 MtSync_Construct(&p->btSync);
\r
796 static void MatchFinderMt_FreeMem(CMatchFinderMt *p, ISzAllocPtr alloc)
\r
798 ISzAlloc_Free(alloc, p->hashBuf);
\r
802 void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc)
\r
805 HASH_THREAD can use CriticalSection(s) btSync.cs and hashSync.cs.
\r
806 So we must be sure that HASH_THREAD will not use CriticalSection(s)
\r
807 after deleting CriticalSection here.
\r
809 we call ReleaseStream(p)
\r
810 that calls StopWriting(btSync)
\r
811 that calls StopWriting(hashSync), if it's required to stop HASH_THREAD.
\r
812 after StopWriting() it's safe to destruct MtSync(s) in any order */
\r
814 MatchFinderMt_ReleaseStream(p);
\r
816 MtSync_Destruct(&p->btSync);
\r
817 MtSync_Destruct(&p->hashSync);
\r
820 printf("\nTree %9d * %7d iter = %9d = sum : bytes = %9d\n",
\r
821 (UInt32)(g_NumIters_Tree / 1000),
\r
822 (UInt32)(((UInt64)g_NumIters_Loop * 1000) / (g_NumIters_Tree + 1)),
\r
823 (UInt32)(g_NumIters_Loop / 1000),
\r
824 (UInt32)(g_NumIters_Bytes / 1000)
\r
827 MatchFinderMt_FreeMem(p, alloc);
\r
831 #define kHashBufferSize (kMtHashBlockSize * kMtHashNumBlocks)
\r
832 #define kBtBufferSize (kMtBtBlockSize * kMtBtNumBlocks)
\r
835 static THREAD_FUNC_DECL HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p); return 0; }
\r
836 static THREAD_FUNC_DECL BtThreadFunc2(void *p)
\r
838 Byte allocaDummy[0x180];
\r
840 for (i = 0; i < 16; i++)
\r
841 allocaDummy[i] = (Byte)0;
\r
842 if (allocaDummy[0] == 0)
\r
843 BtThreadFunc((CMatchFinderMt *)p);
\r
848 SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore,
\r
849 UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc)
\r
851 CMatchFinder *mf = MF(p);
\r
852 p->historySize = historySize;
\r
853 if (kMtBtBlockSize <= matchMaxLen * 4)
\r
854 return SZ_ERROR_PARAM;
\r
857 p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, ((size_t)kHashBufferSize + (size_t)kBtBufferSize) * sizeof(UInt32));
\r
859 return SZ_ERROR_MEM;
\r
860 p->btBuf = p->hashBuf + kHashBufferSize;
\r
862 keepAddBufferBefore += (kHashBufferSize + kBtBufferSize);
\r
863 keepAddBufferAfter += kMtHashBlockSize;
\r
864 if (!MatchFinder_Create(mf, historySize, keepAddBufferBefore, matchMaxLen, keepAddBufferAfter, alloc))
\r
865 return SZ_ERROR_MEM;
\r
867 RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p));
\r
868 RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p));
\r
873 SRes MatchFinderMt_InitMt(CMatchFinderMt *p)
\r
875 RINOK(MtSync_Init(&p->hashSync, kMtHashNumBlocks));
\r
876 return MtSync_Init(&p->btSync, kMtBtNumBlocks);
\r
880 static void MatchFinderMt_Init(CMatchFinderMt *p)
\r
882 CMatchFinder *mf = MF(p);
\r
885 p->btBufPosLimit = NULL;
\r
887 p->hashBufPosLimit = 0;
\r
888 p->hashNumAvail = 0; // 21.03
\r
890 p->failure_BT = False;
\r
892 /* Init without data reading. We don't want to read data in this thread */
\r
893 MatchFinder_Init_4(mf);
\r
895 MatchFinder_Init_LowHash(mf);
\r
897 p->pointerToCurPos = Inline_MatchFinder_GetPointerToCurrentPos(mf);
\r
898 p->btNumAvailBytes = 0;
\r
899 p->failure_LZ_BT = False;
\r
900 // p->failure_LZ_LZ = False;
\r
903 1; // optimal smallest value
\r
904 // 0; // for debug: ignores match to start
\r
905 // kNormalizeAlign; // for debug
\r
907 p->hash = mf->hash;
\r
908 p->fixedHashSize = mf->fixedHashSize;
\r
909 // p->hash4Mask = mf->hash4Mask;
\r
911 // memcpy(p->crc, mf->crc, sizeof(mf->crc));
\r
914 p->matchMaxLen = mf->matchMaxLen;
\r
915 p->numHashBytes = mf->numHashBytes;
\r
917 /* (mf->pos) and (mf->streamPos) were already initialized to 1 in MatchFinder_Init_4() */
\r
918 // mf->streamPos = mf->pos = 1; // optimal smallest value
\r
919 // 0; // for debug: ignores match to start
\r
920 // kNormalizeAlign; // for debug
\r
922 /* we must init (p->pos = mf->pos) for BT, because
\r
923 BT code needs (p->pos == delta_value_for_empty_hash_record == mf->pos) */
\r
924 p->pos = mf->pos; // do not change it
\r
926 p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET);
\r
927 p->cyclicBufferSize = mf->cyclicBufferSize;
\r
928 p->buffer = mf->buffer;
\r
929 p->cutValue = mf->cutValue;
\r
930 // p->son[0] = p->son[1] = 0; // unused: to init skipped record for speculated accesses.
\r
934 /* ReleaseStream is required to finish multithreading */
\r
935 void MatchFinderMt_ReleaseStream(CMatchFinderMt *p)
\r
937 // Sleep(1); // for debug
\r
938 MtSync_StopWriting(&p->btSync);
\r
939 // Sleep(200); // for debug
\r
940 /* p->MatchFinder->ReleaseStream(); */
\r
945 static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p)
\r
947 if (p->failure_LZ_BT)
\r
948 p->btBufPos = p->failureBuf;
\r
951 const UInt32 bi = MtSync_GetNextBlock(&p->btSync);
\r
952 const UInt32 *bt = p->btBuf + GET_BT_BLOCK_OFFSET(bi);
\r
954 const UInt32 numItems = bt[0];
\r
955 p->btBufPosLimit = bt + numItems;
\r
956 p->btNumAvailBytes = bt[1];
\r
957 p->btBufPos = bt + 2;
\r
958 if (numItems < 2 || numItems > kMtBtBlockSize)
\r
960 p->failureBuf[0] = 0;
\r
961 p->btBufPos = p->failureBuf;
\r
962 p->btBufPosLimit = p->failureBuf + 1;
\r
963 p->failure_LZ_BT = True;
\r
964 // p->btNumAvailBytes = 0;
\r
965 /* we don't want to decrease AvailBytes, that was load before.
\r
966 that can be unxepected for the code that have loaded anopther value before */
\r
970 if (p->lzPos >= (UInt32)kMtMaxValForNormalize - (UInt32)kMtBtBlockSize)
\r
972 /* we don't check (lzPos) over exact avail bytes in (btBuf).
\r
973 (fixedHashSize) is small, so normalization is fast */
\r
974 const UInt32 subValue = (p->lzPos - p->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1);
\r
975 p->lzPos -= subValue;
\r
976 MatchFinder_Normalize3(subValue, p->hash, p->fixedHashSize);
\r
979 return p->btNumAvailBytes;
\r
984 static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p)
\r
986 return p->pointerToCurPos;
\r
990 #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p);
\r
993 static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p)
\r
995 if (p->btBufPos != p->btBufPosLimit)
\r
996 return p->btNumAvailBytes;
\r
997 return MatchFinderMt_GetNextBlock_Bt(p);
\r
1001 // #define CHECK_FAILURE_LZ(_match_, _pos_) if (_match_ >= _pos_) { p->failure_LZ_LZ = True; return d; }
\r
1002 #define CHECK_FAILURE_LZ(_match_, _pos_)
\r
1004 static UInt32 * MixMatches2(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
\r
1007 UInt32 *hash = p->hash;
\r
1008 const Byte *cur = p->pointerToCurPos;
\r
1009 const UInt32 m = p->lzPos;
\r
1015 if (c2 >= matchMinPos)
\r
1017 CHECK_FAILURE_LZ(c2, m)
\r
1018 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
\r
1021 *d++ = m - c2 - 1;
\r
1028 static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
\r
1030 UInt32 h2, h3, c2, c3;
\r
1031 UInt32 *hash = p->hash;
\r
1032 const Byte *cur = p->pointerToCurPos;
\r
1033 const UInt32 m = p->lzPos;
\r
1037 c3 = (hash + kFix3HashSize)[h3];
\r
1040 (hash + kFix3HashSize)[h3] = m;
\r
1042 if (c2 >= matchMinPos)
\r
1044 CHECK_FAILURE_LZ(c2, m)
\r
1045 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
\r
1047 d[1] = m - c2 - 1;
\r
1048 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
\r
1058 if (c3 >= matchMinPos)
\r
1060 CHECK_FAILURE_LZ(c3, m)
\r
1061 if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
\r
1064 *d++ = m - c3 - 1;
\r
1072 #define INCREASE_LZ_POS p->lzPos++; p->pointerToCurPos++;
\r
1076 UInt32* MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d)
\r
1078 const UInt32 *bt = p->btBufPos;
\r
1079 const UInt32 len = *bt++;
\r
1080 const UInt32 *btLim = bt + len;
\r
1081 UInt32 matchMinPos;
\r
1082 UInt32 avail = p->btNumAvailBytes - 1;
\r
1083 p->btBufPos = btLim;
\r
1086 p->btNumAvailBytes = avail;
\r
1088 #define BT_HASH_BYTES_MAX 5
\r
1090 matchMinPos = p->lzPos;
\r
1093 matchMinPos -= bt[1];
\r
1094 else if (avail < (BT_HASH_BYTES_MAX - 1) - 1)
\r
1101 const UInt32 hs = p->historySize;
\r
1102 if (matchMinPos > hs)
\r
1103 matchMinPos -= hs;
\r
1112 UInt32 h2, h3, c2, c3;
\r
1113 UInt32 *hash = p->hash;
\r
1114 const Byte *cur = p->pointerToCurPos;
\r
1115 UInt32 m = p->lzPos;
\r
1119 c3 = (hash + kFix3HashSize)[h3];
\r
1122 (hash + kFix3HashSize)[h3] = m;
\r
1124 if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
\r
1126 d[1] = m - c2 - 1;
\r
1127 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
\r
1139 if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
\r
1142 *d++ = m - c3 - 1;
\r
1151 const UInt32 v0 = bt[0];
\r
1152 const UInt32 v1 = bt[1];
\r
1158 while (bt != btLim);
\r
1166 static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
\r
1168 UInt32 h2, h3, /* h4, */ c2, c3 /* , c4 */;
\r
1169 UInt32 *hash = p->hash;
\r
1170 const Byte *cur = p->pointerToCurPos;
\r
1171 const UInt32 m = p->lzPos;
\r
1175 c3 = (hash + kFix3HashSize)[h3];
\r
1176 // c4 = (hash + kFix4HashSize)[h4];
\r
1179 (hash + kFix3HashSize)[h3] = m;
\r
1180 // (hash + kFix4HashSize)[h4] = m;
\r
1185 if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
\r
1187 d[1] = m - c2 - 1;
\r
1188 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
\r
1190 // d[0] = (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3]) ? 4 : 3;
\r
1193 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3])
\r
1202 if (c4 >= matchMinPos)
\r
1204 cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] &&
\r
1205 cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3]
\r
1209 *d++ = m - c4 - 1;
\r
1219 if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
\r
1221 d[1] = m - c3 - 1;
\r
1222 if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m + 3] == cur[3])
\r
1232 if (c4 >= matchMinPos)
\r
1234 cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] &&
\r
1235 cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3]
\r
1239 *d++ = m - c4 - 1;
\r
1247 static UInt32* MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d)
\r
1249 const UInt32 *bt = p->btBufPos;
\r
1250 const UInt32 len = *bt++;
\r
1251 const UInt32 *btLim = bt + len;
\r
1252 p->btBufPos = btLim;
\r
1253 p->btNumAvailBytes--;
\r
1256 while (bt != btLim)
\r
1258 const UInt32 v0 = bt[0];
\r
1259 const UInt32 v1 = bt[1];
\r
1271 static UInt32* MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d)
\r
1273 const UInt32 *bt = p->btBufPos;
\r
1274 UInt32 len = *bt++;
\r
1275 const UInt32 avail = p->btNumAvailBytes - 1;
\r
1276 p->btNumAvailBytes = avail;
\r
1277 p->btBufPos = bt + len;
\r
1280 #define BT_HASH_BYTES_MAX 5
\r
1281 if (avail >= (BT_HASH_BYTES_MAX - 1) - 1)
\r
1283 UInt32 m = p->lzPos;
\r
1284 if (m > p->historySize)
\r
1285 m -= p->historySize;
\r
1288 d = p->MixMatchesFunc(p, m, d);
\r
1294 first match pair from BinTree: (match_len, match_dist),
\r
1295 (match_len >= numHashBytes).
\r
1296 MixMatchesFunc() inserts only hash matches that are nearer than (match_dist)
\r
1298 d = p->MixMatchesFunc(p, p->lzPos - bt[1], d);
\r
1299 // if (d) // check for failure
\r
1302 const UInt32 v0 = bt[0];
\r
1303 const UInt32 v1 = bt[1];
\r
1315 #define SKIP_HEADER2_MT do { GET_NEXT_BLOCK_IF_REQUIRED
\r
1316 #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash;
\r
1317 #define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0);
\r
1319 static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num)
\r
1321 SKIP_HEADER2_MT { p->btNumAvailBytes--;
\r
1325 static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num)
\r
1330 hash[h2] = p->lzPos;
\r
1334 static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num)
\r
1339 (hash + kFix3HashSize)[h3] =
\r
1346 // MatchFinderMt4_Skip() is similar to MatchFinderMt3_Skip().
\r
1347 // The difference is that MatchFinderMt3_Skip() updates hash for last 3 bytes of stream.
\r
1349 static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num)
\r
1352 UInt32 h2, h3; // h4
\r
1355 // (hash + kFix4HashSize)[h4] =
\r
1356 (hash + kFix3HashSize)[h3] =
\r
1363 void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable)
\r
1365 vTable->Init = (Mf_Init_Func)MatchFinderMt_Init;
\r
1366 vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes;
\r
1367 vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos;
\r
1368 vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches;
\r
1370 switch (MF(p)->numHashBytes)
\r
1373 p->GetHeadsFunc = GetHeads2;
\r
1374 p->MixMatchesFunc = (Mf_Mix_Matches)NULL;
\r
1375 vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip;
\r
1376 vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches;
\r
1379 p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3;
\r
1380 p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2;
\r
1381 vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip;
\r
1384 p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4;
\r
1386 // it's fast inline version of GetMatches()
\r
1387 // vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4;
\r
1389 p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3;
\r
1390 vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip;
\r
1393 p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5;
\r
1394 p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4;
\r
1396 (Mf_Skip_Func)MatchFinderMt3_Skip;
\r
1397 // (Mf_Skip_Func)MatchFinderMt4_Skip;
\r