--- /dev/null
+/* LzFindOpt.c -- multithreaded Match finder for LZ algorithms\r
+2021-07-13 : Igor Pavlov : Public domain */\r
+\r
+#include "Precomp.h"\r
+\r
+#include "CpuArch.h"\r
+#include "LzFind.h"\r
+\r
+// #include "LzFindMt.h"\r
+\r
+// #define LOG_ITERS\r
+\r
+// #define LOG_THREAD\r
+\r
+#ifdef LOG_THREAD\r
+#include <stdio.h>\r
+#define PRF(x) x\r
+#else\r
+// #define PRF(x)\r
+#endif\r
+\r
+#ifdef LOG_ITERS\r
+#include <stdio.h>\r
+UInt64 g_NumIters_Tree;\r
+UInt64 g_NumIters_Loop;\r
+UInt64 g_NumIters_Bytes;\r
+#define LOG_ITER(x) x\r
+#else\r
+#define LOG_ITER(x)\r
+#endif\r
+\r
+// ---------- BT THREAD ----------\r
+\r
+#define USE_SON_PREFETCH\r
+#define USE_LONG_MATCH_OPT\r
+\r
+#define kEmptyHashValue 0\r
+\r
+// #define CYC_TO_POS_OFFSET 0\r
+\r
+// #define CYC_TO_POS_OFFSET 1 // for debug\r
+\r
+/*\r
+MY_NO_INLINE\r
+UInt32 * MY_FAST_CALL GetMatchesSpecN_1(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,\r
+ UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 *posRes)\r
+{\r
+ do\r
+ {\r
+ UInt32 delta;\r
+ if (hash == size)\r
+ break;\r
+ delta = *hash++;\r
+\r
+ if (delta == 0 || delta > (UInt32)pos)\r
+ return NULL;\r
+\r
+ lenLimit++;\r
+\r
+ if (delta == (UInt32)pos)\r
+ {\r
+ CLzRef *ptr1 = son + ((size_t)pos << 1) - CYC_TO_POS_OFFSET * 2;\r
+ *d++ = 0;\r
+ ptr1[0] = kEmptyHashValue;\r
+ ptr1[1] = kEmptyHashValue;\r
+ }\r
+else\r
+{\r
+ UInt32 *_distances = ++d;\r
+\r
+ CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;\r
+ CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;\r
+\r
+ const Byte *len0 = cur, *len1 = cur;\r
+ UInt32 cutValue = _cutValue;\r
+ const Byte *maxLen = cur + _maxLen;\r
+\r
+ for (LOG_ITER(g_NumIters_Tree++);;)\r
+ {\r
+ LOG_ITER(g_NumIters_Loop++);\r
+ {\r
+ const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;\r
+ CLzRef *pair = son + ((size_t)(((ptrdiff_t)pos - CYC_TO_POS_OFFSET) + diff) << 1);\r
+ const Byte *len = (len0 < len1 ? len0 : len1);\r
+\r
+ #ifdef USE_SON_PREFETCH\r
+ const UInt32 pair0 = *pair;\r
+ #endif\r
+\r
+ if (len[diff] == len[0])\r
+ {\r
+ if (++len != lenLimit && len[diff] == len[0])\r
+ while (++len != lenLimit)\r
+ {\r
+ LOG_ITER(g_NumIters_Bytes++);\r
+ if (len[diff] != len[0])\r
+ break;\r
+ }\r
+ if (maxLen < len)\r
+ {\r
+ maxLen = len;\r
+ *d++ = (UInt32)(len - cur);\r
+ *d++ = delta - 1;\r
+ \r
+ if (len == lenLimit)\r
+ {\r
+ const UInt32 pair1 = pair[1];\r
+ *ptr1 =\r
+ #ifdef USE_SON_PREFETCH\r
+ pair0;\r
+ #else\r
+ pair[0];\r
+ #endif\r
+ *ptr0 = pair1;\r
+\r
+ _distances[-1] = (UInt32)(d - _distances);\r
+\r
+ #ifdef USE_LONG_MATCH_OPT\r
+\r
+ if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)\r
+ break;\r
+\r
+ {\r
+ for (;;)\r
+ {\r
+ hash++;\r
+ pos++;\r
+ cur++;\r
+ lenLimit++;\r
+ {\r
+ CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;\r
+ #if 0\r
+ *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];\r
+ #else\r
+ const UInt32 p0 = ptr[0 + (diff * 2)];\r
+ const UInt32 p1 = ptr[1 + (diff * 2)];\r
+ ptr[0] = p0;\r
+ ptr[1] = p1;\r
+ // ptr[0] = ptr[0 + (diff * 2)];\r
+ // ptr[1] = ptr[1 + (diff * 2)];\r
+ #endif\r
+ }\r
+ // PrintSon(son + 2, pos - 1);\r
+ // printf("\npos = %x delta = %x\n", pos, delta);\r
+ len++;\r
+ *d++ = 2;\r
+ *d++ = (UInt32)(len - cur);\r
+ *d++ = delta - 1;\r
+ if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)\r
+ break;\r
+ }\r
+ }\r
+ #endif\r
+\r
+ break;\r
+ }\r
+ }\r
+ }\r
+\r
+ {\r
+ const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff);\r
+ if (len[diff] < len[0])\r
+ {\r
+ delta = pair[1];\r
+ if (delta >= curMatch)\r
+ return NULL;\r
+ *ptr1 = curMatch;\r
+ ptr1 = pair + 1;\r
+ len1 = len;\r
+ }\r
+ else\r
+ {\r
+ delta = *pair;\r
+ if (delta >= curMatch)\r
+ return NULL;\r
+ *ptr0 = curMatch;\r
+ ptr0 = pair;\r
+ len0 = len;\r
+ }\r
+\r
+ delta = (UInt32)pos - delta;\r
+ \r
+ if (--cutValue == 0 || delta >= pos)\r
+ {\r
+ *ptr0 = *ptr1 = kEmptyHashValue;\r
+ _distances[-1] = (UInt32)(d - _distances);\r
+ break;\r
+ }\r
+ }\r
+ }\r
+ } // for (tree iterations)\r
+}\r
+ pos++;\r
+ cur++;\r
+ }\r
+ while (d < limit);\r
+ *posRes = (UInt32)pos;\r
+ return d;\r
+}\r
+*/\r
+\r
+/* define cbs if you use 2 functions.\r
+ GetMatchesSpecN_1() : (pos < _cyclicBufferSize)\r
+ GetMatchesSpecN_2() : (pos >= _cyclicBufferSize)\r
+\r
+ do not define cbs if you use 1 function:\r
+ GetMatchesSpecN_2()\r
+*/\r
+\r
+// #define cbs _cyclicBufferSize\r
+\r
+/*\r
+ we use size_t for (pos) and (_cyclicBufferPos_ instead of UInt32\r
+ to eliminate "movsx" BUG in old MSVC x64 compiler.\r
+*/\r
+\r
+UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,\r
+ UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,\r
+ size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,\r
+ UInt32 *posRes);\r
+\r
+MY_NO_INLINE\r
+UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,\r
+ UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,\r
+ size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,\r
+ UInt32 *posRes)\r
+{\r
+ do // while (hash != size)\r
+ {\r
+ UInt32 delta;\r
+ \r
+ #ifndef cbs\r
+ UInt32 cbs;\r
+ #endif\r
+\r
+ if (hash == size)\r
+ break;\r
+\r
+ delta = *hash++;\r
+\r
+ if (delta == 0)\r
+ return NULL;\r
+\r
+ lenLimit++;\r
+\r
+ #ifndef cbs\r
+ cbs = _cyclicBufferSize;\r
+ if ((UInt32)pos < cbs)\r
+ {\r
+ if (delta > (UInt32)pos)\r
+ return NULL;\r
+ cbs = (UInt32)pos;\r
+ }\r
+ #endif\r
+\r
+ if (delta >= cbs)\r
+ {\r
+ CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);\r
+ *d++ = 0;\r
+ ptr1[0] = kEmptyHashValue;\r
+ ptr1[1] = kEmptyHashValue;\r
+ }\r
+else\r
+{\r
+ UInt32 *_distances = ++d;\r
+\r
+ CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;\r
+ CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);\r
+\r
+ UInt32 cutValue = _cutValue;\r
+ const Byte *len0 = cur, *len1 = cur;\r
+ const Byte *maxLen = cur + _maxLen;\r
+\r
+ // if (cutValue == 0) { *ptr0 = *ptr1 = kEmptyHashValue; } else\r
+ for (LOG_ITER(g_NumIters_Tree++);;)\r
+ {\r
+ LOG_ITER(g_NumIters_Loop++);\r
+ {\r
+ // SPEC code\r
+ CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - (ptrdiff_t)delta\r
+ + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)\r
+ ) << 1);\r
+\r
+ const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;\r
+ const Byte *len = (len0 < len1 ? len0 : len1);\r
+\r
+ #ifdef USE_SON_PREFETCH\r
+ const UInt32 pair0 = *pair;\r
+ #endif\r
+\r
+ if (len[diff] == len[0])\r
+ {\r
+ if (++len != lenLimit && len[diff] == len[0])\r
+ while (++len != lenLimit)\r
+ {\r
+ LOG_ITER(g_NumIters_Bytes++);\r
+ if (len[diff] != len[0])\r
+ break;\r
+ }\r
+ if (maxLen < len)\r
+ {\r
+ maxLen = len;\r
+ *d++ = (UInt32)(len - cur);\r
+ *d++ = delta - 1;\r
+ \r
+ if (len == lenLimit)\r
+ {\r
+ const UInt32 pair1 = pair[1];\r
+ *ptr1 =\r
+ #ifdef USE_SON_PREFETCH\r
+ pair0;\r
+ #else\r
+ pair[0];\r
+ #endif\r
+ *ptr0 = pair1;\r
+\r
+ _distances[-1] = (UInt32)(d - _distances);\r
+\r
+ #ifdef USE_LONG_MATCH_OPT\r
+\r
+ if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)\r
+ break;\r
+\r
+ {\r
+ for (;;)\r
+ {\r
+ *d++ = 2;\r
+ *d++ = (UInt32)(lenLimit - cur);\r
+ *d++ = delta - 1;\r
+ cur++;\r
+ lenLimit++;\r
+ // SPEC\r
+ _cyclicBufferPos++;\r
+ {\r
+ // SPEC code\r
+ CLzRef *dest = son + ((size_t)(_cyclicBufferPos) << 1);\r
+ const CLzRef *src = dest + ((diff\r
+ + (ptrdiff_t)(UInt32)((_cyclicBufferPos < delta) ? cbs : 0)) << 1);\r
+ // CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;\r
+ #if 0\r
+ *(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src);\r
+ #else\r
+ const UInt32 p0 = src[0];\r
+ const UInt32 p1 = src[1];\r
+ dest[0] = p0;\r
+ dest[1] = p1;\r
+ #endif\r
+ }\r
+ pos++;\r
+ hash++;\r
+ if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)\r
+ break;\r
+ } // for() end for long matches\r
+ }\r
+ #endif\r
+\r
+ break; // break from TREE iterations\r
+ }\r
+ }\r
+ }\r
+ {\r
+ const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff);\r
+ if (len[diff] < len[0])\r
+ {\r
+ delta = pair[1];\r
+ *ptr1 = curMatch;\r
+ ptr1 = pair + 1;\r
+ len1 = len;\r
+ if (delta >= curMatch)\r
+ return NULL;\r
+ }\r
+ else\r
+ {\r
+ delta = *pair;\r
+ *ptr0 = curMatch;\r
+ ptr0 = pair;\r
+ len0 = len;\r
+ if (delta >= curMatch)\r
+ return NULL;\r
+ }\r
+ delta = (UInt32)pos - delta;\r
+ \r
+ if (--cutValue == 0 || delta >= cbs)\r
+ {\r
+ *ptr0 = *ptr1 = kEmptyHashValue;\r
+ _distances[-1] = (UInt32)(d - _distances);\r
+ break;\r
+ }\r
+ }\r
+ }\r
+ } // for (tree iterations)\r
+}\r
+ pos++;\r
+ _cyclicBufferPos++;\r
+ cur++;\r
+ }\r
+ while (d < limit);\r
+ *posRes = (UInt32)pos;\r
+ return d;\r
+}\r
+\r
+\r
+\r
+/*\r
+typedef UInt32 uint32plus; // size_t\r
+\r
+UInt32 * MY_FAST_CALL GetMatchesSpecN_3(uint32plus lenLimit, size_t pos, const Byte *cur, CLzRef *son,\r
+ UInt32 _cutValue, UInt32 *d, uint32plus _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,\r
+ size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,\r
+ UInt32 *posRes)\r
+{\r
+ do // while (hash != size)\r
+ {\r
+ UInt32 delta;\r
+\r
+ #ifndef cbs\r
+ UInt32 cbs;\r
+ #endif\r
+\r
+ if (hash == size)\r
+ break;\r
+\r
+ delta = *hash++;\r
+\r
+ if (delta == 0)\r
+ return NULL;\r
+\r
+ #ifndef cbs\r
+ cbs = _cyclicBufferSize;\r
+ if ((UInt32)pos < cbs)\r
+ {\r
+ if (delta > (UInt32)pos)\r
+ return NULL;\r
+ cbs = (UInt32)pos;\r
+ }\r
+ #endif\r
+ \r
+ if (delta >= cbs)\r
+ {\r
+ CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);\r
+ *d++ = 0;\r
+ ptr1[0] = kEmptyHashValue;\r
+ ptr1[1] = kEmptyHashValue;\r
+ }\r
+else\r
+{\r
+ CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;\r
+ CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);\r
+ UInt32 *_distances = ++d;\r
+ uint32plus len0 = 0, len1 = 0;\r
+ UInt32 cutValue = _cutValue;\r
+ uint32plus maxLen = _maxLen;\r
+ // lenLimit++; // const Byte *lenLimit = cur + _lenLimit;\r
+\r
+ for (LOG_ITER(g_NumIters_Tree++);;)\r
+ {\r
+ LOG_ITER(g_NumIters_Loop++);\r
+ {\r
+ // const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;\r
+ CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - delta\r
+ + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)\r
+ ) << 1);\r
+ const Byte *pb = cur - delta;\r
+ uint32plus len = (len0 < len1 ? len0 : len1);\r
+\r
+ #ifdef USE_SON_PREFETCH\r
+ const UInt32 pair0 = *pair;\r
+ #endif\r
+\r
+ if (pb[len] == cur[len])\r
+ {\r
+ if (++len != lenLimit && pb[len] == cur[len])\r
+ while (++len != lenLimit)\r
+ if (pb[len] != cur[len])\r
+ break;\r
+ if (maxLen < len)\r
+ {\r
+ maxLen = len;\r
+ *d++ = (UInt32)len;\r
+ *d++ = delta - 1;\r
+ if (len == lenLimit)\r
+ {\r
+ {\r
+ const UInt32 pair1 = pair[1];\r
+ *ptr0 = pair1;\r
+ *ptr1 =\r
+ #ifdef USE_SON_PREFETCH\r
+ pair0;\r
+ #else\r
+ pair[0];\r
+ #endif\r
+ }\r
+\r
+ _distances[-1] = (UInt32)(d - _distances);\r
+\r
+ #ifdef USE_LONG_MATCH_OPT\r
+\r
+ if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit)\r
+ break;\r
+\r
+ {\r
+ const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;\r
+ for (;;)\r
+ {\r
+ *d++ = 2;\r
+ *d++ = (UInt32)lenLimit;\r
+ *d++ = delta - 1;\r
+ _cyclicBufferPos++;\r
+ {\r
+ CLzRef *dest = son + ((size_t)_cyclicBufferPos << 1);\r
+ const CLzRef *src = dest + ((diff +\r
+ (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)) << 1);\r
+ #if 0\r
+ *(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src);\r
+ #else\r
+ const UInt32 p0 = src[0];\r
+ const UInt32 p1 = src[1];\r
+ dest[0] = p0;\r
+ dest[1] = p1;\r
+ #endif\r
+ }\r
+ hash++;\r
+ pos++;\r
+ cur++;\r
+ pb++;\r
+ if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit)\r
+ break;\r
+ }\r
+ }\r
+ #endif\r
+\r
+ break;\r
+ }\r
+ }\r
+ }\r
+ {\r
+ const UInt32 curMatch = (UInt32)pos - delta;\r
+ if (pb[len] < cur[len])\r
+ {\r
+ delta = pair[1];\r
+ *ptr1 = curMatch;\r
+ ptr1 = pair + 1;\r
+ len1 = len;\r
+ }\r
+ else\r
+ {\r
+ delta = *pair;\r
+ *ptr0 = curMatch;\r
+ ptr0 = pair;\r
+ len0 = len;\r
+ }\r
+\r
+ {\r
+ if (delta >= curMatch)\r
+ return NULL;\r
+ delta = (UInt32)pos - delta;\r
+ if (delta >= cbs\r
+ // delta >= _cyclicBufferSize || delta >= pos\r
+ || --cutValue == 0)\r
+ {\r
+ *ptr0 = *ptr1 = kEmptyHashValue;\r
+ _distances[-1] = (UInt32)(d - _distances);\r
+ break;\r
+ }\r
+ }\r
+ }\r
+ }\r
+ } // for (tree iterations)\r
+}\r
+ pos++;\r
+ _cyclicBufferPos++;\r
+ cur++;\r
+ }\r
+ while (d < limit);\r
+ *posRes = (UInt32)pos;\r
+ return d;\r
+}\r
+*/\r