648db22b |
1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | /* This match finder leverages techniques used in file comparison algorithms |
12 | * to find matches between a dictionary and a source file. |
13 | * |
14 | * The original motivation for studying this approach was to try and optimize |
15 | * Zstandard for the use case of patching: the most common scenario being |
16 | * updating an existing software package with the next version. When patching, |
17 | * the difference between the old version of the package and the new version |
18 | * is generally tiny (most of the new file will be identical to |
19 | * the old one). In more technical terms, the edit distance (the minimal number |
20 | * of changes required to take one sequence of bytes to another) between the |
21 | * files would be small relative to the size of the file. |
22 | * |
23 | * Various 'diffing' algorithms utilize this notion of edit distance and |
24 | * the corresponding concept of a minimal edit script between two |
25 | * sequences to identify the regions within two files where they differ. |
26 | * The core algorithm used in this match finder is described in: |
27 | * |
28 | * "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers, |
29 | * Algorithmica Vol. 1, 1986, pp. 251-266, |
30 | * <https://doi.org/10.1007/BF01840446>. |
31 | * |
32 | * Additional algorithmic heuristics for speed improvement have also been included. |
33 | * These we inspired from implementations of various regular and binary diffing |
34 | * algorithms such as GNU diff, bsdiff, and Xdelta. |
35 | * |
36 | * Note: after some experimentation, this approach proved to not provide enough |
37 | * utility to justify the additional CPU used in finding matches. The one area |
38 | * where this approach consistently outperforms Zstandard even on level 19 is |
39 | * when compressing small files (<10 KB) using an equally small dictionary that |
40 | * is very similar to the source file. For the use case that this was intended, |
41 | * (large similar files) this approach by itself took 5-10X longer than zstd-19 and |
42 | * generally resulted in 2-3X larger files. The core advantage that zstd-19 has |
43 | * over this approach for match finding is the overlapping matches. This approach |
44 | * cannot find any. |
45 | * |
46 | * I'm leaving this in the contrib section in case this ever becomes interesting |
47 | * to explore again. |
48 | * */ |
49 | |
50 | #ifndef ZSTD_EDIST_H |
51 | #define ZSTD_EDIST_H |
52 | |
53 | /*-************************************* |
54 | * Dependencies |
55 | ***************************************/ |
56 | |
57 | #include <stddef.h> |
58 | #include "zstd_internal.h" /* ZSTD_Sequence */ |
59 | |
60 | /*! ZSTD_eDist_genSequences() : |
61 | * Will populate the provided ZSTD_Sequence buffer with sequences |
62 | * based on the optimal or near-optimal (depending on 'useHeuristics') |
63 | * edit script between 'dict' and 'src.' |
64 | * @return : the number of sequences found */ |
65 | size_t ZSTD_eDist_genSequences(ZSTD_Sequence* sequences, |
66 | const void* dict, size_t dictSize, |
67 | const void* src, size_t srcSize, |
68 | int useHeuristics); |
69 | |
70 | #endif |