648db22b |
1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | #ifndef ZDICT_STATIC_LINKING_ONLY |
12 | # define ZDICT_STATIC_LINKING_ONLY |
13 | #endif |
14 | |
f535537f |
15 | #include "../common/threading.h" /* ZSTD_pthread_mutex_t */ |
16 | #include "../common/mem.h" /* U32, BYTE */ |
648db22b |
17 | #include "../zdict.h" |
18 | |
19 | /** |
20 | * COVER_best_t is used for two purposes: |
21 | * 1. Synchronizing threads. |
22 | * 2. Saving the best parameters and dictionary. |
23 | * |
24 | * All of the methods except COVER_best_init() are thread safe if zstd is |
25 | * compiled with multithreaded support. |
26 | */ |
27 | typedef struct COVER_best_s { |
28 | ZSTD_pthread_mutex_t mutex; |
29 | ZSTD_pthread_cond_t cond; |
30 | size_t liveJobs; |
31 | void *dict; |
32 | size_t dictSize; |
33 | ZDICT_cover_params_t parameters; |
34 | size_t compressedSize; |
35 | } COVER_best_t; |
36 | |
37 | /** |
38 | * A segment is a range in the source as well as the score of the segment. |
39 | */ |
40 | typedef struct { |
41 | U32 begin; |
42 | U32 end; |
43 | U32 score; |
44 | } COVER_segment_t; |
45 | |
46 | /** |
47 | *Number of epochs and size of each epoch. |
48 | */ |
49 | typedef struct { |
50 | U32 num; |
51 | U32 size; |
52 | } COVER_epoch_info_t; |
53 | |
54 | /** |
55 | * Struct used for the dictionary selection function. |
56 | */ |
57 | typedef struct COVER_dictSelection { |
58 | BYTE* dictContent; |
59 | size_t dictSize; |
60 | size_t totalCompressedSize; |
61 | } COVER_dictSelection_t; |
62 | |
63 | /** |
64 | * Computes the number of epochs and the size of each epoch. |
65 | * We will make sure that each epoch gets at least 10 * k bytes. |
66 | * |
67 | * The COVER algorithms divide the data up into epochs of equal size and |
68 | * select one segment from each epoch. |
69 | * |
70 | * @param maxDictSize The maximum allowed dictionary size. |
71 | * @param nbDmers The number of dmers we are training on. |
72 | * @param k The parameter k (segment size). |
73 | * @param passes The target number of passes over the dmer corpus. |
74 | * More passes means a better dictionary. |
75 | */ |
76 | COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, |
77 | U32 k, U32 passes); |
78 | |
79 | /** |
80 | * Warns the user when their corpus is too small. |
81 | */ |
82 | void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); |
83 | |
84 | /** |
85 | * Checks total compressed size of a dictionary |
86 | */ |
87 | size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, |
88 | const size_t *samplesSizes, const BYTE *samples, |
89 | size_t *offsets, |
90 | size_t nbTrainSamples, size_t nbSamples, |
91 | BYTE *const dict, size_t dictBufferCapacity); |
92 | |
93 | /** |
94 | * Returns the sum of the sample sizes. |
95 | */ |
96 | size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; |
97 | |
98 | /** |
99 | * Initialize the `COVER_best_t`. |
100 | */ |
101 | void COVER_best_init(COVER_best_t *best); |
102 | |
103 | /** |
104 | * Wait until liveJobs == 0. |
105 | */ |
106 | void COVER_best_wait(COVER_best_t *best); |
107 | |
108 | /** |
109 | * Call COVER_best_wait() and then destroy the COVER_best_t. |
110 | */ |
111 | void COVER_best_destroy(COVER_best_t *best); |
112 | |
113 | /** |
114 | * Called when a thread is about to be launched. |
115 | * Increments liveJobs. |
116 | */ |
117 | void COVER_best_start(COVER_best_t *best); |
118 | |
119 | /** |
120 | * Called when a thread finishes executing, both on error or success. |
121 | * Decrements liveJobs and signals any waiting threads if liveJobs == 0. |
122 | * If this dictionary is the best so far save it and its parameters. |
123 | */ |
124 | void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, |
125 | COVER_dictSelection_t selection); |
126 | /** |
127 | * Error function for COVER_selectDict function. Checks if the return |
128 | * value is an error. |
129 | */ |
130 | unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); |
131 | |
132 | /** |
133 | * Error function for COVER_selectDict function. Returns a struct where |
134 | * return.totalCompressedSize is a ZSTD error. |
135 | */ |
136 | COVER_dictSelection_t COVER_dictSelectionError(size_t error); |
137 | |
138 | /** |
139 | * Always call after selectDict is called to free up used memory from |
140 | * newly created dictionary. |
141 | */ |
142 | void COVER_dictSelectionFree(COVER_dictSelection_t selection); |
143 | |
144 | /** |
145 | * Called to finalize the dictionary and select one based on whether or not |
146 | * the shrink-dict flag was enabled. If enabled the dictionary used is the |
147 | * smallest dictionary within a specified regression of the compressed size |
148 | * from the largest dictionary. |
149 | */ |
150 | COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, |
151 | size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, |
152 | size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); |