648db22b |
1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | #if defined (__cplusplus) |
12 | extern "C" { |
13 | #endif |
14 | |
15 | #ifndef ZSTD_ZDICT_H |
16 | #define ZSTD_ZDICT_H |
17 | |
18 | /*====== Dependencies ======*/ |
19 | #include <stddef.h> /* size_t */ |
20 | |
21 | |
22 | /* ===== ZDICTLIB_API : control library symbols visibility ===== */ |
23 | #ifndef ZDICTLIB_VISIBLE |
24 | /* Backwards compatibility with old macro name */ |
25 | # ifdef ZDICTLIB_VISIBILITY |
26 | # define ZDICTLIB_VISIBLE ZDICTLIB_VISIBILITY |
27 | # elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__) |
28 | # define ZDICTLIB_VISIBLE __attribute__ ((visibility ("default"))) |
29 | # else |
30 | # define ZDICTLIB_VISIBLE |
31 | # endif |
32 | #endif |
33 | |
34 | #ifndef ZDICTLIB_HIDDEN |
35 | # if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__) |
36 | # define ZDICTLIB_HIDDEN __attribute__ ((visibility ("hidden"))) |
37 | # else |
38 | # define ZDICTLIB_HIDDEN |
39 | # endif |
40 | #endif |
41 | |
42 | #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) |
43 | # define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBLE |
44 | #elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) |
45 | # define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ |
46 | #else |
47 | # define ZDICTLIB_API ZDICTLIB_VISIBLE |
48 | #endif |
49 | |
50 | /******************************************************************************* |
51 | * Zstd dictionary builder |
52 | * |
53 | * FAQ |
54 | * === |
55 | * Why should I use a dictionary? |
56 | * ------------------------------ |
57 | * |
58 | * Zstd can use dictionaries to improve compression ratio of small data. |
59 | * Traditionally small files don't compress well because there is very little |
60 | * repetition in a single sample, since it is small. But, if you are compressing |
61 | * many similar files, like a bunch of JSON records that share the same |
62 | * structure, you can train a dictionary on ahead of time on some samples of |
63 | * these files. Then, zstd can use the dictionary to find repetitions that are |
64 | * present across samples. This can vastly improve compression ratio. |
65 | * |
66 | * When is a dictionary useful? |
67 | * ---------------------------- |
68 | * |
69 | * Dictionaries are useful when compressing many small files that are similar. |
70 | * The larger a file is, the less benefit a dictionary will have. Generally, |
71 | * we don't expect dictionary compression to be effective past 100KB. And the |
72 | * smaller a file is, the more we would expect the dictionary to help. |
73 | * |
74 | * How do I use a dictionary? |
75 | * -------------------------- |
76 | * |
77 | * Simply pass the dictionary to the zstd compressor with |
78 | * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to |
79 | * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other |
80 | * more advanced functions that allow selecting some options, see zstd.h for |
81 | * complete documentation. |
82 | * |
83 | * What is a zstd dictionary? |
84 | * -------------------------- |
85 | * |
86 | * A zstd dictionary has two pieces: Its header, and its content. The header |
87 | * contains a magic number, the dictionary ID, and entropy tables. These |
88 | * entropy tables allow zstd to save on header costs in the compressed file, |
89 | * which really matters for small data. The content is just bytes, which are |
90 | * repeated content that is common across many samples. |
91 | * |
92 | * What is a raw content dictionary? |
93 | * --------------------------------- |
94 | * |
95 | * A raw content dictionary is just bytes. It doesn't have a zstd dictionary |
96 | * header, a dictionary ID, or entropy tables. Any buffer is a valid raw |
97 | * content dictionary. |
98 | * |
99 | * How do I train a dictionary? |
100 | * ---------------------------- |
101 | * |
102 | * Gather samples from your use case. These samples should be similar to each |
103 | * other. If you have several use cases, you could try to train one dictionary |
104 | * per use case. |
105 | * |
106 | * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your |
107 | * dictionary. There are a few advanced versions of this function, but this |
108 | * is a great starting point. If you want to further tune your dictionary |
109 | * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow |
110 | * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`. |
111 | * |
112 | * If the dictionary training function fails, that is likely because you |
113 | * either passed too few samples, or a dictionary would not be effective |
114 | * for your data. Look at the messages that the dictionary trainer printed, |
115 | * if it doesn't say too few samples, then a dictionary would not be effective. |
116 | * |
117 | * How large should my dictionary be? |
118 | * ---------------------------------- |
119 | * |
120 | * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB. |
121 | * The zstd CLI defaults to a 110KB dictionary. You likely don't need a |
122 | * dictionary larger than that. But, most use cases can get away with a |
123 | * smaller dictionary. The advanced dictionary builders can automatically |
124 | * shrink the dictionary for you, and select the smallest size that doesn't |
125 | * hurt compression ratio too much. See the `shrinkDict` parameter. |
126 | * A smaller dictionary can save memory, and potentially speed up |
127 | * compression. |
128 | * |
129 | * How many samples should I provide to the dictionary builder? |
130 | * ------------------------------------------------------------ |
131 | * |
132 | * We generally recommend passing ~100x the size of the dictionary |
133 | * in samples. A few thousand should suffice. Having too few samples |
134 | * can hurt the dictionaries effectiveness. Having more samples will |
135 | * only improve the dictionaries effectiveness. But having too many |
136 | * samples can slow down the dictionary builder. |
137 | * |
138 | * How do I determine if a dictionary will be effective? |
139 | * ----------------------------------------------------- |
140 | * |
141 | * Simply train a dictionary and try it out. You can use zstd's built in |
142 | * benchmarking tool to test the dictionary effectiveness. |
143 | * |
144 | * # Benchmark levels 1-3 without a dictionary |
145 | * zstd -b1e3 -r /path/to/my/files |
146 | * # Benchmark levels 1-3 with a dictionary |
147 | * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary |
148 | * |
149 | * When should I retrain a dictionary? |
150 | * ----------------------------------- |
151 | * |
152 | * You should retrain a dictionary when its effectiveness drops. Dictionary |
153 | * effectiveness drops as the data you are compressing changes. Generally, we do |
154 | * expect dictionaries to "decay" over time, as your data changes, but the rate |
155 | * at which they decay depends on your use case. Internally, we regularly |
156 | * retrain dictionaries, and if the new dictionary performs significantly |
157 | * better than the old dictionary, we will ship the new dictionary. |
158 | * |
159 | * I have a raw content dictionary, how do I turn it into a zstd dictionary? |
160 | * ------------------------------------------------------------------------- |
161 | * |
162 | * If you have a raw content dictionary, e.g. by manually constructing it, or |
163 | * using a third-party dictionary builder, you can turn it into a zstd |
164 | * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to |
165 | * provide some samples of the data. It will add the zstd header to the |
166 | * raw content, which contains a dictionary ID and entropy tables, which |
167 | * will improve compression ratio, and allow zstd to write the dictionary ID |
168 | * into the frame, if you so choose. |
169 | * |
170 | * Do I have to use zstd's dictionary builder? |
171 | * ------------------------------------------- |
172 | * |
173 | * No! You can construct dictionary content however you please, it is just |
174 | * bytes. It will always be valid as a raw content dictionary. If you want |
175 | * a zstd dictionary, which can improve compression ratio, use |
176 | * `ZDICT_finalizeDictionary()`. |
177 | * |
178 | * What is the attack surface of a zstd dictionary? |
179 | * ------------------------------------------------ |
180 | * |
181 | * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so |
182 | * zstd should never crash, or access out-of-bounds memory no matter what |
183 | * the dictionary is. However, if an attacker can control the dictionary |
184 | * during decompression, they can cause zstd to generate arbitrary bytes, |
185 | * just like if they controlled the compressed data. |
186 | * |
187 | ******************************************************************************/ |
188 | |
189 | |
190 | /*! ZDICT_trainFromBuffer(): |
191 | * Train a dictionary from an array of samples. |
192 | * Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4, |
193 | * f=20, and accel=1. |
194 | * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
195 | * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
196 | * The resulting dictionary will be saved into `dictBuffer`. |
197 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
198 | * or an error code, which can be tested with ZDICT_isError(). |
199 | * Note: Dictionary training will fail if there are not enough samples to construct a |
200 | * dictionary, or if most of the samples are too small (< 8 bytes being the lower limit). |
201 | * If dictionary training fails, you should use zstd without a dictionary, as the dictionary |
202 | * would've been ineffective anyways. If you believe your samples would benefit from a dictionary |
203 | * please open an issue with details, and we can look into it. |
204 | * Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB. |
205 | * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
206 | * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
207 | * In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
208 | * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
209 | */ |
210 | ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, |
211 | const void* samplesBuffer, |
212 | const size_t* samplesSizes, unsigned nbSamples); |
213 | |
214 | typedef struct { |
215 | int compressionLevel; /**< optimize for a specific zstd compression level; 0 means default */ |
216 | unsigned notificationLevel; /**< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ |
217 | unsigned dictID; /**< force dictID value; 0 means auto mode (32-bits random value) |
218 | * NOTE: The zstd format reserves some dictionary IDs for future use. |
219 | * You may use them in private settings, but be warned that they |
220 | * may be used by zstd in a public dictionary registry in the future. |
221 | * These dictionary IDs are: |
222 | * - low range : <= 32767 |
223 | * - high range : >= (2^31) |
224 | */ |
225 | } ZDICT_params_t; |
226 | |
227 | /*! ZDICT_finalizeDictionary(): |
228 | * Given a custom content as a basis for dictionary, and a set of samples, |
229 | * finalize dictionary by adding headers and statistics according to the zstd |
230 | * dictionary format. |
231 | * |
232 | * Samples must be stored concatenated in a flat buffer `samplesBuffer`, |
233 | * supplied with an array of sizes `samplesSizes`, providing the size of each |
234 | * sample in order. The samples are used to construct the statistics, so they |
235 | * should be representative of what you will compress with this dictionary. |
236 | * |
237 | * The compression level can be set in `parameters`. You should pass the |
238 | * compression level you expect to use in production. The statistics for each |
239 | * compression level differ, so tuning the dictionary for the compression level |
240 | * can help quite a bit. |
241 | * |
242 | * You can set an explicit dictionary ID in `parameters`, or allow us to pick |
243 | * a random dictionary ID for you, but we can't guarantee no collisions. |
244 | * |
245 | * The dstDictBuffer and the dictContent may overlap, and the content will be |
246 | * appended to the end of the header. If the header + the content doesn't fit in |
247 | * maxDictSize the beginning of the content is truncated to make room, since it |
248 | * is presumed that the most profitable content is at the end of the dictionary, |
249 | * since that is the cheapest to reference. |
250 | * |
251 | * `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN). |
252 | * |
253 | * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`), |
254 | * or an error code, which can be tested by ZDICT_isError(). |
255 | * Note: ZDICT_finalizeDictionary() will push notifications into stderr if |
256 | * instructed to, using notificationLevel>0. |
257 | * NOTE: This function currently may fail in several edge cases including: |
258 | * * Not enough samples |
259 | * * Samples are uncompressible |
260 | * * Samples are all exactly the same |
261 | */ |
262 | ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize, |
263 | const void* dictContent, size_t dictContentSize, |
264 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
265 | ZDICT_params_t parameters); |
266 | |
267 | |
268 | /*====== Helper functions ======*/ |
269 | ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */ |
270 | ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */ |
271 | ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode); |
272 | ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode); |
273 | |
274 | #endif /* ZSTD_ZDICT_H */ |
275 | |
276 | #if defined(ZDICT_STATIC_LINKING_ONLY) && !defined(ZSTD_ZDICT_H_STATIC) |
277 | #define ZSTD_ZDICT_H_STATIC |
278 | |
279 | /* This can be overridden externally to hide static symbols. */ |
280 | #ifndef ZDICTLIB_STATIC_API |
281 | # if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) |
282 | # define ZDICTLIB_STATIC_API __declspec(dllexport) ZDICTLIB_VISIBLE |
283 | # elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) |
284 | # define ZDICTLIB_STATIC_API __declspec(dllimport) ZDICTLIB_VISIBLE |
285 | # else |
286 | # define ZDICTLIB_STATIC_API ZDICTLIB_VISIBLE |
287 | # endif |
288 | #endif |
289 | |
290 | /* ==================================================================================== |
291 | * The definitions in this section are considered experimental. |
292 | * They should never be used with a dynamic library, as they may change in the future. |
293 | * They are provided for advanced usages. |
294 | * Use them only in association with static linking. |
295 | * ==================================================================================== */ |
296 | |
297 | #define ZDICT_DICTSIZE_MIN 256 |
298 | /* Deprecated: Remove in v1.6.0 */ |
299 | #define ZDICT_CONTENTSIZE_MIN 128 |
300 | |
301 | /*! ZDICT_cover_params_t: |
302 | * k and d are the only required parameters. |
303 | * For others, value 0 means default. |
304 | */ |
305 | typedef struct { |
306 | unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ |
307 | unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ |
308 | unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */ |
309 | unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ |
310 | double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */ |
311 | unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */ |
312 | unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */ |
313 | ZDICT_params_t zParams; |
314 | } ZDICT_cover_params_t; |
315 | |
316 | typedef struct { |
317 | unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ |
318 | unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ |
319 | unsigned f; /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/ |
320 | unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */ |
321 | unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ |
322 | double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */ |
323 | unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */ |
324 | unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */ |
325 | unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */ |
326 | |
327 | ZDICT_params_t zParams; |
328 | } ZDICT_fastCover_params_t; |
329 | |
330 | /*! ZDICT_trainFromBuffer_cover(): |
331 | * Train a dictionary from an array of samples using the COVER algorithm. |
332 | * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
333 | * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
334 | * The resulting dictionary will be saved into `dictBuffer`. |
335 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
336 | * or an error code, which can be tested with ZDICT_isError(). |
337 | * See ZDICT_trainFromBuffer() for details on failure modes. |
338 | * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte. |
339 | * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
340 | * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
341 | * In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
342 | * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
343 | */ |
344 | ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover( |
345 | void *dictBuffer, size_t dictBufferCapacity, |
346 | const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, |
347 | ZDICT_cover_params_t parameters); |
348 | |
349 | /*! ZDICT_optimizeTrainFromBuffer_cover(): |
350 | * The same requirements as above hold for all the parameters except `parameters`. |
351 | * This function tries many parameter combinations and picks the best parameters. |
352 | * `*parameters` is filled with the best parameters found, |
353 | * dictionary constructed with those parameters is stored in `dictBuffer`. |
354 | * |
355 | * All of the parameters d, k, steps are optional. |
356 | * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}. |
357 | * if steps is zero it defaults to its default value. |
358 | * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000]. |
359 | * |
360 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
361 | * or an error code, which can be tested with ZDICT_isError(). |
362 | * On success `*parameters` contains the parameters selected. |
363 | * See ZDICT_trainFromBuffer() for details on failure modes. |
364 | * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. |
365 | */ |
366 | ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover( |
367 | void* dictBuffer, size_t dictBufferCapacity, |
368 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
369 | ZDICT_cover_params_t* parameters); |
370 | |
371 | /*! ZDICT_trainFromBuffer_fastCover(): |
372 | * Train a dictionary from an array of samples using a modified version of COVER algorithm. |
373 | * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
374 | * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
375 | * d and k are required. |
376 | * All other parameters are optional, will use default values if not provided |
377 | * The resulting dictionary will be saved into `dictBuffer`. |
378 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
379 | * or an error code, which can be tested with ZDICT_isError(). |
380 | * See ZDICT_trainFromBuffer() for details on failure modes. |
381 | * Note: ZDICT_trainFromBuffer_fastCover() requires 6 * 2^f bytes of memory. |
382 | * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
383 | * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
384 | * In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
385 | * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
386 | */ |
387 | ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer, |
388 | size_t dictBufferCapacity, const void *samplesBuffer, |
389 | const size_t *samplesSizes, unsigned nbSamples, |
390 | ZDICT_fastCover_params_t parameters); |
391 | |
392 | /*! ZDICT_optimizeTrainFromBuffer_fastCover(): |
393 | * The same requirements as above hold for all the parameters except `parameters`. |
394 | * This function tries many parameter combinations (specifically, k and d combinations) |
395 | * and picks the best parameters. `*parameters` is filled with the best parameters found, |
396 | * dictionary constructed with those parameters is stored in `dictBuffer`. |
397 | * All of the parameters d, k, steps, f, and accel are optional. |
398 | * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}. |
399 | * if steps is zero it defaults to its default value. |
400 | * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000]. |
401 | * If f is zero, default value of 20 is used. |
402 | * If accel is zero, default value of 1 is used. |
403 | * |
404 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
405 | * or an error code, which can be tested with ZDICT_isError(). |
406 | * On success `*parameters` contains the parameters selected. |
407 | * See ZDICT_trainFromBuffer() for details on failure modes. |
408 | * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread. |
409 | */ |
410 | ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer, |
411 | size_t dictBufferCapacity, const void* samplesBuffer, |
412 | const size_t* samplesSizes, unsigned nbSamples, |
413 | ZDICT_fastCover_params_t* parameters); |
414 | |
415 | typedef struct { |
416 | unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */ |
417 | ZDICT_params_t zParams; |
418 | } ZDICT_legacy_params_t; |
419 | |
420 | /*! ZDICT_trainFromBuffer_legacy(): |
421 | * Train a dictionary from an array of samples. |
422 | * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
423 | * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
424 | * The resulting dictionary will be saved into `dictBuffer`. |
425 | * `parameters` is optional and can be provided with values set to 0 to mean "default". |
426 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
427 | * or an error code, which can be tested with ZDICT_isError(). |
428 | * See ZDICT_trainFromBuffer() for details on failure modes. |
429 | * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
430 | * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
431 | * In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
432 | * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
433 | * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. |
434 | */ |
435 | ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_legacy( |
436 | void* dictBuffer, size_t dictBufferCapacity, |
437 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
438 | ZDICT_legacy_params_t parameters); |
439 | |
440 | |
441 | /* Deprecation warnings */ |
442 | /* It is generally possible to disable deprecation warnings from compiler, |
443 | for example with -Wno-deprecated-declarations for gcc |
444 | or _CRT_SECURE_NO_WARNINGS in Visual. |
445 | Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */ |
446 | #ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS |
447 | # define ZDICT_DEPRECATED(message) /* disable deprecation warnings */ |
448 | #else |
449 | # define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) |
450 | # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ |
451 | # define ZDICT_DEPRECATED(message) [[deprecated(message)]] |
452 | # elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405) |
453 | # define ZDICT_DEPRECATED(message) __attribute__((deprecated(message))) |
454 | # elif (ZDICT_GCC_VERSION >= 301) |
455 | # define ZDICT_DEPRECATED(message) __attribute__((deprecated)) |
456 | # elif defined(_MSC_VER) |
457 | # define ZDICT_DEPRECATED(message) __declspec(deprecated(message)) |
458 | # else |
459 | # pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler") |
460 | # define ZDICT_DEPRECATED(message) |
461 | # endif |
462 | #endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */ |
463 | |
464 | ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead") |
465 | ZDICTLIB_STATIC_API |
466 | size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, |
467 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); |
468 | |
469 | |
470 | #endif /* ZSTD_ZDICT_H_STATIC */ |
471 | |
472 | #if defined (__cplusplus) |
473 | } |
474 | #endif |