2 * Copyright (c) Meta Platforms, Inc. and affiliates.
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
11 #ifndef ZDICT_STATIC_LINKING_ONLY
12 # define ZDICT_STATIC_LINKING_ONLY
15 #include <stdio.h> /* fprintf */
16 #include <stdlib.h> /* malloc, free, qsort */
17 #include <string.h> /* memset */
18 #include <time.h> /* clock */
19 #include "../common/mem.h" /* read */
20 #include "../common/pool.h"
21 #include "../common/threading.h"
22 #include "../common/zstd_internal.h" /* includes zstd.h */
26 * COVER_best_t is used for two purposes:
27 * 1. Synchronizing threads.
28 * 2. Saving the best parameters and dictionary.
30 * All of the methods except COVER_best_init() are thread safe if zstd is
31 * compiled with multithreaded support.
33 typedef struct COVER_best_s {
34 ZSTD_pthread_mutex_t mutex;
35 ZSTD_pthread_cond_t cond;
39 ZDICT_cover_params_t parameters;
40 size_t compressedSize;
44 * A segment is a range in the source as well as the score of the segment.
53 *Number of epochs and size of each epoch.
61 * Struct used for the dictionary selection function.
63 typedef struct COVER_dictSelection {
66 size_t totalCompressedSize;
67 } COVER_dictSelection_t;
70 * Computes the number of epochs and the size of each epoch.
71 * We will make sure that each epoch gets at least 10 * k bytes.
73 * The COVER algorithms divide the data up into epochs of equal size and
74 * select one segment from each epoch.
76 * @param maxDictSize The maximum allowed dictionary size.
77 * @param nbDmers The number of dmers we are training on.
78 * @param k The parameter k (segment size).
79 * @param passes The target number of passes over the dmer corpus.
80 * More passes means a better dictionary.
82 COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
86 * Warns the user when their corpus is too small.
88 void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
91 * Checks total compressed size of a dictionary
93 size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
94 const size_t *samplesSizes, const BYTE *samples,
96 size_t nbTrainSamples, size_t nbSamples,
97 BYTE *const dict, size_t dictBufferCapacity);
100 * Returns the sum of the sample sizes.
102 size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
105 * Initialize the `COVER_best_t`.
107 void COVER_best_init(COVER_best_t *best);
110 * Wait until liveJobs == 0.
112 void COVER_best_wait(COVER_best_t *best);
115 * Call COVER_best_wait() and then destroy the COVER_best_t.
117 void COVER_best_destroy(COVER_best_t *best);
120 * Called when a thread is about to be launched.
121 * Increments liveJobs.
123 void COVER_best_start(COVER_best_t *best);
126 * Called when a thread finishes executing, both on error or success.
127 * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
128 * If this dictionary is the best so far save it and its parameters.
130 void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
131 COVER_dictSelection_t selection);
133 * Error function for COVER_selectDict function. Checks if the return
136 unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
139 * Error function for COVER_selectDict function. Returns a struct where
140 * return.totalCompressedSize is a ZSTD error.
142 COVER_dictSelection_t COVER_dictSelectionError(size_t error);
145 * Always call after selectDict is called to free up used memory from
146 * newly created dictionary.
148 void COVER_dictSelectionFree(COVER_dictSelection_t selection);
151 * Called to finalize the dictionary and select one based on whether or not
152 * the shrink-dict flag was enabled. If enabled the dictionary used is the
153 * smallest dictionary within a specified regression of the compressed size
154 * from the largest dictionary.
156 COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
157 size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
158 size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);