648db22b |
1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | #ifndef ZDICT_STATIC_LINKING_ONLY |
12 | # define ZDICT_STATIC_LINKING_ONLY |
13 | #endif |
14 | |
15 | #include <stdio.h> /* fprintf */ |
16 | #include <stdlib.h> /* malloc, free, qsort */ |
17 | #include <string.h> /* memset */ |
18 | #include <time.h> /* clock */ |
19 | #include "../common/mem.h" /* read */ |
20 | #include "../common/pool.h" |
21 | #include "../common/threading.h" |
22 | #include "../common/zstd_internal.h" /* includes zstd.h */ |
23 | #include "../zdict.h" |
24 | |
25 | /** |
26 | * COVER_best_t is used for two purposes: |
27 | * 1. Synchronizing threads. |
28 | * 2. Saving the best parameters and dictionary. |
29 | * |
30 | * All of the methods except COVER_best_init() are thread safe if zstd is |
31 | * compiled with multithreaded support. |
32 | */ |
33 | typedef struct COVER_best_s { |
34 | ZSTD_pthread_mutex_t mutex; |
35 | ZSTD_pthread_cond_t cond; |
36 | size_t liveJobs; |
37 | void *dict; |
38 | size_t dictSize; |
39 | ZDICT_cover_params_t parameters; |
40 | size_t compressedSize; |
41 | } COVER_best_t; |
42 | |
43 | /** |
44 | * A segment is a range in the source as well as the score of the segment. |
45 | */ |
46 | typedef struct { |
47 | U32 begin; |
48 | U32 end; |
49 | U32 score; |
50 | } COVER_segment_t; |
51 | |
52 | /** |
53 | *Number of epochs and size of each epoch. |
54 | */ |
55 | typedef struct { |
56 | U32 num; |
57 | U32 size; |
58 | } COVER_epoch_info_t; |
59 | |
60 | /** |
61 | * Struct used for the dictionary selection function. |
62 | */ |
63 | typedef struct COVER_dictSelection { |
64 | BYTE* dictContent; |
65 | size_t dictSize; |
66 | size_t totalCompressedSize; |
67 | } COVER_dictSelection_t; |
68 | |
69 | /** |
70 | * Computes the number of epochs and the size of each epoch. |
71 | * We will make sure that each epoch gets at least 10 * k bytes. |
72 | * |
73 | * The COVER algorithms divide the data up into epochs of equal size and |
74 | * select one segment from each epoch. |
75 | * |
76 | * @param maxDictSize The maximum allowed dictionary size. |
77 | * @param nbDmers The number of dmers we are training on. |
78 | * @param k The parameter k (segment size). |
79 | * @param passes The target number of passes over the dmer corpus. |
80 | * More passes means a better dictionary. |
81 | */ |
82 | COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers, |
83 | U32 k, U32 passes); |
84 | |
85 | /** |
86 | * Warns the user when their corpus is too small. |
87 | */ |
88 | void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel); |
89 | |
90 | /** |
91 | * Checks total compressed size of a dictionary |
92 | */ |
93 | size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters, |
94 | const size_t *samplesSizes, const BYTE *samples, |
95 | size_t *offsets, |
96 | size_t nbTrainSamples, size_t nbSamples, |
97 | BYTE *const dict, size_t dictBufferCapacity); |
98 | |
99 | /** |
100 | * Returns the sum of the sample sizes. |
101 | */ |
102 | size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ; |
103 | |
104 | /** |
105 | * Initialize the `COVER_best_t`. |
106 | */ |
107 | void COVER_best_init(COVER_best_t *best); |
108 | |
109 | /** |
110 | * Wait until liveJobs == 0. |
111 | */ |
112 | void COVER_best_wait(COVER_best_t *best); |
113 | |
114 | /** |
115 | * Call COVER_best_wait() and then destroy the COVER_best_t. |
116 | */ |
117 | void COVER_best_destroy(COVER_best_t *best); |
118 | |
119 | /** |
120 | * Called when a thread is about to be launched. |
121 | * Increments liveJobs. |
122 | */ |
123 | void COVER_best_start(COVER_best_t *best); |
124 | |
125 | /** |
126 | * Called when a thread finishes executing, both on error or success. |
127 | * Decrements liveJobs and signals any waiting threads if liveJobs == 0. |
128 | * If this dictionary is the best so far save it and its parameters. |
129 | */ |
130 | void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, |
131 | COVER_dictSelection_t selection); |
132 | /** |
133 | * Error function for COVER_selectDict function. Checks if the return |
134 | * value is an error. |
135 | */ |
136 | unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); |
137 | |
138 | /** |
139 | * Error function for COVER_selectDict function. Returns a struct where |
140 | * return.totalCompressedSize is a ZSTD error. |
141 | */ |
142 | COVER_dictSelection_t COVER_dictSelectionError(size_t error); |
143 | |
144 | /** |
145 | * Always call after selectDict is called to free up used memory from |
146 | * newly created dictionary. |
147 | */ |
148 | void COVER_dictSelectionFree(COVER_dictSelection_t selection); |
149 | |
150 | /** |
151 | * Called to finalize the dictionary and select one based on whether or not |
152 | * the shrink-dict flag was enabled. If enabled the dictionary used is the |
153 | * smallest dictionary within a specified regression of the compressed size |
154 | * from the largest dictionary. |
155 | */ |
156 | COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, |
157 | size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, |
158 | size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); |