2 * Copyright (c) Meta Platforms, Inc. and affiliates.
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
13 /* **************************************
15 ****************************************/
17 # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
21 /*-*************************************
23 ***************************************/
24 #include "platform.h" /* Large Files support */
25 #include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */
26 #include <stdlib.h> /* malloc, free */
27 #include <string.h> /* memset */
28 #include <stdio.h> /* fprintf, fopen, ftello64 */
29 #include <errno.h> /* errno */
31 #include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
32 #include "../lib/common/debug.h" /* assert */
33 #include "../lib/common/mem.h" /* read */
34 #include "../lib/zstd_errors.h"
38 /*-*************************************
40 ***************************************/
45 #define SAMPLESIZE_MAX (128 KB)
46 #define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
47 #define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
48 #define FASTCOVER_MEMMULT 1 /* rough estimation : memory cost to analyze 1 byte of sample */
49 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
51 #define NOISELENGTH 32
52 #define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
55 /*-*************************************
57 ***************************************/
58 #define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
59 #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
61 static const U64 g_refreshRate = SEC_TO_MICRO / 6;
62 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
64 #define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
65 if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
66 { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
67 if (displayLevel>=4) fflush(stderr); } } }
69 /*-*************************************
71 ***************************************/
75 #define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
76 #define EXM_THROW(error, ...) \
78 DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
79 DISPLAY("Error %i : ", error); \
80 DISPLAY(__VA_ARGS__); \
86 /* ********************************************************
88 **********************************************************/
90 #define MIN(a,b) ((a) < (b) ? (a) : (b))
93 Returns the size of a file.
96 static S64 DiB_getFileSize (const char * fileName)
98 U64 const fileSize = UTIL_getFileSize(fileName);
99 return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
102 /* ********************************************************
103 * File related operations
104 **********************************************************/
105 /** DiB_loadFiles() :
106 * load samples from files listed in fileNamesTable into buffer.
107 * works even if buffer is too small to load all samples.
108 * Also provides the size of each sample into sampleSizes table
109 * which must be sized correctly, using DiB_fileStats().
110 * @return : nb of samples effectively loaded into `buffer`
111 * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
112 * sampleSizes is filled with the size of each sample.
114 static int DiB_loadFiles(
115 void* buffer, size_t* bufferSizePtr,
116 size_t* sampleSizes, int sstSize,
117 const char** fileNamesTable, int nbFiles,
118 size_t targetChunkSize, int displayLevel )
120 char* const buff = (char*)buffer;
121 size_t totalDataLoaded = 0;
122 int nbSamplesLoaded = 0;
126 assert(targetChunkSize <= SAMPLESIZE_MAX);
128 while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
129 size_t fileDataLoaded;
130 S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
132 /* skip if zero-size or file error */
137 f = fopen( fileNamesTable[fileIndex], "rb");
139 EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
140 DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[fileIndex]);
142 /* Load the first chunk of data from the file */
143 fileDataLoaded = targetChunkSize > 0 ?
144 (size_t)MIN(fileSize, (S64)targetChunkSize) :
145 (size_t)MIN(fileSize, SAMPLESIZE_MAX );
146 if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
148 if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
149 EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
150 sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
151 totalDataLoaded += fileDataLoaded;
153 /* If file-chunking is enabled, load the rest of the file as more samples */
154 if (targetChunkSize > 0) {
155 while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
156 size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
157 if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
160 if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
161 EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
162 sampleSizes[nbSamplesLoaded++] = chunkSize;
163 totalDataLoaded += chunkSize;
164 fileDataLoaded += chunkSize;
173 DISPLAYLEVEL(2, "\r%79s\r", "");
174 DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
175 (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
176 *bufferSizePtr = totalDataLoaded;
177 return nbSamplesLoaded;
180 #define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
181 static U32 DiB_rand(U32* src)
183 static const U32 prime1 = 2654435761U;
184 static const U32 prime2 = 2246822519U;
188 rand32 = DiB_rotl32(rand32, 13);
194 * shuffle a table of file names in a semi-random way
195 * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
196 * it will load random elements from it, instead of just the first ones. */
197 static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
198 U32 seed = 0xFD2FB528;
202 for (i = nbFiles - 1; i > 0; --i) {
203 unsigned const j = DiB_rand(&seed) % (i + 1);
204 const char* const tmp = fileNamesTable[j];
205 fileNamesTable[j] = fileNamesTable[i];
206 fileNamesTable[i] = tmp;
211 /*-********************************************************
212 * Dictionary training functions
213 **********************************************************/
214 static size_t DiB_findMaxMem(unsigned long long requiredMem)
216 size_t const step = 8 MB;
217 void* testmem = NULL;
219 requiredMem = (((requiredMem >> 23) + 1) << 23);
221 if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
224 testmem = malloc((size_t)requiredMem);
229 return (size_t)requiredMem;
233 static void DiB_fillNoise(void* buffer, size_t length)
235 unsigned const prime1 = 2654435761U;
236 unsigned const prime2 = 2246822519U;
237 unsigned acc = prime1;
240 for (p=0; p<length; p++) {
242 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
247 static void DiB_saveDict(const char* dictFileName,
248 const void* buff, size_t buffSize)
250 FILE* const f = fopen(dictFileName, "wb");
251 if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
253 { size_t const n = fwrite(buff, 1, buffSize, f);
254 if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
256 { size_t const n = (size_t)fclose(f);
257 if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
263 int oneSampleTooLarge;
266 /*! DiB_fileStats() :
267 * Given a list of files, and a chunkSize (0 == no chunk, whole files)
268 * provides the amount of data to be loaded and the resulting nb of samples.
269 * This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
271 static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
275 memset(&fs, 0, sizeof(fs));
277 /* We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX */
278 assert( chunkSize <= SAMPLESIZE_MAX );
280 for (n=0; n<nbFiles; n++) {
281 S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
282 /* TODO: is there a minimum sample size? What if the file is 1-byte? */
284 DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
288 /* the case where we are breaking up files in sample chunks */
290 /* TODO: is there a minimum sample size? Can we have a 1-byte sample? */
291 fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
292 fs.totalSizeToLoad += fileSize;
295 /* the case where one file is one sample */
296 if (fileSize > SAMPLESIZE_MAX) {
297 /* flag excessively large sample files */
298 fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
300 /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
301 DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
302 fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
305 fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
308 DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
312 int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
313 const char** fileNamesTable, int nbFiles, size_t chunkSize,
314 ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
315 ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
318 size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
319 int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
320 size_t loadedSize; /* total data loaded in srcBuffer for all samples */
321 void* srcBuffer /* contiguous buffer with training data/samples */;
322 void* const dictBuffer = malloc(maxDictSize);
325 int const displayLevel = params ? params->zParams.notificationLevel :
326 coverParams ? coverParams->zParams.notificationLevel :
327 fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
329 /* Shuffle input files before we start assessing how much sample datA to load.
330 The purpose of the shuffle is to pick random samples when the sample
331 set is larger than what we can load in memory. */
332 DISPLAYLEVEL(3, "Shuffling input files\n");
333 DiB_shuffle(fileNamesTable, nbFiles);
335 /* Figure out how much sample data to load with how many samples */
336 fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
339 int const memMult = params ? MEMMULT :
340 coverParams ? COVER_MEMMULT:
342 size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
343 /* Limit the size of the training data to the free memory */
344 /* Limit the size of the training data to 2GB */
345 /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
346 loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
348 DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",
349 (unsigned)(memLimit / (1 MB)));
350 loadedSize = (size_t)MIN(loadedSize, memLimit);
352 srcBuffer = malloc(loadedSize+NOISELENGTH);
353 sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
357 if ((fs.nbSamples && !sampleSizes) || (!srcBuffer) || (!dictBuffer))
358 EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
359 if (fs.oneSampleTooLarge) {
360 DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n");
361 DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n");
362 DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
364 if (fs.nbSamples < 5) {
365 DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n");
366 DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n");
367 DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
368 EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */
370 if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
371 DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n");
372 DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n");
376 if ((S64)loadedSize < fs.totalSizeToLoad)
377 DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
378 (unsigned)(fs.totalSizeToLoad / (1 MB)),
379 (unsigned)(loadedSize / (1 MB)));
381 /* Load input buffer */
382 nbSamplesLoaded = DiB_loadFiles(
383 srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
384 nbFiles, chunkSize, displayLevel);
386 { size_t dictSize = ZSTD_error_GENERIC;
388 DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH); /* guard band, for end of buffer condition */
389 dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
390 srcBuffer, sampleSizes, nbSamplesLoaded,
392 } else if (coverParams) {
394 dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
395 srcBuffer, sampleSizes, nbSamplesLoaded,
397 if (!ZDICT_isError(dictSize)) {
398 unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
399 DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
400 coverParams->steps, splitPercentage);
403 dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
404 sampleSizes, nbSamplesLoaded, *coverParams);
406 } else if (fastCoverParams != NULL) {
408 dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
409 srcBuffer, sampleSizes, nbSamplesLoaded,
411 if (!ZDICT_isError(dictSize)) {
412 unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
413 DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
414 fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
415 fastCoverParams->accel);
418 dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
419 sampleSizes, nbSamplesLoaded, *fastCoverParams);
422 assert(0 /* Impossible */);
424 if (ZDICT_isError(dictSize)) {
425 DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
430 DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
431 DiB_saveDict(dictFileName, dictBuffer, dictSize);