[pcsx_rearmed.git] / deps / libchdr / deps / zstd-1.5.5 / programs / dibio.c

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under both the BSD-style license (found in the
 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
 * in the COPYING file in the root directory of this source tree).
 * You may select, at your option, one of the above-listed licenses.
 */


/* **************************************
*  Compiler Warnings
****************************************/
#ifdef _MSC_VER
#  pragma warning(disable : 4127)    /* disable: C4127: conditional expression is constant */
#endif


/*-*************************************
*  Includes
***************************************/
#include "platform.h"       /* Large Files support */
#include "util.h"           /* UTIL_getFileSize, UTIL_getTotalFileSize */
#include <stdlib.h>         /* malloc, free */
#include <string.h>         /* memset */
#include <stdio.h>          /* fprintf, fopen, ftello64 */
#include <errno.h>          /* errno */

#include "timefn.h"         /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
#include "../lib/common/debug.h" /* assert */
#include "../lib/common/mem.h"  /* read */
#include "../lib/zstd_errors.h"
#include "dibio.h"


/*-*************************************
*  Constants
***************************************/
#define KB *(1 <<10)
#define MB *(1 <<20)
#define GB *(1U<<30)

#define SAMPLESIZE_MAX (128 KB)
#define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
#define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
#define FASTCOVER_MEMMULT 1    /* rough estimation : memory cost to analyze 1 byte of sample */
static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));

#define NOISELENGTH 32
#define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */


/*-*************************************
*  Console display
***************************************/
#define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }

static const U64 g_refreshRate = SEC_TO_MICRO / 6;
static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;

#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
            if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
            { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
            if (displayLevel>=4) fflush(stderr); } } }

/*-*************************************
*  Exceptions
***************************************/
#ifndef DEBUG
#  define DEBUG 0
#endif
#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
#define EXM_THROW(error, ...)                                             \
{                                                                         \
    DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
    DISPLAY("Error %i : ", error);                                        \
    DISPLAY(__VA_ARGS__);                                                 \
    DISPLAY("\n");                                                        \
    exit(error);                                                          \
}


/* ********************************************************
*  Helper functions
**********************************************************/
#undef MIN
#define MIN(a,b)    ((a) < (b) ? (a) : (b))

/**
  Returns the size of a file.
  If error returns -1.
*/
static S64 DiB_getFileSize (const char * fileName)
{
    U64 const fileSize = UTIL_getFileSize(fileName);
    return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
}

/* ********************************************************
*  File related operations
**********************************************************/
/** DiB_loadFiles() :
 *  load samples from files listed in fileNamesTable into buffer.
 *  works even if buffer is too small to load all samples.
 *  Also provides the size of each sample into sampleSizes table
 *  which must be sized correctly, using DiB_fileStats().
 * @return : nb of samples effectively loaded into `buffer`
 * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
 *  sampleSizes is filled with the size of each sample.
 */
static int DiB_loadFiles(
    void* buffer, size_t* bufferSizePtr,
    size_t* sampleSizes, int sstSize,
    const char** fileNamesTable, int nbFiles,
    size_t targetChunkSize, int displayLevel )
{
    char* const buff = (char*)buffer;
    size_t totalDataLoaded = 0;
    int nbSamplesLoaded = 0;
    int fileIndex = 0;
    FILE * f = NULL;

    assert(targetChunkSize <= SAMPLESIZE_MAX);

    while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
        size_t fileDataLoaded;
        S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
        if (fileSize <= 0) {
            /* skip if zero-size or file error */
            ++fileIndex;
            continue;
        }

        f = fopen( fileNamesTable[fileIndex], "rb");
        if (f == NULL)
            EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
        DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[fileIndex]);

        /* Load the first chunk of data from the file */
        fileDataLoaded = targetChunkSize > 0 ?
                            (size_t)MIN(fileSize, (S64)targetChunkSize) :
                            (size_t)MIN(fileSize, SAMPLESIZE_MAX );
        if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
            break;
        if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
            EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
        sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
        totalDataLoaded += fileDataLoaded;

        /* If file-chunking is enabled, load the rest of the file as more samples */
        if (targetChunkSize > 0) {
            while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
                size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
                if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
                    break;

                if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
                    EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
                sampleSizes[nbSamplesLoaded++] = chunkSize;
                totalDataLoaded += chunkSize;
                fileDataLoaded += chunkSize;
            }
        }
        fileIndex += 1;
        fclose(f); f = NULL;
    }
    if (f != NULL)
        fclose(f);

    DISPLAYLEVEL(2, "\r%79s\r", "");
    DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
        (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
    *bufferSizePtr = totalDataLoaded;
    return nbSamplesLoaded;
}

#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
static U32 DiB_rand(U32* src)
{
    static const U32 prime1 = 2654435761U;
    static const U32 prime2 = 2246822519U;
    U32 rand32 = *src;
    rand32 *= prime1;
    rand32 ^= prime2;
    rand32  = DiB_rotl32(rand32, 13);
    *src = rand32;
    return rand32 >> 5;
}

/* DiB_shuffle() :
 * shuffle a table of file names in a semi-random way
 * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
 * it will load random elements from it, instead of just the first ones. */
static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
    U32 seed = 0xFD2FB528;
    unsigned i;
    if (nbFiles == 0)
        return;
    for (i = nbFiles - 1; i > 0; --i) {
        unsigned const j = DiB_rand(&seed) % (i + 1);
        const char* const tmp = fileNamesTable[j];
        fileNamesTable[j] = fileNamesTable[i];
        fileNamesTable[i] = tmp;
    }
}


/*-********************************************************
*  Dictionary training functions
**********************************************************/
static size_t DiB_findMaxMem(unsigned long long requiredMem)
{
    size_t const step = 8 MB;
    void* testmem = NULL;

    requiredMem = (((requiredMem >> 23) + 1) << 23);
    requiredMem += step;
    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;

    while (!testmem) {
        testmem = malloc((size_t)requiredMem);
        requiredMem -= step;
    }

    free(testmem);
    return (size_t)requiredMem;
}


static void DiB_fillNoise(void* buffer, size_t length)
{
    unsigned const prime1 = 2654435761U;
    unsigned const prime2 = 2246822519U;
    unsigned acc = prime1;
    size_t p=0;

    for (p=0; p<length; p++) {
        acc *= prime2;
        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
    }
}


static void DiB_saveDict(const char* dictFileName,
                         const void* buff, size_t buffSize)
{
    FILE* const f = fopen(dictFileName, "wb");
    if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);

    { size_t const n = fwrite(buff, 1, buffSize, f);
      if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }

    { size_t const n = (size_t)fclose(f);
      if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
}

typedef struct {
    S64 totalSizeToLoad;
    int nbSamples;
    int oneSampleTooLarge;
} fileStats;

/*! DiB_fileStats() :
 *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
 *  provides the amount of data to be loaded and the resulting nb of samples.
 *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
 */
static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
{
    fileStats fs;
    int n;
    memset(&fs, 0, sizeof(fs));

    /* We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX */
    assert( chunkSize <= SAMPLESIZE_MAX );

    for (n=0; n<nbFiles; n++) {
      S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
      /* TODO: is there a minimum sample size? What if the file is 1-byte? */
      if (fileSize == 0) {
        DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
        continue;
      }

      /* the case where we are breaking up files in sample chunks */
      if (chunkSize > 0) {
        /* TODO: is there a minimum sample size? Can we have a 1-byte sample? */
        fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
        fs.totalSizeToLoad += fileSize;
      }
      else {
      /* the case where one file is one sample */
        if (fileSize > SAMPLESIZE_MAX) {
          /* flag excessively large sample files */
          fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);

          /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
          DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
              fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
        }
        fs.nbSamples += 1;
        fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
      }
    }
    DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
    return fs;
}

int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
{
    fileStats fs;
    size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
    int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
    size_t loadedSize; /* total data loaded in srcBuffer for all samples */
    void* srcBuffer /* contiguous buffer with training data/samples */;
    void* const dictBuffer = malloc(maxDictSize);
    int result = 0;

    int const displayLevel = params ? params->zParams.notificationLevel :
        coverParams ? coverParams->zParams.notificationLevel :
        fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;

    /* Shuffle input files before we start assessing how much sample datA to load.
       The purpose of the shuffle is to pick random samples when the sample
       set is larger than what we can load in memory. */
    DISPLAYLEVEL(3, "Shuffling input files\n");
    DiB_shuffle(fileNamesTable, nbFiles);

    /* Figure out how much sample data to load with how many samples */
    fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);

    {
        int const memMult = params ? MEMMULT :
                            coverParams ? COVER_MEMMULT:
                            FASTCOVER_MEMMULT;
        size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
        /* Limit the size of the training data to the free memory */
        /* Limit the size of the training data to 2GB */
        /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
        loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
        if (memLimit != 0) {
            DISPLAYLEVEL(2, "!  Warning : setting manual memory limit for dictionary training data at %u MB \n",
                (unsigned)(memLimit / (1 MB)));
            loadedSize = (size_t)MIN(loadedSize, memLimit);
        }
        srcBuffer = malloc(loadedSize+NOISELENGTH);
        sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
    }

    /* Checks */
    if ((fs.nbSamples && !sampleSizes) || (!srcBuffer) || (!dictBuffer))
        EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
    if (fs.oneSampleTooLarge) {
        DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
        DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
        DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
    }
    if (fs.nbSamples < 5) {
        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
        DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
        DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
        EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
    }
    if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
        DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
        DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
    }

    /* init */
    if ((S64)loadedSize < fs.totalSizeToLoad)
        DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
            (unsigned)(fs.totalSizeToLoad / (1 MB)),
            (unsigned)(loadedSize / (1 MB)));

    /* Load input buffer */
    nbSamplesLoaded = DiB_loadFiles(
        srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
        nbFiles, chunkSize, displayLevel);

    {   size_t dictSize = ZSTD_error_GENERIC;
        if (params) {
            DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
            dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
                                                    srcBuffer, sampleSizes, nbSamplesLoaded,
                                                    *params);
        } else if (coverParams) {
            if (optimize) {
              dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
                                                             srcBuffer, sampleSizes, nbSamplesLoaded,
                                                             coverParams);
              if (!ZDICT_isError(dictSize)) {
                  unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
                  DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
                              coverParams->steps, splitPercentage);
              }
            } else {
              dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
                                                     sampleSizes, nbSamplesLoaded, *coverParams);
            }
        } else if (fastCoverParams != NULL) {
            if (optimize) {
              dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
                                                              srcBuffer, sampleSizes, nbSamplesLoaded,
                                                              fastCoverParams);
              if (!ZDICT_isError(dictSize)) {
                unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
                DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
                            fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
                            fastCoverParams->accel);
              }
            } else {
              dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
                                                        sampleSizes, nbSamplesLoaded, *fastCoverParams);
            }
        } else {
            assert(0 /* Impossible */);
        }
        if (ZDICT_isError(dictSize)) {
            DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
            result = 1;
            goto _cleanup;
        }
        /* save dict */
        DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
        DiB_saveDict(dictFileName, dictBuffer, dictSize);
    }

    /* clean up */
_cleanup:
    free(srcBuffer);
    free(sampleSizes);
    free(dictBuffer);
    return result;
}
Commit	Line	Data
	1	/*
	2	* Copyright (c) Meta Platforms, Inc. and affiliates.
	3	* All rights reserved.
	4	*
	5	* This source code is licensed under both the BSD-style license (found in the
	6	* LICENSE file in the root directory of this source tree) and the GPLv2 (found
	7	* in the COPYING file in the root directory of this source tree).
	8	* You may select, at your option, one of the above-listed licenses.
	9	*/
	10
	11
	12
	13	/* **************************************
	14	* Compiler Warnings
	15	****************************************/
	16	#ifdef _MSC_VER
	17	# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
	18	#endif
	19
	20
	21	/-************************************
	22	* Includes
	23	***************************************/
	24	#include "platform.h" /* Large Files support */
	25	#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */
	26	#include <stdlib.h> /* malloc, free */
	27	#include <string.h> /* memset */
	28	#include <stdio.h> /* fprintf, fopen, ftello64 */
	29	#include <errno.h> /* errno */
	30
	31	#include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
	32	#include "../lib/common/debug.h" /* assert */
	33	#include "../lib/common/mem.h" /* read */
	34	#include "../lib/zstd_errors.h"
	35	#include "dibio.h"
	36
	37
	38	/-************************************
	39	* Constants
	40	***************************************/
	41	#define KB *(1 <<10)
	42	#define MB *(1 <<20)
	43	#define GB *(1U<<30)
	44
	45	#define SAMPLESIZE_MAX (128 KB)
	46	#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
	47	#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
	48	#define FASTCOVER_MEMMULT 1 /* rough estimation : memory cost to analyze 1 byte of sample */
	49	static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
	50
	51	#define NOISELENGTH 32
	52	#define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
	53
	54
	55	/-************************************
	56	* Console display
	57	***************************************/
	58	#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
	59	#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
	60
	61	static const U64 g_refreshRate = SEC_TO_MICRO / 6;
	62	static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
	63
	64	#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
	65	if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) \|\| (displayLevel>=4)) \
	66	{ g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
	67	if (displayLevel>=4) fflush(stderr); } } }
	68
	69	/-************************************
	70	* Exceptions
	71	***************************************/
	72	#ifndef DEBUG
	73	# define DEBUG 0
	74	#endif
	75	#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
	76	#define EXM_THROW(error, ...) \
	77	{ \
	78	DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
	79	DISPLAY("Error %i : ", error); \
	80	DISPLAY(__VA_ARGS__); \
	81	DISPLAY("\n"); \
	82	exit(error); \
	83	}
	84
	85
	86	/* ********************************************************
	87	* Helper functions
	88	**********************************************************/
	89	#undef MIN
	90	#define MIN(a,b) ((a) < (b) ? (a) : (b))
	91
	92	/**
	93	Returns the size of a file.
	94	If error returns -1.
	95	*/
	96	static S64 DiB_getFileSize (const char * fileName)
	97	{
	98	U64 const fileSize = UTIL_getFileSize(fileName);
	99	return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
	100	}
	101
	102	/* ********************************************************
	103	* File related operations
	104	**********************************************************/
	105	/** DiB_loadFiles() :
	106	* load samples from files listed in fileNamesTable into buffer.
	107	* works even if buffer is too small to load all samples.
	108	* Also provides the size of each sample into sampleSizes table
	109	* which must be sized correctly, using DiB_fileStats().
	110	* @return : nb of samples effectively loaded into `buffer`
	111	* *bufferSizePtr is modified, it provides the amount data loaded within buffer.
	112	* sampleSizes is filled with the size of each sample.
	113	*/
	114	static int DiB_loadFiles(
	115	void* buffer, size_t* bufferSizePtr,
	116	size_t* sampleSizes, int sstSize,
	117	const char** fileNamesTable, int nbFiles,
	118	size_t targetChunkSize, int displayLevel )
	119	{
	120	char* const buff = (char*)buffer;
	121	size_t totalDataLoaded = 0;
	122	int nbSamplesLoaded = 0;
	123	int fileIndex = 0;
	124	FILE * f = NULL;
	125
	126	assert(targetChunkSize <= SAMPLESIZE_MAX);
	127
	128	while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
	129	size_t fileDataLoaded;
	130	S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
	131	if (fileSize <= 0) {
	132	/* skip if zero-size or file error */
	133	++fileIndex;
	134	continue;
	135	}
	136
	137	f = fopen( fileNamesTable[fileIndex], "rb");
	138	if (f == NULL)
	139	EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
	140	DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[fileIndex]);
	141
	142	/* Load the first chunk of data from the file */
	143	fileDataLoaded = targetChunkSize > 0 ?
	144	(size_t)MIN(fileSize, (S64)targetChunkSize) :
	145	(size_t)MIN(fileSize, SAMPLESIZE_MAX );
	146	if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
	147	break;
	148	if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
	149	EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
	150	sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
	151	totalDataLoaded += fileDataLoaded;
	152
	153	/* If file-chunking is enabled, load the rest of the file as more samples */
	154	if (targetChunkSize > 0) {
	155	while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
	156	size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
	157	if (totalDataLoaded + chunkSize > bufferSizePtr) / buffer is full */
	158	break;
	159
	160	if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
	161	EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
	162	sampleSizes[nbSamplesLoaded++] = chunkSize;
	163	totalDataLoaded += chunkSize;
	164	fileDataLoaded += chunkSize;
	165	}
	166	}
	167	fileIndex += 1;
	168	fclose(f); f = NULL;
	169	}
	170	if (f != NULL)
	171	fclose(f);
	172
	173	DISPLAYLEVEL(2, "\r%79s\r", "");
	174	DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
	175	(int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
	176	*bufferSizePtr = totalDataLoaded;
	177	return nbSamplesLoaded;
	178	}
	179
	180	#define DiB_rotl32(x,r) ((x << r) \| (x >> (32 - r)))
	181	static U32 DiB_rand(U32* src)
	182	{
	183	static const U32 prime1 = 2654435761U;
	184	static const U32 prime2 = 2246822519U;
	185	U32 rand32 = *src;
	186	rand32 *= prime1;
	187	rand32 ^= prime2;
	188	rand32 = DiB_rotl32(rand32, 13);
	189	*src = rand32;
	190	return rand32 >> 5;
	191	}
	192
	193	/* DiB_shuffle() :
	194	* shuffle a table of file names in a semi-random way
	195	* It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
	196	* it will load random elements from it, instead of just the first ones. */
	197	static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
	198	U32 seed = 0xFD2FB528;
	199	unsigned i;
	200	if (nbFiles == 0)
	201	return;
	202	for (i = nbFiles - 1; i > 0; --i) {
	203	unsigned const j = DiB_rand(&seed) % (i + 1);
	204	const char* const tmp = fileNamesTable[j];
	205	fileNamesTable[j] = fileNamesTable[i];
	206	fileNamesTable[i] = tmp;
	207	}
	208	}
	209
	210
	211	/-*******************************************************
	212	* Dictionary training functions
	213	**********************************************************/
	214	static size_t DiB_findMaxMem(unsigned long long requiredMem)
	215	{
	216	size_t const step = 8 MB;
	217	void* testmem = NULL;
	218
	219	requiredMem = (((requiredMem >> 23) + 1) << 23);
	220	requiredMem += step;
	221	if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
	222
	223	while (!testmem) {
	224	testmem = malloc((size_t)requiredMem);
	225	requiredMem -= step;
	226	}
	227
	228	free(testmem);
	229	return (size_t)requiredMem;
	230	}
	231
	232
	233	static void DiB_fillNoise(void* buffer, size_t length)
	234	{
	235	unsigned const prime1 = 2654435761U;
	236	unsigned const prime2 = 2246822519U;
	237	unsigned acc = prime1;
	238	size_t p=0;
	239
	240	for (p=0; p<length; p++) {
	241	acc *= prime2;
	242	((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
	243	}
	244	}
	245
	246
	247	static void DiB_saveDict(const char* dictFileName,
	248	const void* buff, size_t buffSize)
	249	{
	250	FILE* const f = fopen(dictFileName, "wb");
	251	if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
	252
	253	{ size_t const n = fwrite(buff, 1, buffSize, f);
	254	if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
	255
	256	{ size_t const n = (size_t)fclose(f);
	257	if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
	258	}
	259
	260	typedef struct {
	261	S64 totalSizeToLoad;
	262	int nbSamples;
	263	int oneSampleTooLarge;
	264	} fileStats;
	265
	266	/*! DiB_fileStats() :
	267	* Given a list of files, and a chunkSize (0 == no chunk, whole files)
	268	* provides the amount of data to be loaded and the resulting nb of samples.
	269	* This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
	270	*/
	271	static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
	272	{
	273	fileStats fs;
	274	int n;
	275	memset(&fs, 0, sizeof(fs));
	276
	277	/* We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX */
	278	assert( chunkSize <= SAMPLESIZE_MAX );
	279
	280	for (n=0; n<nbFiles; n++) {
	281	S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
	282	/* TODO: is there a minimum sample size? What if the file is 1-byte? */
	283	if (fileSize == 0) {
	284	DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
	285	continue;
	286	}
	287
	288	/* the case where we are breaking up files in sample chunks */
	289	if (chunkSize > 0) {
	290	/* TODO: is there a minimum sample size? Can we have a 1-byte sample? */
	291	fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
	292	fs.totalSizeToLoad += fileSize;
	293	}
	294	else {
	295	/* the case where one file is one sample */
	296	if (fileSize > SAMPLESIZE_MAX) {
	297	/* flag excessively large sample files */
	298	fs.oneSampleTooLarge \|= (fileSize > 2*SAMPLESIZE_MAX);
	299
	300	/* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
	301	DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
	302	fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
	303	}
	304	fs.nbSamples += 1;
	305	fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
	306	}
	307	}
	308	DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
	309	return fs;
	310	}
	311
	312	int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
	313	const char** fileNamesTable, int nbFiles, size_t chunkSize,
	314	ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
	315	ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
	316	{
	317	fileStats fs;
	318	size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
	319	int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
	320	size_t loadedSize; /* total data loaded in srcBuffer for all samples */
	321	void* srcBuffer /* contiguous buffer with training data/samples */;
	322	void* const dictBuffer = malloc(maxDictSize);
	323	int result = 0;
	324
	325	int const displayLevel = params ? params->zParams.notificationLevel :
	326	coverParams ? coverParams->zParams.notificationLevel :
	327	fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
	328
	329	/* Shuffle input files before we start assessing how much sample datA to load.
	330	The purpose of the shuffle is to pick random samples when the sample
	331	set is larger than what we can load in memory. */
	332	DISPLAYLEVEL(3, "Shuffling input files\n");
	333	DiB_shuffle(fileNamesTable, nbFiles);
	334
	335	/* Figure out how much sample data to load with how many samples */
	336	fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
	337
	338	{
	339	int const memMult = params ? MEMMULT :
	340	coverParams ? COVER_MEMMULT:
	341	FASTCOVER_MEMMULT;
	342	size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
	343	/* Limit the size of the training data to the free memory */
	344	/* Limit the size of the training data to 2GB */
	345	/* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
	346	loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
	347	if (memLimit != 0) {
	348	DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",
	349	(unsigned)(memLimit / (1 MB)));
	350	loadedSize = (size_t)MIN(loadedSize, memLimit);
	351	}
	352	srcBuffer = malloc(loadedSize+NOISELENGTH);
	353	sampleSizes = (size_t)malloc(fs.nbSamples sizeof(size_t));
	354	}
	355
	356	/* Checks */
	357	if ((fs.nbSamples && !sampleSizes) \|\| (!srcBuffer) \|\| (!dictBuffer))
	358	EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
	359	if (fs.oneSampleTooLarge) {
	360	DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n");
	361	DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n");
	362	DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
	363	}
	364	if (fs.nbSamples < 5) {
	365	DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n");
	366	DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n");
	367	DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
	368	EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */
	369	}
	370	if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
	371	DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n");
	372	DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n");
	373	}
	374
	375	/* init */
	376	if ((S64)loadedSize < fs.totalSizeToLoad)
	377	DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
	378	(unsigned)(fs.totalSizeToLoad / (1 MB)),
	379	(unsigned)(loadedSize / (1 MB)));
	380
	381	/* Load input buffer */
	382	nbSamplesLoaded = DiB_loadFiles(
	383	srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
	384	nbFiles, chunkSize, displayLevel);
	385
	386	{ size_t dictSize = ZSTD_error_GENERIC;
	387	if (params) {
	388	DiB_fillNoise((char)srcBuffer + loadedSize, NOISELENGTH); / guard band, for end of buffer condition */
	389	dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
	390	srcBuffer, sampleSizes, nbSamplesLoaded,
	391	*params);
	392	} else if (coverParams) {
	393	if (optimize) {
	394	dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
	395	srcBuffer, sampleSizes, nbSamplesLoaded,
	396	coverParams);
	397	if (!ZDICT_isError(dictSize)) {
	398	unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
	399	DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
	400	coverParams->steps, splitPercentage);
	401	}
	402	} else {
	403	dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
	404	sampleSizes, nbSamplesLoaded, *coverParams);
	405	}
	406	} else if (fastCoverParams != NULL) {
	407	if (optimize) {
	408	dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
	409	srcBuffer, sampleSizes, nbSamplesLoaded,
	410	fastCoverParams);
	411	if (!ZDICT_isError(dictSize)) {
	412	unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
	413	DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
	414	fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
	415	fastCoverParams->accel);
	416	}
	417	} else {
	418	dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
	419	sampleSizes, nbSamplesLoaded, *fastCoverParams);
	420	}
	421	} else {
	422	assert(0 /* Impossible */);
	423	}
	424	if (ZDICT_isError(dictSize)) {
	425	DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
	426	result = 1;
	427	goto _cleanup;
	428	}
	429	/* save dict */
	430	DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
	431	DiB_saveDict(dictFileName, dictBuffer, dictSize);
	432	}
	433
	434	/* clean up */
	435	_cleanup:
	436	free(srcBuffer);
	437	free(sampleSizes);
	438	free(dictBuffer);
	439	return result;
	440	}