blob: 26ebe5ca1d63ca0c99326222e905fb07cbdfd932 [file] [log] [blame]
Yann Collet32fb4072017-08-18 16:52:05 -07001/*
W. Felix Handte5d693cc2022-12-20 12:49:47 -05002 * Copyright (c) Meta Platforms, Inc. and affiliates.
Yann Collet4ded9e52016-08-30 10:04:33 -07003 * All rights reserved.
4 *
Yann Collet32fb4072017-08-18 16:52:05 -07005 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
Yann Collet3128e032017-09-08 00:09:23 -07008 * You may select, at your option, one of the above-listed licenses.
Yann Collet4ded9e52016-08-30 10:04:33 -07009 */
Yann Collet71eafdd2016-02-12 02:31:57 +010010
Yann Collet71eafdd2016-02-12 02:31:57 +010011
Yann Collet71eafdd2016-02-12 02:31:57 +010012
Przemyslaw Skibinski2f6ccee2016-12-21 13:23:34 +010013/* **************************************
14* Compiler Warnings
15****************************************/
16#ifdef _MSC_VER
Yann Collet77c137b2017-09-14 15:12:57 -070017# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
Przemyslaw Skibinski2f6ccee2016-12-21 13:23:34 +010018#endif
19
20
Yann Collet71eafdd2016-02-12 02:31:57 +010021/*-*************************************
22* Includes
23***************************************/
Przemyslaw Skibinski7a8a03c2016-12-21 15:08:44 +010024#include "platform.h" /* Large Files support */
Przemyslaw Skibinski2f6ccee2016-12-21 13:23:34 +010025#include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */
Yann Collet71eafdd2016-02-12 02:31:57 +010026#include <stdlib.h> /* malloc, free */
27#include <string.h> /* memset */
28#include <stdio.h> /* fprintf, fopen, ftello64 */
Yann Colleta3d03a32016-07-06 16:27:17 +020029#include <errno.h> /* errno */
Yann Collet71eafdd2016-02-12 02:31:57 +010030
Yann Collet59a71162019-04-10 12:37:03 -070031#include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
Nick Terrell246982e2022-01-20 22:41:47 -080032#include "../lib/common/debug.h" /* assert */
W. Felix Handte7dcca6b2020-05-01 16:20:40 -040033#include "../lib/common/mem.h" /* read */
Nick Terrell8d65f872022-01-30 12:16:16 -080034#include "../lib/zstd_errors.h"
inikep23a08892016-04-22 12:43:18 +020035#include "dibio.h"
Yann Collet71eafdd2016-02-12 02:31:57 +010036
37
38/*-*************************************
39* Constants
40***************************************/
41#define KB *(1 <<10)
42#define MB *(1 <<20)
43#define GB *(1U<<30)
44
Yann Collet1496c3d2016-12-18 11:58:23 +010045#define SAMPLESIZE_MAX (128 KB)
46#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
Nick Terrelldf8415c2016-12-31 21:08:24 -080047#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -070048#define FASTCOVER_MEMMULT 1 /* rough estimation : memory cost to analyze 1 byte of sample */
Yann Collet77c137b2017-09-14 15:12:57 -070049static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
Yann Collet71eafdd2016-02-12 02:31:57 +010050
51#define NOISELENGTH 32
stanjo7452598d52021-10-04 17:47:52 -070052#define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
Yann Collet71eafdd2016-02-12 02:31:57 +010053
54
55/*-*************************************
56* Console display
57***************************************/
58#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
Yann Collet086b9592017-09-14 16:45:10 -070059#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
Yann Collet71eafdd2016-02-12 02:31:57 +010060
Nick Terrell9a2f6f42017-11-29 19:11:12 -080061static const U64 g_refreshRate = SEC_TO_MICRO / 6;
62static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
Yann Colletf6ca09b2016-05-09 04:44:45 +020063
Nick Terrell9a2f6f42017-11-29 19:11:12 -080064#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
65 if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
66 { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
67 if (displayLevel>=4) fflush(stderr); } } }
Yann Collet71eafdd2016-02-12 02:31:57 +010068
69/*-*************************************
70* Exceptions
71***************************************/
72#ifndef DEBUG
73# define DEBUG 0
74#endif
75#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
76#define EXM_THROW(error, ...) \
77{ \
78 DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
Yann Collet086b9592017-09-14 16:45:10 -070079 DISPLAY("Error %i : ", error); \
80 DISPLAY(__VA_ARGS__); \
81 DISPLAY("\n"); \
Yann Collet71eafdd2016-02-12 02:31:57 +010082 exit(error); \
83}
84
85
86/* ********************************************************
87* Helper functions
88**********************************************************/
Sean Purcell42bac7f2017-04-13 15:35:05 -070089#undef MIN
90#define MIN(a,b) ((a) < (b) ? (a) : (b))
Yann Colletbcb5f772016-07-06 15:41:03 +020091
stanjo7452598d52021-10-04 17:47:52 -070092/**
93 Returns the size of a file.
94 If error returns -1.
95*/
96static S64 DiB_getFileSize (const char * fileName)
97{
98 U64 const fileSize = UTIL_getFileSize(fileName);
99 return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
100}
Yann Collet71eafdd2016-02-12 02:31:57 +0100101
102/* ********************************************************
103* File related operations
104**********************************************************/
Yann Collet290aaa72016-05-30 21:18:52 +0200105/** DiB_loadFiles() :
Yann Colletc68d17f2017-09-15 15:31:31 -0700106 * load samples from files listed in fileNamesTable into buffer.
107 * works even if buffer is too small to load all samples.
108 * Also provides the size of each sample into sampleSizes table
109 * which must be sized correctly, using DiB_fileStats().
110 * @return : nb of samples effectively loaded into `buffer`
111 * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
112 * sampleSizes is filled with the size of each sample.
113 */
stanjo7452598d52021-10-04 17:47:52 -0700114static int DiB_loadFiles(
115 void* buffer, size_t* bufferSizePtr,
116 size_t* sampleSizes, int sstSize,
117 const char** fileNamesTable, int nbFiles,
118 size_t targetChunkSize, int displayLevel )
Yann Collet71eafdd2016-02-12 02:31:57 +0100119{
Yann Collet290aaa72016-05-30 21:18:52 +0200120 char* const buff = (char*)buffer;
stanjo7452598d52021-10-04 17:47:52 -0700121 size_t totalDataLoaded = 0;
122 int nbSamplesLoaded = 0;
123 int fileIndex = 0;
124 FILE * f = NULL;
Yann Collet71eafdd2016-02-12 02:31:57 +0100125
stanjo7452598d52021-10-04 17:47:52 -0700126 assert(targetChunkSize <= SAMPLESIZE_MAX);
127
128 while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
129 size_t fileDataLoaded;
130 S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
Nick Terrellda737c72022-03-02 11:04:04 -0800131 if (fileSize <= 0) {
132 /* skip if zero-size or file error */
133 ++fileIndex;
stanjo7452598d52021-10-04 17:47:52 -0700134 continue;
Nick Terrellda737c72022-03-02 11:04:04 -0800135 }
stanjo7452598d52021-10-04 17:47:52 -0700136
137 f = fopen( fileNamesTable[fileIndex], "rb");
138 if (f == NULL)
139 EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
140 DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[fileIndex]);
141
142 /* Load the first chunk of data from the file */
143 fileDataLoaded = targetChunkSize > 0 ?
144 (size_t)MIN(fileSize, (S64)targetChunkSize) :
145 (size_t)MIN(fileSize, SAMPLESIZE_MAX );
146 if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
147 break;
148 if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
149 EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
150 sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
151 totalDataLoaded += fileDataLoaded;
152
153 /* If file-chunking is enabled, load the rest of the file as more samples */
154 if (targetChunkSize > 0) {
155 while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
156 size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
157 if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
Yann Colletc68d17f2017-09-15 15:31:31 -0700158 break;
stanjo7452598d52021-10-04 17:47:52 -0700159
160 if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
161 EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
162 sampleSizes[nbSamplesLoaded++] = chunkSize;
163 totalDataLoaded += chunkSize;
164 fileDataLoaded += chunkSize;
165 }
166 }
167 fileIndex += 1;
168 fclose(f); f = NULL;
Yann Collet086b9592017-09-14 16:45:10 -0700169 }
stanjo7452598d52021-10-04 17:47:52 -0700170 if (f != NULL)
171 fclose(f);
172
Nick Terrelldf8415c2016-12-31 21:08:24 -0800173 DISPLAYLEVEL(2, "\r%79s\r", "");
stanjo7452598d52021-10-04 17:47:52 -0700174 DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
175 (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
176 *bufferSizePtr = totalDataLoaded;
177 return nbSamplesLoaded;
Yann Collet71eafdd2016-02-12 02:31:57 +0100178}
179
Nick Terrelldf8415c2016-12-31 21:08:24 -0800180#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
181static U32 DiB_rand(U32* src)
182{
183 static const U32 prime1 = 2654435761U;
184 static const U32 prime2 = 2246822519U;
185 U32 rand32 = *src;
186 rand32 *= prime1;
187 rand32 ^= prime2;
188 rand32 = DiB_rotl32(rand32, 13);
189 *src = rand32;
190 return rand32 >> 5;
191}
192
Yann Collet77c137b2017-09-14 15:12:57 -0700193/* DiB_shuffle() :
194 * shuffle a table of file names in a semi-random way
195 * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
196 * it will load random elements from it, instead of just the first ones. */
Nick Terrelldf8415c2016-12-31 21:08:24 -0800197static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
Yann Collet77c137b2017-09-14 15:12:57 -0700198 U32 seed = 0xFD2FB528;
199 unsigned i;
Nick Terrell246982e2022-01-20 22:41:47 -0800200 if (nbFiles == 0)
201 return;
Yann Collet77c137b2017-09-14 15:12:57 -0700202 for (i = nbFiles - 1; i > 0; --i) {
203 unsigned const j = DiB_rand(&seed) % (i + 1);
204 const char* const tmp = fileNamesTable[j];
205 fileNamesTable[j] = fileNamesTable[i];
206 fileNamesTable[i] = tmp;
207 }
Nick Terrelldf8415c2016-12-31 21:08:24 -0800208}
209
Yann Collet71eafdd2016-02-12 02:31:57 +0100210
211/*-********************************************************
212* Dictionary training functions
213**********************************************************/
214static size_t DiB_findMaxMem(unsigned long long requiredMem)
215{
Yann Collet290aaa72016-05-30 21:18:52 +0200216 size_t const step = 8 MB;
Yann Collet71eafdd2016-02-12 02:31:57 +0100217 void* testmem = NULL;
218
219 requiredMem = (((requiredMem >> 23) + 1) << 23);
Yann Colletbcb5f772016-07-06 15:41:03 +0200220 requiredMem += step;
Yann Collet77c137b2017-09-14 15:12:57 -0700221 if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
Yann Collet71eafdd2016-02-12 02:31:57 +0100222
223 while (!testmem) {
Yann Collet71eafdd2016-02-12 02:31:57 +0100224 testmem = malloc((size_t)requiredMem);
Yann Colletbcb5f772016-07-06 15:41:03 +0200225 requiredMem -= step;
Yann Collet71eafdd2016-02-12 02:31:57 +0100226 }
227
228 free(testmem);
Yann Colletbcb5f772016-07-06 15:41:03 +0200229 return (size_t)requiredMem;
Yann Collet71eafdd2016-02-12 02:31:57 +0100230}
231
232
233static void DiB_fillNoise(void* buffer, size_t length)
234{
Yann Colletbcb5f772016-07-06 15:41:03 +0200235 unsigned const prime1 = 2654435761U;
236 unsigned const prime2 = 2246822519U;
237 unsigned acc = prime1;
Ed Masteb81d7cc2019-08-15 21:17:06 -0400238 size_t p=0;
Yann Collet71eafdd2016-02-12 02:31:57 +0100239
240 for (p=0; p<length; p++) {
Yann Colletbcb5f772016-07-06 15:41:03 +0200241 acc *= prime2;
Yann Collet71eafdd2016-02-12 02:31:57 +0100242 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
243 }
244}
245
246
247static void DiB_saveDict(const char* dictFileName,
248 const void* buff, size_t buffSize)
249{
Yann Collet290aaa72016-05-30 21:18:52 +0200250 FILE* const f = fopen(dictFileName, "wb");
Yann Collet71eafdd2016-02-12 02:31:57 +0100251 if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
252
Yann Colletf6ca09b2016-05-09 04:44:45 +0200253 { size_t const n = fwrite(buff, 1, buffSize, f);
254 if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
Yann Collet71eafdd2016-02-12 02:31:57 +0100255
Yann Colletf6ca09b2016-05-09 04:44:45 +0200256 { size_t const n = (size_t)fclose(f);
257 if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
Yann Collet71eafdd2016-02-12 02:31:57 +0100258}
259
Yann Collet086b9592017-09-14 16:45:10 -0700260typedef struct {
stanjo7452598d52021-10-04 17:47:52 -0700261 S64 totalSizeToLoad;
262 int nbSamples;
263 int oneSampleTooLarge;
Yann Collet086b9592017-09-14 16:45:10 -0700264} fileStats;
265
Yann Colletc68d17f2017-09-15 15:31:31 -0700266/*! DiB_fileStats() :
267 * Given a list of files, and a chunkSize (0 == no chunk, whole files)
268 * provides the amount of data to be loaded and the resulting nb of samples.
269 * This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
270 */
stanjo7452598d52021-10-04 17:47:52 -0700271static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
Yann Collet1496c3d2016-12-18 11:58:23 +0100272{
Yann Collet086b9592017-09-14 16:45:10 -0700273 fileStats fs;
stanjo7452598d52021-10-04 17:47:52 -0700274 int n;
Yann Collet086b9592017-09-14 16:45:10 -0700275 memset(&fs, 0, sizeof(fs));
stanjo7452598d52021-10-04 17:47:52 -0700276
Yann Colletbcfb7ad2023-01-12 19:00:27 -0800277 /* We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX */
stanjo7452598d52021-10-04 17:47:52 -0700278 assert( chunkSize <= SAMPLESIZE_MAX );
279
Yann Collet1496c3d2016-12-18 11:58:23 +0100280 for (n=0; n<nbFiles; n++) {
stanjo7452598d52021-10-04 17:47:52 -0700281 S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
Yann Colletbcfb7ad2023-01-12 19:00:27 -0800282 /* TODO: is there a minimum sample size? What if the file is 1-byte? */
stanjo7452598d52021-10-04 17:47:52 -0700283 if (fileSize == 0) {
284 DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
285 continue;
286 }
287
288 /* the case where we are breaking up files in sample chunks */
Yann Colletbcfb7ad2023-01-12 19:00:27 -0800289 if (chunkSize > 0) {
290 /* TODO: is there a minimum sample size? Can we have a 1-byte sample? */
stanjo7452598d52021-10-04 17:47:52 -0700291 fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
292 fs.totalSizeToLoad += fileSize;
293 }
294 else {
295 /* the case where one file is one sample */
296 if (fileSize > SAMPLESIZE_MAX) {
297 /* flag excessively large sample files */
298 fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
299
300 /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
301 DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
302 fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
303 }
304 fs.nbSamples += 1;
305 fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
306 }
Yann Collet1496c3d2016-12-18 11:58:23 +0100307 }
stanjo7452598d52021-10-04 17:47:52 -0700308 DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
Yann Collet086b9592017-09-14 16:45:10 -0700309 return fs;
Yann Collet1496c3d2016-12-18 11:58:23 +0100310}
311
stanjo7452598d52021-10-04 17:47:52 -0700312int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
313 const char** fileNamesTable, int nbFiles, size_t chunkSize,
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700314 ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
Elliot Gorokhovsky71c0c072021-12-10 16:19:40 -0500315 ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
Yann Collet71eafdd2016-02-12 02:31:57 +0100316{
stanjo7452598d52021-10-04 17:47:52 -0700317 fileStats fs;
318 size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
319 int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
320 size_t loadedSize; /* total data loaded in srcBuffer for all samples */
321 void* srcBuffer /* contiguous buffer with training data/samples */;
Yann Collet290aaa72016-05-30 21:18:52 +0200322 void* const dictBuffer = malloc(maxDictSize);
Yann Collet71eafdd2016-02-12 02:31:57 +0100323 int result = 0;
324
stanjo7452598d52021-10-04 17:47:52 -0700325 int const displayLevel = params ? params->zParams.notificationLevel :
326 coverParams ? coverParams->zParams.notificationLevel :
327 fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
328
329 /* Shuffle input files before we start assessing how much sample datA to load.
330 The purpose of the shuffle is to pick random samples when the sample
331 set is larger than what we can load in memory. */
332 DISPLAYLEVEL(3, "Shuffling input files\n");
333 DiB_shuffle(fileNamesTable, nbFiles);
334
335 /* Figure out how much sample data to load with how many samples */
336 fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
337
338 {
339 int const memMult = params ? MEMMULT :
340 coverParams ? COVER_MEMMULT:
341 FASTCOVER_MEMMULT;
342 size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
343 /* Limit the size of the training data to the free memory */
344 /* Limit the size of the training data to 2GB */
Dimitris Apostolouebbd6752021-11-13 10:04:04 +0200345 /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
stanjo7452598d52021-10-04 17:47:52 -0700346 loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
Elliot Gorokhovsky71c0c072021-12-10 16:19:40 -0500347 if (memLimit != 0) {
348 DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",
349 (unsigned)(memLimit / (1 MB)));
350 loadedSize = (size_t)MIN(loadedSize, memLimit);
351 }
stanjo7452598d52021-10-04 17:47:52 -0700352 srcBuffer = malloc(loadedSize+NOISELENGTH);
353 sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
354 }
355
Yann Collet290aaa72016-05-30 21:18:52 +0200356 /* Checks */
Qiongsi Wub1bbb0e2022-07-29 15:21:59 -0400357 if ((fs.nbSamples && !sampleSizes) || (!srcBuffer) || (!dictBuffer))
Yann Collet77c137b2017-09-14 15:12:57 -0700358 EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
Yann Collet086b9592017-09-14 16:45:10 -0700359 if (fs.oneSampleTooLarge) {
360 DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n");
361 DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n");
362 DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
Yann Collet1496c3d2016-12-18 11:58:23 +0100363 }
Yann Colletc68d17f2017-09-15 15:31:31 -0700364 if (fs.nbSamples < 5) {
Yann Collet49d105c2016-08-18 15:02:11 +0200365 DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n");
366 DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n");
Yann Collet17220552017-09-15 16:23:50 -0700367 DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
Yann Collet086b9592017-09-14 16:45:10 -0700368 EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */
369 }
stanjo7452598d52021-10-04 17:47:52 -0700370 if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
Yann Collet086b9592017-09-14 16:45:10 -0700371 DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n");
372 DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n");
Yann Colletdd25a272016-07-27 12:35:29 +0200373 }
Yann Collet290aaa72016-05-30 21:18:52 +0200374
Yann Collet71eafdd2016-02-12 02:31:57 +0100375 /* init */
stanjo7452598d52021-10-04 17:47:52 -0700376 if ((S64)loadedSize < fs.totalSizeToLoad)
377 DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
378 (unsigned)(fs.totalSizeToLoad / (1 MB)),
379 (unsigned)(loadedSize / (1 MB)));
Yann Collet71eafdd2016-02-12 02:31:57 +0100380
Yann Collet71eafdd2016-02-12 02:31:57 +0100381 /* Load input buffer */
stanjo7452598d52021-10-04 17:47:52 -0700382 nbSamplesLoaded = DiB_loadFiles(
383 srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
384 nbFiles, chunkSize, displayLevel);
Yann Collet71eafdd2016-02-12 02:31:57 +0100385
Nick Terrell8d65f872022-01-30 12:16:16 -0800386 { size_t dictSize = ZSTD_error_GENERIC;
Nick Terrelldf8415c2016-12-31 21:08:24 -0800387 if (params) {
Yann Collet086b9592017-09-14 16:45:10 -0700388 DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH); /* guard band, for end of buffer condition */
Yann Collet890d85b2021-01-06 16:19:42 -0800389 dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
stanjo7452598d52021-10-04 17:47:52 -0700390 srcBuffer, sampleSizes, nbSamplesLoaded,
Yann Collet890d85b2021-01-06 16:19:42 -0800391 *params);
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700392 } else if (coverParams) {
393 if (optimize) {
394 dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
stanjo7452598d52021-10-04 17:47:52 -0700395 srcBuffer, sampleSizes, nbSamplesLoaded,
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700396 coverParams);
397 if (!ZDICT_isError(dictSize)) {
398 unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
399 DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
400 coverParams->steps, splitPercentage);
401 }
402 } else {
403 dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
stanjo7452598d52021-10-04 17:47:52 -0700404 sampleSizes, nbSamplesLoaded, *coverParams);
Nick Terrelldf8415c2016-12-31 21:08:24 -0800405 }
Nick Terrell8d65f872022-01-30 12:16:16 -0800406 } else if (fastCoverParams != NULL) {
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700407 if (optimize) {
408 dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
stanjo7452598d52021-10-04 17:47:52 -0700409 srcBuffer, sampleSizes, nbSamplesLoaded,
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700410 fastCoverParams);
411 if (!ZDICT_isError(dictSize)) {
412 unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
413 DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
414 fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
415 fastCoverParams->accel);
416 }
417 } else {
418 dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
stanjo7452598d52021-10-04 17:47:52 -0700419 sampleSizes, nbSamplesLoaded, *fastCoverParams);
Jennifer Liu9d6ed9d2018-08-23 12:06:20 -0700420 }
Nick Terrell8d65f872022-01-30 12:16:16 -0800421 } else {
422 assert(0 /* Impossible */);
Nick Terrelldf8415c2016-12-31 21:08:24 -0800423 }
Yann Collet290aaa72016-05-30 21:18:52 +0200424 if (ZDICT_isError(dictSize)) {
425 DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
426 result = 1;
427 goto _cleanup;
428 }
429 /* save dict */
Yann Colletededcfc2018-12-21 16:19:44 -0800430 DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
Yann Collet290aaa72016-05-30 21:18:52 +0200431 DiB_saveDict(dictFileName, dictBuffer, dictSize);
Yann Collet71eafdd2016-02-12 02:31:57 +0100432 }
433
Yann Collet71eafdd2016-02-12 02:31:57 +0100434 /* clean up */
435_cleanup:
436 free(srcBuffer);
Yann Colletc68d17f2017-09-15 15:31:31 -0700437 free(sampleSizes);
Yann Collet71eafdd2016-02-12 02:31:57 +0100438 free(dictBuffer);
Yann Collet71eafdd2016-02-12 02:31:57 +0100439 return result;
440}