minor comment refactor

commit: 77c137b3ae4d3c961952acf659b4fa515dffb1db [log] [tgz]
author: Yann Collet <cyan@fb.com> Thu Sep 14 15:12:57 2017 -0700
committer: Yann Collet <cyan@fb.com> Thu Sep 14 15:12:57 2017 -0700
tree: d53f7c4d491a04ec306a518904f68da6c4afe95c
parent: 335780c427e365e1c3c7e7ae96acbf79f9b36f19 [diff] [blame]
diff --git a/programs/dibio.c b/programs/dibio.c
index ffc7845..79f2729 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c

@@ -14,7 +14,7 @@
 *  Compiler Warnings
 ****************************************/
 #ifdef _MSC_VER
-#  pragma warning(disable : 4127)                /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4127)    /* disable: C4127: conditional expression is constant */
 #endif
 
 
@@ -44,7 +44,7 @@
 #define SAMPLESIZE_MAX (128 KB)
 #define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
 #define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
-static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
+static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
 
 #define NOISELENGTH 32
 
@@ -98,7 +98,9 @@
 *  File related operations
 **********************************************************/
 /** DiB_loadFiles() :
-*   @return : nb of files effectively loaded into `buffer` */
+ *  load files listed in fileNamesTable into buffer, even if buffer is too small.
+ * @return : nb of files effectively loaded into `buffer`
+ * *bufferSizePtr is modified, it provides the amount data loaded within buffer */
 static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr,
                               size_t* fileSizes,
                               const char** fileNamesTable, unsigned nbFiles)
@@ -139,16 +141,19 @@
     return rand32 >> 5;
 }
 
+/* DiB_shuffle() :
+ * shuffle a table of file names in a semi-random way
+ * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
+ * it will load random elements from it, instead of just the first ones. */
 static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
-  /* Initialize the pseudorandom number generator */
-  U32 seed = 0xFD2FB528;
-  unsigned i;
-  for (i = nbFiles - 1; i > 0; --i) {
-    unsigned const j = DiB_rand(&seed) % (i + 1);
-    const char* tmp = fileNamesTable[j];
-    fileNamesTable[j] = fileNamesTable[i];
-    fileNamesTable[i] = tmp;
-  }
+    U32 seed = 0xFD2FB528;
+    unsigned i;
+    for (i = nbFiles - 1; i > 0; --i) {
+        unsigned const j = DiB_rand(&seed) % (i + 1);
+        const char* const tmp = fileNamesTable[j];
+        fileNamesTable[j] = fileNamesTable[i];
+        fileNamesTable[i] = tmp;
+    }
 }
 
 
@@ -162,7 +167,7 @@
 
     requiredMem = (((requiredMem >> 23) + 1) << 23);
     requiredMem += step;
-    if (requiredMem > maxMemory) requiredMem = maxMemory;
+    if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
 
     while (!testmem) {
         testmem = malloc((size_t)requiredMem);
@@ -203,7 +208,7 @@
 
 
 static int g_tooLargeSamples = 0;
-static U64 DiB_getTotalCappedFileSize(const char** fileNamesTable, unsigned nbFiles)
+static U64 DiB_totalCappedFileSize(const char** fileNamesTable, unsigned nbFiles)
 {
     U64 total = 0;
     unsigned n;
@@ -236,7 +241,7 @@
 {
     void* const dictBuffer = malloc(maxDictSize);
     size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
-    unsigned long long const totalSizeToLoad = DiB_getTotalCappedFileSize(fileNamesTable, nbFiles);
+    unsigned long long const totalSizeToLoad = DiB_totalCappedFileSize(fileNamesTable, nbFiles);
     size_t const memMult = params ? MEMMULT : COVER_MEMMULT;
     size_t const maxMem =  DiB_findMaxMem(totalSizeToLoad * memMult) / memMult;
     size_t benchedSize = (size_t) MIN ((unsigned long long)maxMem, totalSizeToLoad);
@@ -246,8 +251,9 @@
     /* Checks */
     if (params) g_displayLevel = params->zParams.notificationLevel;
     else if (coverParams) g_displayLevel = coverParams->zParams.notificationLevel;
-    else EXM_THROW(13, "Neither dictionary algorith selected");   /* should not happen */
-    if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
+    else EXM_THROW(13, "Neither dictionary algorithm selected");   /* should not happen */
+    if ((!fileSizes) || (!srcBuffer) || (!dictBuffer))
+        EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
     if (g_tooLargeSamples) {
         DISPLAYLEVEL(2, "!  Warning : some samples are very large \n");
         DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small files or beginning of large files. \n");
@@ -270,8 +276,7 @@
     DiB_shuffle(fileNamesTable, nbFiles);
     nbFiles = DiB_loadFiles(srcBuffer, &benchedSize, fileSizes, fileNamesTable, nbFiles);
 
-    {
-        size_t dictSize;
+    {   size_t dictSize;
         if (params) {
             DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
             dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
@@ -285,9 +290,8 @@
                 DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
             }
         } else {
-            dictSize =
-                ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
-                                            fileSizes, nbFiles, *coverParams);
+            dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
+                                                   fileSizes, nbFiles, *coverParams);
         }
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
commit	77c137b3ae4d3c961952acf659b4fa515dffb1db	[log] [tgz]
author	Yann Collet <cyan@fb.com>	Thu Sep 14 15:12:57 2017 -0700
committer	Yann Collet <cyan@fb.com>	Thu Sep 14 15:12:57 2017 -0700
tree	d53f7c4d491a04ec306a518904f68da6c4afe95c
parent	335780c427e365e1c3c7e7ae96acbf79f9b36f19 [diff] [blame]