Merge pull request #58 from bai/zstd-1.4.0

Update vendored zstd to 1.4.0
DataDog · Apr 17, 2019 · 809b919 · 809b919
2 parents 69c5189 + cb68b23
commit 809b919
Show file tree

Hide file tree

Showing 39 changed files with 2,313 additions and 1,446 deletions.
diff --git a/README.md b/README.md
@@ -2,8 +2,8 @@
 
 [C Zstd Homepage](https://github.com/Cyan4973/zstd)
 
-The current headers and C files are from *v1.3.8* (Commit
-[470344d](https://github.com/facebook/zstd/releases/tag/v1.3.8)).
+The current headers and C files are from *v1.4.0* (Commit
+[83b51e9](https://github.com/facebook/zstd/releases/tag/v1.4.0)).
 
 ## Usage
 

diff --git a/compiler.h b/compiler.h
@@ -40,7 +40,7 @@
 
 /**
  * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
- * parameters. They must be inlined for the compiler to elimininate the constant
+ * parameters. They must be inlined for the compiler to eliminate the constant
  * branches.
  */
 #define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR

diff --git a/cover.c b/cover.c
@@ -391,7 +391,7 @@ static void COVER_group(COVER_ctx_t *ctx, const void *group,
  *
  *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
  *
- * Once the dmer d is in the dictionay we set F(d) = 0.
+ * Once the dmer d is in the dictionary we set F(d) = 0.
  */
 static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
                                            COVER_map_t *activeDmers, U32 begin,
@@ -435,7 +435,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
       U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
       activeSegment.begin += 1;
       *delDmerOcc -= 1;
-      /* If this is the last occurence of the dmer, subtract its score */
+      /* If this is the last occurrence of the dmer, subtract its score */
       if (*delDmerOcc == 0) {
         COVER_map_remove(activeDmers, delDmer);
         activeSegment.score -= freqs[delDmer];
@@ -627,6 +627,39 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
   return 1;
 }
 
+void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
+{
+  const double ratio = (double)nbDmers / maxDictSize;
+  if (ratio >= 10) {
+      return;
+  }
+  LOCALDISPLAYLEVEL(displayLevel, 1,
+                    "WARNING: The maximum dictionary size %u is too large "
+                    "compared to the source size %u! "
+                    "size(source)/size(dictionary) = %f, but it should be >= "
+                    "10! This may lead to a subpar dictionary! We recommend "
+                    "training on sources at least 10x, and up to 100x the "
+                    "size of the dictionary!\n", (U32)maxDictSize,
+                    (U32)nbDmers, ratio);
+}
+
+COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
+                                       U32 nbDmers, U32 k, U32 passes)
+{
+  const U32 minEpochSize = k * 10;
+  COVER_epoch_info_t epochs;
+  epochs.num = MAX(1, maxDictSize / k / passes);
+  epochs.size = nbDmers / epochs.num;
+  if (epochs.size >= minEpochSize) {
+      assert(epochs.size * epochs.num <= nbDmers);
+      return epochs;
+  }
+  epochs.size = MIN(minEpochSize, nbDmers);
+  epochs.num = nbDmers / epochs.size;
+  assert(epochs.size * epochs.num <= nbDmers);
+  return epochs;
+}
+
 /**
  * Given the prepared context build the dictionary.
  */
@@ -636,28 +669,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
                                     ZDICT_cover_params_t parameters) {
   BYTE *const dict = (BYTE *)dictBuffer;
   size_t tail = dictBufferCapacity;
-  /* Divide the data up into epochs of equal size.
-   * We will select at least one segment from each epoch.
-   */
-  const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
-  const unsigned epochSize = (U32)(ctx->suffixSize / epochs);
+  /* Divide the data into epochs. We will select one segment from each epoch. */
+  const COVER_epoch_info_t epochs = COVER_computeEpochs(
+      (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
+  const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
+  size_t zeroScoreRun = 0;
   size_t epoch;
   DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
-                epochs, epochSize);
+                (U32)epochs.num, (U32)epochs.size);
   /* Loop through the epochs until there are no more segments or the dictionary
    * is full.
    */
-  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
-    const U32 epochBegin = (U32)(epoch * epochSize);
-    const U32 epochEnd = epochBegin + epochSize;
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+    const U32 epochBegin = (U32)(epoch * epochs.size);
+    const U32 epochEnd = epochBegin + epochs.size;
     size_t segmentSize;
     /* Select a segment */
     COVER_segment_t segment = COVER_selectSegment(
         ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
-    /* If the segment covers no dmers, then we are out of content */
+    /* If the segment covers no dmers, then we are out of content.
+     * There may be new content in other epochs, for continue for some time.
+     */
     if (segment.score == 0) {
-      break;
+      if (++zeroScoreRun >= maxZeroScoreRun) {
+          break;
+      }
+      continue;
     }
+    zeroScoreRun = 0;
     /* Trim the segment if necessary and if it is too small then we are done */
     segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
     if (segmentSize < parameters.d) {
@@ -706,6 +745,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
                       parameters.d, parameters.splitPoint)) {
     return ERROR(GENERIC);
   }
+  COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
     DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
     COVER_ctx_destroy(&ctx);
@@ -977,6 +1017,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
   unsigned k;
   COVER_best_t best;
   POOL_ctx *pool = NULL;
+  int warned = 0;
 
   /* Checks */
   if (splitPoint <= 0 || splitPoint > 1) {
@@ -1019,6 +1060,10 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
       POOL_free(pool);
       return ERROR(GENERIC);
     }
+    if (!warned) {
+      COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
+      warned = 1;
+    }
     /* Loop through k reusing the same context */
     for (k = kMinK; k <= kMaxK; k += kStepSize) {
       /* Prepare the arguments */

diff --git a/cover.h b/cover.h
@@ -38,6 +38,35 @@ typedef struct {
   U32 score;
 } COVER_segment_t;
 
+/**
+ *Number of epochs and size of each epoch.
+ */
+typedef struct {
+  U32 num;
+  U32 size;
+} COVER_epoch_info_t;
+
+/**
+ * Computes the number of epochs and the size of each epoch.
+ * We will make sure that each epoch gets at least 10 * k bytes.
+ *
+ * The COVER algorithms divide the data up into epochs of equal size and
+ * select one segment from each epoch.
+ *
+ * @param maxDictSize The maximum allowed dictionary size.
+ * @param nbDmers     The number of dmers we are training on.
+ * @param k           The parameter k (segment size).
+ * @param passes      The target number of passes over the dmer corpus.
+ *                    More passes means a better dictionary.
+ */
+COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
+                                       U32 k, U32 passes);
+
+/**
+ * Warns the user when their corpus is too small.
+ */
+void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
+
 /**
  *  Checks total compressed size of a dictionary
  */

diff --git a/fastcover.c b/fastcover.c
@@ -132,7 +132,7 @@ typedef struct {
  *
  *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
  *
- * Once the dmer with hash value d is in the dictionay we set F(d) = 0.
+ * Once the dmer with hash value d is in the dictionary we set F(d) = 0.
  */
 static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
                                               U32 *freqs, U32 begin, U32 end,
@@ -161,7 +161,7 @@ static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
     /* Get hash value of current dmer */
     const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
 
-    /* Add frequency of this index to score if this is the first occurence of index in active segment */
+    /* Add frequency of this index to score if this is the first occurrence of index in active segment */
     if (segmentFreqs[idx] == 0) {
       activeSegment.score += freqs[idx];
     }
@@ -386,29 +386,35 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
 {
   BYTE *const dict = (BYTE *)dictBuffer;
   size_t tail = dictBufferCapacity;
-  /* Divide the data up into epochs of equal size.
-   * We will select at least one segment from each epoch.
-   */
-  const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
-  const unsigned epochSize = (U32)(ctx->nbDmers / epochs);
+  /* Divide the data into epochs. We will select one segment from each epoch. */
+  const COVER_epoch_info_t epochs = COVER_computeEpochs(
+      (U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
+  const size_t maxZeroScoreRun = 10;
+  size_t zeroScoreRun = 0;
   size_t epoch;
   DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
-                epochs, epochSize);
+                (U32)epochs.num, (U32)epochs.size);
   /* Loop through the epochs until there are no more segments or the dictionary
    * is full.
    */
-  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
-    const U32 epochBegin = (U32)(epoch * epochSize);
-    const U32 epochEnd = epochBegin + epochSize;
+  for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
+    const U32 epochBegin = (U32)(epoch * epochs.size);
+    const U32 epochEnd = epochBegin + epochs.size;
     size_t segmentSize;
     /* Select a segment */
     COVER_segment_t segment = FASTCOVER_selectSegment(
         ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
 
-    /* If the segment covers no dmers, then we are out of content */
+    /* If the segment covers no dmers, then we are out of content.
+     * There may be new content in other epochs, for continue for some time.
+     */
     if (segment.score == 0) {
-      break;
+      if (++zeroScoreRun >= maxZeroScoreRun) {
+          break;
+      }
+      continue;
     }
+    zeroScoreRun = 0;
 
     /* Trim the segment if necessary and if it is too small then we are done */
     segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
@@ -564,6 +570,7 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
       DISPLAYLEVEL(1, "Failed to initialize context\n");
       return ERROR(GENERIC);
     }
+    COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
     /* Build the dictionary */
     DISPLAYLEVEL(2, "Building dictionary\n");
     {
@@ -616,6 +623,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
     unsigned k;
     COVER_best_t best;
     POOL_ctx *pool = NULL;
+    int warned = 0;
     /* Checks */
     if (splitPoint <= 0 || splitPoint > 1) {
       LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
@@ -664,6 +672,10 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
         POOL_free(pool);
         return ERROR(GENERIC);
       }
+      if (!warned) {
+        COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
+        warned = 1;
+      }
       /* Loop through k reusing the same context */
       for (k = kMinK; k <= kMaxK; k += kStepSize) {
         /* Prepare the arguments */

diff --git a/fse.h b/fse.h
@@ -358,7 +358,7 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size
 typedef enum {
    FSE_repeat_none,  /**< Cannot use the previous table */
    FSE_repeat_check, /**< Can use the previous table but it must be checked */
-   FSE_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+   FSE_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
  } FSE_repeat;
 
 /* *****************************************

diff --git a/fse_compress.c b/fse_compress.c
@@ -129,9 +129,9 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
     {   U32 position = 0;
         U32 symbol;
         for (symbol=0; symbol<=maxSymbolValue; symbol++) {
-            int nbOccurences;
+            int nbOccurrences;
             int const freq = normalizedCounter[symbol];
-            for (nbOccurences=0; nbOccurences<freq; nbOccurences++) {
+            for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) {
                 tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
                 position = (position + step) & tableMask;
                 while (position > highThreshold)

diff --git a/threading.c b/threading.c
@@ -14,8 +14,8 @@
  * This file will hold wrapper for systems, which do not support pthreads
  */
 
-/* create fake symbol to avoid empty trnaslation unit warning */
-int g_ZSTD_threading_useles_symbol;
+/* create fake symbol to avoid empty translation unit warning */
+int g_ZSTD_threading_useless_symbol;
 
 #if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
 

diff --git a/xxhash.c b/xxhash.c
@@ -66,10 +66,10 @@
 /* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
 
 /*!XXH_FORCE_NATIVE_FORMAT :
- * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+ * By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
  * Results are therefore identical for little-endian and big-endian CPU.
  * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
- * Should endian-independance be of no importance for your application, you may set the #define below to 1,
+ * Should endian-independence be of no importance for your application, you may set the #define below to 1,
  * to improve speed for Big-endian CPU.
  * This option has no impact on Little_Endian CPU.
  */

diff --git a/zdict.h b/zdict.h
@@ -46,7 +46,12 @@ extern "C" {
  *  The resulting dictionary will be saved into `dictBuffer`.
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  *          or an error code, which can be tested with ZDICT_isError().
- *  Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte.
+ *  Note:  Dictionary training will fail if there are not enough samples to construct a
+ *         dictionary, or if most of the samples are too small (< 8 bytes being the lower limit).
+ *         If dictionary training fails, you should use zstd without a dictionary, as the dictionary
+ *         would've been ineffective anyways. If you believe your samples would benefit from a dictionary
+ *         please open an issue with details, and we can look into it.
+ *  Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB.
  *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
  *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
  *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
@@ -110,6 +115,7 @@ typedef struct {
  *  The resulting dictionary will be saved into `dictBuffer`.
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  *          or an error code, which can be tested with ZDICT_isError().
+ *          See ZDICT_trainFromBuffer() for details on failure modes.
  *  Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
  *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
  *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
@@ -133,8 +139,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
  * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
  *
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
- *           or an error code, which can be tested with ZDICT_isError().
- *           On success `*parameters` contains the parameters selected.
+ *          or an error code, which can be tested with ZDICT_isError().
+ *          On success `*parameters` contains the parameters selected.
+ *          See ZDICT_trainFromBuffer() for details on failure modes.
  * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
  */
 ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
@@ -151,7 +158,8 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
  *  The resulting dictionary will be saved into `dictBuffer`.
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  *          or an error code, which can be tested with ZDICT_isError().
- *  Note: ZDICT_trainFromBuffer_fastCover() requires about 1 bytes of memory for each input byte and additionally another 6 * 2^f bytes of memory .
+ *          See ZDICT_trainFromBuffer() for details on failure modes.
+ *  Note: ZDICT_trainFromBuffer_fastCover() requires 6 * 2^f bytes of memory.
  *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
  *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
  *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
@@ -175,9 +183,10 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
  * If accel is zero, default value of 1 is used.
  *
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
- *           or an error code, which can be tested with ZDICT_isError().
- *           On success `*parameters` contains the parameters selected.
- * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 1 byte of memory for each input byte and additionally another 6 * 2^f bytes of memory for each thread.
+ *          or an error code, which can be tested with ZDICT_isError().
+ *          On success `*parameters` contains the parameters selected.
+ *          See ZDICT_trainFromBuffer() for details on failure modes.
+ * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread.
  */
 ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
                     size_t dictBufferCapacity, const void* samplesBuffer,
@@ -195,7 +204,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
  * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
  *
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
- *           or an error code, which can be tested by ZDICT_isError().
+ *          or an error code, which can be tested by ZDICT_isError().
  * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
  * Note 2: dictBuffer and dictContent can overlap
  */
@@ -219,6 +228,7 @@ typedef struct {
  * `parameters` is optional and can be provided with values set to 0 to mean "default".
  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  *          or an error code, which can be tested with ZDICT_isError().
+ *          See ZDICT_trainFromBuffer() for details on failure modes.
  *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
  *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
  *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.