sys/contrib/zstd/tests/decodecorpus.c

   1 /*
   2  * Copyright (c) 2017-present, Yann Collet, Facebook, Inc.
   3  * All rights reserved.
   4  *
   5  * This source code is licensed under both the BSD-style license (found in the
   6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
   7  * in the COPYING file in the root directory of this source tree).
   8  * You may select, at your option, one of the above-listed licenses.
   9  */
  10
  11 #include <limits.h>
  12 #include <math.h>
  13 #include <stddef.h>
  14 #include <stdio.h>
  15 #include <stdlib.h>
  16 #include <string.h>
  17
  18 #include "util.h"
  19 #include "zstd.h"
  20 #include "zstd_internal.h"
  21 #include "mem.h"
  22 #define ZDICT_STATIC_LINKING_ONLY
  23 #include "zdict.h"
  24
  25 // Direct access to internal compression functions is required
  26 #include "zstd_compress.c"
  27
  28 #define XXH_STATIC_LINKING_ONLY
  29 #include "xxhash.h"     /* XXH64 */
  30
  31 #ifndef MIN
  32     #define MIN(a, b) ((a) < (b) ? (a) : (b))
  33 #endif
  34
  35 #ifndef MAX_PATH
  36     #ifdef PATH_MAX
  37         #define MAX_PATH PATH_MAX
  38     #else
  39         #define MAX_PATH 256
  40     #endif
  41 #endif
  42
  43 /*-************************************
  44 *  DISPLAY Macros
  45 **************************************/
  46 #define DISPLAY(...)          fprintf(stderr, __VA_ARGS__)
  47 #define DISPLAYLEVEL(l, ...)  if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
  48 static U32 g_displayLevel = 2;
  49
  50 #define DISPLAYUPDATE(...)                                                     \
  51     do {                                                                       \
  52         if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) ||           \
  53             (g_displayLevel >= 4)) {                                           \
  54             g_displayClock = UTIL_getTime();                                   \
  55             DISPLAY(__VA_ARGS__);                                              \
  56             if (g_displayLevel >= 4) fflush(stderr);                           \
  57         }                                                                      \
  58     } while (0)
  59
  60 static const U64 g_refreshRate = SEC_TO_MICRO / 6;
  61 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
  62
  63 #define CHECKERR(code)                                                         \
  64     do {                                                                       \
  65         if (ZSTD_isError(code)) {                                              \
  66             DISPLAY("Error occurred while generating data: %s\n",              \
  67                     ZSTD_getErrorName(code));                                  \
  68             exit(1);                                                           \
  69         }                                                                      \
  70     } while (0)
  71
  72 /*-*******************************************************
  73 *  Random function
  74 *********************************************************/
  75 static unsigned RAND(unsigned* src)
  76 {
  77 #define RAND_rotl32(x,r) ((x << r) | (x >> (32 - r)))
  78     static const U32 prime1 = 2654435761U;
  79     static const U32 prime2 = 2246822519U;
  80     U32 rand32 = *src;
  81     rand32 *= prime1;
  82     rand32 += prime2;
  83     rand32  = RAND_rotl32(rand32, 13);
  84     *src = rand32;
  85     return RAND_rotl32(rand32, 27);
  86 #undef RAND_rotl32
  87 }
  88
  89 #define DISTSIZE (8192)
  90
  91 /* Write `size` bytes into `ptr`, all of which are less than or equal to `maxSymb` */
  92 static void RAND_bufferMaxSymb(U32* seed, void* ptr, size_t size, int maxSymb)
  93 {
  94     size_t i;
  95     BYTE* op = ptr;
  96
  97     for (i = 0; i < size; i++) {
  98         op[i] = (BYTE) (RAND(seed) % (maxSymb + 1));
  99     }
 100 }
 101
 102 /* Write `size` random bytes into `ptr` */
 103 static void RAND_buffer(U32* seed, void* ptr, size_t size)
 104 {
 105     size_t i;
 106     BYTE* op = ptr;
 107
 108     for (i = 0; i + 4 <= size; i += 4) {
 109         MEM_writeLE32(op + i, RAND(seed));
 110     }
 111     for (; i < size; i++) {
 112         op[i] = RAND(seed) & 0xff;
 113     }
 114 }
 115
 116 /* Write `size` bytes into `ptr` following the distribution `dist` */
 117 static void RAND_bufferDist(U32* seed, BYTE* dist, void* ptr, size_t size)
 118 {
 119     size_t i;
 120     BYTE* op = ptr;
 121
 122     for (i = 0; i < size; i++) {
 123         op[i] = dist[RAND(seed) % DISTSIZE];
 124     }
 125 }
 126
 127 /* Generate a random distribution where the frequency of each symbol follows a
 128  * geometric distribution defined by `weight`
 129  * `dist` should have size at least `DISTSIZE` */
 130 static void RAND_genDist(U32* seed, BYTE* dist, double weight)
 131 {
 132     size_t i = 0;
 133     size_t statesLeft = DISTSIZE;
 134     BYTE symb = (BYTE) (RAND(seed) % 256);
 135     BYTE step = (BYTE) ((RAND(seed) % 256) | 1); /* force it to be odd so it's relatively prime to 256 */
 136
 137     while (i < DISTSIZE) {
 138         size_t states = ((size_t)(weight * statesLeft)) + 1;
 139         size_t j;
 140         for (j = 0; j < states && i < DISTSIZE; j++, i++) {
 141             dist[i] = symb;
 142         }
 143
 144         symb += step;
 145         statesLeft -= states;
 146     }
 147 }
 148
 149 /* Generates a random number in the range [min, max) */
 150 static inline U32 RAND_range(U32* seed, U32 min, U32 max)
 151 {
 152     return (RAND(seed) % (max-min)) + min;
 153 }
 154
 155 #define ROUND(x) ((U32)(x + 0.5))
 156
 157 /* Generates a random number in an exponential distribution with mean `mean` */
 158 static double RAND_exp(U32* seed, double mean)
 159 {
 160     double const u = RAND(seed) / (double) UINT_MAX;
 161     return log(1-u) * (-mean);
 162 }
 163
 164 /*-*******************************************************
 165 *  Constants and Structs
 166 *********************************************************/
 167 const char *BLOCK_TYPES[] = {"raw", "rle", "compressed"};
 168
 169 #define MAX_DECOMPRESSED_SIZE_LOG 20
 170 #define MAX_DECOMPRESSED_SIZE (1ULL << MAX_DECOMPRESSED_SIZE_LOG)
 171
 172 #define MAX_WINDOW_LOG 22 /* Recommended support is 8MB, so limit to 4MB + mantissa */
 173
 174 #define MIN_SEQ_LEN (3)
 175 #define MAX_NB_SEQ ((ZSTD_BLOCKSIZE_MAX + MIN_SEQ_LEN - 1) / MIN_SEQ_LEN)
 176
 177 BYTE CONTENT_BUFFER[MAX_DECOMPRESSED_SIZE];
 178 BYTE FRAME_BUFFER[MAX_DECOMPRESSED_SIZE * 2];
 179 BYTE LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX];
 180
 181 seqDef SEQUENCE_BUFFER[MAX_NB_SEQ];
 182 BYTE SEQUENCE_LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX]; /* storeSeq expects a place to copy literals to */
 183 BYTE SEQUENCE_LLCODE[ZSTD_BLOCKSIZE_MAX];
 184 BYTE SEQUENCE_MLCODE[ZSTD_BLOCKSIZE_MAX];
 185 BYTE SEQUENCE_OFCODE[ZSTD_BLOCKSIZE_MAX];
 186
 187 unsigned WKSP[1024];
 188
 189 typedef struct {
 190     size_t contentSize; /* 0 means unknown (unless contentSize == windowSize == 0) */
 191     unsigned windowSize; /* contentSize >= windowSize means single segment */
 192 } frameHeader_t;
 193
 194 /* For repeat modes */
 195 typedef struct {
 196     U32 rep[ZSTD_REP_NUM];
 197
 198     int hufInit;
 199     /* the distribution used in the previous block for repeat mode */
 200     BYTE hufDist[DISTSIZE];
 201     U32 hufTable [256]; /* HUF_CElt is an incomplete type */
 202
 203     int fseInit;
 204     FSE_CTable offcodeCTable  [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
 205     FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
 206     FSE_CTable litlengthCTable  [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
 207
 208     /* Symbols that were present in the previous distribution, for use with
 209      * set_repeat */
 210     BYTE litlengthSymbolSet[36];
 211     BYTE offsetSymbolSet[29];
 212     BYTE matchlengthSymbolSet[53];
 213 } cblockStats_t;
 214
 215 typedef struct {
 216     void* data;
 217     void* dataStart;
 218     void* dataEnd;
 219
 220     void* src;
 221     void* srcStart;
 222     void* srcEnd;
 223
 224     frameHeader_t header;
 225
 226     cblockStats_t stats;
 227     cblockStats_t oldStats; /* so they can be rolled back if uncompressible */
 228 } frame_t;
 229
 230 typedef struct {
 231     int useDict;
 232     U32 dictID;
 233     size_t dictContentSize;
 234     BYTE* dictContent;
 235 } dictInfo;
 236
 237 typedef enum {
 238   gt_frame = 0,  /* generate frames */
 239   gt_block,      /* generate compressed blocks without block/frame headers */
 240 } genType_e;
 241
 242 /*-*******************************************************
 243 *  Global variables (set from command line)
 244 *********************************************************/
 245 U32 g_maxDecompressedSizeLog = MAX_DECOMPRESSED_SIZE_LOG;  /* <= 20 */
 246 U32 g_maxBlockSize = ZSTD_BLOCKSIZE_MAX;                       /* <= 128 KB */
 247
 248 /*-*******************************************************
 249 *  Generator Functions
 250 *********************************************************/
 251
 252 struct {
 253     int contentSize; /* force the content size to be present */
 254 } opts; /* advanced options on generation */
 255
 256 /* Generate and write a random frame header */
 257 static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info)
 258 {
 259     BYTE* const op = frame->data;
 260     size_t pos = 0;
 261     frameHeader_t fh;
 262
 263     BYTE windowByte = 0;
 264
 265     int singleSegment = 0;
 266     int contentSizeFlag = 0;
 267     int fcsCode = 0;
 268
 269     memset(&fh, 0, sizeof(fh));
 270
 271     /* generate window size */
 272     {
 273         /* Follow window algorithm from specification */
 274         int const exponent = RAND(seed) % (MAX_WINDOW_LOG - 10);
 275         int const mantissa = RAND(seed) % 8;
 276         windowByte = (BYTE) ((exponent << 3) | mantissa);
 277         fh.windowSize = (1U << (exponent + 10));
 278         fh.windowSize += fh.windowSize / 8 * mantissa;
 279     }
 280
 281     {
 282         /* Generate random content size */
 283         size_t highBit;
 284         if (RAND(seed) & 7 && g_maxDecompressedSizeLog > 7) {
 285             /* do content of at least 128 bytes */
 286             highBit = 1ULL << RAND_range(seed, 7, g_maxDecompressedSizeLog);
 287         } else if (RAND(seed) & 3) {
 288             /* do small content */
 289             highBit = 1ULL << RAND_range(seed, 0, MIN(7, 1U << g_maxDecompressedSizeLog));
 290         } else {
 291             /* 0 size frame */
 292             highBit = 0;
 293         }
 294         fh.contentSize = highBit ? highBit + (RAND(seed) % highBit) : 0;
 295
 296         /* provide size sometimes */
 297         contentSizeFlag = opts.contentSize | (RAND(seed) & 1);
 298
 299         if (contentSizeFlag && (fh.contentSize == 0 || !(RAND(seed) & 7))) {
 300             /* do single segment sometimes */
 301             fh.windowSize = (U32) fh.contentSize;
 302             singleSegment = 1;
 303         }
 304     }
 305
 306     if (contentSizeFlag) {
 307         /* Determine how large fcs field has to be */
 308         int minFcsCode = (fh.contentSize >= 256) +
 309                                (fh.contentSize >= 65536 + 256) +
 310                                (fh.contentSize > 0xFFFFFFFFU);
 311         if (!singleSegment && !minFcsCode) {
 312             minFcsCode = 1;
 313         }
 314         fcsCode = minFcsCode + (RAND(seed) % (4 - minFcsCode));
 315         if (fcsCode == 1 && fh.contentSize < 256) fcsCode++;
 316     }
 317
 318     /* write out the header */
 319     MEM_writeLE32(op + pos, ZSTD_MAGICNUMBER);
 320     pos += 4;
 321
 322     {
 323         /*
 324          * fcsCode: 2-bit flag specifying how many bytes used to represent Frame_Content_Size (bits 7-6)
 325          * singleSegment: 1-bit flag describing if data must be regenerated within a single continuous memory segment. (bit 5)
 326          * contentChecksumFlag: 1-bit flag that is set if frame includes checksum at the end -- set to 1 below (bit 2)
 327          * dictBits: 2-bit flag describing how many bytes Dictionary_ID uses -- set to 3 (bits 1-0)
 328          * For more information: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header
 329          */
 330         int const dictBits = info.useDict ? 3 : 0;
 331         BYTE const frameHeaderDescriptor =
 332                 (BYTE) ((fcsCode << 6) | (singleSegment << 5) | (1 << 2) | dictBits);
 333         op[pos++] = frameHeaderDescriptor;
 334     }
 335
 336     if (!singleSegment) {
 337         op[pos++] = windowByte;
 338     }
 339     if (info.useDict) {
 340         MEM_writeLE32(op + pos, (U32) info.dictID);
 341         pos += 4;
 342     }
 343     if (contentSizeFlag) {
 344         switch (fcsCode) {
 345         default: /* Impossible */
 346         case 0: op[pos++] = (BYTE) fh.contentSize; break;
 347         case 1: MEM_writeLE16(op + pos, (U16) (fh.contentSize - 256)); pos += 2; break;
 348         case 2: MEM_writeLE32(op + pos, (U32) fh.contentSize); pos += 4; break;
 349         case 3: MEM_writeLE64(op + pos, (U64) fh.contentSize); pos += 8; break;
 350         }
 351     }
 352
 353     DISPLAYLEVEL(3, " frame content size:\t%u\n", (U32)fh.contentSize);
 354     DISPLAYLEVEL(3, " frame window size:\t%u\n", fh.windowSize);
 355     DISPLAYLEVEL(3, " content size flag:\t%d\n", contentSizeFlag);
 356     DISPLAYLEVEL(3, " single segment flag:\t%d\n", singleSegment);
 357
 358     frame->data = op + pos;
 359     frame->header = fh;
 360 }
 361
 362 /* Write a literal block in either raw or RLE form, return the literals size */
 363 static size_t writeLiteralsBlockSimple(U32* seed, frame_t* frame, size_t contentSize)
 364 {
 365     BYTE* op = (BYTE*)frame->data;
 366     int const type = RAND(seed) % 2;
 367     int const sizeFormatDesc = RAND(seed) % 8;
 368     size_t litSize;
 369     size_t maxLitSize = MIN(contentSize, g_maxBlockSize);
 370
 371     if (sizeFormatDesc == 0) {
 372         /* Size_FormatDesc = ?0 */
 373         maxLitSize = MIN(maxLitSize, 31);
 374     } else if (sizeFormatDesc <= 4) {
 375         /* Size_FormatDesc = 01 */
 376         maxLitSize = MIN(maxLitSize, 4095);
 377     } else {
 378         /* Size_Format = 11 */
 379         maxLitSize = MIN(maxLitSize, 1048575);
 380     }
 381
 382     litSize = RAND(seed) % (maxLitSize + 1);
 383     if (frame->src == frame->srcStart && litSize == 0) {
 384         litSize = 1; /* no empty literals if there's nothing preceding this block */
 385     }
 386     if (litSize + 3 > contentSize) {
 387         litSize = contentSize; /* no matches shorter than 3 are allowed */
 388     }
 389     /* use smallest size format that fits */
 390     if (litSize < 32) {
 391         op[0] = (type | (0 << 2) | (litSize << 3)) & 0xff;
 392         op += 1;
 393     } else if (litSize < 4096) {
 394         op[0] = (type | (1 << 2) | (litSize << 4)) & 0xff;
 395         op[1] = (litSize >> 4) & 0xff;
 396         op += 2;
 397     } else {
 398         op[0] = (type | (3 << 2) | (litSize << 4)) & 0xff;
 399         op[1] = (litSize >> 4) & 0xff;
 400         op[2] = (litSize >> 12) & 0xff;
 401         op += 3;
 402     }
 403
 404     if (type == 0) {
 405         /* Raw literals */
 406         DISPLAYLEVEL(4, "   raw literals\n");
 407
 408         RAND_buffer(seed, LITERAL_BUFFER, litSize);
 409         memcpy(op, LITERAL_BUFFER, litSize);
 410         op += litSize;
 411     } else {
 412         /* RLE literals */
 413         BYTE const symb = (BYTE) (RAND(seed) % 256);
 414
 415         DISPLAYLEVEL(4, "   rle literals: 0x%02x\n", (U32)symb);
 416
 417         memset(LITERAL_BUFFER, symb, litSize);
 418         op[0] = symb;
 419         op++;
 420     }
 421
 422     frame->data = op;
 423
 424     return litSize;
 425 }
 426
 427 /* Generate a Huffman header for the given source */
 428 static size_t writeHufHeader(U32* seed, HUF_CElt* hufTable, void* dst, size_t dstSize,
 429                                  const void* src, size_t srcSize)
 430 {
 431     BYTE* const ostart = (BYTE*)dst;
 432     BYTE* op = ostart;
 433
 434     unsigned huffLog = 11;
 435     U32 maxSymbolValue = 255;
 436
 437     U32 count[HUF_SYMBOLVALUE_MAX+1];
 438
 439     /* Scan input and build symbol stats */
 440     {   size_t const largest = FSE_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, WKSP);
 441         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 0; }   /* single symbol, rle */
 442         if (largest <= (srcSize >> 7)+1) return 0;   /* Fast heuristic : not compressible enough */
 443     }
 444
 445     /* Build Huffman Tree */
 446     /* Max Huffman log is 11, min is highbit(maxSymbolValue)+1 */
 447     huffLog = RAND_range(seed, ZSTD_highbit32(maxSymbolValue)+1, huffLog+1);
 448     DISPLAYLEVEL(6, "     huffman log: %u\n", huffLog);
 449     {   size_t const maxBits = HUF_buildCTable_wksp (hufTable, count, maxSymbolValue, huffLog, WKSP, sizeof(WKSP));
 450         CHECKERR(maxBits);
 451         huffLog = (U32)maxBits;
 452     }
 453
 454     /* Write table description header */
 455     {   size_t const hSize = HUF_writeCTable (op, dstSize, hufTable, maxSymbolValue, huffLog);
 456         if (hSize + 12 >= srcSize) return 0;   /* not useful to try compression */
 457         op += hSize;
 458     }
 459
 460     return op - ostart;
 461 }
 462
 463 /* Write a Huffman coded literals block and return the literals size */
 464 static size_t writeLiteralsBlockCompressed(U32* seed, frame_t* frame, size_t contentSize)
 465 {
 466     BYTE* origop = (BYTE*)frame->data;
 467     BYTE* opend = (BYTE*)frame->dataEnd;
 468     BYTE* op;
 469     BYTE* const ostart = origop;
 470     int const sizeFormat = RAND(seed) % 4;
 471     size_t litSize;
 472     size_t hufHeaderSize = 0;
 473     size_t compressedSize = 0;
 474     size_t maxLitSize = MIN(contentSize-3, g_maxBlockSize);
 475
 476     symbolEncodingType_e hType;
 477
 478     if (contentSize < 64) {
 479         /* make sure we get reasonably-sized literals for compression */
 480         return ERROR(GENERIC);
 481     }
 482
 483     DISPLAYLEVEL(4, "   compressed literals\n");
 484
 485     switch (sizeFormat) {
 486     case 0: /* fall through, size is the same as case 1 */
 487     case 1:
 488         maxLitSize = MIN(maxLitSize, 1023);
 489         origop += 3;
 490         break;
 491     case 2:
 492         maxLitSize = MIN(maxLitSize, 16383);
 493         origop += 4;
 494         break;
 495     case 3:
 496         maxLitSize = MIN(maxLitSize, 262143);
 497         origop += 5;
 498         break;
 499     default:; /* impossible */
 500     }
 501
 502     do {
 503         op = origop;
 504         do {
 505             litSize = RAND(seed) % (maxLitSize + 1);
 506         } while (litSize < 32); /* avoid small literal sizes */
 507         if (litSize + 3 > contentSize) {
 508             litSize = contentSize; /* no matches shorter than 3 are allowed */
 509         }
 510
 511         /* most of the time generate a new distribution */
 512         if ((RAND(seed) & 3) || !frame->stats.hufInit) {
 513             do {
 514                 if (RAND(seed) & 3) {
 515                     /* add 10 to ensure some compressability */
 516                     double const weight = ((RAND(seed) % 90) + 10) / 100.0;
 517
 518                     DISPLAYLEVEL(5, "    distribution weight: %d%%\n",
 519                                  (int)(weight * 100));
 520
 521                     RAND_genDist(seed, frame->stats.hufDist, weight);
 522                 } else {
 523                     /* sometimes do restricted range literals to force
 524                      * non-huffman headers */
 525                     DISPLAYLEVEL(5, "    small range literals\n");
 526                     RAND_bufferMaxSymb(seed, frame->stats.hufDist, DISTSIZE,
 527                                        15);
 528                 }
 529                 RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER,
 530                                 litSize);
 531
 532                 /* generate the header from the distribution instead of the
 533                  * actual data to avoid bugs with symbols that were in the
 534                  * distribution but never showed up in the output */
 535                 hufHeaderSize = writeHufHeader(
 536                         seed, (HUF_CElt*)frame->stats.hufTable, op, opend - op,
 537                         frame->stats.hufDist, DISTSIZE);
 538                 CHECKERR(hufHeaderSize);
 539                 /* repeat until a valid header is written */
 540             } while (hufHeaderSize == 0);
 541             op += hufHeaderSize;
 542             hType = set_compressed;
 543
 544             frame->stats.hufInit = 1;
 545         } else {
 546             /* repeat the distribution/table from last time */
 547             DISPLAYLEVEL(5, "    huffman repeat stats\n");
 548             RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER,
 549                             litSize);
 550             hufHeaderSize = 0;
 551             hType = set_repeat;
 552         }
 553
 554         do {
 555             compressedSize =
 556                     sizeFormat == 0
 557                             ? HUF_compress1X_usingCTable(
 558                                       op, opend - op, LITERAL_BUFFER, litSize,
 559                                       (HUF_CElt*)frame->stats.hufTable)
 560                             : HUF_compress4X_usingCTable(
 561                                       op, opend - op, LITERAL_BUFFER, litSize,
 562                                       (HUF_CElt*)frame->stats.hufTable);
 563             CHECKERR(compressedSize);
 564             /* this only occurs when it could not compress or similar */
 565         } while (compressedSize <= 0);
 566
 567         op += compressedSize;
 568
 569         compressedSize += hufHeaderSize;
 570         DISPLAYLEVEL(5, "    regenerated size: %u\n", (U32)litSize);
 571         DISPLAYLEVEL(5, "    compressed size: %u\n", (U32)compressedSize);
 572         if (compressedSize >= litSize) {
 573             DISPLAYLEVEL(5, "     trying again\n");
 574             /* if we have to try again, reset the stats so we don't accidentally
 575              * try to repeat a distribution we just made */
 576             frame->stats = frame->oldStats;
 577         } else {
 578             break;
 579         }
 580     } while (1);
 581
 582     /* write header */
 583     switch (sizeFormat) {
 584     case 0: /* fall through, size is the same as case 1 */
 585     case 1: {
 586         U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
 587                            ((U32)compressedSize << 14);
 588         MEM_writeLE24(ostart, header);
 589         break;
 590     }
 591     case 2: {
 592         U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
 593                            ((U32)compressedSize << 18);
 594         MEM_writeLE32(ostart, header);
 595         break;
 596     }
 597     case 3: {
 598         U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
 599                            ((U32)compressedSize << 22);
 600         MEM_writeLE32(ostart, header);
 601         ostart[4] = (BYTE)(compressedSize >> 10);
 602         break;
 603     }
 604     default:; /* impossible */
 605     }
 606
 607     frame->data = op;
 608     return litSize;
 609 }
 610
 611 static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize)
 612 {
 613     /* only do compressed for larger segments to avoid compressibility issues */
 614     if (RAND(seed) & 7 && contentSize >= 64) {
 615         return writeLiteralsBlockCompressed(seed, frame, contentSize);
 616     } else {
 617         return writeLiteralsBlockSimple(seed, frame, contentSize);
 618     }
 619 }
 620
 621 static inline void initSeqStore(seqStore_t *seqStore) {
 622     seqStore->sequencesStart = SEQUENCE_BUFFER;
 623     seqStore->litStart = SEQUENCE_LITERAL_BUFFER;
 624     seqStore->llCode = SEQUENCE_LLCODE;
 625     seqStore->mlCode = SEQUENCE_MLCODE;
 626     seqStore->ofCode = SEQUENCE_OFCODE;
 627
 628     ZSTD_resetSeqStore(seqStore);
 629 }
 630
 631 /* Randomly generate sequence commands */
 632 static U32 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore,
 633                                 size_t contentSize, size_t literalsSize, dictInfo info)
 634 {
 635     /* The total length of all the matches */
 636     size_t const remainingMatch = contentSize - literalsSize;
 637     size_t excessMatch = 0;
 638     U32 numSequences = 0;
 639
 640     U32 i;
 641
 642
 643     const BYTE* literals = LITERAL_BUFFER;
 644     BYTE* srcPtr = frame->src;
 645
 646     if (literalsSize != contentSize) {
 647         /* each match must be at least MIN_SEQ_LEN, so this is the maximum
 648          * number of sequences we can have */
 649         U32 const maxSequences = (U32)remainingMatch / MIN_SEQ_LEN;
 650         numSequences = (RAND(seed) % maxSequences) + 1;
 651
 652         /* the extra match lengths we have to allocate to each sequence */
 653         excessMatch = remainingMatch - numSequences * MIN_SEQ_LEN;
 654     }
 655
 656     DISPLAYLEVEL(5, "    total match lengths: %u\n", (U32)remainingMatch);
 657     for (i = 0; i < numSequences; i++) {
 658         /* Generate match and literal lengths by exponential distribution to
 659          * ensure nice numbers */
 660         U32 matchLen =
 661                 MIN_SEQ_LEN +
 662                 ROUND(RAND_exp(seed, excessMatch / (double)(numSequences - i)));
 663         U32 literalLen =
 664                 (RAND(seed) & 7)
 665                         ? ROUND(RAND_exp(seed,
 666                                          literalsSize /
 667                                                  (double)(numSequences - i)))
 668                         : 0;
 669         /* actual offset, code to send, and point to copy up to when shifting
 670          * codes in the repeat offsets history */
 671         U32 offset, offsetCode, repIndex;
 672
 673         /* bounds checks */
 674         matchLen = (U32) MIN(matchLen, excessMatch + MIN_SEQ_LEN);
 675         literalLen = MIN(literalLen, (U32) literalsSize);
 676         if (i == 0 && srcPtr == frame->srcStart && literalLen == 0) literalLen = 1;
 677         if (i + 1 == numSequences) matchLen = MIN_SEQ_LEN + (U32) excessMatch;
 678
 679         memcpy(srcPtr, literals, literalLen);
 680         srcPtr += literalLen;
 681         do {
 682             if (RAND(seed) & 7) {
 683                 /* do a normal offset */
 684                 U32 const dataDecompressed = (U32)((BYTE*)srcPtr-(BYTE*)frame->srcStart);
 685                 offset = (RAND(seed) %
 686                           MIN(frame->header.windowSize,
 687                               (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) +
 688                          1;
 689                 if (info.useDict && (RAND(seed) & 1) && i + 1 != numSequences && dataDecompressed < frame->header.windowSize) {
 690                     /* need to occasionally generate offsets that go past the start */
 691                     /* including i+1 != numSequences because the last sequences has to adhere to predetermined contentSize */
 692                     U32 lenPastStart = (RAND(seed) % info.dictContentSize) + 1;
 693                     offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart)+lenPastStart;
 694                     if (offset > frame->header.windowSize) {
 695                         if (lenPastStart < MIN_SEQ_LEN) {
 696                             /* when offset > windowSize, matchLen bound by end of dictionary (lenPastStart) */
 697                             /* this also means that lenPastStart must be greater than MIN_SEQ_LEN */
 698                             /* make sure lenPastStart does not go past dictionary start though */
 699                             lenPastStart = MIN(lenPastStart+MIN_SEQ_LEN, (U32)info.dictContentSize);
 700                             offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) + lenPastStart;
 701                         }
 702                         {
 703                             U32 const matchLenBound = MIN(frame->header.windowSize, lenPastStart);
 704                             matchLen = MIN(matchLen, matchLenBound);
 705                         }
 706                     }
 707                 }
 708                 offsetCode = offset + ZSTD_REP_MOVE;
 709                 repIndex = 2;
 710             } else {
 711                 /* do a repeat offset */
 712                 offsetCode = RAND(seed) % 3;
 713                 if (literalLen > 0) {
 714                     offset = frame->stats.rep[offsetCode];
 715                     repIndex = offsetCode;
 716                 } else {
 717                     /* special case */
 718                     offset = offsetCode == 2 ? frame->stats.rep[0] - 1
 719                                            : frame->stats.rep[offsetCode + 1];
 720                     repIndex = MIN(2, offsetCode + 1);
 721                 }
 722             }
 723         } while (((!info.useDict) && (offset > (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) || offset == 0);
 724
 725         {
 726             size_t j;
 727             BYTE* const dictEnd = info.dictContent + info.dictContentSize;
 728             for (j = 0; j < matchLen; j++) {
 729                 if ((U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) < offset) {
 730                     /* copy from dictionary instead of literals */
 731                     size_t const dictOffset = offset - (srcPtr - (BYTE*)frame->srcStart);
 732                     *srcPtr = *(dictEnd - dictOffset);
 733                 }
 734                 else {
 735                     *srcPtr = *(srcPtr-offset);
 736                 }
 737                 srcPtr++;
 738             }
 739         }
 740
 741         {   int r;
 742             for (r = repIndex; r > 0; r--) {
 743                 frame->stats.rep[r] = frame->stats.rep[r - 1];
 744             }
 745             frame->stats.rep[0] = offset;
 746         }
 747
 748         DISPLAYLEVEL(6, "      LL: %5u OF: %5u ML: %5u", literalLen, offset, matchLen);
 749         DISPLAYLEVEL(7, " srcPos: %8u seqNb: %3u",
 750                      (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart), i);
 751         DISPLAYLEVEL(6, "\n");
 752         if (offsetCode < 3) {
 753             DISPLAYLEVEL(7, "        repeat offset: %d\n", repIndex);
 754         }
 755         /* use libzstd sequence handling */
 756         ZSTD_storeSeq(seqStore, literalLen, literals, offsetCode,
 757                       matchLen - MINMATCH);
 758
 759         literalsSize -= literalLen;
 760         excessMatch -= (matchLen - MIN_SEQ_LEN);
 761         literals += literalLen;
 762     }
 763
 764     memcpy(srcPtr, literals, literalsSize);
 765     srcPtr += literalsSize;
 766     DISPLAYLEVEL(6, "      excess literals: %5u", (U32)literalsSize);
 767     DISPLAYLEVEL(7, " srcPos: %8u", (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart));
 768     DISPLAYLEVEL(6, "\n");
 769
 770     return numSequences;
 771 }
 772
 773 static void initSymbolSet(const BYTE* symbols, size_t len, BYTE* set, BYTE maxSymbolValue)
 774 {
 775     size_t i;
 776
 777     memset(set, 0, (size_t)maxSymbolValue+1);
 778
 779     for (i = 0; i < len; i++) {
 780         set[symbols[i]] = 1;
 781     }
 782 }
 783
 784 static int isSymbolSubset(const BYTE* symbols, size_t len, const BYTE* set, BYTE maxSymbolValue)
 785 {
 786     size_t i;
 787
 788     for (i = 0; i < len; i++) {
 789         if (symbols[i] > maxSymbolValue || !set[symbols[i]]) {
 790             return 0;
 791         }
 792     }
 793     return 1;
 794 }
 795
 796 static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr,
 797                              size_t nbSeq)
 798 {
 799     /* This code is mostly copied from ZSTD_compressSequences in zstd_compress.c */
 800     U32 count[MaxSeq+1];
 801     S16 norm[MaxSeq+1];
 802     FSE_CTable* CTable_LitLength = frame->stats.litlengthCTable;
 803     FSE_CTable* CTable_OffsetBits = frame->stats.offcodeCTable;
 804     FSE_CTable* CTable_MatchLength = frame->stats.matchlengthCTable;
 805     U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
 806     const seqDef* const sequences = seqStorePtr->sequencesStart;
 807     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
 808     const BYTE* const llCodeTable = seqStorePtr->llCode;
 809     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
 810     BYTE* const oend = (BYTE*)frame->dataEnd;
 811     BYTE* op = (BYTE*)frame->data;
 812     BYTE* seqHead;
 813     BYTE scratchBuffer[1<<MAX(MLFSELog,LLFSELog)];
 814
 815     /* literals compressing block removed so that can be done separately */
 816
 817     /* Sequences Header */
 818     if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) return ERROR(dstSize_tooSmall);
 819     if (nbSeq < 0x7F) *op++ = (BYTE)nbSeq;
 820     else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
 821     else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
 822
 823     /* seqHead : flags for FSE encoding type */
 824     seqHead = op++;
 825
 826     if (nbSeq==0) {
 827         frame->data = op;
 828
 829         return 0;
 830     }
 831
 832     /* convert length/distances into codes */
 833     ZSTD_seqToCodes(seqStorePtr);
 834
 835     /* CTable for Literal Lengths */
 836     {   U32 max = MaxLL;
 837         size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, WKSP);
 838         if (mostFrequent == nbSeq) {
 839             /* do RLE if we have the chance */
 840             *op++ = llCodeTable[0];
 841             FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
 842             LLtype = set_rle;
 843         } else if (frame->stats.fseInit && !(RAND(seed) & 3) &&
 844                    isSymbolSubset(llCodeTable, nbSeq,
 845                                   frame->stats.litlengthSymbolSet, 35)) {
 846             /* maybe do repeat mode if we're allowed to */
 847             LLtype = set_repeat;
 848         } else if (!(RAND(seed) & 3)) {
 849             /* maybe use the default distribution */
 850             FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
 851             LLtype = set_basic;
 852         } else {
 853             /* fall back on a full table */
 854             size_t nbSeq_1 = nbSeq;
 855             const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
 856             if (count[llCodeTable[nbSeq-1]]>1) { count[llCodeTable[nbSeq-1]]--; nbSeq_1--; }
 857             FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
 858             { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
 859               if (FSE_isError(NCountSize)) return ERROR(GENERIC);
 860               op += NCountSize; }
 861             FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
 862             LLtype = set_compressed;
 863     }   }
 864
 865     /* CTable for Offsets */
 866     /* see Literal Lengths for descriptions of mode choices */
 867     {   U32 max = MaxOff;
 868         size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, WKSP);
 869         if (mostFrequent == nbSeq) {
 870             *op++ = ofCodeTable[0];
 871             FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
 872             Offtype = set_rle;
 873         } else if (frame->stats.fseInit && !(RAND(seed) & 3) &&
 874                    isSymbolSubset(ofCodeTable, nbSeq,
 875                                   frame->stats.offsetSymbolSet, 28)) {
 876             Offtype = set_repeat;
 877         } else if (!(RAND(seed) & 3)) {
 878             FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, DefaultMaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
 879             Offtype = set_basic;
 880         } else {
 881             size_t nbSeq_1 = nbSeq;
 882             const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
 883             if (count[ofCodeTable[nbSeq-1]]>1) { count[ofCodeTable[nbSeq-1]]--; nbSeq_1--; }
 884             FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
 885             { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
 886               if (FSE_isError(NCountSize)) return ERROR(GENERIC);
 887               op += NCountSize; }
 888             FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
 889             Offtype = set_compressed;
 890     }   }
 891
 892     /* CTable for MatchLengths */
 893     /* see Literal Lengths for descriptions of mode choices */
 894     {   U32 max = MaxML;
 895         size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, WKSP);
 896         if (mostFrequent == nbSeq) {
 897             *op++ = *mlCodeTable;
 898             FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
 899             MLtype = set_rle;
 900         } else if (frame->stats.fseInit && !(RAND(seed) & 3) &&
 901                    isSymbolSubset(mlCodeTable, nbSeq,
 902                                   frame->stats.matchlengthSymbolSet, 52)) {
 903             MLtype = set_repeat;
 904         } else if (!(RAND(seed) & 3)) {
 905             /* sometimes do default distribution */
 906             FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
 907             MLtype = set_basic;
 908         } else {
 909             /* fall back on table */
 910             size_t nbSeq_1 = nbSeq;
 911             const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
 912             if (count[mlCodeTable[nbSeq-1]]>1) { count[mlCodeTable[nbSeq-1]]--; nbSeq_1--; }
 913             FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
 914             { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
 915               if (FSE_isError(NCountSize)) return ERROR(GENERIC);
 916               op += NCountSize; }
 917             FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
 918             MLtype = set_compressed;
 919     }   }
 920     frame->stats.fseInit = 1;
 921     initSymbolSet(llCodeTable, nbSeq, frame->stats.litlengthSymbolSet, 35);
 922     initSymbolSet(ofCodeTable, nbSeq, frame->stats.offsetSymbolSet, 28);
 923     initSymbolSet(mlCodeTable, nbSeq, frame->stats.matchlengthSymbolSet, 52);
 924
 925     DISPLAYLEVEL(5, "    LL type: %d OF type: %d ML type: %d\n", LLtype, Offtype, MLtype);
 926
 927     *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
 928
 929     /* Encoding Sequences */
 930     {   BIT_CStream_t blockStream;
 931         FSE_CState_t  stateMatchLength;
 932         FSE_CState_t  stateOffsetBits;
 933         FSE_CState_t  stateLitLength;
 934
 935         CHECK_E(BIT_initCStream(&blockStream, op, oend-op), dstSize_tooSmall); /* not enough space remaining */
 936
 937         /* first symbols */
 938         FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
 939         FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  ofCodeTable[nbSeq-1]);
 940         FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbSeq-1]);
 941         BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
 942         if (MEM_32bits()) BIT_flushBits(&blockStream);
 943         BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]);
 944         if (MEM_32bits()) BIT_flushBits(&blockStream);
 945         BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]);
 946         BIT_flushBits(&blockStream);
 947
 948         {   size_t n;
 949             for (n=nbSeq-2 ; n<nbSeq ; n--) {      /* intentional underflow */
 950                 BYTE const llCode = llCodeTable[n];
 951                 BYTE const ofCode = ofCodeTable[n];
 952                 BYTE const mlCode = mlCodeTable[n];
 953                 U32  const llBits = LL_bits[llCode];
 954                 U32  const ofBits = ofCode;                                     /* 32b*/  /* 64b*/
 955                 U32  const mlBits = ML_bits[mlCode];
 956                                                                                 /* (7)*/  /* (7)*/
 957                 FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);       /* 15 */  /* 15 */
 958                 FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 24 */  /* 24 */
 959                 if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
 960                 FSE_encodeSymbol(&blockStream, &stateLitLength, llCode);        /* 16 */  /* 33 */
 961                 if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
 962                     BIT_flushBits(&blockStream);                                /* (7)*/
 963                 BIT_addBits(&blockStream, sequences[n].litLength, llBits);
 964                 if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
 965                 BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
 966                 if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
 967                 BIT_addBits(&blockStream, sequences[n].offset, ofBits);         /* 31 */
 968                 BIT_flushBits(&blockStream);                                    /* (7)*/
 969         }   }
 970
 971         FSE_flushCState(&blockStream, &stateMatchLength);
 972         FSE_flushCState(&blockStream, &stateOffsetBits);
 973         FSE_flushCState(&blockStream, &stateLitLength);
 974
 975         {   size_t const streamSize = BIT_closeCStream(&blockStream);
 976             if (streamSize==0) return ERROR(dstSize_tooSmall);   /* not enough space */
 977             op += streamSize;
 978     }   }
 979
 980     frame->data = op;
 981
 982     return 0;
 983 }
 984
 985 static size_t writeSequencesBlock(U32* seed, frame_t* frame, size_t contentSize,
 986                                   size_t literalsSize, dictInfo info)
 987 {
 988     seqStore_t seqStore;
 989     size_t numSequences;
 990
 991
 992     initSeqStore(&seqStore);
 993
 994     /* randomly generate sequences */
 995     numSequences = generateSequences(seed, frame, &seqStore, contentSize, literalsSize, info);
 996     /* write them out to the frame data */
 997     CHECKERR(writeSequences(seed, frame, &seqStore, numSequences));
 998
 999     return numSequences;
1000 }
1001
1002 static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize, dictInfo info)
1003 {
1004     BYTE* const blockStart = (BYTE*)frame->data;
1005     size_t literalsSize;
1006     size_t nbSeq;
1007
1008     DISPLAYLEVEL(4, "  compressed block:\n");
1009
1010     literalsSize = writeLiteralsBlock(seed, frame, contentSize);
1011
1012     DISPLAYLEVEL(4, "   literals size: %u\n", (U32)literalsSize);
1013
1014     nbSeq = writeSequencesBlock(seed, frame, contentSize, literalsSize, info);
1015
1016     DISPLAYLEVEL(4, "   number of sequences: %u\n", (U32)nbSeq);
1017
1018     return (BYTE*)frame->data - blockStart;
1019 }
1020
1021 static void writeBlock(U32* seed, frame_t* frame, size_t contentSize,
1022                        int lastBlock, dictInfo info)
1023 {
1024     int const blockTypeDesc = RAND(seed) % 8;
1025     size_t blockSize;
1026     int blockType;
1027
1028     BYTE *const header = (BYTE*)frame->data;
1029     BYTE *op = header + 3;
1030
1031     DISPLAYLEVEL(4, " block:\n");
1032     DISPLAYLEVEL(4, "  block content size: %u\n", (U32)contentSize);
1033     DISPLAYLEVEL(4, "  last block: %s\n", lastBlock ? "yes" : "no");
1034
1035     if (blockTypeDesc == 0) {
1036         /* Raw data frame */
1037
1038         RAND_buffer(seed, frame->src, contentSize);
1039         memcpy(op, frame->src, contentSize);
1040
1041         op += contentSize;
1042         blockType = 0;
1043         blockSize = contentSize;
1044     } else if (blockTypeDesc == 1) {
1045         /* RLE */
1046         BYTE const symbol = RAND(seed) & 0xff;
1047
1048         op[0] = symbol;
1049         memset(frame->src, symbol, contentSize);
1050
1051         op++;
1052         blockType = 1;
1053         blockSize = contentSize;
1054     } else {
1055         /* compressed, most common */
1056         size_t compressedSize;
1057         blockType = 2;
1058
1059         frame->oldStats = frame->stats;
1060
1061         frame->data = op;
1062         compressedSize = writeCompressedBlock(seed, frame, contentSize, info);
1063         if (compressedSize >= contentSize) {   /* compressed block must be strictly smaller than uncompressed one */
1064             blockType = 0;
1065             memcpy(op, frame->src, contentSize);
1066
1067             op += contentSize;
1068             blockSize = contentSize; /* fall back on raw block if data doesn't
1069                                         compress */
1070
1071             frame->stats = frame->oldStats; /* don't update the stats */
1072         } else {
1073             op += compressedSize;
1074             blockSize = compressedSize;
1075         }
1076     }
1077     frame->src = (BYTE*)frame->src + contentSize;
1078
1079     DISPLAYLEVEL(4, "  block type: %s\n", BLOCK_TYPES[blockType]);
1080     DISPLAYLEVEL(4, "  block size field: %u\n", (U32)blockSize);
1081
1082     header[0] = (BYTE) ((lastBlock | (blockType << 1) | (blockSize << 3)) & 0xff);
1083     MEM_writeLE16(header + 1, (U16) (blockSize >> 5));
1084
1085     frame->data = op;
1086 }
1087
1088 static void writeBlocks(U32* seed, frame_t* frame, dictInfo info)
1089 {
1090     size_t contentLeft = frame->header.contentSize;
1091     size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize);
1092     while (1) {
1093         /* 1 in 4 chance of ending frame */
1094         int const lastBlock = contentLeft > maxBlockSize ? 0 : !(RAND(seed) & 3);
1095         size_t blockContentSize;
1096         if (lastBlock) {
1097             blockContentSize = contentLeft;
1098         } else {
1099             if (contentLeft > 0 && (RAND(seed) & 7)) {
1100                 /* some variable size block */
1101                 blockContentSize = RAND(seed) % (MIN(maxBlockSize, contentLeft)+1);
1102             } else if (contentLeft > maxBlockSize && (RAND(seed) & 1)) {
1103                 /* some full size block */
1104                 blockContentSize = maxBlockSize;
1105             } else {
1106                 /* some empty block */
1107                 blockContentSize = 0;
1108             }
1109         }
1110
1111         writeBlock(seed, frame, blockContentSize, lastBlock, info);
1112
1113         contentLeft -= blockContentSize;
1114         if (lastBlock) break;
1115     }
1116 }
1117
1118 static void writeChecksum(frame_t* frame)
1119 {
1120     /* write checksum so implementations can verify their output */
1121     U64 digest = XXH64(frame->srcStart, (BYTE*)frame->src-(BYTE*)frame->srcStart, 0);
1122     DISPLAYLEVEL(3, "  checksum: %08x\n", (U32)digest);
1123     MEM_writeLE32(frame->data, (U32)digest);
1124     frame->data = (BYTE*)frame->data + 4;
1125 }
1126
1127 static void outputBuffer(const void* buf, size_t size, const char* const path)
1128 {
1129     /* write data out to file */
1130     const BYTE* ip = (const BYTE*)buf;
1131     FILE* out;
1132     if (path) {
1133         out = fopen(path, "wb");
1134     } else {
1135         out = stdout;
1136     }
1137     if (!out) {
1138         fprintf(stderr, "Failed to open file at %s: ", path);
1139         perror(NULL);
1140         exit(1);
1141     }
1142
1143     {   size_t fsize = size;
1144         size_t written = 0;
1145         while (written < fsize) {
1146             written += fwrite(ip + written, 1, fsize - written, out);
1147             if (ferror(out)) {
1148                 fprintf(stderr, "Failed to write to file at %s: ", path);
1149                 perror(NULL);
1150                 exit(1);
1151             }
1152         }
1153     }
1154
1155     if (path) {
1156         fclose(out);
1157     }
1158 }
1159
1160 static void initFrame(frame_t* fr)
1161 {
1162     memset(fr, 0, sizeof(*fr));
1163     fr->data = fr->dataStart = FRAME_BUFFER;
1164     fr->dataEnd = FRAME_BUFFER + sizeof(FRAME_BUFFER);
1165     fr->src = fr->srcStart = CONTENT_BUFFER;
1166     fr->srcEnd = CONTENT_BUFFER + sizeof(CONTENT_BUFFER);
1167
1168     /* init repeat codes */
1169     fr->stats.rep[0] = 1;
1170     fr->stats.rep[1] = 4;
1171     fr->stats.rep[2] = 8;
1172 }
1173
1174 /**
1175  * Generated a single zstd compressed block with no block/frame header.
1176  * Returns the final seed.
1177  */
1178 static U32 generateCompressedBlock(U32 seed, frame_t* frame, dictInfo info)
1179 {
1180     size_t blockContentSize;
1181     int blockWritten = 0;
1182     BYTE* op;
1183     DISPLAYLEVEL(4, "block seed: %u\n", seed);
1184     initFrame(frame);
1185     op = (BYTE*)frame->data;
1186
1187     while (!blockWritten) {
1188         size_t cSize;
1189         /* generate window size */
1190         {   int const exponent = RAND(&seed) % (MAX_WINDOW_LOG - 10);
1191             int const mantissa = RAND(&seed) % 8;
1192             frame->header.windowSize = (1U << (exponent + 10));
1193             frame->header.windowSize += (frame->header.windowSize / 8) * mantissa;
1194         }
1195
1196         /* generate content size */
1197         {   size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize);
1198             if (RAND(&seed) & 15) {
1199                 /* some full size blocks */
1200                 blockContentSize = maxBlockSize;
1201             } else if (RAND(&seed) & 7 && g_maxBlockSize >= (1U << 7)) {
1202                 /* some small blocks <= 128 bytes*/
1203                 blockContentSize = RAND(&seed) % (1U << 7);
1204             } else {
1205                 /* some variable size blocks */
1206                 blockContentSize = RAND(&seed) % maxBlockSize;
1207             }
1208         }
1209
1210         /* try generating a compressed block */
1211         frame->oldStats = frame->stats;
1212         frame->data = op;
1213         cSize = writeCompressedBlock(&seed, frame, blockContentSize, info);
1214         if (cSize >= blockContentSize) {  /* compressed size must be strictly smaller than decompressed size : https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks */
1215             /* data doesn't compress -- try again */
1216             frame->stats = frame->oldStats; /* don't update the stats */
1217             DISPLAYLEVEL(5, "   can't compress block : try again \n");
1218         } else {
1219             blockWritten = 1;
1220             DISPLAYLEVEL(4, "   block size: %u \n", (U32)cSize);
1221             frame->src = (BYTE*)frame->src + blockContentSize;
1222         }
1223     }
1224     return seed;
1225 }
1226
1227 /* Return the final seed */
1228 static U32 generateFrame(U32 seed, frame_t* fr, dictInfo info)
1229 {
1230     /* generate a complete frame */
1231     DISPLAYLEVEL(3, "frame seed: %u\n", seed);
1232     initFrame(fr);
1233
1234     writeFrameHeader(&seed, fr, info);
1235     writeBlocks(&seed, fr, info);
1236     writeChecksum(fr);
1237
1238     return seed;
1239 }
1240
1241 /*_*******************************************************
1242 *  Dictionary Helper Functions
1243 *********************************************************/
1244 /* returns 0 if successful, otherwise returns 1 upon error */
1245 static int genRandomDict(U32 dictID, U32 seed, size_t dictSize, BYTE* fullDict)
1246 {
1247     /* allocate space for samples */
1248     int ret = 0;
1249     unsigned const numSamples = 4;
1250     size_t sampleSizes[4];
1251     BYTE* const samples = malloc(5000*sizeof(BYTE));
1252     if (samples == NULL) {
1253         DISPLAY("Error: could not allocate space for samples\n");
1254         return 1;
1255     }
1256
1257     /* generate samples */
1258     {   unsigned literalValue = 1;
1259         unsigned samplesPos = 0;
1260         size_t currSize = 1;
1261         while (literalValue <= 4) {
1262             sampleSizes[literalValue - 1] = currSize;
1263             {   size_t k;
1264                 for (k = 0; k < currSize; k++) {
1265                     *(samples + (samplesPos++)) = (BYTE)literalValue;
1266             }   }
1267             literalValue++;
1268             currSize *= 16;
1269     }   }
1270
1271     {   size_t dictWriteSize = 0;
1272         ZDICT_params_t zdictParams;
1273         size_t const headerSize = MAX(dictSize/4, 256);
1274         size_t const dictContentSize = dictSize - headerSize;
1275         BYTE* const dictContent = fullDict + headerSize;
1276         if (dictContentSize < ZDICT_CONTENTSIZE_MIN || dictSize < ZDICT_DICTSIZE_MIN) {
1277             DISPLAY("Error: dictionary size is too small\n");
1278             ret = 1;
1279             goto exitGenRandomDict;
1280         }
1281
1282         /* init dictionary params */
1283         memset(&zdictParams, 0, sizeof(zdictParams));
1284         zdictParams.dictID = dictID;
1285         zdictParams.notificationLevel = 1;
1286
1287         /* fill in dictionary content */
1288         RAND_buffer(&seed, (void*)dictContent, dictContentSize);
1289
1290         /* finalize dictionary with random samples */
1291         dictWriteSize = ZDICT_finalizeDictionary(fullDict, dictSize,
1292                                     dictContent, dictContentSize,
1293                                     samples, sampleSizes, numSamples,
1294                                     zdictParams);
1295
1296         if (ZDICT_isError(dictWriteSize)) {
1297             DISPLAY("Could not finalize dictionary: %s\n", ZDICT_getErrorName(dictWriteSize));
1298             ret = 1;
1299         }
1300     }
1301
1302 exitGenRandomDict:
1303     free(samples);
1304     return ret;
1305 }
1306
1307 static dictInfo initDictInfo(int useDict, size_t dictContentSize, BYTE* dictContent, U32 dictID){
1308     /* allocate space statically */
1309     dictInfo dictOp;
1310     memset(&dictOp, 0, sizeof(dictOp));
1311     dictOp.useDict = useDict;
1312     dictOp.dictContentSize = dictContentSize;
1313     dictOp.dictContent = dictContent;
1314     dictOp.dictID = dictID;
1315     return dictOp;
1316 }
1317
1318 /*-*******************************************************
1319 *  Test Mode
1320 *********************************************************/
1321
1322 BYTE DECOMPRESSED_BUFFER[MAX_DECOMPRESSED_SIZE];
1323
1324 static size_t testDecodeSimple(frame_t* fr)
1325 {
1326     /* test decoding the generated data with the simple API */
1327     size_t const ret = ZSTD_decompress(DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1328                            fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart);
1329
1330     if (ZSTD_isError(ret)) return ret;
1331
1332     if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart,
1333                (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) {
1334         return ERROR(corruption_detected);
1335     }
1336
1337     return ret;
1338 }
1339
1340 static size_t testDecodeStreaming(frame_t* fr)
1341 {
1342     /* test decoding the generated data with the streaming API */
1343     ZSTD_DStream* zd = ZSTD_createDStream();
1344     ZSTD_inBuffer in;
1345     ZSTD_outBuffer out;
1346     size_t ret;
1347
1348     if (!zd) return ERROR(memory_allocation);
1349
1350     in.src = fr->dataStart;
1351     in.pos = 0;
1352     in.size = (BYTE*)fr->data - (BYTE*)fr->dataStart;
1353
1354     out.dst = DECOMPRESSED_BUFFER;
1355     out.pos = 0;
1356     out.size = ZSTD_DStreamOutSize();
1357
1358     ZSTD_initDStream(zd);
1359     while (1) {
1360         ret = ZSTD_decompressStream(zd, &out, &in);
1361         if (ZSTD_isError(ret)) goto cleanup; /* error */
1362         if (ret == 0) break; /* frame is done */
1363
1364         /* force decoding to be done in chunks */
1365         out.size += MIN(ZSTD_DStreamOutSize(), MAX_DECOMPRESSED_SIZE - out.size);
1366     }
1367
1368     ret = out.pos;
1369
1370     if (memcmp(out.dst, fr->srcStart, out.pos) != 0) {
1371         return ERROR(corruption_detected);
1372     }
1373
1374 cleanup:
1375     ZSTD_freeDStream(zd);
1376     return ret;
1377 }
1378
1379 static size_t testDecodeWithDict(U32 seed, genType_e genType)
1380 {
1381     /* create variables */
1382     size_t const dictSize = RAND(&seed) % (10 << 20) + ZDICT_DICTSIZE_MIN + ZDICT_CONTENTSIZE_MIN;
1383     U32 const dictID = RAND(&seed);
1384     size_t errorDetected = 0;
1385     BYTE* const fullDict = malloc(dictSize);
1386     if (fullDict == NULL) {
1387         return ERROR(GENERIC);
1388     }
1389
1390     /* generate random dictionary */
1391     if (genRandomDict(dictID, seed, dictSize, fullDict)) {  /* return 0 on success */
1392         errorDetected = ERROR(GENERIC);
1393         goto dictTestCleanup;
1394     }
1395
1396
1397     {   frame_t fr;
1398         dictInfo info;
1399         ZSTD_DCtx* const dctx = ZSTD_createDCtx();
1400         size_t ret;
1401
1402         /* get dict info */
1403         {   size_t const headerSize = MAX(dictSize/4, 256);
1404             size_t const dictContentSize = dictSize-headerSize;
1405             BYTE* const dictContent = fullDict+headerSize;
1406             info = initDictInfo(1, dictContentSize, dictContent, dictID);
1407         }
1408
1409         /* manually decompress and check difference */
1410         if (genType == gt_frame) {
1411             /* Test frame */
1412             generateFrame(seed, &fr, info);
1413             ret = ZSTD_decompress_usingDict(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1414                                             fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart,
1415                                             fullDict, dictSize);
1416         } else {
1417             /* Test block */
1418             generateCompressedBlock(seed, &fr, info);
1419             ret = ZSTD_decompressBegin_usingDict(dctx, fullDict, dictSize);
1420             if (ZSTD_isError(ret)) {
1421                 errorDetected = ret;
1422                 ZSTD_freeDCtx(dctx);
1423                 goto dictTestCleanup;
1424             }
1425             ret = ZSTD_decompressBlock(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1426                                        fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart);
1427         }
1428         ZSTD_freeDCtx(dctx);
1429
1430         if (ZSTD_isError(ret)) {
1431             errorDetected = ret;
1432             goto dictTestCleanup;
1433         }
1434
1435         if (memcmp(DECOMPRESSED_BUFFER, fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart) != 0) {
1436             errorDetected = ERROR(corruption_detected);
1437             goto dictTestCleanup;
1438         }
1439     }
1440
1441 dictTestCleanup:
1442     free(fullDict);
1443     return errorDetected;
1444 }
1445
1446 static size_t testDecodeRawBlock(frame_t* fr)
1447 {
1448     ZSTD_DCtx* dctx = ZSTD_createDCtx();
1449     size_t ret = ZSTD_decompressBegin(dctx);
1450     if (ZSTD_isError(ret)) return ret;
1451
1452     ret = ZSTD_decompressBlock(
1453             dctx,
1454             DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1455             fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart);
1456     ZSTD_freeDCtx(dctx);
1457     if (ZSTD_isError(ret)) return ret;
1458
1459     if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart,
1460                (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) {
1461         return ERROR(corruption_detected);
1462     }
1463
1464     return ret;
1465 }
1466
1467 static int runBlockTest(U32* seed)
1468 {
1469     frame_t fr;
1470     U32 const seedCopy = *seed;
1471     {   dictInfo const info = initDictInfo(0, 0, NULL, 0);
1472         *seed = generateCompressedBlock(*seed, &fr, info);
1473     }
1474
1475     {   size_t const r = testDecodeRawBlock(&fr);
1476         if (ZSTD_isError(r)) {
1477             DISPLAY("Error in block mode on test seed %u: %s\n", seedCopy,
1478                     ZSTD_getErrorName(r));
1479             return 1;
1480         }
1481     }
1482
1483     {   size_t const r = testDecodeWithDict(*seed, gt_block);
1484         if (ZSTD_isError(r)) {
1485             DISPLAY("Error in block mode with dictionary on test seed %u: %s\n",
1486                     seedCopy, ZSTD_getErrorName(r));
1487             return 1;
1488         }
1489     }
1490     return 0;
1491 }
1492
1493 static int runFrameTest(U32* seed)
1494 {
1495     frame_t fr;
1496     U32 const seedCopy = *seed;
1497     {   dictInfo const info = initDictInfo(0, 0, NULL, 0);
1498         *seed = generateFrame(*seed, &fr, info);
1499     }
1500
1501     {   size_t const r = testDecodeSimple(&fr);
1502         if (ZSTD_isError(r)) {
1503             DISPLAY("Error in simple mode on test seed %u: %s\n",
1504                     seedCopy, ZSTD_getErrorName(r));
1505             return 1;
1506         }
1507     }
1508     {   size_t const r = testDecodeStreaming(&fr);
1509         if (ZSTD_isError(r)) {
1510             DISPLAY("Error in streaming mode on test seed %u: %s\n",
1511                     seedCopy, ZSTD_getErrorName(r));
1512             return 1;
1513         }
1514     }
1515     {   size_t const r = testDecodeWithDict(*seed, gt_frame);  /* avoid big dictionaries */
1516         if (ZSTD_isError(r)) {
1517             DISPLAY("Error in dictionary mode on test seed %u: %s\n",
1518                     seedCopy, ZSTD_getErrorName(r));
1519             return 1;
1520         }
1521     }
1522     return 0;
1523 }
1524
1525 static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS,
1526                        genType_e genType)
1527 {
1528     unsigned fnum;
1529
1530     UTIL_time_t const startClock = UTIL_getTime();
1531     U64 const maxClockSpan = testDurationS * SEC_TO_MICRO;
1532
1533     if (numFiles == 0 && !testDurationS) numFiles = 1;
1534
1535     DISPLAY("seed: %u\n", seed);
1536
1537     for (fnum = 0; fnum < numFiles || UTIL_clockSpanMicro(startClock) < maxClockSpan; fnum++) {
1538         if (fnum < numFiles)
1539             DISPLAYUPDATE("\r%u/%u        ", fnum, numFiles);
1540         else
1541             DISPLAYUPDATE("\r%u           ", fnum);
1542
1543         {   int const ret = (genType == gt_frame) ?
1544                             runFrameTest(&seed) :
1545                             runBlockTest(&seed);
1546             if (ret) return ret;
1547         }
1548     }
1549
1550     DISPLAY("\r%u tests completed: ", fnum);
1551     DISPLAY("OK\n");
1552
1553     return 0;
1554 }
1555
1556 /*-*******************************************************
1557 *  File I/O
1558 *********************************************************/
1559
1560 static int generateFile(U32 seed, const char* const path,
1561                         const char* const origPath, genType_e genType)
1562 {
1563     frame_t fr;
1564
1565     DISPLAY("seed: %u\n", seed);
1566
1567     {   dictInfo const info = initDictInfo(0, 0, NULL, 0);
1568         if (genType == gt_frame) {
1569             generateFrame(seed, &fr, info);
1570         } else {
1571             generateCompressedBlock(seed, &fr, info);
1572         }
1573     }
1574     outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
1575     if (origPath) {
1576         outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath);
1577     }
1578     return 0;
1579 }
1580
1581 static int generateCorpus(U32 seed, unsigned numFiles, const char* const path,
1582                           const char* const origPath, genType_e genType)
1583 {
1584     char outPath[MAX_PATH];
1585     unsigned fnum;
1586
1587     DISPLAY("seed: %u\n", seed);
1588
1589     for (fnum = 0; fnum < numFiles; fnum++) {
1590         frame_t fr;
1591
1592         DISPLAYUPDATE("\r%u/%u        ", fnum, numFiles);
1593
1594         {   dictInfo const info = initDictInfo(0, 0, NULL, 0);
1595             if (genType == gt_frame) {
1596                 seed = generateFrame(seed, &fr, info);
1597             } else {
1598                 seed = generateCompressedBlock(seed, &fr, info);
1599             }
1600         }
1601
1602         if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
1603             DISPLAY("Error: path too long\n");
1604             return 1;
1605         }
1606         outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath);
1607
1608         if (origPath) {
1609             if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) {
1610                 DISPLAY("Error: path too long\n");
1611                 return 1;
1612             }
1613             outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath);
1614         }
1615     }
1616
1617     DISPLAY("\r%u/%u      \n", fnum, numFiles);
1618
1619     return 0;
1620 }
1621
1622 static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const path,
1623                                   const char* const origPath, const size_t dictSize,
1624                                   genType_e genType)
1625 {
1626     char outPath[MAX_PATH];
1627     BYTE* fullDict;
1628     U32 const dictID = RAND(&seed);
1629     int errorDetected = 0;
1630
1631     if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
1632         DISPLAY("Error: path too long\n");
1633         return 1;
1634     }
1635
1636     /* allocate space for the dictionary */
1637     fullDict = malloc(dictSize);
1638     if (fullDict == NULL) {
1639         DISPLAY("Error: could not allocate space for full dictionary.\n");
1640         return 1;
1641     }
1642
1643     /* randomly generate the dictionary */
1644     {   int const ret = genRandomDict(dictID, seed, dictSize, fullDict);
1645         if (ret != 0) {
1646             errorDetected = ret;
1647             goto dictCleanup;
1648         }
1649     }
1650
1651     /* write out dictionary */
1652     if (numFiles != 0) {
1653         if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
1654             DISPLAY("Error: dictionary path too long\n");
1655             errorDetected = 1;
1656             goto dictCleanup;
1657         }
1658         outputBuffer(fullDict, dictSize, outPath);
1659     }
1660     else {
1661         outputBuffer(fullDict, dictSize, "dictionary");
1662     }
1663
1664     /* generate random compressed/decompressed files */
1665     {   unsigned fnum;
1666         for (fnum = 0; fnum < MAX(numFiles, 1); fnum++) {
1667             frame_t fr;
1668             DISPLAYUPDATE("\r%u/%u        ", fnum, numFiles);
1669             {
1670                 size_t const headerSize = MAX(dictSize/4, 256);
1671                 size_t const dictContentSize = dictSize-headerSize;
1672                 BYTE* const dictContent = fullDict+headerSize;
1673                 dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID);
1674                 if (genType == gt_frame) {
1675                     seed = generateFrame(seed, &fr, info);
1676                 } else {
1677                     seed = generateCompressedBlock(seed, &fr, info);
1678                 }
1679             }
1680
1681             if (numFiles != 0) {
1682                 if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
1683                     DISPLAY("Error: path too long\n");
1684                     errorDetected = 1;
1685                     goto dictCleanup;
1686                 }
1687                 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath);
1688
1689                 if (origPath) {
1690                     if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) {
1691                         DISPLAY("Error: path too long\n");
1692                         errorDetected = 1;
1693                         goto dictCleanup;
1694                     }
1695                     outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath);
1696                 }
1697             }
1698             else {
1699                 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
1700                 if (origPath) {
1701                     outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath);
1702                 }
1703             }
1704         }
1705     }
1706
1707 dictCleanup:
1708     free(fullDict);
1709     return errorDetected;
1710 }
1711
1712
1713 /*_*******************************************************
1714 *  Command line
1715 *********************************************************/
1716 static U32 makeSeed(void)
1717 {
1718     U32 t = (U32) time(NULL);
1719     return XXH32(&t, sizeof(t), 0) % 65536;
1720 }
1721
1722 static unsigned readInt(const char** argument)
1723 {
1724     unsigned val = 0;
1725     while ((**argument>='0') && (**argument<='9')) {
1726         val *= 10;
1727         val += **argument - '0';
1728         (*argument)++;
1729     }
1730     return val;
1731 }
1732
1733 static void usage(const char* programName)
1734 {
1735     DISPLAY( "Usage :\n");
1736     DISPLAY( "      %s [args]\n", programName);
1737     DISPLAY( "\n");
1738     DISPLAY( "Arguments :\n");
1739     DISPLAY( " -p<path> : select output path (default:stdout)\n");
1740     DISPLAY( "                in multiple files mode this should be a directory\n");
1741     DISPLAY( " -o<path> : select path to output original file (default:no output)\n");
1742     DISPLAY( "                in multiple files mode this should be a directory\n");
1743     DISPLAY( " -s#      : select seed (default:random based on time)\n");
1744     DISPLAY( " -n#      : number of files to generate (default:1)\n");
1745     DISPLAY( " -t       : activate test mode (test files against libzstd instead of outputting them)\n");
1746     DISPLAY( " -T#      : length of time to run tests for\n");
1747     DISPLAY( " -v       : increase verbosity level (default:0, max:7)\n");
1748     DISPLAY( " -h/H     : display help/long help and exit\n");
1749 }
1750
1751 static void advancedUsage(const char* programName)
1752 {
1753     usage(programName);
1754     DISPLAY( "\n");
1755     DISPLAY( "Advanced arguments        :\n");
1756     DISPLAY( " --content-size           : always include the content size in the frame header\n");
1757     DISPLAY( " --use-dict=#             : include a dictionary used to decompress the corpus\n");
1758     DISPLAY( " --gen-blocks             : generate raw compressed blocks without block/frame headers\n");
1759     DISPLAY( " --max-block-size-log=#   : max block size log, must be in range [2, 17]\n");
1760     DISPLAY( " --max-content-size-log=# : max content size log, must be <= 20\n");
1761     DISPLAY( "                            (this is ignored with gen-blocks)\n");
1762 }
1763
1764 /*! readU32FromChar() :
1765     @return : unsigned integer value read from input in `char` format
1766     allows and interprets K, KB, KiB, M, MB and MiB suffix.
1767     Will also modify `*stringPtr`, advancing it to position where it stopped reading.
1768     Note : function result can overflow if digit string > MAX_UINT */
1769 static unsigned readU32FromChar(const char** stringPtr)
1770 {
1771     unsigned result = 0;
1772     while ((**stringPtr >='0') && (**stringPtr <='9'))
1773         result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
1774     if ((**stringPtr=='K') || (**stringPtr=='M')) {
1775         result <<= 10;
1776         if (**stringPtr=='M') result <<= 10;
1777         (*stringPtr)++ ;
1778         if (**stringPtr=='i') (*stringPtr)++;
1779         if (**stringPtr=='B') (*stringPtr)++;
1780     }
1781     return result;
1782 }
1783
1784 /** longCommandWArg() :
1785  *  check if *stringPtr is the same as longCommand.
1786  *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
1787  *  @return 0 and doesn't modify *stringPtr otherwise.
1788  */
1789 static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
1790 {
1791     size_t const comSize = strlen(longCommand);
1792     int const result = !strncmp(*stringPtr, longCommand, comSize);
1793     if (result) *stringPtr += comSize;
1794     return result;
1795 }
1796
1797 int main(int argc, char** argv)
1798 {
1799     U32 seed = 0;
1800     int seedset = 0;
1801     unsigned numFiles = 0;
1802     unsigned testDuration = 0;
1803     int testMode = 0;
1804     const char* path = NULL;
1805     const char* origPath = NULL;
1806     int useDict = 0;
1807     unsigned dictSize = (10 << 10); /* 10 kB default */
1808     genType_e genType = gt_frame;
1809
1810     int argNb;
1811
1812     /* Check command line */
1813     for (argNb=1; argNb<argc; argNb++) {
1814         const char* argument = argv[argNb];
1815         if(!argument) continue;   /* Protection if argument empty */
1816
1817         /* Handle commands. Aggregated commands are allowed */
1818         if (argument[0]=='-') {
1819             argument++;
1820             while (*argument!=0) {
1821                 switch(*argument)
1822                 {
1823                 case 'h':
1824                     usage(argv[0]);
1825                     return 0;
1826                 case 'H':
1827                     advancedUsage(argv[0]);
1828                     return 0;
1829                 case 'v':
1830                     argument++;
1831                     g_displayLevel++;
1832                     break;
1833                 case 's':
1834                     argument++;
1835                     seedset=1;
1836                     seed = readInt(&argument);
1837                     break;
1838                 case 'n':
1839                     argument++;
1840                     numFiles = readInt(&argument);
1841                     break;
1842                 case 'T':
1843                     argument++;
1844                     testDuration = readInt(&argument);
1845                     if (*argument == 'm') {
1846                         testDuration *= 60;
1847                         argument++;
1848                         if (*argument == 'n') argument++;
1849                     }
1850                     break;
1851                 case 'o':
1852                     argument++;
1853                     origPath = argument;
1854                     argument += strlen(argument);
1855                     break;
1856                 case 'p':
1857                     argument++;
1858                     path = argument;
1859                     argument += strlen(argument);
1860                     break;
1861                 case 't':
1862                     argument++;
1863                     testMode = 1;
1864                     break;
1865                 case '-':
1866                     argument++;
1867                     if (strcmp(argument, "content-size") == 0) {
1868                         opts.contentSize = 1;
1869                     } else if (longCommandWArg(&argument, "use-dict=")) {
1870                         dictSize = readU32FromChar(&argument);
1871                         useDict = 1;
1872                     } else if (strcmp(argument, "gen-blocks") == 0) {
1873                         genType = gt_block;
1874                     } else if (longCommandWArg(&argument, "max-block-size-log=")) {
1875                         U32 value = readU32FromChar(&argument);
1876                         if (value >= 2 && value <= ZSTD_BLOCKSIZE_MAX) {
1877                             g_maxBlockSize = 1U << value;
1878                         }
1879                     } else if (longCommandWArg(&argument, "max-content-size-log=")) {
1880                         U32 value = readU32FromChar(&argument);
1881                         g_maxDecompressedSizeLog =
1882                                 MIN(MAX_DECOMPRESSED_SIZE_LOG, value);
1883                     } else {
1884                         advancedUsage(argv[0]);
1885                         return 1;
1886                     }
1887                     argument += strlen(argument);
1888                     break;
1889                 default:
1890                     usage(argv[0]);
1891                     return 1;
1892     }   }   }   }   /* for (argNb=1; argNb<argc; argNb++) */
1893
1894     if (!seedset) {
1895         seed = makeSeed();
1896     }
1897
1898     if (testMode) {
1899         return runTestMode(seed, numFiles, testDuration, genType);
1900     } else {
1901         if (testDuration) {
1902             DISPLAY("Error: -T requires test mode (-t)\n\n");
1903             usage(argv[0]);
1904             return 1;
1905         }
1906     }
1907
1908     if (!path) {
1909         DISPLAY("Error: path is required in file generation mode\n");
1910         usage(argv[0]);
1911         return 1;
1912     }
1913
1914     if (numFiles == 0 && useDict == 0) {
1915         return generateFile(seed, path, origPath, genType);
1916     } else if (useDict == 0){
1917         return generateCorpus(seed, numFiles, path, origPath, genType);
1918     } else {
1919         /* should generate files with a dictionary */
1920         return generateCorpusWithDict(seed, numFiles, path, origPath, dictSize, genType);
1921     }
1922
1923 }