2 * Copyright (c) 2017-present, Yann Collet, Facebook, Inc.
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
20 #include "zstd_internal.h"
22 #define ZDICT_STATIC_LINKING_ONLY
25 /* Direct access to internal compression functions is required */
26 #include "zstd_compress.c"
28 #define XXH_STATIC_LINKING_ONLY
29 #include "xxhash.h" /* XXH64 */
32 #define MIN(a, b) ((a) < (b) ? (a) : (b))
37 #define MAX_PATH PATH_MAX
43 /*-************************************
45 **************************************/
46 #define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
47 #define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
48 static U32 g_displayLevel = 2;
50 #define DISPLAYUPDATE(...) \
52 if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || \
53 (g_displayLevel >= 4)) { \
54 g_displayClock = UTIL_getTime(); \
55 DISPLAY(__VA_ARGS__); \
56 if (g_displayLevel >= 4) fflush(stderr); \
60 static const U64 g_refreshRate = SEC_TO_MICRO / 6;
61 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
63 #define CHECKERR(code) \
65 if (ZSTD_isError(code)) { \
66 DISPLAY("Error occurred while generating data: %s\n", \
67 ZSTD_getErrorName(code)); \
72 /*-*******************************************************
74 *********************************************************/
75 static U32 RAND(U32* src)
77 #define RAND_rotl32(x,r) ((x << r) | (x >> (32 - r)))
78 static const U32 prime1 = 2654435761U;
79 static const U32 prime2 = 2246822519U;
83 rand32 = RAND_rotl32(rand32, 13);
85 return RAND_rotl32(rand32, 27);
89 #define DISTSIZE (8192)
91 /* Write `size` bytes into `ptr`, all of which are less than or equal to `maxSymb` */
92 static void RAND_bufferMaxSymb(U32* seed, void* ptr, size_t size, int maxSymb)
97 for (i = 0; i < size; i++) {
98 op[i] = (BYTE) (RAND(seed) % (maxSymb + 1));
102 /* Write `size` random bytes into `ptr` */
103 static void RAND_buffer(U32* seed, void* ptr, size_t size)
108 for (i = 0; i + 4 <= size; i += 4) {
109 MEM_writeLE32(op + i, RAND(seed));
111 for (; i < size; i++) {
112 op[i] = RAND(seed) & 0xff;
116 /* Write `size` bytes into `ptr` following the distribution `dist` */
117 static void RAND_bufferDist(U32* seed, BYTE* dist, void* ptr, size_t size)
122 for (i = 0; i < size; i++) {
123 op[i] = dist[RAND(seed) % DISTSIZE];
127 /* Generate a random distribution where the frequency of each symbol follows a
128 * geometric distribution defined by `weight`
129 * `dist` should have size at least `DISTSIZE` */
130 static void RAND_genDist(U32* seed, BYTE* dist, double weight)
133 size_t statesLeft = DISTSIZE;
134 BYTE symb = (BYTE) (RAND(seed) % 256);
135 BYTE step = (BYTE) ((RAND(seed) % 256) | 1); /* force it to be odd so it's relatively prime to 256 */
137 while (i < DISTSIZE) {
138 size_t states = ((size_t)(weight * statesLeft)) + 1;
140 for (j = 0; j < states && i < DISTSIZE; j++, i++) {
145 statesLeft -= states;
149 /* Generates a random number in the range [min, max) */
150 static inline U32 RAND_range(U32* seed, U32 min, U32 max)
152 return (RAND(seed) % (max-min)) + min;
155 #define ROUND(x) ((U32)(x + 0.5))
157 /* Generates a random number in an exponential distribution with mean `mean` */
158 static double RAND_exp(U32* seed, double mean)
160 double const u = RAND(seed) / (double) UINT_MAX;
161 return log(1-u) * (-mean);
164 /*-*******************************************************
165 * Constants and Structs
166 *********************************************************/
167 const char *BLOCK_TYPES[] = {"raw", "rle", "compressed"};
169 #define MAX_DECOMPRESSED_SIZE_LOG 20
170 #define MAX_DECOMPRESSED_SIZE (1ULL << MAX_DECOMPRESSED_SIZE_LOG)
172 #define MAX_WINDOW_LOG 22 /* Recommended support is 8MB, so limit to 4MB + mantissa */
174 #define MIN_SEQ_LEN (3)
175 #define MAX_NB_SEQ ((ZSTD_BLOCKSIZE_MAX + MIN_SEQ_LEN - 1) / MIN_SEQ_LEN)
177 BYTE CONTENT_BUFFER[MAX_DECOMPRESSED_SIZE];
178 BYTE FRAME_BUFFER[MAX_DECOMPRESSED_SIZE * 2];
179 BYTE LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX];
181 seqDef SEQUENCE_BUFFER[MAX_NB_SEQ];
182 BYTE SEQUENCE_LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX]; /* storeSeq expects a place to copy literals to */
183 BYTE SEQUENCE_LLCODE[ZSTD_BLOCKSIZE_MAX];
184 BYTE SEQUENCE_MLCODE[ZSTD_BLOCKSIZE_MAX];
185 BYTE SEQUENCE_OFCODE[ZSTD_BLOCKSIZE_MAX];
190 size_t contentSize; /* 0 means unknown (unless contentSize == windowSize == 0) */
191 unsigned windowSize; /* contentSize >= windowSize means single segment */
194 /* For repeat modes */
196 U32 rep[ZSTD_REP_NUM];
199 /* the distribution used in the previous block for repeat mode */
200 BYTE hufDist[DISTSIZE];
201 U32 hufTable [256]; /* HUF_CElt is an incomplete type */
204 FSE_CTable offcodeCTable [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
205 FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
206 FSE_CTable litlengthCTable [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
208 /* Symbols that were present in the previous distribution, for use with
210 BYTE litlengthSymbolSet[36];
211 BYTE offsetSymbolSet[29];
212 BYTE matchlengthSymbolSet[53];
224 frameHeader_t header;
227 cblockStats_t oldStats; /* so they can be rolled back if uncompressible */
233 size_t dictContentSize;
238 gt_frame = 0, /* generate frames */
239 gt_block, /* generate compressed blocks without block/frame headers */
242 /*-*******************************************************
243 * Global variables (set from command line)
244 *********************************************************/
245 U32 g_maxDecompressedSizeLog = MAX_DECOMPRESSED_SIZE_LOG; /* <= 20 */
246 U32 g_maxBlockSize = ZSTD_BLOCKSIZE_MAX; /* <= 128 KB */
248 /*-*******************************************************
249 * Generator Functions
250 *********************************************************/
253 int contentSize; /* force the content size to be present */
254 } opts; /* advanced options on generation */
256 /* Generate and write a random frame header */
257 static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info)
259 BYTE* const op = frame->data;
265 int singleSegment = 0;
266 int contentSizeFlag = 0;
269 memset(&fh, 0, sizeof(fh));
271 /* generate window size */
273 /* Follow window algorithm from specification */
274 int const exponent = RAND(seed) % (MAX_WINDOW_LOG - 10);
275 int const mantissa = RAND(seed) % 8;
276 windowByte = (BYTE) ((exponent << 3) | mantissa);
277 fh.windowSize = (1U << (exponent + 10));
278 fh.windowSize += fh.windowSize / 8 * mantissa;
282 /* Generate random content size */
284 if (RAND(seed) & 7 && g_maxDecompressedSizeLog > 7) {
285 /* do content of at least 128 bytes */
286 highBit = 1ULL << RAND_range(seed, 7, g_maxDecompressedSizeLog);
287 } else if (RAND(seed) & 3) {
288 /* do small content */
289 highBit = 1ULL << RAND_range(seed, 0, MIN(7, 1U << g_maxDecompressedSizeLog));
294 fh.contentSize = highBit ? highBit + (RAND(seed) % highBit) : 0;
296 /* provide size sometimes */
297 contentSizeFlag = opts.contentSize | (RAND(seed) & 1);
299 if (contentSizeFlag && (fh.contentSize == 0 || !(RAND(seed) & 7))) {
300 /* do single segment sometimes */
301 fh.windowSize = (U32) fh.contentSize;
306 if (contentSizeFlag) {
307 /* Determine how large fcs field has to be */
308 int minFcsCode = (fh.contentSize >= 256) +
309 (fh.contentSize >= 65536 + 256) +
310 (fh.contentSize > 0xFFFFFFFFU);
311 if (!singleSegment && !minFcsCode) {
314 fcsCode = minFcsCode + (RAND(seed) % (4 - minFcsCode));
315 if (fcsCode == 1 && fh.contentSize < 256) fcsCode++;
318 /* write out the header */
319 MEM_writeLE32(op + pos, ZSTD_MAGICNUMBER);
324 * fcsCode: 2-bit flag specifying how many bytes used to represent Frame_Content_Size (bits 7-6)
325 * singleSegment: 1-bit flag describing if data must be regenerated within a single continuous memory segment. (bit 5)
326 * contentChecksumFlag: 1-bit flag that is set if frame includes checksum at the end -- set to 1 below (bit 2)
327 * dictBits: 2-bit flag describing how many bytes Dictionary_ID uses -- set to 3 (bits 1-0)
328 * For more information: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header
330 int const dictBits = info.useDict ? 3 : 0;
331 BYTE const frameHeaderDescriptor =
332 (BYTE) ((fcsCode << 6) | (singleSegment << 5) | (1 << 2) | dictBits);
333 op[pos++] = frameHeaderDescriptor;
336 if (!singleSegment) {
337 op[pos++] = windowByte;
340 MEM_writeLE32(op + pos, (U32) info.dictID);
343 if (contentSizeFlag) {
345 default: /* Impossible */
346 case 0: op[pos++] = (BYTE) fh.contentSize; break;
347 case 1: MEM_writeLE16(op + pos, (U16) (fh.contentSize - 256)); pos += 2; break;
348 case 2: MEM_writeLE32(op + pos, (U32) fh.contentSize); pos += 4; break;
349 case 3: MEM_writeLE64(op + pos, (U64) fh.contentSize); pos += 8; break;
353 DISPLAYLEVEL(3, " frame content size:\t%u\n", (unsigned)fh.contentSize);
354 DISPLAYLEVEL(3, " frame window size:\t%u\n", fh.windowSize);
355 DISPLAYLEVEL(3, " content size flag:\t%d\n", contentSizeFlag);
356 DISPLAYLEVEL(3, " single segment flag:\t%d\n", singleSegment);
358 frame->data = op + pos;
362 /* Write a literal block in either raw or RLE form, return the literals size */
363 static size_t writeLiteralsBlockSimple(U32* seed, frame_t* frame, size_t contentSize)
365 BYTE* op = (BYTE*)frame->data;
366 int const type = RAND(seed) % 2;
367 int const sizeFormatDesc = RAND(seed) % 8;
369 size_t maxLitSize = MIN(contentSize, g_maxBlockSize);
371 if (sizeFormatDesc == 0) {
372 /* Size_FormatDesc = ?0 */
373 maxLitSize = MIN(maxLitSize, 31);
374 } else if (sizeFormatDesc <= 4) {
375 /* Size_FormatDesc = 01 */
376 maxLitSize = MIN(maxLitSize, 4095);
378 /* Size_Format = 11 */
379 maxLitSize = MIN(maxLitSize, 1048575);
382 litSize = RAND(seed) % (maxLitSize + 1);
383 if (frame->src == frame->srcStart && litSize == 0) {
384 litSize = 1; /* no empty literals if there's nothing preceding this block */
386 if (litSize + 3 > contentSize) {
387 litSize = contentSize; /* no matches shorter than 3 are allowed */
389 /* use smallest size format that fits */
391 op[0] = (type | (0 << 2) | (litSize << 3)) & 0xff;
393 } else if (litSize < 4096) {
394 op[0] = (type | (1 << 2) | (litSize << 4)) & 0xff;
395 op[1] = (litSize >> 4) & 0xff;
398 op[0] = (type | (3 << 2) | (litSize << 4)) & 0xff;
399 op[1] = (litSize >> 4) & 0xff;
400 op[2] = (litSize >> 12) & 0xff;
406 DISPLAYLEVEL(4, " raw literals\n");
408 RAND_buffer(seed, LITERAL_BUFFER, litSize);
409 memcpy(op, LITERAL_BUFFER, litSize);
413 BYTE const symb = (BYTE) (RAND(seed) % 256);
415 DISPLAYLEVEL(4, " rle literals: 0x%02x\n", (unsigned)symb);
417 memset(LITERAL_BUFFER, symb, litSize);
427 /* Generate a Huffman header for the given source */
428 static size_t writeHufHeader(U32* seed, HUF_CElt* hufTable, void* dst, size_t dstSize,
429 const void* src, size_t srcSize)
431 BYTE* const ostart = (BYTE*)dst;
434 unsigned huffLog = 11;
435 unsigned maxSymbolValue = 255;
437 unsigned count[HUF_SYMBOLVALUE_MAX+1];
439 /* Scan input and build symbol stats */
440 { size_t const largest = HIST_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, WKSP, sizeof(WKSP));
441 assert(!HIST_isError(largest));
442 if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 0; } /* single symbol, rle */
443 if (largest <= (srcSize >> 7)+1) return 0; /* Fast heuristic : not compressible enough */
446 /* Build Huffman Tree */
447 /* Max Huffman log is 11, min is highbit(maxSymbolValue)+1 */
448 huffLog = RAND_range(seed, ZSTD_highbit32(maxSymbolValue)+1, huffLog+1);
449 DISPLAYLEVEL(6, " huffman log: %u\n", huffLog);
450 { size_t const maxBits = HUF_buildCTable_wksp (hufTable, count, maxSymbolValue, huffLog, WKSP, sizeof(WKSP));
452 huffLog = (U32)maxBits;
455 /* Write table description header */
456 { size_t const hSize = HUF_writeCTable (op, dstSize, hufTable, maxSymbolValue, huffLog);
457 if (hSize + 12 >= srcSize) return 0; /* not useful to try compression */
464 /* Write a Huffman coded literals block and return the literals size */
465 static size_t writeLiteralsBlockCompressed(U32* seed, frame_t* frame, size_t contentSize)
467 BYTE* origop = (BYTE*)frame->data;
468 BYTE* opend = (BYTE*)frame->dataEnd;
470 BYTE* const ostart = origop;
471 int const sizeFormat = RAND(seed) % 4;
473 size_t hufHeaderSize = 0;
474 size_t compressedSize = 0;
475 size_t maxLitSize = MIN(contentSize-3, g_maxBlockSize);
477 symbolEncodingType_e hType;
479 if (contentSize < 64) {
480 /* make sure we get reasonably-sized literals for compression */
481 return ERROR(GENERIC);
484 DISPLAYLEVEL(4, " compressed literals\n");
486 switch (sizeFormat) {
487 case 0: /* fall through, size is the same as case 1 */
489 maxLitSize = MIN(maxLitSize, 1023);
493 maxLitSize = MIN(maxLitSize, 16383);
497 maxLitSize = MIN(maxLitSize, 262143);
500 default:; /* impossible */
506 litSize = RAND(seed) % (maxLitSize + 1);
507 } while (litSize < 32); /* avoid small literal sizes */
508 if (litSize + 3 > contentSize) {
509 litSize = contentSize; /* no matches shorter than 3 are allowed */
512 /* most of the time generate a new distribution */
513 if ((RAND(seed) & 3) || !frame->stats.hufInit) {
515 if (RAND(seed) & 3) {
516 /* add 10 to ensure some compressability */
517 double const weight = ((RAND(seed) % 90) + 10) / 100.0;
519 DISPLAYLEVEL(5, " distribution weight: %d%%\n",
520 (int)(weight * 100));
522 RAND_genDist(seed, frame->stats.hufDist, weight);
524 /* sometimes do restricted range literals to force
525 * non-huffman headers */
526 DISPLAYLEVEL(5, " small range literals\n");
527 RAND_bufferMaxSymb(seed, frame->stats.hufDist, DISTSIZE,
530 RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER,
533 /* generate the header from the distribution instead of the
534 * actual data to avoid bugs with symbols that were in the
535 * distribution but never showed up in the output */
536 hufHeaderSize = writeHufHeader(
537 seed, (HUF_CElt*)frame->stats.hufTable, op, opend - op,
538 frame->stats.hufDist, DISTSIZE);
539 CHECKERR(hufHeaderSize);
540 /* repeat until a valid header is written */
541 } while (hufHeaderSize == 0);
543 hType = set_compressed;
545 frame->stats.hufInit = 1;
547 /* repeat the distribution/table from last time */
548 DISPLAYLEVEL(5, " huffman repeat stats\n");
549 RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER,
558 ? HUF_compress1X_usingCTable(
559 op, opend - op, LITERAL_BUFFER, litSize,
560 (HUF_CElt*)frame->stats.hufTable)
561 : HUF_compress4X_usingCTable(
562 op, opend - op, LITERAL_BUFFER, litSize,
563 (HUF_CElt*)frame->stats.hufTable);
564 CHECKERR(compressedSize);
565 /* this only occurs when it could not compress or similar */
566 } while (compressedSize <= 0);
568 op += compressedSize;
570 compressedSize += hufHeaderSize;
571 DISPLAYLEVEL(5, " regenerated size: %u\n", (unsigned)litSize);
572 DISPLAYLEVEL(5, " compressed size: %u\n", (unsigned)compressedSize);
573 if (compressedSize >= litSize) {
574 DISPLAYLEVEL(5, " trying again\n");
575 /* if we have to try again, reset the stats so we don't accidentally
576 * try to repeat a distribution we just made */
577 frame->stats = frame->oldStats;
584 switch (sizeFormat) {
585 case 0: /* fall through, size is the same as case 1 */
587 U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
588 ((U32)compressedSize << 14);
589 MEM_writeLE24(ostart, header);
593 U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
594 ((U32)compressedSize << 18);
595 MEM_writeLE32(ostart, header);
599 U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
600 ((U32)compressedSize << 22);
601 MEM_writeLE32(ostart, header);
602 ostart[4] = (BYTE)(compressedSize >> 10);
605 default:; /* impossible */
612 static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize)
614 /* only do compressed for larger segments to avoid compressibility issues */
615 if (RAND(seed) & 7 && contentSize >= 64) {
616 return writeLiteralsBlockCompressed(seed, frame, contentSize);
618 return writeLiteralsBlockSimple(seed, frame, contentSize);
622 static inline void initSeqStore(seqStore_t *seqStore) {
623 seqStore->maxNbSeq = MAX_NB_SEQ;
624 seqStore->maxNbLit = ZSTD_BLOCKSIZE_MAX;
625 seqStore->sequencesStart = SEQUENCE_BUFFER;
626 seqStore->litStart = SEQUENCE_LITERAL_BUFFER;
627 seqStore->llCode = SEQUENCE_LLCODE;
628 seqStore->mlCode = SEQUENCE_MLCODE;
629 seqStore->ofCode = SEQUENCE_OFCODE;
631 ZSTD_resetSeqStore(seqStore);
634 /* Randomly generate sequence commands */
635 static U32 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore,
636 size_t contentSize, size_t literalsSize, dictInfo info)
638 /* The total length of all the matches */
639 size_t const remainingMatch = contentSize - literalsSize;
640 size_t excessMatch = 0;
641 U32 numSequences = 0;
646 const BYTE* literals = LITERAL_BUFFER;
647 BYTE* srcPtr = frame->src;
649 if (literalsSize != contentSize) {
650 /* each match must be at least MIN_SEQ_LEN, so this is the maximum
651 * number of sequences we can have */
652 U32 const maxSequences = (U32)remainingMatch / MIN_SEQ_LEN;
653 numSequences = (RAND(seed) % maxSequences) + 1;
655 /* the extra match lengths we have to allocate to each sequence */
656 excessMatch = remainingMatch - numSequences * MIN_SEQ_LEN;
659 DISPLAYLEVEL(5, " total match lengths: %u\n", (unsigned)remainingMatch);
660 for (i = 0; i < numSequences; i++) {
661 /* Generate match and literal lengths by exponential distribution to
662 * ensure nice numbers */
665 ROUND(RAND_exp(seed, excessMatch / (double)(numSequences - i)));
668 ? ROUND(RAND_exp(seed,
670 (double)(numSequences - i)))
672 /* actual offset, code to send, and point to copy up to when shifting
673 * codes in the repeat offsets history */
674 U32 offset, offsetCode, repIndex;
677 matchLen = (U32) MIN(matchLen, excessMatch + MIN_SEQ_LEN);
678 literalLen = MIN(literalLen, (U32) literalsSize);
679 if (i == 0 && srcPtr == frame->srcStart && literalLen == 0) literalLen = 1;
680 if (i + 1 == numSequences) matchLen = MIN_SEQ_LEN + (U32) excessMatch;
682 memcpy(srcPtr, literals, literalLen);
683 srcPtr += literalLen;
685 if (RAND(seed) & 7) {
686 /* do a normal offset */
687 U32 const dataDecompressed = (U32)((BYTE*)srcPtr-(BYTE*)frame->srcStart);
688 offset = (RAND(seed) %
689 MIN(frame->header.windowSize,
690 (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) +
692 if (info.useDict && (RAND(seed) & 1) && i + 1 != numSequences && dataDecompressed < frame->header.windowSize) {
693 /* need to occasionally generate offsets that go past the start */
694 /* including i+1 != numSequences because the last sequences has to adhere to predetermined contentSize */
695 U32 lenPastStart = (RAND(seed) % info.dictContentSize) + 1;
696 offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart)+lenPastStart;
697 if (offset > frame->header.windowSize) {
698 if (lenPastStart < MIN_SEQ_LEN) {
699 /* when offset > windowSize, matchLen bound by end of dictionary (lenPastStart) */
700 /* this also means that lenPastStart must be greater than MIN_SEQ_LEN */
701 /* make sure lenPastStart does not go past dictionary start though */
702 lenPastStart = MIN(lenPastStart+MIN_SEQ_LEN, (U32)info.dictContentSize);
703 offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) + lenPastStart;
706 U32 const matchLenBound = MIN(frame->header.windowSize, lenPastStart);
707 matchLen = MIN(matchLen, matchLenBound);
711 offsetCode = offset + ZSTD_REP_MOVE;
714 /* do a repeat offset */
715 offsetCode = RAND(seed) % 3;
716 if (literalLen > 0) {
717 offset = frame->stats.rep[offsetCode];
718 repIndex = offsetCode;
721 offset = offsetCode == 2 ? frame->stats.rep[0] - 1
722 : frame->stats.rep[offsetCode + 1];
723 repIndex = MIN(2, offsetCode + 1);
726 } while (((!info.useDict) && (offset > (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) || offset == 0);
730 BYTE* const dictEnd = info.dictContent + info.dictContentSize;
731 for (j = 0; j < matchLen; j++) {
732 if ((U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) < offset) {
733 /* copy from dictionary instead of literals */
734 size_t const dictOffset = offset - (srcPtr - (BYTE*)frame->srcStart);
735 *srcPtr = *(dictEnd - dictOffset);
738 *srcPtr = *(srcPtr-offset);
745 for (r = repIndex; r > 0; r--) {
746 frame->stats.rep[r] = frame->stats.rep[r - 1];
748 frame->stats.rep[0] = offset;
751 DISPLAYLEVEL(6, " LL: %5u OF: %5u ML: %5u",
752 (unsigned)literalLen, (unsigned)offset, (unsigned)matchLen);
753 DISPLAYLEVEL(7, " srcPos: %8u seqNb: %3u",
754 (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart), (unsigned)i);
755 DISPLAYLEVEL(6, "\n");
756 if (offsetCode < 3) {
757 DISPLAYLEVEL(7, " repeat offset: %d\n", (int)repIndex);
759 /* use libzstd sequence handling */
760 ZSTD_storeSeq(seqStore, literalLen, literals, offsetCode,
761 matchLen - MINMATCH);
763 literalsSize -= literalLen;
764 excessMatch -= (matchLen - MIN_SEQ_LEN);
765 literals += literalLen;
768 memcpy(srcPtr, literals, literalsSize);
769 srcPtr += literalsSize;
770 DISPLAYLEVEL(6, " excess literals: %5u", (unsigned)literalsSize);
771 DISPLAYLEVEL(7, " srcPos: %8u", (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart));
772 DISPLAYLEVEL(6, "\n");
777 static void initSymbolSet(const BYTE* symbols, size_t len, BYTE* set, BYTE maxSymbolValue)
781 memset(set, 0, (size_t)maxSymbolValue+1);
783 for (i = 0; i < len; i++) {
788 static int isSymbolSubset(const BYTE* symbols, size_t len, const BYTE* set, BYTE maxSymbolValue)
792 for (i = 0; i < len; i++) {
793 if (symbols[i] > maxSymbolValue || !set[symbols[i]]) {
800 static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr,
803 /* This code is mostly copied from ZSTD_compressSequences in zstd_compress.c */
804 unsigned count[MaxSeq+1];
806 FSE_CTable* CTable_LitLength = frame->stats.litlengthCTable;
807 FSE_CTable* CTable_OffsetBits = frame->stats.offcodeCTable;
808 FSE_CTable* CTable_MatchLength = frame->stats.matchlengthCTable;
809 U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */
810 const seqDef* const sequences = seqStorePtr->sequencesStart;
811 const BYTE* const ofCodeTable = seqStorePtr->ofCode;
812 const BYTE* const llCodeTable = seqStorePtr->llCode;
813 const BYTE* const mlCodeTable = seqStorePtr->mlCode;
814 BYTE* const oend = (BYTE*)frame->dataEnd;
815 BYTE* op = (BYTE*)frame->data;
817 BYTE scratchBuffer[1<<MAX(MLFSELog,LLFSELog)];
819 /* literals compressing block removed so that can be done separately */
821 /* Sequences Header */
822 if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) return ERROR(dstSize_tooSmall);
823 if (nbSeq < 0x7F) *op++ = (BYTE)nbSeq;
824 else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
825 else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
832 /* seqHead : flags for FSE encoding type */
835 /* convert length/distances into codes */
836 ZSTD_seqToCodes(seqStorePtr);
838 /* CTable for Literal Lengths */
839 { unsigned max = MaxLL;
840 size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */
841 assert(!HIST_isError(mostFrequent));
842 if (mostFrequent == nbSeq) {
843 /* do RLE if we have the chance */
844 *op++ = llCodeTable[0];
845 FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
847 } else if (frame->stats.fseInit && !(RAND(seed) & 3) &&
848 isSymbolSubset(llCodeTable, nbSeq,
849 frame->stats.litlengthSymbolSet, 35)) {
850 /* maybe do repeat mode if we're allowed to */
852 } else if (!(RAND(seed) & 3)) {
853 /* maybe use the default distribution */
854 FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
857 /* fall back on a full table */
858 size_t nbSeq_1 = nbSeq;
859 const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
860 if (count[llCodeTable[nbSeq-1]]>1) { count[llCodeTable[nbSeq-1]]--; nbSeq_1--; }
861 FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
862 { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */
863 if (FSE_isError(NCountSize)) return ERROR(GENERIC);
865 FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
866 LLtype = set_compressed;
869 /* CTable for Offsets */
870 /* see Literal Lengths for descriptions of mode choices */
871 { unsigned max = MaxOff;
872 size_t const mostFrequent = HIST_countFast_wksp(count, &max, ofCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */
873 assert(!HIST_isError(mostFrequent));
874 if (mostFrequent == nbSeq) {
875 *op++ = ofCodeTable[0];
876 FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
878 } else if (frame->stats.fseInit && !(RAND(seed) & 3) &&
879 isSymbolSubset(ofCodeTable, nbSeq,
880 frame->stats.offsetSymbolSet, 28)) {
881 Offtype = set_repeat;
882 } else if (!(RAND(seed) & 3)) {
883 FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, DefaultMaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
886 size_t nbSeq_1 = nbSeq;
887 const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
888 if (count[ofCodeTable[nbSeq-1]]>1) { count[ofCodeTable[nbSeq-1]]--; nbSeq_1--; }
889 FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
890 { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */
891 if (FSE_isError(NCountSize)) return ERROR(GENERIC);
893 FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
894 Offtype = set_compressed;
897 /* CTable for MatchLengths */
898 /* see Literal Lengths for descriptions of mode choices */
899 { unsigned max = MaxML;
900 size_t const mostFrequent = HIST_countFast_wksp(count, &max, mlCodeTable, nbSeq, WKSP, sizeof(WKSP)); /* cannot fail */
901 assert(!HIST_isError(mostFrequent));
902 if (mostFrequent == nbSeq) {
903 *op++ = *mlCodeTable;
904 FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
906 } else if (frame->stats.fseInit && !(RAND(seed) & 3) &&
907 isSymbolSubset(mlCodeTable, nbSeq,
908 frame->stats.matchlengthSymbolSet, 52)) {
910 } else if (!(RAND(seed) & 3)) {
911 /* sometimes do default distribution */
912 FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
915 /* fall back on table */
916 size_t nbSeq_1 = nbSeq;
917 const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
918 if (count[mlCodeTable[nbSeq-1]]>1) { count[mlCodeTable[nbSeq-1]]--; nbSeq_1--; }
919 FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
920 { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog); /* overflow protected */
921 if (FSE_isError(NCountSize)) return ERROR(GENERIC);
923 FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
924 MLtype = set_compressed;
926 frame->stats.fseInit = 1;
927 initSymbolSet(llCodeTable, nbSeq, frame->stats.litlengthSymbolSet, 35);
928 initSymbolSet(ofCodeTable, nbSeq, frame->stats.offsetSymbolSet, 28);
929 initSymbolSet(mlCodeTable, nbSeq, frame->stats.matchlengthSymbolSet, 52);
931 DISPLAYLEVEL(5, " LL type: %d OF type: %d ML type: %d\n", (unsigned)LLtype, (unsigned)Offtype, (unsigned)MLtype);
933 *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
935 /* Encoding Sequences */
936 { BIT_CStream_t blockStream;
937 FSE_CState_t stateMatchLength;
938 FSE_CState_t stateOffsetBits;
939 FSE_CState_t stateLitLength;
941 CHECK_E(BIT_initCStream(&blockStream, op, oend-op), dstSize_tooSmall); /* not enough space remaining */
944 FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
945 FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]);
946 FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]);
947 BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
948 if (MEM_32bits()) BIT_flushBits(&blockStream);
949 BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]);
950 if (MEM_32bits()) BIT_flushBits(&blockStream);
951 BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]);
952 BIT_flushBits(&blockStream);
955 for (n=nbSeq-2 ; n<nbSeq ; n--) { /* intentional underflow */
956 BYTE const llCode = llCodeTable[n];
957 BYTE const ofCode = ofCodeTable[n];
958 BYTE const mlCode = mlCodeTable[n];
959 U32 const llBits = LL_bits[llCode];
960 U32 const ofBits = ofCode; /* 32b*/ /* 64b*/
961 U32 const mlBits = ML_bits[mlCode];
963 FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */
964 FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode); /* 24 */ /* 24 */
965 if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/
966 FSE_encodeSymbol(&blockStream, &stateLitLength, llCode); /* 16 */ /* 33 */
967 if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
968 BIT_flushBits(&blockStream); /* (7)*/
969 BIT_addBits(&blockStream, sequences[n].litLength, llBits);
970 if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
971 BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
972 if (MEM_32bits()) BIT_flushBits(&blockStream); /* (7)*/
973 BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */
974 BIT_flushBits(&blockStream); /* (7)*/
977 FSE_flushCState(&blockStream, &stateMatchLength);
978 FSE_flushCState(&blockStream, &stateOffsetBits);
979 FSE_flushCState(&blockStream, &stateLitLength);
981 { size_t const streamSize = BIT_closeCStream(&blockStream);
982 if (streamSize==0) return ERROR(dstSize_tooSmall); /* not enough space */
991 static size_t writeSequencesBlock(U32* seed, frame_t* frame, size_t contentSize,
992 size_t literalsSize, dictInfo info)
998 initSeqStore(&seqStore);
1000 /* randomly generate sequences */
1001 numSequences = generateSequences(seed, frame, &seqStore, contentSize, literalsSize, info);
1002 /* write them out to the frame data */
1003 CHECKERR(writeSequences(seed, frame, &seqStore, numSequences));
1005 return numSequences;
1008 static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize, dictInfo info)
1010 BYTE* const blockStart = (BYTE*)frame->data;
1011 size_t literalsSize;
1014 DISPLAYLEVEL(4, " compressed block:\n");
1016 literalsSize = writeLiteralsBlock(seed, frame, contentSize);
1018 DISPLAYLEVEL(4, " literals size: %u\n", (unsigned)literalsSize);
1020 nbSeq = writeSequencesBlock(seed, frame, contentSize, literalsSize, info);
1022 DISPLAYLEVEL(4, " number of sequences: %u\n", (unsigned)nbSeq);
1024 return (BYTE*)frame->data - blockStart;
1027 static void writeBlock(U32* seed, frame_t* frame, size_t contentSize,
1028 int lastBlock, dictInfo info)
1030 int const blockTypeDesc = RAND(seed) % 8;
1034 BYTE *const header = (BYTE*)frame->data;
1035 BYTE *op = header + 3;
1037 DISPLAYLEVEL(4, " block:\n");
1038 DISPLAYLEVEL(4, " block content size: %u\n", (unsigned)contentSize);
1039 DISPLAYLEVEL(4, " last block: %s\n", lastBlock ? "yes" : "no");
1041 if (blockTypeDesc == 0) {
1042 /* Raw data frame */
1044 RAND_buffer(seed, frame->src, contentSize);
1045 memcpy(op, frame->src, contentSize);
1049 blockSize = contentSize;
1050 } else if (blockTypeDesc == 1) {
1052 BYTE const symbol = RAND(seed) & 0xff;
1055 memset(frame->src, symbol, contentSize);
1059 blockSize = contentSize;
1061 /* compressed, most common */
1062 size_t compressedSize;
1065 frame->oldStats = frame->stats;
1068 compressedSize = writeCompressedBlock(seed, frame, contentSize, info);
1069 if (compressedSize >= contentSize) { /* compressed block must be strictly smaller than uncompressed one */
1071 memcpy(op, frame->src, contentSize);
1074 blockSize = contentSize; /* fall back on raw block if data doesn't
1077 frame->stats = frame->oldStats; /* don't update the stats */
1079 op += compressedSize;
1080 blockSize = compressedSize;
1083 frame->src = (BYTE*)frame->src + contentSize;
1085 DISPLAYLEVEL(4, " block type: %s\n", BLOCK_TYPES[blockType]);
1086 DISPLAYLEVEL(4, " block size field: %u\n", (unsigned)blockSize);
1088 header[0] = (BYTE) ((lastBlock | (blockType << 1) | (blockSize << 3)) & 0xff);
1089 MEM_writeLE16(header + 1, (U16) (blockSize >> 5));
1094 static void writeBlocks(U32* seed, frame_t* frame, dictInfo info)
1096 size_t contentLeft = frame->header.contentSize;
1097 size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize);
1099 /* 1 in 4 chance of ending frame */
1100 int const lastBlock = contentLeft > maxBlockSize ? 0 : !(RAND(seed) & 3);
1101 size_t blockContentSize;
1103 blockContentSize = contentLeft;
1105 if (contentLeft > 0 && (RAND(seed) & 7)) {
1106 /* some variable size block */
1107 blockContentSize = RAND(seed) % (MIN(maxBlockSize, contentLeft)+1);
1108 } else if (contentLeft > maxBlockSize && (RAND(seed) & 1)) {
1109 /* some full size block */
1110 blockContentSize = maxBlockSize;
1112 /* some empty block */
1113 blockContentSize = 0;
1117 writeBlock(seed, frame, blockContentSize, lastBlock, info);
1119 contentLeft -= blockContentSize;
1120 if (lastBlock) break;
1124 static void writeChecksum(frame_t* frame)
1126 /* write checksum so implementations can verify their output */
1127 U64 digest = XXH64(frame->srcStart, (BYTE*)frame->src-(BYTE*)frame->srcStart, 0);
1128 DISPLAYLEVEL(3, " checksum: %08x\n", (unsigned)digest);
1129 MEM_writeLE32(frame->data, (U32)digest);
1130 frame->data = (BYTE*)frame->data + 4;
1133 static void outputBuffer(const void* buf, size_t size, const char* const path)
1135 /* write data out to file */
1136 const BYTE* ip = (const BYTE*)buf;
1139 out = fopen(path, "wb");
1144 fprintf(stderr, "Failed to open file at %s: ", path);
1149 { size_t fsize = size;
1151 while (written < fsize) {
1152 written += fwrite(ip + written, 1, fsize - written, out);
1154 fprintf(stderr, "Failed to write to file at %s: ", path);
1166 static void initFrame(frame_t* fr)
1168 memset(fr, 0, sizeof(*fr));
1169 fr->data = fr->dataStart = FRAME_BUFFER;
1170 fr->dataEnd = FRAME_BUFFER + sizeof(FRAME_BUFFER);
1171 fr->src = fr->srcStart = CONTENT_BUFFER;
1172 fr->srcEnd = CONTENT_BUFFER + sizeof(CONTENT_BUFFER);
1174 /* init repeat codes */
1175 fr->stats.rep[0] = 1;
1176 fr->stats.rep[1] = 4;
1177 fr->stats.rep[2] = 8;
1181 * Generated a single zstd compressed block with no block/frame header.
1182 * Returns the final seed.
1184 static U32 generateCompressedBlock(U32 seed, frame_t* frame, dictInfo info)
1186 size_t blockContentSize;
1187 int blockWritten = 0;
1189 DISPLAYLEVEL(4, "block seed: %u\n", (unsigned)seed);
1191 op = (BYTE*)frame->data;
1193 while (!blockWritten) {
1195 /* generate window size */
1196 { int const exponent = RAND(&seed) % (MAX_WINDOW_LOG - 10);
1197 int const mantissa = RAND(&seed) % 8;
1198 frame->header.windowSize = (1U << (exponent + 10));
1199 frame->header.windowSize += (frame->header.windowSize / 8) * mantissa;
1202 /* generate content size */
1203 { size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize);
1204 if (RAND(&seed) & 15) {
1205 /* some full size blocks */
1206 blockContentSize = maxBlockSize;
1207 } else if (RAND(&seed) & 7 && g_maxBlockSize >= (1U << 7)) {
1208 /* some small blocks <= 128 bytes*/
1209 blockContentSize = RAND(&seed) % (1U << 7);
1211 /* some variable size blocks */
1212 blockContentSize = RAND(&seed) % maxBlockSize;
1216 /* try generating a compressed block */
1217 frame->oldStats = frame->stats;
1219 cSize = writeCompressedBlock(&seed, frame, blockContentSize, info);
1220 if (cSize >= blockContentSize) { /* compressed size must be strictly smaller than decompressed size : https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks */
1221 /* data doesn't compress -- try again */
1222 frame->stats = frame->oldStats; /* don't update the stats */
1223 DISPLAYLEVEL(5, " can't compress block : try again \n");
1226 DISPLAYLEVEL(4, " block size: %u \n", (unsigned)cSize);
1227 frame->src = (BYTE*)frame->src + blockContentSize;
1233 /* Return the final seed */
1234 static U32 generateFrame(U32 seed, frame_t* fr, dictInfo info)
1236 /* generate a complete frame */
1237 DISPLAYLEVEL(3, "frame seed: %u\n", (unsigned)seed);
1240 writeFrameHeader(&seed, fr, info);
1241 writeBlocks(&seed, fr, info);
1247 /*_*******************************************************
1248 * Dictionary Helper Functions
1249 *********************************************************/
1250 /* returns 0 if successful, otherwise returns 1 upon error */
1251 static int genRandomDict(U32 dictID, U32 seed, size_t dictSize, BYTE* fullDict)
1253 /* allocate space for samples */
1255 unsigned const numSamples = 4;
1256 size_t sampleSizes[4];
1257 BYTE* const samples = malloc(5000*sizeof(BYTE));
1258 if (samples == NULL) {
1259 DISPLAY("Error: could not allocate space for samples\n");
1263 /* generate samples */
1264 { unsigned literalValue = 1;
1265 unsigned samplesPos = 0;
1266 size_t currSize = 1;
1267 while (literalValue <= 4) {
1268 sampleSizes[literalValue - 1] = currSize;
1270 for (k = 0; k < currSize; k++) {
1271 *(samples + (samplesPos++)) = (BYTE)literalValue;
1277 { size_t dictWriteSize = 0;
1278 ZDICT_params_t zdictParams;
1279 size_t const headerSize = MAX(dictSize/4, 256);
1280 size_t const dictContentSize = dictSize - headerSize;
1281 BYTE* const dictContent = fullDict + headerSize;
1282 if (dictContentSize < ZDICT_CONTENTSIZE_MIN || dictSize < ZDICT_DICTSIZE_MIN) {
1283 DISPLAY("Error: dictionary size is too small\n");
1285 goto exitGenRandomDict;
1288 /* init dictionary params */
1289 memset(&zdictParams, 0, sizeof(zdictParams));
1290 zdictParams.dictID = dictID;
1291 zdictParams.notificationLevel = 1;
1293 /* fill in dictionary content */
1294 RAND_buffer(&seed, (void*)dictContent, dictContentSize);
1296 /* finalize dictionary with random samples */
1297 dictWriteSize = ZDICT_finalizeDictionary(fullDict, dictSize,
1298 dictContent, dictContentSize,
1299 samples, sampleSizes, numSamples,
1302 if (ZDICT_isError(dictWriteSize)) {
1303 DISPLAY("Could not finalize dictionary: %s\n", ZDICT_getErrorName(dictWriteSize));
1313 static dictInfo initDictInfo(int useDict, size_t dictContentSize, BYTE* dictContent, U32 dictID){
1314 /* allocate space statically */
1316 memset(&dictOp, 0, sizeof(dictOp));
1317 dictOp.useDict = useDict;
1318 dictOp.dictContentSize = dictContentSize;
1319 dictOp.dictContent = dictContent;
1320 dictOp.dictID = dictID;
1324 /*-*******************************************************
1326 *********************************************************/
1328 BYTE DECOMPRESSED_BUFFER[MAX_DECOMPRESSED_SIZE];
1330 static size_t testDecodeSimple(frame_t* fr)
1332 /* test decoding the generated data with the simple API */
1333 size_t const ret = ZSTD_decompress(DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1334 fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart);
1336 if (ZSTD_isError(ret)) return ret;
1338 if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart,
1339 (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) {
1340 return ERROR(corruption_detected);
1346 static size_t testDecodeStreaming(frame_t* fr)
1348 /* test decoding the generated data with the streaming API */
1349 ZSTD_DStream* zd = ZSTD_createDStream();
1354 if (!zd) return ERROR(memory_allocation);
1356 in.src = fr->dataStart;
1358 in.size = (BYTE*)fr->data - (BYTE*)fr->dataStart;
1360 out.dst = DECOMPRESSED_BUFFER;
1362 out.size = ZSTD_DStreamOutSize();
1364 ZSTD_initDStream(zd);
1366 ret = ZSTD_decompressStream(zd, &out, &in);
1367 if (ZSTD_isError(ret)) goto cleanup; /* error */
1368 if (ret == 0) break; /* frame is done */
1370 /* force decoding to be done in chunks */
1371 out.size += MIN(ZSTD_DStreamOutSize(), MAX_DECOMPRESSED_SIZE - out.size);
1376 if (memcmp(out.dst, fr->srcStart, out.pos) != 0) {
1377 return ERROR(corruption_detected);
1381 ZSTD_freeDStream(zd);
1385 static size_t testDecodeWithDict(U32 seed, genType_e genType)
1387 /* create variables */
1388 size_t const dictSize = RAND(&seed) % (10 << 20) + ZDICT_DICTSIZE_MIN + ZDICT_CONTENTSIZE_MIN;
1389 U32 const dictID = RAND(&seed);
1390 size_t errorDetected = 0;
1391 BYTE* const fullDict = malloc(dictSize);
1392 if (fullDict == NULL) {
1393 return ERROR(GENERIC);
1396 /* generate random dictionary */
1397 if (genRandomDict(dictID, seed, dictSize, fullDict)) { /* return 0 on success */
1398 errorDetected = ERROR(GENERIC);
1399 goto dictTestCleanup;
1405 ZSTD_DCtx* const dctx = ZSTD_createDCtx();
1409 { size_t const headerSize = MAX(dictSize/4, 256);
1410 size_t const dictContentSize = dictSize-headerSize;
1411 BYTE* const dictContent = fullDict+headerSize;
1412 info = initDictInfo(1, dictContentSize, dictContent, dictID);
1415 /* manually decompress and check difference */
1416 if (genType == gt_frame) {
1418 generateFrame(seed, &fr, info);
1419 ret = ZSTD_decompress_usingDict(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1420 fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart,
1421 fullDict, dictSize);
1424 generateCompressedBlock(seed, &fr, info);
1425 ret = ZSTD_decompressBegin_usingDict(dctx, fullDict, dictSize);
1426 if (ZSTD_isError(ret)) {
1427 errorDetected = ret;
1428 ZSTD_freeDCtx(dctx);
1429 goto dictTestCleanup;
1431 ret = ZSTD_decompressBlock(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1432 fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart);
1434 ZSTD_freeDCtx(dctx);
1436 if (ZSTD_isError(ret)) {
1437 errorDetected = ret;
1438 goto dictTestCleanup;
1441 if (memcmp(DECOMPRESSED_BUFFER, fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart) != 0) {
1442 errorDetected = ERROR(corruption_detected);
1443 goto dictTestCleanup;
1449 return errorDetected;
1452 static size_t testDecodeRawBlock(frame_t* fr)
1454 ZSTD_DCtx* dctx = ZSTD_createDCtx();
1455 size_t ret = ZSTD_decompressBegin(dctx);
1456 if (ZSTD_isError(ret)) return ret;
1458 ret = ZSTD_decompressBlock(
1460 DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1461 fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart);
1462 ZSTD_freeDCtx(dctx);
1463 if (ZSTD_isError(ret)) return ret;
1465 if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart,
1466 (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) {
1467 return ERROR(corruption_detected);
1473 static int runBlockTest(U32* seed)
1476 U32 const seedCopy = *seed;
1477 { dictInfo const info = initDictInfo(0, 0, NULL, 0);
1478 *seed = generateCompressedBlock(*seed, &fr, info);
1481 { size_t const r = testDecodeRawBlock(&fr);
1482 if (ZSTD_isError(r)) {
1483 DISPLAY("Error in block mode on test seed %u: %s\n",
1484 (unsigned)seedCopy, ZSTD_getErrorName(r));
1489 { size_t const r = testDecodeWithDict(*seed, gt_block);
1490 if (ZSTD_isError(r)) {
1491 DISPLAY("Error in block mode with dictionary on test seed %u: %s\n",
1492 (unsigned)seedCopy, ZSTD_getErrorName(r));
1499 static int runFrameTest(U32* seed)
1502 U32 const seedCopy = *seed;
1503 { dictInfo const info = initDictInfo(0, 0, NULL, 0);
1504 *seed = generateFrame(*seed, &fr, info);
1507 { size_t const r = testDecodeSimple(&fr);
1508 if (ZSTD_isError(r)) {
1509 DISPLAY("Error in simple mode on test seed %u: %s\n",
1510 (unsigned)seedCopy, ZSTD_getErrorName(r));
1514 { size_t const r = testDecodeStreaming(&fr);
1515 if (ZSTD_isError(r)) {
1516 DISPLAY("Error in streaming mode on test seed %u: %s\n",
1517 (unsigned)seedCopy, ZSTD_getErrorName(r));
1521 { size_t const r = testDecodeWithDict(*seed, gt_frame); /* avoid big dictionaries */
1522 if (ZSTD_isError(r)) {
1523 DISPLAY("Error in dictionary mode on test seed %u: %s\n",
1524 (unsigned)seedCopy, ZSTD_getErrorName(r));
1531 static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS,
1536 UTIL_time_t const startClock = UTIL_getTime();
1537 U64 const maxClockSpan = testDurationS * SEC_TO_MICRO;
1539 if (numFiles == 0 && !testDurationS) numFiles = 1;
1541 DISPLAY("seed: %u\n", (unsigned)seed);
1543 for (fnum = 0; fnum < numFiles || UTIL_clockSpanMicro(startClock) < maxClockSpan; fnum++) {
1544 if (fnum < numFiles)
1545 DISPLAYUPDATE("\r%u/%u ", fnum, numFiles);
1547 DISPLAYUPDATE("\r%u ", fnum);
1549 { int const ret = (genType == gt_frame) ?
1550 runFrameTest(&seed) :
1551 runBlockTest(&seed);
1552 if (ret) return ret;
1556 DISPLAY("\r%u tests completed: ", fnum);
1562 /*-*******************************************************
1564 *********************************************************/
1566 static int generateFile(U32 seed, const char* const path,
1567 const char* const origPath, genType_e genType)
1571 DISPLAY("seed: %u\n", (unsigned)seed);
1573 { dictInfo const info = initDictInfo(0, 0, NULL, 0);
1574 if (genType == gt_frame) {
1575 generateFrame(seed, &fr, info);
1577 generateCompressedBlock(seed, &fr, info);
1580 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
1582 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath);
1587 static int generateCorpus(U32 seed, unsigned numFiles, const char* const path,
1588 const char* const origPath, genType_e genType)
1590 char outPath[MAX_PATH];
1593 DISPLAY("seed: %u\n", (unsigned)seed);
1595 for (fnum = 0; fnum < numFiles; fnum++) {
1598 DISPLAYUPDATE("\r%u/%u ", fnum, numFiles);
1600 { dictInfo const info = initDictInfo(0, 0, NULL, 0);
1601 if (genType == gt_frame) {
1602 seed = generateFrame(seed, &fr, info);
1604 seed = generateCompressedBlock(seed, &fr, info);
1608 if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
1609 DISPLAY("Error: path too long\n");
1612 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath);
1615 if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) {
1616 DISPLAY("Error: path too long\n");
1619 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath);
1623 DISPLAY("\r%u/%u \n", fnum, numFiles);
1628 static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const path,
1629 const char* const origPath, const size_t dictSize,
1632 char outPath[MAX_PATH];
1634 U32 const dictID = RAND(&seed);
1635 int errorDetected = 0;
1637 if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
1638 DISPLAY("Error: path too long\n");
1642 /* allocate space for the dictionary */
1643 fullDict = malloc(dictSize);
1644 if (fullDict == NULL) {
1645 DISPLAY("Error: could not allocate space for full dictionary.\n");
1649 /* randomly generate the dictionary */
1650 { int const ret = genRandomDict(dictID, seed, dictSize, fullDict);
1652 errorDetected = ret;
1657 /* write out dictionary */
1658 if (numFiles != 0) {
1659 if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
1660 DISPLAY("Error: dictionary path too long\n");
1664 outputBuffer(fullDict, dictSize, outPath);
1667 outputBuffer(fullDict, dictSize, "dictionary");
1670 /* generate random compressed/decompressed files */
1672 for (fnum = 0; fnum < MAX(numFiles, 1); fnum++) {
1674 DISPLAYUPDATE("\r%u/%u ", fnum, numFiles);
1676 size_t const headerSize = MAX(dictSize/4, 256);
1677 size_t const dictContentSize = dictSize-headerSize;
1678 BYTE* const dictContent = fullDict+headerSize;
1679 dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID);
1680 if (genType == gt_frame) {
1681 seed = generateFrame(seed, &fr, info);
1683 seed = generateCompressedBlock(seed, &fr, info);
1687 if (numFiles != 0) {
1688 if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
1689 DISPLAY("Error: path too long\n");
1693 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath);
1696 if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) {
1697 DISPLAY("Error: path too long\n");
1701 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath);
1705 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
1707 outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath);
1715 return errorDetected;
1719 /*_*******************************************************
1721 *********************************************************/
1722 static U32 makeSeed(void)
1724 U32 t = (U32) time(NULL);
1725 return XXH32(&t, sizeof(t), 0) % 65536;
1728 static unsigned readInt(const char** argument)
1731 while ((**argument>='0') && (**argument<='9')) {
1733 val += **argument - '0';
1739 static void usage(const char* programName)
1741 DISPLAY( "Usage :\n");
1742 DISPLAY( " %s [args]\n", programName);
1744 DISPLAY( "Arguments :\n");
1745 DISPLAY( " -p<path> : select output path (default:stdout)\n");
1746 DISPLAY( " in multiple files mode this should be a directory\n");
1747 DISPLAY( " -o<path> : select path to output original file (default:no output)\n");
1748 DISPLAY( " in multiple files mode this should be a directory\n");
1749 DISPLAY( " -s# : select seed (default:random based on time)\n");
1750 DISPLAY( " -n# : number of files to generate (default:1)\n");
1751 DISPLAY( " -t : activate test mode (test files against libzstd instead of outputting them)\n");
1752 DISPLAY( " -T# : length of time to run tests for\n");
1753 DISPLAY( " -v : increase verbosity level (default:0, max:7)\n");
1754 DISPLAY( " -h/H : display help/long help and exit\n");
1757 static void advancedUsage(const char* programName)
1761 DISPLAY( "Advanced arguments :\n");
1762 DISPLAY( " --content-size : always include the content size in the frame header\n");
1763 DISPLAY( " --use-dict=# : include a dictionary used to decompress the corpus\n");
1764 DISPLAY( " --gen-blocks : generate raw compressed blocks without block/frame headers\n");
1765 DISPLAY( " --max-block-size-log=# : max block size log, must be in range [2, 17]\n");
1766 DISPLAY( " --max-content-size-log=# : max content size log, must be <= 20\n");
1767 DISPLAY( " (this is ignored with gen-blocks)\n");
1770 /*! readU32FromChar() :
1771 @return : unsigned integer value read from input in `char` format
1772 allows and interprets K, KB, KiB, M, MB and MiB suffix.
1773 Will also modify `*stringPtr`, advancing it to position where it stopped reading.
1774 Note : function result can overflow if digit string > MAX_UINT */
1775 static unsigned readU32FromChar(const char** stringPtr)
1777 unsigned result = 0;
1778 while ((**stringPtr >='0') && (**stringPtr <='9'))
1779 result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
1780 if ((**stringPtr=='K') || (**stringPtr=='M')) {
1782 if (**stringPtr=='M') result <<= 10;
1784 if (**stringPtr=='i') (*stringPtr)++;
1785 if (**stringPtr=='B') (*stringPtr)++;
1790 /** longCommandWArg() :
1791 * check if *stringPtr is the same as longCommand.
1792 * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
1793 * @return 0 and doesn't modify *stringPtr otherwise.
1795 static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
1797 size_t const comSize = strlen(longCommand);
1798 int const result = !strncmp(*stringPtr, longCommand, comSize);
1799 if (result) *stringPtr += comSize;
1803 int main(int argc, char** argv)
1807 unsigned numFiles = 0;
1808 unsigned testDuration = 0;
1810 const char* path = NULL;
1811 const char* origPath = NULL;
1813 unsigned dictSize = (10 << 10); /* 10 kB default */
1814 genType_e genType = gt_frame;
1818 /* Check command line */
1819 for (argNb=1; argNb<argc; argNb++) {
1820 const char* argument = argv[argNb];
1821 if(!argument) continue; /* Protection if argument empty */
1823 /* Handle commands. Aggregated commands are allowed */
1824 if (argument[0]=='-') {
1826 while (*argument!=0) {
1833 advancedUsage(argv[0]);
1842 seed = readInt(&argument);
1846 numFiles = readInt(&argument);
1850 testDuration = readInt(&argument);
1851 if (*argument == 'm') {
1854 if (*argument == 'n') argument++;
1859 origPath = argument;
1860 argument += strlen(argument);
1865 argument += strlen(argument);
1873 if (strcmp(argument, "content-size") == 0) {
1874 opts.contentSize = 1;
1875 } else if (longCommandWArg(&argument, "use-dict=")) {
1876 dictSize = readU32FromChar(&argument);
1878 } else if (strcmp(argument, "gen-blocks") == 0) {
1880 } else if (longCommandWArg(&argument, "max-block-size-log=")) {
1881 U32 value = readU32FromChar(&argument);
1882 if (value >= 2 && value <= ZSTD_BLOCKSIZE_MAX) {
1883 g_maxBlockSize = 1U << value;
1885 } else if (longCommandWArg(&argument, "max-content-size-log=")) {
1886 U32 value = readU32FromChar(&argument);
1887 g_maxDecompressedSizeLog =
1888 MIN(MAX_DECOMPRESSED_SIZE_LOG, value);
1890 advancedUsage(argv[0]);
1893 argument += strlen(argument);
1898 } } } } /* for (argNb=1; argNb<argc; argNb++) */
1905 return runTestMode(seed, numFiles, testDuration, genType);
1908 DISPLAY("Error: -T requires test mode (-t)\n\n");
1915 DISPLAY("Error: path is required in file generation mode\n");
1920 if (numFiles == 0 && useDict == 0) {
1921 return generateFile(seed, path, origPath, genType);
1922 } else if (useDict == 0){
1923 return generateCorpus(seed, numFiles, path, origPath, genType);
1925 /* should generate files with a dictionary */
1926 return generateCorpusWithDict(seed, numFiles, path, origPath, dictSize, genType);