| 1 | /* |
| 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
| 3 | * All rights reserved. |
| 4 | * |
| 5 | * This source code is licensed under both the BSD-style license (found in the |
| 6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
| 7 | * in the COPYING file in the root directory of this source tree). |
| 8 | * You may select, at your option, one of the above-listed licenses. |
| 9 | */ |
| 10 | |
| 11 | /** |
| 12 | * This fuzz target performs a zstd round-trip test by generating an arbitrary |
| 13 | * array of sequences, generating the associated source buffer, calling |
| 14 | * ZSTD_compressSequences(), and then decompresses and compares the result with |
| 15 | * the original generated source buffer. |
| 16 | */ |
| 17 | |
| 18 | #define ZSTD_STATIC_LINKING_ONLY |
| 19 | |
| 20 | #include <stddef.h> |
| 21 | #include <stdlib.h> |
| 22 | #include <stdio.h> |
| 23 | #include <string.h> |
| 24 | #include <time.h> |
| 25 | #include "fuzz_helpers.h" |
| 26 | #include "zstd_helpers.h" |
| 27 | #include "fuzz_data_producer.h" |
| 28 | #include "fuzz_third_party_seq_prod.h" |
| 29 | |
| 30 | static ZSTD_CCtx* cctx = NULL; |
| 31 | static ZSTD_DCtx* dctx = NULL; |
| 32 | static void* literalsBuffer = NULL; |
| 33 | static void* generatedSrc = NULL; |
| 34 | static ZSTD_Sequence* generatedSequences = NULL; |
| 35 | |
| 36 | static void* dictBuffer = NULL; |
| 37 | static ZSTD_CDict* cdict = NULL; |
| 38 | static ZSTD_DDict* ddict = NULL; |
| 39 | |
| 40 | #define ZSTD_FUZZ_GENERATED_SRC_MAXSIZE (1 << 20) /* Allow up to 1MB generated data */ |
| 41 | #define ZSTD_FUZZ_GENERATED_LITERALS_SIZE (1 << 20) /* Fixed size 1MB literals buffer */ |
| 42 | #define ZSTD_FUZZ_MATCHLENGTH_MAXSIZE (1 << 18) /* Allow up to 256KB matches */ |
| 43 | #define ZSTD_FUZZ_GENERATED_DICT_MAXSIZE (1 << ZSTD_WINDOWLOG_MAX_32) /* Allow up to 1 << ZSTD_WINDOWLOG_MAX_32 dictionary */ |
| 44 | #define ZSTD_FUZZ_MAX_NBSEQ (1 << 17) /* Maximum of 128K sequences */ |
| 45 | |
| 46 | /* Deterministic random number generator */ |
| 47 | #define FUZZ_RDG_rotl32(x,r) ((x << r) | (x >> (32 - r))) |
| 48 | static uint32_t FUZZ_RDG_rand(uint32_t* src) |
| 49 | { |
| 50 | static const uint32_t prime1 = 2654435761U; |
| 51 | static const uint32_t prime2 = 2246822519U; |
| 52 | uint32_t rand32 = *src; |
| 53 | rand32 *= prime1; |
| 54 | rand32 ^= prime2; |
| 55 | rand32 = FUZZ_RDG_rotl32(rand32, 13); |
| 56 | *src = rand32; |
| 57 | return rand32 >> 5; |
| 58 | } |
| 59 | |
| 60 | /* Make a pseudorandom string - this simple function exists to avoid |
| 61 | * taking a dependency on datagen.h to have RDG_genBuffer(). |
| 62 | */ |
| 63 | static char* generatePseudoRandomString(char* str, size_t size, FUZZ_dataProducer_t* producer) { |
| 64 | const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJK1234567890!@#$^&*()_"; |
| 65 | uint32_t seed = FUZZ_dataProducer_uint32(producer); |
| 66 | if (size) { |
| 67 | for (size_t n = 0; n < size; n++) { |
| 68 | int key = FUZZ_RDG_rand(&seed) % (int) (sizeof charset - 1); |
| 69 | str[n] = charset[key]; |
| 70 | } |
| 71 | } |
| 72 | return str; |
| 73 | } |
| 74 | |
| 75 | /* Returns size of source buffer */ |
| 76 | static size_t decodeSequences(void* dst, size_t nbSequences, |
| 77 | size_t literalsSize, |
| 78 | const void* dict, size_t dictSize, |
| 79 | ZSTD_sequenceFormat_e mode) |
| 80 | { |
| 81 | const uint8_t* litPtr = literalsBuffer; |
| 82 | const uint8_t* const litBegin = literalsBuffer; |
| 83 | const uint8_t* const litEnd = litBegin + literalsSize; |
| 84 | const uint8_t* dictPtr = dict; |
| 85 | uint8_t* op = dst; |
| 86 | const uint8_t* const oend = (uint8_t*)dst + ZSTD_FUZZ_GENERATED_SRC_MAXSIZE; |
| 87 | size_t generatedSrcBufferSize = 0; |
| 88 | size_t bytesWritten = 0; |
| 89 | |
| 90 | for (size_t i = 0; i < nbSequences; ++i) { |
| 91 | /* block boundary */ |
| 92 | if (generatedSequences[i].offset == 0) |
| 93 | FUZZ_ASSERT(generatedSequences[i].matchLength == 0); |
| 94 | |
| 95 | if (litPtr + generatedSequences[i].litLength > litEnd) { |
| 96 | litPtr = litBegin; |
| 97 | } |
| 98 | memcpy(op, litPtr, generatedSequences[i].litLength); |
| 99 | bytesWritten += generatedSequences[i].litLength; |
| 100 | op += generatedSequences[i].litLength; |
| 101 | litPtr += generatedSequences[i].litLength; |
| 102 | |
| 103 | /* Copy over the match */ |
| 104 | { size_t matchLength = generatedSequences[i].matchLength; |
| 105 | size_t j = 0; |
| 106 | size_t k = 0; |
| 107 | if (dictSize != 0) { |
| 108 | if (generatedSequences[i].offset > bytesWritten) { /* Offset goes into the dictionary */ |
| 109 | size_t dictOffset = generatedSequences[i].offset - bytesWritten; |
| 110 | size_t matchInDict = MIN(matchLength, dictOffset); |
| 111 | for (; k < matchInDict; ++k) { |
| 112 | op[k] = dictPtr[dictSize - dictOffset + k]; |
| 113 | } |
| 114 | matchLength -= matchInDict; |
| 115 | op += matchInDict; |
| 116 | } |
| 117 | } |
| 118 | for (; j < matchLength; ++j) { |
| 119 | op[j] = op[j - generatedSequences[i].offset]; |
| 120 | } |
| 121 | op += j; |
| 122 | FUZZ_ASSERT(generatedSequences[i].matchLength == j + k); |
| 123 | bytesWritten += generatedSequences[i].matchLength; |
| 124 | } |
| 125 | } |
| 126 | generatedSrcBufferSize = bytesWritten; |
| 127 | FUZZ_ASSERT(litPtr <= litEnd); |
| 128 | if (mode == ZSTD_sf_noBlockDelimiters) { |
| 129 | const uint32_t lastLLSize = (uint32_t)(litEnd - litPtr); |
| 130 | if (lastLLSize <= oend - op) { |
| 131 | memcpy(op, litPtr, lastLLSize); |
| 132 | generatedSrcBufferSize += lastLLSize; |
| 133 | } } |
| 134 | return generatedSrcBufferSize; |
| 135 | } |
| 136 | |
| 137 | /* Returns nb sequences generated |
| 138 | * Note : random sequences are always valid in ZSTD_sf_noBlockDelimiters mode. |
| 139 | * However, it can fail with ZSTD_sf_explicitBlockDelimiters, |
| 140 | * due to potential lack of space in |
| 141 | */ |
| 142 | static size_t generateRandomSequences(FUZZ_dataProducer_t* producer, |
| 143 | size_t literalsSizeLimit, size_t dictSize, |
| 144 | size_t windowLog, ZSTD_sequenceFormat_e mode) |
| 145 | { |
| 146 | const uint32_t repCode = 0; /* not used by sequence ingestion api */ |
| 147 | size_t windowSize = 1ULL << windowLog; |
| 148 | size_t blockSizeMax = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); |
| 149 | uint32_t matchLengthMax = ZSTD_FUZZ_MATCHLENGTH_MAXSIZE; |
| 150 | uint32_t bytesGenerated = 0; |
| 151 | uint32_t nbSeqGenerated = 0; |
| 152 | uint32_t isFirstSequence = 1; |
| 153 | uint32_t blockSize = 0; |
| 154 | |
| 155 | if (mode == ZSTD_sf_explicitBlockDelimiters) { |
| 156 | /* ensure that no sequence can be larger than one block */ |
| 157 | literalsSizeLimit = MIN(literalsSizeLimit, blockSizeMax/2); |
| 158 | matchLengthMax = MIN(matchLengthMax, blockSizeMax/2); |
| 159 | } |
| 160 | |
| 161 | while ( nbSeqGenerated < ZSTD_FUZZ_MAX_NBSEQ - 3 /* extra room for explicit delimiters */ |
| 162 | && bytesGenerated < ZSTD_FUZZ_GENERATED_SRC_MAXSIZE |
| 163 | && !FUZZ_dataProducer_empty(producer)) { |
| 164 | uint32_t matchLength; |
| 165 | uint32_t matchBound = matchLengthMax; |
| 166 | uint32_t offset; |
| 167 | uint32_t offsetBound; |
| 168 | const uint32_t minLitLength = (isFirstSequence && (dictSize == 0)); |
| 169 | const uint32_t litLength = FUZZ_dataProducer_uint32Range(producer, minLitLength, (uint32_t)literalsSizeLimit); |
| 170 | bytesGenerated += litLength; |
| 171 | if (bytesGenerated > ZSTD_FUZZ_GENERATED_SRC_MAXSIZE) { |
| 172 | break; |
| 173 | } |
| 174 | offsetBound = (bytesGenerated > windowSize) ? windowSize : bytesGenerated + (uint32_t)dictSize; |
| 175 | offset = FUZZ_dataProducer_uint32Range(producer, 1, offsetBound); |
| 176 | if (dictSize > 0 && bytesGenerated <= windowSize) { |
| 177 | /* Prevent match length from being such that it would be associated with an offset too large |
| 178 | * from the decoder's perspective. If not possible (match would be too small), |
| 179 | * then reduce the offset if necessary. |
| 180 | */ |
| 181 | const size_t bytesToReachWindowSize = windowSize - bytesGenerated; |
| 182 | if (bytesToReachWindowSize < ZSTD_MINMATCH_MIN) { |
| 183 | const uint32_t newOffsetBound = offsetBound > windowSize ? windowSize : offsetBound; |
| 184 | offset = FUZZ_dataProducer_uint32Range(producer, 1, newOffsetBound); |
| 185 | } else { |
| 186 | matchBound = MIN(matchLengthMax, (uint32_t)bytesToReachWindowSize); |
| 187 | } |
| 188 | } |
| 189 | matchLength = FUZZ_dataProducer_uint32Range(producer, ZSTD_MINMATCH_MIN, matchBound); |
| 190 | bytesGenerated += matchLength; |
| 191 | if (bytesGenerated > ZSTD_FUZZ_GENERATED_SRC_MAXSIZE) { |
| 192 | break; |
| 193 | } |
| 194 | { ZSTD_Sequence seq = {offset, litLength, matchLength, repCode}; |
| 195 | const uint32_t lastLits = FUZZ_dataProducer_uint32Range(producer, 0, litLength); |
| 196 | #define SPLITPROB 6000 |
| 197 | #define SPLITMARK 5234 |
| 198 | const int split = (FUZZ_dataProducer_uint32Range(producer, 0, SPLITPROB) == SPLITMARK); |
| 199 | if (mode == ZSTD_sf_explicitBlockDelimiters) { |
| 200 | const size_t seqSize = seq.litLength + seq.matchLength; |
| 201 | if (blockSize + seqSize > blockSizeMax) { /* reaching limit : must end block now */ |
| 202 | const ZSTD_Sequence endBlock = {0, 0, 0, 0}; |
| 203 | generatedSequences[nbSeqGenerated++] = endBlock; |
| 204 | blockSize = seqSize; |
| 205 | } |
| 206 | if (split) { |
| 207 | const ZSTD_Sequence endBlock = {0, lastLits, 0, 0}; |
| 208 | generatedSequences[nbSeqGenerated++] = endBlock; |
| 209 | assert(lastLits <= seq.litLength); |
| 210 | seq.litLength -= lastLits; |
| 211 | blockSize = seqSize - lastLits; |
| 212 | } else { |
| 213 | blockSize += seqSize; |
| 214 | } |
| 215 | } |
| 216 | generatedSequences[nbSeqGenerated++] = seq; |
| 217 | isFirstSequence = 0; |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | if (mode == ZSTD_sf_explicitBlockDelimiters) { |
| 222 | /* always end sequences with a block delimiter */ |
| 223 | const ZSTD_Sequence endBlock = {0, 0, 0, 0}; |
| 224 | assert(nbSeqGenerated < ZSTD_FUZZ_MAX_NBSEQ); |
| 225 | generatedSequences[nbSeqGenerated++] = endBlock; |
| 226 | } |
| 227 | return nbSeqGenerated; |
| 228 | } |
| 229 | |
| 230 | static size_t roundTripTest(void* result, size_t resultCapacity, |
| 231 | void* compressed, size_t compressedCapacity, |
| 232 | const void* src, size_t srcSize, |
| 233 | const ZSTD_Sequence* seqs, size_t seqSize, |
| 234 | unsigned hasDict, |
| 235 | ZSTD_sequenceFormat_e mode) |
| 236 | { |
| 237 | size_t cSize; |
| 238 | size_t dSize; |
| 239 | |
| 240 | if (hasDict) { |
| 241 | FUZZ_ZASSERT(ZSTD_CCtx_refCDict(cctx, cdict)); |
| 242 | FUZZ_ZASSERT(ZSTD_DCtx_refDDict(dctx, ddict)); |
| 243 | } |
| 244 | |
| 245 | cSize = ZSTD_compressSequences(cctx, compressed, compressedCapacity, |
| 246 | seqs, seqSize, |
| 247 | src, srcSize); |
| 248 | if ( (ZSTD_getErrorCode(cSize) == ZSTD_error_dstSize_tooSmall) |
| 249 | && (mode == ZSTD_sf_explicitBlockDelimiters) ) { |
| 250 | /* Valid scenario : in explicit delimiter mode, |
| 251 | * it might be possible for the compressed size to outgrow dstCapacity. |
| 252 | * In which case, it's still a valid fuzzer scenario, |
| 253 | * but no roundtrip shall be possible */ |
| 254 | return 0; |
| 255 | } |
| 256 | /* round-trip */ |
| 257 | FUZZ_ZASSERT(cSize); |
| 258 | dSize = ZSTD_decompressDCtx(dctx, result, resultCapacity, compressed, cSize); |
| 259 | FUZZ_ZASSERT(dSize); |
| 260 | FUZZ_ASSERT_MSG(dSize == srcSize, "Incorrect regenerated size"); |
| 261 | FUZZ_ASSERT_MSG(!FUZZ_memcmp(src, result, srcSize), "Corruption!"); |
| 262 | return dSize; |
| 263 | } |
| 264 | |
| 265 | int LLVMFuzzerTestOneInput(const uint8_t* src, size_t size) |
| 266 | { |
| 267 | FUZZ_SEQ_PROD_SETUP(); |
| 268 | |
| 269 | void* rBuf; |
| 270 | size_t rBufSize; |
| 271 | void* cBuf; |
| 272 | size_t cBufSize; |
| 273 | size_t generatedSrcSize; |
| 274 | size_t nbSequences; |
| 275 | size_t dictSize = 0; |
| 276 | unsigned hasDict; |
| 277 | unsigned wLog; |
| 278 | int cLevel; |
| 279 | ZSTD_sequenceFormat_e mode; |
| 280 | |
| 281 | FUZZ_dataProducer_t* const producer = FUZZ_dataProducer_create(src, size); |
| 282 | FUZZ_ASSERT(producer); |
| 283 | |
| 284 | if (!cctx) { |
| 285 | cctx = ZSTD_createCCtx(); |
| 286 | FUZZ_ASSERT(cctx); |
| 287 | } |
| 288 | if (!dctx) { |
| 289 | dctx = ZSTD_createDCtx(); |
| 290 | FUZZ_ASSERT(dctx); |
| 291 | } |
| 292 | |
| 293 | /* Generate window log first so we don't generate offsets too large */ |
| 294 | wLog = FUZZ_dataProducer_uint32Range(producer, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX); |
| 295 | cLevel = FUZZ_dataProducer_int32Range(producer, -3, 22); |
| 296 | mode = (ZSTD_sequenceFormat_e)FUZZ_dataProducer_int32Range(producer, 0, 1); |
| 297 | |
| 298 | ZSTD_CCtx_reset(cctx, ZSTD_reset_session_and_parameters); |
| 299 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, 0); |
| 300 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, cLevel); |
| 301 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, wLog); |
| 302 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, ZSTD_MINMATCH_MIN); |
| 303 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_validateSequences, 1); |
| 304 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_blockDelimiters, mode); |
| 305 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_forceAttachDict, ZSTD_dictForceAttach); |
| 306 | |
| 307 | if (!literalsBuffer) { |
| 308 | literalsBuffer = FUZZ_malloc(ZSTD_FUZZ_GENERATED_LITERALS_SIZE); |
| 309 | FUZZ_ASSERT(literalsBuffer); |
| 310 | literalsBuffer = generatePseudoRandomString(literalsBuffer, ZSTD_FUZZ_GENERATED_LITERALS_SIZE, producer); |
| 311 | } |
| 312 | |
| 313 | if (!dictBuffer) { /* Generate global dictionary buffer */ |
| 314 | ZSTD_compressionParameters cParams; |
| 315 | |
| 316 | /* Generate a large dictionary buffer */ |
| 317 | dictBuffer = calloc(ZSTD_FUZZ_GENERATED_DICT_MAXSIZE, 1); |
| 318 | FUZZ_ASSERT(dictBuffer); |
| 319 | |
| 320 | /* Create global cdict and ddict */ |
| 321 | cParams = ZSTD_getCParams(1, ZSTD_FUZZ_GENERATED_SRC_MAXSIZE, ZSTD_FUZZ_GENERATED_DICT_MAXSIZE); |
| 322 | cParams.minMatch = ZSTD_MINMATCH_MIN; |
| 323 | cParams.hashLog = ZSTD_HASHLOG_MIN; |
| 324 | cParams.chainLog = ZSTD_CHAINLOG_MIN; |
| 325 | |
| 326 | cdict = ZSTD_createCDict_advanced(dictBuffer, ZSTD_FUZZ_GENERATED_DICT_MAXSIZE, ZSTD_dlm_byRef, ZSTD_dct_rawContent, cParams, ZSTD_defaultCMem); |
| 327 | ddict = ZSTD_createDDict_advanced(dictBuffer, ZSTD_FUZZ_GENERATED_DICT_MAXSIZE, ZSTD_dlm_byRef, ZSTD_dct_rawContent, ZSTD_defaultCMem); |
| 328 | FUZZ_ASSERT(cdict); |
| 329 | FUZZ_ASSERT(ddict); |
| 330 | } |
| 331 | |
| 332 | FUZZ_ASSERT(cdict); |
| 333 | FUZZ_ASSERT(ddict); |
| 334 | |
| 335 | hasDict = FUZZ_dataProducer_uint32Range(producer, 0, 1); |
| 336 | if (hasDict) { |
| 337 | dictSize = ZSTD_FUZZ_GENERATED_DICT_MAXSIZE; |
| 338 | } |
| 339 | |
| 340 | if (!generatedSequences) { |
| 341 | generatedSequences = FUZZ_malloc(sizeof(ZSTD_Sequence)*ZSTD_FUZZ_MAX_NBSEQ); |
| 342 | } |
| 343 | if (!generatedSrc) { |
| 344 | generatedSrc = FUZZ_malloc(ZSTD_FUZZ_GENERATED_SRC_MAXSIZE); |
| 345 | } |
| 346 | |
| 347 | nbSequences = generateRandomSequences(producer, ZSTD_FUZZ_GENERATED_LITERALS_SIZE, dictSize, wLog, mode); |
| 348 | generatedSrcSize = decodeSequences(generatedSrc, nbSequences, ZSTD_FUZZ_GENERATED_LITERALS_SIZE, dictBuffer, dictSize, mode); |
| 349 | |
| 350 | /* Note : in explicit block delimiters mode, |
| 351 | * the fuzzer might generate a lot of small blocks. |
| 352 | * In which case, the final compressed size might be > ZSTD_compressBound(). |
| 353 | * This is still a valid scenario fuzzer though, which makes it possible to check under-sized dstCapacity. |
| 354 | * The test just doesn't roundtrip. */ |
| 355 | cBufSize = ZSTD_compressBound(generatedSrcSize); |
| 356 | cBuf = FUZZ_malloc(cBufSize); |
| 357 | |
| 358 | rBufSize = generatedSrcSize; |
| 359 | rBuf = FUZZ_malloc(rBufSize); |
| 360 | |
| 361 | { const size_t result = roundTripTest(rBuf, rBufSize, |
| 362 | cBuf, cBufSize, |
| 363 | generatedSrc, generatedSrcSize, |
| 364 | generatedSequences, nbSequences, |
| 365 | hasDict, mode); |
| 366 | FUZZ_ASSERT(result <= generatedSrcSize); /* can be 0 when no round-trip */ |
| 367 | } |
| 368 | |
| 369 | free(rBuf); |
| 370 | free(cBuf); |
| 371 | FUZZ_dataProducer_free(producer); |
| 372 | #ifndef STATEFUL_FUZZING |
| 373 | ZSTD_freeCCtx(cctx); cctx = NULL; |
| 374 | ZSTD_freeDCtx(dctx); dctx = NULL; |
| 375 | free(generatedSequences); generatedSequences = NULL; |
| 376 | free(generatedSrc); generatedSrc = NULL; |
| 377 | free(literalsBuffer); literalsBuffer = NULL; |
| 378 | #endif |
| 379 | FUZZ_SEQ_PROD_TEARDOWN(); |
| 380 | return 0; |
| 381 | } |