648db22b |
1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | /** |
12 | * This fuzz target performs a zstd round-trip test by generating an arbitrary |
13 | * array of sequences, generating the associated source buffer, calling |
14 | * ZSTD_compressSequences(), and then decompresses and compares the result with |
15 | * the original generated source buffer. |
16 | */ |
17 | |
18 | #define ZSTD_STATIC_LINKING_ONLY |
19 | |
20 | #include <stddef.h> |
21 | #include <stdlib.h> |
22 | #include <stdio.h> |
23 | #include <string.h> |
24 | #include <time.h> |
25 | #include "fuzz_helpers.h" |
26 | #include "zstd_helpers.h" |
27 | #include "fuzz_data_producer.h" |
28 | #include "fuzz_third_party_seq_prod.h" |
29 | |
30 | static ZSTD_CCtx* cctx = NULL; |
31 | static ZSTD_DCtx* dctx = NULL; |
32 | static void* literalsBuffer = NULL; |
33 | static void* generatedSrc = NULL; |
34 | static ZSTD_Sequence* generatedSequences = NULL; |
35 | |
36 | static void* dictBuffer = NULL; |
37 | static ZSTD_CDict* cdict = NULL; |
38 | static ZSTD_DDict* ddict = NULL; |
39 | |
40 | #define ZSTD_FUZZ_GENERATED_SRC_MAXSIZE (1 << 20) /* Allow up to 1MB generated data */ |
41 | #define ZSTD_FUZZ_GENERATED_LITERALS_SIZE (1 << 20) /* Fixed size 1MB literals buffer */ |
42 | #define ZSTD_FUZZ_MATCHLENGTH_MAXSIZE (1 << 18) /* Allow up to 256KB matches */ |
43 | #define ZSTD_FUZZ_GENERATED_DICT_MAXSIZE (1 << ZSTD_WINDOWLOG_MAX_32) /* Allow up to 1 << ZSTD_WINDOWLOG_MAX_32 dictionary */ |
44 | #define ZSTD_FUZZ_MAX_NBSEQ (1 << 17) /* Maximum of 128K sequences */ |
45 | |
46 | /* Deterministic random number generator */ |
47 | #define FUZZ_RDG_rotl32(x,r) ((x << r) | (x >> (32 - r))) |
48 | static uint32_t FUZZ_RDG_rand(uint32_t* src) |
49 | { |
50 | static const uint32_t prime1 = 2654435761U; |
51 | static const uint32_t prime2 = 2246822519U; |
52 | uint32_t rand32 = *src; |
53 | rand32 *= prime1; |
54 | rand32 ^= prime2; |
55 | rand32 = FUZZ_RDG_rotl32(rand32, 13); |
56 | *src = rand32; |
57 | return rand32 >> 5; |
58 | } |
59 | |
60 | /* Make a pseudorandom string - this simple function exists to avoid |
61 | * taking a dependency on datagen.h to have RDG_genBuffer(). |
62 | */ |
63 | static char* generatePseudoRandomString(char* str, size_t size, FUZZ_dataProducer_t* producer) { |
64 | const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJK1234567890!@#$^&*()_"; |
65 | uint32_t seed = FUZZ_dataProducer_uint32(producer); |
66 | if (size) { |
67 | for (size_t n = 0; n < size; n++) { |
68 | int key = FUZZ_RDG_rand(&seed) % (int) (sizeof charset - 1); |
69 | str[n] = charset[key]; |
70 | } |
71 | } |
72 | return str; |
73 | } |
74 | |
75 | /* Returns size of source buffer */ |
76 | static size_t decodeSequences(void* dst, size_t nbSequences, |
77 | size_t literalsSize, |
78 | const void* dict, size_t dictSize, |
79 | ZSTD_sequenceFormat_e mode) |
80 | { |
81 | const uint8_t* litPtr = literalsBuffer; |
82 | const uint8_t* const litBegin = literalsBuffer; |
83 | const uint8_t* const litEnd = litBegin + literalsSize; |
84 | const uint8_t* dictPtr = dict; |
85 | uint8_t* op = dst; |
86 | const uint8_t* const oend = (uint8_t*)dst + ZSTD_FUZZ_GENERATED_SRC_MAXSIZE; |
87 | size_t generatedSrcBufferSize = 0; |
88 | size_t bytesWritten = 0; |
89 | |
90 | for (size_t i = 0; i < nbSequences; ++i) { |
91 | /* block boundary */ |
92 | if (generatedSequences[i].offset == 0) |
93 | FUZZ_ASSERT(generatedSequences[i].matchLength == 0); |
94 | |
95 | if (litPtr + generatedSequences[i].litLength > litEnd) { |
96 | litPtr = litBegin; |
97 | } |
98 | memcpy(op, litPtr, generatedSequences[i].litLength); |
99 | bytesWritten += generatedSequences[i].litLength; |
100 | op += generatedSequences[i].litLength; |
101 | litPtr += generatedSequences[i].litLength; |
102 | |
103 | /* Copy over the match */ |
104 | { size_t matchLength = generatedSequences[i].matchLength; |
105 | size_t j = 0; |
106 | size_t k = 0; |
107 | if (dictSize != 0) { |
108 | if (generatedSequences[i].offset > bytesWritten) { /* Offset goes into the dictionary */ |
109 | size_t dictOffset = generatedSequences[i].offset - bytesWritten; |
110 | size_t matchInDict = MIN(matchLength, dictOffset); |
111 | for (; k < matchInDict; ++k) { |
112 | op[k] = dictPtr[dictSize - dictOffset + k]; |
113 | } |
114 | matchLength -= matchInDict; |
115 | op += matchInDict; |
116 | } |
117 | } |
118 | for (; j < matchLength; ++j) { |
119 | op[j] = op[j - generatedSequences[i].offset]; |
120 | } |
121 | op += j; |
122 | FUZZ_ASSERT(generatedSequences[i].matchLength == j + k); |
123 | bytesWritten += generatedSequences[i].matchLength; |
124 | } |
125 | } |
126 | generatedSrcBufferSize = bytesWritten; |
127 | FUZZ_ASSERT(litPtr <= litEnd); |
128 | if (mode == ZSTD_sf_noBlockDelimiters) { |
129 | const uint32_t lastLLSize = (uint32_t)(litEnd - litPtr); |
130 | if (lastLLSize <= oend - op) { |
131 | memcpy(op, litPtr, lastLLSize); |
132 | generatedSrcBufferSize += lastLLSize; |
133 | } } |
134 | return generatedSrcBufferSize; |
135 | } |
136 | |
137 | /* Returns nb sequences generated |
138 | * Note : random sequences are always valid in ZSTD_sf_noBlockDelimiters mode. |
139 | * However, it can fail with ZSTD_sf_explicitBlockDelimiters, |
140 | * due to potential lack of space in |
141 | */ |
142 | static size_t generateRandomSequences(FUZZ_dataProducer_t* producer, |
143 | size_t literalsSizeLimit, size_t dictSize, |
144 | size_t windowLog, ZSTD_sequenceFormat_e mode) |
145 | { |
146 | const uint32_t repCode = 0; /* not used by sequence ingestion api */ |
147 | size_t windowSize = 1ULL << windowLog; |
148 | size_t blockSizeMax = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); |
149 | uint32_t matchLengthMax = ZSTD_FUZZ_MATCHLENGTH_MAXSIZE; |
150 | uint32_t bytesGenerated = 0; |
151 | uint32_t nbSeqGenerated = 0; |
152 | uint32_t isFirstSequence = 1; |
153 | uint32_t blockSize = 0; |
154 | |
155 | if (mode == ZSTD_sf_explicitBlockDelimiters) { |
156 | /* ensure that no sequence can be larger than one block */ |
157 | literalsSizeLimit = MIN(literalsSizeLimit, blockSizeMax/2); |
158 | matchLengthMax = MIN(matchLengthMax, blockSizeMax/2); |
159 | } |
160 | |
161 | while ( nbSeqGenerated < ZSTD_FUZZ_MAX_NBSEQ - 3 /* extra room for explicit delimiters */ |
162 | && bytesGenerated < ZSTD_FUZZ_GENERATED_SRC_MAXSIZE |
163 | && !FUZZ_dataProducer_empty(producer)) { |
164 | uint32_t matchLength; |
165 | uint32_t matchBound = matchLengthMax; |
166 | uint32_t offset; |
167 | uint32_t offsetBound; |
168 | const uint32_t minLitLength = (isFirstSequence && (dictSize == 0)); |
169 | const uint32_t litLength = FUZZ_dataProducer_uint32Range(producer, minLitLength, (uint32_t)literalsSizeLimit); |
170 | bytesGenerated += litLength; |
171 | if (bytesGenerated > ZSTD_FUZZ_GENERATED_SRC_MAXSIZE) { |
172 | break; |
173 | } |
174 | offsetBound = (bytesGenerated > windowSize) ? windowSize : bytesGenerated + (uint32_t)dictSize; |
175 | offset = FUZZ_dataProducer_uint32Range(producer, 1, offsetBound); |
176 | if (dictSize > 0 && bytesGenerated <= windowSize) { |
177 | /* Prevent match length from being such that it would be associated with an offset too large |
178 | * from the decoder's perspective. If not possible (match would be too small), |
179 | * then reduce the offset if necessary. |
180 | */ |
181 | const size_t bytesToReachWindowSize = windowSize - bytesGenerated; |
182 | if (bytesToReachWindowSize < ZSTD_MINMATCH_MIN) { |
183 | const uint32_t newOffsetBound = offsetBound > windowSize ? windowSize : offsetBound; |
184 | offset = FUZZ_dataProducer_uint32Range(producer, 1, newOffsetBound); |
185 | } else { |
186 | matchBound = MIN(matchLengthMax, (uint32_t)bytesToReachWindowSize); |
187 | } |
188 | } |
189 | matchLength = FUZZ_dataProducer_uint32Range(producer, ZSTD_MINMATCH_MIN, matchBound); |
190 | bytesGenerated += matchLength; |
191 | if (bytesGenerated > ZSTD_FUZZ_GENERATED_SRC_MAXSIZE) { |
192 | break; |
193 | } |
194 | { ZSTD_Sequence seq = {offset, litLength, matchLength, repCode}; |
195 | const uint32_t lastLits = FUZZ_dataProducer_uint32Range(producer, 0, litLength); |
196 | #define SPLITPROB 6000 |
197 | #define SPLITMARK 5234 |
198 | const int split = (FUZZ_dataProducer_uint32Range(producer, 0, SPLITPROB) == SPLITMARK); |
199 | if (mode == ZSTD_sf_explicitBlockDelimiters) { |
200 | const size_t seqSize = seq.litLength + seq.matchLength; |
201 | if (blockSize + seqSize > blockSizeMax) { /* reaching limit : must end block now */ |
202 | const ZSTD_Sequence endBlock = {0, 0, 0, 0}; |
203 | generatedSequences[nbSeqGenerated++] = endBlock; |
204 | blockSize = seqSize; |
205 | } |
206 | if (split) { |
207 | const ZSTD_Sequence endBlock = {0, lastLits, 0, 0}; |
208 | generatedSequences[nbSeqGenerated++] = endBlock; |
209 | assert(lastLits <= seq.litLength); |
210 | seq.litLength -= lastLits; |
211 | blockSize = seqSize - lastLits; |
212 | } else { |
213 | blockSize += seqSize; |
214 | } |
215 | } |
216 | generatedSequences[nbSeqGenerated++] = seq; |
217 | isFirstSequence = 0; |
218 | } |
219 | } |
220 | |
221 | if (mode == ZSTD_sf_explicitBlockDelimiters) { |
222 | /* always end sequences with a block delimiter */ |
223 | const ZSTD_Sequence endBlock = {0, 0, 0, 0}; |
224 | assert(nbSeqGenerated < ZSTD_FUZZ_MAX_NBSEQ); |
225 | generatedSequences[nbSeqGenerated++] = endBlock; |
226 | } |
227 | return nbSeqGenerated; |
228 | } |
229 | |
230 | static size_t roundTripTest(void* result, size_t resultCapacity, |
231 | void* compressed, size_t compressedCapacity, |
232 | const void* src, size_t srcSize, |
233 | const ZSTD_Sequence* seqs, size_t seqSize, |
234 | unsigned hasDict, |
235 | ZSTD_sequenceFormat_e mode) |
236 | { |
237 | size_t cSize; |
238 | size_t dSize; |
239 | |
240 | if (hasDict) { |
241 | FUZZ_ZASSERT(ZSTD_CCtx_refCDict(cctx, cdict)); |
242 | FUZZ_ZASSERT(ZSTD_DCtx_refDDict(dctx, ddict)); |
243 | } |
244 | |
245 | cSize = ZSTD_compressSequences(cctx, compressed, compressedCapacity, |
246 | seqs, seqSize, |
247 | src, srcSize); |
248 | if ( (ZSTD_getErrorCode(cSize) == ZSTD_error_dstSize_tooSmall) |
249 | && (mode == ZSTD_sf_explicitBlockDelimiters) ) { |
250 | /* Valid scenario : in explicit delimiter mode, |
251 | * it might be possible for the compressed size to outgrow dstCapacity. |
252 | * In which case, it's still a valid fuzzer scenario, |
253 | * but no roundtrip shall be possible */ |
254 | return 0; |
255 | } |
256 | /* round-trip */ |
257 | FUZZ_ZASSERT(cSize); |
258 | dSize = ZSTD_decompressDCtx(dctx, result, resultCapacity, compressed, cSize); |
259 | FUZZ_ZASSERT(dSize); |
260 | FUZZ_ASSERT_MSG(dSize == srcSize, "Incorrect regenerated size"); |
261 | FUZZ_ASSERT_MSG(!FUZZ_memcmp(src, result, srcSize), "Corruption!"); |
262 | return dSize; |
263 | } |
264 | |
265 | int LLVMFuzzerTestOneInput(const uint8_t* src, size_t size) |
266 | { |
267 | FUZZ_SEQ_PROD_SETUP(); |
268 | |
269 | void* rBuf; |
270 | size_t rBufSize; |
271 | void* cBuf; |
272 | size_t cBufSize; |
273 | size_t generatedSrcSize; |
274 | size_t nbSequences; |
275 | size_t dictSize = 0; |
276 | unsigned hasDict; |
277 | unsigned wLog; |
278 | int cLevel; |
279 | ZSTD_sequenceFormat_e mode; |
280 | |
281 | FUZZ_dataProducer_t* const producer = FUZZ_dataProducer_create(src, size); |
282 | FUZZ_ASSERT(producer); |
283 | |
284 | if (!cctx) { |
285 | cctx = ZSTD_createCCtx(); |
286 | FUZZ_ASSERT(cctx); |
287 | } |
288 | if (!dctx) { |
289 | dctx = ZSTD_createDCtx(); |
290 | FUZZ_ASSERT(dctx); |
291 | } |
292 | |
293 | /* Generate window log first so we don't generate offsets too large */ |
294 | wLog = FUZZ_dataProducer_uint32Range(producer, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX); |
295 | cLevel = FUZZ_dataProducer_int32Range(producer, -3, 22); |
296 | mode = (ZSTD_sequenceFormat_e)FUZZ_dataProducer_int32Range(producer, 0, 1); |
297 | |
298 | ZSTD_CCtx_reset(cctx, ZSTD_reset_session_and_parameters); |
299 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, 0); |
300 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, cLevel); |
301 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, wLog); |
302 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, ZSTD_MINMATCH_MIN); |
303 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_validateSequences, 1); |
304 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_blockDelimiters, mode); |
305 | ZSTD_CCtx_setParameter(cctx, ZSTD_c_forceAttachDict, ZSTD_dictForceAttach); |
306 | |
307 | if (!literalsBuffer) { |
308 | literalsBuffer = FUZZ_malloc(ZSTD_FUZZ_GENERATED_LITERALS_SIZE); |
309 | FUZZ_ASSERT(literalsBuffer); |
310 | literalsBuffer = generatePseudoRandomString(literalsBuffer, ZSTD_FUZZ_GENERATED_LITERALS_SIZE, producer); |
311 | } |
312 | |
313 | if (!dictBuffer) { /* Generate global dictionary buffer */ |
314 | ZSTD_compressionParameters cParams; |
315 | |
316 | /* Generate a large dictionary buffer */ |
317 | dictBuffer = calloc(ZSTD_FUZZ_GENERATED_DICT_MAXSIZE, 1); |
318 | FUZZ_ASSERT(dictBuffer); |
319 | |
320 | /* Create global cdict and ddict */ |
321 | cParams = ZSTD_getCParams(1, ZSTD_FUZZ_GENERATED_SRC_MAXSIZE, ZSTD_FUZZ_GENERATED_DICT_MAXSIZE); |
322 | cParams.minMatch = ZSTD_MINMATCH_MIN; |
323 | cParams.hashLog = ZSTD_HASHLOG_MIN; |
324 | cParams.chainLog = ZSTD_CHAINLOG_MIN; |
325 | |
326 | cdict = ZSTD_createCDict_advanced(dictBuffer, ZSTD_FUZZ_GENERATED_DICT_MAXSIZE, ZSTD_dlm_byRef, ZSTD_dct_rawContent, cParams, ZSTD_defaultCMem); |
327 | ddict = ZSTD_createDDict_advanced(dictBuffer, ZSTD_FUZZ_GENERATED_DICT_MAXSIZE, ZSTD_dlm_byRef, ZSTD_dct_rawContent, ZSTD_defaultCMem); |
328 | FUZZ_ASSERT(cdict); |
329 | FUZZ_ASSERT(ddict); |
330 | } |
331 | |
332 | FUZZ_ASSERT(cdict); |
333 | FUZZ_ASSERT(ddict); |
334 | |
335 | hasDict = FUZZ_dataProducer_uint32Range(producer, 0, 1); |
336 | if (hasDict) { |
337 | dictSize = ZSTD_FUZZ_GENERATED_DICT_MAXSIZE; |
338 | } |
339 | |
340 | if (!generatedSequences) { |
341 | generatedSequences = FUZZ_malloc(sizeof(ZSTD_Sequence)*ZSTD_FUZZ_MAX_NBSEQ); |
342 | } |
343 | if (!generatedSrc) { |
344 | generatedSrc = FUZZ_malloc(ZSTD_FUZZ_GENERATED_SRC_MAXSIZE); |
345 | } |
346 | |
347 | nbSequences = generateRandomSequences(producer, ZSTD_FUZZ_GENERATED_LITERALS_SIZE, dictSize, wLog, mode); |
348 | generatedSrcSize = decodeSequences(generatedSrc, nbSequences, ZSTD_FUZZ_GENERATED_LITERALS_SIZE, dictBuffer, dictSize, mode); |
349 | |
350 | /* Note : in explicit block delimiters mode, |
351 | * the fuzzer might generate a lot of small blocks. |
352 | * In which case, the final compressed size might be > ZSTD_compressBound(). |
353 | * This is still a valid scenario fuzzer though, which makes it possible to check under-sized dstCapacity. |
354 | * The test just doesn't roundtrip. */ |
355 | cBufSize = ZSTD_compressBound(generatedSrcSize); |
356 | cBuf = FUZZ_malloc(cBufSize); |
357 | |
358 | rBufSize = generatedSrcSize; |
359 | rBuf = FUZZ_malloc(rBufSize); |
360 | |
361 | { const size_t result = roundTripTest(rBuf, rBufSize, |
362 | cBuf, cBufSize, |
363 | generatedSrc, generatedSrcSize, |
364 | generatedSequences, nbSequences, |
365 | hasDict, mode); |
366 | FUZZ_ASSERT(result <= generatedSrcSize); /* can be 0 when no round-trip */ |
367 | } |
368 | |
369 | free(rBuf); |
370 | free(cBuf); |
371 | FUZZ_dataProducer_free(producer); |
372 | #ifndef STATEFUL_FUZZING |
373 | ZSTD_freeCCtx(cctx); cctx = NULL; |
374 | ZSTD_freeDCtx(dctx); dctx = NULL; |
375 | free(generatedSequences); generatedSequences = NULL; |
376 | free(generatedSrc); generatedSrc = NULL; |
377 | free(literalsBuffer); literalsBuffer = NULL; |
378 | #endif |
379 | FUZZ_SEQ_PROD_TEARDOWN(); |
380 | return 0; |
381 | } |