648db22b |
1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | |
12 | |
13 | /*-************************************ |
14 | * Dependencies |
15 | **************************************/ |
16 | #include "datagen.h" |
17 | #include "platform.h" /* SET_BINARY_MODE */ |
18 | #include <stdlib.h> /* malloc, free */ |
19 | #include <stdio.h> /* FILE, fwrite, fprintf */ |
20 | #include <string.h> /* memcpy */ |
21 | #include "../lib/common/mem.h" /* U32 */ |
22 | |
23 | |
24 | /*-************************************ |
25 | * Macros |
26 | **************************************/ |
27 | #define KB *(1 <<10) |
28 | #define MIN(a,b) ( (a) < (b) ? (a) : (b) ) |
29 | |
30 | #define RDG_DEBUG 0 |
31 | #define TRACE(...) if (RDG_DEBUG) fprintf(stderr, __VA_ARGS__ ) |
32 | |
33 | |
34 | /*-************************************ |
35 | * Local constants |
36 | **************************************/ |
37 | #define LTLOG 13 |
38 | #define LTSIZE (1<<LTLOG) |
39 | #define LTMASK (LTSIZE-1) |
40 | |
41 | |
42 | /*-******************************************************* |
43 | * Local Functions |
44 | *********************************************************/ |
45 | #define RDG_rotl32(x,r) ((x << r) | (x >> (32 - r))) |
46 | static U32 RDG_rand(U32* src) |
47 | { |
48 | static const U32 prime1 = 2654435761U; |
49 | static const U32 prime2 = 2246822519U; |
50 | U32 rand32 = *src; |
51 | rand32 *= prime1; |
52 | rand32 ^= prime2; |
53 | rand32 = RDG_rotl32(rand32, 13); |
54 | *src = rand32; |
55 | return rand32 >> 5; |
56 | } |
57 | |
58 | typedef U32 fixedPoint_24_8; |
59 | |
60 | static void RDG_fillLiteralDistrib(BYTE* ldt, fixedPoint_24_8 ld) |
61 | { |
62 | BYTE const firstChar = (ld<=0.0) ? 0 : '('; |
63 | BYTE const lastChar = (ld<=0.0) ? 255 : '}'; |
64 | BYTE character = (ld<=0.0) ? 0 : '0'; |
65 | U32 u; |
66 | |
67 | if (ld<=0) ld = 0; |
68 | for (u=0; u<LTSIZE; ) { |
69 | U32 const weight = (((LTSIZE - u) * ld) >> 8) + 1; |
70 | U32 const end = MIN ( u + weight , LTSIZE); |
71 | while (u < end) ldt[u++] = character; |
72 | character++; |
73 | if (character > lastChar) character = firstChar; |
74 | } |
75 | } |
76 | |
77 | |
78 | static BYTE RDG_genChar(U32* seed, const BYTE* ldt) |
79 | { |
80 | U32 const id = RDG_rand(seed) & LTMASK; |
81 | return ldt[id]; /* memory-sanitizer fails here, stating "uninitialized value" when table initialized with P==0.0. Checked : table is fully initialized */ |
82 | } |
83 | |
84 | |
85 | static U32 RDG_rand15Bits (U32* seedPtr) |
86 | { |
87 | return RDG_rand(seedPtr) & 0x7FFF; |
88 | } |
89 | |
90 | static U32 RDG_randLength(U32* seedPtr) |
91 | { |
92 | if (RDG_rand(seedPtr) & 7) return (RDG_rand(seedPtr) & 0xF); /* small length */ |
93 | return (RDG_rand(seedPtr) & 0x1FF) + 0xF; |
94 | } |
95 | |
96 | static void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, |
97 | double matchProba, const BYTE* ldt, U32* seedPtr) |
98 | { |
99 | BYTE* const buffPtr = (BYTE*)buffer; |
100 | U32 const matchProba32 = (U32)(32768 * matchProba); |
101 | size_t pos = prefixSize; |
102 | U32 prevOffset = 1; |
103 | |
104 | /* special case : sparse content */ |
105 | while (matchProba >= 1.0) { |
106 | size_t size0 = RDG_rand(seedPtr) & 3; |
107 | size0 = (size_t)1 << (16 + size0 * 2); |
108 | size0 += RDG_rand(seedPtr) & (size0-1); /* because size0 is power of 2*/ |
109 | if (buffSize < pos + size0) { |
110 | memset(buffPtr+pos, 0, buffSize-pos); |
111 | return; |
112 | } |
113 | memset(buffPtr+pos, 0, size0); |
114 | pos += size0; |
115 | buffPtr[pos-1] = RDG_genChar(seedPtr, ldt); |
116 | continue; |
117 | } |
118 | |
119 | /* init */ |
120 | if (pos==0) buffPtr[0] = RDG_genChar(seedPtr, ldt), pos=1; |
121 | |
122 | /* Generate compressible data */ |
123 | while (pos < buffSize) { |
124 | /* Select : Literal (char) or Match (within 32K) */ |
125 | if (RDG_rand15Bits(seedPtr) < matchProba32) { |
126 | /* Copy (within 32K) */ |
127 | U32 const length = RDG_randLength(seedPtr) + 4; |
128 | U32 const d = (U32) MIN(pos + length , buffSize); |
129 | U32 const repeatOffset = (RDG_rand(seedPtr) & 15) == 2; |
130 | U32 const randOffset = RDG_rand15Bits(seedPtr) + 1; |
131 | U32 const offset = repeatOffset ? prevOffset : (U32) MIN(randOffset , pos); |
132 | size_t match = pos - offset; |
133 | while (pos < d) { buffPtr[pos++] = buffPtr[match++]; /* correctly manages overlaps */ } |
134 | prevOffset = offset; |
135 | } else { |
136 | /* Literal (noise) */ |
137 | U32 const length = RDG_randLength(seedPtr); |
138 | U32 const d = (U32) MIN(pos + length, buffSize); |
139 | while (pos < d) { buffPtr[pos++] = RDG_genChar(seedPtr, ldt); } |
140 | } } |
141 | } |
142 | |
143 | |
144 | void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed) |
145 | { |
146 | U32 seed32 = seed; |
147 | BYTE ldt[LTSIZE]; |
148 | memset(ldt, '0', sizeof(ldt)); /* yes, character '0', this is intentional */ |
149 | if (litProba<=0.0) litProba = matchProba / 4.5; |
150 | RDG_fillLiteralDistrib(ldt, (fixedPoint_24_8)(litProba * 256 + 0.001)); |
151 | RDG_genBlock(buffer, size, 0, matchProba, ldt, &seed32); |
152 | } |
153 | |
154 | |
155 | void RDG_genStdout(unsigned long long size, double matchProba, double litProba, unsigned seed) |
156 | { |
157 | U32 seed32 = seed; |
158 | size_t const stdBlockSize = 128 KB; |
159 | size_t const stdDictSize = 32 KB; |
160 | BYTE* const buff = (BYTE*)malloc(stdDictSize + stdBlockSize); |
161 | U64 total = 0; |
162 | BYTE ldt[LTSIZE]; /* literals distribution table */ |
163 | |
164 | /* init */ |
165 | if (buff==NULL) { perror("datagen"); exit(1); } |
166 | if (litProba<=0.0) litProba = matchProba / 4.5; |
167 | memset(ldt, '0', sizeof(ldt)); /* yes, character '0', this is intentional */ |
168 | RDG_fillLiteralDistrib(ldt, (fixedPoint_24_8)(litProba * 256 + 0.001)); |
169 | SET_BINARY_MODE(stdout); |
170 | |
171 | /* Generate initial dict */ |
172 | RDG_genBlock(buff, stdDictSize, 0, matchProba, ldt, &seed32); |
173 | |
174 | /* Generate compressible data */ |
175 | while (total < size) { |
176 | size_t const genBlockSize = (size_t) (MIN (stdBlockSize, size-total)); |
177 | RDG_genBlock(buff, stdDictSize+stdBlockSize, stdDictSize, matchProba, ldt, &seed32); |
178 | total += genBlockSize; |
179 | { size_t const unused = fwrite(buff, 1, genBlockSize, stdout); (void)unused; } |
180 | /* update dict */ |
181 | memcpy(buff, buff + stdBlockSize, stdDictSize); |
182 | } |
183 | |
184 | /* cleanup */ |
185 | free(buff); |
186 | } |