648db22b |
1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | #include "seqgen.h" |
12 | #include "mem.h" |
13 | #include <string.h> |
14 | |
15 | #define MIN(a, b) ((a) < (b) ? (a) : (b)) |
16 | |
17 | static const size_t kMatchBytes = 128; |
18 | |
19 | #define SEQ_rotl32(x,r) ((x << r) | (x >> (32 - r))) |
20 | static BYTE SEQ_randByte(unsigned* src) |
21 | { |
22 | static const U32 prime1 = 2654435761U; |
23 | static const U32 prime2 = 2246822519U; |
24 | U32 rand32 = *src; |
25 | rand32 *= prime1; |
26 | rand32 ^= prime2; |
27 | rand32 = SEQ_rotl32(rand32, 13); |
28 | *src = rand32; |
29 | return (BYTE)(rand32 >> 5); |
30 | } |
31 | |
32 | SEQ_stream SEQ_initStream(unsigned seed) |
33 | { |
34 | SEQ_stream stream; |
35 | stream.state = 0; |
36 | XXH64_reset(&stream.xxh, 0); |
37 | stream.seed = seed; |
38 | return stream; |
39 | } |
40 | |
41 | /* Generates a single guard byte, then match length + 1 of a different byte, |
42 | * then another guard byte. |
43 | */ |
44 | static size_t SEQ_gen_matchLength(SEQ_stream* stream, unsigned value, |
45 | SEQ_outBuffer* out) |
46 | { |
47 | typedef enum { |
48 | ml_first_byte = 0, |
49 | ml_match_bytes, |
50 | ml_last_byte, |
51 | } ml_state; |
52 | BYTE* const ostart = (BYTE*)out->dst; |
53 | BYTE* const oend = ostart + out->size; |
54 | BYTE* op = ostart + out->pos; |
55 | |
56 | switch ((ml_state)stream->state) { |
57 | case ml_first_byte: |
58 | /* Generate a single byte and pick a different byte for the match */ |
59 | if (op >= oend) { |
60 | stream->bytesLeft = 1; |
61 | break; |
62 | } |
63 | *op = SEQ_randByte(&stream->seed) & 0xFF; |
64 | do { |
65 | stream->saved = SEQ_randByte(&stream->seed) & 0xFF; |
66 | } while (*op == stream->saved); |
67 | ++op; |
68 | /* State transition */ |
69 | stream->state = ml_match_bytes; |
70 | stream->bytesLeft = value + 1; |
71 | /* fall-through */ |
72 | case ml_match_bytes: { |
73 | /* Copy matchLength + 1 bytes to the output buffer */ |
74 | size_t const setLength = MIN(stream->bytesLeft, (size_t)(oend - op)); |
75 | if (setLength > 0) { |
76 | memset(op, stream->saved, setLength); |
77 | op += setLength; |
78 | stream->bytesLeft -= setLength; |
79 | } |
80 | if (stream->bytesLeft > 0) |
81 | break; |
82 | /* State transition */ |
83 | stream->state = ml_last_byte; |
84 | } |
85 | /* fall-through */ |
86 | case ml_last_byte: |
87 | /* Generate a single byte and pick a different byte for the match */ |
88 | if (op >= oend) { |
89 | stream->bytesLeft = 1; |
90 | break; |
91 | } |
92 | do { |
93 | *op = SEQ_randByte(&stream->seed) & 0xFF; |
94 | } while (*op == stream->saved); |
95 | ++op; |
96 | /* State transition */ |
97 | /* fall-through */ |
98 | default: |
99 | stream->state = 0; |
100 | stream->bytesLeft = 0; |
101 | break; |
102 | } |
103 | XXH64_update(&stream->xxh, ostart + out->pos, (op - ostart) - out->pos); |
104 | out->pos = op - ostart; |
105 | return stream->bytesLeft; |
106 | } |
107 | |
108 | /* Saves the current seed then generates kMatchBytes random bytes >= 128. |
109 | * Generates literal length - kMatchBytes random bytes < 128. |
110 | * Generates another kMatchBytes using the saved seed to generate a match. |
111 | * This way the match is easy to find for the compressors. |
112 | */ |
113 | static size_t SEQ_gen_litLength(SEQ_stream* stream, unsigned value, SEQ_outBuffer* out) |
114 | { |
115 | typedef enum { |
116 | ll_start = 0, |
117 | ll_run_bytes, |
118 | ll_literals, |
119 | ll_run_match, |
120 | } ll_state; |
121 | BYTE* const ostart = (BYTE*)out->dst; |
122 | BYTE* const oend = ostart + out->size; |
123 | BYTE* op = ostart + out->pos; |
124 | |
125 | switch ((ll_state)stream->state) { |
126 | case ll_start: |
127 | stream->state = ll_run_bytes; |
128 | stream->saved = stream->seed; |
129 | stream->bytesLeft = MIN(kMatchBytes, value); |
130 | /* fall-through */ |
131 | case ll_run_bytes: |
132 | while (stream->bytesLeft > 0 && op < oend) { |
133 | *op++ = SEQ_randByte(&stream->seed) | 0x80; |
134 | --stream->bytesLeft; |
135 | } |
136 | if (stream->bytesLeft > 0) |
137 | break; |
138 | /* State transition */ |
139 | stream->state = ll_literals; |
140 | stream->bytesLeft = value - MIN(kMatchBytes, value); |
141 | /* fall-through */ |
142 | case ll_literals: |
143 | while (stream->bytesLeft > 0 && op < oend) { |
144 | *op++ = SEQ_randByte(&stream->seed) & 0x7F; |
145 | --stream->bytesLeft; |
146 | } |
147 | if (stream->bytesLeft > 0) |
148 | break; |
149 | /* State transition */ |
150 | stream->state = ll_run_match; |
151 | stream->bytesLeft = MIN(kMatchBytes, value); |
152 | /* fall-through */ |
153 | case ll_run_match: { |
154 | while (stream->bytesLeft > 0 && op < oend) { |
155 | *op++ = SEQ_randByte(&stream->saved) | 0x80; |
156 | --stream->bytesLeft; |
157 | } |
158 | if (stream->bytesLeft > 0) |
159 | break; |
160 | } |
161 | /* fall-through */ |
162 | default: |
163 | stream->state = 0; |
164 | stream->bytesLeft = 0; |
165 | break; |
166 | } |
167 | XXH64_update(&stream->xxh, ostart + out->pos, (op - ostart) - out->pos); |
168 | out->pos = op - ostart; |
169 | return stream->bytesLeft; |
170 | } |
171 | |
172 | /* Saves the current seed then generates kMatchBytes random bytes >= 128. |
173 | * Generates offset - kMatchBytes of zeros to get a large offset without |
174 | * polluting the hash tables. |
175 | * Generates another kMatchBytes using the saved seed to generate a with the |
176 | * required offset. |
177 | */ |
178 | static size_t SEQ_gen_offset(SEQ_stream* stream, unsigned value, SEQ_outBuffer* out) |
179 | { |
180 | typedef enum { |
181 | of_start = 0, |
182 | of_run_bytes, |
183 | of_offset, |
184 | of_run_match, |
185 | } of_state; |
186 | BYTE* const ostart = (BYTE*)out->dst; |
187 | BYTE* const oend = ostart + out->size; |
188 | BYTE* op = ostart + out->pos; |
189 | |
190 | switch ((of_state)stream->state) { |
191 | case of_start: |
192 | stream->state = of_run_bytes; |
193 | stream->saved = stream->seed; |
194 | stream->bytesLeft = MIN(value, kMatchBytes); |
195 | /* fall-through */ |
196 | case of_run_bytes: { |
197 | while (stream->bytesLeft > 0 && op < oend) { |
198 | *op++ = SEQ_randByte(&stream->seed) | 0x80; |
199 | --stream->bytesLeft; |
200 | } |
201 | if (stream->bytesLeft > 0) |
202 | break; |
203 | /* State transition */ |
204 | stream->state = of_offset; |
205 | stream->bytesLeft = value - MIN(value, kMatchBytes); |
206 | } |
207 | /* fall-through */ |
208 | case of_offset: { |
209 | /* Copy matchLength + 1 bytes to the output buffer */ |
210 | size_t const setLength = MIN(stream->bytesLeft, (size_t)(oend - op)); |
211 | if (setLength > 0) { |
212 | memset(op, 0, setLength); |
213 | op += setLength; |
214 | stream->bytesLeft -= setLength; |
215 | } |
216 | if (stream->bytesLeft > 0) |
217 | break; |
218 | /* State transition */ |
219 | stream->state = of_run_match; |
220 | stream->bytesLeft = MIN(value, kMatchBytes); |
221 | } |
222 | /* fall-through */ |
223 | case of_run_match: { |
224 | while (stream->bytesLeft > 0 && op < oend) { |
225 | *op++ = SEQ_randByte(&stream->saved) | 0x80; |
226 | --stream->bytesLeft; |
227 | } |
228 | if (stream->bytesLeft > 0) |
229 | break; |
230 | } |
231 | /* fall-through */ |
232 | default: |
233 | stream->state = 0; |
234 | stream->bytesLeft = 0; |
235 | break; |
236 | } |
237 | XXH64_update(&stream->xxh, ostart + out->pos, (op - ostart) - out->pos); |
238 | out->pos = op - ostart; |
239 | return stream->bytesLeft; |
240 | } |
241 | |
242 | /* Returns the number of bytes left to generate. |
243 | * Must pass the same type/value until it returns 0. |
244 | */ |
245 | size_t SEQ_gen(SEQ_stream* stream, SEQ_gen_type type, unsigned value, SEQ_outBuffer* out) |
246 | { |
247 | switch (type) { |
248 | case SEQ_gen_ml: return SEQ_gen_matchLength(stream, value, out); |
249 | case SEQ_gen_ll: return SEQ_gen_litLength(stream, value, out); |
250 | case SEQ_gen_of: return SEQ_gen_offset(stream, value, out); |
251 | case SEQ_gen_max: /* fall-through */ |
252 | default: return 0; |
253 | } |
254 | } |
255 | |
256 | /* Returns the xxhash of the data produced so far */ |
257 | XXH64_hash_t SEQ_digest(SEQ_stream const* stream) |
258 | { |
259 | return XXH64_digest(&stream->xxh); |
260 | } |