| 1 | /* |
| 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
| 3 | * All rights reserved. |
| 4 | * |
| 5 | * This source code is licensed under both the BSD-style license (found in the |
| 6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
| 7 | * in the COPYING file in the root directory of this source tree). |
| 8 | * You may select, at your option, one of the above-listed licenses. |
| 9 | */ |
| 10 | |
| 11 | |
| 12 | |
| 13 | /*-************************************ |
| 14 | * Dependencies |
| 15 | **************************************/ |
| 16 | #include "datagen.h" |
| 17 | #include "platform.h" /* SET_BINARY_MODE */ |
| 18 | #include <stdlib.h> /* malloc, free */ |
| 19 | #include <stdio.h> /* FILE, fwrite, fprintf */ |
| 20 | #include <string.h> /* memcpy */ |
| 21 | #include "../lib/common/mem.h" /* U32 */ |
| 22 | |
| 23 | |
| 24 | /*-************************************ |
| 25 | * Macros |
| 26 | **************************************/ |
| 27 | #define KB *(1 <<10) |
| 28 | #define MIN(a,b) ( (a) < (b) ? (a) : (b) ) |
| 29 | |
| 30 | #define RDG_DEBUG 0 |
| 31 | #define TRACE(...) if (RDG_DEBUG) fprintf(stderr, __VA_ARGS__ ) |
| 32 | |
| 33 | |
| 34 | /*-************************************ |
| 35 | * Local constants |
| 36 | **************************************/ |
| 37 | #define LTLOG 13 |
| 38 | #define LTSIZE (1<<LTLOG) |
| 39 | #define LTMASK (LTSIZE-1) |
| 40 | |
| 41 | |
| 42 | /*-******************************************************* |
| 43 | * Local Functions |
| 44 | *********************************************************/ |
| 45 | #define RDG_rotl32(x,r) ((x << r) | (x >> (32 - r))) |
| 46 | static U32 RDG_rand(U32* src) |
| 47 | { |
| 48 | static const U32 prime1 = 2654435761U; |
| 49 | static const U32 prime2 = 2246822519U; |
| 50 | U32 rand32 = *src; |
| 51 | rand32 *= prime1; |
| 52 | rand32 ^= prime2; |
| 53 | rand32 = RDG_rotl32(rand32, 13); |
| 54 | *src = rand32; |
| 55 | return rand32 >> 5; |
| 56 | } |
| 57 | |
| 58 | typedef U32 fixedPoint_24_8; |
| 59 | |
| 60 | static void RDG_fillLiteralDistrib(BYTE* ldt, fixedPoint_24_8 ld) |
| 61 | { |
| 62 | BYTE const firstChar = (ld<=0.0) ? 0 : '('; |
| 63 | BYTE const lastChar = (ld<=0.0) ? 255 : '}'; |
| 64 | BYTE character = (ld<=0.0) ? 0 : '0'; |
| 65 | U32 u; |
| 66 | |
| 67 | if (ld<=0) ld = 0; |
| 68 | for (u=0; u<LTSIZE; ) { |
| 69 | U32 const weight = (((LTSIZE - u) * ld) >> 8) + 1; |
| 70 | U32 const end = MIN ( u + weight , LTSIZE); |
| 71 | while (u < end) ldt[u++] = character; |
| 72 | character++; |
| 73 | if (character > lastChar) character = firstChar; |
| 74 | } |
| 75 | } |
| 76 | |
| 77 | |
| 78 | static BYTE RDG_genChar(U32* seed, const BYTE* ldt) |
| 79 | { |
| 80 | U32 const id = RDG_rand(seed) & LTMASK; |
| 81 | return ldt[id]; /* memory-sanitizer fails here, stating "uninitialized value" when table initialized with P==0.0. Checked : table is fully initialized */ |
| 82 | } |
| 83 | |
| 84 | |
| 85 | static U32 RDG_rand15Bits (U32* seedPtr) |
| 86 | { |
| 87 | return RDG_rand(seedPtr) & 0x7FFF; |
| 88 | } |
| 89 | |
| 90 | static U32 RDG_randLength(U32* seedPtr) |
| 91 | { |
| 92 | if (RDG_rand(seedPtr) & 7) return (RDG_rand(seedPtr) & 0xF); /* small length */ |
| 93 | return (RDG_rand(seedPtr) & 0x1FF) + 0xF; |
| 94 | } |
| 95 | |
| 96 | static void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, |
| 97 | double matchProba, const BYTE* ldt, U32* seedPtr) |
| 98 | { |
| 99 | BYTE* const buffPtr = (BYTE*)buffer; |
| 100 | U32 const matchProba32 = (U32)(32768 * matchProba); |
| 101 | size_t pos = prefixSize; |
| 102 | U32 prevOffset = 1; |
| 103 | |
| 104 | /* special case : sparse content */ |
| 105 | while (matchProba >= 1.0) { |
| 106 | size_t size0 = RDG_rand(seedPtr) & 3; |
| 107 | size0 = (size_t)1 << (16 + size0 * 2); |
| 108 | size0 += RDG_rand(seedPtr) & (size0-1); /* because size0 is power of 2*/ |
| 109 | if (buffSize < pos + size0) { |
| 110 | memset(buffPtr+pos, 0, buffSize-pos); |
| 111 | return; |
| 112 | } |
| 113 | memset(buffPtr+pos, 0, size0); |
| 114 | pos += size0; |
| 115 | buffPtr[pos-1] = RDG_genChar(seedPtr, ldt); |
| 116 | continue; |
| 117 | } |
| 118 | |
| 119 | /* init */ |
| 120 | if (pos==0) buffPtr[0] = RDG_genChar(seedPtr, ldt), pos=1; |
| 121 | |
| 122 | /* Generate compressible data */ |
| 123 | while (pos < buffSize) { |
| 124 | /* Select : Literal (char) or Match (within 32K) */ |
| 125 | if (RDG_rand15Bits(seedPtr) < matchProba32) { |
| 126 | /* Copy (within 32K) */ |
| 127 | U32 const length = RDG_randLength(seedPtr) + 4; |
| 128 | U32 const d = (U32) MIN(pos + length , buffSize); |
| 129 | U32 const repeatOffset = (RDG_rand(seedPtr) & 15) == 2; |
| 130 | U32 const randOffset = RDG_rand15Bits(seedPtr) + 1; |
| 131 | U32 const offset = repeatOffset ? prevOffset : (U32) MIN(randOffset , pos); |
| 132 | size_t match = pos - offset; |
| 133 | while (pos < d) { buffPtr[pos++] = buffPtr[match++]; /* correctly manages overlaps */ } |
| 134 | prevOffset = offset; |
| 135 | } else { |
| 136 | /* Literal (noise) */ |
| 137 | U32 const length = RDG_randLength(seedPtr); |
| 138 | U32 const d = (U32) MIN(pos + length, buffSize); |
| 139 | while (pos < d) { buffPtr[pos++] = RDG_genChar(seedPtr, ldt); } |
| 140 | } } |
| 141 | } |
| 142 | |
| 143 | |
| 144 | void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed) |
| 145 | { |
| 146 | U32 seed32 = seed; |
| 147 | BYTE ldt[LTSIZE]; |
| 148 | memset(ldt, '0', sizeof(ldt)); /* yes, character '0', this is intentional */ |
| 149 | if (litProba<=0.0) litProba = matchProba / 4.5; |
| 150 | RDG_fillLiteralDistrib(ldt, (fixedPoint_24_8)(litProba * 256 + 0.001)); |
| 151 | RDG_genBlock(buffer, size, 0, matchProba, ldt, &seed32); |
| 152 | } |
| 153 | |
| 154 | |
| 155 | void RDG_genStdout(unsigned long long size, double matchProba, double litProba, unsigned seed) |
| 156 | { |
| 157 | U32 seed32 = seed; |
| 158 | size_t const stdBlockSize = 128 KB; |
| 159 | size_t const stdDictSize = 32 KB; |
| 160 | BYTE* const buff = (BYTE*)malloc(stdDictSize + stdBlockSize); |
| 161 | U64 total = 0; |
| 162 | BYTE ldt[LTSIZE]; /* literals distribution table */ |
| 163 | |
| 164 | /* init */ |
| 165 | if (buff==NULL) { perror("datagen"); exit(1); } |
| 166 | if (litProba<=0.0) litProba = matchProba / 4.5; |
| 167 | memset(ldt, '0', sizeof(ldt)); /* yes, character '0', this is intentional */ |
| 168 | RDG_fillLiteralDistrib(ldt, (fixedPoint_24_8)(litProba * 256 + 0.001)); |
| 169 | SET_BINARY_MODE(stdout); |
| 170 | |
| 171 | /* Generate initial dict */ |
| 172 | RDG_genBlock(buff, stdDictSize, 0, matchProba, ldt, &seed32); |
| 173 | |
| 174 | /* Generate compressible data */ |
| 175 | while (total < size) { |
| 176 | size_t const genBlockSize = (size_t) (MIN (stdBlockSize, size-total)); |
| 177 | RDG_genBlock(buff, stdDictSize+stdBlockSize, stdDictSize, matchProba, ldt, &seed32); |
| 178 | total += genBlockSize; |
| 179 | { size_t const unused = fwrite(buff, 1, genBlockSize, stdout); (void)unused; } |
| 180 | /* update dict */ |
| 181 | memcpy(buff, buff + stdBlockSize, stdDictSize); |
| 182 | } |
| 183 | |
| 184 | /* cleanup */ |
| 185 | free(buff); |
| 186 | } |