| 1 | /* |
| 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
| 3 | * All rights reserved. |
| 4 | * |
| 5 | * This source code is licensed under both the BSD-style license (found in the |
| 6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
| 7 | * in the COPYING file in the root directory of this source tree). |
| 8 | * You may select, at your option, one of the above-listed licenses. |
| 9 | */ |
| 10 | |
| 11 | /* Implementation notes: |
| 12 | * |
| 13 | * This is a very simple lorem ipsum generator |
| 14 | * which features a static list of words |
| 15 | * and print them one after another randomly |
| 16 | * with a fake sentence / paragraph structure. |
| 17 | * |
| 18 | * The goal is to generate a printable text |
| 19 | * that can be used to fake a text compression scenario. |
| 20 | * The resulting compression / ratio curve of the lorem ipsum generator |
| 21 | * is more satisfying than the previous statistical generator, |
| 22 | * which was initially designed for entropy compression, |
| 23 | * and lacks a regularity more representative of text. |
| 24 | * |
| 25 | * The compression ratio achievable on the generated lorem ipsum |
| 26 | * is still a bit too good, presumably because the dictionary is a bit too |
| 27 | * small. It would be possible to create some more complex scheme, notably by |
| 28 | * enlarging the dictionary with a word generator, and adding grammatical rules |
| 29 | * (composition) and syntax rules. But that's probably overkill for the intended |
| 30 | * goal. |
| 31 | */ |
| 32 | |
| 33 | #include "lorem.h" |
| 34 | #include <assert.h> |
| 35 | #include <limits.h> /* INT_MAX */ |
| 36 | #include <string.h> /* memcpy */ |
| 37 | |
| 38 | #define WORD_MAX_SIZE 20 |
| 39 | |
| 40 | /* Define the word pool */ |
| 41 | static const char* kWords[] = { |
| 42 | "lorem", "ipsum", "dolor", "sit", "amet", |
| 43 | "consectetur", "adipiscing", "elit", "sed", "do", |
| 44 | "eiusmod", "tempor", "incididunt", "ut", "labore", |
| 45 | "et", "dolore", "magna", "aliqua", "dis", |
| 46 | "lectus", "vestibulum", "mattis", "ullamcorper", "velit", |
| 47 | "commodo", "a", "lacus", "arcu", "magnis", |
| 48 | "parturient", "montes", "nascetur", "ridiculus", "mus", |
| 49 | "mauris", "nulla", "malesuada", "pellentesque", "eget", |
| 50 | "gravida", "in", "dictum", "non", "erat", |
| 51 | "nam", "voluptat", "maecenas", "blandit", "aliquam", |
| 52 | "etiam", "enim", "lobortis", "scelerisque", "fermentum", |
| 53 | "dui", "faucibus", "ornare", "at", "elementum", |
| 54 | "eu", "facilisis", "odio", "morbi", "quis", |
| 55 | "eros", "donec", "ac", "orci", "purus", |
| 56 | "turpis", "cursus", "leo", "vel", "porta", |
| 57 | "consequat", "interdum", "varius", "vulputate", "aliquet", |
| 58 | "pharetra", "nunc", "auctor", "urna", "id", |
| 59 | "metus", "viverra", "nibh", "cras", "mi", |
| 60 | "unde", "omnis", "iste", "natus", "error", |
| 61 | "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium", |
| 62 | "totam", "rem", "aperiam", "eaque", "ipsa", |
| 63 | "quae", "ab", "illo", "inventore", "veritatis", |
| 64 | "quasi", "architecto", "beatae", "vitae", "dicta", |
| 65 | "sunt", "explicabo", "nemo", "ipsam", "quia", |
| 66 | "voluptas", "aspernatur", "aut", "odit", "fugit", |
| 67 | "consequuntur", "magni", "dolores", "eos", "qui", |
| 68 | "ratione", "sequi", "nesciunt", "neque", "porro", |
| 69 | "quisquam", "est", "dolorem", "adipisci", "numquam", |
| 70 | "eius", "modi", "tempora", "incidunt", "magnam", |
| 71 | "quaerat", "ad", "minima", "veniam", "nostrum", |
| 72 | "ullam", "corporis", "suscipit", "laboriosam", "nisi", |
| 73 | "aliquid", "ex", "ea", "commodi", "consequatur", |
| 74 | "autem", "eum", "iure", "voluptate", "esse", |
| 75 | "quam", "nihil", "molestiae", "illum", "fugiat", |
| 76 | "quo", "pariatur", "vero", "accusamus", "iusto", |
| 77 | "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum", |
| 78 | "deleniti", "atque", "corrupti", "quos", "quas", |
| 79 | "molestias", "excepturi", "sint", "occaecati", "cupiditate", |
| 80 | "provident", "similique", "culpa", "officia", "deserunt", |
| 81 | "mollitia", "animi", "laborum", "dolorum", "fuga", |
| 82 | "harum", "quidem", "rerum", "facilis", "expedita", |
| 83 | "distinctio", "libero", "tempore", "cum", "soluta", |
| 84 | "nobis", "eligendi", "optio", "cumque", "impedit", |
| 85 | "minus", "quod", "maxime", "placeat", "facere", |
| 86 | "possimus", "assumenda", "repellendus", "temporibus", "quibusdam", |
| 87 | "officiis", "debitis", "saepe", "eveniet", "voluptates", |
| 88 | "repudiandae", "recusandae", "itaque", "earum", "hic", |
| 89 | "tenetur", "sapiente", "delectus", "reiciendis", "cillum", |
| 90 | "maiores", "alias", "perferendis", "doloribus", "asperiores", |
| 91 | "repellat", "minim", "nostrud", "exercitation", "ullamco", |
| 92 | "laboris", "aliquip", "duis", "aute", "irure", |
| 93 | }; |
| 94 | static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]); |
| 95 | |
| 96 | /* simple 1-dimension distribution, based on word's length, favors small words |
| 97 | */ |
| 98 | static const int kWeights[] = { 0, 8, 6, 4, 3, 2 }; |
| 99 | static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]); |
| 100 | |
| 101 | #define DISTRIB_SIZE_MAX 650 |
| 102 | static int g_distrib[DISTRIB_SIZE_MAX] = { 0 }; |
| 103 | static unsigned g_distribCount = 0; |
| 104 | |
| 105 | static void countFreqs( |
| 106 | const char* words[], |
| 107 | size_t nbWords, |
| 108 | const int* weights, |
| 109 | size_t nbWeights) |
| 110 | { |
| 111 | unsigned total = 0; |
| 112 | size_t w; |
| 113 | for (w = 0; w < nbWords; w++) { |
| 114 | size_t len = strlen(words[w]); |
| 115 | int lmax; |
| 116 | if (len >= nbWeights) |
| 117 | len = nbWeights - 1; |
| 118 | lmax = weights[len]; |
| 119 | total += (unsigned)lmax; |
| 120 | } |
| 121 | g_distribCount = total; |
| 122 | assert(g_distribCount <= DISTRIB_SIZE_MAX); |
| 123 | } |
| 124 | |
| 125 | static void init_word_distrib( |
| 126 | const char* words[], |
| 127 | size_t nbWords, |
| 128 | const int* weights, |
| 129 | size_t nbWeights) |
| 130 | { |
| 131 | size_t w, d = 0; |
| 132 | countFreqs(words, nbWords, weights, nbWeights); |
| 133 | for (w = 0; w < nbWords; w++) { |
| 134 | size_t len = strlen(words[w]); |
| 135 | int l, lmax; |
| 136 | if (len >= nbWeights) |
| 137 | len = nbWeights - 1; |
| 138 | lmax = weights[len]; |
| 139 | for (l = 0; l < lmax; l++) { |
| 140 | g_distrib[d++] = (int)w; |
| 141 | } |
| 142 | } |
| 143 | } |
| 144 | |
| 145 | /* Note: this unit only works when invoked sequentially. |
| 146 | * No concurrent access is allowed */ |
| 147 | static char* g_ptr = NULL; |
| 148 | static size_t g_nbChars = 0; |
| 149 | static size_t g_maxChars = 10000000; |
| 150 | static unsigned g_randRoot = 0; |
| 151 | |
| 152 | #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r))) |
| 153 | static unsigned LOREM_rand(unsigned range) |
| 154 | { |
| 155 | static const unsigned prime1 = 2654435761U; |
| 156 | static const unsigned prime2 = 2246822519U; |
| 157 | unsigned rand32 = g_randRoot; |
| 158 | rand32 *= prime1; |
| 159 | rand32 ^= prime2; |
| 160 | rand32 = RDG_rotl32(rand32, 13); |
| 161 | g_randRoot = rand32; |
| 162 | return (unsigned)(((unsigned long long)rand32 * range) >> 32); |
| 163 | } |
| 164 | |
| 165 | static void writeLastCharacters(void) |
| 166 | { |
| 167 | size_t lastChars = g_maxChars - g_nbChars; |
| 168 | assert(g_maxChars >= g_nbChars); |
| 169 | if (lastChars == 0) |
| 170 | return; |
| 171 | g_ptr[g_nbChars++] = '.'; |
| 172 | if (lastChars > 2) { |
| 173 | memset(g_ptr + g_nbChars, ' ', lastChars - 2); |
| 174 | } |
| 175 | if (lastChars > 1) { |
| 176 | g_ptr[g_maxChars - 1] = '\n'; |
| 177 | } |
| 178 | g_nbChars = g_maxChars; |
| 179 | } |
| 180 | |
| 181 | static void generateWord(const char* word, const char* separator, int upCase) |
| 182 | { |
| 183 | size_t const len = strlen(word) + strlen(separator); |
| 184 | if (g_nbChars + len > g_maxChars) { |
| 185 | writeLastCharacters(); |
| 186 | return; |
| 187 | } |
| 188 | memcpy(g_ptr + g_nbChars, word, strlen(word)); |
| 189 | if (upCase) { |
| 190 | static const char toUp = 'A' - 'a'; |
| 191 | g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp); |
| 192 | } |
| 193 | g_nbChars += strlen(word); |
| 194 | memcpy(g_ptr + g_nbChars, separator, strlen(separator)); |
| 195 | g_nbChars += strlen(separator); |
| 196 | } |
| 197 | |
| 198 | static int about(unsigned target) |
| 199 | { |
| 200 | return (int)(LOREM_rand(target) + LOREM_rand(target) + 1); |
| 201 | } |
| 202 | |
| 203 | /* Function to generate a random sentence */ |
| 204 | static void generateSentence(int nbWords) |
| 205 | { |
| 206 | int commaPos = about(9); |
| 207 | int comma2 = commaPos + about(7); |
| 208 | int qmark = (LOREM_rand(11) == 7); |
| 209 | const char* endSep = qmark ? "? " : ". "; |
| 210 | int i; |
| 211 | for (i = 0; i < nbWords; i++) { |
| 212 | int const wordID = g_distrib[LOREM_rand(g_distribCount)]; |
| 213 | const char* const word = kWords[wordID]; |
| 214 | const char* sep = " "; |
| 215 | if (i == commaPos) |
| 216 | sep = ", "; |
| 217 | if (i == comma2) |
| 218 | sep = ", "; |
| 219 | if (i == nbWords - 1) |
| 220 | sep = endSep; |
| 221 | generateWord(word, sep, i == 0); |
| 222 | } |
| 223 | } |
| 224 | |
| 225 | static void generateParagraph(int nbSentences) |
| 226 | { |
| 227 | int i; |
| 228 | for (i = 0; i < nbSentences; i++) { |
| 229 | int wordsPerSentence = about(11); |
| 230 | generateSentence(wordsPerSentence); |
| 231 | } |
| 232 | if (g_nbChars < g_maxChars) { |
| 233 | g_ptr[g_nbChars++] = '\n'; |
| 234 | } |
| 235 | if (g_nbChars < g_maxChars) { |
| 236 | g_ptr[g_nbChars++] = '\n'; |
| 237 | } |
| 238 | } |
| 239 | |
| 240 | /* It's "common" for lorem ipsum generators to start with the same first |
| 241 | * pre-defined sentence */ |
| 242 | static void generateFirstSentence(void) |
| 243 | { |
| 244 | int i; |
| 245 | for (i = 0; i < 18; i++) { |
| 246 | const char* word = kWords[i]; |
| 247 | const char* separator = " "; |
| 248 | if (i == 4) |
| 249 | separator = ", "; |
| 250 | if (i == 7) |
| 251 | separator = ", "; |
| 252 | generateWord(word, separator, i == 0); |
| 253 | } |
| 254 | generateWord(kWords[18], ". ", 0); |
| 255 | } |
| 256 | |
| 257 | size_t |
| 258 | LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill) |
| 259 | { |
| 260 | g_ptr = (char*)buffer; |
| 261 | assert(size < INT_MAX); |
| 262 | g_maxChars = size; |
| 263 | g_nbChars = 0; |
| 264 | g_randRoot = seed; |
| 265 | if (g_distribCount == 0) { |
| 266 | init_word_distrib(kWords, kNbWords, kWeights, kNbWeights); |
| 267 | } |
| 268 | |
| 269 | if (first) { |
| 270 | generateFirstSentence(); |
| 271 | } |
| 272 | while (g_nbChars < g_maxChars) { |
| 273 | int sentencePerParagraph = about(7); |
| 274 | generateParagraph(sentencePerParagraph); |
| 275 | if (!fill) |
| 276 | break; /* only generate one paragraph in not-fill mode */ |
| 277 | } |
| 278 | g_ptr = NULL; |
| 279 | return g_nbChars; |
| 280 | } |
| 281 | |
| 282 | void LOREM_genBuffer(void* buffer, size_t size, unsigned seed) |
| 283 | { |
| 284 | LOREM_genBlock(buffer, size, seed, 1, 1); |
| 285 | } |