f535537f |
1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | /* Implementation notes: |
12 | * |
13 | * This is a very simple lorem ipsum generator |
14 | * which features a static list of words |
15 | * and print them one after another randomly |
16 | * with a fake sentence / paragraph structure. |
17 | * |
18 | * The goal is to generate a printable text |
19 | * that can be used to fake a text compression scenario. |
20 | * The resulting compression / ratio curve of the lorem ipsum generator |
21 | * is more satisfying than the previous statistical generator, |
22 | * which was initially designed for entropy compression, |
23 | * and lacks a regularity more representative of text. |
24 | * |
25 | * The compression ratio achievable on the generated lorem ipsum |
26 | * is still a bit too good, presumably because the dictionary is a bit too |
27 | * small. It would be possible to create some more complex scheme, notably by |
28 | * enlarging the dictionary with a word generator, and adding grammatical rules |
29 | * (composition) and syntax rules. But that's probably overkill for the intended |
30 | * goal. |
31 | */ |
32 | |
33 | #include "lorem.h" |
34 | #include <assert.h> |
35 | #include <limits.h> /* INT_MAX */ |
36 | #include <string.h> /* memcpy */ |
37 | |
38 | #define WORD_MAX_SIZE 20 |
39 | |
40 | /* Define the word pool */ |
41 | static const char* kWords[] = { |
42 | "lorem", "ipsum", "dolor", "sit", "amet", |
43 | "consectetur", "adipiscing", "elit", "sed", "do", |
44 | "eiusmod", "tempor", "incididunt", "ut", "labore", |
45 | "et", "dolore", "magna", "aliqua", "dis", |
46 | "lectus", "vestibulum", "mattis", "ullamcorper", "velit", |
47 | "commodo", "a", "lacus", "arcu", "magnis", |
48 | "parturient", "montes", "nascetur", "ridiculus", "mus", |
49 | "mauris", "nulla", "malesuada", "pellentesque", "eget", |
50 | "gravida", "in", "dictum", "non", "erat", |
51 | "nam", "voluptat", "maecenas", "blandit", "aliquam", |
52 | "etiam", "enim", "lobortis", "scelerisque", "fermentum", |
53 | "dui", "faucibus", "ornare", "at", "elementum", |
54 | "eu", "facilisis", "odio", "morbi", "quis", |
55 | "eros", "donec", "ac", "orci", "purus", |
56 | "turpis", "cursus", "leo", "vel", "porta", |
57 | "consequat", "interdum", "varius", "vulputate", "aliquet", |
58 | "pharetra", "nunc", "auctor", "urna", "id", |
59 | "metus", "viverra", "nibh", "cras", "mi", |
60 | "unde", "omnis", "iste", "natus", "error", |
61 | "perspiciatis", "voluptatem", "accusantium", "doloremque", "laudantium", |
62 | "totam", "rem", "aperiam", "eaque", "ipsa", |
63 | "quae", "ab", "illo", "inventore", "veritatis", |
64 | "quasi", "architecto", "beatae", "vitae", "dicta", |
65 | "sunt", "explicabo", "nemo", "ipsam", "quia", |
66 | "voluptas", "aspernatur", "aut", "odit", "fugit", |
67 | "consequuntur", "magni", "dolores", "eos", "qui", |
68 | "ratione", "sequi", "nesciunt", "neque", "porro", |
69 | "quisquam", "est", "dolorem", "adipisci", "numquam", |
70 | "eius", "modi", "tempora", "incidunt", "magnam", |
71 | "quaerat", "ad", "minima", "veniam", "nostrum", |
72 | "ullam", "corporis", "suscipit", "laboriosam", "nisi", |
73 | "aliquid", "ex", "ea", "commodi", "consequatur", |
74 | "autem", "eum", "iure", "voluptate", "esse", |
75 | "quam", "nihil", "molestiae", "illum", "fugiat", |
76 | "quo", "pariatur", "vero", "accusamus", "iusto", |
77 | "dignissimos", "ducimus", "blanditiis", "praesentium", "voluptatum", |
78 | "deleniti", "atque", "corrupti", "quos", "quas", |
79 | "molestias", "excepturi", "sint", "occaecati", "cupiditate", |
80 | "provident", "similique", "culpa", "officia", "deserunt", |
81 | "mollitia", "animi", "laborum", "dolorum", "fuga", |
82 | "harum", "quidem", "rerum", "facilis", "expedita", |
83 | "distinctio", "libero", "tempore", "cum", "soluta", |
84 | "nobis", "eligendi", "optio", "cumque", "impedit", |
85 | "minus", "quod", "maxime", "placeat", "facere", |
86 | "possimus", "assumenda", "repellendus", "temporibus", "quibusdam", |
87 | "officiis", "debitis", "saepe", "eveniet", "voluptates", |
88 | "repudiandae", "recusandae", "itaque", "earum", "hic", |
89 | "tenetur", "sapiente", "delectus", "reiciendis", "cillum", |
90 | "maiores", "alias", "perferendis", "doloribus", "asperiores", |
91 | "repellat", "minim", "nostrud", "exercitation", "ullamco", |
92 | "laboris", "aliquip", "duis", "aute", "irure", |
93 | }; |
94 | static const unsigned kNbWords = sizeof(kWords) / sizeof(kWords[0]); |
95 | |
96 | /* simple 1-dimension distribution, based on word's length, favors small words |
97 | */ |
98 | static const int kWeights[] = { 0, 8, 6, 4, 3, 2 }; |
99 | static const size_t kNbWeights = sizeof(kWeights) / sizeof(kWeights[0]); |
100 | |
101 | #define DISTRIB_SIZE_MAX 650 |
102 | static int g_distrib[DISTRIB_SIZE_MAX] = { 0 }; |
103 | static unsigned g_distribCount = 0; |
104 | |
105 | static void countFreqs( |
106 | const char* words[], |
107 | size_t nbWords, |
108 | const int* weights, |
109 | size_t nbWeights) |
110 | { |
111 | unsigned total = 0; |
112 | size_t w; |
113 | for (w = 0; w < nbWords; w++) { |
114 | size_t len = strlen(words[w]); |
115 | int lmax; |
116 | if (len >= nbWeights) |
117 | len = nbWeights - 1; |
118 | lmax = weights[len]; |
119 | total += (unsigned)lmax; |
120 | } |
121 | g_distribCount = total; |
122 | assert(g_distribCount <= DISTRIB_SIZE_MAX); |
123 | } |
124 | |
125 | static void init_word_distrib( |
126 | const char* words[], |
127 | size_t nbWords, |
128 | const int* weights, |
129 | size_t nbWeights) |
130 | { |
131 | size_t w, d = 0; |
132 | countFreqs(words, nbWords, weights, nbWeights); |
133 | for (w = 0; w < nbWords; w++) { |
134 | size_t len = strlen(words[w]); |
135 | int l, lmax; |
136 | if (len >= nbWeights) |
137 | len = nbWeights - 1; |
138 | lmax = weights[len]; |
139 | for (l = 0; l < lmax; l++) { |
140 | g_distrib[d++] = (int)w; |
141 | } |
142 | } |
143 | } |
144 | |
145 | /* Note: this unit only works when invoked sequentially. |
146 | * No concurrent access is allowed */ |
147 | static char* g_ptr = NULL; |
148 | static size_t g_nbChars = 0; |
149 | static size_t g_maxChars = 10000000; |
150 | static unsigned g_randRoot = 0; |
151 | |
152 | #define RDG_rotl32(x, r) ((x << r) | (x >> (32 - r))) |
153 | static unsigned LOREM_rand(unsigned range) |
154 | { |
155 | static const unsigned prime1 = 2654435761U; |
156 | static const unsigned prime2 = 2246822519U; |
157 | unsigned rand32 = g_randRoot; |
158 | rand32 *= prime1; |
159 | rand32 ^= prime2; |
160 | rand32 = RDG_rotl32(rand32, 13); |
161 | g_randRoot = rand32; |
162 | return (unsigned)(((unsigned long long)rand32 * range) >> 32); |
163 | } |
164 | |
165 | static void writeLastCharacters(void) |
166 | { |
167 | size_t lastChars = g_maxChars - g_nbChars; |
168 | assert(g_maxChars >= g_nbChars); |
169 | if (lastChars == 0) |
170 | return; |
171 | g_ptr[g_nbChars++] = '.'; |
172 | if (lastChars > 2) { |
173 | memset(g_ptr + g_nbChars, ' ', lastChars - 2); |
174 | } |
175 | if (lastChars > 1) { |
176 | g_ptr[g_maxChars - 1] = '\n'; |
177 | } |
178 | g_nbChars = g_maxChars; |
179 | } |
180 | |
181 | static void generateWord(const char* word, const char* separator, int upCase) |
182 | { |
183 | size_t const len = strlen(word) + strlen(separator); |
184 | if (g_nbChars + len > g_maxChars) { |
185 | writeLastCharacters(); |
186 | return; |
187 | } |
188 | memcpy(g_ptr + g_nbChars, word, strlen(word)); |
189 | if (upCase) { |
190 | static const char toUp = 'A' - 'a'; |
191 | g_ptr[g_nbChars] = (char)(g_ptr[g_nbChars] + toUp); |
192 | } |
193 | g_nbChars += strlen(word); |
194 | memcpy(g_ptr + g_nbChars, separator, strlen(separator)); |
195 | g_nbChars += strlen(separator); |
196 | } |
197 | |
198 | static int about(unsigned target) |
199 | { |
200 | return (int)(LOREM_rand(target) + LOREM_rand(target) + 1); |
201 | } |
202 | |
203 | /* Function to generate a random sentence */ |
204 | static void generateSentence(int nbWords) |
205 | { |
206 | int commaPos = about(9); |
207 | int comma2 = commaPos + about(7); |
208 | int qmark = (LOREM_rand(11) == 7); |
209 | const char* endSep = qmark ? "? " : ". "; |
210 | int i; |
211 | for (i = 0; i < nbWords; i++) { |
212 | int const wordID = g_distrib[LOREM_rand(g_distribCount)]; |
213 | const char* const word = kWords[wordID]; |
214 | const char* sep = " "; |
215 | if (i == commaPos) |
216 | sep = ", "; |
217 | if (i == comma2) |
218 | sep = ", "; |
219 | if (i == nbWords - 1) |
220 | sep = endSep; |
221 | generateWord(word, sep, i == 0); |
222 | } |
223 | } |
224 | |
225 | static void generateParagraph(int nbSentences) |
226 | { |
227 | int i; |
228 | for (i = 0; i < nbSentences; i++) { |
229 | int wordsPerSentence = about(11); |
230 | generateSentence(wordsPerSentence); |
231 | } |
232 | if (g_nbChars < g_maxChars) { |
233 | g_ptr[g_nbChars++] = '\n'; |
234 | } |
235 | if (g_nbChars < g_maxChars) { |
236 | g_ptr[g_nbChars++] = '\n'; |
237 | } |
238 | } |
239 | |
240 | /* It's "common" for lorem ipsum generators to start with the same first |
241 | * pre-defined sentence */ |
242 | static void generateFirstSentence(void) |
243 | { |
244 | int i; |
245 | for (i = 0; i < 18; i++) { |
246 | const char* word = kWords[i]; |
247 | const char* separator = " "; |
248 | if (i == 4) |
249 | separator = ", "; |
250 | if (i == 7) |
251 | separator = ", "; |
252 | generateWord(word, separator, i == 0); |
253 | } |
254 | generateWord(kWords[18], ". ", 0); |
255 | } |
256 | |
257 | size_t |
258 | LOREM_genBlock(void* buffer, size_t size, unsigned seed, int first, int fill) |
259 | { |
260 | g_ptr = (char*)buffer; |
261 | assert(size < INT_MAX); |
262 | g_maxChars = size; |
263 | g_nbChars = 0; |
264 | g_randRoot = seed; |
265 | if (g_distribCount == 0) { |
266 | init_word_distrib(kWords, kNbWords, kWeights, kNbWeights); |
267 | } |
268 | |
269 | if (first) { |
270 | generateFirstSentence(); |
271 | } |
272 | while (g_nbChars < g_maxChars) { |
273 | int sentencePerParagraph = about(7); |
274 | generateParagraph(sentencePerParagraph); |
275 | if (!fill) |
276 | break; /* only generate one paragraph in not-fill mode */ |
277 | } |
278 | g_ptr = NULL; |
279 | return g_nbChars; |
280 | } |
281 | |
282 | void LOREM_genBuffer(void* buffer, size_t size, unsigned seed) |
283 | { |
284 | LOREM_genBlock(buffer, size, seed, 1, 1); |
285 | } |