648db22b |
1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | #include "data.h" |
12 | |
13 | #include <assert.h> |
14 | #include <errno.h> |
15 | #include <stdio.h> |
16 | #include <string.h> |
17 | #include <stdlib.h> /* free() */ |
18 | |
19 | #include <sys/stat.h> |
20 | |
21 | #include <curl/curl.h> |
22 | |
23 | #include "mem.h" |
24 | #include "util.h" |
25 | #define XXH_STATIC_LINKING_ONLY |
26 | #include "xxhash.h" |
27 | |
28 | /** |
29 | * Data objects |
30 | */ |
31 | |
32 | #define REGRESSION_RELEASE(x) \ |
33 | "https://github.com/facebook/zstd/releases/download/regression-data/" x |
34 | |
35 | data_t silesia = { |
36 | .name = "silesia", |
37 | .type = data_type_dir, |
38 | .data = |
39 | { |
40 | .url = REGRESSION_RELEASE("silesia.tar.zst"), |
41 | .xxhash64 = 0x48a199f92f93e977LL, |
42 | }, |
43 | }; |
44 | |
45 | data_t silesia_tar = { |
46 | .name = "silesia.tar", |
47 | .type = data_type_file, |
48 | .data = |
49 | { |
50 | .url = REGRESSION_RELEASE("silesia.tar.zst"), |
51 | .xxhash64 = 0x48a199f92f93e977LL, |
52 | }, |
53 | }; |
54 | |
55 | data_t github = { |
56 | .name = "github", |
57 | .type = data_type_dir, |
58 | .data = |
59 | { |
60 | .url = REGRESSION_RELEASE("github.tar.zst"), |
61 | .xxhash64 = 0xa9b1b44b020df292LL, |
62 | }, |
63 | .dict = |
64 | { |
65 | .url = REGRESSION_RELEASE("github.dict.zst"), |
66 | .xxhash64 = 0x1eddc6f737d3cb53LL, |
67 | |
68 | }, |
69 | }; |
70 | |
71 | data_t github_tar = { |
72 | .name = "github.tar", |
73 | .type = data_type_file, |
74 | .data = |
75 | { |
76 | .url = REGRESSION_RELEASE("github.tar.zst"), |
77 | .xxhash64 = 0xa9b1b44b020df292LL, |
78 | }, |
79 | .dict = |
80 | { |
81 | .url = REGRESSION_RELEASE("github.dict.zst"), |
82 | .xxhash64 = 0x1eddc6f737d3cb53LL, |
83 | |
84 | }, |
85 | }; |
86 | |
87 | static data_t* g_data[] = { |
88 | &silesia, |
89 | &silesia_tar, |
90 | &github, |
91 | &github_tar, |
92 | NULL, |
93 | }; |
94 | |
95 | data_t const* const* data = (data_t const* const*)g_data; |
96 | |
97 | /** |
98 | * data helpers. |
99 | */ |
100 | |
101 | int data_has_dict(data_t const* data) { |
102 | return data->dict.url != NULL; |
103 | } |
104 | |
105 | /** |
106 | * data buffer helper functions (documented in header). |
107 | */ |
108 | |
109 | data_buffer_t data_buffer_create(size_t const capacity) { |
110 | data_buffer_t buffer = {}; |
111 | |
112 | buffer.data = (uint8_t*)malloc(capacity); |
113 | if (buffer.data == NULL) |
114 | return buffer; |
115 | buffer.capacity = capacity; |
116 | return buffer; |
117 | } |
118 | |
119 | data_buffer_t data_buffer_read(char const* filename) { |
120 | data_buffer_t buffer = {}; |
121 | |
122 | uint64_t const size = UTIL_getFileSize(filename); |
123 | if (size == UTIL_FILESIZE_UNKNOWN) { |
124 | fprintf(stderr, "unknown size for %s\n", filename); |
125 | return buffer; |
126 | } |
127 | |
128 | buffer.data = (uint8_t*)malloc(size); |
129 | if (buffer.data == NULL) { |
130 | fprintf(stderr, "malloc failed\n"); |
131 | return buffer; |
132 | } |
133 | buffer.capacity = size; |
134 | |
135 | FILE* file = fopen(filename, "rb"); |
136 | if (file == NULL) { |
137 | fprintf(stderr, "file null\n"); |
138 | goto err; |
139 | } |
140 | buffer.size = fread(buffer.data, 1, buffer.capacity, file); |
141 | fclose(file); |
142 | if (buffer.size != buffer.capacity) { |
143 | fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity); |
144 | goto err; |
145 | } |
146 | |
147 | return buffer; |
148 | err: |
149 | free(buffer.data); |
150 | memset(&buffer, 0, sizeof(buffer)); |
151 | return buffer; |
152 | } |
153 | |
154 | data_buffer_t data_buffer_get_data(data_t const* data) { |
155 | data_buffer_t const kEmptyBuffer = {}; |
156 | |
157 | if (data->type != data_type_file) |
158 | return kEmptyBuffer; |
159 | |
160 | return data_buffer_read(data->data.path); |
161 | } |
162 | |
163 | data_buffer_t data_buffer_get_dict(data_t const* data) { |
164 | data_buffer_t const kEmptyBuffer = {}; |
165 | |
166 | if (!data_has_dict(data)) |
167 | return kEmptyBuffer; |
168 | |
169 | return data_buffer_read(data->dict.path); |
170 | } |
171 | |
172 | int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) { |
173 | size_t const size = |
174 | buffer1.size < buffer2.size ? buffer1.size : buffer2.size; |
175 | int const cmp = memcmp(buffer1.data, buffer2.data, size); |
176 | if (cmp != 0) |
177 | return cmp; |
178 | if (buffer1.size < buffer2.size) |
179 | return -1; |
180 | if (buffer1.size == buffer2.size) |
181 | return 0; |
182 | assert(buffer1.size > buffer2.size); |
183 | return 1; |
184 | } |
185 | |
186 | void data_buffer_free(data_buffer_t buffer) { |
187 | free(buffer.data); |
188 | } |
189 | |
190 | /** |
191 | * data filenames helpers. |
192 | */ |
193 | |
194 | FileNamesTable* data_filenames_get(data_t const* data) |
195 | { |
196 | char const* const path = data->data.path; |
197 | return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ ); |
198 | } |
199 | |
200 | /** |
201 | * data buffers helpers. |
202 | */ |
203 | |
204 | data_buffers_t data_buffers_get(data_t const* data) { |
205 | data_buffers_t buffers = {.size = 0}; |
206 | FileNamesTable* const filenames = data_filenames_get(data); |
207 | if (filenames == NULL) return buffers; |
208 | if (filenames->tableSize == 0) { |
209 | UTIL_freeFileNamesTable(filenames); |
210 | return buffers; |
211 | } |
212 | |
213 | data_buffer_t* buffersPtr = |
214 | (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr)); |
215 | if (buffersPtr == NULL) { |
216 | UTIL_freeFileNamesTable(filenames); |
217 | return buffers; |
218 | } |
219 | buffers.buffers = (data_buffer_t const*)buffersPtr; |
220 | buffers.size = filenames->tableSize; |
221 | |
222 | for (size_t i = 0; i < filenames->tableSize; ++i) { |
223 | buffersPtr[i] = data_buffer_read(filenames->fileNames[i]); |
224 | if (buffersPtr[i].data == NULL) { |
225 | data_buffers_t const kEmptyBuffer = {}; |
226 | data_buffers_free(buffers); |
227 | UTIL_freeFileNamesTable(filenames); |
228 | return kEmptyBuffer; |
229 | } |
230 | } |
231 | |
232 | UTIL_freeFileNamesTable(filenames); |
233 | return buffers; |
234 | } |
235 | |
236 | /** |
237 | * Frees the data buffers. |
238 | */ |
239 | void data_buffers_free(data_buffers_t buffers) { |
240 | free((data_buffer_t*)buffers.buffers); |
241 | } |
242 | |
243 | /** |
244 | * Initialization and download functions. |
245 | */ |
246 | |
247 | static char* g_data_dir = NULL; |
248 | |
249 | /* mkdir -p */ |
250 | static int ensure_directory_exists(char const* indir) { |
251 | char* const dir = strdup(indir); |
252 | char* end = dir; |
253 | int ret = 0; |
254 | if (dir == NULL) { |
255 | ret = EINVAL; |
256 | goto out; |
257 | } |
258 | do { |
259 | /* Find the next directory level. */ |
260 | for (++end; *end != '\0' && *end != '/'; ++end) |
261 | ; |
262 | /* End the string there, make the directory, and restore the string. */ |
263 | char const save = *end; |
264 | *end = '\0'; |
265 | int const isdir = UTIL_isDirectory(dir); |
266 | ret = mkdir(dir, S_IRWXU); |
267 | *end = save; |
268 | /* Its okay if the directory already exists. */ |
269 | if (ret == 0 || (errno == EEXIST && isdir)) |
270 | continue; |
271 | ret = errno; |
272 | fprintf(stderr, "mkdir() failed\n"); |
273 | goto out; |
274 | } while (*end != '\0'); |
275 | |
276 | ret = 0; |
277 | out: |
278 | free(dir); |
279 | return ret; |
280 | } |
281 | |
282 | /** Concatenate 3 strings into a new buffer. */ |
283 | static char* cat3(char const* str1, char const* str2, char const* str3) { |
284 | size_t const size1 = strlen(str1); |
285 | size_t const size2 = strlen(str2); |
286 | size_t const size3 = str3 == NULL ? 0 : strlen(str3); |
287 | size_t const size = size1 + size2 + size3 + 1; |
288 | char* const dst = (char*)malloc(size); |
289 | if (dst == NULL) |
290 | return NULL; |
291 | strcpy(dst, str1); |
292 | strcpy(dst + size1, str2); |
293 | if (str3 != NULL) |
294 | strcpy(dst + size1 + size2, str3); |
295 | assert(strlen(dst) == size1 + size2 + size3); |
296 | return dst; |
297 | } |
298 | |
299 | static char* cat2(char const* str1, char const* str2) { |
300 | return cat3(str1, str2, NULL); |
301 | } |
302 | |
303 | /** |
304 | * State needed by the curl callback. |
305 | * It takes data from curl, hashes it, and writes it to the file. |
306 | */ |
307 | typedef struct { |
308 | FILE* file; |
309 | XXH64_state_t xxhash64; |
310 | int error; |
311 | } curl_data_t; |
312 | |
313 | /** Create the curl state. */ |
314 | static curl_data_t curl_data_create( |
315 | data_resource_t const* resource, |
316 | data_type_t type) { |
317 | curl_data_t cdata = {}; |
318 | |
319 | XXH64_reset(&cdata.xxhash64, 0); |
320 | |
321 | assert(UTIL_isDirectory(g_data_dir)); |
322 | |
323 | if (type == data_type_file) { |
324 | /* Decompress the resource and store to the path. */ |
325 | char* cmd = cat3("zstd -dqfo '", resource->path, "'"); |
326 | if (cmd == NULL) { |
327 | cdata.error = ENOMEM; |
328 | return cdata; |
329 | } |
330 | cdata.file = popen(cmd, "w"); |
331 | free(cmd); |
332 | } else { |
333 | /* Decompress and extract the resource to the cache directory. */ |
334 | char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'"); |
335 | if (cmd == NULL) { |
336 | cdata.error = ENOMEM; |
337 | return cdata; |
338 | } |
339 | cdata.file = popen(cmd, "w"); |
340 | free(cmd); |
341 | } |
342 | if (cdata.file == NULL) { |
343 | cdata.error = errno; |
344 | } |
345 | |
346 | return cdata; |
347 | } |
348 | |
349 | /** Free the curl state. */ |
350 | static int curl_data_free(curl_data_t cdata) { |
351 | return pclose(cdata.file); |
352 | } |
353 | |
354 | /** curl callback. Updates the hash, and writes to the file. */ |
355 | static size_t curl_write(void* data, size_t size, size_t count, void* ptr) { |
356 | curl_data_t* cdata = (curl_data_t*)ptr; |
357 | size_t const written = fwrite(data, size, count, cdata->file); |
358 | XXH64_update(&cdata->xxhash64, data, written * size); |
359 | return written; |
360 | } |
361 | |
362 | static int curl_download_resource( |
363 | CURL* curl, |
364 | data_resource_t const* resource, |
365 | data_type_t type) { |
366 | curl_data_t cdata; |
367 | /* Download the data. */ |
368 | if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0) |
369 | return EINVAL; |
370 | if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0) |
371 | return EINVAL; |
372 | cdata = curl_data_create(resource, type); |
373 | if (cdata.error != 0) |
374 | return cdata.error; |
375 | int const curl_err = curl_easy_perform(curl); |
376 | int const close_err = curl_data_free(cdata); |
377 | if (curl_err) { |
378 | fprintf( |
379 | stderr, |
380 | "downloading '%s' for '%s' failed\n", |
381 | resource->url, |
382 | resource->path); |
383 | return EIO; |
384 | } |
385 | if (close_err) { |
386 | fprintf(stderr, "writing data to '%s' failed\n", resource->path); |
387 | return EIO; |
388 | } |
389 | /* check that the file exists. */ |
390 | if (type == data_type_file && !UTIL_isRegularFile(resource->path)) { |
391 | fprintf(stderr, "output file '%s' does not exist\n", resource->path); |
392 | return EIO; |
393 | } |
394 | if (type == data_type_dir && !UTIL_isDirectory(resource->path)) { |
395 | fprintf( |
396 | stderr, "output directory '%s' does not exist\n", resource->path); |
397 | return EIO; |
398 | } |
399 | /* Check that the hash matches. */ |
400 | if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) { |
401 | fprintf( |
402 | stderr, |
403 | "checksum does not match: 0x%llxLL != 0x%llxLL\n", |
404 | (unsigned long long)XXH64_digest(&cdata.xxhash64), |
405 | (unsigned long long)resource->xxhash64); |
406 | return EINVAL; |
407 | } |
408 | |
409 | return 0; |
410 | } |
411 | |
412 | /** Download a single data object. */ |
413 | static int curl_download_datum(CURL* curl, data_t const* data) { |
414 | int ret; |
415 | ret = curl_download_resource(curl, &data->data, data->type); |
416 | if (ret != 0) |
417 | return ret; |
418 | if (data_has_dict(data)) { |
419 | ret = curl_download_resource(curl, &data->dict, data_type_file); |
420 | if (ret != 0) |
421 | return ret; |
422 | } |
423 | return ret; |
424 | } |
425 | |
426 | /** Download all the data. */ |
427 | static int curl_download_data(data_t const* const* data) { |
428 | if (curl_global_init(CURL_GLOBAL_ALL) != 0) |
429 | return EFAULT; |
430 | |
431 | curl_data_t cdata = {}; |
432 | CURL* curl = curl_easy_init(); |
433 | int err = EFAULT; |
434 | |
435 | if (curl == NULL) |
436 | return EFAULT; |
437 | |
438 | if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0) |
439 | goto out; |
440 | if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0) |
441 | goto out; |
442 | if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0) |
443 | goto out; |
444 | |
445 | assert(data != NULL); |
446 | for (; *data != NULL; ++data) { |
447 | if (curl_download_datum(curl, *data) != 0) |
448 | goto out; |
449 | } |
450 | |
451 | err = 0; |
452 | out: |
453 | curl_easy_cleanup(curl); |
454 | curl_global_cleanup(); |
455 | return err; |
456 | } |
457 | |
458 | /** Fill the path member variable of the data objects. */ |
459 | static int data_create_paths(data_t* const* data, char const* dir) { |
460 | size_t const dirlen = strlen(dir); |
461 | assert(data != NULL); |
462 | for (; *data != NULL; ++data) { |
463 | data_t* const datum = *data; |
464 | datum->data.path = cat3(dir, "/", datum->name); |
465 | if (datum->data.path == NULL) |
466 | return ENOMEM; |
467 | if (data_has_dict(datum)) { |
468 | datum->dict.path = cat2(datum->data.path, ".dict"); |
469 | if (datum->dict.path == NULL) |
470 | return ENOMEM; |
471 | } |
472 | } |
473 | return 0; |
474 | } |
475 | |
476 | /** Free the path member variable of the data objects. */ |
477 | static void data_free_paths(data_t* const* data) { |
478 | assert(data != NULL); |
479 | for (; *data != NULL; ++data) { |
480 | data_t* datum = *data; |
481 | free((void*)datum->data.path); |
482 | free((void*)datum->dict.path); |
483 | datum->data.path = NULL; |
484 | datum->dict.path = NULL; |
485 | } |
486 | } |
487 | |
488 | static char const kStampName[] = "STAMP"; |
489 | |
490 | static void xxh_update_le(XXH64_state_t* state, uint64_t data) { |
491 | if (!MEM_isLittleEndian()) |
492 | data = MEM_swap64(data); |
493 | XXH64_update(state, &data, sizeof(data)); |
494 | } |
495 | |
496 | /** Hash the data to create the stamp. */ |
497 | static uint64_t stamp_hash(data_t const* const* data) { |
498 | XXH64_state_t state; |
499 | |
500 | XXH64_reset(&state, 0); |
501 | assert(data != NULL); |
502 | for (; *data != NULL; ++data) { |
503 | data_t const* datum = *data; |
504 | /* We don't care about the URL that we fetch from. */ |
505 | /* The path is derived from the name. */ |
506 | XXH64_update(&state, datum->name, strlen(datum->name)); |
507 | xxh_update_le(&state, datum->data.xxhash64); |
508 | xxh_update_le(&state, datum->dict.xxhash64); |
509 | xxh_update_le(&state, datum->type); |
510 | } |
511 | return XXH64_digest(&state); |
512 | } |
513 | |
514 | /** Check if the stamp matches the stamp in the cache directory. */ |
515 | static int stamp_check(char const* dir, data_t const* const* data) { |
516 | char* stamp = cat3(dir, "/", kStampName); |
517 | uint64_t const expected = stamp_hash(data); |
518 | XXH64_canonical_t actual; |
519 | FILE* stampfile = NULL; |
520 | int matches = 0; |
521 | |
522 | if (stamp == NULL) |
523 | goto out; |
524 | if (!UTIL_isRegularFile(stamp)) { |
525 | fprintf(stderr, "stamp does not exist: recreating the data cache\n"); |
526 | goto out; |
527 | } |
528 | |
529 | stampfile = fopen(stamp, "rb"); |
530 | if (stampfile == NULL) { |
531 | fprintf(stderr, "could not open stamp: recreating the data cache\n"); |
532 | goto out; |
533 | } |
534 | |
535 | size_t b; |
536 | if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) { |
537 | fprintf(stderr, "invalid stamp: recreating the data cache\n"); |
538 | goto out; |
539 | } |
540 | |
541 | matches = (expected == XXH64_hashFromCanonical(&actual)); |
542 | if (matches) |
543 | fprintf(stderr, "stamp matches: reusing the cached data\n"); |
544 | else |
545 | fprintf(stderr, "stamp does not match: recreating the data cache\n"); |
546 | |
547 | out: |
548 | free(stamp); |
549 | if (stampfile != NULL) |
550 | fclose(stampfile); |
551 | return matches; |
552 | } |
553 | |
554 | /** On success write a new stamp, on failure delete the old stamp. */ |
555 | static int |
556 | stamp_write(char const* dir, data_t const* const* data, int const data_err) { |
557 | char* stamp = cat3(dir, "/", kStampName); |
558 | FILE* stampfile = NULL; |
559 | int err = EIO; |
560 | |
561 | if (stamp == NULL) |
562 | return ENOMEM; |
563 | |
564 | if (data_err != 0) { |
565 | err = data_err; |
566 | goto out; |
567 | } |
568 | XXH64_canonical_t hash; |
569 | |
570 | XXH64_canonicalFromHash(&hash, stamp_hash(data)); |
571 | |
572 | stampfile = fopen(stamp, "wb"); |
573 | if (stampfile == NULL) |
574 | goto out; |
575 | if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1) |
576 | goto out; |
577 | err = 0; |
578 | fprintf(stderr, "stamped new data cache\n"); |
579 | out: |
580 | if (err != 0) |
581 | /* Ignore errors. */ |
582 | unlink(stamp); |
583 | free(stamp); |
584 | if (stampfile != NULL) |
585 | fclose(stampfile); |
586 | return err; |
587 | } |
588 | |
589 | int data_init(char const* dir) { |
590 | int err; |
591 | |
592 | if (dir == NULL) |
593 | return EINVAL; |
594 | |
595 | /* This must be first to simplify logic. */ |
596 | err = ensure_directory_exists(dir); |
597 | if (err != 0) |
598 | return err; |
599 | |
600 | /* Save the cache directory. */ |
601 | g_data_dir = strdup(dir); |
602 | if (g_data_dir == NULL) |
603 | return ENOMEM; |
604 | |
605 | err = data_create_paths(g_data, dir); |
606 | if (err != 0) |
607 | return err; |
608 | |
609 | /* If the stamp matches then we are good to go. |
610 | * This must be called before any modifications to the data cache. |
611 | * After this point, we MUST call stamp_write() to update the STAMP, |
612 | * since we've updated the data cache. |
613 | */ |
614 | if (stamp_check(dir, data)) |
615 | return 0; |
616 | |
617 | err = curl_download_data(data); |
618 | if (err != 0) |
619 | goto out; |
620 | |
621 | out: |
622 | /* This must be last, since it must know if data_init() succeeded. */ |
623 | stamp_write(dir, data, err); |
624 | return err; |
625 | } |
626 | |
627 | void data_finish(void) { |
628 | data_free_paths(g_data); |
629 | free(g_data_dir); |
630 | g_data_dir = NULL; |
631 | } |