2 * Copyright (c) Meta Platforms, Inc. and affiliates.
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
17 #include <stdlib.h> /* free() */
21 #include <curl/curl.h>
25 #define XXH_STATIC_LINKING_ONLY
32 #define REGRESSION_RELEASE(x) \
33 "https://github.com/facebook/zstd/releases/download/regression-data/" x
37 .type = data_type_dir,
40 .url = REGRESSION_RELEASE("silesia.tar.zst"),
41 .xxhash64 = 0x48a199f92f93e977LL,
45 data_t silesia_tar = {
46 .name = "silesia.tar",
47 .type = data_type_file,
50 .url = REGRESSION_RELEASE("silesia.tar.zst"),
51 .xxhash64 = 0x48a199f92f93e977LL,
57 .type = data_type_dir,
60 .url = REGRESSION_RELEASE("github.tar.zst"),
61 .xxhash64 = 0xa9b1b44b020df292LL,
65 .url = REGRESSION_RELEASE("github.dict.zst"),
66 .xxhash64 = 0x1eddc6f737d3cb53LL,
73 .type = data_type_file,
76 .url = REGRESSION_RELEASE("github.tar.zst"),
77 .xxhash64 = 0xa9b1b44b020df292LL,
81 .url = REGRESSION_RELEASE("github.dict.zst"),
82 .xxhash64 = 0x1eddc6f737d3cb53LL,
87 static data_t* g_data[] = {
95 data_t const* const* data = (data_t const* const*)g_data;
101 int data_has_dict(data_t const* data) {
102 return data->dict.url != NULL;
106 * data buffer helper functions (documented in header).
109 data_buffer_t data_buffer_create(size_t const capacity) {
110 data_buffer_t buffer = {};
112 buffer.data = (uint8_t*)malloc(capacity);
113 if (buffer.data == NULL)
115 buffer.capacity = capacity;
119 data_buffer_t data_buffer_read(char const* filename) {
120 data_buffer_t buffer = {};
122 uint64_t const size = UTIL_getFileSize(filename);
123 if (size == UTIL_FILESIZE_UNKNOWN) {
124 fprintf(stderr, "unknown size for %s\n", filename);
128 buffer.data = (uint8_t*)malloc(size);
129 if (buffer.data == NULL) {
130 fprintf(stderr, "malloc failed\n");
133 buffer.capacity = size;
135 FILE* file = fopen(filename, "rb");
137 fprintf(stderr, "file null\n");
140 buffer.size = fread(buffer.data, 1, buffer.capacity, file);
142 if (buffer.size != buffer.capacity) {
143 fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
150 memset(&buffer, 0, sizeof(buffer));
154 data_buffer_t data_buffer_get_data(data_t const* data) {
155 data_buffer_t const kEmptyBuffer = {};
157 if (data->type != data_type_file)
160 return data_buffer_read(data->data.path);
163 data_buffer_t data_buffer_get_dict(data_t const* data) {
164 data_buffer_t const kEmptyBuffer = {};
166 if (!data_has_dict(data))
169 return data_buffer_read(data->dict.path);
172 int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
174 buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
175 int const cmp = memcmp(buffer1.data, buffer2.data, size);
178 if (buffer1.size < buffer2.size)
180 if (buffer1.size == buffer2.size)
182 assert(buffer1.size > buffer2.size);
186 void data_buffer_free(data_buffer_t buffer) {
191 * data filenames helpers.
194 FileNamesTable* data_filenames_get(data_t const* data)
196 char const* const path = data->data.path;
197 return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ );
201 * data buffers helpers.
204 data_buffers_t data_buffers_get(data_t const* data) {
205 data_buffers_t buffers = {.size = 0};
206 FileNamesTable* const filenames = data_filenames_get(data);
207 if (filenames == NULL) return buffers;
208 if (filenames->tableSize == 0) {
209 UTIL_freeFileNamesTable(filenames);
213 data_buffer_t* buffersPtr =
214 (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr));
215 if (buffersPtr == NULL) {
216 UTIL_freeFileNamesTable(filenames);
219 buffers.buffers = (data_buffer_t const*)buffersPtr;
220 buffers.size = filenames->tableSize;
222 for (size_t i = 0; i < filenames->tableSize; ++i) {
223 buffersPtr[i] = data_buffer_read(filenames->fileNames[i]);
224 if (buffersPtr[i].data == NULL) {
225 data_buffers_t const kEmptyBuffer = {};
226 data_buffers_free(buffers);
227 UTIL_freeFileNamesTable(filenames);
232 UTIL_freeFileNamesTable(filenames);
237 * Frees the data buffers.
239 void data_buffers_free(data_buffers_t buffers) {
240 free((data_buffer_t*)buffers.buffers);
244 * Initialization and download functions.
247 static char* g_data_dir = NULL;
250 static int ensure_directory_exists(char const* indir) {
251 char* const dir = strdup(indir);
259 /* Find the next directory level. */
260 for (++end; *end != '\0' && *end != '/'; ++end)
262 /* End the string there, make the directory, and restore the string. */
263 char const save = *end;
265 int const isdir = UTIL_isDirectory(dir);
266 ret = mkdir(dir, S_IRWXU);
268 /* Its okay if the directory already exists. */
269 if (ret == 0 || (errno == EEXIST && isdir))
272 fprintf(stderr, "mkdir() failed\n");
274 } while (*end != '\0');
282 /** Concatenate 3 strings into a new buffer. */
283 static char* cat3(char const* str1, char const* str2, char const* str3) {
284 size_t const size1 = strlen(str1);
285 size_t const size2 = strlen(str2);
286 size_t const size3 = str3 == NULL ? 0 : strlen(str3);
287 size_t const size = size1 + size2 + size3 + 1;
288 char* const dst = (char*)malloc(size);
292 strcpy(dst + size1, str2);
294 strcpy(dst + size1 + size2, str3);
295 assert(strlen(dst) == size1 + size2 + size3);
299 static char* cat2(char const* str1, char const* str2) {
300 return cat3(str1, str2, NULL);
304 * State needed by the curl callback.
305 * It takes data from curl, hashes it, and writes it to the file.
309 XXH64_state_t xxhash64;
313 /** Create the curl state. */
314 static curl_data_t curl_data_create(
315 data_resource_t const* resource,
317 curl_data_t cdata = {};
319 XXH64_reset(&cdata.xxhash64, 0);
321 assert(UTIL_isDirectory(g_data_dir));
323 if (type == data_type_file) {
324 /* Decompress the resource and store to the path. */
325 char* cmd = cat3("zstd -dqfo '", resource->path, "'");
327 cdata.error = ENOMEM;
330 cdata.file = popen(cmd, "w");
333 /* Decompress and extract the resource to the cache directory. */
334 char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
336 cdata.error = ENOMEM;
339 cdata.file = popen(cmd, "w");
342 if (cdata.file == NULL) {
349 /** Free the curl state. */
350 static int curl_data_free(curl_data_t cdata) {
351 return pclose(cdata.file);
354 /** curl callback. Updates the hash, and writes to the file. */
355 static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
356 curl_data_t* cdata = (curl_data_t*)ptr;
357 size_t const written = fwrite(data, size, count, cdata->file);
358 XXH64_update(&cdata->xxhash64, data, written * size);
362 static int curl_download_resource(
364 data_resource_t const* resource,
367 /* Download the data. */
368 if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
370 if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
372 cdata = curl_data_create(resource, type);
373 if (cdata.error != 0)
375 int const curl_err = curl_easy_perform(curl);
376 int const close_err = curl_data_free(cdata);
380 "downloading '%s' for '%s' failed\n",
386 fprintf(stderr, "writing data to '%s' failed\n", resource->path);
389 /* check that the file exists. */
390 if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
391 fprintf(stderr, "output file '%s' does not exist\n", resource->path);
394 if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
396 stderr, "output directory '%s' does not exist\n", resource->path);
399 /* Check that the hash matches. */
400 if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
403 "checksum does not match: 0x%llxLL != 0x%llxLL\n",
404 (unsigned long long)XXH64_digest(&cdata.xxhash64),
405 (unsigned long long)resource->xxhash64);
412 /** Download a single data object. */
413 static int curl_download_datum(CURL* curl, data_t const* data) {
415 ret = curl_download_resource(curl, &data->data, data->type);
418 if (data_has_dict(data)) {
419 ret = curl_download_resource(curl, &data->dict, data_type_file);
426 /** Download all the data. */
427 static int curl_download_data(data_t const* const* data) {
428 if (curl_global_init(CURL_GLOBAL_ALL) != 0)
431 curl_data_t cdata = {};
432 CURL* curl = curl_easy_init();
438 if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
440 if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
442 if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
445 assert(data != NULL);
446 for (; *data != NULL; ++data) {
447 if (curl_download_datum(curl, *data) != 0)
453 curl_easy_cleanup(curl);
454 curl_global_cleanup();
458 /** Fill the path member variable of the data objects. */
459 static int data_create_paths(data_t* const* data, char const* dir) {
460 size_t const dirlen = strlen(dir);
461 assert(data != NULL);
462 for (; *data != NULL; ++data) {
463 data_t* const datum = *data;
464 datum->data.path = cat3(dir, "/", datum->name);
465 if (datum->data.path == NULL)
467 if (data_has_dict(datum)) {
468 datum->dict.path = cat2(datum->data.path, ".dict");
469 if (datum->dict.path == NULL)
476 /** Free the path member variable of the data objects. */
477 static void data_free_paths(data_t* const* data) {
478 assert(data != NULL);
479 for (; *data != NULL; ++data) {
480 data_t* datum = *data;
481 free((void*)datum->data.path);
482 free((void*)datum->dict.path);
483 datum->data.path = NULL;
484 datum->dict.path = NULL;
488 static char const kStampName[] = "STAMP";
490 static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
491 if (!MEM_isLittleEndian())
492 data = MEM_swap64(data);
493 XXH64_update(state, &data, sizeof(data));
496 /** Hash the data to create the stamp. */
497 static uint64_t stamp_hash(data_t const* const* data) {
500 XXH64_reset(&state, 0);
501 assert(data != NULL);
502 for (; *data != NULL; ++data) {
503 data_t const* datum = *data;
504 /* We don't care about the URL that we fetch from. */
505 /* The path is derived from the name. */
506 XXH64_update(&state, datum->name, strlen(datum->name));
507 xxh_update_le(&state, datum->data.xxhash64);
508 xxh_update_le(&state, datum->dict.xxhash64);
509 xxh_update_le(&state, datum->type);
511 return XXH64_digest(&state);
514 /** Check if the stamp matches the stamp in the cache directory. */
515 static int stamp_check(char const* dir, data_t const* const* data) {
516 char* stamp = cat3(dir, "/", kStampName);
517 uint64_t const expected = stamp_hash(data);
518 XXH64_canonical_t actual;
519 FILE* stampfile = NULL;
524 if (!UTIL_isRegularFile(stamp)) {
525 fprintf(stderr, "stamp does not exist: recreating the data cache\n");
529 stampfile = fopen(stamp, "rb");
530 if (stampfile == NULL) {
531 fprintf(stderr, "could not open stamp: recreating the data cache\n");
536 if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
537 fprintf(stderr, "invalid stamp: recreating the data cache\n");
541 matches = (expected == XXH64_hashFromCanonical(&actual));
543 fprintf(stderr, "stamp matches: reusing the cached data\n");
545 fprintf(stderr, "stamp does not match: recreating the data cache\n");
549 if (stampfile != NULL)
554 /** On success write a new stamp, on failure delete the old stamp. */
556 stamp_write(char const* dir, data_t const* const* data, int const data_err) {
557 char* stamp = cat3(dir, "/", kStampName);
558 FILE* stampfile = NULL;
568 XXH64_canonical_t hash;
570 XXH64_canonicalFromHash(&hash, stamp_hash(data));
572 stampfile = fopen(stamp, "wb");
573 if (stampfile == NULL)
575 if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
578 fprintf(stderr, "stamped new data cache\n");
584 if (stampfile != NULL)
589 int data_init(char const* dir) {
595 /* This must be first to simplify logic. */
596 err = ensure_directory_exists(dir);
600 /* Save the cache directory. */
601 g_data_dir = strdup(dir);
602 if (g_data_dir == NULL)
605 err = data_create_paths(g_data, dir);
609 /* If the stamp matches then we are good to go.
610 * This must be called before any modifications to the data cache.
611 * After this point, we MUST call stamp_write() to update the STAMP,
612 * since we've updated the data cache.
614 if (stamp_check(dir, data))
617 err = curl_download_data(data);
622 /* This must be last, since it must know if data_init() succeeded. */
623 stamp_write(dir, data, err);
627 void data_finish(void) {
628 data_free_paths(g_data);