| 1 | /* gznorm.c -- normalize a gzip stream |
| 2 | * Copyright (C) 2018 Mark Adler |
| 3 | * For conditions of distribution and use, see copyright notice in zlib.h |
| 4 | * Version 1.0 7 Oct 2018 Mark Adler */ |
| 5 | |
| 6 | // gznorm takes a gzip stream, potentially containing multiple members, and |
| 7 | // converts it to a gzip stream with a single member. In addition the gzip |
| 8 | // header is normalized, removing the file name and time stamp, and setting the |
| 9 | // other header contents (XFL, OS) to fixed values. gznorm does not recompress |
| 10 | // the data, so it is fast, but no advantage is gained from the history that |
| 11 | // could be available across member boundaries. |
| 12 | |
| 13 | #include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf, |
| 14 | // vsnprintf, stdout, stderr, NULL, FILE |
| 15 | #include <stdlib.h> // malloc, free |
| 16 | #include <string.h> // strerror |
| 17 | #include <errno.h> // errno |
| 18 | #include <stdarg.h> // va_list, va_start, va_end |
| 19 | #include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd, |
| 20 | // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK, |
| 21 | // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR, |
| 22 | // Z_MEM_ERROR |
| 23 | |
| 24 | #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) |
| 25 | # include <fcntl.h> |
| 26 | # include <io.h> |
| 27 | # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) |
| 28 | #else |
| 29 | # define SET_BINARY_MODE(file) |
| 30 | #endif |
| 31 | |
| 32 | #define local static |
| 33 | |
| 34 | // printf to an allocated string. Return the string, or NULL if the printf or |
| 35 | // allocation fails. |
| 36 | local char *aprintf(char *fmt, ...) { |
| 37 | // Get the length of the result of the printf. |
| 38 | va_list args; |
| 39 | va_start(args, fmt); |
| 40 | int len = vsnprintf(NULL, 0, fmt, args); |
| 41 | va_end(args); |
| 42 | if (len < 0) |
| 43 | return NULL; |
| 44 | |
| 45 | // Allocate the required space and printf to it. |
| 46 | char *str = malloc(len + 1); |
| 47 | if (str == NULL) |
| 48 | return NULL; |
| 49 | va_start(args, fmt); |
| 50 | vsnprintf(str, len + 1, fmt, args); |
| 51 | va_end(args); |
| 52 | return str; |
| 53 | } |
| 54 | |
| 55 | // Return with an error, putting an allocated error message in *err. Doing an |
| 56 | // inflateEnd() on an already ended state, or one with state set to Z_NULL, is |
| 57 | // permitted. |
| 58 | #define BYE(...) \ |
| 59 | do { \ |
| 60 | inflateEnd(&strm); \ |
| 61 | *err = aprintf(__VA_ARGS__); \ |
| 62 | return 1; \ |
| 63 | } while (0) |
| 64 | |
| 65 | // Chunk size for buffered reads and for decompression. Twice this many bytes |
| 66 | // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned. |
| 67 | #define CHUNK 16384 |
| 68 | |
| 69 | // Read a gzip stream from in and write an equivalent normalized gzip stream to |
| 70 | // out. If given no input, an empty gzip stream will be written. If successful, |
| 71 | // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the |
| 72 | // details of the error are returned in *err, a pointer to an allocated string. |
| 73 | // |
| 74 | // The input may be a stream with multiple gzip members, which is converted to |
| 75 | // a single gzip member on the output. Each gzip member is decompressed at the |
| 76 | // level of deflate blocks. This enables clearing the last-block bit, shifting |
| 77 | // the compressed data to concatenate to the previous member's compressed data, |
| 78 | // which can end at an arbitrary bit boundary, and identifying stored blocks in |
| 79 | // order to resynchronize those to byte boundaries. The deflate compressed data |
| 80 | // is terminated with a 10-bit empty fixed block. If any members on the input |
| 81 | // end with a 10-bit empty fixed block, then that block is excised from the |
| 82 | // stream. This avoids appending empty fixed blocks for every normalization, |
| 83 | // and assures that gzip_normalize applied a second time will not change the |
| 84 | // input. The pad bits after stored block headers and after the final deflate |
| 85 | // block are all forced to zeros. |
| 86 | local int gzip_normalize(FILE *in, FILE *out, char **err) { |
| 87 | // initialize the inflate engine to process a gzip member |
| 88 | z_stream strm; |
| 89 | strm.zalloc = Z_NULL; |
| 90 | strm.zfree = Z_NULL; |
| 91 | strm.opaque = Z_NULL; |
| 92 | strm.avail_in = 0; |
| 93 | strm.next_in = Z_NULL; |
| 94 | if (inflateInit2(&strm, 15 + 16) != Z_OK) |
| 95 | BYE("out of memory"); |
| 96 | |
| 97 | // State while processing the input gzip stream. |
| 98 | enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ... |
| 99 | BETWEEN, // between gzip members (must end in this state) |
| 100 | HEAD, // reading a gzip header |
| 101 | BLOCK, // reading deflate blocks |
| 102 | TAIL // reading a gzip trailer |
| 103 | } state = BETWEEN; // current component being processed |
| 104 | unsigned long crc = 0; // accumulated CRC of uncompressed data |
| 105 | unsigned long len = 0; // accumulated length of uncompressed data |
| 106 | unsigned long buf = 0; // deflate stream bit buffer of num bits |
| 107 | int num = 0; // number of bits in buf (at bottom) |
| 108 | |
| 109 | // Write a canonical gzip header (no mod time, file name, comment, extra |
| 110 | // block, or extra flags, and OS is marked as unknown). |
| 111 | fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out); |
| 112 | |
| 113 | // Process the gzip stream from in until reaching the end of the input, |
| 114 | // encountering invalid input, or experiencing an i/o error. |
| 115 | int more; // true if not at the end of the input |
| 116 | do { |
| 117 | // State inside this loop. |
| 118 | unsigned char *put; // next input buffer location to process |
| 119 | int prev; // number of bits from previous block in |
| 120 | // the bit buffer, or -1 if not at the |
| 121 | // start of a block |
| 122 | unsigned long long memb; // uncompressed length of member |
| 123 | size_t tail; // number of trailer bytes read (0..8) |
| 124 | unsigned long part; // accumulated trailer component |
| 125 | |
| 126 | // Get the next chunk of input from in. |
| 127 | unsigned char dat[CHUNK]; |
| 128 | strm.avail_in = fread(dat, 1, CHUNK, in); |
| 129 | if (strm.avail_in == 0) |
| 130 | break; |
| 131 | more = strm.avail_in == CHUNK; |
| 132 | strm.next_in = put = dat; |
| 133 | |
| 134 | // Run that chunk of input through the inflate engine to exhaustion. |
| 135 | do { |
| 136 | // At this point it is assured that strm.avail_in > 0. |
| 137 | |
| 138 | // Inflate until the end of a gzip component (header, deflate |
| 139 | // block, trailer) is reached, or until all of the chunk is |
| 140 | // consumed. The resulting decompressed data is discarded, though |
| 141 | // the total size of the decompressed data in each member is |
| 142 | // tracked, for the calculation of the total CRC. |
| 143 | do { |
| 144 | // inflate and handle any errors |
| 145 | unsigned char scrap[CHUNK]; |
| 146 | strm.avail_out = CHUNK; |
| 147 | strm.next_out = scrap; |
| 148 | int ret = inflate(&strm, Z_BLOCK); |
| 149 | if (ret == Z_MEM_ERROR) |
| 150 | BYE("out of memory"); |
| 151 | if (ret == Z_DATA_ERROR) |
| 152 | BYE("input invalid: %s", strm.msg); |
| 153 | if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END) |
| 154 | BYE("internal error"); |
| 155 | |
| 156 | // Update the number of uncompressed bytes generated in this |
| 157 | // member. The actual count (not modulo 2^32) is required to |
| 158 | // correctly compute the total CRC. |
| 159 | unsigned got = CHUNK - strm.avail_out; |
| 160 | memb += got; |
| 161 | if (memb < got) |
| 162 | BYE("overflow error"); |
| 163 | |
| 164 | // Continue to process this chunk until it is consumed, or |
| 165 | // until the end of a component (header, deflate block, or |
| 166 | // trailer) is reached. |
| 167 | } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0); |
| 168 | |
| 169 | // Since strm.avail_in was > 0 for the inflate call, some input was |
| 170 | // just consumed. It is therefore assured that put < strm.next_in. |
| 171 | |
| 172 | // Disposition the consumed component or part of a component. |
| 173 | switch (state) { |
| 174 | case BETWEEN: |
| 175 | state = HEAD; |
| 176 | // Fall through to HEAD when some or all of the header is |
| 177 | // processed. |
| 178 | |
| 179 | case HEAD: |
| 180 | // Discard the header. |
| 181 | if (strm.data_type & 0x80) { |
| 182 | // End of header reached -- deflate blocks follow. |
| 183 | put = strm.next_in; |
| 184 | prev = num; |
| 185 | memb = 0; |
| 186 | state = BLOCK; |
| 187 | } |
| 188 | break; |
| 189 | |
| 190 | case BLOCK: |
| 191 | // Copy the deflate stream to the output, but with the |
| 192 | // last-block-bit cleared. Re-synchronize stored block |
| 193 | // headers to the output byte boundaries. The bytes at |
| 194 | // put..strm.next_in-1 is the compressed data that has been |
| 195 | // processed and is ready to be copied to the output. |
| 196 | |
| 197 | // At this point, it is assured that new compressed data is |
| 198 | // available, i.e., put < strm.next_in. If prev is -1, then |
| 199 | // that compressed data starts in the middle of a deflate |
| 200 | // block. If prev is not -1, then the bits in the bit |
| 201 | // buffer, possibly combined with the bits in *put, contain |
| 202 | // the three-bit header of the new deflate block. In that |
| 203 | // case, prev is the number of bits from the previous block |
| 204 | // that remain in the bit buffer. Since num is the number |
| 205 | // of bits in the bit buffer, we have that num - prev is |
| 206 | // the number of bits from the new block currently in the |
| 207 | // bit buffer. |
| 208 | |
| 209 | // If strm.data_type & 0xc0 is 0x80, then the last byte of |
| 210 | // the available compressed data includes the last bits of |
| 211 | // the end of a deflate block. In that case, that last byte |
| 212 | // also has strm.data_type & 0x1f bits of the next deflate |
| 213 | // block, in the range 0..7. If strm.data_type & 0xc0 is |
| 214 | // 0xc0, then the last byte of the compressed data is the |
| 215 | // end of the deflate stream, followed by strm.data_type & |
| 216 | // 0x1f pad bits, also in the range 0..7. |
| 217 | |
| 218 | // Set bits to the number of bits not yet consumed from the |
| 219 | // last byte. If we are at the end of the block, bits is |
| 220 | // either the number of bits in the last byte belonging to |
| 221 | // the next block, or the number of pad bits after the |
| 222 | // final block. In either of those cases, bits is in the |
| 223 | // range 0..7. |
| 224 | ; // (required due to C syntax oddity) |
| 225 | int bits = strm.data_type & 0x1f; |
| 226 | |
| 227 | if (prev != -1) { |
| 228 | // We are at the start of a new block. Clear the last |
| 229 | // block bit, and check for special cases. If it is a |
| 230 | // stored block, then emit the header and pad to the |
| 231 | // next byte boundary. If it is a final, empty fixed |
| 232 | // block, then excise it. |
| 233 | |
| 234 | // Some or all of the three header bits for this block |
| 235 | // may already be in the bit buffer. Load any remaining |
| 236 | // header bits into the bit buffer. |
| 237 | if (num - prev < 3) { |
| 238 | buf += (unsigned long)*put++ << num; |
| 239 | num += 8; |
| 240 | } |
| 241 | |
| 242 | // Set last to have a 1 in the position of the last |
| 243 | // block bit in the bit buffer. |
| 244 | unsigned long last = (unsigned long)1 << prev; |
| 245 | |
| 246 | if (((buf >> prev) & 7) == 3) { |
| 247 | // This is a final fixed block. Load at least ten |
| 248 | // bits from this block, including the header, into |
| 249 | // the bit buffer. We already have at least three, |
| 250 | // so at most one more byte needs to be loaded. |
| 251 | if (num - prev < 10) { |
| 252 | if (put == strm.next_in) |
| 253 | // Need to go get and process more input. |
| 254 | // We'll end up back here to finish this. |
| 255 | break; |
| 256 | buf += (unsigned long)*put++ << num; |
| 257 | num += 8; |
| 258 | } |
| 259 | if (((buf >> prev) & 0x3ff) == 3) { |
| 260 | // That final fixed block is empty. Delete it |
| 261 | // to avoid adding an empty block every time a |
| 262 | // gzip stream is normalized. |
| 263 | num = prev; |
| 264 | buf &= last - 1; // zero the pad bits |
| 265 | } |
| 266 | } |
| 267 | else if (((buf >> prev) & 6) == 0) { |
| 268 | // This is a stored block. Flush to the next |
| 269 | // byte boundary after the three-bit header. |
| 270 | num = (prev + 10) & ~7; |
| 271 | buf &= last - 1; // zero the pad bits |
| 272 | } |
| 273 | |
| 274 | // Clear the last block bit. |
| 275 | buf &= ~last; |
| 276 | |
| 277 | // Write out complete bytes in the bit buffer. |
| 278 | while (num >= 8) { |
| 279 | putc(buf, out); |
| 280 | buf >>= 8; |
| 281 | num -= 8; |
| 282 | } |
| 283 | |
| 284 | // If no more bytes left to process, then we have |
| 285 | // consumed the byte that had bits from the next block. |
| 286 | if (put == strm.next_in) |
| 287 | bits = 0; |
| 288 | } |
| 289 | |
| 290 | // We are done handling the deflate block header. Now copy |
| 291 | // all or almost all of the remaining compressed data that |
| 292 | // has been processed so far. Don't copy one byte at the |
| 293 | // end if it contains bits from the next deflate block or |
| 294 | // pad bits at the end of a deflate block. |
| 295 | |
| 296 | // mix is 1 if we are at the end of a deflate block, and if |
| 297 | // some of the bits in the last byte follow this block. mix |
| 298 | // is 0 if we are in the middle of a deflate block, if the |
| 299 | // deflate block ended on a byte boundary, or if all of the |
| 300 | // compressed data processed so far has been consumed. |
| 301 | int mix = (strm.data_type & 0x80) && bits; |
| 302 | |
| 303 | // Copy all of the processed compressed data to the output, |
| 304 | // except for the last byte if it contains bits from the |
| 305 | // next deflate block or pad bits at the end of the deflate |
| 306 | // stream. Copy the data after shifting in num bits from |
| 307 | // buf in front of it, leaving num bits from the end of the |
| 308 | // compressed data in buf when done. |
| 309 | unsigned char *end = strm.next_in - mix; |
| 310 | if (put < end) { |
| 311 | if (num) |
| 312 | // Insert num bits from buf before the data being |
| 313 | // copied. |
| 314 | do { |
| 315 | buf += (unsigned)(*put++) << num; |
| 316 | putc(buf, out); |
| 317 | buf >>= 8; |
| 318 | } while (put < end); |
| 319 | else { |
| 320 | // No shifting needed -- write directly. |
| 321 | fwrite(put, 1, end - put, out); |
| 322 | put = end; |
| 323 | } |
| 324 | } |
| 325 | |
| 326 | // Process the last processed byte if it wasn't written. |
| 327 | if (mix) { |
| 328 | // Load the last byte into the bit buffer. |
| 329 | buf += (unsigned)(*put++) << num; |
| 330 | num += 8; |
| 331 | |
| 332 | if (strm.data_type & 0x40) { |
| 333 | // We are at the end of the deflate stream and |
| 334 | // there are bits pad bits. Discard the pad bits |
| 335 | // and write a byte to the output, if available. |
| 336 | // Leave the num bits left over in buf to prepend |
| 337 | // to the next deflate stream. |
| 338 | num -= bits; |
| 339 | if (num >= 8) { |
| 340 | putc(buf, out); |
| 341 | num -= 8; |
| 342 | buf >>= 8; |
| 343 | } |
| 344 | |
| 345 | // Force the pad bits in the bit buffer to zeros. |
| 346 | buf &= ((unsigned long)1 << num) - 1; |
| 347 | |
| 348 | // Don't need to set prev here since going to TAIL. |
| 349 | } |
| 350 | else |
| 351 | // At the end of an internal deflate block. Leave |
| 352 | // the last byte in the bit buffer to examine on |
| 353 | // the next entry to BLOCK, when more bits from the |
| 354 | // next block will be available. |
| 355 | prev = num - bits; // number of bits in buffer |
| 356 | // from current block |
| 357 | } |
| 358 | |
| 359 | // Don't have a byte left over, so we are in the middle of |
| 360 | // a deflate block, or the deflate block ended on a byte |
| 361 | // boundary. Set prev appropriately for the next entry into |
| 362 | // BLOCK. |
| 363 | else if (strm.data_type & 0x80) |
| 364 | // The block ended on a byte boundary, so no header |
| 365 | // bits are in the bit buffer. |
| 366 | prev = num; |
| 367 | else |
| 368 | // In the middle of a deflate block, so no header here. |
| 369 | prev = -1; |
| 370 | |
| 371 | // Check for the end of the deflate stream. |
| 372 | if ((strm.data_type & 0xc0) == 0xc0) { |
| 373 | // That ends the deflate stream on the input side, the |
| 374 | // pad bits were discarded, and any remaining bits from |
| 375 | // the last block in the stream are saved in the bit |
| 376 | // buffer to prepend to the next stream. Process the |
| 377 | // gzip trailer next. |
| 378 | tail = 0; |
| 379 | part = 0; |
| 380 | state = TAIL; |
| 381 | } |
| 382 | break; |
| 383 | |
| 384 | case TAIL: |
| 385 | // Accumulate available trailer bytes to update the total |
| 386 | // CRC and the total uncompressed length. |
| 387 | do { |
| 388 | part = (part >> 8) + ((unsigned long)(*put++) << 24); |
| 389 | tail++; |
| 390 | if (tail == 4) { |
| 391 | // Update the total CRC. |
| 392 | z_off_t len2 = memb; |
| 393 | if (len2 < 0 || (unsigned long long)len2 != memb) |
| 394 | BYE("overflow error"); |
| 395 | crc = crc ? crc32_combine(crc, part, len2) : part; |
| 396 | part = 0; |
| 397 | } |
| 398 | else if (tail == 8) { |
| 399 | // Update the total uncompressed length. (It's ok |
| 400 | // if this sum is done modulo 2^32.) |
| 401 | len += part; |
| 402 | |
| 403 | // At the end of a member. Set up to inflate an |
| 404 | // immediately following gzip member. (If we made |
| 405 | // it this far, then the trailer was valid.) |
| 406 | if (inflateReset(&strm) != Z_OK) |
| 407 | BYE("internal error"); |
| 408 | state = BETWEEN; |
| 409 | break; |
| 410 | } |
| 411 | } while (put < strm.next_in); |
| 412 | break; |
| 413 | } |
| 414 | |
| 415 | // Process the input buffer until completely consumed. |
| 416 | } while (strm.avail_in > 0); |
| 417 | |
| 418 | // Process input until end of file, invalid input, or i/o error. |
| 419 | } while (more); |
| 420 | |
| 421 | // Done with the inflate engine. |
| 422 | inflateEnd(&strm); |
| 423 | |
| 424 | // Verify the validity of the input. |
| 425 | if (state != BETWEEN) |
| 426 | BYE("input invalid: incomplete gzip stream"); |
| 427 | |
| 428 | // Write the remaining deflate stream bits, followed by a terminating |
| 429 | // deflate fixed block. |
| 430 | buf += (unsigned long)3 << num; |
| 431 | putc(buf, out); |
| 432 | putc(buf >> 8, out); |
| 433 | if (num > 6) |
| 434 | putc(0, out); |
| 435 | |
| 436 | // Write the gzip trailer, which is the CRC and the uncompressed length |
| 437 | // modulo 2^32, both in little-endian order. |
| 438 | putc(crc, out); |
| 439 | putc(crc >> 8, out); |
| 440 | putc(crc >> 16, out); |
| 441 | putc(crc >> 24, out); |
| 442 | putc(len, out); |
| 443 | putc(len >> 8, out); |
| 444 | putc(len >> 16, out); |
| 445 | putc(len >> 24, out); |
| 446 | fflush(out); |
| 447 | |
| 448 | // Check for any i/o errors. |
| 449 | if (ferror(in) || ferror(out)) |
| 450 | BYE("i/o error: %s", strerror(errno)); |
| 451 | |
| 452 | // All good! |
| 453 | *err = NULL; |
| 454 | return 0; |
| 455 | } |
| 456 | |
| 457 | // Normalize the gzip stream on stdin, writing the result to stdout. |
| 458 | int main(void) { |
| 459 | // Avoid end-of-line conversions on evil operating systems. |
| 460 | SET_BINARY_MODE(stdin); |
| 461 | SET_BINARY_MODE(stdout); |
| 462 | |
| 463 | // Normalize from stdin to stdout, returning 1 on error, 0 if ok. |
| 464 | char *err; |
| 465 | int ret = gzip_normalize(stdin, stdout, &err); |
| 466 | if (ret) |
| 467 | fprintf(stderr, "gznorm error: %s\n", err); |
| 468 | free(err); |
| 469 | return ret; |
| 470 | } |