1
/* gzappend -- command to append to a gzip file
3
Copyright (C) 2003 Mark Adler, all rights reserved
4
version 1.1, 4 Nov 2003
6
This software is provided 'as-is', without any express or implied
7
warranty. In no event will the author be held liable for any damages
8
arising from the use of this software.
10
Permission is granted to anyone to use this software for any purpose,
11
including commercial applications, and to alter it and redistribute it
12
freely, subject to the following restrictions:
14
1. The origin of this software must not be misrepresented; you must not
15
claim that you wrote the original software. If you use this software
16
in a product, an acknowledgment in the product documentation would be
17
appreciated but is not required.
18
2. Altered source versions must be plainly marked as such, and must not be
19
misrepresented as being the original software.
20
3. This notice may not be removed or altered from any source distribution.
22
Mark Adler madler@alumni.caltech.edu
28
* 1.0 19 Oct 2003 - First version
29
* 1.1 4 Nov 2003 - Expand and clarify some comments and notes
30
* - Add version and copyright to help
31
* - Send help to stdout instead of stderr
32
* - Add some preemptive typecasts
33
* - Add L to constants in lseek() calls
34
* - Remove some debugging information in error messages
35
* - Use new data_type definition for zlib 1.2.1
36
* - Simplfy and unify file operations
37
* - Finish off gzip file in gztack()
38
* - Use deflatePrime() instead of adding empty blocks
39
* - Keep gzip file clean on appended file read errors
40
* - Use in-place rotate instead of auxiliary buffer
41
* (Why you ask? Because it was fun to write!)
45
gzappend takes a gzip file and appends to it, compressing files from the
46
command line or data from stdin. The gzip file is written to directly, to
47
avoid copying that file, in case it's large. Note that this results in the
48
unfriendly behavior that if gzappend fails, the gzip file is corrupted.
50
This program was written to illustrate the use of the new Z_BLOCK option of
51
zlib 1.2.x's inflate() function. This option returns from inflate() at each
52
block boundary to facilitate locating and modifying the last block bit at
53
the start of the final deflate block. Also whether using Z_BLOCK or not,
54
another required feature of zlib 1.2.x is that inflate() now provides the
55
number of unusued bits in the last input byte used. gzappend will not work
56
with versions of zlib earlier than 1.2.1.
58
gzappend first decompresses the gzip file internally, discarding all but
59
the last 32K of uncompressed data, and noting the location of the last block
60
bit and the number of unused bits in the last byte of the compressed data.
61
The gzip trailer containing the CRC-32 and length of the uncompressed data
62
is verified. This trailer will be later overwritten.
64
Then the last block bit is cleared by seeking back in the file and rewriting
65
the byte that contains it. Seeking forward, the last byte of the compressed
66
data is saved along with the number of unused bits to initialize deflate.
68
A deflate process is initialized, using the last 32K of the uncompressed
69
data from the gzip file to initialize the dictionary. If the total
70
uncompressed data was less than 32K, then all of it is used to initialize
71
the dictionary. The deflate output bit buffer is also initialized with the
72
last bits from the original deflate stream. From here on, the data to
73
append is simply compressed using deflate, and written to the gzip file.
74
When that is complete, the new CRC-32 and uncompressed length are written
75
as the trailer of the gzip file.
87
#define CHUNK (1U << LGCHUNK)
90
/* print an error message and terminate with extreme prejudice */
91
local void bye(char *msg1, char *msg2)
93
fprintf(stderr, "gzappend error: %s%s\n", msg1, msg2);
97
/* return the greatest common divisor of a and b using Euclid's algorithm,
98
modified to be fast when one argument much greater than the other, and
99
coded to avoid unnecessary swapping */
100
local unsigned gcd(unsigned a, unsigned b)
120
/* rotate list[0..len-1] left by rot positions, in place */
121
local void rotate(unsigned char *list, unsigned len, unsigned rot)
125
unsigned char *start, *last, *to, *from;
127
/* normalize rot and handle degenerate cases */
129
if (rot >= len) rot %= len;
130
if (rot == 0) return;
132
/* pointer to last entry in list */
133
last = list + (len - 1);
135
/* do simple left shift by one */
138
memcpy(list, list + 1, len - 1);
143
/* do simple right shift by one */
144
if (rot == len - 1) {
146
memmove(list + 1, list, len - 1);
151
/* otherwise do rotate as a set of cycles in place */
152
cycles = gcd(len, rot); /* number of cycles */
154
start = from = list + cycles; /* start index is arbitrary */
155
tmp = *from; /* save entry to be overwritten */
157
to = from; /* next step in cycle */
158
from += rot; /* go right rot positions */
159
if (from > last) from -= len; /* (pointer better not wrap) */
160
if (from == start) break; /* all but one shifted */
161
*to = *from; /* shift left */
163
*to = tmp; /* complete the circle */
167
/* structure for gzip file read operations */
169
int fd; /* file descriptor */
170
int size; /* 1 << size is bytes in buf */
171
unsigned left; /* bytes available at next */
172
unsigned char *buf; /* buffer */
173
unsigned char *next; /* next byte in buffer */
174
char *name; /* file name for error messages */
178
local int readin(file *in)
182
len = read(in->fd, in->buf, 1 << in->size);
183
if (len == -1) bye("error reading ", in->name);
184
in->left = (unsigned)len;
189
/* read from file in, exit if end-of-file */
190
local int readmore(file *in)
192
if (readin(in) == 0) bye("unexpected end of ", in->name);
196
#define read1(in) (in->left == 0 ? readmore(in) : 0, \
197
in->left--, *(in->next)++)
199
/* skip over n bytes of in */
200
local void skip(file *in, unsigned n)
206
bypass = n & ~((1U << in->size) - 1);
208
if (lseek(in->fd, (off_t)bypass, SEEK_CUR) == -1)
209
bye("seeking ", in->name);
214
bye("unexpected end of ", in->name);
220
/* read a four-byte unsigned integer, little-endian, from in */
221
unsigned long read4(file *in)
226
val += (unsigned)read1(in) << 8;
227
val += (unsigned long)read1(in) << 16;
228
val += (unsigned long)read1(in) << 24;
232
/* skip over gzip header */
233
local void gzheader(file *in)
238
if (read1(in) != 31 || read1(in) != 139) bye(in->name, " not a gzip file");
239
if (read1(in) != 8) bye("unknown compression method in", in->name);
241
if (flags & 0xe0) bye("unknown header flags set in", in->name);
245
n += (unsigned)(read1(in)) << 8;
248
if (flags & 8) while (read1(in) != 0) ;
249
if (flags & 16) while (read1(in) != 0) ;
250
if (flags & 2) skip(in, 2);
253
/* decompress gzip file "name", return strm with a deflate stream ready to
254
continue compression of the data in the gzip file, and return a file
255
descriptor pointing to where to write the compressed data -- the deflate
256
stream is initialized to compress using level "level" */
257
local int gzscan(char *name, z_stream *strm, int level)
259
int ret, lastbit, left, full;
261
unsigned long crc, tot;
262
unsigned char *window;
268
gz.fd = open(name, O_RDWR, 0);
269
if (gz.fd == -1) bye("cannot open ", name);
270
gz.buf = malloc(CHUNK);
271
if (gz.buf == NULL) bye("out of memory", "");
275
/* skip gzip header */
278
/* prepare to decompress */
279
window = malloc(DSIZE);
280
if (window == NULL) bye("out of memory", "");
281
strm->zalloc = Z_NULL;
282
strm->zfree = Z_NULL;
283
strm->opaque = Z_NULL;
284
ret = inflateInit2(strm, -15);
285
if (ret != Z_OK) bye("out of memory", " or library mismatch");
287
/* decompress the deflate stream, saving append information */
289
lastoff = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
291
strm->avail_in = gz.left;
292
strm->next_in = gz.next;
293
crc = crc32(0L, Z_NULL, 0);
296
/* if needed, get more input */
297
if (strm->avail_in == 0) {
299
strm->avail_in = gz.left;
300
strm->next_in = gz.next;
303
/* set up output to next available section of sliding window */
304
strm->avail_out = DSIZE - have;
305
strm->next_out = window + have;
307
/* inflate and check for errors */
308
ret = inflate(strm, Z_BLOCK);
309
if (ret == Z_STREAM_ERROR) bye("internal stream error!", "");
310
if (ret == Z_MEM_ERROR) bye("out of memory", "");
311
if (ret == Z_DATA_ERROR)
312
bye("invalid compressed data--format violated in", name);
314
/* update crc and sliding window pointer */
315
crc = crc32(crc, window + have, DSIZE - have - strm->avail_out);
317
have = DSIZE - strm->avail_out;
323
/* process end of block */
324
if (strm->data_type & 128) {
325
if (strm->data_type & 64)
326
left = strm->data_type & 0x1f;
328
lastbit = strm->data_type & 0x1f;
329
lastoff = lseek(gz.fd, 0L, SEEK_CUR) - strm->avail_in;
332
} while (ret != Z_STREAM_END);
334
gz.left = strm->avail_in;
335
gz.next = strm->next_in;
337
/* save the location of the end of the compressed data */
338
end = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
340
/* check gzip trailer and save total for deflate */
341
if (crc != read4(&gz))
342
bye("invalid compressed data--crc mismatch in ", name);
343
tot = strm->total_out;
344
if ((tot & 0xffffffffUL) != read4(&gz))
345
bye("invalid compressed data--length mismatch in", name);
347
/* if not at end of file, warn */
348
if (gz.left || readin(&gz))
350
"gzappend warning: junk at end of gzip file overwritten\n");
352
/* clear last block bit */
353
lseek(gz.fd, lastoff - (lastbit != 0), SEEK_SET);
354
if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
355
*gz.buf = (unsigned char)(*gz.buf ^ (1 << ((8 - lastbit) & 7)));
356
lseek(gz.fd, -1L, SEEK_CUR);
357
if (write(gz.fd, gz.buf, 1) != 1) bye("writing after seek to ", name);
359
/* if window wrapped, build dictionary from window by rotating */
361
rotate(window, DSIZE, have);
365
/* set up deflate stream with window, crc, total_in, and leftover bits */
366
ret = deflateInit2(strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
367
if (ret != Z_OK) bye("out of memory", "");
368
deflateSetDictionary(strm, window, have);
370
strm->total_in = tot;
372
lseek(gz.fd, --end, SEEK_SET);
373
if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
374
deflatePrime(strm, 8 - left, *gz.buf);
376
lseek(gz.fd, end, SEEK_SET);
378
/* clean up and return */
384
/* append file "name" to gzip file gd using deflate stream strm -- if last
385
is true, then finish off the deflate stream at the end */
386
local void gztack(char *name, int gd, z_stream *strm, int last)
390
unsigned char *in, *out;
392
/* open file to compress and append */
395
fd = open(name, O_RDONLY, 0);
397
fprintf(stderr, "gzappend warning: %s not found, skipping ...\n",
401
/* allocate buffers */
402
in = fd == -1 ? NULL : malloc(CHUNK);
404
if (out == NULL) bye("out of memory", "");
406
/* compress input file and append to gzip file */
409
len = fd == -1 ? 0 : read(fd, in, CHUNK);
412
"gzappend warning: error reading %s, skipping rest ...\n",
416
strm->avail_in = (unsigned)len;
418
if (len) strm->adler = crc32(strm->adler, in, (unsigned)len);
420
/* compress and write all available output */
422
strm->avail_out = CHUNK;
423
strm->next_out = out;
424
ret = deflate(strm, last && len == 0 ? Z_FINISH : Z_NO_FLUSH);
425
left = CHUNK - strm->avail_out;
427
len = write(gd, out + CHUNK - strm->avail_out - left, left);
428
if (len == -1) bye("writing gzip file", "");
429
left -= (unsigned)len;
431
} while (strm->avail_out == 0 && ret != Z_STREAM_END);
434
/* write trailer after last entry */
437
out[0] = (unsigned char)(strm->adler);
438
out[1] = (unsigned char)(strm->adler >> 8);
439
out[2] = (unsigned char)(strm->adler >> 16);
440
out[3] = (unsigned char)(strm->adler >> 24);
441
out[4] = (unsigned char)(strm->total_in);
442
out[5] = (unsigned char)(strm->total_in >> 8);
443
out[6] = (unsigned char)(strm->total_in >> 16);
444
out[7] = (unsigned char)(strm->total_in >> 24);
447
ret = write(gd, out + 8 - len, len);
448
if (ret == -1) bye("writing gzip file", "");
454
/* clean up and return */
456
if (in != NULL) free(in);
457
if (fd > 0) close(fd);
460
/* process the compression level option if present, scan the gzip file, and
461
append the specified files, or append the data from stdin if no other file
462
names are provided on the command line -- the gzip file must be writable
464
int main(int argc, char **argv)
469
/* ignore command name */
472
/* provide usage if no arguments */
474
printf("gzappend 1.1 (4 Nov 2003) Copyright (C) 2003 Mark Adler\n");
476
"usage: gzappend [-level] file.gz [ addthis [ andthis ... ]]\n");
480
/* set compression level */
481
level = Z_DEFAULT_COMPRESSION;
482
if (argv[0][0] == '-') {
483
if (argv[0][1] < '0' || argv[0][1] > '9' || argv[0][2] != 0)
484
bye("invalid compression level", "");
485
level = argv[0][1] - '0';
486
if (*++argv == NULL) bye("no gzip file name after options", "");
489
/* prepare to append to gzip file */
490
gd = gzscan(*argv++, &strm, level);
492
/* append files on command line, or from stdin if none */
494
gztack(NULL, gd, &strm, 1);
497
gztack(*argv, gd, &strm, argv[1] == NULL);
498
} while (*++argv != NULL);