1
/* gzjoin -- command to join gzip files into one gzip file
3
Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved
4
version 1.2, 14 Aug 2012
6
This software is provided 'as-is', without any express or implied
7
warranty. In no event will the author be held liable for any damages
8
arising from the use of this software.
10
Permission is granted to anyone to use this software for any purpose,
11
including commercial applications, and to alter it and redistribute it
12
freely, subject to the following restrictions:
14
1. The origin of this software must not be misrepresented; you must not
15
claim that you wrote the original software. If you use this software
16
in a product, an acknowledgment in the product documentation would be
17
appreciated but is not required.
18
2. Altered source versions must be plainly marked as such, and must not be
19
misrepresented as being the original software.
20
3. This notice may not be removed or altered from any source distribution.
22
Mark Adler madler@alumni.caltech.edu
28
* 1.0 11 Dec 2004 - First version
29
* 1.1 12 Jun 2005 - Changed ssize_t to long for portability
30
* 1.2 14 Aug 2012 - Clean up for z_const usage
34
gzjoin takes one or more gzip files on the command line and writes out a
35
single gzip file that will uncompress to the concatenation of the
36
uncompressed data from the individual gzip files. gzjoin does this without
37
having to recompress any of the data and without having to calculate a new
38
crc32 for the concatenated uncompressed data. gzjoin does however have to
39
decompress all of the input data in order to find the bits in the compressed
40
data that need to be modified to concatenate the streams.
42
gzjoin does not do an integrity check on the input gzip files other than
43
checking the gzip header and decompressing the compressed data. They are
44
otherwise assumed to be complete and correct.
46
Each joint between gzip files removes at least 18 bytes of previous trailer
47
and subsequent header, and inserts an average of about three bytes to the
48
compressed data in order to connect the streams. The output gzip file
49
has a minimal ten-byte gzip header with no file name or modification time.
51
This program was written to illustrate the use of the Z_BLOCK option of
52
inflate() and the crc32_combine() function. gzjoin will not compile with
53
versions of zlib earlier than 1.2.3.
56
#include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */
57
#include <stdlib.h> /* exit(), malloc(), free() */
58
#include <fcntl.h> /* open() */
59
#include <unistd.h> /* close(), read(), lseek() */
61
/* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
65
/* exit with an error (return a value to allow use in an expression) */
66
local int bail(char *why1, char *why2)
68
fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
73
/* -- simple buffered file input with access to the buffer -- */
75
#define CHUNK 32768 /* must be a power of two and fit in unsigned */
77
/* bin buffered input file type */
79
char *name; /* name of file for error messages */
80
int fd; /* file descriptor */
81
unsigned left; /* bytes remaining at next */
82
unsigned char *next; /* next byte to read */
83
unsigned char *buf; /* allocated buffer of length CHUNK */
86
/* close a buffered file and free allocated memory */
87
local void bclose(bin *in)
98
/* open a buffered file for input, return a pointer to type bin, or NULL on
100
local bin *bopen(char *name)
104
in = malloc(sizeof(bin));
107
in->buf = malloc(CHUNK);
108
in->fd = open(name, O_RDONLY, 0);
109
if (in->buf == NULL || in->fd == -1) {
119
/* load buffer from file, return -1 on read error, 0 or 1 on success, with
120
1 indicating that end-of-file was reached */
121
local int bload(bin *in)
131
len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
134
in->left += (unsigned)len;
135
} while (len != 0 && in->left < CHUNK);
136
return len == 0 ? 1 : 0;
139
/* get a byte from the file, bail if end of file */
140
#define bget(in) (in->left ? 0 : bload(in), \
141
in->left ? (in->left--, *(in->next)++) : \
142
bail("unexpected end of file on ", in->name))
144
/* get a four-byte little-endian unsigned integer from file */
145
local unsigned long bget4(bin *in)
150
val += (unsigned long)(bget(in)) << 8;
151
val += (unsigned long)(bget(in)) << 16;
152
val += (unsigned long)(bget(in)) << 24;
156
/* skip bytes in file */
157
local void bskip(bin *in, unsigned skip)
163
/* easy case -- skip bytes in buffer */
164
if (skip <= in->left) {
170
/* skip what's in buffer, discard buffer contents */
174
/* seek past multiples of CHUNK bytes */
178
left = skip & (CHUNK - 1);
180
/* exact number of chunks: seek all the way minus one byte to check
181
for end-of-file with a read */
182
lseek(in->fd, skip - 1, SEEK_CUR);
183
if (read(in->fd, in->buf, 1) != 1)
184
bail("unexpected end of file on ", in->name);
188
/* skip the integral chunks, update skip with remainder */
189
lseek(in->fd, skip - left, SEEK_CUR);
193
/* read more input and skip remainder */
196
bail("unexpected end of file on ", in->name);
201
/* -- end of buffered input functions -- */
203
/* skip the gzip header from file in */
204
local void gzhead(bin *in)
208
/* verify gzip magic header and compression method */
209
if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
210
bail(in->name, " is not a valid gzip file");
212
/* get and verify flags */
214
if ((flags & 0xe0) != 0)
215
bail("unknown reserved bits set in ", in->name);
217
/* skip modification time, extra flags, and os */
220
/* skip extra field if present */
225
len += (unsigned)(bget(in)) << 8;
229
/* skip file name if present */
231
while (bget(in) != 0)
234
/* skip comment if present */
236
while (bget(in) != 0)
239
/* skip header crc if present */
244
/* write a four-byte little-endian unsigned integer to out */
245
local void put4(unsigned long val, FILE *out)
247
putc(val & 0xff, out);
248
putc((val >> 8) & 0xff, out);
249
putc((val >> 16) & 0xff, out);
250
putc((val >> 24) & 0xff, out);
253
/* Load up zlib stream from buffered input, bail if end of file */
254
local void zpull(z_streamp strm, bin *in)
259
bail("unexpected end of file on ", in->name);
260
strm->avail_in = in->left;
261
strm->next_in = in->next;
264
/* Write header for gzip file to out and initialize trailer. */
265
local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
267
fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
268
*crc = crc32(0L, Z_NULL, 0);
272
/* Copy the compressed data from name, zeroing the last block bit of the last
273
block if clr is true, and adding empty blocks as needed to get to a byte
274
boundary. If clr is false, then the last block becomes the last block of
275
the output, and the gzip trailer is written. crc and tot maintains the
276
crc and length (modulo 2^32) of the output for the trailer. The resulting
277
gzip file is written to out. gzinit() must be called before the first call
278
of gzcopy() to write the gzip header and to initialize crc and tot. */
279
local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
282
int ret; /* return value from zlib functions */
283
int pos; /* where the "last block" bit is in byte */
284
int last; /* true if processing the last block */
285
bin *in; /* buffered input file */
286
unsigned char *start; /* start of compressed data in buffer */
287
unsigned char *junk; /* buffer for uncompressed data -- discarded */
288
z_off_t len; /* length of uncompressed data (support > 4 GB) */
289
z_stream strm; /* zlib inflate stream */
291
/* open gzip file and skip header */
294
bail("could not open ", name);
297
/* allocate buffer for uncompressed data and initialize raw inflate
299
junk = malloc(CHUNK);
300
strm.zalloc = Z_NULL;
302
strm.opaque = Z_NULL;
304
strm.next_in = Z_NULL;
305
ret = inflateInit2(&strm, -15);
306
if (junk == NULL || ret != Z_OK)
307
bail("out of memory", "");
309
/* inflate and copy compressed data, clear last-block bit if requested */
318
/* if input used and output done, write used input and get more */
319
if (strm.avail_in == 0 && strm.avail_out != 0) {
320
fwrite(start, 1, strm.next_in - start, out);
326
/* decompress -- return early when end-of-block reached */
327
strm.avail_out = CHUNK;
328
strm.next_out = junk;
329
ret = inflate(&strm, Z_BLOCK);
332
bail("out of memory", "");
334
bail("invalid compressed data in ", in->name);
337
/* update length of uncompressed data */
338
len += CHUNK - strm.avail_out;
340
/* check for block boundary (only get this when block copied out) */
341
if (strm.data_type & 128) {
342
/* if that was the last block, then done */
346
/* number of unused bits in last byte */
347
pos = strm.data_type & 7;
349
/* find the next last-block bit */
351
/* next last-block bit is in last used byte */
353
last = strm.next_in[-1] & pos;
355
in->buf[strm.next_in - in->buf - 1] &= ~pos;
358
/* next last-block bit is in next unused byte */
359
if (strm.avail_in == 0) {
360
/* don't have that byte yet -- get it */
361
fwrite(start, 1, strm.next_in - start, out);
366
last = strm.next_in[0] & 1;
368
in->buf[strm.next_in - in->buf] &= ~1;
373
/* update buffer with unused input */
374
in->left = strm.avail_in;
375
in->next = in->buf + (strm.next_in - in->buf);
377
/* copy used input, write empty blocks to get to byte boundary */
378
pos = strm.data_type & 7;
379
fwrite(start, 1, in->next - start - 1, out);
381
if (pos == 0 || !clr)
382
/* already at byte boundary, or last file: write last byte */
385
/* append empty blocks to last byte */
386
last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */
388
/* odd -- append an empty stored block */
391
putc(0, out); /* two more bits in block header */
392
fwrite("\0\0\xff\xff", 1, 4, out);
395
/* even -- append 1, 2, or 3 empty fixed blocks */
401
putc(last | 0x20, out);
404
putc(last | 0x80, out);
410
/* update crc and tot */
411
*crc = crc32_combine(*crc, bget4(in), len);
412
*tot += (unsigned long)len;
419
/* write trailer if this is the last gzip file */
426
/* join the gzip files on the command line, write result to stdout */
427
int main(int argc, char **argv)
429
unsigned long crc, tot; /* running crc and total uncompressed length */
431
/* skip command name */
435
/* show usage if no arguments */
437
fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
442
/* join gzip files on command line and write to stdout */
443
gzinit(&crc, &tot, stdout);
445
gzcopy(*argv++, argc, &crc, &tot, stdout);