gzip-unpack.diff

delphij, 07/26/2009 08:10 AM

Download (12.9 KB)

View differences:

usr.bin/gzip/unpack.c
1
/*-
2
 * Copyright (c) 2009 Xin LI <delphij@FreeBSD.org>
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 * 1. Redistributions of source code must retain the above copyright
9
 *    notice, this list of conditions and the following disclaimer.
10
 * 2. Redistributions in binary form must reproduce the above copyright
11
 *    notice, this list of conditions and the following disclaimer in the
12
 *    documentation and/or other materials provided with the distribution.
13
 *
14
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24
 * SUCH DAMAGE.
25
 *
26
 * $FreeBSD$
27
 */
28

  
29
/* This file is #included by gzip.c */
30

  
31
/*
32
 * pack(1) file format:
33
 *
34
 * The first 7 bytes is the header:
35
 *	00, 01 - Signature (US, RS), we already validated it earlier.
36
 *	02..05 - Uncompressed size
37
 *	    06 - Level for the huffman tree (<=24)
38
 *
39
 * pack(1) will then store symbols (leaf) nodes count in each huffman
40
 * tree levels, each level would consume 1 byte (See [1]).
41
 *
42
 * After the symbol count table, there is the symbol table, storing
43
 * symbols represented by coresponding leaf node.  EOB is not being
44
 * explicitly transmitted (not necessary anyway) in the symbol table.
45
 *
46
 * Compressed data goes after the symbol table.
47
 *
48
 * NOTES
49
 *
50
 * [1] If we count EOB into the symbols, that would mean that we will
51
 * have at most 256 symbols in the huffman tree.  pack(1) rejects empty
52
 * file and files that just repeats one character, which means that we
53
 * will have at least 2 symbols.  Therefore, pack(1) would reduce the
54
 * last level symbol count by 2 which makes it a number in
55
 * range [0..254], so all levels' symbol count would fit into 1 byte.
56
 */
57

  
58
#define	PACK_HEADER_LENGTH	7
59
#define	HTREE_MAXLEVEL		24
60

  
61
/*
62
 * unpack descriptor
63
 *
64
 * Represent the huffman tree in a similiar way that pack(1) would
65
 * store in a packed file.  We store all symbols in a linear table,
66
 * and store pointers to each level's first symbol.  In addition to
67
 * that, maintain two counts for each level: inner nodes count and
68
 * leaf nodes count.
69
 */
70
typedef struct {
71
	int		symbol_size;	/* Size of the symbol table */
72
	int		treelevels;	/* Levels for the huffman tree */
73

  
74
	int		*symbolsin;	/* Table of leaf symbols count in
75
					   each level */
76
	int		*inodesin;	/* Table of internal nodes count in
77
					   each level */
78

  
79
	char		*symbol;	/* The symbol table */
80
	char		*symbol_eob;	/* Pointer to the EOB symbol */
81
	char		**tree;		/* Decoding huffman tree (pointers to
82
					   first symbol of each tree level */
83

  
84
	off_t		uncompressed_size; /* Uncompressed size */
85
	FILE		*fpIn;		/* Input stream */
86
	FILE		*fpOut;		/* Output stream */
87
} unpack_descriptor_t;
88

  
89
/*
90
 * Release resource allocated to an unpack descriptor.
91
 *
92
 * Caller is responsible to make sure that all of these pointers are
93
 * initialized (in our case, they all point to valid memory block).
94
 * We don't zero out pointers here because nobody else would ever
95
 * reference the memory block without scrubing them.
96
 */
97
static void
98
unpack_descriptor_fini(unpack_descriptor_t *unpackd)
99
{
100

  
101
	free(unpackd->symbolsin);
102
	free(unpackd->inodesin);
103
	free(unpackd->symbol);
104
	free(unpackd->tree);
105

  
106
	fclose(unpackd->fpIn);
107
	fclose(unpackd->fpOut);
108
}
109

  
110
/*
111
 * Recursively fill the internal node count table
112
 */
113
static void
114
unpackd_fill_inodesin(const unpack_descriptor_t *unpackd, int level)
115
{
116

  
117
	/*
118
	 * The internal nodes would be 1/2 of total internal nodes and
119
	 * leaf nodes in the next level.  For the last level there
120
	 * would be no internal node by defination.
121
	 */
122
	if (level < unpackd->treelevels) {
123
		unpackd_fill_inodesin(unpackd, level + 1);
124
		unpackd->inodesin[level] = (unpackd->inodesin[level + 1] +
125
					  unpackd->symbolsin[level + 1]) / 2;
126
	} else
127
		unpackd->inodesin[level] = 0;
128
}
129

  
130
/*
131
 * Update counter for accepted bytes
132
 */
133
static void
134
accepted_bytes(off_t *bytes_in, off_t newbytes)
135
{
136

  
137
	if (bytes_in != NULL)
138
		(*bytes_in) += newbytes;
139
}
140

  
141
/*
142
 * Read file header and construct the tree.  Also, prepare the buffered I/O
143
 * for decode rountine.
144
 *
145
 * Return value is uncompressed size.
146
 */
147
static void
148
unpack_parse_header(int in, int out, char *pre, size_t prelen, off_t *bytes_in,
149
    unpack_descriptor_t *unpackd)
150
{
151
	unsigned char hdr[PACK_HEADER_LENGTH];	/* buffer for header */
152
	ssize_t bytesread;		/* Bytes read from the file */
153
	int i, j, thisbyte;
154

  
155
	/* Prepend the header buffer if we already read some data */
156
	if (prelen != 0)
157
		memcpy(hdr, pre, prelen);
158

  
159
	/* Read in and fill the rest bytes of header */
160
	bytesread = read(in, hdr + prelen, PACK_HEADER_LENGTH - prelen);
161
	if (bytesread < 0)
162
		maybe_err("Error reading pack header");
163

  
164
	accepted_bytes(bytes_in, PACK_HEADER_LENGTH);
165

  
166
	/* Obtain uncompressed length (bytes 2,3,4,5)*/
167
	unpackd->uncompressed_size = 0;
168
	for (i = 2; i <= 5; i++) {
169
		unpackd->uncompressed_size <<= 8;
170
		unpackd->uncompressed_size |= hdr[i];
171
	}
172

  
173
	/* Get the levels of the tree */
174
	unpackd->treelevels = hdr[6];
175
	if (unpackd->treelevels > HTREE_MAXLEVEL || unpackd->treelevels < 1)
176
		maybe_errx("Huffman tree has insane levels");
177

  
178
	/* Let libc take care for buffering from now on */
179
	if ((unpackd->fpIn = fdopen(in, "r")) == NULL)
180
		maybe_err("Can not fdopen() input stream");
181
	if ((unpackd->fpOut = fdopen(out, "w")) == NULL)
182
		maybe_err("Can not fdopen() output stream");
183

  
184
	/* Allocate for the tables of bounds and the tree itself */
185
	unpackd->inodesin =
186
	    calloc(unpackd->treelevels, sizeof(*(unpackd->inodesin)));
187
	unpackd->symbolsin =
188
	    calloc(unpackd->treelevels, sizeof(*(unpackd->symbolsin)));
189
	unpackd->tree =
190
	    calloc(unpackd->treelevels, (sizeof (*(unpackd->tree))));
191
	if (unpackd->inodesin == NULL || unpackd->symbolsin == NULL ||
192
	    unpackd->tree == NULL)
193
		maybe_err("calloc");
194

  
195
	/* We count from 0 so adjust to match array upper bound */
196
	unpackd->treelevels--;
197

  
198
	/* Read the levels symbol count table and caculate total */
199
	unpackd->symbol_size = 1;		/* EOB */
200
	for (i = 0; i <= unpackd->treelevels; i++) {
201
		if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
202
			maybe_err("File appears to be truncated");
203
		unpackd->symbolsin[i] = (unsigned char)thisbyte;
204
		unpackd->symbol_size += unpackd->symbolsin[i];
205
	}
206
	accepted_bytes(bytes_in, unpackd->treelevels);
207
	if (unpackd->symbol_size > 256)
208
		maybe_errx("Bad symbol table");
209

  
210
	/* Allocate for the symbol table, point symbol_eob at the beginning */
211
	unpackd->symbol_eob = unpackd->symbol = calloc(1, unpackd->symbol_size);
212
	if (unpackd->symbol == NULL)
213
		maybe_err("calloc");
214

  
215
	/*
216
	 * Read in the symbol table, which contain [2, 256] symbols.
217
	 * In order to fit the count in one byte, pack(1) would offset
218
	 * it by reducing 2 from the actual number from the last level.
219
	 *
220
	 * We adjust the last level's symbol count by 1 here, because
221
	 * the EOB symbol is not being transmitted explicitly.  Another
222
	 * adjustment would be done later afterward.
223
	 */
224
	unpackd->symbolsin[unpackd->treelevels]++;
225
	for (i = 0; i <= unpackd->treelevels; i++) {
226
		unpackd->tree[i] = unpackd->symbol_eob;
227
		for (j = 0; j < unpackd->symbolsin[i]; j++) {
228
			if ((thisbyte = fgetc(unpackd->fpIn)) == EOF)
229
				maybe_errx("Symbol table truncated");
230
			*unpackd->symbol_eob++ = (char)thisbyte;
231
		}
232
		accepted_bytes(bytes_in, unpackd->symbolsin[i]);
233
	}
234

  
235
	/* Now, take account for the EOB symbol as well */
236
	unpackd->symbolsin[unpackd->treelevels]++;
237

  
238
	/*
239
	 * The symbolsin table has been constructed now.
240
	 * Caculate the internal nodes count table based on it.
241
	 */
242
	unpackd_fill_inodesin(unpackd, 0);
243
}
244

  
245
/*
246
 * Decode huffman stream, based on the huffman tree.
247
 */
248
static void
249
unpack_decode(const unpack_descriptor_t *unpackd, off_t *bytes_in)
250
{
251
	int thislevel, thiscode, thisbyte, inlevelindex;
252
	int i;
253
	off_t bytes_out = 0;
254
	const char *thissymbol;	/* The symbol pointer decoded from stream */
255

  
256
	/*
257
	 * Decode huffman.  Fetch every bytes from the file, get it
258
	 * into 'thiscode' bit-by-bit, then output the symbol we got
259
	 * when one has been found.
260
	 *
261
	 * Assumption: sizeof(int) > ((max tree levels + 1) / 8).
262
	 * bad things could happen if not.
263
	 */
264
	thislevel = 0;
265
	thiscode = thisbyte = 0;
266

  
267
	while ((thisbyte = fgetc(unpackd->fpIn)) != EOF) {
268
		accepted_bytes(bytes_in, 1);
269

  
270
		/*
271
		 * Split one bit from thisbyte, from highest to lowest,
272
		 * feed the bit into thiscode, until we got a symbol from
273
		 * the tree.
274
		 */
275
		for (i = 7; i >= 0; i--) {
276
			thiscode = (thiscode << 1) | ((thisbyte >> i) & 1);
277

  
278
			/* Did we got a symbol? (referencing leaf node) */
279
			if (thiscode >= unpackd->inodesin[thislevel]) {
280
				inlevelindex =
281
				    thiscode - unpackd->inodesin[thislevel];
282
				if (inlevelindex > unpackd->symbolsin[thislevel])
283
					maybe_errx("File corrupt");
284

  
285
				thissymbol =
286
				    &(unpackd->tree[thislevel][inlevelindex]);
287
				if ((thissymbol == unpackd->symbol_eob) &&
288
				    (bytes_out == unpackd->uncompressed_size))
289
					goto finished;
290

  
291
				fputc((*thissymbol), unpackd->fpOut);
292
				bytes_out++;
293

  
294
				/* Prepare for next input */
295
				thislevel = 0; thiscode = 0;
296
			} else {
297
				thislevel++;
298
				if (thislevel > unpackd->treelevels)
299
					maybe_errx("File corrupt");
300
			}
301
		}
302
	}
303

  
304
finished:
305
	if (bytes_out != unpackd->uncompressed_size)
306
		maybe_errx("Premature EOF");
307
}
308

  
309
/* Handler for pack(1)'ed file */
310
static off_t
311
unpack(int in, int out, char *pre, size_t prelen, off_t *bytes_in)
312
{
313
	unpack_descriptor_t	unpackd;
314

  
315
	unpack_parse_header(dup(in), dup(out), pre, prelen, bytes_in, &unpackd);
316
	unpack_decode(&unpackd, bytes_in);
317
	unpack_descriptor_fini(&unpackd);
318

  
319
	/* If we reached here, the unpack was successful */
320
	return (unpackd.uncompressed_size);
321
}
322

  
usr.bin/gzip/gzip.c
76 76
#ifndef NO_COMPRESS_SUPPORT
77 77
	FT_Z,
78 78
#endif
79
#ifndef NO_PACK_SUPPORT
80
	FT_PACK,
81
#endif
79 82
	FT_LAST,
80 83
	FT_UNKNOWN
81 84
};
......
92 95
#define Z_MAGIC		"\037\235"
93 96
#endif
94 97

  
98
#ifndef NO_PACK_SUPPORT
99
#define PACK_MAGIC	"\037\036"
100
#endif
101

  
95 102
#define GZ_SUFFIX	".gz"
96 103

  
97 104
#define BUFLEN		(64 * 1024)
......
165 172

  
166 173
static	void	maybe_err(const char *fmt, ...)
167 174
    __attribute__((__format__(__printf__, 1, 2)));
168
#ifndef NO_BZIP2_SUPPORT
175
#if !defined(NO_BZIP2_SUPPORT) || !defined(NO_PACK_SUPPORT)
169 176
static	void	maybe_errx(const char *fmt, ...)
170 177
    __attribute__((__format__(__printf__, 1, 2)));
171 178
#endif
......
213 220
static	off_t	zuncompress(FILE *, FILE *, char *, size_t, off_t *);
214 221
#endif
215 222

  
223
#ifndef NO_PACK_SUPPORT
224
static	off_t	unpack(int, int, char *, size_t, off_t *);
225
#endif
226

  
216 227
int main(int, char *p[]);
217 228

  
218 229
#ifdef SMALL
......
411 422
	exit(2);
412 423
}
413 424

  
414
#ifndef NO_BZIP2_SUPPORT
425
#if !defined(NO_BZIP2_SUPPORT) || !defined(NO_PACK_SUPPORT)
415 426
/* ... without an errno. */
416 427
void
417 428
maybe_errx(const char *fmt, ...)
......
1073 1084
		return FT_Z;
1074 1085
	else
1075 1086
#endif
1087
#ifndef NO_PACK_SUPPORT
1088
	if (memcmp(buf, PACK_MAGIC, 2) == 0)
1089
		return FT_PACK;
1090
	else
1091
#endif
1076 1092
		return FT_UNKNOWN;
1077 1093
}
1078 1094

  
......
1427 1443
	} else
1428 1444
#endif
1429 1445

  
1446
#ifndef NO_PACK_SUPPORT
1447
	if (method == FT_PACK) {
1448
		if (lflag) {
1449
			maybe_warnx("no -l with packed files");
1450
			goto lose;
1451
		}
1452

  
1453
		size = unpack(fd, zfd, NULL, 0, NULL);
1454
	} else
1455
#endif
1456

  
1430 1457
#ifndef SMALL
1431 1458
	if (method == FT_UNKNOWN) {
1432 1459
		if (lflag) {
......
1620 1647
		fclose(in);
1621 1648
		break;
1622 1649
#endif
1650
#ifndef NO_PACK_SUPPORT
1651
	case FT_PACK:
1652
		usize = unpack(STDIN_FILENO, STDOUT_FILENO,
1653
			       (char *)header1, sizeof header1, &gsize);
1654
		break;
1655
#endif
1623 1656
	}
1624 1657

  
1625 1658
#ifndef SMALL
......
1993 2026
#ifndef NO_COMPRESS_SUPPORT
1994 2027
#include "zuncompress.c"
1995 2028
#endif
2029
#ifndef NO_PACK_SUPPORT
2030
#include "unpack.c"
2031
#endif
1996 2032

  
1997 2033
static ssize_t
1998 2034
read_retry(int fd, void *buf, size_t sz)