1 /* vi: set sw = 4 ts = 4: */
2 /*	Small bzip2 deflate implementation, by Rob Landley (rob@landley.net).
3 
4 	Based on bzip2 decompression code by Julian R Seward (jseward@acm.org),
5 	which also acknowledges contributions by Mike Burrows, David Wheeler,
6 	Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten,
7 	Robert Sedgewick, and Jon L. Bentley.
8 
9 	This code is licensed under the LGPLv2:
10 		LGPL (http://www.gnu.org/copyleft/lgpl.html
11 */
12 
13 /*
14 	Size and speed optimizations by Manuel Novoa III  (mjn3@codepoet.org).
15 
16 	More efficient reading of Huffman codes, a streamlined read_bunzip()
17 	function, and various other tweaks.  In (limited) tests, approximately
18 	20% faster than bzcat on x86 and about 10% faster on arm.
19 
20 	Note that about 2/3 of the time is spent in read_unzip() reversing
21 	the Burrows-Wheeler transformation.  Much of that time is delay
22 	resulting from cache misses.
23 
24 	I would ask that anyone benefiting from this work, especially those
25 	using it in commercial products, consider making a donation to my local
26 	non-profit hospice organization in the name of the woman I loved, who
27 	passed away Feb. 12, 2003.
28 
29 		In memory of Toni W. Hagan
30 
31 		Hospice of Acadiana, Inc.
32 		2600 Johnston St., Suite 200
33 		Lafayette, LA 70503-3240
34 
35 		Phone (337) 232-1234 or 1-800-738-2226
36 		Fax   (337) 232-1297
37 
38 		http://www.hospiceacadiana.com/
39 
40 	Manuel
41  */
42 
43 /*
44 	Made it fit for running in Linux Kernel by Alain Knaff (alain@knaff.lu)
45 */
46 
47 #include "decompress.h"
48 
49 #ifndef INT_MAX
50 #define INT_MAX 0x7fffffff
51 #endif
52 
53 /* Constants for Huffman coding */
54 #define MAX_GROUPS		6
55 #define GROUP_SIZE   		50	/* 64 would have been more efficient */
56 #define MAX_HUFCODE_BITS 	20	/* Longest Huffman code allowed */
57 #define MAX_SYMBOLS 		258	/* 256 literals + RUNA + RUNB */
58 #define SYMBOL_RUNA		0
59 #define SYMBOL_RUNB		1
60 
61 /* Status return values */
62 #define RETVAL_OK			0
63 #define RETVAL_LAST_BLOCK		(-1)
64 #define RETVAL_NOT_BZIP_DATA		(-2)
65 #define RETVAL_UNEXPECTED_INPUT_EOF	(-3)
66 #define RETVAL_UNEXPECTED_OUTPUT_EOF	(-4)
67 #define RETVAL_DATA_ERROR		(-5)
68 #define RETVAL_OUT_OF_MEMORY		(-6)
69 #define RETVAL_OBSOLETE_INPUT		(-7)
70 
71 /* Other housekeeping constants */
72 #define BZIP2_IOBUF_SIZE		4096
73 
74 /* This is what we know about each Huffman coding group */
75 struct group_data {
76 	/* We have an extra slot at the end of limit[] for a sentinal value. */
77 	int limit[MAX_HUFCODE_BITS+1];
78 	int base[MAX_HUFCODE_BITS];
79 	int permute[MAX_SYMBOLS];
80 	int minLen, maxLen;
81 };
82 
83 /* Structure holding all the housekeeping data, including IO buffers and
84    memory that persists between calls to bunzip */
85 struct bunzip_data {
86 	/* State for interrupting output loop */
87 	int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent;
88 	/* I/O tracking data (file handles, buffers, positions, etc.) */
89 	int (*fill)(void*, unsigned int);
90 	int inbufCount, inbufPos /*, outbufPos*/;
91 	unsigned char *inbuf /*,*outbuf*/;
92 	unsigned int inbufBitCount, inbufBits;
93 	/* The CRC values stored in the block header and calculated from the
94 	data */
95 	unsigned int crc32Table[256], headerCRC, totalCRC, writeCRC;
96 	/* Intermediate buffer and its size (in bytes) */
97 	unsigned int *dbuf, dbufSize;
98 	/* These things are a bit too big to go on the stack */
99 	unsigned char selectors[32768];		/* nSelectors = 15 bits */
100 	struct group_data groups[MAX_GROUPS];	/* Huffman coding tables */
101 	int io_error;			/* non-zero if we have IO error */
102 };
103 
104 
105 /* Return the next nnn bits of input.  All reads from the compressed input
106    are done through this function.  All reads are big endian */
get_bits(struct bunzip_data * bd,char bits_wanted)107 static unsigned int INIT get_bits(struct bunzip_data *bd, char bits_wanted)
108 {
109 	unsigned int bits = 0;
110 
111 	/* If we need to get more data from the byte buffer, do so.
112 	   (Loop getting one byte at a time to enforce endianness and avoid
113 	   unaligned access.) */
114 	while (bd->inbufBitCount < bits_wanted) {
115 		/* If we need to read more data from file into byte buffer, do
116 		   so */
117 		if (bd->inbufPos == bd->inbufCount) {
118 			if (bd->io_error)
119 				return 0;
120 			bd->inbufCount = bd->fill(bd->inbuf, BZIP2_IOBUF_SIZE);
121 			if (bd->inbufCount <= 0) {
122 				bd->io_error = RETVAL_UNEXPECTED_INPUT_EOF;
123 				return 0;
124 			}
125 			bd->inbufPos = 0;
126 		}
127 		/* Avoid 32-bit overflow (dump bit buffer to top of output) */
128 		if (bd->inbufBitCount >= 24) {
129 			bits = bd->inbufBits&((1 << bd->inbufBitCount)-1);
130 			bits_wanted -= bd->inbufBitCount;
131 			bits <<= bits_wanted;
132 			bd->inbufBitCount = 0;
133 		}
134 		/* Grab next 8 bits of input from buffer. */
135 		bd->inbufBits = (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
136 		bd->inbufBitCount += 8;
137 	}
138 	/* Calculate result */
139 	bd->inbufBitCount -= bits_wanted;
140 	bits |= (bd->inbufBits >> bd->inbufBitCount)&((1 << bits_wanted)-1);
141 
142 	return bits;
143 }
144 
145 /* Unpacks the next block and sets up for the inverse burrows-wheeler step. */
146 
get_next_block(struct bunzip_data * bd)147 static int INIT get_next_block(struct bunzip_data *bd)
148 {
149 	struct group_data *hufGroup = NULL;
150 	int *base = NULL;
151 	int *limit = NULL;
152 	int dbufCount, nextSym, dbufSize, groupCount, selector,
153 		i, j, k, t, runPos, symCount, symTotal, nSelectors,
154 		byteCount[256];
155 	unsigned char uc, symToByte[256], mtfSymbol[256], *selectors;
156 	unsigned int *dbuf, origPtr;
157 
158 	dbuf = bd->dbuf;
159 	dbufSize = bd->dbufSize;
160 	selectors = bd->selectors;
161 
162 	/* Read in header signature and CRC, then validate signature.
163 	   (last block signature means CRC is for whole file, return now) */
164 	i = get_bits(bd, 24);
165 	j = get_bits(bd, 24);
166 	bd->headerCRC = get_bits(bd, 32);
167 	if ((i == 0x177245) && (j == 0x385090))
168 		return RETVAL_LAST_BLOCK;
169 	if ((i != 0x314159) || (j != 0x265359))
170 		return RETVAL_NOT_BZIP_DATA;
171 	/* We can add support for blockRandomised if anybody complains.
172 	   There was some code for this in busybox 1.0.0-pre3, but nobody ever
173 	   noticed that it didn't actually work. */
174 	if (get_bits(bd, 1))
175 		return RETVAL_OBSOLETE_INPUT;
176 	origPtr = get_bits(bd, 24);
177 	if (origPtr >= dbufSize)
178 		return RETVAL_DATA_ERROR;
179 	/* mapping table: if some byte values are never used (encoding things
180 	   like ascii text), the compression code removes the gaps to have fewer
181 	   symbols to deal with, and writes a sparse bitfield indicating which
182 	   values were present.  We make a translation table to convert the
183 	   symbols back to the corresponding bytes. */
184 	t = get_bits(bd, 16);
185 	symTotal = 0;
186 	for (i = 0; i < 16; i++) {
187 		if (t&(1 << (15-i))) {
188 			k = get_bits(bd, 16);
189 			for (j = 0; j < 16; j++)
190 				if (k&(1 << (15-j)))
191 					symToByte[symTotal++] = (16*i)+j;
192 		}
193 	}
194 	/* How many different Huffman coding groups does this block use? */
195 	groupCount = get_bits(bd, 3);
196 	if (groupCount < 2 || groupCount > MAX_GROUPS)
197 		return RETVAL_DATA_ERROR;
198 	/* nSelectors: Every GROUP_SIZE many symbols we select a new
199 	   Huffman coding group.  Read in the group selector list,
200 	   which is stored as MTF encoded bit runs.  (MTF = Move To
201 	   Front, as each value is used it's moved to the start of the
202 	   list.) */
203 	nSelectors = get_bits(bd, 15);
204 	if (!nSelectors)
205 		return RETVAL_DATA_ERROR;
206 	for (i = 0; i < groupCount; i++)
207 		mtfSymbol[i] = i;
208 	for (i = 0; i < nSelectors; i++) {
209 		/* Get next value */
210 		for (j = 0; get_bits(bd, 1); j++)
211 			if (j >= groupCount)
212 				return RETVAL_DATA_ERROR;
213 		/* Decode MTF to get the next selector */
214 		uc = mtfSymbol[j];
215 		for (; j; j--)
216 			mtfSymbol[j] = mtfSymbol[j-1];
217 		mtfSymbol[0] = selectors[i] = uc;
218 	}
219 	/* Read the Huffman coding tables for each group, which code
220 	   for symTotal literal symbols, plus two run symbols (RUNA,
221 	   RUNB) */
222 	symCount = symTotal+2;
223 	for (j = 0; j < groupCount; j++) {
224 		unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1];
225 		int	minLen,	maxLen, pp;
226 		/* Read Huffman code lengths for each symbol.  They're
227 		   stored in a way similar to mtf; record a starting
228 		   value for the first symbol, and an offset from the
229 		   previous value for everys symbol after that.
230 		   (Subtracting 1 before the loop and then adding it
231 		   back at the end is an optimization that makes the
232 		   test inside the loop simpler: symbol length 0
233 		   becomes negative, so an unsigned inequality catches
234 		   it.) */
235 		t = get_bits(bd, 5)-1;
236 		for (i = 0; i < symCount; i++) {
237 			for (;;) {
238 				if (((unsigned)t) > (MAX_HUFCODE_BITS-1))
239 					return RETVAL_DATA_ERROR;
240 
241 				/* If first bit is 0, stop.  Else
242 				   second bit indicates whether to
243 				   increment or decrement the value.
244 				   Optimization: grab 2 bits and unget
245 				   the second if the first was 0. */
246 
247 				k = get_bits(bd, 2);
248 				if (k < 2) {
249 					bd->inbufBitCount++;
250 					break;
251 				}
252 				/* Add one if second bit 1, else
253 				 * subtract 1.  Avoids if/else */
254 				t += (((k+1)&2)-1);
255 			}
256 			/* Correct for the initial -1, to get the
257 			 * final symbol length */
258 			length[i] = t+1;
259 		}
260 		/* Find largest and smallest lengths in this group */
261 		minLen = maxLen = length[0];
262 
263 		for (i = 1; i < symCount; i++) {
264 			if (length[i] > maxLen)
265 				maxLen = length[i];
266 			else if (length[i] < minLen)
267 				minLen = length[i];
268 		}
269 
270 		/* Calculate permute[], base[], and limit[] tables from
271 		 * length[].
272 		 *
273 		 * permute[] is the lookup table for converting
274 		 * Huffman coded symbols into decoded symbols.  base[]
275 		 * is the amount to subtract from the value of a
276 		 * Huffman symbol of a given length when using
277 		 * permute[].
278 		 *
279 		 * limit[] indicates the largest numerical value a
280 		 * symbol with a given number of bits can have.  This
281 		 * is how the Huffman codes can vary in length: each
282 		 * code with a value > limit[length] needs another
283 		 * bit.
284 		 */
285 		hufGroup = bd->groups+j;
286 		hufGroup->minLen = minLen;
287 		hufGroup->maxLen = maxLen;
288 		/* Note that minLen can't be smaller than 1, so we
289 		   adjust the base and limit array pointers so we're
290 		   not always wasting the first entry.  We do this
291 		   again when using them (during symbol decoding).*/
292 		base = hufGroup->base-1;
293 		limit = hufGroup->limit-1;
294 		/* Calculate permute[].  Concurently, initialize
295 		 * temp[] and limit[]. */
296 		pp = 0;
297 		for (i = minLen; i <= maxLen; i++) {
298 			temp[i] = limit[i] = 0;
299 			for (t = 0; t < symCount; t++)
300 				if (length[t] == i)
301 					hufGroup->permute[pp++] = t;
302 		}
303 		/* Count symbols coded for at each bit length */
304 		for (i = 0; i < symCount; i++)
305 			temp[length[i]]++;
306 		/* Calculate limit[] (the largest symbol-coding value
307 		 *at each bit length, which is (previous limit <<
308 		 *1)+symbols at this level), and base[] (number of
309 		 *symbols to ignore at each bit length, which is limit
310 		 *minus the cumulative count of symbols coded for
311 		 *already). */
312 		pp = t = 0;
313 		for (i = minLen; i < maxLen; i++) {
314 			pp += temp[i];
315 			/* We read the largest possible symbol size
316 			   and then unget bits after determining how
317 			   many we need, and those extra bits could be
318 			   set to anything.  (They're noise from
319 			   future symbols.)  At each level we're
320 			   really only interested in the first few
321 			   bits, so here we set all the trailing
322 			   to-be-ignored bits to 1 so they don't
323 			   affect the value > limit[length]
324 			   comparison. */
325 			limit[i] = (pp << (maxLen - i)) - 1;
326 			pp <<= 1;
327 			base[i+1] = pp-(t += temp[i]);
328 		}
329 		limit[maxLen+1] = INT_MAX; /* Sentinal value for
330 					    * reading next sym. */
331 		limit[maxLen] = pp+temp[maxLen]-1;
332 		base[minLen] = 0;
333 	}
334 	/* We've finished reading and digesting the block header.  Now
335 	   read this block's Huffman coded symbols from the file and
336 	   undo the Huffman coding and run length encoding, saving the
337 	   result into dbuf[dbufCount++] = uc */
338 
339 	/* Initialize symbol occurrence counters and symbol Move To
340 	 * Front table */
341 	for (i = 0; i < 256; i++) {
342 		byteCount[i] = 0;
343 		mtfSymbol[i] = (unsigned char)i;
344 	}
345 	/* Loop through compressed symbols. */
346 	runPos = dbufCount = symCount = selector = 0;
347 	for (;;) {
348 		/* Determine which Huffman coding group to use. */
349 		if (!(symCount--)) {
350 			symCount = GROUP_SIZE-1;
351 			if (selector >= nSelectors)
352 				return RETVAL_DATA_ERROR;
353 			hufGroup = bd->groups+selectors[selector++];
354 			base = hufGroup->base-1;
355 			limit = hufGroup->limit-1;
356 		}
357 		/* Read next Huffman-coded symbol. */
358 		/* Note: It is far cheaper to read maxLen bits and
359 		   back up than it is to read minLen bits and then an
360 		   additional bit at a time, testing as we go.
361 		   Because there is a trailing last block (with file
362 		   CRC), there is no danger of the overread causing an
363 		   unexpected EOF for a valid compressed file.  As a
364 		   further optimization, we do the read inline
365 		   (falling back to a call to get_bits if the buffer
366 		   runs dry).  The following (up to got_huff_bits:) is
367 		   equivalent to j = get_bits(bd, hufGroup->maxLen);
368 		 */
369 		while (bd->inbufBitCount < hufGroup->maxLen) {
370 			if (bd->inbufPos == bd->inbufCount) {
371 				j = get_bits(bd, hufGroup->maxLen);
372 				goto got_huff_bits;
373 			}
374 			bd->inbufBits =
375 				(bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
376 			bd->inbufBitCount += 8;
377 		};
378 		bd->inbufBitCount -= hufGroup->maxLen;
379 		j = (bd->inbufBits >> bd->inbufBitCount)&
380 			((1 << hufGroup->maxLen)-1);
381 got_huff_bits:
382 		/* Figure how how many bits are in next symbol and
383 		 * unget extras */
384 		i = hufGroup->minLen;
385 		while (j > limit[i])
386 			++i;
387 		bd->inbufBitCount += (hufGroup->maxLen - i);
388 		/* Huffman decode value to get nextSym (with bounds checking) */
389 		if ((i > hufGroup->maxLen)
390 			|| (((unsigned)(j = (j>>(hufGroup->maxLen-i))-base[i]))
391 				>= MAX_SYMBOLS))
392 			return RETVAL_DATA_ERROR;
393 		nextSym = hufGroup->permute[j];
394 		/* We have now decoded the symbol, which indicates
395 		   either a new literal byte, or a repeated run of the
396 		   most recent literal byte.  First, check if nextSym
397 		   indicates a repeated run, and if so loop collecting
398 		   how many times to repeat the last literal. */
399 		if (((unsigned)nextSym) <= SYMBOL_RUNB) { /* RUNA or RUNB */
400 			/* If this is the start of a new run, zero out
401 			 * counter */
402 			if (!runPos) {
403 				runPos = 1;
404 				t = 0;
405 			}
406 			/* Neat trick that saves 1 symbol: instead of
407 			   or-ing 0 or 1 at each bit position, add 1
408 			   or 2 instead.  For example, 1011 is 1 << 0
409 			   + 1 << 1 + 2 << 2.  1010 is 2 << 0 + 2 << 1
410 			   + 1 << 2.  You can make any bit pattern
411 			   that way using 1 less symbol than the basic
412 			   or 0/1 method (except all bits 0, which
413 			   would use no symbols, but a run of length 0
414 			   doesn't mean anything in this context).
415 			   Thus space is saved. */
416 			t += (runPos << nextSym);
417 			/* +runPos if RUNA; +2*runPos if RUNB */
418 
419 			runPos <<= 1;
420 			continue;
421 		}
422 		/* When we hit the first non-run symbol after a run,
423 		   we now know how many times to repeat the last
424 		   literal, so append that many copies to our buffer
425 		   of decoded symbols (dbuf) now.  (The last literal
426 		   used is the one at the head of the mtfSymbol
427 		   array.) */
428 		if (runPos) {
429 			runPos = 0;
430 			if (dbufCount+t >= dbufSize)
431 				return RETVAL_DATA_ERROR;
432 
433 			uc = symToByte[mtfSymbol[0]];
434 			byteCount[uc] += t;
435 			while (t--)
436 				dbuf[dbufCount++] = uc;
437 		}
438 		/* Is this the terminating symbol? */
439 		if (nextSym > symTotal)
440 			break;
441 		/* At this point, nextSym indicates a new literal
442 		   character.  Subtract one to get the position in the
443 		   MTF array at which this literal is currently to be
444 		   found.  (Note that the result can't be -1 or 0,
445 		   because 0 and 1 are RUNA and RUNB.  But another
446 		   instance of the first symbol in the mtf array,
447 		   position 0, would have been handled as part of a
448 		   run above.  Therefore 1 unused mtf position minus 2
449 		   non-literal nextSym values equals -1.) */
450 		if (dbufCount >= dbufSize)
451 			return RETVAL_DATA_ERROR;
452 		i = nextSym - 1;
453 		uc = mtfSymbol[i];
454 		/* Adjust the MTF array.  Since we typically expect to
455 		 *move only a small number of symbols, and are bound
456 		 *by 256 in any case, using memmove here would
457 		 *typically be bigger and slower due to function call
458 		 *overhead and other assorted setup costs. */
459 		do {
460 			mtfSymbol[i] = mtfSymbol[i-1];
461 		} while (--i);
462 		mtfSymbol[0] = uc;
463 		uc = symToByte[uc];
464 		/* We have our literal byte.  Save it into dbuf. */
465 		byteCount[uc]++;
466 		dbuf[dbufCount++] = (unsigned int)uc;
467 	}
468 	/* At this point, we've read all the Huffman-coded symbols
469 	   (and repeated runs) for this block from the input stream,
470 	   and decoded them into the intermediate buffer.  There are
471 	   dbufCount many decoded bytes in dbuf[].  Now undo the
472 	   Burrows-Wheeler transform on dbuf.  See
473 	   http://dogma.net/markn/articles/bwt/bwt.htm
474 	 */
475 	/* Turn byteCount into cumulative occurrence counts of 0 to n-1. */
476 	j = 0;
477 	for (i = 0; i < 256; i++) {
478 		k = j+byteCount[i];
479 		byteCount[i] = j;
480 		j = k;
481 	}
482 	/* Figure out what order dbuf would be in if we sorted it. */
483 	for (i = 0; i < dbufCount; i++) {
484 		uc = (unsigned char)(dbuf[i] & 0xff);
485 		dbuf[byteCount[uc]] |= (i << 8);
486 		byteCount[uc]++;
487 	}
488 	/* Decode first byte by hand to initialize "previous" byte.
489 	   Note that it doesn't get output, and if the first three
490 	   characters are identical it doesn't qualify as a run (hence
491 	   writeRunCountdown = 5). */
492 	if (dbufCount) {
493 		if (origPtr >= dbufCount)
494 			return RETVAL_DATA_ERROR;
495 		bd->writePos = dbuf[origPtr];
496 		bd->writeCurrent = (unsigned char)(bd->writePos&0xff);
497 		bd->writePos >>= 8;
498 		bd->writeRunCountdown = 5;
499 	}
500 	bd->writeCount = dbufCount;
501 
502 	return RETVAL_OK;
503 }
504 
505 /* Undo burrows-wheeler transform on intermediate buffer to produce output.
506    If start_bunzip was initialized with out_fd =-1, then up to len bytes of
507    data are written to outbuf.  Return value is number of bytes written or
508    error (all errors are negative numbers).  If out_fd!=-1, outbuf and len
509    are ignored, data is written to out_fd and return is RETVAL_OK or error.
510 */
511 
read_bunzip(struct bunzip_data * bd,unsigned char * outbuf,int len)512 static int INIT read_bunzip(struct bunzip_data *bd, unsigned char *outbuf, int len)
513 {
514 	const unsigned int *dbuf;
515 	int pos, xcurrent, previous, gotcount;
516 
517 	/* If last read was short due to end of file, return last block now */
518 	if (bd->writeCount < 0)
519 		return bd->writeCount;
520 
521 	gotcount = 0;
522 	dbuf = bd->dbuf;
523 	pos = bd->writePos;
524 	xcurrent = bd->writeCurrent;
525 
526 	/* We will always have pending decoded data to write into the output
527 	   buffer unless this is the very first call (in which case we haven't
528 	   Huffman-decoded a block into the intermediate buffer yet). */
529 
530 	if (bd->writeCopies) {
531 		/* Inside the loop, writeCopies means extra copies (beyond 1) */
532 		--bd->writeCopies;
533 		/* Loop outputting bytes */
534 		for (;;) {
535 			/* If the output buffer is full, snapshot
536 			 * state and return */
537 			if (gotcount >= len) {
538 				bd->writePos = pos;
539 				bd->writeCurrent = xcurrent;
540 				bd->writeCopies++;
541 				return len;
542 			}
543 			/* Write next byte into output buffer, updating CRC */
544 			outbuf[gotcount++] = xcurrent;
545 			bd->writeCRC = (((bd->writeCRC) << 8)
546 				^bd->crc32Table[((bd->writeCRC) >> 24)
547 				^xcurrent]);
548 			/* Loop now if we're outputting multiple
549 			 * copies of this byte */
550 			if (bd->writeCopies) {
551 				--bd->writeCopies;
552 				continue;
553 			}
554 decode_next_byte:
555 			if (!bd->writeCount--)
556 				break;
557 			/* Follow sequence vector to undo
558 			 * Burrows-Wheeler transform */
559 			previous = xcurrent;
560 			pos = dbuf[pos];
561 			xcurrent = pos&0xff;
562 			pos >>= 8;
563 			/* After 3 consecutive copies of the same
564 			   byte, the 4th is a repeat count.  We count
565 			   down from 4 instead *of counting up because
566 			   testing for non-zero is faster */
567 			if (--bd->writeRunCountdown) {
568 				if (xcurrent != previous)
569 					bd->writeRunCountdown = 4;
570 			} else {
571 				/* We have a repeated run, this byte
572 				 * indicates the count */
573 				bd->writeCopies = xcurrent;
574 				xcurrent = previous;
575 				bd->writeRunCountdown = 5;
576 				/* Sometimes there are just 3 bytes
577 				 * (run length 0) */
578 				if (!bd->writeCopies)
579 					goto decode_next_byte;
580 				/* Subtract the 1 copy we'd output
581 				 * anyway to get extras */
582 				--bd->writeCopies;
583 			}
584 		}
585 		/* Decompression of this block completed successfully */
586 		bd->writeCRC = ~bd->writeCRC;
587 		bd->totalCRC = ((bd->totalCRC << 1) |
588 				(bd->totalCRC >> 31)) ^ bd->writeCRC;
589 		/* If this block had a CRC error, force file level CRC error. */
590 		if (bd->writeCRC != bd->headerCRC) {
591 			bd->totalCRC = bd->headerCRC+1;
592 			return RETVAL_LAST_BLOCK;
593 		}
594 	}
595 
596 	/* Refill the intermediate buffer by Huffman-decoding next
597 	 * block of input */
598 	/* (previous is just a convenient unused temp variable here) */
599 	previous = get_next_block(bd);
600 	if (previous) {
601 		bd->writeCount = previous;
602 		return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount;
603 	}
604 	bd->writeCRC = 0xffffffffUL;
605 	pos = bd->writePos;
606 	xcurrent = bd->writeCurrent;
607 	goto decode_next_byte;
608 }
609 
nofill(void * buf,unsigned int len)610 static int INIT nofill(void *buf, unsigned int len)
611 {
612 	return -1;
613 }
614 
615 /* Allocate the structure, read file header.  If in_fd ==-1, inbuf must contain
616    a complete bunzip file (len bytes long).  If in_fd!=-1, inbuf and len are
617    ignored, and data is read from file handle into temporary buffer. */
start_bunzip(struct bunzip_data ** bdp,void * inbuf,int len,int (* fill)(void *,unsigned int))618 static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len,
619 			     int (*fill)(void*, unsigned int))
620 {
621 	struct bunzip_data *bd;
622 	unsigned int i, j, c;
623 	const unsigned int BZh0 =
624 		(((unsigned int)'B') << 24)+(((unsigned int)'Z') << 16)
625 		+(((unsigned int)'h') << 8)+(unsigned int)'0';
626 
627 	/* Figure out how much data to allocate */
628 	i = sizeof(struct bunzip_data);
629 
630 	/* Allocate bunzip_data.  Most fields initialize to zero. */
631 	bd = *bdp = malloc(i);
632 	if (!bd)
633 		return RETVAL_OUT_OF_MEMORY;
634 	memset(bd, 0, sizeof(struct bunzip_data));
635 	/* Setup input buffer */
636 	bd->inbuf = inbuf;
637 	bd->inbufCount = len;
638 	if (fill != NULL)
639 		bd->fill = fill;
640 	else
641 		bd->fill = nofill;
642 
643 	/* Init the CRC32 table (big endian) */
644 	for (i = 0; i < 256; i++) {
645 		c = i << 24;
646 		for (j = 8; j; j--)
647 			c = c&0x80000000 ? (c << 1)^0x04c11db7 : (c << 1);
648 		bd->crc32Table[i] = c;
649 	}
650 
651 	/* Ensure that file starts with "BZh['1'-'9']." */
652 	i = get_bits(bd, 32);
653 	if (((unsigned int)(i-BZh0-1)) >= 9)
654 		return RETVAL_NOT_BZIP_DATA;
655 
656 	/* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of
657 	   uncompressed data.  Allocate intermediate buffer for block. */
658 	bd->dbufSize = 100000*(i-BZh0);
659 
660 	bd->dbuf = large_malloc(bd->dbufSize * sizeof(int));
661 	if (!bd->dbuf)
662 		return RETVAL_OUT_OF_MEMORY;
663 	return RETVAL_OK;
664 }
665 
666 /* Example usage: decompress src_fd to dst_fd.  (Stops at end of bzip2 data,
667    not end of file.) */
bunzip2(unsigned char * buf,unsigned int len,int (* fill)(void *,unsigned int),int (* flush)(void *,unsigned int),unsigned char * outbuf,unsigned int * pos,void (* error)(const char * x))668 STATIC int INIT bunzip2(unsigned char *buf, unsigned int len,
669 			int(*fill)(void*, unsigned int),
670 			int(*flush)(void*, unsigned int),
671 			unsigned char *outbuf,
672 			unsigned int *pos,
673 			void(*error)(const char *x))
674 {
675 	struct bunzip_data *bd;
676 	int i = -1;
677 	unsigned char *inbuf;
678 
679 	if (flush)
680 		outbuf = malloc(BZIP2_IOBUF_SIZE);
681 
682 	if (!outbuf) {
683 		error("Could not allocate output buffer");
684 		return RETVAL_OUT_OF_MEMORY;
685 	}
686 	if (buf)
687 		inbuf = buf;
688 	else
689 		inbuf = malloc(BZIP2_IOBUF_SIZE);
690 	if (!inbuf) {
691 		error("Could not allocate input buffer");
692 		i = RETVAL_OUT_OF_MEMORY;
693 		goto exit_0;
694 	}
695 	i = start_bunzip(&bd, inbuf, len, fill);
696 	if (!i) {
697 		for (;;) {
698 			i = read_bunzip(bd, outbuf, BZIP2_IOBUF_SIZE);
699 			if (i <= 0)
700 				break;
701 			if (!flush)
702 				outbuf += i;
703 			else
704 				if (i != flush(outbuf, i)) {
705 					i = RETVAL_UNEXPECTED_OUTPUT_EOF;
706 					break;
707 				}
708 		}
709 	}
710 	/* Check CRC and release memory */
711 	if (i == RETVAL_LAST_BLOCK) {
712 		if (bd->headerCRC != bd->totalCRC)
713 			error("Data integrity error when decompressing.");
714 		else
715 			i = RETVAL_OK;
716 	} else if (i == RETVAL_UNEXPECTED_OUTPUT_EOF) {
717 		error("Compressed file ends unexpectedly");
718 	}
719 	if (!bd)
720 		goto exit_1;
721 	if (bd->dbuf)
722 		large_free(bd->dbuf);
723 	if (pos)
724 		*pos = bd->inbufPos;
725 	free(bd);
726 exit_1:
727 	if (!buf)
728 		free(inbuf);
729 exit_0:
730 	if (flush)
731 		free(outbuf);
732 	return i;
733 }
734