1 /* vi: set sw = 4 ts = 4: */
2 /*	Small bzip2 deflate implementation, by Rob Landley (rob@landley.net).
3 
4 	Based on bzip2 decompression code by Julian R Seward (jseward@acm.org),
5 	which also acknowledges contributions by Mike Burrows, David Wheeler,
6 	Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten,
7 	Robert Sedgewick, and Jon L. Bentley.
8 
9 	This code is licensed under the LGPLv2:
10 		LGPL (http://www.gnu.org/copyleft/lgpl.html
11 */
12 
13 /*
14 	Size and speed optimizations by Manuel Novoa III  (mjn3@codepoet.org).
15 
16 	More efficient reading of Huffman codes, a streamlined read_bunzip()
17 	function, and various other tweaks.  In (limited) tests, approximately
18 	20% faster than bzcat on x86 and about 10% faster on arm.
19 
20 	Note that about 2/3 of the time is spent in read_unzip() reversing
21 	the Burrows-Wheeler transformation.  Much of that time is delay
22 	resulting from cache misses.
23 
24 	I would ask that anyone benefiting from this work, especially those
25 	using it in commercial products, consider making a donation to my local
26 	non-profit hospice organization in the name of the woman I loved, who
27 	passed away Feb. 12, 2003.
28 
29 		In memory of Toni W. Hagan
30 
31 		Hospice of Acadiana, Inc.
32 		2600 Johnston St., Suite 200
33 		Lafayette, LA 70503-3240
34 
35 		Phone (337) 232-1234 or 1-800-738-2226
36 		Fax   (337) 232-1297
37 
38 		http://www.hospiceacadiana.com/
39 
40 	Manuel
41  */
42 
43 /*
44 	Made it fit for running in Linux Kernel by Alain Knaff (alain@knaff.lu)
45 */
46 
47 #include "decompress.h"
48 
49 #ifndef INT_MAX
50 #define INT_MAX 0x7fffffff
51 #endif
52 
53 /* Constants for Huffman coding */
54 #define MAX_GROUPS		6
55 #define GROUP_SIZE   		50	/* 64 would have been more efficient */
56 #define MAX_HUFCODE_BITS 	20	/* Longest Huffman code allowed */
57 #define MAX_SYMBOLS 		258	/* 256 literals + RUNA + RUNB */
58 #define SYMBOL_RUNA		0
59 #define SYMBOL_RUNB		1
60 
61 /* Status return values */
62 #define RETVAL_OK			0
63 #define RETVAL_LAST_BLOCK		(-1)
64 #define RETVAL_NOT_BZIP_DATA		(-2)
65 #define RETVAL_UNEXPECTED_INPUT_EOF	(-3)
66 #define RETVAL_UNEXPECTED_OUTPUT_EOF	(-4)
67 #define RETVAL_DATA_ERROR		(-5)
68 #define RETVAL_OUT_OF_MEMORY		(-6)
69 #define RETVAL_OBSOLETE_INPUT		(-7)
70 
71 /* Other housekeeping constants */
72 #define BZIP2_IOBUF_SIZE		4096
73 
74 /* This is what we know about each Huffman coding group */
75 struct group_data {
76 	/* We have an extra slot at the end of limit[] for a sentinel value. */
77 	int limit[MAX_HUFCODE_BITS+1];
78 	int base[MAX_HUFCODE_BITS];
79 	int permute[MAX_SYMBOLS];
80 	int minLen, maxLen;
81 };
82 
83 /* Structure holding all the housekeeping data, including IO buffers and
84    memory that persists between calls to bunzip */
85 struct bunzip_data {
86 	/* State for interrupting output loop */
87 	int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent;
88 	/* I/O tracking data (file handles, buffers, positions, etc.) */
89 	int (*fill)(void*, unsigned int);
90 	int inbufCount, inbufPos /*, outbufPos*/;
91 	unsigned char *inbuf /*,*outbuf*/;
92 	unsigned int inbufBitCount, inbufBits;
93 	/* The CRC values stored in the block header and calculated from the
94 	data */
95 	unsigned int crc32Table[256], headerCRC, totalCRC, writeCRC;
96 	/* Intermediate buffer and its size (in bytes) */
97 	unsigned int *dbuf, dbufSize;
98 	/* These things are a bit too big to go on the stack */
99 	unsigned char selectors[32768];		/* nSelectors = 15 bits */
100 	struct group_data groups[MAX_GROUPS];	/* Huffman coding tables */
101 	int io_error;			/* non-zero if we have IO error */
102 };
103 
104 
105 /* Return the next nnn bits of input.  All reads from the compressed input
106    are done through this function.  All reads are big endian */
get_bits(struct bunzip_data * bd,char bits_wanted)107 static unsigned int __init get_bits(struct bunzip_data *bd, char bits_wanted)
108 {
109 	unsigned int bits = 0;
110 
111 	/* If we need to get more data from the byte buffer, do so.
112 	   (Loop getting one byte at a time to enforce endianness and avoid
113 	   unaligned access.) */
114 	while (bd->inbufBitCount < bits_wanted) {
115 		/* If we need to read more data from file into byte buffer, do
116 		   so */
117 		if (bd->inbufPos == bd->inbufCount) {
118 			if (bd->io_error)
119 				return 0;
120 			bd->inbufCount = bd->fill(bd->inbuf, BZIP2_IOBUF_SIZE);
121 			if (bd->inbufCount <= 0) {
122 				bd->io_error = RETVAL_UNEXPECTED_INPUT_EOF;
123 				return 0;
124 			}
125 			bd->inbufPos = 0;
126 		}
127 		/* Avoid 32-bit overflow (dump bit buffer to top of output) */
128 		if (bd->inbufBitCount >= 24) {
129 			bits = bd->inbufBits&((1 << bd->inbufBitCount)-1);
130 			bits_wanted -= bd->inbufBitCount;
131 			bits <<= bits_wanted;
132 			bd->inbufBitCount = 0;
133 		}
134 		/* Grab next 8 bits of input from buffer. */
135 		bd->inbufBits = (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
136 		bd->inbufBitCount += 8;
137 	}
138 	/* Calculate result */
139 	bd->inbufBitCount -= bits_wanted;
140 	bits |= (bd->inbufBits >> bd->inbufBitCount)&((1 << bits_wanted)-1);
141 
142 	return bits;
143 }
144 
145 /* Unpacks the next block and sets up for the inverse burrows-wheeler step. */
146 
get_next_block(struct bunzip_data * bd)147 static int __init get_next_block(struct bunzip_data *bd)
148 {
149 	struct group_data *hufGroup = NULL;
150 	int *base = NULL;
151 	int *limit = NULL;
152 	int dbufCount, nextSym, dbufSize, groupCount, selector,
153 		i, j, k, t, runPos, symCount, symTotal, nSelectors,
154 		byteCount[256];
155 	unsigned char uc, symToByte[256], mtfSymbol[256], *selectors;
156 	unsigned int *dbuf, origPtr;
157 
158 	dbuf = bd->dbuf;
159 	dbufSize = bd->dbufSize;
160 	selectors = bd->selectors;
161 
162 	/* Read in header signature and CRC, then validate signature.
163 	   (last block signature means CRC is for whole file, return now) */
164 	i = get_bits(bd, 24);
165 	j = get_bits(bd, 24);
166 	bd->headerCRC = get_bits(bd, 32);
167 	if ((i == 0x177245) && (j == 0x385090))
168 		return RETVAL_LAST_BLOCK;
169 	if ((i != 0x314159) || (j != 0x265359))
170 		return RETVAL_NOT_BZIP_DATA;
171 	/* We can add support for blockRandomised if anybody complains.
172 	   There was some code for this in busybox 1.0.0-pre3, but nobody ever
173 	   noticed that it didn't actually work. */
174 	if (get_bits(bd, 1))
175 		return RETVAL_OBSOLETE_INPUT;
176 	origPtr = get_bits(bd, 24);
177 	if (origPtr >= dbufSize)
178 		return RETVAL_DATA_ERROR;
179 	/* mapping table: if some byte values are never used (encoding things
180 	   like ascii text), the compression code removes the gaps to have fewer
181 	   symbols to deal with, and writes a sparse bitfield indicating which
182 	   values were present.  We make a translation table to convert the
183 	   symbols back to the corresponding bytes. */
184 	t = get_bits(bd, 16);
185 	symTotal = 0;
186 	for (i = 0; i < 16; i++) {
187 		if (t&(1 << (15-i))) {
188 			k = get_bits(bd, 16);
189 			for (j = 0; j < 16; j++)
190 				if (k&(1 << (15-j)))
191 					symToByte[symTotal++] = (16*i)+j;
192 		}
193 	}
194 	/* How many different Huffman coding groups does this block use? */
195 	groupCount = get_bits(bd, 3);
196 	if (groupCount < 2 || groupCount > MAX_GROUPS)
197 		return RETVAL_DATA_ERROR;
198 	/* nSelectors: Every GROUP_SIZE many symbols we select a new
199 	   Huffman coding group.  Read in the group selector list,
200 	   which is stored as MTF encoded bit runs.  (MTF = Move To
201 	   Front, as each value is used it's moved to the start of the
202 	   list.) */
203 	nSelectors = get_bits(bd, 15);
204 	if (!nSelectors)
205 		return RETVAL_DATA_ERROR;
206 	for (i = 0; i < groupCount; i++)
207 		mtfSymbol[i] = i;
208 	for (i = 0; i < nSelectors; i++) {
209 		/* Get next value */
210 		for (j = 0; get_bits(bd, 1); j++)
211 			if (j >= groupCount)
212 				return RETVAL_DATA_ERROR;
213 		/* Decode MTF to get the next selector */
214 		uc = mtfSymbol[j];
215 		for (; j; j--)
216 			mtfSymbol[j] = mtfSymbol[j-1];
217 		mtfSymbol[0] = selectors[i] = uc;
218 	}
219 	/* Read the Huffman coding tables for each group, which code
220 	   for symTotal literal symbols, plus two run symbols (RUNA,
221 	   RUNB) */
222 	symCount = symTotal+2;
223 	for (j = 0; j < groupCount; j++) {
224 		unsigned char length[MAX_SYMBOLS];
225 		unsigned short temp[MAX_HUFCODE_BITS+1];
226 		int	minLen,	maxLen, pp;
227 		/* Read Huffman code lengths for each symbol.  They're
228 		   stored in a way similar to mtf; record a starting
229 		   value for the first symbol, and an offset from the
230 		   previous value for everys symbol after that.
231 		   (Subtracting 1 before the loop and then adding it
232 		   back at the end is an optimization that makes the
233 		   test inside the loop simpler: symbol length 0
234 		   becomes negative, so an unsigned inequality catches
235 		   it.) */
236 		t = get_bits(bd, 5)-1;
237 		/* GCC 13 has apparently improved use-before-set detection, but
238 		   it can't figure out that length[0] is always intialized by
239 		   virtue of symCount always being positive when making it here.
240 		   See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106511. */
241 		length[0] = 0;
242 		for (i = 0; i < symCount; i++) {
243 			for (;;) {
244 				if (((unsigned)t) > (MAX_HUFCODE_BITS-1))
245 					return RETVAL_DATA_ERROR;
246 
247 				/* If first bit is 0, stop.  Else
248 				   second bit indicates whether to
249 				   increment or decrement the value.
250 				   Optimization: grab 2 bits and unget
251 				   the second if the first was 0. */
252 
253 				k = get_bits(bd, 2);
254 				if (k < 2) {
255 					bd->inbufBitCount++;
256 					break;
257 				}
258 				/* Add one if second bit 1, else
259 				 * subtract 1.  Avoids if/else */
260 				t += (((k+1)&2)-1);
261 			}
262 			/* Correct for the initial -1, to get the
263 			 * final symbol length */
264 			length[i] = t+1;
265 		}
266 		/* Find largest and smallest lengths in this group */
267 		minLen = maxLen = length[0];
268 
269 		for (i = 1; i < symCount; i++) {
270 			if (length[i] > maxLen)
271 				maxLen = length[i];
272 			else if (length[i] < minLen)
273 				minLen = length[i];
274 		}
275 
276 		/* Calculate permute[], base[], and limit[] tables from
277 		 * length[].
278 		 *
279 		 * permute[] is the lookup table for converting
280 		 * Huffman coded symbols into decoded symbols.  base[]
281 		 * is the amount to subtract from the value of a
282 		 * Huffman symbol of a given length when using
283 		 * permute[].
284 		 *
285 		 * limit[] indicates the largest numerical value a
286 		 * symbol with a given number of bits can have.  This
287 		 * is how the Huffman codes can vary in length: each
288 		 * code with a value > limit[length] needs another
289 		 * bit.
290 		 */
291 		hufGroup = bd->groups+j;
292 		hufGroup->minLen = minLen;
293 		hufGroup->maxLen = maxLen;
294 		/* Note that minLen can't be smaller than 1, so we
295 		   adjust the base and limit array pointers so we're
296 		   not always wasting the first entry.  We do this
297 		   again when using them (during symbol decoding).*/
298 		base = hufGroup->base-1;
299 		limit = hufGroup->limit-1;
300 		/* Calculate permute[].  Concurently, initialize
301 		 * temp[] and limit[]. */
302 		pp = 0;
303 		for (i = minLen; i <= maxLen; i++) {
304 			temp[i] = limit[i] = 0;
305 			for (t = 0; t < symCount; t++)
306 				if (length[t] == i)
307 					hufGroup->permute[pp++] = t;
308 		}
309 		/* Count symbols coded for at each bit length */
310 		for (i = 0; i < symCount; i++)
311 			temp[length[i]]++;
312 		/* Calculate limit[] (the largest symbol-coding value
313 		 *at each bit length, which is (previous limit <<
314 		 *1)+symbols at this level), and base[] (number of
315 		 *symbols to ignore at each bit length, which is limit
316 		 *minus the cumulative count of symbols coded for
317 		 *already). */
318 		pp = t = 0;
319 		for (i = minLen; i < maxLen; i++) {
320 			pp += temp[i];
321 			/* We read the largest possible symbol size
322 			   and then unget bits after determining how
323 			   many we need, and those extra bits could be
324 			   set to anything.  (They're noise from
325 			   future symbols.)  At each level we're
326 			   really only interested in the first few
327 			   bits, so here we set all the trailing
328 			   to-be-ignored bits to 1 so they don't
329 			   affect the value > limit[length]
330 			   comparison. */
331 			limit[i] = (pp << (maxLen - i)) - 1;
332 			pp <<= 1;
333 			base[i+1] = pp-(t += temp[i]);
334 		}
335 		limit[maxLen+1] = INT_MAX; /* Sentinel value for
336 					    * reading next sym. */
337 		limit[maxLen] = pp+temp[maxLen]-1;
338 		base[minLen] = 0;
339 	}
340 	/* We've finished reading and digesting the block header.  Now
341 	   read this block's Huffman coded symbols from the file and
342 	   undo the Huffman coding and run length encoding, saving the
343 	   result into dbuf[dbufCount++] = uc */
344 
345 	/* Initialize symbol occurrence counters and symbol Move To
346 	 * Front table */
347 	for (i = 0; i < 256; i++) {
348 		byteCount[i] = 0;
349 		mtfSymbol[i] = (unsigned char)i;
350 	}
351 	/* Loop through compressed symbols. */
352 	runPos = dbufCount = symCount = selector = 0;
353 	for (;;) {
354 		/* Determine which Huffman coding group to use. */
355 		if (!(symCount--)) {
356 			symCount = GROUP_SIZE-1;
357 			if (selector >= nSelectors)
358 				return RETVAL_DATA_ERROR;
359 			hufGroup = bd->groups+selectors[selector++];
360 			base = hufGroup->base-1;
361 			limit = hufGroup->limit-1;
362 		}
363 		/* Read next Huffman-coded symbol. */
364 		/* Note: It is far cheaper to read maxLen bits and
365 		   back up than it is to read minLen bits and then an
366 		   additional bit at a time, testing as we go.
367 		   Because there is a trailing last block (with file
368 		   CRC), there is no danger of the overread causing an
369 		   unexpected EOF for a valid compressed file.  As a
370 		   further optimization, we do the read inline
371 		   (falling back to a call to get_bits if the buffer
372 		   runs dry).  The following (up to got_huff_bits:) is
373 		   equivalent to j = get_bits(bd, hufGroup->maxLen);
374 		 */
375 		while (bd->inbufBitCount < hufGroup->maxLen) {
376 			if (bd->inbufPos == bd->inbufCount) {
377 				j = get_bits(bd, hufGroup->maxLen);
378 				goto got_huff_bits;
379 			}
380 			bd->inbufBits =
381 				(bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
382 			bd->inbufBitCount += 8;
383 		};
384 		bd->inbufBitCount -= hufGroup->maxLen;
385 		j = (bd->inbufBits >> bd->inbufBitCount)&
386 			((1 << hufGroup->maxLen)-1);
387 got_huff_bits:
388 		/* Figure how how many bits are in next symbol and
389 		 * unget extras */
390 		i = hufGroup->minLen;
391 		while (j > limit[i])
392 			++i;
393 		bd->inbufBitCount += (hufGroup->maxLen - i);
394 		/* Huffman decode value to get nextSym (with bounds checking) */
395 		if ((i > hufGroup->maxLen)
396 			|| (((unsigned)(j = (j>>(hufGroup->maxLen-i))-base[i]))
397 				>= MAX_SYMBOLS))
398 			return RETVAL_DATA_ERROR;
399 		nextSym = hufGroup->permute[j];
400 		/* We have now decoded the symbol, which indicates
401 		   either a new literal byte, or a repeated run of the
402 		   most recent literal byte.  First, check if nextSym
403 		   indicates a repeated run, and if so loop collecting
404 		   how many times to repeat the last literal. */
405 		if (((unsigned)nextSym) <= SYMBOL_RUNB) { /* RUNA or RUNB */
406 			/* If this is the start of a new run, zero out
407 			 * counter */
408 			if (!runPos) {
409 				runPos = 1;
410 				t = 0;
411 			}
412 			/* Neat trick that saves 1 symbol: instead of
413 			   or-ing 0 or 1 at each bit position, add 1
414 			   or 2 instead.  For example, 1011 is 1 << 0
415 			   + 1 << 1 + 2 << 2.  1010 is 2 << 0 + 2 << 1
416 			   + 1 << 2.  You can make any bit pattern
417 			   that way using 1 less symbol than the basic
418 			   or 0/1 method (except all bits 0, which
419 			   would use no symbols, but a run of length 0
420 			   doesn't mean anything in this context).
421 			   Thus space is saved. */
422 			t += (runPos << nextSym);
423 			/* +runPos if RUNA; +2*runPos if RUNB */
424 
425 			runPos <<= 1;
426 			continue;
427 		}
428 		/* When we hit the first non-run symbol after a run,
429 		   we now know how many times to repeat the last
430 		   literal, so append that many copies to our buffer
431 		   of decoded symbols (dbuf) now.  (The last literal
432 		   used is the one at the head of the mtfSymbol
433 		   array.) */
434 		if (runPos) {
435 			runPos = 0;
436 			if (dbufCount+t >= dbufSize)
437 				return RETVAL_DATA_ERROR;
438 
439 			uc = symToByte[mtfSymbol[0]];
440 			byteCount[uc] += t;
441 			while (t--)
442 				dbuf[dbufCount++] = uc;
443 		}
444 		/* Is this the terminating symbol? */
445 		if (nextSym > symTotal)
446 			break;
447 		/* At this point, nextSym indicates a new literal
448 		   character.  Subtract one to get the position in the
449 		   MTF array at which this literal is currently to be
450 		   found.  (Note that the result can't be -1 or 0,
451 		   because 0 and 1 are RUNA and RUNB.  But another
452 		   instance of the first symbol in the mtf array,
453 		   position 0, would have been handled as part of a
454 		   run above.  Therefore 1 unused mtf position minus 2
455 		   non-literal nextSym values equals -1.) */
456 		if (dbufCount >= dbufSize)
457 			return RETVAL_DATA_ERROR;
458 		i = nextSym - 1;
459 		uc = mtfSymbol[i];
460 		/* Adjust the MTF array.  Since we typically expect to
461 		 *move only a small number of symbols, and are bound
462 		 *by 256 in any case, using memmove here would
463 		 *typically be bigger and slower due to function call
464 		 *overhead and other assorted setup costs. */
465 		do {
466 			mtfSymbol[i] = mtfSymbol[i-1];
467 		} while (--i);
468 		mtfSymbol[0] = uc;
469 		uc = symToByte[uc];
470 		/* We have our literal byte.  Save it into dbuf. */
471 		byteCount[uc]++;
472 		dbuf[dbufCount++] = (unsigned int)uc;
473 	}
474 	/* At this point, we've read all the Huffman-coded symbols
475 	   (and repeated runs) for this block from the input stream,
476 	   and decoded them into the intermediate buffer.  There are
477 	   dbufCount many decoded bytes in dbuf[].  Now undo the
478 	   Burrows-Wheeler transform on dbuf.  See
479 	   http://dogma.net/markn/articles/bwt/bwt.htm
480 	 */
481 	/* Turn byteCount into cumulative occurrence counts of 0 to n-1. */
482 	j = 0;
483 	for (i = 0; i < 256; i++) {
484 		k = j+byteCount[i];
485 		byteCount[i] = j;
486 		j = k;
487 	}
488 	/* Figure out what order dbuf would be in if we sorted it. */
489 	for (i = 0; i < dbufCount; i++) {
490 		uc = (unsigned char)(dbuf[i] & 0xff);
491 		dbuf[byteCount[uc]] |= (i << 8);
492 		byteCount[uc]++;
493 	}
494 	/* Decode first byte by hand to initialize "previous" byte.
495 	   Note that it doesn't get output, and if the first three
496 	   characters are identical it doesn't qualify as a run (hence
497 	   writeRunCountdown = 5). */
498 	if (dbufCount) {
499 		if (origPtr >= dbufCount)
500 			return RETVAL_DATA_ERROR;
501 		bd->writePos = dbuf[origPtr];
502 		bd->writeCurrent = (unsigned char)(bd->writePos&0xff);
503 		bd->writePos >>= 8;
504 		bd->writeRunCountdown = 5;
505 	}
506 	bd->writeCount = dbufCount;
507 
508 	return RETVAL_OK;
509 }
510 
511 /* Undo burrows-wheeler transform on intermediate buffer to produce output.
512    If start_bunzip was initialized with out_fd =-1, then up to len bytes of
513    data are written to outbuf.  Return value is number of bytes written or
514    error (all errors are negative numbers).  If out_fd!=-1, outbuf and len
515    are ignored, data is written to out_fd and return is RETVAL_OK or error.
516 */
517 
read_bunzip(struct bunzip_data * bd,unsigned char * outbuf,int len)518 static int __init read_bunzip(struct bunzip_data *bd, unsigned char *outbuf, int len)
519 {
520 	const unsigned int *dbuf;
521 	int pos, xcurrent, previous, gotcount;
522 
523 	/* If last read was short due to end of file, return last block now */
524 	if (bd->writeCount < 0)
525 		return bd->writeCount;
526 
527 	gotcount = 0;
528 	dbuf = bd->dbuf;
529 	pos = bd->writePos;
530 	xcurrent = bd->writeCurrent;
531 
532 	/* We will always have pending decoded data to write into the output
533 	   buffer unless this is the very first call (in which case we haven't
534 	   Huffman-decoded a block into the intermediate buffer yet). */
535 
536 	if (bd->writeCopies) {
537 		/* Inside the loop, writeCopies means extra copies (beyond 1) */
538 		--bd->writeCopies;
539 		/* Loop outputting bytes */
540 		for (;;) {
541 			/* If the output buffer is full, snapshot
542 			 * state and return */
543 			if (gotcount >= len) {
544 				bd->writePos = pos;
545 				bd->writeCurrent = xcurrent;
546 				bd->writeCopies++;
547 				return len;
548 			}
549 			/* Write next byte into output buffer, updating CRC */
550 			outbuf[gotcount++] = xcurrent;
551 			bd->writeCRC = (((bd->writeCRC) << 8)
552 				^bd->crc32Table[((bd->writeCRC) >> 24)
553 				^xcurrent]);
554 			/* Loop now if we're outputting multiple
555 			 * copies of this byte */
556 			if (bd->writeCopies) {
557 				--bd->writeCopies;
558 				continue;
559 			}
560 decode_next_byte:
561 			if (!bd->writeCount--)
562 				break;
563 			/* Follow sequence vector to undo
564 			 * Burrows-Wheeler transform */
565 			previous = xcurrent;
566 			pos = dbuf[pos];
567 			xcurrent = pos&0xff;
568 			pos >>= 8;
569 			/* After 3 consecutive copies of the same
570 			   byte, the 4th is a repeat count.  We count
571 			   down from 4 instead *of counting up because
572 			   testing for non-zero is faster */
573 			if (--bd->writeRunCountdown) {
574 				if (xcurrent != previous)
575 					bd->writeRunCountdown = 4;
576 			} else {
577 				/* We have a repeated run, this byte
578 				 * indicates the count */
579 				bd->writeCopies = xcurrent;
580 				xcurrent = previous;
581 				bd->writeRunCountdown = 5;
582 				/* Sometimes there are just 3 bytes
583 				 * (run length 0) */
584 				if (!bd->writeCopies)
585 					goto decode_next_byte;
586 				/* Subtract the 1 copy we'd output
587 				 * anyway to get extras */
588 				--bd->writeCopies;
589 			}
590 		}
591 		/* Decompression of this block completed successfully */
592 		bd->writeCRC = ~bd->writeCRC;
593 		bd->totalCRC = ((bd->totalCRC << 1) |
594 				(bd->totalCRC >> 31)) ^ bd->writeCRC;
595 		/* If this block had a CRC error, force file level CRC error. */
596 		if (bd->writeCRC != bd->headerCRC) {
597 			bd->totalCRC = bd->headerCRC+1;
598 			return RETVAL_LAST_BLOCK;
599 		}
600 	}
601 
602 	/* Refill the intermediate buffer by Huffman-decoding next
603 	 * block of input */
604 	/* (previous is just a convenient unused temp variable here) */
605 	previous = get_next_block(bd);
606 	if (previous) {
607 		bd->writeCount = previous;
608 		return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount;
609 	}
610 	bd->writeCRC = 0xffffffffUL;
611 	pos = bd->writePos;
612 	xcurrent = bd->writeCurrent;
613 	goto decode_next_byte;
614 }
615 
nofill(void * buf,unsigned int len)616 static int __init cf_check nofill(void *buf, unsigned int len)
617 {
618 	return -1;
619 }
620 
621 /* Allocate the structure, read file header.  If in_fd ==-1, inbuf must contain
622    a complete bunzip file (len bytes long).  If in_fd!=-1, inbuf and len are
623    ignored, and data is read from file handle into temporary buffer. */
start_bunzip(struct bunzip_data ** bdp,void * inbuf,int len,int (* fill)(void *,unsigned int))624 static int __init start_bunzip(struct bunzip_data **bdp, void *inbuf, int len,
625 			       int (*fill)(void*, unsigned int))
626 {
627 	struct bunzip_data *bd;
628 	unsigned int i, j, c;
629 	const unsigned int BZh0 =
630 		(((unsigned int)'B') << 24)+(((unsigned int)'Z') << 16)
631 		+(((unsigned int)'h') << 8)+(unsigned int)'0';
632 
633 	/* Figure out how much data to allocate */
634 	i = sizeof(struct bunzip_data);
635 
636 	/* Allocate bunzip_data.  Most fields initialize to zero. */
637 	bd = *bdp = malloc(i);
638 	if (!bd)
639 		return RETVAL_OUT_OF_MEMORY;
640 	memset(bd, 0, sizeof(struct bunzip_data));
641 	/* Setup input buffer */
642 	bd->inbuf = inbuf;
643 	bd->inbufCount = len;
644 	if (fill != NULL)
645 		bd->fill = fill;
646 	else
647 		bd->fill = nofill;
648 
649 	/* Init the CRC32 table (big endian) */
650 	for (i = 0; i < 256; i++) {
651 		c = i << 24;
652 		for (j = 8; j; j--)
653 			c = c&0x80000000 ? (c << 1)^0x04c11db7 : (c << 1);
654 		bd->crc32Table[i] = c;
655 	}
656 
657 	/* Ensure that file starts with "BZh['1'-'9']." */
658 	i = get_bits(bd, 32);
659 	if (((unsigned int)(i-BZh0-1)) >= 9)
660 		return RETVAL_NOT_BZIP_DATA;
661 
662 	/* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of
663 	   uncompressed data.  Allocate intermediate buffer for block. */
664 	bd->dbufSize = 100000*(i-BZh0);
665 
666 	bd->dbuf = large_malloc(bd->dbufSize * sizeof(int));
667 	if (!bd->dbuf)
668 		return RETVAL_OUT_OF_MEMORY;
669 	return RETVAL_OK;
670 }
671 
672 /* Example usage: decompress src_fd to dst_fd.  (Stops at end of bzip2 data,
673    not end of file.) */
bunzip2(unsigned char * buf,unsigned int len,int (* fill)(void *,unsigned int),int (* flush)(void *,unsigned int),unsigned char * outbuf,unsigned int * pos,void (* error)(const char * x))674 int __init bunzip2(unsigned char *buf, unsigned int len,
675 		   int(*fill)(void*, unsigned int),
676 		   int(*flush)(void*, unsigned int),
677 		   unsigned char *outbuf, unsigned int *pos,
678 		   void(*error)(const char *x))
679 {
680 	struct bunzip_data *bd;
681 	int i = -1;
682 	unsigned char *inbuf;
683 
684 	if (flush)
685 		outbuf = malloc(BZIP2_IOBUF_SIZE);
686 
687 	if (!outbuf) {
688 		error("Could not allocate output buffer");
689 		return RETVAL_OUT_OF_MEMORY;
690 	}
691 	if (buf)
692 		inbuf = buf;
693 	else
694 		inbuf = malloc(BZIP2_IOBUF_SIZE);
695 	if (!inbuf) {
696 		error("Could not allocate input buffer");
697 		i = RETVAL_OUT_OF_MEMORY;
698 		goto exit_0;
699 	}
700 	i = start_bunzip(&bd, inbuf, len, fill);
701 	if (!i) {
702 		for (;;) {
703 			i = read_bunzip(bd, outbuf, BZIP2_IOBUF_SIZE);
704 			if (i <= 0)
705 				break;
706 			if (!flush)
707 				outbuf += i;
708 			else
709 				if (i != flush(outbuf, i)) {
710 					i = RETVAL_UNEXPECTED_OUTPUT_EOF;
711 					break;
712 				}
713 		}
714 	}
715 	/* Check CRC and release memory */
716 	if (i == RETVAL_LAST_BLOCK) {
717 		if (bd->headerCRC != bd->totalCRC)
718 			error("Data integrity error when decompressing.");
719 		else
720 			i = RETVAL_OK;
721 	} else if (i == RETVAL_UNEXPECTED_OUTPUT_EOF) {
722 		error("Compressed file ends unexpectedly");
723 	}
724 	if (!bd)
725 		goto exit_1;
726 	if (bd->dbuf)
727 		large_free(bd->dbuf);
728 	if (pos)
729 		*pos = bd->inbufPos;
730 	free(bd);
731 exit_1:
732 	if (!buf)
733 		free(inbuf);
734 exit_0:
735 	if (flush)
736 		free(outbuf);
737 	return i;
738 }
739