1 /* vi: set sw = 4 ts = 4: */
2 /* Small bzip2 deflate implementation, by Rob Landley (rob@landley.net).
3
4 Based on bzip2 decompression code by Julian R Seward (jseward@acm.org),
5 which also acknowledges contributions by Mike Burrows, David Wheeler,
6 Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten,
7 Robert Sedgewick, and Jon L. Bentley.
8
9 This code is licensed under the LGPLv2:
10 LGPL (http://www.gnu.org/copyleft/lgpl.html
11 */
12
13 /*
14 Size and speed optimizations by Manuel Novoa III (mjn3@codepoet.org).
15
16 More efficient reading of Huffman codes, a streamlined read_bunzip()
17 function, and various other tweaks. In (limited) tests, approximately
18 20% faster than bzcat on x86 and about 10% faster on arm.
19
20 Note that about 2/3 of the time is spent in read_unzip() reversing
21 the Burrows-Wheeler transformation. Much of that time is delay
22 resulting from cache misses.
23
24 I would ask that anyone benefiting from this work, especially those
25 using it in commercial products, consider making a donation to my local
26 non-profit hospice organization in the name of the woman I loved, who
27 passed away Feb. 12, 2003.
28
29 In memory of Toni W. Hagan
30
31 Hospice of Acadiana, Inc.
32 2600 Johnston St., Suite 200
33 Lafayette, LA 70503-3240
34
35 Phone (337) 232-1234 or 1-800-738-2226
36 Fax (337) 232-1297
37
38 http://www.hospiceacadiana.com/
39
40 Manuel
41 */
42
43 /*
44 Made it fit for running in Linux Kernel by Alain Knaff (alain@knaff.lu)
45 */
46
47 #include "decompress.h"
48
49 #ifndef INT_MAX
50 #define INT_MAX 0x7fffffff
51 #endif
52
53 /* Constants for Huffman coding */
54 #define MAX_GROUPS 6
55 #define GROUP_SIZE 50 /* 64 would have been more efficient */
56 #define MAX_HUFCODE_BITS 20 /* Longest Huffman code allowed */
57 #define MAX_SYMBOLS 258 /* 256 literals + RUNA + RUNB */
58 #define SYMBOL_RUNA 0
59 #define SYMBOL_RUNB 1
60
61 /* Status return values */
62 #define RETVAL_OK 0
63 #define RETVAL_LAST_BLOCK (-1)
64 #define RETVAL_NOT_BZIP_DATA (-2)
65 #define RETVAL_UNEXPECTED_INPUT_EOF (-3)
66 #define RETVAL_UNEXPECTED_OUTPUT_EOF (-4)
67 #define RETVAL_DATA_ERROR (-5)
68 #define RETVAL_OUT_OF_MEMORY (-6)
69 #define RETVAL_OBSOLETE_INPUT (-7)
70
71 /* Other housekeeping constants */
72 #define BZIP2_IOBUF_SIZE 4096
73
74 /* This is what we know about each Huffman coding group */
75 struct group_data {
76 /* We have an extra slot at the end of limit[] for a sentinal value. */
77 int limit[MAX_HUFCODE_BITS+1];
78 int base[MAX_HUFCODE_BITS];
79 int permute[MAX_SYMBOLS];
80 int minLen, maxLen;
81 };
82
83 /* Structure holding all the housekeeping data, including IO buffers and
84 memory that persists between calls to bunzip */
85 struct bunzip_data {
86 /* State for interrupting output loop */
87 int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent;
88 /* I/O tracking data (file handles, buffers, positions, etc.) */
89 int (*fill)(void*, unsigned int);
90 int inbufCount, inbufPos /*, outbufPos*/;
91 unsigned char *inbuf /*,*outbuf*/;
92 unsigned int inbufBitCount, inbufBits;
93 /* The CRC values stored in the block header and calculated from the
94 data */
95 unsigned int crc32Table[256], headerCRC, totalCRC, writeCRC;
96 /* Intermediate buffer and its size (in bytes) */
97 unsigned int *dbuf, dbufSize;
98 /* These things are a bit too big to go on the stack */
99 unsigned char selectors[32768]; /* nSelectors = 15 bits */
100 struct group_data groups[MAX_GROUPS]; /* Huffman coding tables */
101 int io_error; /* non-zero if we have IO error */
102 };
103
104
105 /* Return the next nnn bits of input. All reads from the compressed input
106 are done through this function. All reads are big endian */
get_bits(struct bunzip_data * bd,char bits_wanted)107 static unsigned int INIT get_bits(struct bunzip_data *bd, char bits_wanted)
108 {
109 unsigned int bits = 0;
110
111 /* If we need to get more data from the byte buffer, do so.
112 (Loop getting one byte at a time to enforce endianness and avoid
113 unaligned access.) */
114 while (bd->inbufBitCount < bits_wanted) {
115 /* If we need to read more data from file into byte buffer, do
116 so */
117 if (bd->inbufPos == bd->inbufCount) {
118 if (bd->io_error)
119 return 0;
120 bd->inbufCount = bd->fill(bd->inbuf, BZIP2_IOBUF_SIZE);
121 if (bd->inbufCount <= 0) {
122 bd->io_error = RETVAL_UNEXPECTED_INPUT_EOF;
123 return 0;
124 }
125 bd->inbufPos = 0;
126 }
127 /* Avoid 32-bit overflow (dump bit buffer to top of output) */
128 if (bd->inbufBitCount >= 24) {
129 bits = bd->inbufBits&((1 << bd->inbufBitCount)-1);
130 bits_wanted -= bd->inbufBitCount;
131 bits <<= bits_wanted;
132 bd->inbufBitCount = 0;
133 }
134 /* Grab next 8 bits of input from buffer. */
135 bd->inbufBits = (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
136 bd->inbufBitCount += 8;
137 }
138 /* Calculate result */
139 bd->inbufBitCount -= bits_wanted;
140 bits |= (bd->inbufBits >> bd->inbufBitCount)&((1 << bits_wanted)-1);
141
142 return bits;
143 }
144
145 /* Unpacks the next block and sets up for the inverse burrows-wheeler step. */
146
get_next_block(struct bunzip_data * bd)147 static int INIT get_next_block(struct bunzip_data *bd)
148 {
149 struct group_data *hufGroup = NULL;
150 int *base = NULL;
151 int *limit = NULL;
152 int dbufCount, nextSym, dbufSize, groupCount, selector,
153 i, j, k, t, runPos, symCount, symTotal, nSelectors,
154 byteCount[256];
155 unsigned char uc, symToByte[256], mtfSymbol[256], *selectors;
156 unsigned int *dbuf, origPtr;
157
158 dbuf = bd->dbuf;
159 dbufSize = bd->dbufSize;
160 selectors = bd->selectors;
161
162 /* Read in header signature and CRC, then validate signature.
163 (last block signature means CRC is for whole file, return now) */
164 i = get_bits(bd, 24);
165 j = get_bits(bd, 24);
166 bd->headerCRC = get_bits(bd, 32);
167 if ((i == 0x177245) && (j == 0x385090))
168 return RETVAL_LAST_BLOCK;
169 if ((i != 0x314159) || (j != 0x265359))
170 return RETVAL_NOT_BZIP_DATA;
171 /* We can add support for blockRandomised if anybody complains.
172 There was some code for this in busybox 1.0.0-pre3, but nobody ever
173 noticed that it didn't actually work. */
174 if (get_bits(bd, 1))
175 return RETVAL_OBSOLETE_INPUT;
176 origPtr = get_bits(bd, 24);
177 if (origPtr >= dbufSize)
178 return RETVAL_DATA_ERROR;
179 /* mapping table: if some byte values are never used (encoding things
180 like ascii text), the compression code removes the gaps to have fewer
181 symbols to deal with, and writes a sparse bitfield indicating which
182 values were present. We make a translation table to convert the
183 symbols back to the corresponding bytes. */
184 t = get_bits(bd, 16);
185 symTotal = 0;
186 for (i = 0; i < 16; i++) {
187 if (t&(1 << (15-i))) {
188 k = get_bits(bd, 16);
189 for (j = 0; j < 16; j++)
190 if (k&(1 << (15-j)))
191 symToByte[symTotal++] = (16*i)+j;
192 }
193 }
194 /* How many different Huffman coding groups does this block use? */
195 groupCount = get_bits(bd, 3);
196 if (groupCount < 2 || groupCount > MAX_GROUPS)
197 return RETVAL_DATA_ERROR;
198 /* nSelectors: Every GROUP_SIZE many symbols we select a new
199 Huffman coding group. Read in the group selector list,
200 which is stored as MTF encoded bit runs. (MTF = Move To
201 Front, as each value is used it's moved to the start of the
202 list.) */
203 nSelectors = get_bits(bd, 15);
204 if (!nSelectors)
205 return RETVAL_DATA_ERROR;
206 for (i = 0; i < groupCount; i++)
207 mtfSymbol[i] = i;
208 for (i = 0; i < nSelectors; i++) {
209 /* Get next value */
210 for (j = 0; get_bits(bd, 1); j++)
211 if (j >= groupCount)
212 return RETVAL_DATA_ERROR;
213 /* Decode MTF to get the next selector */
214 uc = mtfSymbol[j];
215 for (; j; j--)
216 mtfSymbol[j] = mtfSymbol[j-1];
217 mtfSymbol[0] = selectors[i] = uc;
218 }
219 /* Read the Huffman coding tables for each group, which code
220 for symTotal literal symbols, plus two run symbols (RUNA,
221 RUNB) */
222 symCount = symTotal+2;
223 for (j = 0; j < groupCount; j++) {
224 unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1];
225 int minLen, maxLen, pp;
226 /* Read Huffman code lengths for each symbol. They're
227 stored in a way similar to mtf; record a starting
228 value for the first symbol, and an offset from the
229 previous value for everys symbol after that.
230 (Subtracting 1 before the loop and then adding it
231 back at the end is an optimization that makes the
232 test inside the loop simpler: symbol length 0
233 becomes negative, so an unsigned inequality catches
234 it.) */
235 t = get_bits(bd, 5)-1;
236 for (i = 0; i < symCount; i++) {
237 for (;;) {
238 if (((unsigned)t) > (MAX_HUFCODE_BITS-1))
239 return RETVAL_DATA_ERROR;
240
241 /* If first bit is 0, stop. Else
242 second bit indicates whether to
243 increment or decrement the value.
244 Optimization: grab 2 bits and unget
245 the second if the first was 0. */
246
247 k = get_bits(bd, 2);
248 if (k < 2) {
249 bd->inbufBitCount++;
250 break;
251 }
252 /* Add one if second bit 1, else
253 * subtract 1. Avoids if/else */
254 t += (((k+1)&2)-1);
255 }
256 /* Correct for the initial -1, to get the
257 * final symbol length */
258 length[i] = t+1;
259 }
260 /* Find largest and smallest lengths in this group */
261 minLen = maxLen = length[0];
262
263 for (i = 1; i < symCount; i++) {
264 if (length[i] > maxLen)
265 maxLen = length[i];
266 else if (length[i] < minLen)
267 minLen = length[i];
268 }
269
270 /* Calculate permute[], base[], and limit[] tables from
271 * length[].
272 *
273 * permute[] is the lookup table for converting
274 * Huffman coded symbols into decoded symbols. base[]
275 * is the amount to subtract from the value of a
276 * Huffman symbol of a given length when using
277 * permute[].
278 *
279 * limit[] indicates the largest numerical value a
280 * symbol with a given number of bits can have. This
281 * is how the Huffman codes can vary in length: each
282 * code with a value > limit[length] needs another
283 * bit.
284 */
285 hufGroup = bd->groups+j;
286 hufGroup->minLen = minLen;
287 hufGroup->maxLen = maxLen;
288 /* Note that minLen can't be smaller than 1, so we
289 adjust the base and limit array pointers so we're
290 not always wasting the first entry. We do this
291 again when using them (during symbol decoding).*/
292 base = hufGroup->base-1;
293 limit = hufGroup->limit-1;
294 /* Calculate permute[]. Concurently, initialize
295 * temp[] and limit[]. */
296 pp = 0;
297 for (i = minLen; i <= maxLen; i++) {
298 temp[i] = limit[i] = 0;
299 for (t = 0; t < symCount; t++)
300 if (length[t] == i)
301 hufGroup->permute[pp++] = t;
302 }
303 /* Count symbols coded for at each bit length */
304 for (i = 0; i < symCount; i++)
305 temp[length[i]]++;
306 /* Calculate limit[] (the largest symbol-coding value
307 *at each bit length, which is (previous limit <<
308 *1)+symbols at this level), and base[] (number of
309 *symbols to ignore at each bit length, which is limit
310 *minus the cumulative count of symbols coded for
311 *already). */
312 pp = t = 0;
313 for (i = minLen; i < maxLen; i++) {
314 pp += temp[i];
315 /* We read the largest possible symbol size
316 and then unget bits after determining how
317 many we need, and those extra bits could be
318 set to anything. (They're noise from
319 future symbols.) At each level we're
320 really only interested in the first few
321 bits, so here we set all the trailing
322 to-be-ignored bits to 1 so they don't
323 affect the value > limit[length]
324 comparison. */
325 limit[i] = (pp << (maxLen - i)) - 1;
326 pp <<= 1;
327 base[i+1] = pp-(t += temp[i]);
328 }
329 limit[maxLen+1] = INT_MAX; /* Sentinal value for
330 * reading next sym. */
331 limit[maxLen] = pp+temp[maxLen]-1;
332 base[minLen] = 0;
333 }
334 /* We've finished reading and digesting the block header. Now
335 read this block's Huffman coded symbols from the file and
336 undo the Huffman coding and run length encoding, saving the
337 result into dbuf[dbufCount++] = uc */
338
339 /* Initialize symbol occurrence counters and symbol Move To
340 * Front table */
341 for (i = 0; i < 256; i++) {
342 byteCount[i] = 0;
343 mtfSymbol[i] = (unsigned char)i;
344 }
345 /* Loop through compressed symbols. */
346 runPos = dbufCount = symCount = selector = 0;
347 for (;;) {
348 /* Determine which Huffman coding group to use. */
349 if (!(symCount--)) {
350 symCount = GROUP_SIZE-1;
351 if (selector >= nSelectors)
352 return RETVAL_DATA_ERROR;
353 hufGroup = bd->groups+selectors[selector++];
354 base = hufGroup->base-1;
355 limit = hufGroup->limit-1;
356 }
357 /* Read next Huffman-coded symbol. */
358 /* Note: It is far cheaper to read maxLen bits and
359 back up than it is to read minLen bits and then an
360 additional bit at a time, testing as we go.
361 Because there is a trailing last block (with file
362 CRC), there is no danger of the overread causing an
363 unexpected EOF for a valid compressed file. As a
364 further optimization, we do the read inline
365 (falling back to a call to get_bits if the buffer
366 runs dry). The following (up to got_huff_bits:) is
367 equivalent to j = get_bits(bd, hufGroup->maxLen);
368 */
369 while (bd->inbufBitCount < hufGroup->maxLen) {
370 if (bd->inbufPos == bd->inbufCount) {
371 j = get_bits(bd, hufGroup->maxLen);
372 goto got_huff_bits;
373 }
374 bd->inbufBits =
375 (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++];
376 bd->inbufBitCount += 8;
377 };
378 bd->inbufBitCount -= hufGroup->maxLen;
379 j = (bd->inbufBits >> bd->inbufBitCount)&
380 ((1 << hufGroup->maxLen)-1);
381 got_huff_bits:
382 /* Figure how how many bits are in next symbol and
383 * unget extras */
384 i = hufGroup->minLen;
385 while (j > limit[i])
386 ++i;
387 bd->inbufBitCount += (hufGroup->maxLen - i);
388 /* Huffman decode value to get nextSym (with bounds checking) */
389 if ((i > hufGroup->maxLen)
390 || (((unsigned)(j = (j>>(hufGroup->maxLen-i))-base[i]))
391 >= MAX_SYMBOLS))
392 return RETVAL_DATA_ERROR;
393 nextSym = hufGroup->permute[j];
394 /* We have now decoded the symbol, which indicates
395 either a new literal byte, or a repeated run of the
396 most recent literal byte. First, check if nextSym
397 indicates a repeated run, and if so loop collecting
398 how many times to repeat the last literal. */
399 if (((unsigned)nextSym) <= SYMBOL_RUNB) { /* RUNA or RUNB */
400 /* If this is the start of a new run, zero out
401 * counter */
402 if (!runPos) {
403 runPos = 1;
404 t = 0;
405 }
406 /* Neat trick that saves 1 symbol: instead of
407 or-ing 0 or 1 at each bit position, add 1
408 or 2 instead. For example, 1011 is 1 << 0
409 + 1 << 1 + 2 << 2. 1010 is 2 << 0 + 2 << 1
410 + 1 << 2. You can make any bit pattern
411 that way using 1 less symbol than the basic
412 or 0/1 method (except all bits 0, which
413 would use no symbols, but a run of length 0
414 doesn't mean anything in this context).
415 Thus space is saved. */
416 t += (runPos << nextSym);
417 /* +runPos if RUNA; +2*runPos if RUNB */
418
419 runPos <<= 1;
420 continue;
421 }
422 /* When we hit the first non-run symbol after a run,
423 we now know how many times to repeat the last
424 literal, so append that many copies to our buffer
425 of decoded symbols (dbuf) now. (The last literal
426 used is the one at the head of the mtfSymbol
427 array.) */
428 if (runPos) {
429 runPos = 0;
430 if (dbufCount+t >= dbufSize)
431 return RETVAL_DATA_ERROR;
432
433 uc = symToByte[mtfSymbol[0]];
434 byteCount[uc] += t;
435 while (t--)
436 dbuf[dbufCount++] = uc;
437 }
438 /* Is this the terminating symbol? */
439 if (nextSym > symTotal)
440 break;
441 /* At this point, nextSym indicates a new literal
442 character. Subtract one to get the position in the
443 MTF array at which this literal is currently to be
444 found. (Note that the result can't be -1 or 0,
445 because 0 and 1 are RUNA and RUNB. But another
446 instance of the first symbol in the mtf array,
447 position 0, would have been handled as part of a
448 run above. Therefore 1 unused mtf position minus 2
449 non-literal nextSym values equals -1.) */
450 if (dbufCount >= dbufSize)
451 return RETVAL_DATA_ERROR;
452 i = nextSym - 1;
453 uc = mtfSymbol[i];
454 /* Adjust the MTF array. Since we typically expect to
455 *move only a small number of symbols, and are bound
456 *by 256 in any case, using memmove here would
457 *typically be bigger and slower due to function call
458 *overhead and other assorted setup costs. */
459 do {
460 mtfSymbol[i] = mtfSymbol[i-1];
461 } while (--i);
462 mtfSymbol[0] = uc;
463 uc = symToByte[uc];
464 /* We have our literal byte. Save it into dbuf. */
465 byteCount[uc]++;
466 dbuf[dbufCount++] = (unsigned int)uc;
467 }
468 /* At this point, we've read all the Huffman-coded symbols
469 (and repeated runs) for this block from the input stream,
470 and decoded them into the intermediate buffer. There are
471 dbufCount many decoded bytes in dbuf[]. Now undo the
472 Burrows-Wheeler transform on dbuf. See
473 http://dogma.net/markn/articles/bwt/bwt.htm
474 */
475 /* Turn byteCount into cumulative occurrence counts of 0 to n-1. */
476 j = 0;
477 for (i = 0; i < 256; i++) {
478 k = j+byteCount[i];
479 byteCount[i] = j;
480 j = k;
481 }
482 /* Figure out what order dbuf would be in if we sorted it. */
483 for (i = 0; i < dbufCount; i++) {
484 uc = (unsigned char)(dbuf[i] & 0xff);
485 dbuf[byteCount[uc]] |= (i << 8);
486 byteCount[uc]++;
487 }
488 /* Decode first byte by hand to initialize "previous" byte.
489 Note that it doesn't get output, and if the first three
490 characters are identical it doesn't qualify as a run (hence
491 writeRunCountdown = 5). */
492 if (dbufCount) {
493 if (origPtr >= dbufCount)
494 return RETVAL_DATA_ERROR;
495 bd->writePos = dbuf[origPtr];
496 bd->writeCurrent = (unsigned char)(bd->writePos&0xff);
497 bd->writePos >>= 8;
498 bd->writeRunCountdown = 5;
499 }
500 bd->writeCount = dbufCount;
501
502 return RETVAL_OK;
503 }
504
505 /* Undo burrows-wheeler transform on intermediate buffer to produce output.
506 If start_bunzip was initialized with out_fd =-1, then up to len bytes of
507 data are written to outbuf. Return value is number of bytes written or
508 error (all errors are negative numbers). If out_fd!=-1, outbuf and len
509 are ignored, data is written to out_fd and return is RETVAL_OK or error.
510 */
511
read_bunzip(struct bunzip_data * bd,unsigned char * outbuf,int len)512 static int INIT read_bunzip(struct bunzip_data *bd, unsigned char *outbuf, int len)
513 {
514 const unsigned int *dbuf;
515 int pos, xcurrent, previous, gotcount;
516
517 /* If last read was short due to end of file, return last block now */
518 if (bd->writeCount < 0)
519 return bd->writeCount;
520
521 gotcount = 0;
522 dbuf = bd->dbuf;
523 pos = bd->writePos;
524 xcurrent = bd->writeCurrent;
525
526 /* We will always have pending decoded data to write into the output
527 buffer unless this is the very first call (in which case we haven't
528 Huffman-decoded a block into the intermediate buffer yet). */
529
530 if (bd->writeCopies) {
531 /* Inside the loop, writeCopies means extra copies (beyond 1) */
532 --bd->writeCopies;
533 /* Loop outputting bytes */
534 for (;;) {
535 /* If the output buffer is full, snapshot
536 * state and return */
537 if (gotcount >= len) {
538 bd->writePos = pos;
539 bd->writeCurrent = xcurrent;
540 bd->writeCopies++;
541 return len;
542 }
543 /* Write next byte into output buffer, updating CRC */
544 outbuf[gotcount++] = xcurrent;
545 bd->writeCRC = (((bd->writeCRC) << 8)
546 ^bd->crc32Table[((bd->writeCRC) >> 24)
547 ^xcurrent]);
548 /* Loop now if we're outputting multiple
549 * copies of this byte */
550 if (bd->writeCopies) {
551 --bd->writeCopies;
552 continue;
553 }
554 decode_next_byte:
555 if (!bd->writeCount--)
556 break;
557 /* Follow sequence vector to undo
558 * Burrows-Wheeler transform */
559 previous = xcurrent;
560 pos = dbuf[pos];
561 xcurrent = pos&0xff;
562 pos >>= 8;
563 /* After 3 consecutive copies of the same
564 byte, the 4th is a repeat count. We count
565 down from 4 instead *of counting up because
566 testing for non-zero is faster */
567 if (--bd->writeRunCountdown) {
568 if (xcurrent != previous)
569 bd->writeRunCountdown = 4;
570 } else {
571 /* We have a repeated run, this byte
572 * indicates the count */
573 bd->writeCopies = xcurrent;
574 xcurrent = previous;
575 bd->writeRunCountdown = 5;
576 /* Sometimes there are just 3 bytes
577 * (run length 0) */
578 if (!bd->writeCopies)
579 goto decode_next_byte;
580 /* Subtract the 1 copy we'd output
581 * anyway to get extras */
582 --bd->writeCopies;
583 }
584 }
585 /* Decompression of this block completed successfully */
586 bd->writeCRC = ~bd->writeCRC;
587 bd->totalCRC = ((bd->totalCRC << 1) |
588 (bd->totalCRC >> 31)) ^ bd->writeCRC;
589 /* If this block had a CRC error, force file level CRC error. */
590 if (bd->writeCRC != bd->headerCRC) {
591 bd->totalCRC = bd->headerCRC+1;
592 return RETVAL_LAST_BLOCK;
593 }
594 }
595
596 /* Refill the intermediate buffer by Huffman-decoding next
597 * block of input */
598 /* (previous is just a convenient unused temp variable here) */
599 previous = get_next_block(bd);
600 if (previous) {
601 bd->writeCount = previous;
602 return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount;
603 }
604 bd->writeCRC = 0xffffffffUL;
605 pos = bd->writePos;
606 xcurrent = bd->writeCurrent;
607 goto decode_next_byte;
608 }
609
nofill(void * buf,unsigned int len)610 static int INIT nofill(void *buf, unsigned int len)
611 {
612 return -1;
613 }
614
615 /* Allocate the structure, read file header. If in_fd ==-1, inbuf must contain
616 a complete bunzip file (len bytes long). If in_fd!=-1, inbuf and len are
617 ignored, and data is read from file handle into temporary buffer. */
start_bunzip(struct bunzip_data ** bdp,void * inbuf,int len,int (* fill)(void *,unsigned int))618 static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len,
619 int (*fill)(void*, unsigned int))
620 {
621 struct bunzip_data *bd;
622 unsigned int i, j, c;
623 const unsigned int BZh0 =
624 (((unsigned int)'B') << 24)+(((unsigned int)'Z') << 16)
625 +(((unsigned int)'h') << 8)+(unsigned int)'0';
626
627 /* Figure out how much data to allocate */
628 i = sizeof(struct bunzip_data);
629
630 /* Allocate bunzip_data. Most fields initialize to zero. */
631 bd = *bdp = malloc(i);
632 if (!bd)
633 return RETVAL_OUT_OF_MEMORY;
634 memset(bd, 0, sizeof(struct bunzip_data));
635 /* Setup input buffer */
636 bd->inbuf = inbuf;
637 bd->inbufCount = len;
638 if (fill != NULL)
639 bd->fill = fill;
640 else
641 bd->fill = nofill;
642
643 /* Init the CRC32 table (big endian) */
644 for (i = 0; i < 256; i++) {
645 c = i << 24;
646 for (j = 8; j; j--)
647 c = c&0x80000000 ? (c << 1)^0x04c11db7 : (c << 1);
648 bd->crc32Table[i] = c;
649 }
650
651 /* Ensure that file starts with "BZh['1'-'9']." */
652 i = get_bits(bd, 32);
653 if (((unsigned int)(i-BZh0-1)) >= 9)
654 return RETVAL_NOT_BZIP_DATA;
655
656 /* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of
657 uncompressed data. Allocate intermediate buffer for block. */
658 bd->dbufSize = 100000*(i-BZh0);
659
660 bd->dbuf = large_malloc(bd->dbufSize * sizeof(int));
661 if (!bd->dbuf)
662 return RETVAL_OUT_OF_MEMORY;
663 return RETVAL_OK;
664 }
665
666 /* Example usage: decompress src_fd to dst_fd. (Stops at end of bzip2 data,
667 not end of file.) */
bunzip2(unsigned char * buf,unsigned int len,int (* fill)(void *,unsigned int),int (* flush)(void *,unsigned int),unsigned char * outbuf,unsigned int * pos,void (* error)(const char * x))668 STATIC int INIT bunzip2(unsigned char *buf, unsigned int len,
669 int(*fill)(void*, unsigned int),
670 int(*flush)(void*, unsigned int),
671 unsigned char *outbuf,
672 unsigned int *pos,
673 void(*error)(const char *x))
674 {
675 struct bunzip_data *bd;
676 int i = -1;
677 unsigned char *inbuf;
678
679 if (flush)
680 outbuf = malloc(BZIP2_IOBUF_SIZE);
681
682 if (!outbuf) {
683 error("Could not allocate output buffer");
684 return RETVAL_OUT_OF_MEMORY;
685 }
686 if (buf)
687 inbuf = buf;
688 else
689 inbuf = malloc(BZIP2_IOBUF_SIZE);
690 if (!inbuf) {
691 error("Could not allocate input buffer");
692 i = RETVAL_OUT_OF_MEMORY;
693 goto exit_0;
694 }
695 i = start_bunzip(&bd, inbuf, len, fill);
696 if (!i) {
697 for (;;) {
698 i = read_bunzip(bd, outbuf, BZIP2_IOBUF_SIZE);
699 if (i <= 0)
700 break;
701 if (!flush)
702 outbuf += i;
703 else
704 if (i != flush(outbuf, i)) {
705 i = RETVAL_UNEXPECTED_OUTPUT_EOF;
706 break;
707 }
708 }
709 }
710 /* Check CRC and release memory */
711 if (i == RETVAL_LAST_BLOCK) {
712 if (bd->headerCRC != bd->totalCRC)
713 error("Data integrity error when decompressing.");
714 else
715 i = RETVAL_OK;
716 } else if (i == RETVAL_UNEXPECTED_OUTPUT_EOF) {
717 error("Compressed file ends unexpectedly");
718 }
719 if (!bd)
720 goto exit_1;
721 if (bd->dbuf)
722 large_free(bd->dbuf);
723 if (pos)
724 *pos = bd->inbufPos;
725 free(bd);
726 exit_1:
727 if (!buf)
728 free(inbuf);
729 exit_0:
730 if (flush)
731 free(outbuf);
732 return i;
733 }
734