1 
2 /*-------------------------------------------------------------*/
3 /*--- Block sorting machinery                               ---*/
4 /*---                                           blocksort.c ---*/
5 /*-------------------------------------------------------------*/
6 
7 /*--
8   This file is a part of bzip2 and/or libbzip2, a program and
9   library for lossless, block-sorting data compression.
10 
11   Copyright (C) 1996-2002 Julian R Seward.  All rights reserved.
12 
13   Redistribution and use in source and binary forms, with or without
14   modification, are permitted provided that the following conditions
15   are met:
16 
17   1. Redistributions of source code must retain the above copyright
18      notice, this list of conditions and the following disclaimer.
19 
20   2. The origin of this software must not be misrepresented; you must
21      not claim that you wrote the original software.  If you use this
22      software in a product, an acknowledgment in the product
23      documentation would be appreciated but is not required.
24 
25   3. Altered source versions must be plainly marked as such, and must
26      not be misrepresented as being the original software.
27 
28   4. The name of the author may not be used to endorse or promote
29      products derived from this software without specific prior written
30      permission.
31 
32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 
44   Julian Seward, Cambridge, UK.
45   jseward@acm.org
46   bzip2/libbzip2 version 1.0.6 of 6 September 2010
47   Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
48 
49   This program is based on (at least) the work of:
50      Mike Burrows
51      David Wheeler
52      Peter Fenwick
53      Alistair Moffat
54      Radford Neal
55      Ian H. Witten
56      Robert Sedgewick
57      Jon L. Bentley
58 
59   For more information on these sources, see the manual.
60 --*/
61 
62 #include "bzlib_private.h"
63 #include <log.h>
64 
65 /*---------------------------------------------*/
66 /*--- Fallback O(N log(N)^2) sorting        ---*/
67 /*--- algorithm, for repetitive blocks      ---*/
68 /*---------------------------------------------*/
69 
70 /*---------------------------------------------*/
71 static
72 __inline__
fallbackSimpleSort(UInt32 * fmap,UInt32 * eclass,Int32 lo,Int32 hi)73 void fallbackSimpleSort ( UInt32* fmap,
74                           UInt32* eclass,
75                           Int32   lo,
76                           Int32   hi )
77 {
78    Int32 i, j, tmp;
79    UInt32 ec_tmp;
80 
81    if (lo == hi) return;
82 
83    if (hi - lo > 3) {
84       for ( i = hi-4; i >= lo; i-- ) {
85          tmp = fmap[i];
86          ec_tmp = eclass[tmp];
87          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
88             fmap[j-4] = fmap[j];
89          fmap[j-4] = tmp;
90       }
91    }
92 
93    for ( i = hi-1; i >= lo; i-- ) {
94       tmp = fmap[i];
95       ec_tmp = eclass[tmp];
96       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
97          fmap[j-1] = fmap[j];
98       fmap[j-1] = tmp;
99    }
100 }
101 
102 /*---------------------------------------------*/
103 #define fswap(zz1, zz2) \
104    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
105 
106 #define fvswap(zzp1, zzp2, zzn)       \
107 {                                     \
108    Int32 yyp1 = (zzp1);               \
109    Int32 yyp2 = (zzp2);               \
110    Int32 yyn  = (zzn);                \
111    while (yyn > 0) {                  \
112       fswap(fmap[yyp1], fmap[yyp2]);  \
113       yyp1++; yyp2++; yyn--;          \
114    }                                  \
115 }
116 
117 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
118 
119 #define fpush(lz,hz) { stackLo[sp] = lz; \
120                        stackHi[sp] = hz; \
121                        sp++; }
122 
123 #define fpop(lz,hz) { sp--;              \
124                       lz = stackLo[sp];  \
125                       hz = stackHi[sp]; }
126 
127 #define FALLBACK_QSORT_SMALL_THRESH 10
128 #define FALLBACK_QSORT_STACK_SIZE   100
129 
130 static
fallbackQSort3(UInt32 * fmap,UInt32 * eclass,Int32 loSt,Int32 hiSt)131 void fallbackQSort3 ( UInt32* fmap,
132                       UInt32* eclass,
133                       Int32   loSt,
134                       Int32   hiSt )
135 {
136    Int32 unLo, unHi, ltLo, gtHi, n, m;
137    Int32 sp, lo, hi;
138    UInt32 med, r, r3;
139    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
140    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
141 
142    r = 0;
143 
144    sp = 0;
145    fpush ( loSt, hiSt );
146 
147    while (sp > 0) {
148 
149       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
150 
151       fpop ( lo, hi );
152       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
153          fallbackSimpleSort ( fmap, eclass, lo, hi );
154          continue;
155       }
156 
157       /* Random partitioning.  Median of 3 sometimes fails to
158          avoid bad cases.  Median of 9 seems to help but
159          looks rather expensive.  This too seems to work but
160          is cheaper.  Guidance for the magic constants
161          7621 and 32768 is taken from Sedgewick's algorithms
162          book, chapter 35.
163       */
164       r = ((r * 7621) + 1) % 32768;
165       r3 = r % 3;
166       if (r3 == 0) med = eclass[fmap[lo]]; else
167       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
168                    med = eclass[fmap[hi]];
169 
170       unLo = ltLo = lo;
171       unHi = gtHi = hi;
172 
173       while (1) {
174          while (1) {
175             if (unLo > unHi) break;
176             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
177             if (n == 0) {
178                fswap(fmap[unLo], fmap[ltLo]);
179                ltLo++; unLo++;
180                continue;
181             };
182             if (n > 0) break;
183             unLo++;
184          }
185          while (1) {
186             if (unLo > unHi) break;
187             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
188             if (n == 0) {
189                fswap(fmap[unHi], fmap[gtHi]);
190                gtHi--; unHi--;
191                continue;
192             };
193             if (n < 0) break;
194             unHi--;
195          }
196          if (unLo > unHi) break;
197          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
198       }
199 
200       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
201 
202       if (gtHi < ltLo) continue;
203 
204       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
205       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
206 
207       n = lo + unLo - ltLo - 1;
208       m = hi - (gtHi - unHi) + 1;
209 
210       if (n - lo > hi - m) {
211          fpush ( lo, n );
212          fpush ( m, hi );
213       } else {
214          fpush ( m, hi );
215          fpush ( lo, n );
216       }
217    }
218 }
219 
220 #undef fmin
221 #undef fpush
222 #undef fpop
223 #undef fswap
224 #undef fvswap
225 #undef FALLBACK_QSORT_SMALL_THRESH
226 #undef FALLBACK_QSORT_STACK_SIZE
227 
228 /*---------------------------------------------*/
229 /* Pre:
230       nblock > 0
231       eclass exists for [0 .. nblock-1]
232       ((UChar*)eclass) [0 .. nblock-1] holds block
233       ptr exists for [0 .. nblock-1]
234 
235    Post:
236       ((UChar*)eclass) [0 .. nblock-1] holds block
237       All other areas of eclass destroyed
238       fmap [0 .. nblock-1] holds sorted order
239       bhtab [ 0 .. 2+(nblock/32) ] destroyed
240 */
241 
242 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
243 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
244 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
245 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
246 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
247 
248 static
fallbackSort(UInt32 * fmap,UInt32 * eclass,UInt32 * bhtab,Int32 nblock,Int32 verb)249 void fallbackSort ( UInt32* fmap,
250                     UInt32* eclass,
251                     UInt32* bhtab,
252                     Int32   nblock,
253                     Int32   verb )
254 {
255    Int32 ftab[257];
256    Int32 ftabCopy[256];
257    Int32 H, i, j, k, l, r, cc, cc1;
258    Int32 nNotDone;
259    Int32 nBhtab;
260    UChar* eclass8 = (UChar*)eclass;
261 
262    /*--
263       Initial 1-char radix sort to generate
264       initial fmap and initial BH bits.
265    --*/
266    if (verb >= 4)
267       VPrintf0 ( "        bucket sorting ...\n" );
268    for (i = 0; i < 257;    i++) ftab[i] = 0;
269    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
270    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
271    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
272 
273    for (i = 0; i < nblock; i++) {
274       j = eclass8[i];
275       k = ftab[j] - 1;
276       ftab[j] = k;
277       fmap[k] = i;
278    }
279 
280    nBhtab = 2 + (nblock / 32);
281    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
282    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
283 
284    /*--
285       Inductively refine the buckets.  Kind-of an
286       "exponential radix sort" (!), inspired by the
287       Manber-Myers suffix array construction algorithm.
288    --*/
289 
290    /*-- set sentinel bits for block-end detection --*/
291    for (i = 0; i < 32; i++) {
292       SET_BH(nblock + 2*i);
293       CLEAR_BH(nblock + 2*i + 1);
294    }
295 
296    /*-- the log(N) loop --*/
297    H = 1;
298    while (1) {
299 
300       if (verb >= 4)
301          VPrintf1 ( "        depth %6d has ", H );
302 
303       j = 0;
304       for (i = 0; i < nblock; i++) {
305          if (ISSET_BH(i)) j = i;
306          k = fmap[i] - H; if (k < 0) k += nblock;
307          eclass[k] = j;
308       }
309 
310       nNotDone = 0;
311       r = -1;
312       while (1) {
313 
314 	 /*-- find the next non-singleton bucket --*/
315          k = r + 1;
316          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
317          if (ISSET_BH(k)) {
318             while (WORD_BH(k) == 0xffffffff) k += 32;
319             while (ISSET_BH(k)) k++;
320          }
321          l = k - 1;
322          if (l >= nblock) break;
323          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
324          if (!ISSET_BH(k)) {
325             while (WORD_BH(k) == 0x00000000) k += 32;
326             while (!ISSET_BH(k)) k++;
327          }
328          r = k - 1;
329          if (r >= nblock) break;
330 
331          /*-- now [l, r] bracket current bucket --*/
332          if (r > l) {
333             nNotDone += (r - l + 1);
334             fallbackQSort3 ( fmap, eclass, l, r );
335 
336             /*-- scan bucket and generate header bits-- */
337             cc = -1;
338             for (i = l; i <= r; i++) {
339                cc1 = eclass[fmap[i]];
340                if (cc != cc1) { SET_BH(i); cc = cc1; };
341             }
342          }
343       }
344 
345       if (verb >= 4)
346          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
347 
348       H *= 2;
349       if (H > nblock || nNotDone == 0) break;
350    }
351 
352    /*--
353       Reconstruct the original block in
354       eclass8 [0 .. nblock-1], since the
355       previous phase destroyed it.
356    --*/
357    if (verb >= 4)
358       VPrintf0 ( "        reconstructing block ...\n" );
359    j = 0;
360    for (i = 0; i < nblock; i++) {
361       while (ftabCopy[j] == 0) j++;
362       ftabCopy[j]--;
363       eclass8[fmap[i]] = (UChar)j;
364    }
365    AssertH ( j < 256, 1005 );
366 }
367 
368 #undef       SET_BH
369 #undef     CLEAR_BH
370 #undef     ISSET_BH
371 #undef      WORD_BH
372 #undef UNALIGNED_BH
373 
374 /*---------------------------------------------*/
375 /*--- The main, O(N^2 log(N)) sorting       ---*/
376 /*--- algorithm.  Faster for "normal"       ---*/
377 /*--- non-repetitive blocks.                ---*/
378 /*---------------------------------------------*/
379 
380 /*---------------------------------------------*/
381 static
382 __inline__
mainGtU(UInt32 i1,UInt32 i2,UChar * block,UInt16 * quadrant,UInt32 nblock,Int32 * budget)383 Bool mainGtU ( UInt32  i1,
384                UInt32  i2,
385                UChar*  block,
386                UInt16* quadrant,
387                UInt32  nblock,
388                Int32*  budget )
389 {
390    Int32  k;
391    UChar  c1, c2;
392    UInt16 s1, s2;
393 
394    AssertD ( i1 != i2, "mainGtU" );
395    /* 1 */
396    c1 = block[i1]; c2 = block[i2];
397    if (c1 != c2) return (c1 > c2);
398    i1++; i2++;
399    /* 2 */
400    c1 = block[i1]; c2 = block[i2];
401    if (c1 != c2) return (c1 > c2);
402    i1++; i2++;
403    /* 3 */
404    c1 = block[i1]; c2 = block[i2];
405    if (c1 != c2) return (c1 > c2);
406    i1++; i2++;
407    /* 4 */
408    c1 = block[i1]; c2 = block[i2];
409    if (c1 != c2) return (c1 > c2);
410    i1++; i2++;
411    /* 5 */
412    c1 = block[i1]; c2 = block[i2];
413    if (c1 != c2) return (c1 > c2);
414    i1++; i2++;
415    /* 6 */
416    c1 = block[i1]; c2 = block[i2];
417    if (c1 != c2) return (c1 > c2);
418    i1++; i2++;
419    /* 7 */
420    c1 = block[i1]; c2 = block[i2];
421    if (c1 != c2) return (c1 > c2);
422    i1++; i2++;
423    /* 8 */
424    c1 = block[i1]; c2 = block[i2];
425    if (c1 != c2) return (c1 > c2);
426    i1++; i2++;
427    /* 9 */
428    c1 = block[i1]; c2 = block[i2];
429    if (c1 != c2) return (c1 > c2);
430    i1++; i2++;
431    /* 10 */
432    c1 = block[i1]; c2 = block[i2];
433    if (c1 != c2) return (c1 > c2);
434    i1++; i2++;
435    /* 11 */
436    c1 = block[i1]; c2 = block[i2];
437    if (c1 != c2) return (c1 > c2);
438    i1++; i2++;
439    /* 12 */
440    c1 = block[i1]; c2 = block[i2];
441    if (c1 != c2) return (c1 > c2);
442    i1++; i2++;
443 
444    k = nblock + 8;
445 
446    do {
447       /* 1 */
448       c1 = block[i1]; c2 = block[i2];
449       if (c1 != c2) return (c1 > c2);
450       s1 = quadrant[i1]; s2 = quadrant[i2];
451       if (s1 != s2) return (s1 > s2);
452       i1++; i2++;
453       /* 2 */
454       c1 = block[i1]; c2 = block[i2];
455       if (c1 != c2) return (c1 > c2);
456       s1 = quadrant[i1]; s2 = quadrant[i2];
457       if (s1 != s2) return (s1 > s2);
458       i1++; i2++;
459       /* 3 */
460       c1 = block[i1]; c2 = block[i2];
461       if (c1 != c2) return (c1 > c2);
462       s1 = quadrant[i1]; s2 = quadrant[i2];
463       if (s1 != s2) return (s1 > s2);
464       i1++; i2++;
465       /* 4 */
466       c1 = block[i1]; c2 = block[i2];
467       if (c1 != c2) return (c1 > c2);
468       s1 = quadrant[i1]; s2 = quadrant[i2];
469       if (s1 != s2) return (s1 > s2);
470       i1++; i2++;
471       /* 5 */
472       c1 = block[i1]; c2 = block[i2];
473       if (c1 != c2) return (c1 > c2);
474       s1 = quadrant[i1]; s2 = quadrant[i2];
475       if (s1 != s2) return (s1 > s2);
476       i1++; i2++;
477       /* 6 */
478       c1 = block[i1]; c2 = block[i2];
479       if (c1 != c2) return (c1 > c2);
480       s1 = quadrant[i1]; s2 = quadrant[i2];
481       if (s1 != s2) return (s1 > s2);
482       i1++; i2++;
483       /* 7 */
484       c1 = block[i1]; c2 = block[i2];
485       if (c1 != c2) return (c1 > c2);
486       s1 = quadrant[i1]; s2 = quadrant[i2];
487       if (s1 != s2) return (s1 > s2);
488       i1++; i2++;
489       /* 8 */
490       c1 = block[i1]; c2 = block[i2];
491       if (c1 != c2) return (c1 > c2);
492       s1 = quadrant[i1]; s2 = quadrant[i2];
493       if (s1 != s2) return (s1 > s2);
494       i1++; i2++;
495 
496       if (i1 >= nblock) i1 -= nblock;
497       if (i2 >= nblock) i2 -= nblock;
498 
499       k -= 8;
500       (*budget)--;
501    }
502       while (k >= 0);
503 
504    return False;
505 }
506 
507 /*---------------------------------------------*/
508 /*--
509    Knuth's increments seem to work better
510    than Incerpi-Sedgewick here.  Possibly
511    because the number of elems to sort is
512    usually small, typically <= 20.
513 --*/
514 static
515 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
516                    9841, 29524, 88573, 265720,
517                    797161, 2391484 };
518 
519 static
mainSimpleSort(UInt32 * ptr,UChar * block,UInt16 * quadrant,Int32 nblock,Int32 lo,Int32 hi,Int32 d,Int32 * budget)520 void mainSimpleSort ( UInt32* ptr,
521                       UChar*  block,
522                       UInt16* quadrant,
523                       Int32   nblock,
524                       Int32   lo,
525                       Int32   hi,
526                       Int32   d,
527                       Int32*  budget )
528 {
529    Int32 i, j, h, bigN, hp;
530    UInt32 v;
531 
532    bigN = hi - lo + 1;
533    if (bigN < 2) return;
534 
535    hp = 0;
536    while (incs[hp] < bigN) hp++;
537    hp--;
538 
539    for (; hp >= 0; hp--) {
540       h = incs[hp];
541 
542       i = lo + h;
543       while (True) {
544 
545          /*-- copy 1 --*/
546          if (i > hi) break;
547          v = ptr[i];
548          j = i;
549          while ( mainGtU (
550                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
551                  ) ) {
552             ptr[j] = ptr[j-h];
553             j = j - h;
554             if (j <= (lo + h - 1)) break;
555          }
556          ptr[j] = v;
557          i++;
558 
559          /*-- copy 2 --*/
560          if (i > hi) break;
561          v = ptr[i];
562          j = i;
563          while ( mainGtU (
564                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
565                  ) ) {
566             ptr[j] = ptr[j-h];
567             j = j - h;
568             if (j <= (lo + h - 1)) break;
569          }
570          ptr[j] = v;
571          i++;
572 
573          /*-- copy 3 --*/
574          if (i > hi) break;
575          v = ptr[i];
576          j = i;
577          while ( mainGtU (
578                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
579                  ) ) {
580             ptr[j] = ptr[j-h];
581             j = j - h;
582             if (j <= (lo + h - 1)) break;
583          }
584          ptr[j] = v;
585          i++;
586 
587          if (*budget < 0) return;
588       }
589    }
590 }
591 
592 /*---------------------------------------------*/
593 /*--
594    The following is an implementation of
595    an elegant 3-way quicksort for strings,
596    described in a paper "Fast Algorithms for
597    Sorting and Searching Strings", by Robert
598    Sedgewick and Jon L. Bentley.
599 --*/
600 
601 #define mswap(zz1, zz2) \
602    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
603 
604 #define mvswap(zzp1, zzp2, zzn)       \
605 {                                     \
606    Int32 yyp1 = (zzp1);               \
607    Int32 yyp2 = (zzp2);               \
608    Int32 yyn  = (zzn);                \
609    while (yyn > 0) {                  \
610       mswap(ptr[yyp1], ptr[yyp2]);    \
611       yyp1++; yyp2++; yyn--;          \
612    }                                  \
613 }
614 
615 static
616 __inline__
mmed3(UChar a,UChar b,UChar c)617 UChar mmed3 ( UChar a, UChar b, UChar c )
618 {
619    UChar t;
620    if (a > b) { t = a; a = b; b = t; };
621    if (b > c) {
622       b = c;
623       if (a > b) b = a;
624    }
625    return b;
626 }
627 
628 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
629 
630 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
631                           stackHi[sp] = hz; \
632                           stackD [sp] = dz; \
633                           sp++; }
634 
635 #define mpop(lz,hz,dz) { sp--;             \
636                          lz = stackLo[sp]; \
637                          hz = stackHi[sp]; \
638                          dz = stackD [sp]; }
639 
640 #define mnextsize(az) (nextHi[az]-nextLo[az])
641 
642 #define mnextswap(az,bz)                                        \
643    { Int32 tz;                                                  \
644      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
645      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
646      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
647 
648 #define MAIN_QSORT_SMALL_THRESH 20
649 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
650 #define MAIN_QSORT_STACK_SIZE 100
651 
652 static
mainQSort3(UInt32 * ptr,UChar * block,UInt16 * quadrant,Int32 nblock,Int32 loSt,Int32 hiSt,Int32 dSt,Int32 * budget)653 void mainQSort3 ( UInt32* ptr,
654                   UChar*  block,
655                   UInt16* quadrant,
656                   Int32   nblock,
657                   Int32   loSt,
658                   Int32   hiSt,
659                   Int32   dSt,
660                   Int32*  budget )
661 {
662    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
663    Int32 sp, lo, hi, d;
664 
665    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
666    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
667    Int32 stackD [MAIN_QSORT_STACK_SIZE];
668 
669    Int32 nextLo[3];
670    Int32 nextHi[3];
671    Int32 nextD [3];
672 
673    sp = 0;
674    mpush ( loSt, hiSt, dSt );
675 
676    while (sp > 0) {
677 
678       AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
679 
680       mpop ( lo, hi, d );
681       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
682           d > MAIN_QSORT_DEPTH_THRESH) {
683          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
684          if (*budget < 0) return;
685          continue;
686       }
687 
688       med = (Int32)
689             mmed3 ( block[ptr[ lo         ]+d],
690                     block[ptr[ hi         ]+d],
691                     block[ptr[ (lo+hi)>>1 ]+d] );
692 
693       unLo = ltLo = lo;
694       unHi = gtHi = hi;
695 
696       while (True) {
697          while (True) {
698             if (unLo > unHi) break;
699             n = ((Int32)block[ptr[unLo]+d]) - med;
700             if (n == 0) {
701                mswap(ptr[unLo], ptr[ltLo]);
702                ltLo++; unLo++; continue;
703             };
704             if (n >  0) break;
705             unLo++;
706          }
707          while (True) {
708             if (unLo > unHi) break;
709             n = ((Int32)block[ptr[unHi]+d]) - med;
710             if (n == 0) {
711                mswap(ptr[unHi], ptr[gtHi]);
712                gtHi--; unHi--; continue;
713             };
714             if (n <  0) break;
715             unHi--;
716          }
717          if (unLo > unHi) break;
718          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
719       }
720 
721       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
722 
723       if (gtHi < ltLo) {
724          mpush(lo, hi, d+1 );
725          continue;
726       }
727 
728       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
729       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
730 
731       n = lo + unLo - ltLo - 1;
732       m = hi - (gtHi - unHi) + 1;
733 
734       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
735       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
736       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
737 
738       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
739       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
740       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
741 
742       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
743       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
744 
745       mpush (nextLo[0], nextHi[0], nextD[0]);
746       mpush (nextLo[1], nextHi[1], nextD[1]);
747       mpush (nextLo[2], nextHi[2], nextD[2]);
748    }
749 }
750 
751 #undef mswap
752 #undef mvswap
753 #undef mpush
754 #undef mpop
755 #undef mmin
756 #undef mnextsize
757 #undef mnextswap
758 #undef MAIN_QSORT_SMALL_THRESH
759 #undef MAIN_QSORT_DEPTH_THRESH
760 #undef MAIN_QSORT_STACK_SIZE
761 
762 /*---------------------------------------------*/
763 /* Pre:
764       nblock > N_OVERSHOOT
765       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
766       ((UChar*)block32) [0 .. nblock-1] holds block
767       ptr exists for [0 .. nblock-1]
768 
769    Post:
770       ((UChar*)block32) [0 .. nblock-1] holds block
771       All other areas of block32 destroyed
772       ftab [0 .. 65536 ] destroyed
773       ptr [0 .. nblock-1] holds sorted order
774       if (*budget < 0), sorting was abandoned
775 */
776 
777 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
778 #define SETMASK (1 << 21)
779 #define CLEARMASK (~(SETMASK))
780 
781 static
mainSort(UInt32 * ptr,UChar * block,UInt16 * quadrant,UInt32 * ftab,Int32 nblock,Int32 verb,Int32 * budget)782 void mainSort ( UInt32* ptr,
783                 UChar*  block,
784                 UInt16* quadrant,
785                 UInt32* ftab,
786                 Int32   nblock,
787                 Int32   verb,
788                 Int32*  budget )
789 {
790    Int32  i, j, k, ss, sb;
791    Int32  runningOrder[256];
792    Bool   bigDone[256];
793    Int32  copyStart[256];
794    Int32  copyEnd  [256];
795    UChar  c1;
796    Int32  numQSorted;
797    UInt16 s;
798    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
799 
800    /*-- set up the 2-byte frequency table --*/
801    for (i = 65536; i >= 0; i--) ftab[i] = 0;
802 
803    j = block[0] << 8;
804    i = nblock-1;
805    for (; i >= 3; i -= 4) {
806       quadrant[i] = 0;
807       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
808       ftab[j]++;
809       quadrant[i-1] = 0;
810       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
811       ftab[j]++;
812       quadrant[i-2] = 0;
813       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
814       ftab[j]++;
815       quadrant[i-3] = 0;
816       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
817       ftab[j]++;
818    }
819    for (; i >= 0; i--) {
820       quadrant[i] = 0;
821       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
822       ftab[j]++;
823    }
824 
825    /*-- (emphasises close relationship of block & quadrant) --*/
826    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
827       block   [nblock+i] = block[i];
828       quadrant[nblock+i] = 0;
829    }
830 
831    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
832 
833    /*-- Complete the initial radix sort --*/
834    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
835 
836    s = block[0] << 8;
837    i = nblock-1;
838    for (; i >= 3; i -= 4) {
839       s = (s >> 8) | (block[i] << 8);
840       j = ftab[s] -1;
841       ftab[s] = j;
842       ptr[j] = i;
843       s = (s >> 8) | (block[i-1] << 8);
844       j = ftab[s] -1;
845       ftab[s] = j;
846       ptr[j] = i-1;
847       s = (s >> 8) | (block[i-2] << 8);
848       j = ftab[s] -1;
849       ftab[s] = j;
850       ptr[j] = i-2;
851       s = (s >> 8) | (block[i-3] << 8);
852       j = ftab[s] -1;
853       ftab[s] = j;
854       ptr[j] = i-3;
855    }
856    for (; i >= 0; i--) {
857       s = (s >> 8) | (block[i] << 8);
858       j = ftab[s] -1;
859       ftab[s] = j;
860       ptr[j] = i;
861    }
862 
863    /*--
864       Now ftab contains the first loc of every small bucket.
865       Calculate the running order, from smallest to largest
866       big bucket.
867    --*/
868    for (i = 0; i <= 255; i++) {
869       bigDone     [i] = False;
870       runningOrder[i] = i;
871    }
872 
873    {
874       Int32 vv;
875       Int32 h = 1;
876       do h = 3 * h + 1; while (h <= 256);
877       do {
878          h = h / 3;
879          for (i = h; i <= 255; i++) {
880             vv = runningOrder[i];
881             j = i;
882             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
883                runningOrder[j] = runningOrder[j-h];
884                j = j - h;
885                if (j <= (h - 1)) goto zero;
886             }
887             zero:
888             runningOrder[j] = vv;
889          }
890       } while (h != 1);
891    }
892 
893    /*--
894       The main sorting loop.
895    --*/
896 
897    numQSorted = 0;
898 
899    for (i = 0; i <= 255; i++) {
900 
901       /*--
902          Process big buckets, starting with the least full.
903          Basically this is a 3-step process in which we call
904          mainQSort3 to sort the small buckets [ss, j], but
905          also make a big effort to avoid the calls if we can.
906       --*/
907       ss = runningOrder[i];
908 
909       /*--
910          Step 1:
911          Complete the big bucket [ss] by quicksorting
912          any unsorted small buckets [ss, j], for j != ss.
913          Hopefully previous pointer-scanning phases have already
914          completed many of the small buckets [ss, j], so
915          we don't have to sort them at all.
916       --*/
917       for (j = 0; j <= 255; j++) {
918          if (j != ss) {
919             sb = (ss << 8) + j;
920             if ( ! (ftab[sb] & SETMASK) ) {
921                Int32 lo = ftab[sb]   & CLEARMASK;
922                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
923                if (hi > lo) {
924                   if (verb >= 4)
925                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
926                                 "done %d   this %d\n",
927                                 ss, j, numQSorted, hi - lo + 1 );
928                   mainQSort3 (
929                      ptr, block, quadrant, nblock,
930                      lo, hi, BZ_N_RADIX, budget
931                   );
932                   numQSorted += (hi - lo + 1);
933                   if (*budget < 0) return;
934                }
935             }
936             ftab[sb] |= SETMASK;
937          }
938       }
939 
940       AssertH ( !bigDone[ss], 1006 );
941 
942       /*--
943          Step 2:
944          Now scan this big bucket [ss] so as to synthesise the
945          sorted order for small buckets [t, ss] for all t,
946          including, magically, the bucket [ss,ss] too.
947          This will avoid doing Real Work in subsequent Step 1's.
948       --*/
949       {
950          for (j = 0; j <= 255; j++) {
951             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
952             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
953          }
954          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
955             k = ptr[j]-1; if (k < 0) k += nblock;
956             c1 = block[k];
957             if (!bigDone[c1])
958                ptr[ copyStart[c1]++ ] = k;
959          }
960          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
961             k = ptr[j]-1; if (k < 0) k += nblock;
962             c1 = block[k];
963             if (!bigDone[c1])
964                ptr[ copyEnd[c1]-- ] = k;
965          }
966       }
967 
968       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
969                 ||
970                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
971                    Necessity for this case is demonstrated by compressing
972                    a sequence of approximately 48.5 million of character
973                    251; 1.0.0/1.0.1 will then die here. */
974                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
975                 1007 )
976 
977       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
978 
979       /*--
980          Step 3:
981          The [ss] big bucket is now done.  Record this fact,
982          and update the quadrant descriptors.  Remember to
983          update quadrants in the overshoot area too, if
984          necessary.  The "if (i < 255)" test merely skips
985          this updating for the last bucket processed, since
986          updating for the last bucket is pointless.
987 
988          The quadrant array provides a way to incrementally
989          cache sort orderings, as they appear, so as to
990          make subsequent comparisons in fullGtU() complete
991          faster.  For repetitive blocks this makes a big
992          difference (but not big enough to be able to avoid
993          the fallback sorting mechanism, exponential radix sort).
994 
995          The precise meaning is: at all times:
996 
997             for 0 <= i < nblock and 0 <= j <= nblock
998 
999             if block[i] != block[j],
1000 
1001                then the relative values of quadrant[i] and
1002                     quadrant[j] are meaningless.
1003 
1004                else {
1005                   if quadrant[i] < quadrant[j]
1006                      then the string starting at i lexicographically
1007                      precedes the string starting at j
1008 
1009                   else if quadrant[i] > quadrant[j]
1010                      then the string starting at j lexicographically
1011                      precedes the string starting at i
1012 
1013                   else
1014                      the relative ordering of the strings starting
1015                      at i and j has not yet been determined.
1016                }
1017       --*/
1018       bigDone[ss] = True;
1019 
1020       if (i < 255) {
1021          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
1022          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
1023          Int32 shifts   = 0;
1024 
1025          while ((bbSize >> shifts) > 65534) shifts++;
1026 
1027          for (j = bbSize-1; j >= 0; j--) {
1028             Int32 a2update     = ptr[bbStart + j];
1029             UInt16 qVal        = (UInt16)(j >> shifts);
1030             quadrant[a2update] = qVal;
1031             if (a2update < BZ_N_OVERSHOOT)
1032                quadrant[a2update + nblock] = qVal;
1033          }
1034          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1035       }
1036 
1037    }
1038 
1039    if (verb >= 4)
1040       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1041                  nblock, numQSorted, nblock - numQSorted );
1042 }
1043 
1044 #undef BIGFREQ
1045 #undef SETMASK
1046 #undef CLEARMASK
1047 
1048 /*---------------------------------------------*/
1049 /* Pre:
1050       nblock > 0
1051       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1052       ((UChar*)arr2)  [0 .. nblock-1] holds block
1053       arr1 exists for [0 .. nblock-1]
1054 
1055    Post:
1056       ((UChar*)arr2) [0 .. nblock-1] holds block
1057       All other areas of block destroyed
1058       ftab [ 0 .. 65536 ] destroyed
1059       arr1 [0 .. nblock-1] holds sorted order
1060 */
BZ2_blockSort(EState * s)1061 void BZ2_blockSort ( EState* s )
1062 {
1063    UInt32* ptr    = s->ptr;
1064    UChar*  block  = s->block;
1065    UInt32* ftab   = s->ftab;
1066    Int32   nblock = s->nblock;
1067    Int32   verb   = s->verbosity;
1068    Int32   wfact  = s->workFactor;
1069    UInt16* quadrant;
1070    Int32   budget;
1071    Int32   budgetInit;
1072    Int32   i;
1073 
1074    if (nblock < 10000) {
1075       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1076    } else {
1077       /* Calculate the location for quadrant, remembering to get
1078          the alignment right.  Assumes that &(block[0]) is at least
1079          2-byte aligned -- this should be ok since block is really
1080          the first section of arr2.
1081       */
1082       i = nblock+BZ_N_OVERSHOOT;
1083       if (i & 1) i++;
1084       quadrant = (UInt16*)(&(block[i]));
1085 
1086       /* (wfact-1) / 3 puts the default-factor-30
1087          transition point at very roughly the same place as
1088          with v0.1 and v0.9.0.
1089          Not that it particularly matters any more, since the
1090          resulting compressed stream is now the same regardless
1091          of whether or not we use the main sort or fallback sort.
1092       */
1093       if (wfact < 1  ) wfact = 1;
1094       if (wfact > 100) wfact = 100;
1095       budgetInit = nblock * ((wfact-1) / 3);
1096       budget = budgetInit;
1097 
1098       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1099       if (verb >= 3)
1100          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1101                     budgetInit - budget,
1102                     nblock,
1103                     (float)(budgetInit - budget) /
1104                     (float)(nblock==0 ? 1 : nblock) );
1105       if (budget < 0) {
1106          if (verb >= 2)
1107             VPrintf0 ( "    too repetitive; using fallback"
1108                        " sorting algorithm\n" );
1109          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1110       }
1111    }
1112 
1113    s->origPtr = -1;
1114    for (i = 0; i < s->nblock; i++)
1115       if (ptr[i] == 0)
1116          { s->origPtr = i; break; };
1117 
1118    AssertH( s->origPtr != -1, 1003 );
1119 }
1120 
1121 /*-------------------------------------------------------------*/
1122 /*--- end                                       blocksort.c ---*/
1123 /*-------------------------------------------------------------*/
1124