blocksort.c - vx32 - Local 9vx git repository for patches.
 (HTM) git clone git://r-36.net/vx32
 (DIR) Log
 (DIR) Files
 (DIR) Refs
       ---
       blocksort.c (32594B)
       ---
            1 
            2 /*-------------------------------------------------------------*/
            3 /*--- Block sorting machinery                               ---*/
            4 /*---                                           blocksort.c ---*/
            5 /*-------------------------------------------------------------*/
            6 
            7 /*--
            8   This file is a part of bzip2 and/or libbzip2, a program and
            9   library for lossless, block-sorting data compression.
           10 
           11   Copyright (C) 1996-2005 Julian R Seward.  All rights reserved.
           12 
           13   Redistribution and use in source and binary forms, with or without
           14   modification, are permitted provided that the following conditions
           15   are met:
           16 
           17   1. Redistributions of source code must retain the above copyright
           18      notice, this list of conditions and the following disclaimer.
           19 
           20   2. The origin of this software must not be misrepresented; you must 
           21      not claim that you wrote the original software.  If you use this 
           22      software in a product, an acknowledgment in the product 
           23      documentation would be appreciated but is not required.
           24 
           25   3. Altered source versions must be plainly marked as such, and must
           26      not be misrepresented as being the original software.
           27 
           28   4. The name of the author may not be used to endorse or promote 
           29      products derived from this software without specific prior written 
           30      permission.
           31 
           32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
           33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
           34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
           35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
           36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
           37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
           38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
           39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
           40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
           41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
           42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
           43 
           44   Julian Seward, Cambridge, UK.
           45   jseward@bzip.org
           46   bzip2/libbzip2 version 1.0 of 21 March 2000
           47 
           48   This program is based on (at least) the work of:
           49      Mike Burrows
           50      David Wheeler
           51      Peter Fenwick
           52      Alistair Moffat
           53      Radford Neal
           54      Ian H. Witten
           55      Robert Sedgewick
           56      Jon L. Bentley
           57 
           58   For more information on these sources, see the manual.
           59 
           60   To get some idea how the block sorting algorithms in this file 
           61   work, read my paper 
           62      On the Performance of BWT Sorting Algorithms
           63   in Proceedings of the IEEE Data Compression Conference 2000,
           64   Snowbird, Utah, USA, 27-30 March 2000.  The main sort in this
           65   file implements the algorithm called  cache  in the paper.
           66 --*/
           67 
           68 
           69 #include "bzlib_private.h"
           70 
           71 /*---------------------------------------------*/
           72 /*--- Fallback O(N log(N)^2) sorting        ---*/
           73 /*--- algorithm, for repetitive blocks      ---*/
           74 /*---------------------------------------------*/
           75 
           76 /*---------------------------------------------*/
           77 static 
           78 __inline__
           79 void fallbackSimpleSort ( UInt32* fmap, 
           80                           UInt32* eclass, 
           81                           Int32   lo, 
           82                           Int32   hi )
           83 {
           84    Int32 i, j, tmp;
           85    UInt32 ec_tmp;
           86 
           87    if (lo == hi) return;
           88 
           89    if (hi - lo > 3) {
           90       for ( i = hi-4; i >= lo; i-- ) {
           91          tmp = fmap[i];
           92          ec_tmp = eclass[tmp];
           93          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
           94             fmap[j-4] = fmap[j];
           95          fmap[j-4] = tmp;
           96       }
           97    }
           98 
           99    for ( i = hi-1; i >= lo; i-- ) {
          100       tmp = fmap[i];
          101       ec_tmp = eclass[tmp];
          102       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
          103          fmap[j-1] = fmap[j];
          104       fmap[j-1] = tmp;
          105    }
          106 }
          107 
          108 
          109 /*---------------------------------------------*/
          110 #define fswap(zz1, zz2) \
          111    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
          112 
          113 #define fvswap(zzp1, zzp2, zzn)       \
          114 {                                     \
          115    Int32 yyp1 = (zzp1);               \
          116    Int32 yyp2 = (zzp2);               \
          117    Int32 yyn  = (zzn);                \
          118    while (yyn > 0) {                  \
          119       fswap(fmap[yyp1], fmap[yyp2]);  \
          120       yyp1++; yyp2++; yyn--;          \
          121    }                                  \
          122 }
          123 
          124 
          125 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
          126 
          127 #define fpush(lz,hz) { stackLo[sp] = lz; \
          128                        stackHi[sp] = hz; \
          129                        sp++; }
          130 
          131 #define fpop(lz,hz) { sp--;              \
          132                       lz = stackLo[sp];  \
          133                       hz = stackHi[sp]; }
          134 
          135 #define FALLBACK_QSORT_SMALL_THRESH 10
          136 #define FALLBACK_QSORT_STACK_SIZE   100
          137 
          138 
          139 static
          140 void fallbackQSort3 ( UInt32* fmap, 
          141                       UInt32* eclass,
          142                       Int32   loSt, 
          143                       Int32   hiSt )
          144 {
          145    Int32 unLo, unHi, ltLo, gtHi, n, m;
          146    Int32 sp, lo, hi;
          147    UInt32 med, r, r3;
          148    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
          149    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
          150 
          151    r = 0;
          152 
          153    sp = 0;
          154    fpush ( loSt, hiSt );
          155 
          156    while (sp > 0) {
          157 
          158       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 );
          159 
          160       fpop ( lo, hi );
          161       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
          162          fallbackSimpleSort ( fmap, eclass, lo, hi );
          163          continue;
          164       }
          165 
          166       /* Random partitioning.  Median of 3 sometimes fails to
          167          avoid bad cases.  Median of 9 seems to help but 
          168          looks rather expensive.  This too seems to work but
          169          is cheaper.  Guidance for the magic constants 
          170          7621 and 32768 is taken from Sedgewick's algorithms
          171          book, chapter 35.
          172       */
          173       r = ((r * 7621) + 1) % 32768;
          174       r3 = r % 3;
          175       if (r3 == 0) med = eclass[fmap[lo]]; else
          176       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
          177                    med = eclass[fmap[hi]];
          178 
          179       unLo = ltLo = lo;
          180       unHi = gtHi = hi;
          181 
          182       while (1) {
          183          while (1) {
          184             if (unLo > unHi) break;
          185             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
          186             if (n == 0) { 
          187                fswap(fmap[unLo], fmap[ltLo]); 
          188                ltLo++; unLo++; 
          189                continue; 
          190             };
          191             if (n > 0) break;
          192             unLo++;
          193          }
          194          while (1) {
          195             if (unLo > unHi) break;
          196             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
          197             if (n == 0) { 
          198                fswap(fmap[unHi], fmap[gtHi]); 
          199                gtHi--; unHi--; 
          200                continue; 
          201             };
          202             if (n < 0) break;
          203             unHi--;
          204          }
          205          if (unLo > unHi) break;
          206          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
          207       }
          208 
          209       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
          210 
          211       if (gtHi < ltLo) continue;
          212 
          213       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
          214       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
          215 
          216       n = lo + unLo - ltLo - 1;
          217       m = hi - (gtHi - unHi) + 1;
          218 
          219       if (n - lo > hi - m) {
          220          fpush ( lo, n );
          221          fpush ( m, hi );
          222       } else {
          223          fpush ( m, hi );
          224          fpush ( lo, n );
          225       }
          226    }
          227 }
          228 
          229 #undef fmin
          230 #undef fpush
          231 #undef fpop
          232 #undef fswap
          233 #undef fvswap
          234 #undef FALLBACK_QSORT_SMALL_THRESH
          235 #undef FALLBACK_QSORT_STACK_SIZE
          236 
          237 
          238 /*---------------------------------------------*/
          239 /* Pre:
          240       nblock > 0
          241       eclass exists for [0 .. nblock-1]
          242       ((UChar*)eclass) [0 .. nblock-1] holds block
          243       ptr exists for [0 .. nblock-1]
          244 
          245    Post:
          246       ((UChar*)eclass) [0 .. nblock-1] holds block
          247       All other areas of eclass destroyed
          248       fmap [0 .. nblock-1] holds sorted order
          249       bhtab [ 0 .. 2+(nblock/32) ] destroyed
          250 */
          251 
          252 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
          253 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
          254 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
          255 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
          256 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
          257 
          258 static
          259 void fallbackSort ( UInt32* fmap, 
          260                     UInt32* eclass, 
          261                     UInt32* bhtab,
          262                     Int32   nblock,
          263                     Int32   verb )
          264 {
          265    Int32 ftab[257];
          266    Int32 ftabCopy[256];
          267    Int32 H, i, j, k, l, r, cc, cc1;
          268    Int32 nNotDone;
          269    Int32 nBhtab;
          270    UChar* eclass8 = (UChar*)eclass;
          271 
          272    /*--
          273       Initial 1-char radix sort to generate
          274       initial fmap and initial BH bits.
          275    --*/
          276    if (verb >= 4)
          277       VPrintf0 ( "        bucket sorting ...\n" );
          278    for (i = 0; i < 257;    i++) ftab[i] = 0;
          279    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
          280    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
          281    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
          282 
          283    for (i = 0; i < nblock; i++) {
          284       j = eclass8[i];
          285       k = ftab[j] - 1;
          286       ftab[j] = k;
          287       fmap[k] = i;
          288    }
          289 
          290    nBhtab = 2 + (nblock / 32);
          291    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
          292    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
          293 
          294    /*--
          295       Inductively refine the buckets.  Kind-of an
          296       "exponential radix sort" (!), inspired by the
          297       Manber-Myers suffix array construction algorithm.
          298    --*/
          299 
          300    /*-- set sentinel bits for block-end detection --*/
          301    for (i = 0; i < 32; i++) { 
          302       SET_BH(nblock + 2*i);
          303       CLEAR_BH(nblock + 2*i + 1);
          304    }
          305 
          306    /*-- the log(N) loop --*/
          307    H = 1;
          308    while (1) {
          309 
          310       if (verb >= 4) 
          311          VPrintf1 ( "        depth %6d has ", H );
          312 
          313       j = 0;
          314       for (i = 0; i < nblock; i++) {
          315          if (ISSET_BH(i)) j = i;
          316          k = fmap[i] - H; if (k < 0) k += nblock;
          317          eclass[k] = j;
          318       }
          319 
          320       nNotDone = 0;
          321       r = -1;
          322       while (1) {
          323 
          324          /*-- find the next non-singleton bucket --*/
          325          k = r + 1;
          326          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
          327          if (ISSET_BH(k)) {
          328             while (WORD_BH(k) == 0xffffffff) k += 32;
          329             while (ISSET_BH(k)) k++;
          330          }
          331          l = k - 1;
          332          if (l >= nblock) break;
          333          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
          334          if (!ISSET_BH(k)) {
          335             while (WORD_BH(k) == 0x00000000) k += 32;
          336             while (!ISSET_BH(k)) k++;
          337          }
          338          r = k - 1;
          339          if (r >= nblock) break;
          340 
          341          /*-- now [l, r] bracket current bucket --*/
          342          if (r > l) {
          343             nNotDone += (r - l + 1);
          344             fallbackQSort3 ( fmap, eclass, l, r );
          345 
          346             /*-- scan bucket and generate header bits-- */
          347             cc = -1;
          348             for (i = l; i <= r; i++) {
          349                cc1 = eclass[fmap[i]];
          350                if (cc != cc1) { SET_BH(i); cc = cc1; };
          351             }
          352          }
          353       }
          354 
          355       if (verb >= 4) 
          356          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
          357 
          358       H *= 2;
          359       if (H > nblock || nNotDone == 0) break;
          360    }
          361 
          362    /*-- 
          363       Reconstruct the original block in
          364       eclass8 [0 .. nblock-1], since the
          365       previous phase destroyed it.
          366    --*/
          367    if (verb >= 4)
          368       VPrintf0 ( "        reconstructing block ...\n" );
          369    j = 0;
          370    for (i = 0; i < nblock; i++) {
          371       while (ftabCopy[j] == 0) j++;
          372       ftabCopy[j]--;
          373       eclass8[fmap[i]] = (UChar)j;
          374    }
          375    AssertH ( j < 256, 1005 );
          376 }
          377 
          378 #undef       SET_BH
          379 #undef     CLEAR_BH
          380 #undef     ISSET_BH
          381 #undef      WORD_BH
          382 #undef UNALIGNED_BH
          383 
          384 
          385 /*---------------------------------------------*/
          386 /*--- The main, O(N^2 log(N)) sorting       ---*/
          387 /*--- algorithm.  Faster for "normal"       ---*/
          388 /*--- non-repetitive blocks.                ---*/
          389 /*---------------------------------------------*/
          390 
          391 /*---------------------------------------------*/
          392 static
          393 __inline__
          394 Bool mainGtU ( UInt32  i1, 
          395                UInt32  i2,
          396                UChar*  block, 
          397                UInt16* quadrant,
          398                UInt32  nblock,
          399                Int32*  budget )
          400 {
          401    Int32  k;
          402    UChar  c1, c2;
          403    UInt16 s1, s2;
          404 
          405    AssertD ( i1 != i2, "mainGtU" );
          406    /* 1 */
          407    c1 = block[i1]; c2 = block[i2];
          408    if (c1 != c2) return (c1 > c2);
          409    i1++; i2++;
          410    /* 2 */
          411    c1 = block[i1]; c2 = block[i2];
          412    if (c1 != c2) return (c1 > c2);
          413    i1++; i2++;
          414    /* 3 */
          415    c1 = block[i1]; c2 = block[i2];
          416    if (c1 != c2) return (c1 > c2);
          417    i1++; i2++;
          418    /* 4 */
          419    c1 = block[i1]; c2 = block[i2];
          420    if (c1 != c2) return (c1 > c2);
          421    i1++; i2++;
          422    /* 5 */
          423    c1 = block[i1]; c2 = block[i2];
          424    if (c1 != c2) return (c1 > c2);
          425    i1++; i2++;
          426    /* 6 */
          427    c1 = block[i1]; c2 = block[i2];
          428    if (c1 != c2) return (c1 > c2);
          429    i1++; i2++;
          430    /* 7 */
          431    c1 = block[i1]; c2 = block[i2];
          432    if (c1 != c2) return (c1 > c2);
          433    i1++; i2++;
          434    /* 8 */
          435    c1 = block[i1]; c2 = block[i2];
          436    if (c1 != c2) return (c1 > c2);
          437    i1++; i2++;
          438    /* 9 */
          439    c1 = block[i1]; c2 = block[i2];
          440    if (c1 != c2) return (c1 > c2);
          441    i1++; i2++;
          442    /* 10 */
          443    c1 = block[i1]; c2 = block[i2];
          444    if (c1 != c2) return (c1 > c2);
          445    i1++; i2++;
          446    /* 11 */
          447    c1 = block[i1]; c2 = block[i2];
          448    if (c1 != c2) return (c1 > c2);
          449    i1++; i2++;
          450    /* 12 */
          451    c1 = block[i1]; c2 = block[i2];
          452    if (c1 != c2) return (c1 > c2);
          453    i1++; i2++;
          454 
          455    k = nblock + 8;
          456 
          457    do {
          458       /* 1 */
          459       c1 = block[i1]; c2 = block[i2];
          460       if (c1 != c2) return (c1 > c2);
          461       s1 = quadrant[i1]; s2 = quadrant[i2];
          462       if (s1 != s2) return (s1 > s2);
          463       i1++; i2++;
          464       /* 2 */
          465       c1 = block[i1]; c2 = block[i2];
          466       if (c1 != c2) return (c1 > c2);
          467       s1 = quadrant[i1]; s2 = quadrant[i2];
          468       if (s1 != s2) return (s1 > s2);
          469       i1++; i2++;
          470       /* 3 */
          471       c1 = block[i1]; c2 = block[i2];
          472       if (c1 != c2) return (c1 > c2);
          473       s1 = quadrant[i1]; s2 = quadrant[i2];
          474       if (s1 != s2) return (s1 > s2);
          475       i1++; i2++;
          476       /* 4 */
          477       c1 = block[i1]; c2 = block[i2];
          478       if (c1 != c2) return (c1 > c2);
          479       s1 = quadrant[i1]; s2 = quadrant[i2];
          480       if (s1 != s2) return (s1 > s2);
          481       i1++; i2++;
          482       /* 5 */
          483       c1 = block[i1]; c2 = block[i2];
          484       if (c1 != c2) return (c1 > c2);
          485       s1 = quadrant[i1]; s2 = quadrant[i2];
          486       if (s1 != s2) return (s1 > s2);
          487       i1++; i2++;
          488       /* 6 */
          489       c1 = block[i1]; c2 = block[i2];
          490       if (c1 != c2) return (c1 > c2);
          491       s1 = quadrant[i1]; s2 = quadrant[i2];
          492       if (s1 != s2) return (s1 > s2);
          493       i1++; i2++;
          494       /* 7 */
          495       c1 = block[i1]; c2 = block[i2];
          496       if (c1 != c2) return (c1 > c2);
          497       s1 = quadrant[i1]; s2 = quadrant[i2];
          498       if (s1 != s2) return (s1 > s2);
          499       i1++; i2++;
          500       /* 8 */
          501       c1 = block[i1]; c2 = block[i2];
          502       if (c1 != c2) return (c1 > c2);
          503       s1 = quadrant[i1]; s2 = quadrant[i2];
          504       if (s1 != s2) return (s1 > s2);
          505       i1++; i2++;
          506 
          507       if (i1 >= nblock) i1 -= nblock;
          508       if (i2 >= nblock) i2 -= nblock;
          509 
          510       k -= 8;
          511       (*budget)--;
          512    }
          513       while (k >= 0);
          514 
          515    return False;
          516 }
          517 
          518 
          519 /*---------------------------------------------*/
          520 /*--
          521    Knuth's increments seem to work better
          522    than Incerpi-Sedgewick here.  Possibly
          523    because the number of elems to sort is
          524    usually small, typically <= 20.
          525 --*/
          526 static
          527 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
          528                    9841, 29524, 88573, 265720,
          529                    797161, 2391484 };
          530 
          531 static
          532 void mainSimpleSort ( UInt32* ptr,
          533                       UChar*  block,
          534                       UInt16* quadrant,
          535                       Int32   nblock,
          536                       Int32   lo, 
          537                       Int32   hi, 
          538                       Int32   d,
          539                       Int32*  budget )
          540 {
          541    Int32 i, j, h, bigN, hp;
          542    UInt32 v;
          543 
          544    bigN = hi - lo + 1;
          545    if (bigN < 2) return;
          546 
          547    hp = 0;
          548    while (incs[hp] < bigN) hp++;
          549    hp--;
          550 
          551    for (; hp >= 0; hp--) {
          552       h = incs[hp];
          553 
          554       i = lo + h;
          555       while (True) {
          556 
          557          /*-- copy 1 --*/
          558          if (i > hi) break;
          559          v = ptr[i];
          560          j = i;
          561          while ( mainGtU ( 
          562                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
          563                  ) ) {
          564             ptr[j] = ptr[j-h];
          565             j = j - h;
          566             if (j <= (lo + h - 1)) break;
          567          }
          568          ptr[j] = v;
          569          i++;
          570 
          571          /*-- copy 2 --*/
          572          if (i > hi) break;
          573          v = ptr[i];
          574          j = i;
          575          while ( mainGtU ( 
          576                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
          577                  ) ) {
          578             ptr[j] = ptr[j-h];
          579             j = j - h;
          580             if (j <= (lo + h - 1)) break;
          581          }
          582          ptr[j] = v;
          583          i++;
          584 
          585          /*-- copy 3 --*/
          586          if (i > hi) break;
          587          v = ptr[i];
          588          j = i;
          589          while ( mainGtU ( 
          590                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget 
          591                  ) ) {
          592             ptr[j] = ptr[j-h];
          593             j = j - h;
          594             if (j <= (lo + h - 1)) break;
          595          }
          596          ptr[j] = v;
          597          i++;
          598 
          599          if (*budget < 0) return;
          600       }
          601    }
          602 }
          603 
          604 
          605 /*---------------------------------------------*/
          606 /*--
          607    The following is an implementation of
          608    an elegant 3-way quicksort for strings,
          609    described in a paper "Fast Algorithms for
          610    Sorting and Searching Strings", by Robert
          611    Sedgewick and Jon L. Bentley.
          612 --*/
          613 
          614 #define mswap(zz1, zz2) \
          615    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
          616 
          617 #define mvswap(zzp1, zzp2, zzn)       \
          618 {                                     \
          619    Int32 yyp1 = (zzp1);               \
          620    Int32 yyp2 = (zzp2);               \
          621    Int32 yyn  = (zzn);                \
          622    while (yyn > 0) {                  \
          623       mswap(ptr[yyp1], ptr[yyp2]);    \
          624       yyp1++; yyp2++; yyn--;          \
          625    }                                  \
          626 }
          627 
          628 static 
          629 __inline__
          630 UChar mmed3 ( UChar a, UChar b, UChar c )
          631 {
          632    UChar t;
          633    if (a > b) { t = a; a = b; b = t; };
          634    if (b > c) { 
          635       b = c;
          636       if (a > b) b = a;
          637    }
          638    return b;
          639 }
          640 
          641 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
          642 
          643 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
          644                           stackHi[sp] = hz; \
          645                           stackD [sp] = dz; \
          646                           sp++; }
          647 
          648 #define mpop(lz,hz,dz) { sp--;             \
          649                          lz = stackLo[sp]; \
          650                          hz = stackHi[sp]; \
          651                          dz = stackD [sp]; }
          652 
          653 
          654 #define mnextsize(az) (nextHi[az]-nextLo[az])
          655 
          656 #define mnextswap(az,bz)                                        \
          657    { Int32 tz;                                                  \
          658      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
          659      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
          660      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
          661 
          662 
          663 #define MAIN_QSORT_SMALL_THRESH 20
          664 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
          665 #define MAIN_QSORT_STACK_SIZE 100
          666 
          667 static
          668 void mainQSort3 ( UInt32* ptr,
          669                   UChar*  block,
          670                   UInt16* quadrant,
          671                   Int32   nblock,
          672                   Int32   loSt, 
          673                   Int32   hiSt, 
          674                   Int32   dSt,
          675                   Int32*  budget )
          676 {
          677    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
          678    Int32 sp, lo, hi, d;
          679 
          680    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
          681    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
          682    Int32 stackD [MAIN_QSORT_STACK_SIZE];
          683 
          684    Int32 nextLo[3];
          685    Int32 nextHi[3];
          686    Int32 nextD [3];
          687 
          688    sp = 0;
          689    mpush ( loSt, hiSt, dSt );
          690 
          691    while (sp > 0) {
          692 
          693       AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 );
          694 
          695       mpop ( lo, hi, d );
          696       if (hi - lo < MAIN_QSORT_SMALL_THRESH || 
          697           d > MAIN_QSORT_DEPTH_THRESH) {
          698          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
          699          if (*budget < 0) return;
          700          continue;
          701       }
          702 
          703       med = (Int32) 
          704             mmed3 ( block[ptr[ lo         ]+d],
          705                     block[ptr[ hi         ]+d],
          706                     block[ptr[ (lo+hi)>>1 ]+d] );
          707 
          708       unLo = ltLo = lo;
          709       unHi = gtHi = hi;
          710 
          711       while (True) {
          712          while (True) {
          713             if (unLo > unHi) break;
          714             n = ((Int32)block[ptr[unLo]+d]) - med;
          715             if (n == 0) { 
          716                mswap(ptr[unLo], ptr[ltLo]); 
          717                ltLo++; unLo++; continue; 
          718             };
          719             if (n >  0) break;
          720             unLo++;
          721          }
          722          while (True) {
          723             if (unLo > unHi) break;
          724             n = ((Int32)block[ptr[unHi]+d]) - med;
          725             if (n == 0) { 
          726                mswap(ptr[unHi], ptr[gtHi]); 
          727                gtHi--; unHi--; continue; 
          728             };
          729             if (n <  0) break;
          730             unHi--;
          731          }
          732          if (unLo > unHi) break;
          733          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
          734       }
          735 
          736       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
          737 
          738       if (gtHi < ltLo) {
          739          mpush(lo, hi, d+1 );
          740          continue;
          741       }
          742 
          743       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
          744       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
          745 
          746       n = lo + unLo - ltLo - 1;
          747       m = hi - (gtHi - unHi) + 1;
          748 
          749       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
          750       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
          751       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
          752 
          753       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
          754       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
          755       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
          756 
          757       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
          758       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
          759 
          760       mpush (nextLo[0], nextHi[0], nextD[0]);
          761       mpush (nextLo[1], nextHi[1], nextD[1]);
          762       mpush (nextLo[2], nextHi[2], nextD[2]);
          763    }
          764 }
          765 
          766 #undef mswap
          767 #undef mvswap
          768 #undef mpush
          769 #undef mpop
          770 #undef mmin
          771 #undef mnextsize
          772 #undef mnextswap
          773 #undef MAIN_QSORT_SMALL_THRESH
          774 #undef MAIN_QSORT_DEPTH_THRESH
          775 #undef MAIN_QSORT_STACK_SIZE
          776 
          777 
          778 /*---------------------------------------------*/
          779 /* Pre:
          780       nblock > N_OVERSHOOT
          781       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
          782       ((UChar*)block32) [0 .. nblock-1] holds block
          783       ptr exists for [0 .. nblock-1]
          784 
          785    Post:
          786       ((UChar*)block32) [0 .. nblock-1] holds block
          787       All other areas of block32 destroyed
          788       ftab [0 .. 65536 ] destroyed
          789       ptr [0 .. nblock-1] holds sorted order
          790       if (*budget < 0), sorting was abandoned
          791 */
          792 
          793 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
          794 #define SETMASK (1 << 21)
          795 #define CLEARMASK (~(SETMASK))
          796 
          797 static
          798 void mainSort ( UInt32* ptr, 
          799                 UChar*  block,
          800                 UInt16* quadrant, 
          801                 UInt32* ftab,
          802                 Int32   nblock,
          803                 Int32   verb,
          804                 Int32*  budget )
          805 {
          806    Int32  i, j, k, ss, sb;
          807    Int32  runningOrder[256];
          808    Bool   bigDone[256];
          809    Int32  copyStart[256];
          810    Int32  copyEnd  [256];
          811    UChar  c1;
          812    Int32  numQSorted;
          813    UInt16 s;
          814    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
          815 
          816    /*-- set up the 2-byte frequency table --*/
          817    for (i = 65536; i >= 0; i--) ftab[i] = 0;
          818 
          819    j = block[0] << 8;
          820    i = nblock-1;
          821    for (; i >= 3; i -= 4) {
          822       quadrant[i] = 0;
          823       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
          824       ftab[j]++;
          825       quadrant[i-1] = 0;
          826       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
          827       ftab[j]++;
          828       quadrant[i-2] = 0;
          829       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
          830       ftab[j]++;
          831       quadrant[i-3] = 0;
          832       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
          833       ftab[j]++;
          834    }
          835    for (; i >= 0; i--) {
          836       quadrant[i] = 0;
          837       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
          838       ftab[j]++;
          839    }
          840 
          841    /*-- (emphasises close relationship of block & quadrant) --*/
          842    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
          843       block   [nblock+i] = block[i];
          844       quadrant[nblock+i] = 0;
          845    }
          846 
          847    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
          848 
          849    /*-- Complete the initial radix sort --*/
          850    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
          851 
          852    s = block[0] << 8;
          853    i = nblock-1;
          854    for (; i >= 3; i -= 4) {
          855       s = (s >> 8) | (block[i] << 8);
          856       j = ftab[s] -1;
          857       ftab[s] = j;
          858       ptr[j] = i;
          859       s = (s >> 8) | (block[i-1] << 8);
          860       j = ftab[s] -1;
          861       ftab[s] = j;
          862       ptr[j] = i-1;
          863       s = (s >> 8) | (block[i-2] << 8);
          864       j = ftab[s] -1;
          865       ftab[s] = j;
          866       ptr[j] = i-2;
          867       s = (s >> 8) | (block[i-3] << 8);
          868       j = ftab[s] -1;
          869       ftab[s] = j;
          870       ptr[j] = i-3;
          871    }
          872    for (; i >= 0; i--) {
          873       s = (s >> 8) | (block[i] << 8);
          874       j = ftab[s] -1;
          875       ftab[s] = j;
          876       ptr[j] = i;
          877    }
          878 
          879    /*--
          880       Now ftab contains the first loc of every small bucket.
          881       Calculate the running order, from smallest to largest
          882       big bucket.
          883    --*/
          884    for (i = 0; i <= 255; i++) {
          885       bigDone     [i] = False;
          886       runningOrder[i] = i;
          887    }
          888 
          889    {
          890       Int32 vv;
          891       Int32 h = 1;
          892       do h = 3 * h + 1; while (h <= 256);
          893       do {
          894          h = h / 3;
          895          for (i = h; i <= 255; i++) {
          896             vv = runningOrder[i];
          897             j = i;
          898             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
          899                runningOrder[j] = runningOrder[j-h];
          900                j = j - h;
          901                if (j <= (h - 1)) goto zero;
          902             }
          903             zero:
          904             runningOrder[j] = vv;
          905          }
          906       } while (h != 1);
          907    }
          908 
          909    /*--
          910       The main sorting loop.
          911    --*/
          912 
          913    numQSorted = 0;
          914 
          915    for (i = 0; i <= 255; i++) {
          916 
          917       /*--
          918          Process big buckets, starting with the least full.
          919          Basically this is a 3-step process in which we call
          920          mainQSort3 to sort the small buckets [ss, j], but
          921          also make a big effort to avoid the calls if we can.
          922       --*/
          923       ss = runningOrder[i];
          924 
          925       /*--
          926          Step 1:
          927          Complete the big bucket [ss] by quicksorting
          928          any unsorted small buckets [ss, j], for j != ss.  
          929          Hopefully previous pointer-scanning phases have already
          930          completed many of the small buckets [ss, j], so
          931          we don't have to sort them at all.
          932       --*/
          933       for (j = 0; j <= 255; j++) {
          934          if (j != ss) {
          935             sb = (ss << 8) + j;
          936             if ( ! (ftab[sb] & SETMASK) ) {
          937                Int32 lo = ftab[sb]   & CLEARMASK;
          938                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
          939                if (hi > lo) {
          940                   if (verb >= 4)
          941                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
          942                                 "done %d   this %d\n",
          943                                 ss, j, numQSorted, hi - lo + 1 );
          944                   mainQSort3 ( 
          945                      ptr, block, quadrant, nblock, 
          946                      lo, hi, BZ_N_RADIX, budget 
          947                   );   
          948                   numQSorted += (hi - lo + 1);
          949                   if (*budget < 0) return;
          950                }
          951             }
          952             ftab[sb] |= SETMASK;
          953          }
          954       }
          955 
          956       AssertH ( !bigDone[ss], 1006 );
          957 
          958       /*--
          959          Step 2:
          960          Now scan this big bucket [ss] so as to synthesise the
          961          sorted order for small buckets [t, ss] for all t,
          962          including, magically, the bucket [ss,ss] too.
          963          This will avoid doing Real Work in subsequent Step 1's.
          964       --*/
          965       {
          966          for (j = 0; j <= 255; j++) {
          967             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
          968             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
          969          }
          970          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
          971             k = ptr[j]-1; if (k < 0) k += nblock;
          972             c1 = block[k];
          973             if (!bigDone[c1])
          974                ptr[ copyStart[c1]++ ] = k;
          975          }
          976          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
          977             k = ptr[j]-1; if (k < 0) k += nblock;
          978             c1 = block[k];
          979             if (!bigDone[c1]) 
          980                ptr[ copyEnd[c1]-- ] = k;
          981          }
          982       }
          983 
          984       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
          985                 || 
          986                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
          987                    Necessity for this case is demonstrated by compressing 
          988                    a sequence of approximately 48.5 million of character 
          989                    251; 1.0.0/1.0.1 will then die here. */
          990                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
          991                 1007 )
          992 
          993       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
          994 
          995       /*--
          996          Step 3:
          997          The [ss] big bucket is now done.  Record this fact,
          998          and update the quadrant descriptors.  Remember to
          999          update quadrants in the overshoot area too, if
         1000          necessary.  The "if (i < 255)" test merely skips
         1001          this updating for the last bucket processed, since
         1002          updating for the last bucket is pointless.
         1003 
         1004          The quadrant array provides a way to incrementally
         1005          cache sort orderings, as they appear, so as to 
         1006          make subsequent comparisons in fullGtU() complete
         1007          faster.  For repetitive blocks this makes a big
         1008          difference (but not big enough to be able to avoid
         1009          the fallback sorting mechanism, exponential radix sort).
         1010 
         1011          The precise meaning is: at all times:
         1012 
         1013             for 0 <= i < nblock and 0 <= j <= nblock
         1014 
         1015             if block[i] != block[j], 
         1016 
         1017                then the relative values of quadrant[i] and 
         1018                     quadrant[j] are meaningless.
         1019 
         1020                else {
         1021                   if quadrant[i] < quadrant[j]
         1022                      then the string starting at i lexicographically
         1023                      precedes the string starting at j
         1024 
         1025                   else if quadrant[i] > quadrant[j]
         1026                      then the string starting at j lexicographically
         1027                      precedes the string starting at i
         1028 
         1029                   else
         1030                      the relative ordering of the strings starting
         1031                      at i and j has not yet been determined.
         1032                }
         1033       --*/
         1034       bigDone[ss] = True;
         1035 
         1036       if (i < 255) {
         1037          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
         1038          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
         1039          Int32 shifts   = 0;
         1040 
         1041          while ((bbSize >> shifts) > 65534) shifts++;
         1042 
         1043          for (j = bbSize-1; j >= 0; j--) {
         1044             Int32 a2update     = ptr[bbStart + j];
         1045             UInt16 qVal        = (UInt16)(j >> shifts);
         1046             quadrant[a2update] = qVal;
         1047             if (a2update < BZ_N_OVERSHOOT)
         1048                quadrant[a2update + nblock] = qVal;
         1049          }
         1050          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
         1051       }
         1052 
         1053    }
         1054 
         1055    if (verb >= 4)
         1056       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
         1057                  nblock, numQSorted, nblock - numQSorted );
         1058 }
         1059 
         1060 #undef BIGFREQ
         1061 #undef SETMASK
         1062 #undef CLEARMASK
         1063 
         1064 
         1065 /*---------------------------------------------*/
         1066 /* Pre:
         1067       nblock > 0
         1068       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
         1069       ((UChar*)arr2)  [0 .. nblock-1] holds block
         1070       arr1 exists for [0 .. nblock-1]
         1071 
         1072    Post:
         1073       ((UChar*)arr2) [0 .. nblock-1] holds block
         1074       All other areas of block destroyed
         1075       ftab [ 0 .. 65536 ] destroyed
         1076       arr1 [0 .. nblock-1] holds sorted order
         1077 */
         1078 void BZ2_blockSort ( EState* s )
         1079 {
         1080    UInt32* ptr    = s->ptr; 
         1081    UChar*  block  = s->block;
         1082    UInt32* ftab   = s->ftab;
         1083    Int32   nblock = s->nblock;
         1084    Int32   verb   = s->verbosity;
         1085    Int32   wfact  = s->workFactor;
         1086    UInt16* quadrant;
         1087    Int32   budget;
         1088    Int32   budgetInit;
         1089    Int32   i;
         1090 
         1091    if (nblock < 10000) {
         1092       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
         1093    } else {
         1094       /* Calculate the location for quadrant, remembering to get
         1095          the alignment right.  Assumes that &(block[0]) is at least
         1096          2-byte aligned -- this should be ok since block is really
         1097          the first section of arr2.
         1098       */
         1099       i = nblock+BZ_N_OVERSHOOT;
         1100       if (i & 1) i++;
         1101       quadrant = (UInt16*)(&(block[i]));
         1102 
         1103       /* (wfact-1) / 3 puts the default-factor-30
         1104          transition point at very roughly the same place as 
         1105          with v0.1 and v0.9.0.  
         1106          Not that it particularly matters any more, since the
         1107          resulting compressed stream is now the same regardless
         1108          of whether or not we use the main sort or fallback sort.
         1109       */
         1110       if (wfact < 1  ) wfact = 1;
         1111       if (wfact > 100) wfact = 100;
         1112       budgetInit = nblock * ((wfact-1) / 3);
         1113       budget = budgetInit;
         1114 
         1115       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
         1116       if (verb >= 3) 
         1117          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
         1118                     budgetInit - budget,
         1119                     nblock, 
         1120                     (float)(budgetInit - budget) /
         1121                     (float)(nblock==0 ? 1 : nblock) ); 
         1122       if (budget < 0) {
         1123          if (verb >= 2) 
         1124             VPrintf0 ( "    too repetitive; using fallback"
         1125                        " sorting algorithm\n" );
         1126          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
         1127       }
         1128    }
         1129 
         1130    s->origPtr = -1;
         1131    for (i = 0; i < s->nblock; i++)
         1132       if (ptr[i] == 0)
         1133          { s->origPtr = i; break; };
         1134 
         1135    AssertH( s->origPtr != -1, 1003 );
         1136 }
         1137 
         1138 
         1139 /*-------------------------------------------------------------*/
         1140 /*--- end                                       blocksort.c ---*/
         1141 /*-------------------------------------------------------------*/