tdiffreg.c - plan9port - [fork] Plan 9 from user space
 (HTM) git clone git://src.adamsgaard.dk/plan9port
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       tdiffreg.c (8861B)
       ---
            1 #include <u.h>
            2 #include <libc.h>
            3 #include <bio.h>
            4 #include "diff.h"
            5 
            6 /*        diff - differential file comparison
            7 *
            8 *        Uses an algorithm due to Harold Stone, which finds
            9 *        a pair of longest identical subsequences in the two
           10 *        files.
           11 *
           12 *        The major goal is to generate the match vector J.
           13 *        J[i] is the index of the line in file1 corresponding
           14 *        to line i file0. J[i] = 0 if there is no
           15 *        such line in file1.
           16 *
           17 *        Lines are hashed so as to work in core. All potential
           18 *        matches are located by sorting the lines of each file
           19 *        on the hash (called value). In particular, this
           20 *        collects the equivalence classes in file1 together.
           21 *        Subroutine equiv replaces the value of each line in
           22 *        file0 by the index of the first element of its
           23 *        matching equivalence in (the reordered) file1.
           24 *        To save space equiv squeezes file1 into a single
           25 *        array member in which the equivalence classes
           26 *        are simply concatenated, except that their first
           27 *        members are flagged by changing sign.
           28 *
           29 *        Next the indices that point into member are unsorted into
           30 *        array class according to the original order of file0.
           31 *
           32 *        The cleverness lies in routine stone. This marches
           33 *        through the lines of file0, developing a vector klist
           34 *        of "k-candidates". At step i a k-candidate is a matched
           35 *        pair of lines x,y (x in file0 y in file1) such that
           36 *        there is a common subsequence of lenght k
           37 *        between the first i lines of file0 and the first y
           38 *        lines of file1, but there is no such subsequence for
           39 *        any smaller y. x is the earliest possible mate to y
           40 *        that occurs in such a subsequence.
           41 *
           42 *        Whenever any of the members of the equivalence class of
           43 *        lines in file1 matable to a line in file0 has serial number
           44 *        less than the y of some k-candidate, that k-candidate
           45 *        with the smallest such y is replaced. The new
           46 *        k-candidate is chained (via pred) to the current
           47 *        k-1 candidate so that the actual subsequence can
           48 *        be recovered. When a member has serial number greater
           49 *        that the y of all k-candidates, the klist is extended.
           50 *        At the end, the longest subsequence is pulled out
           51 *        and placed in the array J by unravel.
           52 *
           53 *        With J in hand, the matches there recorded are
           54 *        check'ed against reality to assure that no spurious
           55 *        matches have crept in due to hashing. If they have,
           56 *        they are broken, and "jackpot " is recorded--a harmless
           57 *        matter except that a true match for a spuriously
           58 *        mated line may now be unnecessarily reported as a change.
           59 *
           60 *        Much of the complexity of the program comes simply
           61 *        from trying to minimize core utilization and
           62 *        maximize the range of doable problems by dynamically
           63 *        allocating what is needed and reusing what is not.
           64 *        The core requirements for problems larger than somewhat
           65 *        are (in words) 2*length(file0) + length(file1) +
           66 *        3*(number of k-candidates installed),  typically about
           67 *        6n words for files of length n.
           68 */
           69 
           70 #define class diffclass
           71 
           72 /* TIDY THIS UP */
           73 struct cand {
           74         int x;
           75         int y;
           76         int pred;
           77 } cand;
           78 struct line {
           79         int serial;
           80         int value;
           81 } *file[2], line;
           82 int len[2];
           83 int binary;
           84 struct line *sfile[2];        /*shortened by pruning common prefix and suffix*/
           85 int slen[2];
           86 int pref, suff;        /*length of prefix and suffix*/
           87 int *class;        /*will be overlaid on file[0]*/
           88 int *member;        /*will be overlaid on file[1]*/
           89 int *klist;                /*will be overlaid on file[0] after class*/
           90 struct cand *clist;        /* merely a free storage pot for candidates */
           91 int clen;
           92 int *J;                /*will be overlaid on class*/
           93 long *ixold;        /*will be overlaid on klist*/
           94 long *ixnew;        /*will be overlaid on file[1]*/
           95 /* END OF SOME TIDYING */
           96 
           97 static void
           98 sort(struct line *a, int n)        /*shellsort CACM #201*/
           99 {
          100         int m;
          101         struct line *ai, *aim, *j, *k;
          102         struct line w;
          103         int i;
          104 
          105         m = 0;
          106         for (i = 1; i <= n; i *= 2)
          107                 m = 2*i - 1;
          108         for (m /= 2; m != 0; m /= 2) {
          109                 k = a+(n-m);
          110                 for (j = a+1; j <= k; j++) {
          111                         ai = j;
          112                         aim = ai+m;
          113                         do {
          114                                 if (aim->value > ai->value ||
          115                                    aim->value == ai->value &&
          116                                    aim->serial > ai->serial)
          117                                         break;
          118                                 w = *ai;
          119                                 *ai = *aim;
          120                                 *aim = w;
          121 
          122                                 aim = ai;
          123                                 ai -= m;
          124                         } while (ai > a && aim >= ai);
          125                 }
          126         }
          127 }
          128 
          129 static void
          130 unsort(struct line *f, int l, int *b)
          131 {
          132         int *a;
          133         int i;
          134 
          135         a = MALLOC(int, (l+1));
          136         for(i=1;i<=l;i++)
          137                 a[f[i].serial] = f[i].value;
          138         for(i=1;i<=l;i++)
          139                 b[i] = a[i];
          140         FREE(a);
          141 }
          142 
          143 static void
          144 prune(void)
          145 {
          146         int i,j;
          147 
          148         for(pref=0;pref<len[0]&&pref<len[1]&&
          149                 file[0][pref+1].value==file[1][pref+1].value;
          150                 pref++ ) ;
          151         for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
          152                 file[0][len[0]-suff].value==file[1][len[1]-suff].value;
          153                 suff++) ;
          154         for(j=0;j<2;j++) {
          155                 sfile[j] = file[j]+pref;
          156                 slen[j] = len[j]-pref-suff;
          157                 for(i=0;i<=slen[j];i++)
          158                         sfile[j][i].serial = i;
          159         }
          160 }
          161 
          162 static void
          163 equiv(struct line *a, int n, struct line *b, int m, int *c)
          164 {
          165         int i, j;
          166 
          167         i = j = 1;
          168         while(i<=n && j<=m) {
          169                 if(a[i].value < b[j].value)
          170                         a[i++].value = 0;
          171                 else if(a[i].value == b[j].value)
          172                         a[i++].value = j;
          173                 else
          174                         j++;
          175         }
          176         while(i <= n)
          177                 a[i++].value = 0;
          178         b[m+1].value = 0;
          179         j = 0;
          180         while(++j <= m) {
          181                 c[j] = -b[j].serial;
          182                 while(b[j+1].value == b[j].value) {
          183                         j++;
          184                         c[j] = b[j].serial;
          185                 }
          186         }
          187         c[j] = -1;
          188 }
          189 
          190 static int
          191 newcand(int x, int  y, int pred)
          192 {
          193         struct cand *q;
          194 
          195         clist = REALLOC(clist, struct cand, (clen+1));
          196         q = clist + clen;
          197         q->x = x;
          198         q->y = y;
          199         q->pred = pred;
          200         return clen++;
          201 }
          202 
          203 static int
          204 search(int *c, int k, int y)
          205 {
          206         int i, j, l;
          207         int t;
          208 
          209         if(clist[c[k]].y < y)        /*quick look for typical case*/
          210                 return k+1;
          211         i = 0;
          212         j = k+1;
          213         while((l=(i+j)/2) > i) {
          214                 t = clist[c[l]].y;
          215                 if(t > y)
          216                         j = l;
          217                 else if(t < y)
          218                         i = l;
          219                 else
          220                         return l;
          221         }
          222         return l+1;
          223 }
          224 
          225 static int
          226 stone(int *a, int n, int *b, int *c)
          227 {
          228         int i, k,y;
          229         int j, l;
          230         int oldc, tc;
          231         int oldl;
          232 
          233         k = 0;
          234         c[0] = newcand(0,0,0);
          235         for(i=1; i<=n; i++) {
          236                 j = a[i];
          237                 if(j==0)
          238                         continue;
          239                 y = -b[j];
          240                 oldl = 0;
          241                 oldc = c[0];
          242                 do {
          243                         if(y <= clist[oldc].y)
          244                                 continue;
          245                         l = search(c, k, y);
          246                         if(l!=oldl+1)
          247                                 oldc = c[l-1];
          248                         if(l<=k) {
          249                                 if(clist[c[l]].y <= y)
          250                                         continue;
          251                                 tc = c[l];
          252                                 c[l] = newcand(i,y,oldc);
          253                                 oldc = tc;
          254                                 oldl = l;
          255                         } else {
          256                                 c[l] = newcand(i,y,oldc);
          257                                 k++;
          258                                 break;
          259                         }
          260                 } while((y=b[++j]) > 0);
          261         }
          262         return k;
          263 }
          264 
          265 static void
          266 unravel(int p)
          267 {
          268         int i;
          269         struct cand *q;
          270 
          271         for(i=0; i<=len[0]; i++) {
          272                 if (i <= pref)
          273                         J[i] = i;
          274                 else if (i > len[0]-suff)
          275                         J[i] = i+len[1]-len[0];
          276                 else
          277                         J[i] = 0;
          278         }
          279         for(q=clist+p;q->y!=0;q=clist+q->pred)
          280                 J[q->x+pref] = q->y+pref;
          281 }
          282 
          283 static void
          284 output(void)
          285 {
          286         int m, i0, i1, j0, j1;
          287 
          288         m = len[0];
          289         J[0] = 0;
          290         J[m+1] = len[1]+1;
          291         if (mode != 'e') {
          292                 for (i0 = 1; i0 <= m; i0 = i1+1) {
          293                         while (i0 <= m && J[i0] == J[i0-1]+1)
          294                                 i0++;
          295                         j0 = J[i0-1]+1;
          296                         i1 = i0-1;
          297                         while (i1 < m && J[i1+1] == 0)
          298                                 i1++;
          299                         j1 = J[i1+1]-1;
          300                         J[i1] = j1;
          301                         change(i0, i1, j0, j1);
          302                 }
          303         }
          304         else {
          305                 for (i0 = m; i0 >= 1; i0 = i1-1) {
          306                         while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0])
          307                                 i0--;
          308                         j0 = J[i0+1]-1;
          309                         i1 = i0+1;
          310                         while (i1 > 1 && J[i1-1] == 0)
          311                                 i1--;
          312                         j1 = J[i1-1]+1;
          313                         J[i1] = j1;
          314                         change(i1 , i0, j1, j0);
          315                 }
          316         }
          317         if (m == 0)
          318                 change(1, 0, 1, len[1]);
          319         flushchanges();
          320 }
          321 
          322 #define BUF 4096
          323 static int
          324 cmp(Biobuf* b1, Biobuf* b2)
          325 {
          326         int n;
          327         uchar buf1[BUF], buf2[BUF];
          328         int f1, f2;
          329         vlong nc = 1;
          330         uchar *b1s, *b1e, *b2s, *b2e;
          331 
          332         f1 = Bfildes(b1);
          333         f2 = Bfildes(b2);
          334         seek(f1, 0, 0);
          335         seek(f2, 0, 0);
          336         b1s = b1e = buf1;
          337         b2s = b2e = buf2;
          338         for(;;){
          339                 if(b1s >= b1e){
          340                         if(b1s >= &buf1[BUF])
          341                                 b1s = buf1;
          342                         n = read(f1, b1s,  &buf1[BUF] - b1s);
          343                         b1e = b1s + n;
          344                 }
          345                 if(b2s >= b2e){
          346                         if(b2s >= &buf2[BUF])
          347                                 b2s = buf2;
          348                         n = read(f2, b2s,  &buf2[BUF] - b2s);
          349                         b2e = b2s + n;
          350                 }
          351                 n = b2e - b2s;
          352                 if(n > b1e - b1s)
          353                         n = b1e - b1s;
          354                 if(n <= 0)
          355                         break;
          356                 if(memcmp((void *)b1s, (void *)b2s, n) != 0){
          357                         return 1;
          358                 }
          359                 nc += n;
          360                 b1s += n;
          361                 b2s += n;
          362         }
          363         if(b1e - b1s == b2e - b2s)
          364                 return 0;
          365         return 1;
          366 }
          367 
          368 void
          369 diffreg(char *f, char *t)
          370 {
          371         Biobuf *b0, *b1;
          372         int k;
          373 
          374         binary = 0;
          375         b0 = prepare(0, f);
          376         if (!b0)
          377                 return;
          378         b1 = prepare(1, t);
          379         if (!b1) {
          380                 FREE(file[0]);
          381                 Bterm(b0);
          382                 return;
          383         }
          384         if (binary){
          385                 /* could use b0 and b1 but this is simpler. */
          386                 if (cmp(b0, b1))
          387                         print("binary files %s %s differ\n", f, t);
          388                 Bterm(b0);
          389                 Bterm(b1);
          390                 return;
          391         }
          392         clen = 0;
          393         prune();
          394         sort(sfile[0], slen[0]);
          395         sort(sfile[1], slen[1]);
          396 
          397         member = (int *)file[1];
          398         equiv(sfile[0], slen[0], sfile[1], slen[1], member);
          399         member = REALLOC(member, int, slen[1]+2);
          400 
          401         class = (int *)file[0];
          402         unsort(sfile[0], slen[0], class);
          403         class = REALLOC(class, int, slen[0]+2);
          404 
          405         klist = MALLOC(int, slen[0]+2);
          406         clist = MALLOC(struct cand, 1);
          407         k = stone(class, slen[0], member, klist);
          408         FREE(member);
          409         FREE(class);
          410 
          411         J = MALLOC(int, len[0]+2);
          412         unravel(klist[k]);
          413         FREE(clist);
          414         FREE(klist);
          415 
          416         ixold = MALLOC(long, len[0]+2);
          417         ixnew = MALLOC(long, len[1]+2);
          418         Bseek(b0, 0, 0); Bseek(b1, 0, 0);
          419         check(b0, b1);
          420         output();
          421         FREE(J); FREE(ixold); FREE(ixnew);
          422         Bterm(b0); Bterm(b1);                        /* ++++ */
          423 }