tn8.c - plan9port - [fork] Plan 9 from user space
 (HTM) git clone git://src.adamsgaard.dk/plan9port
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       tn8.c (10312B)
       ---
            1 #include <u.h>
            2 #include "tdef.h"
            3 #include "fns.h"
            4 #include "ext.h"
            5 
            6 #define        HY_BIT        0200        /* stuff in here only works for 7-bit ascii */
            7                         /* this value is used (as a literal) in suftab.c */
            8                         /* to encode possible hyphenation points in suffixes. */
            9                         /* it could be changed, by widening the tables */
           10                         /* to be shorts instead of chars. */
           11 
           12 /*
           13  * troff8.c
           14  *
           15  * hyphenation
           16  */
           17 
           18 int        hexsize = 0;                /* hyphenation exception list size */
           19 char        *hbufp = NULL;                /* base of list */
           20 char        *nexth = NULL;                /* first free slot in list */
           21 Tchar        *hyend;
           22 
           23 #define THRESH 160                 /* digram goodness threshold */
           24 int        thresh = THRESH;
           25 
           26 int        texhyphen(void);
           27 static        int        alpha(Tchar);
           28 
           29 void hyphen(Tchar *wp)
           30 {
           31         int j;
           32         Tchar *i;
           33 
           34         i = wp;
           35         while (punct((*i++)))
           36                 ;
           37         if (!alpha(*--i))
           38                 return;
           39         wdstart = i++;
           40         while (alpha(*i++))
           41                 ;
           42         hyend = wdend = --i - 1;
           43         while (punct((*i++)))
           44                 ;
           45         if (*--i)
           46                 return;
           47         if (wdend - wdstart < 4)        /* 4 chars is too short to hyphenate */
           48                 return;
           49         hyp = hyptr;
           50         *hyp = 0;
           51         hyoff = 2;
           52 
           53         /* for now, try exceptions first, then tex (if hyphalg is non-zero),
           54            then suffix and digram if tex didn't hyphenate it at all.
           55         */
           56 
           57         if (!exword() && !texhyphen() && !suffix())
           58                 digram();
           59 
           60         /* this appears to sort hyphenation points into increasing order */
           61         *hyp++ = 0;
           62         if (*hyptr)
           63                 for (j = 1; j; ) {
           64                         j = 0;
           65                         for (hyp = hyptr + 1; *hyp != 0; hyp++) {
           66                                 if (*(hyp - 1) > *hyp) {
           67                                         j++;
           68                                         i = *hyp;
           69                                         *hyp = *(hyp - 1);
           70                                         *(hyp - 1) = i;
           71                                 }
           72                         }
           73                 }
           74 }
           75 
           76 static int alpha(Tchar i)        /* non-zero if really alphabetic */
           77 {
           78         if (ismot(i))
           79                 return 0;
           80         else if (cbits(i) >= ALPHABET)        /* this isn't very elegant, but there's */
           81                 return 0;                /* no good way to make sure i is in range for */
           82         else                                /* the call of isalpha */
           83                 return isalpha(cbits(i));
           84 }
           85 
           86 int
           87 punct(Tchar i)
           88 {
           89         if (!i || alpha(i))
           90                 return(0);
           91         else
           92                 return(1);
           93 }
           94 
           95 
           96 void caseha(void)        /* set hyphenation algorithm */
           97 {
           98         hyphalg = HYPHALG;
           99         if (skip())
          100                 return;
          101         noscale++;
          102         hyphalg = atoi0();
          103         noscale = 0;
          104 }
          105 
          106 
          107 void caseht(void)        /* set hyphenation threshold;  not in manual! */
          108 {
          109         thresh = THRESH;
          110         if (skip())
          111                 return;
          112         noscale++;
          113         thresh = atoi0();
          114         noscale = 0;
          115 }
          116 
          117 
          118 char *growh(char *where)
          119 {
          120         char *new;
          121 
          122         hexsize += NHEX;
          123         if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
          124                 return NULL;
          125         if (new == hbufp) {
          126                 return where;
          127         } else {
          128                 int diff;
          129                 diff = where - hbufp;
          130                 hbufp = new;
          131                 return new + diff;
          132         }
          133 }
          134 
          135 
          136 void casehw(void)
          137 {
          138         int i, k;
          139         char *j;
          140         Tchar t;
          141 
          142         if (nexth == NULL) {
          143                 if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
          144                         ERROR "No space for exception word list." WARN;
          145                         return;
          146                 }
          147                 hexsize = NHEX;
          148         }
          149         k = 0;
          150         while (!skip()) {
          151                 if ((j = nexth) >= hbufp + hexsize - 2)
          152                         if ((j = nexth = growh(j)) == NULL)
          153                                 goto full;
          154                 for (;;) {
          155                         if (ismot(t = getch()))
          156                                 continue;
          157                         i = cbits(t);
          158                         if (i == ' ' || i == '\n') {
          159                                 *j++ = 0;
          160                                 nexth = j;
          161                                 *j = 0;
          162                                 if (i == ' ')
          163                                         break;
          164                                 else
          165                                         return;
          166                         }
          167                         if (i == '-') {
          168                                 k = HY_BIT;
          169                                 continue;
          170                         }
          171                         *j++ = maplow(i) | k;
          172                         k = 0;
          173                         if (j >= hbufp + hexsize - 2)
          174                                 if ((j = growh(j)) == NULL)
          175                                         goto full;
          176                 }
          177         }
          178         return;
          179 full:
          180         ERROR "Cannot grow exception word list." WARN;
          181         *nexth = 0;
          182 }
          183 
          184 
          185 int exword(void)
          186 {
          187         Tchar *w;
          188         char *e, *save;
          189 
          190         e = hbufp;
          191         while (1) {
          192                 save = e;
          193                 if (e == NULL || *e == 0)
          194                         return(0);
          195                 w = wdstart;
          196                 while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
          197                         e++;
          198                         w++;
          199                 }
          200                 if (!*e) {
          201                         if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
          202                                 w = wdstart;
          203                                 for (e = save; *e; e++) {
          204                                         if (*e & HY_BIT)
          205                                                 *hyp++ = w;
          206                                         if (hyp > hyptr + NHYP - 1)
          207                                                 hyp = hyptr + NHYP - 1;
          208                                         w++;
          209                                 }
          210                                 return(1);
          211                         } else {
          212                                 e++;
          213                                 continue;
          214                         }
          215                 } else
          216                         while (*e++)
          217                                 ;
          218         }
          219 }
          220 
          221 int
          222 suffix(void)
          223 {
          224         Tchar *w;
          225         char *s, *s0;
          226         Tchar i;
          227         extern char *suftab[];
          228 
          229 again:
          230         i = cbits(*hyend);
          231         if (!alpha(i))
          232                 return(0);
          233         if (i < 'a')
          234                 i -= 'A' - 'a';
          235         if ((s0 = suftab[i-'a']) == 0)
          236                 return(0);
          237         for (;;) {
          238                 if ((i = *s0 & 017) == 0)
          239                         return(0);
          240                 s = s0 + i - 1;
          241                 w = hyend - 1;
          242                 while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
          243                         s--;
          244                         w--;
          245                 }
          246                 if (s == s0)
          247                         break;
          248                 s0 += i;
          249         }
          250         s = s0 + i - 1;
          251         w = hyend;
          252         if (*s0 & HY_BIT)
          253                 goto mark;
          254         while (s > s0) {
          255                 w--;
          256                 if (*s-- & HY_BIT) {
          257 mark:
          258                         hyend = w - 1;
          259                         if (*s0 & 0100)        /* 0100 used in suftab to encode something too */
          260                                 continue;
          261                         if (!chkvow(w))
          262                                 return(0);
          263                         *hyp++ = w;
          264                 }
          265         }
          266         if (*s0 & 040)
          267                 return(0);
          268         if (exword())
          269                 return(1);
          270         goto again;
          271 }
          272 
          273 int
          274 maplow(int i)
          275 {
          276         if (isupper(i))
          277                 i = tolower(i);
          278         return(i);
          279 }
          280 
          281 int
          282 vowel(int i)
          283 {
          284         switch (i) {
          285         case 'a': case 'A':
          286         case 'e': case 'E':
          287         case 'i': case 'I':
          288         case 'o': case 'O':
          289         case 'u': case 'U':
          290         case 'y': case 'Y':
          291                 return(1);
          292         default:
          293                 return(0);
          294         }
          295 }
          296 
          297 
          298 Tchar *chkvow(Tchar *w)
          299 {
          300         while (--w >= wdstart)
          301                 if (vowel(cbits(*w)))
          302                         return(w);
          303         return(0);
          304 }
          305 
          306 
          307 void digram(void)
          308 {
          309         Tchar *w;
          310         int val;
          311         Tchar *nhyend, *maxw;
          312         int maxval;
          313         extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
          314         maxw = 0;
          315 again:
          316         if (!(w = chkvow(hyend + 1)))
          317                 return;
          318         hyend = w;
          319         if (!(w = chkvow(hyend)))
          320                 return;
          321         nhyend = w;
          322         maxval = 0;
          323         w--;
          324         while (++w < hyend && w < wdend - 1) {
          325                 val = 1;
          326                 if (w == wdstart)
          327                         val *= dilook('a', cbits(*w), bxh);
          328                 else if (w == wdstart + 1)
          329                         val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
          330                 else
          331                         val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
          332                 val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
          333                 val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
          334                 if (val > maxval) {
          335                         maxval = val;
          336                         maxw = w + 1;
          337                 }
          338         }
          339         hyend = nhyend;
          340         if (maxval > thresh)
          341                 *hyp++ = maxw;
          342         goto again;
          343 }
          344 
          345 int
          346 dilook(int a, int b, char t[26][13])
          347 {
          348         int i, j;
          349 
          350         i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
          351         if (!(j & 01))
          352                 i >>= 4;
          353         return(i & 017);
          354 }
          355 
          356 
          357 /* here beginneth the tex hyphenation code, as interpreted freely */
          358 /* the main difference is that there is no attempt to squeeze space */
          359 /* as tightly at tex does. */
          360 
          361 static int        texit(Tchar *, Tchar *);
          362 static int        readpats(void);
          363 static void        install(char *);
          364 static void        fixup(void);
          365 static int        trieindex(int, int);
          366 
          367 static char        pats[50000];        /* size ought to be computed dynamically */
          368 static char        *nextpat = pats;
          369 static char        *trie[27*27];        /* english-specific sizes */
          370 
          371 int texhyphen(void)
          372 {
          373         static int loaded = 0;                /* -1: couldn't find tex file */
          374 
          375         if (hyphalg == 0 || loaded == -1)        /* non-zero => tex for now */
          376                 return 0;
          377         if (loaded == 0) {
          378                 if (readpats())
          379                         loaded = 1;
          380                 else
          381                         loaded = -1;
          382         }
          383         return texit(wdstart, wdend);
          384 }
          385 
          386 static int texit(Tchar *start, Tchar *end)        /* hyphenate as in tex, return # found */
          387 {
          388         int nw, i, k, equal, cnt[500];
          389         char w[500+1], *np, *pp, *wp, *xpp, *xwp;
          390 
          391         w[0] = '.';
          392         for (nw = 1; start <= end && nw < 500-1; nw++, start++)
          393                 w[nw] = maplow(tolower(cbits(*start)));
          394         start -= (nw - 1);
          395         w[nw++] = '.';
          396         w[nw] = 0;
          397 /*
          398  * printf("try %s\n", w);
          399 */
          400         for (i = 0; i <= nw; i++)
          401                 cnt[i] = '0';
          402 
          403         for (wp = w; wp+1 < w+nw; wp++) {
          404                 for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
          405                         if (pp == 0                /* no trie entry */
          406                          || *pp != *wp                /* no match on 1st letter */
          407                          || *(pp+1) != *(wp+1))        /* no match on 2nd letter */
          408                                 break;                /*   so move to next letter of word */
          409                         equal = 1;
          410                         for (xpp = pp+2, xwp = wp+2; *xpp; )
          411                                 if (*xpp++ != *xwp++) {
          412                                         equal = 0;
          413                                         break;
          414                                 }
          415                         if (equal) {
          416                                 np = xpp+1;        /* numpat */
          417                                 for (k = wp-w; *np; k++, np++)
          418                                         if (*np > cnt[k])
          419                                                 cnt[k] = *np;
          420 /*
          421  * printf("match: %s  %s\n", pp, xpp+1);
          422 */
          423                         }
          424                         pp += *(pp-1);        /* skip over pattern and numbers to next */
          425                 }
          426         }
          427 /*
          428  * for (i = 0; i < nw; i++) printf("%c", w[i]);
          429  * printf("  ");
          430  * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
          431  * printf("\n");
          432 */
          433 /*
          434  *         for (i = 1; i < nw - 1; i++) {
          435  *                 if (i > 2 && i < nw - 3 && cnt[i] % 2)
          436  *                         printf("-");
          437  *                 if (cbits(start[i-1]) != '.')
          438  *                         printf("%c", cbits(start[i-1]));
          439  *         }
          440  *         printf("\n");
          441 */
          442         for (i = 1; i < nw -1; i++)
          443                 if (i > 2 && i < nw - 3 && cnt[i] % 2)
          444                         *hyp++ = start + i - 1;
          445         return hyp - hyptr;        /* non-zero if a hyphen was found */
          446 }
          447 
          448 /*
          449         This code assumes that hyphen.tex looks like
          450                 % some comments
          451                 \patterns{ % more comments
          452                 pat5ter4ns, 1 per line, SORTED, nothing else
          453                 }
          454                 more goo
          455                 \hyphenation{ % more comments
          456                 ex-cep-tions, one per line; i ignore this part for now
          457                 }
          458 
          459         this code is NOT robust against variations.  unfortunately,
          460         it looks like every local language version of this file has
          461         a different format.  i have also made no provision for weird
          462         characters.  sigh.
          463 */
          464 
          465 static int readpats(void)
          466 {
          467         FILE *fp;
          468         char buf[200], buf1[200];
          469 
          470         if ((fp = fopen(unsharp(TEXHYPHENS), "r")) == NULL
          471          && (fp = fopen(unsharp(DWBalthyphens), "r")) == NULL) {
          472                 ERROR "warning: can't find hyphen.tex" WARN;
          473                 return 0;
          474         }
          475 
          476         while (fgets(buf, sizeof buf, fp) != NULL) {
          477                 sscanf(buf, "%s", buf1);
          478                 if (strcmp(buf1, "\\patterns{") == 0)
          479                         break;
          480         }
          481         while (fgets(buf, sizeof buf, fp) != NULL) {
          482                 if (buf[0] == '}')
          483                         break;
          484                 install(buf);
          485         }
          486         fclose(fp);
          487         fixup();
          488         return 1;
          489 }
          490 
          491 static void install(char *s)        /* map ab4c5de to: 12 abcde \0 00405 \0 */
          492 {
          493         int npat, lastpat;
          494         char num[500], *onextpat = nextpat;
          495 
          496         num[0] = '0';
          497         *nextpat++ = ' ';        /* fill in with count later */
          498         for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
          499                 if (isdigit((uchar)*s)) {
          500                         num[npat] = *s;
          501                         lastpat = npat;
          502                 } else {
          503                         *nextpat++ = *s;
          504                         npat++;
          505                         num[npat] = '0';
          506                 }
          507         }
          508         *nextpat++ = 0;
          509         if (nextpat > pats + sizeof(pats)-20) {
          510                 ERROR "tex hyphenation table overflow, tail end ignored" WARN;
          511                 nextpat = onextpat;
          512         }
          513         num[lastpat+1] = 0;
          514         strcat(nextpat, num);
          515         nextpat += strlen(nextpat) + 1;
          516 }
          517 
          518 static void fixup(void)        /* build indexes of where . a b c ... start */
          519 {
          520         char *p, *lastc;
          521         int n;
          522 
          523         for (lastc = pats, p = pats+1; p < nextpat; p++)
          524                 if (*p == ' ') {
          525                         *lastc = p - lastc;
          526                         lastc = p;
          527                 }
          528         *lastc = p - lastc;
          529         for (p = pats+1; p < nextpat; ) {
          530                 n = trieindex(p[0], p[1]);
          531                 if (trie[n] == 0)
          532                         trie[n] = p;
          533                 p += p[-1];
          534         }
          535         /* printf("pats = %d\n", nextpat - pats); */
          536 }
          537 
          538 static int trieindex(int d1, int d2)
          539 {
          540         int z;
          541 
          542         z = 27 * (d1 == '.' ? 0 : d1 - 'a' + 1) + (d2 == '.' ? 0 : d2 - 'a' + 1);
          543         assert(z >= 0 && z < 27*27);
          544         return z;
          545 }