tregex.c - neatvi - [fork] simple vi-type editor with UTF-8 support
 (HTM) git clone git://src.adamsgaard.dk/neatvi
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
       ---
       tregex.c (13827B)
       ---
            1 #include <ctype.h>
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 #include "regex.h"
            6 
            7 #define NGRPS                64        /* maximum number of groups */
            8 #define NREPS                128        /* maximum repetitions */
            9 #define NDEPT                256        /* re_rec() recursion depth limit */
           10 
           11 #define MAX(a, b)        ((a) < (b) ? (b) : (a))
           12 #define LEN(a)                (sizeof(a) / sizeof((a)[0]))
           13 
           14 /* regular expressions atoms */
           15 #define RA_CHR                '\0'        /* character literal */
           16 #define RA_BEG                '^'        /* string start */
           17 #define RA_END                '$'        /* string end */
           18 #define RA_ANY                '.'        /* any character */
           19 #define RA_BRK                '['        /* bracket expression */
           20 #define RA_WBEG                '<'        /* word start */
           21 #define RA_WEND                '>'        /* word end */
           22 
           23 /* regular expression node types */
           24 #define RN_ATOM                '\0'        /* regular expression */
           25 #define RN_CAT                'c'        /* concatenation */
           26 #define RN_ALT                '|'        /* alternate expressions */
           27 #define RN_GRP                '('        /* pattern group */
           28 
           29 /* regular expression program instructions */
           30 #define RI_ATOM                '\0'        /* regular expression */
           31 #define RI_FORK                'f'        /* fork the execution */
           32 #define RI_JUMP                'j'        /* jump to the given instruction */
           33 #define RI_MARK                'm'        /* mark the current position */
           34 #define RI_MATCH        'q'        /* the pattern is matched */
           35 
           36 /* regular expression atom */
           37 struct ratom {
           38         int ra;                        /* atom type (RA_*) */
           39         char *s;                /* atom argument */
           40 };
           41 
           42 /* regular expression instruction */
           43 struct rinst {
           44         struct ratom ra;        /* regular expression atom (RI_ATOM) */
           45         int ri;                        /* instruction type (RI_*) */
           46         int a1, a2;                /* destination of RI_FORK and RI_JUMP */
           47         int mark;                /* mark (RI_MARK) */
           48 };
           49 
           50 /* regular expression program */
           51 struct regex {
           52         struct rinst *p;        /* the program */
           53         int n;                        /* number of instructions */
           54         int flg;                /* regcomp() flags */
           55 };
           56 
           57 /* regular expression matching state */
           58 struct rstate {
           59         char *s;                /* the current position in the string */
           60         char *o;                /* the beginning of the string */
           61         int mark[NGRPS * 2];        /* marks for RI_MARK */
           62         int pc;                        /* program counter */
           63         int flg;                /* flags passed to regcomp() and regexec() */
           64         int dep;                /* re_rec() depth */
           65 };
           66 
           67 /* regular expression tree; used for parsing */
           68 struct rnode {
           69         struct ratom ra;        /* regular expression atom (RN_ATOM) */
           70         struct rnode *c1, *c2;        /* children */
           71         int mincnt, maxcnt;        /* number of repetitions */
           72         int grp;                /* group number */
           73         int rn;                        /* node type (RN_*) */
           74 };
           75 
           76 static struct rnode *rnode_make(int rn, struct rnode *c1, struct rnode *c2)
           77 {
           78         struct rnode *rnode = malloc(sizeof(*rnode));
           79         memset(rnode, 0, sizeof(*rnode));
           80         rnode->rn = rn;
           81         rnode->c1 = c1;
           82         rnode->c2 = c2;
           83         rnode->mincnt = 1;
           84         rnode->maxcnt = 1;
           85         return rnode;
           86 }
           87 
           88 static void rnode_free(struct rnode *rnode)
           89 {
           90         if (rnode->c1)
           91                 rnode_free(rnode->c1);
           92         if (rnode->c2)
           93                 rnode_free(rnode->c2);
           94         free(rnode->ra.s);
           95         free(rnode);
           96 }
           97 
           98 static int uc_len(char *s)
           99 {
          100         int c = (unsigned char) s[0];
          101         if (~c & 0xc0)                /* ASCII or invalid */
          102                 return c > 0;
          103         if (~c & 0x20)
          104                 return 2;
          105         if (~c & 0x10)
          106                 return 3;
          107         if (~c & 0x08)
          108                 return 4;
          109         return 1;
          110 }
          111 
          112 static int uc_dec(char *s)
          113 {
          114         int c = (unsigned char) s[0];
          115         if (~c & 0xc0)                /* ASCII or invalid */
          116                 return c;
          117         if (~c & 0x20)
          118                 return ((c & 0x1f) << 6) | (s[1] & 0x3f);
          119         if (~c & 0x10)
          120                 return ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
          121         if (~c & 0x08)
          122                 return ((c & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
          123         return c;
          124 }
          125 
          126 static void ratom_copy(struct ratom *dst, struct ratom *src)
          127 {
          128         dst->ra = src->ra;
          129         dst->s = NULL;
          130         if (src->s) {
          131                 int len = strlen(src->s);
          132                 dst->s = malloc(len + 1);
          133                 memcpy(dst->s, src->s, len + 1);
          134         }
          135 }
          136 
          137 static int brk_len(char *s)
          138 {
          139         int n = 1;
          140         if (s[n] == '^')        /* exclusion mark */
          141                 n++;
          142         if (s[n] == ']')        /* handling []a] */
          143                 n++;
          144         while (s[n] && s[n] != ']') {
          145                 if (s[n] == '[' && (s[n + 1] == ':' || s[n + 1] == '='))
          146                         while (s[n] && s[n] != ']')
          147                                 n++;
          148                 if (s[n])
          149                         n++;
          150         }
          151         return s[n] == ']' ? n + 1 : n;
          152 }
          153 
          154 static void ratom_readbrk(struct ratom *ra, char **pat)
          155 {
          156         int len = brk_len(*pat);
          157         ra->ra = RA_BRK;
          158         ra->s = malloc(len + 1);
          159         memcpy(ra->s, *pat, len);
          160         ra->s[len] = '\0';
          161         *pat += len;
          162 }
          163 
          164 static void ratom_read(struct ratom *ra, char **pat)
          165 {
          166         int len;
          167         switch ((unsigned char) **pat) {
          168         case '.':
          169                 ra->ra = RA_ANY;
          170                 (*pat)++;
          171                 break;
          172         case '^':
          173                 ra->ra = RA_BEG;
          174                 (*pat)++;
          175                 break;
          176         case '$':
          177                 ra->ra = RA_END;
          178                 (*pat)++;
          179                 break;
          180         case '[':
          181                 ratom_readbrk(ra, pat);
          182                 break;
          183         case '\\':
          184                 if ((*pat)[1] == '<' || (*pat)[1] == '>') {
          185                         ra->ra = (*pat)[1] == '<' ? RA_WBEG : RA_WEND;
          186                         *pat += 2;
          187                         break;
          188                 }
          189                 (*pat)++;
          190         default:
          191                 ra->ra = RA_CHR;
          192                 len = uc_len(*pat);
          193                 ra->s = malloc(8);
          194                 memcpy(ra->s, *pat, len);
          195                 ra->s[len] = '\0';
          196                 *pat += len;
          197         }
          198 }
          199 
          200 static char *uc_beg(char *beg, char *s)
          201 {
          202         while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
          203                 s--;
          204         return s;
          205 }
          206 
          207 static int isword(char *s)
          208 {
          209         int c = (unsigned char) s[0];
          210         return isalnum(c) || c == '_' || c > 127;
          211 }
          212 
          213 static char *brk_classes[][2] = {
          214         {":alnum:", "a-zA-Z0-9"},
          215         {":alpha:", "a-zA-Z"},
          216         {":blank:", " \t"},
          217         {":digit:", "0-9"},
          218         {":lower:", "a-z"},
          219         {":print:", "\x20-\x7e"},
          220         {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"},
          221         {":space:", " \t\r\n\v\f"},
          222         {":upper:", "A-Z"},
          223         {":word:", "a-zA-Z0-9_"},
          224         {":xdigit:", "a-fA-F0-9"},
          225 };
          226 
          227 static int brk_match(char *brk, int c, int flg)
          228 {
          229         int beg, end;
          230         int i;
          231         int not = brk[0] == '^';
          232         char *p = not ? brk + 1 : brk;
          233         char *p0 = p;
          234         if (flg & REG_ICASE && c < 128 && isupper(c))
          235                 c = tolower(c);
          236         while (*p && (p == p0 || *p != ']')) {
          237                 if (p[0] == '[' && p[1] == ':') {
          238                         for (i = 0; i < LEN(brk_classes); i++) {
          239                                 char *cc = brk_classes[i][0];
          240                                 char *cp = brk_classes[i][1];
          241                                 if (!strncmp(cc, p + 1, strlen(cc)))
          242                                         if (!brk_match(cp, c, flg))
          243                                                 return not;
          244                         }
          245                         p += brk_len(p);
          246                         continue;
          247                 }
          248                 beg = uc_dec(p);
          249                 p += uc_len(p);
          250                 end = beg;
          251                 if (p[0] == '-' && p[1] && p[1] != ']') {
          252                         p++;
          253                         end = uc_dec(p);
          254                         p += uc_len(p);
          255                 }
          256                 if (flg & REG_ICASE && beg < 128 && isupper(beg))
          257                         beg = tolower(beg);
          258                 if (flg & REG_ICASE && end < 128 && isupper(end))
          259                         end = tolower(end);
          260                 if (c >= beg && c <= end)
          261                         return not;
          262         }
          263         return !not;
          264 }
          265 
          266 static int ratom_match(struct ratom *ra, struct rstate *rs)
          267 {
          268         if (ra->ra == RA_CHR) {
          269                 int c1 = uc_dec(ra->s);
          270                 int c2 = uc_dec(rs->s);
          271                 if (rs->flg & REG_ICASE && c1 < 128 && isupper(c1))
          272                         c1 = tolower(c1);
          273                 if (rs->flg & REG_ICASE && c2 < 128 && isupper(c2))
          274                         c2 = tolower(c2);
          275                 if (c1 != c2)
          276                         return 1;
          277                 rs->s += uc_len(ra->s);
          278                 return 0;
          279         }
          280         if (ra->ra == RA_ANY) {
          281                 if (!rs->s[0] || (rs->s[0] == '\n' && !(rs->flg & REG_NOTEOL)))
          282                         return 1;
          283                 rs->s += uc_len(rs->s);
          284                 return 0;
          285         }
          286         if (ra->ra == RA_BRK) {
          287                 int c = uc_dec(rs->s);
          288                 if (!c || (c == '\n' && !(rs->flg & REG_NOTEOL)))
          289                         return 1;
          290                 rs->s += uc_len(rs->s);
          291                 return brk_match(ra->s + 1, c, rs->flg);
          292         }
          293         if (ra->ra == RA_BEG && !(rs->flg & REG_NOTBOL))
          294                 return !(rs->s == rs->o || rs->s[-1] == '\n');
          295         if (ra->ra == RA_END && !(rs->flg & REG_NOTEOL))
          296                 return rs->s[0] != '\0' && rs->s[0] != '\n';
          297         if (ra->ra == RA_WBEG)
          298                 return !((rs->s == rs->o || !isword(uc_beg(rs->o, rs->s - 1))) &&
          299                         isword(rs->s));
          300         if (ra->ra == RA_WEND)
          301                 return !(rs->s != rs->o && isword(uc_beg(rs->o, rs->s - 1)) &&
          302                         (!rs->s[0] || !isword(rs->s)));
          303         return 1;
          304 }
          305 
          306 static struct rnode *rnode_parse(char **pat);
          307 
          308 static struct rnode *rnode_grp(char **pat)
          309 {
          310         struct rnode *rnode = NULL;
          311         if ((*pat)[0] != '(')
          312                 return NULL;
          313         *pat += 1;
          314         if ((*pat)[0] != ')') {
          315                 rnode = rnode_parse(pat);
          316                 if (!rnode)
          317                         return NULL;
          318         }
          319         if ((*pat)[0] != ')') {
          320                 rnode_free(rnode);
          321                 return NULL;
          322         }
          323         *pat += 1;
          324         return rnode_make(RN_GRP, rnode, NULL);
          325 }
          326 
          327 static struct rnode *rnode_atom(char **pat)
          328 {
          329         struct rnode *rnode;
          330         if (!**pat)
          331                 return NULL;
          332         if ((*pat)[0] == '|' || (*pat)[0] == ')')
          333                 return NULL;
          334         if ((*pat)[0] == '(') {
          335                 rnode = rnode_grp(pat);
          336         } else {
          337                 rnode = rnode_make(RN_ATOM, NULL, NULL);
          338                 ratom_read(&rnode->ra, pat);
          339         }
          340         if (!rnode)
          341                 return NULL;
          342         if ((*pat)[0] == '*' || (*pat)[0] == '?') {
          343                 rnode->mincnt = 0;
          344                 rnode->maxcnt = (*pat)[0] == '*' ? -1 : 1;
          345                 *pat += 1;
          346         }
          347         if ((*pat)[0] == '+') {
          348                 rnode->mincnt = 1;
          349                 rnode->maxcnt = -1;
          350                 *pat += 1;
          351         }
          352         if ((*pat)[0] == '{') {
          353                 rnode->mincnt = 0;
          354                 rnode->maxcnt = 0;
          355                 *pat += 1;
          356                 while (isdigit((unsigned char) **pat))
          357                         rnode->mincnt = rnode->mincnt * 10 + *(*pat)++ - '0';
          358                 if (**pat == ',') {
          359                         (*pat)++;
          360                         if ((*pat)[0] == '}')
          361                                 rnode->maxcnt = -1;
          362                         while (isdigit((unsigned char) **pat))
          363                                 rnode->maxcnt = rnode->maxcnt * 10 + *(*pat)++ - '0';
          364                 } else {
          365                         rnode->maxcnt = rnode->mincnt;
          366                 }
          367                 *pat += 1;
          368                 if (rnode->mincnt > NREPS || rnode->maxcnt > NREPS) {
          369                         rnode_free(rnode);
          370                         return NULL;
          371                 }
          372         }
          373         return rnode;
          374 }
          375 
          376 static struct rnode *rnode_seq(char **pat)
          377 {
          378         struct rnode *c1 = rnode_atom(pat);
          379         struct rnode *c2;
          380         if (!c1)
          381                 return NULL;
          382         c2 = rnode_seq(pat);
          383         return c2 ? rnode_make(RN_CAT, c1, c2) : c1;
          384 }
          385 
          386 static struct rnode *rnode_parse(char **pat)
          387 {
          388         struct rnode *c1 = rnode_seq(pat);
          389         struct rnode *c2;
          390         if ((*pat)[0] != '|')
          391                 return c1;
          392         *pat += 1;
          393         c2 = rnode_parse(pat);
          394         return c2 ? rnode_make(RN_ALT, c1, c2) : c1;
          395 }
          396 
          397 static int rnode_count(struct rnode *rnode)
          398 {
          399         int n = 1;
          400         if (!rnode)
          401                 return 0;
          402         if (rnode->rn == RN_CAT)
          403                 n = rnode_count(rnode->c1) + rnode_count(rnode->c2);
          404         if (rnode->rn == RN_ALT)
          405                 n = rnode_count(rnode->c1) + rnode_count(rnode->c2) + 2;
          406         if (rnode->rn == RN_GRP)
          407                 n = rnode_count(rnode->c1) + 2;
          408         if (rnode->mincnt == 0 && rnode->maxcnt == 0)
          409                 return 0;
          410         if (rnode->mincnt == 1 && rnode->maxcnt == 1)
          411                 return n;
          412         if (rnode->maxcnt < 0) {
          413                 n = (rnode->mincnt + 1) * n + 1;
          414         } else {
          415                 n = (rnode->mincnt + rnode->maxcnt) * n +
          416                         rnode->maxcnt - rnode->mincnt;
          417         }
          418         if (!rnode->mincnt)
          419                 n++;
          420         return n;
          421 }
          422 
          423 static int rnode_grpnum(struct rnode *rnode, int num)
          424 {
          425         int cur = 0;
          426         if (!rnode)
          427                 return 0;
          428         if (rnode->rn == RN_GRP)
          429                 rnode->grp = num + cur++;
          430         cur += rnode_grpnum(rnode->c1, num + cur);
          431         cur += rnode_grpnum(rnode->c2, num + cur);
          432         return cur;
          433 }
          434 
          435 static int re_insert(struct regex *p, int ri)
          436 {
          437         p->p[p->n++].ri = ri;
          438         return p->n - 1;
          439 }
          440 
          441 static void rnode_emit(struct rnode *n, struct regex *p);
          442 
          443 static void rnode_emitnorep(struct rnode *n, struct regex *p)
          444 {
          445         int fork, done, mark;
          446         if (n->rn == RN_ALT) {
          447                 fork = re_insert(p, RI_FORK);
          448                 p->p[fork].a1 = p->n;
          449                 rnode_emit(n->c1, p);
          450                 done = re_insert(p, RI_JUMP);
          451                 p->p[fork].a2 = p->n;
          452                 rnode_emit(n->c2, p);
          453                 p->p[done].a1 = p->n;
          454         }
          455         if (n->rn == RN_CAT) {
          456                 rnode_emit(n->c1, p);
          457                 rnode_emit(n->c2, p);
          458         }
          459         if (n->rn == RN_GRP) {
          460                 mark = re_insert(p, RI_MARK);
          461                 p->p[mark].mark = 2 * n->grp;
          462                 rnode_emit(n->c1, p);
          463                 mark = re_insert(p, RI_MARK);
          464                 p->p[mark].mark = 2 * n->grp + 1;
          465         }
          466         if (n->rn == RN_ATOM) {
          467                 int atom = re_insert(p, RI_ATOM);
          468                 ratom_copy(&p->p[atom].ra, &n->ra);
          469         }
          470 }
          471 
          472 static void rnode_emit(struct rnode *n, struct regex *p)
          473 {
          474         int last;
          475         int jmpend[NREPS];
          476         int jmpend_cnt = 0;
          477         int i;
          478         if (!n)
          479                 return;
          480         if (n->mincnt == 0 && n->maxcnt == 0)
          481                 return;
          482         if (n->mincnt == 1 && n->maxcnt == 1) {
          483                 rnode_emitnorep(n, p);
          484                 return;
          485         }
          486         if (n->mincnt == 0) {
          487                 int fork = re_insert(p, RI_FORK);
          488                 p->p[fork].a1 = p->n;
          489                 jmpend[jmpend_cnt++] = fork;
          490         }
          491         for (i = 0; i < MAX(1, n->mincnt); i++) {
          492                 last = p->n;
          493                 rnode_emitnorep(n, p);
          494         }
          495         if (n->maxcnt < 0) {
          496                 int fork;
          497                 fork = re_insert(p, RI_FORK);
          498                 p->p[fork].a1 = last;
          499                 p->p[fork].a2 = p->n;
          500         }
          501         for (i = MAX(1, n->mincnt); i < n->maxcnt; i++) {
          502                 int fork = re_insert(p, RI_FORK);
          503                 p->p[fork].a1 = p->n;
          504                 jmpend[jmpend_cnt++] = fork;
          505                 rnode_emitnorep(n, p);
          506         }
          507         for (i = 0; i < jmpend_cnt; i++)
          508                 p->p[jmpend[i]].a2 = p->n;
          509 }
          510 
          511 int regcomp(regex_t *preg, char *pat, int flg)
          512 {
          513         struct rnode *rnode = rnode_parse(&pat);
          514         struct regex *re;
          515         int n = rnode_count(rnode) + 3;
          516         int mark;
          517         if (!rnode)
          518                 return 1;
          519         rnode_grpnum(rnode, 1);
          520         re = malloc(sizeof(*re));
          521         memset(re, 0, sizeof(*re));
          522         re->p = malloc(n * sizeof(re->p[0]));
          523         memset(re->p, 0, n * sizeof(re->p[0]));
          524         mark = re_insert(re, RI_MARK);
          525         re->p[mark].mark = 0;
          526         rnode_emit(rnode, re);
          527         mark = re_insert(re, RI_MARK);
          528         re->p[mark].mark = 1;
          529         mark = re_insert(re, RI_MATCH);
          530         rnode_free(rnode);
          531         re->flg = flg;
          532         *preg = re;
          533         return 0;
          534 }
          535 
          536 void regfree(regex_t *preg)
          537 {
          538         struct regex *re = *preg;
          539         int i;
          540         for (i = 0; i < re->n; i++)
          541                 if (re->p[i].ri == RI_ATOM)
          542                         free(re->p[i].ra.s);
          543         free(re->p);
          544         free(re);
          545 }
          546 
          547 static int re_rec(struct regex *re, struct rstate *rs)
          548 {
          549         struct rinst *ri = NULL;
          550         if (rs->dep >= NDEPT)
          551                 return 1;
          552         rs->dep++;
          553         while (1) {
          554                 ri = &re->p[rs->pc];
          555                 if (ri->ri == RI_ATOM) {
          556                         if (ratom_match(&ri->ra, rs))
          557                                 return 1;
          558                         rs->pc++;
          559                         continue;
          560                 }
          561                 if (ri->ri == RI_MARK) {
          562                         if (ri->mark < NGRPS)
          563                                 rs->mark[ri->mark] = rs->s - rs->o;
          564                         rs->pc++;
          565                         continue;
          566                 }
          567                 if (ri->ri == RI_JUMP) {
          568                         rs->pc = ri->a1;
          569                         continue;
          570                 }
          571                 if (ri->ri == RI_FORK) {
          572                         struct rstate base = *rs;
          573                         rs->pc = ri->a1;
          574                         if (!re_rec(re, rs))
          575                                 return 0;
          576                         *rs = base;
          577                         rs->pc = ri->a2;
          578                         continue;
          579                 }
          580                 break;
          581         }
          582         rs->dep--;
          583         return ri->ri != RI_MATCH;
          584 }
          585 
          586 static int re_recmatch(struct regex *re, struct rstate *rs, int nsub, regmatch_t *psub)
          587 {
          588         int i;
          589         rs->pc = 0;
          590         for (i = 0; i < LEN(rs->mark); i++)
          591                 rs->mark[i] = -1;
          592         rs->dep = 0;
          593         if (!re_rec(re, rs)) {
          594                 for (i = 0; i < nsub; i++) {
          595                         psub[i].rm_so = i * 2 < LEN(rs->mark) ? rs->mark[i * 2] : -1;
          596                         psub[i].rm_eo = i * 2 < LEN(rs->mark) ? rs->mark[i * 2 + 1] : -1;
          597                 }
          598                 return 0;
          599         }
          600         return 1;
          601 }
          602 
          603 int regexec(regex_t *preg, char *s, int nsub, regmatch_t psub[], int flg)
          604 {
          605         struct regex *re = *preg;
          606         struct rstate rs;
          607         memset(&rs, 0, sizeof(rs));
          608         rs.flg = re->flg | flg;
          609         rs.o = s;
          610         while (*s) {
          611                 rs.s = s;
          612                 s += uc_len(s);
          613                 if (!re_recmatch(re, &rs, flg & REG_NOSUB ? 0 : nsub, psub))
          614                         return 0;
          615         }
          616         return 1;
          617 }
          618 
          619 int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size)
          620 {
          621         return 0;
          622 }