regcomp.c - 9base - revived minimalist port of Plan 9 userland to Unix
 (HTM) git clone git://git.suckless.org/9base
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       regcomp.c (9743B)
       ---
            1 #include "lib9.h"
            2 #include "regexp9.h"
            3 #include "regcomp.h"
            4 
            5 #define        TRUE        1
            6 #define        FALSE        0
            7 
            8 /*
            9  * Parser Information
           10  */
           11 typedef
           12 struct Node
           13 {
           14         Reinst*        first;
           15         Reinst*        last;
           16 }Node;
           17 
           18 #define        NSTACK        20
           19 static        Node        andstack[NSTACK];
           20 static        Node        *andp;
           21 static        int        atorstack[NSTACK];
           22 static        int*        atorp;
           23 static        int        cursubid;                /* id of current subexpression */
           24 static        int        subidstack[NSTACK];        /* parallel to atorstack */
           25 static        int*        subidp;
           26 static        int        lastwasand;        /* Last token was operand */
           27 static        int        nbra;
           28 static        char*        exprp;                /* pointer to next character in source expression */
           29 static        int        lexdone;
           30 static        int        nclass;
           31 static        Reclass*classp;
           32 static        Reinst*        freep;
           33 static        int        errors;
           34 static        Rune        yyrune;                /* last lex'd rune */
           35 static        Reclass*yyclassp;        /* last lex'd class */
           36 
           37 /* predeclared crap */
           38 static        void        operator(int);
           39 static        void        pushand(Reinst*, Reinst*);
           40 static        void        pushator(int);
           41 static        void        evaluntil(int);
           42 static        int        bldcclass(void);
           43 
           44 static jmp_buf regkaboom;
           45 
           46 static        void
           47 rcerror(char *s)
           48 {
           49         errors++;
           50         regerror(s);
           51         longjmp(regkaboom, 1);
           52 }
           53 
           54 static        Reinst*
           55 newinst(int t)
           56 {
           57         freep->type = t;
           58         freep->u2.left = 0;
           59         freep->u1.right = 0;
           60         return freep++;
           61 }
           62 
           63 static        void
           64 operand(int t)
           65 {
           66         Reinst *i;
           67 
           68         if(lastwasand)
           69                 operator(CAT);        /* catenate is implicit */
           70         i = newinst(t);
           71 
           72         if(t == CCLASS || t == NCCLASS)
           73                 i->u1.cp = yyclassp;
           74         if(t == RUNE)
           75                 i->u1.r = yyrune;
           76 
           77         pushand(i, i);
           78         lastwasand = TRUE;
           79 }
           80 
           81 static        void
           82 operator(int t)
           83 {
           84         if(t==RBRA && --nbra<0)
           85                 rcerror("unmatched right paren");
           86         if(t==LBRA){
           87                 if(++cursubid >= NSUBEXP)
           88                         rcerror ("too many subexpressions");
           89                 nbra++;
           90                 if(lastwasand)
           91                         operator(CAT);
           92         } else
           93                 evaluntil(t);
           94         if(t != RBRA)
           95                 pushator(t);
           96         lastwasand = FALSE;
           97         if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
           98                 lastwasand = TRUE;        /* these look like operands */
           99 }
          100 
          101 static        void
          102 regerr2(char *s, int c)
          103 {
          104         char buf[100];
          105         char *cp = buf;
          106         while(*s)
          107                 *cp++ = *s++;
          108         *cp++ = c;
          109         *cp = '\0'; 
          110         rcerror(buf);
          111 }
          112 
          113 static        void
          114 cant(char *s)
          115 {
          116         char buf[100];
          117         strcpy(buf, "can't happen: ");
          118         strcat(buf, s);
          119         rcerror(buf);
          120 }
          121 
          122 static        void
          123 pushand(Reinst *f, Reinst *l)
          124 {
          125         if(andp >= &andstack[NSTACK])
          126                 cant("operand stack overflow");
          127         andp->first = f;
          128         andp->last = l;
          129         andp++;
          130 }
          131 
          132 static        void
          133 pushator(int t)
          134 {
          135         if(atorp >= &atorstack[NSTACK])
          136                 cant("operator stack overflow");
          137         *atorp++ = t;
          138         *subidp++ = cursubid;
          139 }
          140 
          141 static        Node*
          142 popand(int op)
          143 {
          144         Reinst *inst;
          145 
          146         if(andp <= &andstack[0]){
          147                 regerr2("missing operand for ", op);
          148                 inst = newinst(NOP);
          149                 pushand(inst,inst);
          150         }
          151         return --andp;
          152 }
          153 
          154 static        int
          155 popator(void)
          156 {
          157         if(atorp <= &atorstack[0])
          158                 cant("operator stack underflow");
          159         --subidp;
          160         return *--atorp;
          161 }
          162 
          163 static        void
          164 evaluntil(int pri)
          165 {
          166         Node *op1, *op2;
          167         Reinst *inst1, *inst2;
          168 
          169         while(pri==RBRA || atorp[-1]>=pri){
          170                 switch(popator()){
          171                 default:
          172                         rcerror("unknown operator in evaluntil");
          173                         break;
          174                 case LBRA:                /* must have been RBRA */
          175                         op1 = popand('(');
          176                         inst2 = newinst(RBRA);
          177                         inst2->u1.subid = *subidp;
          178                         op1->last->u2.next = inst2;
          179                         inst1 = newinst(LBRA);
          180                         inst1->u1.subid = *subidp;
          181                         inst1->u2.next = op1->first;
          182                         pushand(inst1, inst2);
          183                         return;
          184                 case OR:
          185                         op2 = popand('|');
          186                         op1 = popand('|');
          187                         inst2 = newinst(NOP);
          188                         op2->last->u2.next = inst2;
          189                         op1->last->u2.next = inst2;
          190                         inst1 = newinst(OR);
          191                         inst1->u1.right = op1->first;
          192                         inst1->u2.left = op2->first;
          193                         pushand(inst1, inst2);
          194                         break;
          195                 case CAT:
          196                         op2 = popand(0);
          197                         op1 = popand(0);
          198                         op1->last->u2.next = op2->first;
          199                         pushand(op1->first, op2->last);
          200                         break;
          201                 case STAR:
          202                         op2 = popand('*');
          203                         inst1 = newinst(OR);
          204                         op2->last->u2.next = inst1;
          205                         inst1->u1.right = op2->first;
          206                         pushand(inst1, inst1);
          207                         break;
          208                 case PLUS:
          209                         op2 = popand('+');
          210                         inst1 = newinst(OR);
          211                         op2->last->u2.next = inst1;
          212                         inst1->u1.right = op2->first;
          213                         pushand(op2->first, inst1);
          214                         break;
          215                 case QUEST:
          216                         op2 = popand('?');
          217                         inst1 = newinst(OR);
          218                         inst2 = newinst(NOP);
          219                         inst1->u2.left = inst2;
          220                         inst1->u1.right = op2->first;
          221                         op2->last->u2.next = inst2;
          222                         pushand(inst1, inst2);
          223                         break;
          224                 }
          225         }
          226 }
          227 
          228 static        Reprog*
          229 optimize(Reprog *pp)
          230 {
          231         Reinst *inst, *target;
          232         int size;
          233         Reprog *npp;
          234         Reclass *cl;
          235         int diff;
          236 
          237         /*
          238          *  get rid of NOOP chains
          239          */
          240         for(inst=pp->firstinst; inst->type!=END; inst++){
          241                 target = inst->u2.next;
          242                 while(target->type == NOP)
          243                         target = target->u2.next;
          244                 inst->u2.next = target;
          245         }
          246 
          247         /*
          248          *  The original allocation is for an area larger than
          249          *  necessary.  Reallocate to the actual space used
          250          *  and then relocate the code.
          251          */
          252         size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
          253         npp = realloc(pp, size);
          254         if(npp==0 || npp==pp)
          255                 return pp;
          256         diff = (char *)npp - (char *)pp;
          257         freep = (Reinst *)((char *)freep + diff);
          258         for(inst=npp->firstinst; inst<freep; inst++){
          259                 switch(inst->type){
          260                 case OR:
          261                 case STAR:
          262                 case PLUS:
          263                 case QUEST:
          264                         inst->u1.right = (void*)((char*)inst->u1.right + diff);
          265                         break;
          266                 case CCLASS:
          267                 case NCCLASS:
          268                         inst->u1.right = (void*)((char*)inst->u1.right + diff);
          269                         cl = inst->u1.cp;
          270                         cl->end = (void*)((char*)cl->end + diff);
          271                         break;
          272                 }
          273                 inst->u2.left = (void*)((char*)inst->u2.left + diff);
          274         }
          275         npp->startinst = (void*)((char*)npp->startinst + diff);
          276         return npp;
          277 }
          278 
          279 #ifdef        DEBUG
          280 static        void
          281 dumpstack(void){
          282         Node *stk;
          283         int *ip;
          284 
          285         print("operators\n");
          286         for(ip=atorstack; ip<atorp; ip++)
          287                 print("0%o\n", *ip);
          288         print("operands\n");
          289         for(stk=andstack; stk<andp; stk++)
          290                 print("0%o\t0%o\n", stk->first->type, stk->last->type);
          291 }
          292 
          293 static        void
          294 dump(Reprog *pp)
          295 {
          296         Reinst *l;
          297         Rune *p;
          298 
          299         l = pp->firstinst;
          300         do{
          301                 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
          302                         l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
          303                 if(l->type == RUNE)
          304                         print("\t%C\n", l->u1.r);
          305                 else if(l->type == CCLASS || l->type == NCCLASS){
          306                         print("\t[");
          307                         if(l->type == NCCLASS)
          308                                 print("^");
          309                         for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
          310                                 if(p[0] == p[1])
          311                                         print("%C", p[0]);
          312                                 else
          313                                         print("%C-%C", p[0], p[1]);
          314                         print("]\n");
          315                 } else
          316                         print("\n");
          317         }while(l++->type);
          318 }
          319 #endif
          320 
          321 static        Reclass*
          322 newclass(void)
          323 {
          324         if(nclass >= NCLASS)
          325                 regerr2("too many character classes; limit", NCLASS+'0');
          326         return &(classp[nclass++]);
          327 }
          328 
          329 static        int
          330 nextc(Rune *rp)
          331 {
          332         if(lexdone){
          333                 *rp = 0;
          334                 return 1;
          335         }
          336         exprp += chartorune(rp, exprp);
          337         if(*rp == '\\'){
          338                 exprp += chartorune(rp, exprp);
          339                 return 1;
          340         }
          341         if(*rp == 0)
          342                 lexdone = 1;
          343         return 0;
          344 }
          345 
          346 static        int
          347 lex(int literal, int dot_type)
          348 {
          349         int quoted;
          350 
          351         quoted = nextc(&yyrune);
          352         if(literal || quoted){
          353                 if(yyrune == 0)
          354                         return END;
          355                 return RUNE;
          356         }
          357 
          358         switch(yyrune){
          359         case 0:
          360                 return END;
          361         case '*':
          362                 return STAR;
          363         case '?':
          364                 return QUEST;
          365         case '+':
          366                 return PLUS;
          367         case '|':
          368                 return OR;
          369         case '.':
          370                 return dot_type;
          371         case '(':
          372                 return LBRA;
          373         case ')':
          374                 return RBRA;
          375         case '^':
          376                 return BOL;
          377         case '$':
          378                 return EOL;
          379         case '[':
          380                 return bldcclass();
          381         }
          382         return RUNE;
          383 }
          384 
          385 static int
          386 bldcclass(void)
          387 {
          388         int type;
          389         Rune r[NCCRUNE];
          390         Rune *p, *ep, *np;
          391         Rune rune;
          392         int quoted;
          393 
          394         /* we have already seen the '[' */
          395         type = CCLASS;
          396         yyclassp = newclass();
          397 
          398         /* look ahead for negation */
          399         /* SPECIAL CASE!!! negated classes don't match \n */
          400         ep = r;
          401         quoted = nextc(&rune);
          402         if(!quoted && rune == '^'){
          403                 type = NCCLASS;
          404                 quoted = nextc(&rune);
          405                 *ep++ = '\n';
          406                 *ep++ = '\n';
          407         }
          408 
          409         /* parse class into a set of spans */
          410         for(; ep<&r[NCCRUNE];){
          411                 if(rune == 0){
          412                         rcerror("malformed '[]'");
          413                         return 0;
          414                 }
          415                 if(!quoted && rune == ']')
          416                         break;
          417                 if(!quoted && rune == '-'){
          418                         if(ep == r){
          419                                 rcerror("malformed '[]'");
          420                                 return 0;
          421                         }
          422                         quoted = nextc(&rune);
          423                         if((!quoted && rune == ']') || rune == 0){
          424                                 rcerror("malformed '[]'");
          425                                 return 0;
          426                         }
          427                         *(ep-1) = rune;
          428                 } else {
          429                         *ep++ = rune;
          430                         *ep++ = rune;
          431                 }
          432                 quoted = nextc(&rune);
          433         }
          434 
          435         /* sort on span start */
          436         for(p = r; p < ep; p += 2){
          437                 for(np = p; np < ep; np += 2)
          438                         if(*np < *p){
          439                                 rune = np[0];
          440                                 np[0] = p[0];
          441                                 p[0] = rune;
          442                                 rune = np[1];
          443                                 np[1] = p[1];
          444                                 p[1] = rune;
          445                         }
          446         }
          447 
          448         /* merge spans */
          449         np = yyclassp->spans;
          450         p = r;
          451         if(r == ep)
          452                 yyclassp->end = np;
          453         else {
          454                 np[0] = *p++;
          455                 np[1] = *p++;
          456                 for(; p < ep; p += 2)
          457                         if(p[0] <= np[1]){
          458                                 if(p[1] > np[1])
          459                                         np[1] = p[1];
          460                         } else {
          461                                 np += 2;
          462                                 np[0] = p[0];
          463                                 np[1] = p[1];
          464                         }
          465                 yyclassp->end = np+2;
          466         }
          467 
          468         return type;
          469 }
          470 
          471 static        Reprog*
          472 regcomp1(char *s, int literal, int dot_type)
          473 {
          474         int token;
          475         Reprog *volatile pp;
          476 
          477         /* get memory for the program */
          478         pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
          479         if(pp == 0){
          480                 regerror("out of memory");
          481                 return 0;
          482         }
          483         freep = pp->firstinst;
          484         classp = pp->class;
          485         errors = 0;
          486 
          487         if(setjmp(regkaboom))
          488                 goto out;
          489 
          490         /* go compile the sucker */
          491         lexdone = 0;
          492         exprp = s;
          493         nclass = 0;
          494         nbra = 0;
          495         atorp = atorstack;
          496         andp = andstack;
          497         subidp = subidstack;
          498         lastwasand = FALSE;
          499         cursubid = 0;
          500 
          501         /* Start with a low priority operator to prime parser */
          502         pushator(START-1);
          503         while((token = lex(literal, dot_type)) != END){
          504                 if((token&0300) == OPERATOR)
          505                         operator(token);
          506                 else
          507                         operand(token);
          508         }
          509 
          510         /* Close with a low priority operator */
          511         evaluntil(START);
          512 
          513         /* Force END */
          514         operand(END);
          515         evaluntil(START);
          516 #ifdef DEBUG
          517         dumpstack();
          518 #endif
          519         if(nbra)
          520                 rcerror("unmatched left paren");
          521         --andp;        /* points to first and only operand */
          522         pp->startinst = andp->first;
          523 #ifdef DEBUG
          524         dump(pp);
          525 #endif
          526         pp = optimize(pp);
          527 #ifdef DEBUG
          528         print("start: %d\n", andp->first-pp->firstinst);
          529         dump(pp);
          530 #endif
          531 out:
          532         if(errors){
          533                 free(pp);
          534                 pp = 0;
          535         }
          536         return pp;
          537 }
          538 
          539 extern        Reprog*
          540 regcomp(char *s)
          541 {
          542         return regcomp1(s, 0, ANY);
          543 }
          544 
          545 extern        Reprog*
          546 regcomplit(char *s)
          547 {
          548         return regcomp1(s, 1, ANY);
          549 }
          550 
          551 extern        Reprog*
          552 regcompnl(char *s)
          553 {
          554         return regcomp1(s, 0, ANYNL);
          555 }