tregex: import regex.c from neatlibc - neatvi - [fork] simple vi-type editor with UTF-8 support
 (HTM) git clone git://src.adamsgaard.dk/neatvi
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
       ---
 (DIR) commit 67f3f8c955e80f17bc4ca092e2454a0104d13b18
 (DIR) parent 0225a76aa55a2d0dae9ac2a6d98f9c19de3b52c2
 (HTM) Author: Ali Gholami Rudi <ali@rudi.ir>
       Date:   Tue, 16 Jun 2015 14:51:07 +0430
       
       regex: import regex.c from neatlibc
       
       Diffstat:
         M Makefile                            |       2 +-
         A regex.c                             |     593 +++++++++++++++++++++++++++++++
         A regex.h                             |      18 ++++++++++++++++++
         M rset.c                              |       2 +-
       
       4 files changed, 613 insertions(+), 2 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       t@@ -3,7 +3,7 @@ CFLAGS = -Wall -O2
        LDFLAGS =
        
        OBJS = vi.o ex.o lbuf.o mot.o sbuf.o ren.o dir.o syn.o reg.o led.o \
       -        uc.o term.o rset.o cmd.o conf.o
       +        uc.o term.o rset.o regex.o cmd.o conf.o
        
        all: vi
        
 (DIR) diff --git a/regex.c b/regex.c
       t@@ -0,0 +1,593 @@
       +#include <ctype.h>
       +#include <stdio.h>
       +#include <stdlib.h>
       +#include <string.h>
       +#include "regex.h"
       +
       +#define NGRPS                64
       +#define NREPS                128
       +
       +#define MAX(a, b)        ((a) < (b) ? (b) : (a))
       +#define LEN(a)                (sizeof(a) / sizeof((a)[0]))
       +
       +/* regular expressions atoms */
       +#define RA_CHR                '\0'        /* character literal */
       +#define RA_BEG                '^'        /* string start */
       +#define RA_END                '$'        /* string end */
       +#define RA_ANY                '.'        /* any character */
       +#define RA_BRK                '['        /* bracket expression */
       +#define RA_WBEG                '<'        /* word start */
       +#define RA_WEND                '>'        /* word end */
       +
       +/* regular expression node types */
       +#define RN_ATOM                '\0'        /* regular expression */
       +#define RN_CAT                'c'        /* concatenation */
       +#define RN_ALT                '|'        /* alternate expressions */
       +#define RN_GRP                '('        /* pattern group */
       +
       +/* regular expression program instructions */
       +#define RI_ATOM                '\0'        /* regular expression */
       +#define RI_FORK                'f'        /* fork the execution */
       +#define RI_JUMP                'j'        /* jump to the given instruction */
       +#define RI_MARK                'm'        /* mark the current position */
       +#define RI_MATCH        'q'        /* the pattern is matched */
       +
       +/* regular expression atom */
       +struct ratom {
       +        int ra;                        /* atom type (RA_*) */
       +        char *s;                /* atom argument */
       +};
       +
       +/* regular expression instruction */
       +struct rinst {
       +        int ri;                        /* instruction type (RI_*) */
       +        struct ratom ra;        /* regular expression atom (RI_ATOM) */
       +        int a1, a2;                /* destination of RE_FORK and RI_JUMP */
       +        int mark;                /* mark (RI_MARK) */
       +};
       +
       +/* regular expression program */
       +struct regex {
       +        struct rinst *p;        /* the program */
       +        int n;                        /* number of instructions */
       +        int grpcnt;                /* number of groups */
       +        int flg;                /* regcomp() flags */
       +};
       +
       +/* regular expression matching state */
       +struct rstate {
       +        int mark[NGRPS * 2];        /* marks for RI_MARK */
       +        int pc;                        /* program counter */
       +        char *s;                /* the current position in the string */
       +        char *o;                /* the beginning of the string */
       +        int flg;                /* flags passed to regcomp() and regexec() */
       +};
       +
       +/* regular expression tree; used for parsing */
       +struct rnode {
       +        int rn;                        /* node type (RN_*) */
       +        struct ratom ra;        /* regular expression atom (RN_ATOM) */
       +        int mincnt, maxcnt;        /* number of repetitions */
       +        struct rnode *c1, *c2;        /* children */
       +};
       +
       +static struct rnode *rnode_make(int rn, struct rnode *c1, struct rnode *c2)
       +{
       +        struct rnode *rnode = malloc(sizeof(*rnode));
       +        memset(rnode, 0, sizeof(*rnode));
       +        rnode->rn = rn;
       +        rnode->c1 = c1;
       +        rnode->c2 = c2;
       +        rnode->mincnt = 1;
       +        rnode->maxcnt = 1;
       +        return rnode;
       +}
       +
       +static void rnode_free(struct rnode *rnode)
       +{
       +        if (rnode->c1)
       +                rnode_free(rnode->c1);
       +        if (rnode->c2)
       +                rnode_free(rnode->c2);
       +        free(rnode->ra.s);
       +        free(rnode);
       +}
       +
       +static int uc_len(char *s)
       +{
       +        int c = (unsigned char) s[0];
       +        if (c > 0 && c <= 0x7f)
       +                return 1;
       +        if (c >= 0xfc)
       +                return 6;
       +        if (c >= 0xf8)
       +                return 5;
       +        if (c >= 0xf0)
       +                return 4;
       +        if (c >= 0xe0)
       +                return 3;
       +        if (c >= 0xc0)
       +                return 2;
       +        return c != 0;
       +}
       +
       +static int uc_dec(char *s)
       +{
       +        int result;
       +        int l = uc_len(s);
       +        if (l <= 1)
       +                return (unsigned char) *s;
       +        result = (0x3f >> --l) & (unsigned char) *s++;
       +        while (l--)
       +                result = (result << 6) | ((unsigned char) *s++ & 0x3f);
       +        return result;
       +}
       +
       +static void ratom_copy(struct ratom *dst, struct ratom *src)
       +{
       +        dst->ra = src->ra;
       +        dst->s = NULL;
       +        if (src->s) {
       +                int len = strlen(src->s);
       +                dst->s = malloc(len + 1);
       +                memcpy(dst->s, src->s, len + 1);
       +        }
       +}
       +
       +static int brk_len(char *s)
       +{
       +        int n = 1;
       +        if (s[n] == '^')        /* exclusion mark */
       +                n++;
       +        if (s[n] == ']')        /* handling []a] */
       +                n++;
       +        while (s[n] && s[n] != ']') {
       +                if (s[n] == '[' && (s[n + 1] == ':' || s[n + 1] == '='))
       +                        while (s[n] && s[n] != ']')
       +                                n++;
       +                if (s[n])
       +                        n++;
       +        }
       +        return s[n] == ']' ? n + 1 : n;
       +}
       +
       +static void ratom_readbrk(struct ratom *ra, char **pat)
       +{
       +        int len = brk_len(*pat);
       +        ra->ra = RA_BRK;
       +        ra->s = malloc(len + 1);
       +        memcpy(ra->s, *pat, len);
       +        ra->s[len] = '\0';
       +        *pat += len;
       +}
       +
       +static void ratom_read(struct ratom *ra, char **pat)
       +{
       +        int len;
       +        switch ((unsigned char) **pat) {
       +        case '.':
       +                ra->ra = RA_ANY;
       +                (*pat)++;
       +                break;
       +        case '^':
       +                ra->ra = RA_BEG;
       +                (*pat)++;
       +                break;
       +        case '$':
       +                ra->ra = RA_END;
       +                (*pat)++;
       +                break;
       +        case '[':
       +                ratom_readbrk(ra, pat);
       +                break;
       +        case '\\':
       +                if ((*pat)[1] == '<' || (*pat)[1] == '>') {
       +                        ra->ra = (*pat)[1] == '<' ? RA_WBEG : RA_WEND;
       +                        *pat += 2;
       +                        break;
       +                }
       +                (*pat)++;
       +        default:
       +                ra->ra = RA_CHR;
       +                len = uc_len(*pat);
       +                ra->s = malloc(8);
       +                memcpy(ra->s, *pat, len);
       +                ra->s[len] = '\0';
       +                *pat += len;
       +        }
       +}
       +
       +static char *uc_beg(char *beg, char *s)
       +{
       +        while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
       +                s--;
       +        return s;
       +}
       +
       +static int isword(char *s)
       +{
       +        int c = (unsigned char) s[0];
       +        return isalnum(c) || c == '_' || c > 127;
       +}
       +
       +static char *brk_classes[][2] = {
       +        {":alnum:", "a-zA-Z0-9"},
       +        {":alpha:", "a-zA-Z"},
       +        {":blank:", " \t"},
       +        {":digit:", "0-9"},
       +        {":lower:", "a-z"},
       +        {":print:", "\x20-\x7e"},
       +        {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"},
       +        {":space:", " \t\r\n\v\f"},
       +        {":upper:", "A-Z"},
       +        {":word:", "a-zA-Z0-9_"},
       +        {":xdigit:", "a-fA-F0-9"},
       +};
       +
       +static int brk_match(char *brk, int c, int flg)
       +{
       +        int beg, end;
       +        int i;
       +        int not = brk[0] == '^';
       +        char *p = not ? brk + 1 : brk;
       +        char *p0 = p;
       +        if (flg & REG_ICASE && c < 128 && isupper(c))
       +                c = tolower(c);
       +        while (*p && (p == p0 || *p != ']')) {
       +                if (p[0] == '[' && p[1] == ':') {
       +                        for (i = 0; i < LEN(brk_classes); i++) {
       +                                char *cc = brk_classes[i][0];
       +                                char *cp = brk_classes[i][1];
       +                                if (!strncmp(cc, p + 1, strlen(cc)))
       +                                        if (!brk_match(cp, c, flg))
       +                                                return not;
       +                        }
       +                        p += brk_len(p);
       +                        continue;
       +                }
       +                beg = uc_dec(p);
       +                p += uc_len(p);
       +                end = beg;
       +                if (p[0] == '-' && p[1] && p[1] != ']') {
       +                        p++;
       +                        end = uc_dec(p);
       +                        p += uc_len(p);
       +                }
       +                if (flg & REG_ICASE && beg < 128 && isupper(beg))
       +                        beg = tolower(beg);
       +                if (flg & REG_ICASE && end < 128 && isupper(end))
       +                        end = tolower(end);
       +                if (c >= beg && c <= end)
       +                        return not;
       +        }
       +        return !not;
       +}
       +
       +static int ratom_match(struct ratom *ra, struct rstate *rs)
       +{
       +        if (ra->ra == RA_CHR) {
       +                int c1 = uc_dec(ra->s);
       +                int c2 = uc_dec(rs->s);
       +                if (rs->flg & REG_ICASE && c1 < 128 && isupper(c1))
       +                        c1 = tolower(c1);
       +                if (rs->flg & REG_ICASE && c2 < 128 && isupper(c2))
       +                        c2 = tolower(c2);
       +                if (c1 != c2)
       +                        return 1;
       +                rs->s += uc_len(ra->s);
       +                return 0;
       +        }
       +        if (ra->ra == RA_ANY) {
       +                if (!rs->s[0])
       +                        return 1;
       +                rs->s += uc_len(rs->s);
       +                return 0;
       +        }
       +        if (ra->ra == RA_BRK) {
       +                int c = uc_dec(rs->s);
       +                if (!c)
       +                        return 1;
       +                rs->s += uc_len(rs->s);
       +                return brk_match(ra->s + 1, c, rs->flg);
       +        }
       +        if (ra->ra == RA_BEG && !(rs->flg & REG_NOTBOL))
       +                return rs->s != rs->o;
       +        if (ra->ra == RA_END && !(rs->flg & REG_NOTEOL))
       +                return rs->s[0] != '\0';
       +        if (ra->ra == RA_WBEG)
       +                return !((rs->s == rs->o || !isword(uc_beg(rs->o, rs->s - 1))) &&
       +                        isword(rs->s));
       +        if (ra->ra == RA_WEND)
       +                return !(rs->s != rs->o && isword(uc_beg(rs->o, rs->s - 1)) &&
       +                        (!rs->s[0] || !isword(rs->s)));
       +        return 1;
       +}
       +
       +static struct rnode *rnode_parse(char **pat);
       +
       +static struct rnode *rnode_grp(char **pat)
       +{
       +        struct rnode *rnode;
       +        if ((*pat)[0] != '(')
       +                return NULL;
       +        *pat += 1;
       +        rnode = rnode_parse(pat);
       +        if ((*pat)[0] != ')') {
       +                rnode_free(rnode);
       +                return NULL;
       +        }
       +        *pat += 1;
       +        return rnode_make(RN_GRP, rnode, NULL);
       +}
       +
       +static struct rnode *rnode_atom(char **pat)
       +{
       +        struct rnode *rnode;
       +        if (!**pat)
       +                return NULL;
       +        if ((*pat)[0] == '|' || (*pat)[0] == ')')
       +                return NULL;
       +        if ((*pat)[0] == '(') {
       +                rnode = rnode_grp(pat);
       +        } else {
       +                rnode = rnode_make(RN_ATOM, NULL, NULL);
       +                ratom_read(&rnode->ra, pat);
       +        }
       +        if ((*pat)[0] == '*' || (*pat)[0] == '?') {
       +                rnode->mincnt = 0;
       +                rnode->maxcnt = (*pat)[0] == '*' ? -1 : 1;
       +                (*pat)++;
       +        }
       +        if ((*pat)[0] == '+') {
       +                rnode->mincnt = 1;
       +                rnode->maxcnt = -1;
       +                *pat += 1;
       +        }
       +        if ((*pat)[0] == '{') {
       +                rnode->mincnt = 0;
       +                rnode->maxcnt = 0;
       +                *pat += 1;
       +                while (isdigit((unsigned char) **pat))
       +                        rnode->mincnt = rnode->mincnt * 10 + *(*pat)++ - '0';
       +                if (**pat == ',') {
       +                        (*pat)++;
       +                        if ((*pat)[0] == '}')
       +                                rnode->maxcnt = -1;
       +                        while (isdigit((unsigned char) **pat))
       +                                rnode->maxcnt = rnode->maxcnt * 10 + *(*pat)++ - '0';
       +                } else {
       +                        rnode->maxcnt = rnode->mincnt;
       +                }
       +                *pat += 1;
       +                if (rnode->mincnt > NREPS || rnode->maxcnt > NREPS) {
       +                        rnode_free(rnode);
       +                        return NULL;
       +                }
       +        }
       +        return rnode;
       +}
       +
       +static struct rnode *rnode_seq(char **pat)
       +{
       +        struct rnode *c1 = rnode_atom(pat);
       +        struct rnode *c2;
       +        if (!c1)
       +                return NULL;
       +        c2 = rnode_seq(pat);
       +        return c2 ? rnode_make(RN_CAT, c1, c2) : c1;
       +}
       +
       +static struct rnode *rnode_parse(char **pat)
       +{
       +        struct rnode *c1 = rnode_seq(pat);
       +        struct rnode *c2;
       +        if ((*pat)[0] != '|')
       +                return c1;
       +        *pat += 1;
       +        c2 = rnode_parse(pat);
       +        return c2 ? rnode_make(RN_ALT, c1, c2) : c1;
       +}
       +
       +static int rnode_count(struct rnode *rnode)
       +{
       +        int n = 1;
       +        if (!rnode)
       +                return 0;
       +        if (rnode->rn == RN_CAT)
       +                n = rnode_count(rnode->c1) + rnode_count(rnode->c2);
       +        if (rnode->rn == RN_ALT)
       +                n = rnode_count(rnode->c1) + rnode_count(rnode->c2) + 2;
       +        if (rnode->rn == RN_GRP)
       +                n = rnode_count(rnode->c1) + 2;
       +        if (rnode->mincnt == 0 && rnode->maxcnt == 0)
       +                return 0;
       +        if (rnode->mincnt == 1 && rnode->maxcnt == 1)
       +                return n;
       +        if (rnode->maxcnt < 0) {
       +                n = (rnode->mincnt + 1) * n + 1;
       +        } else {
       +                n = (rnode->mincnt + rnode->maxcnt) * n +
       +                        rnode->maxcnt - rnode->mincnt;
       +        }
       +        if (!rnode->mincnt)
       +                n++;
       +        return n;
       +}
       +
       +static int re_insert(struct regex *p, int ri)
       +{
       +        p->p[p->n++].ri = ri;
       +        return p->n - 1;
       +}
       +
       +static void rnode_emit(struct rnode *n, struct regex *p);
       +
       +static void rnode_emitnorep(struct rnode *n, struct regex *p)
       +{
       +        int fork, done, mark;
       +        if (n->rn == RN_ALT) {
       +                fork = re_insert(p, RI_FORK);
       +                p->p[fork].a1 = p->n;
       +                rnode_emit(n->c1, p);
       +                done = re_insert(p, RI_JUMP);
       +                p->p[fork].a2 = p->n;
       +                rnode_emit(n->c2, p);
       +                p->p[done].a1 = p->n;
       +        }
       +        if (n->rn == RN_CAT) {
       +                rnode_emit(n->c1, p);
       +                rnode_emit(n->c2, p);
       +        }
       +        if (n->rn == RN_GRP) {
       +                int grp = p->grpcnt++ + 1;
       +                mark = re_insert(p, RI_MARK);
       +                p->p[mark].mark = 2 * grp;
       +                rnode_emit(n->c1, p);
       +                mark = re_insert(p, RI_MARK);
       +                p->p[mark].mark = 2 * grp + 1;
       +        }
       +        if (n->rn == RN_ATOM) {
       +                int atom = re_insert(p, RI_ATOM);
       +                ratom_copy(&p->p[atom].ra, &n->ra);
       +        }
       +}
       +
       +static void rnode_emit(struct rnode *n, struct regex *p)
       +{
       +        int last;
       +        int jmpend[NREPS];
       +        int jmpend_cnt = 0;
       +        int i;
       +        if (!n)
       +                return;
       +        if (n->mincnt == 0 && n->maxcnt == 0)
       +                return;
       +        if (n->mincnt == 1 && n->maxcnt == 1) {
       +                rnode_emitnorep(n, p);
       +                return;
       +        }
       +        if (n->mincnt == 0) {
       +                int fork = re_insert(p, RI_FORK);
       +                p->p[fork].a1 = p->n;
       +                jmpend[jmpend_cnt++] = fork;
       +        }
       +        for (i = 0; i < MAX(1, n->mincnt); i++) {
       +                last = p->n;
       +                rnode_emitnorep(n, p);
       +        }
       +        if (n->maxcnt < 0) {
       +                int fork;
       +                fork = re_insert(p, RI_FORK);
       +                p->p[fork].a1 = last;
       +                p->p[fork].a2 = p->n;
       +        }
       +        for (i = MAX(1, n->mincnt); i < n->maxcnt; i++) {
       +                int fork = re_insert(p, RI_FORK);
       +                p->p[fork].a1 = p->n;
       +                jmpend[jmpend_cnt++] = fork;
       +                rnode_emitnorep(n, p);
       +        }
       +        for (i = 0; i < jmpend_cnt; i++)
       +                p->p[jmpend[i]].a2 = p->n;
       +}
       +
       +int regcomp(regex_t *preg, char *pat, int flg)
       +{
       +        struct rnode *rnode = rnode_parse(&pat);
       +        struct regex *re = malloc(sizeof(*re));
       +        int n = rnode_count(rnode) + 3;
       +        int mark;
       +        memset(re, 0, sizeof(*re));
       +        re->p = malloc(n * sizeof(re->p[0]));
       +        memset(re->p, 0, n * sizeof(re->p[0]));
       +        mark = re_insert(re, RI_MARK);
       +        re->p[mark].mark = 0;
       +        rnode_emit(rnode, re);
       +        mark = re_insert(re, RI_MARK);
       +        re->p[mark].mark = 1;
       +        mark = re_insert(re, RI_MATCH);
       +        rnode_free(rnode);
       +        re->flg = flg;
       +        *preg = re;
       +        return 0;
       +}
       +
       +void regfree(regex_t *preg)
       +{
       +        struct regex *re = *preg;
       +        int i;
       +        for (i = 0; i < re->n; i++)
       +                if (re->p[i].ri == RI_ATOM)
       +                        free(re->p[i].ra.s);
       +        free(re->p);
       +        free(re);
       +}
       +
       +static int re_rec(struct regex *re, struct rstate *rs)
       +{
       +        struct rinst *ri = &re->p[rs->pc];
       +        if (ri->ri == RI_ATOM) {
       +                if (ratom_match(&ri->ra, rs))
       +                        return 1;
       +                rs->pc++;
       +                return re_rec(re, rs);
       +        }
       +        if (ri->ri == RI_MARK) {
       +                if (ri->mark < NGRPS)
       +                        rs->mark[ri->mark] = rs->s - rs->o;
       +                rs->pc++;
       +                return re_rec(re, rs);
       +        }
       +        if (ri->ri == RI_JUMP) {
       +                rs->pc = ri->a1;
       +                return re_rec(re, rs);
       +        }
       +        if (ri->ri == RI_FORK) {
       +                struct rstate base = *rs;
       +                rs->pc = ri->a1;
       +                if (!re_rec(re, rs))
       +                        return 0;
       +                *rs = base;
       +                rs->pc = ri->a2;
       +                return re_rec(re, rs);
       +        }
       +        if (ri->ri == RI_MATCH)
       +                return 0;
       +        return 1;
       +}
       +
       +static int re_recmatch(struct regex *re, struct rstate *rs, int nsub, regmatch_t *psub)
       +{
       +        int i;
       +        rs->pc = 0;
       +        for (i = 0; i < LEN(rs->mark); i++)
       +                rs->mark[i] = -1;
       +        if (!re_rec(re, rs)) {
       +                for (i = 0; i < nsub; i++) {
       +                        psub[i].rm_so = i * 2 < LEN(rs->mark) ? rs->mark[i * 2] : -1;
       +                        psub[i].rm_eo = i * 2 < LEN(rs->mark) ? rs->mark[i * 2 + 1] : -1;
       +                }
       +                return 0;
       +        }
       +        return 1;
       +}
       +
       +int regexec(regex_t *preg, char *s, int nsub, regmatch_t psub[], int flg)
       +{
       +        struct regex *re = *preg;
       +        struct rstate rs;
       +        memset(&rs, 0, sizeof(rs));
       +        rs.flg = re->flg | flg;
       +        rs.o = s;
       +        while (*s) {
       +                rs.s = s++;
       +                if (!re_recmatch(re, &rs, flg & REG_NOSUB ? 0 : nsub, psub))
       +                        return 0;
       +        }
       +        return 1;
       +}
       +
       +int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size)
       +{
       +        return 0;
       +}
 (DIR) diff --git a/regex.h b/regex.h
       t@@ -0,0 +1,18 @@
       +#define REG_EXTENDED                0x01
       +#define REG_NOSUB                0x02
       +#define REG_ICASE                0x04
       +#define REG_NEWLINE                0x08
       +#define REG_NOTBOL                0x10
       +#define REG_NOTEOL                0x20
       +
       +typedef struct {
       +        long rm_so;
       +        long rm_eo;
       +} regmatch_t;
       +
       +typedef struct regex *regex_t;
       +
       +int regcomp(regex_t *preg, char *regex, int cflags);
       +int regexec(regex_t *preg, char *str, int nmatch, regmatch_t pmatch[], int eflags);
       +int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size);
       +void regfree(regex_t *preg);
 (DIR) diff --git a/rset.c b/rset.c
       t@@ -1,7 +1,7 @@
       -#include <regex.h>
        #include <stdlib.h>
        #include <stdio.h>
        #include <string.h>
       +#include "regex.h"
        #include "vi.h"
        
        /* regular expression set */