trstr: faster searching for non-regex patterns - neatvi - [fork] simple vi-type editor with UTF-8 support
 (HTM) git clone git://src.adamsgaard.dk/neatvi
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
       ---
 (DIR) commit 2c0f3f357bcd4446492330f2313464c3c6348343
 (DIR) parent 7df341b1420aeeae6a9da53a5209762eda00341d
 (HTM) Author: Ali Gholami Rudi <ali@rudi.ir>
       Date:   Sat, 20 Nov 2021 21:57:26 +0330
       
       rstr: faster searching for non-regex patterns
       
       Diffstat:
         M Makefile                            |       2 +-
         M ex.c                                |      34 +++++++++++++++----------------
         M mot.c                               |       8 ++++----
         A rstr.c                              |     115 +++++++++++++++++++++++++++++++
         M vi.h                                |       8 ++++++--
       
       5 files changed, 142 insertions(+), 25 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       t@@ -3,7 +3,7 @@ CFLAGS = -Wall -O2
        LDFLAGS =
        
        OBJS = vi.o ex.o lbuf.o mot.o sbuf.o ren.o dir.o syn.o reg.o led.o \
       -        uc.o term.o rset.o regex.o cmd.o conf.o
       +        uc.o term.o rset.o rstr.o regex.o cmd.o conf.o
        
        all: vi
        
 (DIR) diff --git a/ex.c b/ex.c
       t@@ -167,8 +167,8 @@ static int ex_search(char **pat)
                struct sbuf *kw;
                char *b = *pat;
                char *e = b;
       -        char *pats[1];
       -        struct rset *re;
       +        char *pat_re;
       +        struct rstr *re;
                int dir, row;
                kw = sbuf_make();
                while (*++e) {
       t@@ -182,18 +182,18 @@ static int ex_search(char **pat)
                        ex_kwdset(sbuf_buf(kw), **pat == '/' ? 1 : -1);
                sbuf_free(kw);
                *pat = *e ? e + 1 : e;
       -        if (ex_kwd(&pats[0], &dir))
       +        if (ex_kwd(&pat_re, &dir))
                        return -1;
       -        re = rset_make(1, pats, xic ? RE_ICASE : 0);
       +        re = rstr_make(pat_re, xic ? RE_ICASE : 0);
                if (!re)
                        return -1;
                row = xrow + dir;
                while (row >= 0 && row < lbuf_len(xb)) {
       -                if (rset_find(re, lbuf_get(xb, row), 0, NULL, 0) >= 0)
       +                if (rstr_find(re, lbuf_get(xb, row), 0, NULL, 0) >= 0)
                                break;
                        row += dir;
                }
       -        rset_free(re);
       +        rstr_free(re);
                return row >= 0 && row < lbuf_len(xb) ? row : -1;
        }
        
       t@@ -604,10 +604,9 @@ static void replace(struct sbuf *dst, char *rep, char *ln, int *offs)
        
        static int ec_substitute(char *loc, char *cmd, char *arg)
        {
       -        struct rset *re;
       +        struct rstr *re;
                int offs[32];
                int beg, end;
       -        char *pats[1];
                char *pat = NULL, *rep = NULL;
                char *s = arg;
                int i;
       t@@ -624,15 +623,15 @@ static int ec_substitute(char *loc, char *cmd, char *arg)
                        snprintf(xrep, sizeof(xrep), "%s", rep ? rep : "");
                free(pat);
                free(rep);
       -        if (ex_kwd(&pats[0], NULL))
       +        if (ex_kwd(&pat, NULL))
                        return 1;
       -        re = rset_make(1, pats, xic ? RE_ICASE : 0);
       +        re = rstr_make(pat, xic ? RE_ICASE : 0);
                if (!re)
                        return 1;
                for (i = beg; i < end; i++) {
                        char *ln = lbuf_get(xb, i);
                        struct sbuf *r = NULL;
       -                while (rset_find(re, ln, LEN(offs) / 2, offs, 0) >= 0) {
       +                while (rstr_find(re, ln, LEN(offs) / 2, offs, 0) >= 0) {
                                if (!r)
                                        r = sbuf_make();
                                sbuf_mem(r, ln, offs[0]);
       t@@ -649,7 +648,7 @@ static int ec_substitute(char *loc, char *cmd, char *arg)
                                sbuf_free(r);
                        }
                }
       -        rset_free(re);
       +        rstr_free(re);
                return 0;
        }
        
       t@@ -716,10 +715,9 @@ static int ex_exec(char *ln);
        
        static int ec_glob(char *loc, char *cmd, char *arg)
        {
       -        struct rset *re;
       +        struct rstr *re;
                int offs[32];
                int beg, end, not;
       -        char *pats[1];
                char *pat;
                char *s = arg;
                int i;
       t@@ -732,9 +730,9 @@ static int ec_glob(char *loc, char *cmd, char *arg)
                if (pat && pat[0])
                        ex_kwdset(pat, +1);
                free(pat);
       -        if (ex_kwd(&pats[0], NULL))
       +        if (ex_kwd(&pat, NULL))
                        return 1;
       -        if (!(re = rset_make(1, pats, xic ? RE_ICASE : 0)))
       +        if (!(re = rstr_make(pat, xic ? RE_ICASE : 0)))
                        return 1;
                xgdep++;
                for (i = beg + 1; i < end; i++)
       t@@ -742,7 +740,7 @@ static int ec_glob(char *loc, char *cmd, char *arg)
                i = beg;
                while (i < lbuf_len(xb)) {
                        char *ln = lbuf_get(xb, i);
       -                if ((rset_find(re, ln, LEN(offs) / 2, offs, 0) < 0) == not) {
       +                if ((rstr_find(re, ln, LEN(offs) / 2, offs, 0) < 0) == not) {
                                xrow = i;
                                if (ex_exec(s))
                                        break;
       t@@ -754,7 +752,7 @@ static int ec_glob(char *loc, char *cmd, char *arg)
                for (i = 0; i < lbuf_len(xb); i++)
                        lbuf_globget(xb, i, xgdep);
                xgdep--;
       -        rset_free(re);
       +        rstr_free(re);
                return 0;
        }
        
 (DIR) diff --git a/mot.c b/mot.c
       t@@ -55,13 +55,13 @@ int lbuf_search(struct lbuf *lb, char *kw, int dir, int *r, int *o, int *len)
                int found = 0;
                int r0 = *r, o0 = *o;
                int i;
       -        struct rset *re = rset_make(1, &kw, xic ? RE_ICASE : 0);
       +        struct rstr *re = rstr_make(kw, xic ? RE_ICASE : 0);
                if (!re)
                        return 1;
                for (i = r0; !found && i >= 0 && i < lbuf_len(lb); i += dir) {
                        char *s = lbuf_get(lb, i);
                        int off = dir > 0 && r0 == i ? uc_chr(s, o0 + 1) - s : 0;
       -                while (rset_find(re, s + off, 1, offs,
       +                while (rstr_find(re, s + off, 1, offs,
                                        off ? RE_NOTBOL : 0) >= 0) {
                                if (dir < 0 && r0 == i &&
                                                uc_off(s, off + offs[0]) >= o0)
       t@@ -71,11 +71,11 @@ int lbuf_search(struct lbuf *lb, char *kw, int dir, int *r, int *o, int *len)
                                *r = i;
                                *len = uc_off(s + off + offs[0], offs[1] - offs[0]);
                                off += offs[1] > offs[0] ? offs[1] : offs[1] + 1;
       -                        if (dir > 0)
       +                        if (dir > 0 || !s[off])
                                        break;
                        }
                }
       -        rset_free(re);
       +        rstr_free(re);
                return !found;
        }
        
 (DIR) diff --git a/rstr.c b/rstr.c
       t@@ -0,0 +1,115 @@
       +#include <ctype.h>
       +#include <stdlib.h>
       +#include <stdio.h>
       +#include <string.h>
       +#include "vi.h"
       +
       +struct rstr {
       +        struct rset *rs;        /* the compiled regular expression */
       +        char *str;                /* simple search string */
       +        int icase;
       +        int lbeg, lend;
       +        int wbeg, wend;
       +};
       +
       +static int rstr_simple(struct rstr *rs, char *re)
       +{
       +        char *beg;
       +        char *end;
       +        rs->lbeg = re[0] == '^';
       +        if (rs->lbeg)
       +                re++;
       +        rs->wbeg = re[0] == '\\' && re[1] == '<';
       +        if (rs->wbeg)
       +                re += 2;
       +        beg = re;
       +        while (re[0] && !strchr("\\.*+?[]{}()$", (unsigned char) re[0]))
       +                re++;
       +        end = re;
       +        rs->wend = re[0] == '\\' && re[1] == '>';
       +        if (rs->wend)
       +                re += 2;
       +        rs->lend = re[0] == '$';
       +        if (rs->lend)
       +                re++;
       +        if (!re[0]) {
       +                int len = end - beg;
       +                rs->str = malloc(len + 1);
       +                memcpy(rs->str, beg, len);
       +                rs->str[len] = '\0';
       +                return 0;
       +        }
       +        return 1;
       +}
       +
       +struct rstr *rstr_make(char *re, int flg)
       +{
       +        struct rstr *rs = malloc(sizeof(*rs));
       +        memset(rs, 0, sizeof(*rs));
       +        rs->icase = flg & RE_ICASE;
       +        if (rstr_simple(rs, re))
       +                rs->rs = rset_make(1, &re, flg);
       +        if (!rs->rs && !rs->str) {
       +                free(rs);
       +                return NULL;
       +        }
       +        return rs;
       +}
       +
       +static int isword(char *s)
       +{
       +        int c = (unsigned char) s[0];
       +        return isalnum(c) || c == '_' || c > 127;
       +}
       +
       +static int match_case(char *s, char *r, int icase)
       +{
       +        for (; *r && *s; s++, r++) {
       +                if (!icase && *s != *r)
       +                        return 1;
       +                if (icase && tolower((unsigned char) *s) != tolower((unsigned char) *r))
       +                        return 1;
       +        }
       +        return 0;
       +}
       +
       +/* return zero if an occurrence is found */
       +int rstr_find(struct rstr *rs, char *s, int n, int *grps, int flg)
       +{
       +        int len;
       +        char *beg, *end;
       +        char *r;
       +        if (rs->rs)
       +                return rset_find(rs->rs, s, n, grps, flg);
       +        if ((rs->lbeg && (flg & RE_NOTBOL)) || (rs->lend && (flg & RE_NOTEOL)))
       +                return -1;
       +        len = strlen(rs->str);
       +        beg = s;
       +        end = s + strlen(s) - len - 1;
       +        if (rs->lbeg)
       +                end = beg;
       +        if (rs->lend)
       +                beg = end;
       +        for (r = beg; r <= end; r++) {
       +                if (rs->wbeg && r > s && (isword(r - 1) || !isword(r)))
       +                        continue;
       +                if (rs->wend && r[len] && (!isword(r + len - 1) || isword(r + len)))
       +                        continue;
       +                if (!match_case(r, rs->str, rs->icase)) {
       +                        if (n >= 1) {
       +                                grps[0] = r - s;
       +                                grps[1] = r - s + len;
       +                        }
       +                        return 0;
       +                }
       +        }
       +        return -1;
       +}
       +
       +void rstr_free(struct rstr *rs)
       +{
       +        if (rs->rs)
       +                rset_free(rs->rs);
       +        free(rs->str);
       +        free(rs);
       +}
 (DIR) diff --git a/vi.h b/vi.h
       t@@ -45,15 +45,19 @@ void sbuf_printf(struct sbuf *sbuf, char *s, ...);
        int sbuf_len(struct sbuf *sb);
        void sbuf_cut(struct sbuf *s, int len);
        
       -/* regular expression sets */
       +/* regular expressions */
        #define RE_ICASE                1
        #define RE_NOTBOL                2
        #define RE_NOTEOL                4
       -
       +/* regular expression sets: searching for multiple regular expressions */
        struct rset *rset_make(int n, char **pat, int flg);
        int rset_find(struct rset *re, char *s, int n, int *grps, int flg);
        void rset_free(struct rset *re);
        char *re_read(char **src);
       +/* searching for a single pattern regular expression */
       +struct rstr *rstr_make(char *re, int flg);
       +int rstr_find(struct rstr *rs, char *s, int n, int *grps, int flg);
       +void rstr_free(struct rstr *rs);
        
        /* rendering lines */
        int *ren_position(char *s);