re.c - 9base - revived minimalist port of Plan 9 userland to Unix
 (HTM) git clone git://git.suckless.org/9base
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       re.c (6951B)
       ---
            1 /****************************************************************
            2 Copyright (C) Lucent Technologies 1997
            3 All Rights Reserved
            4 
            5 Permission to use, copy, modify, and distribute this software and
            6 its documentation for any purpose and without fee is hereby
            7 granted, provided that the above copyright notice appear in all
            8 copies and that both that the copyright notice and this
            9 permission notice and warranty disclaimer appear in supporting
           10 documentation, and that the name Lucent Technologies or any of
           11 its entities not be used in advertising or publicity pertaining
           12 to distribution of the software without specific, written prior
           13 permission.
           14 
           15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
           16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
           17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
           18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
           19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
           20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
           21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
           22 THIS SOFTWARE.
           23 ****************************************************************/
           24 
           25 
           26 #define DEBUG
           27 #include <stdio.h>
           28 #include <u.h>
           29 #include <libc.h>
           30 #include <ctype.h>
           31 #include <bio.h>
           32 #include <regexp.h>
           33 #include "awk.h"
           34 #include "y.tab.h"
           35 
           36         /* This file provides the interface between the main body of
           37          * awk and the pattern matching package.  It preprocesses
           38          * patterns prior to compilation to provide awk-like semantics
           39          * to character sequences not supported by the pattern package.
           40          * The following conversions are performed:
           41          *
           42          *        "()"                ->        "[]"
           43          *        "[-"                ->        "[\-"
           44          *        "[^-"                ->        "[^\-"
           45          *        "-]"                ->        "\-]"
           46          *        "[]"                ->        "[]*"
           47          *        "\xdddd"        ->        "\z" where 'z' is the UTF sequence
           48          *                                        for the hex value
           49          *        "\ddd"                ->        "\o" where 'o' is a char octal value
           50          *        "\b"                ->        "\B"        where 'B' is backspace
           51          *        "\t"                ->        "\T"        where 'T' is tab
           52          *        "\f"                ->        "\F"        where 'F' is form feed
           53          *        "\n"                ->        "\N"        where 'N' is newline
           54          *        "\r"                ->        "\r"        where 'C' is cr
           55          */
           56 
           57 #define        MAXRE        512
           58 
           59 static char        re[MAXRE];        /* copy buffer */
           60 
           61 char        *patbeg;
           62 int        patlen;                        /* number of chars in pattern */
           63 
           64 #define        NPATS        20                /* number of slots in pattern cache */
           65 
           66 static struct pat_list                /* dynamic pattern cache */
           67 {
           68         char        *re;
           69         int        use;
           70         Reprog        *program;
           71 } pattern[NPATS];
           72 
           73 static int npats;                /* cache fill level */
           74 
           75         /* Compile a pattern */
           76 void
           77 *compre(char *pat)
           78 {
           79         int i, j, inclass;
           80         char c, *p, *s;
           81         Reprog *program;
           82 
           83         if (!compile_time) {        /* search cache for dynamic pattern */
           84                 for (i = 0; i < npats; i++)
           85                         if (!strcmp(pat, pattern[i].re)) {
           86                                 pattern[i].use++;
           87                                 return((void *) pattern[i].program);
           88                         }
           89         }
           90                 /* Preprocess Pattern for compilation */
           91         p = re;
           92         s = pat;
           93         inclass = 0;
           94         while (c = *s++) {
           95                 if (c == '\\') {
           96                         quoted(&s, &p, re+MAXRE);
           97                         continue;
           98                 }
           99                 else if (!inclass && c == '(' && *s == ')') {
          100                         if (p < re+MAXRE-2) {        /* '()' -> '[]*' */
          101                                 *p++ = '[';
          102                                 *p++ = ']';
          103                                 c = '*';
          104                                 s++;
          105                         }
          106                         else overflow();
          107                 }
          108                 else if (c == '['){                        /* '[-' -> '[\-' */
          109                         inclass = 1;
          110                         if (*s == '-') {
          111                                 if (p < re+MAXRE-2) {
          112                                         *p++ = '[';
          113                                         *p++ = '\\';
          114                                         c = *s++;
          115                                 }
          116                                 else overflow();
          117                         }                                /* '[^-' -> '[^\-'*/
          118                         else if (*s == '^' && s[1] == '-'){
          119                                 if (p < re+MAXRE-3) {
          120                                         *p++ = '[';
          121                                         *p++ = *s++;
          122                                         *p++ = '\\';
          123                                         c = *s++;
          124                                 }
          125                                 else overflow();
          126                         }
          127                         else if (*s == '['){                /* skip '[[' */
          128                                 if (p < re+MAXRE-1)
          129                                         *p++ = c;
          130                                 else overflow();
          131                                 c = *s++;
          132                         }
          133                         else if (*s == '^' && s[1] == '[') {        /* skip '[^['*/
          134                                 if (p < re+MAXRE-2) {
          135                                         *p++ = c;
          136                                         *p++ = *s++;
          137                                         c = *s++;
          138                                 }
          139                                 else overflow();
          140                         }
          141                         else if (*s == ']') {                /* '[]' -> '[]*' */
          142                                 if (p < re+MAXRE-2) {
          143                                         *p++ = c;
          144                                         *p++ = *s++;
          145                                         c = '*';
          146                                         inclass = 0;
          147                                 }
          148                                 else overflow();
          149                         }
          150                 }
          151                 else if (c == '-' && *s == ']') {        /* '-]' -> '\-]' */
          152                         if (p < re+MAXRE-1)
          153                                 *p++ = '\\';
          154                         else overflow();
          155                 }
          156                 else if (c == ']')
          157                         inclass = 0;
          158                 if (p < re+MAXRE-1)
          159                         *p++ = c;
          160                 else overflow();
          161         }
          162         *p = 0;
          163         program = regcomp(re);                /* compile pattern */
          164         if (!compile_time) {
          165                 if (npats < NPATS)        /* Room in cache */
          166                         i = npats++;
          167                 else {                        /* Throw out least used */
          168                         int use = pattern[0].use;
          169                         i = 0;
          170                         for (j = 1; j < NPATS; j++) {
          171                                 if (pattern[j].use < use) {
          172                                         use = pattern[j].use;
          173                                         i = j;
          174                                 }
          175                         }
          176                         xfree(pattern[i].program);
          177                         xfree(pattern[i].re);
          178                 }
          179                 pattern[i].re = tostring(pat);
          180                 pattern[i].program = program;
          181                 pattern[i].use = 1;
          182         }
          183         return((void *) program);
          184 }
          185 
          186         /* T/F match indication - matched string not exported */
          187 int
          188 match(void *p, char *s, char *start)
          189 {
          190         return regexec((Reprog *) p, (char *) s, 0, 0);
          191 }
          192 
          193         /* match and delimit the matched string */
          194 int
          195 pmatch(void *p, char *s, char *start)
          196 {
          197         Resub m;
          198 
          199         m.s.sp = start;
          200         m.e.ep = 0;
          201         if (regexec((Reprog *) p, (char *) s, &m, 1)) {
          202                 patbeg = m.s.sp;
          203                 patlen = m.e.ep-m.s.sp;
          204                 return 1;
          205         }
          206         patlen = -1;
          207         patbeg = start;
          208         return 0;
          209 }
          210 
          211         /* perform a non-empty match */
          212 int
          213 nematch(void *p, char *s, char *start)
          214 {
          215         if (pmatch(p, s, start) == 1 && patlen > 0)
          216                 return 1;
          217         patlen = -1;
          218         patbeg = start; 
          219         return 0;
          220 }
          221 /* in the parsing of regular expressions, metacharacters like . have */
          222 /* to be seen literally;  \056 is not a metacharacter. */
          223 
          224 int
          225 hexstr(char **pp)        /* find and eval hex string at pp, return new p */
          226 {
          227         char c;
          228         int n = 0;
          229         int i;
          230 
          231         for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
          232                 if (isdigit(c))
          233                         n = 16 * n + c - '0';
          234                 else if ('a' <= c && c <= 'f')
          235                         n = 16 * n + c - 'a' + 10;
          236                 else if ('A' <= c && c <= 'F')
          237                         n = 16 * n + c - 'A' + 10;
          238         }
          239         *pp += i;
          240         return n;
          241 }
          242 
          243         /* look for awk-specific escape sequences */
          244 
          245 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
          246 
          247 void
          248 quoted(char **s, char **to, char *end)        /* handle escaped sequence */
          249 {
          250         char *p = *s;
          251         char *t = *to;
          252         wchar_t c;
          253 
          254         switch(c = *p++) {
          255         case 't':
          256                 c = '\t';
          257                 break;
          258         case 'n':
          259                 c = '\n';
          260                 break;
          261         case 'f':
          262                 c = '\f';
          263                 break;
          264         case 'r':
          265                 c = '\r';
          266                 break;
          267         case 'b':
          268                 c = '\b';
          269                 break;
          270         default:
          271                 if (t < end-1)                /* all else must be escaped */
          272                         *t++ = '\\';
          273                 if (c == 'x') {                /* hexadecimal goo follows */
          274                         c = hexstr(&p);
          275                         if (t < end-MB_CUR_MAX)
          276                                 t += wctomb(t, c);
          277                         else overflow();
          278                         *to = t;
          279                         *s = p;
          280                         return;
          281                 } else if (isoctdigit(c)) {        /* \d \dd \ddd */
          282                         c -= '0';
          283                         if (isoctdigit(*p)) {
          284                                 c = 8 * c + *p++ - '0';
          285                                 if (isoctdigit(*p))
          286                                         c = 8 * c + *p++ - '0';
          287                         }
          288                 }
          289                 break;
          290         }
          291         if (t < end-1)
          292                 *t++ = c;
          293         *s = p;
          294         *to = t;
          295 }
          296         /* count rune positions */
          297 int
          298 countposn(char *s, int n)
          299 {
          300         int i, j;
          301         char *end;
          302 
          303         for (i = 0, end = s+n; *s && s < end; i++){
          304                 j = mblen(s, n);
          305                 if(j <= 0)
          306                         j = 1;
          307                 s += j;
          308         }
          309         return(i);
          310 }
          311 
          312         /* pattern package error handler */
          313 
          314 void
          315 regerror(char *s)
          316 {
          317         FATAL("%s", s);
          318 }
          319 
          320 void
          321 overflow(void)
          322 {
          323         FATAL("%s", "regular expression too big");
          324 }
          325