/* Pattern Search Utility * * This utility will search for regular expressions in text streams. * First a function is called to "compile" the search pattern into a form * easier to search with. Then a function is called with the text to be * searched; it returns a pointer to the first line within the text which * matches the pattern (or 0 if none did). This second function can be * called repeatedly, with successive pointers to find all lines which * match the pattern. The second function also returns a pointer to the * memory location just past the matched string, making repeated searches * easier. * For example: * * comppat("*one*",'\n',FALSE); * sptr = srchpat(textbuf,bufsz,endptr); * * will set sptr to the first line in textbuf containing the string * "one" anywhere in it, and endptr to the point just past the matched * line; or else it will return zero if there was no matching line. * Although the line must match the pattern exactly, searching for * a pattern anywhere in the line can be done by prefixing and postfixing * the pattern with "*". Also, comppat() can be told (via the third * parameter) to ignore case when comparing letters. * * Characters in the regular expressions have the following meanings: * * - unless one of the ones below, match exactly . * ? - match any single character. * * - match zero or more of any character. * [] - match any of the characters in the set. * [-] - match any of the characters not in the set. * ::= match * - match any char in the * range - * "\" can be used to quote characters (i.e., prevent them from being * interpretted specially). * * * Initial coding 8/18/82 by Harry R. Chesley. * * * Copyright 1982 by Harry R. Chesley. * Unlimited permission granted for non-profit use only. */ #include "pat.h" /* comppat(pat,eolchar,anycase) * * Function: Compile the regular expression pattern given in pat * into a series of pattern elements; append an element which matches * the eolchar. If anycase is TRUE, ignore case on letters. * * Algorithm: Parse pat for regular expression special characters, * constructing a compiled version in cpat. The following transformations * are made to get from the original pattern to the compiled series of * elements: * - if anycase FALSE, pel_type = MONEC, pel_c = ; * if anycase TRUE, pel_type = MSET, set = [, * other case of ]. * ? - pel_type = MONEA. * * - pel_type = MMANY. * [...] - pel_type = MSET, set = indicated characters. * [-...] - pel_type = MSET, set = all but indicated characters. * * Also, eolchar is not allowed in any but the last element. * * Comments: This routine compiles into the global cpat, and also * sets cptop and eolch. Therefore, it is not reentrant, and it is not * possible to be using more than one pattern at a time. */ comppat(pat,eolchar,anycase) char *pat; char eolchar; int anycase; { int negflg; /* Used in [...] processing to remember leading -. */ char c1, c2, ct; /* Used in [...] for - processing. */ eolch = eolchar; /* Remember end-of-line char. */ cptop = cpat; /* Start with nothing. */ /* Go thru pattern 'til end of string. */ while (*pat != 0) { /* Never allow the EOL character in the search pattern. */ if (*pat == eolch) { pat++; continue; }; /* Look for special characters. */ switch (*pat++) { /* Zero or more of anything. */ case '*': cptop->pel_type = MMANY; /* Zero or more. */ break; /* One of anything. */ case '?': cptop->pel_type = MONEA; /* One of anything.*/ break; /* Character sets. */ case '[': cptop->pel_type = MSET; /* Any in set. */ clrall(cptop); /* Start with nothing. */ /* Check for "any but". */ if (*pat == '-') { pat++; negflg = TRUE; } else negflg = FALSE; /* Figure the set. */ while ((*pat != ']') && (*pat != 0)) { if (*pat == '\\') if (*(++pat) == 0) break; /* Check for -. */ if (*(pat+1) == '-') { if (*(pat+2) != 0) { if ((c1 = *pat) > (c2 = *(pat+2))) { ct = c1; c1 = c2; c2 = ct; }; do if (anycase) setnoc(cptop,c1); else setmem(cptop,c1); while (c1++ < c2); pat += 2; } else pat++; } else { if (anycase) setnoc(cptop,*pat); else setmem(cptop,*pat); }; pat++; }; if (negflg) negset(cptop); if (*pat == ']') pat++; /* Never match the EOL char. */ clrmem(cptop,eolch); break; /* Quote. */ case '\\': if (*pat != 0) pat++; /* Fall thru to match single processing. */ /* Anything else: match only it. */ default: if (anycase) { cptop->pel_type = MSET; clrall(cptop); setnoc(cptop,*(pat-1)); } else { cptop->pel_type = MONEC; cptop->pel_c = *(pat-1); }; break; }; cptop++; /* Next element. */ }; /* Last of all, match EOL. */ cptop->pel_type = MONEC; cptop->pel_c = eolchar; #ifdef DEBUG /* Print out the pattern we just compiled. */ prtpat(); #endif } /* srchpat(strng,sz,eosptr) * * Function: Using the previously compiled pattern, search the string * strng (of size sz) for a line exactly matching the pattern. Return a * pointer to that line, or 0. On a non-zero return, return in eosptr a * pointer to the next character after the matched string. * * Algorithm: Repeatedly call match() on each successive line until a * line matching the pattern is found, or we run out of data. * * Comments: See comments on comppat(). * The eosptr return value is passed from match by side-effect. */ char *srchpat(strng,sz,eosptr) register char *strng; register unsigned sz; char **eosptr; { /* While we've still got something to search thru. */ while (sz != 0) { /* If this one matches, return it. */ if (match(strng,sz,cpat)) { *eosptr = nextstr; return(strng); }; /* Otherwise, find the next line, and try it. */ while (*strng != eolch) { strng++; sz--; if (sz == 0) return(0); }; strng++; sz--; /* Skip EOL. */ }; return(0); } /* match(sptr,sz,cpptr) * * Function: Return TRUE if the string sptr (of size sz) exactly * matches the compiled search pattern cpptr. If returning TRUE, also * return the next character past the match in nextstr. * * Algorithm: Match only the first element (shortest string first), * recursively calling ourself to match the remainder of the string. * * Comments: This function is used by srchpat() only. The user should * never call it directly. * nextstr is a side-effect return, which is not generally a nice idea. * However, match() is the most crucial routine in this package with regard * to execution-time efficiency, and passing another parameter thru the whole * recursive search would severly slow things down. * The recursive depth of this routine is bounded by the maximum size * of the search pattern. I.e., with a max pattern size of 100, this routine * will never call itself more than 100 times (or is it 101 times?). */ match(sptr,sz,cpptr) register char *sptr; register unsigned sz; register struct pel *cpptr; { struct pel *cpp1; /* cpptr + 1. */ /* If there's nothing left of the string, we can't match it. */ if (sz == 0) return(FALSE); /* Calculate next cpptr for later use. */ cpp1 = cpptr+1; /* Switch on type of element. */ switch (cpptr->pel_type) { /* Match one exactly. */ case MONEC: if (*sptr != cpptr->pel_c) return(FALSE); break; /* Match any one. */ case MONEA: if (*sptr == eolch) return(FALSE); break; /* Match any in set. */ case MSET: if (! inset(cpptr,*sptr)) return(FALSE); break; /* Match zero or more. */ case MMANY: /* Try matching 0, 1, 2, ... */ do { if (match(sptr,sz,cpp1)) return(TRUE); if (*sptr++ == eolch) break; } while (--sz); return(FALSE); /* We'd better not get here! */ default: #ifdef DEBUG printf("Illegal pel_type in match().\n"); #endif exit(1); }; /* It matched so see if we're at the end or the rest matches. */ if (cpptr == cptop) { nextstr = sptr+1; /* Return next char past line. */ return(TRUE); }; /* See if the rest matches. */ if (match(sptr+1,sz-1,cpp1)) return(TRUE); /* No match. */ return(FALSE); } /* setall(sptr), clrall(sptr), negset(sptr), setmem(sptr,mem), * setnoc(sptr,mem) * * Function: Set operations: set all bits, clear all, complement all, * clear one member bit, set one member bit, and set one member bit and it's * other case equivalent if a letter. * * Algorithm: Obvious. * * Comments: One other set operation (inset) is define as a macro * for speed. */ setall(sptr) struct pel *sptr; { int i; int *iptr; for (i = SETMAX, iptr = sptr->pel_set; i-- > 0; *iptr++ = 0xFFFF); } clrall(sptr) struct pel *sptr; { int i; int *iptr; for (i = SETMAX, iptr = sptr->pel_set; i-- > 0; *iptr++ = 0); } negset(sptr) struct pel *sptr; { int i; int *iptr; for (i = SETMAX, iptr = sptr->pel_set; i-- > 0; *iptr = ~*iptr, iptr++); } clrmem(sptr,mem) struct pel *sptr; char mem; { sptr->pel_set[(mem>>4)&7] &= ~(1<<(mem&15)); } setmem(sptr,mem) struct pel *sptr; char mem; { sptr->pel_set[(mem>>4)&7] |= (1<<(mem&15)); } setnoc(sptr,mem) struct pel *sptr; char mem; { if ((mem >= 'a') && (mem <= 'z')) setmem(sptr,mem+('A'-'a')); else if ((mem >= 'A') && (mem <= 'Z')) setmem(sptr,mem+('a'-'A')); setmem(sptr,mem); } /* prtpat() * * Function: Print the current pattern is a readable form. * * Algorithm: Obvious to the casual observer. * * Comments: This is for debugging purposes only, and is only compiled * or called if DEBUG is defined. */ #ifdef DEBUG prtpat() { struct pel *cpptr; char c; for (cpptr = cpat; cpptr <= cptop; cpptr++) { printf("** Prtpat: "); switch (cpptr->pel_type) { case MMANY: printf("*"); break; case MONEA: printf("?"); break; case MONEC: if (cpptr->pel_c < ' ') printf("\\%3.3o",cpptr->pel_c); else printf("%c",cpptr->pel_c); break; case MSET: printf("["); for (c = 0; c <= 128; c++ ) if (inset(cpptr,c)) if (c < ' ') printf(" \\%3.3o",c); else printf(" %c",c); printf("]"); break; default: printf("Illegal pel_type in prtpat!\n"); break; }; printf("\n"); }; } #endif  .