From howardjp@byzantine.student.umd.edu Thu Oct 14 21:44:37 1999
Return-Path: <howardjp@byzantine.student.umd.edu>
Received: from byzantine.student.umd.edu (byzantine.student.umd.edu [129.2.244.158])
	by hub.freebsd.org (Postfix) with ESMTP id D1DCC14F41
	for <FreeBSD-gnats-submit@freebsd.org>; Thu, 14 Oct 1999 21:44:29 -0700 (PDT)
	(envelope-from howardjp@byzantine.student.umd.edu)
Received: (from howardjp@localhost)
	by byzantine.student.umd.edu (8.9.3/8.9.3) id AAA81151;
	Fri, 15 Oct 1999 00:45:46 -0400 (EDT)
	(envelope-from howardjp)
Message-Id: <199910150445.AAA81151@byzantine.student.umd.edu>
Date: Fri, 15 Oct 1999 00:45:46 -0400 (EDT)
From: howardjp@wam.umd.edu
Sender: howardjp@byzantine.student.umd.edu
Reply-To: howardjp@wam.umd.edu, umd.edu@byzantine.student.umd.edu
To: FreeBSD-gnats-submit@freebsd.org
Subject: [PATCH] Speed ups for regex!
X-Send-Pr-Version: 3.2

>Number:         14342
>Category:       bin
>Synopsis:       [PATCH] Speed ups for regex!
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    dcs
>State:          closed
>Quarter:        
>Keywords:       
>Date-Required:  
>Class:          change-request
>Submitter-Id:   current-users
>Arrival-Date:   Thu Oct 14 21:50:00 PDT 1999
>Closed-Date:    Wed Jun 28 21:42:24 PDT 2000
>Last-Modified:  Wed Jun 28 21:45:18 PDT 2000
>Originator:     James Howard
>Release:        FreeBSD 4.0-CURRENT i386
>Organization:
University of Maryland
>Environment:

FreeBSD byzantine.student.umd.edu 4.0-CURRENT FreeBSD 4.0-CURRENT #40: Wed Oct  6 18:17:44 EDT 1999     howardjp@byzantine.student.umd.edu:/usr/src/sys/compile/BYZANTINE  i386

>Description:

Regex is horribly, horribly slow.  

There is code in lib/libc/regex/engine.c to handle searching for
a known constant string which has to be present in any matches. 
This is meant to cut down the number of strings regexec() actually
has to search.  However, it uses a simple search to find it and 
it takes forever.

I replaced it with a Boyer-Moore search.  I stole the code from
4.4BSD's egrep.  I have trimed it down as much as I can and it 
roughly halves the speed of calls to regexec on mis-matches.

Apply the patch below to lib/libc/regex to get add the new code.

>How-To-Repeat:

N/A

>Fix:

diff -ur regex.orig/engine.c regex/engine.c
--- regex.orig/engine.c	Mon Sep 13 09:53:57 1999
+++ regex/engine.c	Fri Oct 15 00:30:46 1999
@@ -144,14 +144,14 @@
 int eflags;
 {
 	register char *endp;
-	register int i;
+	register int i, j;
+	int patlen = g->mlen;
 	struct match mv;
 	register struct match *m = &mv;
-	register char *dp;
+	register char *dp, *k, *s;
 	register const sopno gf = g->firststate+1;	/* +1 for OEND */
 	register const sopno gl = g->laststate;
-	char *start;
-	char *stop;
+	char *start, *stop;
 
 	/* simplify the situation where possible */
 	if (g->cflags&REG_NOSUB)
@@ -167,14 +167,37 @@
 		return(REG_INVARG);
 
 	/* prescreening; this does wonders for this rather slow code */
-	if (g->must != NULL) {
-		for (dp = start; dp < stop; dp++)
-			if (*dp == g->must[0] && stop - dp >= g->mlen &&
-				memcmp(dp, g->must, (size_t)g->mlen) == 0)
-				break;
-		if (dp == stop)		/* we didn't find g->must */
-			return(REG_NOMATCH);
+ 	if (g->must != NULL) {
+		for (k = string + patlen - 1; k < stop;) {
+			/*
+			 * for a large class of patterns, upwards of 80% of
+			 * match time is spent on the next line.  we beat
+			 * existing microcode (vax 'matchc') this way. 
+			 */
+			while ((k += g->delta0[*(unsigned char *) k]) < stop);
+			if (k < string + LARGE)
+				 break;
+			k -= LARGE;
+			
+ 			j = patlen - 1;
+	 		s = k - 1;
+		 	while (g->cmap[*((unsigned char *)s--)] == 
+			       g->must[--j]);
+		
+			/*
+			 * delta-less shortcut for literati. short shrift for
+			 * genetic engineers? 
+			 */
+
+			if (j < 0)
+				goto bmpassed;
+			
+			k++;    /* no match; restart next char */
+		}
+		return REG_NOMATCH;
 	}
+
+bmpassed:
 
 	/* match struct setup */
 	m->g = g;
diff -ur regex.orig/regcomp.c regex/regcomp.c
--- regex.orig/regcomp.c	Mon Sep 13 09:53:57 1999
+++ regex/regcomp.c	Fri Oct 15 00:32:47 1999
@@ -245,6 +245,7 @@
 	g->categories = &g->catspace[-(CHAR_MIN)];
 	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
 	g->backrefs = 0;
+	(void) memset((void *)g->delta0, 0, 256 * sizeof(int));
 
 	/* do it */
 	EMIT(OEND, 0);
@@ -1738,6 +1739,7 @@
 	}
 	assert(cp == g->must + g->mlen);
 	*cp++ = '\0';		/* just on general principles */
+	gosper(g->must, g->delta0, g->cmap);
 }
 
 /*
@@ -1774,4 +1776,46 @@
 	if (plusnest != 0)
 		g->iflags |= BAD;
 	return(maxnest);
+}
+
+
+#define NALT	7		/* tied to scanf() size in alternate() */
+
+/*
+ * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] 
+ */
+void
+gosper(pattern, delta0, cmap)
+	char *pattern;		/* ... HAKMEM lives ... */
+	int  *delta0;           /* Who/what is HAKMEM? */
+	char *cmap;
+{
+	char *altpat[NALT];
+	int altlen[NALT];
+	register int j, altmin;
+	unsigned char c;
+
+	/* Make one-string case look like simple alternatives case */
+	altmin = altlen[0] = strlen(pattern);
+	altpat[0] = pattern;
+	
+	/* For chars that aren't in any string, skip by string length. */
+ 	for (j = 0; j < 256; j++) { 
+ 		delta0[j] = altmin; 
+		cmap[j] = j;	/* Sneak in initialization of cmap */
+	}
+
+	/* For chars in a string, skip distance from char to end of string. */
+	/* (If char appears more than once, skip minimum distance.) */
+
+	delta0[0] = 0;
+	for (j = 0; j < altlen[0] - 1; j++) {
+		c = altpat[0][j];
+		delta0[c] = MIN(delta0[c], altlen[0] - j - 1);
+	}
+
+	/* For last char of each string, fall out of search loop. */
+	c = altpat[0][altlen[0] - 1];
+	delta0[c] = LARGE;	 
+
 }
diff -ur regex.orig/regex2.h regex/regex2.h
--- regex.orig/regex2.h	Mon Sep 13 09:53:57 1999
+++ regex/regex2.h	Fri Oct 15 00:36:14 1999
@@ -56,6 +56,8 @@
  */
 #define	MAGIC1	((('r'^0200)<<8) | 'e')
 
+#define META	"\n^$.[]()?+*|\\"	/* egrep meta-characters */
+
 /*
  * The internal representation is a *strip*, a sequence of
  * operators ending with an endmarker.  (Some terminology etc. is a
@@ -164,6 +166,8 @@
 	size_t nsub;		/* copy of re_nsub */
 	int backrefs;		/* does it use back references? */
 	sopno nplus;		/* how deep does it nest +s? */
+	int delta0[256];        /* Boyer-Moore algorithm core */
+	char cmap[256];
 	/* catspace must be last */
 	cat_t catspace[1];	/* actually [NC] */
 };
@@ -171,3 +175,14 @@
 /* misc utilities */
 #define	OUT	(CHAR_MAX+1)	/* a non-character value */
 #define ISWORD(c)       (isalnum((uch)(c)) || (c) == '_')
+
+/* stuff used by Boyer-Moore */
+
+#define	MIN(A, B)	((A) > (B) ? (B) : (A))
+
+#define BUFSIZE	65536		/* make higher for cray */
+#define PATSIZE 6000
+#define LARGE 	BUFSIZE + PATSIZE
+
+void             gosper(char *, int *, char *);
+int              chimaera(char *, char *, struct re_guts *);



>Release-Note:
>Audit-Trail:
Responsible-Changed-From-To: freebsd-bugs->dcs  
Responsible-Changed-By: cpiazza 
Responsible-Changed-When: Sun Jun 4 17:00:22 PDT 2000 
Responsible-Changed-Why:  
dcs is working on this 

http://www.freebsd.org/cgi/query-pr.cgi?pr=14342 

From: "Daniel C. Sobral" <dcs@newsguy.com>
To: freebsd-gnats-submit@FreeBSD.org, howardjp@wam.umd.edu,
	umd.edu@byzantine.student.umd.edu
Cc:  
Subject: Re: bin/14342: [PATCH] Speed ups for regex!
Date: Mon, 05 Jun 2000 09:27:49 +0900

 I can see a number of bugs with the patch. Cmap isn't properly
 initialized, there is a reference to a "string" not mentioned elsewhere,
 there is a reference to altlen[N], altpat[N] but only altlen[0] and
 altpat[0] are actually used. The code compares must to cmap at regexec()
 time, when they are both defined at regcomp() and not changed after
 that! In other words... you trimmed too much. :-)
 
 On the more problematic side, this imposes a limit on the size of both
 text and pattern, something which the present code has not.
 
 Finally, this is not, strictly speaking, the Boyer-Moore algorithm. It
 only checks the last character of the must pattern, and, obviously
 doesn't even have a provision for match jumps. And it must be pointed
 out that the Boyer-Moore algorithm is much worse than the
 straight-forward algorithm for patterns <= 3 characters in length.
 
 -- 
 Daniel C. Sobral			(8-DCS)
 
 dcs@newsguy.com
 dcs@freebsd.org
 capo@yet.another.bsdconspiracy.org
 
 	Hmmm - I have to go check this. My reality assumptions are shattered.
 
State-Changed-From-To: open->closed 
State-Changed-By: dcs 
State-Changed-When: Wed Jun 28 21:42:24 PDT 2000 
State-Changed-Why:  
A Boyer-Moore algorithm was added. Thanks. 

http://www.freebsd.org/cgi/query-pr.cgi?pr=14342 
>Unformatted:
