/* vim: set sw=8 ts=8 si : */
/* Author: Guido Socher, Copyright: GPL */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#include "config.h"
/*length of one pattern string */
#define MAXPATLEN 50
/*length of doc title */
#define MAXSTRLEN 500
/* max number of patters to search for in parallel*/
#define NUMOFPAT 3
/* max len of all search keys together */
#define PATARGLEN 80

/* data structures for the objects */
struct docdat{
	int tpos;
	char title[MAXSTRLEN];
};
struct pattern{
	int i; /* pattern pos */
	int isvalid; /* 1 or 0, is a valid pattern */
	int skip; /* 1 or 0, anchor_search: skip to the next non word char */
	int matchcount; /*how often we matched*/
	char pat[MAXPATLEN]; /* the search pattern */
	int nexti[MAXPATLEN+2]; /* kmp_search: where to start next on a mismatch */
};

/*begin global data*/
static struct pattern *key[NUMOFPAT];  /* pattern object */
static struct pattern *titlepat;  /* search for doc title */
static struct docdat doc;  /* descrriptive data about the document */
/*end global data*/

/* make the character lower case. Used for case insensitive search
 * with a latin ISO_8859_1 character set */
int tolowercase(char *ch){
	if (*ch >= 'A' && *ch <='Z'){
		*ch=(*ch + ('a'-'A'));
		return(0);
	}
	/* handle umlaute */
	if (*ch > (char)0xBF && *ch < (char)0xDF) *ch=(*ch + (char)0x20);
	return(0);
}
/****** methods that can be applied to the pattern object ******/
#define IFANCHOR(CHR,ACTION) if(CHR >= '\t' && CHR <= '@'){ACTION;} 
int is_anchor(char ch){
	/*return 1 if we have an anchor character (non word character)*/
	if (ch >= '\t' && ch <= '@') return(1);
	return(0);
}
/* search for anchored words. That is a match for whole words
 * with anchor at begining and end. 
 * 
 * txt is a character to from the text and you must step this
 * character within the text everytime you call this function.
 * return 1 on match and 0 otherwise 
 * anchor_search is case sensitive*/
int anchor_search(char *txt,struct pattern *pobj){
	if (pobj->isvalid==0) return(0);
	if (pobj->skip == 0){
		if(*txt == pobj->pat[pobj->i]){
			/* we have a match on this char */
			pobj->i++;
			return(0);
		}
		/* charaters differ */
		pobj->skip = 1; /*default but we check below*/
		/* check if we are at the end of the pattern, if yes
		 * then the whole pattern did match */
		if (pobj->pat[pobj->i] == '\0'){
			pobj->i = 0;
			IFANCHOR(*txt,pobj->skip = 0;pobj->matchcount++;return(1))
		}
		/* no match */
		pobj->i = 0;
		/* start search if this is an anchor char otherwise skip */
		IFANCHOR(*txt,pobj->skip=0)
		return(0);
	}else{
		IFANCHOR(*txt,pobj->skip=0)
	}
	return(0);
}
/* substing match with Knuth Morris Pratt algorithm. This algorithm
 * is very good if you can only step character by character in one 
 * direction through the text.
 * 
 * txt is a character to from the text and you must step this
 * character within the text everytime you call this function.
 * return 1 on match and 0 otherwise 
 * kmp_search is case sensitive*/
int kmp_search(char *txt,struct pattern *pobj){
	if (pobj->isvalid==0) return(0);
	if (pobj->pat[pobj->i] == '\0'){
		pobj->i=0;
		/*if we have 2 patterns in the text immediately
		 *following each other then we will not match
		 *the second one because we return now. I think
		 *this is acceptable.*/
		pobj->matchcount++;
		return(1);
	}
	while((pobj->i >= 0) && (*txt != pobj->pat[pobj->i])){
		pobj->i = pobj->nexti[pobj->i];
	}
	pobj->i++;
	return(0);
}
/* get the pattern string */
char *get_pat(struct pattern *pobj){
	if (pobj->isvalid==0) return("");
	return(pobj->pat);
}
/* the pattern object must be reset if a new file is opened */
void resetpattern(struct pattern *pobj){
	pobj->skip=0;
	pobj->matchcount=0;
	pobj->i=0;
}
/* create and initialize a new pattern object */
/* newline tab space etc.. is removed */
struct pattern *newpobj(char *patstr){
	struct pattern *pobj;
	char *ch_ptr;
	int i, j, M;
	pobj=(struct pattern *)malloc(sizeof(struct pattern));
	pobj->i=0;
	pobj->skip=0;
	pobj->matchcount=0;
	ch_ptr=patstr;
	while(*ch_ptr){
		if (*ch_ptr=='\n'||*ch_ptr=='\r'||*ch_ptr==' '||*ch_ptr=='\t') *ch_ptr='\0';
		ch_ptr++;
	}
	if (*patstr){
		pobj->isvalid=1;
		strncpy(pobj->pat,patstr,MAXPATLEN);
		pobj->nexti[MAXPATLEN-1]='\0';
		/* initialize the nexti array for kmp_search */
		M=strlen(pobj->pat);
		pobj->nexti[0]=-1;
		for (i=0,j=-1;i<M; i++, j++, pobj->nexti[i]=j){
			while(j >=0 && (pobj->pat[i] != pobj->pat[j])){
				j=pobj->nexti[j];
			}
		}
	}else{
		/* the pattern was an empty string */
		pobj->isvalid=0;
	}
	return(pobj);
}
/*********************************************************/

void help()
{
        printf("webfgrep -- search html pages for keywords\n\
\n\
USAGE: webfgrep [-ahist] [-p str] [key1,key2,...] html-files\n\
\n\
OPTIONS: -h This help\n\
         -i Search case insensitive (works only with ISO-8859-1 char)\n\
         -t Text output (default is html)\n\
         -p Path prefix to add when displaying\n\
	 -s Read the keys form stdin rather than from the command line.\n\
	    This is more secure and should be used by a cgi-bin\n\
         -a Anchor search, search whole words no substring search\n\
EXAMPLE:\n\
         webfgrep -a -p http://www.linuxfocus.org/ -- guido,File *.html\n\
         webfgrep -- guido,file *.html\n\
\n\
The search pattern consists of 1 to 3 keywords to search for.\n\
The keys are seperated by comma. A web-page that matches contains\n\
all the keywords. \n\
\n\
");
#ifdef VERINFO
	puts(VERINFO);
#endif
	exit(0);
}
/****** Document title handling **************************/
/* insert a character into the title buffer */
void add_title_buf(char c){
	if (doc.tpos<MAXSTRLEN-2){
		doc.title[doc.tpos]=c;
		doc.title[doc.tpos+1]='\0';
		doc.tpos++;
	}
}
/* initialize the title buffer. Must be done for each new document */
void init_title_buf(){
	doc.tpos=0;
}
/* Get the current title as one line with max one space between words */
char *get_title(){
	char *ptr;
	char *dest;
	if (doc.tpos < 1) return("");
	dest="";
	/*remove repeated space*/
	doc.title[MAXSTRLEN-2]='\0';
	doc.title[MAXSTRLEN-1]='\0';
	ptr=doc.title;
	dest=doc.title;
	while(*ptr){
		/* remove line break and tab from title */
		if (*ptr=='\n'|| *ptr=='\r'|| *ptr=='\t') *ptr=' '; 
		ptr++;
	}
	ptr=doc.title;
	/*remove leading space*/
	while(*ptr && *ptr == ' ') ptr++;
	*dest=*ptr;
	dest++;
	ptr++;
	while(*ptr){
		if (*ptr == ' ' && *(ptr+1) == ' '){
			ptr++; continue;
		}
		*dest=*ptr;
		dest++;
		ptr++;
	}
	*dest='\0';
	return(doc.title);
}
/*********************************************************/
#ifndef MAP_FAILED
#define MAP_FAILED ((void *)-1)
#endif
/* memory map a small file (less than 1 MB). mmap is a lot faster than
 * read/fread functions. Return 0 on success and
 * 1 on error. The address region is writable and starts a start_addr.
 * It is len_in_bytes long.*/
int mmap_file(int fd, char **start_addr, int *len_in_bytes){
	struct stat statbuf;
	
	if (fstat(fd,&statbuf) < 0) return(1);
	/* must be a regular file */
	if (!(statbuf.st_mode && S_IFMT && S_IFREG)) return(1);
	if (statbuf.st_size > 1000 * 1024) return(1);
	*start_addr=(char *)mmap(0,statbuf.st_size,PROT_READ| PROT_WRITE,MAP_PRIVATE,fd,0);
	if (*start_addr == MAP_FAILED) return(1);
	*len_in_bytes=statbuf.st_size;
	return(0);

}

/*********************************************************/
int main(int argc, char *argv[])
{
	int opt_a=0;
	int opt_t=0;
	int opt_i=0;
	int opt_s=0;
	int opt_p=0;
	int i=0;
	int bufsize;
	int docsize;
	int tmp=0;
	int titlefound=0;/*0: no title yet, 1: <title> found, 2: end of title*/
	int returnval=1;
	int patcount=0;
	char pat_arg[PATARGLEN];
	char *chptr,*txtptr,*prefixpath;
	char *mmapadr;
	char ch;
	
	int fd;
	/* The following things are used for getopt: */
        extern char *optarg;
        extern int optind;
        extern int opterr;

	prefixpath="";
	opterr = 0;
	while ((ch = (char)getopt(argc, argv, "ahip:st")) != -1) {
		switch (ch) {
		case 'a':
			opt_a=1;
			break;
		case 'h':
			help(); /*no break, help does not return */
		case 'i':
			opt_i=1;
			break;
		case 'p':
			opt_p=1;
			prefixpath=optarg;
			break;
		case 's':
			opt_s=1;
			break;
		case 't':
			opt_t=1;
			break;
		case '?':
			fprintf(stderr, "webfgrep ERROR: No such option. -h for help.\n");
			exit(1);
		/*no default action for case */
		}
	}
	if ((opt_s==0 && optind >= argc -1) || (opt_s && optind == argc)){
		/* have a minimum of 1 argument or 2 with opt_s not set*/
		help();
	}
	if (opt_s){
		/* get the pattern from stdin */
		fgets(pat_arg,PATARGLEN,stdin);
	}else{
		/*get the 1-3 pattern strings out of the first argument*/
		strncpy(pat_arg,argv[optind],PATARGLEN);
		optind++;
	}
	pat_arg[PATARGLEN-1]='\0';
	chptr=pat_arg;
	/* lowercase the pattern, this is to do case insensitive search */
	while(opt_i && *chptr){
		tolowercase(chptr);
		chptr++;
	}
	chptr=pat_arg;
	txtptr=pat_arg;
	/* split up the comma seperated search keys and make pat. objects*/
	for (i=0;i<NUMOFPAT;i++){
		if ((txtptr=strchr(chptr,','))){
			*txtptr='\0';
			key[i]=newpobj(chptr);
			chptr=txtptr+1;
		}else{
			/*the last or the only search key*/
			key[i]=newpobj(chptr);
			/* note: txtptr==NULL */
			*chptr='\0'; /*causes furter newpobj() to be invalid*/
		}
	}
	/* find out how many patterns we have now */
	patcount=0;
	for (i=0;i<NUMOFPAT;i++){
		if(key[i]->isvalid) patcount++;
	}
	if (patcount == 0){
		printf("<P>No valid search pattern.</P>\n");
		exit(0);
	}
	/* to search for the title string */
	titlepat=newpobj("<title>");
	/* now search through all files */
	while(optind<argc){
		fd=open(argv[optind],O_RDONLY);
		if (fd == -1){
			/* only err. message if this is not a web output */
			if (opt_t) fprintf(stderr, "webfgrep ERROR: can not read %s\n",argv[optind]);
			optind++;
			continue;
		}
		/* init */
		init_title_buf();
		for (i=0;i<NUMOFPAT;i++){
			resetpattern(key[i]);
		}
		resetpattern(titlepat);
		titlefound=0;
		/* read the file to memory */
		if (mmap_file(fd,&chptr,&bufsize)){
			/* only err. message if this is not a web output */
			if (opt_t) fprintf(stderr,"webfgrep ERROR: file %s too big or not mem mappable.\n",argv[optind]);
			close(fd);
			optind++;
			continue;
		}
		docsize=bufsize;
		mmapadr=chptr;
		while(bufsize){
			/* title handling (we remember the doc title)*/
			if (titlefound==2) goto search;
			if (titlefound==0){
				/* title must come within the first 1000 char */
				if (docsize-bufsize > 1000) titlefound=2;
				ch=*chptr; /* save char before tolowercase */
				tolowercase(chptr);
				if (kmp_search(chptr,titlepat)) titlefound=1;
				/* restore original (for the 1-st title char)*/
				*chptr=ch;
			}
			if (titlefound==1){
				/* we have found a title */
				/* \n,\r and \t char are removed in get_title */
				if (*chptr=='<'){
					/* end of title */
					titlefound=2; 
				}else{
					add_title_buf(*chptr);
				}
			}
		search:
			/*real search*/
			if (opt_i) tolowercase(chptr);
			if (opt_a){
				for (i=0;i<patcount;i++){
					anchor_search(chptr,key[i]);
				}
			}else{
				for (i=0;i<patcount;i++){
					kmp_search(chptr,key[i]);
				}
			}
			/* get next char */
			bufsize--;
			chptr++;
		}
		close(fd);
		/* unmap to release the memory */
		munmap(mmapadr,docsize);
		tmp=0;
		for (i=0;i<patcount;i++){
			if (key[i]->matchcount) tmp++;
		}
		/* all pattern must match for the document to qualify */
		if (tmp==patcount){
			returnval=0;
			if (opt_t){
				printf("File: ");
				if (opt_p) printf("%s",prefixpath);
				printf("%s\n ",argv[optind]);
				printf("Title: %s\n Matchcount on ",get_title());
				if (opt_a){
					printf("exact words: ");
				}else{
					printf("substring: ");
				}
				for (i=0;i<patcount;i++){
					printf("%s=%d  ",get_pat(key[i]),key[i]->matchcount);
				}
				printf("\n");

			}else{
				printf("<P><A HREF=\"");
				if (opt_p) printf("%s",prefixpath);
				printf("%s\">",argv[optind]);
				if (opt_p) printf("%s",prefixpath);
				printf("%s</A><BR>",argv[optind]);
				printf("Title: %s<BR>Matchcount on ",get_title());
				if (opt_a){
					printf("exact words: ");
				}else{
					printf("substring: ");
				}
				for (i=0;i<patcount;i++){
					printf("%s=%d  ",get_pat(key[i]),key[i]->matchcount);
				}
				printf("</P>\n");
			}
		}
		optind++;
	}
	if (opt_t){
		return(returnval);
	}else{
		if (returnval!=0){
			printf("<P>No matching documents found.</P>\n");
		}
	}
	return(0);
}
