/* vim: set sw=8 ts=8 si : 
 * Author: Guido Socher, Copyright: GPL */
/*
 * this is a generic html tag reading library
 * it reads tags of the format:
 * <word arg= ....> or
 * <a arg= ...>words...</a>
 */
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <unistd.h>
#include <ctype.h>
#include <strings.h>
#include <string.h>
#include "htag.h"
#include "config.h"
/* tags longer than MAXTAGLEN are ignored*/
struct buffer{ /* string buffer for the current tag (used by findtag)*/
        int spos;
        int line;
        char string[MAXTAGLEN+1];
};
static struct buffer buf;
static int ch; /*current character in file*/
/*end global data*/
/* 
 * Match a string against a pattern. This a very basic
 * (but fast) regexp matcher. The only known regexp special characters
 * are: ^=anchor at line start
 *      *=zero or more multiplier (can be escaped like this: \* , c-code
 *                                you must write \\* )
 *     \w=a-z or 0-9 or underscore (Note: The c-compiler expands escape
 *                                 sequences. You must therefore write \\w)
 * Pattern must be lower case!! The search is not case
 * sensitive. Example: pattern="src *= *" would match
 * txt="Src=ydfdy" or txt="Src        = ydfds"
 * return value is 0 for no match or 1 for match.
 * matchend is an outvalue and points to the character where the match ended
 * ( the character y in the the above example).
 * Usage example: char *pathstartptr;
 *                char somestring[]="<a href=index.html>....";
 *                matchpat(somestring," href *= *",&pathstartptr);
 */
int matchpat(char *txt,char *pattern,char **matchend){
	char *pstart,*tstart;
	int mcount,alpha,anchor;
	char t;
	tstart=txt;
	anchor=0;
	if (*pattern=='^'){
		pattern++;
		anchor=1;
	}
	pstart=pattern;
	mcount=0;
	alpha=0;
	while(*txt){
		t=(char)tolower((int)*txt);
		if (*pattern=='\\'){
			/* this could be an escape of the form "\*"
			 * or "\w" */
			if (*(pattern+1)=='*'){
				/* next compare must compare a literal star */
				pattern++;
				continue;
			}
			if (*(pattern+1)=='w'){
				pattern++;
				/* indicate that we search a-z0-9_ :*/
				alpha=1;
			}
		}
		if (*pattern && *(pattern+1)=='*'){
			if (alpha){
				if (isalnum((int)t)||t=='_'){
					txt++;
					mcount++;
					continue;
				}
			}else if (t==*pattern){
				txt++;
				mcount++;
				continue;
			}
			/* no more match on <char>* in txt, 
			 * go to next pattern pos*/
			pattern+=2;
			alpha=0;
			if (*pattern==(char)0){
				*matchend=txt;
				return(1);
			}
			continue;
		}
		/* no multiplier (the *) */
		if (alpha){
			if (isalnum((int)t)||t=='_'){
				mcount++;
				pattern++;
				alpha=0;
				goto MATCH;
			}
		}else if (t==*pattern){
			mcount++;
			pattern++;
			goto MATCH;
		}
		/* no match */
		/* if this was anchor match then fail */
		if (anchor){
			*matchend=tstart;
			return(0);
		}
		/* go back to where it last matched*/
		txt-=mcount; 
		mcount=0;
		alpha=0;
		pattern=pstart;
		/* * */
	MATCH:
		if (*pattern==(char)0){
			*matchend=txt+1;
			return(1);
		}
		txt++;
	}
	*matchend=tstart;
	return(0);
}
char *rmspacebuf(){ 
	static char s[MAXTAGLEN+1];
	int i=0;
	buf.string[buf.spos]=(char)0;
	strcpy(s,buf.string);
	while(s[i]){
		if (s[i]=='\n'|| s[i]=='\r' || s[i]=='\t') s[i]=' ';
		i++;
	}
	return(s);
}
void inibuf(){ /* initialize findtag string buffer */
	buf.spos=0;
	buf.string[buf.spos]=(char)0;
}
void printbuf(){ 
	buf.string[buf.spos]=(char)0;
	fputs(buf.string,stdout);
	inibuf();
}
void addbuf(){ /* at ch to findtag string buffer */
	buf.string[buf.spos]=(char)ch;
	buf.spos++;
}
/* set the string buffer to a given string */
void setbufferval(char *s){
	char oldch;
	oldch=ch;
	inibuf();
	while(*s){
		ch=*s;
		addbuf();
		s++;
	}
	ch=oldch;
}
/*
 * search a file for html tags of the form
 * <tag ...=...> or 
 * <a ...=... >string</a> (if doasearch=1)
 * If such a tag is found then the function 
 * int evaltagfun("tagstring",linenumber,is_anchor) is called
 * The if doasearch==1 then anchors are read until </a> and errors
 * are printed if </a> was not found.
 */
int findtag(int (*evaltagfun)(char *,int,int),char *filename, int doasearch){
	FILE *fd;
	int l; /*current line number*/
	int asearch=0; /* stop searching at </a>*/
	int state; /*parse state*/
	fd=fopen(filename,"r");
	if (fd == NULL){
		fprintf(stderr, "ERROR: can not read file %s\n",filename);
		return(1);
	}
	/* init */
	l=1; /* line number count */
	inibuf();
	state=0;
	while((ch=fgetc(fd))!=EOF){
		if (buf.spos > MAXTAGLEN -5 ){
			if (asearch){
				printf("%s:%d: ERROR, anchor tag without </a>.\n",filename,buf.line);
			}
			inibuf();
			state=0;
			asearch=0;
		}
		if (ch=='\n') l++;
		if (ch=='\n'|| ch=='\r' || ch=='\t') ch=' ';
		switch (state) {
			case 0:
			 /* outside of tag and we start tag here*/
			 if (ch=='<') {
				 inibuf(); /* zero buffer*/
				 asearch=0;
				 buf.line=l; /*line where tag starts*/
				 addbuf();
				 state=1;
			 }
			 break;
			case 1:
		         if (ch=='!'){
				 /*check for comments*/
				 state=30;
			 }else if (ch == 'a' || ch == 'A'){
				 /*possible start of a anchor tag*/
				 state=2;
			 }else if (isalpha(ch)){
				 /*start of a tag*/
				 state=3;
			 }else{
				 state=0;
			 }
			 addbuf();
			 break;
			case 2: /* inside <a... tag check if this is anchor */
			 if (isspace(ch) && doasearch) asearch=1;
			 state++;
			 if (ch=='>') state=0;
			 addbuf();
			 break;
			case 3: /* inside a tag searching for "=" */
			 if (ch=='='){
				 /* tag must have an "=" somewhere */
				 state++;
			 }else if (ch=='>'){
				 state=0;
			 }
			 addbuf();
			 break;
			case 4: /* inside a tag searching for ">" */
			 addbuf();
			 if (ch=='>'){
				if (asearch==0){
					buf.string[buf.spos]=(char)0;
					(*evaltagfun)(buf.string,buf.line,0);
					state=0;
				}else{
					/* search end of anchor tag */
					state++;
				}
			 }
			 break;
			/*--------------*/
			case 5: /* search for "</a >" */
			 if (ch=='<'){
				 state++;
			 }
			 addbuf();
			 break;
			case 6: /* search for "</a >" or "<a " consider
			         * the error case where a new anchor starts
				 * without termination of this one */
			 if (ch=='/'){
				state++;
			 }else if (ch=='a' || ch == 'A'){
				 state=10;
			 }else{
				 state=5;
			 }
			 addbuf();
			 break;
			case 7: /* search for "</a >" */
			 if (ch=='a' || ch == 'A'){
				state++;
			 }else{
				 /* may be some </font> tag got to 5 */
				state=5;
			 }
			 addbuf();
			 break;
			case 8: /* search for "</a >" we found "</a" */
			 addbuf();
			 if (isspace(ch)){
				break; /* stay, igore space */
			 }else if (ch == '>'){
				/* we are done */
				buf.string[buf.spos]=(char)0;
				(*evaltagfun)(buf.string,buf.line,1);
				state=0;
			 }else{
				 /* perhaps some </any> tag */
				state=5;
			 }
			 break;
			case 10: /* error handling of anchor terminted by 
			          * start of new anchor */
			 if (isspace(ch)){
				printf("%s:%d: ERROR, start of new anchor tag without termination of previous anchor from line %d.\n",filename,l,buf.line);
				/* we are done */
				buf.string[buf.spos-2]=(char)0;
				(*evaltagfun)(buf.string,buf.line,1);
				/* now continue with next tag */
				setbufferval("<a ");
				state=3;
				asearch=1;
			 }else{
				 /* perhaps some <any> tag */
				 state=5;
			 }
			 break;
			/*--------------*/
			case 30: /*comment handling,
				  *we have found "<!", wait for "<!-" */
				if(ch=='-'){
					state++;
				}else{
					state=0;
				}
				break;
			case 31: /*comment handling,
				  *we have found "<!-" */
				if(ch=='-'){
					state++;
				}else{
					state=0;
				}
				break;
			case 32: /*comment handling,
				  *we have found "<!--", wait for
				  *comment termination with "->" */
				if(ch=='-'){
					state++;
				}else{
					/* other character */
					state=34;
				}
				break;
			case 33: /*comment end handling,
				  *we have found "--" (or "<!---" ), 
				  *wait for ">" */
				if(ch=='>'){
					state=0;
				}else if (ch!='-'){
					/* other character than "-" */
					state=34;
				}
				break;
			case 34: /* a non "-" character was found */
				if(ch=='-'){
					state=32;
				}
				break;
			/*--------------*/

			default:
				fprintf(stderr,"%s:%d: Programm Error, state = %d\n",filename,l,state);
				exit(1);
		}
	}
	fclose(fd);
	if (state > 30){
		printf("%s:%d: ERROR, comment does not terminate before end of file\n",filename,buf.line);
		return(1);
	}
	return(0);
}
/*
 * preprocess html files to straighten out html tags which span over 
 * several lines.
 * search a file for html tags of the form
 * <tag ...=...> 
 * If such a tag is found then the function 
 * int evaltagfun("tagstring",linenumber) is called
 * if that function returns 1 then the html tag is printed to stdout
 * without included newline otherwise is is printed to stdout un-changed.
 * Note: this function does not join anchor tag lines.
 */
int prepro(int (*evaltagfun)(char *,int),char *filename){
	FILE *fd;
	char *straight;
	int l; /*current line number*/
	int nlcnt; /*number of newline in current tag*/
	int outstandingln=0; /* number of nl to add when the next nl comes*/
	int state; /*parse state*/
	int dos; /*guess if this is a dos file with \r\n*/
	fd=fopen(filename,"r");
	if (fd == NULL){
		fprintf(stderr, "ERROR: can not read file %s\n",filename);
		return(1);
	}
	/* init */
	l=1; /* line number count */
	nlcnt=0;
	inibuf();
	state=0;
	while((ch=fgetc(fd))!=EOF){
		if (buf.spos > MAXTAGLEN -5 ){
			printbuf();
			state=0;
		}
		if (ch=='\n') {
			l++;
			nlcnt++;
		}
		switch (state) {
			case 0:
			 /* outside of tag and we start tag here*/
			 if (ch=='<') {
				 inibuf(); /* zero buffer*/
				 nlcnt=0; /* initialize nl in tag count */
				 buf.line=l; /*line where tag starts*/
				 addbuf();
				 state=1;
			 }
			 break;
			case 1:
			 if (isalpha(ch)){
				 /*start of a tag*/
				 state++;
			 }else{
				 /*ignore comments, space and other nonsence*/
				 printbuf();
				 state=0;
			 }
			 addbuf();
			 break;
			case 2: /* inside a tag searching for "=" */
			 if (ch=='='){
				 /* tag must have an "=" somewhere */
				 state++;
			 }else if (ch=='>'){
				 printbuf();
				 state=0;
			 }
			 addbuf();
			 break;
			case 3: /* inside a tag searching for ">" */
			 addbuf();
			 if (ch=='>'){
				straight=rmspacebuf();
				if ((*evaltagfun)(straight,buf.line)){
					/* print without included nl */
					fputs(straight,stdout);
					/*compensate the nl after the tag
					 *by adding them when the next nl 
					 *comes in the text*/
					outstandingln+=nlcnt; 

				}else{
					/* print un-changed */
					printbuf();
				}
				state=0;
				/* do not print again the > character */
				goto NOPRINT;
			 }
			 break;
			/*--------------*/

			default:
				fprintf(stderr,"%s:%d: Programm Error, state = %d\n",filename,l,state);
				exit(1);
		}
		if (state==0){
			putc(ch,stdout);
			if (ch=='\n'){
				/* now compensate any nl that we have removed
				 * inside a tag */
				while(outstandingln){
					/* guess if this was a dos file */
					if (dos) putchar('\r');;
					putchar('\n');
					outstandingln--;
				}
			}
		}
		NOPRINT:
		dos=0;
		if (ch=='\r'){
			dos=1;
		}
	}
	fclose(fd);
	return(0);
}
