/* vim: set sw=8 ts=8 si : */
/* Author: Guido Socher, Copyright: GPL */
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <unistd.h>
#include <ctype.h>
#include <strings.h>
#include <string.h>
#include "hash.h"
#include "htag.h"
#include "config.h"
/* tags longer than this are ignored:*/
static char pathstr[MAXTAGLEN+1];
static char linkpath[MAXTAGLEN+1];
static char defindexfiles[]="index.html,index.htm,index.shtml,index.phtml";
static char *indexfiles;
static char *fileindir;
static char **indexfileslist;
static int typeoflink=0; /* 0 initial val, 1 href, 2 src, 3 background, 4 name */
static char *typeoflinkstr[]={"","href","src","background","name"};
static char *abstype[]={"rel-lnk","anchor-in-file","file-from-docroot","abs-ref","internal"};
static char *filename; /* the file name currently read */
static int opt_a=0;
static int opt_d=0;
static int opt_D=0;
static int opt_i=0;
static int opt_A=0;
static int opt_W=0;
static int opt_L=0;
/*end global data*/

void help()
{
        printf("lshtmlref -- list all relative links in html files\n\
\n\
USAGE: lshtmlref [-hadADLW] [-i list] html-files\n\
\n\
OPTIONS:-a print all rel. links independent of whether the files exist or not.\n\
        -d print all links in debug format with line number and filename.\n\
        -h this help.\n\
        -i list of index file names. A comma seperated list of files to use when\n\
           an URL points to a directory. The default is:\n\
           %s\n\
        -A just list all references in the webpage.\n\
        -D like debug but classify the links additionally as:\n\
        rel-lnk,anchor-in-file,file-from-docroot,abs-ref,internal\n\
        -L do not list the html-files that were provided on the command line.\n\
        -W do not warn about broken links on stderr.\n\
\n\
This program can be used to generate tar archives from a number of html files\n\
and include into these archives all web pages, images, text files etc. that\n\
are linked to the given html pages. lshtmlref lists duplicate links only once\n\
and cancels \"..\" out against a previous path component.\n\
If a link points to a directory then index.html is selected as filename.\n\
Note: This program is not recursive. It does only list the links in the\n\
      named html-files.\n\
EXAMPLES:\n\
 tar toghether webpages:\n\
  tar cvf web.tar `lshtmlref *.html */*.html`\n\
 list all relative links in a web-page:\n\
  lshtmlref -Wa file.html\n\
\n",defindexfiles);

#ifdef VERINFO
	puts(VERINFO);
#endif
	exit(0);
}
/* 
 * remove .. in a path, by compensating it with previous path components. 
 * /zz/../xx becomes /xx
 * /zz/../../xx becomes ../xx
 * /yy/zz/vv/../../xx becomes /yy/xx
 */
/* number of componenets that the path can consist of */
#define MAX_PTH_COMP 50
int flattenpath(char *path){
	static int patharr[MAX_PTH_COMP];
	char *c_ptr,*lc_ptr,*dest;
	int i,ii,strpos,lastcompstart,alen,found;
	
	/*remove // out of the path*/
	dest=path;
	c_ptr=path;
	i=0;
	while(*c_ptr && i < MAXTAGLEN){
		*dest=*c_ptr;
		c_ptr++;
		if (*(c_ptr-1)=='/' && *c_ptr=='/') continue;
		dest++;
	}
	*dest='\0';
	/* patharr has for every section of the path a plan what to it looks
	 * like. */
	c_ptr=path;
	i=0;
	strpos=0;
	lastcompstart=0;
	if (*c_ptr == '/') {
		c_ptr++; /* ignore the first char. xx/yy/zz has 3 components
			  * just as /xx/yy/zz */
	}
	/*generate an array where  /./ is marked 2
         *and /../ is marked 1 Every other section is maked in patharr as 0 */
	while(*c_ptr){
		if (*c_ptr == '/'){
			if (strpos - lastcompstart == 2 && *(c_ptr - 2) == '.' && *(c_ptr - 1) == '.'){
				/* we have /../ */
				patharr[i]=1;
			}else if (strpos - lastcompstart == 1 && *(c_ptr - 1) == '.'){
				/* we have /./ */
				patharr[i]=2;
			}else{
				patharr[i]=0;
			}
			lastcompstart=strpos+1;
			i++;
		}
		c_ptr++;
		strpos++;
	}
	/* now we cancel the .. sections out with sections makred as 0 
	 * in our path array */
	alen=i;
	i=0;
	while(i<alen){
		if (patharr[i]==1){
			/*go and find a 0 in a position before i and set it
			 *to 4. 4 means cancel this path component out.
			 *For every 1 for which we found not 0 we replace the
			 *1 by -1 */
			ii=i-1;
			found=0;
			while(ii>=0){
				if (patharr[ii]==0){
					patharr[ii]=4;
					found=1;
					break;
				}
				ii--;
			}
			if (found==0) patharr[i]=-1;
		}
		i++;
	}
	i=0;
	while(i<alen){
		i++;
	}
	/* now we have a nice plan inside patharr[]. -1 and 0 are the components
	 * that we keep.*/
	c_ptr=path;
	if (*c_ptr=='/') c_ptr++;
	lc_ptr=c_ptr;  /* lc_ptr is after the string part that is save
			* and needs no overwriting */	
	dest=c_ptr;
	i=0;
	while(*c_ptr){
		*dest=*c_ptr;
		if (*c_ptr == '/'){
			if (patharr[i]>0){
				dest=lc_ptr; /*ignore the last component */
				c_ptr++; /*copy stuff after this char */
				*dest=*c_ptr;
				if(!*c_ptr) break;
			}else{
				/*what we copied so far will never be 
				 *overwritten. The first overwritable char
				 *is the next char */
				lc_ptr=dest+1;
			}
			i++;
		}
		dest++;c_ptr++;
	}
	*dest='\0';
	return(0);
}
			
/* check if this is an empty string. Return 1 if it is empty or
 * consists only of white space */
int is_empty(char *s){
	while (*s){
		if (*s == ' ' || *s == '\t'){
			s++;
		}else return(0);
	}
	return(1);
}

/* return the name of the directory from a full path file name 
 * the path returned does not end in "/" */
char *dirname(char *filename){
	static char string[MAXTAGLEN+1];
	char *chptr;

	strncpy(string,filename,MAXTAGLEN-1);
	string[MAXTAGLEN]='\0';
	chptr=strrchr(string,'/');
	if (chptr){
		*chptr='\0';
	}else{
		string[0]='.';
		string[1]='\0';
	}
	return(string);
}
/* 
 * Check if this is an abs or a rel link.
 * abs file system path (something that starts with /)-> retrun 2
 * #xxx (something that starts with #) -> retrun 1
 * mailto: or javascript: (anything with ^\w+:[^/][^/] )-> retrun 4 
 * http:// https:// ftp:// (anything with ^\w+://) -> retrun 3
 * definitly a rel link (starts with . or 
 * is a word not followed by ://)-> return 0 
 */
int is_abs(char *string){
	int sstate=0;
	int ccount=0;
	char *dummy;
	/* file system abs path:*/
	if (*string=='/') return(2);
	/* something that starts with dot is definitly a rel link*/
	if (*string=='.') return(0);
	if (*string=='#') return(1);
	if (matchpat(string,"^\\w\\w*://",&dummy)) return(3);
	if (matchpat(string,"^\\w\\w*:",&dummy)) return(4);
	return(0); 
}
/*
 * copy the file path into the pathstr variable.
 * pathstartptr must point to the start of the possible quoted string.
 * Example: pathstartptr ponting to "index.html"> will result in
 * pathstr beeing set to index.html with quotes removed.
 * pathstartptr ponting to xx.html>xxxx will result in pathstr beeing set to
 * xx.html
 */
void copy_file_path(char *pathstartptr){
	char *qptr,*dest;
	dest=pathstr;
	if (*pathstartptr == '"' || *pathstartptr == '\''){
		qptr=pathstartptr;
		pathstartptr++;
	}else{
		qptr=NULL;
	}
	while(*pathstartptr){
		if (qptr == NULL){
			/* wait for space or '>' */
			if (*pathstartptr== ' '||*pathstartptr == '>'){
				break;
			}
		}else{
			if (*pathstartptr==*qptr){
				/*found closing quote */
				break;
			}
		}
		*dest=*pathstartptr;
		dest++;
		pathstartptr++;
	}
	*dest='\0';
}
/* Convert a string list (space or comma seperated) into an
 * array of strings and return a pointer to it.
 * string="aaa bb,cc,  dd" becomes *result[]={"aaa","bb","cc",dd"}
 * Only the 50 first elements from string are taken an the rest is
 * ignored.
 */
char **string_to_list(char *string){
	char *dat;
	char *chptr;
	char **array;
	int i=0;

	dat=(char *)malloc(strlen(string)+1);
	array=(char **)malloc(sizeof(char *)*51);
	strcpy(dat,string);
	while(*dat && i <50){
		/* walk through space and comma */
		while(*dat && (*dat == '\t'||*dat == ' '||*dat == ',')){
			*dat='\0';
			dat++;
		}
		*(array+i)=dat;
		if (*dat) i++;
		/* walk through the element */
		while(*dat && *dat != '\t' && *dat != ' ' && *dat != ','){
			dat++;
		}
	}
	*(array+i)=NULL;
	return(array);
}
/* check if one of the filenames in strlist can be found in the
 * directory dir. dir may or may not end in a /
 * return 1 if none of the files was found in the directory
 * otherwise return 0 and fileindir points to the file name found.
 * fileindir is an out-value
 */
int ckdeffile(char *dir,char **strlist,char **fileindir){
	static char mydir[MAXTAGLEN +1];
	struct stat statbuf;
	char *ch_ptr;
	int i=0;
	int app=0;
	while(*dir && i < MAXTAGLEN-1){ /* -1 for the append of slash */
		mydir[i]=*dir;
		dir++;
		i++;
	}
	if (mydir[i-1] != '/'){
		mydir[i]='/';
		i++;
	}
	mydir[i]=0;
	while(*strlist){
		app=0;
		ch_ptr=*strlist;
		/* append the string to the directory */
		while(*ch_ptr && (i+app) < MAXTAGLEN){
			mydir[i+app]=*ch_ptr;
			ch_ptr++;
			app++;
		}
		mydir[i+app]=0;
		/* check if file exists in dir */
		if (stat(mydir,&statbuf)==0){
			/* ok it is there */
			*fileindir=mydir;
			return(0);
		}
		strlist++;
	}
	return(1);
}
/*
 * check if link exists and print results
 */
int print_result(char *wholetag, int l){
	char *dir;
	struct stat stbuf;
	char *chptr;
	int i,lnk;
	lnk=is_abs(pathstr);
	if (opt_A){
		if (opt_d){
			printf("%s:%d: %s=\"%s\"\n",filename,l,typeoflinkstr[typeoflink],pathstr);
		}else if (opt_D){
			printf("%s:%d: %s=\"%s\" :%s\n",filename,l,typeoflinkstr[typeoflink],pathstr,abstype[lnk]);
		}else{
			printf("%s=\"%s\"\n",typeoflinkstr[typeoflink],pathstr);
		}
		goto ENDOFPRT;
	}
	if (is_empty(pathstr)){
		if (opt_W==0) fprintf(stderr,"%s:%d: Warning, empty link %s=\"%s\"\n",filename,l,typeoflinkstr[typeoflink],pathstr);
		/*ignore empty links*/
		goto ENDOFPRT;
	}
	if (lnk!=0){
		/* not a rel link */
		goto ENDOFPRT;
	}
	dir=dirname(filename);
	/*construct the full path, Note: even if buf.pathstr_start is
	 *a relative link it may still look like: ../info.html#sec1 
	 *and we need to remove the #sec1 . It is also possible to
         *have cgi-bin's between the html pages: ../qer.pl?val=1*/
	chptr=dir;
	i=0;
	while(*chptr && i < MAXTAGLEN){
		linkpath[i]=*chptr;
		i++;chptr++;
	}
	linkpath[i++]='/';
	chptr=pathstr;
	while(*chptr && i < MAXTAGLEN){
		linkpath[i]=*chptr;
		if (linkpath[i] == '#') linkpath[i]='\0';
		if (linkpath[i] == '?') linkpath[i]='\0';
		i++;chptr++;
	}
	linkpath[i]='\0';
	flattenpath(linkpath);
	/*show each file name only once*/
	if(is_in_fifo(0,linkpath)) goto ENDOFPRT;
	add_to_fifo(0,linkpath,"");
	/* now we have all the information and must list the file names: */
	if (stat(linkpath,&stbuf)!=0){
		/* broken link */
		if (opt_W==0) fprintf(stderr,"%s:%d: Warning, broken link %s=\"%s\"\n",filename,l,typeoflinkstr[typeoflink],pathstr);
		/* do not list broken links */
		if (opt_a == 0) goto ENDOFPRT;
	}else{
		/* we are able to stat the linkpath. Something exists.
		 * it is either a file or a directory. If it is a 
		 * directory then we append index.html */
		if (S_ISDIR(stbuf.st_mode)){
			if (ckdeffile(linkpath,indexfileslist,&fileindir)){
				/* broken link */
				if (opt_W==0) fprintf(stderr,"%s:%d: Warning, directory %s does not contain %s\n",filename,l,linkpath,indexfiles);
				/* do not list broken links */
				if (opt_a == 0) goto ENDOFPRT;
			}else{
				strncpy(linkpath,fileindir,MAXTAGLEN);
				linkpath[MAXTAGLEN]=0;
				/* we have modified the linkpath
				 * now we check again if we have that
				 * file already */
				if(is_in_fifo(0,linkpath)) goto ENDOFPRT;
				add_to_fifo(0,linkpath,"");
			}
		}
	}
	if (opt_d){
		printf("%s:%d: %s\n",filename,l,linkpath);
	}else if (opt_D){
		printf("%s:%d: %s :%s\n",filename,l,linkpath,abstype[lnk]);
	}else{
		printf("%s\n",linkpath);
	}
ENDOFPRT:
	return(0);
}

/* check for the type of tag, argument to findtag*/
int evaltag(char *wholetag,int linenumber,int is_anchor){
	char *pathstartptr;
	typeoflink=0;
	if (matchpat(wholetag," href *= *",&pathstartptr)){
		typeoflink=1; /* 1 href, 2 src, 3 background, 4 name */
		copy_file_path(pathstartptr);
	}else if (matchpat(wholetag," src *= *",&pathstartptr)){
		typeoflink=2;
		copy_file_path(pathstartptr);
	}else if (matchpat(wholetag," background *= *",&pathstartptr)){
		typeoflink=3;
		copy_file_path(pathstartptr);
	}
	if (typeoflink){
		print_result(wholetag,linenumber);
	}
	return(0);
}


int main(int argc, char *argv[])
{
	/* The following things are used for getopt: */
        extern char *optarg;
        extern int optind;
        extern int opterr;
	int ch,i;

	opterr = 0;
	while ((ch = getopt(argc, argv, "ahdi:ADLW")) != -1) {
		switch (ch) {
		case 'a':
			opt_a=1;
			break;
		case 'd':
			opt_d=1;
			break;
		case 'D':
			opt_D=1;
			break;
		case 'i':
			opt_i=1;
			indexfiles=(char *)malloc(strlen(optarg)+1);
			strcpy(indexfiles,optarg);
			break;
		case 'A':
			opt_A=1;
			break;
		case 'L':
			opt_L=1;
			break;
		case 'W':
			opt_W=1;
			break;
		case 'h':
			help(); /*no break, help does not return */
		case '?':
			fprintf(stderr, "ERROR: No such option. -h for help.\n");
			exit(1);
		/*no default action for case */
		}
	}
	if (optind == argc){
		help();
	}
	init_fifo_class(0); 
	if (opt_i==0){
		indexfiles=defindexfiles;
	}
	indexfileslist=string_to_list(indexfiles);
	/* put all files provided on the cmd line into the fifo 
	 * before the first html file is read */
	i=optind;
	while(i<argc){
		filename=argv[i];
		strncpy(linkpath,filename,MAXTAGLEN);
		linkpath[MAXTAGLEN-1]='\0';
		flattenpath(linkpath);
		/*show each file name only once*/
		add_to_fifo_unless_there(0,linkpath,"");
		i++;
	}
	/* we search for:
	 * _HREF="http://www.xxx/" _SRC="xxxx" (the _ is a space)
	 * _BACKGROUND="xxxx"
	 * Note: we search for it only inside < ... > and not outside 
	 * Some broken html pages use however "<" instead of &lt; therefore
	 * findtag reset the state after MAXTAGLEN characters */
	while(optind<argc){
		/* search for html tags and call the function evaltag */
		filename=argv[optind];
		strncpy(linkpath,filename,MAXTAGLEN);
		linkpath[MAXTAGLEN-1]='\0';
		flattenpath(linkpath);
		/*print also the file name it self*/
		if (opt_L==0) printf("%s\n",linkpath);
		findtag(evaltag,filename,0);
		optind++;
	}
	return(0);
}
