/*GPL*START*
 * 
 * checkwwwlinks - check connectivity of html hyperlinks on local site
 * 
 * Copyright (C) 1998 by Johannes Overmann <overmann@iname.com>
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 * *GPL*END*/  


#include <sys/stat.h>
#include <unistd.h>
#include <ctype.h>
#include <dirent.h>
#include <stdio.h>
#include <stdlib.h>
#include "tregex.h"
#include "tappconfig.h"


// config


// one '.' per 'progress' links
const int progress = 100;

// heuristic escape
const int MAX_REF_LEN = 2048;



// end of config


// impossibe name string indicates no LINK#NAME tag
#define NO_NAME "\t\r\n"


// history:
// start: unknown (before 04 Jan 1998)
// 1998:
// 01 Oct 21:00 v0.1.1 prepare for sunsite, renamed to 'checkwwwlinks'
// 1999:
// 09 Dec 22:19 v0.1.3 bug in tstring.h (operator != not from trelops.h) fixed (spurous name tags)



const char *options[] = {
   "#usage='Usage: %n [OPTIONS, FILES and DIRS]... [--] [FILES and DIRS]...\n\n"
     "this program checks the connectivity of html FILES and DIRS and reports errors and statistics to stdout\n"     
     "progress information and runtime errors are written to stderr\n'",
   "#trailer='\n%n version %v *** (C) 1997 by Johannes Overmann\ncomments, bugs and suggestions welcome: %e\n%gpl'",
   "#onlycl", // only command line options
   "#stopat--", // stop option scanning after a -- parameter
   // options

   "name=html-ext         , type=string, char=H, default='html,htm,shtml', help='comma separated list of valid html extension'",
   "name=unused           , type=switch, char=u, help='show also unused name tags and unused non-html-files'", 
   "name=external         , type=switch, char=x, help='show also references to external resources like http, ftp, news and mailto'", 
   "name=hide-comments    , type=switch, char=c, help='hide all comments on output, also suppresses statistics'",
   "name=quiet            , type=switch, char=q, help='quiet execution, suppress progress indication'",
   "EOL" // end of list
};

bool verbose;
bool quiet;
bool hide_com;

TAssocArray<TString,TString> class2err;
TAssocArray<TString,int> class2num;
TArray<TString> err_class;


// print statistics
void printStatistics() {
   if(hide_com) return;
   printf("#\n# statistics\n#\n");
   int num_err = 0;
   for(int i=0; i < err_class.num(); i++) {
      int num = class2num[err_class[i]];
      num_err += num;
      printf("# %s:%5d events (%s)\n", *err_class[i], num, 
	     *class2err[err_class[i]]);
   }
   if(num_err) {
      printf("# %10d events (total)\n", num_err);
   } else {
      printf("# no events/errors found\n");
   }
}


// split par to files, dirs, error and other and add them to the lists
void splitFilesAndDirs(const TArray<TString>& par, TArray<TString>& files, 
		       TArray<TString>& dirs, TArray<TString>& other, 
		       TArray<TString>& error) {
   struct stat filestat;
   int r;
   
   for(int i=0; i<par.num(); i++) {
      r = stat(par[i], &filestat);
      if(r) {
	 error += par[i];
      } else {
	 if(S_ISREG(filestat.st_mode)) {
	    files += par[i];
	 } else if(S_ISDIR(filestat.st_mode)) {
	    dirs  += par[i];
	 } else {
	    other += par[i];
	 }
      }
   }     
}


// scan directory
TArray<TString> scanDirectory(TString dir) {
   struct dirent **dirents;
   TArray<TString> ret;
   int i,j;
   
   if(verbose) {
      if(dir.len()>65) {
	 const char *p = dir;
	 p += dir.len()-62;
	 fprintf(stderr, "scanning ...%-62.62s\r", p);
      }
      else fprintf(stderr, "scanning %-65.65s\r", *dir);
      fflush(stderr);
   }
   if((j=scandir(dir, &dirents, NULL, alphasort))==-1)
     userError("error while scanning directory '%s'\n", *dir);
   for(i=0; i<j; i++) {
      if(strcmp(".", dirents[i]->d_name) && strcmp("..", dirents[i]->d_name))
	ret += dir + "/" + TString(dirents[i]->d_name);
   }
   free(dirents);
   return ret;
}


// report string list
void reportStrList(const TArray<TString>& list, const TString& description, 
		   const TString& clas) {
   if(list.num()>0) {
      err_class += clas;
      class2err[clas] = description;
      class2num[clas] = list.num();
   
      if(!hide_com) printf("#\n# %d %s: %s\n#\n", list.num(), *clas, *description);
      for(int i=0; i < list.num(); i++) 
	printf("%s:%s\n", *clas, *list[i]);
   }
}


// format error string
TString fileLineError(const TString& file, int pos, const TString& error) {
   static TString f;
   static TString filename;
   static int old_pos=0;
   static int line=1;
   
   // perhaps load file
   if(file!=filename) {
      f.readFile(file);
      filename = file;
      old_pos = 0;
      line = 1;
   } 
      
   // perhaps start again
   if(old_pos > pos) {
      old_pos = 0;
      line = 1;      
   }
   
   // count lines
   for(;old_pos < pos; old_pos++) 
     if(f[old_pos] == '\n') line++;
   
   // return string
   return file + ":" + TString(line) + ":" + error;     
}


// main
int main(int argc, char *argv[]) {
   TArray<TString> test; // use for testing / debugging
   int i,j;
   
   // get parameter
   TAppConfig ac(options, "options", argc, argv, 0, 0, VERSION);
   hide_com= ac("hide-comments");
   quiet   = ac("quiet");
   verbose = !quiet;
   TString filter  = ac.getString("html-ext");
   bool show_unused   = ac("unused");
   bool show_external = ac("external");
   
   // get files 
   TArray<TString> files;
   TArray<TString> dirs;
   TArray<TString> non_regular;
   TArray<TString> non_existent;
   splitFilesAndDirs(ac.params(), files, dirs, non_regular, non_existent);
   TArray<TString> newfiles;
   while(dirs.isNotEmpty()) {	 
      for(i=0; i<dirs.num(); i++)
	newfiles += scanDirectory(dirs[i]);
      dirs.empty();
      splitFilesAndDirs(newfiles, files, dirs, non_regular, non_existent);
      newfiles.empty();
   }
   if(verbose) fprintf(stderr, "                                                                              \r");


   // filter html files 
   TArray<TString> html_files;
   TArray<TString> non_html_files;   
   if(verbose) fprintf(stderr, "filtering html files (%s)     \n", *filter);
   TArray<TString> filt = split(filter, ",");
   for(i=0; i < files.num(); i++) {
      TString ext(files[i]);
      ext.extractFilenameExtension();
      for(j=0; j < filt.num(); j++)
	if(ext==filt[j]) break;
      if(j==filt.num()) non_html_files += files[i]; // no match
      else html_files += files[i]; // match
   }
   
   
   // build reverse lookup table
   if(verbose) fprintf(stderr, "building reverse lookup table                          \n");   
   TAssocArray<TString,int> files2i; 
   TAssocArray<TString,int> html_files2i; 
   TAssocArray<TString,int> non_html_files2i; 
   for(i=0; i < html_files.num(); i++) {
      html_files[i].normalizePath();
      files2i[html_files[i]] = -1;
      html_files2i[html_files[i]] = i;
      if(verbose) {
	 if((i%50)==0) fprintf(stderr, "%d/%d       \r", i, html_files.num());
      }
   }
   for(i=0; i < non_html_files.num(); i++) {
      non_html_files[i].normalizePath();
      files2i[non_html_files[i]] = -1;
      non_html_files2i[non_html_files[i]] = i;
      if(verbose) {
	 if((i%50)==0) fprintf(stderr, "%d/%d       \r", i, non_html_files.num());
      }
   }
   
   
   // build reference tables
   if(verbose) fprintf(stderr, "processing %d html files...\n", html_files.num());
   TArray<TString> nul_html_files; // html files containing NUL char
   TString f;
   TArray<TArray<int> > all; // for regexp searching
   TRegEx reg("(src|href|name|background)[[:space:]]*=(.)", REG_EXTENDED|REG_ICASE);
   int reg_off=4;
   reg.exitOnError();   
   TArray<TArray<TString> > reference;
   TArray<TArray<int> > reference_pos;
   TArray<TArray<char> > reference_type;
   TArray<TAssocArray<TString,int> > name_tag;
   for(i=0; i < html_files.num(); i++) {
      if(verbose) {
	 fprintf(stderr, "reading %d/%d %s\r", i, html_files.num(), html_files[i].pSuf(54));
	 fflush(stderr);
      }
      if(f.readFile(html_files[i]))
	userError("error while reading file '%s'!\n", *html_files[i]);
      if(f.containsNulChar())
	nul_html_files += html_files[i];
      else {
	 all.empty();
	 reg.allMatchesSubstring(f, all, 0, -progress);
	 if(verbose) putc('\r', stderr);
	 for(j=0; j < all.num(); j++) {
	    
	    // get string value
	    const char *p = *f+all[j][reg_off];
	    int l=0;	    	   
	    while(isspace(*p)) p++; // skip blanks	    
	    if(strchr("\"'`", *p)) { // quoted
	       char term = *(p++);  // quote
	       while((p[l]!=term) && (p[l]!=0) && (l<MAX_REF_LEN)) l++;
	    } else {
	       while((!isspace(p[l])) && (p[l]!=0) && 
		     (p[l]!='>') && (l<MAX_REF_LEN)) l++;	       
	    }

	    // save tag
	    if(tolower(*(*f+all[j][0])) == 'n') { // NAME tag found
	       name_tag[i][TString(p, l)] = (p-(*f));
	    } else {                              // HREF or SRC or BACKGROUND tag found
	       reference[i]      += TString(p, l);
	       reference_pos[i]  += (p-(*f));
	       reference_type[i] += tolower(*(*f+all[j][0]));
	    }
	    	    
	    // progress
	    if(verbose) {
	       if((j%progress)==0) {
		  putc('o', stderr);
		  fflush(stderr);
	       }
	    }
	 }	 
	 if(verbose) putc('\r', stderr);
      }      
   }
   
        
   // check references
   if(verbose) 
     fprintf(stderr, "checking references                                                           \n");
   TArray<TString> unknown_ref;
   TArray<TString> unknown_name;
   TArray<TString> unused_name;
   TArray<TString> unused_file;
   TArray<TString> absolute_ref;
   TArray<TString> http_ref;
   TArray<TString> ftp_ref;
   TArray<TString> mailto_ref;
   TArray<TString> news_ref;
   TArray<TString> ref_syntax_error;
   TArray<TString> name_for_non_html;
   TAssocArray<TString,int> non_html_used;
   for(i=0; i < html_files.num(); i++) {
      
      // get path to current file
      TString path(html_files[i]);
      path.extractPath();
      
      for(j=0; j < reference[i].num(); j++) {
	 
	 // get ref
	 TString ref(reference[i][j]);
	 char type = reference_type[i][j];
	 int pos = reference_pos[i][j];
	 TString name(NO_NAME);
	 
	 // get ref and name
	 if(type=='h') {
	    int sep = ref.firstOccurence('#');
	    if(sep!=-1) { // has #NAME
	       name = ref(sep+1, TString::END);
	       ref = ref(0, sep);
	    }
	 }
	 
	 // check for external
	 if(ref.hasPrefix("http:")) {
	    if(show_external) 
	      http_ref += fileLineError(html_files[i], pos, ref);
	 } else if(ref.hasPrefix("ftp:")) {
	    if(show_external) 
	      ftp_ref += fileLineError(html_files[i], pos, ref);
	 } else if(ref.hasPrefix("mailto:")) {
	    if(show_external) 
	      mailto_ref += fileLineError(html_files[i], pos, ref);
	 } else if(ref.hasPrefix("news:")) {
	    if(show_external) 
	      news_ref += fileLineError(html_files[i], pos, ref);
	 } else {
	    if(ref.len()==0) {
	       // local name ref
	       if(name!=NO_NAME) {
		  // check for name		  
		  if(!name_tag[i].contains(name)) {
		     // name not found
		     unknown_name += fileLineError(html_files[i], pos, 
						  reference[i][j]);
		  } else {
		     // found and used name
		     if(name_tag[i][name] > 0)
		       name_tag[i][name] = -name_tag[i][name];
		  }		  
	       } else {
		  // reference syntax error
		  ref_syntax_error += fileLineError(html_files[i], pos, 
						   reference[i][j]);
	       }	       
	    } else {
	       // internal resource
	       
	       // build full ref
	       TString full_ref;
	       if(ref.isAbsolutePath()) {
		  // absolute ref
		  absolute_ref += fileLineError(html_files[i], pos, ref);
		  full_ref = ref;
	       } else {
		  full_ref = path + "/" + ref;
	       }
	       full_ref.normalizePath();
	       
	       
	       // check for ref
	       if(!files2i.contains(full_ref)) {
		  // ref not found
		  unknown_ref += fileLineError(html_files[i], pos, full_ref);
	       } else {
		  if(name!=NO_NAME) {
		     // check for name
		     if(html_files2i.contains(full_ref)) { 
			int k = html_files2i[full_ref];
			if(!name_tag[k].contains(name)) {
			   // name not found
			   name.expandUnprintable();
			   unknown_name += fileLineError(html_files[i], pos, 
							reference[i][j]);
			} else {
			   // found and used name
			   if(name_tag[k][name] > 0)
			     name_tag[k][name] = -name_tag[k][name];
			}		  
		     } else {
			// name given for non html file
			name_for_non_html += fileLineError(html_files[i], pos, 
							   reference[i][j]);
		     }
		  }
		  
		  if(non_html_files2i.contains(full_ref)) {
		     // found and used non html file
		     non_html_used[full_ref] = 1;
		  }	       	       
	       }
	    }
	 }	    
      }
      if(verbose) {
	 fprintf(stderr, "%d/%d    \r", i+1, html_files.num());
      }
   }
   if(verbose) {
      fprintf(stderr, "                       \r");
   }
   
   
   // build unused lists
   if(show_unused) {
      if(verbose) 
	fprintf(stderr, "building unused lists                             \n");
      for(i=0; i < html_files.num(); i++) {
	 for(TAssocArrayIter<TString,int> p(name_tag[i]); p; p++) {
	    if(*p>0) { 
	       unused_name += fileLineError(html_files[i], *p, p());
	    }
	 }
	 if(verbose) {
	    if((i%10)==0) fprintf(stderr, "%d/%d       \r", i, html_files.num());
	 }
      }
      for(i=0; i < non_html_files.num(); i++) {
	 if(!non_html_used.contains(non_html_files[i])) 
	   unused_file += non_html_files[i];
	 if(verbose) {
	    if((i%50)==0) fprintf(stderr, "%d/%d       \r", i, non_html_files.num());
	 }
      }      
   }
   
   // normalize filenames
   for(i=0; i < non_regular.num(); i++) 
     non_regular[i].normalizePath();
   for(i=0; i < non_existent.num(); i++) 
     non_existent[i].normalizePath();

   
   // print
   
   
   // report nonregular
   reportStrList(non_regular, "irregular file", "irre");
   
   // report nonexistent
   reportStrList(non_existent, "non existant file", "nexi");

   // report html files containing NUL char
   reportStrList(nul_html_files, "html files containing NUL char", "null");

   // absolute refs
   reportStrList(absolute_ref, "absolute path in reference to local file", "abso");

   // unknown refs
   reportStrList(unknown_ref, "dangling reference to local file", "dref");
   
   // unknown names
   reportStrList(unknown_name, "undefined name tag", "name");
   
   // reference syntax error
   reportStrList(ref_syntax_error, "reference syntax error", "rsyn");
   
   // name for non hmtl
   reportStrList(name_for_non_html, "name tag for non html file", "nfnh");
   
   // unused warnings
   if(show_unused) {
      reportStrList(unused_file, "unused file", "uref");
      reportStrList(unused_name, "unused name tag", "unam");      
   }
   
   // external refs
   if(show_external) {
      reportStrList(http_ref, "http reference", "http");
      reportStrList(ftp_ref, "ftp reference", "ftpr");
      reportStrList(mailto_ref, "mailto reference", "mail");
      reportStrList(news_ref, "news reference", "news");      
   }
   
   // test / debug
   reportStrList(test, "test/debug info", "test");
   
   // end
   printStatistics();   
   return 0;
}





















