/*GPL*START*
 * regextract - extract ascii data with a regexp and print formatted
 * Copyright (C) 1998 by Johannes Overmann <overmann@iname.com>
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 * *GPL*END*/  

#include <sys/stat.h>
#include <unistd.h>
#include <dirent.h>
#include <stdio.h>
#include <stdlib.h>
#include "tappconfig.h"
#include "tregex.h"
#include "file_tools.h"


// config:

// maximum number of substrings to print:
#define MAX_SUB 10

// 1998:
// 16:44 04 Aug v0.0  starting regextract by cloning substool v0.9a
// 19:27 04 Aug v0.1  seems to work perfectly, tested with email names/adds and long regexp
// 22:04 05 Aug v0.2  binary file processing, bugfix
// 11:25 06 Aug v0.3  added lower/upper/capitalize
// 18:05 18 Sep v0.3.1 preparing for sunsite

// update/use VERSION in Makefile!

const char *options[] = {
   "#usage='Usage: %n [OPTIONS, FILES, DIRS and PATTERN=FORMAT... ] [--] [FILES and DIRS]\n\n"
     "search for regexp PATTERNS in FILES (and DIRS if -r) and print parenthesized\n"
     "substrings with FORMAT (like printf, use %s, see below) to stdout\n"
     "example: \\'0[xX]([0-9a-fA-F]+)=HexValue(%s)\\n\\' will print \\'HexValue(deadbeef)\\'\n"
     "plus a newline if somwhere the string \\'0xdeadbeef\\' is found\n"
     "FORMAT: use \\'%s\\', \\'%l\\', \\'%u\\' and \\'%c\\' to print a substring \\'as is\\',\n\\'in lower case\\', \\'in upper case\\' and \\'capitalized\\'\n"
     "the conversion parameters are the same as for \\'%s\\' in \\'printf\\'\n"
     "example: the format \\'%-10.10u\\' will print exactly 10 chars left adjusted in\nupper case\n"
     "always be aware of the shell: use backslashes and quotes with care\n"
     "'",   
   "#trailer='\n%n version %v *** (C) 1997 by Johannes Overmann\ncomments, bugs and suggestions welcome: %e\n%gpl'",
   "#onlycl", // only command line options
   "#stopat--", // stop option scanning after a -- parameter
   // options
   "name=recursive        , type=switch, char=r, help=recurse directories, headline=file options:",
   "name=follow-links     , type=switch, char=l, help=follow symbolic links",
   "name=filter           , type=string, char=P, help='process only files whose filenames match the regexp PAT', param=PAT",
   "name=binary           , type=switch, char=b, help='do *not* skip binary files but replace every occurrence of a \\0 char by a \\1 char to make it possible to use the file as a valid C-string for the use with regex'",
   
   "name=ignore-case      , type=switch, char=i, help=ignore case, headline=matching options:",
   "name=match-newline    , type=switch, char=n, help=match any character operators also match newline",
   
   "name=verbose          , type=switch, char=v, help='be more verbose about (not) processing files on stderr', headline=common options:",
   "name=no-color         , type=switch, char=Q, help='do *not* colorize output with ansi sequences'",
   "EOL" // end of list
};




const char *color_bold = "\033[33;01m";
const char *color_nor  = "\033[00m";


void splitFilesAndDirs(const TArray<string>& par, TArray<string>& files, 
		       TArray<string>& dirs, bool follow_links, bool quiet) {
   struct stat filestat;
   int r;
   
   for(int i=0; i<par.num(); i++) {
      if(follow_links) r= stat(par[i], &filestat);
      else             r=lstat(par[i], &filestat);
      if(r)
	 userError("'%s': no such file or directory\n", *par[i]);
      if(S_ISREG(filestat.st_mode)) {
	 files += par[i];
      } else if(S_ISDIR(filestat.st_mode)) {
	 dirs += par[i];
      } else {
	 if(!quiet) {
	    if(S_ISLNK(filestat.st_mode)) {
	       fprintf(stderr, "ignoring symbolic link '%s'\n", *par[i]);
	    } else {
	       fprintf(stderr, "ignoring non-regular file '%s' (mode=%#o)\n",
		       *par[i], (int)filestat.st_mode);
	    }
	 }
      }
   }     
}


TArray<string> scanDirectory(string dir, bool quiet) {
   struct dirent **dirents;
   TArray<string> ret;
   int i,j;
   
   if(!quiet) {
      if(dir.len()>65) {
	 const char *p = dir;
	 p += dir.len()-62;
	 fprintf(stderr, "scanning ...%-62.62s\r", p);
      }
      else fprintf(stderr, "scanning %-65.65s\r", *dir);
      fflush(stderr);
   }
   if((j=scandir(dir, &dirents, NULL, alphasort))==-1)
     userError("error while scanning directory '%s'\n", *dir);
   for(i=0; i<j; i++) {
      if(strcmp(".", dirents[i]->d_name) && strcmp("..", dirents[i]->d_name))
	ret += dir + "/" + string(dirents[i]->d_name);
   }
   free(dirents);
   return ret;
}


int main(int argc, char *argv[]) {
   int i,j,k;

   // get parameters
   TAppConfig ac(options, "options", argc, argv, 0, 0, VERSION);

   // setup
   bool quiet = !ac("verbose");   
   bool ignore_case = ac("ignore-case");
   bool follow_links = ac("follow-links");
   if(ac("no-color"))
     color_bold = color_nor = "";
   int prog_num = 5;
   if(quiet) prog_num = 0;
   bool skip_binary = !ac("binary");
   
   
   // get patterns and files
   TArray<string> l_rules;
   TArray<string> r_rules;
   TArray<string> par;
   TArray<TRegEx *> regex;
   INSTANCENAME(l_rules);
   INSTANCENAME(r_rules);
   INSTANCENAME(par);
   INSTANCENAME(regex);
   for(i=0; i<ac.numParam(); i++) {
      if(ac.param(i) == "--") { // files follow
	 i++;
	 break;
      }
      TArray<string> a(split(ac.param(i), "=", true));
      switch(a.num()) {
       case 1: // file
	 par += a[0];
	 break;
       case 2: // rule
	 a[0].compileCString();
	 a[1].compileCString();
	 if(a[0].len()==0) 
	   userError("can't handle empty pattern: '%s'!\n", *ac.param(i));
	 l_rules += a[0];
	 r_rules += a[1];
	 break;
       default: // error
	 userError("invalid pattern '%s'!\n(expecting format 'pattern=format', if you want a literal '=' char use '\\=', but be aware of the shell: you may want to use '\\\\=')\n", *ac.param(i));
      }
   }

   // add remaining parameters as files (only after a '--') 
   for(; i<ac.numParam(); i++) 
     par += ac.param(i);

   // check patterns
   if(l_rules.num()==0)
     userError("no pattern given! (at least one pattern is needed, try --help)\n");
   for(i=0; i<l_rules.num(); i++) {
      if(l_rules[i].containsNulChar()) {
	 l_rules[i].expandUnprintable();
	 userError("illegal null char in regexp pattern '%s'!\n", *l_rules[i]);
      }
   }

   // print patterns
   if(!quiet) {
      int r = l_rules.num();
      fprintf(stderr, "%d rule%s given:\n", r, (r==1)?"":"s");
      for(i=0; i<l_rules.num(); i++) {
	 string l(l_rules[i]);
	 string r(r_rules[i]);
	 l.expandUnprintable();
	 r.expandUnprintable();
	 fprintf(stderr, "  '%s%s%s' ==> '%s%s%s'\n", 
		color_bold, *l, color_nor,
		color_bold, *r, color_nor);
      }
      fprintf(stderr, "\n");
   }

   // compile regex
   int flags=0;
   if(ignore_case) flags |= REG_ICASE;
   flags |= REG_EXTENDED;
   if(!ac("match-newline")) flags |= REG_NEWLINE;
   for(i=0; i<l_rules.num(); i++) {
      regex[i] = new TRegEx(l_rules[i], flags);
      if(regex[i]->error()) {
	 l_rules[i].expandUnprintable();
	 userError("error compiling regex '%s':\n%s\n", *l_rules[i], *regex[i]->errorToStr());
      }
   }
   regex.fixedSize();

   // compile output format
   TArray<string> format;
   enum {UPPER=1, LOWER=2, CAPITALIZE=3};
   TArray<TArray<int> > format_info;
   for(i=0; i < r_rules.num(); ++i) {
      int seq = 0;
      format[i] = r_rules[i];
      string f = r_rules[i];
      int len = f.len();
      f += "%";
      for(j = 0; j < len; ++j) {
	 if(f[j] == '%') {
	    if(f[j+1] == '%') ++j; // skip %% char
	    else {
	       j++;
	       while(strchr("0123456789.-+# ", f[j])) ++j;
	       switch(f[j]) {
		case 's': // normal string
		  format_info[i][seq] = 0;
		  break;
		case 'l': // lower string
		  format[i][j] = 's';
		  format_info[i][seq] = LOWER;
		  break;
		case 'u': // upper string
		  format[i][j] = 's';
		  format_info[i][seq] = UPPER;
		  break;
		case 'c': // capitalize string
		  format[i][j] = 's';
		  format_info[i][seq] = CAPITALIZE;
		  break;

		default:
		  r_rules[i].expandUnprintable();
		  userError("illegal conversion character '%c' in format '%s'!\nuse 's', 'l', 'u' or 'c' to print the string 'as is', 'in lowercase',\n'in uppercase' or 'capitalized', respectively\n", f[j], *r_rules[i]);
	       }
	       seq++;
	    }
	 }
      }
   }
        
   // get files 
   TArray<string> files;
   TArray<string> dirs;
   splitFilesAndDirs(par, files, dirs, follow_links, quiet);
   if(ac("recursive")) { // recursive
      TArray<string> newfiles;
      while(dirs.isNotEmpty()) {	 
	 for(i=0; i<dirs.num(); i++) 
	   newfiles += scanDirectory(dirs[i], quiet);
	 dirs.empty();
	 splitFilesAndDirs(newfiles, files, dirs, follow_links, quiet);
	 newfiles.empty();
      }
   } else { // not recursive
      if(!quiet) {
	 for(i=0; i<dirs.num(); i++)
	   fprintf(stderr, "ignoring directory '%s'\n", *dirs[i]);
      }
   }
   
   // check for files 
   if(files.num() == 0)
     userError("no file(s) specified!\n");     
   
   // filter files
   if(ac.getString("filter")) {
      string filterstr(ac.getString("filter"));
      TRegEx filter(filterstr, REG_EXTENDED);
      if(filter.error()) {
	 filterstr.expandUnprintable();
	 userError("error compiling filter regex '%s':\n%s\n", *filterstr, *filter.errorToStr());
      }
      for(i=0; i < files.num(); i++) {
	 if(!filter.match(files[i])) files[i].empty();
      }
   }
      
   // check for files
   int numfiles = 0;
   for(i = 0; i < files.num(); ++i) if(files[i]) ++numfiles;
   if(numfiles == 0) {
      if(!quiet) {
	 fprintf(stderr, "nothing to do (no files match pattern)!\n");
      }
   }
      
   // process files
   string f;
   for(i = 0; i < files.num(); i++) {
      if(files[i]) {
	 
	 // read file
	 if(!quiet) {
	    fprintf(stderr, "processing '%s'\n", *files[i]);
	    fflush(stderr);
	 }
	 int len = flen(files[i]);
	 FILE *fp = fopen(files[i], "rb");
	 if(fp==0) {
	    userError("error while opening file '%s' for reading!\n", *files[i]);
	 }
	 f.read(fp, len);
	 fclose(fp);
	 
	 // process file
	 if(f.containsNulChar()) {
	    if(skip_binary) {
	       if(!quiet) {
		  fprintf(stderr, "skipping binary file\n");
	       }
	       continue;
	    } else {
	       if(!quiet) {
		  fprintf(stderr, "converting binary file\n");
	       }
	       f.translateChar(0, 1);
	    }
	 } 

	 // apply all patterns
	 for(int r=0; r < l_rules.num(); r++) {
	    TArray<TArray<int> > all;
	    regex[r]->allMatchesSubstring(f, all, 0, prog_num, TRegEx::P_NUMBER | TRegEx::P_STDERR);
	    if(!quiet) fprintf(stderr, "\n");
	    for(k=0; k < all.num(); k++) {
	       int num = all[k].num()/2 - 1;
	       if(num > MAX_SUB) num = MAX_SUB;
	       TArray<string> str;
	       for(j=0; j < num; j++) {
		  str[j] = f(all[k][2*j+2], all[k][2*j+2] + all[k][2*j+3]);
	       }
	       
	       // convert substring
	       for(j=0; j < num; j++) {
		  switch(format_info(r)(j)) {
		   case UPPER:
		     str[j].upper();
		     break;
		   case LOWER:
		     str[j].lower();
		     break;
		   case CAPITALIZE:
		     str[j].capitalize();
		     break;
		  }
	       }
	       
	       // print
#if (MAX_SUB != 10) 
#error fix printf arguments to fit MAX_SUB
#endif
	       printf(*format[r], *str[0], *str[1], *str[2], *str[3], *str[4], 
		      *str[5], *str[6], *str[7], *str[8], *str[9]);
	       
	    } // for k (all occurences)
	 } // for r (number of rules	 
      } // if files
   } // for files
   
   // cleanup
   for(i=0; i<l_rules.num(); i++)
     delete regex[i];
   
   // normal exit
   return 0;
}


