#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "web.h"

const char * pszServerRoot = SERVERROOT;
const char * pszURLPrefix = URLPREFIX;

FILE * GetPage (char *, int, char *, char *, int);

int process_page (char * host, int port, char * file, char * destdir,
		  int overwrite, int max_recurs);

/* looks for /etc/webcrawl.conf, /usr/local/etc/webcrawl.conf,
 * $HOME/.webcrawl and processes all that are found
 */

void processconfig(FILE * fp, const char * filename)
{
    char szBuf[256];
    int line;

    fgets(szBuf, 256, fp);
    line = 1;
    while (!feof(fp))
    {
	if (!strcmp(szBuf, "[rename]")) {
	    rename_readconfig(fp, filename, &line, szBuf, 256);
	    continue; /* next line returned in buffer */
	}
	else {
	    fprintf (stderr, "%s, line %d: I don't understand that\n",
		     filename, line);
	    exit(1);
	}
    }
}

void readconfig()
{
    char buf[256];
    FILE * fp;
    if ((fp = fopen("/etc/webcrawl.conf", "r")) != NULL) {
	processconfig(fp, "/etc/webcrawl.conf");
	fclose(fp);
    }
    if ((fp = fopen("/usr/local/etc/webcrawl.conf", "r")) != NULL) {
	processconfig(fp, "/usr/local/etc/webcrawl.conf");
	fclose(fp);
    }
    if (getenv("HOME")) {
	strcpy(buf, getenv("HOME"));
	strcat(buf, "/.webcrawl");
	if ((fp = fopen(buf, "r")) != NULL) {
	    processconfig(fp, buf);
	    fclose(fp);
	}	
    }	
}
int main (int argc, char * * argv)
{
    char	* pszURL, * pszDestDir, szHost [128], szFile [128];
    int		port;

    argc--,argv++;
    if (argc < 2)
    {
	printf ("usage: webcrawl [options] <webaddress> <destination_dir>\n");
	printf ("options:\n");
	printf ("  URL selection (default=don't follow off-site links):\n");
	printf ("      -a      ask the user whether to jump to new servers\n");
	printf ("      -f str  always follow links to URLS that contain string"
                              " 'str'\n");
	printf ("      -d str  don't ever follow links containing 'str'\n");
	printf ("      -u f    log unfollowed URLs to file 'f'\n");
	printf ("      -x      don't follow any page links by default!\n");
	printf ("      -X      don't load inline images by default\n");
	printf ("  Page re-writing:\n");
	printf ("      -n      don't rewrite the pages with local URLs\n");
	printf ("      -r<x>   rewrite non relative URLs when: a - always\n");
	printf ("              l - URL is local, f (default) - target file "
		"exists\n");
	printf ("      -k      keep existing names; disable renaming files "
		"to sane filenames\n");
	printf ("      -q      disable insertion of process id in query "
		"filenames\n");
	printf ("  Recursion limiting:\n");
	printf ("      -l[x] n limit depth of search to find files to n, "
		"initially with -ll,\n");
	printf ("              after jump to remote site with -lr, "
		"both with -l\n");
	printf ("  General options:\n");
	printf ("      -v      increase verbosity (use up to 4 times)\n");
	printf ("      -[op] d change o: server root directory, p: url "
		"rewriting prefix to d\n");
	printf ("  HTTP options:\n");
	printf ("      -A      set the agent name\n"
		"              default = '"
		DEF_USER_AGENT "'\n");
	printf ("      -t n    set timeout to n seconds\n");
	printf ("      -T      use no data timeout, rather than overall "
		"connection timeout\n");
	printf ("\nweb address should not have a leading http://, and "
		"destination dir is taken\n");
	printf ("relative to the server root directory (" SERVERROOT ").\n");
	printf ("webcrawl version " WEBCRAWL_VERSION "\n");
	return 1;
    }

    options.bAsk = 0;
    options.nAlwaysFollow = 0;
    options.bImageOverride = 1;
    options.fURLLog = NULL;

    options.bRewrite = 1;
    options.cRewriteMode = 'f';
    options.bRename = 1;
    options.bQueryAddPid = 1;

    options.nRemote = -1;
    options.nLocal = -1;

    options.bVerbose = 0;    

    options.userAgent = DEF_USER_AGENT;
    options.timeout = 0;
    options.bNoDataTO = 0;

    while (argc && **argv=='-')
    {
	switch(argv[0][1])
	{
	    /* options to control URL selection */
	case 'a':
	    options.bAsk = 1;
	    break;
	case 'f':
	    if (options.nAlwaysFollow == MAXFOLLOW)
	    {
		fprintf(stderr, "Too many -f options on command line\n");
		return 1;
	    }
	    options.pszAlwaysFollow[options.nAlwaysFollow++] = *(++argv);
	    argc--;
	    break;
	case 'd':
	    if (options.nNeverFollow == MAXFOLLOW)
	    {
		fprintf(stderr, "Too many -d options on command line\n");
		return 1;
	    }
	    options.pszNeverFollow[options.nNeverFollow++] = *(++argv);
	    argc--;
	    break;
	case 'u':
	    options.fURLLog = fopen(*(++argv), "w");
	    if (!options.fURLLog)
	    {
		fprintf(stderr, "Couldn't open '%s'\n", *argv);
		return 1;
	    }
	    argc--;
	    break;
	case 'x':
	    options.bFollowNone = 1;
	    break;
	case 'X':
	    options.bImageOverride = 0;
	    break;

	    /* options to control page rewriting */
	case 'n':
	    options.bRewrite = 0;
	    break;
	case 'r':
	    if (argv[0][2] != 'a' && argv[0][2] != 'l' && argv[0][2] != 'f')
	    {
		fprintf(stderr, "unrecognised -rx mode: %c\n", argv[0][2]);
		return 1;
	    }
	    options.cRewriteMode = argv[0][2];
	    break;
	case 'k':
	    options.bRename = 0;
	    break;
	case 'q':
	    options.bQueryAddPid = 0;
	    break;
	    /* recursion limiting options */
	case 'l':
	    if (argv[0][2] == 'r') {
		options.nRemote = atoi(argv[1]);
	    }
	    else if (argv[0][2] == 'l') {
		options.nLocal = atoi(argv[1]);
	    }
	    else if (argv[0][2] == 0) {
		options.nRemote = options.nLocal = atoi(argv[1]);
	    }
	    else
	    {
		printf("Unrecognised option: %s\n", *argv);
		return 1;
	    }
	    argv++, argc--;
	    break;

	    /* general options */
	case 'v':
	    options.bVerbose++;
	    break;
	case 'o':
	    pszServerRoot = *(++argv);
	    argc--;
	    break;
	case 'p':
	    pszURLPrefix = *(++argv);
	    argc--;
	    break;

	    /* HTTP-related options */
	case 'A':
	    options.userAgent = *(++argv);
	    argc--;
	    break;
	case 't':
	    options.timeout = atoi(*(++argv));
	    argc--;
	    break;
	case 'T':
	    options.bNoDataTO = 1;
	    break;
	default:
	    printf("Unrecognised option: %s\n", *argv);
	    return 1;
	}
	argc--;
	argv++;
    }
    pszURL = argv [0], pszDestDir = argv [1];
    options.pszOutputDir = argv[1];
    if (SplitURL (pszURL, szHost, szFile, &port) )
    {
	printf ("Invalid URL : %s\n", pszURL);
	return 1;
    }
    rename_init();
    return process_page (szHost, port, szFile, pszDestDir, 1, options.nLocal);
}

int process_page (char * host, int port, char * file, char * destdir,
		  int overwrite, int max_recurs)
{
    FILE     * fp;
    xreflist xr;
    char     newhost[128], newfile[128];
    int      newport;
    int	     i;

    if (max_recurs == 0) {
	printf("Maximum recursion level reached.\n");
	return 0;
    }
    if (max_recurs > 0) max_recurs --; /* figure to pass on ! */

    if (! (fp = GetPage(host, port, file, destdir, overwrite)))
	return 1;

    /* printf("Content-type: %s\n", lastcontenttype); */
    if (!strcmp(lastcontenttype, "text/html"))
    {
	fseek(fp, 0, SEEK_SET);
	if (getxref(fp, &xr)) {
	    fclose(fp);
	    return 1;
	}
	fclose(fp);

	for (i = 0; i < xr.nrefs; i++)
	{
	    if (relative_url(xr.refs[i], host, port, file, 
			     newhost, &newport, newfile, xr.alwaysget[i]))
	    {
		fprintf(stderr, "not following link to: %s\n",
			xr.refs[i]);
	    }
	    else
	    {
		if (strcmp(newhost, host))
		    process_page(newhost, newport, newfile, destdir, 0,
				 options.nRemote);
		else
		    process_page(newhost, newport, newfile, destdir, 0,
				 max_recurs);
	    }
	    free(xr.refs[i]);
	}
	if (options.bRewrite) rewrite(destdir, host, port, file, &xr);
    }
    else
	fclose(fp);
    return 0;
}

FILE * GetPage (char * pszHost, int nPort, char * pszFile, char * pszDest, 
	     int bOverwrite)
{
    /*
    FILE	* fpDest = CreateFile (pszDest, pszFile, bOverwrite);

    if (!fpDest) return 1;
    */

    return Download (pszHost, nPort, pszFile, pszDest, 0, bOverwrite);
}





