// mhtml.cc -- the main program
//
// mhtml -- a program to mirror html pages recursively
// Copyright (C) 1996  Kevin M. Bealer
// 
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version
// 2 of the License, or (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public
// License along with this program; if not, write to the Free
// Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
// USA.
// 
// You can send mail to the author at <kmb203@psu.edu> or:
// 
// Kevin M Bealer
// 94 Bowers Road
// Mertztown, PA 19539

// Types of URLs this program uses:
//
// remote - http://hostname/path/to/file/filename.extension
// clocal - /cache/dir/hostname/path/to/file/filename.extension.orig
// vlocal - /cache/dir/hostname/path/to/file/filename.extension
//
// remote is the filename used to fetch from the remote host
// clocal is the cached version, should be ident contents. to remote
// vlocal is the clocal version, with paths rewritten for cache.
// 

//#include<stl.h> -- in mhtml.h
#include<GetOpt.h>
#include<String.h>

#include "mhtml.h"
#include "basic.h"
#include "UrlConvert.h"
//#include "WebRef.h" -- in mhtml.h

// Global Variables ! Aaack! We have to pass these or redefine bits of the
// parser to allow passing info to the parse-thing.

list<WebRef> * Urls;   // database of inlines/href's etc 
String GlobalCurrPage; // html page being parsed.
String GlobalCacheDir; // cache directory
int verbose;

int main(int argc, char ** argv)
{
	String CacheExt = ".orig";
	GlobalCurrPage = "";
	GlobalCacheDir = "/var/spool/mhtml/";

	GetOpt getopt(argc, argv, "phc:l:s:d:");

	int hopcount = 0;
	int localhopcount = 0;
	verbose = 0;

	char ochar;
	
	while((ochar = getopt()) != EOF) {
		switch(ochar) {
			case 'h':
				cout << "\nmhtml version " << VERSION 
				<< ", Copyright (C) 1996 Kevin Bealer\n"
				<< "mhtml comes with ABSOLUTELY NO WARRANTY;\n"
				<< "This is free software; for more info, type mhtml -p.\n\n"
				<< "Syntax: mhtml [options] (page)\n"
				<< "------------------------------\n"
				<< "-h: this help text\n"
				<< "-p: license info\n"
				<< "-c (path): specify cache directory\n"
				<< "-l (num): number of links to follow\n"
				<< "     (default is " << hopcount << ")\n"
				<< "-s (num): number of links to follow\n"
				<< "     within same server.\n"
				<< "-d [to]*: Sets verbosity of output.\n"
				<< "   [t]: Show transfers.\n"
				<< "   [o]: Other information.\n"
				<< "   [d]: Debugging info.\n"
				<< endl;
				return(0);

			case 'p':
				ShowLicense();
				return(0);
			
			case 'c':
				GlobalCacheDir = getopt.optarg;
				break;
			
			case 'l':
				hopcount = atol(getopt.optarg);
				break;
			
			case 's':
				localhopcount = atoi(getopt.optarg);
				break;
			
			case 'd':
				if (getopt.optarg == 0) {
					cerr << "Need an argument for -d\n";
					return (1);
				}
				
				for(int i = 0; i < (int)strlen(getopt.optarg); i++)
				{ switch(getopt.optarg[i]) {
					case 't': verbose |= V_TRANSFER;
						break;
						
					case 'o': verbose |= V_OTHER;
						break;
					
					case 'd': verbose |= V_DEBUG;
						break;
					
					default: cerr << "Unknown option."
						<< getopt.optarg << endl;
						return(1);
					}
				}
				break;
				
			default:
				return(1);
		}
	}
	
	localhopcount = MAXIMUM(hopcount, localhopcount);

	if((argc - getopt.optind) != 1) {
		cerr << "mhtml needs exactly one argument." << endl;
		return(1);
	}
	
	String remotepage = argv[getopt.optind];

	remotepage = RemoteToRemote(remotepage);

	String localpage = RemoteToLocal(remotepage, GlobalCacheDir);
	if (localpage == 0) {
		cerr << "No local equivalent for " << remotepage << endl;
		return(1);
	}
	
	Urls = new list<WebRef>;

	FetchPage(remotepage, localpage + ".orig", verbose);
	Translate(localpage + ".orig", localpage, remotepage, verbose);

	// Make sure there are no identical references ...
	Urls->sort();
	Urls->unique();
	
	if(verbose & V_OTHER) {
		cout << "Selecting files to get ..." << endl;
	}

	// Tag each file (by extension) for different actions
	PickRetrieves(Urls);

	if(verbose & V_OTHER) {
		cout << "Getting files ...\n" << endl;
	}

	for(list<WebRef>::iterator i = Urls->begin(); i != Urls->end(); i++) {
		String rname = (*i).RemRef();
		String lname = RemoteToLocal((*i).RemRef(), GlobalCacheDir);
		if (lname == 0) // this reference is a non-useful
			continue; 	 // type (for example mailto:)
		int retrieve = (*i).GetRetrieve();
		int reftype = (*i).RefType();
		
		if(verbose & V_OTHER) {
			cout << "\nRemote: <" << rname << ">, type <" 
			     << reftype << ">" << endl;
		}
		if(verbose & V_TRANSFER) {
			cout << "Local : <" << lname << ">" << endl;
			cout << "Retrieve == " << retrieve << endl;
		}
		
		// Implement the policy decisions in PickRetrieves
		if(retrieve == PR_FETCH) {
			FetchPage(rname, lname, verbose);
		} else if (retrieve == PR_RECURSE) {
			RecurseMirror(rname, GlobalCacheDir, remotepage,
			              hopcount - 1, localhopcount - 1,
			              verbose);
		} // if zero nothing happens
	}

	return(0);
}

void PickRetrieves(list<WebRef> * TheList)
{
	// For now we select inline images for getting
	// and html refs for recursive getting.

	list<WebRef>::iterator i;
	for(i = TheList->begin(); i != TheList->end(); i++) {
		switch((*i).RefType()) {
			case WR_INLINE:
			case WR_BACKGR:
				(*i).SetRetrieve(PR_FETCH);
				break;
			
			case WR_HREF:
				{
				int rval = PR_IGNORE;
				String test = (*i).RemRef();
				while(test.contains("/"))
					test = test.after("/");
				while(test.contains("."))
					test = test.after(".");
				// Now we (should) have the extension.
				test.downcase();
				if(test == "jpeg") rval = PR_FETCH;
				if(test == "jpg") rval = PR_FETCH;
				if(test == "gif") rval = PR_FETCH;
				if(test.index("html") == 0) rval = PR_RECURSE;
				// because .html, .html2, .html3 etc.
				(*i).SetRetrieve(rval);
				}
				break;
			
			default:
				// This is always a bug
				cerr << "Unknown type of reference!" << endl;
		}
	}
}

void ShowLicense()
{
	cout << " mhtml -- a program to mirror html pages recursively" << endl 
	<< " Copyright (C) 1996  Kevin M. Bealer" << endl 
	<< " " << endl 
	<< " This program is free software; you can redistribute it and/or" << endl 
	<< " modify it under the terms of the GNU General Public License" << endl 
	<< " as published by the Free Software Foundation; either version" << endl 
	<< " 2 of the License, or (at your option) any later version." << endl 
	<< " " << endl 
	<< " This program is distributed in the hope that it will be useful," << endl 
	<< " but WITHOUT ANY WARRANTY; without even the implied warranty of" << endl 
	<< " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the" << endl 
	<< " GNU General Public License for more details." << endl 
	<< " " << endl 
	<< " You should have received a copy of the GNU General Public" << endl
	<< " License along with this program; if not, write to the Free" << endl
	<< " Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139," << endl 
	<< " USA." << endl 
	<< " " << endl 
	<< " You can send mail to the author at <kmb203@psu.edu> or:" << endl 
	<< " " << endl 
	<< " Kevin M Bealer" << endl 
	<< " 94 Bowers Road" << endl 
	<< " Mertztown, PA 19539" << endl;
	
}
