#!/usr/bin/perl -w
########################################################################
#
# slave - spiders a single gopher site
# Copyright (C) 2004 Timothy Jon Fraser tfraser@alum.wpi.edu
#
# $Id: slave,v 1.2 2004/06/03 15:01:46 tim Exp $
#
# This file is part of gspider.
#
# gspider is free software; you can redistribute it and/or modify it
# under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.  
# 
# gspider is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.  
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA.
#
#
# USAGE:  slave [-v] -h <host> [-p <port>] -r <remote sites filename>
#               -s <filename for statistical output>
#               -e <filename for error messages>
#
# slave spiders through all the directories on one gopher site,
# specified by <host> and <port>.  It truncates the <remote sites filename>
# and outputs a list of all unique gopher servers cited by the
# gopher site slave is spidering.  It appends one line to
# <filename for statistical output> naming the gopher site, the
# time spidering completed, and some statistics on its contents.
# It also appends some error diagnostic messages to <filename for
# error messages.  These messages are mainly aimed at listing
# examples where the gopher URL parsing algorithm fails and might
# be improved.
#
########################################################################

use strict;
use vars qw($opt_v $opt_h $opt_p $opt_r $opt_s $opt_e);
use Getopt::Std;
use gspider;

#
# handle command line arguments
#
my $usage = "USAGE: slave [-v] -h <host> [-p <port>] -r <remote sites filename> -s <filename for statistical output> -e <filename for error messages>";
$opt_v = 0;       # verbose output off by default
$opt_h = "";      # gopher server host, no default
$opt_p = "70";    # gopher server port, defaults to 70
$opt_r = "";      # we must write hostport of remote servers to this file
$opt_s = "";      # we must write statistical output to this file
$opt_e = "";      # we must write error diagnostic messages to this file
getopts('vh:p:r:s:e:') || die $usage;
if($opt_h eq "") {  # user must supply host on commandline
    die $usage;
}
$opt_h = lc $opt_h;  # normalize to lowercase on hostnames

if($opt_r eq "") {  # user must supply filename on commandline
    die $usage;
}
if($opt_s eq "") {  # user must supply filename on commandline
    die $usage;
}
if($opt_e eq "") {  # user must supply filename on commandline
    die $usage;
}

open(ERRORFILE, ">>$opt_e") || die "Can't open $opt_e: $!";

my $thedirectory;        # the directory to ask for on this iteration
my %uniquelocaldirs;     # tracks unique local directories
my %uniqueremoteservers; # tracks unique remote servers
my $checkme = "checkme"; # uniquelocaldirs entry needs to be spidered
my $checked = "checked"; # uniquelocaldirs entry has already been spidered
my $selectorcount = 0;   # count of selectors parsed on local site
my $statline;            # line printed to stat file
my $retval = 0;          # value returned by this program, 0=success, -1=not

# start by trying to retrieve the root page of the specific server
$thedirectory = "/\n";
if(&handlepage) {

    # no response, this server is dead, give up.
    $retval = -1;
    $statline = $statline = $opt_h . ":" . $opt_p . " " . localtime() .
	" dead\n";
    goto out;

}

# OK, since we managed to get the root page, now let's spider the entire site
my $newpageflag = 1; # quit only after running through hash w/no new pages
while($newpageflag) {
    $newpageflag = 0;               # predict no new pages
    scalar keys %uniquelocaldirs;   # reset the each iterator
    while(($thedirectory) = each %uniquelocaldirs) {
	if($uniquelocaldirs{$thedirectory} eq $checkme) {
	    if(&handlepage) {
		next;  # failed to load this page, try the next
	    }
	    $uniquelocaldirs{$thedirectory} = $checked;
	    $newpageflag = 1;
	}
    }
} 

# if we found some remote servers, write to $opt_r REMOTEFILE.
# if we found none, don't bother writing the file.
if((scalar keys %uniqueremoteservers) > 0) {
    open(REMOTEFILE, ">$opt_r") || die "Can't open $opt_r: $!";
    print REMOTEFILE sort keys %uniqueremoteservers;
    close REMOTEFILE;
}

$statline = $opt_h . ":" . $opt_p . " " . localtime() .
    " $selectorcount selectors, " . (keys %uniqueremoteservers) .
    " unique remote servers.\n";

$retval = 0;  # success!

out:

if($opt_v) {
    print $statline;
}
open(STATFILE, ">>$opt_s") || die "Can't open $opt_s: $!";
print STATFILE $statline;
close STATFILE;

close ERRORFILE;

exit $retval;
 

# handlepage()
#
# return:  value   condition
#          -----   ---------
#            0     page processed successfully
#           -1     lynx failed to retrieve the page
#
# This subroutine downloads a single page from $opt_h, $opt_p, and
# $thedirectory, parses it, spits error messages to ERRORFILE, and
# updates %uniquelocaldirs and %uniqueremotesevers.  It touches a
# great many variables through side-effects because, heck, this is
# Perl.
#

sub handlepage {

    # download page
    my @thepage;      # page contents download into this array
    my $thedirnocr = $thedirectory;
    chop $thedirnocr;
    @thepage = 
	`lynx -source gopher://$opt_h:$opt_p$thedirnocr 2> /dev/null`;
    if(($?) || (scalar(@thepage) <= 0)) {
	print ERRORFILE "Failed to fetch " . $opt_h . ":" . $opt_p .
	    $thedirectory;
	return -1;
    }

    # parse page, update ERRORFILE
    my $hostnameport = $opt_h . ":" . $opt_p;
    my @newremoteservers; # remote sites seen on this page - may be repeats
    my @newlocaldirs;     # local dirs seen on this page - may be repeats
    my @failures;         # failures encountered on this page
    $selectorcount += parsegopherpage($hostnameport, \@thepage,
				      \@newremoteservers, \@newlocaldirs,
				      \@failures);

    my $newcount;  # used to count unique additions
    my $count;     # used to count non-unique items found
    $count = $#failures + 1;
    if($opt_v) {
	print "Parsed $thedirnocr with $count failures.\n";
    }

    print ERRORFILE @failures;

    # update %uniquelocaldirs and %uniqueremoteservers.  We're using hashes
    # to avoid storing the same value more than once.

    $newcount = 0;
    my $localdir;  # used to iterate through local directories in @newlocaldirs
    foreach $localdir (@newlocaldirs) {
	unless(exists $uniquelocaldirs{$localdir}) {
	    $uniquelocaldirs{$localdir} = $checkme;
	    $newcount++;
	}
    }
    
    $count = $#newlocaldirs + 1;
    if($opt_v) {
	print "Found $count local dirs, $newcount new.\n";
    }

    $newcount = 0;
    my $remoteserver; # used to iterate through servers in @newremoteservers
    foreach $remoteserver (@newremoteservers) {
	unless(exists $uniqueremoteservers{$remoteserver}) {
	    $uniqueremoteservers{$remoteserver} = "1";  # arbitrary value
	    $newcount++;
	}
    }

    $count = $#newremoteservers + 1;
    if($opt_v) {
	print "Found $count remote servers, $newcount new.\n";
    }
    return 0;
} # sub handlepage()
