#!/usr/bin/perl -w
########################################################################
#
# master - controls a number of slaves to spider gopherspace
# Copyright (C) 2004 Timothy Jon Fraser tfraser@alum.wpi.edu
#
# $Id: master,v 1.3 2004/06/03 15:01:46 tim Exp $
#
# This file is part of gspider.
#
# gspider is free software; you can redistribute it and/or modify it
# under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.  
# 
# gspider is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.  
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA.
#
#
# USAGE:  master [-v] -s <filename for initial state and dumps>
#                     -n <max number of slaves>
#                     -o <filename for statistical output>
#                     -e <filename for error messages>
#
########################################################################

use strict;
use vars qw($opt_v $opt_s $opt_n $opt_o $opt_e);
use Getopt::Std;

#
# handle command line arguments
#
my $usage = "USAGE: master [-v] " .
            "-s <filename for initial state and dumps>" .
            "-n <max number of slaves> " .
            "-o <filename for statistical output> " .
            "-e <filename for error messages>";
$opt_v = 0;                # verbose output off by default
$opt_s = "gspider.state";  # default filename for initial state and dumps
$opt_n = 1;                # default max number of slaves
$opt_o = "gspider.out";    # our results wind up here by default
$opt_e = "gspider.log";    # error logs wind up here by default
getopts('vs:n:o:e:') || die $usage;

my $checkme  = "chme"; # %state entry needs to be assigned for checking
my $assigned = "asnd"; # %state entry has been assigned to a slave for checking
my $checked  = "chkd"; # %state entry has been checked by a slave
my %state;             # maps servers -> {$checkme,$assigned,$checked}
my %slaves;            # maps slave pids -> servers
my $dumpflag = 0;      # causes sigchld handler to dump state when 1
my $newserversprefix = "/tmp/gspiderns.";  # full name: /tmp/gspiderns.<pid>
my $numslaves = 0;     # number of slaves currently running

if($opt_v) { print "gspider master started w/pid $$ " . localtime() . "\n"; }

# read state from state file into %state
loadstate();

$SIG{CHLD} = \&handle_sigchld;
$SIG{HUP}  = \&handle_sighup;

# start one slave.  Once slave starts, only the sigchld handler can touch
# the variables we're fooling with here.
launchslave(each %state);

# sleep forever, signal handlers will do the work from now on.
while(1) { sleep; }
exit -1; # never reached.


# handle_sigchld()
#
# This subroutine handles SIGCHLD signals from terminating slaves.  It
# acts as a "monitor" - it serializes access to %state, %slaves and
# $numslaves.  It collects information from terminating slaves and
# starts new ones to replace them, for as long as there are new sites
# to spider in %state.  When we run out of sites, this subroutine
# terminates the master program.
#
# This subroutine will call statedump() if $dumpflag is set.  This
# routine is the only one that can do the dump, because we're using
# this subroutine as a monitor to ensure serialized access to the
# %state hash.
#
# This subroutine will work only on platforms where, once a process
# gets a SIGCHLD, further SIGCHLDs are deferred until handle_sigchld()
# has run to completion.
#
# This subroutine will work only on platforms where you don't have
# to reinstall your signal handlers after handling a signal.
#

sub handle_sigchld {

    # reap terminated slave
    reap_slave();

    # if the user requested a dump (via sighup), do dump now
    if($dumpflag) {
	if($opt_v) { print "dumping state at " . localtime() . "\n"; }
	$dumpflag = 0;
	statedump();
    }
    
    # start as many new slaves as possible
    start_slaves();

    # if 0 slaves are running, quit master.
    if($numslaves <= 0) {
	if($opt_v) {
	    print "gspider master completed at " . localtime() . "\n";
	}
	exit 0;  
    }

} # handle_sigchld


# reap_slave()
#
# This subroutine cleans up after a terminated slave.  It reads the
# file of (potentially) new servers left by the slave and merges it
# (skipping duplicates) with the %state hash.  Then it uses the
# %slaves hash to determine which server the slave just spidered, and
# marks that server $checked in the %state hash.
#

sub reap_slave {

    my $slavepid;        # pid of slave that just quit
    my $server;          # server that terminated slave just spidered
    my $servercount = 0; # counts possibly-not-new servers found by slave
    my $uniquecount = 0; # counts new servers found by slave

    if(-1 == ($slavepid = wait)) {
	die "wait on no children $!\n";
    }

    # slaves that found new servers will leave a $newservers file.
    # slaves that found dead sites, or found live sites with no
    # new servers will leave no file.
    my $newservers = $newserversprefix . $slavepid;
    if(-e $newservers) {
	open(NEWSERVERS, "$newservers") || die "Can't open $newservers: $!\n";
	my $nsline;
	foreach $nsline (<NEWSERVERS>) {
	    $servercount++;
	    chop $nsline;
	    if(exists $state{$nsline}) {
		next;
	    }
	    $state{$nsline} = $checkme;
	    $uniquecount++;
	}
	close NEWSERVERS;
	unlink $newservers || die "can't unlink $newservers: $!\n";
    }

    unless(exists $slaves{$slavepid}) { die "ASSERT: slave pid not found"; }
    $server = $slaves{$slavepid};
    delete $slaves{$slavepid};
    unless(exists $state{$server}) { die "ASSERT: server not found in state"; }
    unless($state{$server} eq $assigned) { die "ASSERT: unassigned server"; }
    $state{$server} = $checked;

    $numslaves--;

    if($opt_v) {
	print "slave $slavepid spidered $server, found $servercount servers" .
	      " ($uniquecount new) at " . localtime() . "\n";
    }

} # reap_slave()


# start_slaves()
#
# This subroutine starts as many slaves as possible, limited by the
# availability of unspidered sites in %states, and by the total limit
# on concurrent slaves set by $opt_s.
#

sub start_slaves {

    scalar keys %state;      # reset the each iterator
    my $targetsite;          # name of sites to spider taken from %state
    while(($numslaves <= $opt_n) && ($targetsite = each %state)) {
	if($state{$targetsite} eq $checkme) {
	    launchslave($targetsite);
	} # if site state is $checkme
    } # for all elements in %state hash at this time
    
} # start_slaves()


# handle_sighup
#
# sets $dumpflag, which will trigger a state dump as soon as the
# next slave terminates.
#

sub handle_sighup {
    if($opt_v) { print "dump requested at " . localtime() . "\n"; }
    $dumpflag = 1;
} # handle_sighup()


# loadstate()
#
# loads state from the state file named by $opt_s into %state
# may be used to restore from a checkpoint.
#

sub loadstate {

    open(STATEFILE, "$opt_s") || die "Can't open $opt_s: $!\n";
    my $line;
    foreach $line (<STATEFILE>) {
	my $server;
	my $status;
	($server, $status) = split / /, $line;
	chop $status;

	unless (($status eq $checkme) || ($status eq $checked)) {
	    die "ASSERT: bad status on loadstate\n";
	}
	$state{$server} = $status;
    }
    close STATEFILE;

} # loadstate()


# statedump()
#
# This subroutine dumps the contents of %state to the file named by
# $opt_s.  All instances of $assigned in %state are written out as
# $checkme in the file.
#
# You can use this routine to checkpoint the master's computation for
# a later restart.  Note that work done by slaves who have not yet
# terminated will be lost.
#

sub statedump {

    open(STATEFILE, ">$opt_s") || die "Can't open $opt_s: $!\n";
    scalar keys %state;      # reset the each iterator
    my $server;
    my $status;
    while(($server) = each %state) {
	$status = $state{$server};
	if($status eq $assigned) { $status = $checkme; }
	print STATEFILE "$server $status\n";
    }
    close STATEFILE;

} # statedump


# launchslave()
#
# this subroutine launches a new slave process to spider the site
# named by its first argument.
#

sub launchslave {
    my ($targetsite) = @_;   # name of site for slave to spider
    my $slavepid;            # new slave's pid

    # we're putting the following two assignments before the fork()
    # to avoid race conditions.  We're being optimistic assigning
    # these things now, but since the master dies on failure, it
    # won't matter if we've guessed wrong.
    $numslaves++;
    $state{$targetsite} = $assigned;

    if($slavepid = fork) {
	# parent.  There is a race in the following assignmentto $slaves,
	# if the slave terminates and the master catches its sigchld
	# before the assignment occurs, the master's sigchld handler
	# will expect the master has done this assignment done when,
        # in fact, it hasn't.  Haven't seen this one occur yet, probably
	# because the fork and assignment are adjacent in the master,
	# but the slave has to do considerable more processing to cause
	# trouble.
	$slaves{$slavepid} = $targetsite;
	if($opt_v) {
	    print "slave $slavepid assigned to $targetsite at " . 
		localtime() . "\n";
	}
	return;
    } elsif(defined $slavepid) {
	# child - new slave
	my ($host, $port);
	my $newservers = $newserversprefix . $$;
	($host, $port) = split /:/, $targetsite;

	exec "slave -h $host -p $port -r $newservers -s $opt_o -e $opt_e"
	    || die "slave $$ failed exec: $!\n";
	# not reached
    } else {
	die "failed to fork $!\n";
    }

} # launchslave()

    
    
