#!/usr/bin/perl -w

###########################################################################
# urlmon - URL monitor
# checks and keeps track of lists of last modification dates of URLS

# urlmon is copyright (C) 1997, 1998 Jeremy Impson <jdimpson@acm.org>

# type 'urlmon -h' for usage

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

# ver 1.0 -- here we go...
# ver 2.0 -- added a comments field to the rc file, and made the rc file
#		depend on the file name (so that multiple rc files can
#		exist for any one user), and made printing not rewrite
#		the rc file. 
#	     Fixed the derivation of who '$me' is
# ver 2.1 -- Use HEAD instead of GET for initial connections, assuming that
#		there will be a last_modified date.  If there isn't, then use
#		GET.  When using the rc file, if there is already a checksum, 
#		then immediately use GET.
#	     Allow user settable timeout length.
#	     Allow user settable proxy server.
#	     Allow fork()ing to monitor URLs in parallel (number of 
#		processes settable by user)
# ver 2.2 -- Bug fix print_urls() returning false so when '-p' given 
#		the program didn't exit.  Also, fixed uninitialized value in 
#		is_cksum when monitoring a URL not yet in the database.  
#            Added $ua->env_proxy(); line to actually use the proxy server set
#		in the proper environmental variables.
#		All these bugs were caught and diagnosed by Michael
#		Wiedmann <mw@miwie.in-berlin.de>
#	     Wrote companion script, nscape2urlmon to migrate netscape
#		bookmark file to urlmonrc file format.  Will also take the 
#		union of a bookmark file and the urlmonrc file
# 	     Modularized code that reads in urlmonrc file for ease of
#		inserting it into nscape2urlmon
#	     Allow use of username/password for www-basic-authentication
#		Thanks to Robin Houston <robin@oneworld.org> for help
# ver 2.3 -- Bug fixed that printed "Errors:" even when there were none.
#	     Added ability to write out file to current directory in the 
#		that the URL has changed. 
#		Thanks to Eric Raymond <esr@thyrsus.com> for suggestion.
#	     Added different return codes based on what happened, useful when
#		running in 'curt' mode (-c).  The return codes are:
#		 0 for no new URLs, modified URLs, or errors
#		 1 for modified URLs detected
#		 2 for new URLs added to database 
#		 3 for modified URLs detected, new URLs added
#		 4 for errors
#		 5 for modified URLs and errors
#		 6 for new URLs and errors
#		 7 for all three
#		Thanks to Eric Raymond <esr@thyrsus.com> for suggestion.
#	     Added rudimentary file locking so urlmonrc file doesn't get
#		corrupted 
#	     getting SIGPIPE errors from some FTP servers.
#		If you get this error, let me know what kind of FTP
#		software it was connecting to.
#	     Fixed array to hash conversion in 'read_in_state'
# ver 3.0 -- Made filtering of file content possible, so that certain lines
#		can be filtered out before a checksum is computed.
#	     Unfortunately, this forces the urlmonrc file to have a new (more
#		flexible) format.
#	     'db_url' got rewritten while implementing filters, and 'new_url'
#		got rolled into it.  The result is much cleaner.
#	     -c option now prints errors to STDERR, so that scripts can capture
#		the data and do something intelligent with it.
# ver 4.0 -- Added DISPLAY variable (thanks to Bill Dyess
#		<bill@dyess.com> for the patch) to allow user to set what
#		string is output when a give URL has changed.
#	     Experimental SysV IPC support, to avoid having to use /tmp
#		files, something I though was ugly as well as a potential
#		security problem.  Requires IPC::SysV module (and SysV IPC
#		support on your system).
#	     Fixed usage of escape and unescape when passing data from
#		children to parent.  Duh.
#	     Added debug() routine so that debugging statements can be
#		controlled more granularly.  Goodbye '$opt_d and print ...' 
#		statements.
#	     Modified print_urls ('-p') to print more of info. kept in
#		urlmonrc file
#	     urlmon can now take URLs from standard input.  Duh.  
#		As a result, the comment handling code was changed, so
#		keep an eye on it.
#	     Fixed bug in the logic that decides what filter to use (if 
#		none was provided, or the current one is inappropriate).  
#		Right now, '-s' will force timestamps into checksums, but 
#		will not force anything else (e.g. regexp) to checksums
#		(because the already are checksums, of sorts).
#	     The %urlmonfilter::filtercode array can be added to by defining a 
#		library file in the urlmonrc file with the CODE= tag.
#		Note that  the CODE tag's location in the urlmonrc file only
#		specifies the order in which one code file will be read (first
#		come, first 'require'd) but they are all read in before any
#		URLs are processed.
#	     Reworked the splitline() code to handle blank entries in the 
#		urlmonrc file (i.e. COMM=  is allowed).  Keep an eye on it!
#	     Thanks to Robert Richard George 'reptile' Wal <reptile@reptile.eu.org>,
#		some bugs in the filtercode 'regexp' have been fixed.  
#		'reptile' also added a method to view urlmon's progress using
#		'ps -aux | grep urlmon' on the command line.  Pretty cool!
#	     Thanks to Douglas N. Arnold <dna@psu.edu> for pointing out that
#		http://www.foo.bar/baz and http://www.foo.bar/baz/ might be
#		two different URLs.  (Fixed in 'sub normalize'.)
#	     Realized that 'pararegexp' filtercode is incorrect.  It
#		didn't do anything wrong, but did something silly.  Now,
#		it simply applies its FARG argument (a Perl pattern
#		match) against the entire contents of the URL, which is 
#		what it used to do.  It also use to set the $/ variable
#		to '', but that was completely irrelavant.
#	     'regexp'  and 'pararegexp' filtercode now take multiple "s///"
#		arguments, and will apply them one at a time to the 
#		content. 

# This doesn't solve the sigpipe problem.  Now, instead of dying, it 
# loops forever...
$SIG{'PIPE'} = 'IGNORE';

# who am i?
$version = "4.0b";
$me      = $0;
if($me =~ m#([^/]+)$#) {
	$me = $1;
}
$0 = "$me: parent";	# set status
#

#$opt_d = 6;

#
require LWP::UserAgent;
#use LWP::Debug qw(+ );	#DEBUG
use HTTP::Status;
use Getopt::Std;
use MD5;
use IPC::SysV  # consider conditionally 'require'ing this, instead
	qw(IPC_PRIVATE IPC_RMID IPC_NOWAIT IPC_STAT S_IRWXU S_IRWXG S_IRWXO);
use IPC::Msg;
require 'ctime.pl';
#

#
$sysv = 1;
$sysvmax = 4096; # bytes.   This value MUST be 32 or greater.
# This assumes that a Perl character == a byte.
# See comment in write_out_state
# What is a good value here??  If too small, children have to send
# multiple messages.  If too big, wasted memory.
#

#
$last_modified_database = ".${me}rc";
#

# handle options
#
getopts('Cdlhrpcsof:P:t:F:'); # See 'usage()' for explanation...
$opt_h = $opt_h; # make '-w' shut up...
$opt_r = $opt_r;
$opt_P = $opt_P;
$opt_p = $opt_p;
$opt_l = $opt_l;
#

$opt_d = 0 unless $opt_d;
$opt_F = 0 unless $opt_F;

debug("debug level: $opt_d\n");

# 
do { warn "Timeout must be greater than 0\n"; exit 4; } if (defined($opt_t) and $opt_t <= 0);
do { warn "Expecting numerical argument to '-F', not \"$opt_F\"\n"; exit 4; }
  if (defined $opt_F and $opt_F !~ /^\d+$/);
$opt_F = 1 unless $opt_F;
($opt_d = 0) if $opt_c; # shut up debugs if curt output is wanted
$opt_h and usage() and exit 0;
$opt_s and debug("Forcing checksums for all URLs\n");
$opt_P and debug("Proxy server is $opt_P\n");
$opt_d and ($| = 1);
$opt_p and do { $read_only = 1; print_urls(); exit 0; };
$opt_C and $opt_r = 0; # too confusing to mix comments and removals
#

# set up error codes:
#
$errorbit = 2;
$newbit   = 1;
$modbit   = 0;
#
$retval   = "0"; # must be a string, or wierdness ensues
#
# now, if there is an error, do this:
#	vec($retval,$errorbit,1) = 1;
# a new URL, do this:
#	vec($retval,$newbit,1)   = 1;
# a modified URL, do this:
#	vec($retval,$modbit,1)   = 1; 
#

# deal with last_modified data (urlmonrc) file business
init_mods(); # sets %mods and %comm

#
# get the targets.  This function _must_ be called after getopts (to set
# $opt_l) and after init_mods (to set %mods)!!  Get rid of nulls, because
# we use them internally for special purposes
@targets = map { s/\0//g; $_; } targets();
debug("targets: @targets\n");
@targets or usage() and exit 0;
#


#
$opt_r and do { remove_urls(@targets);  exit 0; };
#

# set up user agent -- I must say, this is a really cool abstraction!

package MyAgent;
# this package is needed to force the useragent to use my $username, $passwd
# values.

# inherit from LWP::UserAgent
push @ISA, qw( LWP::UserAgent );

# override normal authentication, as called by 'request'
sub get_basic_credentials {
	($username, $passwd);	# These are global to the MyAgent
				# namespace, and are set in main::normalize
}

package main;    

$ua = new MyAgent;
$ua->agent("$me/$version");
$ua->env_proxy();   # use proxy servers as defined in env. variables
$ua->proxy(['http', 'ftp', 'gopher'], $opt_P) if $opt_P;
$ua->timeout($opt_t) if $opt_t;
$md5 = new MD5;
$i = 0;

$fork_file = "/tmp/urlmon.tmp";
$savedir   = "."; # current dir, for now
$wait_max  = 1; # for (poor) file locking

$numtargets  = $#targets + 1;
$numchildren = $opt_F;
if ($numtargets < $numchildren) {
	# avoid overkill, we don't need more procs than URLs
	# (Also, it is usually not a win to have on proc per URL, as
	# forking and execing takes time, so the $numchildren should be
	# much less than $numtargets.  However, let the user do what the
	# user wants, assuming it is possible.)
	$numchildren = $numtargets;
}
$targetsperchild = $numtargets / $numchildren;

debug ("targets: $numtargets; children: $numchildren\n");

if ($numchildren <= 1) {
  # why go to so much trouble??
  debug("Not forking, need only one process\n");
  monitor(@targets);
} else {
  if ($sysv) {
    # create SysV IPC message queue
    debug("Creating SysV IPC queue\n");
    $msq = new IPC::Msg(IPC_PRIVATE, S_IRWXU | S_IRWXG | S_IRWXO)
      or die "Cannot create shared memory queue $!\n";
    $msgtype = 1;	# what is this for??
  } # END if sysv

  debug("Forking\n");
  for ($cnt = 0; $cnt < $numchildren; $cnt++) {

    my($offset) = $cnt * $targetsperchild;
    #
    # perl rules!  It takes care of rounding, and it appears that in these
    # slices all of @targets will get used, regardless the ratio b/w 
    # $numtargets and $numchildren, which won't always be an integer.
    #
    # To see it, run this for a few different values of '$nc':
    # perl -e '$nc = 3; @t = qw(1 2 3 4 5 6 7 8 9 10 11); $nt = $#t + 1; $tpc = $nt / $nc; for($cnt = 0; $cnt < $nc; $cnt++) { $offset = $cnt * $tpc; @o = @t[$offset .. $offset + $tpc - 1]; print("$offset\t@o\n"); }'
    #
    my(@use) = @targets[$offset .. $offset + $targetsperchild - 1];

    if(! defined ($fork = fork)) { # fork() error
      warn "fork() broke on set $cnt (too many processes?)\nThe following URLs have NOT been checked:\n";
      foreach (@use) {
        print STDERR "$_\n";
      }
    } elsif ($fork) { # parent
      ; # fall through to the 'wait()'
    } else { # child
      debug("temporary file: $fork_file.$$\n");
      monitor(@use);
      write_out_state($$); # to file or to message queue
      debug("Child $$ exiting with $retval\n");
      exit $retval;
    }
  } # END for $cnt

  # wait
  while (($pid = wait) != -1) {
    debug("Reading in child $pid\n");
    read_in_state($pid);  # from file or from message queue
  }

  if ($sysv and ref($msq) =~ /IPC/) {
    # At this point, all children are dead, so no new messages can be
    # added to queue.  Go ahead and make sure the message queue is empty.
    my $num;

    while(($num = $msq->stat()->qnum()) > 0) {
      debug(2, "There are $num messages in the SysV queue\n");
      read_in_state(0); # from message queue
      if ($num <= $msq->stat()->qnum()) {
        warn "SysV IPC receive not working! Some data lost...\n";
        last;
      }
    }
  } # END if $sysv

} # END if $numchildren <= 1


debug("\n");

# remove message queue.  An error here doesn't hurt urlmon, but indicates
# something is fubared with the system.
if ($sysv and ref($msq) =~ /IPC/) {
	debug("Removing SysV IPC message queue\n");
	$msq->remove or warn "Can't remove SysV IPC queue: $!\n";
}

# write out changes in last_modified file
write_mods_file();

# print out our findings
print_results();

debug("exiting with $retval\n");
# end of main()
exit $retval;

###############
# subroutines #
###############

sub init_mods { # sanity checks on urlmonrc and set %mods and %comm

	$mods_file_home = $ENV{'HOME'};
	if($opt_f) {
		$mods_file = $opt_f;
	} else {
		$mods_file = "$mods_file_home/$last_modified_database";
	}
	$lockfile = "${mods_file}.LCK";
	debug("last_modified database is \"$mods_file\"\n");
	if( ! -r $mods_file) {
		die "Can't read \"$mods_file\"\n" if -e $mods_file;
		die "Can't create \"$mods_file\"\n" unless -w $mods_file_home;
		open(FOO,">$mods_file") or 
		  die "Couldn't create \"$mods_file\": $!\n";
		close FOO or 
		  warn "Something funky with \"$mods_file\" creation: $!\n";
	}
	if(! -w $mods_file) {
		warn "\"$mods_file\" not writeable.  Utilizing it read-only.\n";
		$read_only = 1;
	}
	# open, read, and parse the last_modified database
	open(MODS,$mods_file) or 
	  die "Can't open \"$mods_file\" for reading: $!\n";
	@mods = <MODS>;

	local $^W = 0; # shut up if @mods is empty.

	# check urlmonrc versioning info.
	if ($mods[0] =~ /^#\s*(.*)/) {
		$filevers = $1; # this needs to be defined so we know how
		debug("urlmonrc version is $filevers, using new parse method\n");
		# to write the file back out

		my (@line, @fargs);
		my ($url, $mod, $comm, $filter, $display);
		foreach (@mods) {
			s/#.*$//;
			next unless /.+/;
			@line = splitline($_);
			foreach (@line) {

				/CODE=(.*)/ and do {
					my $file = $1;
					unshift @INC, ".", $ENV{'HOME'};
					if (! ( -e $file or 
						($file !~ m#^/# and 
						 grep ( -e "$_/$file", @INC)))) {

						warn "Could not find $file to use!\n";
					} else {
						package urlmonfilter;
						require $file;
						package main;
					}
					shift @INC; shift @INC;
					push @code, $file;
					goto LINEEND;	# you didn't see this...
				};

				/URL=(.*)/ and do {
					$url = $1;
					next;
				};

				/MOD=(.*)/ and do {
					$mod = $1;
					next;
				};

				/COMM=(.*)/ and do {
					$comm = $1;
					next;
				};

				/FILTER=(.*)/ and do {
					$filter = $1;
					next;
				};

				/FARGS=(.*)/ and do {
					@fargs = split /[,\s]+/, $1;
					next;
				};

 				/DISPLAY=(.*)/ and do {
 					$display = $1;
 					next;
 				};
			} # END foreach line

			# set the data for this URL
			if (! defined $url) {
				warn "the line \"@line\" of \"$mods_file\" contains no URL (target) to monitor!!\n";
			} elsif (! defined $mod) {
				warn "the line \"@line\" of \"$mods_file\" contains no modification information!!\n";
			} else {
				$mods{$url}    = $mod;
				$comm{$url}    = $comm if $comm;
				$filter{$url}  = $filter if $filter;
				$display{$url} = $display?$display:$url;
				if ($filter and @fargs) {
					$fargs{$url} = [];
					@{$fargs{$url}} = @fargs;
				}
			}
		    LINEEND:
			$url = $mod = $comm = $filter = $display = '';
			@fargs = ();
		} # END foreach mods
	} else { # no version in urlmonrc file, use old method.
		debug("no urlmonrc version, using old parse method\n");
		foreach (@mods) {
			s/#.*$//;
			next unless /.+/;
			($url,$data,@comment) = split(/\s+/,$_);
			$mods{$url} = $data;
			$comm{$url} = (@comment ? join ' ', @comment : '');
		}
	}
	@mods = ();
	close MODS or warn "Can't close \"$mods_file\": $!\n";
} # END init_mods

sub splitline {
	my($line) = @_;
	my(@line);

	my($regexp) = '[A-Z]+=(".*?"|([^"\s]*\S)*)';

	push @line, $1 while ($line =~ /($regexp)/g);

	@line;
}


# XXX: figure out if %filter variable should be preserved in this manner. 
#      I fear so, because db_url may change it if certain conditions
#      occur.  However, I think %fargs does not, which is good, because
#      that could have gotten ugly (it is a hash of arrays).  When
#      threaded perl becomes available, we can get rid of this garbage.
#      (Will threaded perl be portable across all systems??)

# XXX: should DISPLAY values be preserved in this manner?  I suspect
#      that it isn't necessary.  It depends on what method other than
#      manually editing the urlmonrc file, if any, is employed.


sub write_out_state {	# write out to a file the contents of 
			# %errors, %new, %mods, %comm
  my($filend) = @_;
  my($errors, $new, $mods, $comm, $filter, $changed);

  # we only want to write out for those urls we have actually checked up
  # however, these arrays (actually %mods and %comm) are set before any
  # forks are done, and are set with _ALL_ of the urls from the urlmonrc
  # file, so we need to get rid of them.  If we didn't do this, then the 
  # urlmonrc file would reflect the state of the last child to be read in,
  # which would cause the loss of any data gathered by the other children.
  #
  foreach(keys %mods) {
    unless ($seen{$_}) { # %seen gets defined by each child proc for only those URLs the child saw
      delete $mods{$_};
      delete $comm{$_}   if defined $comm{$_};
      delete $new{$_}    if defined $new{$_};
      delete $errors{$_} if defined $errors{$_};
    }
  }

  $errors  = join '||', map { escape($_) } %errors;
  $new     = join '||', map { escape($_) } %new;
  $mods    = join '||', map { escape($_) } %mods;
  $comm    = join '||', map { escape($_) } %comm;
  $changed = join '||', map { escape($_) } @changed;

  if (! $sysv) {

    open(FILE, ">$fork_file.$filend")
      or die "Can't write \"$fork_file.$filend\": $!\n";
    print FILE "ERRORS:$errors\n";
    print FILE "NEW:$new\n";
    print FILE "MODS:$mods\n";
    print FILE "COMM:$comm\n";
    print FILE "CHANGED:$changed\n";
    close FILE;

  } else {	# ACK, this is ugly!

    my ($pid) = $$;
    $pid .= ''; # force string interpretation    
    debug(2, "length of $pid: ", length($pid), "\n");
    my ($msg, $msg2, $length);

    foreach $msg ("ERRORS:$errors", "NEW:$new", "MODS:$mods",
	"COMM:$comm", "CHANGED:$changed") {

      if (length($msg) > $sysvmax) {
        debug(2, "Message too big, breaking down\n");
        # This assumes that a perl character == a byte on all platforms,
        # in all situations. Does anyone have a portable way to make sure
        # a Perl string isn't longer than a certain size in bytes?

	# Break the string down into parts.  Send the first part bookended
        # by this processes pid.  Send the rest until we get to the end.
        # At the end, only send the pid at the beginning.

        $length = $sysvmax - 2 * length("||$pid||");
        $regexp = "." x $length;
        while (length($msg) > $sysvmax - 2 * length("||$pid||")) {
          if(length("||${pid}||||${pid}||") >= $sysvmax) {
            die "\$sysvmax size ($sysvmax) too small!!  (Should be >= 32)\n";
          }
          # break down message
          $msg  =~ /($regexp)(.*)/;
          $msg2 = "||${pid}||$1||${pid}||";
          $msg  = $2;
          debug(2, "\tSending \"$msg2\"\n");
          debug(2, "\tRemains \"$msg\"\n");
          $msq->snd($msgtype, $msg2, 0)
            or die "can't send message: $msg2\n";
        }
        $msg = "||${pid}||${msg}"; # send end of multi-part message
      } # END if size too big
      debug(2, "Sending \"$msg\"\n");
      $msq->snd($msgtype, $msg, 0)
        or die "can't send message: $msg\n";
    }

  }

} # END write_out_state

sub read_in_state { # restore all the state hashes from child files

  my($filend) = @_;
  my(@tmp) = ();
  my($tmp);

  if (!$sysv) {

    open(FILE, "$fork_file.$filend")
      or die "Can't read \"$fork_file.$filend\": $!\n";
    foreach (<FILE>) {
      inparse($_);  
    }
    close FILE;
    unlink("$fork_file.$filend") 
      or warn "Can't unlink \"$fork_file.$filend\": $!\n";

  } else {
    my ($msg, $num);
    my $loop = 5 * $numchildren; # 5 because we read 5 messages/child

    while(($num = $msq->stat()->qnum()) > 0) {
      $msg = '';
      debug(2, "Queued messages: $num\n");
      $msq->rcv($msg, $sysvmax, 0, IPC_NOWAIT) or
        warn "can't receive a message\n";
      debug(2, "Read \"$msg\"\n");

      if ($msg =~ /^\|\|(\d+)\|\|(.*)\|\|\1\|\|$/) {
        debug(2, "\tBuffering it in $1\n");
        $buffer[$1] .= $2;
        next; # stop here and get next message
      } elsif ($msg =~ /^\|\|(\d+)\|\|(.*)$/) {
        # reconstruct it and send it on to be parsed
        $msg = $buffer[$1] . $2;
        debug(2, "\tGot all of $1: \"$msg\"\n");
        $buffer[$1] = '';
      } 

      inparse($msg);
      $loop--;
      if ($loop < 1) { return; }
    }

  } # END if !sysv
} # END read_in_state

sub inparse { # sets %errors, %new, %mods, etc, as necessary
  my @tmp = ();
  my $tmp = '';

  local ($_) = @_;

  # the line 'push @tmp, " " if ($#tmp & 1) == 0)' is there to make
  # sure that @tmp has an even number of elements, so it converts cleanly
  # to a hash.  The process of writing out state and then reading it back in
  # loses the last element of %errors, %mods, etc IFF that value was ''.

  /^ERRORS:(.*)/ and do {
    $tmp = $1;
    @tmp = %errors;
    push @tmp, map { unescape($_) } split(/\|\|/, $tmp);
    push @tmp, " " if (($#tmp & 1) == 0);
    %errors = @tmp;
    return;
  };

  /^NEW:(.*)/ and do { 
    $tmp = unescape($1);
    @tmp = %new;
    push @tmp, map { unescape($_) } split(/\|\|/, $tmp);
    push @tmp, " " if (($#tmp & 1) == 0);
    %new = @tmp;
    return;
  };

  /^MODS:(.*)/ and do { 
    $tmp = unescape($1);
    @tmp = %mods;
    push @tmp, map { unescape($_) } split(/\|\|/, $tmp);
    push @tmp, " "  if (($#tmp & 1) == 0);
    %mods = @tmp if @tmp;
    return;
  };

  /^COMM:(.*)/ and do { 
    $tmp = unescape($1);
    $tmp =~ s/\|\|$//; # is this line needed?  I forget :)
    @tmp = %comm;
    push @tmp, map { unescape($_) } split(/\|\|/, $tmp);
    push @tmp, "" if (($#tmp & 1) == 0); 
    %comm = @tmp;
    return;
  };

  /^CHANGED:(.*)/ and do {
    $tmp = unescape($1);
    push @changed, map { unescape($_) } split(/\|\|/, $tmp);
    return;
  };

} # END inparse

# These assume that nulls will never show up.  It this bad?  How can I do
# it better?
sub escape {
	my($string) = @_;
	$string =~ s/\|\|/\0/g;         # escape double pipes
	$string;
} # END escape

sub unescape {
	my($string) = @_;
	$string =~ s/\0/\|\|/g; # put double pipes back
	$string;
} # END unescape

sub monitor { # does the actual lookups.  Needs to be cleaned up...
  my ($gad_total,$gad_counter) = ($#_ + 1,0);
  # for every html file to check...
  foreach (@_) {
    $gad_counter++;
    debug("Target is \"$_\"\n");
    $url = normalize($_); # normalize its name
    if(!defined($url)) {
      debug("URL \"$_\" not valid\n");
      $errors{$_} = "URL not valid";
      next;
    }

    # needed for preventing overlap when having more than one process running
    $seen{$url}++;

    if($opt_C) {
      $comm{$url} = shift @comm; # @comm was set by targets(), and is evenly mapped with @targets (or @_)
    }
    debug("Normalized: \"$url\"\n");

    ($new{$url} = 1) unless defined $mods{$url};
    # check to see if it is already in the list...
    debug("\"$url\" is new\n") if $new{$url};

    # generate request and store it.
    debug("Checking \"$url\"\n");
    # it would be nice to not have to know anything about the filter
    # at this point (except maybe that it exists).  We need to
    # know here in order to determine the most efficient way of
    # getting the data we need to detect changes.
    if (is_cksum($url) or $opt_s or $opt_o 
	or ($filter{$url} and $filter{$url} ne 'default' and 
			      $filter{$url} ne 'none'    and
			      $filter{$url} ne 'timestamp')) { 
    # use GET only if we need a checksum or are going to write out changed files
    # _OR_ if there is a filter but it isn't 'default' or 'none' 
      $0 = "$me: ($gad_counter/$gad_total) GET $url";
      $request{$url} = new HTTP::Request 'GET', $url;
      $reqtype{$url} = 'GET';
    } else { # use HEAD if it's new or has a timestamp
      $0 = "$me: ($gad_counter/$gad_total) HEAD $url";
      $request{$url} = new HTTP::Request 'HEAD', $url;
      $reqtype{$url} = 'HEAD';
    }

    debug("\tUsing $reqtype{$url} method\n");
    unless (defined $request{$url} and ref $request{$url}) {
      $errors{$url} = "Cannot generate request";
      debug("No request for \"$url\"\n");
      next;
    }

    # make connection with URL
    $response = $ua->request($request{$url});

    # failed request
    if ((! ref $response) or (! $response->is_success) ) {
      debug("Request for \"$url\" failed: response object is \"$response\"\n");
      $errors{$url} = "Request failed: " . status_message($response->code);
      delete($new{$url}) if defined($new{$url}); # don't include this in output
      next; # next value for $url
    }

    if (! defined($response->last_modified) and $reqtype{$url} =~ /HEAD/) {
      $0 = "$me: ($gad_counter/$gad_total) GET $url";
      # if there is no timestamp, and no content with which to 
      # generate a checksum, reconnect to get the content
      $request{$url} = new HTTP::Request 'GET', $url;
      debug("\tNo last_modified date, reconnecting using GET method\n");
      $response = $ua->request($request{$url});
      if ((! ref $response) or (! $response->is_success) ) {
        # if we made it here, then our first connection was successful, but
        # the second one was not.  Hmmm...
        debug("Subsequent request for \"$url\" failed\n");
        $errors{$url} = "Subsequent request failed: " . 
          status_message($response->code);
        delete($new{$url}) if defined($new{$url});
        next;
      }
    }

    db_url($url, $response); # deal with URL in database

  } # END foreach target url

} # END monitor


sub save { # save the contents of the file to a file in the directory $savedir
  return 0 unless $opt_o; # make sure we weren't called without $opt_o being set

  my($url, $response) = @_;
  my($content) = $response->content;

  $url =~ m#([^/]+)$#;
  my($filename) =  "$savedir/$1";
  return 1 unless fileOK($filename);

  debug("Writing out \"$filename\"\n");

  open(OUT,"> $filename") or # shouldn't happen unless something wrong with fileOK
    die "Can't open \"$filename\" for writing (shouldn't happen): $!\n";

  print OUT $content;

  close OUT or 
    warn "Can't close \"$filename\" (how odd): $!\n";

  0;

} # END save

sub fileOK { # XXX: fix this to actually do something
	my($filename) = @_;
	debug("checking file \"$filename\"\n");
	1;
} # END fileOK

sub is_cksum {
	my($url) = @_;
	my($chksum) = $mods{$url};
	$chksum = '' unless $chksum;
	$chksum =~ /^C/;
	#$filter{$url} ne 'timestamp';	# XXX: it would be nice to only
					#      use this line, but this
					#      function is sometimes
					#      called before the %filter
					#      hash is filled in
} # END is_cksum


# This prints out the results.  It also computes what the return
# value ($? in shell) should be.  This is kind of ugly, but it is also the
# best place for it, since it does all that logic anyway.
sub print_results {

  # print out new URLs
  if (keys %new) {
    vec($retval,$newbit,1) = 1; 
    $opt_c or print "New Targets:\n";
    foreach(keys %new) {
      $opt_c or print "\t";
      print "$_";
      $opt_c or ($comm{$_} and print "\t$comm{$_}");
      print "\n";
    }
  }

  # print modified URLs
  if($#changed >= 0) { # there are modified targets
    vec($retval,$modbit,1) = 1;
    $opt_c or print "\nModified Targets:\n";
    foreach (@changed) {
      $opt_c or print "\t";
      print "$display{$_}";
      $opt_c or ($comm{$_} and print "\t$comm{$_}");
      print "\n";
    }
  } else {
    $opt_c or print "\nNo Targets Modified\n";
  } 
  $opt_c or print "\n";

  # print errors
  if (%errors) { # are there any errors?
    vec($retval,$errorbit,1) = 1;

    if ($opt_c) {
      foreach (keys %errors) {
	print STDERR "$_\t\t$errors{$_}\n";
      }
    } else {
      print "Errors:\n";
      foreach (keys %errors) {
        print "\t\"$_\"\t$errors{$_}\n";
      } # END foreach error
    } # END if !$opt_c
  } # END if errors occurred
} # END print_results


sub targets { # return list of urls to be monitored

  my(@argv);

  if ($opt_l) { # use the last_modified database for list of targets to check
    if(!defined(%mods)) {
      if( -z $mods_file) {
        die "$me: \"$mods_file\" empty.  No URLs to monitor.\n";
      } else {
        die "Error: \"\%mods\" not defined (programming glitch, scream at author!)\n";
      }
    }
    # Yeesh, this is inefficient...
    grep ($seen{$_}++, @ARGV);
    foreach (keys %mods) {
      if (!$seen{$_}) {
        push @ARGV, $_;
	if ($opt_C) { # make comments come after url, like on command line
	  push @ARGV, (defined $comm{$_} ? $comm{$_} : '');
	  # comments on command line take precedence over ones in urlmonrc file
	}
      }
    }
  } 

  if($opt_C) { # assume command line has url/comment pairs
    @argv = ();
    while(@ARGV) {
      push(@argv, shift @ARGV);
      push(@comm, shift @ARGV);
    }
    @ARGV = @argv;
  }

  if (!@ARGV) { # XXX: should try to handle comments
    # read in standard input into the target array
    while (defined ($_ = <STDIN>)) {
      chomp;
      push @ARGV, $_;
    }
  }

  @ARGV;
} # END targets



sub usage { 
  die "Usage: $me [-Cdcso] [-P http://my.proxy.org] [-f urlmonrcfile]  [-t timeout] [-F procs] [ <URL> | -l | -h | -p | -r <URL> ]
  <URL> is one or more space-delimited URL's to monitor (or remove).
  -l use the URLs in last_modified database ($last_modified_database) as targets to check.  (Also uses URLs on command line.)
  -h print this stuff.
  -r remove the URLs from database.  Silent if URL is not in database.
  -s use only checksums to determine if URLs have been modified.
  -d run in debug mode.
  -c curt output.  Print nothing but the changed or new URLs.  Shuts off -d.  Prints errors on STDERR.
  -p print out contents of the last_modified database ($last_modified_database).
  -f <file>  use supplied file as last_modified database.
  -C assume that argument(s) will be url/comment pair(s).
  -P specifies a proxy server to use for ftp, gopher, and http connections.
  -t make 'timeout' the timeout length for all network connections. Defaults to 180 seconds.
  -F use 'procs' number of processes to monitor in parallel.  Defaults to $opt_F.
  -o save all new files to the current directoy.
  Notes: 
    For ftp and http connections, the following syntax is allowed for authenticated sessions:
      ftp://username:passwd\@host.domain.org/  
      http://username:passwd\@host.domain.org/
    The following environmental variables will be searched for proxy servers:
      http_proxy, gopher_proxy, wais_proxy, and no_proxy
";
} # END usage


sub normalize { # make things 'normal' :)
  my($url) = $_[0];
  my($newurl);

  my ($restypes) = "(http|ftp|file)";

  # This is a no-no.  Thanks to Douglas N. Arnold <dna@psu.edu>
  # who pointed this out.
  #$url =~ s#(.*)/$#$1#; # get rid of trailing slashes
  
  if($url !~ m#^$restypes+:#) {
    $url = "http://$url";  # if there is no ':', add 'http://'
  }

  # I could remove this if I bother to look at the new version of LWP,
  # which allows passwords in HTTP URLs, but I haven't gotten around to 
  # it yet.

  if ($url =~ m#^http://(.+?):(.+?)@(.+$)#) { # embedded username/passwd
    $MyAgent::username = $1; # save for use in UserAgent::request()
    $MyAgent::passwd   = $2;
    debug(2, "Username: $MyAgent::username\nPasswd: $MyAgent::passwd\n");
  }

  ($url ? $url : undef);
} # END normalize 

sub gen_cksum { # generate checksum...  
	my($output);  
	$md5->reset();
	debug("\tGenerating checksum\n");

	$md5->add(@_);
	$output = $md5->hexdigest();

	debug("\tChecksum is $output\n");
	$output;
} # END gen_cksum


sub remove_urls { # removes URLs from the database
	my(@targets) = @_;

	foreach(@targets) {
		$_ = normalize($_);
		undef($mods{$_}) if defined $mods{$_};
	}

	write_mods_file();
	1;
}



sub write_mods_file {

	my($foo,$i);
	return if $read_only;
	# rewrite the last_modified data file with current info

	lockfile();

	open(MODS,">$mods_file") or die "Can't write to \"$mods_file\": $!\n";
	debug(3, "\n");

	debug("writing out rc file\n\n");
	$filevers = $version unless $filevers;
	print MODS "# $filevers\n";
	foreach $_ (@code) {
		print MODS "CODE=$_\n";
	}
	foreach $_ (keys %mods) {
		if(! defined $mods{$_}) {
			debug(3, "$_ has no modification  date/sum!\n");
			next;
		}
		debug(3, "URL=$_\tMOD=$mods{$_}");
		$foo = "URL=$_\tMOD=$mods{$_}";
		if ($comm{$_}) {
			if ($comm{$_} =~ /\s/ and $comm{$_} !~ /^".*"$/) {
			# has spaces by not surrounded by quotes
				$quote = "\"";
			} else {
				$quote = '';
			}
			debug(3, "\tCOMM=${quote}${comm{$_}}${quote}");
			$foo = "${foo}\tCOMM=${quote}${comm{$_}}${quote}";
		}

		if ($filter{$_}) {
			debug(3, "\tFILTER=$filter{$_}");
			$foo = "${foo}\tFILTER=$filter{$_}";
		}

		if ($fargs{$_} and (ref ($fargs{$_}) eq 'ARRAY')) {
			my $fargs = join ',', @{$fargs{$_}};
			debug(3, "\tFARGS=$fargs");
			$foo = "${foo}\tFARGS=$fargs";
		}

		if (defined($display{$_}) && $display{$_} ne $_) {
			debug(3, "\tDISPLAY=$display{$_}");
			$foo = "${foo}\tDISPLAY=$display{$_}";
		}

		debug(3, "\n");
		$foo = "$foo\n";
		print MODS $foo;
	} # END foreach keys %mods

	debug(3, "\n");
	close MODS or warn "Can't close \"$mods_file\": $!\n";
	unlockfile();
} # END write_mods_file

sub unlockfile {
	unlink $lockfile or warn "WARNING: Cannot unlink \"$lockfile\".  Subsequent calls to \"$me\" will not be able to record URL modifications until it is removed!\n";
} # END unlockfile

sub lockfile {
	# rudimentary file locking
	#	
	$i = 0;	
	debug("Looking for \"$lockfile\" as lock file\n");
	while ( -e $lockfile) {
		debug(print "\"$lockfile\" exists, waiting for its removal...\n");
		if ($i >= $wait_max) {
			warn "Waiting for \"$lockfile\" to be removed for too long, exiting...\n";
			exit 4;
		}
		sleep 5;
		$i++;
	}
	$i = 0;
	# Can you say, "race condition"?  I knew you could...
	open(LOCK,"> $lockfile") or do {
		warn "Can't create lock file \"$lockfile\": not writing out new modifications: $!\n";
		return;
	};
	close LOCK or warn "Can't close \"$lockfile\" (wierd): $!\n";
	#
} # END lockfile

sub print_urls { # print out last_modified database

	my($url,$time);
	foreach $url (keys %mods) {
		#print "\"$url\"\t";
		if(is_cksum($url)) { # checksum
			print "(time not available)\t";
		} else {
			chomp($time = ctime($mods{$url}));
			print $time;
		}
		print "\t\"", 
			($display{$url} ? $display{$url} : $url),
			"\"\t",
			($filter{$url} ? 
				"filter: $filter{$url}\t" : "\t"),
			($comm{$url} ? "$comm{$url}\n" : "\n");
	}
	0;
} # END print_urls

# logic to determine how to compute modifications and maintain database
sub db_url {
	my ($url, $response) = @_;
	my ($out);
	if ($new{$url}) {
		debug("\tNew, adding to mods file\n");
	} else {
		debug("\tFound in mods file\n");
	}

	# figure out which filter type we _REALLY_ need
	#
	# default to timestamp
	$filter{$url} = 'timestamp' if !$filter{$url} or
					$filter{$url} eq 'none' or
					$filter{$url} eq 'default'; 
	# have last_mod
	if ($response->last_modified) {
		if ($opt_s) { # force checksums anyway
			$filter{$url} = 'checksum';
		} elsif(!$filter{$url}) { # since we have last_mod data...
			$filter{$url} = 'timestamp';
		}
	} else {
		# no last_mod, or no (or incorrect) filter
		if (!$filter{$url} or $filter{$url} eq 'timestamp') {
			$filter{$url} = 'checksum' 
		} # XXX: put opt_s in if check to force standard checksum over all filter types.
	}
	#
	# if there is a filter defined (besides default) use it, regardless
	# of whether there is mod info (i.e. $response->last_modified)
	#
	debug("\tFilter type is \"$filter{$url}\"\n");
	#

	# apply the filter to see if there has been a change
	#

	if (dofilter($url, $response)) {
		debug("\t\turl has been changed\n");
		$changed[$i++] = $url unless $new{$url};
		save($url, $response) if $opt_o;
	} else {
		debug("\t\turl has not changed\n");
	}
	#

} # END db_url

sub debug {

	if ($_[0] =~ /^\d+$/) {
		$prio   = shift;
		$prio   += 0;
	} else {
		$prio   = 1;
	}
	return unless $opt_d >= $prio;

	$string = join '', (($opt_F > 1 ? "$$: " : '' ), @_);

	print $string;

} # END debug

# code to compute modifications, modulo any filters

sub dofilter {
	my($url, $response) = @_;
	my($filter) = $filter{$url};
	my($chksum, $content);

	# check to see if we should use 'timestamp' filter
	# At this point, $filter will never be 'default' or 'none'
	if (!$opt_o and $filter eq 'timestamp') { 
		debug(5, "\tChecking timestamps\n" .
			"\t\tmods time: $mods{$url}\n" .	
			"\t\tresponse time: " . 
			$response->last_modified . "\n");

		# set this to 0 if is isn't defined and we are comparing it
		# to a timestamp.  If it is 0, it is probably new.
		$mods{$url} = 0 unless $mods{$url};

		if ($mods{$url} != $response->last_modified) {
			$mods{$url} = $response->last_modified;
			1;
		} else {
			0;
		}

	} elsif ($filter eq 'random') {

		# approximately 50/50 chance
		srand();
		(int(rand(1024)) & 1 ? 1 : 0);

	# } elsif ($filter eq '???') { # add other non-content-based 'filters' 
	# for determining when a URL changes here.

	} else { # all filters that work on the content of the URL file
		 # should fall through to here.  Make sure they are defined
		 # in the %urlmonfilter::filtercode array.

		debug("\tChecking checksums\n");

		$mods{$url} = 'C' unless $mods{$url};

		if (! defined $urlmonfilter::filtercode{$filter}) {
			warn "\n\nWARNING: Filter \"$filter\" for \"$url\" not defined!  Taking unfiltered checksum.\n\n";
			$filter = 'checksum';
		}

		######
		# apply the filter
		######

		# prepare any args to the filter
		my (@args) = ($response->content);
		if ($fargs{$url} and ref $fargs{$url} eq 'ARRAY') {
			debug("\tUsing \"", 
					 join (',', @{$fargs{$url}}), 
					 "\" as arguments to filter\n");
			push @args, @{$fargs{$url}};
		}

		$content = 
			&{$urlmonfilter::filtercode{$filter}}(@args);

		debug(5, "\nprefilter\n\n", $response->content,
			"\n\npostfilter\n\n$content\n\n");

		$chksum = "C" . gen_cksum($content); 
		if ($mods{$url} !~ /^$chksum$/) {
			debug("\tOld was    $mods{$url}\n");

			$mods{$url} = $chksum;
			1;
		} else {
			0;
		}
	}	 
} # END dofilter

BEGIN {
package urlmonfilter;

%filtercode = (
	'checksum'	=>	sub { shift },

	'null'		=>	sub { '';},

	'lines'		=>	sub { 
		my ($content, $startline, $endline) = @_;
		return $content	     unless (defined $startline and defined $endline);
		return $content      if $startline > $endline; 
		my (@content) = split(/\n/, $content);
		$endline = ($#content - 1) if $endline   >= $#content;
		my ($i);
		
		# It might be better to 'delete' lines (and bump everything 
		# up) instead of making them empty, because when they get
		# joined back up they will be blank lines.  However, it is
		# not yet clear which behaviour is correct.  In fact, it
		# really doesn't matter, as long as we are consistent.
		for $i ($startline .. $endline) {
			$content[$i] = '';
		}
		join "\n", @content;
	}, # END of "lines"

	'regexp'	=>	sub {
		my ($content,@regexp) = @_;
		return $content unless @regexp;

		my ($regexp)  = join ";\n", @regexp;

		my (@content) = split(/\n/, $content);

		# The argument to this eval needs to be in a string to get the 
		# desired affects of interpolating $regexp now but @content
		# at 'eval' run time. 
		eval ("
foreach (\@content) {
	$regexp;
}
		");
		# The name for this filter 'regexp' is misleading, as the 
		# argument could be anything that can be applied to each line
		# of the URL's content (and that works on $_).

		if ($@) {
			warn $@;
			return $content;
		}

		join "\n", @content;
	}, # END of "regexp"

	'pararegexp'	=>	sub { # match multiple lines
		my ($content, @regexp) = @_;
		my ($regexp);
	
		return $content unless @regexp;

		my $foo =  '';
		foreach $regexp (@regexp) { 
			$foo = "${foo} \$content =~ $regexp;\n";
		}

		main::debug(5, $foo);

		eval $foo;
		warn $@ if $@;
	
		$content;
	}, # END of "pararegexp"
);
				

package main;
};

__END__
