########################################################################
#
# gspider.pm - gopher directory download and parsing routines
# Copyright (C) 2002, 2003, 2004 Timothy Jon Fraser tfraser@alum.wpi.edu
#
# $Id: gspider.pm,v 1.3 2004/06/03 15:01:46 tim Exp $
#
# This file is part of gspider.
#
# gspider is free software; you can redistribute it and/or modify it
# under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.  
# 
# gspider is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
# License for more details.  
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA.
#
########################################################################

package gspider;
use Exporter ();
@ISA         = qw(Exporter gspider);
@EXPORT      = qw(parsegopherurl parsegopherpage);
use strict;

# parsegopherurl
#
# in:     $line - a line containing a (possibly malformed) gopher
#                 URL in the format produced by LWP::UserAgent.
# out:    nothing
# return: a result hash, as described below
#
# This subroutine parses the gopher URL in $line.  Note that this
# must be in lynx -source format, not rfc1436 format.  If the
# line contains a well-formed URL, this subroutine parses it into
# components, returning them in a hash as described below.
#
# For example, given this well-formed lynx -source URL:
# <A HREF="gopher://gopher.host.com:70/1%2Fdirname"
#
# the result will be:
#
# "status"      => "success"
# "host"        => "gopher.host.com"
# "port"        => "70"
# "selector"    => "1%2Fdirname"
#
# Note that lynx -source outputs URLs on 3 lines, and this is only
# the first of the three.  However, we care only about the information
# in this first line, so we ignore the others.
#
# We also accept lines in the format:
# <A HREF="//gopher.vaboofer.com/3./links/emptyfile"
# This line is different from the one above: it does not specify a port.
# We will return "port" => "70" in this case.
#
# When given a non-well-formed URL, the result will be:
#
# "status"      => some descriptive string other than "0"
# "host"        => "error.host"
# "port"        => "1"
# "selector"    => "3/error"
#

sub parsegopherurl {
    my $line = shift(@_);  # the input line
    my $host;              # host component of $line
    my $port;              # port component of $line
    my $selector;          # selector component of $line
    my $token_index;       # index into $line used for parsing
    my %result = ( "status"      => "your error message here",
		   "host"        => "error.host",
		   "port"        => "1",
		   "selector"    => "3/error" );

    # line must begin with '<A HREF="//'
    unless($line =~ m/^<A HREF="\/\// ) {
        $result{"status"} = 'URL did not begin with <A HREF="//';
	return %result;
    }
    $line = substr $line, 11; # skip URL prefix

    # Everything from the beginning of $line to the next '/'
    # is either host or host and port.  If there is no '/',
    # then $line is malformed.
    $token_index = index $line, "\/";
    if($token_index == -1) {
        $result{"status"} = "hostport did not terminate with a \/";
	return %result;
    }
    $host = substr $line, 0, $token_index;
    $line = substr $line, $token_index; # terminating '/' is part of selector

    # now we must separate the $port from $host, if it has a $port.
    $token_index = index $host, ":";
    if($token_index == -1) {
         # $host did not contain a ":", so it has no port.  report "70".
         $port = "70";
    } else {
         # $host has a port, separate it out
         $port = substr $host, ($token_index + 1);
         $host = substr $host, 0, $token_index;
    }

    # if there are any illegal characters in $host, or if $host
    # is a zero-length string, we will skip this line.  Some
    # sites have malformed host entries:
    # oddjob.uchicago.edu has <A HREF="gopher:///r"> - BAD!
    if(length $host <= 0) {
        $result{"status"} = "zero-length hostname";
	return %result;
    }
    if($host  =~ m/[^0-9A-Za-z._-]/ ) {
        $result{"status"} = "illegal characters in hostname";
	return %result;
    }
    $host = lc $host;  # normalize to lowercase on hostnames

    # if there are any illegal characters in $port, or it
    # $port is zero-length, then $line is malformed.  Some
    # sites have malformed host entries -
    # gopher.iif.hu has a variety of bad (non-numeric) ports:
    # <A HREF="gopher://nntp%20ls%20alt.illuminati%20:gopher.iif.hu/...
    # <A HREF="gopher://We%20all%20macs.:nntp%20ls%20alt.religion...
    if(length $port <= 0) {
        $result{"status"} = "zero-length port";
	return %result;
    }
    if($port  =~ m/[^0-9]/ ) {
        $result{"status"} = "illegal characters in port";
	return %result;
    }

    # everything from the beginning of $line to the next "
    # is selector.  If there is no ", then $line is malformed.
    $token_index = index $line, "\"";
    if($token_index == -1) {
        $result{"status"} = "selector did not terminate with \"";
	return %result;
    }
    $selector = substr $line, 0, $token_index;
    $line = substr $line, ($token_index + 1);

    # form result array and return to caller
    $result{"status"} = "success";
    $result{"host"} = $host;
    $result{"port"} = $port;
    $result{"selector"} = $selector;

    return %result;

} # sub parsegopherurl()


# sub parsegopherpage()
#
# in:    $hostnameport - hostname and port of server that served thepage
#                        in the format "hostname:port".
#        @$r_thepage   - reference to an array of lines representing a
#                        gopher page.
# out:   @$r_remoteservers - reference to an array that will contain
#                        all of the remote servers in thepage.
#                        Remote servers will be identified by
#                        "hostname:port" strings.  Remote servers are
#                        servers that do not match $hostnameport.
#        @$r_localdirs - reference to an array that will contain
#                        all of the selectors for local directories in
#                        thepage.  Local directories are type 1 and 11
#                        selectors served by the server identified by
#                        $hostnameport.  They will be in the format:
#                        "type%2Fselector".
#        @$r_failures - reference to an array that will contain diagnostic
#                       error messages for each selector that failed to
#                       parse correctly.
#                       Format: "site : selector : error msg"
#
# return: number of selectors successfully parsed
#
# This subroutine parses the page described by @$r_thepage.  This
# page must be in the format produced by lynx -source not in
# rfc1436 format.
#
# note that the local dir and remote servers output may contain duplicates
# if the page parsed contained duplicates.
#

sub parsegopherpage {
    my ($hostnameport, $r_thepage, $r_remoteservers, $r_localdirs,
	$r_failures) = @_;
    my $rawline;       # unparsed line from @$r_thepage
    my $tagindex;      # index of "<A HREF" tag in $rawline
    my %fields;        # parsed fields from $rawline
    my $linehostport;  # hostname and port constructed from %fields
    my $selectorsparsed = 0;  # selectors successfully parsed

    # parse each line from @$r_thepage
    foreach $rawline (@$r_thepage) {

	# does $rawline contain an "<A HREF" tag? if so, trim off preceedig
	# text and pass the tag to parsegopherurl() for parsing.
	$tagindex = index $rawline, '<A HREF="//';
	if($tagindex == -1) {
	    next;  # no tag on this line, check next line
	}
	$rawline = substr $rawline, $tagindex;

	%fields = parsegopherurl($rawline);
	if($fields{"status"} ne "success") {
	    # tag is not parseable.  Put some diagnostic info into
	    # the failures array and move on to the next line.
	    chop $rawline;
	    $rawline = $hostnameport . " : " . $rawline . " : " .
		$fields{"status"} . "\n";
	    push @$r_failures, $rawline;
	    next;
	}
	# if we successfully parsed $rawline, increment count
	$selectorsparsed++;

	# We are interested in two kinds of lines:
	# (1) directory-type selectors on the local server $hostnameport
	# (2) any selector residing on a remote server
	$linehostport = $fields{"host"} . ":" . $fields{"port"};
	if($hostnameport eq $linehostport) {	

	    # This line is local.  We consider any selector that begins
	    # with "/1" to be a directory.  This'll catch the non-rfc1436
	    # type "11" directories, too.  Will it catch non-directories?
	    if($fields{"selector"} =~ m/^\/1/) {
		push @$r_localdirs, $fields{"selector"} . "\n";
	    } # if directory

	} else {

	    # This line references a remote gopher server.
	    push @$r_remoteservers, $linehostport . "\n";

	} # if local/remote

    } # for each line

    return $selectorsparsed;
} # sub parsegopherpage()
