#!/usr/local/ls6/perl/bin/perl
#                              -*- Mode: Perl -*- 
# 
# SFproxy -- 
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Sat Jun 11 16:19:09 1994
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Fri Apr 21 15:53:48 1995
# Language        : Perl
# Update Count    : 185
# Status          : Unknown, Use with caution!
# 
# (C) Copyright 1995, Universitt Dortmund, all rights reserved.
# 
# HISTORY
# 
# 

# remove this blockfor perl4!

BEGIN {
    require Config; import Config;
    if ($Config{'extensions'} !~ /\bGDBM_File\b/) {
        use GDBM_File;
    }
}

# $|=1;                           # make stdout unbuffered
# arguments
$waisindex = '/usr/local/ls6/wais-1.3/bin/waisindex -nopairs -nocat -export';
# where should you indexes go per default
$defaultdir = '/home/robots/wais/wais-sources';
$defaultport = 2345;

# make perl -wc happy
# The rest should work without any modification

$opt_debug = $opt_d = $opt_help = $opt_h = $opt_help = $opt_database =
    $opt_dbm = $opt_proxy = $opt_noproxy = $opt_server = $opt_s =
    $opt_add = $opt_a = $opt_remove = $opt_r = $opt_recreate = 
    $opt_hotlist = $opt_l = $opt_logfile = $opt_index = $opt_list =
    $opt_dir = $opt_register = '';

require 'newgetopt.pl';
&NGetOpt('debug', 'd', 
         'help', 'h', 
         'database=s',
         'dbm=s',
         'proxy=s', 'noproxy', 
         'server:i', 's:i',
         'add=s', 'a=s',
         'remove=s', 'r=s',
         'recreate',
         'hotlist=s',
         'register=s',
         'l=s', 'logfile=s',
         'index',
         'list',
         'dir=s',
         ) || &usage;

&usage if $opt_h || $opt_help;

$database   = $opt_database || 'proxy-html';
$urldb      = $opt_dbm || $database;
$http_proxy = $opt_proxy || $ENV{'http_proxy'};
$http_proxy = '' if $opt_noproxy;
$debug      = $opt_debug || $opt_d;
$server     = $opt_server || $opt_s;
$proxyport  = $server || $defaultport;
$logfile    = $opt_logfile || $opt_l;
$addurl     = $opt_a || $opt_add;
$remove     = $opt_r || $opt_remove;
$waisindex  = $opt_index || $waisindex;
$waisdir    = $opt_dir || $defaultdir;

if ($waisdir) {
    chdir($waisdir) || die "Could not chdir to \"$waisdir\": $!\n";
}

$sockaddr      = 'S n a4 x8';
chop($thishost = `hostname`); 
chop($pwd      = `pwd`);
$thisaddr      = (gethostbyname($thishost))[4];
$thisproc      = pack($sockaddr, 2, 0, $thisaddr);
$next          = "chatsymbol000000"; # next one
$AF_INET       = 2;
$SOCK_STREAM   = 1;

# dispatch

if ($opt_list) {
    &do_list;
} elsif ($server) {
    &do_server($proxyport);          # never returns
} elsif ($remove) {
    &do_remove($remove);
} elsif ($addurl) {
    &do_addurl($addurl);
} elsif ($opt_register) {
    &do_register($opt_register);
} elsif ($opt_hotlist) {
    &usage,die "Could not open hotlist \"$opt_hotlist\": $!\n" 
        unless -r $opt_hotlist;
    &do_hotlist($opt_hotlist);
} elsif ($opt_recreate) {
    &do_recreate;
} else {
    &do_daemon(STDIN,STDOUT);
}
    
exit (0);

sub log {
    return unless $logfile;
    open(LOG, ">>$logfile") || die "Could not open logfile \"$logfile\": $!\n";
    printf(LOG @_);
    close(LOG);
}

sub do_addurl {
    local($url) = @_;

    $url =~ s/\#.*//;
    print STDERR "do_addurl: $url\n" if $debug;

    local($them, $port, $newurl) = &whom_to_ask($url);

    *UGET = '';                  # make perl -wc happy
    *UGET = &open_port($them, $port);
    print UGET "GET $newurl HTTP/1.0\r\n\r\n";
    &process_answer(UGET,$url);
}

sub do_remove {
    local($url) = @_;

    dbmopen(%URL, $urldb, 0644) || die "Could not open dbmfile: $!\n";
    for (keys %URL) {
        print STDERR "do_remove: URL{$_}\n" if /$url/;
        delete $URL{$_} if /$url/;
    }
    dbmclose(%URL);
}

sub do_register {
    local($url) = @_;

    dbmopen(%URL, $urldb, 0644) || die "Could not open dbmfile: $!\n";
    $URL{$url} = 1;
    dbmclose(%URL);
}

sub do_recreate {
    local(%URL,%CURL,$_);

    # We slurp the dbm array in memory, since without ndbm support
    # only one dbm database can be open at one time
    dbmopen(%URL, $urldb, 0644) || die "Could not open dbmfile: $!\n";
    %CURL = %URL;
    dbmclose(%URL);
    unlink "$urldb.dir";        # remove dbm database
    unlink "$urldb.pag";
    unlink "$urldb";            # gdb database
    unlink "$database.doc";     # remove wais database
    for (keys %CURL) {
        &do_addurl($_);
    }
}

sub do_list {
    local(%URL,$_);

    dbmopen(%URL, $urldb, 0644) || die "Could not open dbmfile: $!\n";
    for (keys %URL) {
        print "$_:\t$URL{$_}\n";
    }
    dbmclose(%URL);
}

sub do_hotlist {
    local($hfile) = @_;
    local($_);

    open(HF, "<$hfile") || die "Could not open hotlist file \"$hfile\": $!\n";
    while (<HF>) {
        next unless /^http:/;
        chop;
        s/[ ?\#].*//;
        &do_addurl($_);
    }
    close(HF);
}

sub do_server {
    local($port) = @_;
               
    local($name, $aliases, $proto, $this, $sockaddr, $con, $addr, 
          @child);

    ($name, $aliases, $proto) = getprotobyname('tcp');
    if ($port !~ /^\d+$/) {
        ($name, $aliases, $port) = getservbyport($port, 'tcp');
    }

    $this = pack($sockaddr, $AF_INET, $port, "\0\0\0\0");

    select(NS); $| = 1; select(STDOUT);

    socket(SM, $AF_INET, $SOCK_STREAM, $proto) || die "socket: $!";
    bind(SM,$this) || die "bind: $!";
    listen(SM,5) || die "connect: $!";

    select(SM); $| = 1; select(STDOUT);

    $con = 0;
    print "Listening for connection 1....\n";
    for(;;) {
        ($addr = accept(NS,SM)) || die $!;

        $con++;
        if (($child[$con] = fork()) == 0) {
            if ($debug) {
                local($af,$port,$inetaddr,@inetaddr);
                ($af,$port,$inetaddr) = unpack($sockaddr,$addr);
                @inetaddr = unpack('C4',$inetaddr);
                &log("$con: $af $port @inetaddr\n");
            };
            &do_daemon(NS,NS);
        }
        close(NS);
        &log("Listening for connection %d\n",$con+1);
    }
}

sub whom_to_ask {
    local($url) = @_;
    local($host, $port, $newurl, $dummy);

    ($host, $dummy, $port, $newurl) =
        ($url =~ m@http://([^/:?]+)(:(\d+))?(/\S*)@);

    if ($http_proxy) {
        ($dummy, $host, $dummy, $port) = 
            ($http_proxy =~ m@^(http://)?([^/:?]+)(:(\d+))?(/\S*)?$@);
        $newurl = $url;
    }
    $port = 80 unless $port;
    &log("whom_to_ask: url=%s http_proxy=%s host=%s", $url, $http_proxy, $host)
        if $debug;
    print STDERR "whom_to_ask($url) -> ($host, $port, $url)\n" if $debug;
    ($host, $port, $newurl);
}
        

sub do_daemon {
    local($in,$out) = @_;
    local($them, $port, $get, $url,$newurl);
    
    &log("accept ok\n");

    $get = <$in>;               # get the first line;
    ($method,$url) = ($get =~ /^(GET|POST) (\S+)/i);
    die "do_daemon: Unknown request: $get\n" unless $url;
    ($them, $port, $newurl) = &whom_to_ask($url);
    *P = '';                    # make perl -wc happy
    *P = &open_port($them, $port);
    &log("Connected to %s:%d to get %s\n", $them, $port, $url);
    $get =~ s/$url/$newurl/;
    print P $get;

    while (<$in>) {
        print P $_;
        if (/^\r?$/) {        # end of request reached
            last if $method =~ /GET/;
            $_ = <$in>;         # POST (one line ???)
            #&log("get+ %s\n", $_);
            print P $_;
            last;
        }
    }
    
    # now we can get the answer from P and should send it to $out;
    &process_answer(P, $url, $out);
}

sub process_answer {
    local($in,$url,$out) = @_;

    local($length, $type, $body, $oldhttp, $_, $success);

    # get the answer
    $_ = <$in>; print $out $_ if $out;
    if (m:^HTTP/1.0:) {
        while (<$in>) {
            print $out $_ if $out;
            $length = $1 if (/Content-Length: (\d+)/i);
            $type   = $1 if (/Content-Type: ([\w\/]+)/i);
            $success = 1 if (/200 Document follows/i);
            last if /^\r?$/;        # end of header
        }
    } else {
        $oldhttp=1;
        $type   = "text/html" if $oldhttp && /<(HTML|TITLE)>/i;
        $body = $_;
    }

    while (<$in>) {
        $type   = "text/html" if $oldhttp && /<(HTML|TITLE)>/i;
        $body .= $_;
    }
    print $out $body if $out;

    close($in);
    close($out) if $out;
    &log("type= %s\n", $type) if $debug;
    exit if $body =~ /<TITLE>\s*(Error|Redirection)/i;
    exit if $body =~ /404\s+Not\s+Found/i;
    exit if $body =~ /403\s+Sorry/i;
    exit if $body =~ /403\s+Forbidden/i;
    exit if $body =~ /500\s+Server\s+Error/i;
    if ($type eq "text/html") {
        close(STDOUT);
        close(STDERR) unless $debug;
        close(STDIN);
        if ($debug || fork() == 0) { # child
            &index_if_necessary($url,$body);
        }
        exit;
    }
    if ($type eq "text/plain") {
        close(STDOUT);
        close(STDERR) unless $debug;
        close(STDIN);
        if ($debug || fork() == 0) { # child
            &index_if_necessary($url,$body."<TITLE></TITLE>");
        }
        exit;
    }
    # ok we'r done, exit now.
}

sub get_index_lock {
    local($me, $locked);

    $me = getlogin || (getpwuid($>))[0] || $ENV{USER};

    open(LOCK, ">$database.LOCK$$") || die "Cant't create lock file: $!\n";
    print LOCK "$me $thishost\n";
    close LOCK;

    for(;;) {
        $locked = link("$database.LOCK$$", "$database.LOCK");

        if (!$locked && -M "$database.LOCK" > 1/48 ) {
            &log("Overriding lock that was more than 1/2 hour old...\n");
            unlink "$database.LOCK";
            $locked = link("$database.LOCK$$", "$database.LOCK");
        }
        last if $locked;
        sleep(10);
    }
    $locked;
}

sub release_index_lock {
    unlink "$database.LOCK$$";
    unlink "$database.LOCK";
}


sub index_if_necessary {
    local($url,$body) = @_;
    local(%URL);

    return if $url =~ /\?/;     # don't index search results
    $url =~ s/#.*//;            # no indexing for parts of documents
    $url =~ s/\s.*//;           # make shure, that no spaces in $url

    &get_index_lock;
    dbmopen(%URL, $urldb, 0644) || die "Could not open dbmfile: $!\n";
    if ($URL{$url}) {
        &log("index_if_necessary: seen %s %d\n", $url, $URL{$url});
        $URL{$url}++;
    } else {
        &log("index_if_necessary: indexing %s\n", $url);
        unless (&index($url,$body)) {
            $URL{$url}=1;
            &log("index_if_necessary: indexing successfull\n");
        }
    }
    dbmclose(%URL);
    &release_index_lock;
}
        
sub newtitle {
    local($_) = @_;
    s/<(\w+).*<\/\1>//g;
    &log("Title: $_\n");
    s/^\s+//;
    "<title>$_</title>";
}

sub index {
    local($url,$body) = @_;
    local($tmpfile,$add, $_, $error);
    local($title);

    $tmpfile = "mist.$$";
    $body =~ s:<(/?)(title)>:<$1\L$2\E>:ig;
    $body =~ m:<title>(.*)</title>:;
    $title = &newtitle($1);
    $body =~ tr/A-Z/a-z/;
    $body =~ s:<title>(.*)</title>:$title:;
    $url =~ s:[\s\n]+::g;

    open(TMP, ">$tmpfile") || die "Could not write $tmpfile: $!\n";
    print TMP "$body\n\n";
    close(TMP);

    $add = '-a' if -e "$database.doc";
    &makeformat($database) unless -e "$database.fmt";
    if ($debug) {
        print STDERR "$waisindex $add -t URL $pwd/$tmpfile ".
            "$url -t fields -d $database $pwd/$tmpfile\n" if $debug;
    open(INDEX, "$waisindex $add -t URL $pwd/$tmpfile ".
         "$url -t fields -d $database $pwd/$tmpfile 2>&1|")
        || die "Could not start $waisindex\n";
    } else {
        open(INDEX, "$waisindex $add -t URL $pwd/$tmpfile ".
             "$url -t fields -d $database $pwd/$tmpfile >/dev/null 2>&1|")
            || die "Could not start $waisindex\n";
    }
    while (<INDEX>) {
        $error ++ if /error/i;
        print STDERR $_ if $debug;
    }
    close(INDEX);
    unlink($tmpfile);
    $error || $?;
}

sub open_port {                 ## from chat2
	local($server, $port) = @_;

	local($serveraddr,$serverproc);

	*S = ++$next;
	if ($server =~ /^(\d+)+\.(\d+)\.(\d+)\.(\d+)$/) {
		$serveraddr = pack('C4', $1, $2, $3, $4);
	} else {
		local(@x) = gethostbyname($server);
		return undef unless @x;
		$serveraddr = $x[4];
	}
	$serverproc = pack($sockaddr, 2, $port, $serveraddr);
	unless (socket(S, 2, 1, 6)) {
		# XXX hardwired $AF_SOCKET, $SOCK_STREAM, 'tcp'
		# but who the heck would change these anyway? (:-)
		($!) = ($!, close(S)); # close S while saving $!
		return undef;
	}
	unless (bind(S, $thisproc)) {
		($!) = ($!, close(S)); # close S while saving $!
		return undef;
	}
	unless (connect(S, $serverproc)) {
		($!) = ($!, close(S)); # close S while saving $!
		return undef;
	}
	select((select(S), $| = 1)[0]);
	$next; # return symbol for switcharound
}

sub makeformat {
    local($database) = @_;

    print STDERR "Creating $database.fmt!\n";
    open(FMT, ">$database.fmt");
    print FMT <<'EOFMT'
<record-end> //

<layout>
<headline> /<title>/ /<\/title>/ 
        80 /<title>./
<end>

<field> /<title>/
ti "Title" stemming TEXT BOTH
<end> /<\/title>/

<field> /<h[0-9]>/
hl "Headline" stemming TEXT BOTH
<end> /<\/h[0-9]/

<field> /<dt>/
dt "Description Term" stemming TEXT BOTH
<end> /<d[dt]>/

<field> /<address>/
ad "Description Term" stemming TEXT BOTH
<end> /<\/address>/

<field> /<b>/
stemming TEXT GLOBAL
<end> /<\/b>/

<field> /<i>/
stemming TEXT GLOBAL
<end> /<\/i>/

EOFMT
    ;
    close(FMT);
}

#<field> /^./
#stemming TEXT GLOBAL
#<end> /$/

sub usage {
    while (<DATA>) {
        print;
    }
    exit;
}
__END__

SFproxy(1)               USER COMMANDS                 SFproxy(1)

NAME
     SFproxy - a freeWAIS-sf http proxy

SYNOPSIS
     SFproxy [ options ] 

DESCRIPTION

  OPTIONS
    -d, -debug

    -h, -help

       Print this help message.

    -database <name>

       Set the name of the wais database. Defaults to 'proxy-html';

    -dbm <name>

       Set the name of the dbm database, used to store the indexed URLs.
       Defaults to the name of the wais database.

    -noproxy
    -proxy <[http://]hostname[:port]>

       Use hostname, port as a proxy to get the URLs. Port defaults to
       80.  Per default, the environement variable 'http_poxy' is used
       to to specify the proxy to use. No proxy is used if neither
       this variable is set nor the -proxy option is given.

       If -noproxy is specified SFproxy directely connects to the server.

    -server [port]
    -s [port]
       Run as server, spawning processes to hadle clients. Use port if
       given. Port defaults to 2345.

    -a <url>
    -add <url>
       Get <url> and add it to the database.

    -r <url>
    -remove <url>

       Remove the url from the dbm database. Next recreate will no
       index this <url>.

       
    -recreate
       Refetch all URLs in the dbm database and index them.

    -hotlist <filename> 

       Fetch all 'http:*' urls in the Mosaic hotlist file specified
       and add them to the database.

    -list

       List all URLs in the dbm database and exit.

    -daemon 
       Assume beeing called by inetd. Get requests from STDIN, send
       answers to STDOUT. This is the default.

    -l <filename>
    -logfile <filename>
       Write logging messages to <filename>. Default is to run silent.

    -index <progname>
       Use <progname> instead of 'waisindex' as index program.

    -dir <dirname>
       Change to directory given, before doing anything.

ENVIRONEMENT

    http_poxy
       is used to to specify the proxy to use.

AUTHOR
  Ulrich Pfeifer <pfeifer@ls6.informatik.uni-dortmund.de>
