#!/usr/local/bin/perl
#
# Acknowledgements
#
# Thanks to Guy Brooker (guy@jw.estec.esa.nl) for his AA interface,
# which was the starting point for this program.
#
# Paul Clark
# paul@cs.arizona.edu
#
# Michael Smith
# msmith@cs.arizona.edu
#
# Modifications
#
# 2/22/94	Version 1.0, shell script version	Paul Clark
# 4/21/94	Version 1.1, multiple archives support	Paul Clark
# 4/22/94	Version 1.2, perl script		Paul Clark
# 8/05/94	Version 1.3, verbosity&security		Paul Clark
#10/05/94	Version 1.4, more security, improved
#				output			Paul Clark
# 2/15/95   Version 2.0, made layout of archives distributed, 
#                        cleaned up interface, msmith

# **** **** **** ****    CONFIGURABLE VARIABLES     **** **** **** ****
$GLIMPSEHTTP_HOME="/usr2/local/glimpsehttp" ;
$GLIMPSE_LOC="/usr2/local/glimpse" ;

# Path to your scripts
$CGIBIN = "cgi-bin2";

# Glimpse options
# use -j ONLY for Glimpse v3.5 or later
$GLIMPSE_OPT = "-j -y";

# for Glimpse v3.0 or earlier, uncomment the following line
# $GLIMPSE_OPT = "-y";

# **** **** **** **** NO CONFIGURATION NEEDED BELOW **** **** **** ****
# If you want per-line access
$FSSERV = "/$CGIBIN/mfs" ;

# Set file name pattern where to suppress HTML tags
# Comment out to cancel suppression
$SUPPRESS_HTML_TAGS = "\\.s?html?\$";

# **** **** **** **** Done settings **** **** **** ****

$path_info = $ENV{'PATH_INFO'};
$_ = $path_info;

# /<length>/$indexdir/$path is the format of the PATH_INFO

# might as well start the message now
print "Content-type: text/html\n\n";
print "<HTML>\n";
print "<HEAD>\n";

if ( m|^/([0-9]*)(.*)$| ) {
	$length = $1;
	$path = $2;
	$path =~ s|"||g;
} else {
	&err_badargs;
}

$indexdir = substr($path,0,$length);
$relpath = substr($path,$length,length($path));

# print "<br>indexdir=$indexdir<br>relpath=$relpath<br>";

open(CONF,"$indexdir/archive.cfg") || &err_conf;
line: while (<CONF>) {
	@_ = split(/\t/);
	$title = $_[0];
	$urlpath = $_[1];
}

&err_badargs unless $indexdir;

close(CONF);

($ENV{'HOME'} = $indexdir) || &err_badargs; # some versions of Glimpse need it

# Ensure that Glimpse is available on this machine
-x $GLIMPSE_LOC || &err_noglimpse ;

# Ensure that index is available
-r "$indexdir/.glimpse_index" || &err_noindex($indexdir) ;

#	To support an ISINDEX type search, set query string if given
#	an argument on the command line
$prefix="whole=on&case=off&query=" if ( $#ARGV >= 0 );

#	Check that a query has been made
($query = $ENV{'QUERY_STRING'}) || &err_noquery ;

#	Strip the variables out from the query string,
#	and assign them into variables, prefixed by 'QS_'
@qvars = split( /\&/, $prefix . $query );
foreach (@qvars) {
	split(/=/);
	$fname = $_[0];
	$fvalue = $_[1];
	$fvalue =~ s/\'//g;
	$cmd = "\$QS_$fname = '$fvalue';" ;
	# print ">>>",$cmd,"\n";
	$cmd = eval $cmd if ( $fname =~ /^[a-z_A-Z]\w*$/ );
}
$QS_query =~ s|\+| |g;
$QS_query =~ s|%(\w\w)|sprintf("%c", hex($1))|ge;
$pquery = $QS_query;
$QS_query =~ s|\'|\'\"\'\"\'|g;

$OPT_errors="-$QS_errors"	if $QS_errors =~ /^[0-8]$/;
$OPT_errors="-B"		if $QS_errors =~ /^Best\+match$/;
# remove the '-i' from case if the switch is on
$OPT_case="-i";
$OPT_case=""			if $QS_case =~ /^on$/;
$OPT_whole="-w"			unless $QS_whole =~ /^on$/;
$OPT_age = "-Y $QS_age" if $QS_age =~ /^[0-9]+$/;
# print "OPT_age = $OPT_age<br>\n";
$path =~ s/\./\\./g;
$path =~ s/\'//g;
$OPT_filter="-F '$path'"	if $path;

if ($QS_maxlines =~ /\d+/) {
	$maxlines = $&;
} else {
	$maxlines = 20;
}
if ($QS_maxfiles =~ /\d+/) {
	$maxfiles = $&;
} else {
	$maxfiles = 100;
}

$highlight = $QS_query;
$highlight =~ s/^\W+//;
$highlight = join("|",split(/\W+/,$highlight));
# check if the query contains any words
&err_badquery if !$highlight;
$highlight = '\b('.$highlight.')\b' if $OPT_whole;

print "<TITLE>Result for query \"$pquery\"\n";
print "</TITLE></HEAD><BODY>\n";
print "<center>";
print "<H1>Results for query \"$pquery\"</H1>\n";
print "<h3>on archive: <a href=$urlpath/ghindex.html>$title</a></h3>\n";
if($relpath){
	print "<h4>subdirectory <a href=$urlpath$relpath>$relpath</a></h4>\n";
}

print "</center><p>\n";

chdir $indexdir;

# the default is *no* jump to lines.  If line=on, tell glimpse to get lines
if($QS_lines){
	$OPT_linenums="-n";
	print "File name (modification date), and list of matched lines (preceded by line numbers)<br>\n";
}else{
	print "File name (modification date), and list of matched lines<br>\n";
}

# $cmd = "exec $GLIMPSE_LOC -y -n $OPT_case $OPT_whole $OPT_errors -H . " .
#	  "$OPT_filter '$QS_query' 2>&1 |";
$cmd = "exec $GLIMPSE_LOC $GLIMPSE_OPT $OPT_linenums $OPT_age $OPT_case $OPT_whole $OPT_errors -H $indexdir " .
	 "$OPT_filter '$QS_query' 2>&1 |";

if (!open(GOUT, $cmd )) {
	print "<H2>Cannot execute glimpse</H2>\n";
	exit;
}

$prevfile = "";
$lcount = 0;
$fcount = 0;

line: while (<GOUT>) {

	if($QS_lines){
		# look for line number, too
		( /^([^:]*):([^:]*):\s*(\d+):(.*)/ ) || next;
		$file = $1;
		$date = $2;
		$line = $3;
		$string = $4;
	}else{
		( /^([^:]*):([^:]*):(.*)/ ) || next;
		$file = $1;
		$date = $2;
		$string = $3;
	}
	# skip the file if it isn't in this index directory directory
	next unless $file =~ s|^$indexdir||o;

	if ($file ne $prevfile) {
		$linecount = 0;
		if ($fcount>$maxfiles) {
			print "<H3>Limit of $maxfiles files exceeded...</H3>\n";
			$file = "";
			$fcount = "at least $fcount";
			$lcount = "at least $lcount";
			last line;
		}
		print "</UL>" if ( $prevfile ne "" );
		$prevfile = $file ;
		print 
			"<hr><b><A HREF=\"",$urlpath,$file,
			"\">",$file,"</A></b>",
			", <font size=-1>($date)</font><br><UL>\n" ;
		$fcount++ ;
	}
	$lcount++ ;
	$linecount++;
	if ($linecount>=$maxlines) {
		print "<LI>Limit of $maxlines matched " .
			"lines per file exceeded...\n" if
				$linecount==$maxlines;
		next line;
	}

	if ($SUPPRESS_HTML_TAGS && $file =~ /$SUPPRESS_HTML_TAGS/o) {
		$string =~ s#\</?[a-zA-Z][^>]*\>?##g;
	}
	$string =~ s/\&/\&amp;/g;
	$string =~ s/\</\&lt;/g;
	$string =~ s/\>/\&gt;/g;
	if($QS_lines){
		# BOLDING
		if ($OPT_case) {
			$string =~ s#$highlight#<B>$&</B>#gio;
		} else {
			$string =~ s#$highlight#<B>$&</B>#go;
		}
		print "<LI><A HREF=\"",$FSSERV,"/$length".$indexdir,$file,"?",$line,"#mfs\">\n" ;
		print "line ",$line,"</A>:",$string,"\n" ;
	}else{
		print "<LI>$string\n";
	}
}

print "</UL>\n" if $file ;
print "<HR>" ;
print "<H2>Summary for query <code>\"",$QS_query,"\":</code></H2>\n" ;
print "<i><a href=http://glimpse.cs.arizona.edu/ghttp>GlimpseHTTP</a></i>\n";
print "search found ",$lcount," matches in ",$fcount," files<br>\n" ;
print "(Some matches may be to HTML tags which may not be shown.)\n";
print "</BODY>\n" ;
print "</HTML>\n";
close(GOUT);
unlink "/tmp/.glimpse_tmp.$gpid";

sub diag_exit {
# exit on error
	print "</HTML>\n";
	exit 1;
}
sub err_noquery {
#	The script was called without a query. 
#	Provide an ISINDEX type response for browsers
#	without form support.
	print <<'EOM' ;
<TITLE>Glimpse Gateway</TITLE></HEAD>
<BODY><H1>Glimpse Gateway</H1>
This is a gateway to Glimpse.
Type a pattern to search in your browser's search dialog.<P>

<ISINDEX>

<H2>What is Glimpse ?</H2>
<QUOTE>
<P>
Glimpse (which stands  for  GLobal  IMPicit  SEarch)  is  an
indexing  and query system that allows you to search through
all your files very quickly.   For  example,  a  search  for
Schwarzkopf  allowing  two  misspelling errors in 5600 files
occupying 77MB took 7 seconds on a SUN  IPC.   Glimpse  supports
most of agrep's options (agrep is our powerful version
of  grep)  including  approximate  matching  (e.g.,  finding
misspelled  words),  Boolean  queries, and even some limited
forms of regular expressions.<BR>
Glimpse's running time is typically slower than systems
tems using inverted indexes, but its index is  an  order  of
magnitude smaller (typically 2-5% of the size of the files).
<H2>Authors of Glimpse</H2>
Udi Manber, Sun Wu, and Burra Gopal<BR>
<ADDRESS>
Department of  Computer
Science, University   of   Arizona,   Tucson,   AZ  85721.<BR>
glimpse\@cs.arizona.edu
</ADDRESS>
</QUOTE>

<HR>
<ADDRESS>
Glimpse<BR>
glimpse\@cs.arizona.edu<BR>
</ADDRESS>

</BODY>
EOM
	&diag_exit;
}

sub err_noglimpse {
#
# Glimpse was not found
# Report a useful message
#
	print <<'EOM' ;
<TITLE>Glimpse not found</TITLE>
</HEAD>
<BODY>
<H1>Glimpse not found</H1>

This gateway relies on <CODE>Glimpse</CODE> search tool.
If it is installed, please set the correct path in the script file.
Otherwise obtain the latest version from
<A HREF="file://ftp.cs.arizona.edu/glimpse">ftp.cs.arizona.edu</A>
</BODY>
EOM
	&diag_exit;
}

sub err_noindex {
	local ($indexdir) = @_;
# Glimpse index was not found
# Give recommendations for indexing
	print "<TITLE>Glimpse Index not found</TITLE>\n";
	print "</HEAD>\n";
	print "<BODY>\n";
	print "<H1>Glimpse Index in directory '$indexdir' not found</H1>\n";
	print "Glimpse cannot proceed without index.\n";
	print "Please check if the directory being searched is indexed\n";
	print "by <code>glimpseindex</code>.\n";
	print "</BODY>\n";
	&diag_exit;
}

sub err_badargs {
# Glimpse archive was not found
	print "<TITLE>Glimpse Archive not found</TITLE>\n";
	print "</HEAD>\n";
	print "<BODY>\n";
	print "<H1>Glimpse Archive not found</H1>\n";
	print "There was a problem with the arguments passed to aglimpse.\n";
	print "Please check your settings.\n";
	print "</BODY>\n";
	&diag_exit;
}

sub err_conf {
# Glimpse archive Configuration File was not found
	print "<TITLE>Glimpse Archive Configuration File not found</TITLE>\n";
	print "</HEAD>\n";
	print "<BODY>\n";
	print "<H1>Glimpse Archive Configuration File not found</H1>\n";
	print "Cannot open configuration file $indexdir/archive.cfg\n";
	print "</BODY>\n";
	&diag_exit;
}

sub err_badquery {
	print "<TITLE>Query is too broad</TITLE>\n";
	print "</HEAD>\n";
	print "<BODY>\n";
	print "<H1>Query is too broad</H1>\n";
	print "The query \"$pquery\" doesn't contain any words and ".
		"thus will take too much time. Please refine your query.\n";
	print "</BODY>\n";
	&diag_exit;
}
