#!/usr/local/bin/perl

$VERSION		= "1.0";
$MOD_DATE		= "05.Nov.94";
$AUTHOR			= "Paul Hulford";

# 05-Nov-94 PDH Extracted from hginsrht script

#
# Get options and print verions if required
#

require "getopts.pl";
&Getopts("vhf:c:t:") || &help && die;

die "\nhginsst v$VERSION - $AUTHOR $MOD_DATE\n\n" if $opt_v;
&help && exit if $opt_h;

#
# Help options
#

sub help
{
	print "\n";
	print "Description:\n";
	print "         Inserts simple text files into Hyper-G\n";
	print "\n";
	print "Usage:\n";
	print "         hginsst -h\n";
	print "         hginsst -v\n";
	print "         hginsst -f file [-t title] [-c collection]\n";
	print "\n";
	print "Options:\n";
	print "        -h              -- help\n";
	print "        -v              -- version information\n";	
	print "        -f file         -- text,rtf or htf file to process\n";
	print "        -t title        -- title of inserted document\n";
	print "        -c collection   -- collection to place document in\n";
	print "\n";
}

#
# Sort out arguments
#

$HOST		= "hmu1";
$COLLECTION	= "$opt_c";
$TITLE 		= "$opt_t";
$INFILE 	= "$opt_f" || &help && die "ERROR! -f arg required\n\n";
$TMP		= "/usr/local/Hyper-G/tmp/hginsst.$$";
$MIN_LEN	= 45;

# lines longer than this are probably delimiters so treat them as
# outliers rather than part of the average paragraph length

$OUTLIER	= 68; 

#
# print out input info
#

print "\n";
print "Input File:             ", $INFILE, "\n";
print "Document Title:         ", $opt_t, "\n" if $opt_t;
print "Collection:             ", $opt_c, "\n" if $opt_c;
print "\n";


&insert_txt;  # process text file

print "\n";
exit;

#////////////////////////////////////////////////////////////////////

sub insert_txt
{
# check presence of files, write access etc and take a copy
# of source file

	open( SF, "$INFILE" ) || die "ERROR! open \"$INFILE\" failed\n";
	open( TMPF, ">$TMP.1" ) || die "open \"$TMP.1\" failed\n";
	while(<SF>){print TMPF};	
	close TMPF;
	close SF;

# check to make sure we have all the bits

	( $COLLECTION && $TITLE )
		|| &mydie("ERROR! ",$COLLECTION ? "-t":"-c"," argument required\n");

# now try to improve text formatting to identify paragraphs

	open( TMPF1, "<$TMP.1") 
		|| &mydie("ERROR! open \"$TMP.1\" failed\n");

	open( TMPF2, ">$TMP.2") 
		|| &mydie("ERROR! open \"$TMP.2\" failed\n");

	print TMPF2 "<TITLE>$TITLE\n";
	print TMPF2 "<H1>$TITLE<P>\n\n";

	while(1)
	{

# read ahead and get average non-white space line length for para.
# ignore lines with non-white space count > $OUTLIER 

		$fpos = tell( TMPF1 );
		$tot_len = 0;
		$no_lines = 0;

		while(<TMPF1>)
		{
			1 while( s/ // );
			last if length == 1;
			$tot_len += length if( (length) <= $OUTLIER );
			$no_lines++ if( (length) <= $OUTLIER );
		}
		seek(TMPF1, $fpos, 0);

		$av_len = 0;	
		$av_len = int( $tot_len / $no_lines ) if $no_lines;

# if average is long then most likely a paragraph

		if( $av_len >= $MIN_LEN )
		{
			while(<TMPF1>)
			{
				s/^\s*//;
				s/\s*$/\n/;
				s/-\n//;       		#1.July.94 PDH
				last if length == 1;
				print TMPF2 $_;
			}
			print TMPF2 "<P>\n";
			last if eof;
			next;
		}

# if average is short then most likely a list or heading etc

		if( $av_len )
		{
			while(<TMPF1>)
			{
				s/^\s*$/\n/;
				last if length == 1;
				chop;
				print TMPF2 "$_"."<L LEFT>\n";
			}
			print TMPF2 "\n";
			last if eof;
			next;
		}

# if average is zero have a blank line

		if( $av_len == 0 )
		{
			<TMPF1>;
			print TMPF2 "\n";
			last if eof;
			next;
		}
	}

	close TMPF1;
	close TMPF2;

# finally insert into server

	print "hginstext -o $HOST -f \"$COLLECTION\" $TMP.2\n";
 
	! system("hginstext -o $HOST -f \"$COLLECTION\" $TMP.2") 
		|| &mydie("ERROR! insert document failed\n");

	! system("/bin/rm $TMP.[1-2]")
		|| &mydie("ERROR! could not remove \"$TMP\" files\n");
}

#////////////////////////////////////////////////////////////////////

sub mydie
{
	system "/bin/rm $TMP.?";
	die @_;
}

