#!/usr/local/bin/perl

$VERSION		= "2.8";
$MOD_DATE		= "14.Aug.94";
$AUTHOR			= "Paul Hulford";


# 15.Aug.94 PDH - dont count lines which >= 68 non space chars (outliers) 
# 29.Jul.94 PDH - put quotes around $COLL when inserting 
# 29.Jul.94 PDH - tmp files need to be accessible cross platform
#                 so use Hyper-G/tmp which is 
# 29.Jul.94 PDH - remove path stuff - not requied env already set 
# 11.Jul.94 PDH - put quotes around $opt_t so get whole title 
# 01.Jul.94 PDH - remove "-\cr" from paras (hyphenated words - text only)

#
# Get options and print verions if required
#

require "getopts.pl";
&Getopts("vhf:c:t:") || &help && die;

die "\nhginsrht v$VERSION - $AUTHOR $MOD_DATE\n\n" if $opt_v;
&help && exit if $opt_h;

#
# Help options
#

sub help
{
	print "\n";
	print "Description:\n";
	print "         Inserts rtf, htf and text files into Hyper-G\n";
	print "\n";
	print "Usage:\n";
	print "         hginsrht -h\n";
	print "         hginsrht -v\n";
	print "         hginsrht -f file [-t title] [-c collection]\n";
	print "\n";
	print "Options:\n";
	print "        -h              -- help\n";
	print "        -v              -- version information\n";	
	print "        -f file         -- text,rtf or htf file to process\n";
	print "        -t title        -- title of inserted document\n";
	print "        -c collection   -- collection to place document in\n";
	print "\n";
}

#
# Sort out arguments
#

$HOST		= "hmu1";
$COLLECTION	= $opt_c;
$TITLE 		= "$opt_t";		# 11.July.94 PDH
$INFILE 	= "$opt_f" || &help && die "ERROR! -f arg required\n\n";
$TMP		= "/usr/local/Hyper-G/tmp/hginsrht.$$";
$MIN_LEN	= 45;

# lines longer than this are probably delimiters so treat them as
# outliers rather than part of the average paragraph length

$OUTLIER	= 68; 

&get_fname;					# checks INFILE and gets EXTENSION

$ext = $EXTENSION;
$INFILE =  "$INFILE.$ext" if $ext;

#
# If we dont know what file format have a look
#

if( ! $ext )
{
	open( SF, $INFILE ) || die "ERROR! open \"$INFILE\" failed";
	while(<SF>)
	{
		$ext = "htf" if /<TITLE>/i;
		$ext = "rtf" if /{\\rtf/;
		last;
	}
	$ext = "txt" if ! $ext;      # default to text
	close SF;
}

$EXTENSION = $ext;

#
# print out input info
#

print "\n";
print "Input File:             ", $INFILE, "\n";
print "Document Title:         ", $opt_t, "\n" if $opt_t;
print "Collection:             ", $opt_c, "\n" if $opt_c;
print "File Type:              ", $EXTENSION ? $ext : "unknown", "\n";
print "\n";

#
# handle input file types
#

&insert_htf if ( $ext =~ /htf/ );   # process htf files 
&insert_rtf if ( $ext =~ /rtf/ );	# process rtf files
&insert_txt if ( $ext =~ /txt/ );   # process txt files

print "\n";

exit;

#
# We can insert htf files straight away
#

#////////////////////////////////////////////////////////////////////

sub insert_htf
{
	&get_coll_title;

# if not found and not passed as args complain otherwise insert

	( $COLLECTION && $TITLE )
		|| &mydie("\nERROR! ",	$COLLECTION ? "-t":"-c",
			" argument required\n\n");

	print "hginstext -o $HOST -f \"$COLLECTION\" $INFILE\n";

	! system("hginstext -o $HOST -f \"$COLLECTION\" $INFILE")
		|| &mydie("ERROR! loading into Hyper-G failed\n");

	! system("/bin/rm $TMP.[1-2]")
		|| &mydie("ERROR! could not remove \"$TMP\" files\n");
}


#////////////////////////////////////////////////////////////////////

sub insert_rtf
{
	&get_coll_title;

# if coll and title not found and not passed as args complain 
# otherwise insert

	( $COLLECTION && $TITLE )
		|| &mydie("ERROR! ",$COLLECTION ? "-t":"-c"," argument required\n");

	print "rtf2htf -t '$TITLE' $TMP.1 $TMP.2\n";

	! system("rtf2htf -t '$TITLE' $TMP.1 $TMP.2")
		|| &mydie("ERROR! rtf2htf conversion failed\n");

	print "hginstext -o $HOST -f \"$COLLECTION\" $TMP.2\n";

	! system("hginstext -o $HOST -f \"$COLLECTION\" $TMP.2")
		|| &mydie("ERROR! insert document failed\n");

	! system("/bin/rm $TMP.[1-2]")
		|| &mydie("ERROR! could not remove \"$TMP\" files\n");
}


#////////////////////////////////////////////////////////////////////

# txt is tricky due to formatting hastles

sub insert_txt
{
	&get_coll_title;

# check to make sure we have all the bits

	( $COLLECTION && $TITLE )
		|| &mydie("ERROR! ",$COLLECTION ? "-t":"-c"," argument required\n");

# now try to improve text formatting to include dynamic paragraph sizes

	open( TMPF1, "<$TMP.1") 
		|| &mydie("ERROR! open \"$TMP.1\" failed\n");

	open( TMPF2, ">$TMP.2") 
		|| &mydie("ERROR! open \"$TMP.2\" failed\n");

	print TMPF2 "<TITLE>$TITLE\n";
	print TMPF2 "<H1>$TITLE<P>\n\n";

	while(1)
	{

# read ahead and get average line length for para.
	
		$fpos = tell( TMPF1 );
		$tot_len = 0;
		$no_lines = 0;

		while(<TMPF1>)
		{
			1 while( s/ // );
			last if length == 1;
			$tot_len += length if( (length) <= $OUTLIER );
			$no_lines++ if( (length) <= $OUTLIER );
		}
		seek(TMPF1, $fpos, 0);

		$av_len = 0;	
		$av_len = int( $tot_len / $no_lines ) if $no_lines;

#		print "offset ",$fpos, " :av_len ",$av_len," :lines ", $no_lines,"\n";


# if average is long then most likely a paragraph

		if( $av_len >= $MIN_LEN )
		{
			while(<TMPF1>)
			{
				s/^\s*//;
				s/\s*$/\n/;
				s/-\n//;       		#1.July.94 PDH
				last if length == 1;
				print TMPF2 $_;
			}
			print TMPF2 "<P>\n";
			last if eof;
			next;
		}

# if average is short then most likely a list

		if( $av_len )
		{
			while(<TMPF1>)
			{
				s/^\s*$/\n/;
				last if length == 1;
				chop;
				print TMPF2 "$_"."<L LEFT>\n";
			}
			print TMPF2 "\n";
			last if eof;
			next;
		}

# if average is zero have a blank line

		if( $av_len == 0 )
		{
			<TMPF1>;
			print TMPF2 "\n";
			last if eof;
			next;
		}
	}

	close TMPF1;
	close TMPF2;

# finally insert into server

	print "hginstext -o $HOST -f \"$COLLECTION\" $TMP.2\n";
 
	! system("hginstext -o $HOST -f \"$COLLECTION\" $TMP.2") 
		|| &mydie("ERROR! insert document failed\n");

	! system("/bin/rm $TMP.[1-2]")
		|| &mydie("ERROR! could not remove \"$TMP\" files\n");
}

#////////////////////////////////////////////////////////////////////

sub get_fname
{
	$fname = $INFILE;

	if( ! -r $fname ){
		if( -r "$fname.rtf" ){
			$fname = $fname.".rtf";
		}
		elsif( -r "$fname.htf" ){
			$fname = $fname.".htf";
		}
		elsif( -r "$fname.txt" ){
			$fname = $fname.".txt";
		}
		else {
			die "ERROR! File not found or no read permission\n";
		}
	}
	
	$EXTENSION = $fname;
	$EXTENSION =~ s/.*\.(rtf|htf|txt)$/$1/;
	$EXTENSION = ""	if $EXTENSION eq $fname;
	
	$INFILE =~ s/(.*)(\.htf|\.rtf|\.txt)$/$1/;
}

#////////////////////////////////////////////////////////////////////

sub mydie
{
	system "/bin/rm $TMP.?";
	die @_;
}

#////////////////////////////////////////////////////////////////////

sub get_coll_title
{
	open( SF, "$INFILE" ) || die "ERROR! open \"$INFILE\" failed\n";
	open( TMPF, ">$TMP.1" ) || die "open \"$TMP.1\" failed\n";

# extract title and collection from inside file if present

	while(<SF>)
	{
		if( ! $TITLE && /<title>/i )    # get title if required
		{
			$TITLE = $_;
			$TITLE =~ s/.*<title>(.*)\n/$1/i;
			$TITLE =~ s/<.*//;
			$TITLE =~ s/\\par.*//;
			$TITLE =~ s/^\s*//;
			$TITLE =~ s/\s*$//;
			$TITLE =~ s/}.*//;
		}

		if( /<title>/i )      # remove title if present
		{
			s/<title>[^<]*\\par//i;
			s/<title>[^<]*</</i;
		}

		if( ! $COLLECTION && /<collection>/i )  #get coll if required
		{
			$COLLECTION = $_;
			$COLLECTION =~ s/.*<collection>(.*)\n/$1/i;
			$COLLECTION =~ s/<.*//;
			$COLLECTION =~ s/\\par.*//;
			$COLLECTION =~ s/^\s*//;
			$COLLECTION =~ s/\s*$//;
			$COLLECTION =~ s/}.*//;
		}

		if( /<collection>/i )  # remove coll
		{
			s/<collection>[^<]*\\par//i;
			s/<collection>[^<]*</</i;
		}
		print TMPF;
	}
	close TMPF;
	close SF;
}
