#!/usr/bin/perl
# html2sgml - converts html-filese to
# v 0.3
# Rude script to convert a html-file to a sgml-file in a format
# suitabel for linuxdoc-sgml. See html2sgml.1 for ducumentation.
# (c) Peter Antman, 1997.
# send bug reports to:
# peter.antman@abc.se
#
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#



$THISPROG = "html2sgml 0.3";


$usage = "usage: html2sgml fil.html";

die $usage if @ARGV < 1;
$htmlfile = shift (@ARGV);

$htmlfile =~ /([\S]*?)\.html/ or die "Could not figure out postfix for filename: $!\n";
$filename = $1;
$sgmlfile = "$filename.sgml";

open (html, $htmlfile);
open (sgml, ">$sgmlfile");


# Make sgml-header
print sgml <<'End_off';
<!--Converted to sgml with html2sgml-->
<!doctype linuxdoc system>
<article>
End_off

# If we can find an applix-world file in the same directory with
# the same name: try to get footnotes

if (-e "$filename.aw") {
    open (aw, "$filename.aw");
    
    $nrfootnotes = 0;
    $start_note = nej;
    $inT = nej;
    while (<aw>) {
    
    	# Find all fotnotes and their number and sequensnumber
	
	if (/^<S_F.*?footnote\s\"(\d*)\">$/) {
		$number = $1;    
		$nrfootnotes++;
	    	$tmpftn{$number} = "$nrfootnotes";
	
	}
	
	# What footnote are we taking the content from
	if (/^<start_footnote\s\"(\d*)\">$/) {
	    $whichnote = $1;
	    $start_note = ja;
	}
	
	# En open footnote
	if (/^<end_footnote>$/) {
	    $start_note = nej;
	}
	
	# Grab the content
	if ($start_note eq ja) {
	    if (!((/footnote_body/) or (/<T.*?position/) or (/<Symbol/))) {
	    	
		# citations in footnotes are tricky in converted docs
		s/\\\"/''/g;
		
	    	# special rutin to handle long footnotes
	    	# this is done only when foonote was not ended in first round
	    	if ($inT eq ja) {

	    		if (/(^[\s]+.*?\\)|(^[\s]+.*?>)/) {
	    			s/\\//g;
	    			s/^\s//g;
	    			s/\n//g;
	    			
	    			
	    			if (/.*?>/) {
	    			
	    				$tmpharb = "$tmpharb" . "$_";
	    				if ($tmpharb =~ /italic/) {
	    					$tmpharb =~ s/\"\s.*?>//;
	    					$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "<it>$tmpharb</it>";
	    				} elsif ($tmpharb =~ /bold/) {
	    					$tmpharb =~ s/\"\s.*?>//;
	    					$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "<bf>$tmpharb</bf>";
	    				} else {
	    					$tmpharb =~ s/\".*?>$//;
	    					$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$tmpharb";
	    				}
	    				
	    				$inT = nej;
	    			} else {
	    				$tmpharb = "$tmpharb" . "$_";
	    			}
	    		}
	    	}
	    	
	    	# The real footnotecontent
	    
		if (/(<T\s\"(.*)\"(.*)$)|(<T\s\"(.*)[^\"]\\)/) {
			# Footnotes have three formats
			# 1) <T "CONTENT">
			# 2) <T "CONTENT" TAG>
			# 3) <T "CONTENT\
			# Hm, and
			# 4) <T "CONTENT"\ TAG>
			if (/<T\s\"([^\"]+)\"(.*)>$/) {
				$cont = $1;
				$cont =~ s/\\//g;
		
				$it = $2;
				
				if (/italic/) {
		    		$cont = "<it>$cont</it>";
				}
		
				if (/bold/) {
				$cont = "<bf>$cont</bf>";
				}
		

		
				$footnotes{$tmpftn{$whichnote}} = "$footnotes{$tmpftn{$whichnote}}" . "$cont";
			}
			if (/<T\s\"(.*?)\\$/) {
			# Ok we are in a long footnote, wich is difficult
			# It may not even be the hole footnote, just a
			# special formated piece of it.
				$cont = $1;
				$cont =~ s/\\//g;
			# Have to harbour it temporary
				$tmpharb = ();
				$tmpharb = $cont;	
				$inT = ja;
				
			}
		

	    	}
	}
	}
	}
	
}
close aw;

# Sorry we have to do this, but othervise we might get norty chars
# in footnotes; I have only converted chars of interest to mee. 
# Others have to fill in with other 8-bit chars
for ($i = 1; $i <= $nrfootnotes; $i++) {
		$tmp = "$footnotes{$i}";
		$tmp =~ s/\&/&amp;/g;
	    	$tmp =~ s/\^[\s]*of/&aring;/g;
		$tmp =~ s/\^[\s]*pg/&ouml;/g;
		$tmp =~ s/\^[\s]*oe/&auml;/g;
		
		$tmp =~ s/\^[\s]*mf/&Aring;/g;
		$tmp =~ s/\^[\s]*me/&Auml;/g;
		$tmp =~ s/\^[\s]*ng/&Ouml;/g;
		$tmp =~ s/\^[\s]*mj/&Eacute;/g;
		$tmp =~ s/\^[\s]*mi/&Egrave;/g;
		$tmp =~ s/\^[\s]*oj/&eacute;/g;
		$tmp =~ s/\^[\s]*oi/&egrave;/g;
		$tmp =~ s/\^[\s]*ni/&Oslash;/g;
		$tmp =~ s/\^[\s]*pi/&oslash;/g;
		$tmp =~ s/\^[\s]*nm/&Uuml;/g;
		$tmp =~ s/\^[\s]*pm/&uuml;/g;
		$tmp =~ s/\^[\s]*kh//g;
		$tmp =~ s/\[/&lsqb;/g;
		$tmp =~ s/\]/&rsqb;/g;
		$tmp =~ s/\^//g;

		$footnotes{$i} = $tmp;
		
}


# Now we must check a couple of things first. The title and where to start
# section levels, and tables, Therefore we poure througe the whole file until we
# find something usefull.

$hone = 0;
$htwo = 0;

$intable = nej;
$nrtable = 0;
$nrrow = 0;

LOOP:
while (<html>) {
	if ($_ =~/\<TITLE\>([^\<]*)/i) {
		print sgml "<title>$1\n";
	}
	if ($_ =~/\<H1.*?\>/i) {
		$hone++;
	
	    }
    
	# We have to count tables and the amount of coloumns in them
	if ($_=~/<TABLE.*?>/i) {
	    $intable = ja;
	    $nrtable++;
	}
	if ($intable eq ja) {
	    if ($_=~/<TR.*?>/) {
		$nrrow++;
	    }
	    if ($nrrow == 1) {
		if (/<\/TD.*?>/) {
		    $table{$nrtable} = "$table{$nrtable}" . "l";
		}
	    }
	}
	
	if ($_=~/<\/TABLE.*?>/) {
	    $intable = nej;
	    $nrrow = 0;
	}
    }
if ($hone > 1 ) {
	$sect = H1;
} else {
	$sect = H2;
}
    
close html;

$nrtable = 0;
$intable = nej;

# Lets first write I couple of thing we probably cant figure out
# Would be good if we could figure out the name of the writer
# Perhaps ask for one?
print sgml <<'End_off_sub';
<author>You name
<date>
<abstract>
</abstract>
<toc>
End_off_sub

# Now we can start the real substitution, the order of the substitution is
# in some cases realy important
open (html, $htmlfile);

$inquote = nej;
$inverb = nej;
$nrfootnotes = 0;
while (<html>) {
	
	# must be first for sake of footnotes	
    
    	s/<FONT.*?>|<\/FONT>//gi;
    
    	# footnotes, as they are converted in Applix aw->html
	if (/<SUP>|<\/SUP>/) {
		# grab all footnotes on the current line
		while (/<SUP>[0-9]+/) {
			$nrfootnotes++;
			$putin = $footnotes{$nrfootnotes};
			s/<SUP>[0-9]+/<footnote>$putin<\/footnote>/;
		}
		# if foonotenumber noot on the same line as <SUP>
		while (/[0-9]+<\/SUP>/) {
			$nrfootnotes++;
			$putin = $footnotes{$nrfootnotes};
			s/[0-9]+<\/SUP>/<footnote>$putin<\/footnote>/;
		}
	}
	
	
	# Things to remowe
	s/<SUP>|<\/SUP>//gi;
	s/<HTML>|<\/HTML>//gi;
	s/<HEAD>|<\/HEAD>//gi;
	s/<TITLE>((.*<\/TITLE>)|(.*$))//gi;
	s/<\/TITLE>//gi;
	s/<HR>//gi;
	s/<U>|<\/U>//gi;
	s/<BODY.*?>|\<\/BODY>//gi;
	s/[0-9]*<\/SUP>//gi;
	s/<TT>|<\/TT>//gi;
	s/<SAMP>|<\/SAMP>//gi;
	
	s/<CENTER>|<\/CENTER>//gi;
	s/<ADDRESS>|<\/ADDRESS>//gi;
	s/<P\s.*?>//gi;
	
	
	#things to substitute
	
	# fist sectionlevel
	# first ones that has no meaning, remove them
	 s/<H1><\/H1>//gi;
	 s/<H2><\/H2>//gi;
	 s/<H3><\/H3>//gi;
	 s/<H4><\/H4>//gi;
	 s/<H5><\/H5>//gi;
	if ($sect eq H1) {
	    s/<H1.*?>/<sect>/gi;
	    s/<\/H1>/\n<p>/gi;
	    s/<H2.*?>/<sect1>/gi;
	    s/<\/H2>/\n<p>/gi;
	    s/<H3.*?>/<sect2>/gi;
	    s/<\/H3>/\n<p>/gi;
	    s/<H4.*?>/<sect3>/gi;
	    s/<\/H4>/\n<p>/gi;
	    s/<H5.*?>/<sect4>/gi;
	    s/<\/H5>/\n<p>/gi;
	} else {
	    s/<H2.*?>/<sect>/gi;
	    s/<\/H2>/\n<p>/gi;
	    s/<H3.*?>/<sect1>/gi;
	    s/<\/H3>/\n<p>/gi;
	    s/<H4.*?>/<sect2>/gi;
	    s/<\/H4>/\n<p>/gi;
	    s/<H5.*?>/<sect3>/gi;
	    s/<\/H5>/\n<p>/gi;
	    }
	
	# lets take al one to one things
	s/<BR>//gi;
	s/<UL>/<itemize>/gi;
	s/<\/UL>/<\/itemize>/gi;
	s/<OL>/<enum>/gi;
	s/<\/OL>/<\/enum>/gi;
	s/<DIR>/<itemize>/gi;
	s/<\/DIR>/<\/itemize>/gi;

	# Does not generate good results
	s/<DL.*?>/<descrip>/gi;
	s/<\/DL>/<\/descrip>/gi;
	s/<DT>/<tag>/gi;
	s/<DD>/<\/tag>/gi;
	
	s/<LI>/<item>/gi;
	s/<\/LI>/<\/item>/gi;
	s/<LQ>/<lq>/gi;
	s/<\/LQ>/<\/lq>/gi;
	s/<EM>/<em>/g;
	s/<\/EM>/<\/em>/g;
	s/<CITE>/<em>/gi;
	s/<\/CITE>/<\/em>/gi;
	s/<B>/<bf>/gi;
	s/<\/B>/<\/bf>/gi;
	s/<STRONG>/<bf>/gi;
	s/<\/STRONG>/<\/bf>/gi;
	s/<I>/<it>/gi;
	s/<\/I>/<\/it>/gi;
	s/<SF>/<sf>/g;
	s/<\/SF>/<\/sf>/g;
	s/<CODE>/<tt>/gi;
	s/<\/CODE>/<\/tt>\n/gi;
	s/<PRE>/<tscreen><verb>/gi;
	
	# \n taken away from ending if tscreen
	s/<\/PRE>/<\/verb><\/tscreen>/gi;
	s/<MC>/<mc>/gi;
	s/<\/MC>/<\/mc>/gi;
	s/&quot;/''/gi;
	s/<DFN>/<tt>/gi;
	s/<\/DFN>/<\/tt>/gi;
	
	# To prevent norty things to happend i latex
	s/``/''/g;
	
	s/&#60;/&lt;/g;
	s/&#62;/&gt;/g;
	s/&#38;/&amp;/gi;
	
	
	# we have to do special things inside verb and quote
	if (/\<tscreen\>\<verb>/) {
	    $inverb = ja;
	}
	
	if (/\<\/verb\>\<\/tscreen\>/) {
	    $inverb = nej;
	}
	
	# quote does not like empty rows
	if (/\<BLOCKQUOTE\>/i) {
	    $inquote = ja;
	    s/<BLOCKQUOTE>/<quote>/gi;
	}
	if (/\<\/BLOCKQUOTE\>/i) {
	    $inquote = nej;
	    s/<\/BLOCKQUOTE>/<\/quote>/gi;
	}
	

	
	#table - hm...probably only works for applix,
	# to work with sgml2html, change tabular to table in mapping
	if ($_=~/<TABLE.*?>/i) {
	    $nrtable++;
	    s/<TABLE.*>?/<tabular ca=\"$table{$nrtable}\">/gi;
	}
	s/<TR.*?>//gi;
	s/<\/TD><\/TR>/<rowsep>/gi;
	s/<TD.*?>//gi;
	s/<\/TD>/<colsep>/gi;
	if ($_=~/<\/TABLE.*?>/i) {
	    s/<\/TABLE>/<\/tabular>/gi;
	}


	#urls - pure url does not produce god looking formats - use htmlurl for all
	s/<A\sNAME=\"(.*)\">?(.*)<\/A>/\2<label id=\"\1\">/gi;
	s/<A\sNAME=(.*?)>(.*?)<\/A>/\2<label id=\"\1\">/gi;
	s/<A\sHREF=\"#([^\"]*)\">?(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;;
	s/<A\sHREF=\"([^\"]*)\">?(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;
	s/<A\sHREF=\"([^\"]*)\">?(.*?)$/<htmlurl url=\"\1\" name=\"\2\">/gi;
	
	# pure formaters dont use quotes
	s/<A\sHREF=#([^\"]*?)>(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;;
	s/<A\sHREF=([^\"]*?)>(.*?)<\/A>/<htmlurl url=\"\1\" name=\"\2\">/gi;
	s/<A\sHREF=([^\"]*?)>(.*?)$/<htmlurl url=\"\1\" name=\"\2\">/gi;
	# img - makes a eps-img, will only convert gifs
	if ($_=~/\<IMG.*?SRC=([^>]*?)\.([^>]*?)>/i) {
				      
		$img = $1;
		$img =~ s/\"//g;
		$ext = $2;
		$ext =~ s/\"//g;
		s/<IMG.*?SRC=[^>]*?>/<figure>\n<eps file=\"$img\">\n<\/figure>/gi;
		 # save the pics
		push(@pics, "$img.$ext");
				   
	}
	
				  
	# remove stale things
	s/<\/A>//gi;	
			   
	# Fix smlish away from verb-env, should be more...
	if ($inverb eq ja) {

	    if (!/<quote>|<tscreen><verb>|<footnote>|<\/footnote>/) {
		s/<.*?>|<\/.*?>//gi;
		s/&gt;/>/gi;
		s/&lt;/</gi;
		s/&lsqb;/\[/g;
		s/&rsqb;/\]/g;
		s/&amp;/&ero;/gi;
		
		# iso-char have to be iso-char in verb
		# not all implemented
		s/&Auml;//g;
		s/&Aring;//g;
		s/&Egrave;//g;
		s/&Eacute;//g;
		s/&Ouml;//g;
		s/&auml;//g;
		s/&aring;//g;
		s/&egrave;//g;
		s/&eacute;//g;
		s/&ouml;//g;    
	    }
	    
				      
# Dont know if this should realy be here, taken away for jargon
	    s/(\n)\n/\1/g;
	}

	# special macro-conversion
	if ($inverb eq nej) {
		s/\$/&dollar;/gi;
		s/([^&])#/\1&num;/gi;
		s/%/&percnt;/gi;
		s/~/&tilde;/gi;
		s/\\/&bsol;/gi;
		s/\[/&lsqb;/g;
		s/\]/&rsqb;/g;
		s/\^/&circ;/g;
		s/_/&lowbar;/g;
	
			       }
			       
	if ($inquote eq ja) {
		s/^\n//gi;
	    }		       
	
	s/<P>|^<\/P>$//g;
	
	if ($inquote eq nej) {
	
		s/<\/P>/\n/gi;
	} else {
		s/<\/P>//gi;
	}
	
	
	if ($inquote eq ja) {
		if (/^\s*$/) {
			# Dont do anything, just remove empty lines from quotes
		} else {
	    		print sgml;
	    	}
	} else {	  
		print sgml;
	}
			      
}
print sgml "</article>";
close sgml;

# convert any gifs (needs programs: giftopnm, ppptopgm and pnmtops) to ps
while (<@pics>) {
	if (-e "$_"){
		($name, $ex) = split(/\./, $_);
		`giftopnm $_ | ppmtopgm | pnmtops -noturn > $name.ps`;
	}
}

exit





