########################################################################
# Document Type: FORM
# Author: Len Budney
#
# Description:
#   This filter reads an existing SGML document into Perl, and
#   copies it to standard out prettified.
#
# Usage:  
#   nlsgmls <file> | sgmlspl <this_filter> [spec] > <new_file>
########################################################################

use SGMLS;			# Use the SGMLS package.
use SGMLS::Output;		# Use stack-based output.
use Text::Format;		# Text formatter a la "fmt"

INITIALIZE:{
    #-----------------------------------------------------------------------
    # Before we do any parsing, we read the spec file which specifies
    # which elements are empty, and which are "inline" elements.  At
    # the moment, spec files are not implemented.
    #-----------------------------------------------------------------------

    # Determine the name of the state file, if any was supplied on the
    # command line.
    my( $filename ) = shift;

    if( $filename ){
        # Open the file, and read it.
        open( FILE, $filename ) or die "Could not open state file.";
	# UNIMPLIMENTED
        close FILE;
    }
    $fmt = new Text::Format;

    #-----------------------------------------------------------------------
    # For now, we specify some quirks of HTML right here.  Later, this
    # will be relegated to a spec file.
    #-----------------------------------------------------------------------
    foreach $element ('HR', 'BR', 'IMG', 'LINK', 'META'){

	# Mark those elements which are empty.
	$EMPTY{$element} = "1";

    }
    foreach $element ('A', 'IMG', 'CITE', 'EM', 'STRONG', 'BR', 'FONT',
		      'B', 'IT', ){

	# Mark those elements which are "inline"; i.e. those elements
	# which do not start brand-new "paragraphs" of their own.
	$INLINE{$element} = "1";

    }
    foreach $element ('PRE'){

	# Mark those elements which require "literal" processing of 
	# their input, to avoid reformatting.
	$LITERAL{$element} = "1";

    }
    foreach $element ('P', 'LI', 'H1', 'H2', 'H3', 'DD', 'DT',
		      'TITLE', 'ADDRESS', 'TD', 'TH'){

	# Mark those non-inline elements which need formatting for display.
	$LEAF{$element} = '1';

    }
    foreach $element ('HTML', 'HEAD', 'BODY', 'UL', 'OL', 'DL', 'TR',
		      'TABLE', 'CENTER', 'TBODY'){
	
	# Mark those elements which contain no CDATA.
	$NO_CDATA{$element} = '1';
    }
    foreach $element ('UL', 'OL', 'DL', 'TABLE', 'CENTER'){
	
	# Mark those elements which want a blank line to precede them.
	$LINE_BEFORE{$element} = '1';
    }
    # Start with "literal mode" turned off.
    $literal_mode = "";
    # At the start of the document, we are nested 0 levels deep.
    $nesting_level = 0;
    # At the moment, output is going straight to stdout.
    $output_saved = ();
    # Sometimes, we just want to throw output away.
    $discarding_output = 0;
    # Track the current tag we are in.
    @current_tag  = ();
    #-----------------------------------------------------------------------
}
#---------------------------------------------------------------------------

#---------------------------------------------------------------------------
#
# Document Handlers.
#
sgml('start', sub {
});
sgml('end', "\n"); # Neatness counts!


#
# Default handlers. Since this script maps HTML->HTML, the default
# behavior is to pass tags through unmodified.  We provide a function
# which performs this copy.
#
sgml('start_element', sub{ 

    # Check the name of the tag.
    my($tag)=$_[0]->name;
    unshift @current_tag, $tag;

    # Print a doctype statement for the "HTML" element.
    if( $tag eq "HTML" ){
	my $doctype = $_[0]->attribute("VERSION")->value;
	output "<!DOCTYPE HTML PUBLIC \"$doctype\">\n";
	$nesting_level--;
    }

    # Keep track of our depth in the document.
    if( $NO_CDATA{$tag} and not $EMPTY{$tag} ){
	$nesting_level++;
    }

    # Turn on "literal mode" if this is a "literal" tag.
    $literal_mode = 1 if $LITERAL{$tag};

    # Stop any formatting that is pending, if something non-leaf
    # or literal mode.
    if( ($NO_CDATA{$tag} and not $EMPTY{$tag}) or $literal_mode ){
	if( $#current_tag >= 1 and $output_saved[0] eq $current_tag[1] ){
	    &format_content($tag, &pop_output);   
	    &output("\n");
	    shift( @output_saved );
	}
    }

    # Push output onto a stack for formatting, if this
    # is not an inline tag.
    if ($LEAF{$tag} and not $literal_mode and not $EMPTY{$tag}){

	# Save output in a string for processing.
	push_output('string');
	unshift( @output_saved, $tag );

    }
    # Print some indentation, if this tag has no formattable content
    # and isn't inside a formattable tag.
    elsif ($LINE_BEFORE{$tag}) {
	output "\n  " x $nesting_level;
    }
    elsif (not $INLINE{$tag}) {
	output "  " x $nesting_level;
    }
    elsif( $EMPTY{$tag} and not grep( do{ $LEAF{$_}; }, @current_tag ) ){
	output "\n  " x ($nesting_level + 1) ;
    }

    # Produce an exact copy of the element tag, including attributes.
    &copy_element(@_);

    # Print a carriage return under the appropriate conditions.
    if( $EMPTY{$tag} and not grep( do{ $LEAF{$_}; }, @current_tag ) ){
	output "\n";
    }
    elsif( $NO_CDATA{$tag} ){
	output "\n";
    }
});
sgml('end_element', sub{ 

    # Check the name of the tag.
    my($tag)=$_[0]->name;
    shift @current_tag;

    # Stop discarding output at the right time.
    if( $discarding_output eq $tag ){
	$discarding_output = 0;
	&pop_output;
	return;
    }

    # Keep track of our depth in the document.
    $nesting_level-- if $NO_CDATA{$tag} and not $EMPTY{$tag};

    # If this is the end tag for the tag which started the
    # saving of output, then format it now.
    &format_content($tag, &pop_output." </$tag>"), 
        return if $output_saved[0] eq $tag;

    # If the current tag is not inline, and execution reaches this
    # point, then indentation is needed for neatness' sake.
    output "  " x ($nesting_level + 1) unless $INLINE{$tag} or $EMPTY{$tag};

    # Print the tag closure unless we are dealing with an empty tag.
    output "</$tag>" unless $EMPTY{$tag};

    # Print a carriage return if this is not an inline tag.
    output "\n" unless $INLINE{$tag};

    # Turn off literal mode, if it is turned on.
    $literal_mode = "" if $literal_mode and $LITERAL{$tag};
});

#
# What to do with textual data.
#
sgml('cdata',sub { 
    my $data = $_[0];
    return if $NO_CDATA{$current_tag[0]}; 

    $data =~ s/&/&amp;/g;
    $data =~ s/</&lt;/g;
    $data =~ s/>/&gt;/g;
    $data =~ s/\"/&quot;/g;
    output $data;
});

#
# Element Handlers.  These handlers correspond to the events raised
# by specific SGML elements.
#

#---------------------------------------------------------------------------
# Useful subroutines.
#---------------------------------------------------------------------------

sub copy_element {
    # Copy an element verbatim, along with all of its attributes.
    my( $element, $event ) = @_;

    my( $tag ) = $element->name;
    output "<$tag";

    foreach $attr ( $element->attribute_names ) {
	$val = $element->attribute($attr)->value;

	# Most empty attributes are not important.
	next unless $val or ($attr eq "ALT" and $tag eq "IMG");

	# Entityfy all "<" and "&" characters.
	$val =~ s/&/&amp;/g;
	$val =~ s/</&lt;/g;
	$val =~ s/>/&gt;/g;
	$val =~ s/\"/&quot;/g;
	output " $attr=\"$val\"";
    }

    output ">";
}


sub format_content {

    my($tag, $output) = @_;
    $output =~ s/\n/ /g;
    $output =~ s/\s+/ /g;
    my($margin) = $nesting_level * 2;

    # Prepare the text formatter for its new task, setting
    # indentation, etc.
    $fmt->config( {
	columns=>72, 
	leftMargin=>$margin,
	firstIndent=>0,
	} );
    output( $fmt->format( "$output" ) );
    shift @output_saved;
    return;
}
1;

