#!/usr/bin/perl -w

###################### BEGIN SETUP #########################

use strict;
use HTML::TreeBuilder;

####################### END SETUP ##########################

################### BEGIN DEFINITIONS ######################

my $debug = 0; #turns on and off debugging mode
my $default_image_scale = 0.5;
my $default_table_border = 0; 
my $default_font_size = 12;
my $default_document_class = "article";

my @packages = qw(
		  fullpage
		  graphicx
		  );

my %tags = (
	    "h1"     => [\&command_handler, "section"       ],
	    "h2"     => [\&command_handler, "subsection"    ],
	    "h3"     => [\&command_handler, "subsubsection" ],
	    "h4"     => [\&command_handler, "bf"            ],
	    "h5"     => [\&command_handler, "bf"            ],
	    "h6"     => [\&command_handler, "bf"            ],
	    "b"      => [\&command_handler, "bf"            ],
	    "i"      => [\&command_handler, "sl"            ],
	    "title"  => [\&command_handler, "title"         ],
	    
	    "body"   => [\&environment_handler, "document"  ],
	    "ol"     => [\&environment_handler, "enumerate" ],
	    "ul"     => [\&environment_handler, "itemize"   ],
	    "center" => [\&environment_handler, "center"    ],
	    
	    "li"     => [\&single_handler, "\\item"         ],
	    "br"     => [\&single_handler, "\\\\"           ],
	    "p"      => [\&single_handler, "\n"             ],
	    "hr"     => [\&single_handler, "\\hline"        ],
	    
	    "table"  => [\&table_handler, "table"           ],
	    "tr"     => [\&table_handler, "td"              ],
	    "td"     => [\&table_handler, "tr"              ],

	    "img"    => [\&image_handler, "includegraphics" ],

	    "html"   => [\&preamble_handler,"documentclass" ] 
	    );

################### END DEFENITIONS #######################

###################### BEGIN MAIN #########################

my @filenames = get_filename_bases(@ARGV);

foreach my $filename (@filenames) {
    check_for_previous_files("$filename.tex");
    open_files($filename);

    my $tree = HTML::TreeBuilder->new;
    $tree->warn(1);
    $tree->parse_file(*IN);
    print OUT &texify($tree->root());

    close_files($filename);

    #latex2pdf($filename);
}

####################### END MAIN ##########################

##################### BEGIN HANDLERS ######################

# HTML input form: <FOO> Bar </FOO>
# Latex output form: \tex_foo{bar}
# tex_foo is found in the %tags hash
# <1> tex_foo
# <2> Bar
# Returns latex output with a newline
sub command_handler{
    my($command,$content_ref,$html_element) = @_;
    return "\\$command\{" . texify(@$content_ref) . "\}\n";
}

# HTML input form: <FOO> Bar </FOO>
# Latex output form: \begin{tex_foo} Bar \end{tex_foo}
# tex_foo is found in the %tags hash
# <1> tex_foo
# <2> Bar
# Returns latex output with a newline
sub environment_handler{
    my($environment,$content_ref,$html_element) = @_;
    return "\\begin\{$environment\}\n"  . texify(@$content_ref) . "\\end\{$environment\}\n";
}

# HTML input form: <FOO> Bar (implicit end)
# Latex output form: \tex_foo Bar
# tex_foo is found in the %tags hash
# <1> tex_foo
# <2> Bar
# Returns latex output with a newline
sub single_handler{
    my($single,$content_ref,$html_element) = @_;
    return "$single "  . texify(@$content_ref) . "\n";
}

# Does a lot of work to create a table in latex format.
# It takes <TABLE>, <TR>, and <TD>.  It works by finding those tags nested inside
# and then calling texify on them while keepind track of when to print
# latex syntax.  It's messy, I know.
sub table_handler{
    my($tex,$content_ref,$html_element) = @_;
    my $output = "";
    if($tex eq "table"){ # It's a table
	if($html_element->parent()->is_inside('table')){ # it's nested.  not allowed in Latex
	    print STDERR "Nested tables not allowed!!!\n";
	} else { # it's not nested.  act normal
	    $output = create_latex_table($html_element);
	}
    } else { # It's a td or tr, let above take care of "\\" and "&"
	$output .= texify(@{$html_element->content()}); # add the texified text inside
    }
    return $output;
}


# HTML input form: <IMG src="blah">
# Latex output form: \includegraphic
# The input and output are just like all the other handlers.
# In also converts the image to a .png or .pdf
sub image_handler{
   my($tex,$content_ref,$html_element) = @_;
   my $source = $html_element->attr('src') || "";
   my $scale = $html_element->attr('scale') || $default_image_scale;
   my $alt = $html_element->attr('alt') || "";

   if(my $image = convert_image($source,$scale)){ # convert worked
       return "\\begin{center}\n\\$tex\[scale=$scale\]\{$image\}\n\\end{center}\n";
   } else { #convert didn't work
       return $alt;
   }
}

sub preamble_handler{
    my($tex,$content_ref,$html_element) = @_;
    my $document_class = $html_element->attr('class') || $default_document_class;
    my $font_size = $html_element->attr('fontsize') || $default_font_size;
    my $output;

    $output .= "\\$tex\[${font_size}pt\]\{$document_class\}\n";
    $output .= '\usepackage{' . join(", ",@packages) . '}' . "\n";
    $output .= texify(@$content_ref);

    return $output;
}

###################### END HANDLERS #######################

####################### BEGIN SUBS ########################

# Takes in an array (or scalar) of HTML::Elements which calls a
# handler, which calls texify recursively, and eventually makes a
# string.

sub texify {
    my @html_elements = @_;
    print " <texify called on " . scalar(@html_elements) . " elements>: \n" if $debug;
    my $output = "";

    foreach my $html_element (@html_elements){
	print "  <found html_element: $html_element>\n" if $debug;
	$output .= " "; #space out the elements;

	if(ref $html_element){ # If this element is another HTML::Element
	    my $tag = $html_element->tag();
	    my $content_ref = $html_element->content() || [];
	    print "   <html_element has tag $tag>\n" if $debug;

	    if($tags{$tag}){ # if the tag is used with a handler, use it
		my ($handler_ref, $tex) = @{$tags{$tag}};
		$output .= &$handler_ref($tex,$content_ref,$html_element);
	    } else { #otherwise, just texify the contents;
		$output .= texify(@$content_ref);
	    }
	} else { # Otherwise, it's just a string 
	    $output .= $html_element; 
	}
    }

    return $output;
}

# runs the system command "pdflatex"
# <1> The base of the filename
sub latex2pdf {
    my $filename = shift;
    check_for_previous_files("$filename.pdf");
    my $pdflatex_return = `pdflatex -interaction=nonstopmode $filename.tex`;
    my $errors = get_tex_errors($pdflatex_return);

    if($errors){
	die "Unable to create the pdf due to the following errors: $errors\n";
    } else {
	check_for_current_files("$filename.pdf");
    }
}


# if pdflatex fails, its output will include errors that begin with "!"
sub get_tex_errors{
    my $tex_return = shift;
    my @errors = $tex_return =~ /(!.*\n)/mg;
    return "@errors";
}

# opens necessary files
# <1> The base of the filename
sub open_files {
    my $filename = shift;

    open IN, "< $filename.html" or die "Can't open $filename.html: $!";
    open OUT,"> $filename.tex" or die "Can't open $filename.tex: $!";
    print "Processing $filename.html\n";
}

# closes IN and OUT
# <1> The base of the filename
sub close_files {
    my $filename = shift;

    close IN or die "Can't close $filename.html: $!";
    close OUT or die "Can't close $filename.tex: $!";
}

# return array of filenames that we will output latex to
sub get_filename_bases {
    my @filenames = @_;
    foreach(@filenames){
	s/\.html//i;
    }
    return @filenames;
}


# checks for existance of file and moves it to name .old .
sub check_for_previous_files {
    my $filename = shift;
    if( -f $filename){
	rename $filename, "$filename.old" and print STDERR "renamed $filename $filename.old\n";
    }	
}

# checks for existance of file and prints that it successfully created it.
# <1> filename
# [2] error to print if didn't create;
sub check_for_current_files {
   my $filename = shift;
   if( -f $filename){
       print STDERR "Successfully created $filename\n"; 
       return $filename;
   }	
   else{
       print STDERR "Failed to create $filename\n";
       return 0;
   }
}

# converts an image from jpeg or gif into png
# returns the name of the new filename is successfull
# <1> filename

sub convert_image {
    my $source = shift;
    my $scale = shift || $default_image_scale;
    my $output = 0;
    my ($base,$extension) = split /\./,$source;

    if($extension =~ /png/i){
	$output = $source;
    } elsif($extension =~ /gif/i){
	my $pic_filename = "$base.png";
	check_for_previous_files($pic_filename);
	`gif2png -O $source`; # -O removes verbose output
	$output = check_for_current_files($pic_filename);
    } elsif($extension =~ /jpe{0,1}g/i){
	my $pic_filename = "$base+pic.pdf"; 
	my $ps_filename = "$base.ps";
	
	check_for_previous_files($ps_filename);
	`jpeg2ps $source > $ps_filename`;
	$output = check_for_current_files($ps_filename);

	if($output){
	    check_for_previous_files($pic_filename);
	    `ps2pdf $ps_filename $pic_filename`;
	    $output = check_for_current_files($pic_filename);
	}
    } else {
	print STDERR "Unknown file type: $source\n";
	$output = 0;
    }
    
    return $output;
}

# Creates a latex table from an html table using the other table sub procedures.
# <1> The $html_element that is a table tag.
# Returns the table in latex string form
sub create_latex_table {
    my $table = shift;
    my $output;
    my($latex_table_ref,$row_number,$column_number) = create_latex_table_def($table);
    my $border = $table->attr('border') || $default_table_border;    

    $output .= '\begin{center}\begin{tabular}{' . $latex_table_ref . '}' . "\n";
    $output .= "\\hline \n" if $border;
    
    my @rows = $table->find_by_tag_name('tr');
    foreach my $row (@rows){
	my @columns = $row->find_by_tag_name('td');
	
	for my $i (0 .. $column_number - 1){ # Make Sure to fill in blank ones if necessary
	    my $column = $columns[$i];
	    $output .= texify(@{$column->content()}) if $column; # Add the td data
	    $output .= (($i < $column_number -1)?  " &" : ""); # Add the puncation at the end if not the last one
	}
	
	$output .= (($row->pindex() < $row_number -1 or $border)?  " \\\\" : "") . "\n"; # Add the puncation at the end if not the last one
	$output .= " \\hline \n" if $border;
    }
    
    $output .= '\end{tabular}\end{center}' . "\n";

    return $output;
}

# Based on the alignments of the rows, create a latex table defenition (i.e. "cccc")
# <1> The the number of columns the table has;
# <2> A refrence to an array with alignment defenitions
# <3> Whether it has a border or not;
# Returns the table definition, the number of columns and the number of rows
sub create_latex_table_def {
    # get variables
    my $table = shift;
    my $border = $table->attr('border') || $default_table_border;
    my ($row_number,$column_number) = find_table_lengths($table);
    my @column_alignments = create_column_alignments($table);

    # define table_def
    my $latex_table_def = ($border? "|" : "");
    for my $i (0 .. $column_number - 1){
	my $align = $column_alignments[$i];
	$latex_table_def .= ($align? ($border? $align . "|" : "$align") : ($border? "c|" : "c")); 
    }

    return ($latex_table_def,$row_number,$column_number);
}

# Finds the maximum number of columns that any row in a table has
# and also the number of rows it has.
# <1> the refrence to the HTML::Element table.
sub find_table_lengths {
    my $table = shift;
    my @rows = $table->find_by_tag_name('tr');
    my $max_row_length = 0;
    foreach my $row (@rows){
	my @columns = $row->find_by_tag_name('td');
	if(@columns > $max_row_length){
	    $max_row_length = @columns;
	}
    }
    
    #        row_number    column_number
    return (scalar(@rows),$max_row_length);
}

# returns an array of column alignments
# <1> the refrence to the HTML::Element table.
sub create_column_alignments {
    my $table = shift;
    my @column_alignments;
    my $row = $table->find_by_tag_name('tr');
    if($row){
	my @columns = $row->find_by_tag_name('td');
	foreach my $column (@columns){
	    my $align = $column->attr('align');
	    if($align and $align eq 'left'){
		$align = 'l';
	    } elsif($align and $align eq 'right'){
		$align = 'r';
	    } else {
		$align = 'c';
	    }
	    
	    push @column_alignments, $align;
	}
    }
    
    return @column_alignments;
}

########################## END SUBS #############################






