#! /usr/bin/perl -w

# Example perl file - extract H1,H2 or H3 headers from HTML files
# Run via:
#   perl this-perl-script.pl [-o outputfile] input-file(s)
# E.g.
#   perl proto-getH1.pl -o headers *.html
#   perl proto-getH1.pl -o output.txt homepage.htm
#
# Russell Quong         2/19/98

require 5.003;			# need this version of Perl or newer
use English;			# use English names, not cryptic ones
use FileHandle;			# use FileHandles instead of open(),close()
use Carp;                       # get standard error / warning messages
use strict;			# force disciplined use of variables

## define some variables.
my($author) = "Russell W. Quong";
my($version) = "Version 1.0";
my($reldate) = "Jan 1998";

my($lineno) = 0;                # variable, current line number
my($OUT) = \*STDOUT;            # default output file stream, stdout
my(@headerArr) = ();            # array of HTML headers

  # print out a non-crucial for-your-information messages.
  # By making fyi() a function, we enable/disable debugging messages easily.
sub fyi ($) {
    my($str) = @_;
    print "$str\n";
}

sub main () {
    fyi("perl script = $PROGRAM_NAME, $version, $author, $reldate.");
    handle_flags();
      # handle remaining command line args, namely the input files
    if (@ARGV == 0) {           # @ARGV used in scalar context = number of args
        handle_file('-');
    } else {
        my($i);
        foreach $i (@ARGV) {
            handle_file($i);
        }
    }
    postProcess();              # additional processing after reading input
}

  # handle all the arguments, in the @ARGV array.
  # we assume flags begin with a '-' (dash or minus sign).
  #
sub handle_flags () {
    my($a, $oname) = (undef, undef);
    foreach $a (@ARGV) {
        if ($a =~ /^-o/) {
            shift @ARGV;                # discard ARGV[0] = the -o flag
            $oname = $ARGV[0];          # get arg after -o
            shift @ARGV;                # discard ARGV[0] = output file name
            $OUT = new FileHandle "> $oname";
            if (! defined($OUT) ) {
                croak "Unable to open output file: $oname.  Bye-bye.";
                exit(1);
            }
        } else {
            last;                       # break out of this loop
        }
    }
}

  # handle_file (FILENAME);
  #   open a file handle or input stream for the file named FILENAME.
  # if FILENAME == '-' use stdin instead.
sub handle_file ($) {
    my($infile) = @_;
    fyi(" handle_file($infile)");
    if ($infile eq "-") {
        read_file(\*STDIN, "[stdin]");  # \*STDIN=input stream for STDIN.
    } else {
        my($IN) = new FileHandle "$infile";
        if (! defined($IN)) {
            fyi("Can't open spec file $infile: $!\n");
            return;
        }
        read_file($IN, "$infile");      # $IN = file handle for $infile
        $IN->close();           # done, close the file.
    }
}

  # read_file (INPUT_STREAM, filename);
  #   
sub read_file ($$) {
    my($IN, $filename) = @_;
    my($line, $from) = ("", "");
    $lineno = 0;                        # reset line number for this file
    while ( defined($line = <$IN>) ) {
        $lineno++;
        chomp($line);                   # strip off trailing '\n' (newline)
        do_line($line, $lineno, $filename);
    }
}

  # do_line(line of text data, line number, filename);
  #   process a line of text.  
sub do_line ($$$) {
    my($line, $lineno, $filename) = @_;
    my($heading, $htype) = undef;
    # search for a <Hx> .... </Hx>  line, save the .... in $header.
    # where Hx = H1, H2 or H3.
    if ( $line =~ m:(<H[123]>)(.*)</H[123]>:i ) {
        $htype = $1;            # either H1, H2, or H3
        $heading = $2;          # text matched in the parethesis in the regex
        fyi("FYI: $filename, $lineno: Found ($heading)");       
        print $OUT "$filename, $lineno: $heading\n";    

          # we'll also save the all the headers in an array, headerArr
        push(@headerArr, "$heading ($filename, $lineno)");
    }
}
    
  # print out headers sorted alphabetically
  #
sub postProcess() {
    my(@sorted) = sort { $a cmp $b } @headerArr;	# example using sort
    print $OUT "\n--- SORTED HEADERS ---\n";
    my($h);
    foreach $h (@sorted) {
        print $OUT "$h\n";
    }
    my $now = localtime();
    print $OUT "\nGenerated $now.\n"

}
 # start executing at main()
 # 
main();
0;              # return 0 (no error from this script)
