windows-xp/Source/XPSP1/NT/shell/lib/htmlcln.pl


								#!perl


								=head1 NAME


								Htmlcln - Sanitizes HTML and related files before publication


								=head1 SYNOPSIS


								perl htmlcln.pl [-t [html|js|css]] [-DVAR[=value] ...] [-v]

								-o I<outfile> I<infile>


								=head1 DESCRIPTION


								The Htmlcln program preprocesses text files such as HTML, JS, or CSS

								files and cleans them up.


								=over 8

								Comments are removed.


								Blank lines are removed.


								Sections marked "debug" are removed in the retail build.


								=back


								=head1 OPTIONS


								=over 8


								=item B<-t> [html|js|css]


								Htmlcln normally tries to guess what kind of file it is processing from

								the filename extension.  You can explicitly override the guess with the

								B<-t> command line switch.


								=item B<-D>VAR[=value] ...


								Command line definitions are supported in the same manner as the C compiler.

								The only command-line variable we pay attention to is the -DDBG flag, which

								indicates that this is a debug build.


								=item B<-o> I<outfile>


								Specifies the name of the output file.


								=item I<srcfile>


								Specifies the name of the source file.


								=back


								=cut


								use strict qw(vars refs subs);


								##############################################################################

								#

								#   Element -  A class that parses HTML elements

								#

								#   Instance variables:

								#

								#       raw         = the raw text

								#       attr        = hash of attributes

								#       tag         = name of tag, including slash

								#

								#       If the value of an attribute hash is undef, it means that the

								#       attribute is present but with no value.

								#


								package Element;


								#

								#   Constructor:  $elem = new Element("<TABLE BORDER>");

								#

								sub new {

								    my ($class, $raw) = @_;

								    my $attr = { };

								    my $self = { raw => $raw, attr => $attr };

								    my $tag;


								    if ($raw =~ s/^<([^\s>]*)//) {

								        $self->{tag} = uc $1;

								        if ($self->{tag} =~ /^[A-Z]/) {

								            $raw =~ s/>$//;

								            for (;;) {

								                if ($raw =~ s/^\s*([-A-Za-z]+)="([^"]*)"// ||

								                    $raw =~ s/^\s*([-A-Za-z]+)='([^']*)'// ||

								                    $raw =~ s/^\s*([-A-Za-z]+)=(\S*)//) {

								                    $attr->{uc $1} = $2;

								                } elsif ($raw =~ s/^\s*([A-Za-z]+)//) {

								                    $attr->{uc $1} = undef;

								                } else {

								                    last;

								                }

								            }

								        }

								    } else {

								        warn "Can't parse \"$raw\"";

								    }


								    bless $self, $class;


								}


								#

								#   Element::Tag

								#

								#   Returns the tag.

								#

								sub Tag {

								    my $self = shift;

								    $self->{tag};

								}


								#

								#   Element::Attr

								#

								#   Returns the value of the attribute.

								#

								sub Attr {

								    my ($self, $attr) = @_;

								    $self->{attr}{uc $attr};

								}


								#

								#   Element::Exists

								#

								#   Returns the presence of the attribute.

								#

								sub Exists {

								    my ($self, $attr) = @_;

								    exists $self->{attr}{uc $attr};

								}


								##############################################################################

								#

								#   Filter base class

								#

								#   Basic stuff to save people some hassle.

								#

								#   Per perl tradition, an object is a ref to an anonymous hash where the

								#   state is kept.

								#

								#   Instance variables:

								#

								#       sink        = reference to filter sink

								#


								package Filter;


								sub new {

								    my($class) = @_;

								    bless { }, $class;

								}


								sub SetSink {

								    my ($self, $sink) = @_;

								    $self->{sink} = $sink;

								}


								sub Add {

								    my $self = shift;

								    $self->{sink}->Add(@_);

								}


								sub Flush { }


								sub Close {

								    my $self = shift;

								    $self->{sink}->Close(@_);

								}


								sub SinkAdd {

								    my $self = shift;

								    $self->{sink}->Add(@_);

								}


								##############################################################################

								#

								#   TokenFilter filter package

								#

								#   Does not modify the stream, but merely chops them into tokens, as

								#   recognized by NextToken and processed by EachToken.

								#

								#   Instance data:

								#

								#       buf         = unprocessed text

								#

								package TokenFilter;

								@TokenFilter::ISA = qw(Filter);


								#

								#   Append the incoming text to the buffer, then suck out entire tokens.

								#

								sub Add {

								    my($self, $text) = @_;

								    my $tok;


								    $self->{buf} .= $text;


								    while ($self->{buf} ne '' && defined($tok = $self->NextToken))

								    {

								        $self->EachToken($tok);

								    }

								}


								sub Flush {

								    my $self = shift;

								    $self->EachToken($self->{buf});

								}


								#

								#   By default, we just sink tokens to the next layer.

								#

								sub EachToken {

								    my($self, $tok) = @_;

								    $self->SinkAdd($tok);

								}


								##############################################################################

								#

								#   LineFilter filter package

								#

								#   Tokenizer that recognizes lines.

								#

								#   Instance data:

								#

								#       buf         = unprocessed text

								#

								package LineFilter;

								@LineFilter::ISA = qw(TokenFilter);


								#

								#   Recognize lines.

								#

								sub NextToken {

								    my($self) = shift;


								    if ($self->{buf} =~ s/([^\n]*\n)//) {

								        $1;

								    } else {

								        undef;

								    }

								}


								##############################################################################

								#

								#   WhitespaceFilter filter package

								#

								#   Removes blank lines and removes leading and trailing whitespace.

								#

								#   Someday: Collapse multiple whitespace outside of quotation marks.

								#

								package WhitespaceFilter;

								@WhitespaceFilter::ISA = qw(LineFilter);


								sub EachToken {

								    my($self, $line) = @_;

								    $line =~ s/^[ \t]+//;

								    $line =~ s/[ \t]+$//;

								    $self->SinkAdd($line) unless $line =~ /^$/;

								}


								##############################################################################

								#

								#   OutFile filter package

								#

								#   Writes its output to a file.

								#

								#   Instance data:

								#

								#       fh          = name of file handle

								#

								#


								package OutFile;

								@OutFile::ISA = qw(Filter);

								no strict 'refs';           # Our filename globs aren't very strict


								#

								#   Custom method:  SetOutput.  Opens an output file.

								#


								my $seq = 0;


								sub SetOutput {

								    my($self, $file) = @_;

								    $self->{fh} = "OutFile" . $seq++;

								    open($self->{fh}, ">$file") || die "Unable to open $file for writing ($!)\n";

								}


								sub Add {

								    my $self = shift;

								    print { $self->{fh} }  @_;

								}


								sub Close {

								    my $self = shift;

								    close($self->{fh});

								}


								##############################################################################

								#

								#   DebugFilter filter package

								#

								#   Filters out ;debug and ;begin_debug blocks if building retail.

								#

								#   Instance data:

								#

								#       skip        = nonzero if we are inside an ignored ;begin_debug block

								#       buf         = unprocessed text

								#


								package DebugFilter;

								@DebugFilter::ISA = qw(LineFilter);

								no strict 'refs';           # Our filename globs aren't very strict


								#

								#   See if the line contains a debug marker.

								#   If applicable, send the line down the chain.

								#

								sub EachToken {

								    my($self, $line) = @_;


								    # ;begin_debug means start skipping if retail

								    if ($line =~ s/;begin_debug//) {

								        $self->{skip} = $::RetailVersion;

								    }


								    # If we were skipping, then ;end_debug ends skipping and we should eat it

								    if ($line =~ s/;end_debug// && $self->{skip}) {

								        $self->{skip} = 0;

								    } elsif ($line =~ s/;debug// && $::RetailVersion) {

								        # A one-shot debug line in retail - skip it

								    } elsif (!$self->{skip}) {

								        $self->SinkAdd($line);          # send it down the chain

								    }

								}


								##############################################################################

								#

								#   CPP filter package

								#

								#   The CPP filter performs the following operations:

								#

								#       Removes C and C++-style comments.

								#

								#       Filters whitespace.

								#

								#   Instance data:

								#

								#       buf         = unprocessed text

								#       wsf         = child WhitespaceFilter

								#       script      = current script sink

								#       ultSink     = the ultimate sink


								package CPP;

								@CPP::ISA = qw(TokenFilter);


								sub new {

								    my($class) = shift;

								    my $self = new Filter;

								    $self->{wsf} = new WhitespaceFilter;    # sink into a whitespace filter

								    $self->{sink} = $self->{wsf};           # initially use this script

								    bless $self, $class;

								}


								#

								#   Recognize tokens, which are lines or /* ... */ comments.

								#

								sub NextToken {

								    my($self) = shift;


								    if ($self->{buf} =~ s/^([^\/]+)//) {    # eat up to a slash

								        $1;

								    } elsif ($self->{buf} =~ s/^\/\/.*?\n//) { # eat // to end of line

								        "\n";

								    } elsif ($self->{buf} =~ s/^\/\*[^\0]*?\*\///) { # eat /* .. */

								        '';

								    } elsif ($self->{buf} =~ s/^(\/)(?=[^\/\*])//) { # eat / not followed by / or *

								        $1;

								    } else {                                    # incomplete fragment - stop

								        undef;

								    }

								}


								#

								#   SetSink

								#

								#   The sink we get is really the whitespace filter's sink, and we sink

								#   into the whitespace filter.

								#

								sub SetSink {

								    my ($self, $sink) = @_;

								    $self->{wsf}->SetSink($sink);

								}


								##############################################################################

								#

								#

								#   JS - comments are // or /* ... */, invoked via <SCRIPT>...

								#   CSS - comments are /* ... */, invoked via <STYLE TYPE="text/css">

								#

								#   They are both just CPP thingies.  Both should someday remove whitespace


								package JS;

								@JS::ISA = qw(CPP);


								package CSS;

								@CSS::ISA = qw(CPP);


								##############################################################################

								#

								#   HTML filter package

								#

								#   The HTML filter performs the following operations:

								#

								#       Send the final output through a whitespace filter.

								#

								#       Remove comments.

								#

								#   Someday it will also...

								#

								#       Recognize embedded stylesheets and scripts and generate a subfilter

								#       to handle them.

								#

								#       Compress spaces outside quotation marks.

								#

								#   Instance data:

								#

								#       buf         = unprocessed text

								#       wsf         = child WhitespaceFilter

								#       script      = current script sink

								#       endScript   = sub that recognizes end of script

								#       ultSink     = the ultimate sink


								package HTML;

								@HTML::ISA = qw(TokenFilter);


								sub new {

								    my($class) = shift;

								    my $self = new Filter;

								    $self->{wsf} = new WhitespaceFilter;

								    $self->{sink} = $self->{wsf};           # initially use this script

								    bless $self, $class;

								}


								#

								#   SetSink

								#

								#   The sink we get is really the whitespace filter's sink, and we sink

								#   into the whitespace filter.

								#

								sub SetSink {

								    my ($self, $sink) = @_;

								    $self->{ultSink} = $sink;

								    $self->{wsf}->SetSink($sink);

								}


								#

								#   NextHTMLToken

								#

								#   An HTML token is one of the following:

								#

								#   -   A hunk of boring text.

								#   -   A comment (thrown away).

								#   -   A matched <...> thingie.


								sub NextHTMLToken {

								    my($self) = shift;


								    #

								    #   Any string of non "<" counts as a boring text token.

								    #

								    #   Be careful not to mistake <!DOCTYPE...> as a comment.

								    #

								    if ($self->{buf} =~ s/^([^<]+)//) {

								        $1;

								    } elsif ($self->{buf} =~ s/^(<!--[^\0]*?-->)//) {  # Eat full comments

								        '';

								    } elsif ($self->{buf} =~ s/^(<![^-][^>]*>)//) { # <!DOCTYPE ...>

								        $1;

								    } elsif ($self->{buf} =~ s/^(<[^!][^>]*>)//) { # <something else>

								        $1;

								    } else {                                    # incomplete fragment - stop

								        undef;

								    }

								}


								#

								#   NextScriptToken

								#

								#   A script token is anything that isn't the word </SCRIPT>.

								#


								sub NextScriptToken

								{

								    my($self) = shift;

								    if ($self->{buf} =~ s,^(</SCRIPT>),,i) {

								        $1;

								    } elsif ($self->{buf} =~ s,^(.*?)</SCRIPT>,,i) {

								        $1;

								    } else {

								        my $tok = $self->{buf};

								        $self->{buf} = '';

								        $tok;

								    }

								}


								#

								#   NextToken

								#

								#   Returns either an HTML token or a script token.

								#

								sub NextToken {

								    my($self) = shift;

								    if (defined $self->{script}) {

								        $self->NextScriptToken();

								    } else {

								        $self->NextHTMLToken();

								    }

								}


								#

								#   _Redirect - Private method that redirects parsing to a script language.

								#

								#       $self->_Redirect($scr, $end);

								#

								#       $scr = script object to hook in

								#       $end = sub that recognizes the end of the script

								#

								#

								sub _Redirect {

								    my ($self, $scr, $end) = @_;

								    $self->{script} = $self->{sink} = $scr;

								    $scr->SetSink($self->{ultSink});

								    $self->{endScript} = $end;

								}


								sub EachToken {

								    my($self, $tok) = @_;


								    if ($tok =~ /^<SCRIPT/i) {

								        $self->{inScript} = 1; # BUGBUG create a script sink

								        my $elem = new Element($tok);

								        my $lang = lc $elem->Attr("LANGUAGE");

								        my $scr;

								        # No language implies JScript

								        if (!defined($lang) || $lang eq 'jscript' || $lang eq 'javascript') {

								            $scr = new CPP;

								        } else {

								            warn "Unknown script language [$lang]";

								            # Just use the whitespace filter as the unknown script filter

								            $scr = new WhitespaceFilter;

								        }

								        $self->_Redirect($scr, sub { m,^</SCRIPT>,i });


								    } elsif ($tok =~ /<STYLE/i) {

								        $self->_Redirect(new CSS, sub { m,^</STYLE>,i });


								    } elsif (defined($self->{endScript}) && &{$self->{endScript}}($tok)) {

								        delete $self->{endScript};

								        $self->{script}->Flush();

								        delete $self->{script};

								        $self->{sink} = $self->{wsf};

								    }

								    $self->SinkAdd($tok);

								}


								##############################################################################

								#

								#   Main package

								#


								package main;


								#

								#   Set up some defaults.

								#

								my $force_type = undef;                 # do not force file type

								$::RetailVersion = 1;                   # not the debugging version

								my $outfile = undef;                    # output file not known yet

								my %VAR = ();                           # No variables defined yet

								my $verbose = undef;                    # not verbose mode


								##############################################################################

								#

								#   CreateTypeFilter - Create a filter for the specified type.

								#


								my $types = {

								    html    => sub { new HTML },        # HTML

								    htm     => sub { new HTML },

								    htx     => sub { new HTML },

								    js      => sub { new JS },          # Javascript

								    jsx     => sub { new JS },

								    css     => sub { new CSS },         # Cascading style sheet

								    csx     => sub { new CSS },

								};


								sub CreateTypeFilter {

								    my $sub = $types->{lc shift};

								    &$sub;

								}


								##############################################################################

								#

								#   Command line parsing

								#


								sub Usage {

								    die "Usage: htmlcln [-t [html|js|css]] [-DVAR[=value]...] [-v] -o outfile infile\n";

								}


								#

								#   AddDefine - Handle a -D command line option.

								#

								sub AddDefine {

								    my $line = shift;

								    if ($line =~ /=/) {

								        $VAR{$`} = $';

								    } else {

								        $VAR{$line} = 1;

								    }

								}


								sub ParseCommandLine {


								    #

								    #   Scream through the command line arguments.

								    #


								    while ($#ARGV >= 0 && $ARGV[0] =~ /^-(.)(.*)/) {

								        # $1 - command

								        # $2 - optional argument


								        my($cmd, $val) = ($1, $2);


								        shift(@ARGV);


								        if ($cmd eq 't') {

								            $val = shift(@ARGV) if $val eq '';

								            $force_type = $val;

								        } elsif ($cmd eq 'D') {

								            AddDefine($val);

								        } elsif ($cmd eq 'o') {

								            $val = shift(@ARGV) if $val eq '';

								            $outfile = $val;

								        } elsif ($cmd eq 'v') {

								            $verbose = 1;

								        } else {

								            Usage();

								        }

								    }


								    #

								    #   What's left should be a filename, and there should be an output file.

								    #


								    my $infile = shift(@ARGV);

								    Usage() unless defined $infile && defined $outfile && $#ARGV == -1;


								    #

								    #   If the filetype is not being overridden, then take it from the filename.

								    #

								    if (!defined $force_type) {

								        ($force_type) = $infile =~ /\.(.*)/;

								    }


								    #

								    #   Include debug goo only if building DBG=1 and FULL_DEBUG is set in the

								    #   environment.

								    #

								    $::RetailVersion = 0 if defined($VAR{"DBG"}) && defined($ENV{"FULL_DEBUG"});


								    $infile;

								}


								##############################################################################

								#

								#   File processing

								#


								sub ProcessFile {

								    my $infile = shift;


								    #

								    #   Create the final sink.

								    #

								    my $sink = new OutFile;

								    $sink->SetOutput($outfile);


								    #

								    #   Set up the default filter based on the file type.

								    #

								    my $Type = CreateTypeFilter($force_type);

								    $Type->SetSink($sink);


								    #

								    #   Create the DebugFilter which sits at the top of the chain.

								    #

								    my $Filter = new DebugFilter;

								    $Filter->SetSink($Type);


								    #

								    #   All the plumbing is ready - start pumping data.

								    #

								    open(I, $infile) || die "Cannot open $infile for reading ($!)\n";


								    while (<I>) {

								        $Filter->Add($_);

								    }

								    $Filter->Flush();

								    $Filter->Close();

								}


								##############################################################################

								#

								#   Main program

								#


								{

								    my $infile = ParseCommandLine();

								    ProcessFile($infile);

								}