require 5; # Time-stamp: "2001-03-14 20:11:48 MST" package HTML::TreeBuilder; #TODO: maybe have it recognize higher versions of # Parser, and register the methods as subs? # Hm, but TreeBuilder wouldn't be subclassable, then. # TODO: document tweaks? # TODO: deprecate subclassing TreeBuilder? use strict; use integer; # vroom vroom! use vars qw(@ISA $VERSION $DEBUG); $VERSION = '3.11'; # TODO: thank whoever pointed out the TEXTAREA bug # TODO: make require Parser of at least version... 2.27? # The one with the stop-parse. Then kill the whole stunting thing. #--------------------------------------------------------------------------- # Make a 'DEBUG' constant... BEGIN { # We used to have things like # print $indent, "lalala" if $Debug; # But there were an awful lot of having to evaluate $Debug's value. # If we make that depend on a constant, like so: # sub DEBUG () { 1 } # or whatever value. # ... # print $indent, "lalala" if DEBUG; # Which at compile-time (thru the miracle of constant folding) turns into: # print $indent, "lalala"; # or, if DEBUG is a constant with a true value, then that print statement # is simply optimized away, and doesn't appear in the target code at all. # If you don't believe me, run: # perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \ # $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder' # and see for yourself (substituting whatever value you want for $DEBUG # there). if(defined &DEBUG) { # Already been defined! Do nothing. } elsif($] < 5.00404) { # Grudgingly accomodate ancient (pre-constant) versions. eval 'sub DEBUG { $Debug } '; } elsif(!$DEBUG) { eval 'sub DEBUG () {0}'; # Make it a constant. } elsif($DEBUG =~ m<^\d+$>s) { eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant. } else { # WTF? warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG"; eval 'sub DEBUG () { $DEBUG }'; # I guess. } } #--------------------------------------------------------------------------- use HTML::Entities (); use HTML::Tagset 3.02 (); use HTML::Element (); use HTML::Parser (); @ISA = qw(HTML::Element HTML::Parser); # This looks schizoid, I know. # It's not that we ARE an element AND a parser. # We ARE an element, but one that knows how to handle signals # (method calls) from Parser in order to elaborate its subtree. # Legacy aliases: *HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown; *HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten; *HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement; *HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement; *HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup; *HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement; *HTML::TreeBuilder::isList = \%HTML::Tagset::isList; *HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement; *HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement; *HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers; #========================================================================== # Two little shortcut constructors: sub new_from_file { # or from a FH my $class = shift; require Carp, Carp::croak("new_from_file takes only one argument") unless @_ == 1; require Carp, Carp::croak("new_from_file is a class method only") if ref $class; my $new = $class->new(); $new->parse_file($_[0]); return $new; } sub new_from_content { # from any number of scalars my $class = shift; require Carp, Carp::croak("new_from_content is a class method only") if ref $class; my $new = $class->new(); foreach my $whunk (@_) { $new->parse($whunk); last if $new->{'_stunted'}; # might as well check that. } $new->eof(); return $new; } #--------------------------------------------------------------------------- sub new { # constructor! my $class = shift; $class = ref($class) || $class; my $self = HTML::Element->new('html'); # Initialize HTML::Element part { # A hack for certain strange versions of Parser: my $other_self = HTML::Parser->new(); %$self = (%$self, %$other_self); # copy fields # Yes, multiple inheritance is messy. Kids, don't try this at home. bless $other_self, "HTML::TreeBuilder::_hideyhole"; # whack it out of the HTML::Parser class, to avoid the destructor } # The root of the tree is special, as it has these funny attributes, # and gets reblessed into this class. # Initialize parser settings $self->{'_implicit_tags'} = 1; $self->{'_implicit_body_p_tag'} = 0; # If true, trying to insert text, or any of %isPhraseMarkup right # under 'body' will implicate a 'p'. If false, will just go there. $self->{'_tighten'} = 1; # whether ignorable WS in this tree should be deleted $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag $self->{'_element_class'} = 'HTML::Element'; $self->{'_ignore_unknown'} = 1; $self->{'_ignore_text'} = 0; $self->{'_warn'} = 0; $self->{'_no_space_compacting'}= 0; $self->{'_store_comments'} = 0; $self->{'_store_pis'} = 0; $self->{'_store_declarations'} = 0; $self->{'_p_strict'} = 0; # Parse attributes passed in as arguments if(@_) { my %attr = @_; for (keys %attr) { $self->{"_$_"} = $attr{$_}; } } # rebless to our class bless $self, $class; $self->{'_element_count'} = 1; # undocumented, informal, and maybe not exactly correct $self->{'_head'} = $self->insert_element('head',1); $self->{'_pos'} = undef; # pull it back up $self->{'_body'} = $self->insert_element('body',1); $self->{'_pos'} = undef; # pull it back up again return $self; } #========================================================================== sub _elem # universal accessor... { my($self, $elem, $val) = @_; my $old = $self->{$elem}; $self->{$elem} = $val if defined $val; return $old; } # accessors.... sub implicit_tags { shift->_elem('_implicit_tags', @_); } sub implicit_body_p_tag { shift->_elem('_implicit_body_p_tag', @_); } sub p_strict { shift->_elem('_p_strict', @_); } sub no_space_compacting { shift->_elem('_no_space_compacting', @_); } sub ignore_unknown { shift->_elem('_ignore_unknown', @_); } sub ignore_text { shift->_elem('_ignore_text', @_); } sub ignore_ignorable_whitespace { shift->_elem('_tighten', @_); } sub store_comments { shift->_elem('_store_comments', @_); } sub store_declarations { shift->_elem('_store_declarations', @_); } sub store_pis { shift->_elem('_store_pis', @_); } sub warn { shift->_elem('_warn', @_); } #========================================================================== sub warning { my $self = shift; CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'}; # should maybe say HTML::TreeBuilder instead } #========================================================================== { # To avoid having to rebuild these lists constantly... my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)]; my $indent; sub start { return if $_[0]{'_stunted'}; # Accept a signal from HTML::Parser for start-tags. my($self, $tag, $attr) = @_; # Parser passes more, actually: # $self->start($tag, $attr, $attrseq, $origtext) # But we can merrily ignore $attrseq and $origtext. if($tag eq 'x-html') { print "Ignoring open-x-html tag.\n" if DEBUG; # inserted by some lame code-generators. return; # bypass tweaking. } my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'}; my $already_inserted; #my($indent); if(DEBUG) { # optimization -- don't figure out indenting unless we're in debug mode my @lineage = $pos->lineage; $indent = ' ' x (1 + @lineage); print $indent, "Proposing a new \U$tag\E under ", join('/', map $_->{'_tag'}, reverse($pos, @lineage)) || 'Root', ".\n"; #} else { # $indent = ' '; } #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2; my $e = ($self->{'_element_class'} || 'HTML::Element')->new($tag, %$attr); # Make a new element object. # (Only rarely do we end up just throwing it away later in this call.) # Some prep -- custom messiness for those damned tables, and strict P's. if($self->{'_implicit_tags'}) { # wallawallawalla! unless($HTML::TreeBuilder::isTableElement{$tag}) { if ($ptag eq 'table') { print $indent, " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n" if DEBUG > 1; $self->insert_element('tr', 1); $pos = $self->insert_element('td', 1); # yes, needs updating } elsif ($ptag eq 'tr') { print $indent, " * Phrasal \U$tag\E right under TR makes an implicit TD\n" if DEBUG > 1; $pos = $self->insert_element('td', 1); # yes, needs updating } $ptag = $pos->{'_tag'}; # yes, needs updating } # end of table-implication block. # Now maybe do a little dance to enforce P-strictness. # This seems like it should be integrated with the big # "ALL HOPE..." block, further below, but that doesn't # seem feasable. if( $self->{'_p_strict'} and $HTML::TreeBuilder::isKnown{$tag} and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} ) { my $here = $pos; my $here_tag = $ptag; while(1) { if($here_tag eq 'p') { print $indent, " * Inserting $tag closes strict P.\n" if DEBUG > 1; $self->end(\q{p}); # NB: same as \'q', but less confusing to emacs cperl-mode last; } #print("Lasting from $here_tag\n"), last if $HTML::TreeBuilder::isKnown{$here_tag} and not $HTML::Tagset::is_Possible_Strict_P_Content{$here_tag}; # Don't keep looking up the tree if we see something that can't # be strict-P content. $here_tag = ($here = $here->{'_parent'} || last)->{'_tag'}; }# end while $ptag = ($pos = $self->{'_pos'} || $self)->{'_tag'}; # better update! } # end of strict-p block. } # And now, get busy... #---------------------------------------------------------------------- if (!$self->{'_implicit_tags'}) { # bimskalabim # do nothing print $indent, " * _implicit_tags is off. doing nothing\n" if DEBUG > 1; #---------------------------------------------------------------------- } elsif ($HTML::TreeBuilder::isHeadOrBodyElement{$tag}) { if ($pos->is_inside('body')) { # all is well print $indent, " * ambilocal element \U$tag\E is fine under BODY.\n" if DEBUG > 1; } elsif ($pos->is_inside('head')) { print $indent, " * ambilocal element \U$tag\E is fine under HEAD.\n" if DEBUG > 1; } else { # In neither head nor body! mmmmm... put under head? if ($ptag eq 'html') { # expected case # TODO?? : would there ever be a case where _head would be # absent from a tree that would ever be accessed at this # point? die "Where'd my head go?" unless ref $self->{'_head'}; if ($self->{'_head'}{'_implicit'}) { print $indent, " * ambilocal element \U$tag\E makes an implicit HEAD.\n" if DEBUG > 1; # or rather, points us at it. $self->{'_pos'} = $self->{'_head'}; # to insert under... } else { $self->warning( "Ambilocal element <$tag> not under HEAD or BODY!?"); # Put it under HEAD by default, I guess $self->{'_pos'} = $self->{'_head'}; # to insert under... } } else { # Neither under head nor body, nor right under html... pass thru? $self->warning( "Ambilocal element <$tag> neither under head nor body, nor right under html!?"); } } #---------------------------------------------------------------------- } elsif ($HTML::TreeBuilder::isBodyElement{$tag}) { # Ensure that we are within
if($ptag eq 'body') { # We're good. } elsif($HTML::TreeBuilder::isBodyElement{$ptag} # glarg and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag} ) { # Special case: Save ourselves a call to is_inside further down. # If our $ptag is an isBodyElement element (but not an # isHeadOrBodyElement element), then we must be under body! print $indent, " * Inferring that $ptag is under BODY.\n", if DEBUG > 3; # I think this and the test for 'body' trap everything # bodyworthy, except the case where the parent element is # under an unknown element that's a descendant of body. } elsif ($pos->is_inside('head')) { print $indent, " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n" if DEBUG > 1; $ptag = ( $pos = $self->{'_pos'} = $self->{'_body'} # yes, needs updating || die "Where'd my body go?" )->{'_tag'}; # yes, needs updating } elsif (! $pos->is_inside('body')) { print $indent, " * body-element \U$tag\E makes implicit BODY.\n" if DEBUG > 1; $ptag = ( $pos = $self->{'_pos'} = $self->{'_body'} # yes, needs updating || die "Where'd my body go?" )->{'_tag'}; # yes, needs updating } # else we ARE under body, so okay. # Handle implicit endings and insert based on