|
|
package ActiveState::Rx::Info;
use ActiveState::Rx;
our $VERSION = 0.10;
#============================================================================= # The following subs are the API, accessed from clients. #============================================================================= sub new { my $class = shift; my $regex = shift || ""; my $mods = shift || "";
my $o = bless { regex => $regex, mods => $mods, }, $class;
$o->{global} = 1 if ($mods =~ s/g//); $o->{cregex} = eval qq|qr{$regex}$mods|; $o->{uregex} = ActiveState::Rx::rxdump($regex,$mods); $o->{tregex} = ActiveState::Rx::translate_tree($o->{uregex}, 0); $o->_sort_ranges; $o->_count_groups; return $o; }
sub regex { my $o = shift; return $o->{regex}; }
sub modifiers { my $o = shift; return $o->{mods} }
sub groupCount { my $o = shift; return scalar keys %{$o->{groups}}; }
sub maxLevel { my $o = shift; my $nodeId = shift; return 0; }
sub match { my $o = shift; my $target = shift; return $o->_multimatch($target) if $o->{global}; return $o->_match($target); }
my %tips; sub nodeTip { my $o = shift; my $nodeID = shift;
my $regex = $o->{regex}; my $modifiers = $o->{mods}; my $uregex = $o->{uregex};
do { my $n = $uregex->{$nodeID}; my $i = $nodeID; my $h = $uregex; my $r = $regex; my $m = $modifiers; @_ = ($o, $n, $i, $h, $r, $m); # If a sub is called, it gets all these. return eval $tips{$uregex->{$nodeID}{TYPE}}; }; }
sub nodeRange { my $o = shift; my $id = shift; my $level = shift; my @ret;
return unless $id ne "";
my @offsets = @{$o->{uregex}{OFFSETS}}; my @lengths = @{$o->{uregex}{LENGTHS}}; if (defined $offsets[$id] and defined $lengths[$id]) { my $start = $offsets[$id] - 1; my $end = $start + $lengths[$id] - 1; push @ret, $start, $end; }
return wantarray ? @ret : \@ret; }
sub childNodesRange { my $o = shift; my $id = shift; my @ret;
my $node = $o->get_tnode($id);
if ($node->{CHILD}) { my @children = @{$node->{CHILD}}; # max and min are first set to an extremely large number. my $max = -1; my $min = -1; # find the span of the child nodes for my $child (@children) { my $child_id = $child->{__this__}; my @child_span = $o->nodeRange($child_id, 0); $min = $child_span[0] if $child_span[0] < $min || $min == -1; $max = $child_span[1] if $child_span[1] > $max || $max == -1; } push @ret, $min, $max; } # The children of a '(' or ')' are everything in between the # parens elsif ($node->{TYPE} eq 'OPEN') { # Find the corresponding CLOSE node my $which = $node->{ARGS}; my $close = $o->find_tnode(TYPE => 'CLOSE', ARGS => $which); my $close_id = $close->{__this__}; my (undef,$opn) = $o->nodeRange($id, 0); my ($cls,undef) = $o->nodeRange($close_id, 0); push @ret, $opn + 1, $cls - 1; } elsif ($node->{TYPE} eq 'CLOSE') { # Find the corresponding OPEN node my $which = $node->{ARGS}; my $open = $o->find_tnode(TYPE => 'OPEN', ARGS => $which); my $open_id = $open->{__this__}; my (undef,$opn) = $o->nodeRange($open_id, 0); my ($cls,undef) = $o->nodeRange($id, 0); push @ret, $opn + 1, $cls - 1; }
# The "children" of a minmod should be the next node, plus its children. elsif ($node->{TYPE} eq 'MINMOD') { my $affected = $node->{NEXT}; my ($start,undef) = $o->childNodesRange($affected); my (undef, $stop) = $o->nodeRange($affected, 0); push @ret, $start, $stop; } return wantarray ? @ret : \@ret; }
sub nodeId { my $o = shift; my $offset = shift;
if ($offset < 0 or $offset >= length $o->{regex}) { print STDERR "ActiveState::Rx::Info::nodeId($offset)\n"; print STDERR " Error: Offset out of range.\n"; return; }
my $uregex = $o->{uregex}; my @sorted_ranges = @{$o->{ranges}};
# now select the one we want: for (my $i=0; $i<@sorted_ranges; $i++) { my @q = @{$sorted_ranges[$i]}; my $start_of_range = $q[0]; my $end_of_range = $start_of_range + $q[1]; if ($offset >= $start_of_range and $offset < $end_of_range) { return $q[2] if defined $uregex->{$q[2]}; # This is an interesting case -- it means that node disappeared # at some point during optimization. The easiest way to see this # is in this expression: (ab)* # # OFFSET => NODE => TYPE # 0 => 2 => OPTIMIZED # 1 => 4 => EXACT # 2 => 4 => EXACT # 3 => node not found # 4 => 0 => CURLYM # # In this case, we can't highlight the node, find its parent, # or anything like that, since we have no idea which node it # corresponded to in the original string. print STDERR "warning -- this node has been optimized away by " . "Perl's regex engine!\n"; } } }
sub groupId { my $o = shift; my $id = shift; my $node = $o->get_tnode($id); return $node->{ARGS} if ($node->{TYPE} eq 'OPEN' or $node->{TYPE} eq 'CLOSE'); return 0; }
# matchId() has nothing to do with match(). It returns the node which # "matches" the node passed in. Currently, it only handles OPEN and # CLOSE nodes. sub matchId { my $o = shift; my $id = shift; my $m = "";
my $node = $o->{uregex}{$id}; if ($node->{TYPE} eq 'OPEN') { $m = $o->{groups}{$node->{ARGS}}{CLOSE}; } elsif ($node->{TYPE} eq 'CLOSE') { $m = $o->{groups}{$node->{ARGS}}{OPEN}; } return $m; }
sub findnode { return find_tnode(@_)->{__this__}; }
#============================================================================= # Subs below are for internal use only. #=============================================================================
sub DESTROY { my $o = shift;
}
sub _sort_ranges { my $o = shift; my @offsets = @{$o->{uregex}{OFFSETS}}; my @lengths = @{$o->{uregex}{LENGTHS}}; my @sorted_ranges; for (my $i=0; $i<@offsets; $i++) { if (defined $offsets[$i] and defined $lengths[$i]) { push @sorted_ranges, [$offsets[$i] - 1, # offset $lengths[$i], # length $i, # MJD's id ]; } } @sorted_ranges = sort { $a->[0] <=> $b->[0] } @sorted_ranges; $o->{ranges} = \@sorted_ranges; }
sub _count_groups { my $o = shift; for my $key (keys %{$o->{uregex}}) { next if substr($key,0,2) eq "__" or $key eq 'OFFSETS' or $key eq 'LENGTHS'; my $node = $o->{uregex}{$key}; next unless defined $node->{TYPE}; if ($node->{TYPE} eq 'OPEN' or $node->{TYPE} eq 'CLOSE') { $o->{groups}{$node->{ARGS}}{$node->{TYPE}} = $key; } } }
sub _match { my $o = shift; my $target = shift; my @ret; return unless $target =~ $o->{cregex}; for (my $i=0; $i<@+; $i++) { if ($+[$i] == $-[$i]) { push @ret, undef, undef } else { push @ret, $-[$i], $+[$i]-1 if $+[$i] >= 0 and $-[$i] >= 0; } } return @ret; }
# We have to cheat a little to get the offset information sub _multimatch { my $o = shift; my $target = shift;
# Capture the "raw offsets" my $start = undef; my $end = 0; my @ret; while (1) { # Get one match (and break if it fails) my (@pairs) = $o->_match($target); last unless @pairs;
# Remove the $& pair (the first pair) my @trunc = splice @pairs, 0, 2; for my $foo (@pairs) { $foo += $end if defined $foo; }
# Update the span, set up the next target. $start = $trunc[0] unless defined $start; $end += $trunc[1] + 1; my $ntarget = substr($target, $trunc[1] + 1); last if $ntarget eq $target; # prevent infinite loop $target = $ntarget;
# Add the shifted pairs to the return array push @ret, @pairs; }
# Last-minute cleanup $end--; splice @ret, 0, 0, $start, $end; return @ret; }
sub get_tnode { my $o = shift; my $id = shift; $o->{cached_tnodes}{$id} = $o->find_tnode($id) unless defined $o->{cached_tnodes}{$id}; return $o->{cached_tnodes}{$id}; }
sub find_tnode { my $o = shift; my $list = ref $_[0] eq 'ARRAY' ? shift : $o->{tregex}; my $id = shift if (@_ % 2); my %criteria = @_; $criteria{__this__} ||= $id if $id;
for my $node (@$list) { my $matched = 1; for my $key (keys %criteria) { $matched &= (defined $node->{$key} and $node->{$key} eq $criteria{$key}); } return $node if $matched; if ($node->{CHILD}) { my $n = $o->find_tnode($node->{CHILD}, %criteria); return $n if $n; } } return undef; }
sub tip_star { my ($o, $n, $i, $h, $r, $m) = @_; my ($start, $stop) = $o->childNodesRange($i); my $child = substr($h->{REGEX},$start,$stop-$start+1);
my $c = $o->get_tnode($n->{CHILD}); return "Match '$child' 0 or more times" if $c->{TYPE} eq 'EXACT'; return "Match <$child> 0 or more times"; }
sub tip_plus { my ($o, $n, $i, $h, $r, $m) = @_; my ($start, $stop) = $o->childNodesRange($i); my $child = substr($h->{REGEX},$start,$stop-$start+1); my $c = $o->get_tnode($n->{CHILD}); return "Match '$child' 1 or more times" if $c->{TYPE} eq 'EXACT'; return "Match <$child> 1 or more times"; }
sub tip_curly { my ($o, $n, $i, $h, $r, $m) = @_; my ($min, $max) = @{$n->{ARGS}}; my ($start, $stop) = $o->childNodesRange($i); my $child = substr($h->{REGEX},$start,$stop-$start+1); my $c = $o->get_tnode($n->{CHILD}); return "Match '$child' $min to $max times" if $c->{TYPE} eq 'EXACT'; return "Match <$child> $min to $max times"; }
sub tip_curlyx { my ($o, $n, $i, $h, $r, $m) = @_; my ($min, $max) = @{$n->{ARGS}}; my ($start,$stop) = $o->childNodesRange($i); my $child = substr($h->{REGEX},$start,$stop-$start+1); my $quant; if ($max == 32767 or $max == 2147483647) { $quant = "$min or more"; } else { $quant = "$min to $max"; } return "Match <$child> $quant times"; }
sub tip_anyof { my ($o, $n, $i, $h, $r, $m) = @_; my ($start,$stop) = $o->nodeRange($i,0); my $klass = substr($h->{REGEX},$start,$stop-$start+1); my $not = ""; if (substr($klass, 1, 1) eq '^') { substr($klass, 1, 1, ""); $not = " not"; } return "Match any character$not in $klass"; }
sub tip_minmod { my ($o, $n, $i, $h, $r, $m) = @_; my $affected = $n->{NEXT}; my ($start,undef) = $o->childNodesRange($affected); my (undef,$stop) = $o->nodeRange($affected,0); my $str = substr($h->{REGEX}, $start, $stop-$start+1); return "Match <$str> non-greedily"; }
BEGIN { %tips = ( END => q{"End of regular expression"}, SUCCEED => q{"Return from a subexpression"}, BOL => q{"Match the beginning of the string"}, MBOL => q{"Match the beginning of any line"}, SBOL => q{"Match the beginning of the string"}, EOS => q{"Match the end of the string"}, EOL => q{"Match the end of the string"}, MEOL => q{"Match the end of any line"}, SEOL => q{"Match the end of the line"}, BOUND => q{"Match any word boundary"}, BOUNDL => q{"Match any word boundary"}, NBOUND => q{"Match any word non-boundary"}, NBOUNDL => q{"Match any word non-boundary"}, GPOS => q{"Matches where last m//g left off"}, # [Special] alternatives REG_ANY => q{"Match any one character (except newline)"}, ANY => q{"Match any one character (except newline)"}, SANY => q{"Match any one character (including newline)"}, ANYOF => q{tip_anyof(@_)}, ALNUM => q{"Match any alphanumeric character"}, ALNUML => q{"Match any alphanumeric char in locale"}, NALNUM => q{"Match any non-alphanumeric character"}, NALNUML => q{"Match any non-alphanumeric char in locale"}, SPACE => q{"Match any whitespace character"}, SPACEL => q{"Match any whitespace char in locale"}, NSPACE => q{"Match any non-whitespace character"}, NSPACEL => q{"Match any non-whitespace char in locale"}, DIGIT => q{"Match any numeric character"}, NDIGIT => q{"Match any non-numeric character"}, # BRANCH The set of branches constituting a single choice are hooked # together with their "next" pointers, since precedence prevents # anything being concatenated to any individual branch. The # "next" pointer of the last BRANCH in a choice points to the # thing following the whole choice. This is also where the # final "next" pointer of each individual branch points; each # branch starts with the operand node of a BRANCH node. # BRANCH => q{"Match this alternative, or the next"}, # BACK Normal "next" pointers all implicitly point forward; BACK # exists to make loop structures possible. # not used BACK => q{"Match \"\", \"next\" ptr points backward"}, # Literals EXACT => q{"Match '${\\$n->{STRING}}'"}, EXACTF => q{"Match '${\\$n->{STRING}}'"}, EXACTFL => q{"Match '${\\$n->{STRING}}'"}, # Do nothing NOTHING => q{"Match empty string"}, # A variant of above which delimits a group, thus stops optimizations TAIL => q{"Match empty string"}, # STAR,PLUS '?', and complex '*' and '+', are implemented as circular # BRANCH structures using BACK. Simple cases (one character # per match) are implemented with STAR and PLUS for speed # and to minimize recursive plunges. # STAR => q{tip_star(@_)}, PLUS => q{tip_plus(@_)}, CURLY => q{tip_curly(@_)}, CURLYN => q{"Match next-after-this simple thing"}, CURLYM => q{"Match this medium-complex thing {n,m} times"}, CURLYX => q{tip_curlyx(@_)}, # This terminator creates a loop structure for CURLYX WHILEM => q{"Do curly processing and see if rest matches"}, # OPEN,CLOSE,GROUPP ...are numbered at compile time. OPEN => q{"Capture group \$${\\$n->{ARGS}}"}, CLOSE => q{"Capture group \$${\\$n->{ARGS}}"}, REF => q{"Match some already matched string"}, REFF => q{"Match some already matched string"}, REFFL => q{"Match some already matched string"}, # grouping assertions IFMATCH => q{"Succeeds if the following matches"}, UNLESSM => q{"Fails if the following matches"}, SUSPEND => q{"Independent sub-RE"}, IFTHEN => q{"Switch, should be preceeded by switcher"}, GROUPP => q{"Whether the group matched"}, # Support for long RE LONGJMP => q{"Jump far away"}, BRANCHJ => q{"BRANCH with long offset"}, # The heavy worker EVAL => q{"Execute some Perl code"}, # Modifiers MINMOD => q{tip_minmod(@_)}, LOGICAL => q{"${\\$h->{$n->{NEXT}}->{TYPE}} should set the flag only"}, # This is not used yet RENUM => q{"Group with independently numbered parens"}, # This is not really a node, but an optimized away piece of a "long" node. # To simplify debugging output, we mark it as if it were a node OPTIMIZED => q{"Placeholder for dump"}, ); }
__END__
=head1 NAME
ActiveState::Rx::Info -- An object-oriented interface to the Regular Expression debugger.
=head1 SYNOPSIS
use ActiveState::Rx::Info;
my $obj = ActiveState::Rx::Info->new('(.*)(\d+)'); print "Matched!" if ($obj->match('testing 123')); print "The number of groups in this regex is: $obj->groupCount\n"; my $nid = $obj->findnode(TYPE => 'OPEN', ARGS => 1); print "The start of group 1 is at offset: ", $obj->nodeRange($nid), "\n";
This complete program prints out:
Matched! The number of groups in this regex is: 2 The start of group 1 is at offset: 0
=head1 DESCRIPTION
ActiveState::Rx::Info is designed to provide a higher level abstraction of the regular expression debugger than does ActiveState::Rx. The modified compiler and executor are kept in ActiveState::Rx, but ActiveState::Rx::Info makes it easier to use.
=head1 API
The following sections document the methods available from ActiveState::Rx::Info.
=head2 new(regex[, modifiers])
Creates a ActiveState::Rx::Info object. 'regex' is the regular expression to generate information about, and 'modifiers' is an optional parameter containing perl modifiers g, i, s, m, o, and x.
=head2 regex()
Returns the string form of the regular expression stored in the object.
=head2 modifiers()
Returns the string form of the modifiers stored in the object.
=head2 groupCount()
Returns the number of groups found in the regex. For example,
use ActiveState::Rx::Info; my $gc = ActiveState::Rx::Info->new('(abc*)')->groupCount;
In this example, C<$gc> will be set to 1.
=head2 nodeId(offset)
Returns the 'node id' of the node found at the given offset into the regular expression string. Most API functions in ActiveState::Rx::Info operate on a node id, since that is how regular expressions are manipulated internally.
=head2 maxLevel(nodeId)
Returns the maximum 'level' of the node. Level is an abstract concept -- so abstract it hasn't even been nailed down. Yet. This function currently doesn't do anything except return 0.
=head2 match(target)
Attempts to apply the regular expression to the target string. Returns a list of offsets in the target string, designed to aid highlighting the parts of the string which corresponded to groups in the regular expression.
Here is an example:
use ActiveState::Rx::Info; my @m = ActiveState::Rx::Info->new('(.*)(\d+)')->match('testing123');
In this example, C<@m> is set to (0, 9, 0, 8, 9, 9). These numbers represent three pairs of numbers: (0, 9), (0, 8), and (9, 9). I<These> pairs represent substrings of the target string corresponding to matches. The first pair is always the substring C<$&>, or the extents of the match. The remaining pairs all refer to C<$1>, C<$2>, and so on. If global matching is turned on, then there will be I<one> C<$&> at the beginning, and one pair for each iteration of the match.
If no string was matched by the particular pair, they are both undef.
=head2 nodeTip(nodeId)
Returns a node tip corresponding to the given regular expression node. For example:
use ActiveState::Rx::Info; my $o = ActiveState::Rx::Info->new('abc*'); print $o->nodeTip($o->nodeId(0));
will print I<Match 'ab'>.
=head2 nodeRange(nodeId)
Returns the range of the node in the regular expression string. For example:
use ActiveState::Rx::Info; my $o = ActiveState::Rx::Info->new('abc*'); print join ', ', $o->nodeRange($o->nodeId(0));
will print I<0, 1>.
=head2 childNodesRange(nodeId)
Returns the range of any children of the given node. Some nodes do not have children; they will return an empty list.
=head2 groupId(nodeId)
Returns the group number that nodeId refers to. Only supported if nodeId is either an OPEN or CLOSE node.
=head2 matchId(nodeId)
Returns the nodeId of a node which "matches" the given node. Currently only implemented if nodeId refers to a OPEN or CLOSE node. If nodeId returns to an OPEN node, it returns the node id of the corresponding CLOSE, and vice versa.
=head2 findnode(criteria)
Searches the nodes in the regular expression for a matching node. Returns the node id of the matching node structure. For example:
use ActiveState::Rx::Info; my $o = ActiveState::Rx::Info->new('ab(c*)'); my $nid = $o->findnode(TYPE => OPEN, ARGS => 1);
This example set C<$nid> to the node id referring to the first OPEN node in the regular expression.
=head1 AUTHOR
Neil Watkiss <[email protected]> ActiveState Corporation
=head1 COPYRIGHT
Copyright (c) 2001, ActiveState SRL.
=cut
|