windows-server-2003/tools/perl/site/lib/activestate/rx/info.pm

package ActiveState::Rx::Info;

use ActiveState::Rx;

our $VERSION = 0.10;

#=============================================================================
# The following subs are the API, accessed from clients.
#=============================================================================
sub new {
  my $class = shift;
  my $regex = shift || "";
  my $mods = shift  || "";

  my $o = bless { regex  => $regex,
                  mods   => $mods,
                }, $class;

  $o->{global} = 1 if ($mods =~ s/g//);
  $o->{cregex} = eval qq|qr{$regex}$mods|;
  $o->{uregex} = ActiveState::Rx::rxdump($regex,$mods);
  $o->{tregex} = ActiveState::Rx::translate_tree($o->{uregex}, 0);
  $o->_sort_ranges;
  $o->_count_groups;
  return $o;
}

sub regex {
  my $o = shift;
  return $o->{regex};
}

sub modifiers {
  my $o = shift;
  return $o->{mods}
}

sub groupCount {
  my $o = shift;
  return scalar keys %{$o->{groups}};
}

sub maxLevel {
  my $o = shift;
  my $nodeId = shift;
  return 0;
}

sub match {
  my $o = shift;
  my $target = shift;
  return $o->_multimatch($target)
    if $o->{global};
  return $o->_match($target);
}

my %tips;
sub nodeTip {
  my $o = shift;
  my $nodeID = shift;

  my $regex = $o->{regex};
  my $modifiers = $o->{mods};  
  my $uregex = $o->{uregex};

  do {
    my $n = $uregex->{$nodeID};
    my $i = $nodeID;
    my $h = $uregex;
    my $r = $regex;
    my $m = $modifiers;
    @_ = ($o, $n, $i, $h, $r, $m); # If a sub is called, it gets all these.
    return eval $tips{$uregex->{$nodeID}{TYPE}};
  };
}

sub nodeRange {
  my $o = shift;
  my $id = shift;
  my $level = shift;
  my @ret;

  return unless $id ne "";

  my @offsets = @{$o->{uregex}{OFFSETS}};
  my @lengths = @{$o->{uregex}{LENGTHS}};
  
  if (defined $offsets[$id] and defined $lengths[$id]) {
    my $start = $offsets[$id] - 1;
    my $end = $start + $lengths[$id] - 1;    
    push @ret, $start, $end;
  }

  return wantarray ? @ret : \@ret;
}

sub childNodesRange {
  my $o = shift;
  my $id = shift;
  my @ret;

  my $node = $o->get_tnode($id);

  if ($node->{CHILD}) {
    my @children = @{$node->{CHILD}};
    
    # max and min are first set to an extremely large number.
    my $max = -1;
    my $min = -1;
    
    # find the span of the child nodes
    for my $child (@children) {
      my $child_id = $child->{__this__};
      my @child_span = $o->nodeRange($child_id, 0);
      $min = $child_span[0]
        if $child_span[0] < $min || $min == -1;
      $max = $child_span[1]
        if $child_span[1] > $max || $max == -1;
    }
    push @ret, $min, $max;
  }
  
  # The children of a '(' or ')' are everything in between the
  # parens 
  elsif ($node->{TYPE} eq 'OPEN') {
    # Find the corresponding CLOSE node 
    my $which = $node->{ARGS};
    my $close = $o->find_tnode(TYPE => 'CLOSE', ARGS => $which);
    my $close_id = $close->{__this__};
    my (undef,$opn) = $o->nodeRange($id, 0);
    my ($cls,undef) = $o->nodeRange($close_id, 0);
    push @ret, $opn + 1, $cls - 1;
  }
  elsif ($node->{TYPE} eq 'CLOSE') {
    # Find the corresponding OPEN node 
    my $which = $node->{ARGS};
    my $open = $o->find_tnode(TYPE => 'OPEN', ARGS => $which);
    my $open_id = $open->{__this__};
    my (undef,$opn) = $o->nodeRange($open_id, 0);
    my ($cls,undef) = $o->nodeRange($id, 0);
    push @ret, $opn + 1, $cls - 1;
  }

  # The "children" of a minmod should be the next node, plus its children.
  elsif ($node->{TYPE} eq 'MINMOD') {
    my $affected = $node->{NEXT};
    my ($start,undef) = $o->childNodesRange($affected);
    my (undef, $stop) = $o->nodeRange($affected, 0);
    push @ret, $start, $stop;
  }
  return wantarray ? @ret : \@ret;
}

sub nodeId {
  my $o = shift;
  my $offset = shift;

  if ($offset < 0 or $offset >= length $o->{regex}) {
    print STDERR "ActiveState::Rx::Info::nodeId($offset)\n";
    print STDERR "  Error: Offset out of range.\n";
    return;
  }

  my $uregex = $o->{uregex};
  my @sorted_ranges = @{$o->{ranges}};

  # now select the one we want:
  for (my $i=0; $i<@sorted_ranges; $i++) {
    my @q = @{$sorted_ranges[$i]};
    my $start_of_range = $q[0];
    my $end_of_range = $start_of_range + $q[1];
    
    if ($offset >= $start_of_range and $offset < $end_of_range) {
      return $q[2]
        if defined $uregex->{$q[2]};
      # This is an interesting case -- it means that node disappeared
      # at some point during optimization. The easiest way to see this
      # is in this expression: (ab)*
      #
      # OFFSET   =>   NODE   =>   TYPE
      # 0        =>   2      =>   OPTIMIZED
      # 1        =>   4      =>   EXACT
      # 2        =>   4      =>   EXACT
      # 3        =>   node not found
      # 4        =>   0      =>   CURLYM
      #
      # In this case, we can't highlight the node, find its parent,
      # or anything like that, since we have no idea which node it 
      # corresponded to in the original string.
      
      print STDERR "warning -- this node has been optimized away by " . 
        "Perl's regex engine!\n";
    }
  }
}

sub groupId {
  my $o = shift;
  my $id = shift;
  my $node = $o->get_tnode($id);
  return $node->{ARGS} if ($node->{TYPE} eq 'OPEN' or
                           $node->{TYPE} eq 'CLOSE');
  return 0;
}

# matchId() has nothing to do with match(). It returns the node which
# "matches" the node passed in. Currently, it only handles OPEN and
# CLOSE nodes.
sub matchId {
  my $o = shift;
  my $id = shift;
  my $m = "";

  my $node = $o->{uregex}{$id};
  if ($node->{TYPE} eq 'OPEN') {
    $m = $o->{groups}{$node->{ARGS}}{CLOSE};
  }
  elsif ($node->{TYPE} eq 'CLOSE') {
    $m = $o->{groups}{$node->{ARGS}}{OPEN};
  }
  return $m;
}

sub findnode {
  return find_tnode(@_)->{__this__};
}

#=============================================================================
# Subs below are for internal use only.
#=============================================================================

sub DESTROY {
  my $o = shift;

}

sub _sort_ranges {
  my $o = shift;
  
  my @offsets = @{$o->{uregex}{OFFSETS}};
  my @lengths = @{$o->{uregex}{LENGTHS}};
  
  my @sorted_ranges;
  for (my $i=0; $i<@offsets; $i++) {
    if (defined $offsets[$i] and defined $lengths[$i]) {
      push @sorted_ranges, [$offsets[$i] - 1,  # offset
                            $lengths[$i],      # length
                            $i,                # MJD's id
                           ];
    }
  }
  
  @sorted_ranges = sort { $a->[0] <=> $b->[0] } @sorted_ranges;
  $o->{ranges} = \@sorted_ranges;
}

sub _count_groups {
  my $o = shift;
  for my $key (keys %{$o->{uregex}}) {
    next if substr($key,0,2) eq "__" or $key eq 'OFFSETS' or $key eq 'LENGTHS';
    my $node = $o->{uregex}{$key};
    next unless defined $node->{TYPE};
    if ($node->{TYPE} eq 'OPEN' or
        $node->{TYPE} eq 'CLOSE') {
      $o->{groups}{$node->{ARGS}}{$node->{TYPE}} = $key;
    }
  }
}

sub _match {
  my $o = shift;
  my $target = shift;
  my @ret;
  return unless $target =~ $o->{cregex};
  for (my $i=0; $i<@+; $i++) {
    if ($+[$i] == $-[$i]) { push @ret, undef, undef }
    else {
      push @ret, $-[$i], $+[$i]-1 
        if $+[$i] >= 0 and $-[$i] >= 0;
    }
  }
  return @ret;
}

# We have to cheat a little to get the offset information
sub _multimatch {
  my $o = shift;
  my $target = shift;

  # Capture the "raw offsets"
  my $start = undef;
  my $end = 0;
  my @ret;
  while (1) {
    # Get one match (and break if it fails)
    my (@pairs) = $o->_match($target);
    last unless @pairs;

    # Remove the $& pair (the first pair)
    my @trunc = splice @pairs, 0, 2;
    for my $foo (@pairs) { $foo += $end if defined $foo; }

    # Update the span, set up the next target.
    $start = $trunc[0] unless defined $start;
    $end += $trunc[1] + 1;
    my $ntarget = substr($target, $trunc[1] + 1);
    last if $ntarget eq $target; # prevent infinite loop
    $target = $ntarget;

    # Add the shifted pairs to the return array
    push @ret, @pairs;
  }

  # Last-minute cleanup
  $end--;
  splice @ret, 0, 0, $start, $end;
  return @ret;
}

sub get_tnode {
  my $o = shift;
  my $id = shift;
  $o->{cached_tnodes}{$id} = $o->find_tnode($id)
    unless defined $o->{cached_tnodes}{$id};
  return $o->{cached_tnodes}{$id};
}

sub find_tnode {
  my $o = shift;
  my $list = ref $_[0] eq 'ARRAY' ? shift : $o->{tregex};
  my $id = shift if (@_ % 2);
  my %criteria = @_;
  $criteria{__this__} ||= $id if $id;

  for my $node (@$list) {
    my $matched = 1;
    for my $key (keys %criteria) {
      $matched &= (defined $node->{$key} and $node->{$key} eq $criteria{$key});
    }
    return $node if $matched;
    if ($node->{CHILD}) {
      my $n = $o->find_tnode($node->{CHILD}, %criteria);
      return $n if $n;
    }
  }
  return undef;
}

sub tip_star {
  my ($o, $n, $i, $h, $r, $m) = @_;
  my ($start, $stop) = $o->childNodesRange($i);
  my $child = substr($h->{REGEX},$start,$stop-$start+1);

  my $c = $o->get_tnode($n->{CHILD});
  return "Match '$child' 0 or more times" if $c->{TYPE} eq 'EXACT';
  return "Match <$child> 0 or more times";
}

sub tip_plus {
  my ($o, $n, $i, $h, $r, $m) = @_;
  my ($start, $stop) = $o->childNodesRange($i);
  my $child = substr($h->{REGEX},$start,$stop-$start+1);
  my $c = $o->get_tnode($n->{CHILD});
  return "Match '$child' 1 or more times" if $c->{TYPE} eq 'EXACT';
  return "Match <$child> 1 or more times";
}

sub tip_curly {
  my ($o, $n, $i, $h, $r, $m) = @_;
  my ($min, $max) = @{$n->{ARGS}};
  my ($start, $stop) = $o->childNodesRange($i);
  my $child = substr($h->{REGEX},$start,$stop-$start+1);
  my $c = $o->get_tnode($n->{CHILD});
  return "Match '$child' $min to $max times" if $c->{TYPE} eq 'EXACT';
  return "Match <$child> $min to $max times";
}

sub tip_curlyx {
  my ($o, $n, $i, $h, $r, $m) = @_;
  my ($min, $max) = @{$n->{ARGS}};
  my ($start,$stop) = $o->childNodesRange($i);
  my $child = substr($h->{REGEX},$start,$stop-$start+1);
  my $quant;
  if ($max == 32767 or
      $max == 2147483647) {
    $quant = "$min or more";
  }
  else {
    $quant = "$min to $max";
  }
  return "Match <$child> $quant times";
}

sub tip_anyof {
  my ($o, $n, $i, $h, $r, $m) = @_;
  my ($start,$stop) = $o->nodeRange($i,0);
  my $klass = substr($h->{REGEX},$start,$stop-$start+1);
  my $not = "";
  if (substr($klass, 1, 1) eq '^') {
    substr($klass, 1, 1, "");
    $not = " not";
  }
  return "Match any character$not in $klass";
}

sub tip_minmod {
  my ($o, $n, $i, $h, $r, $m) = @_;
  my $affected = $n->{NEXT};
  my ($start,undef) = $o->childNodesRange($affected);
  my (undef,$stop) = $o->nodeRange($affected,0);
  my $str = substr($h->{REGEX}, $start, $stop-$start+1);
  return "Match <$str> non-greedily";
}

BEGIN {
  %tips =
    (
     END => q{"End of regular expression"},
     SUCCEED => q{"Return from a subexpression"},
     BOL => q{"Match the beginning of the string"},
     MBOL => q{"Match the beginning of any line"},
     SBOL => q{"Match the beginning of the string"},
     EOS => q{"Match the end of the string"},
     EOL => q{"Match the end of the string"},
     MEOL => q{"Match the end of any line"},
     SEOL => q{"Match the end of the line"},
     BOUND => q{"Match any word boundary"},
     BOUNDL => q{"Match any word boundary"},
     NBOUND => q{"Match any word non-boundary"},
     NBOUNDL => q{"Match any word non-boundary"},
     GPOS => q{"Matches where last m//g left off"},
     
     # [Special] alternatives
     REG_ANY => q{"Match any one character (except newline)"},
     ANY => q{"Match any one character (except newline)"},
     SANY => q{"Match any one character (including newline)"},
     ANYOF => q{tip_anyof(@_)},
     ALNUM => q{"Match any alphanumeric character"},
     ALNUML => q{"Match any alphanumeric char in locale"},
     NALNUM => q{"Match any non-alphanumeric character"},
     NALNUML => q{"Match any non-alphanumeric char in locale"},
     SPACE => q{"Match any whitespace character"},
     SPACEL => q{"Match any whitespace char in locale"},
     NSPACE => q{"Match any non-whitespace character"},
     NSPACEL => q{"Match any non-whitespace char in locale"},
     DIGIT => q{"Match any numeric character"},
     NDIGIT => q{"Match any non-numeric character"},
     
     # BRANCH    The set of branches constituting a single choice are hooked
     #           together with their "next" pointers, since precedence prevents
     #           anything being concatenated to any individual branch.  The
     #           "next" pointer of the last BRANCH in a choice points to the
     #           thing following the whole choice.  This is also where the
     #           final "next" pointer of each individual branch points; each
     #           branch starts with the operand node of a BRANCH node.
     #
     BRANCH => q{"Match this alternative, or the next"},
     
     # BACK      Normal "next" pointers all implicitly point forward; BACK
     #           exists to make loop structures possible.
     # not used
     BACK => q{"Match \"\", \"next\" ptr points backward"},
     
     # Literals
     EXACT => q{"Match '${\\$n->{STRING}}'"},
     EXACTF => q{"Match '${\\$n->{STRING}}'"},
     EXACTFL => q{"Match '${\\$n->{STRING}}'"},
     
     # Do nothing
     NOTHING => q{"Match empty string"},
     # A variant of above which delimits a group, thus stops optimizations
     TAIL => q{"Match empty string"},
     
     # STAR,PLUS '?', and complex '*' and '+', are implemented as circular
     #           BRANCH structures using BACK.  Simple cases (one character
     #           per match) are implemented with STAR and PLUS for speed
     #           and to minimize recursive plunges.
     #
     STAR => q{tip_star(@_)},
     PLUS => q{tip_plus(@_)},
     CURLY => q{tip_curly(@_)},
     CURLYN => q{"Match next-after-this simple thing"},
     CURLYM => q{"Match this medium-complex thing {n,m} times"},
     CURLYX => q{tip_curlyx(@_)},
     
     # This terminator creates a loop structure for CURLYX
     WHILEM => q{"Do curly processing and see if rest matches"},
     
     # OPEN,CLOSE,GROUPP ...are numbered at compile time.
     OPEN => q{"Capture group \$${\\$n->{ARGS}}"},
     CLOSE => q{"Capture group \$${\\$n->{ARGS}}"},
     
     REF => q{"Match some already matched string"},
     REFF => q{"Match some already matched string"},
     REFFL => q{"Match some already matched string"},
     
     # grouping assertions
     IFMATCH => q{"Succeeds if the following matches"},
     UNLESSM => q{"Fails if the following matches"},
     SUSPEND => q{"Independent sub-RE"},
     IFTHEN => q{"Switch, should be preceeded by switcher"},
     GROUPP => q{"Whether the group matched"},
     
     # Support for long RE
     LONGJMP => q{"Jump far away"},
     BRANCHJ => q{"BRANCH with long offset"},
     
     # The heavy worker
     EVAL => q{"Execute some Perl code"},
     
     # Modifiers
     MINMOD => q{tip_minmod(@_)},
     LOGICAL => q{"${\\$h->{$n->{NEXT}}->{TYPE}} should set the flag only"},
     
     # This is not used yet
     RENUM => q{"Group with independently numbered parens"},
     
     # This is not really a node, but an optimized away piece of a "long" node.
     # To simplify debugging output, we mark it as if it were a node
     OPTIMIZED => q{"Placeholder for dump"},
    );
}

__END__

=head1 NAME

ActiveState::Rx::Info -- An object-oriented interface to the Regular Expression debugger.

=head1 SYNOPSIS

  use ActiveState::Rx::Info;

  my $obj = ActiveState::Rx::Info->new('(.*)(\d+)');
  print "Matched!" if ($obj->match('testing 123'));
  print "The number of groups in this regex is: $obj->groupCount\n";
  my $nid = $obj->findnode(TYPE => 'OPEN', ARGS => 1);
  print "The start of group 1 is at offset: ", 
    $obj->nodeRange($nid), "\n";

This complete program prints out:

  Matched!
  The number of groups in this regex is: 2
  The start of group 1 is at offset: 0

=head1 DESCRIPTION

ActiveState::Rx::Info is designed to provide a higher level
abstraction of the regular expression debugger than does
ActiveState::Rx. The modified compiler and executor are kept in
ActiveState::Rx, but ActiveState::Rx::Info makes it easier to use.

=head1 API

The following sections document the methods available from
ActiveState::Rx::Info.

=head2 new(regex[, modifiers])

Creates a ActiveState::Rx::Info object. 'regex' is the regular
expression to generate information about, and 'modifiers' is an
optional parameter containing perl modifiers g, i, s, m, o, and x.

=head2 regex()

Returns the string form of the regular expression stored in the object.

=head2 modifiers()

Returns the string form of the modifiers stored in the object.

=head2 groupCount()

Returns the number of groups found in the regex. For example,

  use ActiveState::Rx::Info;
  my $gc = ActiveState::Rx::Info->new('(abc*)')->groupCount;

In this example, C<$gc> will be set to 1. 

=head2 nodeId(offset)

Returns the 'node id' of the node found at the given offset into the
regular expression string. Most API functions in ActiveState::Rx::Info
operate on a node id, since that is how regular expressions are
manipulated internally.

=head2 maxLevel(nodeId)

Returns the maximum 'level' of the node. Level is an abstract concept
-- so abstract it hasn't even been nailed down. Yet. This function
currently doesn't do anything except return 0.

=head2 match(target)

Attempts to apply the regular expression to the target string. Returns
a list of offsets in the target string, designed to aid highlighting
the parts of the string which corresponded to groups in the regular
expression.

Here is an example:

  use ActiveState::Rx::Info;
  my @m = ActiveState::Rx::Info->new('(.*)(\d+)')->match('testing123');

In this example, C<@m> is set to (0, 9, 0, 8, 9, 9). These numbers
represent three pairs of numbers: (0, 9), (0, 8), and (9, 9). I<These>
pairs represent substrings of the target string corresponding to
matches. The first pair is always the substring C<$&>, or the extents
of the match. The remaining pairs all refer to C<$1>, C<$2>, and so
on. If global matching is turned on, then there will be I<one> C<$&>
at the beginning, and one pair for each iteration of the match.

If no string was matched by the particular pair, they are both undef.

=head2 nodeTip(nodeId)

Returns a node tip corresponding to the given regular expression
node. For example:

  use ActiveState::Rx::Info; 
  my $o = ActiveState::Rx::Info->new('abc*'); 
  print $o->nodeTip($o->nodeId(0));

will print I<Match 'ab'>. 

=head2 nodeRange(nodeId)

Returns the range of the node in the regular expression string. For example:

  use ActiveState::Rx::Info;
  my $o = ActiveState::Rx::Info->new('abc*');
  print join ', ', $o->nodeRange($o->nodeId(0));

will print I<0, 1>. 

=head2 childNodesRange(nodeId)

Returns the range of any children of the given node. Some nodes do not have
children; they will return an empty list.

=head2 groupId(nodeId)

Returns the group number that nodeId refers to. Only supported if nodeId 
is either an OPEN or CLOSE node. 

=head2 matchId(nodeId)

Returns the nodeId of a node which "matches" the given node. Currently only
implemented if nodeId refers to a OPEN or CLOSE node. If nodeId returns to 
an OPEN node, it returns the node id of the corresponding CLOSE, and vice 
versa.

=head2 findnode(criteria)

Searches the nodes in the regular expression for a matching node. Returns the
node id of the matching node structure. For example:

  use ActiveState::Rx::Info;
  my $o = ActiveState::Rx::Info->new('ab(c*)');
  my $nid = $o->findnode(TYPE => OPEN, ARGS => 1);

This example set C<$nid> to the node id referring to the first OPEN node
in the regular expression. 

=head1 AUTHOR

Neil Watkiss <[email protected]>
ActiveState Corporation

=head1 COPYRIGHT

Copyright (c) 2001, ActiveState SRL.

=cut