Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

676 lines
20 KiB

  1. package ActiveState::Rx::Info;
  2. use ActiveState::Rx;
  3. our $VERSION = 0.10;
  4. #=============================================================================
  5. # The following subs are the API, accessed from clients.
  6. #=============================================================================
  7. sub new {
  8. my $class = shift;
  9. my $regex = shift || "";
  10. my $mods = shift || "";
  11. my $o = bless { regex => $regex,
  12. mods => $mods,
  13. }, $class;
  14. $o->{global} = 1 if ($mods =~ s/g//);
  15. $o->{cregex} = eval qq|qr{$regex}$mods|;
  16. $o->{uregex} = ActiveState::Rx::rxdump($regex,$mods);
  17. $o->{tregex} = ActiveState::Rx::translate_tree($o->{uregex}, 0);
  18. $o->_sort_ranges;
  19. $o->_count_groups;
  20. return $o;
  21. }
  22. sub regex {
  23. my $o = shift;
  24. return $o->{regex};
  25. }
  26. sub modifiers {
  27. my $o = shift;
  28. return $o->{mods}
  29. }
  30. sub groupCount {
  31. my $o = shift;
  32. return scalar keys %{$o->{groups}};
  33. }
  34. sub maxLevel {
  35. my $o = shift;
  36. my $nodeId = shift;
  37. return 0;
  38. }
  39. sub match {
  40. my $o = shift;
  41. my $target = shift;
  42. return $o->_multimatch($target)
  43. if $o->{global};
  44. return $o->_match($target);
  45. }
  46. my %tips;
  47. sub nodeTip {
  48. my $o = shift;
  49. my $nodeID = shift;
  50. my $regex = $o->{regex};
  51. my $modifiers = $o->{mods};
  52. my $uregex = $o->{uregex};
  53. do {
  54. my $n = $uregex->{$nodeID};
  55. my $i = $nodeID;
  56. my $h = $uregex;
  57. my $r = $regex;
  58. my $m = $modifiers;
  59. @_ = ($o, $n, $i, $h, $r, $m); # If a sub is called, it gets all these.
  60. return eval $tips{$uregex->{$nodeID}{TYPE}};
  61. };
  62. }
  63. sub nodeRange {
  64. my $o = shift;
  65. my $id = shift;
  66. my $level = shift;
  67. my @ret;
  68. return unless $id ne "";
  69. my @offsets = @{$o->{uregex}{OFFSETS}};
  70. my @lengths = @{$o->{uregex}{LENGTHS}};
  71. if (defined $offsets[$id] and defined $lengths[$id]) {
  72. my $start = $offsets[$id] - 1;
  73. my $end = $start + $lengths[$id] - 1;
  74. push @ret, $start, $end;
  75. }
  76. return wantarray ? @ret : \@ret;
  77. }
  78. sub childNodesRange {
  79. my $o = shift;
  80. my $id = shift;
  81. my @ret;
  82. my $node = $o->get_tnode($id);
  83. if ($node->{CHILD}) {
  84. my @children = @{$node->{CHILD}};
  85. # max and min are first set to an extremely large number.
  86. my $max = -1;
  87. my $min = -1;
  88. # find the span of the child nodes
  89. for my $child (@children) {
  90. my $child_id = $child->{__this__};
  91. my @child_span = $o->nodeRange($child_id, 0);
  92. $min = $child_span[0]
  93. if $child_span[0] < $min || $min == -1;
  94. $max = $child_span[1]
  95. if $child_span[1] > $max || $max == -1;
  96. }
  97. push @ret, $min, $max;
  98. }
  99. # The children of a '(' or ')' are everything in between the
  100. # parens
  101. elsif ($node->{TYPE} eq 'OPEN') {
  102. # Find the corresponding CLOSE node
  103. my $which = $node->{ARGS};
  104. my $close = $o->find_tnode(TYPE => 'CLOSE', ARGS => $which);
  105. my $close_id = $close->{__this__};
  106. my (undef,$opn) = $o->nodeRange($id, 0);
  107. my ($cls,undef) = $o->nodeRange($close_id, 0);
  108. push @ret, $opn + 1, $cls - 1;
  109. }
  110. elsif ($node->{TYPE} eq 'CLOSE') {
  111. # Find the corresponding OPEN node
  112. my $which = $node->{ARGS};
  113. my $open = $o->find_tnode(TYPE => 'OPEN', ARGS => $which);
  114. my $open_id = $open->{__this__};
  115. my (undef,$opn) = $o->nodeRange($open_id, 0);
  116. my ($cls,undef) = $o->nodeRange($id, 0);
  117. push @ret, $opn + 1, $cls - 1;
  118. }
  119. # The "children" of a minmod should be the next node, plus its children.
  120. elsif ($node->{TYPE} eq 'MINMOD') {
  121. my $affected = $node->{NEXT};
  122. my ($start,undef) = $o->childNodesRange($affected);
  123. my (undef, $stop) = $o->nodeRange($affected, 0);
  124. push @ret, $start, $stop;
  125. }
  126. return wantarray ? @ret : \@ret;
  127. }
  128. sub nodeId {
  129. my $o = shift;
  130. my $offset = shift;
  131. if ($offset < 0 or $offset >= length $o->{regex}) {
  132. print STDERR "ActiveState::Rx::Info::nodeId($offset)\n";
  133. print STDERR " Error: Offset out of range.\n";
  134. return;
  135. }
  136. my $uregex = $o->{uregex};
  137. my @sorted_ranges = @{$o->{ranges}};
  138. # now select the one we want:
  139. for (my $i=0; $i<@sorted_ranges; $i++) {
  140. my @q = @{$sorted_ranges[$i]};
  141. my $start_of_range = $q[0];
  142. my $end_of_range = $start_of_range + $q[1];
  143. if ($offset >= $start_of_range and $offset < $end_of_range) {
  144. return $q[2]
  145. if defined $uregex->{$q[2]};
  146. # This is an interesting case -- it means that node disappeared
  147. # at some point during optimization. The easiest way to see this
  148. # is in this expression: (ab)*
  149. #
  150. # OFFSET => NODE => TYPE
  151. # 0 => 2 => OPTIMIZED
  152. # 1 => 4 => EXACT
  153. # 2 => 4 => EXACT
  154. # 3 => node not found
  155. # 4 => 0 => CURLYM
  156. #
  157. # In this case, we can't highlight the node, find its parent,
  158. # or anything like that, since we have no idea which node it
  159. # corresponded to in the original string.
  160. print STDERR "warning -- this node has been optimized away by " .
  161. "Perl's regex engine!\n";
  162. }
  163. }
  164. }
  165. sub groupId {
  166. my $o = shift;
  167. my $id = shift;
  168. my $node = $o->get_tnode($id);
  169. return $node->{ARGS} if ($node->{TYPE} eq 'OPEN' or
  170. $node->{TYPE} eq 'CLOSE');
  171. return 0;
  172. }
  173. # matchId() has nothing to do with match(). It returns the node which
  174. # "matches" the node passed in. Currently, it only handles OPEN and
  175. # CLOSE nodes.
  176. sub matchId {
  177. my $o = shift;
  178. my $id = shift;
  179. my $m = "";
  180. my $node = $o->{uregex}{$id};
  181. if ($node->{TYPE} eq 'OPEN') {
  182. $m = $o->{groups}{$node->{ARGS}}{CLOSE};
  183. }
  184. elsif ($node->{TYPE} eq 'CLOSE') {
  185. $m = $o->{groups}{$node->{ARGS}}{OPEN};
  186. }
  187. return $m;
  188. }
  189. sub findnode {
  190. return find_tnode(@_)->{__this__};
  191. }
  192. #=============================================================================
  193. # Subs below are for internal use only.
  194. #=============================================================================
  195. sub DESTROY {
  196. my $o = shift;
  197. }
  198. sub _sort_ranges {
  199. my $o = shift;
  200. my @offsets = @{$o->{uregex}{OFFSETS}};
  201. my @lengths = @{$o->{uregex}{LENGTHS}};
  202. my @sorted_ranges;
  203. for (my $i=0; $i<@offsets; $i++) {
  204. if (defined $offsets[$i] and defined $lengths[$i]) {
  205. push @sorted_ranges, [$offsets[$i] - 1, # offset
  206. $lengths[$i], # length
  207. $i, # MJD's id
  208. ];
  209. }
  210. }
  211. @sorted_ranges = sort { $a->[0] <=> $b->[0] } @sorted_ranges;
  212. $o->{ranges} = \@sorted_ranges;
  213. }
  214. sub _count_groups {
  215. my $o = shift;
  216. for my $key (keys %{$o->{uregex}}) {
  217. next if substr($key,0,2) eq "__" or $key eq 'OFFSETS' or $key eq 'LENGTHS';
  218. my $node = $o->{uregex}{$key};
  219. next unless defined $node->{TYPE};
  220. if ($node->{TYPE} eq 'OPEN' or
  221. $node->{TYPE} eq 'CLOSE') {
  222. $o->{groups}{$node->{ARGS}}{$node->{TYPE}} = $key;
  223. }
  224. }
  225. }
  226. sub _match {
  227. my $o = shift;
  228. my $target = shift;
  229. my @ret;
  230. return unless $target =~ $o->{cregex};
  231. for (my $i=0; $i<@+; $i++) {
  232. if ($+[$i] == $-[$i]) { push @ret, undef, undef }
  233. else {
  234. push @ret, $-[$i], $+[$i]-1
  235. if $+[$i] >= 0 and $-[$i] >= 0;
  236. }
  237. }
  238. return @ret;
  239. }
  240. # We have to cheat a little to get the offset information
  241. sub _multimatch {
  242. my $o = shift;
  243. my $target = shift;
  244. # Capture the "raw offsets"
  245. my $start = undef;
  246. my $end = 0;
  247. my @ret;
  248. while (1) {
  249. # Get one match (and break if it fails)
  250. my (@pairs) = $o->_match($target);
  251. last unless @pairs;
  252. # Remove the $& pair (the first pair)
  253. my @trunc = splice @pairs, 0, 2;
  254. for my $foo (@pairs) { $foo += $end if defined $foo; }
  255. # Update the span, set up the next target.
  256. $start = $trunc[0] unless defined $start;
  257. $end += $trunc[1] + 1;
  258. my $ntarget = substr($target, $trunc[1] + 1);
  259. last if $ntarget eq $target; # prevent infinite loop
  260. $target = $ntarget;
  261. # Add the shifted pairs to the return array
  262. push @ret, @pairs;
  263. }
  264. # Last-minute cleanup
  265. $end--;
  266. splice @ret, 0, 0, $start, $end;
  267. return @ret;
  268. }
  269. sub get_tnode {
  270. my $o = shift;
  271. my $id = shift;
  272. $o->{cached_tnodes}{$id} = $o->find_tnode($id)
  273. unless defined $o->{cached_tnodes}{$id};
  274. return $o->{cached_tnodes}{$id};
  275. }
  276. sub find_tnode {
  277. my $o = shift;
  278. my $list = ref $_[0] eq 'ARRAY' ? shift : $o->{tregex};
  279. my $id = shift if (@_ % 2);
  280. my %criteria = @_;
  281. $criteria{__this__} ||= $id if $id;
  282. for my $node (@$list) {
  283. my $matched = 1;
  284. for my $key (keys %criteria) {
  285. $matched &= (defined $node->{$key} and $node->{$key} eq $criteria{$key});
  286. }
  287. return $node if $matched;
  288. if ($node->{CHILD}) {
  289. my $n = $o->find_tnode($node->{CHILD}, %criteria);
  290. return $n if $n;
  291. }
  292. }
  293. return undef;
  294. }
  295. sub tip_star {
  296. my ($o, $n, $i, $h, $r, $m) = @_;
  297. my ($start, $stop) = $o->childNodesRange($i);
  298. my $child = substr($h->{REGEX},$start,$stop-$start+1);
  299. my $c = $o->get_tnode($n->{CHILD});
  300. return "Match '$child' 0 or more times" if $c->{TYPE} eq 'EXACT';
  301. return "Match <$child> 0 or more times";
  302. }
  303. sub tip_plus {
  304. my ($o, $n, $i, $h, $r, $m) = @_;
  305. my ($start, $stop) = $o->childNodesRange($i);
  306. my $child = substr($h->{REGEX},$start,$stop-$start+1);
  307. my $c = $o->get_tnode($n->{CHILD});
  308. return "Match '$child' 1 or more times" if $c->{TYPE} eq 'EXACT';
  309. return "Match <$child> 1 or more times";
  310. }
  311. sub tip_curly {
  312. my ($o, $n, $i, $h, $r, $m) = @_;
  313. my ($min, $max) = @{$n->{ARGS}};
  314. my ($start, $stop) = $o->childNodesRange($i);
  315. my $child = substr($h->{REGEX},$start,$stop-$start+1);
  316. my $c = $o->get_tnode($n->{CHILD});
  317. return "Match '$child' $min to $max times" if $c->{TYPE} eq 'EXACT';
  318. return "Match <$child> $min to $max times";
  319. }
  320. sub tip_curlyx {
  321. my ($o, $n, $i, $h, $r, $m) = @_;
  322. my ($min, $max) = @{$n->{ARGS}};
  323. my ($start,$stop) = $o->childNodesRange($i);
  324. my $child = substr($h->{REGEX},$start,$stop-$start+1);
  325. my $quant;
  326. if ($max == 32767 or
  327. $max == 2147483647) {
  328. $quant = "$min or more";
  329. }
  330. else {
  331. $quant = "$min to $max";
  332. }
  333. return "Match <$child> $quant times";
  334. }
  335. sub tip_anyof {
  336. my ($o, $n, $i, $h, $r, $m) = @_;
  337. my ($start,$stop) = $o->nodeRange($i,0);
  338. my $klass = substr($h->{REGEX},$start,$stop-$start+1);
  339. my $not = "";
  340. if (substr($klass, 1, 1) eq '^') {
  341. substr($klass, 1, 1, "");
  342. $not = " not";
  343. }
  344. return "Match any character$not in $klass";
  345. }
  346. sub tip_minmod {
  347. my ($o, $n, $i, $h, $r, $m) = @_;
  348. my $affected = $n->{NEXT};
  349. my ($start,undef) = $o->childNodesRange($affected);
  350. my (undef,$stop) = $o->nodeRange($affected,0);
  351. my $str = substr($h->{REGEX}, $start, $stop-$start+1);
  352. return "Match <$str> non-greedily";
  353. }
  354. BEGIN {
  355. %tips =
  356. (
  357. END => q{"End of regular expression"},
  358. SUCCEED => q{"Return from a subexpression"},
  359. BOL => q{"Match the beginning of the string"},
  360. MBOL => q{"Match the beginning of any line"},
  361. SBOL => q{"Match the beginning of the string"},
  362. EOS => q{"Match the end of the string"},
  363. EOL => q{"Match the end of the string"},
  364. MEOL => q{"Match the end of any line"},
  365. SEOL => q{"Match the end of the line"},
  366. BOUND => q{"Match any word boundary"},
  367. BOUNDL => q{"Match any word boundary"},
  368. NBOUND => q{"Match any word non-boundary"},
  369. NBOUNDL => q{"Match any word non-boundary"},
  370. GPOS => q{"Matches where last m//g left off"},
  371. # [Special] alternatives
  372. REG_ANY => q{"Match any one character (except newline)"},
  373. ANY => q{"Match any one character (except newline)"},
  374. SANY => q{"Match any one character (including newline)"},
  375. ANYOF => q{tip_anyof(@_)},
  376. ALNUM => q{"Match any alphanumeric character"},
  377. ALNUML => q{"Match any alphanumeric char in locale"},
  378. NALNUM => q{"Match any non-alphanumeric character"},
  379. NALNUML => q{"Match any non-alphanumeric char in locale"},
  380. SPACE => q{"Match any whitespace character"},
  381. SPACEL => q{"Match any whitespace char in locale"},
  382. NSPACE => q{"Match any non-whitespace character"},
  383. NSPACEL => q{"Match any non-whitespace char in locale"},
  384. DIGIT => q{"Match any numeric character"},
  385. NDIGIT => q{"Match any non-numeric character"},
  386. # BRANCH The set of branches constituting a single choice are hooked
  387. # together with their "next" pointers, since precedence prevents
  388. # anything being concatenated to any individual branch. The
  389. # "next" pointer of the last BRANCH in a choice points to the
  390. # thing following the whole choice. This is also where the
  391. # final "next" pointer of each individual branch points; each
  392. # branch starts with the operand node of a BRANCH node.
  393. #
  394. BRANCH => q{"Match this alternative, or the next"},
  395. # BACK Normal "next" pointers all implicitly point forward; BACK
  396. # exists to make loop structures possible.
  397. # not used
  398. BACK => q{"Match \"\", \"next\" ptr points backward"},
  399. # Literals
  400. EXACT => q{"Match '${\\$n->{STRING}}'"},
  401. EXACTF => q{"Match '${\\$n->{STRING}}'"},
  402. EXACTFL => q{"Match '${\\$n->{STRING}}'"},
  403. # Do nothing
  404. NOTHING => q{"Match empty string"},
  405. # A variant of above which delimits a group, thus stops optimizations
  406. TAIL => q{"Match empty string"},
  407. # STAR,PLUS '?', and complex '*' and '+', are implemented as circular
  408. # BRANCH structures using BACK. Simple cases (one character
  409. # per match) are implemented with STAR and PLUS for speed
  410. # and to minimize recursive plunges.
  411. #
  412. STAR => q{tip_star(@_)},
  413. PLUS => q{tip_plus(@_)},
  414. CURLY => q{tip_curly(@_)},
  415. CURLYN => q{"Match next-after-this simple thing"},
  416. CURLYM => q{"Match this medium-complex thing {n,m} times"},
  417. CURLYX => q{tip_curlyx(@_)},
  418. # This terminator creates a loop structure for CURLYX
  419. WHILEM => q{"Do curly processing and see if rest matches"},
  420. # OPEN,CLOSE,GROUPP ...are numbered at compile time.
  421. OPEN => q{"Capture group \$${\\$n->{ARGS}}"},
  422. CLOSE => q{"Capture group \$${\\$n->{ARGS}}"},
  423. REF => q{"Match some already matched string"},
  424. REFF => q{"Match some already matched string"},
  425. REFFL => q{"Match some already matched string"},
  426. # grouping assertions
  427. IFMATCH => q{"Succeeds if the following matches"},
  428. UNLESSM => q{"Fails if the following matches"},
  429. SUSPEND => q{"Independent sub-RE"},
  430. IFTHEN => q{"Switch, should be preceeded by switcher"},
  431. GROUPP => q{"Whether the group matched"},
  432. # Support for long RE
  433. LONGJMP => q{"Jump far away"},
  434. BRANCHJ => q{"BRANCH with long offset"},
  435. # The heavy worker
  436. EVAL => q{"Execute some Perl code"},
  437. # Modifiers
  438. MINMOD => q{tip_minmod(@_)},
  439. LOGICAL => q{"${\\$h->{$n->{NEXT}}->{TYPE}} should set the flag only"},
  440. # This is not used yet
  441. RENUM => q{"Group with independently numbered parens"},
  442. # This is not really a node, but an optimized away piece of a "long" node.
  443. # To simplify debugging output, we mark it as if it were a node
  444. OPTIMIZED => q{"Placeholder for dump"},
  445. );
  446. }
  447. __END__
  448. =head1 NAME
  449. ActiveState::Rx::Info -- An object-oriented interface to the Regular Expression debugger.
  450. =head1 SYNOPSIS
  451. use ActiveState::Rx::Info;
  452. my $obj = ActiveState::Rx::Info->new('(.*)(\d+)');
  453. print "Matched!" if ($obj->match('testing 123'));
  454. print "The number of groups in this regex is: $obj->groupCount\n";
  455. my $nid = $obj->findnode(TYPE => 'OPEN', ARGS => 1);
  456. print "The start of group 1 is at offset: ",
  457. $obj->nodeRange($nid), "\n";
  458. This complete program prints out:
  459. Matched!
  460. The number of groups in this regex is: 2
  461. The start of group 1 is at offset: 0
  462. =head1 DESCRIPTION
  463. ActiveState::Rx::Info is designed to provide a higher level
  464. abstraction of the regular expression debugger than does
  465. ActiveState::Rx. The modified compiler and executor are kept in
  466. ActiveState::Rx, but ActiveState::Rx::Info makes it easier to use.
  467. =head1 API
  468. The following sections document the methods available from
  469. ActiveState::Rx::Info.
  470. =head2 new(regex[, modifiers])
  471. Creates a ActiveState::Rx::Info object. 'regex' is the regular
  472. expression to generate information about, and 'modifiers' is an
  473. optional parameter containing perl modifiers g, i, s, m, o, and x.
  474. =head2 regex()
  475. Returns the string form of the regular expression stored in the object.
  476. =head2 modifiers()
  477. Returns the string form of the modifiers stored in the object.
  478. =head2 groupCount()
  479. Returns the number of groups found in the regex. For example,
  480. use ActiveState::Rx::Info;
  481. my $gc = ActiveState::Rx::Info->new('(abc*)')->groupCount;
  482. In this example, C<$gc> will be set to 1.
  483. =head2 nodeId(offset)
  484. Returns the 'node id' of the node found at the given offset into the
  485. regular expression string. Most API functions in ActiveState::Rx::Info
  486. operate on a node id, since that is how regular expressions are
  487. manipulated internally.
  488. =head2 maxLevel(nodeId)
  489. Returns the maximum 'level' of the node. Level is an abstract concept
  490. -- so abstract it hasn't even been nailed down. Yet. This function
  491. currently doesn't do anything except return 0.
  492. =head2 match(target)
  493. Attempts to apply the regular expression to the target string. Returns
  494. a list of offsets in the target string, designed to aid highlighting
  495. the parts of the string which corresponded to groups in the regular
  496. expression.
  497. Here is an example:
  498. use ActiveState::Rx::Info;
  499. my @m = ActiveState::Rx::Info->new('(.*)(\d+)')->match('testing123');
  500. In this example, C<@m> is set to (0, 9, 0, 8, 9, 9). These numbers
  501. represent three pairs of numbers: (0, 9), (0, 8), and (9, 9). I<These>
  502. pairs represent substrings of the target string corresponding to
  503. matches. The first pair is always the substring C<$&>, or the extents
  504. of the match. The remaining pairs all refer to C<$1>, C<$2>, and so
  505. on. If global matching is turned on, then there will be I<one> C<$&>
  506. at the beginning, and one pair for each iteration of the match.
  507. If no string was matched by the particular pair, they are both undef.
  508. =head2 nodeTip(nodeId)
  509. Returns a node tip corresponding to the given regular expression
  510. node. For example:
  511. use ActiveState::Rx::Info;
  512. my $o = ActiveState::Rx::Info->new('abc*');
  513. print $o->nodeTip($o->nodeId(0));
  514. will print I<Match 'ab'>.
  515. =head2 nodeRange(nodeId)
  516. Returns the range of the node in the regular expression string. For example:
  517. use ActiveState::Rx::Info;
  518. my $o = ActiveState::Rx::Info->new('abc*');
  519. print join ', ', $o->nodeRange($o->nodeId(0));
  520. will print I<0, 1>.
  521. =head2 childNodesRange(nodeId)
  522. Returns the range of any children of the given node. Some nodes do not have
  523. children; they will return an empty list.
  524. =head2 groupId(nodeId)
  525. Returns the group number that nodeId refers to. Only supported if nodeId
  526. is either an OPEN or CLOSE node.
  527. =head2 matchId(nodeId)
  528. Returns the nodeId of a node which "matches" the given node. Currently only
  529. implemented if nodeId refers to a OPEN or CLOSE node. If nodeId returns to
  530. an OPEN node, it returns the node id of the corresponding CLOSE, and vice
  531. versa.
  532. =head2 findnode(criteria)
  533. Searches the nodes in the regular expression for a matching node. Returns the
  534. node id of the matching node structure. For example:
  535. use ActiveState::Rx::Info;
  536. my $o = ActiveState::Rx::Info->new('ab(c*)');
  537. my $nid = $o->findnode(TYPE => OPEN, ARGS => 1);
  538. This example set C<$nid> to the node id referring to the first OPEN node
  539. in the regular expression.
  540. =head1 AUTHOR
  541. Neil Watkiss <[email protected]>
  542. ActiveState Corporation
  543. =head1 COPYRIGHT
  544. Copyright (c) 2001, ActiveState SRL.
  545. =cut