Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1297 lines
35 KiB

  1. //+-------------------------------------------------------------------------
  2. //
  3. // Copyright (C) 1991, Microsoft Corporation.
  4. //
  5. // File: FA.cxx
  6. //
  7. // Contents: Non-deterministic finite automata
  8. //
  9. // Classes: CNFA
  10. //
  11. // History: 01-20-92 KyleP Created
  12. // 03-11-97 arunk Modified for Kessel
  13. //--------------------------------------------------------------------------
  14. #include <fa.hxx>
  15. #include <stateset.hxx>
  16. //+-------------------------------------------------------------------------
  17. //
  18. // Member: CFA::CFA, public
  19. //
  20. // Synopsis: Copy constructor
  21. //
  22. // History: 13-Jul-95 KyleP Created
  23. //
  24. //--------------------------------------------------------------------------
  25. CFA::CFA( CFA const & src )
  26. : _cTotal( src._cTotal ),
  27. _ppState( 0 )
  28. {
  29. _ppState = new CFAState * [ _cTotal ];
  30. unsigned i = 0;
  31. for ( ; i < _cTotal; i++ )
  32. {
  33. if ( 0 == src._ppState[i] )
  34. _ppState[i] = 0;
  35. else
  36. _ppState[i] = new CFAState( *src._ppState[i] );
  37. }
  38. }
  39. //+-------------------------------------------------------------------------
  40. //
  41. // Member: CFA::~CFA, protected
  42. //
  43. // Synopsis: Frees automata.
  44. //
  45. // History: 20-Jan-92 KyleP Created
  46. //
  47. //--------------------------------------------------------------------------
  48. CFA::~CFA()
  49. {
  50. if( _ppState )
  51. {
  52. for ( UINT i = 0; i < _cTotal; i++ )
  53. {
  54. delete _ppState[i];
  55. }
  56. delete _ppState;
  57. }
  58. }
  59. //+-------------------------------------------------------------------------
  60. //
  61. // Member: CFA::Add, protected
  62. //
  63. // Synopsis: Adds new state to automata.
  64. //
  65. // Arguments: [pState] -- New state. State number is member data.
  66. //
  67. // History: 20-Jan-92 KyleP Created
  68. //
  69. //--------------------------------------------------------------------------
  70. void CFA::Add( CFAState * pState )
  71. {
  72. if ( pState->StateNumber() > _cTotal )
  73. {
  74. for( UINT newTotal = (_cTotal) ? _cTotal * 2 : 1;
  75. pState->StateNumber() > newTotal;
  76. newTotal *= 2 );
  77. CFAState ** oldState = _ppState;
  78. _ppState = new CFAState * [ newTotal ];
  79. memcpy( _ppState, oldState,
  80. _cTotal * sizeof( CFAState * ) );
  81. memset( _ppState + _cTotal,
  82. 0,
  83. (newTotal - _cTotal) * sizeof( CFAState * ) );
  84. _cTotal = newTotal;
  85. }
  86. _ppState[pState->StateNumber() - 1] = pState;
  87. }
  88. //+-------------------------------------------------------------------------
  89. //
  90. // Member: CFA::Get, protected
  91. //
  92. // Arguments: [iState] -- State to fetch.
  93. //
  94. // Returns: State [iState].
  95. //
  96. // History: 20-Jan-92 KyleP Created
  97. //
  98. //--------------------------------------------------------------------------
  99. CFAState * CFA::Get( UINT iState ){
  100. return( _ppState[ iState - 1 ] );
  101. }
  102. //+-------------------------------------------------------------------------
  103. //
  104. // Member: CNFA::CNFA, public
  105. //
  106. // Synopsis: Converts regular expression string to NFA.
  107. //
  108. // Arguments: [pwcs] -- Regular expression.
  109. // [fCaseSens] -- true if case sensitive search.
  110. //
  111. // History: 20-Jan-92 Kyleap Created
  112. //
  113. //--------------------------------------------------------------------------
  114. CNFA::CNFA( WCHAR const * pwcs, bool fCaseSens )
  115. : _iNextState( 1 ),
  116. _iStart( 0 ),
  117. _chars( fCaseSens ),
  118. _pState( 0 )
  119. {
  120. UINT iEnd;
  121. //
  122. // _pState initially contains room for 2 * #chars in regex. According
  123. // to the Dragon Book pg. 121 this is guaranteed to be sufficient space.
  124. // Of course the dragon book doesn't completely take DOS or CMS into
  125. // account. For DOS, we need to treat beginning (and end) of line as
  126. // 'characters' in the string. For CMS, I agreed to support the
  127. // {m,n} construct, which clearly violates this rule.
  128. //
  129. if ( 0 == pwcs )
  130. {
  131. throw ERROR_INVALID_PARAMETER;
  132. }
  133. _cState = wcslen( pwcs ) * 2 + 2*2; // 2*2 for beginning & end of line
  134. _pState = new CNFAState [ _cState ];
  135. for ( unsigned i = 1 ; i <= _cState; i++ )
  136. Get(i)->Init(i);
  137. FindCharClasses( pwcs );
  138. Parse( pwcs, &_iStart, &iEnd );
  139. Get( iEnd )->MakeFinal();
  140. }
  141. //+-------------------------------------------------------------------------
  142. //
  143. // Member: CNFA::CNFA, public
  144. //
  145. // Synopsis: Copy constructor
  146. //
  147. // Arguments: [src] -- Source
  148. //
  149. // History: 13-Jul-95 Kylep Created
  150. //
  151. //--------------------------------------------------------------------------
  152. CNFA::CNFA( CNFA const & src )
  153. : _iNextState( src.NumStates() ),
  154. _iStart( src._iStart ),
  155. _chars( src._chars ),
  156. _cState( src._cState ),
  157. _pState( new CNFAState [ src._cState ] )
  158. {
  159. for ( unsigned i = 0; i < _cState; i++ )
  160. _pState[i] = src._pState[i];
  161. }
  162. //+-------------------------------------------------------------------------
  163. //
  164. // Member: CNFA::~CNFA, public
  165. //
  166. // Synopsis: Free state table.
  167. //
  168. // History: 13-Oct-92 KyleP Created
  169. //
  170. //--------------------------------------------------------------------------
  171. CNFA::~CNFA()
  172. {
  173. delete [] _pState;
  174. }
  175. //+-------------------------------------------------------------------------
  176. //
  177. // Member: CNFA::EpsClosure, public
  178. //
  179. // Synopsis: Computes the epsilon closure for state [StateNum]
  180. //
  181. // Effects: States in the epsilon closure of state [StateNum]
  182. // are added to the state set [ssOut].
  183. //
  184. // Arguments: [StateNum] -- Initial state.
  185. // [ssOut] -- Output state set.
  186. //
  187. // History: 20-Jan-92 KyleP Created
  188. //
  189. //--------------------------------------------------------------------------
  190. void CNFA::EpsClosure( UINT StateNum, CStateSet & ssOut )
  191. {
  192. CStateSet ssTraversed;
  193. ssOut.Add( StateNum );
  194. bool changed = true;
  195. while ( changed )
  196. {
  197. changed = false;
  198. for ( UINT i = ssOut.Count(); i > 0; i-- )
  199. {
  200. if ( !ssTraversed.IsMember( ssOut.State( i ) ) )
  201. {
  202. ssTraversed.Add( ssOut.State( i ) );
  203. Get( ssOut.State( i ) )->Move( ssOut, symEpsilon );
  204. changed = true;
  205. }
  206. }
  207. }
  208. }
  209. //+-------------------------------------------------------------------------
  210. //
  211. // Member: CNFA::EpsClosure, public
  212. //
  213. // Synopsis: Computes the epsilon closure for state set [ssIn]
  214. //
  215. // Effects: States in the epsilon closure of [ssIn]
  216. // are added to the state set [ssOut].
  217. //
  218. // Arguments: [ssIn] -- Initial state set.
  219. // [ssOut] -- Output state set.
  220. //
  221. // History: 20-Jan-92 KyleP Created
  222. //
  223. //--------------------------------------------------------------------------
  224. void CNFA::EpsClosure( CStateSet & ssIn, CStateSet & ssOut )
  225. {
  226. for ( UINT i = ssIn.Count(); i > 0; i-- )
  227. {
  228. EpsClosure( ssIn.State( i ), ssOut );
  229. }
  230. }
  231. //+-------------------------------------------------------------------------
  232. //
  233. // Member: CDFA::IsFinal, public
  234. //
  235. // Arguments: [ss] -- State set
  236. //
  237. // Returns: true if some state in [ss] is final.
  238. //
  239. // History: 20-Jan-92 Kyleap Created
  240. //
  241. //--------------------------------------------------------------------------
  242. bool CNFA::IsFinal( CStateSet & ss )
  243. {
  244. bool fFinal = false;
  245. for ( UINT i = ss.Count(); i > 0 && !fFinal; i-- )
  246. {
  247. fFinal = (Get( ss.State( i ) )->IsFinal() != NULL);
  248. }
  249. return( fFinal );
  250. }
  251. //+-------------------------------------------------------------------------
  252. //
  253. // Member: CNFA::Move, public
  254. //
  255. // Effects: Performs a non-deterministic move from every state
  256. // in [ssIn] on [symbol]. The new state set is in
  257. // [ssOut].
  258. //
  259. // Arguments: [ssIn] -- Initial state set.
  260. // [ssOut] -- Final state set.
  261. // [symbol] -- Transition symbol.
  262. //
  263. // History: 20-Jan-92 KyleP Created
  264. //
  265. //--------------------------------------------------------------------------
  266. void CNFA::Move( CStateSet & ssIn, CStateSet & ssOut, UINT symbol )
  267. {
  268. for ( UINT i = ssIn.Count(); i > 0; i-- )
  269. {
  270. Get( ssIn.State( i ) )->Move( ssOut, symbol );
  271. }
  272. }
  273. //+-------------------------------------------------------------------------
  274. //
  275. // Member: CNFA::FindCharClasses, private
  276. //
  277. // Effects: Partitions the UniCode character space (2^16 characters)
  278. // into equivalence classes such that all characters in
  279. // a given class will have identical transitions in the NFA.
  280. //
  281. // Arguments: [wcs] -- Original regular expression string.
  282. //
  283. // History: 20-Jan-92 KyleP Created
  284. //
  285. // Notes: If case sensitivity is turned off, two ranges will be
  286. // added for characters with upper/lower case. Even though
  287. // both ranges react identically the mapping algorithm can
  288. // only deal with contiguous ranges of characters.
  289. //
  290. //--------------------------------------------------------------------------
  291. void CNFA::FindCharClasses( WCHAR const * wcs )
  292. {
  293. //
  294. // Scan the regex looking for characters with (potentially)
  295. // different transitions.
  296. //
  297. while ( *wcs )
  298. {
  299. switch ( *wcs )
  300. {
  301. case wcAnySingle:
  302. case wcAnyMultiple:
  303. case wcDOSDot:
  304. break;
  305. case wcEscape:
  306. {
  307. wcs++;
  308. switch ( *wcs )
  309. {
  310. case 0:
  311. throw ERROR_INVALID_PARAMETER;
  312. break;
  313. case wcAnySingle:
  314. case wcRepeatZero:
  315. case wcRepeatOne:
  316. case wcOr:
  317. case wcBeginParen:
  318. case wcEndParen:
  319. break;
  320. case wcBeginRepeat:
  321. for ( wcs++; *wcs; wcs++ )
  322. {
  323. if ( *wcs == wcEscape && *(wcs+1) == wcEndRepeat )
  324. {
  325. wcs++;
  326. break;
  327. }
  328. }
  329. break;
  330. case wcBeginRange:
  331. wcs++;
  332. //
  333. // Check the special cases of ^ and ]
  334. //
  335. if ( *wcs == wcInvertRange )
  336. wcs++;
  337. if ( *wcs == wcEndRange )
  338. {
  339. _chars.AddRange( *wcs, *wcs );
  340. wcs++;
  341. }
  342. for ( ; *wcs && *wcs != wcEndRange; wcs++ )
  343. {
  344. if ( *(wcs + 1) == wcRangeSep )
  345. {
  346. _chars.AddRange( *wcs, *(wcs+2) );
  347. }
  348. else
  349. {
  350. _chars.AddRange( *wcs, *wcs );
  351. }
  352. }
  353. if ( *wcs != wcEndRange )
  354. {
  355. throw ERROR_INVALID_PARAMETER;
  356. }
  357. break;
  358. default:
  359. _chars.AddRange( *wcs, *wcs );
  360. break;
  361. }
  362. break;
  363. }
  364. default:
  365. _chars.AddRange( *wcs, *wcs );
  366. break;
  367. }
  368. wcs++;
  369. }
  370. _chars.Prepare();
  371. }
  372. WCHAR * CNFA::_wcsNull = (WCHAR*)"";
  373. //+-------------------------------------------------------------------------
  374. //
  375. // Member: CNFA::Parse, private
  376. //
  377. // Synopsis: Creates a NFA from [wcs]
  378. //
  379. // Effects: Parses [wcs] until end of string or character wcHalt is
  380. // encountered. On exit, [iStart] and [iEnd] contain the
  381. // starting and ending states of the NFA, respectively.
  382. // [pwcsEnd] points to the last character of [wcs] that was
  383. // parsed.
  384. //
  385. // Arguments: [wcs] -- Regular expression.
  386. // [iStart] -- Starting state of NFA.
  387. // [iEnd] -- Ending state of NFA
  388. // [pwcsEnd] -- Last character of [wcs] that was parsed.
  389. // [wcHalt] -- Stop parsing if this character encountered.
  390. //
  391. // History: 20-Jan-92 KyleP Created
  392. // 08-Jun-98 SBens Fixed so that all top-level OR clauses
  393. // must terminate with symEndLine.
  394. //
  395. //--------------------------------------------------------------------------
  396. void CNFA::Parse( WCHAR const * wcs,
  397. UINT * iStart,
  398. UINT * iEnd,
  399. WCHAR const * * pwcsEnd,
  400. WCHAR wcHalt )
  401. {
  402. unsigned iCurrent;
  403. unsigned iNext;
  404. unsigned iLocalStart; // Used for */+/? repositioning
  405. bool fRepeat = false; // Used for +
  406. bool fTopLevel = (*iStart == 0); // true if at top level;
  407. *iEnd = 0;
  408. //
  409. // Get a starting state. *iStart == 0 implies this is the 'top-level'
  410. // parse of the regular expression (e.g. we're not parsing a
  411. // parenthesized subexpression.
  412. //
  413. if ( fTopLevel )
  414. {
  415. iCurrent = _iNextState;
  416. *iStart = _iNextState++;
  417. iLocalStart = 0;
  418. //
  419. // non-EGREP (DOS) regex match entire string.
  420. //
  421. if ( *wcs != wcAnyMultiple )
  422. {
  423. iNext = _iNextState;
  424. Get( iCurrent )->AddTransition( symBeginLine, _iNextState );
  425. _iNextState++;
  426. iCurrent = iNext;
  427. }
  428. else
  429. {
  430. //
  431. // Add a 'special' transition on the very first state to
  432. // eat up characters until we actually jump into the
  433. // regular expresion.
  434. //
  435. Get( iCurrent )->AddTransition( symAny, Get( iCurrent )->StateNumber() );
  436. }
  437. }
  438. else
  439. {
  440. iCurrent = *iStart;
  441. iLocalStart = *iStart;
  442. }
  443. unsigned iOrStart = Get( iCurrent )->StateNumber();
  444. //
  445. // wcsLocalStart tracks the piece of string to be repeated for wcZeroOrOne, etc.
  446. //
  447. WCHAR const * wcsLocalStart = wcs;
  448. //
  449. // Parse the regular expression until there is no more or a
  450. // termination character is hit.
  451. //
  452. for ( ; *wcs && *wcs != wcHalt; wcs++ )
  453. {
  454. switch ( *wcs )
  455. {
  456. case wcAnySingle:
  457. iNext = _iNextState;
  458. Get( iCurrent )->AddTransition( symAny, _iNextState );
  459. iLocalStart = Get( iCurrent )->StateNumber();
  460. wcsLocalStart = wcs;
  461. _iNextState++;
  462. iCurrent = iNext;
  463. break;
  464. case wcAnyMultiple:
  465. //
  466. // Any single
  467. //
  468. iNext = _iNextState;
  469. Get( iCurrent )->AddTransition( symAny, _iNextState );
  470. iLocalStart = Get( iCurrent )->StateNumber();
  471. wcsLocalStart = wcs;
  472. _iNextState++;
  473. iCurrent = iNext;
  474. //
  475. // Repeat zero or more
  476. //
  477. Get( iLocalStart )->AddTransition( symEpsilon,
  478. Get( iCurrent )->StateNumber() );
  479. Get( iCurrent )->AddTransition( symEpsilon, iLocalStart );
  480. break;
  481. case wcEscape:
  482. {
  483. wcs++;
  484. switch ( *wcs )
  485. {
  486. case wcBeginParen:
  487. {
  488. UINT iLocalEnd;
  489. iLocalStart = Get( iCurrent )->StateNumber();
  490. wcsLocalStart = wcs - 1;
  491. wcs++; // Eat '('.
  492. Parse( wcs, &iLocalStart, &iLocalEnd, &wcs, wcEndParen );
  493. wcs--; // Provide character for loop to eat.
  494. iCurrent = iLocalEnd;
  495. break;
  496. }
  497. case wcEndParen:
  498. //
  499. // Taken care of at outer level. Just backup so we hit the end.
  500. //
  501. wcs--;
  502. break;
  503. case wcBeginRepeat:
  504. {
  505. if ( wcHalt == wcBeginRepeat )
  506. {
  507. //
  508. // Taken care of at outer level. Just backup so we hit the end.
  509. //
  510. wcs--;
  511. }
  512. else
  513. {
  514. //
  515. // Setup: Bounds of repeated regex
  516. //
  517. WCHAR const * wcsStartRepeat = wcsLocalStart;
  518. WCHAR const * wcsEndRepeat = wcs + 1;
  519. //
  520. // Setup: Repeat parameters.
  521. //
  522. unsigned cRepeat1, cRepeat2;
  523. wcs++;
  524. ParseRepeat( wcs, cRepeat1, cRepeat2 );
  525. unsigned iLocalEnd;
  526. //
  527. // The minimum set has no epsilon transitions.
  528. //
  529. if ( cRepeat1 > 1 )
  530. {
  531. iLocalStart = Get( iCurrent )->StateNumber();
  532. iLocalEnd = iLocalStart;
  533. for ( unsigned i = 1; i < cRepeat1; i++ )
  534. {
  535. WCHAR const * wcsEnd;
  536. iLocalStart = iLocalEnd;
  537. iLocalEnd = 0; // Must be zero!
  538. Parse( wcsLocalStart, &iLocalStart, &iLocalEnd, &wcsEnd, wcBeginRepeat );
  539. if ( wcsEnd != wcsEndRepeat )
  540. {
  541. throw ERROR_INVALID_PARAMETER;
  542. }
  543. }
  544. }
  545. else
  546. iLocalEnd = Get( iCurrent )->StateNumber();
  547. if ( cRepeat1 == cRepeat2 )
  548. {
  549. }
  550. else if ( cRepeat2 == 0 )
  551. {
  552. Get( iLocalEnd )->AddTransition( symEpsilon, iLocalStart );
  553. }
  554. else if ( cRepeat2 > cRepeat1 )
  555. {
  556. for ( unsigned i = cRepeat1; i < cRepeat2; i++ )
  557. {
  558. WCHAR const * wcsEnd;
  559. iLocalStart = iLocalEnd;
  560. iLocalEnd = 0; // Must be zero!
  561. Parse( wcsLocalStart, &iLocalStart, &iLocalEnd, &wcsEnd, wcBeginRepeat );
  562. Get( iLocalStart )->AddTransition( symEpsilon, iLocalEnd );
  563. if ( wcsEnd != wcsEndRepeat )
  564. {
  565. throw ERROR_INVALID_PARAMETER;
  566. }
  567. }
  568. }
  569. else
  570. {
  571. throw ERROR_INVALID_PARAMETER;
  572. }
  573. iCurrent = iLocalEnd;
  574. iLocalStart = 0;
  575. wcsLocalStart = _wcsNull;
  576. }
  577. break;
  578. }
  579. case wcOr:
  580. // Top level 'OR' clauses must terminate with symEndLine.
  581. if ( fTopLevel )
  582. {
  583. iNext = _iNextState;
  584. Get( iCurrent )->AddTransition( symEndLine, _iNextState );
  585. _iNextState++;
  586. iCurrent = iNext;
  587. }
  588. if ( *iEnd == 0 )
  589. {
  590. //
  591. // First part of OR clause.
  592. //
  593. *iEnd = Get( iCurrent )->StateNumber();
  594. }
  595. else
  596. {
  597. //
  598. // Subsequent OR clause. Epsilon link to end
  599. //
  600. Get( iCurrent )->AddTransition( symEpsilon, *iEnd );
  601. }
  602. iCurrent = iOrStart;
  603. wcsLocalStart = _wcsNull;
  604. iLocalStart = 0;
  605. break;
  606. case wcBeginRange:
  607. {
  608. bool fReverse = false;
  609. wcsLocalStart = wcs-1;
  610. iNext = _iNextState;
  611. wcs++; // Eat '['. ']' eaten by loop.
  612. //
  613. // Check the special cases of ^ and ]
  614. //
  615. if ( *wcs == wcInvertRange )
  616. {
  617. wcs++;
  618. fReverse = true;
  619. //
  620. // Add all transitions, they will be removed later.
  621. //
  622. for ( UINT uiNext = _chars.TranslateRange( 1,
  623. (USHORT) symLastValidChar );
  624. uiNext != 0;
  625. uiNext = _chars.TranslateRange( 0, (USHORT) symLastValidChar ) )
  626. {
  627. Get( iCurrent )->AddTransition( uiNext,
  628. _iNextState );
  629. }
  630. }
  631. if ( *wcs == wcEndRange )
  632. {
  633. if ( fReverse )
  634. {
  635. Get( iCurrent )->RemoveTransition( _chars.Translate( *wcs++ ),
  636. _iNextState );
  637. }
  638. else
  639. {
  640. Get( iCurrent )->AddTransition( _chars.Translate( *wcs++ ),
  641. _iNextState );
  642. }
  643. }
  644. for ( ; *wcs && *wcs != wcEndRange; wcs++ )
  645. {
  646. if ( *(wcs + 1) == wcRangeSep )
  647. {
  648. if ( fReverse )
  649. {
  650. Get( iCurrent )->RemoveTransition(
  651. _chars.TranslateRange( *wcs, *(wcs+2) ),
  652. _iNextState );
  653. }
  654. else
  655. {
  656. Get( iCurrent )->AddTransition(
  657. _chars.TranslateRange( *wcs, *(wcs+2) ),
  658. _iNextState );
  659. }
  660. for ( UINT uiNext = _chars.TranslateRange( 0,
  661. *(wcs+2) );
  662. uiNext != 0;
  663. uiNext = _chars.TranslateRange( 0, *(wcs+2) ) )
  664. {
  665. if ( fReverse )
  666. {
  667. Get( iCurrent )->RemoveTransition( uiNext,
  668. _iNextState );
  669. }
  670. else
  671. {
  672. Get( iCurrent )->AddTransition( uiNext,
  673. _iNextState );
  674. }
  675. }
  676. wcs += 2;
  677. }
  678. else
  679. {
  680. if ( fReverse )
  681. {
  682. Get( iCurrent )->RemoveTransition(
  683. _chars.Translate( *wcs ),
  684. _iNextState );
  685. }
  686. else
  687. {
  688. Get( iCurrent )->AddTransition(
  689. _chars.Translate( *wcs ),
  690. _iNextState );
  691. }
  692. }
  693. }
  694. if ( *wcs != wcEndRange )
  695. {
  696. throw ERROR_INVALID_PARAMETER;
  697. }
  698. iLocalStart = Get( iCurrent )->StateNumber();
  699. _iNextState++;
  700. iCurrent = iNext;
  701. break;
  702. }
  703. case wcRepeatOne:
  704. if ( iLocalStart == 0 )
  705. {
  706. throw ERROR_INVALID_PARAMETER;
  707. }
  708. Get( iCurrent )->AddTransition( symEpsilon, iLocalStart );
  709. break;
  710. case wcRepeatZero:
  711. if ( iLocalStart == 0 )
  712. {
  713. throw ERROR_INVALID_PARAMETER;
  714. }
  715. Get( iLocalStart )->AddTransition( symEpsilon,
  716. Get( iCurrent )->StateNumber() );
  717. Get( iCurrent )->AddTransition( symEpsilon, iLocalStart );
  718. break;
  719. case wcRepeatZeroOrOne:
  720. {
  721. if ( iLocalStart == 0 )
  722. {
  723. throw ERROR_INVALID_PARAMETER;
  724. }
  725. Get( iLocalStart )->AddTransition( symEpsilon,
  726. Get( iCurrent )->StateNumber() );
  727. break;
  728. }
  729. default:
  730. iNext = _iNextState;
  731. Get( iCurrent )->AddTransition( _chars.Translate( *wcs ),
  732. _iNextState );
  733. iLocalStart = Get( iCurrent )->StateNumber();
  734. wcsLocalStart = wcs - 1;
  735. _iNextState++;
  736. iCurrent = iNext;
  737. break;
  738. }
  739. break; // switch for wcEscape
  740. }
  741. default:
  742. iNext = _iNextState;
  743. Get( iCurrent )->AddTransition( _chars.Translate( *wcs ),
  744. _iNextState );
  745. //
  746. // In non-EGREP (DOS) syntax dot '.' is funny. It will match
  747. // a dot, but if you're at the end of string it will also match
  748. // end. So *.txt will look for strings with zero or more
  749. // characters followed by '.txt' but *. will find any names
  750. // without an extension and with no trailing dot.
  751. //
  752. if ( *wcs == wcDOSDot )
  753. {
  754. Get( iCurrent )->AddTransition( symEndLine, _iNextState );
  755. }
  756. iLocalStart = Get( iCurrent )->StateNumber();
  757. wcsLocalStart = wcs;
  758. _iNextState++;
  759. iCurrent = iNext;
  760. break;
  761. }
  762. }
  763. //
  764. // non-EGREP (DOS) regex match entire string.
  765. //
  766. if ( wcHalt == 0 && *(wcs-1) != wcAnyMultiple )
  767. {
  768. iNext = _iNextState;
  769. Get( iCurrent )->AddTransition( symEndLine, _iNextState );
  770. iLocalStart = 0;
  771. wcsLocalStart = _wcsNull;
  772. _iNextState++;
  773. iCurrent = iNext;
  774. }
  775. //
  776. // If we haven't had an OR clause yet, then set iEnd
  777. //
  778. if ( *iEnd == 0 )
  779. {
  780. //
  781. // First part of OR clause.
  782. //
  783. *iEnd = Get( iCurrent )->StateNumber();
  784. }
  785. else
  786. {
  787. //
  788. // Subsequent OR clause. Epsilon link to end
  789. //
  790. Get( iCurrent )->AddTransition( symEpsilon, *iEnd );
  791. }
  792. if ( pwcsEnd )
  793. {
  794. *pwcsEnd = wcs + 1; // Eat halt character.
  795. }
  796. if( *wcs != wcHalt )
  797. {
  798. throw ERROR_INVALID_PARAMETER;
  799. }
  800. }
  801. void CNFA::ParseRepeat( WCHAR const * & wcs, unsigned & cRepeat1, unsigned & cRepeat2 )
  802. {
  803. cRepeat1 = 0;
  804. cRepeat2 = 0;
  805. for ( ; *wcs && isdigit(*wcs); wcs++ )
  806. {
  807. cRepeat1 *= 10;
  808. cRepeat1 += *wcs - '0';
  809. }
  810. if ( cRepeat1 == 0 || cRepeat1 > 255 )
  811. {
  812. throw ERROR_INVALID_PARAMETER;
  813. }
  814. if ( *wcs == ',' )
  815. {
  816. wcs++;
  817. if ( *wcs == wcEscape && *(wcs+1) == wcEndRepeat )
  818. {
  819. wcs++;
  820. }
  821. else
  822. {
  823. for ( ; *wcs && isdigit(*wcs); wcs++ )
  824. {
  825. cRepeat2 *= 10;
  826. cRepeat2 += *wcs - '0';
  827. }
  828. if ( cRepeat2 == 0 || cRepeat2 > 255 )
  829. {
  830. throw ERROR_INVALID_PARAMETER;
  831. }
  832. if ( *wcs != wcEscape || *(wcs+1) != wcEndRepeat )
  833. {
  834. throw ERROR_INVALID_PARAMETER;
  835. }
  836. else
  837. {
  838. wcs++;
  839. }
  840. }
  841. }
  842. else if ( *wcs == wcEscape && *(wcs+1) == wcEndRepeat )
  843. {
  844. wcs++;
  845. cRepeat2 = cRepeat1;
  846. }
  847. else
  848. {
  849. throw ERROR_INVALID_PARAMETER;
  850. }
  851. }
  852. //+-------------------------------------------------------------------------
  853. //
  854. // Member: CDFA::CDFA, public
  855. //
  856. // Synopsis: Constructs a DFA from a NFA.
  857. //
  858. // Arguments: [pwcs] -- Regular expression (passed to NFA)
  859. // [fCaseSens] -- true if case-sensitive search
  860. //
  861. // History: 20-Jan-92 KyleP Created
  862. //
  863. //--------------------------------------------------------------------------
  864. CDFA::CDFA( WCHAR const * pwcs, bool fCaseSens )
  865. : _nfa( pwcs, fCaseSens ),
  866. _xs( _nfa.NumStates() ),
  867. _cState( _nfa.NumStates() ),
  868. _pStateTrans( 0 ),
  869. _pStateFinal( 0 )
  870. {
  871. CommonCtor();
  872. }
  873. //+-------------------------------------------------------------------------
  874. //
  875. // Member: CDFA::CDFA, public
  876. //
  877. // Synopsis: Copy constructor
  878. //
  879. // Arguments: [pwcs] -- Regular expression (passed to NFA)
  880. // [fCaseSens] -- true if case-sensitive search
  881. //
  882. // History: 20-Jan-92 KyleP Created
  883. //
  884. //--------------------------------------------------------------------------
  885. CDFA::CDFA( CDFA const & src )
  886. : _nfa( src._nfa ),
  887. _xs( src._nfa.NumStates() ),
  888. _cState( src._nfa.NumStates() ),
  889. _pStateTrans( 0 ),
  890. _pStateFinal( 0 )
  891. {
  892. CommonCtor();
  893. }
  894. //+-------------------------------------------------------------------------
  895. //
  896. // Member: CDFA::CommonCtor, private
  897. //
  898. // Synopsis: Code common to both constructors.
  899. //
  900. // History: 13-Jul-95 KyleP Snarfed from constructor
  901. //
  902. //--------------------------------------------------------------------------
  903. void CDFA::CommonCtor()
  904. {
  905. //
  906. // Add initial state.
  907. //
  908. CStateSet ss;
  909. _nfa.EpsClosure( _nfa.StartState(), ss );
  910. _stateStart = _xs.XlatToOne( ss );
  911. //
  912. // Intialize translation table.
  913. //
  914. int cEntries = (_cState + 1) * ( _nfa.Translate().NumClasses() + 1 );
  915. _pStateTrans = new UINT [ cEntries ];
  916. _pStateFinal = new bool [ _cState + 1 ];
  917. memset( _pStateTrans, 0xFF, cEntries * sizeof(_pStateTrans[0]) );
  918. RtlZeroMemory( _pStateFinal, (_cState + 1) * sizeof(_pStateFinal[0]) );
  919. for ( int i = _cState; i >= 0; i-- )
  920. {
  921. AddTransition( i, 0, stateUndefined );
  922. }
  923. Add( _stateStart, _nfa.IsFinal( ss ) );
  924. }
  925. //+-------------------------------------------------------------------------
  926. //
  927. // Member: CDFA::~CDFA, public
  928. //
  929. // Synopsis: Clean up DFA. Free state tables.
  930. //
  931. // History: 20-Jun-92 KyleP Created
  932. //
  933. //--------------------------------------------------------------------------
  934. CDFA::~CDFA()
  935. {
  936. delete _pStateTrans;
  937. delete _pStateFinal;
  938. }
  939. //+-------------------------------------------------------------------------
  940. //
  941. // Member: CDFA::Recognize, public
  942. //
  943. // Arguments: [wcs] -- Input string.
  944. //
  945. // Returns: true if [wcs] is matched by the regular expression.
  946. //
  947. // History: 20-Jan-92 KyleP Created
  948. //
  949. //--------------------------------------------------------------------------
  950. bool CDFA::Recognize( WCHAR * wcs )
  951. {
  952. //////////
  953. // Modified from original version to handle a NULL string.
  954. //////////
  955. if (!wcs) { return false; }
  956. UINT CurrentState = _stateStart;
  957. UINT LastState = CurrentState;
  958. bool fFinal = IsFinal( CurrentState );
  959. WCHAR wcCurrent = symBeginLine;
  960. while ( !fFinal )
  961. {
  962. UINT NextState = Move( CurrentState, wcCurrent );
  963. if ( NextState == stateUncomputed )
  964. {
  965. CStateSet ssCurrent;
  966. CStateSet ssNew;
  967. CStateSet ssClosed;
  968. _xs.XlatToMany( CurrentState, ssCurrent );
  969. _nfa.Move( ssCurrent, ssNew, wcCurrent );
  970. if ( ssNew.Count() == 0 )
  971. {
  972. NextState = stateUndefined;
  973. AddTransition( CurrentState, wcCurrent, NextState );
  974. }
  975. else
  976. {
  977. _nfa.EpsClosure( ssNew, ssClosed );
  978. NextState = _xs.XlatToOne( ssClosed );
  979. if ( !IsComputed( NextState ) )
  980. {
  981. Add( NextState, _nfa.IsFinal( ssClosed ) );
  982. }
  983. AddTransition( CurrentState, wcCurrent, NextState );
  984. }
  985. }
  986. if ( NextState == stateUndefined )
  987. {
  988. return( false );
  989. }
  990. LastState = CurrentState;
  991. CurrentState = NextState;
  992. fFinal = IsFinal( CurrentState );
  993. //
  994. // If we ran out of string then just keep going, appending
  995. // end-of-string symbols. Unfortunately the string is conceptually
  996. // a set of characters followed by an arbitrary number of
  997. // end-of-string symbols. In non-EGREP the end-of-string symbol
  998. // may actually cause multiple state transitions before reaching
  999. // a final state. In non-EGREP (DOS) mode we stop only when we
  1000. // are no longer 'making progress' (moving to new states) on
  1001. // end-of-string. I haven't completely convinced myself this
  1002. // algorithm is guaranteed to terminate.
  1003. //
  1004. if ( wcCurrent == symEndLine )
  1005. {
  1006. if ( LastState == CurrentState )
  1007. break;
  1008. }
  1009. else
  1010. {
  1011. wcCurrent = *wcs++;
  1012. //
  1013. // After we've exhausted the string, append the special
  1014. // end-of-line character.
  1015. //
  1016. if ( wcCurrent == 0 )
  1017. {
  1018. wcCurrent = symEndLine;
  1019. }
  1020. else
  1021. {
  1022. wcCurrent = (WCHAR)_nfa.Translate().Translate( wcCurrent );
  1023. }
  1024. }
  1025. }
  1026. return( fFinal );
  1027. }
  1028. //+-------------------------------------------------------------------------
  1029. //
  1030. // Member: CDFA::Add, private
  1031. //
  1032. // Synopsis: Adds a new state the the DFA.
  1033. //
  1034. // Arguments: [state] -- State number
  1035. // [fFinal] -- true if state is a final state.
  1036. //
  1037. // History: 20-Jan-92 KyleP Created
  1038. //
  1039. // Notes: All transitions for the new state are initially uncomputed.
  1040. //
  1041. //--------------------------------------------------------------------------
  1042. void CDFA::Add( UINT state, bool fFinal )
  1043. {
  1044. if ( state > _cState )
  1045. {
  1046. //
  1047. // Since the number of states required will probably grow at
  1048. // a slow rate, increase the size of the array in a linear
  1049. // fashion.
  1050. UINT const DeltaState = 10;
  1051. UINT * oldStateTrans = _pStateTrans;
  1052. bool * oldStateFinal = _pStateFinal;
  1053. UINT oldcState = _cState;
  1054. UINT oldcEntries = (_cState + 1) *
  1055. ( _nfa.Translate().NumClasses() + 1 );
  1056. _cState += DeltaState;
  1057. UINT cEntries = (_cState + 1) * ( _nfa.Translate().NumClasses() + 1 );
  1058. _pStateTrans = new UINT [ cEntries ];
  1059. _pStateFinal = new bool [ _cState + 1 ];
  1060. //
  1061. // Initilize new state tables...
  1062. //
  1063. memcpy( _pStateTrans, oldStateTrans, oldcEntries * sizeof( UINT ) );
  1064. memcpy( _pStateFinal, oldStateFinal, oldcState * sizeof( bool ) );
  1065. memset( _pStateTrans + oldcEntries, 0xFF, (cEntries - oldcEntries)*sizeof(_pStateTrans[0]) );
  1066. RtlZeroMemory( _pStateFinal + oldcState, (_cState + 1 - oldcState)*sizeof(_pStateFinal[0]) );
  1067. for ( UINT i = _cState - DeltaState + 1; i <= _cState; i++ )
  1068. {
  1069. AddTransition( i, 0, stateUndefined );
  1070. }
  1071. //
  1072. // ...and destroy the old
  1073. //
  1074. delete oldStateTrans;
  1075. delete oldStateFinal;
  1076. }
  1077. //
  1078. // All states are set to stateUncomputed above, except the 'undefined' flag-state.
  1079. //
  1080. AddTransition( state, 0, stateUncomputed );
  1081. _pStateFinal[state] = fFinal;
  1082. }