Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1766 lines
50 KiB

  1. //+-------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1991 - 2001.
  5. //
  6. // File: FA.cxx
  7. //
  8. // Contents: Non-deterministic finite automata
  9. //
  10. // Classes: CNFA
  11. //
  12. // History: 01-20-92 KyleP Created
  13. //
  14. //--------------------------------------------------------------------------
  15. #include <pch.cxx>
  16. #pragma hdrstop
  17. #pragma optimize( "", off )
  18. #include <fa.hxx>
  19. #include <strategy.hxx>
  20. #include <codepage.hxx>
  21. #include "stateset.hxx"
  22. //+-------------------------------------------------------------------------
  23. //
  24. // Member: CFA::CFA, public
  25. //
  26. // Synopsis: Copy constructor
  27. //
  28. // History: 13-Jul-95 KyleP Created
  29. //
  30. //--------------------------------------------------------------------------
  31. CFA::CFA( CFA const & src )
  32. : _cTotal( src._cTotal ),
  33. _ppState( 0 )
  34. {
  35. _ppState = new CFAState * [ _cTotal ];
  36. unsigned i = 0;
  37. TRY
  38. {
  39. for ( ; i < _cTotal; i++ )
  40. {
  41. if ( 0 == src._ppState[i] )
  42. _ppState[i] = 0;
  43. else
  44. _ppState[i] = new CFAState( *src._ppState[i] );
  45. }
  46. }
  47. CATCH( CException, e )
  48. {
  49. for ( ;i > 0; i-- )
  50. delete _ppState[i-1];
  51. delete _ppState;
  52. RETHROW();
  53. }
  54. END_CATCH
  55. }
  56. //+-------------------------------------------------------------------------
  57. //
  58. // Member: CFA::~CFA, protected
  59. //
  60. // Synopsis: Frees automata.
  61. //
  62. // History: 20-Jan-92 KyleP Created
  63. //
  64. //--------------------------------------------------------------------------
  65. CFA::~CFA()
  66. {
  67. if( _ppState )
  68. {
  69. for ( unsigned i = 0; i < _cTotal; i++ )
  70. {
  71. delete _ppState[i];
  72. }
  73. delete _ppState;
  74. }
  75. }
  76. //+-------------------------------------------------------------------------
  77. //
  78. // Member: CFA::Add, protected
  79. //
  80. // Synopsis: Adds new state to automata.
  81. //
  82. // Arguments: [pState] -- New state. State number is member data.
  83. //
  84. // History: 20-Jan-92 KyleP Created
  85. //
  86. //--------------------------------------------------------------------------
  87. void CFA::Add( CFAState * pState )
  88. {
  89. if ( pState->StateNumber() > _cTotal )
  90. {
  91. for( unsigned newTotal = (_cTotal) ? _cTotal * 2 : 1;
  92. pState->StateNumber() > newTotal;
  93. newTotal *= 2 );
  94. CFAState ** oldState = _ppState;
  95. _ppState = new CFAState * [ newTotal ];
  96. memcpy( _ppState, oldState,
  97. _cTotal * sizeof( CFAState * ) );
  98. memset( _ppState + _cTotal,
  99. 0,
  100. (newTotal - _cTotal) * sizeof( CFAState * ) );
  101. _cTotal = newTotal;
  102. }
  103. _ppState[pState->StateNumber() - 1] = pState;
  104. }
  105. //+-------------------------------------------------------------------------
  106. //
  107. // Member: CFA::Get, protected
  108. //
  109. // Arguments: [iState] -- State to fetch.
  110. //
  111. // Returns: State [iState].
  112. //
  113. // History: 20-Jan-92 KyleP Created
  114. //
  115. //--------------------------------------------------------------------------
  116. CFAState * CFA::Get( unsigned iState )
  117. {
  118. vqAssert( iState <= _cTotal );
  119. {
  120. # if (CIDBG == 1)
  121. if ( _ppState[ iState - 1 ]->StateNumber() != iState )
  122. vqDebugOut(( DEB_ERROR, "CFA::Get() -- Error\n" ));
  123. # endif // (CIDBG == 1)
  124. return( _ppState[ iState - 1 ] );
  125. }
  126. }
  127. //+-------------------------------------------------------------------------
  128. //
  129. // Member: CNFA::CNFA, public
  130. //
  131. // Synopsis: Converts regular expression string to NFA.
  132. //
  133. // Arguments: [pwcs] -- Regular expression.
  134. // [fCaseSens] -- TRUE if case sensitive search.
  135. //
  136. // History: 20-Jan-92 Kyleap Created
  137. //
  138. //--------------------------------------------------------------------------
  139. CNFA::CNFA( WCHAR const * pwcs, BOOLEAN fCaseSens )
  140. : _iNextState( 1 ),
  141. _iStart( 0 ),
  142. _chars( fCaseSens )
  143. {
  144. unsigned iEnd;
  145. //
  146. // _aState initially contains room for 2 * #chars in regex. According
  147. // to the Dragon Book pg. 121 this is guaranteed to be sufficient space.
  148. // Of course the dragon book doesn't completely take DOS or CMS into
  149. // account. For DOS, we need to treat beginning (and end) of line as
  150. // 'characters' in the string. For CMS, I agreed to support the
  151. // {m,n} construct, which clearly violates this rule.
  152. //
  153. if ( 0 == pwcs )
  154. {
  155. vqDebugOut(( DEB_ERROR, "ERROR: regex string value of 0 " ));
  156. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  157. }
  158. unsigned cState = wcslen( pwcs ) * 2 + 2*2; // 2*2 for beginning & end of line
  159. _aState.Init( cState );
  160. for ( unsigned i = 1 ; i <= _aState.Count(); i++ )
  161. Get(i)->Init(i);
  162. FindCharClasses( pwcs );
  163. Parse( pwcs, &_iStart, &iEnd );
  164. Get( iEnd )->MakeFinal();
  165. }
  166. //+-------------------------------------------------------------------------
  167. //
  168. // Member: CNFA::CNFA, public
  169. //
  170. // Synopsis: Copy constructor
  171. //
  172. // Arguments: [src] -- Source
  173. //
  174. // History: 13-Jul-95 Kylep Created
  175. //
  176. //--------------------------------------------------------------------------
  177. CNFA::CNFA( CNFA const & src )
  178. : _iNextState( src.NumStates() ),
  179. _iStart( src._iStart ),
  180. _chars( src._chars ),
  181. _aState( src._aState.Count() )
  182. {
  183. for ( unsigned i = 0; i < _aState.Count(); i++ )
  184. _aState[i] = src._aState[i];
  185. }
  186. //+-------------------------------------------------------------------------
  187. //
  188. // Member: CNFA::~CNFA, public
  189. //
  190. // Synopsis: Free state table.
  191. //
  192. // History: 13-Oct-92 KyleP Created
  193. //
  194. //--------------------------------------------------------------------------
  195. CNFA::~CNFA()
  196. {
  197. }
  198. //+-------------------------------------------------------------------------
  199. //
  200. // Member: CNFA::EpsClosure, public
  201. //
  202. // Synopsis: Computes the epsilon closure for state [StateNum]
  203. //
  204. // Effects: States in the epsilon closure of state [StateNum]
  205. // are added to the state set [ssOut].
  206. //
  207. // Arguments: [StateNum] -- Initial state.
  208. // [ssOut] -- Output state set.
  209. //
  210. // History: 20-Jan-92 KyleP Created
  211. //
  212. //--------------------------------------------------------------------------
  213. void CNFA::EpsClosure( unsigned StateNum, CStateSet & ssOut )
  214. {
  215. CStateSet ssTraversed;
  216. ssOut.Add( StateNum );
  217. BOOLEAN changed = TRUE;
  218. while ( changed )
  219. {
  220. changed = FALSE;
  221. for ( unsigned i = ssOut.Count(); i > 0; i-- )
  222. {
  223. if ( !ssTraversed.IsMember( ssOut.State( i ) ) )
  224. {
  225. ssTraversed.Add( ssOut.State( i ) );
  226. Get( ssOut.State( i ) )->Move( ssOut, symEpsilon );
  227. changed = TRUE;
  228. }
  229. }
  230. }
  231. }
  232. //+-------------------------------------------------------------------------
  233. //
  234. // Member: CNFA::EpsClosure, public
  235. //
  236. // Synopsis: Computes the epsilon closure for state set [ssIn]
  237. //
  238. // Effects: States in the epsilon closure of [ssIn]
  239. // are added to the state set [ssOut].
  240. //
  241. // Arguments: [ssIn] -- Initial state set.
  242. // [ssOut] -- Output state set.
  243. //
  244. // History: 20-Jan-92 KyleP Created
  245. //
  246. //--------------------------------------------------------------------------
  247. void CNFA::EpsClosure( CStateSet & ssIn, CStateSet & ssOut )
  248. {
  249. for ( unsigned i = ssIn.Count(); i > 0; i-- )
  250. {
  251. EpsClosure( ssIn.State( i ), ssOut );
  252. }
  253. }
  254. //+-------------------------------------------------------------------------
  255. //
  256. // Member: CDFA::IsFinal, public
  257. //
  258. // Arguments: [ss] -- State set
  259. //
  260. // Returns: TRUE if some state in [ss] is final.
  261. //
  262. // History: 20-Jan-92 Kyleap Created
  263. //
  264. //--------------------------------------------------------------------------
  265. BOOLEAN CNFA::IsFinal( CStateSet & ss )
  266. {
  267. BOOLEAN fFinal = FALSE;
  268. for ( unsigned i = ss.Count(); i > 0 && !fFinal; i-- )
  269. {
  270. fFinal = (BYTE)(Get( ss.State( i ) )->IsFinal());
  271. }
  272. return( fFinal );
  273. }
  274. //+-------------------------------------------------------------------------
  275. //
  276. // Member: CNFA::Move, public
  277. //
  278. // Effects: Performs a non-deterministic move from every state
  279. // in [ssIn] on [symbol]. The new state set is in
  280. // [ssOut].
  281. //
  282. // Arguments: [ssIn] -- Initial state set.
  283. // [ssOut] -- Final state set.
  284. // [symbol] -- Transition symbol.
  285. //
  286. // History: 20-Jan-92 KyleP Created
  287. //
  288. //--------------------------------------------------------------------------
  289. void CNFA::Move( CStateSet & ssIn, CStateSet & ssOut, unsigned symbol )
  290. {
  291. for ( unsigned i = ssIn.Count(); i > 0; i-- )
  292. {
  293. Get( ssIn.State( i ) )->Move( ssOut, symbol );
  294. }
  295. }
  296. //+-------------------------------------------------------------------------
  297. //
  298. // Member: CNFA::FindCharClasses, private
  299. //
  300. // Effects: Partitions the UniCode character space (2^16 characters)
  301. // into equivalence classes such that all characters in
  302. // a given class will have identical transitions in the NFA.
  303. //
  304. // Arguments: [wcs] -- Original regular expression string.
  305. //
  306. // History: 20-Jan-92 KyleP Created
  307. //
  308. // Notes: If case sensitivity is turned off, two ranges will be
  309. // added for characters with upper/lower case. Even though
  310. // both ranges react identically the mapping algorithm can
  311. // only deal with contiguous ranges of characters.
  312. //
  313. //--------------------------------------------------------------------------
  314. void CNFA::FindCharClasses( WCHAR const * wcs )
  315. {
  316. //
  317. // Scan the regex looking for characters with (potentially)
  318. // different transitions.
  319. //
  320. while ( *wcs )
  321. {
  322. switch ( *wcs )
  323. {
  324. case wcAnySingle:
  325. case wcAnyMultiple:
  326. case wcDOSDot:
  327. break;
  328. case wcEscape:
  329. {
  330. wcs++;
  331. switch ( *wcs )
  332. {
  333. case 0:
  334. vqDebugOut(( DEB_WARN, "Invalid regex (%wc at end of string\n", wcEscape ));
  335. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  336. break;
  337. case wcAnySingle:
  338. case wcRepeatZero:
  339. case wcRepeatOne:
  340. case wcOr:
  341. case wcBeginParen:
  342. case wcEndParen:
  343. break;
  344. case wcBeginRepeat:
  345. for ( wcs++; *wcs; wcs++ )
  346. {
  347. if ( *wcs == wcEscape && *(wcs+1) == wcEndRepeat )
  348. {
  349. wcs++;
  350. break;
  351. }
  352. }
  353. break;
  354. case wcBeginRange:
  355. wcs++;
  356. //
  357. // Check the special cases of ^ and ]
  358. //
  359. if ( *wcs == wcInvertRange )
  360. wcs++;
  361. if ( *wcs == wcEndRange )
  362. {
  363. _chars.AddRange( *wcs, *wcs );
  364. wcs++;
  365. }
  366. for ( ; *wcs && *wcs != wcEndRange; wcs++ )
  367. {
  368. if ( *(wcs + 1) == wcRangeSep )
  369. {
  370. _chars.AddRange( *wcs, *(wcs+2) );
  371. }
  372. else
  373. {
  374. _chars.AddRange( *wcs, *wcs );
  375. }
  376. }
  377. if ( *wcs != wcEndRange )
  378. {
  379. vqDebugOut(( DEB_WARN, "Invalid regex. Missing %wc\n", wcEndRange ));
  380. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  381. }
  382. break;
  383. default:
  384. _chars.AddRange( *wcs, *wcs );
  385. break;
  386. }
  387. break;
  388. }
  389. default:
  390. _chars.AddRange( *wcs, *wcs );
  391. break;
  392. }
  393. wcs++;
  394. }
  395. _chars.Prepare();
  396. }
  397. WCHAR * CNFA::_wcsNull = (WCHAR*)"";
  398. //+-------------------------------------------------------------------------
  399. //
  400. // Member: CNFA::Parse, private
  401. //
  402. // Synopsis: Creates a NFA from [wcs]
  403. //
  404. // Effects: Parses [wcs] until end of string or character wcHalt is
  405. // encountered. On exit, [iStart] and [iEnd] contain the
  406. // starting and ending states of the NFA, respectively.
  407. // [pwcsEnd] points to the last character of [wcs] that was
  408. // parsed.
  409. //
  410. // Arguments: [wcs] -- Regular expression.
  411. // [iStart] -- Starting state of NFA.
  412. // [iEnd] -- Ending state of NFA
  413. // [pwcsEnd] -- Last character of [wcs] that was parsed.
  414. // [wcHalt] -- Stop parsing if this character encountered.
  415. //
  416. // History: 20-Jan-92 KyleP Created
  417. //
  418. //--------------------------------------------------------------------------
  419. void CNFA::Parse( WCHAR const * wcs,
  420. unsigned * iStart,
  421. unsigned * iEnd,
  422. WCHAR const * * pwcsEnd,
  423. WCHAR wcHalt )
  424. {
  425. unsigned iCurrent;
  426. unsigned iNext;
  427. unsigned iLocalStart; // Used for */+/? repositioning
  428. BOOLEAN fRepeat = FALSE; // Used for +
  429. BOOLEAN fTopLevel = (*iStart == 0); // TRUE if at top level;
  430. *iEnd = 0;
  431. //
  432. // Get a starting state. *iStart == 0 implies this is the 'top-level'
  433. // parse of the regular expression (e.g. we're not parsing a
  434. // parenthesized subexpression.
  435. //
  436. if ( fTopLevel )
  437. {
  438. iCurrent = _iNextState;
  439. *iStart = _iNextState++;
  440. iLocalStart = 0;
  441. //
  442. // non-EGREP (DOS) regex match entire string.
  443. //
  444. if ( *wcs != wcAnyMultiple )
  445. {
  446. iNext = _iNextState;
  447. Get( iCurrent )->AddTransition( symBeginLine, _iNextState );
  448. _iNextState++;
  449. iCurrent = iNext;
  450. }
  451. else
  452. {
  453. //
  454. // Add a 'special' transition on the very first state to
  455. // eat up characters until we actually jump into the
  456. // regular expresion.
  457. //
  458. Get( iCurrent )->AddTransition( symAny, Get( iCurrent )->StateNumber() );
  459. }
  460. }
  461. else
  462. {
  463. iCurrent = *iStart;
  464. iLocalStart = *iStart;
  465. }
  466. unsigned iOrStart = Get( iCurrent )->StateNumber();
  467. //
  468. // Original start of string.
  469. //
  470. WCHAR const * wcsBeginning = wcs;
  471. //
  472. // wcsLocalStart tracks the piece of string to be repeated for wcZeroOrOne, etc.
  473. //
  474. WCHAR const * wcsLocalStart = wcs;
  475. //
  476. // Parse the regular expression until there is no more or a
  477. // termination character is hit.
  478. //
  479. for ( ; *wcs && *wcs != wcHalt; wcs++ )
  480. {
  481. switch ( *wcs )
  482. {
  483. case wcAnySingle:
  484. iNext = _iNextState;
  485. Get( iCurrent )->AddTransition( symAny, _iNextState );
  486. iLocalStart = Get( iCurrent )->StateNumber();
  487. wcsLocalStart = wcs;
  488. _iNextState++;
  489. iCurrent = iNext;
  490. break;
  491. case wcAnyMultiple:
  492. //
  493. // Any single
  494. //
  495. iNext = _iNextState;
  496. Get( iCurrent )->AddTransition( symAny, _iNextState );
  497. iLocalStart = Get( iCurrent )->StateNumber();
  498. wcsLocalStart = wcs;
  499. _iNextState++;
  500. iCurrent = iNext;
  501. //
  502. // Repeat zero or more
  503. //
  504. Get( iLocalStart )->AddTransition( symEpsilon,
  505. Get( iCurrent )->StateNumber() );
  506. Get( iCurrent )->AddTransition( symEpsilon, iLocalStart );
  507. break;
  508. case wcEscape:
  509. {
  510. wcs++;
  511. switch ( *wcs )
  512. {
  513. case wcBeginParen:
  514. {
  515. unsigned iLocalEnd;
  516. iLocalStart = Get( iCurrent )->StateNumber();
  517. wcsLocalStart = wcs - 1;
  518. wcs++; // Eat '('.
  519. Parse( wcs, &iLocalStart, &iLocalEnd, &wcs, wcEndParen );
  520. wcs--; // Provide character for loop to eat.
  521. iCurrent = iLocalEnd;
  522. break;
  523. }
  524. case wcEndParen:
  525. //
  526. // Taken care of at outer level. Just backup so we hit the end.
  527. //
  528. wcs--;
  529. break;
  530. case wcBeginRepeat:
  531. {
  532. if ( wcHalt == wcBeginRepeat )
  533. {
  534. //
  535. // Taken care of at outer level. Just backup so we hit the end.
  536. //
  537. wcs--;
  538. }
  539. else
  540. {
  541. //
  542. // Setup: Bounds of repeated regex
  543. //
  544. WCHAR const * wcsStartRepeat = wcsLocalStart;
  545. WCHAR const * wcsEndRepeat = wcs + 1;
  546. //
  547. // Setup: Repeat parameters.
  548. //
  549. unsigned cRepeat1, cRepeat2;
  550. wcs++;
  551. ParseRepeat( wcs, cRepeat1, cRepeat2 );
  552. unsigned iLocalEnd;
  553. //
  554. // The minimum set has no epsilon transitions.
  555. //
  556. if ( cRepeat1 > 1 )
  557. {
  558. iLocalStart = Get( iCurrent )->StateNumber();
  559. iLocalEnd = iLocalStart;
  560. for ( unsigned i = 1; i < cRepeat1; i++ )
  561. {
  562. WCHAR const * wcsEnd;
  563. iLocalStart = iLocalEnd;
  564. iLocalEnd = 0; // Must be zero!
  565. Parse( wcsLocalStart, &iLocalStart, &iLocalEnd, &wcsEnd, wcBeginRepeat );
  566. if ( wcsEnd != wcsEndRepeat )
  567. {
  568. vqDebugOut(( DEB_ERROR, "Invalid regex: Nested repeats?\n" ));
  569. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  570. }
  571. }
  572. }
  573. else
  574. iLocalEnd = Get( iCurrent )->StateNumber();
  575. if ( cRepeat1 == cRepeat2 )
  576. {
  577. vqDebugOut(( DEB_REGEX, "REPEAT: Exactly %u times\n", cRepeat1 ));
  578. }
  579. else if ( cRepeat2 == 0 )
  580. {
  581. vqDebugOut(( DEB_REGEX, "REPEAT: At least %u times\n", cRepeat1 ));
  582. Get( iLocalEnd )->AddTransition( symEpsilon, iLocalStart );
  583. }
  584. else if ( cRepeat2 > cRepeat1 )
  585. {
  586. for ( unsigned i = cRepeat1; i < cRepeat2; i++ )
  587. {
  588. WCHAR const * wcsEnd;
  589. iLocalStart = iLocalEnd;
  590. iLocalEnd = 0; // Must be zero!
  591. Parse( wcsLocalStart, &iLocalStart, &iLocalEnd, &wcsEnd, wcBeginRepeat );
  592. Get( iLocalStart )->AddTransition( symEpsilon, iLocalEnd );
  593. if ( wcsEnd != wcsEndRepeat )
  594. {
  595. vqDebugOut(( DEB_ERROR, "Invalid regex: Nested repeats?\n" ));
  596. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  597. }
  598. }
  599. }
  600. else
  601. {
  602. vqDebugOut(( DEB_ERROR, "Invalid regex: End repeat count %d < start %d\n",
  603. cRepeat2, cRepeat1 ));
  604. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  605. }
  606. iCurrent = iLocalEnd;
  607. iLocalStart = 0;
  608. wcsLocalStart = _wcsNull;
  609. }
  610. break;
  611. }
  612. case wcOr:
  613. if ( *iEnd == 0 )
  614. {
  615. //
  616. // First part of OR clause.
  617. //
  618. if ( fTopLevel )
  619. {
  620. iNext = _iNextState;
  621. Get( iCurrent )->AddTransition( symEndLine, _iNextState );
  622. _iNextState++;
  623. iCurrent = iNext;
  624. }
  625. *iEnd = Get( iCurrent )->StateNumber();
  626. }
  627. else
  628. {
  629. //
  630. // Subsequent OR clause. Epsilon link to end
  631. //
  632. Get( iCurrent )->AddTransition( symEpsilon, *iEnd );
  633. }
  634. iCurrent = iOrStart;
  635. wcsLocalStart = _wcsNull;
  636. iLocalStart = 0;
  637. break;
  638. case wcBeginRange:
  639. {
  640. BOOLEAN fReverse = FALSE;
  641. vqDebugOut(( DEB_REGEX, "RANGE\n" ));
  642. wcsLocalStart = wcs-1;
  643. iNext = _iNextState;
  644. wcs++; // Eat '['. ']' eaten by loop.
  645. //
  646. // Check the special cases of ^ and ]
  647. //
  648. if ( *wcs == wcInvertRange )
  649. {
  650. wcs++;
  651. fReverse = TRUE;
  652. //
  653. // Add all transitions, they will be removed later.
  654. //
  655. for ( unsigned uiNext = _chars.TranslateRange( 1,
  656. wcLastValidChar );
  657. uiNext != 0;
  658. uiNext = _chars.TranslateRange( 0, wcLastValidChar ) )
  659. {
  660. Get( iCurrent )->AddTransition( uiNext,
  661. _iNextState );
  662. }
  663. }
  664. if ( *wcs == wcEndRange )
  665. {
  666. if ( fReverse )
  667. {
  668. Get( iCurrent )->RemoveTransition( _chars.Translate( *wcs++ ),
  669. _iNextState );
  670. }
  671. else
  672. {
  673. Get( iCurrent )->AddTransition( _chars.Translate( *wcs++ ),
  674. _iNextState );
  675. }
  676. }
  677. for ( ; *wcs && *wcs != wcEndRange; wcs++ )
  678. {
  679. if ( *(wcs + 1) == wcRangeSep )
  680. {
  681. vqDebugOut(( DEB_REGEX,
  682. "Range %u to %u\n", *wcs, *(wcs+2) ));
  683. for ( unsigned uiNext = _chars.TranslateRange( *wcs,
  684. *(wcs+2) );
  685. uiNext != 0;
  686. uiNext = _chars.TranslateRange( 0, *(wcs+2) ) )
  687. {
  688. if ( fReverse )
  689. {
  690. Get( iCurrent )->RemoveTransition( uiNext,
  691. _iNextState );
  692. }
  693. else
  694. {
  695. Get( iCurrent )->AddTransition( uiNext,
  696. _iNextState );
  697. }
  698. }
  699. wcs += 2;
  700. }
  701. else
  702. {
  703. vqDebugOut(( DEB_REGEX, "Singleton = %u\n", *wcs ));
  704. if ( fReverse )
  705. {
  706. Get( iCurrent )->RemoveTransition(
  707. _chars.Translate( *wcs ),
  708. _iNextState );
  709. }
  710. else
  711. {
  712. Get( iCurrent )->AddTransition(
  713. _chars.Translate( *wcs ),
  714. _iNextState );
  715. }
  716. }
  717. }
  718. if ( *wcs != wcEndRange )
  719. {
  720. vqDebugOut(( DEB_WARN, "Invalid regex. Missing %wc\n", wcEndRange ));
  721. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  722. }
  723. iLocalStart = Get( iCurrent )->StateNumber();
  724. _iNextState++;
  725. iCurrent = iNext;
  726. break;
  727. }
  728. case wcRepeatOne:
  729. if ( iLocalStart == 0 )
  730. {
  731. vqDebugOut(( DEB_ERROR, "Invalid regex. Nothing to repeat\n" ));
  732. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  733. }
  734. Get( iCurrent )->AddTransition( symEpsilon, iLocalStart );
  735. iNext = _iNextState;
  736. Get( iCurrent )->AddTransition( symEpsilon, _iNextState );
  737. wcsLocalStart = wcs - 1;
  738. _iNextState++;
  739. iCurrent = iNext;
  740. break;
  741. case wcRepeatZero:
  742. if ( iLocalStart == 0 )
  743. {
  744. vqDebugOut(( DEB_ERROR, "Invalid regex. Nothing to repeat.\n" ));
  745. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  746. }
  747. Get( iLocalStart )->AddTransition( symEpsilon,
  748. Get( iCurrent )->StateNumber() );
  749. Get( iCurrent )->AddTransition( symEpsilon, iLocalStart );
  750. iNext = _iNextState;
  751. Get( iCurrent )->AddTransition( symEpsilon, _iNextState );
  752. wcsLocalStart = wcs - 1;
  753. _iNextState++;
  754. iCurrent = iNext;
  755. break;
  756. case wcRepeatZeroOrOne:
  757. {
  758. if ( iLocalStart == 0 )
  759. {
  760. vqDebugOut(( DEB_ERROR, "Invalid regex. Nothing to repeat.\n" ));
  761. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  762. }
  763. Get( iLocalStart )->AddTransition( symEpsilon,
  764. Get( iCurrent )->StateNumber() );
  765. break;
  766. }
  767. default:
  768. iNext = _iNextState;
  769. Get( iCurrent )->AddTransition( _chars.Translate( *wcs ),
  770. _iNextState );
  771. iLocalStart = Get( iCurrent )->StateNumber();
  772. wcsLocalStart = wcs - 1;
  773. _iNextState++;
  774. iCurrent = iNext;
  775. break;
  776. }
  777. break; // switch for wcEscape
  778. }
  779. default:
  780. iNext = _iNextState;
  781. Get( iCurrent )->AddTransition( _chars.Translate( *wcs ),
  782. _iNextState );
  783. //
  784. // In non-EGREP (DOS) syntax dot '.' is funny. It will match
  785. // a dot, but if you're at the end of string it will also match
  786. // end. So *.txt will look for strings with zero or more
  787. // characters followed by '.txt' but *. will find any names
  788. // without an extension and with no trailing dot.
  789. //
  790. if ( *wcs == wcDOSDot )
  791. {
  792. Get( iCurrent )->AddTransition( symEndLine, _iNextState );
  793. }
  794. iLocalStart = Get( iCurrent )->StateNumber();
  795. wcsLocalStart = wcs;
  796. _iNextState++;
  797. iCurrent = iNext;
  798. break;
  799. }
  800. }
  801. //
  802. // non-EGREP (DOS) regex match entire string.
  803. //
  804. if ( wcHalt == 0 &&
  805. ( ( wcsBeginning+1 <= wcs && *(wcs-1) != wcAnyMultiple ) ||
  806. ( wcsBeginning+2 <= wcs && *(wcs-2) == wcEscape ) ) )
  807. {
  808. iNext = _iNextState;
  809. Get( iCurrent )->AddTransition( symEndLine, _iNextState );
  810. iLocalStart = 0;
  811. wcsLocalStart = _wcsNull;
  812. _iNextState++;
  813. iCurrent = iNext;
  814. }
  815. //
  816. // If we haven't had an OR clause yet, then set iEnd
  817. //
  818. if ( *iEnd == 0 )
  819. {
  820. //
  821. // First part of OR clause.
  822. //
  823. *iEnd = Get( iCurrent )->StateNumber();
  824. }
  825. else
  826. {
  827. //
  828. // Subsequent OR clause. Epsilon link to end
  829. //
  830. Get( iCurrent )->AddTransition( symEpsilon, *iEnd );
  831. }
  832. if ( pwcsEnd )
  833. {
  834. *pwcsEnd = wcs + 1; // Eat halt character.
  835. }
  836. if( *wcs != wcHalt )
  837. {
  838. vqDebugOut(( DEB_WARN, "Invalid regex. Missing %wc\n", wcHalt ));
  839. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  840. }
  841. }
  842. void CNFA::ParseRepeat( WCHAR const * & wcs, unsigned & cRepeat1, unsigned & cRepeat2 )
  843. {
  844. cRepeat1 = 0;
  845. cRepeat2 = 0;
  846. for ( ; *wcs && isdigit(*wcs); wcs++ )
  847. {
  848. cRepeat1 *= 10;
  849. cRepeat1 += *wcs - '0';
  850. }
  851. if ( cRepeat1 == 0 || cRepeat1 > 255 )
  852. {
  853. vqDebugOut(( DEB_ERROR, "Invalid regex: Repeat count %d out of bounds.\n", cRepeat1 ));
  854. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  855. }
  856. if ( *wcs == ',' )
  857. {
  858. wcs++;
  859. if ( *wcs == wcEscape && *(wcs+1) == wcEndRepeat )
  860. {
  861. wcs++;
  862. }
  863. else
  864. {
  865. for ( ; *wcs && isdigit(*wcs); wcs++ )
  866. {
  867. cRepeat2 *= 10;
  868. cRepeat2 += *wcs - '0';
  869. }
  870. if ( cRepeat2 == 0 || cRepeat2 > 255 )
  871. {
  872. vqDebugOut(( DEB_ERROR, "Invalid regex: Repeat count %d too big.\n", cRepeat2 ));
  873. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  874. }
  875. if ( *wcs != wcEscape || *(wcs+1) != wcEndRepeat )
  876. {
  877. vqDebugOut(( DEB_ERROR, "Invalid regex: No end to repeat specification.\n" ));
  878. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  879. }
  880. else
  881. {
  882. wcs++;
  883. }
  884. }
  885. }
  886. else if ( *wcs == wcEscape && *(wcs+1) == wcEndRepeat )
  887. {
  888. wcs++;
  889. cRepeat2 = cRepeat1;
  890. }
  891. else
  892. {
  893. vqDebugOut(( DEB_ERROR, "Invalid regex: No end to repeat specification.\n" ));
  894. THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
  895. }
  896. }
  897. //+-------------------------------------------------------------------------
  898. //
  899. // Member: CDFA::CDFA, public
  900. //
  901. // Synopsis: Constructs a DFA from a NFA.
  902. //
  903. // Arguments: [pwcs] -- Regular expression (passed to NFA)
  904. // [timeLimit] -- Execution time limit
  905. // [fCaseSens] -- TRUE if case-sensitive search
  906. //
  907. // History: 20-Jan-92 KyleP Created
  908. //
  909. //--------------------------------------------------------------------------
  910. CDFA::CDFA( WCHAR const * pwcs, CTimeLimit & timeLimit, BOOLEAN fCaseSens )
  911. : _nfa( pwcs, fCaseSens ),
  912. _xs( _nfa.NumStates() ),
  913. _cState( _nfa.NumStates() ),
  914. _timeLimit( timeLimit )
  915. {
  916. CommonCtor();
  917. }
  918. //+-------------------------------------------------------------------------
  919. //
  920. // Member: CDFA::CDFA, public
  921. //
  922. // Synopsis: Copy constructor
  923. //
  924. // Arguments: [pwcs] -- Regular expression (passed to NFA)
  925. // [fCaseSens] -- TRUE if case-sensitive search
  926. //
  927. // History: 20-Jan-92 KyleP Created
  928. //
  929. //--------------------------------------------------------------------------
  930. CDFA::CDFA( CDFA const & src )
  931. : _nfa( src._nfa ),
  932. _xs( src._nfa.NumStates() ),
  933. _cState( src._nfa.NumStates() ),
  934. _timeLimit( (CTimeLimit &) src._timeLimit )
  935. {
  936. CommonCtor();
  937. }
  938. //+-------------------------------------------------------------------------
  939. //
  940. // Member: CDFA::CommonCtor, private
  941. //
  942. // Synopsis: Code common to both constructors.
  943. //
  944. // History: 13-Jul-95 KyleP Snarfed from constructor
  945. //
  946. //--------------------------------------------------------------------------
  947. void CDFA::CommonCtor()
  948. {
  949. //
  950. // Add initial state.
  951. //
  952. CStateSet ss;
  953. _nfa.EpsClosure( _nfa.StartState(), ss );
  954. _stateStart = _xs.XlatToOne( ss );
  955. //
  956. // Intialize translation table.
  957. //
  958. int cEntries = (_cState + 1) * ( _nfa.Translate().NumClasses() + 1 );
  959. _xStateTrans.Init( cEntries );
  960. _xStateFinal.Init( _cState + 1 );
  961. Win4Assert( stateUncomputed == 0xFFFFFFFF );
  962. memset( _xStateTrans.GetPointer(), 0xFF, cEntries * sizeof( unsigned ) );
  963. RtlZeroMemory( _xStateFinal.GetPointer(), (_cState + 1) * sizeof( BOOLEAN ) );
  964. for ( int i = _cState; i >= 0; i-- )
  965. {
  966. AddTransition( i, 0, stateUndefined );
  967. }
  968. Add( _stateStart, _nfa.IsFinal( ss ) );
  969. # if (CIDBG == 1)
  970. vqDebugOut(( DEB_REGEX, "Character translation:\n" ));
  971. _nfa.Translate().Display();
  972. vqDebugOut(( DEB_REGEX, "NFA:\n" ));
  973. _nfa.Display();
  974. vqDebugOut(( DEB_REGEX, "DFA state %u = NFA states ", _stateStart ));
  975. ss.Display();
  976. vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "\n" ));
  977. vqDebugOut(( DEB_REGEX, "DFA start state = %u\n", _stateStart ));
  978. # endif // (CIDBG == 1)
  979. }
  980. //+-------------------------------------------------------------------------
  981. //
  982. // Member: CDFA::~CDFA, public
  983. //
  984. // Synopsis: Clean up DFA. Free state tables.
  985. //
  986. // History: 20-Jun-92 KyleP Created
  987. //
  988. //--------------------------------------------------------------------------
  989. CDFA::~CDFA()
  990. {
  991. }
  992. //+-------------------------------------------------------------------------
  993. //
  994. // Member: CDFA::Recognize, public
  995. //
  996. // Arguments: [wcs] -- Input string.
  997. //
  998. // Returns: TRUE if [wcs] is matched by the regular expression.
  999. //
  1000. // History: 20-Jan-92 KyleP Created
  1001. //
  1002. //--------------------------------------------------------------------------
  1003. BOOLEAN CDFA::Recognize( WCHAR const * wcs )
  1004. {
  1005. # if CIDBG == 1
  1006. ValidateStateTransitions();
  1007. # endif // CIDBG == 1
  1008. unsigned CurrentState = _stateStart;
  1009. unsigned LastState = CurrentState;
  1010. BOOLEAN fFinal = IsFinal( CurrentState );
  1011. WCHAR wcCurrent = symBeginLine;
  1012. while ( !fFinal )
  1013. {
  1014. unsigned NextState;
  1015. {
  1016. CReadAccess lock( _rwa );
  1017. //
  1018. // Casting is to guarantee this method doesn't modify anything (e.g. read lock ok).
  1019. //
  1020. #if CIDBG == 1
  1021. NextState = ((CDFA const *)this)->Move( CurrentState, wcCurrent );
  1022. #else
  1023. NextState = Move( CurrentState, wcCurrent );
  1024. #endif
  1025. }
  1026. vqDebugOut(( DEB_REGEX,
  1027. "DFA move[ %u, %u ] = %u\n",
  1028. CurrentState, wcCurrent, NextState ));
  1029. if ( stateUncomputed == NextState )
  1030. {
  1031. CWriteAccess lock( _rwa );
  1032. //
  1033. // Did someone else get here first?
  1034. //
  1035. NextState = Move( CurrentState, wcCurrent );
  1036. if ( stateUncomputed != NextState )
  1037. continue;
  1038. //
  1039. // Build the new state
  1040. //
  1041. CStateSet ssCurrent;
  1042. CStateSet ssNew;
  1043. CStateSet ssClosed;
  1044. _xs.XlatToMany( CurrentState, ssCurrent );
  1045. # if (CIDBG == 1)
  1046. vqDebugOut(( DEB_REGEX,
  1047. "DFA state %u = NFA states ", CurrentState ));
  1048. ssCurrent.Display();
  1049. if ( _nfa.IsFinal( ssCurrent ) )
  1050. {
  1051. vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, " FINAL" ));
  1052. }
  1053. vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "\n" ));
  1054. # endif // (CIDBG == 1)
  1055. _nfa.Move( ssCurrent, ssNew, wcCurrent );
  1056. if ( ssNew.Count() == 0 )
  1057. {
  1058. NextState = stateUndefined;
  1059. AddTransition( CurrentState, wcCurrent, NextState );
  1060. vqDebugOut(( DEB_REGEX, "Undefined transition from %u on %u\n",
  1061. CurrentState,
  1062. wcCurrent ));
  1063. }
  1064. else
  1065. {
  1066. _nfa.EpsClosure( ssNew, ssClosed );
  1067. # if (CIDBG == 1)
  1068. vqDebugOut(( DEB_REGEX, "NFA move FROM " ));
  1069. ssCurrent.Display();
  1070. vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME,
  1071. " ON %d TO ", wcCurrent ));
  1072. ssClosed.Display();
  1073. vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "\n" ));
  1074. # endif // (CIDBG == 1)
  1075. NextState = _xs.XlatToOne( ssClosed );
  1076. if ( !IsComputed( NextState ) )
  1077. {
  1078. Add( NextState, _nfa.IsFinal( ssClosed ) );
  1079. }
  1080. # if (CIDBG == 1)
  1081. vqDebugOut(( DEB_REGEX,
  1082. "DFA state %u = NFA states ", NextState ));
  1083. ssClosed.Display();
  1084. vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "\n" ));
  1085. # endif // (CIDBG == 1)
  1086. AddTransition( CurrentState, wcCurrent, NextState );
  1087. vqDebugOut(( DEB_REGEX,
  1088. "Adding transition from %u on %u to %u\n",
  1089. CurrentState,
  1090. wcCurrent,
  1091. NextState ));
  1092. }
  1093. if ( _timeLimit.CheckExecutionTime() )
  1094. {
  1095. vqDebugOut(( DEB_WARN,
  1096. "CDFA::Recognize: aborting because execution time limit has been exceeded\n" ));
  1097. THROW( CException( QUERY_E_TIMEDOUT ) );
  1098. }
  1099. }
  1100. if ( NextState == stateUndefined )
  1101. {
  1102. return( FALSE );
  1103. }
  1104. //
  1105. // The following are to find a specific condition detected on
  1106. // JHavens' machine.
  1107. //
  1108. Win4Assert( LastState <= _cState );
  1109. Win4Assert( CurrentState <= _cState );
  1110. Win4Assert( NextState <= _cState );
  1111. LastState = CurrentState;
  1112. CurrentState = NextState;
  1113. fFinal = IsFinal( CurrentState );
  1114. //
  1115. // If we ran out of string then just keep going, appending
  1116. // end-of-string symbols. Unfortunately the string is conceptually
  1117. // a set of characters followed by an arbitrary number of
  1118. // end-of-string symbols. In non-EGREP the end-of-string symbol
  1119. // may actually cause multiple state transitions before reaching
  1120. // a final state. In non-EGREP (DOS) mode we stop only when we
  1121. // are no longer 'making progress' (moving to new states) on
  1122. // end-of-string. I haven't completely convinced myself this
  1123. // algorithm is guaranteed to terminate.
  1124. //
  1125. if ( wcCurrent == symEndLine )
  1126. {
  1127. if ( LastState == CurrentState )
  1128. break;
  1129. }
  1130. else
  1131. {
  1132. wcCurrent = *wcs++;
  1133. //
  1134. // After we've exhausted the string, append the special
  1135. // end-of-line character.
  1136. //
  1137. if ( wcCurrent == 0 )
  1138. {
  1139. wcCurrent = symEndLine;
  1140. }
  1141. else
  1142. {
  1143. vqDebugOut(( DEB_REGEX, "\"%c\" --> ", wcCurrent ));
  1144. //
  1145. // Casting is to guarantee this method doesn't modify anything (e.g. read lock ok).
  1146. //
  1147. #if CIDBG == 1
  1148. wcCurrent = (WCHAR) ((CNFA const *)&_nfa)->Translate().Translate( wcCurrent );
  1149. #else
  1150. wcCurrent = (WCHAR) _nfa.Translate().Translate( wcCurrent );
  1151. #endif
  1152. vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "%u\n", wcCurrent ));
  1153. }
  1154. }
  1155. }
  1156. # if CIDBG == 1
  1157. ValidateStateTransitions();
  1158. # endif // CIDBG == 1
  1159. return( fFinal );
  1160. }
  1161. //+-------------------------------------------------------------------------
  1162. //
  1163. // Member: CDFA::Add, private
  1164. //
  1165. // Synopsis: Adds a new state the the DFA.
  1166. //
  1167. // Arguments: [state] -- State number
  1168. // [fFinal] -- TRUE if state is a final state.
  1169. //
  1170. // History: 20-Jan-92 KyleP Created
  1171. //
  1172. // Notes: All transitions for the new state are initially uncomputed.
  1173. //
  1174. //--------------------------------------------------------------------------
  1175. void CDFA::Add( unsigned state, BOOLEAN fFinal )
  1176. {
  1177. if ( state > _cState )
  1178. {
  1179. vqDebugOut(( DEB_ITRACE, "Growing DFA state array.\n" ));
  1180. //
  1181. // Since the number of states required will probably grow at
  1182. // a slow rate, increase the size of the array in a linear
  1183. // fashion.
  1184. unsigned const DeltaState = 10;
  1185. XPtrST<unsigned> xOldStateTrans( _xStateTrans.Acquire() );
  1186. XPtrST<BOOLEAN> xOldStateFinal( _xStateFinal.Acquire() );
  1187. unsigned oldcState = _cState;
  1188. unsigned oldcEntries = (_cState + 1) *
  1189. ( _nfa.Translate().NumClasses() + 1 );
  1190. _cState += DeltaState;
  1191. unsigned cEntries = (_cState + 1) * ( _nfa.Translate().NumClasses() + 1 );
  1192. _xStateTrans.Init( cEntries );
  1193. _xStateFinal.Init( _cState + 1 );
  1194. //
  1195. // Initilize new state tables...
  1196. //
  1197. memcpy( _xStateTrans.GetPointer(),
  1198. xOldStateTrans.GetPointer(),
  1199. oldcEntries * sizeof( unsigned ) );
  1200. memcpy( _xStateFinal.GetPointer(),
  1201. xOldStateFinal.GetPointer(),
  1202. oldcState * sizeof( BOOLEAN ) );
  1203. Win4Assert( stateUncomputed == 0xFFFFFFFF );
  1204. memset( _xStateTrans.GetPointer() + oldcEntries, 0xFF, (cEntries - oldcEntries)*sizeof(unsigned ) );
  1205. RtlZeroMemory( _xStateFinal.GetPointer() + oldcState, (_cState + 1 - oldcState)*sizeof(BOOLEAN) );
  1206. for ( unsigned i = _cState - DeltaState + 1; i <= _cState; i++ )
  1207. {
  1208. AddTransition( i, 0, stateUndefined );
  1209. }
  1210. }
  1211. //
  1212. // All states are set to stateUncomputed above, except the 'undefined' flag-state.
  1213. //
  1214. # if CIDBG == 1
  1215. for ( int i = _nfa.Translate().NumClasses(); i > 0; i-- )
  1216. Win4Assert( Move( state, i ) == stateUncomputed );
  1217. # endif
  1218. AddTransition( state, 0, stateUncomputed );
  1219. _xStateFinal[state] = fFinal;
  1220. }
  1221. //+---------------------------------------------------------------------------
  1222. //
  1223. // Member: CRegXpr::CRegXpr, public
  1224. //
  1225. // Synopsis: Create an expression used to match <prop> with a regex.
  1226. //
  1227. // Arguments: [prel] -- Property restriction.
  1228. // [timeLimit] -- Execution time limit
  1229. //
  1230. // History: 15-Apr-92 KyleP Created.
  1231. //
  1232. //----------------------------------------------------------------------------
  1233. CRegXpr::CRegXpr( CInternalPropertyRestriction * prst, CTimeLimit& timeLimit )
  1234. : CXpr( CXpr::NTRegex ),
  1235. _pxpval( prst->Pid() ),
  1236. _xrstContentHelper( prst->AcquireContentHelper() ),
  1237. //
  1238. // Feature decision: Make all regular expressions case insensitive.
  1239. //
  1240. _dfa( prst->Value(), timeLimit, FALSE ),
  1241. _ulCodePage( LocaleToCodepage( GetSystemDefaultLCID() ))
  1242. {
  1243. //
  1244. // Existence of _prstContentHelper implies a fixed starting prefix.
  1245. //
  1246. if ( !_xrstContentHelper.IsNull() )
  1247. {
  1248. //
  1249. // Find fixed prefix, and add it as a view value
  1250. //
  1251. unsigned i = wcscspn( prst->Value().GetLPWSTR(),
  1252. awcSpecialRegex );
  1253. if ( i > 0 )
  1254. {
  1255. WCHAR wcs[50];
  1256. if ( i > sizeof(wcs)/sizeof(WCHAR) - 2 )
  1257. i = sizeof(wcs)/sizeof(WCHAR) - 2;
  1258. //
  1259. // If "foo" is the prefix, we want all values from "foo" to "fop",
  1260. // but I'm going to be lazy. If the trailing letter of the prefix is
  1261. // 0xFFFF then I just won't set bounds.
  1262. //
  1263. if ( prst->Value().GetLPWSTR()[i-1] != 0xFFFF )
  1264. {
  1265. memcpy( wcs, prst->Value().GetLPWSTR(), i*sizeof(WCHAR) );
  1266. wcs[i] = 0;
  1267. _varPrefix.SetLPWSTR( wcs );
  1268. }
  1269. }
  1270. }
  1271. }
  1272. //+---------------------------------------------------------------------------
  1273. //
  1274. // Member: CRegXpr::CRegXpr, public
  1275. //
  1276. // Synopsis: Copy constructor
  1277. //
  1278. // Arguments: [src] -- Source expression
  1279. //
  1280. // History: 13-Jul-95 KyleP Created.
  1281. //
  1282. //----------------------------------------------------------------------------
  1283. CRegXpr::CRegXpr( CRegXpr const & src )
  1284. : CXpr( CXpr::NTRegex ),
  1285. _pxpval( src._pxpval ),
  1286. _varPrefix( src._varPrefix ),
  1287. _dfa( src._dfa ),
  1288. _ulCodePage( src._ulCodePage )
  1289. {
  1290. if ( !src._xrstContentHelper.IsNull() )
  1291. _xrstContentHelper.Set( src._xrstContentHelper->Clone() );
  1292. }
  1293. //+---------------------------------------------------------------------------
  1294. //
  1295. // Member: CRegXpr::Clone, public
  1296. //
  1297. // Returns: A copy of this node.
  1298. //
  1299. // Derivation: From base class CXpr, Always override in subclasses.
  1300. //
  1301. // History: 11-Dec-91 KyleP Created.
  1302. //
  1303. //----------------------------------------------------------------------------
  1304. CXpr * CRegXpr::Clone()
  1305. {
  1306. return new CRegXpr( *this );
  1307. }
  1308. void CRegXpr::SelectIndexing( CIndexStrategy & strategy )
  1309. {
  1310. if ( _pxpval.Pid() == pidPath ||
  1311. _pxpval.Pid() == pidDirectory ||
  1312. _pxpval.Pid() == pidVirtualPath )
  1313. {
  1314. strategy.SetUnknownBounds( _pxpval.Pid() );
  1315. return;
  1316. }
  1317. if ( _varPrefix.Type() == VT_LPWSTR )
  1318. {
  1319. strategy.SetLowerBound( _pxpval.Pid(), _varPrefix );
  1320. WCHAR * wcs = (WCHAR *)_varPrefix.GetLPWSTR();
  1321. unsigned cc = wcslen( wcs );
  1322. Win4Assert( wcs[cc-1] != 0xFFFF );
  1323. wcs[cc-1] = wcs[cc-1] + 1;
  1324. strategy.SetUpperBound( _pxpval.Pid(), _varPrefix, TRUE );
  1325. }
  1326. if ( !_xrstContentHelper.IsNull() )
  1327. {
  1328. strategy.SetContentHelper( _xrstContentHelper.GetPointer() );
  1329. _xrstContentHelper.Acquire();
  1330. }
  1331. }
  1332. //+---------------------------------------------------------------------------
  1333. //
  1334. // Member: CRegXpr::IsMatch, public
  1335. //
  1336. // Arguments: [obj] -- The objects table. [obj] is already positioned
  1337. // to the record to test.
  1338. //
  1339. // Returns: TRUE if the current record satisfies the regex.
  1340. //
  1341. // History: 15-Apr-92 KyleP Created.
  1342. //
  1343. //----------------------------------------------------------------------------
  1344. BOOL CRegXpr::IsMatch( CRetriever & obj )
  1345. {
  1346. // Make this big enough for most paths
  1347. const cbGuess = ( MAX_PATH * sizeof WCHAR ) + sizeof PROPVARIANT;
  1348. XGrowable<BYTE,cbGuess> xBuffer;
  1349. PROPVARIANT * ppv = (PROPVARIANT *) xBuffer.Get();
  1350. ULONG cb = xBuffer.SizeOf();
  1351. GetValueResult rc = _pxpval.GetValue( obj, ppv, &cb );
  1352. //
  1353. // If the object is too big for the stack then allocate heap (sigh).
  1354. //
  1355. if ( rc == GVRNotEnoughSpace )
  1356. {
  1357. xBuffer.SetSize( cb );
  1358. ppv = (PROPVARIANT *) xBuffer.Get();
  1359. rc = _pxpval.GetValue( obj, ppv, &cb );
  1360. }
  1361. if ( rc != GVRSuccess )
  1362. return FALSE;
  1363. // MAX_PATH here is just a heuristic
  1364. XGrowable<WCHAR, MAX_PATH> xConvert;
  1365. //
  1366. // Cast LPSTR to LPWSTR
  1367. //
  1368. if ( ppv->vt == VT_LPSTR )
  1369. {
  1370. cb = strlen( ppv->pszVal );
  1371. ULONG cwcOut = cb + cb / 4 + 1;
  1372. xConvert.SetSize( cwcOut );
  1373. ULONG cwcActual = 0;
  1374. do
  1375. {
  1376. cwcActual = MultiByteToWideChar( _ulCodePage,
  1377. 0,
  1378. ppv->pszVal,
  1379. cb + 1,
  1380. xConvert.Get(),
  1381. cwcOut );
  1382. if ( cwcActual == 0 )
  1383. {
  1384. if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER )
  1385. {
  1386. cwcOut *= 2;
  1387. xConvert.SetSize( cwcOut );
  1388. }
  1389. else
  1390. THROW( CException() );
  1391. }
  1392. } while ( 0 == cwcActual );
  1393. ppv->vt = VT_LPWSTR;
  1394. ppv->pwszVal = xConvert.Get();
  1395. }
  1396. else if ( ppv->vt == VT_LPWSTR || ppv->vt == VT_BSTR )
  1397. {
  1398. //
  1399. // Normalize to precomposed Unicode
  1400. //
  1401. ULONG cwcIn;
  1402. WCHAR *pwcIn;
  1403. if ( ppv->vt == VT_LPWSTR )
  1404. {
  1405. pwcIn = ppv->pwszVal;
  1406. cwcIn = wcslen(pwcIn) + 1;
  1407. }
  1408. else // ppv->vt == VT_BSTR
  1409. {
  1410. pwcIn = ppv->bstrVal;
  1411. cwcIn = SysStringLen( pwcIn ) + 1;
  1412. }
  1413. xConvert.SetSize( cwcIn );
  1414. ULONG cwcFolded = FoldStringW( MAP_PRECOMPOSED,
  1415. pwcIn,
  1416. cwcIn,
  1417. xConvert.Get(),
  1418. cwcIn );
  1419. if ( cwcFolded == 0 )
  1420. {
  1421. Win4Assert( GetLastError() != ERROR_INSUFFICIENT_BUFFER );
  1422. THROW( CException() );
  1423. }
  1424. ppv->vt = VT_LPWSTR;
  1425. ppv->pwszVal = xConvert.Get();
  1426. }
  1427. //
  1428. // But any other types are illegal
  1429. //
  1430. if ( ppv->vt != VT_LPWSTR )
  1431. {
  1432. vqDebugOut(( DEB_ITRACE,
  1433. "CRegXpr::IsMatch -- Type mismatch. Got 0x%x\n",
  1434. ppv->vt ));
  1435. return FALSE;
  1436. }
  1437. return _dfa.Recognize( ppv->pwszVal );
  1438. }
  1439. #if (CIDBG == 1)
  1440. //
  1441. // Debug methods
  1442. //
  1443. void CNFA::Display()
  1444. {
  1445. vqDebugOut(( DEB_REGEX, "NFA contains %d states.\n", _iNextState-1 ));
  1446. for ( unsigned i = 1; i < _iNextState; i++ )
  1447. {
  1448. Get(i)->Display();
  1449. vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "\n" ));
  1450. }
  1451. }
  1452. void CDFA::ValidateStateTransitions()
  1453. {
  1454. //
  1455. // Valid states are numbers < _cState, plus a few special states.
  1456. //
  1457. for ( int i = _cState * (_nfa.Translate().NumClasses() + 1);
  1458. i >= 0;
  1459. i-- )
  1460. {
  1461. if ( _xStateTrans[i] > _cState &&
  1462. _xStateTrans[i] != stateUncomputed &&
  1463. _xStateTrans[i] != stateUninitialized &&
  1464. _xStateTrans[i] != stateUndefined )
  1465. {
  1466. vqDebugOut(( DEB_ERROR, "Bogus state 0x%x in DFA. pDFA = 0x%x\n",
  1467. _xStateTrans[i], this ));
  1468. Win4Assert( !"Bogus state in DFA" );
  1469. }
  1470. }
  1471. }
  1472. #endif // (CIDBG == 1)