//+------------------------------------------------------------------------- // // Microsoft Windows // Copyright (C) Microsoft Corporation, 1991 - 2001. // // File: FA.cxx // // Contents: Non-deterministic finite automata // // Classes: CNFA // // History: 01-20-92 KyleP Created // //-------------------------------------------------------------------------- #include #pragma hdrstop #pragma optimize( "", off ) #include #include #include #include "stateset.hxx" //+------------------------------------------------------------------------- // // Member: CFA::CFA, public // // Synopsis: Copy constructor // // History: 13-Jul-95 KyleP Created // //-------------------------------------------------------------------------- CFA::CFA( CFA const & src ) : _cTotal( src._cTotal ), _ppState( 0 ) { _ppState = new CFAState * [ _cTotal ]; unsigned i = 0; TRY { for ( ; i < _cTotal; i++ ) { if ( 0 == src._ppState[i] ) _ppState[i] = 0; else _ppState[i] = new CFAState( *src._ppState[i] ); } } CATCH( CException, e ) { for ( ;i > 0; i-- ) delete _ppState[i-1]; delete _ppState; RETHROW(); } END_CATCH } //+------------------------------------------------------------------------- // // Member: CFA::~CFA, protected // // Synopsis: Frees automata. // // History: 20-Jan-92 KyleP Created // //-------------------------------------------------------------------------- CFA::~CFA() { if( _ppState ) { for ( unsigned i = 0; i < _cTotal; i++ ) { delete _ppState[i]; } delete _ppState; } } //+------------------------------------------------------------------------- // // Member: CFA::Add, protected // // Synopsis: Adds new state to automata. // // Arguments: [pState] -- New state. State number is member data. // // History: 20-Jan-92 KyleP Created // //-------------------------------------------------------------------------- void CFA::Add( CFAState * pState ) { if ( pState->StateNumber() > _cTotal ) { for( unsigned newTotal = (_cTotal) ? _cTotal * 2 : 1; pState->StateNumber() > newTotal; newTotal *= 2 ); CFAState ** oldState = _ppState; _ppState = new CFAState * [ newTotal ]; memcpy( _ppState, oldState, _cTotal * sizeof( CFAState * ) ); memset( _ppState + _cTotal, 0, (newTotal - _cTotal) * sizeof( CFAState * ) ); _cTotal = newTotal; } _ppState[pState->StateNumber() - 1] = pState; } //+------------------------------------------------------------------------- // // Member: CFA::Get, protected // // Arguments: [iState] -- State to fetch. // // Returns: State [iState]. // // History: 20-Jan-92 KyleP Created // //-------------------------------------------------------------------------- CFAState * CFA::Get( unsigned iState ) { vqAssert( iState <= _cTotal ); { # if (CIDBG == 1) if ( _ppState[ iState - 1 ]->StateNumber() != iState ) vqDebugOut(( DEB_ERROR, "CFA::Get() -- Error\n" )); # endif // (CIDBG == 1) return( _ppState[ iState - 1 ] ); } } //+------------------------------------------------------------------------- // // Member: CNFA::CNFA, public // // Synopsis: Converts regular expression string to NFA. // // Arguments: [pwcs] -- Regular expression. // [fCaseSens] -- TRUE if case sensitive search. // // History: 20-Jan-92 Kyleap Created // //-------------------------------------------------------------------------- CNFA::CNFA( WCHAR const * pwcs, BOOLEAN fCaseSens ) : _iNextState( 1 ), _iStart( 0 ), _chars( fCaseSens ) { unsigned iEnd; // // _aState initially contains room for 2 * #chars in regex. According // to the Dragon Book pg. 121 this is guaranteed to be sufficient space. // Of course the dragon book doesn't completely take DOS or CMS into // account. For DOS, we need to treat beginning (and end) of line as // 'characters' in the string. For CMS, I agreed to support the // {m,n} construct, which clearly violates this rule. // if ( 0 == pwcs ) { vqDebugOut(( DEB_ERROR, "ERROR: regex string value of 0 " )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } unsigned cState = wcslen( pwcs ) * 2 + 2*2; // 2*2 for beginning & end of line _aState.Init( cState ); for ( unsigned i = 1 ; i <= _aState.Count(); i++ ) Get(i)->Init(i); FindCharClasses( pwcs ); Parse( pwcs, &_iStart, &iEnd ); Get( iEnd )->MakeFinal(); } //+------------------------------------------------------------------------- // // Member: CNFA::CNFA, public // // Synopsis: Copy constructor // // Arguments: [src] -- Source // // History: 13-Jul-95 Kylep Created // //-------------------------------------------------------------------------- CNFA::CNFA( CNFA const & src ) : _iNextState( src.NumStates() ), _iStart( src._iStart ), _chars( src._chars ), _aState( src._aState.Count() ) { for ( unsigned i = 0; i < _aState.Count(); i++ ) _aState[i] = src._aState[i]; } //+------------------------------------------------------------------------- // // Member: CNFA::~CNFA, public // // Synopsis: Free state table. // // History: 13-Oct-92 KyleP Created // //-------------------------------------------------------------------------- CNFA::~CNFA() { } //+------------------------------------------------------------------------- // // Member: CNFA::EpsClosure, public // // Synopsis: Computes the epsilon closure for state [StateNum] // // Effects: States in the epsilon closure of state [StateNum] // are added to the state set [ssOut]. // // Arguments: [StateNum] -- Initial state. // [ssOut] -- Output state set. // // History: 20-Jan-92 KyleP Created // //-------------------------------------------------------------------------- void CNFA::EpsClosure( unsigned StateNum, CStateSet & ssOut ) { CStateSet ssTraversed; ssOut.Add( StateNum ); BOOLEAN changed = TRUE; while ( changed ) { changed = FALSE; for ( unsigned i = ssOut.Count(); i > 0; i-- ) { if ( !ssTraversed.IsMember( ssOut.State( i ) ) ) { ssTraversed.Add( ssOut.State( i ) ); Get( ssOut.State( i ) )->Move( ssOut, symEpsilon ); changed = TRUE; } } } } //+------------------------------------------------------------------------- // // Member: CNFA::EpsClosure, public // // Synopsis: Computes the epsilon closure for state set [ssIn] // // Effects: States in the epsilon closure of [ssIn] // are added to the state set [ssOut]. // // Arguments: [ssIn] -- Initial state set. // [ssOut] -- Output state set. // // History: 20-Jan-92 KyleP Created // //-------------------------------------------------------------------------- void CNFA::EpsClosure( CStateSet & ssIn, CStateSet & ssOut ) { for ( unsigned i = ssIn.Count(); i > 0; i-- ) { EpsClosure( ssIn.State( i ), ssOut ); } } //+------------------------------------------------------------------------- // // Member: CDFA::IsFinal, public // // Arguments: [ss] -- State set // // Returns: TRUE if some state in [ss] is final. // // History: 20-Jan-92 Kyleap Created // //-------------------------------------------------------------------------- BOOLEAN CNFA::IsFinal( CStateSet & ss ) { BOOLEAN fFinal = FALSE; for ( unsigned i = ss.Count(); i > 0 && !fFinal; i-- ) { fFinal = (BYTE)(Get( ss.State( i ) )->IsFinal()); } return( fFinal ); } //+------------------------------------------------------------------------- // // Member: CNFA::Move, public // // Effects: Performs a non-deterministic move from every state // in [ssIn] on [symbol]. The new state set is in // [ssOut]. // // Arguments: [ssIn] -- Initial state set. // [ssOut] -- Final state set. // [symbol] -- Transition symbol. // // History: 20-Jan-92 KyleP Created // //-------------------------------------------------------------------------- void CNFA::Move( CStateSet & ssIn, CStateSet & ssOut, unsigned symbol ) { for ( unsigned i = ssIn.Count(); i > 0; i-- ) { Get( ssIn.State( i ) )->Move( ssOut, symbol ); } } //+------------------------------------------------------------------------- // // Member: CNFA::FindCharClasses, private // // Effects: Partitions the UniCode character space (2^16 characters) // into equivalence classes such that all characters in // a given class will have identical transitions in the NFA. // // Arguments: [wcs] -- Original regular expression string. // // History: 20-Jan-92 KyleP Created // // Notes: If case sensitivity is turned off, two ranges will be // added for characters with upper/lower case. Even though // both ranges react identically the mapping algorithm can // only deal with contiguous ranges of characters. // //-------------------------------------------------------------------------- void CNFA::FindCharClasses( WCHAR const * wcs ) { // // Scan the regex looking for characters with (potentially) // different transitions. // while ( *wcs ) { switch ( *wcs ) { case wcAnySingle: case wcAnyMultiple: case wcDOSDot: break; case wcEscape: { wcs++; switch ( *wcs ) { case 0: vqDebugOut(( DEB_WARN, "Invalid regex (%wc at end of string\n", wcEscape )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); break; case wcAnySingle: case wcRepeatZero: case wcRepeatOne: case wcOr: case wcBeginParen: case wcEndParen: break; case wcBeginRepeat: for ( wcs++; *wcs; wcs++ ) { if ( *wcs == wcEscape && *(wcs+1) == wcEndRepeat ) { wcs++; break; } } break; case wcBeginRange: wcs++; // // Check the special cases of ^ and ] // if ( *wcs == wcInvertRange ) wcs++; if ( *wcs == wcEndRange ) { _chars.AddRange( *wcs, *wcs ); wcs++; } for ( ; *wcs && *wcs != wcEndRange; wcs++ ) { if ( *(wcs + 1) == wcRangeSep ) { _chars.AddRange( *wcs, *(wcs+2) ); } else { _chars.AddRange( *wcs, *wcs ); } } if ( *wcs != wcEndRange ) { vqDebugOut(( DEB_WARN, "Invalid regex. Missing %wc\n", wcEndRange )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } break; default: _chars.AddRange( *wcs, *wcs ); break; } break; } default: _chars.AddRange( *wcs, *wcs ); break; } wcs++; } _chars.Prepare(); } WCHAR * CNFA::_wcsNull = (WCHAR*)""; //+------------------------------------------------------------------------- // // Member: CNFA::Parse, private // // Synopsis: Creates a NFA from [wcs] // // Effects: Parses [wcs] until end of string or character wcHalt is // encountered. On exit, [iStart] and [iEnd] contain the // starting and ending states of the NFA, respectively. // [pwcsEnd] points to the last character of [wcs] that was // parsed. // // Arguments: [wcs] -- Regular expression. // [iStart] -- Starting state of NFA. // [iEnd] -- Ending state of NFA // [pwcsEnd] -- Last character of [wcs] that was parsed. // [wcHalt] -- Stop parsing if this character encountered. // // History: 20-Jan-92 KyleP Created // //-------------------------------------------------------------------------- void CNFA::Parse( WCHAR const * wcs, unsigned * iStart, unsigned * iEnd, WCHAR const * * pwcsEnd, WCHAR wcHalt ) { unsigned iCurrent; unsigned iNext; unsigned iLocalStart; // Used for */+/? repositioning BOOLEAN fRepeat = FALSE; // Used for + BOOLEAN fTopLevel = (*iStart == 0); // TRUE if at top level; *iEnd = 0; // // Get a starting state. *iStart == 0 implies this is the 'top-level' // parse of the regular expression (e.g. we're not parsing a // parenthesized subexpression. // if ( fTopLevel ) { iCurrent = _iNextState; *iStart = _iNextState++; iLocalStart = 0; // // non-EGREP (DOS) regex match entire string. // if ( *wcs != wcAnyMultiple ) { iNext = _iNextState; Get( iCurrent )->AddTransition( symBeginLine, _iNextState ); _iNextState++; iCurrent = iNext; } else { // // Add a 'special' transition on the very first state to // eat up characters until we actually jump into the // regular expresion. // Get( iCurrent )->AddTransition( symAny, Get( iCurrent )->StateNumber() ); } } else { iCurrent = *iStart; iLocalStart = *iStart; } unsigned iOrStart = Get( iCurrent )->StateNumber(); // // Original start of string. // WCHAR const * wcsBeginning = wcs; // // wcsLocalStart tracks the piece of string to be repeated for wcZeroOrOne, etc. // WCHAR const * wcsLocalStart = wcs; // // Parse the regular expression until there is no more or a // termination character is hit. // for ( ; *wcs && *wcs != wcHalt; wcs++ ) { switch ( *wcs ) { case wcAnySingle: iNext = _iNextState; Get( iCurrent )->AddTransition( symAny, _iNextState ); iLocalStart = Get( iCurrent )->StateNumber(); wcsLocalStart = wcs; _iNextState++; iCurrent = iNext; break; case wcAnyMultiple: // // Any single // iNext = _iNextState; Get( iCurrent )->AddTransition( symAny, _iNextState ); iLocalStart = Get( iCurrent )->StateNumber(); wcsLocalStart = wcs; _iNextState++; iCurrent = iNext; // // Repeat zero or more // Get( iLocalStart )->AddTransition( symEpsilon, Get( iCurrent )->StateNumber() ); Get( iCurrent )->AddTransition( symEpsilon, iLocalStart ); break; case wcEscape: { wcs++; switch ( *wcs ) { case wcBeginParen: { unsigned iLocalEnd; iLocalStart = Get( iCurrent )->StateNumber(); wcsLocalStart = wcs - 1; wcs++; // Eat '('. Parse( wcs, &iLocalStart, &iLocalEnd, &wcs, wcEndParen ); wcs--; // Provide character for loop to eat. iCurrent = iLocalEnd; break; } case wcEndParen: // // Taken care of at outer level. Just backup so we hit the end. // wcs--; break; case wcBeginRepeat: { if ( wcHalt == wcBeginRepeat ) { // // Taken care of at outer level. Just backup so we hit the end. // wcs--; } else { // // Setup: Bounds of repeated regex // WCHAR const * wcsStartRepeat = wcsLocalStart; WCHAR const * wcsEndRepeat = wcs + 1; // // Setup: Repeat parameters. // unsigned cRepeat1, cRepeat2; wcs++; ParseRepeat( wcs, cRepeat1, cRepeat2 ); unsigned iLocalEnd; // // The minimum set has no epsilon transitions. // if ( cRepeat1 > 1 ) { iLocalStart = Get( iCurrent )->StateNumber(); iLocalEnd = iLocalStart; for ( unsigned i = 1; i < cRepeat1; i++ ) { WCHAR const * wcsEnd; iLocalStart = iLocalEnd; iLocalEnd = 0; // Must be zero! Parse( wcsLocalStart, &iLocalStart, &iLocalEnd, &wcsEnd, wcBeginRepeat ); if ( wcsEnd != wcsEndRepeat ) { vqDebugOut(( DEB_ERROR, "Invalid regex: Nested repeats?\n" )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } } } else iLocalEnd = Get( iCurrent )->StateNumber(); if ( cRepeat1 == cRepeat2 ) { vqDebugOut(( DEB_REGEX, "REPEAT: Exactly %u times\n", cRepeat1 )); } else if ( cRepeat2 == 0 ) { vqDebugOut(( DEB_REGEX, "REPEAT: At least %u times\n", cRepeat1 )); Get( iLocalEnd )->AddTransition( symEpsilon, iLocalStart ); } else if ( cRepeat2 > cRepeat1 ) { for ( unsigned i = cRepeat1; i < cRepeat2; i++ ) { WCHAR const * wcsEnd; iLocalStart = iLocalEnd; iLocalEnd = 0; // Must be zero! Parse( wcsLocalStart, &iLocalStart, &iLocalEnd, &wcsEnd, wcBeginRepeat ); Get( iLocalStart )->AddTransition( symEpsilon, iLocalEnd ); if ( wcsEnd != wcsEndRepeat ) { vqDebugOut(( DEB_ERROR, "Invalid regex: Nested repeats?\n" )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } } } else { vqDebugOut(( DEB_ERROR, "Invalid regex: End repeat count %d < start %d\n", cRepeat2, cRepeat1 )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } iCurrent = iLocalEnd; iLocalStart = 0; wcsLocalStart = _wcsNull; } break; } case wcOr: if ( *iEnd == 0 ) { // // First part of OR clause. // if ( fTopLevel ) { iNext = _iNextState; Get( iCurrent )->AddTransition( symEndLine, _iNextState ); _iNextState++; iCurrent = iNext; } *iEnd = Get( iCurrent )->StateNumber(); } else { // // Subsequent OR clause. Epsilon link to end // Get( iCurrent )->AddTransition( symEpsilon, *iEnd ); } iCurrent = iOrStart; wcsLocalStart = _wcsNull; iLocalStart = 0; break; case wcBeginRange: { BOOLEAN fReverse = FALSE; vqDebugOut(( DEB_REGEX, "RANGE\n" )); wcsLocalStart = wcs-1; iNext = _iNextState; wcs++; // Eat '['. ']' eaten by loop. // // Check the special cases of ^ and ] // if ( *wcs == wcInvertRange ) { wcs++; fReverse = TRUE; // // Add all transitions, they will be removed later. // for ( unsigned uiNext = _chars.TranslateRange( 1, wcLastValidChar ); uiNext != 0; uiNext = _chars.TranslateRange( 0, wcLastValidChar ) ) { Get( iCurrent )->AddTransition( uiNext, _iNextState ); } } if ( *wcs == wcEndRange ) { if ( fReverse ) { Get( iCurrent )->RemoveTransition( _chars.Translate( *wcs++ ), _iNextState ); } else { Get( iCurrent )->AddTransition( _chars.Translate( *wcs++ ), _iNextState ); } } for ( ; *wcs && *wcs != wcEndRange; wcs++ ) { if ( *(wcs + 1) == wcRangeSep ) { vqDebugOut(( DEB_REGEX, "Range %u to %u\n", *wcs, *(wcs+2) )); for ( unsigned uiNext = _chars.TranslateRange( *wcs, *(wcs+2) ); uiNext != 0; uiNext = _chars.TranslateRange( 0, *(wcs+2) ) ) { if ( fReverse ) { Get( iCurrent )->RemoveTransition( uiNext, _iNextState ); } else { Get( iCurrent )->AddTransition( uiNext, _iNextState ); } } wcs += 2; } else { vqDebugOut(( DEB_REGEX, "Singleton = %u\n", *wcs )); if ( fReverse ) { Get( iCurrent )->RemoveTransition( _chars.Translate( *wcs ), _iNextState ); } else { Get( iCurrent )->AddTransition( _chars.Translate( *wcs ), _iNextState ); } } } if ( *wcs != wcEndRange ) { vqDebugOut(( DEB_WARN, "Invalid regex. Missing %wc\n", wcEndRange )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } iLocalStart = Get( iCurrent )->StateNumber(); _iNextState++; iCurrent = iNext; break; } case wcRepeatOne: if ( iLocalStart == 0 ) { vqDebugOut(( DEB_ERROR, "Invalid regex. Nothing to repeat\n" )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } Get( iCurrent )->AddTransition( symEpsilon, iLocalStart ); iNext = _iNextState; Get( iCurrent )->AddTransition( symEpsilon, _iNextState ); wcsLocalStart = wcs - 1; _iNextState++; iCurrent = iNext; break; case wcRepeatZero: if ( iLocalStart == 0 ) { vqDebugOut(( DEB_ERROR, "Invalid regex. Nothing to repeat.\n" )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } Get( iLocalStart )->AddTransition( symEpsilon, Get( iCurrent )->StateNumber() ); Get( iCurrent )->AddTransition( symEpsilon, iLocalStart ); iNext = _iNextState; Get( iCurrent )->AddTransition( symEpsilon, _iNextState ); wcsLocalStart = wcs - 1; _iNextState++; iCurrent = iNext; break; case wcRepeatZeroOrOne: { if ( iLocalStart == 0 ) { vqDebugOut(( DEB_ERROR, "Invalid regex. Nothing to repeat.\n" )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } Get( iLocalStart )->AddTransition( symEpsilon, Get( iCurrent )->StateNumber() ); break; } default: iNext = _iNextState; Get( iCurrent )->AddTransition( _chars.Translate( *wcs ), _iNextState ); iLocalStart = Get( iCurrent )->StateNumber(); wcsLocalStart = wcs - 1; _iNextState++; iCurrent = iNext; break; } break; // switch for wcEscape } default: iNext = _iNextState; Get( iCurrent )->AddTransition( _chars.Translate( *wcs ), _iNextState ); // // In non-EGREP (DOS) syntax dot '.' is funny. It will match // a dot, but if you're at the end of string it will also match // end. So *.txt will look for strings with zero or more // characters followed by '.txt' but *. will find any names // without an extension and with no trailing dot. // if ( *wcs == wcDOSDot ) { Get( iCurrent )->AddTransition( symEndLine, _iNextState ); } iLocalStart = Get( iCurrent )->StateNumber(); wcsLocalStart = wcs; _iNextState++; iCurrent = iNext; break; } } // // non-EGREP (DOS) regex match entire string. // if ( wcHalt == 0 && ( ( wcsBeginning+1 <= wcs && *(wcs-1) != wcAnyMultiple ) || ( wcsBeginning+2 <= wcs && *(wcs-2) == wcEscape ) ) ) { iNext = _iNextState; Get( iCurrent )->AddTransition( symEndLine, _iNextState ); iLocalStart = 0; wcsLocalStart = _wcsNull; _iNextState++; iCurrent = iNext; } // // If we haven't had an OR clause yet, then set iEnd // if ( *iEnd == 0 ) { // // First part of OR clause. // *iEnd = Get( iCurrent )->StateNumber(); } else { // // Subsequent OR clause. Epsilon link to end // Get( iCurrent )->AddTransition( symEpsilon, *iEnd ); } if ( pwcsEnd ) { *pwcsEnd = wcs + 1; // Eat halt character. } if( *wcs != wcHalt ) { vqDebugOut(( DEB_WARN, "Invalid regex. Missing %wc\n", wcHalt )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } } void CNFA::ParseRepeat( WCHAR const * & wcs, unsigned & cRepeat1, unsigned & cRepeat2 ) { cRepeat1 = 0; cRepeat2 = 0; for ( ; *wcs && isdigit(*wcs); wcs++ ) { cRepeat1 *= 10; cRepeat1 += *wcs - '0'; } if ( cRepeat1 == 0 || cRepeat1 > 255 ) { vqDebugOut(( DEB_ERROR, "Invalid regex: Repeat count %d out of bounds.\n", cRepeat1 )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } if ( *wcs == ',' ) { wcs++; if ( *wcs == wcEscape && *(wcs+1) == wcEndRepeat ) { wcs++; } else { for ( ; *wcs && isdigit(*wcs); wcs++ ) { cRepeat2 *= 10; cRepeat2 += *wcs - '0'; } if ( cRepeat2 == 0 || cRepeat2 > 255 ) { vqDebugOut(( DEB_ERROR, "Invalid regex: Repeat count %d too big.\n", cRepeat2 )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } if ( *wcs != wcEscape || *(wcs+1) != wcEndRepeat ) { vqDebugOut(( DEB_ERROR, "Invalid regex: No end to repeat specification.\n" )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } else { wcs++; } } } else if ( *wcs == wcEscape && *(wcs+1) == wcEndRepeat ) { wcs++; cRepeat2 = cRepeat1; } else { vqDebugOut(( DEB_ERROR, "Invalid regex: No end to repeat specification.\n" )); THROW( CException( QUERY_E_INVALIDRESTRICTION ) ); } } //+------------------------------------------------------------------------- // // Member: CDFA::CDFA, public // // Synopsis: Constructs a DFA from a NFA. // // Arguments: [pwcs] -- Regular expression (passed to NFA) // [timeLimit] -- Execution time limit // [fCaseSens] -- TRUE if case-sensitive search // // History: 20-Jan-92 KyleP Created // //-------------------------------------------------------------------------- CDFA::CDFA( WCHAR const * pwcs, CTimeLimit & timeLimit, BOOLEAN fCaseSens ) : _nfa( pwcs, fCaseSens ), _xs( _nfa.NumStates() ), _cState( _nfa.NumStates() ), _timeLimit( timeLimit ) { CommonCtor(); } //+------------------------------------------------------------------------- // // Member: CDFA::CDFA, public // // Synopsis: Copy constructor // // Arguments: [pwcs] -- Regular expression (passed to NFA) // [fCaseSens] -- TRUE if case-sensitive search // // History: 20-Jan-92 KyleP Created // //-------------------------------------------------------------------------- CDFA::CDFA( CDFA const & src ) : _nfa( src._nfa ), _xs( src._nfa.NumStates() ), _cState( src._nfa.NumStates() ), _timeLimit( (CTimeLimit &) src._timeLimit ) { CommonCtor(); } //+------------------------------------------------------------------------- // // Member: CDFA::CommonCtor, private // // Synopsis: Code common to both constructors. // // History: 13-Jul-95 KyleP Snarfed from constructor // //-------------------------------------------------------------------------- void CDFA::CommonCtor() { // // Add initial state. // CStateSet ss; _nfa.EpsClosure( _nfa.StartState(), ss ); _stateStart = _xs.XlatToOne( ss ); // // Intialize translation table. // int cEntries = (_cState + 1) * ( _nfa.Translate().NumClasses() + 1 ); _xStateTrans.Init( cEntries ); _xStateFinal.Init( _cState + 1 ); Win4Assert( stateUncomputed == 0xFFFFFFFF ); memset( _xStateTrans.GetPointer(), 0xFF, cEntries * sizeof( unsigned ) ); RtlZeroMemory( _xStateFinal.GetPointer(), (_cState + 1) * sizeof( BOOLEAN ) ); for ( int i = _cState; i >= 0; i-- ) { AddTransition( i, 0, stateUndefined ); } Add( _stateStart, _nfa.IsFinal( ss ) ); # if (CIDBG == 1) vqDebugOut(( DEB_REGEX, "Character translation:\n" )); _nfa.Translate().Display(); vqDebugOut(( DEB_REGEX, "NFA:\n" )); _nfa.Display(); vqDebugOut(( DEB_REGEX, "DFA state %u = NFA states ", _stateStart )); ss.Display(); vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "\n" )); vqDebugOut(( DEB_REGEX, "DFA start state = %u\n", _stateStart )); # endif // (CIDBG == 1) } //+------------------------------------------------------------------------- // // Member: CDFA::~CDFA, public // // Synopsis: Clean up DFA. Free state tables. // // History: 20-Jun-92 KyleP Created // //-------------------------------------------------------------------------- CDFA::~CDFA() { } //+------------------------------------------------------------------------- // // Member: CDFA::Recognize, public // // Arguments: [wcs] -- Input string. // // Returns: TRUE if [wcs] is matched by the regular expression. // // History: 20-Jan-92 KyleP Created // //-------------------------------------------------------------------------- BOOLEAN CDFA::Recognize( WCHAR const * wcs ) { # if CIDBG == 1 ValidateStateTransitions(); # endif // CIDBG == 1 unsigned CurrentState = _stateStart; unsigned LastState = CurrentState; BOOLEAN fFinal = IsFinal( CurrentState ); WCHAR wcCurrent = symBeginLine; while ( !fFinal ) { unsigned NextState; { CReadAccess lock( _rwa ); // // Casting is to guarantee this method doesn't modify anything (e.g. read lock ok). // #if CIDBG == 1 NextState = ((CDFA const *)this)->Move( CurrentState, wcCurrent ); #else NextState = Move( CurrentState, wcCurrent ); #endif } vqDebugOut(( DEB_REGEX, "DFA move[ %u, %u ] = %u\n", CurrentState, wcCurrent, NextState )); if ( stateUncomputed == NextState ) { CWriteAccess lock( _rwa ); // // Did someone else get here first? // NextState = Move( CurrentState, wcCurrent ); if ( stateUncomputed != NextState ) continue; // // Build the new state // CStateSet ssCurrent; CStateSet ssNew; CStateSet ssClosed; _xs.XlatToMany( CurrentState, ssCurrent ); # if (CIDBG == 1) vqDebugOut(( DEB_REGEX, "DFA state %u = NFA states ", CurrentState )); ssCurrent.Display(); if ( _nfa.IsFinal( ssCurrent ) ) { vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, " FINAL" )); } vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "\n" )); # endif // (CIDBG == 1) _nfa.Move( ssCurrent, ssNew, wcCurrent ); if ( ssNew.Count() == 0 ) { NextState = stateUndefined; AddTransition( CurrentState, wcCurrent, NextState ); vqDebugOut(( DEB_REGEX, "Undefined transition from %u on %u\n", CurrentState, wcCurrent )); } else { _nfa.EpsClosure( ssNew, ssClosed ); # if (CIDBG == 1) vqDebugOut(( DEB_REGEX, "NFA move FROM " )); ssCurrent.Display(); vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, " ON %d TO ", wcCurrent )); ssClosed.Display(); vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "\n" )); # endif // (CIDBG == 1) NextState = _xs.XlatToOne( ssClosed ); if ( !IsComputed( NextState ) ) { Add( NextState, _nfa.IsFinal( ssClosed ) ); } # if (CIDBG == 1) vqDebugOut(( DEB_REGEX, "DFA state %u = NFA states ", NextState )); ssClosed.Display(); vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "\n" )); # endif // (CIDBG == 1) AddTransition( CurrentState, wcCurrent, NextState ); vqDebugOut(( DEB_REGEX, "Adding transition from %u on %u to %u\n", CurrentState, wcCurrent, NextState )); } if ( _timeLimit.CheckExecutionTime() ) { vqDebugOut(( DEB_WARN, "CDFA::Recognize: aborting because execution time limit has been exceeded\n" )); THROW( CException( QUERY_E_TIMEDOUT ) ); } } if ( NextState == stateUndefined ) { return( FALSE ); } // // The following are to find a specific condition detected on // JHavens' machine. // Win4Assert( LastState <= _cState ); Win4Assert( CurrentState <= _cState ); Win4Assert( NextState <= _cState ); LastState = CurrentState; CurrentState = NextState; fFinal = IsFinal( CurrentState ); // // If we ran out of string then just keep going, appending // end-of-string symbols. Unfortunately the string is conceptually // a set of characters followed by an arbitrary number of // end-of-string symbols. In non-EGREP the end-of-string symbol // may actually cause multiple state transitions before reaching // a final state. In non-EGREP (DOS) mode we stop only when we // are no longer 'making progress' (moving to new states) on // end-of-string. I haven't completely convinced myself this // algorithm is guaranteed to terminate. // if ( wcCurrent == symEndLine ) { if ( LastState == CurrentState ) break; } else { wcCurrent = *wcs++; // // After we've exhausted the string, append the special // end-of-line character. // if ( wcCurrent == 0 ) { wcCurrent = symEndLine; } else { vqDebugOut(( DEB_REGEX, "\"%c\" --> ", wcCurrent )); // // Casting is to guarantee this method doesn't modify anything (e.g. read lock ok). // #if CIDBG == 1 wcCurrent = (WCHAR) ((CNFA const *)&_nfa)->Translate().Translate( wcCurrent ); #else wcCurrent = (WCHAR) _nfa.Translate().Translate( wcCurrent ); #endif vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "%u\n", wcCurrent )); } } } # if CIDBG == 1 ValidateStateTransitions(); # endif // CIDBG == 1 return( fFinal ); } //+------------------------------------------------------------------------- // // Member: CDFA::Add, private // // Synopsis: Adds a new state the the DFA. // // Arguments: [state] -- State number // [fFinal] -- TRUE if state is a final state. // // History: 20-Jan-92 KyleP Created // // Notes: All transitions for the new state are initially uncomputed. // //-------------------------------------------------------------------------- void CDFA::Add( unsigned state, BOOLEAN fFinal ) { if ( state > _cState ) { vqDebugOut(( DEB_ITRACE, "Growing DFA state array.\n" )); // // Since the number of states required will probably grow at // a slow rate, increase the size of the array in a linear // fashion. unsigned const DeltaState = 10; XPtrST xOldStateTrans( _xStateTrans.Acquire() ); XPtrST xOldStateFinal( _xStateFinal.Acquire() ); unsigned oldcState = _cState; unsigned oldcEntries = (_cState + 1) * ( _nfa.Translate().NumClasses() + 1 ); _cState += DeltaState; unsigned cEntries = (_cState + 1) * ( _nfa.Translate().NumClasses() + 1 ); _xStateTrans.Init( cEntries ); _xStateFinal.Init( _cState + 1 ); // // Initilize new state tables... // memcpy( _xStateTrans.GetPointer(), xOldStateTrans.GetPointer(), oldcEntries * sizeof( unsigned ) ); memcpy( _xStateFinal.GetPointer(), xOldStateFinal.GetPointer(), oldcState * sizeof( BOOLEAN ) ); Win4Assert( stateUncomputed == 0xFFFFFFFF ); memset( _xStateTrans.GetPointer() + oldcEntries, 0xFF, (cEntries - oldcEntries)*sizeof(unsigned ) ); RtlZeroMemory( _xStateFinal.GetPointer() + oldcState, (_cState + 1 - oldcState)*sizeof(BOOLEAN) ); for ( unsigned i = _cState - DeltaState + 1; i <= _cState; i++ ) { AddTransition( i, 0, stateUndefined ); } } // // All states are set to stateUncomputed above, except the 'undefined' flag-state. // # if CIDBG == 1 for ( int i = _nfa.Translate().NumClasses(); i > 0; i-- ) Win4Assert( Move( state, i ) == stateUncomputed ); # endif AddTransition( state, 0, stateUncomputed ); _xStateFinal[state] = fFinal; } //+--------------------------------------------------------------------------- // // Member: CRegXpr::CRegXpr, public // // Synopsis: Create an expression used to match with a regex. // // Arguments: [prel] -- Property restriction. // [timeLimit] -- Execution time limit // // History: 15-Apr-92 KyleP Created. // //---------------------------------------------------------------------------- CRegXpr::CRegXpr( CInternalPropertyRestriction * prst, CTimeLimit& timeLimit ) : CXpr( CXpr::NTRegex ), _pxpval( prst->Pid() ), _xrstContentHelper( prst->AcquireContentHelper() ), // // Feature decision: Make all regular expressions case insensitive. // _dfa( prst->Value(), timeLimit, FALSE ), _ulCodePage( LocaleToCodepage( GetSystemDefaultLCID() )) { // // Existence of _prstContentHelper implies a fixed starting prefix. // if ( !_xrstContentHelper.IsNull() ) { // // Find fixed prefix, and add it as a view value // unsigned i = wcscspn( prst->Value().GetLPWSTR(), awcSpecialRegex ); if ( i > 0 ) { WCHAR wcs[50]; if ( i > sizeof(wcs)/sizeof(WCHAR) - 2 ) i = sizeof(wcs)/sizeof(WCHAR) - 2; // // If "foo" is the prefix, we want all values from "foo" to "fop", // but I'm going to be lazy. If the trailing letter of the prefix is // 0xFFFF then I just won't set bounds. // if ( prst->Value().GetLPWSTR()[i-1] != 0xFFFF ) { memcpy( wcs, prst->Value().GetLPWSTR(), i*sizeof(WCHAR) ); wcs[i] = 0; _varPrefix.SetLPWSTR( wcs ); } } } } //+--------------------------------------------------------------------------- // // Member: CRegXpr::CRegXpr, public // // Synopsis: Copy constructor // // Arguments: [src] -- Source expression // // History: 13-Jul-95 KyleP Created. // //---------------------------------------------------------------------------- CRegXpr::CRegXpr( CRegXpr const & src ) : CXpr( CXpr::NTRegex ), _pxpval( src._pxpval ), _varPrefix( src._varPrefix ), _dfa( src._dfa ), _ulCodePage( src._ulCodePage ) { if ( !src._xrstContentHelper.IsNull() ) _xrstContentHelper.Set( src._xrstContentHelper->Clone() ); } //+--------------------------------------------------------------------------- // // Member: CRegXpr::Clone, public // // Returns: A copy of this node. // // Derivation: From base class CXpr, Always override in subclasses. // // History: 11-Dec-91 KyleP Created. // //---------------------------------------------------------------------------- CXpr * CRegXpr::Clone() { return new CRegXpr( *this ); } void CRegXpr::SelectIndexing( CIndexStrategy & strategy ) { if ( _pxpval.Pid() == pidPath || _pxpval.Pid() == pidDirectory || _pxpval.Pid() == pidVirtualPath ) { strategy.SetUnknownBounds( _pxpval.Pid() ); return; } if ( _varPrefix.Type() == VT_LPWSTR ) { strategy.SetLowerBound( _pxpval.Pid(), _varPrefix ); WCHAR * wcs = (WCHAR *)_varPrefix.GetLPWSTR(); unsigned cc = wcslen( wcs ); Win4Assert( wcs[cc-1] != 0xFFFF ); wcs[cc-1] = wcs[cc-1] + 1; strategy.SetUpperBound( _pxpval.Pid(), _varPrefix, TRUE ); } if ( !_xrstContentHelper.IsNull() ) { strategy.SetContentHelper( _xrstContentHelper.GetPointer() ); _xrstContentHelper.Acquire(); } } //+--------------------------------------------------------------------------- // // Member: CRegXpr::IsMatch, public // // Arguments: [obj] -- The objects table. [obj] is already positioned // to the record to test. // // Returns: TRUE if the current record satisfies the regex. // // History: 15-Apr-92 KyleP Created. // //---------------------------------------------------------------------------- BOOL CRegXpr::IsMatch( CRetriever & obj ) { // Make this big enough for most paths const cbGuess = ( MAX_PATH * sizeof WCHAR ) + sizeof PROPVARIANT; XGrowable xBuffer; PROPVARIANT * ppv = (PROPVARIANT *) xBuffer.Get(); ULONG cb = xBuffer.SizeOf(); GetValueResult rc = _pxpval.GetValue( obj, ppv, &cb ); // // If the object is too big for the stack then allocate heap (sigh). // if ( rc == GVRNotEnoughSpace ) { xBuffer.SetSize( cb ); ppv = (PROPVARIANT *) xBuffer.Get(); rc = _pxpval.GetValue( obj, ppv, &cb ); } if ( rc != GVRSuccess ) return FALSE; // MAX_PATH here is just a heuristic XGrowable xConvert; // // Cast LPSTR to LPWSTR // if ( ppv->vt == VT_LPSTR ) { cb = strlen( ppv->pszVal ); ULONG cwcOut = cb + cb / 4 + 1; xConvert.SetSize( cwcOut ); ULONG cwcActual = 0; do { cwcActual = MultiByteToWideChar( _ulCodePage, 0, ppv->pszVal, cb + 1, xConvert.Get(), cwcOut ); if ( cwcActual == 0 ) { if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER ) { cwcOut *= 2; xConvert.SetSize( cwcOut ); } else THROW( CException() ); } } while ( 0 == cwcActual ); ppv->vt = VT_LPWSTR; ppv->pwszVal = xConvert.Get(); } else if ( ppv->vt == VT_LPWSTR || ppv->vt == VT_BSTR ) { // // Normalize to precomposed Unicode // ULONG cwcIn; WCHAR *pwcIn; if ( ppv->vt == VT_LPWSTR ) { pwcIn = ppv->pwszVal; cwcIn = wcslen(pwcIn) + 1; } else // ppv->vt == VT_BSTR { pwcIn = ppv->bstrVal; cwcIn = SysStringLen( pwcIn ) + 1; } xConvert.SetSize( cwcIn ); ULONG cwcFolded = FoldStringW( MAP_PRECOMPOSED, pwcIn, cwcIn, xConvert.Get(), cwcIn ); if ( cwcFolded == 0 ) { Win4Assert( GetLastError() != ERROR_INSUFFICIENT_BUFFER ); THROW( CException() ); } ppv->vt = VT_LPWSTR; ppv->pwszVal = xConvert.Get(); } // // But any other types are illegal // if ( ppv->vt != VT_LPWSTR ) { vqDebugOut(( DEB_ITRACE, "CRegXpr::IsMatch -- Type mismatch. Got 0x%x\n", ppv->vt )); return FALSE; } return _dfa.Recognize( ppv->pwszVal ); } #if (CIDBG == 1) // // Debug methods // void CNFA::Display() { vqDebugOut(( DEB_REGEX, "NFA contains %d states.\n", _iNextState-1 )); for ( unsigned i = 1; i < _iNextState; i++ ) { Get(i)->Display(); vqDebugOut(( DEB_REGEX | DEB_NOCOMPNAME, "\n" )); } } void CDFA::ValidateStateTransitions() { // // Valid states are numbers < _cState, plus a few special states. // for ( int i = _cState * (_nfa.Translate().NumClasses() + 1); i >= 0; i-- ) { if ( _xStateTrans[i] > _cState && _xStateTrans[i] != stateUncomputed && _xStateTrans[i] != stateUninitialized && _xStateTrans[i] != stateUndefined ) { vqDebugOut(( DEB_ERROR, "Bogus state 0x%x in DFA. pDFA = 0x%x\n", _xStateTrans[i], this )); Win4Assert( !"Bogus state in DFA" ); } } } #endif // (CIDBG == 1)