%{ //+--------------------------------------------------------------------------- // // Microsoft Windows // Copyright (C) Microsoft Corporation, 1997 - 2000. // // File: parser.l // // Contents: Lex rules for parser // // Notes: Written for flex version 2.5.4 // // History: 10-01-97 emilyb created // //---------------------------------------------------------------------------- class CValueParser; #include "yybase.hxx" #include "parser.h" #include "parsepl.h" #include "flexcpp.h" #define TOKEN(tknNum) return (tknNum); #define STRING_VALUE(tknNum, fLong, fQuote) \ { \ if (!IsTokenEmpty()) \ return CreateTknValue(yylval, tknNum, fLong, fQuote); \ } /* ** Make Lex read from a block of data ** buffer is the character buffer, ** result is a variable to store the number of chars read ** ms is the size of the buffer */ #undef YY_INPUT #define YY_INPUT(b, r, ms) (r = yybufferinput(b, ms)) DECLARE_INFOLEVEL(yacc) //+--------------------------------------------------------------------------- // // Function: YYLEXER::IsTokenEmpty // // Synopsis: Determines if a token is empty. An empty token only has // whitespace or has nothing in it. // // Arguments: None. // // Returns: Boolean value. // // History: 08-APR-98 KrishnaN created // //---------------------------------------------------------------------------- BOOL YYLEXER::IsTokenEmpty() { LPWSTR pwsz = yytext; Win4Assert(pwsz); while (*pwsz != 0) { if (*pwsz != L' ' && *pwsz != L'\t') return FALSE; pwsz++; } return TRUE; } //+--------------------------------------------------------------------------- // // Function: YYLEXER::IsNotOperator // // Synopsis: Determines if we have a not operator. // // Arguments: None. // // Returns: Boolean value. // // History: 08-DEC-98 KrishnaN created // //---------------------------------------------------------------------------- BOOL YYLEXER::IsNotOperator() { LPWSTR pwsz = yytext; Win4Assert(pwsz); // skip past leading spaces int i = 0; while (*pwsz != 0 && (*pwsz == L' ' || *pwsz == L'\t')) { pwsz++; i++; } // If we don't have at least four chars to consider, we don't have a // not operator. if (yyleng < i+4) return FALSE; if ( (*pwsz == L'n' || *pwsz == L'N') && (*(pwsz+1) == L'o' || *(pwsz+1) == L'O') && (*(pwsz+2) == L't' || *(pwsz+2) == L'T') && (*(pwsz+3) == L'@' || *(pwsz+3) == L'#' || *(pwsz+3) == L'$') ) return TRUE; else return FALSE; } //+--------------------------------------------------------------------------- // // Function: YYLEXER::CreateTknValue // // Synopsis: Allocs a WCHAR string which is passed to the YACC value stack. // // Arguments: [ppStg] -- set to pointer to alloc'd memory // [tknNum] -- token id // [fLong] -- true if token is in longhand version // [fQuote] -- true if token is quoted // // Returns: Updated token id // // History: 10-01-97 emilyb created // //---------------------------------------------------------------------------- short YYLEXER::CreateTknValue(YYSTYPE *ppStg, short tknNum, BOOL fLong, BOOL fQuote ) { HRESULT hr = S_OK; short retTkn = tknNum; LPWSTR pwsz = yytext; if (!fQuote) { // If we see a double quote, consider the string quoted. while (L' ' == *pwsz) pwsz++; if (*pwsz == L'"') { // strip trailing blanks and check if we see a trailing " LPWSTR pLast = pwsz + wcslen(pwsz) - 1; while (pLast >= pwsz && L' ' == *pLast ) { *pLast = L'\0'; pLast--; } if (*pLast == L'"' && pLast > pwsz ) fQuote = TRUE; } } // start parsing from the beginning of the string pwsz = yytext; if (_PHRASEORREGEX == tknNum) { // A quoted string is always a phrase. if (fQuote) retTkn = _PHRASE; else retTkn = DetermineTokenType(); } switch (retTkn) { case _PHRASE: { LPWSTR pLast; pLast = pwsz + wcslen(pwsz) - 1; // if long version, find the phrase if (fLong) { pwsz = pwsz + wcslen(L"{phrase}"); pLast = pLast - wcslen(L"{/phrase}"+1); Win4Assert(*pLast == L'{'); *pLast = L'\0'; } // strip leading and trailing blanks while (L' ' == *pwsz) pwsz++; pLast = pwsz + wcslen(pwsz) - 1; while (pLast >= pwsz && L' ' == *pLast ) { *pLast = L'\0'; pLast--; } // NOTE: Don't strip double quotes here, they will be stripped later yaccDebugOut((DEB_ITRACE, "Phrase %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short")); } break; case _PROPNAME: { LPWSTR pLast; if (fLong) // looks like: { prop name = "prop name" } { // find = while (L'=' != *pwsz) pwsz++; pwsz++; pLast = pwsz + wcslen(pwsz) - 1; Win4Assert( *pLast == L'}'); *pLast-- = L'\0'; } else { // Strip @ or # or $ token Win4Assert(*pwsz == L'@' || *pwsz == L'#' || *pwsz == L'$'); pwsz = pwsz + 1; } // strip leading and trailing blanks while (L' ' == *pwsz) pwsz++; pLast = pwsz + wcslen(pwsz) - 1; while (pLast >= pwsz && L' ' == *pLast ) { *pLast--= L'\0'; } if (fQuote) { pwsz++; *pLast = L'\0'; } yaccDebugOut((DEB_ITRACE, "Propname %ws in %ws format and %ws\n", pwsz, fLong ? L"Long" : L"Short", fQuote ? L"quoted" : L"unquoted")); } break; case _FREETEXT: { LPWSTR pLast; // if long version, find the FREETEXT if (fLong) { pwsz = pwsz + wcslen(L"{freetext}"); pLast = pwsz + wcslen(pwsz) - 1; pLast = pLast - wcslen(L"{/freetext}")+1; Win4Assert(*pLast == L'{'); *pLast = L'\0'; } // strip leading and trailing blanks while (L' ' == *pwsz) pwsz++; pLast = pwsz + wcslen(pwsz) - 1; while (pLast >= pwsz && L' ' == *pLast ) { *pLast = L'\0'; pLast--; } if (fQuote) { Win4Assert(pLast >= pwsz+1); // Strip quotes pwsz = pwsz + 1; *pLast = L'\0'; } yaccDebugOut((DEB_ITRACE, "Freetext %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short")); } break; case _REGEX: { LPWSTR pLast; // if long version, find the regex if (fLong) { pwsz = pwsz + wcslen(L"{regex}"); pLast = pwsz + wcslen(pwsz); pLast = pLast - wcslen(L"{/regex}"); Win4Assert(*pLast == L'{'); *pLast = L'\0'; } // strip leading blanks while (L' ' == *pwsz) pwsz++; // If the first char is =, ignore it. We only ignore the first // = character. This is backward compatible with Triplish1 if (L'=' == *pwsz) pwsz++; // strip leading and trailing blanks while (L' ' == *pwsz) pwsz++; pLast = pwsz + wcslen(pwsz) - 1; while (pLast >= pwsz && L' ' == *pLast ) { *pLast = L'\0'; pLast--; } // After we strip a leading =, we might have a quoted phrase // Check only if fQuote is false. // We don't want to deal with an unpaired double quote. if (!fQuote && *pwsz == L'"' && *pLast == L'"' && pLast > pwsz ) fQuote = TRUE; if (fQuote) { Win4Assert(pLast >= pwsz+1); // Strip quotes pwsz = pwsz + 1; *pLast = L'\0'; } yaccDebugOut((DEB_ITRACE, "RegEx %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short")); } break; case _WEIGHT: { Assert (fLong); Assert(!fQuote); if (fLong) // looks like: {weight value = number } { // find = while (L'=' != *pwsz) pwsz++; pwsz++; // step past leading blanks while (L' ' == *pwsz) pwsz++; // remove trailing } and blanks LPWSTR pLast = pwsz + wcslen(pwsz) - 1; Win4Assert(*pLast == L'}'); *(pLast--) = L'\0'; while (pLast >= pwsz && L' ' == *pLast ) { *(pLast--) = L'\0'; } } } break; case _NEARDIST: { Assert (fLong); Assert(!fQuote); if (fLong) // looks like: dist = number { // find = while (L'=' != *pwsz) pwsz++; pwsz++; // step past leading blanks while (L' ' == *pwsz) pwsz++; } yaccDebugOut((DEB_ITRACE, "NearDist string: %ws in %s format\n", pwsz, fLong ? L"Long" : L"Short")); } break; case _NEARUNIT: { Assert (fLong); Assert(!fQuote); if (fLong) // looks like: unit = blah { // find = while (L'=' != *pwsz) pwsz++; pwsz++; // step past leading blanks while (L' ' == *pwsz) pwsz++; } yaccDebugOut((DEB_ITRACE, "NearUnit string: %ws in %s format\n", pwsz, fLong ? L"Long" : L"Short")); } break; case _VECTORELEMENT: { // strip leading and trailing blanks while (L' ' == *pwsz) pwsz++; LPWSTR pTemp = pwsz + wcslen(pwsz) - 1; if (fLong) // strip trailing ; { Win4Assert(L';' == *pTemp); *pTemp--='\0'; } while (L' ' == *pTemp && pTemp > pwsz) *pTemp-- = L'\0'; if (fQuote) { // Strip quotes pwsz = pwsz + 1; pwsz[wcslen(pwsz)-1] = L'\0'; } yaccDebugOut((DEB_ITRACE, "VectorElem %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short")); } break; case _VEMETHOD: { Assert (fLong); LPWSTR pTemp; if (fLong) // looks like: {vector rankmethod= blah} { // find = while (L'=' != *pwsz) pwsz++; pwsz++; // strip trailing } pTemp = pwsz + wcslen(pwsz) - 1; Win4Assert(L'}' == *pTemp); *pTemp-- = L'\0'; } // strip leading and trailing blanks and quotes while (L' ' == *pwsz) pwsz++; pTemp = pwsz + wcslen(pwsz) - 1; while (L' ' == *pTemp && pTemp > pwsz) *pTemp-- = L'\0'; if (fQuote) { // Strip quotes pwsz = pwsz + 1; pwsz[wcslen(pwsz)-1] = L'\0'; } yaccDebugOut((DEB_ITRACE, "VectorMethod %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short")); } break; } int len = wcslen(pwsz); XPtrST xwszRet(new WCHAR[len + 1]); _allocations.Add(xwszRet.GetPointer(), _allocations.Count()); RtlCopyMemory(xwszRet.GetPointer(), pwsz, (len+1) * sizeof(WCHAR)); (*ppStg).pwszChar = xwszRet.Acquire(); return retTkn; } //+--------------------------------------------------------------------------- // // Function: YYLEXER::DetermineTokenType // // Synopsis: Determines if we have a regular expression or a regular string. // A regular expression is a string that contains atleast one of // *, ?, or | characters. // // Returns: Token id // // History: Jun-05-98 KrishnaN created // //---------------------------------------------------------------------------- short YYLEXER::DetermineTokenType() { LPWSTR pwsz = yytext; LPWSTR pLast = pwsz + wcslen(pwsz) - 1; while (pLast >= pwsz) { if (L'|' == *pwsz || L'*' == *pwsz || L'?' == *pwsz) return _REGEX; pwsz++; } // None of the regular expression defining characters have been found return _PHRASE; } // // // RULES // // Notes: Any characters which are not matched, cause yylexer to throw. // We can also throw if E_OUTOFMEMORY. // Tokens which need 2 return more than 1 value (e.g. {near} // use start states to return each pice of the value. The start // states also emit a "token end" token so that the parser can // check that they are syntactically complete. // Lex matches to the longest match in the rules. If 2 matches // are the same, it matches to the 1st match. %} %x innear %x shortgen %x shortregex %x mayberegex %x implicitphrase %x infreefreetext %x invector white [ \t\n\f\r]+ begin_freetext \{[fF][rR][eE][eE][tT][eE][xX][tT]\}[ ]* end_freetext [ ]*\{\/[fF][rR][eE][eE][tT][eE][xX][tT]\} begin_phrase \{[pP][hH][rR][aA][sS][eE]\}[ ]* end_phrase [ ]*\{\/[pP][hH][rR][aA][sS][eE]\} prop [pP][rR][oO][pP] propname {prop}[ ]+[nN][aA][mM][eE][ ]* contains [cC][oO][nN][tT][aA][iI][nN][sS] and [aA][nN][dD] or [oO][rR] not [nN][oO][tT] near [nN][eE][aA][rR] vector [vV][eE][cC][tT][oO][rR] vecmethod {vector}[ ]+[rR][aA][nN][kK][mM][eE][tT][hH][oO][dD][ ]* ve [vV][eE] weight [wW][eE][iI][gG][hH][tT][ ]+[vV][aA][lL][uU][eE][ ]* coerce [cC][oO][eE][rR][cC][eE] generate [gG][eE][nN][eE][rR][aA][tT][eE] genmethod {generate}[ ]+[mM][eE][tT][hH][oO][dD][ ]* begin_regex \{[rR][eE][gG][eE][xX]\}[ ]* end_regex [ ]*\{\/[rR][eE][gG][eE][xX]\} dist [dD][iI][sS][tT][ ]* unit [uU][nN][iI][tT][ ]* word [wW][oO][rR][dD] sent [sS][eE][nN][tT] par [pP][aA][rR] chap [cC][hH][aA][pP] %% {white} { /* do nothing */ } \( { fContinueImplicitPhrase = FALSE; fContinueRegex = FALSE; fContinueMaybeRegex = FALSE; TOKEN (_OPEN); } \) { fContinueImplicitPhrase = FALSE; fContinueRegex = FALSE; fContinueMaybeRegex = FALSE; TOKEN (_CLOSE); } %{// ************ // PROPNAME // ************ %} %{ // If something was treated as a phrase in Tripolish 1, it should // be treated as such even now. That applies here. For e.g. @propname // caused the following text to be treated as a phrase. The same should // apply to {prop name = propname} // %} %{// shorthand, quoted %} @\"[^"]+\" { // treat value as a phrase BEGIN implicitphrase; STRING_VALUE(_PROPNAME, FALSE, TRUE); } %{// shorthand, not quoted %} @[^" <>=!&|~\^]+ { // treat value as a phrase BEGIN implicitphrase; STRING_VALUE(_PROPNAME, FALSE, FALSE); } %{// shorthand, quoted %} $\"[^"]+\" { // treat value as freetext BEGIN infreefreetext; STRING_VALUE(_PROPNAME, FALSE, TRUE); } %{// shorthand, not quoted %} $[^" <>=!&|~\^]+ { // treat value as freetext BEGIN infreefreetext; STRING_VALUE(_PROPNAME, FALSE, FALSE); } %{// longhand, quoted %} \{{propname}=[ ]*\"[^"]*\"[ ]*\} { // treat value as a phrase BEGIN implicitphrase; STRING_VALUE(_PROPNAME, TRUE, TRUE); } %{// longhand, not quoted %} \{{propname}=[ ]*[^"} ][^}]*\} { // treat value as a phrase BEGIN implicitphrase; STRING_VALUE(_PROPNAME, TRUE, FALSE); } %{// closing token %} \{\/{prop}\} { TOKEN (_PROPEND); } %{// ********* // OPERATORS // ********* %} {contains}[ ]+ { if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } else if (fContinueRegex) { BEGIN shortregex; fContinueRegex = FALSE; } else if (fContinueMaybeRegex) { BEGIN mayberegex; fContinueMaybeRegex = FALSE; } TOKEN (_CONTAINS); } {and}[ ]+ { if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } else if (fContinueRegex) { BEGIN shortregex; fContinueRegex = FALSE; } else if (fContinueMaybeRegex) { BEGIN mayberegex; fContinueMaybeRegex = FALSE; } TOKEN (_AND); } {and}\{ { yyless(yyleng-1); if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } else if (fContinueRegex) { BEGIN shortregex; fContinueRegex = FALSE; } else if (fContinueMaybeRegex) { BEGIN mayberegex; fContinueMaybeRegex = FALSE; } TOKEN (_AND); } {or}[ ]+ { if (fContinueImplicitPhrase) { yaccDebugOut(( DEB_ITRACE, "fContinueImplicitPhrase\n" )); BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } else if (fContinueRegex) { yaccDebugOut(( DEB_ITRACE, "fContinueRegex\n" )); BEGIN shortregex; fContinueRegex = FALSE; } else if (fContinueMaybeRegex) { yaccDebugOut(( DEB_ITRACE, "fContinueMaybeRegex\n" )); BEGIN mayberegex; fContinueMaybeRegex = FALSE; } yaccDebugOut(( DEB_ITRACE, "OR TOKEN found !!!\n" )); TOKEN (_OR); } {or}\{ { yyless(yyleng-1); if (fContinueImplicitPhrase) { yaccDebugOut(( DEB_ITRACE, "OR{ fContinueImplicitPhrase\n" )); BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } else if (fContinueRegex) { yaccDebugOut(( DEB_ITRACE, "OR{ fContinueRegex\n" )); BEGIN shortregex; fContinueRegex = FALSE; } else if (fContinueMaybeRegex) { yaccDebugOut(( DEB_ITRACE, "OR{ fContinueMaybeRegex\n" )); BEGIN mayberegex; fContinueMaybeRegex = FALSE; } yaccDebugOut(( DEB_ITRACE, "OR{ TOKEN found !!!\n" )); TOKEN (_OR); } {not}[ ]+ { if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } else if (fContinueRegex) { BEGIN shortregex; fContinueRegex = FALSE; } else if (fContinueMaybeRegex) { BEGIN mayberegex; fContinueMaybeRegex = FALSE; } TOKEN (_NOT);} {not}\{ { yyless(yyleng-1); if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } else if (fContinueRegex) { BEGIN shortregex; fContinueRegex = FALSE; } else if (fContinueMaybeRegex) { BEGIN mayberegex; fContinueMaybeRegex = FALSE; } TOKEN (_NOT);} & { if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } else if (fContinueRegex) { BEGIN shortregex; fContinueRegex = FALSE; } else if (fContinueMaybeRegex) { BEGIN mayberegex; fContinueMaybeRegex = FALSE; } TOKEN (_AND);} \| { if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } else if (fContinueRegex) { BEGIN shortregex; fContinueRegex = FALSE; } else if (fContinueMaybeRegex) { BEGIN mayberegex; fContinueMaybeRegex = FALSE; } TOKEN (_OR);} ! { if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } else if (fContinueRegex) { BEGIN shortregex; fContinueRegex = FALSE; } else if (fContinueMaybeRegex) { BEGIN mayberegex; fContinueMaybeRegex = FALSE; } TOKEN (_NOT);} {near}[ ]+ { yaccDebugOut(( DEB_ITRACE, "near[ ]+ _NEAR token, begin implicitphrase\n" )); BEGIN implicitphrase; TOKEN (_NEAR);} {near}\{ { yaccDebugOut(( DEB_ITRACE, "near{ _NEAR token, begin implicitphrase\n" )); yyless(yyleng-1); BEGIN implicitphrase; TOKEN (_NEAR);} ~ { BEGIN implicitphrase; TOKEN (_NEAR);} \< { TOKEN (_LT);} \> { TOKEN (_GT);} \<\= { TOKEN (_LTE);} \>\= { TOKEN (_GTE);} \= { if (fContinueMaybeRegex) { // We are not sure if we are going to find a // regular expression or a phrase. BEGIN mayberegex; fContinueMaybeRegex = FALSE; } TOKEN (_EQ); } \!\= { TOKEN (_NE); } \^a { TOKEN (_ALLOF); } \^s { TOKEN (_SOMEOF); } \<[ ]*\^s | \^s[ ]*\< { TOKEN (_LTSOME); } \>[ ]*\^s | \^s[ ]*\> { TOKEN (_GTSOME); } \<\=[ ]*\^s | \^s[ ]*\<\= { TOKEN (_LTESOME); } \>\=[ ]*\^s | \^s[ ]*\>\= { TOKEN (_GTESOME); } \=[ ]*\^s | \^s[ ]*\= { TOKEN (_EQSOME); } \!\=[ ]*\^s | \^s[ ]*\!\= { TOKEN (_NESOME); } \^s[ ]*\^a { TOKEN (_ALLOFSOME); } \^s[ ]*\^s { TOKEN (_SOMEOFSOME); } \^<[ ]*\^a | \^a[ ]*\< { TOKEN (_LTALL); } \>[ ]*\^a | \^a[ ]*\> { TOKEN (_GTALL); } \<\=[ ]*\^a | \^a[ ]*\<\= { TOKEN (_LTEALL); } \>\=[ ]*\^a | \^a[ ]*\>\= { TOKEN (_GTEALL); } \=[ ]*\^a | \^a[ ]*\= { TOKEN (_EQALL); } \!\=[ ]*\^a | \^a[ ]*\!\= { TOKEN (_NEALL); } \^a[ ]*\^a { TOKEN (_ALLOFALL); } \^a[ ]*\^s { TOKEN (_SOMEOFALL); } %{// ************* // VECTOR SPACE TOKENS // ************* %} \{{vecmethod}=[ ]*\"[^"]*\"[ ]*\} { STRING_VALUE(_VEMETHOD, TRUE, TRUE); } \{{vecmethod}=[^}]*\} { STRING_VALUE(_VEMETHOD, TRUE, FALSE); } \{{ve}\} { // makes more sense to enter phrase mode // rather than freetext mode. fContinueImplicitPhrase = TRUE; BEGIN implicitphrase; TOKEN (_VE); } \{\/{vector}\} { TOKEN (_VECTOR_END); } %{// ************* // longhand NEAR // ************* %} %{// must return both unit and distance, so use start state to pull them out, and // return _NEAR_END so parser knows we hit the closing } %} \{{near}[ ] { yaccDebugOut(( DEB_ITRACE, "Longhand _NEAR token, begin innear\n" )); BEGIN innear; } \{{near}\{ { yaccDebugOut(( DEB_ITRACE, "Longhand _NEAR{ token, begin innear\n" )); yyless(yyleng-1); BEGIN innear; } %{// ************ // WEIGHT // ************ %} \{{weight}=[ ]*(0|1|0\.[0-9]*|1\.[0]*|\.[0-9]+)[ ]*\} { if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } yaccDebugOut(( DEB_ITRACE, "_WEIGHT TOKEN FOUND!!\n" )); STRING_VALUE(_WEIGHT,TRUE,FALSE); } \{{coerce}\} { if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } TOKEN (_COERCE); } %{// **************** // longhand GENERATE // **************** %} \{{genmethod}=[" ]*prefix[" ]*\} { if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } yaccDebugOut((DEB_ITRACE, "Prefix recognized.\n")); TOKEN(_GENPREFIX); } \{{genmethod}=[" ]*inflect[" ]*\} { if (fContinueImplicitPhrase) { BEGIN implicitphrase; fContinueImplicitPhrase = FALSE; } yaccDebugOut((DEB_ITRACE, "Inflect recognized.\n")); TOKEN(_GENINFLECT); } \{\/{generate}\} { TOKEN (_GENNORMAL); } %{// **************** // longhand REGEX // **************** %} {begin_regex}\"[^"]*\"{end_regex} { STRING_VALUE(_REGEX,TRUE,TRUE);} {begin_regex}[^{]*{end_regex} { STRING_VALUE(_REGEX,TRUE,FALSE);} {begin_regex}([^{]*\|[()\[{}\],*?+][^{]*)*{end_regex} { STRING_VALUE(_REGEX,TRUE,FALSE);} %{// **************** // shorthand REGEX // **************** %} %{// shorthand, quoted %} #\"[^"]+\" { // Get into short form of reg expression BEGIN shortregex; STRING_VALUE(_PROPNAME, FALSE, TRUE); } %{// shorthand, not quoted %} #[^" <>=!&|~\^]+ { // Get into short form of reg expression BEGIN shortregex; STRING_VALUE(_PROPNAME, FALSE, FALSE); } %{// *************** // longhand PHRASE // *************** %} %{// quoted, with trailing * or ** %} {begin_phrase}\"[^"]*\"{end_phrase}\* { // trailing * has to be for inflection - // process it in shortgen on next pass. // Grab phrase now. yyless(yyleng-1); BEGIN shortgen; STRING_VALUE(_PHRASE,TRUE,TRUE); } %{// quoted, without trailing * or ** %} {begin_phrase}\"[^"]*\"{end_phrase} { // no trailing * -- phrase only STRING_VALUE(_PHRASE,TRUE,TRUE); } %{// unquoted, with trailing * or ** %} {begin_phrase}[^{]*{end_phrase}\* { // trailing * has to be for inflection - // process it in shortgen on next pass. // Grab phrase now. yyless(yyleng-1); BEGIN shortgen; STRING_VALUE(_PHRASE,TRUE,FALSE); } %{// unquoted, without trailing * or ** %} {begin_phrase}[^{]*{end_phrase} { // no trailing * -- phrase only STRING_VALUE(_PHRASE,TRUE,FALSE); } %{// ************* // shorthand PHRASE // ************* %} %{// with trailing * or ** %} \"[^"]*\"\* { // trailing * has to be for inflection - // process it in shortgen on next pass. // Grab phrase now. yyless(yyleng-1); BEGIN shortgen; STRING_VALUE(_PHRASE, FALSE, TRUE); } %{ // without trailing * or ** %} \"[^"]*\" { // no trailing * -- phrase only STRING_VALUE(_PHRASE, FALSE, TRUE); } %{// ***************** // longhand FREETEXT // ***************** %} %{// quoted, with trailing * or ** %} {begin_freetext}\"[^"]*\"{end_freetext}\* { // trailing * has to be for inflection - // process it in shortgen on next pass. // Grab freetext now. yyless(yyleng-1); BEGIN shortgen; STRING_VALUE(_FREETEXT,TRUE,TRUE); } %{// quoted, without trailing * or ** %} {begin_freetext}\"[^"]*\"{end_freetext} { // no trailing * -- freetext only STRING_VALUE(_FREETEXT,TRUE,TRUE); } %{// unquoted, with trailing * or ** %} {begin_freetext}[^{]*{end_freetext}\* { // trailing * has to be for inflection - // process it in shortgen on next pass. // Grab freetext now. yyless(yyleng-1); BEGIN shortgen; STRING_VALUE(_FREETEXT,TRUE,FALSE); } %{// unquoted, without trailing * or ** %} {begin_freetext}[^{]*{end_freetext} { // no trailing * -- freetext only STRING_VALUE(_FREETEXT,TRUE,FALSE); } %{// ****************** // shorthand FREETEXT // ****************** %} [^#$@~&|<>=!\^*"()\{ ][^&~|{) ]*[ ] { // For backward compatibility, we want to special // case and recognize the "not" operator when it // is immediately followed by a mode specifier character // (@, $, #). For e.g. "not@size > 2" should be treated // as if we have a "not" operator followed by "@size > 2". // Without this special case, "not@size > 2" gets recognized // as free text. if (IsNotOperator()) { yyless(3); BEGIN INITIAL; TOKEN(_NOT); } yaccDebugOut(( DEB_ITRACE, "fTreatFreetextAsPhrase is %d\n", fTreatFreetextAsPhrase )); if (fTreatFreetextAsPhrase) BEGIN implicitphrase; else BEGIN infreefreetext; fTreatFreetextAsPhrase = FALSE; yymore(); } [^#$@~&|<>=!\^*"()\{ ][^&~|{) ]* { // IsNotOperator is used here for the same reason as the // use above, except that this rule covers situations where // we have no spaces in the query. E.g. "not@size>2". // This should be equivalent to // "not@size > 2", which in turn should be equivalent to // "not @size > 2" if (IsNotOperator()) { yyless(3); BEGIN INITIAL; TOKEN(_NOT); } if (fTreatFreetextAsPhrase) { STRING_VALUE(_PHRASE,FALSE,FALSE); } else { STRING_VALUE(_FREETEXT,FALSE,FALSE); } fTreatFreetextAsPhrase = FALSE; } %{// ************* // VECTOR VALUES // ************* %} %{// quoted multi-value vector - has ; separator. Singlets caught in parser %} \([ ]*\"[^"]*\"[ ]*; { BEGIN invector; yyless(1);} %{// unquoted multi-value vector - has ; separator. Singlets caught in parser %} \([^(;)]+; { BEGIN invector; yyless(1);} %{// // INNEAR: longhand NEAR processing // %} {white} {} , {} dist[ ]*=[ ]*[0-9]+ { STRING_VALUE(_NEARDIST,TRUE,FALSE);} unit[ ]*=[ ]*{word} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);} unit[ ]*=[ ]*{sent} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);} unit[ ]*=[ ]*{par} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);} unit[ ]*=[ ]*{chap} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);} \} { BEGIN implicitphrase; TOKEN (_NEAR_END);} %{// // INVECTOR: multi value vector processing // %} {white} {} ; {} \"[^"]*\" { STRING_VALUE(_VECTORELEMENT, FALSE, TRUE);} [^ ";)][^;)]*; { STRING_VALUE(_VECTORELEMENT, TRUE, FALSE);} [^ ";)][^;)]*\) { // Need to emit _VECTORELEMENT and _VE_END -- so backup 1 // so we can emit _VE_END on next pass yyless(yyleng-1); STRING_VALUE(_VECTORELEMENT, FALSE, FALSE); } \) { BEGIN INITIAL; TOKEN (_VE_END); } %{// // INFREEFREETEXT: shorthand FREETEXT processing // // NOTE: and, or, near need to be localized %} [ ]+ { yymore(); } {and}[ ] { yyless(yyleng-4); BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } {and}\{ { yyless(yyleng-4); BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } {or}[ ] { yyless(yyleng-3); BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } {or}\{ { yyless(yyleng-3); BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } {near}[ ] { yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{near}[ ]\n" )); yyless(yyleng-5); fTreatFreetextAsPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } {near}\{ { yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{near}{\n" )); yyless(yyleng-5); fTreatFreetextAsPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } \{{near}[ ] { yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{{near}\n" )); yyless(yyleng-6); fTreatFreetextAsPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } \{{near}\{ { yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{{near}{\n" )); yyless(yyleng-6); fTreatFreetextAsPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } & { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } \| { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } ~ { yyless(yyleng-1); fTreatFreetextAsPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } \( { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } \) { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } \{ { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } \"[^"]+\" { BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } [^~&|{}()" ]+[ ] { yymore(); } [^~&|{}()" ]+ { BEGIN INITIAL; STRING_VALUE(_FREETEXT,FALSE,FALSE); } %{// // SHORTGEN: * or ** processing // // can only get here by backing up over *, // so we will always find a match %} \*\* { BEGIN INITIAL; TOKEN(_SHGENINFLECT); } \* { BEGIN INITIAL; TOKEN(_SHGENPREFIX); } %{// // SHORTREGEX: #propname processing // // can only get here when #"propname" or #propname // (quoted or unquoted) version is detected. // NOTE: and, or need to be localized // NOTE: It doesn't make sense to have the near operator following // a regular expression. A regex is Boolean and doesn't evaluate // to a position value. // // %} [ ]+ { yymore(); } = { // ignore equal operators... BEGIN shortregex; } \"[^"]*\" { STRING_VALUE(_REGEX, FALSE, TRUE);} {and}[ ] { fContinueRegex = TRUE; yyless(yyleng-4); BEGIN INITIAL; STRING_VALUE(_REGEX,FALSE,FALSE); } {or}[ ] { fContinueRegex = TRUE; yyless(yyleng-3); BEGIN INITIAL; STRING_VALUE(_REGEX,FALSE,FALSE); } {not}[ ] { yyless(yyleng-4); // The only valid way to get here is to // have had seen "and" before. Don't recognize // a regex. Back off and let the lexer takes its // normal course. fContinueRegex = TRUE; BEGIN INITIAL; } & { fContinueRegex = TRUE; yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_REGEX,FALSE,FALSE); } \| { fContinueRegex = TRUE; yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_REGEX,FALSE,FALSE); } ! { yyless(yyleng-1); // The only valid way to get here is to // have had seen "and" before. Don't recognize // a phrase. Back off and let the lexer takes its // normal course. fContinueRegex = TRUE; BEGIN INITIAL; } %{ // When we find an operator we should treat it as one. // So backup and get out if you see one. // Normally '^' is treated as part of an operator (e.g. ^a), but it also // has a special meaning in regular expression syntax. So we will have to // let it through when it is part of a regular expression. As an alternative, // we can allow '^' in regular expression in a limited manner (i.e. only the use // in square brackets to exclude the set of chars "[^abc]" where abc are excluded). // This alternative will let the common case use of '^' in a regular expression // while allowing it to be treated as part of an operator when it doesn't // occur immediately after a '['. // We are implementing the alternative here because our regex capability // only allows for the "[^" construct. %} [\^<>@$#] { yyless(yyleng-1); fContinueRegex = FALSE; BEGIN INITIAL; } \( { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_REGEX,FALSE,FALSE); } \) { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_REGEX,FALSE,FALSE); } \{ { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_REGEX,FALSE,FALSE); } (([^~&|{}()\^<>!@$#= ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$#= ])*)+[ ] { yymore(); } (([^~&|{}()\^<>!@$#= ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$#= ])*)+ { fContinueRegex = TRUE; BEGIN INITIAL; STRING_VALUE(_REGEX,FALSE,FALSE); } {and}[ ] { yyless(yyleng-4); fContinueMaybeRegex = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE); } {or}[ ] { yyless(yyleng-3); fContinueMaybeRegex = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE); } {not}[ ] { yyless(yyleng-4); // The only valid way to get here is to // have had seen "and" before. Don't recognize // a regex. Back off and let the lexer takes its // normal course. fContinueMaybeRegex = TRUE; BEGIN INITIAL; } & { fContinueMaybeRegex = TRUE; yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE); } \| { fContinueMaybeRegex = TRUE; yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE); } ! { yyless(yyleng-1); // The only valid way to get here is to // have had seen "and" before. Don't recognize // a phrase. Back off and let the lexer takes its // normal course. fContinueMaybeRegex = TRUE; BEGIN INITIAL; } \( { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE); } \) { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE); } \{ { yyless(yyleng-1); BEGIN INITIAL; STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE); } [ ]+ { yymore(); } \"[^"]*\" { STRING_VALUE(_PHRASE, FALSE, TRUE);} (([^~&|{}()\^<>!@$# ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$# ])*)+[ ] { yymore(); } (([^~&|{}()\^<>!@$# ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$# ])*)+ { fContinueMaybeRegex = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE); } %{ // When we find an operator at the start of a phrase, // we should treat it as one. So backup and get out if you see one. %} [\^<>@$#] { yyless(yyleng-1); fContinueMaybeRegex = FALSE; BEGIN INITIAL; } %{// // IMPLICITPHRASE: Where phrase is implied. // // can only get here when @propname or {prop name = propname} is detected. // NOTE: and, or, not need to be localized when time permits. // // NTRAID#DB-NTBUG9-84571-2000/07/31-dlee Indexing Service tripolish2 query expressions misinterpreted as strings // if expression has trailing blanks, we'll emit a string value %} \"[^"]*\" { fContinueImplicitPhrase = FALSE; BEGIN INITIAL; STRING_VALUE(_PHRASE, FALSE, TRUE); } [ ]+ { yymore(); } {and}[ ] { yyless(yyleng-4); fContinueImplicitPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } {or}[ ] { yyless(yyleng-3); fContinueImplicitPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } {near}[ ] { yyless(yyleng-5); // We want to treat the following token as a phrase fContinueImplicitPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } {near}\{ { yyless(yyleng-5); // We want to treat the following token as a phrase fContinueImplicitPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } {not}[ ] { yyless(yyleng-4); // The only valid way to get here is to // have had seen "and" before. Don't recognize // a phrase. Back off and let the lexer takes its // normal course. fContinueImplicitPhrase = TRUE; BEGIN INITIAL; } & { yyless(yyleng-1); fContinueImplicitPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } ~ { yyless(yyleng-1); // We want to treat the following token as a phrase fContinueImplicitPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } ! { yyless(yyleng-1); // The only valid way to get here is to // have had seen "and" before. Don't recognize // a phrase. Back off and let the lexer takes its // normal course. fContinueImplicitPhrase = TRUE; BEGIN INITIAL; } \| { yyless(yyleng-1); fContinueImplicitPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } \( { yyless(yyleng-1); fContinueImplicitPhrase = FALSE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } \) { yyless(yyleng-1); fContinueImplicitPhrase = FALSE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } \{ { yyless(yyleng-1); fContinueImplicitPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } {contains}[ ] { yyless(yyleng-9); fContinueImplicitPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); } %{ // When we find an operator at the start of an implicit phrase, // we should treat it as one. So backup and get out if you see one. %} [\^<>@$#] { yyless(yyleng-1); fContinueImplicitPhrase = FALSE; BEGIN INITIAL; } %{ // Triplish2 uses = to indicate that whatever appears after it may // be using wildcards. Implement that here. %} = { yyless(yyleng-1); fContinueMaybeRegex = TRUE; BEGIN INITIAL; } [^~&|{}()\^<>=!@$# ]+[ ] { yymore(); } [^~&|{}()\^<>=!@$# ]+ { fContinueImplicitPhrase = TRUE; BEGIN INITIAL; STRING_VALUE(_PHRASE,FALSE,FALSE); }