Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1578 lines
63 KiB

  1. %{
  2. //+---------------------------------------------------------------------------
  3. //
  4. // Microsoft Windows
  5. // Copyright (C) Microsoft Corporation, 1997 - 2000.
  6. //
  7. // File: parser.l
  8. //
  9. // Contents: Lex rules for parser
  10. //
  11. // Notes: Written for flex version 2.5.4
  12. //
  13. // History: 10-01-97 emilyb created
  14. //
  15. //----------------------------------------------------------------------------
  16. class CValueParser;
  17. #include "yybase.hxx"
  18. #include "parser.h"
  19. #include "parsepl.h"
  20. #include "flexcpp.h"
  21. #define TOKEN(tknNum) return (tknNum);
  22. #define STRING_VALUE(tknNum, fLong, fQuote) \
  23. { \
  24. if (!IsTokenEmpty()) \
  25. return CreateTknValue(yylval, tknNum, fLong, fQuote); \
  26. }
  27. /*
  28. ** Make Lex read from a block of data
  29. ** buffer is the character buffer,
  30. ** result is a variable to store the number of chars read
  31. ** ms is the size of the buffer
  32. */
  33. #undef YY_INPUT
  34. #define YY_INPUT(b, r, ms) (r = yybufferinput(b, ms))
  35. DECLARE_INFOLEVEL(yacc)
  36. //+---------------------------------------------------------------------------
  37. //
  38. // Function: YYLEXER::IsTokenEmpty
  39. //
  40. // Synopsis: Determines if a token is empty. An empty token only has
  41. // whitespace or has nothing in it.
  42. //
  43. // Arguments: None.
  44. //
  45. // Returns: Boolean value.
  46. //
  47. // History: 08-APR-98 KrishnaN created
  48. //
  49. //----------------------------------------------------------------------------
  50. BOOL YYLEXER::IsTokenEmpty()
  51. {
  52. LPWSTR pwsz = yytext;
  53. Win4Assert(pwsz);
  54. while (*pwsz != 0)
  55. {
  56. if (*pwsz != L' ' && *pwsz != L'\t')
  57. return FALSE;
  58. pwsz++;
  59. }
  60. return TRUE;
  61. }
  62. //+---------------------------------------------------------------------------
  63. //
  64. // Function: YYLEXER::IsNotOperator
  65. //
  66. // Synopsis: Determines if we have a not operator.
  67. //
  68. // Arguments: None.
  69. //
  70. // Returns: Boolean value.
  71. //
  72. // History: 08-DEC-98 KrishnaN created
  73. //
  74. //----------------------------------------------------------------------------
  75. BOOL YYLEXER::IsNotOperator()
  76. {
  77. LPWSTR pwsz = yytext;
  78. Win4Assert(pwsz);
  79. // skip past leading spaces
  80. int i = 0;
  81. while (*pwsz != 0 && (*pwsz == L' ' || *pwsz == L'\t'))
  82. {
  83. pwsz++;
  84. i++;
  85. }
  86. // If we don't have at least four chars to consider, we don't have a
  87. // not operator.
  88. if (yyleng < i+4)
  89. return FALSE;
  90. if ( (*pwsz == L'n' || *pwsz == L'N') &&
  91. (*(pwsz+1) == L'o' || *(pwsz+1) == L'O') &&
  92. (*(pwsz+2) == L't' || *(pwsz+2) == L'T') &&
  93. (*(pwsz+3) == L'@' || *(pwsz+3) == L'#' || *(pwsz+3) == L'$')
  94. )
  95. return TRUE;
  96. else
  97. return FALSE;
  98. }
  99. //+---------------------------------------------------------------------------
  100. //
  101. // Function: YYLEXER::CreateTknValue
  102. //
  103. // Synopsis: Allocs a WCHAR string which is passed to the YACC value stack.
  104. //
  105. // Arguments: [ppStg] -- set to pointer to alloc'd memory
  106. // [tknNum] -- token id
  107. // [fLong] -- true if token is in longhand version
  108. // [fQuote] -- true if token is quoted
  109. //
  110. // Returns: Updated token id
  111. //
  112. // History: 10-01-97 emilyb created
  113. //
  114. //----------------------------------------------------------------------------
  115. short YYLEXER::CreateTknValue(YYSTYPE *ppStg, short tknNum, BOOL fLong, BOOL fQuote )
  116. {
  117. HRESULT hr = S_OK;
  118. short retTkn = tknNum;
  119. LPWSTR pwsz = yytext;
  120. if (!fQuote)
  121. {
  122. // If we see a double quote, consider the string quoted.
  123. while (L' ' == *pwsz)
  124. pwsz++;
  125. if (*pwsz == L'"')
  126. {
  127. // strip trailing blanks and check if we see a trailing "
  128. LPWSTR pLast = pwsz + wcslen(pwsz) - 1;
  129. while (pLast >= pwsz && L' ' == *pLast )
  130. {
  131. *pLast = L'\0';
  132. pLast--;
  133. }
  134. if (*pLast == L'"' && pLast > pwsz )
  135. fQuote = TRUE;
  136. }
  137. }
  138. // start parsing from the beginning of the string
  139. pwsz = yytext;
  140. if (_PHRASEORREGEX == tknNum)
  141. {
  142. // A quoted string is always a phrase.
  143. if (fQuote)
  144. retTkn = _PHRASE;
  145. else
  146. retTkn = DetermineTokenType();
  147. }
  148. switch (retTkn)
  149. {
  150. case _PHRASE:
  151. {
  152. LPWSTR pLast;
  153. pLast = pwsz + wcslen(pwsz) - 1;
  154. // if long version, find the phrase
  155. if (fLong)
  156. {
  157. pwsz = pwsz + wcslen(L"{phrase}");
  158. pLast = pLast - wcslen(L"{/phrase}"+1);
  159. Win4Assert(*pLast == L'{');
  160. *pLast = L'\0';
  161. }
  162. // strip leading and trailing blanks
  163. while (L' ' == *pwsz)
  164. pwsz++;
  165. pLast = pwsz + wcslen(pwsz) - 1;
  166. while (pLast >= pwsz && L' ' == *pLast )
  167. {
  168. *pLast = L'\0';
  169. pLast--;
  170. }
  171. // NOTE: Don't strip double quotes here, they will be stripped later
  172. yaccDebugOut((DEB_ITRACE, "Phrase %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
  173. }
  174. break;
  175. case _PROPNAME:
  176. {
  177. LPWSTR pLast;
  178. if (fLong) // looks like: { prop name = "prop name" }
  179. {
  180. // find =
  181. while (L'=' != *pwsz)
  182. pwsz++;
  183. pwsz++;
  184. pLast = pwsz + wcslen(pwsz) - 1;
  185. Win4Assert( *pLast == L'}');
  186. *pLast-- = L'\0';
  187. }
  188. else
  189. {
  190. // Strip @ or # or $ token
  191. Win4Assert(*pwsz == L'@' || *pwsz == L'#' || *pwsz == L'$');
  192. pwsz = pwsz + 1;
  193. }
  194. // strip leading and trailing blanks
  195. while (L' ' == *pwsz)
  196. pwsz++;
  197. pLast = pwsz + wcslen(pwsz) - 1;
  198. while (pLast >= pwsz && L' ' == *pLast )
  199. {
  200. *pLast--= L'\0';
  201. }
  202. if (fQuote)
  203. {
  204. pwsz++;
  205. *pLast = L'\0';
  206. }
  207. yaccDebugOut((DEB_ITRACE, "Propname %ws in %ws format and %ws\n",
  208. pwsz, fLong ? L"Long" : L"Short", fQuote ? L"quoted" : L"unquoted"));
  209. }
  210. break;
  211. case _FREETEXT:
  212. {
  213. LPWSTR pLast;
  214. // if long version, find the FREETEXT
  215. if (fLong)
  216. {
  217. pwsz = pwsz + wcslen(L"{freetext}");
  218. pLast = pwsz + wcslen(pwsz) - 1;
  219. pLast = pLast - wcslen(L"{/freetext}")+1;
  220. Win4Assert(*pLast == L'{');
  221. *pLast = L'\0';
  222. }
  223. // strip leading and trailing blanks
  224. while (L' ' == *pwsz)
  225. pwsz++;
  226. pLast = pwsz + wcslen(pwsz) - 1;
  227. while (pLast >= pwsz && L' ' == *pLast )
  228. {
  229. *pLast = L'\0';
  230. pLast--;
  231. }
  232. if (fQuote)
  233. {
  234. Win4Assert(pLast >= pwsz+1);
  235. // Strip quotes
  236. pwsz = pwsz + 1;
  237. *pLast = L'\0';
  238. }
  239. yaccDebugOut((DEB_ITRACE, "Freetext %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
  240. }
  241. break;
  242. case _REGEX:
  243. {
  244. LPWSTR pLast;
  245. // if long version, find the regex
  246. if (fLong)
  247. {
  248. pwsz = pwsz + wcslen(L"{regex}");
  249. pLast = pwsz + wcslen(pwsz);
  250. pLast = pLast - wcslen(L"{/regex}");
  251. Win4Assert(*pLast == L'{');
  252. *pLast = L'\0';
  253. }
  254. // strip leading blanks
  255. while (L' ' == *pwsz)
  256. pwsz++;
  257. // If the first char is =, ignore it. We only ignore the first
  258. // = character. This is backward compatible with Triplish1
  259. if (L'=' == *pwsz)
  260. pwsz++;
  261. // strip leading and trailing blanks
  262. while (L' ' == *pwsz)
  263. pwsz++;
  264. pLast = pwsz + wcslen(pwsz) - 1;
  265. while (pLast >= pwsz && L' ' == *pLast )
  266. {
  267. *pLast = L'\0';
  268. pLast--;
  269. }
  270. // After we strip a leading =, we might have a quoted phrase
  271. // Check only if fQuote is false.
  272. // We don't want to deal with an unpaired double quote.
  273. if (!fQuote && *pwsz == L'"' && *pLast == L'"' && pLast > pwsz )
  274. fQuote = TRUE;
  275. if (fQuote)
  276. {
  277. Win4Assert(pLast >= pwsz+1);
  278. // Strip quotes
  279. pwsz = pwsz + 1;
  280. *pLast = L'\0';
  281. }
  282. yaccDebugOut((DEB_ITRACE, "RegEx %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
  283. }
  284. break;
  285. case _WEIGHT:
  286. {
  287. Assert (fLong);
  288. Assert(!fQuote);
  289. if (fLong) // looks like: {weight value = number }
  290. {
  291. // find =
  292. while (L'=' != *pwsz)
  293. pwsz++;
  294. pwsz++;
  295. // step past leading blanks
  296. while (L' ' == *pwsz)
  297. pwsz++;
  298. // remove trailing } and blanks
  299. LPWSTR pLast = pwsz + wcslen(pwsz) - 1;
  300. Win4Assert(*pLast == L'}');
  301. *(pLast--) = L'\0';
  302. while (pLast >= pwsz && L' ' == *pLast )
  303. {
  304. *(pLast--) = L'\0';
  305. }
  306. }
  307. }
  308. break;
  309. case _NEARDIST:
  310. {
  311. Assert (fLong);
  312. Assert(!fQuote);
  313. if (fLong) // looks like: dist = number
  314. {
  315. // find =
  316. while (L'=' != *pwsz)
  317. pwsz++;
  318. pwsz++;
  319. // step past leading blanks
  320. while (L' ' == *pwsz)
  321. pwsz++;
  322. }
  323. yaccDebugOut((DEB_ITRACE, "NearDist string: %ws in %s format\n", pwsz, fLong ? L"Long" : L"Short"));
  324. }
  325. break;
  326. case _NEARUNIT:
  327. {
  328. Assert (fLong);
  329. Assert(!fQuote);
  330. if (fLong) // looks like: unit = blah
  331. {
  332. // find =
  333. while (L'=' != *pwsz)
  334. pwsz++;
  335. pwsz++;
  336. // step past leading blanks
  337. while (L' ' == *pwsz)
  338. pwsz++;
  339. }
  340. yaccDebugOut((DEB_ITRACE, "NearUnit string: %ws in %s format\n", pwsz, fLong ? L"Long" : L"Short"));
  341. }
  342. break;
  343. case _VECTORELEMENT:
  344. {
  345. // strip leading and trailing blanks
  346. while (L' ' == *pwsz)
  347. pwsz++;
  348. LPWSTR pTemp = pwsz + wcslen(pwsz) - 1;
  349. if (fLong) // strip trailing ;
  350. {
  351. Win4Assert(L';' == *pTemp);
  352. *pTemp--='\0';
  353. }
  354. while (L' ' == *pTemp && pTemp > pwsz)
  355. *pTemp-- = L'\0';
  356. if (fQuote)
  357. {
  358. // Strip quotes
  359. pwsz = pwsz + 1;
  360. pwsz[wcslen(pwsz)-1] = L'\0';
  361. }
  362. yaccDebugOut((DEB_ITRACE, "VectorElem %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
  363. }
  364. break;
  365. case _VEMETHOD:
  366. {
  367. Assert (fLong);
  368. LPWSTR pTemp;
  369. if (fLong) // looks like: {vector rankmethod= blah}
  370. {
  371. // find =
  372. while (L'=' != *pwsz)
  373. pwsz++;
  374. pwsz++;
  375. // strip trailing }
  376. pTemp = pwsz + wcslen(pwsz) - 1;
  377. Win4Assert(L'}' == *pTemp);
  378. *pTemp-- = L'\0';
  379. }
  380. // strip leading and trailing blanks and quotes
  381. while (L' ' == *pwsz)
  382. pwsz++;
  383. pTemp = pwsz + wcslen(pwsz) - 1;
  384. while (L' ' == *pTemp && pTemp > pwsz)
  385. *pTemp-- = L'\0';
  386. if (fQuote)
  387. {
  388. // Strip quotes
  389. pwsz = pwsz + 1;
  390. pwsz[wcslen(pwsz)-1] = L'\0';
  391. }
  392. yaccDebugOut((DEB_ITRACE, "VectorMethod %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
  393. }
  394. break;
  395. }
  396. int len = wcslen(pwsz);
  397. XPtrST<WCHAR> xwszRet(new WCHAR[len + 1]);
  398. _allocations.Add(xwszRet.GetPointer(), _allocations.Count());
  399. RtlCopyMemory(xwszRet.GetPointer(), pwsz, (len+1) * sizeof(WCHAR));
  400. (*ppStg).pwszChar = xwszRet.Acquire();
  401. return retTkn;
  402. }
  403. //+---------------------------------------------------------------------------
  404. //
  405. // Function: YYLEXER::DetermineTokenType
  406. //
  407. // Synopsis: Determines if we have a regular expression or a regular string.
  408. // A regular expression is a string that contains atleast one of
  409. // *, ?, or | characters.
  410. //
  411. // Returns: Token id
  412. //
  413. // History: Jun-05-98 KrishnaN created
  414. //
  415. //----------------------------------------------------------------------------
  416. short YYLEXER::DetermineTokenType()
  417. {
  418. LPWSTR pwsz = yytext;
  419. LPWSTR pLast = pwsz + wcslen(pwsz) - 1;
  420. while (pLast >= pwsz)
  421. {
  422. if (L'|' == *pwsz || L'*' == *pwsz || L'?' == *pwsz)
  423. return _REGEX;
  424. pwsz++;
  425. }
  426. // None of the regular expression defining characters have been found
  427. return _PHRASE;
  428. }
  429. //
  430. //
  431. // RULES
  432. //
  433. // Notes: Any characters which are not matched, cause yylexer to throw.
  434. // We can also throw if E_OUTOFMEMORY.
  435. // Tokens which need 2 return more than 1 value (e.g. {near}
  436. // use start states to return each pice of the value. The start
  437. // states also emit a "token end" token so that the parser can
  438. // check that they are syntactically complete.
  439. // Lex matches to the longest match in the rules. If 2 matches
  440. // are the same, it matches to the 1st match.
  441. %}
  442. %x innear
  443. %x shortgen
  444. %x shortregex
  445. %x mayberegex
  446. %x implicitphrase
  447. %x infreefreetext
  448. %x invector
  449. white [ \t\n\f\r]+
  450. begin_freetext \{[fF][rR][eE][eE][tT][eE][xX][tT]\}[ ]*
  451. end_freetext [ ]*\{\/[fF][rR][eE][eE][tT][eE][xX][tT]\}
  452. begin_phrase \{[pP][hH][rR][aA][sS][eE]\}[ ]*
  453. end_phrase [ ]*\{\/[pP][hH][rR][aA][sS][eE]\}
  454. prop [pP][rR][oO][pP]
  455. propname {prop}[ ]+[nN][aA][mM][eE][ ]*
  456. contains [cC][oO][nN][tT][aA][iI][nN][sS]
  457. and [aA][nN][dD]
  458. or [oO][rR]
  459. not [nN][oO][tT]
  460. near [nN][eE][aA][rR]
  461. vector [vV][eE][cC][tT][oO][rR]
  462. vecmethod {vector}[ ]+[rR][aA][nN][kK][mM][eE][tT][hH][oO][dD][ ]*
  463. ve [vV][eE]
  464. weight [wW][eE][iI][gG][hH][tT][ ]+[vV][aA][lL][uU][eE][ ]*
  465. coerce [cC][oO][eE][rR][cC][eE]
  466. generate [gG][eE][nN][eE][rR][aA][tT][eE]
  467. genmethod {generate}[ ]+[mM][eE][tT][hH][oO][dD][ ]*
  468. begin_regex \{[rR][eE][gG][eE][xX]\}[ ]*
  469. end_regex [ ]*\{\/[rR][eE][gG][eE][xX]\}
  470. dist [dD][iI][sS][tT][ ]*
  471. unit [uU][nN][iI][tT][ ]*
  472. word [wW][oO][rR][dD]
  473. sent [sS][eE][nN][tT]
  474. par [pP][aA][rR]
  475. chap [cC][hH][aA][pP]
  476. %%
  477. {white} { /* do nothing */ }
  478. \( { fContinueImplicitPhrase = FALSE;
  479. fContinueRegex = FALSE;
  480. fContinueMaybeRegex = FALSE;
  481. TOKEN (_OPEN);
  482. }
  483. \) {
  484. fContinueImplicitPhrase = FALSE;
  485. fContinueRegex = FALSE;
  486. fContinueMaybeRegex = FALSE;
  487. TOKEN (_CLOSE);
  488. }
  489. %{// ************
  490. // PROPNAME
  491. // ************ %}
  492. %{ // If something was treated as a phrase in Tripolish 1, it should
  493. // be treated as such even now. That applies here. For e.g. @propname
  494. // caused the following text to be treated as a phrase. The same should
  495. // apply to {prop name = propname}
  496. //
  497. %}
  498. %{// shorthand, quoted %}
  499. @\"[^"]+\" {
  500. // treat value as a phrase
  501. BEGIN implicitphrase;
  502. STRING_VALUE(_PROPNAME, FALSE, TRUE);
  503. }
  504. %{// shorthand, not quoted %}
  505. @[^" <>=!&|~\^]+ {
  506. // treat value as a phrase
  507. BEGIN implicitphrase;
  508. STRING_VALUE(_PROPNAME, FALSE, FALSE);
  509. }
  510. %{// shorthand, quoted %}
  511. $\"[^"]+\" {
  512. // treat value as freetext
  513. BEGIN infreefreetext;
  514. STRING_VALUE(_PROPNAME, FALSE, TRUE);
  515. }
  516. %{// shorthand, not quoted %}
  517. $[^" <>=!&|~\^]+ {
  518. // treat value as freetext
  519. BEGIN infreefreetext;
  520. STRING_VALUE(_PROPNAME, FALSE, FALSE);
  521. }
  522. %{// longhand, quoted %}
  523. \{{propname}=[ ]*\"[^"]*\"[ ]*\} {
  524. // treat value as a phrase
  525. BEGIN implicitphrase;
  526. STRING_VALUE(_PROPNAME, TRUE, TRUE);
  527. }
  528. %{// longhand, not quoted %}
  529. \{{propname}=[ ]*[^"} ][^}]*\} {
  530. // treat value as a phrase
  531. BEGIN implicitphrase;
  532. STRING_VALUE(_PROPNAME, TRUE, FALSE);
  533. }
  534. %{// closing token %}
  535. \{\/{prop}\} { TOKEN (_PROPEND); }
  536. %{// *********
  537. // OPERATORS
  538. // ********* %}
  539. {contains}[ ]+ { if (fContinueImplicitPhrase)
  540. {
  541. BEGIN implicitphrase;
  542. fContinueImplicitPhrase = FALSE;
  543. }
  544. else if (fContinueRegex)
  545. {
  546. BEGIN shortregex;
  547. fContinueRegex = FALSE;
  548. }
  549. else if (fContinueMaybeRegex)
  550. {
  551. BEGIN mayberegex;
  552. fContinueMaybeRegex = FALSE;
  553. }
  554. TOKEN (_CONTAINS);
  555. }
  556. {and}[ ]+ { if (fContinueImplicitPhrase)
  557. {
  558. BEGIN implicitphrase;
  559. fContinueImplicitPhrase = FALSE;
  560. }
  561. else if (fContinueRegex)
  562. {
  563. BEGIN shortregex;
  564. fContinueRegex = FALSE;
  565. }
  566. else if (fContinueMaybeRegex)
  567. {
  568. BEGIN mayberegex;
  569. fContinueMaybeRegex = FALSE;
  570. }
  571. TOKEN (_AND);
  572. }
  573. {and}\{ {
  574. yyless(yyleng-1);
  575. if (fContinueImplicitPhrase)
  576. {
  577. BEGIN implicitphrase;
  578. fContinueImplicitPhrase = FALSE;
  579. }
  580. else if (fContinueRegex)
  581. {
  582. BEGIN shortregex;
  583. fContinueRegex = FALSE;
  584. }
  585. else if (fContinueMaybeRegex)
  586. {
  587. BEGIN mayberegex;
  588. fContinueMaybeRegex = FALSE;
  589. }
  590. TOKEN (_AND);
  591. }
  592. {or}[ ]+ { if (fContinueImplicitPhrase)
  593. {
  594. yaccDebugOut(( DEB_ITRACE, "fContinueImplicitPhrase\n" ));
  595. BEGIN implicitphrase;
  596. fContinueImplicitPhrase = FALSE;
  597. }
  598. else if (fContinueRegex)
  599. {
  600. yaccDebugOut(( DEB_ITRACE, "fContinueRegex\n" ));
  601. BEGIN shortregex;
  602. fContinueRegex = FALSE;
  603. }
  604. else if (fContinueMaybeRegex)
  605. {
  606. yaccDebugOut(( DEB_ITRACE, "fContinueMaybeRegex\n" ));
  607. BEGIN mayberegex;
  608. fContinueMaybeRegex = FALSE;
  609. }
  610. yaccDebugOut(( DEB_ITRACE, "OR TOKEN found !!!\n" ));
  611. TOKEN (_OR); }
  612. {or}\{ {
  613. yyless(yyleng-1);
  614. if (fContinueImplicitPhrase)
  615. {
  616. yaccDebugOut(( DEB_ITRACE, "OR{ fContinueImplicitPhrase\n" ));
  617. BEGIN implicitphrase;
  618. fContinueImplicitPhrase = FALSE;
  619. }
  620. else if (fContinueRegex)
  621. {
  622. yaccDebugOut(( DEB_ITRACE, "OR{ fContinueRegex\n" ));
  623. BEGIN shortregex;
  624. fContinueRegex = FALSE;
  625. }
  626. else if (fContinueMaybeRegex)
  627. {
  628. yaccDebugOut(( DEB_ITRACE, "OR{ fContinueMaybeRegex\n" ));
  629. BEGIN mayberegex;
  630. fContinueMaybeRegex = FALSE;
  631. }
  632. yaccDebugOut(( DEB_ITRACE, "OR{ TOKEN found !!!\n" ));
  633. TOKEN (_OR); }
  634. {not}[ ]+ { if (fContinueImplicitPhrase)
  635. {
  636. BEGIN implicitphrase;
  637. fContinueImplicitPhrase = FALSE;
  638. }
  639. else if (fContinueRegex)
  640. {
  641. BEGIN shortregex;
  642. fContinueRegex = FALSE;
  643. }
  644. else if (fContinueMaybeRegex)
  645. {
  646. BEGIN mayberegex;
  647. fContinueMaybeRegex = FALSE;
  648. }
  649. TOKEN (_NOT);}
  650. {not}\{ {
  651. yyless(yyleng-1);
  652. if (fContinueImplicitPhrase)
  653. {
  654. BEGIN implicitphrase;
  655. fContinueImplicitPhrase = FALSE;
  656. }
  657. else if (fContinueRegex)
  658. {
  659. BEGIN shortregex;
  660. fContinueRegex = FALSE;
  661. }
  662. else if (fContinueMaybeRegex)
  663. {
  664. BEGIN mayberegex;
  665. fContinueMaybeRegex = FALSE;
  666. }
  667. TOKEN (_NOT);}
  668. & { if (fContinueImplicitPhrase)
  669. {
  670. BEGIN implicitphrase;
  671. fContinueImplicitPhrase = FALSE;
  672. }
  673. else if (fContinueRegex)
  674. {
  675. BEGIN shortregex;
  676. fContinueRegex = FALSE;
  677. }
  678. else if (fContinueMaybeRegex)
  679. {
  680. BEGIN mayberegex;
  681. fContinueMaybeRegex = FALSE;
  682. }
  683. TOKEN (_AND);}
  684. \| { if (fContinueImplicitPhrase)
  685. {
  686. BEGIN implicitphrase;
  687. fContinueImplicitPhrase = FALSE;
  688. }
  689. else if (fContinueRegex)
  690. {
  691. BEGIN shortregex;
  692. fContinueRegex = FALSE;
  693. }
  694. else if (fContinueMaybeRegex)
  695. {
  696. BEGIN mayberegex;
  697. fContinueMaybeRegex = FALSE;
  698. }
  699. TOKEN (_OR);}
  700. ! { if (fContinueImplicitPhrase)
  701. {
  702. BEGIN implicitphrase;
  703. fContinueImplicitPhrase = FALSE;
  704. }
  705. else if (fContinueRegex)
  706. {
  707. BEGIN shortregex;
  708. fContinueRegex = FALSE;
  709. }
  710. else if (fContinueMaybeRegex)
  711. {
  712. BEGIN mayberegex;
  713. fContinueMaybeRegex = FALSE;
  714. }
  715. TOKEN (_NOT);}
  716. {near}[ ]+ { yaccDebugOut(( DEB_ITRACE, "near[ ]+ _NEAR token, begin implicitphrase\n" ));
  717. BEGIN implicitphrase;
  718. TOKEN (_NEAR);}
  719. {near}\{ { yaccDebugOut(( DEB_ITRACE, "near{ _NEAR token, begin implicitphrase\n" ));
  720. yyless(yyleng-1);
  721. BEGIN implicitphrase;
  722. TOKEN (_NEAR);}
  723. ~ { BEGIN implicitphrase;
  724. TOKEN (_NEAR);}
  725. \< { TOKEN (_LT);}
  726. \> { TOKEN (_GT);}
  727. \<\= { TOKEN (_LTE);}
  728. \>\= { TOKEN (_GTE);}
  729. \= { if (fContinueMaybeRegex)
  730. {
  731. // We are not sure if we are going to find a
  732. // regular expression or a phrase.
  733. BEGIN mayberegex;
  734. fContinueMaybeRegex = FALSE;
  735. }
  736. TOKEN (_EQ);
  737. }
  738. \!\= { TOKEN (_NE); }
  739. \^a { TOKEN (_ALLOF); }
  740. \^s { TOKEN (_SOMEOF); }
  741. \<[ ]*\^s |
  742. \^s[ ]*\< { TOKEN (_LTSOME); }
  743. \>[ ]*\^s |
  744. \^s[ ]*\> { TOKEN (_GTSOME); }
  745. \<\=[ ]*\^s |
  746. \^s[ ]*\<\= { TOKEN (_LTESOME); }
  747. \>\=[ ]*\^s |
  748. \^s[ ]*\>\= { TOKEN (_GTESOME); }
  749. \=[ ]*\^s |
  750. \^s[ ]*\= { TOKEN (_EQSOME); }
  751. \!\=[ ]*\^s |
  752. \^s[ ]*\!\= { TOKEN (_NESOME); }
  753. \^s[ ]*\^a { TOKEN (_ALLOFSOME); }
  754. \^s[ ]*\^s { TOKEN (_SOMEOFSOME); }
  755. \^<[ ]*\^a |
  756. \^a[ ]*\< { TOKEN (_LTALL); }
  757. \>[ ]*\^a |
  758. \^a[ ]*\> { TOKEN (_GTALL); }
  759. \<\=[ ]*\^a |
  760. \^a[ ]*\<\= { TOKEN (_LTEALL); }
  761. \>\=[ ]*\^a |
  762. \^a[ ]*\>\= { TOKEN (_GTEALL); }
  763. \=[ ]*\^a |
  764. \^a[ ]*\= { TOKEN (_EQALL); }
  765. \!\=[ ]*\^a |
  766. \^a[ ]*\!\= { TOKEN (_NEALL); }
  767. \^a[ ]*\^a { TOKEN (_ALLOFALL); }
  768. \^a[ ]*\^s { TOKEN (_SOMEOFALL); }
  769. %{// *************
  770. // VECTOR SPACE TOKENS
  771. // ************* %}
  772. \{{vecmethod}=[ ]*\"[^"]*\"[ ]*\} { STRING_VALUE(_VEMETHOD, TRUE, TRUE); }
  773. \{{vecmethod}=[^}]*\} { STRING_VALUE(_VEMETHOD, TRUE, FALSE); }
  774. \{{ve}\} {
  775. // makes more sense to enter phrase mode
  776. // rather than freetext mode.
  777. fContinueImplicitPhrase = TRUE;
  778. BEGIN implicitphrase;
  779. TOKEN (_VE);
  780. }
  781. \{\/{vector}\} { TOKEN (_VECTOR_END); }
  782. %{// *************
  783. // longhand NEAR
  784. // ************* %}
  785. %{// must return both unit and distance, so use start state to pull them out, and
  786. // return _NEAR_END so parser knows we hit the closing }
  787. %}
  788. \{{near}[ ] { yaccDebugOut(( DEB_ITRACE, "Longhand _NEAR token, begin innear\n" ));
  789. BEGIN innear; }
  790. \{{near}\{ { yaccDebugOut(( DEB_ITRACE, "Longhand _NEAR{ token, begin innear\n" ));
  791. yyless(yyleng-1);
  792. BEGIN innear; }
  793. %{// ************
  794. // WEIGHT
  795. // ************ %}
  796. \{{weight}=[ ]*(0|1|0\.[0-9]*|1\.[0]*|\.[0-9]+)[ ]*\} {
  797. if (fContinueImplicitPhrase)
  798. {
  799. BEGIN implicitphrase;
  800. fContinueImplicitPhrase = FALSE;
  801. }
  802. yaccDebugOut(( DEB_ITRACE, "_WEIGHT TOKEN FOUND!!\n" ));
  803. STRING_VALUE(_WEIGHT,TRUE,FALSE);
  804. }
  805. \{{coerce}\} {
  806. if (fContinueImplicitPhrase)
  807. {
  808. BEGIN implicitphrase;
  809. fContinueImplicitPhrase = FALSE;
  810. }
  811. TOKEN (_COERCE); }
  812. %{// ****************
  813. // longhand GENERATE
  814. // **************** %}
  815. \{{genmethod}=[" ]*prefix[" ]*\} {
  816. if (fContinueImplicitPhrase)
  817. {
  818. BEGIN implicitphrase;
  819. fContinueImplicitPhrase = FALSE;
  820. }
  821. yaccDebugOut((DEB_ITRACE, "Prefix recognized.\n"));
  822. TOKEN(_GENPREFIX);
  823. }
  824. \{{genmethod}=[" ]*inflect[" ]*\} {
  825. if (fContinueImplicitPhrase)
  826. {
  827. BEGIN implicitphrase;
  828. fContinueImplicitPhrase = FALSE;
  829. }
  830. yaccDebugOut((DEB_ITRACE, "Inflect recognized.\n"));
  831. TOKEN(_GENINFLECT);
  832. }
  833. \{\/{generate}\} { TOKEN (_GENNORMAL); }
  834. %{// ****************
  835. // longhand REGEX
  836. // **************** %}
  837. {begin_regex}\"[^"]*\"{end_regex} { STRING_VALUE(_REGEX,TRUE,TRUE);}
  838. {begin_regex}[^{]*{end_regex} { STRING_VALUE(_REGEX,TRUE,FALSE);}
  839. {begin_regex}([^{]*\|[()\[{}\],*?+][^{]*)*{end_regex} { STRING_VALUE(_REGEX,TRUE,FALSE);}
  840. %{// ****************
  841. // shorthand REGEX
  842. // **************** %}
  843. %{// shorthand, quoted %}
  844. #\"[^"]+\" {
  845. // Get into short form of reg expression
  846. BEGIN shortregex;
  847. STRING_VALUE(_PROPNAME, FALSE, TRUE);
  848. }
  849. %{// shorthand, not quoted %}
  850. #[^" <>=!&|~\^]+ {
  851. // Get into short form of reg expression
  852. BEGIN shortregex;
  853. STRING_VALUE(_PROPNAME, FALSE, FALSE);
  854. }
  855. %{// ***************
  856. // longhand PHRASE
  857. // *************** %}
  858. %{// quoted, with trailing * or ** %}
  859. {begin_phrase}\"[^"]*\"{end_phrase}\* {
  860. // trailing * has to be for inflection -
  861. // process it in shortgen on next pass.
  862. // Grab phrase now.
  863. yyless(yyleng-1);
  864. BEGIN shortgen;
  865. STRING_VALUE(_PHRASE,TRUE,TRUE);
  866. }
  867. %{// quoted, without trailing * or ** %}
  868. {begin_phrase}\"[^"]*\"{end_phrase} {
  869. // no trailing * -- phrase only
  870. STRING_VALUE(_PHRASE,TRUE,TRUE);
  871. }
  872. %{// unquoted, with trailing * or ** %}
  873. {begin_phrase}[^{]*{end_phrase}\* {
  874. // trailing * has to be for inflection -
  875. // process it in shortgen on next pass.
  876. // Grab phrase now.
  877. yyless(yyleng-1);
  878. BEGIN shortgen;
  879. STRING_VALUE(_PHRASE,TRUE,FALSE);
  880. }
  881. %{// unquoted, without trailing * or ** %}
  882. {begin_phrase}[^{]*{end_phrase} {
  883. // no trailing * -- phrase only
  884. STRING_VALUE(_PHRASE,TRUE,FALSE);
  885. }
  886. %{// *************
  887. // shorthand PHRASE
  888. // ************* %}
  889. %{// with trailing * or ** %}
  890. \"[^"]*\"\* {
  891. // trailing * has to be for inflection -
  892. // process it in shortgen on next pass.
  893. // Grab phrase now.
  894. yyless(yyleng-1);
  895. BEGIN shortgen;
  896. STRING_VALUE(_PHRASE, FALSE, TRUE);
  897. }
  898. %{ // without trailing * or ** %}
  899. \"[^"]*\" {
  900. // no trailing * -- phrase only
  901. STRING_VALUE(_PHRASE, FALSE, TRUE);
  902. }
  903. %{// *****************
  904. // longhand FREETEXT
  905. // ***************** %}
  906. %{// quoted, with trailing * or ** %}
  907. {begin_freetext}\"[^"]*\"{end_freetext}\* {
  908. // trailing * has to be for inflection -
  909. // process it in shortgen on next pass.
  910. // Grab freetext now.
  911. yyless(yyleng-1);
  912. BEGIN shortgen;
  913. STRING_VALUE(_FREETEXT,TRUE,TRUE);
  914. }
  915. %{// quoted, without trailing * or ** %}
  916. {begin_freetext}\"[^"]*\"{end_freetext} {
  917. // no trailing * -- freetext only
  918. STRING_VALUE(_FREETEXT,TRUE,TRUE);
  919. }
  920. %{// unquoted, with trailing * or ** %}
  921. {begin_freetext}[^{]*{end_freetext}\* {
  922. // trailing * has to be for inflection -
  923. // process it in shortgen on next pass.
  924. // Grab freetext now.
  925. yyless(yyleng-1);
  926. BEGIN shortgen;
  927. STRING_VALUE(_FREETEXT,TRUE,FALSE);
  928. }
  929. %{// unquoted, without trailing * or ** %}
  930. {begin_freetext}[^{]*{end_freetext} {
  931. // no trailing * -- freetext only
  932. STRING_VALUE(_FREETEXT,TRUE,FALSE);
  933. }
  934. %{// ******************
  935. // shorthand FREETEXT
  936. // ****************** %}
  937. [^#$@~&|<>=!\^*"()\{ ][^&~|{) ]*[ ] {
  938. // For backward compatibility, we want to special
  939. // case and recognize the "not" operator when it
  940. // is immediately followed by a mode specifier character
  941. // (@, $, #). For e.g. "not@size > 2" should be treated
  942. // as if we have a "not" operator followed by "@size > 2".
  943. // Without this special case, "not@size > 2" gets recognized
  944. // as free text.
  945. if (IsNotOperator())
  946. {
  947. yyless(3);
  948. BEGIN INITIAL;
  949. TOKEN(_NOT);
  950. }
  951. yaccDebugOut(( DEB_ITRACE, "fTreatFreetextAsPhrase is %d\n", fTreatFreetextAsPhrase ));
  952. if (fTreatFreetextAsPhrase)
  953. BEGIN implicitphrase;
  954. else
  955. BEGIN infreefreetext;
  956. fTreatFreetextAsPhrase = FALSE;
  957. yymore();
  958. }
  959. [^#$@~&|<>=!\^*"()\{ ][^&~|{) ]* {
  960. // IsNotOperator is used here for the same reason as the
  961. // use above, except that this rule covers situations where
  962. // we have no spaces in the query. E.g. "not@size>2".
  963. // This should be equivalent to
  964. // "not@size > 2", which in turn should be equivalent to
  965. // "not @size > 2"
  966. if (IsNotOperator())
  967. {
  968. yyless(3);
  969. BEGIN INITIAL;
  970. TOKEN(_NOT);
  971. }
  972. if (fTreatFreetextAsPhrase)
  973. {
  974. STRING_VALUE(_PHRASE,FALSE,FALSE);
  975. }
  976. else
  977. {
  978. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  979. }
  980. fTreatFreetextAsPhrase = FALSE;
  981. }
  982. %{// *************
  983. // VECTOR VALUES
  984. // ************* %}
  985. %{// quoted multi-value vector - has ; separator. Singlets caught in parser %}
  986. \([ ]*\"[^"]*\"[ ]*; { BEGIN invector; yyless(1);}
  987. %{// unquoted multi-value vector - has ; separator. Singlets caught in parser %}
  988. \([^(;)]+; { BEGIN invector; yyless(1);}
  989. %{//
  990. // INNEAR: longhand NEAR processing
  991. //
  992. %}
  993. <innear>{white} {}
  994. <innear>, {}
  995. <innear>dist[ ]*=[ ]*[0-9]+ { STRING_VALUE(_NEARDIST,TRUE,FALSE);}
  996. <innear>unit[ ]*=[ ]*{word} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
  997. <innear>unit[ ]*=[ ]*{sent} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
  998. <innear>unit[ ]*=[ ]*{par} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
  999. <innear>unit[ ]*=[ ]*{chap} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
  1000. <innear>\} { BEGIN implicitphrase; TOKEN (_NEAR_END);}
  1001. %{//
  1002. // INVECTOR: multi value vector processing
  1003. //
  1004. %}
  1005. <invector>{white} {}
  1006. <invector>; {}
  1007. <invector>\"[^"]*\" { STRING_VALUE(_VECTORELEMENT, FALSE, TRUE);}
  1008. <invector>[^ ";)][^;)]*; { STRING_VALUE(_VECTORELEMENT, TRUE, FALSE);}
  1009. <invector>[^ ";)][^;)]*\) {
  1010. // Need to emit _VECTORELEMENT and _VE_END -- so backup 1
  1011. // so we can emit _VE_END on next pass
  1012. yyless(yyleng-1);
  1013. STRING_VALUE(_VECTORELEMENT, FALSE, FALSE);
  1014. }
  1015. <invector>\) { BEGIN INITIAL; TOKEN (_VE_END); }
  1016. %{//
  1017. // INFREEFREETEXT: shorthand FREETEXT processing
  1018. //
  1019. // NOTE: and, or, near need to be localized %}
  1020. <infreefreetext>[ ]+ { yymore(); }
  1021. <infreefreetext>{and}[ ] {
  1022. yyless(yyleng-4);
  1023. BEGIN INITIAL;
  1024. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1025. }
  1026. <infreefreetext>{and}\{ {
  1027. yyless(yyleng-4);
  1028. BEGIN INITIAL;
  1029. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1030. }
  1031. <infreefreetext>{or}[ ] {
  1032. yyless(yyleng-3);
  1033. BEGIN INITIAL;
  1034. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1035. }
  1036. <infreefreetext>{or}\{ {
  1037. yyless(yyleng-3);
  1038. BEGIN INITIAL;
  1039. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1040. }
  1041. <infreefreetext>{near}[ ] {
  1042. yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{near}[ ]\n" ));
  1043. yyless(yyleng-5);
  1044. fTreatFreetextAsPhrase = TRUE;
  1045. BEGIN INITIAL;
  1046. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1047. }
  1048. <infreefreetext>{near}\{ {
  1049. yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{near}{\n" ));
  1050. yyless(yyleng-5);
  1051. fTreatFreetextAsPhrase = TRUE;
  1052. BEGIN INITIAL;
  1053. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1054. }
  1055. <infreefreetext>\{{near}[ ] {
  1056. yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{{near}\n" ));
  1057. yyless(yyleng-6);
  1058. fTreatFreetextAsPhrase = TRUE;
  1059. BEGIN INITIAL;
  1060. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1061. }
  1062. <infreefreetext>\{{near}\{ {
  1063. yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{{near}{\n" ));
  1064. yyless(yyleng-6);
  1065. fTreatFreetextAsPhrase = TRUE;
  1066. BEGIN INITIAL;
  1067. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1068. }
  1069. <infreefreetext>& {
  1070. yyless(yyleng-1);
  1071. BEGIN INITIAL;
  1072. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1073. }
  1074. <infreefreetext>\| {
  1075. yyless(yyleng-1);
  1076. BEGIN INITIAL;
  1077. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1078. }
  1079. <infreefreetext>~ {
  1080. yyless(yyleng-1);
  1081. fTreatFreetextAsPhrase = TRUE;
  1082. BEGIN INITIAL;
  1083. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1084. }
  1085. <infreefreetext>\( {
  1086. yyless(yyleng-1);
  1087. BEGIN INITIAL;
  1088. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1089. }
  1090. <infreefreetext>\) {
  1091. yyless(yyleng-1);
  1092. BEGIN INITIAL;
  1093. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1094. }
  1095. <infreefreetext>\{ {
  1096. yyless(yyleng-1);
  1097. BEGIN INITIAL;
  1098. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1099. }
  1100. <infreefreetext>\"[^"]+\" {
  1101. BEGIN INITIAL;
  1102. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1103. }
  1104. <infreefreetext>[^~&|{}()" ]+[ ] { yymore(); }
  1105. <infreefreetext>[^~&|{}()" ]+ {
  1106. BEGIN INITIAL;
  1107. STRING_VALUE(_FREETEXT,FALSE,FALSE);
  1108. }
  1109. %{//
  1110. // SHORTGEN: * or ** processing
  1111. //
  1112. // can only get here by backing up over *,
  1113. // so we will always find a match %}
  1114. <shortgen>\*\* {
  1115. BEGIN INITIAL;
  1116. TOKEN(_SHGENINFLECT);
  1117. }
  1118. <shortgen>\* {
  1119. BEGIN INITIAL;
  1120. TOKEN(_SHGENPREFIX);
  1121. }
  1122. %{//
  1123. // SHORTREGEX: #propname processing
  1124. //
  1125. // can only get here when #"propname" or #propname
  1126. // (quoted or unquoted) version is detected.
  1127. // NOTE: and, or need to be localized
  1128. // NOTE: It doesn't make sense to have the near operator following
  1129. // a regular expression. A regex is Boolean and doesn't evaluate
  1130. // to a position value.
  1131. //
  1132. //
  1133. %}
  1134. <shortregex>[ ]+ { yymore(); }
  1135. <shortregex>= {
  1136. // ignore equal operators...
  1137. BEGIN shortregex;
  1138. }
  1139. <shortregex>\"[^"]*\" { STRING_VALUE(_REGEX, FALSE, TRUE);}
  1140. <shortregex>{and}[ ] {
  1141. fContinueRegex = TRUE;
  1142. yyless(yyleng-4);
  1143. BEGIN INITIAL;
  1144. STRING_VALUE(_REGEX,FALSE,FALSE);
  1145. }
  1146. <shortregex>{or}[ ] {
  1147. fContinueRegex = TRUE;
  1148. yyless(yyleng-3);
  1149. BEGIN INITIAL;
  1150. STRING_VALUE(_REGEX,FALSE,FALSE);
  1151. }
  1152. <shortregex>{not}[ ] {
  1153. yyless(yyleng-4);
  1154. // The only valid way to get here is to
  1155. // have had seen "and" before. Don't recognize
  1156. // a regex. Back off and let the lexer takes its
  1157. // normal course.
  1158. fContinueRegex = TRUE;
  1159. BEGIN INITIAL;
  1160. }
  1161. <shortregex>& {
  1162. fContinueRegex = TRUE;
  1163. yyless(yyleng-1);
  1164. BEGIN INITIAL;
  1165. STRING_VALUE(_REGEX,FALSE,FALSE);
  1166. }
  1167. <shortregex>\| {
  1168. fContinueRegex = TRUE;
  1169. yyless(yyleng-1);
  1170. BEGIN INITIAL;
  1171. STRING_VALUE(_REGEX,FALSE,FALSE);
  1172. }
  1173. <shortregex>! {
  1174. yyless(yyleng-1);
  1175. // The only valid way to get here is to
  1176. // have had seen "and" before. Don't recognize
  1177. // a phrase. Back off and let the lexer takes its
  1178. // normal course.
  1179. fContinueRegex = TRUE;
  1180. BEGIN INITIAL;
  1181. }
  1182. %{
  1183. // When we find an operator we should treat it as one.
  1184. // So backup and get out if you see one.
  1185. // Normally '^' is treated as part of an operator (e.g. ^a), but it also
  1186. // has a special meaning in regular expression syntax. So we will have to
  1187. // let it through when it is part of a regular expression. As an alternative,
  1188. // we can allow '^' in regular expression in a limited manner (i.e. only the use
  1189. // in square brackets to exclude the set of chars "[^abc]" where abc are excluded).
  1190. // This alternative will let the common case use of '^' in a regular expression
  1191. // while allowing it to be treated as part of an operator when it doesn't
  1192. // occur immediately after a '['.
  1193. // We are implementing the alternative here because our regex capability
  1194. // only allows for the "[^" construct.
  1195. %}
  1196. <shortregex>[\^<>@$#] {
  1197. yyless(yyleng-1);
  1198. fContinueRegex = FALSE;
  1199. BEGIN INITIAL;
  1200. }
  1201. <shortregex>\( {
  1202. yyless(yyleng-1);
  1203. BEGIN INITIAL;
  1204. STRING_VALUE(_REGEX,FALSE,FALSE);
  1205. }
  1206. <shortregex>\) {
  1207. yyless(yyleng-1);
  1208. BEGIN INITIAL;
  1209. STRING_VALUE(_REGEX,FALSE,FALSE);
  1210. }
  1211. <shortregex>\{ {
  1212. yyless(yyleng-1);
  1213. BEGIN INITIAL;
  1214. STRING_VALUE(_REGEX,FALSE,FALSE);
  1215. }
  1216. <shortregex>(([^~&|{}()\^<>!@$#= ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$#= ])*)+[ ] { yymore(); }
  1217. <shortregex>(([^~&|{}()\^<>!@$#= ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$#= ])*)+ {
  1218. fContinueRegex = TRUE;
  1219. BEGIN INITIAL;
  1220. STRING_VALUE(_REGEX,FALSE,FALSE);
  1221. }
  1222. <mayberegex>{and}[ ] {
  1223. yyless(yyleng-4);
  1224. fContinueMaybeRegex = TRUE;
  1225. BEGIN INITIAL;
  1226. STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
  1227. }
  1228. <mayberegex>{or}[ ] {
  1229. yyless(yyleng-3);
  1230. fContinueMaybeRegex = TRUE;
  1231. BEGIN INITIAL;
  1232. STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
  1233. }
  1234. <mayberegex>{not}[ ] {
  1235. yyless(yyleng-4);
  1236. // The only valid way to get here is to
  1237. // have had seen "and" before. Don't recognize
  1238. // a regex. Back off and let the lexer takes its
  1239. // normal course.
  1240. fContinueMaybeRegex = TRUE;
  1241. BEGIN INITIAL;
  1242. }
  1243. <mayberegex>& {
  1244. fContinueMaybeRegex = TRUE;
  1245. yyless(yyleng-1);
  1246. BEGIN INITIAL;
  1247. STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
  1248. }
  1249. <mayberegex>\| {
  1250. fContinueMaybeRegex = TRUE;
  1251. yyless(yyleng-1);
  1252. BEGIN INITIAL;
  1253. STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
  1254. }
  1255. <mayberegex>! {
  1256. yyless(yyleng-1);
  1257. // The only valid way to get here is to
  1258. // have had seen "and" before. Don't recognize
  1259. // a phrase. Back off and let the lexer takes its
  1260. // normal course.
  1261. fContinueMaybeRegex = TRUE;
  1262. BEGIN INITIAL;
  1263. }
  1264. <mayberegex>\( {
  1265. yyless(yyleng-1);
  1266. BEGIN INITIAL;
  1267. STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
  1268. }
  1269. <mayberegex>\) {
  1270. yyless(yyleng-1);
  1271. BEGIN INITIAL;
  1272. STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
  1273. }
  1274. <mayberegex>\{ {
  1275. yyless(yyleng-1);
  1276. BEGIN INITIAL;
  1277. STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
  1278. }
  1279. <mayberegex>[ ]+ { yymore(); }
  1280. <mayberegex>\"[^"]*\" { STRING_VALUE(_PHRASE, FALSE, TRUE);}
  1281. <mayberegex>(([^~&|{}()\^<>!@$# ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$# ])*)+[ ] { yymore(); }
  1282. <mayberegex>(([^~&|{}()\^<>!@$# ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$# ])*)+ {
  1283. fContinueMaybeRegex = TRUE;
  1284. BEGIN INITIAL;
  1285. STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
  1286. }
  1287. %{
  1288. // When we find an operator at the start of a phrase,
  1289. // we should treat it as one. So backup and get out if you see one.
  1290. %}
  1291. <mayberegex>[\^<>@$#] {
  1292. yyless(yyleng-1);
  1293. fContinueMaybeRegex = FALSE;
  1294. BEGIN INITIAL;
  1295. }
  1296. %{//
  1297. // IMPLICITPHRASE: Where phrase is implied.
  1298. //
  1299. // can only get here when @propname or {prop name = propname} is detected.
  1300. // NOTE: and, or, not need to be localized when time permits.
  1301. //
  1302. // NTRAID#DB-NTBUG9-84571-2000/07/31-dlee Indexing Service tripolish2 query expressions misinterpreted as strings
  1303. // if expression has trailing blanks, we'll emit a string value
  1304. %}
  1305. <implicitphrase>\"[^"]*\" {
  1306. fContinueImplicitPhrase = FALSE;
  1307. BEGIN INITIAL;
  1308. STRING_VALUE(_PHRASE, FALSE, TRUE);
  1309. }
  1310. <implicitphrase>[ ]+ { yymore(); }
  1311. <implicitphrase>{and}[ ] {
  1312. yyless(yyleng-4);
  1313. fContinueImplicitPhrase = TRUE;
  1314. BEGIN INITIAL;
  1315. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1316. }
  1317. <implicitphrase>{or}[ ] {
  1318. yyless(yyleng-3);
  1319. fContinueImplicitPhrase = TRUE;
  1320. BEGIN INITIAL;
  1321. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1322. }
  1323. <implicitphrase>{near}[ ] {
  1324. yyless(yyleng-5);
  1325. // We want to treat the following token as a phrase
  1326. fContinueImplicitPhrase = TRUE;
  1327. BEGIN INITIAL;
  1328. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1329. }
  1330. <implicitphrase>{near}\{ {
  1331. yyless(yyleng-5);
  1332. // We want to treat the following token as a phrase
  1333. fContinueImplicitPhrase = TRUE;
  1334. BEGIN INITIAL;
  1335. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1336. }
  1337. <implicitphrase>{not}[ ] {
  1338. yyless(yyleng-4);
  1339. // The only valid way to get here is to
  1340. // have had seen "and" before. Don't recognize
  1341. // a phrase. Back off and let the lexer takes its
  1342. // normal course.
  1343. fContinueImplicitPhrase = TRUE;
  1344. BEGIN INITIAL;
  1345. }
  1346. <implicitphrase>& {
  1347. yyless(yyleng-1);
  1348. fContinueImplicitPhrase = TRUE;
  1349. BEGIN INITIAL;
  1350. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1351. }
  1352. <implicitphrase>~ {
  1353. yyless(yyleng-1);
  1354. // We want to treat the following token as a phrase
  1355. fContinueImplicitPhrase = TRUE;
  1356. BEGIN INITIAL;
  1357. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1358. }
  1359. <implicitphrase>! {
  1360. yyless(yyleng-1);
  1361. // The only valid way to get here is to
  1362. // have had seen "and" before. Don't recognize
  1363. // a phrase. Back off and let the lexer takes its
  1364. // normal course.
  1365. fContinueImplicitPhrase = TRUE;
  1366. BEGIN INITIAL;
  1367. }
  1368. <implicitphrase>\| {
  1369. yyless(yyleng-1);
  1370. fContinueImplicitPhrase = TRUE;
  1371. BEGIN INITIAL;
  1372. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1373. }
  1374. <implicitphrase>\( {
  1375. yyless(yyleng-1);
  1376. fContinueImplicitPhrase = FALSE;
  1377. BEGIN INITIAL;
  1378. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1379. }
  1380. <implicitphrase>\) {
  1381. yyless(yyleng-1);
  1382. fContinueImplicitPhrase = FALSE;
  1383. BEGIN INITIAL;
  1384. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1385. }
  1386. <implicitphrase>\{ {
  1387. yyless(yyleng-1);
  1388. fContinueImplicitPhrase = TRUE;
  1389. BEGIN INITIAL;
  1390. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1391. }
  1392. <implicitphrase>{contains}[ ] {
  1393. yyless(yyleng-9);
  1394. fContinueImplicitPhrase = TRUE;
  1395. BEGIN INITIAL;
  1396. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1397. }
  1398. %{
  1399. // When we find an operator at the start of an implicit phrase,
  1400. // we should treat it as one. So backup and get out if you see one.
  1401. %}
  1402. <implicitphrase>[\^<>@$#] {
  1403. yyless(yyleng-1);
  1404. fContinueImplicitPhrase = FALSE;
  1405. BEGIN INITIAL;
  1406. }
  1407. %{
  1408. // Triplish2 uses = to indicate that whatever appears after it may
  1409. // be using wildcards. Implement that here.
  1410. %}
  1411. <implicitphrase>= {
  1412. yyless(yyleng-1);
  1413. fContinueMaybeRegex = TRUE;
  1414. BEGIN INITIAL;
  1415. }
  1416. <implicitphrase>[^~&|{}()\^<>=!@$# ]+[ ] { yymore(); }
  1417. <implicitphrase>[^~&|{}()\^<>=!@$# ]+ {
  1418. fContinueImplicitPhrase = TRUE;
  1419. BEGIN INITIAL;
  1420. STRING_VALUE(_PHRASE,FALSE,FALSE);
  1421. }