Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1332 lines
36 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // Copyright (C) 1994-1998, Microsoft Corporation.
  4. //
  5. // File: SCANNER.CXX
  6. //
  7. // Contents: Implementation of CQueryScanner
  8. //
  9. // History: 22-May-92 AmyA Created.
  10. // 23-Jun-92 MikeHew Added weight token recognition.
  11. // 17-May-94 t-jeffc Added error info and reg ex support.
  12. //
  13. //----------------------------------------------------------------------------
  14. #include <pch.cxx>
  15. #pragma hdrstop
  16. //+---------------------------------------------------------------------------
  17. //
  18. // Member: CQueryScanner::CQueryScanner, public
  19. //
  20. // Synopsis: Create a scanner from a string.
  21. //
  22. // Arguments: [buffer] -- the string to be scanned.
  23. // [fLookForTextualKeywords] -- TRUE if the scanner should
  24. // look for "and/or/not/near" in
  25. // text form.
  26. // [lcid] -- language for and/or/not/near detection
  27. // [fTreatPlusAsToken] -- TRUE if the scanner should treat the
  28. // '+' character as a token (used
  29. // in GroupBy parsing)
  30. //
  31. // Notes: This string is not copied, so the scanner does not own it.
  32. // If the string is changed outside of the scanner, it will
  33. // affect the information that is returned.
  34. //
  35. // History: 30-Apr-92 AmyA Created
  36. //
  37. //----------------------------------------------------------------------------
  38. CQueryScanner::CQueryScanner(
  39. WCHAR const * buffer,
  40. BOOL fLookForTextualKeywords,
  41. LCID lcid,
  42. BOOL fTreatPlusAsToken )
  43. : _text( buffer ),
  44. _pBuf( buffer ),
  45. _pLookAhead( buffer ),
  46. _fLookForTextualKeywords( fLookForTextualKeywords ),
  47. _fTreatPlusAsToken( fTreatPlusAsToken ),
  48. _lcid( lcid )
  49. {
  50. Accept();
  51. }
  52. //+---------------------------------------------------------------------------
  53. //
  54. // Member: CQueryScanner::AcceptWord, public
  55. //
  56. // Synopsis: Consumes a single word out of a phrase
  57. //
  58. // Requires: Should be called after AcqWord
  59. //
  60. // History: 15-Sep-92 BartoszM Created
  61. //
  62. //----------------------------------------------------------------------------
  63. void CQueryScanner::AcceptWord()
  64. {
  65. _pLookAhead = _text;
  66. Accept();
  67. }
  68. //+---------------------------------------------------------------------------
  69. //
  70. // Member: CQueryScanner::AcceptColumn, public
  71. //
  72. // Synopsis: Consumes a column name out of a phrase
  73. //
  74. // Requires: Should be called after AcqColumn
  75. //
  76. // History: 15-Sep-92 BartoszM Created
  77. //
  78. //----------------------------------------------------------------------------
  79. void CQueryScanner::AcceptColumn()
  80. {
  81. AcceptWord();
  82. }
  83. struct SStringToken
  84. {
  85. WCHAR * pwcToken;
  86. unsigned cwc;
  87. Token token;
  88. };
  89. static SStringToken s_EnglishStringTokens[] =
  90. {
  91. { L"AND", (sizeof L"AND" / sizeof WCHAR) - 1, AND_TOKEN },
  92. { L"OR", (sizeof L"OR" / sizeof WCHAR) - 1, OR_TOKEN },
  93. { L"NOT", (sizeof L"NOT" / sizeof WCHAR) - 1, NOT_TOKEN },
  94. { L"NEAR", (sizeof L"NEAR" / sizeof WCHAR) - 1, PROX_TOKEN },
  95. };
  96. static SStringToken s_GermanStringTokens[] =
  97. {
  98. { L"UND", (sizeof L"UND" / sizeof WCHAR) - 1, AND_TOKEN },
  99. { L"ODER", (sizeof L"ODER" / sizeof WCHAR) - 1, OR_TOKEN },
  100. { L"NICHT", (sizeof L"NICHT" / sizeof WCHAR) - 1, NOT_TOKEN },
  101. { L"NAH", (sizeof L"NAH" / sizeof WCHAR) - 1, PROX_TOKEN },
  102. };
  103. static SStringToken s_FrenchStringTokens[] =
  104. {
  105. { L"ET", (sizeof L"ET" / sizeof WCHAR) - 1, AND_TOKEN },
  106. { L"OU", (sizeof L"OU" / sizeof WCHAR) - 1, OR_TOKEN },
  107. { L"SANS", (sizeof L"SANS" / sizeof WCHAR) - 1, NOT_TOKEN },
  108. { L"PRES", (sizeof L"PRES" / sizeof WCHAR) - 1, PROX_TOKEN },
  109. };
  110. static SStringToken s_SpanishStringTokens[] =
  111. {
  112. { L"Y", (sizeof L"Y" / sizeof WCHAR) - 1, AND_TOKEN },
  113. { L"O", (sizeof L"O" / sizeof WCHAR) - 1, OR_TOKEN },
  114. { L"NO", (sizeof L"NO" / sizeof WCHAR) - 1, NOT_TOKEN },
  115. { L"CERCA", (sizeof L"CERCA" / sizeof WCHAR) - 1, PROX_TOKEN },
  116. };
  117. static SStringToken s_DutchStringTokens[] =
  118. {
  119. { L"EN", (sizeof L"EN" / sizeof WCHAR) - 1, AND_TOKEN },
  120. { L"OF", (sizeof L"OF" / sizeof WCHAR) - 1, OR_TOKEN },
  121. { L"NIET", (sizeof L"NIET" / sizeof WCHAR) - 1, NOT_TOKEN },
  122. { L"NABIJ", (sizeof L"NABIJ" / sizeof WCHAR) - 1, PROX_TOKEN },
  123. };
  124. static WCHAR aSwedishNear[] = { L'N', 0xc4, L'R', L'A', 0 };
  125. static SStringToken s_SwedishStringTokens[] =
  126. {
  127. { L"OCH", (sizeof L"OCH" / sizeof WCHAR) - 1, AND_TOKEN },
  128. { L"ELLER", (sizeof L"ELLER" / sizeof WCHAR) - 1, OR_TOKEN },
  129. { L"INTE", (sizeof L"INTE" / sizeof WCHAR) - 1, NOT_TOKEN },
  130. { aSwedishNear, 4, PROX_TOKEN },
  131. };
  132. static SStringToken s_ItalianStringTokens[] =
  133. {
  134. { L"E", (sizeof L"E" / sizeof WCHAR) - 1, AND_TOKEN },
  135. { L"O", (sizeof L"O" / sizeof WCHAR) - 1, OR_TOKEN },
  136. { L"NO", (sizeof L"NO" / sizeof WCHAR) - 1, NOT_TOKEN },
  137. { L"VICINO", (sizeof L"VICINO" / sizeof WCHAR) - 1, PROX_TOKEN },
  138. };
  139. const unsigned cStringTokens = sizeof(s_EnglishStringTokens) /
  140. sizeof(s_EnglishStringTokens[0]);
  141. #define WORD_STR L"{}!&|~*@#()[],\t=<>\n\"^ "
  142. //+---------------------------------------------------------------------------
  143. //
  144. // Function: InternalFindStringToken
  145. //
  146. // Synopsis: Looks for a textual token in plain text.
  147. //
  148. // Arguments: [pwcIn] -- string to search
  149. // [token] -- returns the token found
  150. // [cwc] -- returns length of token found
  151. // [pTokens] -- token array to use
  152. //
  153. // Returns: Pointer to token or 0 if none was found
  154. //
  155. // History: 08-Feb-96 dlee created
  156. //
  157. //----------------------------------------------------------------------------
  158. WCHAR * InternalFindStringToken(
  159. WCHAR * pwcIn,
  160. Token & token,
  161. unsigned & cwc,
  162. SStringToken * pTokens )
  163. {
  164. // for each of and/or/not/near
  165. WCHAR *pwcOut = 0;
  166. for ( unsigned i = 0; i < cStringTokens; i++ )
  167. {
  168. WCHAR *pwcStr = wcsstr( pwcIn, pTokens[i].pwcToken );
  169. while ( pwcStr )
  170. {
  171. // found a match -- does it have white space on either side?
  172. WCHAR wcBeyond = * (pwcStr + pTokens[i].cwc);
  173. if ( ( ( 0 == wcBeyond ) ||
  174. ( wcschr( WORD_STR, wcBeyond ) ) ) &&
  175. ( ( pwcStr == pwcIn ) ||
  176. ( iswspace( * ( pwcStr - 1 ) ) ) ) )
  177. {
  178. // if the first match found or the match closest to the
  179. // beginning of the string, use it.
  180. if ( ( 0 == pwcOut ) ||
  181. ( pwcStr < pwcOut ) )
  182. {
  183. pwcOut = pwcStr;
  184. token = pTokens[i].token;
  185. cwc = pTokens[i].cwc;
  186. }
  187. break;
  188. }
  189. pwcStr = wcsstr( pwcStr + 1, pTokens[i].pwcToken );
  190. }
  191. }
  192. return pwcOut;
  193. } //InternalFindStringToken
  194. SStringToken * GetStringTokenArray(
  195. LCID lcid )
  196. {
  197. SStringToken *pTokens;
  198. switch ( PRIMARYLANGID( LANGIDFROMLCID( lcid ) ) )
  199. {
  200. case LANG_GERMAN :
  201. pTokens = s_GermanStringTokens;
  202. break;
  203. case LANG_FRENCH :
  204. pTokens = s_FrenchStringTokens;
  205. break;
  206. case LANG_SPANISH :
  207. pTokens = s_SpanishStringTokens;
  208. break;
  209. case LANG_DUTCH :
  210. pTokens = s_DutchStringTokens;
  211. break;
  212. case LANG_SWEDISH :
  213. pTokens = s_SwedishStringTokens;
  214. break;
  215. case LANG_ITALIAN :
  216. pTokens = s_ItalianStringTokens;
  217. break;
  218. case LANG_NEUTRAL :
  219. case LANG_ENGLISH :
  220. default :
  221. pTokens = s_EnglishStringTokens;
  222. break;
  223. }
  224. Win4Assert( 0 != pTokens );
  225. return pTokens;
  226. }
  227. //+---------------------------------------------------------------------------
  228. //
  229. // Function: FindStringToken
  230. //
  231. // Synopsis: Looks for a textual token in plain text. Always tries
  232. // English, tries a different language depending on _lcid.
  233. //
  234. // Arguments: [pwcIn] -- string to search
  235. // [token] -- returns the token found
  236. // [cwc] -- returns length of token found
  237. //
  238. // Returns: Pointer to token or 0 if none was found
  239. //
  240. // History: 08-Feb-96 dlee created
  241. //
  242. //----------------------------------------------------------------------------
  243. WCHAR * CQueryScanner::FindStringToken(
  244. WCHAR * pwcIn,
  245. Token & token,
  246. unsigned & cwc )
  247. {
  248. SStringToken * pTokens = GetStringTokenArray( _lcid );
  249. WCHAR * pwcToken = InternalFindStringToken( pwcIn, token, cwc, pTokens );
  250. // if the search above wasn't in English, try English too.
  251. if ( pTokens != s_EnglishStringTokens )
  252. {
  253. unsigned cwcEnglish;
  254. Token tokenEnglish;
  255. WCHAR * pwcEnglish = InternalFindStringToken( pwcIn,
  256. tokenEnglish,
  257. cwcEnglish,
  258. s_EnglishStringTokens );
  259. // If there is no language-specific match or the English match
  260. // occurs before the language-specific match, use the English
  261. // match.
  262. if ( ( 0 != pwcEnglish ) &&
  263. ( ( 0 == pwcToken ) || ( pwcEnglish < pwcToken ) ) )
  264. {
  265. pwcToken = pwcEnglish;
  266. token = tokenEnglish;
  267. cwc = cwcEnglish;
  268. }
  269. }
  270. return pwcToken;
  271. } //FindStringToken
  272. //+---------------------------------------------------------------------------
  273. //
  274. // Member: CQueryScanner::Accept, public
  275. //
  276. // Synopsis: Determines what the next token is. Will advance _pLookAhead
  277. // over the next token and white space.
  278. //
  279. // Notes: There are five different types of TEXT_TOKENS, Phrase, Path,
  280. // Number, Column and Command. Since the length of the token
  281. // depends on which token it is, _pLookAhead is forwarded to the
  282. // end of the longest, and _text is used to parse the token in the
  283. // various Acq and Get methods.
  284. //
  285. // History: 30-Apr-92 AmyA Created
  286. // 19-May-92 AmyA Added Guid hack
  287. // 23-Jun-92 MikeHew Added weight token recognition.
  288. // 26-May-94 t-jeffc Added more tokens; rearranged to
  289. // support parsing errors
  290. //
  291. //----------------------------------------------------------------------------
  292. void CQueryScanner::Accept()
  293. {
  294. EatWhiteSpace();
  295. _text = _pLookAhead;
  296. switch ( *_pLookAhead )
  297. {
  298. case '&':
  299. _pLookAhead++;
  300. _token = AND_TOKEN;
  301. break;
  302. case '*':
  303. _pLookAhead++;
  304. if ( *_pLookAhead == '*' )
  305. {
  306. _token = FUZ2_TOKEN;
  307. _pLookAhead++;
  308. }
  309. else
  310. _token = FUZZY_TOKEN;
  311. break;
  312. case '=':
  313. _pLookAhead++;
  314. _token = EQUAL_TOKEN;
  315. break;
  316. case '<':
  317. _pLookAhead++;
  318. if ( *_pLookAhead == '=' )
  319. {
  320. _token = LESS_EQUAL_TOKEN;
  321. _pLookAhead++;
  322. }
  323. else
  324. _token = LESS_TOKEN;
  325. break;
  326. case '>':
  327. _pLookAhead++;
  328. if ( *_pLookAhead == '=' )
  329. {
  330. _token = GREATER_EQUAL_TOKEN;
  331. _pLookAhead++;
  332. }
  333. else
  334. _token = GREATER_TOKEN;
  335. break;
  336. case '!':
  337. _pLookAhead++;
  338. if ( *_pLookAhead == '=' )
  339. {
  340. _token = NOT_EQUAL_TOKEN;
  341. _pLookAhead++;
  342. }
  343. else
  344. {
  345. _token = NOT_TOKEN;
  346. }
  347. break;
  348. case '|':
  349. _pLookAhead++;
  350. _token = OR_TOKEN;
  351. break;
  352. case '~':
  353. _pLookAhead++;
  354. _token = PROX_TOKEN;
  355. break;
  356. case '@':
  357. _pLookAhead++;
  358. _token = PROP_TOKEN;
  359. break;
  360. case '#':
  361. _pLookAhead++;
  362. _token = PROP_REGEX_TOKEN;
  363. break;
  364. case '(':
  365. _pLookAhead++;
  366. _token = OPEN_TOKEN;
  367. break;
  368. case ')':
  369. _pLookAhead++;
  370. _token = CLOSE_TOKEN;
  371. break;
  372. case '[':
  373. _pLookAhead++;
  374. _token = W_OPEN_TOKEN;
  375. break;
  376. case ']':
  377. _pLookAhead++;
  378. _token = W_CLOSE_TOKEN;
  379. break;
  380. case ',':
  381. _pLookAhead++;
  382. _token = COMMA_TOKEN;
  383. break;
  384. case '\0':
  385. case 0x1A: // CTRL-Z
  386. _token = EOS_TOKEN;
  387. break;
  388. case '"':
  389. _pLookAhead++;
  390. _token = QUOTES_TOKEN;
  391. break;
  392. case '$':
  393. _pLookAhead++;
  394. _token = PROP_NATLANG_TOKEN;
  395. break;
  396. case '{':
  397. _pLookAhead++;
  398. _token = C_OPEN_TOKEN;
  399. break;
  400. case '}':
  401. _pLookAhead++;
  402. _token = C_CLOSE_TOKEN;
  403. break;
  404. case '^':
  405. {
  406. WCHAR wc = *(_pLookAhead + 1);
  407. BOOL fOk = TRUE;
  408. if (L'a' == wc) // all bits
  409. _token = ALLOF_TOKEN;
  410. else if (L's' == wc) // some bits
  411. _token = SOMEOF_TOKEN;
  412. else
  413. fOk = FALSE;
  414. if (fOk)
  415. {
  416. _pLookAhead += 2;
  417. break;
  418. }
  419. }
  420. // FALL THROUGH
  421. case '+':
  422. if (*_pLookAhead == L'+' && _fTreatPlusAsToken)
  423. {
  424. _pLookAhead++;
  425. _token = PLUS_TOKEN;
  426. break;
  427. }
  428. // FALL THROUGH
  429. default:
  430. {
  431. // forwards pwcEnd over anything that could be in a phrase,
  432. // which is the most inclusive of the TEXT_TOKENs.
  433. // (except, for regex's and phrases in quotes - but they're
  434. // handled separately)
  435. WCHAR const *pwcEnd = _text + wcscspn( _text, PHRASE_STR );
  436. if ( _fLookForTextualKeywords )
  437. {
  438. unsigned cwc = (unsigned) ( pwcEnd - _text );
  439. cwc = __min( cwc, MAX_PATH * 2 );
  440. // if a textual keyword is beyond 500 chars in the string,
  441. // blow it off -- the workaround is to use the '&|~' version.
  442. WCHAR awcBuf[ 1 + MAX_PATH * 2 ];
  443. RtlCopyMemory( awcBuf, _text, cwc * sizeof WCHAR );
  444. awcBuf[ cwc ] = 0;
  445. ULONG cwcOut = LCMapString( _lcid,
  446. LCMAP_UPPERCASE,
  447. awcBuf,
  448. cwc,
  449. awcBuf,
  450. cwc );
  451. if ( cwcOut != cwc )
  452. THROW( CException() );
  453. Token token;
  454. unsigned cwcToken;
  455. WCHAR *pwcTok = FindStringToken( awcBuf, token, cwcToken );
  456. if ( 0 != pwcTok )
  457. {
  458. // a textual token exists in the string
  459. if ( pwcTok == awcBuf )
  460. {
  461. // textual token at the start of the string
  462. _token = token;
  463. _pLookAhead = _text + cwcToken;
  464. }
  465. else
  466. {
  467. // textual token in the middle of the string, stop the
  468. // current token at that point and get it next time
  469. // Accept() is called.
  470. _pLookAhead = _text + ( pwcTok - awcBuf );
  471. _token = TEXT_TOKEN;
  472. }
  473. }
  474. else
  475. {
  476. _pLookAhead = pwcEnd;
  477. _token = TEXT_TOKEN;
  478. }
  479. }
  480. else
  481. {
  482. _pLookAhead = pwcEnd;
  483. _token = TEXT_TOKEN;
  484. }
  485. break;
  486. }
  487. }
  488. }
  489. //+---------------------------------------------------------------------------
  490. //
  491. // Member: CQueryScanner::AllocReturnString, private inline
  492. //
  493. // Synopsis: Copies all of the relevant characters of the string that
  494. // _text is pointing to and returns the new string.
  495. //
  496. // History: 17 Apr 97 AlanW Created
  497. //
  498. //----------------------------------------------------------------------------
  499. inline WCHAR * CQueryScanner::AllocReturnString( int cch )
  500. {
  501. WCHAR * newBuf = new WCHAR [ cch + 1 ];
  502. RtlCopyMemory ( newBuf, _text, cch * sizeof(WCHAR));
  503. newBuf[cch] = L'\0';
  504. _text += cch;
  505. while ( iswspace(*_text) )
  506. _text++;
  507. return newBuf;
  508. }
  509. //+---------------------------------------------------------------------------
  510. //
  511. // Member: CQueryScanner::AcqPath, public
  512. //
  513. // Synopsis: Copies all of the relevant characters of the string that
  514. // _text is pointing to and returns the new string. Will
  515. // return 0 if _text is at end of whole TEXT_TOKEN.
  516. //
  517. // Notes: Since the string is copied, the caller of this function is
  518. // responsible for freeing the memory occupied by the string.
  519. // This method can be called several times before calling
  520. // Accept(), so many paths can be acquired if they exist in the
  521. // scanner.
  522. //
  523. // History: 30-Apr-92 AmyA Created
  524. //
  525. //----------------------------------------------------------------------------
  526. WCHAR * CQueryScanner::AcqPath()
  527. {
  528. if ( IsEndOfTextToken() )
  529. return 0;
  530. // how many characters follow _text that are not in CMND_STR?
  531. int count = wcscspn( _text, CMND_STR );
  532. return AllocReturnString( count );
  533. }
  534. //+---------------------------------------------------------------------------
  535. //
  536. // Member: CQueryScanner::AcqWord, public
  537. //
  538. // Synopsis: Copies the word that _text is pointing to and returns the
  539. // new string. Positions _text after the word and whitespace.
  540. // Returns 0 if at the end of a TEXT_TOKEN.
  541. //
  542. // History: 29-Jun-92 MikeHew Created.
  543. //
  544. //----------------------------------------------------------------------------
  545. WCHAR * CQueryScanner::AcqWord()
  546. {
  547. if ( IsEndOfTextToken() )
  548. return 0;
  549. WCHAR const * pEnd = _text;
  550. while ( !iswspace(*pEnd) && pEnd < _pLookAhead )
  551. pEnd++;
  552. unsigned count = CiPtrToUint( pEnd - _text );
  553. return AllocReturnString( count );
  554. }
  555. //+---------------------------------------------------------------------------
  556. //
  557. // Member: CQueryScanner::AcqColumn, public
  558. //
  559. // Synopsis: Copies a column name and returns the new string. A column
  560. // name is either a single word, or a quoted string.
  561. // Positions _text after the word and whitespace.
  562. //
  563. // Returns: WCHAR* pointer to column name. 0 if no column name found.
  564. //
  565. // History: 17 Apr 97 AlanW Created.
  566. //
  567. //----------------------------------------------------------------------------
  568. WCHAR * CQueryScanner::AcqColumn()
  569. {
  570. if ( QUOTES_TOKEN == _token)
  571. {
  572. Accept();
  573. WCHAR * pwszOut = AcqPhraseInQuotes();
  574. _text = _pLookAhead;
  575. return pwszOut;
  576. }
  577. if ( IsEndOfTextToken() )
  578. return 0;
  579. int count = wcscspn( _text, COLUMN_STR );
  580. return AllocReturnString( count );
  581. }
  582. //+---------------------------------------------------------------------------
  583. //
  584. // Member: CQueryScanner::AcqPhrase, public
  585. //
  586. // Synopsis: Copies all of the relevant characters of the string that
  587. // _text is pointing to and returns the new string.
  588. // Returns 0 if at the end of a text token.
  589. //
  590. // Notes: Since the string is copied, the caller of this function is
  591. // responsible for freeing the memory occupied by the string.
  592. // The difference between this function and AcqPath is that this
  593. // should only be called once before calling Accept().
  594. //
  595. // History: 30-Apr-92 AmyA Created
  596. // 09-May-96 DwightKr Strip trailing white space
  597. //
  598. //----------------------------------------------------------------------------
  599. WCHAR * CQueryScanner::AcqPhrase()
  600. {
  601. if( IsEndOfTextToken() )
  602. return 0;
  603. //
  604. // Strip trailing white-space from the end of the phrase. _pLookAhead
  605. // points to the first character of the NEXT phrase.
  606. //
  607. WCHAR const * pEnd = _pLookAhead - 1;
  608. while ( (pEnd > _text) && iswspace(*pEnd) )
  609. {
  610. pEnd--;
  611. }
  612. unsigned count = CiPtrToUint( pEnd - _text ) + 1;
  613. WCHAR * newBuf = new WCHAR [ count + 1 ];
  614. RtlCopyMemory( newBuf, _text, count * sizeof( WCHAR ) );
  615. newBuf[count] = 0;
  616. return newBuf;
  617. }
  618. //+---------------------------------------------------------------------------
  619. //
  620. // Member: CQueryScanner::AcqRegEx, public
  621. //
  622. // Synopsis: Copies all of the relevant characters of the string that
  623. // _text is pointing to and returns the new string. Matches
  624. // the longest string possible - the only restriction is that
  625. // the regex can not contain any of the characters in REGEX_STR
  626. // outside of <> braces (which may be nested).
  627. // Returns 0 if the regex is empty.
  628. //
  629. // Notes: Since the string is copied, the caller of this function is
  630. // responsible for freeing the memory occupied by the string.
  631. // Because some regex characters are duplicated in the query
  632. // language, _pLookAhead is ignored (and actually reset) in
  633. // this operation. Like AcqPhrase(), this should be called only
  634. // once before Accept().
  635. //
  636. // History: 10-May-94 t-jeffc Created
  637. //
  638. //----------------------------------------------------------------------------
  639. WCHAR * CQueryScanner::AcqRegEx()
  640. {
  641. WCHAR const * pEnd = _text;
  642. BOOL fDone = FALSE;
  643. BOOL fQuoted = FALSE;
  644. if ( *pEnd == L'"' )
  645. {
  646. fQuoted = TRUE;
  647. pEnd++;
  648. }
  649. // scan the string - stop at \0 or if any REGEX_STR characters are
  650. // found outside of braces
  651. //
  652. for( ;; )
  653. {
  654. switch( *pEnd )
  655. {
  656. case '\0':
  657. if ( fQuoted )
  658. THROW( CException( QPARSE_E_UNEXPECTED_EOS ) );
  659. fDone = TRUE;
  660. break;
  661. case ' ':
  662. if ( !fQuoted )
  663. fDone = TRUE;
  664. break;
  665. case ')':
  666. if ( !fQuoted )
  667. {
  668. if ( ( pEnd != _text ) &&
  669. ( '|' != (*(pEnd-1)) ) )
  670. fDone = TRUE;
  671. }
  672. break;
  673. case '"':
  674. if ( fQuoted )
  675. {
  676. pEnd++;
  677. fDone = TRUE;
  678. }
  679. break;
  680. default:
  681. break;
  682. } // switch( *pEnd )
  683. if( fDone ) break;
  684. pEnd++;
  685. }
  686. if( _text == pEnd )
  687. return 0;
  688. // set _pLookAhead
  689. _pLookAhead = pEnd;
  690. // copy the string
  691. unsigned count = CiPtrToUint( _pLookAhead - _text );
  692. if ( fQuoted )
  693. {
  694. Win4Assert( count >= 2 );
  695. count -= 2;
  696. }
  697. WCHAR * newBuf = new WCHAR[ count + 1 ];
  698. RtlCopyMemory( newBuf, _text + (fQuoted ? 1 : 0), count * sizeof( WCHAR ) );
  699. newBuf[ count ] = 0;
  700. return newBuf;
  701. }
  702. //+---------------------------------------------------------------------------
  703. //
  704. // Member: CQueryScanner::AcqPhraseInQuotes, public
  705. //
  706. // Synopsis: Copies all characters until a matching " is found, or until
  707. // the end of string. Embedded quotes are escaped with a quote:
  708. // "Bill ""the man"" Gates"
  709. //
  710. // Notes: Since the string is copied, the caller of this function is
  711. // responsible for freeing the memory occupied by the string.
  712. //
  713. // History: 18-Jan-95 SitaramR Created
  714. // 3-Jul-96 dlee added embedded quotes
  715. //
  716. //----------------------------------------------------------------------------
  717. WCHAR * CQueryScanner::AcqPhraseInQuotes()
  718. {
  719. WCHAR const * pEnd = _text;
  720. do
  721. {
  722. if ( 0 == *pEnd )
  723. break;
  724. if ( L'"' == *pEnd )
  725. {
  726. if ( L'"' == *(pEnd+1) )
  727. pEnd++;
  728. else
  729. break;
  730. }
  731. pEnd++;
  732. } while ( TRUE );
  733. unsigned count = CiPtrToUint( pEnd - _text );
  734. WCHAR * newBuf = new WCHAR [ count + 1 ];
  735. WCHAR * pwcNewBuf = newBuf;
  736. WCHAR const * pStart = _text;
  737. // copy the string, but remove the extra quote characters
  738. while ( pStart < pEnd )
  739. {
  740. *pwcNewBuf++ = *pStart++;
  741. if ( L'"' == *pStart )
  742. pStart++;
  743. }
  744. *pwcNewBuf = 0;
  745. if ( *pEnd == L'"' )
  746. _pLookAhead = pEnd + 1;
  747. else
  748. _pLookAhead = pEnd;
  749. return newBuf;
  750. }
  751. //+---------------------------------------------------------------------------
  752. //
  753. // Member: CQueryScanner::GetNumber, public
  754. //
  755. // Synopsis: If _text is at the end of the TEXT_TOKEN, returns FALSE.
  756. // If not, puts the ULONG from the scanner into number and
  757. // returns TRUE.
  758. //
  759. // Arguments: [number] -- the ULONG which will be changed and passed back
  760. // out as the ULONG from the scanner.
  761. // [fAtEnd] -- returns TRUE if at the end of the scanned string
  762. //
  763. // Notes: May be called several times in a loop before Accept() is
  764. // called.
  765. //
  766. // History: 11-May-92 AmyA Created
  767. //
  768. //----------------------------------------------------------------------------
  769. BOOL CQueryScanner::GetNumber( ULONG & number, BOOL & fAtEnd )
  770. {
  771. if ( IsEndOfTextToken() || !iswdigit(*_text) || (*_text == L'-') )
  772. return FALSE;
  773. // is this a hex number?
  774. ULONG base = 10;
  775. if (_text[0] == L'0' && (_text[1] == L'x' || _text[1] == L'X'))
  776. {
  777. _text += 2;
  778. base = 16;
  779. }
  780. const WCHAR * pwcStart = _text;
  781. number = wcstoul( _text, (WCHAR **)(&_text), base );
  782. // looks like a real number?
  783. if ( ( pwcStart == _text ) ||
  784. ( L'.' == *_text ) )
  785. return FALSE;
  786. while ( iswspace(*_text) )
  787. _text++;
  788. fAtEnd = ( 0 == *_text );
  789. return TRUE;
  790. }
  791. //+---------------------------------------------------------------------------
  792. //
  793. // Member: CQueryScanner::GetNumber, public
  794. //
  795. // Synopsis: If _text is at the end of the TEXT_TOKEN, returns FALSE.
  796. // If not, puts the LONG from the scanner into number and
  797. // returns TRUE.
  798. //
  799. // Arguments: [number] -- the LONG which will be changed and passed back
  800. // out as the LONG from the scanner.
  801. // [fAtEnd] -- returns TRUE if at the end of the scanned string
  802. //
  803. // Notes: May be called several times in a loop before Accept() is
  804. // called.
  805. //
  806. // History: 96-Jan-15 DwightKr Created
  807. //
  808. //----------------------------------------------------------------------------
  809. BOOL CQueryScanner::GetNumber( LONG & number, BOOL & fAtEnd )
  810. {
  811. WCHAR *text = (WCHAR *) _text;
  812. BOOL IsNegative = FALSE;
  813. ULONG ulMax = (ULONG) LONG_MAX;
  814. if ( L'-' == _text[0] )
  815. {
  816. IsNegative = TRUE;
  817. ulMax++; // can represent 1 more negative than positive.
  818. _text++;
  819. }
  820. ULONG ulNumber;
  821. if ( !GetNumber( ulNumber, fAtEnd ) )
  822. {
  823. _text = text;
  824. return FALSE;
  825. }
  826. // Signed number overflow/underflow
  827. if ( ulNumber > ulMax )
  828. {
  829. _text = text;
  830. return FALSE;
  831. }
  832. if ( IsNegative )
  833. {
  834. if ( ulMax == ulNumber )
  835. number = LONG_MIN;
  836. else
  837. number = - (LONG) ulNumber;
  838. }
  839. else
  840. {
  841. number = (LONG) ulNumber;
  842. }
  843. return TRUE;
  844. }
  845. //+---------------------------------------------------------------------------
  846. //
  847. // Member: CQueryScanner::GetNumber, public
  848. //
  849. // Synopsis: If _text is at the end of the TEXT_TOKEN, returns FALSE.
  850. // If not, puts the ULONG from the scanner into number and
  851. // returns TRUE.
  852. //
  853. // Arguments: [number] -- the ULONG which will be changed and passed back
  854. // out as the ULONG from the scanner.
  855. // [fAtEnd] -- returns TRUE if at the end of the scanned string
  856. //
  857. // Notes: May be called several times in a loop before Accept() is
  858. // called.
  859. //
  860. // History: 27-Feb-96 dlee Created
  861. //
  862. //----------------------------------------------------------------------------
  863. BOOL CQueryScanner::GetNumber( unsigned _int64 & number, BOOL & fAtEnd )
  864. {
  865. if ( IsEndOfTextToken() || !iswdigit(*_text) || (*_text == L'-') )
  866. return FALSE;
  867. // is this a hex number?
  868. ULONG base = 10;
  869. if (_text[0] == L'0' && (_text[1] == L'x' || _text[1] == L'X'))
  870. {
  871. _text += 2;
  872. base = 16;
  873. }
  874. const WCHAR * pwcStart = _text;
  875. number = _wcstoui64( _text, (WCHAR **)(&_text), base );
  876. // looks like a real number?
  877. if ( ( pwcStart == _text ) ||
  878. ( L'.' == *_text ) )
  879. return FALSE;
  880. while ( iswspace(*_text) )
  881. _text++;
  882. fAtEnd = ( 0 == *_text );
  883. return TRUE;
  884. }
  885. //+---------------------------------------------------------------------------
  886. //
  887. // Member: CQueryScanner::GetNumber, public
  888. //
  889. // Synopsis: If _text is at the end of the TEXT_TOKEN, returns FALSE.
  890. // If not, puts the LONG from the scanner into number and
  891. // returns TRUE.
  892. //
  893. // Arguments: [number] -- the LONG which will be changed and passed back
  894. // out as the LONG from the scanner.
  895. // [fAtEnd] -- returns TRUE if at the end of the scanned string
  896. //
  897. // Notes: May be called several times in a loop before Accept() is
  898. // called.
  899. //
  900. // History: 27-Feb-96 dlee Created
  901. //
  902. //----------------------------------------------------------------------------
  903. BOOL CQueryScanner::GetNumber( _int64 & number, BOOL & fAtEnd )
  904. {
  905. WCHAR *text = (WCHAR *) _text;
  906. BOOL IsNegative = FALSE;
  907. unsigned _int64 ullMax = (unsigned _int64) _I64_MAX;
  908. if ( L'-' == _text[0] )
  909. {
  910. IsNegative = TRUE;
  911. ullMax++; // can represent 1 more negative than positive.
  912. _text++;
  913. }
  914. unsigned _int64 ullNumber;
  915. if ( !GetNumber( ullNumber, fAtEnd ) )
  916. {
  917. _text = text;
  918. return FALSE;
  919. }
  920. // Signed number overflow/underflow
  921. if ( ullNumber > ullMax )
  922. {
  923. _text = text;
  924. return FALSE;
  925. }
  926. if ( IsNegative )
  927. {
  928. if ( ullMax == ullNumber )
  929. number = _I64_MIN;
  930. else
  931. number = -((_int64) ullNumber);
  932. }
  933. else
  934. {
  935. number = (_int64) ullNumber;
  936. }
  937. return TRUE;
  938. }
  939. //+---------------------------------------------------------------------------
  940. //
  941. // Member: CQueryScanner::GetNumber, public
  942. //
  943. // Synopsis: If _text is at the end of the TEXT_TOKEN, returns FALSE.
  944. // If not, puts the LONG from the scanner into number and
  945. // returns TRUE.
  946. //
  947. // Arguments: [number] -- the double which will be changed and passed back
  948. // out as the double from the scanner.
  949. //
  950. // Notes: May be called several times in a loop before Accept() is
  951. // called.
  952. //
  953. // History: 96-Jan-15 DwightKr Created
  954. //
  955. //----------------------------------------------------------------------------
  956. BOOL CQueryScanner::GetNumber( double & number )
  957. {
  958. if ( IsEndOfTextToken() || !iswdigit(*_text) )
  959. return FALSE;
  960. if ( swscanf( _text, L"%lf", &number ) != 1 )
  961. {
  962. return FALSE;
  963. }
  964. while ( iswspace(*_text) != 0 )
  965. _text++;
  966. return TRUE;
  967. }
  968. //+---------------------------------------------------------------------------
  969. //
  970. // Member: CQueryScanner::GetCommandChar, public
  971. //
  972. // Synopsis: Returns the command character pointed to by _text and advances
  973. // _text. If the command can't be uniquely determined by the
  974. // first character, each subsequent call will return the next
  975. // character in the word. After the command has been determined,
  976. // AcceptCommand() should be called and then operand parsing may begin.
  977. //
  978. // History: 14-May-92 AmyA Created
  979. // 16-May-94 t-jeffc Returns one character at a time to
  980. // support more commands
  981. //
  982. //----------------------------------------------------------------------------
  983. WCHAR CQueryScanner::GetCommandChar()
  984. {
  985. if( IsEndOfTextToken() )
  986. return 0;
  987. WCHAR chCommand = _text[0];
  988. _text++;
  989. return towlower( chCommand );
  990. }
  991. //+---------------------------------------------------------------------------
  992. //
  993. // Member: CQueryScanner::AcceptCommand, public
  994. //
  995. // Synopsis: Advances _text past any characters in the command.
  996. // Used when enough command characters have been
  997. // read to uniquely determine the command and begin parsing
  998. // the operands.
  999. //
  1000. // History: 16-May-94 t-jeffc Created
  1001. //
  1002. //----------------------------------------------------------------------------
  1003. void CQueryScanner::AcceptCommand()
  1004. {
  1005. int cChars = wcscspn( _text, CMND_STR ); // how many characters follow
  1006. // _text that are not in CMND_STR
  1007. _text += cChars;
  1008. _pLookAhead = _text;
  1009. Accept();
  1010. }
  1011. //+---------------------------------------------------------------------------
  1012. //
  1013. // Member: CQueryScanner::ResetBuffer, public
  1014. //
  1015. // Synopsis: Puts a new string into _pBuf and resets _pLookAhead
  1016. // accordingly.
  1017. //
  1018. // Arguments: [buffer] -- the new string for _pBuf
  1019. //
  1020. // History: 05-May-92 AmyA Created
  1021. //
  1022. //----------------------------------------------------------------------------
  1023. void CQueryScanner::ResetBuffer( WCHAR const * buffer )
  1024. {
  1025. _pBuf = buffer;
  1026. _pLookAhead = _pBuf;
  1027. Accept();
  1028. }
  1029. //+---------------------------------------------------------------------------
  1030. //
  1031. // Member: CQueryScanner::EatWhiteSpace, private
  1032. //
  1033. // Synopsis: Advances _pLookAhead past any white space in the string.
  1034. //
  1035. // History: 29-Apr-92 AmyA Created
  1036. //
  1037. //----------------------------------------------------------------------------
  1038. void CQueryScanner::EatWhiteSpace()
  1039. {
  1040. while ( iswspace(*_pLookAhead) != 0 )
  1041. _pLookAhead++;
  1042. }
  1043. //+---------------------------------------------------------------------------
  1044. //
  1045. // Member: CQueryScanner::IsEndOfTextToken, private
  1046. //
  1047. // Synopsis: Returns TRUE if the current token is not a TEXT_TOKEN or
  1048. // if the string starting at _text to _pLookAhead contains
  1049. // nothing but whitespace.
  1050. //
  1051. // History: 27-May-94 t-jeffc Created
  1052. //
  1053. //----------------------------------------------------------------------------
  1054. BOOL CQueryScanner::IsEndOfTextToken()
  1055. {
  1056. if( _token == TEXT_TOKEN && _text < _pLookAhead )
  1057. return FALSE;
  1058. else
  1059. return TRUE;
  1060. }
  1061. //+---------------------------------------------------------------------------
  1062. //
  1063. // Member: CQueryScanner::AcqLine, public
  1064. //
  1065. // Synopsis: Copies all of the remaining characters on the line;
  1066. // return 0 if _text is at end of whole TEXT_TOKEN.
  1067. //
  1068. // Arguments: [fParseQuotes] -- if TRUE, initial and final quotes are removed
  1069. //
  1070. // Notes: Since the string is copied, the caller of this function is
  1071. // responsible for freeing the memory occupied by the string.
  1072. // This method can be called several times before calling
  1073. // Accept(), so many paths can be acquired if they exist in the
  1074. // scanner.
  1075. //
  1076. // History: 96-Jan-03 DwightKr Created
  1077. // 96-Feb-26 DwightKr Allow lines to be quoted
  1078. //
  1079. //----------------------------------------------------------------------------
  1080. WCHAR * CQueryScanner::AcqLine( BOOL fParseQuotes )
  1081. {
  1082. if ( *_text == L'\0' )
  1083. return 0;
  1084. unsigned cwcBuffer = wcslen(_text);
  1085. //
  1086. // If there are \r, \n, or other white space at the end of the string,
  1087. // strip it off
  1088. //
  1089. while ( cwcBuffer > 0 && _text[cwcBuffer-1] <= L' ' )
  1090. cwcBuffer--;
  1091. if ( fParseQuotes )
  1092. {
  1093. //
  1094. // If there is a pair of quotes delimiting this line, strip them off
  1095. //
  1096. if ( (L'"' == _text[0]) && (cwcBuffer > 1) )
  1097. {
  1098. if ( L'"' == _text[cwcBuffer-1] )
  1099. cwcBuffer--;
  1100. _text++;
  1101. cwcBuffer--;
  1102. }
  1103. }
  1104. WCHAR *pText = new WCHAR [ cwcBuffer + 1 ];
  1105. RtlCopyMemory( pText, _text, cwcBuffer * sizeof(WCHAR) );
  1106. pText[cwcBuffer] = 0;
  1107. _pLookAhead = _text + cwcBuffer - 1;
  1108. return pText;
  1109. } //AcqLine