Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3194 lines
83 KiB

  1. ////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Filename : Tokenizer.cpp
  4. // Purpose : Tokenizer implementation
  5. //
  6. // Project : WordBreakers
  7. // Component: English word breaker
  8. //
  9. // Author : yairh
  10. //
  11. // Log:
  12. //
  13. // Jan 06 2000 yairh creation
  14. // Apr 04 2000 dovh on behalf of dlee - Fix CTokenizer::OutputClitics
  15. // to avoid PutWord of length 0 (leads to multiple PutWord at
  16. // same location (duplicate keys), and index corruption!
  17. // Example: :...'s :...'s (. stands for junk character)
  18. // Apr 05 2000 dovh - Fixed two problematic debug / tracer buffer size
  19. // problems. (Related to Bug 15449).
  20. // May 07 2000 dovh - USE_WS_SENTINEL algorithm in BreakText
  21. // May 11 2000 dovh - Simplify VerifyMisc test.
  22. // Nov 11 2000 dovh - Special underscore treatment
  23. // Add AddBackUnderscores '_' + alphanumeric treatment.
  24. //
  25. ////////////////////////////////////////////////////////////////////////////////
  26. #include "base.h"
  27. #include "Tokenizer.h"
  28. #include "PropArray.h"
  29. #include "excption.h"
  30. #include "formats.h"
  31. DECLARE_TRIE_SENTINEL;
  32. CWbToUpper g_WbToUpper;
  33. CAutoClassPointer<CPropArray> g_pPropArray;
  34. CTokenizer::CTokenizer(
  35. TEXT_SOURCE* pTxtSource,
  36. IWordSink * pWordSink,
  37. IPhraseSink * pPhraseSink,
  38. LCID lcid,
  39. BOOL bQueryTime,
  40. ULONG ulMaxTokenSize) :
  41. m_pTxtSource(pTxtSource),
  42. m_apWordSink(pWordSink),
  43. m_apPhraseSink(pPhraseSink),
  44. m_Lcid(lcid),
  45. m_bQueryTime(bQueryTime),
  46. m_bNoMoreTxt(false),
  47. m_Token(ulMaxTokenSize),
  48. m_bWhiteSpaceGuarranteed(false)
  49. {
  50. m_ulMaxTokenSize = min(ulMaxTokenSize, TOKENIZER_MAXBUFFERLIMIT);
  51. m_apLangSupport = new CLangSupport(lcid);
  52. m_pCurToken = &m_Token;
  53. if (pTxtSource->iEnd > pTxtSource->iCur)
  54. {
  55. CalculateUpdateEndOfBuffer();
  56. }
  57. else
  58. {
  59. m_ulUpdatedEndOfBuffer = pTxtSource->iEnd;
  60. }
  61. }
  62. void CTokenizer::BreakText()
  63. {
  64. Trace(
  65. elVerbose,
  66. s_tagTokenizer,
  67. ("CTokenizer::BreakText()"));
  68. WCHAR wch;
  69. ULONGLONG ullflags(PROP_DEFAULT);
  70. //
  71. // USE_WS_SENTINEL Algorithm:
  72. //
  73. HRESULT hr = S_OK;
  74. if (m_pTxtSource->iCur >= m_ulUpdatedEndOfBuffer)
  75. {
  76. hr = FillBuffer();
  77. }
  78. while ( SUCCEEDED(hr) )
  79. {
  80. if ( m_bWhiteSpaceGuarranteed )
  81. {
  82. while (true)
  83. {
  84. wch = m_pTxtSource->awcBuffer[m_pTxtSource->iCur];
  85. ullflags = (GET_PROP(wch).m_ulFlag);
  86. if (ullflags & PROP_WS)
  87. {
  88. if (m_pCurToken->IsNotEmpty())
  89. {
  90. ProcessToken();
  91. }
  92. m_pTxtSource->iCur++;
  93. if (m_pTxtSource->iCur >= m_ulUpdatedEndOfBuffer)
  94. {
  95. hr = FillBuffer();
  96. break;
  97. }
  98. continue;
  99. }
  100. //
  101. // The following lines are inline expenstion of what
  102. // used to be CToken::RecordChar:
  103. //
  104. Assert(m_pCurToken->m_ulBufPos < m_ulMaxTokenSize);
  105. m_pCurToken->m_awchBuf[m_pCurToken->m_ulBufPos] = wch;
  106. m_pCurToken->m_ulBufPos++;
  107. m_pCurToken->m_State.m_Properties.m_ulFlag |= ullflags;
  108. m_pTxtSource->iCur++;
  109. } // while
  110. }
  111. else
  112. {
  113. while (true)
  114. {
  115. if (m_pTxtSource->iCur >= m_ulUpdatedEndOfBuffer)
  116. {
  117. Assert(m_pTxtSource->iCur == m_ulUpdatedEndOfBuffer);
  118. //
  119. // before we switch between buffers if the current token is not empty we
  120. // need to proccess it. m_ulUpdatedEndOfBuffer always points to a breaker character
  121. // (usually it is a WS) thus no token can start at a certain buffer and end in the
  122. // proceeding buffer.
  123. //
  124. if (m_pCurToken->IsNotEmpty())
  125. {
  126. ProcessToken();
  127. }
  128. hr = FillBuffer();
  129. if (FAILED(hr))
  130. {
  131. break;
  132. }
  133. }
  134. wch = m_pTxtSource->awcBuffer[m_pTxtSource->iCur];
  135. ULONGLONG ullflags(GET_PROP(wch).m_ulFlag);
  136. if (ullflags & PROP_WS)
  137. {
  138. if (m_pCurToken->IsNotEmpty())
  139. {
  140. ProcessToken();
  141. }
  142. m_pTxtSource->iCur++;
  143. continue;
  144. }
  145. //
  146. // the following lines are inline expenstion of what used to be CToken::RecordChar.
  147. //
  148. Assert(m_pCurToken->m_ulBufPos < m_ulMaxTokenSize);
  149. m_pCurToken->m_awchBuf[m_pCurToken->m_ulBufPos] = wch;
  150. m_pCurToken->m_ulBufPos++;
  151. m_pCurToken->m_State.m_Properties.m_ulFlag |= ullflags;
  152. m_pTxtSource->iCur++;
  153. } // while
  154. } // if
  155. } // while ( !FAILED(hr) )
  156. } // CTokenizer::BreakText
  157. void CTokenizer::ProcessToken()
  158. {
  159. ULONG ulOffset;
  160. if (m_pTxtSource->iCur < m_pCurToken->m_ulBufPos)
  161. {
  162. Trace(
  163. elWarning,
  164. s_tagTokenizer,
  165. ("CTokenizer::ProcessToken() wrong offset calculation"));
  166. //
  167. // BUGBUG need to understand why we got to this place.
  168. //
  169. ulOffset = m_pCurToken->m_ulBufPos + 1;
  170. }
  171. else if (m_pTxtSource->iCur == m_pCurToken->m_ulBufPos)
  172. {
  173. ulOffset = m_pCurToken->m_ulBufPos;
  174. }
  175. else
  176. {
  177. ulOffset = m_pTxtSource->iCur;
  178. }
  179. m_pCurToken->MarkEndToken(ulOffset);
  180. #ifdef DEBUG
  181. TraceToken();
  182. #endif
  183. //
  184. // simple token.
  185. //
  186. if (IS_PROP_SIMPLE(m_pCurToken->m_State.m_Properties))
  187. {
  188. OutputSimpleToken(
  189. m_pCurToken->m_State,
  190. &g_EmptyClitics);
  191. }
  192. else
  193. {
  194. ProcessTokenInternal();
  195. }
  196. if (m_pCurToken->m_fHasEos)
  197. {
  198. Trace(
  199. elVerbose,
  200. s_tagTokenizerDecision,
  201. ("EOS"));
  202. HRESULT hr;
  203. hr = m_apWordSink->PutBreak(WORDREP_BREAK_EOS);
  204. if (FAILED(hr))
  205. {
  206. THROW_HRESULT_EXCEPTION(hr);
  207. }
  208. }
  209. m_pCurToken->Clear();
  210. }
  211. void CTokenizer::ProcessTokenInternal()
  212. {
  213. do
  214. {
  215. //
  216. // url
  217. //
  218. if (HAS_PROP_SLASH(m_pCurToken->m_State.m_Properties) &&
  219. HAS_PROP_COLON(m_pCurToken->m_State.m_Properties) &&
  220. HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
  221. {
  222. Trace(
  223. elVerbose,
  224. s_tagTokenizerSuspect,
  225. ("%*.*S suspected to be <alpha>:// url",
  226. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  227. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  228. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  229. ));
  230. if (VerifyAlphaUrl())
  231. {
  232. break;
  233. }
  234. }
  235. if (HAS_PROP_PERIOD(m_pCurToken->m_State.m_Properties) &&
  236. HAS_PROP_W(m_pCurToken->m_State.m_Properties))
  237. {
  238. Trace(
  239. elVerbose,
  240. s_tagTokenizerSuspect,
  241. ("%*.*S suspected to be www. url",
  242. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  243. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  244. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  245. ));
  246. if (VerifyWwwUrl())
  247. {
  248. break;
  249. }
  250. }
  251. //
  252. // Acronym
  253. //
  254. if (HAS_PROP_PERIOD(m_pCurToken->m_State.m_Properties) &&
  255. HAS_PROP_UPPER_CASE(m_pCurToken->m_State.m_Properties))
  256. {
  257. if (!HAS_PROP_LOWER_CASE(m_pCurToken->m_State.m_Properties) ||
  258. HAS_PROP_APOSTROPHE(m_pCurToken->m_State.m_Properties))
  259. {
  260. Trace(
  261. elVerbose,
  262. s_tagTokenizerSuspect,
  263. ("%*.*S suspected to be an acronym",
  264. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  265. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  266. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  267. ));
  268. if (VerifyAcronym())
  269. {
  270. break;
  271. }
  272. }
  273. //
  274. // Abbreviation
  275. //
  276. Trace(
  277. elVerbose,
  278. s_tagTokenizerSuspect,
  279. ("%*.*S suspected to be an abbreviation",
  280. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  281. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  282. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  283. ));
  284. if (VerifyAbbreviation())
  285. {
  286. break;
  287. }
  288. Trace(
  289. elVerbose,
  290. s_tagTokenizerSuspect,
  291. ("%*.*S suspected to be a special abbreviation",
  292. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  293. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  294. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  295. ));
  296. if (VerifySpecialAbbreviation())
  297. {
  298. break;
  299. }
  300. }
  301. //
  302. // Hyphenation
  303. //
  304. if (HAS_PROP_DASH(m_pCurToken->m_State.m_Properties) &&
  305. HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
  306. {
  307. Trace(
  308. elVerbose,
  309. s_tagTokenizerSuspect,
  310. ("%*.*S suspected to have a hyphenation",
  311. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  312. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  313. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  314. ));
  315. if (VerifyHyphenation())
  316. {
  317. break;
  318. }
  319. }
  320. //
  321. // (s) parenthesis
  322. //
  323. if (HAS_PROP_LEFT_PAREN(m_pCurToken->m_State.m_Properties) &&
  324. HAS_PROP_RIGHT_PAREN(m_pCurToken->m_State.m_Properties) &&
  325. HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
  326. {
  327. Trace(
  328. elVerbose,
  329. s_tagTokenizerSuspect,
  330. ("%*.*S suspected to have a (s) Parenthesis",
  331. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  332. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  333. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  334. ));
  335. if (VerifyParens())
  336. {
  337. break;
  338. }
  339. }
  340. //
  341. // Currency
  342. //
  343. if (HAS_PROP_CURRENCY(m_pCurToken->m_State.m_Properties) &&
  344. HAS_PROP_NUMBER(m_pCurToken->m_State.m_Properties))
  345. {
  346. Trace(
  347. elVerbose,
  348. s_tagTokenizerSuspect,
  349. ("%*.*S suspected to be a currency",
  350. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  351. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  352. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  353. ));
  354. if (VerifyCurrency())
  355. {
  356. break;
  357. }
  358. }
  359. //
  360. // Numbers / time / dates
  361. //
  362. if (HAS_PROP_NUMBER(m_pCurToken->m_State.m_Properties))
  363. {
  364. Trace(
  365. elVerbose,
  366. s_tagTokenizerSuspect,
  367. ("%*.*S suspected to be a number or a time or a date",
  368. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  369. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  370. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  371. ));
  372. if (VerifyNumberOrTimeOrDate())
  373. {
  374. break;
  375. }
  376. }
  377. //
  378. // commersial signs
  379. //
  380. if (TEST_PROP(m_pCurToken->m_State.m_Properties, PROP_COMMERSIAL_SIGN) &&
  381. HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
  382. {
  383. Trace(
  384. elVerbose,
  385. s_tagTokenizerSuspect,
  386. ("%*.*S suspected to have a commesial sign",
  387. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  388. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  389. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  390. ));
  391. if (VerifyCommersialSign())
  392. {
  393. break;
  394. }
  395. }
  396. //
  397. // Misc - C++, J++, A+, A- .. C#
  398. //
  399. if ( TEST_PROP(m_pCurToken->m_State.m_Properties, (PROP_MINUS|PROP_PLUS|PROP_POUND)) &&
  400. HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties) )
  401. {
  402. Trace(
  403. elVerbose,
  404. s_tagTokenizerSuspect,
  405. ("%*.*S suspected to belong to the misc list",
  406. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  407. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  408. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  409. ));
  410. if (VerifyMisc())
  411. {
  412. break;
  413. }
  414. }
  415. //
  416. // default
  417. //
  418. ProcessDefault();
  419. } while (false);
  420. }
  421. #ifdef DEBUG
  422. void CTokenizer::TraceToken()
  423. {
  424. WCHAR buf[MAX_NUM_PROP+1];
  425. size_t bufLen = wcslen(TRACE_CHAR);
  426. Assert(bufLen < MAX_NUM_PROP + 1);
  427. buf[bufLen] = L'\0';
  428. for(int i=0; i<bufLen; i++)
  429. {
  430. if(TEST_PROP(m_pCurToken->m_State.m_Properties, (1<<i)))
  431. {
  432. buf[i] = TRACE_CHAR[i];
  433. }
  434. else
  435. {
  436. buf[i] = L'_';
  437. }
  438. }
  439. Trace(
  440. elVerbose,
  441. s_tagTokenizerTrace,
  442. ("[%S] - %*.*S",
  443. buf,
  444. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  445. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  446. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  447. ));
  448. }
  449. #endif // DEBUG
  450. bool CTokenizer::VerifyAlphaUrl()
  451. {
  452. //
  453. // looking for <alpha>:// pattern
  454. //
  455. CTokenState State(m_pCurToken->m_State);
  456. ULONG ul = State.m_ulStart;
  457. if (!HAS_PROP_ALPHA(GET_PROP(State.m_pwcsToken[ul])))
  458. {
  459. return false;
  460. }
  461. while (HAS_PROP_EXTENDED_ALPHA(GET_PROP(State.m_pwcsToken[ul])))
  462. {
  463. ul++;
  464. }
  465. if (!(HAS_PROP_COLON(GET_PROP(State.m_pwcsToken[ul]))))
  466. {
  467. return false;
  468. }
  469. ul++;
  470. if (!(HAS_PROP_SLASH(GET_PROP(State.m_pwcsToken[ul]))))
  471. {
  472. return false;
  473. }
  474. ul++;
  475. if (!(HAS_PROP_SLASH(GET_PROP(State.m_pwcsToken[ul]))))
  476. {
  477. return false;
  478. }
  479. {
  480. Trace(
  481. elVerbose,
  482. s_tagTokenizerDecision,
  483. ("%*.*S is an <alpha>:// url",
  484. State.m_ulEnd - State.m_ulStart,
  485. State.m_ulEnd - State.m_ulStart,
  486. State.m_pwcsToken + State.m_ulStart
  487. ));
  488. }
  489. OutputUrl(State);
  490. return true;
  491. }
  492. bool CTokenizer::VerifyWwwUrl()
  493. {
  494. CTokenState State(m_pCurToken->m_State);
  495. if (State.m_ulEnd - State.m_ulStart <= 4)
  496. {
  497. return false;
  498. }
  499. if (0 != _wcsnicmp(State.m_pwcsToken + State.m_ulStart, L"www.", 4))
  500. {
  501. return false;
  502. }
  503. Trace(
  504. elVerbose,
  505. s_tagTokenizerDecision,
  506. ("%*.*S is a www. url",
  507. State.m_ulEnd - State.m_ulStart,
  508. State.m_ulEnd - State.m_ulStart,
  509. State.m_pwcsToken + State.m_ulStart
  510. ));
  511. OutputUrl(State);
  512. return true;
  513. }
  514. bool CTokenizer::VerifyAcronym()
  515. {
  516. //
  517. // looking for I.B.M or I.B.M. or A.B.CC but not A.B.CC.
  518. //
  519. CTokenState State(m_pCurToken->m_State);
  520. CPropFlag AbbPuctTail(ACRONYM_PUNCT_TAIL);
  521. CPropFlag AbbPuctHead(ACRONYM_PUNCT_HEAD);
  522. bool fNeedToRemoveEos = true;
  523. if (TEST_PROP(State.m_Properties, (ACRONYM_PUNCT_TAIL | ACRONYM_PUNCT_HEAD)))
  524. {
  525. if (TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd- 1]), ABBREVIATION_EOS))
  526. {
  527. fNeedToRemoveEos = false;
  528. }
  529. ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(AbbPuctTail, State);
  530. ulCharRemoved += m_pCurToken->RemoveHeadPunct(AbbPuctHead, State);
  531. if (ulCharRemoved)
  532. {
  533. m_pCurToken->ComputeStateProperties(State);
  534. }
  535. }
  536. const CCliticsTerm* pCliticsTerm;
  537. pCliticsTerm = VerifyClitics(State);
  538. ULONG ulEnd = State.m_ulEnd;
  539. ULONG ulCur = State.m_ulStart;
  540. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  541. {
  542. ulCur += pCliticsTerm->ulLen;
  543. }
  544. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  545. {
  546. ulEnd -= pCliticsTerm->ulLen;
  547. }
  548. //
  549. // finding the last period
  550. //
  551. while ((ulEnd > ulCur) &&
  552. HAS_PROP_UPPER_CASE(GET_PROP(State.m_pwcsToken[ulEnd- 1])))
  553. {
  554. ulEnd--;
  555. }
  556. if ((ulEnd == ulCur) ||
  557. !HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[ulEnd- 1])))
  558. {
  559. return false;
  560. }
  561. ULONG ulCounter = 0;
  562. while (ulCur < ulEnd)
  563. {
  564. if (ulCounter%2 == 0)
  565. {
  566. if (!HAS_PROP_UPPER_CASE(GET_PROP(State.m_pwcsToken[ulCur])))
  567. {
  568. return false;
  569. }
  570. }
  571. else
  572. {
  573. if (!HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[ulCur])))
  574. {
  575. return false;
  576. }
  577. }
  578. ulCur++;
  579. ulCounter++;
  580. }
  581. Trace(
  582. elVerbose,
  583. s_tagTokenizerDecision,
  584. ("%*.*S is an acronym",
  585. State.m_ulEnd - State.m_ulStart,
  586. State.m_ulEnd - State.m_ulStart,
  587. State.m_pwcsToken + State.m_ulStart
  588. ));
  589. if (fNeedToRemoveEos && (pCliticsTerm->ulOp != TAIL_MATCH_TRUNCATE))
  590. {
  591. m_pCurToken->m_fHasEos = false;
  592. }
  593. OutputAcronym(State, pCliticsTerm);
  594. return true;
  595. }
  596. bool CTokenizer::VerifyAbbreviation()
  597. {
  598. //
  599. // looking for Sr. Jr.
  600. // we define abbreviation as a pattern with 2 letters ending with a dot and the first letter
  601. // is a capital one
  602. //
  603. CTokenState State(m_pCurToken->m_State);
  604. CPropFlag AbbPuctTail(ABBREVIATION_PUNCT_TAIL);
  605. CPropFlag AbbPuctHead(ABBREVIATION_PUNCT_HEAD);
  606. bool fNeedToRemoveEos = true;
  607. if (TEST_PROP(State.m_Properties, (ABBREVIATION_PUNCT_TAIL | ABBREVIATION_PUNCT_HEAD)))
  608. {
  609. if (TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd- 1]), ABBREVIATION_EOS))
  610. {
  611. fNeedToRemoveEos = false;
  612. }
  613. ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(AbbPuctTail, State);
  614. ulCharRemoved += m_pCurToken->RemoveHeadPunct(AbbPuctHead, State);
  615. if (ulCharRemoved)
  616. {
  617. m_pCurToken->ComputeStateProperties(State);
  618. }
  619. }
  620. if ((State.m_ulEnd - State.m_ulStart) != 3)
  621. {
  622. return false;
  623. }
  624. if (!HAS_PROP_UPPER_CASE(GET_PROP(State.m_pwcsToken[State.m_ulStart])))
  625. {
  626. return false;
  627. }
  628. if (!HAS_PROP_EXTENDED_ALPHA(GET_PROP(State.m_pwcsToken[State.m_ulStart + 1])))
  629. {
  630. return false;
  631. }
  632. if (!HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[State.m_ulStart + 2])))
  633. {
  634. return false;
  635. }
  636. Trace(
  637. elVerbose,
  638. s_tagTokenizerDecision,
  639. ("%*.*S is an abbreviation",
  640. State.m_ulEnd - State.m_ulStart,
  641. State.m_ulEnd - State.m_ulStart,
  642. State.m_pwcsToken + State.m_ulStart
  643. ));
  644. if (fNeedToRemoveEos)
  645. {
  646. m_pCurToken->m_fHasEos = false;
  647. }
  648. OutputAbbreviation(State);
  649. return true;
  650. }
  651. bool CTokenizer::VerifySpecialAbbreviation()
  652. {
  653. CTokenState State(m_pCurToken->m_State);
  654. CPropFlag AbbPuctTail(SPECIAL_ABBREVIATION_PUNCT_TAIL);
  655. CPropFlag AbbPuctHead(SPECIAL_ABBREVIATION_PUNCT_HEAD);
  656. if (TEST_PROP(State.m_Properties, (SPECIAL_ABBREVIATION_PUNCT_TAIL | SPECIAL_ABBREVIATION_PUNCT_HEAD)))
  657. {
  658. ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(AbbPuctTail, State);
  659. ulCharRemoved += m_pCurToken->RemoveHeadPunct(AbbPuctHead, State);
  660. if (ulCharRemoved)
  661. {
  662. m_pCurToken->ComputeStateProperties(State);
  663. }
  664. if (!HAS_PROP_PERIOD(State.m_Properties))
  665. {
  666. return false;
  667. }
  668. }
  669. const CCliticsTerm* pCliticsTerm;
  670. pCliticsTerm = VerifyClitics(State);
  671. ULONG ulAddToStart = 0;
  672. ULONG ulDecFromEnd = 0;
  673. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  674. {
  675. ulAddToStart = pCliticsTerm->ulLen;
  676. }
  677. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  678. {
  679. ulDecFromEnd = pCliticsTerm->ulLen;
  680. }
  681. CAbbTerm* pTerm;
  682. short sResCount = 0;
  683. DictStatus status;
  684. CSpecialAbbreviationSet* pAbbSet = m_apLangSupport->GetAbbSet();
  685. status = pAbbSet->m_trieAbb.trie_Find(
  686. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  687. TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
  688. 1,
  689. &pTerm,
  690. &sResCount);
  691. if (sResCount &&
  692. (pTerm->ulAbbLen == (State.m_ulEnd - State.m_ulStart - ulAddToStart - ulDecFromEnd)))
  693. {
  694. Trace(
  695. elVerbose,
  696. s_tagTokenizerDecision,
  697. ("%*.*S is an abbreviation",
  698. State.m_ulEnd - State.m_ulStart,
  699. State.m_ulEnd - State.m_ulStart,
  700. State.m_pwcsToken + State.m_ulStart
  701. ));
  702. OutputSpecialAbbreviation(State, pTerm, pCliticsTerm);
  703. return true;
  704. }
  705. return false;
  706. }
  707. bool CTokenizer::VerifyMisc()
  708. {
  709. CTokenState State(m_pCurToken->m_State);
  710. CPropFlag MiscPuctTail(MISC_PUNCT_TAIL);
  711. CPropFlag MiscPuctHead(MISC_PUNCT_HEAD);
  712. if (TEST_PROP(State.m_Properties, (MISC_PUNCT_TAIL | MISC_PUNCT_HEAD)))
  713. {
  714. ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(MiscPuctTail, State);
  715. ulCharRemoved += m_pCurToken->RemoveHeadPunct(MiscPuctHead, State);
  716. if (ulCharRemoved)
  717. {
  718. m_pCurToken->ComputeStateProperties(State);
  719. }
  720. }
  721. const CCliticsTerm* pCliticsTerm;
  722. pCliticsTerm = VerifyClitics(State);
  723. ULONG ulAddToStart = 0;
  724. ULONG ulDecFromEnd = 0;
  725. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  726. {
  727. ulAddToStart = pCliticsTerm->ulLen;
  728. }
  729. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  730. {
  731. ulDecFromEnd = pCliticsTerm->ulLen;
  732. }
  733. bool bPatternContainOnlyUpperCase = true;
  734. ULONG ulSuffixSize = 0;
  735. if (TEST_PROP(State.m_Properties, PROP_POUND))
  736. {
  737. //
  738. // look for A# C#
  739. //
  740. ULONG ulEnd = State.m_ulEnd - ulDecFromEnd;
  741. ULONG ulStart = State.m_ulStart + ulAddToStart;
  742. if (ulEnd - ulStart != 2)
  743. {
  744. return false;
  745. }
  746. if (!TEST_PROP(GET_PROP(State.m_pwcsToken[ulEnd - 1]), PROP_POUND))
  747. {
  748. return false;
  749. }
  750. if (!TEST_PROP(GET_PROP(State.m_pwcsToken[ulStart]), PROP_UPPER_CASE))
  751. {
  752. return false;
  753. }
  754. ulSuffixSize = 1;
  755. }
  756. else
  757. {
  758. //
  759. // look for C++ COM+ ...
  760. //
  761. ULONG ul = State.m_ulEnd - ulDecFromEnd - 1;
  762. while ((int)ul >= (int)(State.m_ulStart + ulAddToStart))
  763. {
  764. if (!TEST_PROP(GET_PROP(State.m_pwcsToken[ul]), PROP_PLUS | PROP_MINUS))
  765. {
  766. break;
  767. }
  768. ulSuffixSize++;
  769. ul--;
  770. }
  771. if (ulSuffixSize > 2)
  772. {
  773. return false;
  774. }
  775. while ((int)ul >= (int)(State.m_ulStart + ulAddToStart))
  776. {
  777. CPropFlag prop(GET_PROP(State.m_pwcsToken[ul]));
  778. if (!HAS_PROP_EXTENDED_ALPHA(prop))
  779. {
  780. return false;
  781. }
  782. if (!TEST_PROP(prop, PROP_UPPER_CASE))
  783. {
  784. bPatternContainOnlyUpperCase = false;
  785. }
  786. ul--;
  787. }
  788. }
  789. Trace(
  790. elVerbose,
  791. s_tagTokenizerDecision,
  792. ("%*.*S is detected",
  793. State.m_ulEnd - State.m_ulStart,
  794. State.m_ulEnd - State.m_ulStart,
  795. State.m_pwcsToken + State.m_ulStart
  796. ));
  797. OutputMisc(
  798. State,
  799. bPatternContainOnlyUpperCase,
  800. ulSuffixSize,
  801. pCliticsTerm);
  802. return true;
  803. }
  804. bool CTokenizer::VerifyHyphenation()
  805. {
  806. //
  807. // looking for data-base
  808. //
  809. CPropFlag PunctHead(HYPHENATION_PUNCT_HEAD);
  810. CPropFlag PunctTail(HYPHENATION_PUNCT_TAIL);
  811. CTokenState State(m_pCurToken->m_State);
  812. if (TEST_PROP(State.m_Properties, (HYPHENATION_PUNCT_HEAD | HYPHENATION_PUNCT_TAIL)))
  813. {
  814. ULONG ulCharRemoved;
  815. ulCharRemoved = m_pCurToken->RemoveHeadPunct(PunctHead, State);
  816. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  817. if (ulCharRemoved)
  818. {
  819. m_pCurToken->ComputeStateProperties(State);
  820. }
  821. }
  822. if (!HAS_PROP_DASH(State.m_Properties))
  823. {
  824. return false;
  825. }
  826. const CCliticsTerm* pCliticsTerm;
  827. pCliticsTerm = VerifyClitics(State);
  828. ULONG ulAddToStart = 0;
  829. ULONG ulDecFromEnd = 0;
  830. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  831. {
  832. ulAddToStart = pCliticsTerm->ulLen;
  833. }
  834. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  835. {
  836. ulDecFromEnd = pCliticsTerm->ulLen;
  837. }
  838. ULONG ulCur = State.m_ulStart + ulAddToStart;
  839. ULONG ulEnd = State.m_ulEnd - ulDecFromEnd;
  840. bool bReadAlpha = false;
  841. do
  842. {
  843. while (ulCur < ulEnd)
  844. {
  845. if (HAS_PROP_EXTENDED_ALPHA(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
  846. {
  847. ulCur++;
  848. bReadAlpha = true;
  849. continue;
  850. }
  851. break;
  852. }
  853. if (!bReadAlpha)
  854. {
  855. return false;
  856. }
  857. if (ulCur < ulEnd)
  858. {
  859. if (!HAS_PROP_DASH(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
  860. {
  861. return false;
  862. }
  863. }
  864. else
  865. {
  866. break;
  867. }
  868. ulCur++;
  869. bReadAlpha = false;
  870. }
  871. while (ulCur < ulEnd);
  872. if (!bReadAlpha)
  873. {
  874. //
  875. // last characters where not alpha ex. free-
  876. //
  877. return false;
  878. }
  879. Trace(
  880. elVerbose,
  881. s_tagTokenizerDecision,
  882. ("%*.*S is an hyphenation",
  883. State.m_ulEnd - State.m_ulStart,
  884. State.m_ulEnd - State.m_ulStart,
  885. State.m_pwcsToken + State.m_ulStart
  886. ));
  887. OutputHyphenation(State, pCliticsTerm);
  888. return true;
  889. }
  890. bool CTokenizer::VerifyParens()
  891. {
  892. CPropFlag PunctTail(PAREN_PUNCT_TAIL);
  893. CPropFlag PunctHead(PAREN_PUNCT_HEAD);
  894. CTokenState State(m_pCurToken->m_State);
  895. if (TEST_PROP(State.m_Properties, (PAREN_PUNCT_TAIL | PAREN_PUNCT_HEAD)))
  896. {
  897. ULONG ulCharRemoved;
  898. ulCharRemoved = m_pCurToken->RemoveTailPunct(PunctTail, State);
  899. ulCharRemoved += m_pCurToken->RemoveHeadPunct(PunctHead, State);
  900. if (ulCharRemoved)
  901. {
  902. m_pCurToken->ComputeStateProperties(State);
  903. }
  904. }
  905. //
  906. // looking for (s)
  907. //
  908. if ((State.m_ulEnd - State.m_ulStart) < 4)
  909. {
  910. return false;
  911. }
  912. if (0 != wcsncmp(State.m_pwcsToken + State.m_ulEnd - 3, L"(s)", 3))
  913. {
  914. return false;
  915. }
  916. for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd - 3; ul++)
  917. {
  918. if (!HAS_PROP_EXTENDED_ALPHA(GET_PROP(State.m_pwcsToken[ul])))
  919. {
  920. return false;
  921. }
  922. }
  923. Trace(
  924. elVerbose,
  925. s_tagTokenizerDecision,
  926. ("%*.*S has (s) parenthesis",
  927. State.m_ulEnd - State.m_ulStart,
  928. State.m_ulEnd - State.m_ulStart,
  929. State.m_pwcsToken + State.m_ulStart
  930. ));
  931. OutputParens(State);
  932. return true;
  933. }
  934. const CCliticsTerm* CTokenizer::VerifyClitics(CTokenState& S)
  935. {
  936. if (TEST_PROP(GET_PROP(S.m_pwcsToken[S.m_ulStart]), PROP_APOSTROPHE))
  937. {
  938. S.m_ulStart++;
  939. if ((TEST_PROP(GET_PROP(S.m_pwcsToken[S.m_ulEnd - 1]), PROP_APOSTROPHE)) &&
  940. (S.m_ulEnd > S.m_ulStart))
  941. {
  942. S.m_ulEnd--;
  943. }
  944. m_pCurToken->ComputeStateProperties(S);
  945. }
  946. if (!(HAS_PROP_APOSTROPHE(S.m_Properties)))
  947. {
  948. return &g_EmptyClitics;
  949. }
  950. CPropFlag PunctTail(CLITICS_PUNC_TAIL);
  951. CPropFlag PunctHead(CLITICS_PUNCT_HEAD);
  952. CTokenState State(S);
  953. if (TEST_PROP(State.m_Properties, (CLITICS_PUNC_TAIL | CLITICS_PUNCT_HEAD)))
  954. {
  955. ULONG ulCharRemoved;
  956. ulCharRemoved = m_pCurToken->RemoveTailPunct(PunctTail, State);
  957. ulCharRemoved += m_pCurToken->RemoveHeadPunct(PunctHead, State);
  958. if (ulCharRemoved)
  959. {
  960. m_pCurToken->ComputeStateProperties(State);
  961. }
  962. }
  963. Trace(
  964. elVerbose,
  965. s_tagTokenizerSuspect,
  966. ("%*.*S suspected to have an apostophe",
  967. State.m_ulEnd - State.m_ulStart,
  968. State.m_ulEnd - State.m_ulStart,
  969. State.m_pwcsToken + State.m_ulStart
  970. ));
  971. ULONG ulApostrophePos = -1;
  972. ULONG ulCur;
  973. for (ulCur = State.m_ulStart; ulCur < State.m_ulEnd ; ulCur++)
  974. {
  975. if (TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur]), PROP_APOSTROPHE))
  976. {
  977. if ((-1 != ulApostrophePos) || (State.m_ulStart == ulCur))
  978. {
  979. //
  980. // this is not the first \' this is not a valid clitics
  981. // or the term start with a new apostrophe
  982. //
  983. return &g_EmptyClitics;
  984. }
  985. ulApostrophePos = ulCur;
  986. //
  987. // replace the apostrophe with an ascii apostrophe.
  988. //
  989. State.m_pwcsToken[ulCur] = L'\'';
  990. continue;
  991. }
  992. }
  993. //
  994. // looking for xxxxs'
  995. //
  996. if ((ulApostrophePos == State.m_ulEnd - 1) &&
  997. (State.m_pwcsToken[ulApostrophePos - 1] == L's'))
  998. {
  999. Trace(
  1000. elVerbose,
  1001. s_tagTokenizerDecision,
  1002. ("%*.*S has a s' clitcs",
  1003. State.m_ulEnd - State.m_ulStart,
  1004. State.m_ulEnd - State.m_ulStart,
  1005. State.m_pwcsToken + State.m_ulStart
  1006. ));
  1007. S = State;
  1008. return &g_SClitics;
  1009. }
  1010. //
  1011. // looking for tail clitics like xxx's
  1012. //
  1013. DictStatus status;
  1014. CCliticsTerm* pTerm;
  1015. short sResCount = 0;
  1016. if (ulCur > State.m_ulStart)
  1017. {
  1018. status = g_pClitics->m_trieClitics.trie_Find(
  1019. State.m_pwcsToken + ulApostrophePos,
  1020. TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
  1021. 1,
  1022. &pTerm,
  1023. &sResCount);
  1024. if (sResCount && pTerm->ulLen == (State.m_ulEnd - ulApostrophePos))
  1025. {
  1026. Trace(
  1027. elVerbose,
  1028. s_tagTokenizerDecision,
  1029. ("%*.*S has a %S clitcs",
  1030. State.m_ulEnd - State.m_ulStart,
  1031. State.m_ulEnd - State.m_ulStart,
  1032. State.m_pwcsToken + State.m_ulStart,
  1033. pTerm->pwcs
  1034. ));
  1035. S = State;
  1036. return pTerm;
  1037. }
  1038. }
  1039. //
  1040. // looking for head clitics like l'xxxx
  1041. //
  1042. status = g_pClitics->m_trieClitics.trie_Find(
  1043. State.m_pwcsToken + State.m_ulStart,
  1044. TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
  1045. 1,
  1046. &pTerm,
  1047. &sResCount);
  1048. if (sResCount)
  1049. {
  1050. Trace(
  1051. elVerbose,
  1052. s_tagTokenizerDecision,
  1053. ("%*.*S has a %S clitcs",
  1054. State.m_ulEnd - State.m_ulStart,
  1055. State.m_ulEnd - State.m_ulStart,
  1056. State.m_pwcsToken + State.m_ulStart,
  1057. pTerm->pwcs
  1058. ));
  1059. S = State;
  1060. return pTerm;
  1061. }
  1062. return &g_EmptyClitics;
  1063. }
  1064. bool CTokenizer::VerifyNumberOrTimeOrDate()
  1065. {
  1066. CPropFlag PunctHead(NUM_DATE_TIME_PUNCT_HEAD);
  1067. CPropFlag PunctTail(NUM_DATE_TIME_PUNCT_TAIL);
  1068. CTokenState State(m_pCurToken->m_State);
  1069. if (TEST_PROP(State.m_Properties,
  1070. (NUM_DATE_TIME_PUNCT_HEAD | NUM_DATE_TIME_PUNCT_TAIL)))
  1071. {
  1072. ULONG ulCharRemoved;
  1073. ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
  1074. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  1075. if (ulCharRemoved)
  1076. {
  1077. m_pCurToken->ComputeStateProperties(State);
  1078. }
  1079. }
  1080. if ((TEST_PROP(
  1081. State.m_Properties,
  1082. (GET_PROP(m_apLangSupport->GetTimeSeperator()).m_ulFlag))) ||
  1083. HAS_PROP_ALPHA(State.m_Properties))
  1084. {
  1085. //
  1086. // suspected to be time 12:33 14:22 15:22:33
  1087. // or AM/PM time format 12:22AM 13PM
  1088. //
  1089. Trace(
  1090. elVerbose,
  1091. s_tagTokenizerSuspect,
  1092. ("%*.*S suspected to be AM/PM time",
  1093. State.m_ulEnd - State.m_ulStart,
  1094. State.m_ulEnd - State.m_ulStart,
  1095. State.m_pwcsToken + State.m_ulStart
  1096. ));
  1097. if (VerifyTime(State))
  1098. {
  1099. return true;
  1100. }
  1101. }
  1102. Trace(
  1103. elVerbose,
  1104. s_tagTokenizerSuspect,
  1105. ("%*.*S suspected to be a simple number",
  1106. State.m_ulEnd - State.m_ulStart,
  1107. State.m_ulEnd - State.m_ulStart,
  1108. State.m_pwcsToken + State.m_ulStart
  1109. ));
  1110. if (VerifyNumber(State))
  1111. {
  1112. return true;
  1113. }
  1114. if (TEST_PROP(State.m_Properties, PROP_DATE_SEPERATOR))
  1115. {
  1116. //
  1117. // suspected to be a date 1999-05-04 or 1998/11/10 1999.05.04
  1118. //
  1119. Trace(
  1120. elVerbose,
  1121. s_tagTokenizerSuspect,
  1122. ("%*.*S suspected to be a date",
  1123. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  1124. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  1125. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  1126. ));
  1127. return VerifyDate(State);
  1128. }
  1129. return false;
  1130. }
  1131. bool CTokenizer::VerifyTime(CTokenState& S)
  1132. {
  1133. CTokenState State(S);
  1134. CPropFlag PunctHead(TIME_ADDITIONAL_PUNCT_HEAD);
  1135. CPropFlag PunctTail(TIME_ADDITIONAL_PUNCT_TAIL);
  1136. if (TEST_PROP(State.m_Properties,
  1137. (TIME_ADDITIONAL_PUNCT_HEAD | TIME_ADDITIONAL_PUNCT_TAIL)))
  1138. {
  1139. ULONG ulCharRemoved;
  1140. ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
  1141. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  1142. if (ulCharRemoved)
  1143. {
  1144. m_pCurToken->ComputeStateProperties(State);
  1145. }
  1146. }
  1147. if ((State.m_ulEnd - State.m_ulStart) > MAX_TIME_FORMAT_LEN)
  1148. {
  1149. return false;
  1150. }
  1151. WCHAR pwcsBuf[MAX_TIME_FORMAT_LEN + 1];
  1152. ULONG ulCur = State.m_ulStart;
  1153. WCHAR wcSeperator = 0xFFFF;
  1154. ULONG ul = 0;
  1155. //
  1156. // formatting the text to a date format
  1157. //
  1158. while (ulCur < State.m_ulEnd)
  1159. {
  1160. CPropFlag prop(GET_PROP(State.m_pwcsToken[ulCur]));
  1161. if (HAS_PROP_NUMBER(prop))
  1162. {
  1163. pwcsBuf[ul] = L'#';
  1164. }
  1165. else if (State.m_pwcsToken[ulCur] == m_apLangSupport->GetTimeSeperator())
  1166. {
  1167. if (0xFFFF == wcSeperator)
  1168. {
  1169. wcSeperator = State.m_pwcsToken[ulCur];
  1170. }
  1171. else if (wcSeperator != State.m_pwcsToken[ulCur])
  1172. {
  1173. return false;
  1174. }
  1175. pwcsBuf[ul] = L':';
  1176. }
  1177. else if (HAS_PROP_ALPHA(prop) || HAS_PROP_PERIOD(prop))
  1178. {
  1179. pwcsBuf[ul] = State.m_pwcsToken[ulCur];
  1180. }
  1181. else
  1182. {
  1183. return false;
  1184. }
  1185. ul++;
  1186. ulCur++;
  1187. }
  1188. pwcsBuf[ul] = L'\0';
  1189. CTimeTerm* pTerm;
  1190. short sResCount = 0;
  1191. DictStatus status;
  1192. status = g_pTimeFormat->m_trieTimeFormat.trie_Find(
  1193. pwcsBuf,
  1194. TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
  1195. 1,
  1196. &pTerm,
  1197. &sResCount);
  1198. if (!(sResCount && (pTerm->bLen == ul)))
  1199. {
  1200. return false;
  1201. }
  1202. LONG lHour;
  1203. LONG lMin;
  1204. LONG lSec;
  1205. TimeFormat AmPm;
  1206. GetValuesFromTimeString(
  1207. pTerm,
  1208. State.m_pwcsToken + State.m_ulStart ,
  1209. &lHour,
  1210. &lMin,
  1211. &lSec,
  1212. &AmPm);
  1213. if (None == AmPm)
  1214. {
  1215. if (lHour > 24)
  1216. {
  1217. return false;
  1218. }
  1219. }
  1220. else
  1221. {
  1222. if (lHour > 12)
  1223. {
  1224. return false;
  1225. }
  1226. if (Am == AmPm)
  1227. {
  1228. if (12 == lHour)
  1229. {
  1230. lHour = 0;
  1231. }
  1232. }
  1233. else
  1234. {
  1235. if (lHour < 12)
  1236. {
  1237. lHour += 12;
  1238. }
  1239. }
  1240. }
  1241. if (lMin > 59)
  1242. {
  1243. return false;
  1244. }
  1245. if (lSec > 59)
  1246. {
  1247. return false;
  1248. }
  1249. WCHAR pwcsTime[9] = {L'\0',L'\0',L'\0',L'\0',L'\0',L'\0',L'\0',L'\0',L'\0',};
  1250. swprintf(pwcsTime, L"TT%02d%02d", lHour, lMin);
  1251. Trace(
  1252. elVerbose,
  1253. s_tagTokenizerDecision,
  1254. ("%*.*S is a time -> %S",
  1255. State.m_ulEnd - State.m_ulStart,
  1256. State.m_ulEnd - State.m_ulStart,
  1257. State.m_pwcsToken + State.m_ulStart,
  1258. pwcsTime));
  1259. OutputTime(pwcsTime, State);
  1260. return true;
  1261. }
  1262. bool CTokenizer::VerifyDate(CTokenState& S)
  1263. {
  1264. CTokenState State(S);
  1265. CPropFlag PunctHead(DATE_ADDITIONAL_PUNCT_HEAD);
  1266. CPropFlag PunctTail(DATE_ADDITIONAL_PUNCT_TAIL);
  1267. if (TEST_PROP(State.m_Properties,
  1268. (DATE_ADDITIONAL_PUNCT_HEAD | DATE_ADDITIONAL_PUNCT_TAIL)))
  1269. {
  1270. ULONG ulCharRemoved;
  1271. ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
  1272. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  1273. if (ulCharRemoved)
  1274. {
  1275. m_pCurToken->ComputeStateProperties(State);
  1276. }
  1277. }
  1278. WCHAR pwcsBuf[MAX_DATE_FORMAT_LEN + 1];
  1279. if (State.m_ulEnd - State.m_ulStart > MAX_DATE_FORMAT_LEN)
  1280. {
  1281. return false;
  1282. }
  1283. ULONG ulCur = State.m_ulStart;
  1284. WCHAR wcSeperator = 0xFFFF;
  1285. ULONG ul = 0;
  1286. //
  1287. // formatting the text to a date format
  1288. //
  1289. while (ulCur < State.m_ulEnd)
  1290. {
  1291. CPropFlag prop(GET_PROP(State.m_pwcsToken[ulCur]));
  1292. if (HAS_PROP_NUMBER(prop))
  1293. {
  1294. pwcsBuf[ul] = L'#';
  1295. }
  1296. else if (HAS_PROP_PERIOD(prop) ||
  1297. HAS_PROP_DASH(prop) ||
  1298. HAS_PROP_SLASH(prop))
  1299. {
  1300. if (0xFFFF == wcSeperator)
  1301. {
  1302. wcSeperator = State.m_pwcsToken[ulCur];
  1303. }
  1304. else if (wcSeperator != State.m_pwcsToken[ulCur])
  1305. {
  1306. return false;
  1307. }
  1308. pwcsBuf[ul] = L'.';
  1309. }
  1310. else
  1311. {
  1312. return false;
  1313. }
  1314. ul++;
  1315. ulCur++;
  1316. }
  1317. pwcsBuf[ul] = L'\0';
  1318. CDateTerm* pTerm;
  1319. short sResCount = 0;
  1320. DictStatus status;
  1321. status = g_pDateFormat->m_trieDateFormat.trie_Find(
  1322. pwcsBuf,
  1323. TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
  1324. 1,
  1325. &pTerm,
  1326. &sResCount);
  1327. if (!(sResCount && (pTerm->bLen == ul)))
  1328. {
  1329. return false;
  1330. }
  1331. LONG lD_M1;
  1332. LONG lD_M2;
  1333. LONG lYear;
  1334. GetValuesFromDateString(
  1335. pTerm,
  1336. State.m_pwcsToken + State.m_ulStart,
  1337. &lD_M1,
  1338. &lD_M2,
  1339. &lYear);
  1340. LONG lDay;
  1341. LONG lMonth;
  1342. //
  1343. // language dependent
  1344. //
  1345. if (m_apLangSupport->IsDayMonthOrder() ||
  1346. pTerm->bType == YYMMDD_TYPE)
  1347. {
  1348. lDay = lD_M1;
  1349. lMonth = lD_M2;
  1350. }
  1351. else
  1352. {
  1353. lDay = lD_M2;
  1354. lMonth = lD_M1;
  1355. }
  1356. if (!((lDay > 0) && (lDay <= 31)))
  1357. {
  1358. return false;
  1359. }
  1360. if (!((lMonth > 0) && (lMonth <= 12)))
  1361. {
  1362. return false;
  1363. }
  1364. WCHAR pwcsDate1[11] = { L'D', L'D', L'0', L'0', L'0', L'0', L'0', L'0', L'0', L'0', L'\0'};
  1365. WCHAR pwcsDate2[11];
  1366. bool bY2K = false;
  1367. if (lYear <= 99) // Y2k bug
  1368. {
  1369. _ltow(lYear + 1900, pwcsDate1 + 2, 10);
  1370. bY2K = true;
  1371. }
  1372. else if (lYear < 1000)
  1373. {
  1374. _ltow(lYear, pwcsDate1 + 3, 10);
  1375. }
  1376. else
  1377. {
  1378. _ltow(lYear, pwcsDate1 + 2, 10);
  1379. }
  1380. if (lMonth < 10)
  1381. {
  1382. pwcsDate1[6] = L'0';
  1383. _ltow(lMonth, pwcsDate1 + 7, 10);
  1384. }
  1385. else
  1386. {
  1387. _ltow(lMonth, pwcsDate1 + 6, 10);
  1388. }
  1389. if (lDay < 10)
  1390. {
  1391. pwcsDate1[8] = L'0';
  1392. _ltow(lDay, pwcsDate1 + 9, 10);
  1393. }
  1394. else
  1395. {
  1396. _ltow(lDay, pwcsDate1 + 8, 10);
  1397. }
  1398. if (bY2K)
  1399. {
  1400. wcscpy(pwcsDate2, pwcsDate1);
  1401. pwcsDate2[2] = L'2';
  1402. pwcsDate2[3] = L'0';
  1403. }
  1404. Trace(
  1405. elVerbose,
  1406. s_tagTokenizerDecision,
  1407. ("%*.*S is a date",
  1408. State.m_ulEnd - State.m_ulStart,
  1409. State.m_ulEnd - State.m_ulStart,
  1410. State.m_pwcsToken + State.m_ulStart
  1411. ));
  1412. if (bY2K)
  1413. {
  1414. OutputDate(pwcsDate1, pwcsDate2, State);
  1415. }
  1416. else
  1417. {
  1418. OutputDate(pwcsDate1, NULL, State);
  1419. }
  1420. return true;
  1421. }
  1422. bool CTokenizer::VerifyNumber(CTokenState& S)
  1423. {
  1424. CTokenState State(S);
  1425. WCHAR pwcsNumber[TOKENIZER_MAXBUFFERLIMIT + 10];
  1426. ULONG ulOutLen;
  1427. ULONG ulOffsetToTxt;
  1428. const CCliticsTerm* pCliticsTerm;
  1429. pCliticsTerm = VerifyClitics(State);
  1430. ULONG ulAddToStart = 0;
  1431. ULONG ulDecFromEnd = 0;
  1432. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  1433. {
  1434. ulAddToStart = pCliticsTerm->ulLen;
  1435. }
  1436. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  1437. {
  1438. ulDecFromEnd = pCliticsTerm->ulLen;
  1439. }
  1440. bool fRet = CheckAndCreateNumber(
  1441. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  1442. State.m_ulEnd - State.m_ulStart - ulAddToStart - ulDecFromEnd,
  1443. pwcsNumber,
  1444. &ulOffsetToTxt,
  1445. &ulOutLen);
  1446. if (!fRet)
  1447. {
  1448. return false;
  1449. }
  1450. Trace(
  1451. elVerbose,
  1452. s_tagTokenizerDecision,
  1453. ("%*.*S is a number",
  1454. State.m_ulEnd - State.m_ulStart,
  1455. State.m_ulEnd - State.m_ulStart,
  1456. State.m_pwcsToken + State.m_ulStart
  1457. ));
  1458. OutputNumbers(State, ulOutLen, pwcsNumber + ulOffsetToTxt, pCliticsTerm);
  1459. return true;
  1460. }
  1461. bool CTokenizer::VerifyCurrency()
  1462. {
  1463. //
  1464. // format is either $12.22 or 12.22$
  1465. //
  1466. CPropFlag PunctHead(CURRENCY_PUNCT_HEAD);
  1467. CPropFlag PunctTail(CURRENCY_PUNCT_TAIL);
  1468. CTokenState State(m_pCurToken->m_State);
  1469. if (TEST_PROP(State.m_Properties,
  1470. (CURRENCY_PUNCT_HEAD | CURRENCY_PUNCT_TAIL)))
  1471. {
  1472. ULONG ulCharRemoved;
  1473. ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
  1474. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  1475. if (ulCharRemoved)
  1476. {
  1477. m_pCurToken->ComputeStateProperties(State);
  1478. }
  1479. }
  1480. const CCliticsTerm* pCliticsTerm;
  1481. pCliticsTerm = VerifyClitics(State);
  1482. ULONG ulAddToStart = 0;
  1483. ULONG ulDecFromEnd = 0;
  1484. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  1485. {
  1486. ulAddToStart = pCliticsTerm->ulLen;
  1487. }
  1488. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  1489. {
  1490. ulDecFromEnd = pCliticsTerm->ulLen;
  1491. }
  1492. WCHAR wchCurrency;
  1493. WCHAR pwcsCurrency[TOKENIZER_MAXBUFFERLIMIT + 10];
  1494. WCHAR* pwcsStr = State.m_pwcsToken + State.m_ulStart;
  1495. if (HAS_PROP_CURRENCY(GET_PROP(State.m_pwcsToken[State.m_ulStart + ulAddToStart])))
  1496. {
  1497. wchCurrency = State.m_pwcsToken[State.m_ulStart + ulAddToStart];
  1498. pwcsStr += 1;
  1499. }
  1500. else if (HAS_PROP_CURRENCY(GET_PROP(State.m_pwcsToken[State.m_ulEnd - 1 - ulDecFromEnd])))
  1501. {
  1502. wchCurrency = State.m_pwcsToken[State.m_ulEnd - 1 - ulDecFromEnd];
  1503. }
  1504. else
  1505. {
  1506. return false;
  1507. }
  1508. ULONG ulOutLen;
  1509. ULONG ulOffsetToTxt;
  1510. if (false == CheckAndCreateNumber(
  1511. pwcsStr + ulAddToStart,
  1512. State.m_ulEnd - State.m_ulStart - 1 - ulAddToStart - ulDecFromEnd,
  1513. pwcsCurrency,
  1514. &ulOffsetToTxt,
  1515. &ulOutLen))
  1516. {
  1517. return false;
  1518. }
  1519. Assert(ulOffsetToTxt + ulOutLen + 1 < m_ulMaxTokenSize + 4);
  1520. pwcsCurrency[ulOffsetToTxt + ulOutLen] = wchCurrency;
  1521. pwcsCurrency[ulOffsetToTxt + ulOutLen + 1] = L'\0';
  1522. Trace(
  1523. elVerbose,
  1524. s_tagTokenizerDecision,
  1525. ("%*.*S is a currency",
  1526. State.m_ulEnd - State.m_ulStart,
  1527. State.m_ulEnd - State.m_ulStart,
  1528. State.m_pwcsToken + State.m_ulStart
  1529. ));
  1530. OutputCurrency(ulOutLen+1, pwcsCurrency + ulOffsetToTxt , State, pCliticsTerm);
  1531. return true;
  1532. }
  1533. bool CTokenizer::VerifyCommersialSign()
  1534. {
  1535. CTokenState State(m_pCurToken->m_State);
  1536. CPropFlag CommPunctTail(COMMERSIAL_SIGN_PUNCT_TAIL);
  1537. CPropFlag CommPunctHead(COMMERSIAL_SIGN_PUNCT_HEAD);
  1538. if (TEST_PROP(State.m_Properties, (COMMERSIAL_SIGN_PUNCT_TAIL | COMMERSIAL_SIGN_PUNCT_HEAD)))
  1539. {
  1540. ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(CommPunctTail, State);
  1541. ulCharRemoved += m_pCurToken->RemoveHeadPunct(CommPunctHead, State);
  1542. if (ulCharRemoved)
  1543. {
  1544. m_pCurToken->ComputeStateProperties(State);
  1545. }
  1546. }
  1547. if (TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd - 1]),
  1548. PROP_COMMERSIAL_SIGN))
  1549. {
  1550. //
  1551. // the length of the token must be greater then 1 since it includes an alpha
  1552. // and the commersial sign
  1553. //
  1554. Assert((State.m_ulEnd - State.m_ulStart) > 1);
  1555. OutputCommersialSignToken(State);
  1556. return true;
  1557. }
  1558. return false;
  1559. }
  1560. void CTokenizer::ProcessDefault()
  1561. {
  1562. CTokenState State(m_pCurToken->m_State);
  1563. if (TEST_PROP(State.m_Properties, PROP_DEFAULT_BREAKER))
  1564. {
  1565. if (TEST_PROP(State.m_Properties, PROP_FIRST_LEVEL_BREAKER))
  1566. {
  1567. CPropFlag prop(PROP_FIRST_LEVEL_BREAKER);
  1568. BreakCompundString(State, prop);
  1569. return;
  1570. }
  1571. if (TEST_PROP(State.m_Properties, PROP_SECOND_LEVEL_BREAKER))
  1572. {
  1573. CPropFlag prop(PROP_SECOND_LEVEL_BREAKER);
  1574. BreakCompundString(State, prop);
  1575. return;
  1576. }
  1577. }
  1578. //
  1579. // this is a simple token
  1580. //
  1581. const CCliticsTerm* pCliticsTerm;
  1582. pCliticsTerm = VerifyClitics(State);
  1583. if (pCliticsTerm == &g_EmptyClitics)
  1584. {
  1585. if (TEST_PROP(State.m_Properties, PROP_NBS))
  1586. {
  1587. CPropFlag prop(PROP_NBS);
  1588. BreakCompundString(State, prop);
  1589. return;
  1590. }
  1591. CPropFlag PunctHead(SIMPLE_PUNCT_HEAD);
  1592. CPropFlag PunctTail(SIMPLE_PUNCT_TAIL);
  1593. if (TEST_PROP(State.m_Properties,
  1594. (SIMPLE_PUNCT_HEAD | SIMPLE_PUNCT_TAIL)))
  1595. {
  1596. ULONG ulCharRemoved;
  1597. ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
  1598. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  1599. if ( TEST_PROP(State.m_Properties, PROP_UNDERSCORE) )
  1600. {
  1601. bool hasFrontUnderscore =
  1602. (State.m_ulStart > m_pCurToken->m_State.m_ulStart) &&
  1603. TEST_PROP( GET_PROP(State.m_pwcsToken[State.m_ulStart-1]),
  1604. PROP_UNDERSCORE ) &&
  1605. TEST_PROP( GET_PROP(State.m_pwcsToken[State.m_ulStart]),
  1606. PROP_ALPHA_NUMERIC );
  1607. bool hasBackUnderscore =
  1608. (State.m_ulEnd < m_pCurToken->m_State.m_ulEnd) &&
  1609. TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd]),
  1610. PROP_UNDERSCORE) &&
  1611. TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd-1]),
  1612. PROP_ALPHA_NUMERIC);
  1613. //
  1614. // Note: To change the policy to "leave ALL attached underscore
  1615. // seuences, simply change below condition to:
  1616. // if ( (hasFrontUnderscore || hasBackUnderscore) )
  1617. //
  1618. if ( (hasFrontUnderscore ^ hasBackUnderscore) )
  1619. {
  1620. ulCharRemoved -=
  1621. AddBackUnderscores(
  1622. State,
  1623. hasFrontUnderscore,
  1624. hasBackUnderscore
  1625. );
  1626. }
  1627. } // if ( TEST_PROP(State.m_Properties, PROP_UNDERSCORE) )
  1628. if (ulCharRemoved)
  1629. {
  1630. m_pCurToken->ComputeStateProperties(State);
  1631. }
  1632. }
  1633. }
  1634. if (State.m_ulEnd == State.m_ulStart)
  1635. {
  1636. //
  1637. // case we remove all chracters in the above statement
  1638. //
  1639. return;
  1640. }
  1641. Trace(
  1642. elVerbose,
  1643. s_tagTokenizerDecision,
  1644. ("%*.*S is a simple token",
  1645. State.m_ulEnd - State.m_ulStart,
  1646. State.m_ulEnd - State.m_ulStart,
  1647. State.m_pwcsToken + State.m_ulStart
  1648. ));
  1649. OutputSimpleToken(State, pCliticsTerm);
  1650. }
  1651. //
  1652. // CTokenizer::AddBackUnderscores:
  1653. //
  1654. // Treat cases of a "simple" token with head and/or tail underscore
  1655. // sequence (consecutive underscores prefix or suffix); those
  1656. // do not get flipped off and remain part of the token.
  1657. // This routine is called after underscore removal, (as a result of
  1658. // Remove[Head|Tail]Punct) and adds them back in.
  1659. //
  1660. // return value: Number of underscores added back in.
  1661. //
  1662. ULONG
  1663. CTokenizer::AddBackUnderscores(
  1664. IN CTokenState& State,
  1665. IN bool hasFrontUnderscore,
  1666. IN bool hasBackUnderscore
  1667. )
  1668. {
  1669. ULONG ulCharsAdded = 0;
  1670. if ( hasFrontUnderscore )
  1671. {
  1672. // Move left over consecutive underscores
  1673. ulCharsAdded = m_pCurToken->FindLeftmostUnderscore(State);
  1674. }
  1675. if ( hasBackUnderscore )
  1676. {
  1677. // Move right over consecutive underscores
  1678. ulCharsAdded += m_pCurToken->FindRightmostUnderscore(State);
  1679. } // if ( hasFrontUnderscore )
  1680. return ulCharsAdded;
  1681. } // CTokenizer::AddBackUnderscores()
  1682. void CTokenizer::OutputUrl(CTokenState& State)
  1683. {
  1684. HRESULT hr;
  1685. ULONG ulOffsetInTxtSourceBuffer =
  1686. m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  1687. ULONG ulCur = State.m_ulStart;
  1688. ULONG ulStart = ulCur;
  1689. ULONG ulLenInTxtSourceBuffer = 0;
  1690. ULONG ulOffsetDueToAnEscapeChar;
  1691. while (ulCur < State.m_ulEnd)
  1692. {
  1693. ulLenInTxtSourceBuffer++;
  1694. ulOffsetDueToAnEscapeChar = 0;
  1695. if ((State.m_pwcsToken[ulCur] == L'%') &&
  1696. (ulCur <= State.m_ulEnd - 2))
  1697. {
  1698. //
  1699. // replacing escape charaters with real ones.
  1700. //
  1701. if (TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+1]) , PROP_XDIGIT) &&
  1702. TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+2]) , PROP_XDIGIT))
  1703. {
  1704. short sVal;
  1705. sVal = ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 1]);
  1706. sVal *= 16;
  1707. sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 2]);
  1708. State.m_pwcsToken[ulCur+2] = sVal;
  1709. for (ULONG ul = ulCur -1 ; ul >= ulStart; ul--)
  1710. {
  1711. State.m_pwcsToken[ul+2] = State.m_pwcsToken[ul];
  1712. }
  1713. ulCur += 2;
  1714. ulStart+=2;
  1715. ulOffsetDueToAnEscapeChar = 2;
  1716. ulLenInTxtSourceBuffer += 2;
  1717. }
  1718. else if ((ulCur <= State.m_ulEnd - 5) &&
  1719. ((State.m_pwcsToken[ulCur+1] == L'u') ||
  1720. (State.m_pwcsToken[ulCur+1] == L'U')) &&
  1721. TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+2]) , PROP_XDIGIT) &&
  1722. TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+3]) , PROP_XDIGIT) &&
  1723. TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+4]) , PROP_XDIGIT) &&
  1724. TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+5]) , PROP_XDIGIT))
  1725. {
  1726. short sVal;
  1727. sVal = ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 2]);
  1728. sVal *= 0x1000;
  1729. sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 3]);
  1730. sVal *= 0x100;
  1731. sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 4]);
  1732. sVal *= 0x10;
  1733. sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 5]);
  1734. State.m_pwcsToken[ulCur+5] = sVal;
  1735. for (ULONG ul = ulCur -1 ; ul >= ulStart; ul--)
  1736. {
  1737. State.m_pwcsToken[ul+5] = State.m_pwcsToken[ul];
  1738. }
  1739. ulCur += 5;
  1740. ulStart+=5;
  1741. ulOffsetDueToAnEscapeChar = 5;
  1742. ulLenInTxtSourceBuffer += 5;
  1743. }
  1744. }
  1745. if ( IS_BREAKER( State.m_pwcsToken[ulCur] ) )
  1746. {
  1747. if (ulCur - ulStart == 0)
  1748. {
  1749. //
  1750. // only punctuation
  1751. //
  1752. ulCur++;
  1753. ulStart = ulCur;
  1754. ulOffsetInTxtSourceBuffer += ulOffsetDueToAnEscapeChar + 1;
  1755. ulLenInTxtSourceBuffer = 0;
  1756. continue;
  1757. }
  1758. hr = m_apWordSink->PutWord(
  1759. ulCur - ulStart,
  1760. &State.m_pwcsToken[ulStart],
  1761. ulLenInTxtSourceBuffer - 1 - ulOffsetDueToAnEscapeChar,
  1762. ulOffsetInTxtSourceBuffer);
  1763. if (FAILED(hr))
  1764. {
  1765. THROW_HRESULT_EXCEPTION(hr);
  1766. }
  1767. ulStart = ulCur + 1;
  1768. ulOffsetInTxtSourceBuffer += ulLenInTxtSourceBuffer;
  1769. ulLenInTxtSourceBuffer = 0;
  1770. }
  1771. ulCur++;
  1772. }
  1773. //
  1774. // last word.
  1775. //
  1776. if (ulStart < ulCur)
  1777. {
  1778. hr = m_apWordSink->PutWord(
  1779. ulCur - ulStart,
  1780. &State.m_pwcsToken[ulStart],
  1781. ulLenInTxtSourceBuffer,
  1782. ulOffsetInTxtSourceBuffer);
  1783. if (FAILED(hr))
  1784. {
  1785. THROW_HRESULT_EXCEPTION(hr);
  1786. }
  1787. }
  1788. }
  1789. void CTokenizer::OutputNumbers(
  1790. CTokenState& State,
  1791. ULONG ulLen,
  1792. WCHAR* pwcsNumber,
  1793. const CCliticsTerm* pCliticsTerm)
  1794. {
  1795. HRESULT hr;
  1796. //
  1797. // Input: 1.22 Output: 1.22, NN1D22
  1798. //
  1799. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  1800. if (ulLen > m_ulMaxTokenSize)
  1801. {
  1802. hr = m_apWordSink->PutWord(
  1803. State.m_ulEnd - State.m_ulStart,
  1804. &State.m_pwcsToken[State.m_ulStart],
  1805. State.m_ulEnd - State.m_ulStart,
  1806. ulOffsetInTxtSourceBuffer);
  1807. if (FAILED(hr))
  1808. {
  1809. THROW_HRESULT_EXCEPTION(hr);
  1810. }
  1811. return;
  1812. }
  1813. hr = m_apWordSink->PutAltWord(
  1814. State.m_ulEnd - State.m_ulStart,
  1815. &State.m_pwcsToken[State.m_ulStart],
  1816. State.m_ulEnd - State.m_ulStart,
  1817. ulOffsetInTxtSourceBuffer);
  1818. if (FAILED(hr))
  1819. {
  1820. THROW_HRESULT_EXCEPTION(hr);
  1821. }
  1822. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  1823. {
  1824. hr = m_apWordSink->PutAltWord(
  1825. State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
  1826. State.m_pwcsToken + State.m_ulStart + pCliticsTerm->ulLen,
  1827. State.m_ulEnd - State.m_ulStart,
  1828. ulOffsetInTxtSourceBuffer);
  1829. if (FAILED(hr))
  1830. {
  1831. THROW_HRESULT_EXCEPTION(hr);
  1832. }
  1833. }
  1834. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  1835. {
  1836. hr = m_apWordSink->PutAltWord(
  1837. State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
  1838. State.m_pwcsToken + State.m_ulStart,
  1839. State.m_ulEnd - State.m_ulStart,
  1840. ulOffsetInTxtSourceBuffer);
  1841. if (FAILED(hr))
  1842. {
  1843. THROW_HRESULT_EXCEPTION(hr);
  1844. }
  1845. }
  1846. hr = m_apWordSink->PutWord(
  1847. ulLen,
  1848. pwcsNumber,
  1849. State.m_ulEnd - State.m_ulStart,
  1850. ulOffsetInTxtSourceBuffer);
  1851. if (FAILED(hr))
  1852. {
  1853. THROW_HRESULT_EXCEPTION(hr);
  1854. }
  1855. }
  1856. void CTokenizer::OutputParens(CTokenState& State)
  1857. {
  1858. HRESULT hr;
  1859. //
  1860. // format is xxx(s)
  1861. // Input: xxx(s) Output: xxx
  1862. //
  1863. State.m_pwcsToken[State.m_ulEnd - 3] = L'\0';
  1864. hr = m_apWordSink->PutWord(
  1865. State.m_ulEnd - 3 - State.m_ulStart,
  1866. &State.m_pwcsToken[State.m_ulStart],
  1867. State.m_ulEnd - State.m_ulStart,
  1868. m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State));
  1869. if (FAILED(hr))
  1870. {
  1871. THROW_HRESULT_EXCEPTION(hr);
  1872. }
  1873. }
  1874. void CTokenizer::OutputAcronym(CTokenState& State, const CCliticsTerm* pCliticsTerm)
  1875. {
  1876. HRESULT hr;
  1877. //
  1878. // Input: I.B.M Output: I.B.M, IBM
  1879. //
  1880. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  1881. ULONG ulAddToStart = 0;
  1882. ULONG ulDecFromEnd = 0;
  1883. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  1884. {
  1885. ulAddToStart = pCliticsTerm->ulLen;
  1886. }
  1887. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  1888. {
  1889. ulDecFromEnd = pCliticsTerm->ulLen;
  1890. }
  1891. hr = m_apWordSink->PutAltWord(
  1892. State.m_ulEnd - ulDecFromEnd - (State.m_ulStart + ulAddToStart),
  1893. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  1894. State.m_ulEnd - State.m_ulStart,
  1895. ulOffsetInTxtSourceBuffer);
  1896. if (FAILED(hr))
  1897. {
  1898. THROW_HRESULT_EXCEPTION(hr);
  1899. }
  1900. ULONG ulCur = State.m_ulStart + ulAddToStart;
  1901. ULONG ulNext = ulCur;
  1902. while (ulCur < State.m_ulEnd)
  1903. {
  1904. if (!HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[ulCur])))
  1905. {
  1906. State.m_pwcsToken[ulNext] = State.m_pwcsToken[ulCur];
  1907. ulNext++;
  1908. ulCur++;
  1909. continue;
  1910. }
  1911. ulCur++;
  1912. }
  1913. if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  1914. {
  1915. hr = m_apWordSink->PutAltWord(
  1916. ulNext - (State.m_ulStart + ulAddToStart),
  1917. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  1918. State.m_ulEnd - State.m_ulStart,
  1919. ulOffsetInTxtSourceBuffer);
  1920. if (FAILED(hr))
  1921. {
  1922. THROW_HRESULT_EXCEPTION(hr);
  1923. }
  1924. }
  1925. hr = m_apWordSink->PutWord(
  1926. ulNext - ulDecFromEnd - (State.m_ulStart + ulAddToStart),
  1927. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  1928. State.m_ulEnd - State.m_ulStart,
  1929. ulOffsetInTxtSourceBuffer);
  1930. if (FAILED(hr))
  1931. {
  1932. THROW_HRESULT_EXCEPTION(hr);
  1933. }
  1934. }
  1935. void CTokenizer::OutputAbbreviation(CTokenState& State)
  1936. {
  1937. HRESULT hr;
  1938. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  1939. hr = m_apWordSink->PutAltWord(
  1940. State.m_ulEnd - State.m_ulStart - 1,
  1941. &State.m_pwcsToken[State.m_ulStart],
  1942. State.m_ulEnd - State.m_ulStart,
  1943. ulOffsetInTxtSourceBuffer);
  1944. if (FAILED(hr))
  1945. {
  1946. THROW_HRESULT_EXCEPTION(hr);
  1947. }
  1948. hr = m_apWordSink->PutWord(
  1949. State.m_ulEnd - State.m_ulStart,
  1950. &State.m_pwcsToken[State.m_ulStart],
  1951. State.m_ulEnd - State.m_ulStart,
  1952. ulOffsetInTxtSourceBuffer);
  1953. if (FAILED(hr))
  1954. {
  1955. THROW_HRESULT_EXCEPTION(hr);
  1956. }
  1957. }
  1958. void CTokenizer::OutputSpecialAbbreviation(
  1959. CTokenState& State,
  1960. CAbbTerm* pTerm,
  1961. const CCliticsTerm* pCliticsTerm)
  1962. {
  1963. HRESULT hr;
  1964. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  1965. WCHAR* pwcsAbb = pTerm->pwcsAbb;
  1966. ULONG ulLen = pTerm->ulAbbLen;
  1967. if (pTerm->pwcsCanonicalForm)
  1968. {
  1969. pwcsAbb = pTerm->pwcsCanonicalForm;
  1970. ulLen = pTerm->ulCanLen;
  1971. }
  1972. if (TAIL_MATCH_TRUNCATE == pCliticsTerm->ulOp)
  1973. {
  1974. WCHAR pwcs[TOKENIZER_MAXBUFFERLIMIT];
  1975. wcscpy(pwcs, pwcsAbb);
  1976. wcscpy(pwcs + ulLen, pCliticsTerm->pwcs);
  1977. hr = m_apWordSink->PutAltWord(
  1978. ulLen + pCliticsTerm->ulLen,
  1979. pwcs,
  1980. State.m_ulEnd - State.m_ulStart,
  1981. ulOffsetInTxtSourceBuffer);
  1982. if (FAILED(hr))
  1983. {
  1984. THROW_HRESULT_EXCEPTION(hr);
  1985. }
  1986. }
  1987. hr = m_apWordSink->PutWord(
  1988. ulLen,
  1989. pwcsAbb,
  1990. State.m_ulEnd - State.m_ulStart,
  1991. ulOffsetInTxtSourceBuffer);
  1992. if (FAILED(hr))
  1993. {
  1994. THROW_HRESULT_EXCEPTION(hr);
  1995. }
  1996. }
  1997. void CTokenizer::OutputHyphenation(CTokenState& State, const CCliticsTerm* pCliticsTerm)
  1998. {
  1999. //
  2000. // Input: Data-Base Output Data Base, DataBase (only in query time)
  2001. //
  2002. HRESULT hr;
  2003. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2004. ULONG ulAddToStart = 0;
  2005. ULONG ulDecFromEnd = 0;
  2006. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  2007. {
  2008. ulAddToStart = pCliticsTerm->ulLen;
  2009. }
  2010. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2011. {
  2012. ulDecFromEnd = pCliticsTerm->ulLen;
  2013. }
  2014. ULONG ulCur = State.m_ulStart + ulAddToStart;
  2015. ULONG ulStart = ulCur;
  2016. ULONG ulRelPosInTxtSrcBuff = ulOffsetInTxtSourceBuffer;
  2017. if (m_bQueryTime)
  2018. {
  2019. ULONG ulNext = ulCur;
  2020. hr = m_apWordSink->StartAltPhrase();
  2021. if (FAILED(hr))
  2022. {
  2023. THROW_HRESULT_EXCEPTION(hr);
  2024. }
  2025. ULONG ulAdd = ulAddToStart;
  2026. while (ulCur < State.m_ulEnd)
  2027. {
  2028. if ( HAS_PROP_DASH(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
  2029. {
  2030. hr = m_apWordSink->PutWord(
  2031. ulNext - ulStart,
  2032. &State.m_pwcsToken[ulStart],
  2033. ulNext - ulStart + ulAdd,
  2034. ulRelPosInTxtSrcBuff);
  2035. if (FAILED(hr))
  2036. {
  2037. THROW_HRESULT_EXCEPTION(hr);
  2038. }
  2039. ulRelPosInTxtSrcBuff += ulNext - ulStart + 1 + ulAdd;
  2040. ulStart = ulNext;
  2041. ulCur++;
  2042. ulAdd = 0;
  2043. continue;
  2044. }
  2045. State.m_pwcsToken[ulNext] = State.m_pwcsToken[ulCur];
  2046. ulNext++;
  2047. ulCur++;
  2048. }
  2049. Assert(ulCur > ulStart);
  2050. if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2051. {
  2052. hr = m_apWordSink->PutAltWord(
  2053. ulNext - ulStart,
  2054. &State.m_pwcsToken[ulStart],
  2055. ulNext - ulStart,
  2056. ulRelPosInTxtSrcBuff);
  2057. if (FAILED(hr))
  2058. {
  2059. THROW_HRESULT_EXCEPTION(hr);
  2060. }
  2061. }
  2062. hr = m_apWordSink->PutWord(
  2063. ulNext - ulStart - ulDecFromEnd,
  2064. &State.m_pwcsToken[ulStart],
  2065. ulNext - ulStart,
  2066. ulRelPosInTxtSrcBuff);
  2067. if (FAILED(hr))
  2068. {
  2069. THROW_HRESULT_EXCEPTION(hr);
  2070. }
  2071. hr = m_apWordSink->StartAltPhrase();
  2072. if (FAILED(hr))
  2073. {
  2074. THROW_HRESULT_EXCEPTION(hr);
  2075. }
  2076. if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2077. {
  2078. hr = m_apWordSink->PutAltWord(
  2079. ulNext - State.m_ulStart,
  2080. &State.m_pwcsToken[State.m_ulStart],
  2081. State.m_ulEnd - State.m_ulStart - ulAddToStart,
  2082. ulOffsetInTxtSourceBuffer);
  2083. if (FAILED(hr))
  2084. {
  2085. THROW_HRESULT_EXCEPTION(hr);
  2086. }
  2087. }
  2088. hr = m_apWordSink->PutWord(
  2089. ulNext - State.m_ulStart - ulDecFromEnd - ulAddToStart,
  2090. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  2091. State.m_ulEnd - State.m_ulStart + ulAddToStart,
  2092. ulOffsetInTxtSourceBuffer);
  2093. if (FAILED(hr))
  2094. {
  2095. THROW_HRESULT_EXCEPTION(hr);
  2096. }
  2097. hr = m_apWordSink->EndAltPhrase();
  2098. if (FAILED(hr))
  2099. {
  2100. THROW_HRESULT_EXCEPTION(hr);
  2101. }
  2102. }
  2103. else
  2104. {
  2105. ULONG ulAdd = ulAddToStart;
  2106. while (ulCur < State.m_ulEnd)
  2107. {
  2108. if (HAS_PROP_DASH(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
  2109. {
  2110. hr = m_apWordSink->PutWord(
  2111. ulCur - ulStart,
  2112. &State.m_pwcsToken[ulStart],
  2113. ulCur - ulStart + ulAdd,
  2114. ulRelPosInTxtSrcBuff);
  2115. if (FAILED(hr))
  2116. {
  2117. THROW_HRESULT_EXCEPTION(hr);
  2118. }
  2119. ulRelPosInTxtSrcBuff += ulCur - ulStart + 1 + ulAdd;
  2120. ulStart = ulCur + 1;
  2121. ulAdd = 0;
  2122. }
  2123. ulCur++;
  2124. }
  2125. Assert(ulCur > ulStart);
  2126. if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2127. {
  2128. hr = m_apWordSink->PutAltWord(
  2129. ulCur - ulStart,
  2130. &State.m_pwcsToken[ulStart],
  2131. ulCur - ulStart,
  2132. ulRelPosInTxtSrcBuff);
  2133. if (FAILED(hr))
  2134. {
  2135. THROW_HRESULT_EXCEPTION(hr);
  2136. }
  2137. }
  2138. hr = m_apWordSink->PutWord(
  2139. ulCur - ulStart - ulDecFromEnd,
  2140. &State.m_pwcsToken[ulStart],
  2141. ulCur - ulStart,
  2142. ulRelPosInTxtSrcBuff);
  2143. if (FAILED(hr))
  2144. {
  2145. THROW_HRESULT_EXCEPTION(hr);
  2146. }
  2147. }
  2148. }
  2149. void CTokenizer::OutputTime(WCHAR* pwcsTime, CTokenState& State)
  2150. {
  2151. HRESULT hr;
  2152. //
  2153. // Output: TT1353
  2154. //
  2155. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2156. hr = m_apWordSink->PutAltWord(
  2157. State.m_ulEnd - State.m_ulStart,
  2158. &State.m_pwcsToken[State.m_ulStart],
  2159. State.m_ulEnd - State.m_ulStart,
  2160. ulOffsetInTxtSourceBuffer);
  2161. if (FAILED(hr))
  2162. {
  2163. THROW_HRESULT_EXCEPTION(hr);
  2164. }
  2165. hr = m_apWordSink->PutWord(
  2166. 6,
  2167. pwcsTime,
  2168. State.m_ulEnd - State.m_ulStart,
  2169. ulOffsetInTxtSourceBuffer);
  2170. if (FAILED(hr))
  2171. {
  2172. THROW_HRESULT_EXCEPTION(hr);
  2173. }
  2174. }
  2175. void CTokenizer::OutputDate(
  2176. WCHAR* pwcsDate1,
  2177. WCHAR* pwcsDate2,
  2178. CTokenState& State)
  2179. {
  2180. HRESULT hr;
  2181. //
  2182. // Output: DD19990921
  2183. //
  2184. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2185. hr = m_apWordSink->PutAltWord(
  2186. State.m_ulEnd - State.m_ulStart,
  2187. &State.m_pwcsToken[State.m_ulStart],
  2188. State.m_ulEnd - State.m_ulStart,
  2189. ulOffsetInTxtSourceBuffer);
  2190. if (FAILED(hr))
  2191. {
  2192. THROW_HRESULT_EXCEPTION(hr);
  2193. }
  2194. if (pwcsDate2)
  2195. {
  2196. hr = m_apWordSink->PutAltWord(
  2197. 10,
  2198. pwcsDate2,
  2199. State.m_ulEnd - State.m_ulStart,
  2200. ulOffsetInTxtSourceBuffer);
  2201. if (FAILED(hr))
  2202. {
  2203. THROW_HRESULT_EXCEPTION(hr);
  2204. }
  2205. }
  2206. hr = m_apWordSink->PutWord(
  2207. 10,
  2208. pwcsDate1,
  2209. State.m_ulEnd - State.m_ulStart,
  2210. ulOffsetInTxtSourceBuffer);
  2211. if (FAILED(hr))
  2212. {
  2213. THROW_HRESULT_EXCEPTION(hr);
  2214. }
  2215. }
  2216. void CTokenizer::OutputSimpleToken(CTokenState& State, const CCliticsTerm* pTerm)
  2217. {
  2218. HRESULT hr;
  2219. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2220. if ((TAIL_MATCH_TRUNCATE == pTerm->ulOp) ||
  2221. (HEAD_MATCH_TRUNCATE == pTerm->ulOp))
  2222. {
  2223. if (0 == ( State.m_ulEnd - State.m_ulStart - pTerm->ulLen ))
  2224. {
  2225. return;
  2226. }
  2227. hr = m_apWordSink->PutAltWord(
  2228. State.m_ulEnd - State.m_ulStart,
  2229. &State.m_pwcsToken[State.m_ulStart],
  2230. State.m_ulEnd - State.m_ulStart,
  2231. ulOffsetInTxtSourceBuffer);
  2232. if (FAILED(hr))
  2233. {
  2234. THROW_HRESULT_EXCEPTION(hr);
  2235. }
  2236. if (pTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2237. {
  2238. hr = m_apWordSink->PutWord(
  2239. State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
  2240. &State.m_pwcsToken[State.m_ulStart],
  2241. State.m_ulEnd - State.m_ulStart,
  2242. ulOffsetInTxtSourceBuffer);
  2243. if (FAILED(hr))
  2244. {
  2245. THROW_HRESULT_EXCEPTION(hr);
  2246. }
  2247. }
  2248. else
  2249. {
  2250. Assert(pTerm->ulOp == HEAD_MATCH_TRUNCATE);
  2251. hr = m_apWordSink->PutWord(
  2252. State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
  2253. &State.m_pwcsToken[State.m_ulStart + pTerm->ulLen],
  2254. State.m_ulEnd - State.m_ulStart,
  2255. ulOffsetInTxtSourceBuffer);
  2256. if (FAILED(hr))
  2257. {
  2258. THROW_HRESULT_EXCEPTION(hr);
  2259. }
  2260. }
  2261. return;
  2262. }
  2263. hr = m_apWordSink->PutWord(
  2264. State.m_ulEnd - State.m_ulStart,
  2265. &State.m_pwcsToken[State.m_ulStart],
  2266. State.m_ulEnd - State.m_ulStart,
  2267. m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State));
  2268. if (FAILED(hr))
  2269. {
  2270. THROW_HRESULT_EXCEPTION(hr);
  2271. }
  2272. }
  2273. void CTokenizer::OutputCurrency(
  2274. ULONG ulLen,
  2275. WCHAR* pwcsCurrency,
  2276. CTokenState& State,
  2277. const CCliticsTerm* pTerm)
  2278. {
  2279. HRESULT hr;
  2280. //
  2281. // Output: CC12.22$
  2282. //
  2283. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2284. if (ulLen > m_ulMaxTokenSize)
  2285. {
  2286. hr = m_apWordSink->PutWord(
  2287. State.m_ulEnd - State.m_ulStart,
  2288. &State.m_pwcsToken[State.m_ulStart],
  2289. State.m_ulEnd - State.m_ulStart,
  2290. ulOffsetInTxtSourceBuffer);
  2291. if (FAILED(hr))
  2292. {
  2293. THROW_HRESULT_EXCEPTION(hr);
  2294. }
  2295. return;
  2296. }
  2297. hr = m_apWordSink->PutAltWord(
  2298. State.m_ulEnd - State.m_ulStart,
  2299. &State.m_pwcsToken[State.m_ulStart],
  2300. State.m_ulEnd - State.m_ulStart,
  2301. ulOffsetInTxtSourceBuffer);
  2302. if (FAILED(hr))
  2303. {
  2304. THROW_HRESULT_EXCEPTION(hr);
  2305. }
  2306. if (pTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2307. {
  2308. hr = m_apWordSink->PutAltWord(
  2309. State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
  2310. &State.m_pwcsToken[State.m_ulStart],
  2311. State.m_ulEnd - State.m_ulStart,
  2312. ulOffsetInTxtSourceBuffer);
  2313. if (FAILED(hr))
  2314. {
  2315. THROW_HRESULT_EXCEPTION(hr);
  2316. }
  2317. }
  2318. else if (pTerm->ulOp == HEAD_MATCH_TRUNCATE)
  2319. {
  2320. hr = m_apWordSink->PutAltWord(
  2321. State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
  2322. &State.m_pwcsToken[State.m_ulStart + pTerm->ulLen],
  2323. State.m_ulEnd - State.m_ulStart,
  2324. ulOffsetInTxtSourceBuffer);
  2325. if (FAILED(hr))
  2326. {
  2327. THROW_HRESULT_EXCEPTION(hr);
  2328. }
  2329. }
  2330. hr = m_apWordSink->PutWord(
  2331. ulLen,
  2332. pwcsCurrency,
  2333. State.m_ulEnd - State.m_ulStart,
  2334. ulOffsetInTxtSourceBuffer);
  2335. if (FAILED(hr))
  2336. {
  2337. THROW_HRESULT_EXCEPTION(hr);
  2338. }
  2339. }
  2340. void CTokenizer::OutputCommersialSignToken(
  2341. CTokenState& State)
  2342. {
  2343. HRESULT hr;
  2344. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2345. hr = m_apWordSink->PutAltWord(
  2346. State.m_ulEnd - State.m_ulStart - 1,
  2347. State.m_pwcsToken + State.m_ulStart,
  2348. State.m_ulEnd - State.m_ulStart,
  2349. ulOffsetInTxtSourceBuffer);
  2350. if (FAILED(hr))
  2351. {
  2352. THROW_HRESULT_EXCEPTION(hr);
  2353. }
  2354. hr = m_apWordSink->PutWord(
  2355. State.m_ulEnd - State.m_ulStart,
  2356. State.m_pwcsToken + State.m_ulStart,
  2357. State.m_ulEnd - State.m_ulStart,
  2358. ulOffsetInTxtSourceBuffer);
  2359. if (FAILED(hr))
  2360. {
  2361. THROW_HRESULT_EXCEPTION(hr);
  2362. }
  2363. }
  2364. void CTokenizer::OutputMisc(
  2365. CTokenState& State,
  2366. bool bPatternContainOnlyUpperCase,
  2367. ULONG ulSuffixSize,
  2368. const CCliticsTerm* pCliticsTerm)
  2369. {
  2370. HRESULT hr;
  2371. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2372. ULONG ulAddToStart = 0;
  2373. ULONG ulDecFromEnd = 0;
  2374. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  2375. {
  2376. hr = m_apWordSink->PutAltWord(
  2377. State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
  2378. State.m_pwcsToken + State.m_ulStart + pCliticsTerm->ulLen,
  2379. State.m_ulEnd - State.m_ulStart,
  2380. ulOffsetInTxtSourceBuffer);
  2381. if (FAILED(hr))
  2382. {
  2383. THROW_HRESULT_EXCEPTION(hr);
  2384. }
  2385. ulAddToStart = pCliticsTerm->ulLen;
  2386. }
  2387. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2388. {
  2389. hr = m_apWordSink->PutAltWord(
  2390. State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
  2391. State.m_pwcsToken + State.m_ulStart,
  2392. State.m_ulEnd - State.m_ulStart,
  2393. ulOffsetInTxtSourceBuffer);
  2394. if (FAILED(hr))
  2395. {
  2396. THROW_HRESULT_EXCEPTION(hr);
  2397. }
  2398. ulDecFromEnd = pCliticsTerm->ulLen;
  2399. }
  2400. if (!bPatternContainOnlyUpperCase)
  2401. {
  2402. hr = m_apWordSink->PutAltWord(
  2403. State.m_ulEnd - State.m_ulStart - ulAddToStart - ulDecFromEnd - ulSuffixSize,
  2404. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  2405. State.m_ulEnd - State.m_ulStart,
  2406. ulOffsetInTxtSourceBuffer);
  2407. if (FAILED(hr))
  2408. {
  2409. THROW_HRESULT_EXCEPTION(hr);
  2410. }
  2411. }
  2412. hr = m_apWordSink->PutWord(
  2413. State.m_ulEnd - State.m_ulStart,
  2414. &State.m_pwcsToken[State.m_ulStart],
  2415. State.m_ulEnd - State.m_ulStart,
  2416. ulOffsetInTxtSourceBuffer);
  2417. if (FAILED(hr))
  2418. {
  2419. THROW_HRESULT_EXCEPTION(hr);
  2420. }
  2421. }
  2422. #define NUMBER_NO_ERROR 0
  2423. #define NUMBER_SEPERATOR_ERROR 1
  2424. #define NUMBER_ERROR 2
  2425. bool CTokenizer::CheckAndCreateNumber(
  2426. WCHAR* pwcsStr,
  2427. ULONG ulLen,
  2428. WCHAR* pwcsOut,
  2429. ULONG* pulOffsetToTxt, // the actual output does not always start at the beginning of buffer
  2430. ULONG* pulOutLen)
  2431. {
  2432. int iRet;
  2433. iRet = CheckAndCreateNumber(
  2434. pwcsStr,
  2435. ulLen,
  2436. m_apLangSupport->GetDecimalSeperator(),
  2437. m_apLangSupport->GetThousandSeperator(),
  2438. pwcsOut,
  2439. pulOffsetToTxt,
  2440. pulOutLen);
  2441. if (NUMBER_NO_ERROR == iRet)
  2442. {
  2443. return true;
  2444. }
  2445. else if (NUMBER_ERROR == iRet)
  2446. {
  2447. return false;
  2448. }
  2449. iRet = CheckAndCreateNumber(
  2450. pwcsStr,
  2451. ulLen,
  2452. L'.', // default value
  2453. 0xFFFF, // no thousand sperator
  2454. pwcsOut,
  2455. pulOffsetToTxt,
  2456. pulOutLen);
  2457. if (NUMBER_NO_ERROR == iRet)
  2458. {
  2459. return true;
  2460. }
  2461. return false;
  2462. }
  2463. //
  2464. // return value:
  2465. // NUMBER_NO_ERROR - success
  2466. // NUMBER_SEPERATOR_ERROR - error due to sperators
  2467. // NUMBER_ERROR - error since it's not a number.
  2468. //
  2469. int CTokenizer::CheckAndCreateNumber(
  2470. WCHAR* pwcsStr,
  2471. ULONG ulLen,
  2472. WCHAR wchSDecimal,
  2473. WCHAR wchSThousand,
  2474. WCHAR* pwcsOut,
  2475. ULONG* pulOffsetToTxt, // the actual output does not always start at the beginning of buffer
  2476. ULONG* pulOutLen)
  2477. {
  2478. Assert(ulLen > 0);
  2479. //
  2480. // assumes that the out buffer is big enough.
  2481. // looking for the following formats: 1111 1111.2222 1,111,111.222
  2482. //
  2483. ULONG ulCur = ulLen - 1;
  2484. ULONG ulNumCharsBeforDigitSeperator = 0;
  2485. ULONG ulNextChar = ulLen - 1 + 3; // +3 is for the NN at the begging of the formated token +
  2486. // additional 0 in the begining in case .50
  2487. bool fHasFraction = false;
  2488. while ((((int)(ulCur)) >= 0) &&
  2489. HAS_PROP_NUMBER(GET_PROP(pwcsStr[ulCur])))
  2490. {
  2491. pwcsOut[ulNextChar] = pwcsStr[ulCur];
  2492. ulCur--;
  2493. ulNextChar--;
  2494. ulNumCharsBeforDigitSeperator++;
  2495. }
  2496. if (ulCur == ulLen - 1)
  2497. {
  2498. //
  2499. // did not read any digits.
  2500. //
  2501. return NUMBER_ERROR;
  2502. }
  2503. if ((((int)ulCur) >= 0) && (pwcsStr[ulCur] == wchSDecimal))
  2504. {
  2505. fHasFraction = true;
  2506. pwcsOut[ulNextChar] = L'D';
  2507. ulCur--;
  2508. ulNextChar--;
  2509. ulNumCharsBeforDigitSeperator = 0;
  2510. }
  2511. ULONG ulNumOfThousandSeperator = 0;
  2512. while (((int)ulCur) >= 0)
  2513. {
  2514. if (pwcsStr[ulCur] == wchSThousand)
  2515. {
  2516. if (3 != ulNumCharsBeforDigitSeperator)
  2517. {
  2518. return NUMBER_SEPERATOR_ERROR;
  2519. }
  2520. ulNumCharsBeforDigitSeperator = 0;
  2521. ulNumOfThousandSeperator++;
  2522. }
  2523. else if(HAS_PROP_NUMBER(GET_PROP(pwcsStr[ulCur])))
  2524. {
  2525. pwcsOut[ulNextChar] = pwcsStr[ulCur];
  2526. ulNumCharsBeforDigitSeperator++;
  2527. ulNextChar--;
  2528. }
  2529. else
  2530. {
  2531. if (TEST_PROP(
  2532. GET_PROP(pwcsStr[ulCur]), PROP_DEFAULT_BREAKER))
  2533. {
  2534. return NUMBER_SEPERATOR_ERROR;
  2535. }
  2536. return NUMBER_ERROR;
  2537. }
  2538. ulCur--;
  2539. }
  2540. *pulOutLen = ulLen;
  2541. if (L'D' == pwcsOut[ulNextChar+1])
  2542. {
  2543. Assert(ulNextChar >= 2);
  2544. //
  2545. // the number has the following format .50
  2546. //
  2547. pwcsOut[ulNextChar] = L'0';
  2548. ulNextChar--;
  2549. *pulOutLen += 1;
  2550. }
  2551. Assert(ulNextChar >= 1);
  2552. pwcsOut[ulLen + 3] = L'\0';
  2553. pwcsOut[ulNextChar] = L'N';
  2554. pwcsOut[ulNextChar - 1] = L'N';
  2555. *pulOutLen = *pulOutLen + 2 - ulNumOfThousandSeperator; // don't use += because 2 - ulNextChar + 1
  2556. *pulOffsetToTxt = ulNextChar - 1;
  2557. // can be negative and since it is ULONG we
  2558. // can get the wrong result.
  2559. if (fHasFraction)
  2560. {
  2561. while (HAS_PROP_NUMBER(GET_PROP(pwcsOut[*pulOutLen + *pulOffsetToTxt - 1])) &&
  2562. (0 == ConvertCharToDigit(pwcsOut[*pulOutLen + *pulOffsetToTxt - 1])))
  2563. {
  2564. Assert(*pulOutLen > 3);
  2565. (*pulOutLen)--;
  2566. }
  2567. if (L'D' == pwcsOut[*pulOutLen + *pulOffsetToTxt - 1])
  2568. {
  2569. (*pulOutLen)--;
  2570. }
  2571. }
  2572. return NUMBER_NO_ERROR;
  2573. }
  2574. void CTokenizer::GetValuesFromDateString(
  2575. CDateTerm* pFormat,
  2576. WCHAR* pwcsDate,
  2577. LONG* plD_M1, // we can't tell in this stage whether this is a Day or a month.
  2578. LONG* plD_M2,
  2579. LONG* plYear)
  2580. {
  2581. BYTE i;
  2582. int iBase;
  2583. *plD_M1 = 0;
  2584. for ( i = pFormat->bD_M1Len, iBase = 1; i > 0; i--, iBase *= 10)
  2585. {
  2586. *plD_M1 += ConvertCharToDigit(pwcsDate[pFormat->bD_M1Offset + i - 1]) * iBase;
  2587. }
  2588. *plD_M2 = 0;
  2589. for ( i = pFormat->bD_M2Len, iBase = 1; i > 0; i--, iBase *= 10)
  2590. {
  2591. *plD_M2 += ConvertCharToDigit(pwcsDate[pFormat->bD_M2Offset + i - 1]) * iBase;
  2592. }
  2593. *plYear = 0;
  2594. for ( i = pFormat->bYearLen, iBase = 1; i > 0; i--, iBase *= 10)
  2595. {
  2596. *plYear += ConvertCharToDigit(pwcsDate[pFormat->bYearOffset + i - 1]) * iBase;
  2597. }
  2598. }
  2599. void CTokenizer::GetValuesFromTimeString(
  2600. CTimeTerm* pFormat,
  2601. WCHAR* pwcsTime,
  2602. LONG* plHour,
  2603. LONG* plMin,
  2604. LONG* plSec,
  2605. TimeFormat* pAmPm)
  2606. {
  2607. BYTE i;
  2608. int iBase;
  2609. *plHour = 0;
  2610. for ( i = pFormat->bHourLen, iBase = 1; i > 0; i--, iBase *= 10)
  2611. {
  2612. *plHour += ConvertCharToDigit(pwcsTime[pFormat->bHourOffset + i - 1]) * iBase;
  2613. }
  2614. *plMin = 0;
  2615. for ( i = pFormat->bMinLen, iBase = 1; i > 0; i--, iBase *= 10)
  2616. {
  2617. *plMin += ConvertCharToDigit(pwcsTime[pFormat->bMinOffset + i - 1]) * iBase;
  2618. }
  2619. *plSec = 0;
  2620. for ( i = pFormat->bSecLen, iBase = 1; i > 0; i--, iBase *= 10)
  2621. {
  2622. *plSec += ConvertCharToDigit(pwcsTime[pFormat->bSecOffset + i - 1]) * iBase;
  2623. }
  2624. *pAmPm = pFormat->AmPm;
  2625. }
  2626. void CTokenizer::BreakCompundString(CTokenState& State, CPropFlag& propBreaker)
  2627. {
  2628. //
  2629. // still there are puctutaitons inside the token
  2630. // we break them up and resubmit them.
  2631. //
  2632. ULONG ulStart = State.m_ulStart;
  2633. ULONG ulCur = ulStart;
  2634. while (ulCur < State.m_ulEnd)
  2635. {
  2636. if ( TEST_PROP1(GET_PROP(State.m_pwcsToken[ulCur]), propBreaker))
  2637. {
  2638. if (ulCur - ulStart == 0)
  2639. {
  2640. //
  2641. // only punctuation
  2642. //
  2643. ulCur++;
  2644. ulStart = ulCur;
  2645. continue;
  2646. }
  2647. m_pCurToken->m_State.m_ulStart = 0;
  2648. m_pCurToken->m_State.m_ulEnd = ulCur - ulStart;
  2649. m_pCurToken->m_State.m_pwcsToken = State.m_pwcsToken + ulStart;
  2650. m_pCurToken->ComputeStateProperties(m_pCurToken->m_State);
  2651. //
  2652. // we just created a sub token need to procces it
  2653. //
  2654. ProcessTokenInternal();
  2655. ulStart = ulCur + 1;
  2656. }
  2657. ulCur++;
  2658. }
  2659. if (ulStart < ulCur)
  2660. {
  2661. //
  2662. // last sub token
  2663. //
  2664. m_pCurToken->m_State.m_ulStart = 0;
  2665. m_pCurToken->m_State.m_ulEnd = ulCur - ulStart;
  2666. m_pCurToken->m_State.m_pwcsToken = State.m_pwcsToken + ulStart;
  2667. m_pCurToken->ComputeStateProperties(m_pCurToken->m_State);
  2668. //
  2669. // we just created a sub token need to procces it
  2670. //
  2671. ProcessTokenInternal();
  2672. }
  2673. return;
  2674. }