Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3207 lines
86 KiB

  1. ////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Filename : Tokenizer.cpp
  4. // Purpose : Tokenizer implementation
  5. //
  6. // Project : WordBreakers
  7. // Component: English word breaker
  8. //
  9. // Author : yairh
  10. //
  11. // Log:
  12. //
  13. // Jan 06 2000 yairh creation
  14. // Apr 04 2000 dovh on behalf of dlee - Fix CTokenizer::OutputClitics
  15. // to avoid PutWord of length 0 (leads to multiple PutWord at
  16. // same location (duplicate keys), and index corruption!
  17. // Example: :...'s :...'s (. stands for junk character)
  18. // Apr 05 2000 dovh - Fixed two problematic debug / tracer buffer size
  19. // problems. (Related to Bug 15449).
  20. // May 07 2000 dovh - USE_WS_SENTINEL algorithm in BreakText
  21. // May 11 2000 dovh - Simplify VerifyMisc test.
  22. // Nov 11 2000 dovh - Special underscore treatment
  23. // Add AddBackUnderscores '_' + alphanumeric treatment.
  24. //
  25. ////////////////////////////////////////////////////////////////////////////////
  26. #include "base.h"
  27. #include "Tokenizer.h"
  28. #include "PropArray.h"
  29. #include "excption.h"
  30. #include "formats.h"
  31. DECLARE_TRIE_SENTINEL;
  32. CWbToUpper g_WbToUpper;
  33. CAutoClassPointer<CPropArray> g_pPropArray;
  34. CTokenizer::CTokenizer(
  35. TEXT_SOURCE* pTxtSource,
  36. IWordSink * pWordSink,
  37. IPhraseSink * pPhraseSink,
  38. LCID lcid,
  39. BOOL bQueryTime,
  40. ULONG ulMaxTokenSize) :
  41. m_pTxtSource(pTxtSource),
  42. m_apWordSink(pWordSink),
  43. m_apPhraseSink(pPhraseSink),
  44. m_Lcid(lcid),
  45. m_bQueryTime(bQueryTime),
  46. m_bNoMoreTxt(false),
  47. m_Token(ulMaxTokenSize),
  48. m_bWhiteSpaceGuarranteed(false)
  49. {
  50. m_ulMaxTokenSize = min(ulMaxTokenSize, TOKENIZER_MAXBUFFERLIMIT);
  51. m_apLangSupport = new CLangSupport(lcid);
  52. m_pCurToken = &m_Token;
  53. if (pTxtSource->iEnd > pTxtSource->iCur)
  54. {
  55. CalculateUpdateEndOfBuffer();
  56. }
  57. else
  58. {
  59. m_ulUpdatedEndOfBuffer = pTxtSource->iEnd;
  60. }
  61. }
  62. void CTokenizer::BreakText()
  63. {
  64. Trace(
  65. elVerbose,
  66. s_tagTokenizer,
  67. ("CTokenizer::BreakText()"));
  68. WCHAR wch;
  69. ULONGLONG ullflags(PROP_DEFAULT);
  70. //
  71. // USE_WS_SENTINEL Algorithm:
  72. //
  73. HRESULT hr = S_OK;
  74. if (m_pTxtSource->iCur >= m_ulUpdatedEndOfBuffer)
  75. {
  76. hr = FillBuffer();
  77. }
  78. while ( SUCCEEDED(hr) )
  79. {
  80. if ( m_bWhiteSpaceGuarranteed )
  81. {
  82. while (true)
  83. {
  84. wch = m_pTxtSource->awcBuffer[m_pTxtSource->iCur];
  85. ullflags = (GET_PROP(wch).m_ulFlag);
  86. if (ullflags & PROP_WS)
  87. {
  88. if (m_pCurToken->IsNotEmpty())
  89. {
  90. ProcessToken();
  91. }
  92. m_pTxtSource->iCur++;
  93. if (m_pTxtSource->iCur >= m_ulUpdatedEndOfBuffer)
  94. {
  95. hr = FillBuffer();
  96. break;
  97. }
  98. continue;
  99. }
  100. //
  101. // The following lines are inline expenstion of what
  102. // used to be CToken::RecordChar:
  103. //
  104. Assert(m_pCurToken->m_ulBufPos < m_ulMaxTokenSize);
  105. m_pCurToken->m_awchBuf[m_pCurToken->m_ulBufPos] = wch;
  106. m_pCurToken->m_ulBufPos++;
  107. m_pCurToken->m_State.m_Properties.m_ulFlag |= ullflags;
  108. m_pTxtSource->iCur++;
  109. } // while
  110. }
  111. else
  112. {
  113. while (true)
  114. {
  115. if (m_pTxtSource->iCur >= m_ulUpdatedEndOfBuffer)
  116. {
  117. Assert(m_pTxtSource->iCur == m_ulUpdatedEndOfBuffer);
  118. //
  119. // before we switch between buffers if the current token is not empty we
  120. // need to proccess it. m_ulUpdatedEndOfBuffer always points to a breaker character
  121. // (usually it is a WS) thus no token can start at a certain buffer and end in the
  122. // proceeding buffer.
  123. //
  124. if (m_pCurToken->IsNotEmpty())
  125. {
  126. ProcessToken();
  127. }
  128. hr = FillBuffer();
  129. if (FAILED(hr))
  130. {
  131. break;
  132. }
  133. }
  134. wch = m_pTxtSource->awcBuffer[m_pTxtSource->iCur];
  135. ULONGLONG ullflags(GET_PROP(wch).m_ulFlag);
  136. if (ullflags & PROP_WS)
  137. {
  138. if (m_pCurToken->IsNotEmpty())
  139. {
  140. ProcessToken();
  141. }
  142. m_pTxtSource->iCur++;
  143. continue;
  144. }
  145. //
  146. // the following lines are inline expenstion of what used to be CToken::RecordChar.
  147. //
  148. Assert(m_pCurToken->m_ulBufPos < m_ulMaxTokenSize);
  149. m_pCurToken->m_awchBuf[m_pCurToken->m_ulBufPos] = wch;
  150. m_pCurToken->m_ulBufPos++;
  151. m_pCurToken->m_State.m_Properties.m_ulFlag |= ullflags;
  152. m_pTxtSource->iCur++;
  153. } // while
  154. } // if
  155. } // while ( !FAILED(hr) )
  156. } // CTokenizer::BreakText
  157. void CTokenizer::ProcessToken()
  158. {
  159. ULONG ulOffset;
  160. if (m_pTxtSource->iCur < m_pCurToken->m_ulBufPos)
  161. {
  162. Trace(
  163. elWarning,
  164. s_tagTokenizer,
  165. ("CTokenizer::ProcessToken() wrong offset calculation"));
  166. //
  167. // BUGBUG need to understand why we got to this place.
  168. //
  169. Assert(0 && "Wrong offset calculation");
  170. ulOffset = m_pCurToken->m_ulBufPos + 1;
  171. }
  172. else if (m_pTxtSource->iCur == m_pCurToken->m_ulBufPos)
  173. {
  174. ulOffset = m_pCurToken->m_ulBufPos;
  175. }
  176. else
  177. {
  178. ulOffset = m_pTxtSource->iCur;
  179. }
  180. m_pCurToken->MarkEndToken(ulOffset);
  181. #ifdef DEBUG
  182. TraceToken();
  183. #endif
  184. //
  185. // simple token.
  186. //
  187. if (IS_PROP_SIMPLE(m_pCurToken->m_State.m_Properties))
  188. {
  189. OutputSimpleToken(
  190. m_pCurToken->m_State,
  191. &g_EmptyClitics);
  192. }
  193. else
  194. {
  195. ProcessTokenInternal();
  196. }
  197. if (m_pCurToken->m_fHasEos)
  198. {
  199. Trace(
  200. elVerbose,
  201. s_tagTokenizerDecision,
  202. ("EOS"));
  203. HRESULT hr;
  204. hr = m_apWordSink->PutBreak(WORDREP_BREAK_EOS);
  205. if (FAILED(hr))
  206. {
  207. THROW_HRESULT_EXCEPTION(hr);
  208. }
  209. }
  210. m_pCurToken->Clear();
  211. }
  212. void CTokenizer::ProcessTokenInternal()
  213. {
  214. do
  215. {
  216. //
  217. // url
  218. //
  219. if (HAS_PROP_SLASH(m_pCurToken->m_State.m_Properties) &&
  220. HAS_PROP_COLON(m_pCurToken->m_State.m_Properties) &&
  221. HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
  222. {
  223. Trace(
  224. elVerbose,
  225. s_tagTokenizerSuspect,
  226. ("%*.*S suspected to be <alpha>:// url",
  227. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  228. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  229. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  230. ));
  231. if (VerifyAlphaUrl())
  232. {
  233. break;
  234. }
  235. }
  236. if (HAS_PROP_PERIOD(m_pCurToken->m_State.m_Properties) &&
  237. HAS_PROP_W(m_pCurToken->m_State.m_Properties))
  238. {
  239. Trace(
  240. elVerbose,
  241. s_tagTokenizerSuspect,
  242. ("%*.*S suspected to be www. url",
  243. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  244. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  245. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  246. ));
  247. if (VerifyWwwUrl())
  248. {
  249. break;
  250. }
  251. }
  252. //
  253. // Acronym
  254. //
  255. if (HAS_PROP_PERIOD(m_pCurToken->m_State.m_Properties) &&
  256. HAS_PROP_UPPER_CASE(m_pCurToken->m_State.m_Properties))
  257. {
  258. if (!HAS_PROP_LOWER_CASE(m_pCurToken->m_State.m_Properties) ||
  259. HAS_PROP_APOSTROPHE(m_pCurToken->m_State.m_Properties))
  260. {
  261. Trace(
  262. elVerbose,
  263. s_tagTokenizerSuspect,
  264. ("%*.*S suspected to be an acronym",
  265. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  266. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  267. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  268. ));
  269. if (VerifyAcronym())
  270. {
  271. break;
  272. }
  273. }
  274. //
  275. // Abbreviation
  276. //
  277. Trace(
  278. elVerbose,
  279. s_tagTokenizerSuspect,
  280. ("%*.*S suspected to be an abbreviation",
  281. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  282. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  283. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  284. ));
  285. if (VerifyAbbreviation())
  286. {
  287. break;
  288. }
  289. Trace(
  290. elVerbose,
  291. s_tagTokenizerSuspect,
  292. ("%*.*S suspected to be a special abbreviation",
  293. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  294. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  295. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  296. ));
  297. if (VerifySpecialAbbreviation())
  298. {
  299. break;
  300. }
  301. }
  302. //
  303. // Hyphenation
  304. //
  305. if (HAS_PROP_DASH(m_pCurToken->m_State.m_Properties) &&
  306. HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
  307. {
  308. Trace(
  309. elVerbose,
  310. s_tagTokenizerSuspect,
  311. ("%*.*S suspected to have a hyphenation",
  312. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  313. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  314. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  315. ));
  316. if (VerifyHyphenation())
  317. {
  318. break;
  319. }
  320. }
  321. //
  322. // (s) parenthesis
  323. //
  324. if (HAS_PROP_LEFT_PAREN(m_pCurToken->m_State.m_Properties) &&
  325. HAS_PROP_RIGHT_PAREN(m_pCurToken->m_State.m_Properties) &&
  326. HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
  327. {
  328. Trace(
  329. elVerbose,
  330. s_tagTokenizerSuspect,
  331. ("%*.*S suspected to have a (s) Parenthesis",
  332. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  333. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  334. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  335. ));
  336. if (VerifyParens())
  337. {
  338. break;
  339. }
  340. }
  341. //
  342. // Currency
  343. //
  344. if (HAS_PROP_CURRENCY(m_pCurToken->m_State.m_Properties) &&
  345. HAS_PROP_NUMBER(m_pCurToken->m_State.m_Properties))
  346. {
  347. Trace(
  348. elVerbose,
  349. s_tagTokenizerSuspect,
  350. ("%*.*S suspected to be a currency",
  351. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  352. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  353. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  354. ));
  355. if (VerifyCurrency())
  356. {
  357. break;
  358. }
  359. }
  360. //
  361. // Numbers / time / dates
  362. //
  363. if (HAS_PROP_NUMBER(m_pCurToken->m_State.m_Properties))
  364. {
  365. Trace(
  366. elVerbose,
  367. s_tagTokenizerSuspect,
  368. ("%*.*S suspected to be a number or a time or a date",
  369. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  370. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  371. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  372. ));
  373. if (VerifyNumberOrTimeOrDate())
  374. {
  375. break;
  376. }
  377. }
  378. //
  379. // commersial signs
  380. //
  381. if (TEST_PROP(m_pCurToken->m_State.m_Properties, PROP_COMMERSIAL_SIGN) &&
  382. HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
  383. {
  384. Trace(
  385. elVerbose,
  386. s_tagTokenizerSuspect,
  387. ("%*.*S suspected to have a commesial sign",
  388. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  389. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  390. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  391. ));
  392. if (VerifyCommersialSign())
  393. {
  394. break;
  395. }
  396. }
  397. //
  398. // Misc - C++, J++, A+, A- .. C#
  399. //
  400. if ( TEST_PROP(m_pCurToken->m_State.m_Properties, (PROP_MINUS|PROP_PLUS|PROP_POUND)) &&
  401. HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties) )
  402. {
  403. Trace(
  404. elVerbose,
  405. s_tagTokenizerSuspect,
  406. ("%*.*S suspected to belong to the misc list",
  407. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  408. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  409. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  410. ));
  411. if (VerifyMisc())
  412. {
  413. break;
  414. }
  415. }
  416. //
  417. // default
  418. //
  419. ProcessDefault();
  420. } while (false);
  421. }
  422. #ifdef DEBUG
  423. void CTokenizer::TraceToken()
  424. {
  425. WCHAR buf[MAX_NUM_PROP+1];
  426. size_t bufLen = wcslen(TRACE_CHAR);
  427. Assert(bufLen < MAX_NUM_PROP + 1);
  428. buf[bufLen] = L'\0';
  429. for(int i=0; i<bufLen; i++)
  430. {
  431. if(TEST_PROP(m_pCurToken->m_State.m_Properties, (1<<i)))
  432. {
  433. buf[i] = TRACE_CHAR[i];
  434. }
  435. else
  436. {
  437. buf[i] = L'_';
  438. }
  439. }
  440. Trace(
  441. elVerbose,
  442. s_tagTokenizerTrace,
  443. ("[%S] - %*.*S",
  444. buf,
  445. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  446. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  447. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  448. ));
  449. }
  450. #endif // DEBUG
  451. bool CTokenizer::VerifyAlphaUrl()
  452. {
  453. //
  454. // looking for <alpha>:// pattern
  455. //
  456. CTokenState State(m_pCurToken->m_State);
  457. ULONG ul = State.m_ulStart;
  458. if (!HAS_PROP_ALPHA(GET_PROP(State.m_pwcsToken[ul])))
  459. {
  460. return false;
  461. }
  462. while (HAS_PROP_EXTENDED_ALPHA(GET_PROP(State.m_pwcsToken[ul])))
  463. {
  464. ul++;
  465. }
  466. if (!(HAS_PROP_COLON(GET_PROP(State.m_pwcsToken[ul]))))
  467. {
  468. return false;
  469. }
  470. ul++;
  471. if (!(HAS_PROP_SLASH(GET_PROP(State.m_pwcsToken[ul]))))
  472. {
  473. return false;
  474. }
  475. ul++;
  476. if (!(HAS_PROP_SLASH(GET_PROP(State.m_pwcsToken[ul]))))
  477. {
  478. return false;
  479. }
  480. {
  481. Trace(
  482. elVerbose,
  483. s_tagTokenizerDecision,
  484. ("%*.*S is an <alpha>:// url",
  485. State.m_ulEnd - State.m_ulStart,
  486. State.m_ulEnd - State.m_ulStart,
  487. State.m_pwcsToken + State.m_ulStart
  488. ));
  489. }
  490. OutputUrl(State);
  491. return true;
  492. }
  493. bool CTokenizer::VerifyWwwUrl()
  494. {
  495. CTokenState State(m_pCurToken->m_State);
  496. if (State.m_ulEnd - State.m_ulStart <= 4)
  497. {
  498. return false;
  499. }
  500. if (0 != _wcsnicmp(State.m_pwcsToken + State.m_ulStart, L"www.", 4))
  501. {
  502. return false;
  503. }
  504. Trace(
  505. elVerbose,
  506. s_tagTokenizerDecision,
  507. ("%*.*S is a www. url",
  508. State.m_ulEnd - State.m_ulStart,
  509. State.m_ulEnd - State.m_ulStart,
  510. State.m_pwcsToken + State.m_ulStart
  511. ));
  512. OutputUrl(State);
  513. return true;
  514. }
  515. bool CTokenizer::VerifyAcronym()
  516. {
  517. //
  518. // looking for I.B.M or I.B.M. or A.B.CC but not A.B.CC.
  519. //
  520. CTokenState State(m_pCurToken->m_State);
  521. CPropFlag AbbPuctTail(ACRONYM_PUNCT_TAIL);
  522. CPropFlag AbbPuctHead(ACRONYM_PUNCT_HEAD);
  523. bool fNeedToRemoveEos = true;
  524. if (TEST_PROP(State.m_Properties, (ACRONYM_PUNCT_TAIL | ACRONYM_PUNCT_HEAD)))
  525. {
  526. if (TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd- 1]), ABBREVIATION_EOS))
  527. {
  528. fNeedToRemoveEos = false;
  529. }
  530. ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(AbbPuctTail, State);
  531. ulCharRemoved += m_pCurToken->RemoveHeadPunct(AbbPuctHead, State);
  532. if (ulCharRemoved)
  533. {
  534. m_pCurToken->ComputeStateProperties(State);
  535. }
  536. }
  537. const CCliticsTerm* pCliticsTerm;
  538. pCliticsTerm = VerifyClitics(State);
  539. ULONG ulEnd = State.m_ulEnd;
  540. ULONG ulCur = State.m_ulStart;
  541. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  542. {
  543. ulCur += pCliticsTerm->ulLen;
  544. }
  545. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  546. {
  547. ulEnd -= pCliticsTerm->ulLen;
  548. }
  549. //
  550. // finding the last period
  551. //
  552. while ((ulEnd > ulCur) &&
  553. HAS_PROP_UPPER_CASE(GET_PROP(State.m_pwcsToken[ulEnd- 1])))
  554. {
  555. ulEnd--;
  556. }
  557. if ((ulEnd == ulCur) ||
  558. !HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[ulEnd- 1])))
  559. {
  560. return false;
  561. }
  562. ULONG ulCounter = 0;
  563. while (ulCur < ulEnd)
  564. {
  565. if (ulCounter%2 == 0)
  566. {
  567. if (!HAS_PROP_UPPER_CASE(GET_PROP(State.m_pwcsToken[ulCur])))
  568. {
  569. return false;
  570. }
  571. }
  572. else
  573. {
  574. if (!HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[ulCur])))
  575. {
  576. return false;
  577. }
  578. }
  579. ulCur++;
  580. ulCounter++;
  581. }
  582. Trace(
  583. elVerbose,
  584. s_tagTokenizerDecision,
  585. ("%*.*S is an acronym",
  586. State.m_ulEnd - State.m_ulStart,
  587. State.m_ulEnd - State.m_ulStart,
  588. State.m_pwcsToken + State.m_ulStart
  589. ));
  590. if (fNeedToRemoveEos && (pCliticsTerm->ulOp != TAIL_MATCH_TRUNCATE))
  591. {
  592. m_pCurToken->m_fHasEos = false;
  593. }
  594. OutputAcronym(State, pCliticsTerm);
  595. return true;
  596. }
  597. bool CTokenizer::VerifyAbbreviation()
  598. {
  599. //
  600. // looking for Sr. Jr.
  601. // we define abbreviation as a pattern with 2 letters ending with a dot and the first letter
  602. // is a capital one
  603. //
  604. CTokenState State(m_pCurToken->m_State);
  605. CPropFlag AbbPuctTail(ABBREVIATION_PUNCT_TAIL);
  606. CPropFlag AbbPuctHead(ABBREVIATION_PUNCT_HEAD);
  607. bool fNeedToRemoveEos = true;
  608. if (TEST_PROP(State.m_Properties, (ABBREVIATION_PUNCT_TAIL | ABBREVIATION_PUNCT_HEAD)))
  609. {
  610. if (TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd- 1]), ABBREVIATION_EOS))
  611. {
  612. fNeedToRemoveEos = false;
  613. }
  614. ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(AbbPuctTail, State);
  615. ulCharRemoved += m_pCurToken->RemoveHeadPunct(AbbPuctHead, State);
  616. if (ulCharRemoved)
  617. {
  618. m_pCurToken->ComputeStateProperties(State);
  619. }
  620. }
  621. if ((State.m_ulEnd - State.m_ulStart) != 3)
  622. {
  623. return false;
  624. }
  625. if (!HAS_PROP_UPPER_CASE(GET_PROP(State.m_pwcsToken[State.m_ulStart])))
  626. {
  627. return false;
  628. }
  629. if (!HAS_PROP_EXTENDED_ALPHA(GET_PROP(State.m_pwcsToken[State.m_ulStart + 1])))
  630. {
  631. return false;
  632. }
  633. if (!HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[State.m_ulStart + 2])))
  634. {
  635. return false;
  636. }
  637. Trace(
  638. elVerbose,
  639. s_tagTokenizerDecision,
  640. ("%*.*S is an abbreviation",
  641. State.m_ulEnd - State.m_ulStart,
  642. State.m_ulEnd - State.m_ulStart,
  643. State.m_pwcsToken + State.m_ulStart
  644. ));
  645. if (fNeedToRemoveEos)
  646. {
  647. m_pCurToken->m_fHasEos = false;
  648. }
  649. OutputAbbreviation(State);
  650. return true;
  651. }
  652. bool CTokenizer::VerifySpecialAbbreviation()
  653. {
  654. CTokenState State(m_pCurToken->m_State);
  655. CPropFlag AbbPuctTail(SPECIAL_ABBREVIATION_PUNCT_TAIL);
  656. CPropFlag AbbPuctHead(SPECIAL_ABBREVIATION_PUNCT_HEAD);
  657. if (TEST_PROP(State.m_Properties, (SPECIAL_ABBREVIATION_PUNCT_TAIL | SPECIAL_ABBREVIATION_PUNCT_HEAD)))
  658. {
  659. ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(AbbPuctTail, State);
  660. ulCharRemoved += m_pCurToken->RemoveHeadPunct(AbbPuctHead, State);
  661. if (ulCharRemoved)
  662. {
  663. m_pCurToken->ComputeStateProperties(State);
  664. }
  665. if (!HAS_PROP_PERIOD(State.m_Properties))
  666. {
  667. return false;
  668. }
  669. }
  670. const CCliticsTerm* pCliticsTerm;
  671. pCliticsTerm = VerifyClitics(State);
  672. ULONG ulAddToStart = 0;
  673. ULONG ulDecFromEnd = 0;
  674. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  675. {
  676. ulAddToStart = pCliticsTerm->ulLen;
  677. }
  678. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  679. {
  680. ulDecFromEnd = pCliticsTerm->ulLen;
  681. }
  682. CAbbTerm* pTerm;
  683. short sResCount = 0;
  684. DictStatus status;
  685. CSpecialAbbreviationSet* pAbbSet = m_apLangSupport->GetAbbSet();
  686. status = pAbbSet->m_trieAbb.trie_Find(
  687. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  688. TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
  689. 1,
  690. &pTerm,
  691. &sResCount);
  692. if (sResCount &&
  693. (pTerm->ulAbbLen == (State.m_ulEnd - State.m_ulStart - ulAddToStart - ulDecFromEnd)))
  694. {
  695. Trace(
  696. elVerbose,
  697. s_tagTokenizerDecision,
  698. ("%*.*S is an abbreviation",
  699. State.m_ulEnd - State.m_ulStart,
  700. State.m_ulEnd - State.m_ulStart,
  701. State.m_pwcsToken + State.m_ulStart
  702. ));
  703. OutputSpecialAbbreviation(State, pTerm, pCliticsTerm);
  704. return true;
  705. }
  706. return false;
  707. }
  708. bool CTokenizer::VerifyMisc()
  709. {
  710. CTokenState State(m_pCurToken->m_State);
  711. CPropFlag MiscPuctTail(MISC_PUNCT_TAIL);
  712. CPropFlag MiscPuctHead(MISC_PUNCT_HEAD);
  713. if (TEST_PROP(State.m_Properties, (MISC_PUNCT_TAIL | MISC_PUNCT_HEAD)))
  714. {
  715. ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(MiscPuctTail, State);
  716. ulCharRemoved += m_pCurToken->RemoveHeadPunct(MiscPuctHead, State);
  717. if (ulCharRemoved)
  718. {
  719. m_pCurToken->ComputeStateProperties(State);
  720. }
  721. }
  722. const CCliticsTerm* pCliticsTerm;
  723. pCliticsTerm = VerifyClitics(State);
  724. ULONG ulAddToStart = 0;
  725. ULONG ulDecFromEnd = 0;
  726. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  727. {
  728. ulAddToStart = pCliticsTerm->ulLen;
  729. }
  730. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  731. {
  732. ulDecFromEnd = pCliticsTerm->ulLen;
  733. }
  734. int iEnd = State.m_ulEnd - ulDecFromEnd;
  735. int iStart = State.m_ulStart + ulAddToStart;
  736. if (iEnd <= iStart)
  737. {
  738. return false;
  739. }
  740. bool bPatternContainOnlyUpperCase = true;
  741. ULONG ulSuffixSize = 0;
  742. if (TEST_PROP(State.m_Properties, PROP_POUND))
  743. {
  744. //
  745. // look for A# C#
  746. //
  747. ULONG ulEnd = State.m_ulEnd - ulDecFromEnd;
  748. ULONG ulStart = State.m_ulStart + ulAddToStart;
  749. if (ulEnd - ulStart != 2)
  750. {
  751. return false;
  752. }
  753. if (!TEST_PROP(GET_PROP(State.m_pwcsToken[ulEnd - 1]), PROP_POUND))
  754. {
  755. return false;
  756. }
  757. if (!TEST_PROP(GET_PROP(State.m_pwcsToken[ulStart]), PROP_UPPER_CASE))
  758. {
  759. return false;
  760. }
  761. ulSuffixSize = 1;
  762. }
  763. else
  764. {
  765. //
  766. // look for C++ COM+ ...
  767. //
  768. ULONG ul = State.m_ulEnd - ulDecFromEnd - 1;
  769. while ((int)ul >= (int)(State.m_ulStart + ulAddToStart))
  770. {
  771. if (!TEST_PROP(GET_PROP(State.m_pwcsToken[ul]), PROP_PLUS | PROP_MINUS))
  772. {
  773. break;
  774. }
  775. ulSuffixSize++;
  776. ul--;
  777. }
  778. if (ulSuffixSize > 2)
  779. {
  780. return false;
  781. }
  782. while ((int)ul >= (int)(State.m_ulStart + ulAddToStart))
  783. {
  784. CPropFlag prop(GET_PROP(State.m_pwcsToken[ul]));
  785. if (!HAS_PROP_EXTENDED_ALPHA(prop))
  786. {
  787. return false;
  788. }
  789. if (!TEST_PROP(prop, PROP_UPPER_CASE))
  790. {
  791. bPatternContainOnlyUpperCase = false;
  792. }
  793. ul--;
  794. }
  795. }
  796. Trace(
  797. elVerbose,
  798. s_tagTokenizerDecision,
  799. ("%*.*S is detected",
  800. State.m_ulEnd - State.m_ulStart,
  801. State.m_ulEnd - State.m_ulStart,
  802. State.m_pwcsToken + State.m_ulStart
  803. ));
  804. OutputMisc(
  805. State,
  806. bPatternContainOnlyUpperCase,
  807. ulSuffixSize,
  808. pCliticsTerm);
  809. return true;
  810. }
  811. bool CTokenizer::VerifyHyphenation()
  812. {
  813. //
  814. // looking for data-base
  815. //
  816. CPropFlag PunctHead(HYPHENATION_PUNCT_HEAD);
  817. CPropFlag PunctTail(HYPHENATION_PUNCT_TAIL);
  818. CTokenState State(m_pCurToken->m_State);
  819. if (TEST_PROP(State.m_Properties, (HYPHENATION_PUNCT_HEAD | HYPHENATION_PUNCT_TAIL)))
  820. {
  821. ULONG ulCharRemoved;
  822. ulCharRemoved = m_pCurToken->RemoveHeadPunct(PunctHead, State);
  823. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  824. if (ulCharRemoved)
  825. {
  826. m_pCurToken->ComputeStateProperties(State);
  827. }
  828. }
  829. if (!HAS_PROP_DASH(State.m_Properties))
  830. {
  831. return false;
  832. }
  833. const CCliticsTerm* pCliticsTerm;
  834. pCliticsTerm = VerifyClitics(State);
  835. ULONG ulAddToStart = 0;
  836. ULONG ulDecFromEnd = 0;
  837. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  838. {
  839. ulAddToStart = pCliticsTerm->ulLen;
  840. }
  841. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  842. {
  843. ulDecFromEnd = pCliticsTerm->ulLen;
  844. }
  845. ULONG ulCur = State.m_ulStart + ulAddToStart;
  846. ULONG ulEnd = State.m_ulEnd - ulDecFromEnd;
  847. bool bReadAlpha = false;
  848. do
  849. {
  850. while (ulCur < ulEnd)
  851. {
  852. if (HAS_PROP_EXTENDED_ALPHA(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
  853. {
  854. ulCur++;
  855. bReadAlpha = true;
  856. continue;
  857. }
  858. break;
  859. }
  860. if (!bReadAlpha)
  861. {
  862. return false;
  863. }
  864. if (ulCur < ulEnd)
  865. {
  866. if (!HAS_PROP_DASH(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
  867. {
  868. return false;
  869. }
  870. }
  871. else
  872. {
  873. break;
  874. }
  875. ulCur++;
  876. bReadAlpha = false;
  877. }
  878. while (ulCur < ulEnd);
  879. if (!bReadAlpha)
  880. {
  881. //
  882. // last characters where not alpha ex. free-
  883. //
  884. return false;
  885. }
  886. Trace(
  887. elVerbose,
  888. s_tagTokenizerDecision,
  889. ("%*.*S is an hyphenation",
  890. State.m_ulEnd - State.m_ulStart,
  891. State.m_ulEnd - State.m_ulStart,
  892. State.m_pwcsToken + State.m_ulStart
  893. ));
  894. OutputHyphenation(State, pCliticsTerm);
  895. return true;
  896. }
  897. bool CTokenizer::VerifyParens()
  898. {
  899. CPropFlag PunctTail(PAREN_PUNCT_TAIL);
  900. CPropFlag PunctHead(PAREN_PUNCT_HEAD);
  901. CTokenState State(m_pCurToken->m_State);
  902. if (TEST_PROP(State.m_Properties, (PAREN_PUNCT_TAIL | PAREN_PUNCT_HEAD)))
  903. {
  904. ULONG ulCharRemoved;
  905. ulCharRemoved = m_pCurToken->RemoveTailPunct(PunctTail, State);
  906. ulCharRemoved += m_pCurToken->RemoveHeadPunct(PunctHead, State);
  907. if (ulCharRemoved)
  908. {
  909. m_pCurToken->ComputeStateProperties(State);
  910. }
  911. }
  912. //
  913. // looking for (s)
  914. //
  915. if ((State.m_ulEnd - State.m_ulStart) < 4)
  916. {
  917. return false;
  918. }
  919. if (0 != wcsncmp(State.m_pwcsToken + State.m_ulEnd - 3, L"(s)", 3))
  920. {
  921. return false;
  922. }
  923. for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd - 3; ul++)
  924. {
  925. if (!HAS_PROP_EXTENDED_ALPHA(GET_PROP(State.m_pwcsToken[ul])))
  926. {
  927. return false;
  928. }
  929. }
  930. Trace(
  931. elVerbose,
  932. s_tagTokenizerDecision,
  933. ("%*.*S has (s) parenthesis",
  934. State.m_ulEnd - State.m_ulStart,
  935. State.m_ulEnd - State.m_ulStart,
  936. State.m_pwcsToken + State.m_ulStart
  937. ));
  938. OutputParens(State);
  939. return true;
  940. }
  941. const CCliticsTerm* CTokenizer::VerifyClitics(CTokenState& S)
  942. {
  943. if (TEST_PROP(GET_PROP(S.m_pwcsToken[S.m_ulStart]), PROP_APOSTROPHE))
  944. {
  945. S.m_ulStart++;
  946. if ((TEST_PROP(GET_PROP(S.m_pwcsToken[S.m_ulEnd - 1]), PROP_APOSTROPHE)) &&
  947. (S.m_ulEnd > S.m_ulStart))
  948. {
  949. S.m_ulEnd--;
  950. }
  951. m_pCurToken->ComputeStateProperties(S);
  952. }
  953. if (!(HAS_PROP_APOSTROPHE(S.m_Properties)))
  954. {
  955. return &g_EmptyClitics;
  956. }
  957. CPropFlag PunctTail(CLITICS_PUNC_TAIL);
  958. CPropFlag PunctHead(CLITICS_PUNCT_HEAD);
  959. CTokenState State(S);
  960. if (TEST_PROP(State.m_Properties, (CLITICS_PUNC_TAIL | CLITICS_PUNCT_HEAD)))
  961. {
  962. ULONG ulCharRemoved;
  963. ulCharRemoved = m_pCurToken->RemoveTailPunct(PunctTail, State);
  964. ulCharRemoved += m_pCurToken->RemoveHeadPunct(PunctHead, State);
  965. if (ulCharRemoved)
  966. {
  967. m_pCurToken->ComputeStateProperties(State);
  968. }
  969. }
  970. Trace(
  971. elVerbose,
  972. s_tagTokenizerSuspect,
  973. ("%*.*S suspected to have an apostophe",
  974. State.m_ulEnd - State.m_ulStart,
  975. State.m_ulEnd - State.m_ulStart,
  976. State.m_pwcsToken + State.m_ulStart
  977. ));
  978. ULONG ulApostrophePos = -1;
  979. ULONG ulCur;
  980. for (ulCur = State.m_ulStart; ulCur < State.m_ulEnd ; ulCur++)
  981. {
  982. if (TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur]), PROP_APOSTROPHE))
  983. {
  984. if ((-1 != ulApostrophePos) || (State.m_ulStart == ulCur))
  985. {
  986. //
  987. // this is not the first \' this is not a valid clitics
  988. // or the term start with a new apostrophe
  989. //
  990. return &g_EmptyClitics;
  991. }
  992. ulApostrophePos = ulCur;
  993. //
  994. // replace the apostrophe with an ascii apostrophe.
  995. //
  996. State.m_pwcsToken[ulCur] = L'\'';
  997. continue;
  998. }
  999. }
  1000. //
  1001. // looking for xxxxs'
  1002. //
  1003. if ((ulApostrophePos == State.m_ulEnd - 1) &&
  1004. (State.m_pwcsToken[ulApostrophePos - 1] == L's'))
  1005. {
  1006. Trace(
  1007. elVerbose,
  1008. s_tagTokenizerDecision,
  1009. ("%*.*S has a s' clitcs",
  1010. State.m_ulEnd - State.m_ulStart,
  1011. State.m_ulEnd - State.m_ulStart,
  1012. State.m_pwcsToken + State.m_ulStart
  1013. ));
  1014. S = State;
  1015. return &g_SClitics;
  1016. }
  1017. //
  1018. // looking for tail clitics like xxx's
  1019. //
  1020. DictStatus status;
  1021. CCliticsTerm* pTerm;
  1022. short sResCount = 0;
  1023. if (ulCur > State.m_ulStart)
  1024. {
  1025. status = g_pClitics->m_trieClitics.trie_Find(
  1026. State.m_pwcsToken + ulApostrophePos,
  1027. TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
  1028. 1,
  1029. &pTerm,
  1030. &sResCount);
  1031. if (sResCount && pTerm->ulLen == (State.m_ulEnd - ulApostrophePos))
  1032. {
  1033. Trace(
  1034. elVerbose,
  1035. s_tagTokenizerDecision,
  1036. ("%*.*S has a %S clitcs",
  1037. State.m_ulEnd - State.m_ulStart,
  1038. State.m_ulEnd - State.m_ulStart,
  1039. State.m_pwcsToken + State.m_ulStart,
  1040. pTerm->pwcs
  1041. ));
  1042. S = State;
  1043. return pTerm;
  1044. }
  1045. }
  1046. //
  1047. // looking for head clitics like l'xxxx
  1048. //
  1049. status = g_pClitics->m_trieClitics.trie_Find(
  1050. State.m_pwcsToken + State.m_ulStart,
  1051. TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
  1052. 1,
  1053. &pTerm,
  1054. &sResCount);
  1055. if (sResCount)
  1056. {
  1057. Trace(
  1058. elVerbose,
  1059. s_tagTokenizerDecision,
  1060. ("%*.*S has a %S clitcs",
  1061. State.m_ulEnd - State.m_ulStart,
  1062. State.m_ulEnd - State.m_ulStart,
  1063. State.m_pwcsToken + State.m_ulStart,
  1064. pTerm->pwcs
  1065. ));
  1066. S = State;
  1067. return pTerm;
  1068. }
  1069. return &g_EmptyClitics;
  1070. }
  1071. bool CTokenizer::VerifyNumberOrTimeOrDate()
  1072. {
  1073. CPropFlag PunctHead(NUM_DATE_TIME_PUNCT_HEAD);
  1074. CPropFlag PunctTail(NUM_DATE_TIME_PUNCT_TAIL);
  1075. CTokenState State(m_pCurToken->m_State);
  1076. if (TEST_PROP(State.m_Properties,
  1077. (NUM_DATE_TIME_PUNCT_HEAD | NUM_DATE_TIME_PUNCT_TAIL)))
  1078. {
  1079. ULONG ulCharRemoved;
  1080. ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
  1081. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  1082. if (ulCharRemoved)
  1083. {
  1084. m_pCurToken->ComputeStateProperties(State);
  1085. }
  1086. }
  1087. if ((TEST_PROP(
  1088. State.m_Properties,
  1089. (GET_PROP(m_apLangSupport->GetTimeSeperator()).m_ulFlag))) ||
  1090. HAS_PROP_ALPHA(State.m_Properties))
  1091. {
  1092. //
  1093. // suspected to be time 12:33 14:22 15:22:33
  1094. // or AM/PM time format 12:22AM 13PM
  1095. //
  1096. Trace(
  1097. elVerbose,
  1098. s_tagTokenizerSuspect,
  1099. ("%*.*S suspected to be AM/PM time",
  1100. State.m_ulEnd - State.m_ulStart,
  1101. State.m_ulEnd - State.m_ulStart,
  1102. State.m_pwcsToken + State.m_ulStart
  1103. ));
  1104. if (VerifyTime(State))
  1105. {
  1106. return true;
  1107. }
  1108. }
  1109. Trace(
  1110. elVerbose,
  1111. s_tagTokenizerSuspect,
  1112. ("%*.*S suspected to be a simple number",
  1113. State.m_ulEnd - State.m_ulStart,
  1114. State.m_ulEnd - State.m_ulStart,
  1115. State.m_pwcsToken + State.m_ulStart
  1116. ));
  1117. if (VerifyNumber(State))
  1118. {
  1119. return true;
  1120. }
  1121. if (TEST_PROP(State.m_Properties, PROP_DATE_SEPERATOR))
  1122. {
  1123. //
  1124. // suspected to be a date 1999-05-04 or 1998/11/10 1999.05.04
  1125. //
  1126. Trace(
  1127. elVerbose,
  1128. s_tagTokenizerSuspect,
  1129. ("%*.*S suspected to be a date",
  1130. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  1131. m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
  1132. m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
  1133. ));
  1134. return VerifyDate(State);
  1135. }
  1136. return false;
  1137. }
  1138. bool CTokenizer::VerifyTime(CTokenState& S)
  1139. {
  1140. CTokenState State(S);
  1141. CPropFlag PunctHead(TIME_ADDITIONAL_PUNCT_HEAD);
  1142. CPropFlag PunctTail(TIME_ADDITIONAL_PUNCT_TAIL);
  1143. if (TEST_PROP(State.m_Properties,
  1144. (TIME_ADDITIONAL_PUNCT_HEAD | TIME_ADDITIONAL_PUNCT_TAIL)))
  1145. {
  1146. ULONG ulCharRemoved;
  1147. ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
  1148. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  1149. if (ulCharRemoved)
  1150. {
  1151. m_pCurToken->ComputeStateProperties(State);
  1152. }
  1153. }
  1154. if ((State.m_ulEnd - State.m_ulStart) > MAX_TIME_FORMAT_LEN)
  1155. {
  1156. return false;
  1157. }
  1158. WCHAR pwcsBuf[MAX_TIME_FORMAT_LEN + 1];
  1159. ULONG ulCur = State.m_ulStart;
  1160. WCHAR wcSeperator = 0xFFFF;
  1161. ULONG ul = 0;
  1162. //
  1163. // formatting the text to a date format
  1164. //
  1165. while (ulCur < State.m_ulEnd)
  1166. {
  1167. CPropFlag prop(GET_PROP(State.m_pwcsToken[ulCur]));
  1168. if (HAS_PROP_NUMBER(prop))
  1169. {
  1170. pwcsBuf[ul] = L'#';
  1171. }
  1172. else if (State.m_pwcsToken[ulCur] == m_apLangSupport->GetTimeSeperator())
  1173. {
  1174. if (0xFFFF == wcSeperator)
  1175. {
  1176. wcSeperator = State.m_pwcsToken[ulCur];
  1177. }
  1178. else if (wcSeperator != State.m_pwcsToken[ulCur])
  1179. {
  1180. return false;
  1181. }
  1182. pwcsBuf[ul] = L':';
  1183. }
  1184. else if (HAS_PROP_ALPHA(prop) || HAS_PROP_PERIOD(prop))
  1185. {
  1186. pwcsBuf[ul] = State.m_pwcsToken[ulCur];
  1187. }
  1188. else
  1189. {
  1190. return false;
  1191. }
  1192. ul++;
  1193. ulCur++;
  1194. }
  1195. pwcsBuf[ul] = L'\0';
  1196. CTimeTerm* pTerm;
  1197. short sResCount = 0;
  1198. DictStatus status;
  1199. status = g_pTimeFormat->m_trieTimeFormat.trie_Find(
  1200. pwcsBuf,
  1201. TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
  1202. 1,
  1203. &pTerm,
  1204. &sResCount);
  1205. if (!(sResCount && (pTerm->bLen == ul)))
  1206. {
  1207. return false;
  1208. }
  1209. LONG lHour;
  1210. LONG lMin;
  1211. LONG lSec;
  1212. TimeFormat AmPm;
  1213. GetValuesFromTimeString(
  1214. pTerm,
  1215. State.m_pwcsToken + State.m_ulStart ,
  1216. &lHour,
  1217. &lMin,
  1218. &lSec,
  1219. &AmPm);
  1220. if (None == AmPm)
  1221. {
  1222. if (lHour > 24)
  1223. {
  1224. return false;
  1225. }
  1226. }
  1227. else
  1228. {
  1229. if (lHour > 12)
  1230. {
  1231. return false;
  1232. }
  1233. if (Am == AmPm)
  1234. {
  1235. if (12 == lHour)
  1236. {
  1237. lHour = 0;
  1238. }
  1239. }
  1240. else
  1241. {
  1242. if (lHour < 12)
  1243. {
  1244. lHour += 12;
  1245. }
  1246. }
  1247. }
  1248. if (lMin > 59)
  1249. {
  1250. return false;
  1251. }
  1252. if (lSec > 59)
  1253. {
  1254. return false;
  1255. }
  1256. WCHAR pwcsTime[9] = {L'\0',L'\0',L'\0',L'\0',L'\0',L'\0',L'\0',L'\0',L'\0'};
  1257. swprintf(pwcsTime, L"TT%02d%02d", lHour, lMin);
  1258. Trace(
  1259. elVerbose,
  1260. s_tagTokenizerDecision,
  1261. ("%*.*S is a time -> %S",
  1262. State.m_ulEnd - State.m_ulStart,
  1263. State.m_ulEnd - State.m_ulStart,
  1264. State.m_pwcsToken + State.m_ulStart,
  1265. pwcsTime));
  1266. OutputTime(pwcsTime, State);
  1267. return true;
  1268. }
  1269. bool CTokenizer::VerifyDate(CTokenState& S)
  1270. {
  1271. CTokenState State(S);
  1272. CPropFlag PunctHead(DATE_ADDITIONAL_PUNCT_HEAD);
  1273. CPropFlag PunctTail(DATE_ADDITIONAL_PUNCT_TAIL);
  1274. if (TEST_PROP(State.m_Properties,
  1275. (DATE_ADDITIONAL_PUNCT_HEAD | DATE_ADDITIONAL_PUNCT_TAIL)))
  1276. {
  1277. ULONG ulCharRemoved;
  1278. ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
  1279. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  1280. if (ulCharRemoved)
  1281. {
  1282. m_pCurToken->ComputeStateProperties(State);
  1283. }
  1284. }
  1285. WCHAR pwcsBuf[MAX_DATE_FORMAT_LEN + 1];
  1286. if (State.m_ulEnd - State.m_ulStart > MAX_DATE_FORMAT_LEN)
  1287. {
  1288. return false;
  1289. }
  1290. ULONG ulCur = State.m_ulStart;
  1291. WCHAR wcSeperator = 0xFFFF;
  1292. ULONG ul = 0;
  1293. //
  1294. // formatting the text to a date format
  1295. //
  1296. while (ulCur < State.m_ulEnd)
  1297. {
  1298. CPropFlag prop(GET_PROP(State.m_pwcsToken[ulCur]));
  1299. if (HAS_PROP_NUMBER(prop))
  1300. {
  1301. pwcsBuf[ul] = L'#';
  1302. }
  1303. else if (HAS_PROP_PERIOD(prop) ||
  1304. HAS_PROP_DASH(prop) ||
  1305. HAS_PROP_SLASH(prop))
  1306. {
  1307. if (0xFFFF == wcSeperator)
  1308. {
  1309. wcSeperator = State.m_pwcsToken[ulCur];
  1310. }
  1311. else if (wcSeperator != State.m_pwcsToken[ulCur])
  1312. {
  1313. return false;
  1314. }
  1315. pwcsBuf[ul] = L'.';
  1316. }
  1317. else
  1318. {
  1319. return false;
  1320. }
  1321. ul++;
  1322. ulCur++;
  1323. }
  1324. pwcsBuf[ul] = L'\0';
  1325. CDateTerm* pTerm;
  1326. short sResCount = 0;
  1327. DictStatus status;
  1328. status = g_pDateFormat->m_trieDateFormat.trie_Find(
  1329. pwcsBuf,
  1330. TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
  1331. 1,
  1332. &pTerm,
  1333. &sResCount);
  1334. if (!(sResCount && (pTerm->bLen == ul)))
  1335. {
  1336. return false;
  1337. }
  1338. LONG lD_M1;
  1339. LONG lD_M2;
  1340. LONG lYear;
  1341. GetValuesFromDateString(
  1342. pTerm,
  1343. State.m_pwcsToken + State.m_ulStart,
  1344. &lD_M1,
  1345. &lD_M2,
  1346. &lYear);
  1347. LONG lDay;
  1348. LONG lMonth;
  1349. //
  1350. // language dependent
  1351. //
  1352. if (m_apLangSupport->IsDayMonthOrder() ||
  1353. pTerm->bType == YYMMDD_TYPE)
  1354. {
  1355. lDay = lD_M1;
  1356. lMonth = lD_M2;
  1357. }
  1358. else
  1359. {
  1360. lDay = lD_M2;
  1361. lMonth = lD_M1;
  1362. }
  1363. if (!((lDay > 0) && (lDay <= 31)))
  1364. {
  1365. return false;
  1366. }
  1367. if (!((lMonth > 0) && (lMonth <= 12)))
  1368. {
  1369. return false;
  1370. }
  1371. WCHAR pwcsDate1[11] = { L'D', L'D', L'0', L'0', L'0', L'0', L'0', L'0', L'0', L'0', L'\0'};
  1372. WCHAR pwcsDate2[11];
  1373. bool bY2K = false;
  1374. if (lYear <= 99) // Y2k bug
  1375. {
  1376. _ltow(lYear + 1900, pwcsDate1 + 2, 10);
  1377. bY2K = true;
  1378. }
  1379. else if (lYear < 1000)
  1380. {
  1381. _ltow(lYear, pwcsDate1 + 3, 10);
  1382. }
  1383. else
  1384. {
  1385. _ltow(lYear, pwcsDate1 + 2, 10);
  1386. }
  1387. if (lMonth < 10)
  1388. {
  1389. pwcsDate1[6] = L'0';
  1390. _ltow(lMonth, pwcsDate1 + 7, 10);
  1391. }
  1392. else
  1393. {
  1394. _ltow(lMonth, pwcsDate1 + 6, 10);
  1395. }
  1396. if (lDay < 10)
  1397. {
  1398. pwcsDate1[8] = L'0';
  1399. _ltow(lDay, pwcsDate1 + 9, 10);
  1400. }
  1401. else
  1402. {
  1403. _ltow(lDay, pwcsDate1 + 8, 10);
  1404. }
  1405. if (bY2K)
  1406. {
  1407. wcscpy(pwcsDate2, pwcsDate1);
  1408. pwcsDate2[2] = L'2';
  1409. pwcsDate2[3] = L'0';
  1410. }
  1411. Trace(
  1412. elVerbose,
  1413. s_tagTokenizerDecision,
  1414. ("%*.*S is a date",
  1415. State.m_ulEnd - State.m_ulStart,
  1416. State.m_ulEnd - State.m_ulStart,
  1417. State.m_pwcsToken + State.m_ulStart
  1418. ));
  1419. if (bY2K)
  1420. {
  1421. OutputDate(pwcsDate1, pwcsDate2, State);
  1422. }
  1423. else
  1424. {
  1425. OutputDate(pwcsDate1, NULL, State);
  1426. }
  1427. return true;
  1428. }
  1429. bool CTokenizer::VerifyNumber(CTokenState& S)
  1430. {
  1431. CTokenState State(S);
  1432. WCHAR pwcsNumber[TOKENIZER_MAXBUFFERLIMIT + 10];
  1433. ULONG ulOutLen;
  1434. ULONG ulOffsetToTxt;
  1435. const CCliticsTerm* pCliticsTerm;
  1436. pCliticsTerm = VerifyClitics(State);
  1437. ULONG ulAddToStart = 0;
  1438. ULONG ulDecFromEnd = 0;
  1439. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  1440. {
  1441. ulAddToStart = pCliticsTerm->ulLen;
  1442. }
  1443. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  1444. {
  1445. ulDecFromEnd = pCliticsTerm->ulLen;
  1446. }
  1447. bool fRet = CheckAndCreateNumber(
  1448. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  1449. State.m_ulEnd - State.m_ulStart - ulAddToStart - ulDecFromEnd,
  1450. pwcsNumber,
  1451. &ulOffsetToTxt,
  1452. &ulOutLen);
  1453. if (!fRet)
  1454. {
  1455. return false;
  1456. }
  1457. Trace(
  1458. elVerbose,
  1459. s_tagTokenizerDecision,
  1460. ("%*.*S is a number",
  1461. State.m_ulEnd - State.m_ulStart,
  1462. State.m_ulEnd - State.m_ulStart,
  1463. State.m_pwcsToken + State.m_ulStart
  1464. ));
  1465. OutputNumbers(State, ulOutLen, pwcsNumber + ulOffsetToTxt, pCliticsTerm);
  1466. return true;
  1467. }
  1468. bool CTokenizer::VerifyCurrency()
  1469. {
  1470. //
  1471. // format is either $12.22 or 12.22$
  1472. //
  1473. CPropFlag PunctHead(CURRENCY_PUNCT_HEAD);
  1474. CPropFlag PunctTail(CURRENCY_PUNCT_TAIL);
  1475. CTokenState State(m_pCurToken->m_State);
  1476. if (TEST_PROP(State.m_Properties,
  1477. (CURRENCY_PUNCT_HEAD | CURRENCY_PUNCT_TAIL)))
  1478. {
  1479. ULONG ulCharRemoved;
  1480. ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
  1481. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  1482. if (ulCharRemoved)
  1483. {
  1484. m_pCurToken->ComputeStateProperties(State);
  1485. }
  1486. }
  1487. const CCliticsTerm* pCliticsTerm;
  1488. pCliticsTerm = VerifyClitics(State);
  1489. ULONG ulAddToStart = 0;
  1490. ULONG ulDecFromEnd = 0;
  1491. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  1492. {
  1493. ulAddToStart = pCliticsTerm->ulLen;
  1494. }
  1495. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  1496. {
  1497. ulDecFromEnd = pCliticsTerm->ulLen;
  1498. }
  1499. WCHAR wchCurrency;
  1500. WCHAR pwcsCurrency[TOKENIZER_MAXBUFFERLIMIT + 10];
  1501. WCHAR* pwcsStr = State.m_pwcsToken + State.m_ulStart;
  1502. if (HAS_PROP_CURRENCY(GET_PROP(State.m_pwcsToken[State.m_ulStart + ulAddToStart])))
  1503. {
  1504. wchCurrency = State.m_pwcsToken[State.m_ulStart + ulAddToStart];
  1505. pwcsStr += 1;
  1506. }
  1507. else if (HAS_PROP_CURRENCY(GET_PROP(State.m_pwcsToken[State.m_ulEnd - 1 - ulDecFromEnd])))
  1508. {
  1509. wchCurrency = State.m_pwcsToken[State.m_ulEnd - 1 - ulDecFromEnd];
  1510. }
  1511. else
  1512. {
  1513. return false;
  1514. }
  1515. ULONG ulOutLen;
  1516. ULONG ulOffsetToTxt;
  1517. if (false == CheckAndCreateNumber(
  1518. pwcsStr + ulAddToStart,
  1519. State.m_ulEnd - State.m_ulStart - 1 - ulAddToStart - ulDecFromEnd,
  1520. pwcsCurrency,
  1521. &ulOffsetToTxt,
  1522. &ulOutLen))
  1523. {
  1524. return false;
  1525. }
  1526. Assert(ulOffsetToTxt + ulOutLen + 1 < m_ulMaxTokenSize + 4);
  1527. pwcsCurrency[ulOffsetToTxt + ulOutLen] = wchCurrency;
  1528. pwcsCurrency[ulOffsetToTxt + ulOutLen + 1] = L'\0';
  1529. Trace(
  1530. elVerbose,
  1531. s_tagTokenizerDecision,
  1532. ("%*.*S is a currency",
  1533. State.m_ulEnd - State.m_ulStart,
  1534. State.m_ulEnd - State.m_ulStart,
  1535. State.m_pwcsToken + State.m_ulStart
  1536. ));
  1537. OutputCurrency(ulOutLen+1, pwcsCurrency + ulOffsetToTxt , State, pCliticsTerm);
  1538. return true;
  1539. }
  1540. bool CTokenizer::VerifyCommersialSign()
  1541. {
  1542. CTokenState State(m_pCurToken->m_State);
  1543. CPropFlag CommPunctTail(COMMERSIAL_SIGN_PUNCT_TAIL);
  1544. CPropFlag CommPunctHead(COMMERSIAL_SIGN_PUNCT_HEAD);
  1545. if (TEST_PROP(State.m_Properties, (COMMERSIAL_SIGN_PUNCT_TAIL | COMMERSIAL_SIGN_PUNCT_HEAD)))
  1546. {
  1547. ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(CommPunctTail, State);
  1548. ulCharRemoved += m_pCurToken->RemoveHeadPunct(CommPunctHead, State);
  1549. if (ulCharRemoved)
  1550. {
  1551. m_pCurToken->ComputeStateProperties(State);
  1552. }
  1553. }
  1554. if (TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd - 1]),
  1555. PROP_COMMERSIAL_SIGN))
  1556. {
  1557. //
  1558. // the length of the token must be greater then 1 since it includes an alpha
  1559. // and the commersial sign
  1560. //
  1561. Assert((State.m_ulEnd - State.m_ulStart) > 1);
  1562. OutputCommersialSignToken(State);
  1563. return true;
  1564. }
  1565. return false;
  1566. }
  1567. void CTokenizer::ProcessDefault()
  1568. {
  1569. CTokenState State(m_pCurToken->m_State);
  1570. if (TEST_PROP(State.m_Properties, PROP_DEFAULT_BREAKER))
  1571. {
  1572. if (TEST_PROP(State.m_Properties, PROP_FIRST_LEVEL_BREAKER))
  1573. {
  1574. CPropFlag prop(PROP_FIRST_LEVEL_BREAKER);
  1575. BreakCompundString(State, prop);
  1576. return;
  1577. }
  1578. if (TEST_PROP(State.m_Properties, PROP_SECOND_LEVEL_BREAKER))
  1579. {
  1580. CPropFlag prop(PROP_SECOND_LEVEL_BREAKER);
  1581. BreakCompundString(State, prop);
  1582. return;
  1583. }
  1584. }
  1585. //
  1586. // this is a simple token
  1587. //
  1588. const CCliticsTerm* pCliticsTerm;
  1589. pCliticsTerm = VerifyClitics(State);
  1590. if (pCliticsTerm == &g_EmptyClitics)
  1591. {
  1592. if (TEST_PROP(State.m_Properties, PROP_NBS))
  1593. {
  1594. CPropFlag prop(PROP_NBS);
  1595. BreakCompundString(State, prop);
  1596. return;
  1597. }
  1598. CPropFlag PunctHead(SIMPLE_PUNCT_HEAD);
  1599. CPropFlag PunctTail(SIMPLE_PUNCT_TAIL);
  1600. if (TEST_PROP(State.m_Properties,
  1601. (SIMPLE_PUNCT_HEAD | SIMPLE_PUNCT_TAIL)))
  1602. {
  1603. ULONG ulCharRemoved;
  1604. ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
  1605. ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
  1606. if ( TEST_PROP(State.m_Properties, PROP_UNDERSCORE) )
  1607. {
  1608. bool hasFrontUnderscore =
  1609. (State.m_ulStart > m_pCurToken->m_State.m_ulStart) &&
  1610. TEST_PROP( GET_PROP(State.m_pwcsToken[State.m_ulStart-1]),
  1611. PROP_UNDERSCORE ) &&
  1612. TEST_PROP( GET_PROP(State.m_pwcsToken[State.m_ulStart]),
  1613. PROP_ALPHA_NUMERIC );
  1614. bool hasBackUnderscore =
  1615. (State.m_ulEnd < m_pCurToken->m_State.m_ulEnd) &&
  1616. TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd]),
  1617. PROP_UNDERSCORE) &&
  1618. TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd-1]),
  1619. PROP_ALPHA_NUMERIC);
  1620. //
  1621. // Note: To change the policy to "leave ALL attached underscore
  1622. // seuences, simply change below condition to:
  1623. // if ( (hasFrontUnderscore || hasBackUnderscore) )
  1624. //
  1625. if ( (hasFrontUnderscore ^ hasBackUnderscore) )
  1626. {
  1627. ulCharRemoved -=
  1628. AddBackUnderscores(
  1629. State,
  1630. hasFrontUnderscore,
  1631. hasBackUnderscore
  1632. );
  1633. }
  1634. } // if ( TEST_PROP(State.m_Properties, PROP_UNDERSCORE) )
  1635. if (ulCharRemoved)
  1636. {
  1637. m_pCurToken->ComputeStateProperties(State);
  1638. }
  1639. }
  1640. }
  1641. if (State.m_ulEnd == State.m_ulStart)
  1642. {
  1643. //
  1644. // case we remove all chracters in the above statement
  1645. //
  1646. return;
  1647. }
  1648. Trace(
  1649. elVerbose,
  1650. s_tagTokenizerDecision,
  1651. ("%*.*S is a simple token",
  1652. State.m_ulEnd - State.m_ulStart,
  1653. State.m_ulEnd - State.m_ulStart,
  1654. State.m_pwcsToken + State.m_ulStart
  1655. ));
  1656. OutputSimpleToken(State, pCliticsTerm);
  1657. }
  1658. //
  1659. // CTokenizer::AddBackUnderscores:
  1660. //
  1661. // Treat cases of a "simple" token with head and/or tail underscore
  1662. // sequence (consecutive underscores prefix or suffix); those
  1663. // do not get flipped off and remain part of the token.
  1664. // This routine is called after underscore removal, (as a result of
  1665. // Remove[Head|Tail]Punct) and adds them back in.
  1666. //
  1667. // return value: Number of underscores added back in.
  1668. //
  1669. ULONG
  1670. CTokenizer::AddBackUnderscores(
  1671. IN CTokenState& State,
  1672. IN bool hasFrontUnderscore,
  1673. IN bool hasBackUnderscore
  1674. )
  1675. {
  1676. ULONG ulCharsAdded = 0;
  1677. if ( hasFrontUnderscore )
  1678. {
  1679. // Move left over consecutive underscores
  1680. ulCharsAdded = m_pCurToken->FindLeftmostUnderscore(State);
  1681. }
  1682. if ( hasBackUnderscore )
  1683. {
  1684. // Move right over consecutive underscores
  1685. ulCharsAdded += m_pCurToken->FindRightmostUnderscore(State);
  1686. } // if ( hasFrontUnderscore )
  1687. return ulCharsAdded;
  1688. } // CTokenizer::AddBackUnderscores()
  1689. void CTokenizer::OutputUrl(CTokenState& State)
  1690. {
  1691. HRESULT hr;
  1692. ULONG ulOffsetInTxtSourceBuffer =
  1693. m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  1694. ULONG ulCur = State.m_ulStart;
  1695. ULONG ulStart = ulCur;
  1696. ULONG ulLenInTxtSourceBuffer = 0;
  1697. ULONG ulOffsetDueToAnEscapeChar;
  1698. while (ulCur < State.m_ulEnd)
  1699. {
  1700. ulLenInTxtSourceBuffer++;
  1701. ulOffsetDueToAnEscapeChar = 0;
  1702. if ((State.m_pwcsToken[ulCur] == L'%') &&
  1703. (ulCur <= State.m_ulEnd - 2))
  1704. {
  1705. //
  1706. // replacing escape charaters with real ones.
  1707. //
  1708. if (TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+1]) , PROP_XDIGIT) &&
  1709. TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+2]) , PROP_XDIGIT))
  1710. {
  1711. short sVal;
  1712. sVal = ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 1]);
  1713. sVal *= 16;
  1714. sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 2]);
  1715. State.m_pwcsToken[ulCur+2] = sVal;
  1716. for (ULONG ul = ulCur -1 ; ul >= ulStart; ul--)
  1717. {
  1718. State.m_pwcsToken[ul+2] = State.m_pwcsToken[ul];
  1719. }
  1720. ulCur += 2;
  1721. ulStart+=2;
  1722. ulOffsetDueToAnEscapeChar = 2;
  1723. ulLenInTxtSourceBuffer += 2;
  1724. }
  1725. else if ((ulCur <= State.m_ulEnd - 5) &&
  1726. ((State.m_pwcsToken[ulCur+1] == L'u') ||
  1727. (State.m_pwcsToken[ulCur+1] == L'U')) &&
  1728. TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+2]) , PROP_XDIGIT) &&
  1729. TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+3]) , PROP_XDIGIT) &&
  1730. TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+4]) , PROP_XDIGIT) &&
  1731. TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+5]) , PROP_XDIGIT))
  1732. {
  1733. short sVal;
  1734. sVal = ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 2]);
  1735. sVal *= 0x1000;
  1736. sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 3]);
  1737. sVal *= 0x100;
  1738. sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 4]);
  1739. sVal *= 0x10;
  1740. sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 5]);
  1741. State.m_pwcsToken[ulCur+5] = sVal;
  1742. for (ULONG ul = ulCur -1 ; ul >= ulStart; ul--)
  1743. {
  1744. State.m_pwcsToken[ul+5] = State.m_pwcsToken[ul];
  1745. }
  1746. ulCur += 5;
  1747. ulStart+=5;
  1748. ulOffsetDueToAnEscapeChar = 5;
  1749. ulLenInTxtSourceBuffer += 5;
  1750. }
  1751. }
  1752. if ( IS_BREAKER( State.m_pwcsToken[ulCur] ) )
  1753. {
  1754. if (ulCur - ulStart == 0)
  1755. {
  1756. //
  1757. // only punctuation
  1758. //
  1759. ulCur++;
  1760. ulStart = ulCur;
  1761. ulOffsetInTxtSourceBuffer += ulOffsetDueToAnEscapeChar + 1;
  1762. ulLenInTxtSourceBuffer = 0;
  1763. continue;
  1764. }
  1765. hr = m_apWordSink->PutWord(
  1766. ulCur - ulStart,
  1767. &State.m_pwcsToken[ulStart],
  1768. ulLenInTxtSourceBuffer - 1 - ulOffsetDueToAnEscapeChar,
  1769. ulOffsetInTxtSourceBuffer);
  1770. if (FAILED(hr))
  1771. {
  1772. THROW_HRESULT_EXCEPTION(hr);
  1773. }
  1774. ulStart = ulCur + 1;
  1775. ulOffsetInTxtSourceBuffer += ulLenInTxtSourceBuffer;
  1776. ulLenInTxtSourceBuffer = 0;
  1777. }
  1778. ulCur++;
  1779. }
  1780. //
  1781. // last word.
  1782. //
  1783. if (ulStart < ulCur)
  1784. {
  1785. hr = m_apWordSink->PutWord(
  1786. ulCur - ulStart,
  1787. &State.m_pwcsToken[ulStart],
  1788. ulLenInTxtSourceBuffer,
  1789. ulOffsetInTxtSourceBuffer);
  1790. if (FAILED(hr))
  1791. {
  1792. THROW_HRESULT_EXCEPTION(hr);
  1793. }
  1794. }
  1795. }
  1796. void CTokenizer::OutputNumbers(
  1797. CTokenState& State,
  1798. ULONG ulLen,
  1799. WCHAR* pwcsNumber,
  1800. const CCliticsTerm* pCliticsTerm)
  1801. {
  1802. HRESULT hr;
  1803. //
  1804. // Input: 1.22 Output: 1.22, NN1D22
  1805. //
  1806. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  1807. if (ulLen > m_ulMaxTokenSize)
  1808. {
  1809. hr = m_apWordSink->PutWord(
  1810. State.m_ulEnd - State.m_ulStart,
  1811. &State.m_pwcsToken[State.m_ulStart],
  1812. State.m_ulEnd - State.m_ulStart,
  1813. ulOffsetInTxtSourceBuffer);
  1814. if (FAILED(hr))
  1815. {
  1816. THROW_HRESULT_EXCEPTION(hr);
  1817. }
  1818. return;
  1819. }
  1820. hr = m_apWordSink->PutAltWord(
  1821. State.m_ulEnd - State.m_ulStart,
  1822. &State.m_pwcsToken[State.m_ulStart],
  1823. State.m_ulEnd - State.m_ulStart,
  1824. ulOffsetInTxtSourceBuffer);
  1825. if (FAILED(hr))
  1826. {
  1827. THROW_HRESULT_EXCEPTION(hr);
  1828. }
  1829. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  1830. {
  1831. hr = m_apWordSink->PutAltWord(
  1832. State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
  1833. State.m_pwcsToken + State.m_ulStart + pCliticsTerm->ulLen,
  1834. State.m_ulEnd - State.m_ulStart,
  1835. ulOffsetInTxtSourceBuffer);
  1836. if (FAILED(hr))
  1837. {
  1838. THROW_HRESULT_EXCEPTION(hr);
  1839. }
  1840. }
  1841. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  1842. {
  1843. hr = m_apWordSink->PutAltWord(
  1844. State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
  1845. State.m_pwcsToken + State.m_ulStart,
  1846. State.m_ulEnd - State.m_ulStart,
  1847. ulOffsetInTxtSourceBuffer);
  1848. if (FAILED(hr))
  1849. {
  1850. THROW_HRESULT_EXCEPTION(hr);
  1851. }
  1852. }
  1853. hr = m_apWordSink->PutWord(
  1854. ulLen,
  1855. pwcsNumber,
  1856. State.m_ulEnd - State.m_ulStart,
  1857. ulOffsetInTxtSourceBuffer);
  1858. if (FAILED(hr))
  1859. {
  1860. THROW_HRESULT_EXCEPTION(hr);
  1861. }
  1862. }
  1863. void CTokenizer::OutputParens(CTokenState& State)
  1864. {
  1865. HRESULT hr;
  1866. //
  1867. // format is xxx(s)
  1868. // Input: xxx(s) Output: xxx
  1869. //
  1870. State.m_pwcsToken[State.m_ulEnd - 3] = L'\0';
  1871. hr = m_apWordSink->PutWord(
  1872. State.m_ulEnd - 3 - State.m_ulStart,
  1873. &State.m_pwcsToken[State.m_ulStart],
  1874. State.m_ulEnd - State.m_ulStart,
  1875. m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State));
  1876. if (FAILED(hr))
  1877. {
  1878. THROW_HRESULT_EXCEPTION(hr);
  1879. }
  1880. }
  1881. void CTokenizer::OutputAcronym(CTokenState& State, const CCliticsTerm* pCliticsTerm)
  1882. {
  1883. HRESULT hr;
  1884. //
  1885. // Input: I.B.M Output: I.B.M, IBM
  1886. //
  1887. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  1888. ULONG ulAddToStart = 0;
  1889. ULONG ulDecFromEnd = 0;
  1890. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  1891. {
  1892. ulAddToStart = pCliticsTerm->ulLen;
  1893. }
  1894. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  1895. {
  1896. ulDecFromEnd = pCliticsTerm->ulLen;
  1897. }
  1898. hr = m_apWordSink->PutAltWord(
  1899. State.m_ulEnd - ulDecFromEnd - (State.m_ulStart + ulAddToStart),
  1900. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  1901. State.m_ulEnd - State.m_ulStart,
  1902. ulOffsetInTxtSourceBuffer);
  1903. if (FAILED(hr))
  1904. {
  1905. THROW_HRESULT_EXCEPTION(hr);
  1906. }
  1907. ULONG ulCur = State.m_ulStart + ulAddToStart;
  1908. ULONG ulNext = ulCur;
  1909. while (ulCur < State.m_ulEnd)
  1910. {
  1911. if (!HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[ulCur])))
  1912. {
  1913. State.m_pwcsToken[ulNext] = State.m_pwcsToken[ulCur];
  1914. ulNext++;
  1915. ulCur++;
  1916. continue;
  1917. }
  1918. ulCur++;
  1919. }
  1920. if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  1921. {
  1922. hr = m_apWordSink->PutAltWord(
  1923. ulNext - (State.m_ulStart + ulAddToStart),
  1924. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  1925. State.m_ulEnd - State.m_ulStart,
  1926. ulOffsetInTxtSourceBuffer);
  1927. if (FAILED(hr))
  1928. {
  1929. THROW_HRESULT_EXCEPTION(hr);
  1930. }
  1931. }
  1932. hr = m_apWordSink->PutWord(
  1933. ulNext - ulDecFromEnd - (State.m_ulStart + ulAddToStart),
  1934. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  1935. State.m_ulEnd - State.m_ulStart,
  1936. ulOffsetInTxtSourceBuffer);
  1937. if (FAILED(hr))
  1938. {
  1939. THROW_HRESULT_EXCEPTION(hr);
  1940. }
  1941. }
  1942. void CTokenizer::OutputAbbreviation(CTokenState& State)
  1943. {
  1944. HRESULT hr;
  1945. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  1946. hr = m_apWordSink->PutAltWord(
  1947. State.m_ulEnd - State.m_ulStart - 1,
  1948. &State.m_pwcsToken[State.m_ulStart],
  1949. State.m_ulEnd - State.m_ulStart,
  1950. ulOffsetInTxtSourceBuffer);
  1951. if (FAILED(hr))
  1952. {
  1953. THROW_HRESULT_EXCEPTION(hr);
  1954. }
  1955. hr = m_apWordSink->PutWord(
  1956. State.m_ulEnd - State.m_ulStart,
  1957. &State.m_pwcsToken[State.m_ulStart],
  1958. State.m_ulEnd - State.m_ulStart,
  1959. ulOffsetInTxtSourceBuffer);
  1960. if (FAILED(hr))
  1961. {
  1962. THROW_HRESULT_EXCEPTION(hr);
  1963. }
  1964. }
  1965. void CTokenizer::OutputSpecialAbbreviation(
  1966. CTokenState& State,
  1967. CAbbTerm* pTerm,
  1968. const CCliticsTerm* pCliticsTerm)
  1969. {
  1970. HRESULT hr;
  1971. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  1972. WCHAR* pwcsAbb = pTerm->pwcsAbb;
  1973. ULONG ulLen = pTerm->ulAbbLen;
  1974. if (pTerm->pwcsCanonicalForm)
  1975. {
  1976. pwcsAbb = pTerm->pwcsCanonicalForm;
  1977. ulLen = pTerm->ulCanLen;
  1978. }
  1979. if (TAIL_MATCH_TRUNCATE == pCliticsTerm->ulOp)
  1980. {
  1981. WCHAR pwcs[TOKENIZER_MAXBUFFERLIMIT];
  1982. int iCount;
  1983. iCount = _snwprintf(
  1984. pwcs,
  1985. TOKENIZER_MAXBUFFERLIMIT,
  1986. L"%s%s",
  1987. pwcsAbb,
  1988. pCliticsTerm->pwcs);
  1989. Assert(iCount < TOKENIZER_MAXBUFFERLIMIT);
  1990. pwcs[TOKENIZER_MAXBUFFERLIMIT - 1] = L'\0';
  1991. hr = m_apWordSink->PutAltWord(
  1992. ulLen + pCliticsTerm->ulLen,
  1993. pwcs,
  1994. State.m_ulEnd - State.m_ulStart,
  1995. ulOffsetInTxtSourceBuffer);
  1996. if (FAILED(hr))
  1997. {
  1998. THROW_HRESULT_EXCEPTION(hr);
  1999. }
  2000. }
  2001. hr = m_apWordSink->PutWord(
  2002. ulLen,
  2003. pwcsAbb,
  2004. State.m_ulEnd - State.m_ulStart,
  2005. ulOffsetInTxtSourceBuffer);
  2006. if (FAILED(hr))
  2007. {
  2008. THROW_HRESULT_EXCEPTION(hr);
  2009. }
  2010. }
  2011. void CTokenizer::OutputHyphenation(CTokenState& State, const CCliticsTerm* pCliticsTerm)
  2012. {
  2013. //
  2014. // Input: Data-Base Output Data Base, DataBase (only in query time)
  2015. //
  2016. HRESULT hr;
  2017. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2018. ULONG ulAddToStart = 0;
  2019. ULONG ulDecFromEnd = 0;
  2020. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  2021. {
  2022. ulAddToStart = pCliticsTerm->ulLen;
  2023. }
  2024. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2025. {
  2026. ulDecFromEnd = pCliticsTerm->ulLen;
  2027. }
  2028. ULONG ulCur = State.m_ulStart + ulAddToStart;
  2029. ULONG ulStart = ulCur;
  2030. ULONG ulRelPosInTxtSrcBuff = ulOffsetInTxtSourceBuffer;
  2031. if (m_bQueryTime)
  2032. {
  2033. ULONG ulNext = ulCur;
  2034. hr = m_apWordSink->StartAltPhrase();
  2035. if (FAILED(hr))
  2036. {
  2037. THROW_HRESULT_EXCEPTION(hr);
  2038. }
  2039. ULONG ulAdd = ulAddToStart;
  2040. while (ulCur < State.m_ulEnd)
  2041. {
  2042. if ( HAS_PROP_DASH(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
  2043. {
  2044. hr = m_apWordSink->PutWord(
  2045. ulNext - ulStart,
  2046. &State.m_pwcsToken[ulStart],
  2047. ulNext - ulStart + ulAdd,
  2048. ulRelPosInTxtSrcBuff);
  2049. if (FAILED(hr))
  2050. {
  2051. THROW_HRESULT_EXCEPTION(hr);
  2052. }
  2053. ulRelPosInTxtSrcBuff += ulNext - ulStart + 1 + ulAdd;
  2054. ulStart = ulNext;
  2055. ulCur++;
  2056. ulAdd = 0;
  2057. continue;
  2058. }
  2059. State.m_pwcsToken[ulNext] = State.m_pwcsToken[ulCur];
  2060. ulNext++;
  2061. ulCur++;
  2062. }
  2063. Assert(ulCur > ulStart);
  2064. if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2065. {
  2066. hr = m_apWordSink->PutAltWord(
  2067. ulNext - ulStart,
  2068. &State.m_pwcsToken[ulStart],
  2069. ulNext - ulStart,
  2070. ulRelPosInTxtSrcBuff);
  2071. if (FAILED(hr))
  2072. {
  2073. THROW_HRESULT_EXCEPTION(hr);
  2074. }
  2075. }
  2076. hr = m_apWordSink->PutWord(
  2077. ulNext - ulStart - ulDecFromEnd,
  2078. &State.m_pwcsToken[ulStart],
  2079. ulNext - ulStart,
  2080. ulRelPosInTxtSrcBuff);
  2081. if (FAILED(hr))
  2082. {
  2083. THROW_HRESULT_EXCEPTION(hr);
  2084. }
  2085. hr = m_apWordSink->StartAltPhrase();
  2086. if (FAILED(hr))
  2087. {
  2088. THROW_HRESULT_EXCEPTION(hr);
  2089. }
  2090. if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2091. {
  2092. hr = m_apWordSink->PutAltWord(
  2093. ulNext - State.m_ulStart,
  2094. &State.m_pwcsToken[State.m_ulStart],
  2095. State.m_ulEnd - State.m_ulStart - ulAddToStart,
  2096. ulOffsetInTxtSourceBuffer);
  2097. if (FAILED(hr))
  2098. {
  2099. THROW_HRESULT_EXCEPTION(hr);
  2100. }
  2101. }
  2102. hr = m_apWordSink->PutWord(
  2103. ulNext - State.m_ulStart - ulDecFromEnd - ulAddToStart,
  2104. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  2105. State.m_ulEnd - State.m_ulStart + ulAddToStart,
  2106. ulOffsetInTxtSourceBuffer);
  2107. if (FAILED(hr))
  2108. {
  2109. THROW_HRESULT_EXCEPTION(hr);
  2110. }
  2111. hr = m_apWordSink->EndAltPhrase();
  2112. if (FAILED(hr))
  2113. {
  2114. THROW_HRESULT_EXCEPTION(hr);
  2115. }
  2116. }
  2117. else
  2118. {
  2119. ULONG ulAdd = ulAddToStart;
  2120. while (ulCur < State.m_ulEnd)
  2121. {
  2122. if (HAS_PROP_DASH(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
  2123. {
  2124. hr = m_apWordSink->PutWord(
  2125. ulCur - ulStart,
  2126. &State.m_pwcsToken[ulStart],
  2127. ulCur - ulStart + ulAdd,
  2128. ulRelPosInTxtSrcBuff);
  2129. if (FAILED(hr))
  2130. {
  2131. THROW_HRESULT_EXCEPTION(hr);
  2132. }
  2133. ulRelPosInTxtSrcBuff += ulCur - ulStart + 1 + ulAdd;
  2134. ulStart = ulCur + 1;
  2135. ulAdd = 0;
  2136. }
  2137. ulCur++;
  2138. }
  2139. Assert(ulCur > ulStart);
  2140. if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2141. {
  2142. hr = m_apWordSink->PutAltWord(
  2143. ulCur - ulStart,
  2144. &State.m_pwcsToken[ulStart],
  2145. ulCur - ulStart,
  2146. ulRelPosInTxtSrcBuff);
  2147. if (FAILED(hr))
  2148. {
  2149. THROW_HRESULT_EXCEPTION(hr);
  2150. }
  2151. }
  2152. hr = m_apWordSink->PutWord(
  2153. ulCur - ulStart - ulDecFromEnd,
  2154. &State.m_pwcsToken[ulStart],
  2155. ulCur - ulStart,
  2156. ulRelPosInTxtSrcBuff);
  2157. if (FAILED(hr))
  2158. {
  2159. THROW_HRESULT_EXCEPTION(hr);
  2160. }
  2161. }
  2162. }
  2163. void CTokenizer::OutputTime(WCHAR* pwcsTime, CTokenState& State)
  2164. {
  2165. HRESULT hr;
  2166. //
  2167. // Output: TT1353
  2168. //
  2169. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2170. hr = m_apWordSink->PutAltWord(
  2171. State.m_ulEnd - State.m_ulStart,
  2172. &State.m_pwcsToken[State.m_ulStart],
  2173. State.m_ulEnd - State.m_ulStart,
  2174. ulOffsetInTxtSourceBuffer);
  2175. if (FAILED(hr))
  2176. {
  2177. THROW_HRESULT_EXCEPTION(hr);
  2178. }
  2179. hr = m_apWordSink->PutWord(
  2180. 6,
  2181. pwcsTime,
  2182. State.m_ulEnd - State.m_ulStart,
  2183. ulOffsetInTxtSourceBuffer);
  2184. if (FAILED(hr))
  2185. {
  2186. THROW_HRESULT_EXCEPTION(hr);
  2187. }
  2188. }
  2189. void CTokenizer::OutputDate(
  2190. WCHAR* pwcsDate1,
  2191. WCHAR* pwcsDate2,
  2192. CTokenState& State)
  2193. {
  2194. HRESULT hr;
  2195. //
  2196. // Output: DD19990921
  2197. //
  2198. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2199. hr = m_apWordSink->PutAltWord(
  2200. State.m_ulEnd - State.m_ulStart,
  2201. &State.m_pwcsToken[State.m_ulStart],
  2202. State.m_ulEnd - State.m_ulStart,
  2203. ulOffsetInTxtSourceBuffer);
  2204. if (FAILED(hr))
  2205. {
  2206. THROW_HRESULT_EXCEPTION(hr);
  2207. }
  2208. if (pwcsDate2)
  2209. {
  2210. hr = m_apWordSink->PutAltWord(
  2211. 10,
  2212. pwcsDate2,
  2213. State.m_ulEnd - State.m_ulStart,
  2214. ulOffsetInTxtSourceBuffer);
  2215. if (FAILED(hr))
  2216. {
  2217. THROW_HRESULT_EXCEPTION(hr);
  2218. }
  2219. }
  2220. hr = m_apWordSink->PutWord(
  2221. 10,
  2222. pwcsDate1,
  2223. State.m_ulEnd - State.m_ulStart,
  2224. ulOffsetInTxtSourceBuffer);
  2225. if (FAILED(hr))
  2226. {
  2227. THROW_HRESULT_EXCEPTION(hr);
  2228. }
  2229. }
  2230. void CTokenizer::OutputSimpleToken(CTokenState& State, const CCliticsTerm* pTerm)
  2231. {
  2232. HRESULT hr;
  2233. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2234. if (((TAIL_MATCH_TRUNCATE == pTerm->ulOp) ||
  2235. (HEAD_MATCH_TRUNCATE == pTerm->ulOp)) &&
  2236. (State.m_ulStart + pTerm->ulLen < State.m_ulEnd))
  2237. {
  2238. hr = m_apWordSink->PutAltWord(
  2239. State.m_ulEnd - State.m_ulStart,
  2240. &State.m_pwcsToken[State.m_ulStart],
  2241. State.m_ulEnd - State.m_ulStart,
  2242. ulOffsetInTxtSourceBuffer);
  2243. if (FAILED(hr))
  2244. {
  2245. THROW_HRESULT_EXCEPTION(hr);
  2246. }
  2247. if (pTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2248. {
  2249. hr = m_apWordSink->PutWord(
  2250. State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
  2251. &State.m_pwcsToken[State.m_ulStart],
  2252. State.m_ulEnd - State.m_ulStart,
  2253. ulOffsetInTxtSourceBuffer);
  2254. if (FAILED(hr))
  2255. {
  2256. THROW_HRESULT_EXCEPTION(hr);
  2257. }
  2258. }
  2259. else
  2260. {
  2261. Assert(pTerm->ulOp == HEAD_MATCH_TRUNCATE);
  2262. hr = m_apWordSink->PutWord(
  2263. State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
  2264. &State.m_pwcsToken[State.m_ulStart + pTerm->ulLen],
  2265. State.m_ulEnd - State.m_ulStart,
  2266. ulOffsetInTxtSourceBuffer);
  2267. if (FAILED(hr))
  2268. {
  2269. THROW_HRESULT_EXCEPTION(hr);
  2270. }
  2271. }
  2272. return;
  2273. }
  2274. hr = m_apWordSink->PutWord(
  2275. State.m_ulEnd - State.m_ulStart,
  2276. &State.m_pwcsToken[State.m_ulStart],
  2277. State.m_ulEnd - State.m_ulStart,
  2278. m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State));
  2279. if (FAILED(hr))
  2280. {
  2281. THROW_HRESULT_EXCEPTION(hr);
  2282. }
  2283. }
  2284. void CTokenizer::OutputCurrency(
  2285. ULONG ulLen,
  2286. WCHAR* pwcsCurrency,
  2287. CTokenState& State,
  2288. const CCliticsTerm* pTerm)
  2289. {
  2290. HRESULT hr;
  2291. //
  2292. // Output: CC12.22$
  2293. //
  2294. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2295. if (ulLen > m_ulMaxTokenSize)
  2296. {
  2297. hr = m_apWordSink->PutWord(
  2298. State.m_ulEnd - State.m_ulStart,
  2299. &State.m_pwcsToken[State.m_ulStart],
  2300. State.m_ulEnd - State.m_ulStart,
  2301. ulOffsetInTxtSourceBuffer);
  2302. if (FAILED(hr))
  2303. {
  2304. THROW_HRESULT_EXCEPTION(hr);
  2305. }
  2306. return;
  2307. }
  2308. hr = m_apWordSink->PutAltWord(
  2309. State.m_ulEnd - State.m_ulStart,
  2310. &State.m_pwcsToken[State.m_ulStart],
  2311. State.m_ulEnd - State.m_ulStart,
  2312. ulOffsetInTxtSourceBuffer);
  2313. if (FAILED(hr))
  2314. {
  2315. THROW_HRESULT_EXCEPTION(hr);
  2316. }
  2317. if (pTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2318. {
  2319. hr = m_apWordSink->PutAltWord(
  2320. State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
  2321. &State.m_pwcsToken[State.m_ulStart],
  2322. State.m_ulEnd - State.m_ulStart,
  2323. ulOffsetInTxtSourceBuffer);
  2324. if (FAILED(hr))
  2325. {
  2326. THROW_HRESULT_EXCEPTION(hr);
  2327. }
  2328. }
  2329. else if (pTerm->ulOp == HEAD_MATCH_TRUNCATE)
  2330. {
  2331. hr = m_apWordSink->PutAltWord(
  2332. State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
  2333. &State.m_pwcsToken[State.m_ulStart + pTerm->ulLen],
  2334. State.m_ulEnd - State.m_ulStart,
  2335. ulOffsetInTxtSourceBuffer);
  2336. if (FAILED(hr))
  2337. {
  2338. THROW_HRESULT_EXCEPTION(hr);
  2339. }
  2340. }
  2341. hr = m_apWordSink->PutWord(
  2342. ulLen,
  2343. pwcsCurrency,
  2344. State.m_ulEnd - State.m_ulStart,
  2345. ulOffsetInTxtSourceBuffer);
  2346. if (FAILED(hr))
  2347. {
  2348. THROW_HRESULT_EXCEPTION(hr);
  2349. }
  2350. }
  2351. void CTokenizer::OutputCommersialSignToken(
  2352. CTokenState& State)
  2353. {
  2354. HRESULT hr;
  2355. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2356. hr = m_apWordSink->PutAltWord(
  2357. State.m_ulEnd - State.m_ulStart - 1,
  2358. State.m_pwcsToken + State.m_ulStart,
  2359. State.m_ulEnd - State.m_ulStart,
  2360. ulOffsetInTxtSourceBuffer);
  2361. if (FAILED(hr))
  2362. {
  2363. THROW_HRESULT_EXCEPTION(hr);
  2364. }
  2365. hr = m_apWordSink->PutWord(
  2366. State.m_ulEnd - State.m_ulStart,
  2367. State.m_pwcsToken + State.m_ulStart,
  2368. State.m_ulEnd - State.m_ulStart,
  2369. ulOffsetInTxtSourceBuffer);
  2370. if (FAILED(hr))
  2371. {
  2372. THROW_HRESULT_EXCEPTION(hr);
  2373. }
  2374. }
  2375. void CTokenizer::OutputMisc(
  2376. CTokenState& State,
  2377. bool bPatternContainOnlyUpperCase,
  2378. ULONG ulSuffixSize,
  2379. const CCliticsTerm* pCliticsTerm)
  2380. {
  2381. HRESULT hr;
  2382. ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
  2383. ULONG ulAddToStart = 0;
  2384. ULONG ulDecFromEnd = 0;
  2385. if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
  2386. {
  2387. hr = m_apWordSink->PutAltWord(
  2388. State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
  2389. State.m_pwcsToken + State.m_ulStart + pCliticsTerm->ulLen,
  2390. State.m_ulEnd - State.m_ulStart,
  2391. ulOffsetInTxtSourceBuffer);
  2392. if (FAILED(hr))
  2393. {
  2394. THROW_HRESULT_EXCEPTION(hr);
  2395. }
  2396. ulAddToStart = pCliticsTerm->ulLen;
  2397. }
  2398. else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
  2399. {
  2400. hr = m_apWordSink->PutAltWord(
  2401. State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
  2402. State.m_pwcsToken + State.m_ulStart,
  2403. State.m_ulEnd - State.m_ulStart,
  2404. ulOffsetInTxtSourceBuffer);
  2405. if (FAILED(hr))
  2406. {
  2407. THROW_HRESULT_EXCEPTION(hr);
  2408. }
  2409. ulDecFromEnd = pCliticsTerm->ulLen;
  2410. }
  2411. if (!bPatternContainOnlyUpperCase)
  2412. {
  2413. hr = m_apWordSink->PutAltWord(
  2414. State.m_ulEnd - State.m_ulStart - ulAddToStart - ulDecFromEnd - ulSuffixSize,
  2415. State.m_pwcsToken + State.m_ulStart + ulAddToStart,
  2416. State.m_ulEnd - State.m_ulStart,
  2417. ulOffsetInTxtSourceBuffer);
  2418. if (FAILED(hr))
  2419. {
  2420. THROW_HRESULT_EXCEPTION(hr);
  2421. }
  2422. }
  2423. hr = m_apWordSink->PutWord(
  2424. State.m_ulEnd - State.m_ulStart,
  2425. &State.m_pwcsToken[State.m_ulStart],
  2426. State.m_ulEnd - State.m_ulStart,
  2427. ulOffsetInTxtSourceBuffer);
  2428. if (FAILED(hr))
  2429. {
  2430. THROW_HRESULT_EXCEPTION(hr);
  2431. }
  2432. }
  2433. #define NUMBER_NO_ERROR 0
  2434. #define NUMBER_SEPERATOR_ERROR 1
  2435. #define NUMBER_ERROR 2
  2436. bool CTokenizer::CheckAndCreateNumber(
  2437. WCHAR* pwcsStr,
  2438. ULONG ulLen,
  2439. WCHAR* pwcsOut,
  2440. ULONG* pulOffsetToTxt, // the actual output does not always start at the beginning of buffer
  2441. ULONG* pulOutLen)
  2442. {
  2443. int iRet;
  2444. iRet = CheckAndCreateNumber(
  2445. pwcsStr,
  2446. ulLen,
  2447. m_apLangSupport->GetDecimalSeperator(),
  2448. m_apLangSupport->GetThousandSeperator(),
  2449. pwcsOut,
  2450. pulOffsetToTxt,
  2451. pulOutLen);
  2452. if (NUMBER_NO_ERROR == iRet)
  2453. {
  2454. return true;
  2455. }
  2456. else if (NUMBER_ERROR == iRet)
  2457. {
  2458. return false;
  2459. }
  2460. iRet = CheckAndCreateNumber(
  2461. pwcsStr,
  2462. ulLen,
  2463. L'.', // default value
  2464. 0xFFFF, // no thousand sperator
  2465. pwcsOut,
  2466. pulOffsetToTxt,
  2467. pulOutLen);
  2468. if (NUMBER_NO_ERROR == iRet)
  2469. {
  2470. return true;
  2471. }
  2472. return false;
  2473. }
  2474. //
  2475. // return value:
  2476. // NUMBER_NO_ERROR - success
  2477. // NUMBER_SEPERATOR_ERROR - error due to sperators
  2478. // NUMBER_ERROR - error since it's not a number.
  2479. //
  2480. int CTokenizer::CheckAndCreateNumber(
  2481. WCHAR* pwcsStr,
  2482. ULONG ulLen,
  2483. WCHAR wchSDecimal,
  2484. WCHAR wchSThousand,
  2485. WCHAR* pwcsOut,
  2486. ULONG* pulOffsetToTxt, // the actual output does not always start at the beginning of buffer
  2487. ULONG* pulOutLen)
  2488. {
  2489. Assert(ulLen > 0);
  2490. //
  2491. // assumes that the out buffer is big enough.
  2492. // looking for the following formats: 1111 1111.2222 1,111,111.222
  2493. //
  2494. ULONG ulCur = ulLen - 1;
  2495. ULONG ulNumCharsBeforDigitSeperator = 0;
  2496. ULONG ulNextChar = ulLen - 1 + 3; // +3 is for the NN at the begging of the formated token +
  2497. // additional 0 in the begining in case .50
  2498. bool fHasFraction = false;
  2499. while ((((int)(ulCur)) >= 0) &&
  2500. HAS_PROP_NUMBER(GET_PROP(pwcsStr[ulCur])))
  2501. {
  2502. pwcsOut[ulNextChar] = pwcsStr[ulCur];
  2503. ulCur--;
  2504. ulNextChar--;
  2505. ulNumCharsBeforDigitSeperator++;
  2506. }
  2507. if (ulCur == ulLen - 1)
  2508. {
  2509. //
  2510. // did not read any digits.
  2511. //
  2512. return NUMBER_ERROR;
  2513. }
  2514. if ((((int)ulCur) >= 0) && (pwcsStr[ulCur] == wchSDecimal))
  2515. {
  2516. fHasFraction = true;
  2517. pwcsOut[ulNextChar] = L'D';
  2518. ulCur--;
  2519. ulNextChar--;
  2520. ulNumCharsBeforDigitSeperator = 0;
  2521. }
  2522. ULONG ulNumOfThousandSeperator = 0;
  2523. while (((int)ulCur) >= 0)
  2524. {
  2525. if (pwcsStr[ulCur] == wchSThousand)
  2526. {
  2527. if (3 != ulNumCharsBeforDigitSeperator)
  2528. {
  2529. return NUMBER_SEPERATOR_ERROR;
  2530. }
  2531. ulNumCharsBeforDigitSeperator = 0;
  2532. ulNumOfThousandSeperator++;
  2533. }
  2534. else if(HAS_PROP_NUMBER(GET_PROP(pwcsStr[ulCur])))
  2535. {
  2536. pwcsOut[ulNextChar] = pwcsStr[ulCur];
  2537. ulNumCharsBeforDigitSeperator++;
  2538. ulNextChar--;
  2539. }
  2540. else
  2541. {
  2542. if (TEST_PROP(
  2543. GET_PROP(pwcsStr[ulCur]), PROP_DEFAULT_BREAKER))
  2544. {
  2545. return NUMBER_SEPERATOR_ERROR;
  2546. }
  2547. return NUMBER_ERROR;
  2548. }
  2549. ulCur--;
  2550. }
  2551. *pulOutLen = ulLen;
  2552. if (L'D' == pwcsOut[ulNextChar+1])
  2553. {
  2554. Assert(ulNextChar >= 2);
  2555. //
  2556. // the number has the following format .50
  2557. //
  2558. pwcsOut[ulNextChar] = L'0';
  2559. ulNextChar--;
  2560. *pulOutLen += 1;
  2561. }
  2562. Assert(ulNextChar >= 1);
  2563. pwcsOut[ulLen + 3] = L'\0';
  2564. pwcsOut[ulNextChar] = L'N';
  2565. pwcsOut[ulNextChar - 1] = L'N';
  2566. *pulOutLen = *pulOutLen + 2 - ulNumOfThousandSeperator; // don't use += because 2 - ulNextChar + 1
  2567. *pulOffsetToTxt = ulNextChar - 1;
  2568. // can be negative and since it is ULONG we
  2569. // can get the wrong result.
  2570. if (fHasFraction)
  2571. {
  2572. while (HAS_PROP_NUMBER(GET_PROP(pwcsOut[*pulOutLen + *pulOffsetToTxt - 1])) &&
  2573. (0 == ConvertCharToDigit(pwcsOut[*pulOutLen + *pulOffsetToTxt - 1])))
  2574. {
  2575. Assert(*pulOutLen > 3);
  2576. (*pulOutLen)--;
  2577. }
  2578. if (L'D' == pwcsOut[*pulOutLen + *pulOffsetToTxt - 1])
  2579. {
  2580. (*pulOutLen)--;
  2581. }
  2582. }
  2583. return NUMBER_NO_ERROR;
  2584. }
  2585. void CTokenizer::GetValuesFromDateString(
  2586. CDateTerm* pFormat,
  2587. WCHAR* pwcsDate,
  2588. LONG* plD_M1, // we can't tell in this stage whether this is a Day or a month.
  2589. LONG* plD_M2,
  2590. LONG* plYear)
  2591. {
  2592. BYTE i;
  2593. int iBase;
  2594. *plD_M1 = 0;
  2595. for ( i = pFormat->bD_M1Len, iBase = 1; i > 0; i--, iBase *= 10)
  2596. {
  2597. *plD_M1 += ConvertCharToDigit(pwcsDate[pFormat->bD_M1Offset + i - 1]) * iBase;
  2598. }
  2599. *plD_M2 = 0;
  2600. for ( i = pFormat->bD_M2Len, iBase = 1; i > 0; i--, iBase *= 10)
  2601. {
  2602. *plD_M2 += ConvertCharToDigit(pwcsDate[pFormat->bD_M2Offset + i - 1]) * iBase;
  2603. }
  2604. *plYear = 0;
  2605. for ( i = pFormat->bYearLen, iBase = 1; i > 0; i--, iBase *= 10)
  2606. {
  2607. *plYear += ConvertCharToDigit(pwcsDate[pFormat->bYearOffset + i - 1]) * iBase;
  2608. }
  2609. }
  2610. void CTokenizer::GetValuesFromTimeString(
  2611. CTimeTerm* pFormat,
  2612. WCHAR* pwcsTime,
  2613. LONG* plHour,
  2614. LONG* plMin,
  2615. LONG* plSec,
  2616. TimeFormat* pAmPm)
  2617. {
  2618. BYTE i;
  2619. int iBase;
  2620. *plHour = 0;
  2621. for ( i = pFormat->bHourLen, iBase = 1; i > 0; i--, iBase *= 10)
  2622. {
  2623. *plHour += ConvertCharToDigit(pwcsTime[pFormat->bHourOffset + i - 1]) * iBase;
  2624. }
  2625. *plMin = 0;
  2626. for ( i = pFormat->bMinLen, iBase = 1; i > 0; i--, iBase *= 10)
  2627. {
  2628. *plMin += ConvertCharToDigit(pwcsTime[pFormat->bMinOffset + i - 1]) * iBase;
  2629. }
  2630. *plSec = 0;
  2631. for ( i = pFormat->bSecLen, iBase = 1; i > 0; i--, iBase *= 10)
  2632. {
  2633. *plSec += ConvertCharToDigit(pwcsTime[pFormat->bSecOffset + i - 1]) * iBase;
  2634. }
  2635. *pAmPm = pFormat->AmPm;
  2636. }
  2637. void CTokenizer::BreakCompundString(CTokenState& State, CPropFlag& propBreaker)
  2638. {
  2639. //
  2640. // still there are puctutaitons inside the token
  2641. // we break them up and resubmit them.
  2642. //
  2643. ULONG ulStart = State.m_ulStart;
  2644. ULONG ulCur = ulStart;
  2645. while (ulCur < State.m_ulEnd)
  2646. {
  2647. if ( TEST_PROP1(GET_PROP(State.m_pwcsToken[ulCur]), propBreaker))
  2648. {
  2649. if (ulCur - ulStart == 0)
  2650. {
  2651. //
  2652. // only punctuation
  2653. //
  2654. ulCur++;
  2655. ulStart = ulCur;
  2656. continue;
  2657. }
  2658. m_pCurToken->m_State.m_ulStart = 0;
  2659. m_pCurToken->m_State.m_ulEnd = ulCur - ulStart;
  2660. m_pCurToken->m_State.m_pwcsToken = State.m_pwcsToken + ulStart;
  2661. m_pCurToken->ComputeStateProperties(m_pCurToken->m_State);
  2662. //
  2663. // we just created a sub token need to procces it
  2664. //
  2665. ProcessTokenInternal();
  2666. ulStart = ulCur + 1;
  2667. }
  2668. ulCur++;
  2669. }
  2670. if (ulStart < ulCur)
  2671. {
  2672. //
  2673. // last sub token
  2674. //
  2675. m_pCurToken->m_State.m_ulStart = 0;
  2676. m_pCurToken->m_State.m_ulEnd = ulCur - ulStart;
  2677. m_pCurToken->m_State.m_pwcsToken = State.m_pwcsToken + ulStart;
  2678. m_pCurToken->ComputeStateProperties(m_pCurToken->m_State);
  2679. //
  2680. // we just created a sub token need to procces it
  2681. //
  2682. ProcessTokenInternal();
  2683. }
  2684. return;
  2685. }