Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

893 lines
23 KiB

  1. ////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Filename : Tokenizer.h
  4. // Purpose : Tokenizer declerations
  5. //
  6. // Project : WordBreakers
  7. // Component: English word breaker
  8. //
  9. // Author : yairh
  10. //
  11. // Log:
  12. //
  13. // Jan 06 2000 yairh creation
  14. // Apr 05 2000 dovh - Fixed two problematic debug / tracer buffer size
  15. // problems. (Fix Bug 15449).
  16. // May 07 2000 dovh - USE_WS_SENTINEL algorithm in BreakText
  17. // Nov 11 2000 dovh - Special underscore treatment
  18. // Added inline support routines (FindLeftmostUnderscore etc.)
  19. //
  20. ////////////////////////////////////////////////////////////////////////////////
  21. #ifndef _TOKENIZER_H_
  22. #define _TOKENIZER_H_
  23. #include "tracer.h"
  24. #include "PropArray.h"
  25. #include "Query.h"
  26. #include "stdafx.h"
  27. #include "cierror.h"
  28. #include "LangSupport.h"
  29. #include "Formats.h"
  30. #define TOKENIZER_MAXBUFFERLIMIT 1024 // max size of a token is 1024 chars
  31. DECLARE_TAG(s_tagTokenizer, "Tokenizer");
  32. DECLARE_TAG(s_tagTokenizerOutput, "Tokenizer Output");
  33. DECLARE_TAG(s_tagTokenizerTrace, "Tokenizer Trace");
  34. DECLARE_TAG(s_tagTokenizerDecision, "Tokenizer Decision");
  35. DECLARE_TAG(s_tagTokenizerSuspect, "Tokenizer Suspect");
  36. #if defined(DEBUG)
  37. ///////////////////////////////////////////////////////////////////////////////
  38. // Class CTraceWordSink
  39. ///////////////////////////////////////////////////////////////////////////////
  40. class CTraceWordSink : public IWordSink
  41. {
  42. public:
  43. CTraceWordSink(IWordSink* p) : m_apWordSink(p)
  44. {
  45. }
  46. ULONG __stdcall AddRef()
  47. {
  48. return 1;
  49. }
  50. ULONG __stdcall Release()
  51. {
  52. return 0;
  53. }
  54. STDMETHOD(QueryInterface)(
  55. IN REFIID riid,
  56. IN void **ppvObject)
  57. {
  58. Assert(false);
  59. return E_FAIL;
  60. }
  61. STDMETHOD(PutWord)(
  62. ULONG cwc,
  63. WCHAR const* pwcInBuf,
  64. ULONG cwcSrcLen,
  65. ULONG cwcSrcPos)
  66. {
  67. Assert(cwc < TOKENIZER_MAXBUFFERLIMIT + 10);
  68. #if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER)
  69. if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput))
  70. {
  71. Trace(
  72. elVerbose,
  73. s_tagTokenizerOutput,
  74. ("PutWord: %*.*S, %d, %d, %d",
  75. cwc,
  76. cwc,
  77. pwcInBuf,
  78. cwc,
  79. cwcSrcLen,
  80. cwcSrcPos));
  81. }
  82. #endif
  83. return m_apWordSink->PutWord(cwc, pwcInBuf, cwcSrcLen, cwcSrcPos);
  84. }
  85. STDMETHOD(PutAltWord)(
  86. ULONG cwc,
  87. WCHAR const* pwcInBuf,
  88. ULONG cwcSrcLen,
  89. ULONG cwcSrcPos)
  90. {
  91. Assert(cwc < TOKENIZER_MAXBUFFERLIMIT + 10);
  92. #if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER)
  93. if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput))
  94. {
  95. Trace(
  96. elVerbose,
  97. s_tagTokenizerOutput,
  98. ("PutAltWord: %*.*S, %d, %d, %d",
  99. cwc,
  100. cwc,
  101. pwcInBuf,
  102. cwc,
  103. cwcSrcLen,
  104. cwcSrcPos));
  105. }
  106. #endif
  107. return m_apWordSink->PutAltWord(cwc, pwcInBuf, cwcSrcLen, cwcSrcPos);
  108. }
  109. STDMETHOD(StartAltPhrase)()
  110. {
  111. Trace(
  112. elVerbose,
  113. s_tagTokenizerOutput,
  114. ("StartAltPhrase"));
  115. return m_apWordSink->StartAltPhrase();
  116. }
  117. STDMETHOD(EndAltPhrase)()
  118. {
  119. Trace(
  120. elVerbose,
  121. s_tagTokenizerOutput,
  122. ("EndAltPhrase"));
  123. return m_apWordSink->EndAltPhrase();
  124. }
  125. STDMETHOD(PutBreak)(WORDREP_BREAK_TYPE breakType)
  126. {
  127. WCHAR* p;
  128. #if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER)
  129. if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput))
  130. {
  131. switch (breakType)
  132. {
  133. case WORDREP_BREAK_EOW:
  134. p = L"WORDREP_BREAK_EOW";
  135. break;
  136. case WORDREP_BREAK_EOS:
  137. p = L"WORDREP_BREAK_EOS";
  138. break;
  139. case WORDREP_BREAK_EOP:
  140. p = L"WORDREP_BREAK_EOP";
  141. break;
  142. case WORDREP_BREAK_EOC:
  143. p = L"WORDREP_BREAK_EOC";
  144. break;
  145. default:
  146. p = L"Unknown break type";
  147. }
  148. Trace(
  149. elVerbose,
  150. s_tagTokenizerOutput,
  151. ("PutBreak %S", p));
  152. }
  153. #endif
  154. return m_apWordSink->PutBreak(breakType);
  155. }
  156. CTraceWordSink* operator ->()
  157. {
  158. return this;
  159. }
  160. private:
  161. CComPtr<IWordSink> m_apWordSink;
  162. };
  163. #endif
  164. ///////////////////////////////////////////////////////////////////////////////
  165. // Class CTokenState
  166. ///////////////////////////////////////////////////////////////////////////////
  167. class CTokenState
  168. {
  169. public:
  170. //
  171. // methods
  172. //
  173. CTokenState();
  174. CTokenState(CTokenState& s);
  175. CTokenState& operator = (CTokenState& S);
  176. void Clear(ULONG ulEnd);
  177. public:
  178. //
  179. // members
  180. //
  181. ULONG m_ulStart;
  182. ULONG m_ulEnd;
  183. CPropFlag m_Properties;
  184. WCHAR* m_pwcsToken;
  185. };
  186. inline CTokenState::CTokenState() : m_ulStart(0), m_ulEnd(0)
  187. {
  188. }
  189. inline CTokenState::CTokenState(CTokenState& s) :
  190. m_ulStart(s.m_ulStart),
  191. m_ulEnd(s.m_ulEnd),
  192. m_pwcsToken(s.m_pwcsToken),
  193. m_Properties(s.m_Properties)
  194. {
  195. }
  196. inline CTokenState& CTokenState::operator = (CTokenState& S)
  197. {
  198. m_ulStart = S.m_ulStart;
  199. m_ulEnd = S.m_ulEnd;
  200. m_Properties = S.m_Properties;
  201. m_pwcsToken = S.m_pwcsToken;
  202. return *this;
  203. }
  204. inline void CTokenState::Clear(ULONG ulEnd)
  205. {
  206. m_ulStart = 0;
  207. m_ulEnd = ulEnd;
  208. m_Properties.Clear();
  209. m_pwcsToken = NULL;
  210. }
  211. ///////////////////////////////////////////////////////////////////////////////
  212. // Class CToken
  213. ///////////////////////////////////////////////////////////////////////////////
  214. class CToken
  215. {
  216. public:
  217. //
  218. // methods
  219. //
  220. CToken(ULONG ulMaxTokenSize);
  221. bool IsNotEmpty();
  222. void Clear();
  223. bool IsFull();
  224. void MarkEndToken(ULONG ulCurPosInTxtSourceBuffer);
  225. ULONG RemoveHeadPunct(CPropFlag& PunctProperties, CTokenState& State);
  226. ULONG RemoveTailPunct(CPropFlag& PunctProperties, CTokenState& State);
  227. void ComputeStateProperties(CTokenState& State);
  228. ULONG CalculateStateOffsetInTxtSourceBuffer(CTokenState& State);
  229. ULONG FindLeftmostUnderscore(CTokenState& State);
  230. ULONG FindRightmostUnderscore(CTokenState& State);
  231. public:
  232. //
  233. // members
  234. //
  235. ULONG m_ulBufPos;
  236. bool m_fHasEos;
  237. ULONG m_ulOffsetInTxtSourceBuffer;
  238. ULONG m_ulMaxTokenSize;
  239. CTokenState m_State;
  240. WCHAR m_awchBuf[TOKENIZER_MAXBUFFERLIMIT + 1];
  241. };
  242. inline CToken::CToken(ULONG ulMaxTokenSize) :
  243. m_ulBufPos(0),
  244. m_fHasEos(false),
  245. m_ulOffsetInTxtSourceBuffer(0),
  246. m_ulMaxTokenSize(ulMaxTokenSize)
  247. {
  248. m_awchBuf[0] = L'\0';
  249. }
  250. inline bool CToken::IsNotEmpty()
  251. {
  252. return (m_ulBufPos > 0);
  253. }
  254. inline void CToken::Clear()
  255. {
  256. m_ulBufPos = 0;
  257. m_awchBuf[0] = L'\0';
  258. m_State.Clear(0);
  259. m_fHasEos = false;
  260. m_ulOffsetInTxtSourceBuffer = 0;
  261. }
  262. inline bool CToken::IsFull()
  263. {
  264. return (m_ulBufPos == m_ulMaxTokenSize);
  265. }
  266. inline void CToken::MarkEndToken(ULONG ulCurPosInTxtSourceBuffer)
  267. {
  268. Assert(m_ulBufPos < m_ulMaxTokenSize + 1);
  269. m_awchBuf[m_ulBufPos] = L'\0';
  270. m_State.m_pwcsToken = m_awchBuf;
  271. m_State.m_ulStart = 0;
  272. m_State.m_ulEnd = m_ulBufPos;
  273. if (TEST_PROP(m_State.m_Properties, PROP_EOS) &&
  274. (m_ulBufPos < m_ulMaxTokenSize))
  275. {
  276. ULONG ulCur = m_State.m_ulEnd - 1;
  277. while (TEST_PROP(GET_PROP(m_awchBuf[ulCur]), EOS_SUFFIX))
  278. {
  279. ulCur--;
  280. }
  281. if (IS_EOS(m_awchBuf[ulCur]))
  282. {
  283. m_fHasEos = true;
  284. }
  285. }
  286. //
  287. // BUGBUG need to enalble the assert
  288. //
  289. // Assert(ulCurPosInTxtSourceBuffer > m_ulBufPos);
  290. m_ulOffsetInTxtSourceBuffer = ulCurPosInTxtSourceBuffer - m_ulBufPos;
  291. }
  292. inline ULONG CToken::CalculateStateOffsetInTxtSourceBuffer(CTokenState& State)
  293. {
  294. ULONG ulOffset =
  295. m_ulOffsetInTxtSourceBuffer +
  296. (State.m_pwcsToken - m_awchBuf) +
  297. State.m_ulStart;
  298. return ulOffset;
  299. }
  300. inline ULONG CToken::RemoveHeadPunct(CPropFlag& PunctProperties, CTokenState& State)
  301. {
  302. Assert(m_State.m_ulStart <= State.m_ulStart);
  303. Assert(State.m_ulStart <= State.m_ulEnd);
  304. Assert(State.m_ulEnd <= m_State.m_ulEnd);
  305. for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd; ul++)
  306. {
  307. if (!TEST_PROP1(GET_PROP(State.m_pwcsToken[ul]), PunctProperties) )
  308. {
  309. break;
  310. }
  311. }
  312. State.m_ulStart = ul;
  313. //
  314. // return num of characters removed
  315. //
  316. return ul;
  317. }
  318. inline ULONG CToken::RemoveTailPunct(CPropFlag& PunctProperties, CTokenState& State)
  319. {
  320. Assert(m_State.m_ulStart <= State.m_ulStart);
  321. Assert(State.m_ulStart <= State.m_ulEnd);
  322. Assert(State.m_ulEnd <= m_State.m_ulEnd);
  323. for (ULONG ul = State.m_ulEnd; ul > State.m_ulStart; ul--)
  324. {
  325. if (!TEST_PROP1(GET_PROP(State.m_pwcsToken[ul - 1]), PunctProperties) )
  326. {
  327. break;
  328. }
  329. }
  330. ULONG ulNumOfRemovedChars = State.m_ulEnd - ul;
  331. State.m_ulEnd = ul;
  332. return ulNumOfRemovedChars;
  333. }
  334. inline void CToken::ComputeStateProperties(CTokenState& State)
  335. {
  336. Assert(m_State.m_ulStart <= State.m_ulStart);
  337. Assert(State.m_ulStart <= State.m_ulEnd);
  338. Assert(State.m_ulEnd <= m_State.m_ulEnd);
  339. State.m_Properties.Clear();
  340. for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd; ul++)
  341. {
  342. State.m_Properties |= GET_PROP(State.m_pwcsToken[ul]);
  343. }
  344. }
  345. ////////////////////////////////////////////////////////////////////////////////
  346. //
  347. // Support routines for UNDERSCORE '_' treatment.
  348. //
  349. // Current algorithm has the following behavior for tokens containing
  350. // ALPHANUMERIC characters and UNDERSCORES:
  351. //
  352. // 1. Single underscores and consecutive underscore sequence surrounded by
  353. // alphanumeric characters (IE underscores buried within words) are
  354. // treated as alphanumeric characters, and do not break words, or get
  355. // omitted. Examples: Foo_Bar => Foo_Bar, and X___Y => X___Y
  356. //
  357. // 2. An underscore / underscore sequence tacked to the right (left) end
  358. // end of an alphanumeric (+ embedded underscores) token, will be part of
  359. // the token, as long as the sequence is attacked only to one side of the
  360. // alphanumeric token. If there are BOTH header and trailer consecutive
  361. // underscore sequences, both header & trailer sequence will be omitted.
  362. // Examples: __Foo_Bar => __Foo_Bar , alpha_beta_ => alpha_beta_ ,
  363. // __HEADERFILE__ => __HEADERFILE__ , __MY_FILE_H__ => MY_FILE_H
  364. //
  365. // 3. Caveat: Note that other than the two rules stated above underscores are
  366. // NOT treated as ALPHANUMERIC characters. he behavior on a mixed sequence
  367. // of underscores, and other non-alphanumeric characters is undefined!
  368. //
  369. ////////////////////////////////////////////////////////////////////////////////
  370. //
  371. // Assumes: on entry State.m_ulStart is the first alphanumeric in token
  372. // returns: num of underscores scanned
  373. //
  374. inline ULONG
  375. CToken::FindLeftmostUnderscore(CTokenState& State)
  376. {
  377. Assert(m_State.m_ulStart < State.m_ulStart);
  378. Assert(State.m_ulStart <= State.m_ulEnd);
  379. Assert(State.m_ulEnd <= m_State.m_ulEnd);
  380. Assert( TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulStart-1]), PROP_UNDERSCORE) );
  381. ULONG ulNumUnderscores = 0;
  382. for (ULONG ul = State.m_ulStart;
  383. (ul > m_State.m_ulStart) &&
  384. (TEST_PROP(GET_PROP(State.m_pwcsToken[ul-1]), PROP_UNDERSCORE) );
  385. ul--)
  386. ;
  387. ulNumUnderscores = State.m_ulStart - ul;
  388. State.m_ulStart = ul;
  389. //
  390. // return num of underscores scanned
  391. //
  392. return (ulNumUnderscores);
  393. } // CToken::FindLeftmostUnderscore
  394. //
  395. // Assumes: on entry State.m_ulEnd is the last alphanumeric in token
  396. // returns: num of underscores scanned
  397. //
  398. inline ULONG
  399. CToken::FindRightmostUnderscore(CTokenState& State)
  400. {
  401. Assert(m_State.m_ulStart <= State.m_ulStart);
  402. Assert(State.m_ulStart <= State.m_ulEnd);
  403. Assert(State.m_ulEnd < m_State.m_ulEnd);
  404. Assert( TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd]), PROP_UNDERSCORE) );
  405. ULONG ulNumUnderscores = 0;
  406. for (ULONG ul = State.m_ulEnd;
  407. (ul < m_State.m_ulEnd) &&
  408. (TEST_PROP(GET_PROP(State.m_pwcsToken[ul]), PROP_UNDERSCORE) );
  409. ul++)
  410. ;
  411. ulNumUnderscores = ul - State.m_ulEnd;
  412. State.m_ulEnd = ul;
  413. //
  414. // return num of underscores scanned
  415. //
  416. return (ulNumUnderscores);
  417. } // CToken::FindRightmostUnderscore
  418. ///////////////////////////////////////////////////////////////////////////////
  419. // Class CTokenizer
  420. ///////////////////////////////////////////////////////////////////////////////
  421. class CTokenizer
  422. {
  423. public:
  424. CTokenizer(
  425. TEXT_SOURCE* pTxtSource,
  426. IWordSink * pWordSink,
  427. IPhraseSink * pPhraseSink,
  428. LCID lcid,
  429. BOOL bQueryTime,
  430. ULONG ulMaxTokenSize);
  431. // destructor frees the passed buffer, if it exists
  432. virtual ~CTokenizer(void)
  433. {
  434. }
  435. void BreakText();
  436. protected:
  437. //
  438. // methods
  439. //
  440. void ProcessToken();
  441. void ProcessTokenInternal();
  442. void BreakCompundString(CTokenState& State, CPropFlag& prop);
  443. HRESULT FillBuffer();
  444. void CalculateUpdateEndOfBuffer();
  445. bool CheckAndCreateNumber(
  446. WCHAR* pwcsStr,
  447. ULONG ulLen,
  448. WCHAR* pwcsOut,
  449. ULONG* pulOffsetToTxt,
  450. ULONG* pulOutLen);
  451. int CheckAndCreateNumber(
  452. WCHAR* pwcsStr,
  453. ULONG ulLen,
  454. WCHAR wchSDecimal,
  455. WCHAR wchSThousand,
  456. WCHAR* pwcsOut,
  457. ULONG* pulOffsetToTxt,
  458. ULONG* pulOutLen);
  459. short ConvertHexCharToNumber(WCHAR wch);
  460. void GetValuesFromDateString(
  461. CDateTerm* pFormat,
  462. WCHAR* pwcsDate,
  463. LONG* plD_M1, // we can't tell in this stage whether this is a Day or a month.
  464. LONG* plD_M2,
  465. LONG* plYear);
  466. void GetValuesFromTimeString(
  467. CTimeTerm* pFormat,
  468. WCHAR* pwcsTime,
  469. LONG* plHour,
  470. LONG* plMin,
  471. LONG* plSec,
  472. TimeFormat* pAmPm);
  473. LONG ConvertCharToDigit(WCHAR wch);
  474. #ifdef DEBUG
  475. void TraceToken();
  476. #endif DEBUG
  477. bool VerifyAlphaUrl();
  478. bool VerifyWwwUrl();
  479. bool VerifyAcronym();
  480. bool VerifyAbbreviation();
  481. bool VerifySpecialAbbreviation();
  482. bool VerifyHyphenation();
  483. bool VerifyParens();
  484. const CCliticsTerm* VerifyClitics(CTokenState& State);
  485. bool VerifyNumber(CTokenState& State);
  486. bool VerifyNumberOrTimeOrDate();
  487. bool VerifyTime(CTokenState& State);
  488. bool VerifyDate(CTokenState& State);
  489. bool VerifyCurrency();
  490. bool VerifyMisc();
  491. bool VerifyCommersialSign();
  492. void ProcessDefault();
  493. ULONG
  494. AddBackUnderscores(
  495. IN CTokenState& State,
  496. IN bool hasFrontUnderscore,
  497. IN bool hasBackUnderscore
  498. );
  499. bool CheckAndRemoveOneSidedUnderscores(CTokenState& State);
  500. void OutputUrl(
  501. CTokenState& State);
  502. void OutputAcronym(
  503. CTokenState& State,
  504. const CCliticsTerm* pCliticsTerm);
  505. void OutputAbbreviation(
  506. CTokenState& State);
  507. void OutputSpecialAbbreviation(
  508. CTokenState& State,
  509. CAbbTerm* pTerm,
  510. const CCliticsTerm* pCliticsTerm);
  511. virtual void OutputHyphenation(
  512. CTokenState& State,
  513. const CCliticsTerm* pCliticsTerm);
  514. void OutputParens(
  515. CTokenState& State);
  516. void OutputNumbers(
  517. CTokenState& State,
  518. ULONG ulLen,
  519. WCHAR* pwcsNumber,
  520. const CCliticsTerm* pCliticsTerm);
  521. void OutputTime(
  522. WCHAR* pwcsTime,
  523. CTokenState& State);
  524. void OutputDate(
  525. WCHAR* pwcsDate1,
  526. WCHAR* pwcsDate2,
  527. CTokenState& State);
  528. virtual void OutputSimpleToken(
  529. CTokenState& State,
  530. const CCliticsTerm* pTerm);
  531. void OutputCurrency(
  532. ULONG ulLen,
  533. WCHAR* pwcsCurrency,
  534. CTokenState& State,
  535. const CCliticsTerm* pTerm);
  536. void OutputMisc(
  537. CTokenState& State,
  538. bool bPatternContainOnlyUpperCase,
  539. ULONG ulSuffixSize,
  540. const CCliticsTerm* pCliticsTerm);
  541. void OutputCommersialSignToken(CTokenState& State);
  542. //
  543. // members
  544. //
  545. LCID m_Lcid;
  546. CAutoClassPointer<CLangSupport> m_apLangSupport;
  547. CToken* m_pCurToken;
  548. CToken m_Token;
  549. #if defined(DEBUG)
  550. CTraceWordSink m_apWordSink;
  551. #else
  552. CComPtr<IWordSink> m_apWordSink;
  553. #endif
  554. CComPtr<IPhraseSink> m_apPhraseSink;
  555. TEXT_SOURCE* m_pTxtSource;
  556. BOOL m_bQueryTime;
  557. ULONG m_ulUpdatedEndOfBuffer;
  558. bool m_bNoMoreTxt;
  559. //
  560. // All Chunks in buffer have a white space
  561. //
  562. bool m_bWhiteSpaceGuarranteed;
  563. ULONG m_ulMaxTokenSize;
  564. };
  565. inline HRESULT CTokenizer::FillBuffer()
  566. {
  567. Trace(
  568. elVerbose,
  569. s_tagTokenizer,
  570. ("WBreakGetNextChar: Filling the buffer"));
  571. HRESULT hr;
  572. if (!m_bNoMoreTxt)
  573. {
  574. do
  575. {
  576. //
  577. // this loop usually performs only one rotations. we use it to solve the
  578. // problem when the user return 0 characters and a success return code.
  579. // the following code assumes that in case you get a success return code then
  580. // the buffer is not empty.
  581. //
  582. hr = m_pTxtSource->pfnFillTextBuffer(m_pTxtSource);
  583. } while ((m_pTxtSource->iEnd <= m_pTxtSource->iCur) && SUCCEEDED(hr));
  584. if ( FAILED(hr))
  585. {
  586. m_bNoMoreTxt = true;
  587. }
  588. }
  589. if (m_bNoMoreTxt && m_pTxtSource->iCur >= m_pTxtSource->iEnd)
  590. {
  591. //
  592. // we reached the end of the buffer.
  593. //
  594. return WBREAK_E_END_OF_TEXT;
  595. }
  596. CalculateUpdateEndOfBuffer();
  597. return S_OK;
  598. }
  599. inline void CTokenizer::CalculateUpdateEndOfBuffer()
  600. {
  601. //
  602. // m_ulUpdatedEndOfBuffer is a marker for the last character that we can read
  603. // from the current buffer before and additional call to fill buffer is needed.
  604. // we use this marker to avoid terms spitted between two consecutive buffers.
  605. // in order to achieve the above m_ulUpdatedEndOfBuffer will point to a breaker
  606. // character. (the only exception to that is when we have a very long term that does
  607. // not contains breaker characters).
  608. //
  609. //
  610. // we split the buffer into chunks of TOKENIZER_MAXBUFFERLIMIT size. in each
  611. // chunk we make sure that there is a breaker.
  612. //
  613. ULONG ulStartChunk = m_pTxtSource->iCur;
  614. ULONG ulEndChunk ;
  615. bool fLastRound = false;
  616. Assert(m_pTxtSource->iEnd > m_pTxtSource->iCur);
  617. ulEndChunk = m_pTxtSource->iCur + m_ulMaxTokenSize > (m_pTxtSource->iEnd - 1) ?
  618. (m_pTxtSource->iEnd - 1) : m_pTxtSource->iCur + m_ulMaxTokenSize;
  619. ULONG ulCur;
  620. ULONG ulBreakerMarker = 0;
  621. m_bWhiteSpaceGuarranteed = false;
  622. while(true)
  623. {
  624. ulCur = ulEndChunk;
  625. //
  626. // per each chunk we go backward and try to find a WS.
  627. //
  628. while ((ulCur > ulStartChunk) &&
  629. (!IS_WS(m_pTxtSource->awcBuffer[ulCur])))
  630. {
  631. ulCur--;
  632. }
  633. if (ulCur == ulStartChunk)
  634. {
  635. //
  636. // the last chunk that we checked did not contain any WS
  637. //
  638. if (m_ulMaxTokenSize == (ulEndChunk - ulStartChunk))
  639. {
  640. //
  641. // full buffer case. we look for a default breaker.
  642. //
  643. ulCur = ulEndChunk;
  644. while ( (ulCur > ulStartChunk) &&
  645. !IS_BREAKER( m_pTxtSource->awcBuffer[ulCur] )
  646. )
  647. {
  648. ulCur--;
  649. }
  650. //
  651. // if we found a breaker then ulBreakerMarker will set to it else
  652. // the term does not contain any breakers and we set the ulBreakerMarker
  653. // to the end of the term. this is the only case that we spilt terms.
  654. //
  655. ulBreakerMarker = ulCur > ulStartChunk ? ulCur : ulEndChunk;
  656. }
  657. else
  658. {
  659. if (ulStartChunk > m_pTxtSource->iCur)
  660. {
  661. //
  662. // case we had a previous chunk. in this case ulStartChunk points to
  663. // a breaker
  664. //
  665. //
  666. // ulStart points to the WS from the previous chunk.
  667. //
  668. ulBreakerMarker = ulStartChunk;
  669. }
  670. else
  671. {
  672. ulBreakerMarker = m_pTxtSource->iEnd;
  673. }
  674. }
  675. break;
  676. }
  677. if (fLastRound)
  678. {
  679. //
  680. // ulCur points to a WS
  681. //
  682. ulBreakerMarker = ulCur + 1;
  683. m_bWhiteSpaceGuarranteed = true;
  684. break;
  685. }
  686. //
  687. // move to the next chunk
  688. //
  689. ulStartChunk = ulCur + 1; // ulStarChunk will points to a breaker
  690. if (ulStartChunk + m_ulMaxTokenSize < (m_pTxtSource->iEnd - 1))
  691. {
  692. ulEndChunk = ulStartChunk + m_ulMaxTokenSize;
  693. }
  694. else
  695. {
  696. ulEndChunk = m_pTxtSource->iEnd - 1;
  697. fLastRound = true;
  698. }
  699. }
  700. Assert(ulBreakerMarker <= m_pTxtSource->iEnd);
  701. m_ulUpdatedEndOfBuffer = ulBreakerMarker;
  702. }
  703. inline short CTokenizer::ConvertHexCharToNumber(WCHAR wch)
  704. {
  705. //
  706. // assumes wch is a valid HEX character
  707. //
  708. Assert(wch >= L'0');
  709. if (wch <= L'9')
  710. {
  711. return (wch - L'0');
  712. }
  713. else if (wch <= L'F')
  714. {
  715. Assert(wch >= L'A');
  716. return (wch - L'A' + 10);
  717. }
  718. else if (wch <= L'f')
  719. {
  720. Assert(wch >= L'a');
  721. return (wch - L'a' + 10);
  722. }
  723. else if (wch <= 0xFF19)
  724. {
  725. Assert(wch >= 0xFF10);
  726. return (wch - 0xFF10);
  727. }
  728. else if (wch <= 0xFF26)
  729. {
  730. Assert(wch >= 0xFF21);
  731. return (wch - 0xFF21 + 10);
  732. }
  733. else
  734. {
  735. Assert((wch >= 0xFF41) && (wch <= 0xFF46));
  736. return (wch - 0xFF41 + 10);
  737. }
  738. }
  739. inline LONG CTokenizer::ConvertCharToDigit(WCHAR wch)
  740. {
  741. Assert((wch >= L'0' && wch <= L'9') || ((wch >= 0xFF10) && (wch <= 0xFF19)));
  742. if (wch <= L'9')
  743. {
  744. return (wch - L'0');
  745. }
  746. return (wch - 0xFF10); // Full width characters.
  747. }
  748. #endif _TOKENIZER_H_