Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

889 lines
24 KiB

  1. ////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Filename : Tokenizer.h
  4. // Purpose : Tokenizer declerations
  5. //
  6. // Project : WordBreakers
  7. // Component: English word breaker
  8. //
  9. // Author : yairh
  10. //
  11. // Log:
  12. //
  13. // Jan 06 2000 yairh creation
  14. // Apr 05 2000 dovh - Fixed two problematic debug / tracer buffer size
  15. // problems. (Fix Bug 15449).
  16. // May 07 2000 dovh - USE_WS_SENTINEL algorithm in BreakText
  17. // Nov 11 2000 dovh - Special underscore treatment
  18. // Added inline support routines (FindLeftmostUnderscore etc.)
  19. //
  20. ////////////////////////////////////////////////////////////////////////////////
  21. #ifndef _TOKENIZER_H_
  22. #define _TOKENIZER_H_
  23. #include "tracer.h"
  24. #include "PropArray.h"
  25. #include "Query.h"
  26. #include "stdafx.h"
  27. #include "cierror.h"
  28. #include "LangSupport.h"
  29. #include "Formats.h"
  30. #define TOKENIZER_MAXBUFFERLIMIT 1024 // max size of a token is 1024 chars
  31. DECLARE_TAG(s_tagTokenizer, "Tokenizer");
  32. DECLARE_TAG(s_tagTokenizerOutput, "Tokenizer Output");
  33. DECLARE_TAG(s_tagTokenizerTrace, "Tokenizer Trace");
  34. DECLARE_TAG(s_tagTokenizerDecision, "Tokenizer Decision");
  35. DECLARE_TAG(s_tagTokenizerSuspect, "Tokenizer Suspect");
  36. #if defined(DEBUG)
  37. ///////////////////////////////////////////////////////////////////////////////
  38. // Class CTraceWordSink
  39. ///////////////////////////////////////////////////////////////////////////////
  40. class CTraceWordSink : public IWordSink
  41. {
  42. public:
  43. CTraceWordSink(IWordSink* p) : m_apWordSink(p)
  44. {
  45. }
  46. ULONG __stdcall AddRef()
  47. {
  48. return 1;
  49. }
  50. ULONG __stdcall Release()
  51. {
  52. return 0;
  53. }
  54. STDMETHOD(QueryInterface)(
  55. IN REFIID riid,
  56. IN void **ppvObject)
  57. {
  58. Assert(false);
  59. return E_FAIL;
  60. }
  61. STDMETHOD(PutWord)(
  62. ULONG cwc,
  63. WCHAR const* pwcInBuf,
  64. ULONG cwcSrcLen,
  65. ULONG cwcSrcPos)
  66. {
  67. Assert(cwc < TOKENIZER_MAXBUFFERLIMIT + 10);
  68. #if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER)
  69. if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput))
  70. {
  71. Trace(
  72. elVerbose,
  73. s_tagTokenizerOutput,
  74. ("PutWord: %*.*S, %d, %d, %d",
  75. cwc,
  76. cwc,
  77. pwcInBuf,
  78. cwc,
  79. cwcSrcLen,
  80. cwcSrcPos));
  81. }
  82. #endif
  83. return m_apWordSink->PutWord(cwc, pwcInBuf, cwcSrcLen, cwcSrcPos);
  84. }
  85. STDMETHOD(PutAltWord)(
  86. ULONG cwc,
  87. WCHAR const* pwcInBuf,
  88. ULONG cwcSrcLen,
  89. ULONG cwcSrcPos)
  90. {
  91. Assert(cwc < TOKENIZER_MAXBUFFERLIMIT + 10);
  92. #if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER)
  93. if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput))
  94. {
  95. Trace(
  96. elVerbose,
  97. s_tagTokenizerOutput,
  98. ("PutAltWord: %*.*S, %d, %d, %d",
  99. cwc,
  100. cwc,
  101. pwcInBuf,
  102. cwc,
  103. cwcSrcLen,
  104. cwcSrcPos));
  105. }
  106. #endif
  107. return m_apWordSink->PutAltWord(cwc, pwcInBuf, cwcSrcLen, cwcSrcPos);
  108. }
  109. STDMETHOD(StartAltPhrase)()
  110. {
  111. Trace(
  112. elVerbose,
  113. s_tagTokenizerOutput,
  114. ("StartAltPhrase"));
  115. return m_apWordSink->StartAltPhrase();
  116. }
  117. STDMETHOD(EndAltPhrase)()
  118. {
  119. Trace(
  120. elVerbose,
  121. s_tagTokenizerOutput,
  122. ("EndAltPhrase"));
  123. return m_apWordSink->EndAltPhrase();
  124. }
  125. STDMETHOD(PutBreak)(WORDREP_BREAK_TYPE breakType)
  126. {
  127. WCHAR* p;
  128. #if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER)
  129. if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput))
  130. {
  131. switch (breakType)
  132. {
  133. case WORDREP_BREAK_EOW:
  134. p = L"WORDREP_BREAK_EOW";
  135. break;
  136. case WORDREP_BREAK_EOS:
  137. p = L"WORDREP_BREAK_EOS";
  138. break;
  139. case WORDREP_BREAK_EOP:
  140. p = L"WORDREP_BREAK_EOP";
  141. break;
  142. case WORDREP_BREAK_EOC:
  143. p = L"WORDREP_BREAK_EOC";
  144. break;
  145. default:
  146. p = L"Unknown break type";
  147. }
  148. Trace(
  149. elVerbose,
  150. s_tagTokenizerOutput,
  151. ("PutBreak %S", p));
  152. }
  153. #endif
  154. return m_apWordSink->PutBreak(breakType);
  155. }
  156. CTraceWordSink* operator ->()
  157. {
  158. return this;
  159. }
  160. private:
  161. CComPtr<IWordSink> m_apWordSink;
  162. };
  163. #endif
  164. ///////////////////////////////////////////////////////////////////////////////
  165. // Class CTokenState
  166. ///////////////////////////////////////////////////////////////////////////////
  167. class CTokenState
  168. {
  169. public:
  170. //
  171. // methods
  172. //
  173. CTokenState();
  174. CTokenState(CTokenState& s);
  175. CTokenState& operator = (CTokenState& S);
  176. void Clear(ULONG ulEnd);
  177. public:
  178. //
  179. // members
  180. //
  181. ULONG m_ulStart;
  182. ULONG m_ulEnd;
  183. CPropFlag m_Properties;
  184. WCHAR* m_pwcsToken;
  185. };
  186. inline CTokenState::CTokenState() : m_ulStart(0), m_ulEnd(0)
  187. {
  188. }
  189. inline CTokenState::CTokenState(CTokenState& s) :
  190. m_ulStart(s.m_ulStart),
  191. m_ulEnd(s.m_ulEnd),
  192. m_pwcsToken(s.m_pwcsToken),
  193. m_Properties(s.m_Properties)
  194. {
  195. }
  196. inline CTokenState& CTokenState::operator = (CTokenState& S)
  197. {
  198. m_ulStart = S.m_ulStart;
  199. m_ulEnd = S.m_ulEnd;
  200. m_Properties = S.m_Properties;
  201. m_pwcsToken = S.m_pwcsToken;
  202. return *this;
  203. }
  204. inline void CTokenState::Clear(ULONG ulEnd)
  205. {
  206. m_ulStart = 0;
  207. m_ulEnd = ulEnd;
  208. m_Properties.Clear();
  209. m_pwcsToken = NULL;
  210. }
  211. ///////////////////////////////////////////////////////////////////////////////
  212. // Class CToken
  213. ///////////////////////////////////////////////////////////////////////////////
  214. class CToken
  215. {
  216. public:
  217. //
  218. // methods
  219. //
  220. CToken(ULONG ulMaxTokenSize);
  221. bool IsNotEmpty();
  222. void Clear();
  223. bool IsFull();
  224. void MarkEndToken(ULONG ulCurPosInTxtSourceBuffer);
  225. ULONG RemoveHeadPunct(CPropFlag& PunctProperties, CTokenState& State);
  226. ULONG RemoveTailPunct(CPropFlag& PunctProperties, CTokenState& State);
  227. void ComputeStateProperties(CTokenState& State);
  228. ULONG CalculateStateOffsetInTxtSourceBuffer(CTokenState& State);
  229. ULONG FindLeftmostUnderscore(CTokenState& State);
  230. ULONG FindRightmostUnderscore(CTokenState& State);
  231. public:
  232. //
  233. // members
  234. //
  235. ULONG m_ulBufPos;
  236. bool m_fHasEos;
  237. ULONG m_ulOffsetInTxtSourceBuffer;
  238. ULONG m_ulMaxTokenSize;
  239. CTokenState m_State;
  240. WCHAR m_awchBuf[TOKENIZER_MAXBUFFERLIMIT + 1];
  241. };
  242. inline CToken::CToken(ULONG ulMaxTokenSize) :
  243. m_ulBufPos(0),
  244. m_fHasEos(false),
  245. m_ulOffsetInTxtSourceBuffer(0),
  246. m_ulMaxTokenSize(ulMaxTokenSize)
  247. {
  248. m_awchBuf[0] = L'\0';
  249. }
  250. inline bool CToken::IsNotEmpty()
  251. {
  252. return (m_ulBufPos > 0);
  253. }
  254. inline void CToken::Clear()
  255. {
  256. m_ulBufPos = 0;
  257. m_awchBuf[0] = L'\0';
  258. m_State.Clear(0);
  259. m_fHasEos = false;
  260. m_ulOffsetInTxtSourceBuffer = 0;
  261. }
  262. inline bool CToken::IsFull()
  263. {
  264. return (m_ulBufPos == m_ulMaxTokenSize);
  265. }
  266. inline void CToken::MarkEndToken(ULONG ulCurPosInTxtSourceBuffer)
  267. {
  268. Assert(m_ulBufPos < m_ulMaxTokenSize + 1);
  269. m_awchBuf[m_ulBufPos] = L'\0';
  270. m_State.m_pwcsToken = m_awchBuf;
  271. m_State.m_ulStart = 0;
  272. m_State.m_ulEnd = m_ulBufPos;
  273. if (TEST_PROP(m_State.m_Properties, PROP_EOS) &&
  274. (m_ulBufPos < m_ulMaxTokenSize))
  275. {
  276. ULONG ulCur = m_State.m_ulEnd - 1;
  277. while (TEST_PROP(GET_PROP(m_awchBuf[ulCur]), EOS_SUFFIX))
  278. {
  279. ulCur--;
  280. }
  281. if (IS_EOS(m_awchBuf[ulCur]))
  282. {
  283. m_fHasEos = true;
  284. }
  285. }
  286. Assert(ulCurPosInTxtSourceBuffer >= m_ulBufPos);
  287. m_ulOffsetInTxtSourceBuffer = ulCurPosInTxtSourceBuffer - m_ulBufPos;
  288. }
  289. inline ULONG CToken::CalculateStateOffsetInTxtSourceBuffer(CTokenState& State)
  290. {
  291. ULONG ulOffset =
  292. m_ulOffsetInTxtSourceBuffer +
  293. (State.m_pwcsToken - m_awchBuf) +
  294. State.m_ulStart;
  295. return ulOffset;
  296. }
  297. inline ULONG CToken::RemoveHeadPunct(CPropFlag& PunctProperties, CTokenState& State)
  298. {
  299. Assert(m_State.m_ulStart <= State.m_ulStart);
  300. Assert(State.m_ulStart <= State.m_ulEnd);
  301. Assert(State.m_ulEnd <= m_State.m_ulEnd);
  302. for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd; ul++)
  303. {
  304. if (!TEST_PROP1(GET_PROP(State.m_pwcsToken[ul]), PunctProperties) )
  305. {
  306. break;
  307. }
  308. }
  309. State.m_ulStart = ul;
  310. //
  311. // return num of characters removed
  312. //
  313. return ul;
  314. }
  315. inline ULONG CToken::RemoveTailPunct(CPropFlag& PunctProperties, CTokenState& State)
  316. {
  317. Assert(m_State.m_ulStart <= State.m_ulStart);
  318. Assert(State.m_ulStart <= State.m_ulEnd);
  319. Assert(State.m_ulEnd <= m_State.m_ulEnd);
  320. for (ULONG ul = State.m_ulEnd; ul > State.m_ulStart; ul--)
  321. {
  322. if (!TEST_PROP1(GET_PROP(State.m_pwcsToken[ul - 1]), PunctProperties) )
  323. {
  324. break;
  325. }
  326. }
  327. ULONG ulNumOfRemovedChars = State.m_ulEnd - ul;
  328. State.m_ulEnd = ul;
  329. return ulNumOfRemovedChars;
  330. }
  331. inline void CToken::ComputeStateProperties(CTokenState& State)
  332. {
  333. Assert(m_State.m_ulStart <= State.m_ulStart);
  334. Assert(State.m_ulStart <= State.m_ulEnd);
  335. Assert(State.m_ulEnd <= m_State.m_ulEnd);
  336. State.m_Properties.Clear();
  337. for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd; ul++)
  338. {
  339. State.m_Properties |= GET_PROP(State.m_pwcsToken[ul]);
  340. }
  341. }
  342. ////////////////////////////////////////////////////////////////////////////////
  343. //
  344. // Support routines for UNDERSCORE '_' treatment.
  345. //
  346. // Current algorithm has the following behavior for tokens containing
  347. // ALPHANUMERIC characters and UNDERSCORES:
  348. //
  349. // 1. Single underscores and consecutive underscore sequence surrounded by
  350. // alphanumeric characters (IE underscores buried within words) are
  351. // treated as alphanumeric characters, and do not break words, or get
  352. // omitted. Examples: Foo_Bar => Foo_Bar, and X___Y => X___Y
  353. //
  354. // 2. An underscore / underscore sequence tacked to the right (left) end
  355. // end of an alphanumeric (+ embedded underscores) token, will be part of
  356. // the token, as long as the sequence is attacked only to one side of the
  357. // alphanumeric token. If there are BOTH header and trailer consecutive
  358. // underscore sequences, both header & trailer sequence will be omitted.
  359. // Examples: __Foo_Bar => __Foo_Bar , alpha_beta_ => alpha_beta_ ,
  360. // __HEADERFILE__ => __HEADERFILE__ , __MY_FILE_H__ => MY_FILE_H
  361. //
  362. // 3. Caveat: Note that other than the two rules stated above underscores are
  363. // NOT treated as ALPHANUMERIC characters. he behavior on a mixed sequence
  364. // of underscores, and other non-alphanumeric characters is undefined!
  365. //
  366. ////////////////////////////////////////////////////////////////////////////////
  367. //
  368. // Assumes: on entry State.m_ulStart is the first alphanumeric in token
  369. // returns: num of underscores scanned
  370. //
  371. inline ULONG
  372. CToken::FindLeftmostUnderscore(CTokenState& State)
  373. {
  374. Assert(m_State.m_ulStart < State.m_ulStart);
  375. Assert(State.m_ulStart <= State.m_ulEnd);
  376. Assert(State.m_ulEnd <= m_State.m_ulEnd);
  377. Assert( TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulStart-1]), PROP_UNDERSCORE) );
  378. ULONG ulNumUnderscores = 0;
  379. for (ULONG ul = State.m_ulStart;
  380. (ul > m_State.m_ulStart) &&
  381. (TEST_PROP(GET_PROP(State.m_pwcsToken[ul-1]), PROP_UNDERSCORE) );
  382. ul--)
  383. ;
  384. ulNumUnderscores = State.m_ulStart - ul;
  385. State.m_ulStart = ul;
  386. //
  387. // return num of underscores scanned
  388. //
  389. return (ulNumUnderscores);
  390. } // CToken::FindLeftmostUnderscore
  391. //
  392. // Assumes: on entry State.m_ulEnd is the last alphanumeric in token
  393. // returns: num of underscores scanned
  394. //
  395. inline ULONG
  396. CToken::FindRightmostUnderscore(CTokenState& State)
  397. {
  398. Assert(m_State.m_ulStart <= State.m_ulStart);
  399. Assert(State.m_ulStart <= State.m_ulEnd);
  400. Assert(State.m_ulEnd < m_State.m_ulEnd);
  401. Assert( TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd]), PROP_UNDERSCORE) );
  402. ULONG ulNumUnderscores = 0;
  403. for (ULONG ul = State.m_ulEnd;
  404. (ul < m_State.m_ulEnd) &&
  405. (TEST_PROP(GET_PROP(State.m_pwcsToken[ul]), PROP_UNDERSCORE) );
  406. ul++)
  407. ;
  408. ulNumUnderscores = ul - State.m_ulEnd;
  409. State.m_ulEnd = ul;
  410. //
  411. // return num of underscores scanned
  412. //
  413. return (ulNumUnderscores);
  414. } // CToken::FindRightmostUnderscore
  415. ///////////////////////////////////////////////////////////////////////////////
  416. // Class CTokenizer
  417. ///////////////////////////////////////////////////////////////////////////////
  418. class CTokenizer
  419. {
  420. public:
  421. CTokenizer(
  422. TEXT_SOURCE* pTxtSource,
  423. IWordSink * pWordSink,
  424. IPhraseSink * pPhraseSink,
  425. LCID lcid,
  426. BOOL bQueryTime,
  427. ULONG ulMaxTokenSize);
  428. // destructor frees the passed buffer, if it exists
  429. virtual ~CTokenizer(void)
  430. {
  431. }
  432. void BreakText();
  433. protected:
  434. //
  435. // methods
  436. //
  437. void ProcessToken();
  438. void ProcessTokenInternal();
  439. void BreakCompundString(CTokenState& State, CPropFlag& prop);
  440. HRESULT FillBuffer();
  441. void CalculateUpdateEndOfBuffer();
  442. bool CheckAndCreateNumber(
  443. WCHAR* pwcsStr,
  444. ULONG ulLen,
  445. WCHAR* pwcsOut,
  446. ULONG* pulOffsetToTxt,
  447. ULONG* pulOutLen);
  448. int CheckAndCreateNumber(
  449. WCHAR* pwcsStr,
  450. ULONG ulLen,
  451. WCHAR wchSDecimal,
  452. WCHAR wchSThousand,
  453. WCHAR* pwcsOut,
  454. ULONG* pulOffsetToTxt,
  455. ULONG* pulOutLen);
  456. short ConvertHexCharToNumber(WCHAR wch);
  457. void GetValuesFromDateString(
  458. CDateTerm* pFormat,
  459. WCHAR* pwcsDate,
  460. LONG* plD_M1, // we can't tell in this stage whether this is a Day or a month.
  461. LONG* plD_M2,
  462. LONG* plYear);
  463. void GetValuesFromTimeString(
  464. CTimeTerm* pFormat,
  465. WCHAR* pwcsTime,
  466. LONG* plHour,
  467. LONG* plMin,
  468. LONG* plSec,
  469. TimeFormat* pAmPm);
  470. LONG ConvertCharToDigit(WCHAR wch);
  471. #ifdef DEBUG
  472. void TraceToken();
  473. #endif DEBUG
  474. bool VerifyAlphaUrl();
  475. bool VerifyWwwUrl();
  476. bool VerifyAcronym();
  477. bool VerifyAbbreviation();
  478. bool VerifySpecialAbbreviation();
  479. bool VerifyHyphenation();
  480. bool VerifyParens();
  481. const CCliticsTerm* VerifyClitics(CTokenState& State);
  482. bool VerifyNumber(CTokenState& State);
  483. bool VerifyNumberOrTimeOrDate();
  484. bool VerifyTime(CTokenState& State);
  485. bool VerifyDate(CTokenState& State);
  486. bool VerifyCurrency();
  487. bool VerifyMisc();
  488. bool VerifyCommersialSign();
  489. void ProcessDefault();
  490. ULONG
  491. AddBackUnderscores(
  492. IN CTokenState& State,
  493. IN bool hasFrontUnderscore,
  494. IN bool hasBackUnderscore
  495. );
  496. bool CheckAndRemoveOneSidedUnderscores(CTokenState& State);
  497. void OutputUrl(
  498. CTokenState& State);
  499. void OutputAcronym(
  500. CTokenState& State,
  501. const CCliticsTerm* pCliticsTerm);
  502. void OutputAbbreviation(
  503. CTokenState& State);
  504. void OutputSpecialAbbreviation(
  505. CTokenState& State,
  506. CAbbTerm* pTerm,
  507. const CCliticsTerm* pCliticsTerm);
  508. virtual void OutputHyphenation(
  509. CTokenState& State,
  510. const CCliticsTerm* pCliticsTerm);
  511. void OutputParens(
  512. CTokenState& State);
  513. void OutputNumbers(
  514. CTokenState& State,
  515. ULONG ulLen,
  516. WCHAR* pwcsNumber,
  517. const CCliticsTerm* pCliticsTerm);
  518. void OutputTime(
  519. WCHAR* pwcsTime,
  520. CTokenState& State);
  521. void OutputDate(
  522. WCHAR* pwcsDate1,
  523. WCHAR* pwcsDate2,
  524. CTokenState& State);
  525. virtual void OutputSimpleToken(
  526. CTokenState& State,
  527. const CCliticsTerm* pTerm);
  528. void OutputCurrency(
  529. ULONG ulLen,
  530. WCHAR* pwcsCurrency,
  531. CTokenState& State,
  532. const CCliticsTerm* pTerm);
  533. void OutputMisc(
  534. CTokenState& State,
  535. bool bPatternContainOnlyUpperCase,
  536. ULONG ulSuffixSize,
  537. const CCliticsTerm* pCliticsTerm);
  538. void OutputCommersialSignToken(CTokenState& State);
  539. //
  540. // members
  541. //
  542. LCID m_Lcid;
  543. CAutoClassPointer<CLangSupport> m_apLangSupport;
  544. CToken* m_pCurToken;
  545. CToken m_Token;
  546. #if defined(DEBUG)
  547. CTraceWordSink m_apWordSink;
  548. #else
  549. CComPtr<IWordSink> m_apWordSink;
  550. #endif
  551. CComPtr<IPhraseSink> m_apPhraseSink;
  552. TEXT_SOURCE* m_pTxtSource;
  553. BOOL m_bQueryTime;
  554. ULONG m_ulUpdatedEndOfBuffer;
  555. bool m_bNoMoreTxt;
  556. //
  557. // All Chunks in buffer have a white space
  558. //
  559. bool m_bWhiteSpaceGuarranteed;
  560. ULONG m_ulMaxTokenSize;
  561. };
  562. inline HRESULT CTokenizer::FillBuffer()
  563. {
  564. Trace(
  565. elVerbose,
  566. s_tagTokenizer,
  567. ("WBreakGetNextChar: Filling the buffer"));
  568. HRESULT hr;
  569. if (!m_bNoMoreTxt)
  570. {
  571. do
  572. {
  573. //
  574. // this loop usually performs only one rotations. we use it to solve the
  575. // problem when the user return 0 characters and a success return code.
  576. // the following code assumes that in case you get a success return code then
  577. // the buffer is not empty.
  578. //
  579. hr = m_pTxtSource->pfnFillTextBuffer(m_pTxtSource);
  580. } while ((m_pTxtSource->iEnd <= m_pTxtSource->iCur) && SUCCEEDED(hr));
  581. if ( FAILED(hr))
  582. {
  583. m_bNoMoreTxt = true;
  584. }
  585. }
  586. if (m_bNoMoreTxt && m_pTxtSource->iCur >= m_pTxtSource->iEnd)
  587. {
  588. //
  589. // we reached the end of the buffer.
  590. //
  591. return WBREAK_E_END_OF_TEXT;
  592. }
  593. CalculateUpdateEndOfBuffer();
  594. return S_OK;
  595. }
  596. inline void CTokenizer::CalculateUpdateEndOfBuffer()
  597. {
  598. //
  599. // m_ulUpdatedEndOfBuffer is a marker for the last character that we can read
  600. // from the current buffer before and additional call to fill buffer is needed.
  601. // we use this marker to avoid terms spitted between two consecutive buffers.
  602. // in order to achieve the above m_ulUpdatedEndOfBuffer will point to a breaker
  603. // character. (the only exception to that is when we have a very long term that does
  604. // not contains breaker characters).
  605. //
  606. //
  607. // we split the buffer into chunks of TOKENIZER_MAXBUFFERLIMIT size. in each
  608. // chunk we make sure that there is a breaker.
  609. //
  610. ULONG ulStartChunk = m_pTxtSource->iCur;
  611. ULONG ulEndChunk ;
  612. bool fLastRound = false;
  613. Assert(m_pTxtSource->iEnd > m_pTxtSource->iCur);
  614. ulEndChunk = m_pTxtSource->iCur + m_ulMaxTokenSize > (m_pTxtSource->iEnd - 1) ?
  615. (m_pTxtSource->iEnd - 1) : m_pTxtSource->iCur + m_ulMaxTokenSize;
  616. ULONG ulCur;
  617. ULONG ulBreakerMarker = 0;
  618. m_bWhiteSpaceGuarranteed = false;
  619. while(true)
  620. {
  621. ulCur = ulEndChunk;
  622. //
  623. // per each chunk we go backward and try to find a WS.
  624. //
  625. while ((ulCur > ulStartChunk) &&
  626. (!IS_WS(m_pTxtSource->awcBuffer[ulCur])))
  627. {
  628. ulCur--;
  629. }
  630. if (ulCur == ulStartChunk)
  631. {
  632. //
  633. // the last chunk that we checked did not contain any WS
  634. //
  635. if (m_ulMaxTokenSize == (ulEndChunk - ulStartChunk))
  636. {
  637. //
  638. // full buffer case. we look for a default breaker.
  639. //
  640. ulCur = ulEndChunk;
  641. while ( (ulCur > ulStartChunk) &&
  642. !IS_BREAKER( m_pTxtSource->awcBuffer[ulCur] )
  643. )
  644. {
  645. ulCur--;
  646. }
  647. //
  648. // if we found a breaker then ulBreakerMarker will set to it else
  649. // the term does not contain any breakers and we set the ulBreakerMarker
  650. // to the end of the term. this is the only case that we spilt terms.
  651. //
  652. ulBreakerMarker = ulCur > ulStartChunk ? ulCur : ulEndChunk;
  653. }
  654. else
  655. {
  656. if (ulStartChunk > m_pTxtSource->iCur)
  657. {
  658. //
  659. // case we had a previous chunk. in this case ulStartChunk points to
  660. // a breaker
  661. //
  662. //
  663. // ulStart points to the WS from the previous chunk.
  664. //
  665. ulBreakerMarker = ulStartChunk;
  666. }
  667. else
  668. {
  669. ulBreakerMarker = m_pTxtSource->iEnd;
  670. }
  671. }
  672. break;
  673. }
  674. if (fLastRound)
  675. {
  676. //
  677. // ulCur points to a WS
  678. //
  679. ulBreakerMarker = ulCur + 1;
  680. m_bWhiteSpaceGuarranteed = true;
  681. break;
  682. }
  683. //
  684. // move to the next chunk
  685. //
  686. ulStartChunk = ulCur + 1; // ulStarChunk will points to a breaker
  687. if (ulStartChunk + m_ulMaxTokenSize < (m_pTxtSource->iEnd - 1))
  688. {
  689. ulEndChunk = ulStartChunk + m_ulMaxTokenSize;
  690. }
  691. else
  692. {
  693. ulEndChunk = m_pTxtSource->iEnd - 1;
  694. fLastRound = true;
  695. }
  696. }
  697. Assert(ulBreakerMarker <= m_pTxtSource->iEnd);
  698. m_ulUpdatedEndOfBuffer = ulBreakerMarker;
  699. }
  700. inline short CTokenizer::ConvertHexCharToNumber(WCHAR wch)
  701. {
  702. //
  703. // assumes wch is a valid HEX character
  704. //
  705. Assert(wch >= L'0');
  706. if (wch <= L'9')
  707. {
  708. return (wch - L'0');
  709. }
  710. else if (wch <= L'F')
  711. {
  712. Assert(wch >= L'A');
  713. return (wch - L'A' + 10);
  714. }
  715. else if (wch <= L'f')
  716. {
  717. Assert(wch >= L'a');
  718. return (wch - L'a' + 10);
  719. }
  720. else if (wch <= 0xFF19)
  721. {
  722. Assert(wch >= 0xFF10);
  723. return (wch - 0xFF10);
  724. }
  725. else if (wch <= 0xFF26)
  726. {
  727. Assert(wch >= 0xFF21);
  728. return (wch - 0xFF21 + 10);
  729. }
  730. else
  731. {
  732. Assert((wch >= 0xFF41) && (wch <= 0xFF46));
  733. return (wch - 0xFF41 + 10);
  734. }
  735. }
  736. inline LONG CTokenizer::ConvertCharToDigit(WCHAR wch)
  737. {
  738. Assert((wch >= L'0' && wch <= L'9') || ((wch >= 0xFF10) && (wch <= 0xFF19)));
  739. if (wch <= L'9')
  740. {
  741. return (wch - L'0');
  742. }
  743. return (wch - 0xFF10); // Full width characters.
  744. }
  745. #endif _TOKENIZER_H_