Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

827 lines
23 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. //
  4. // CThaiWordBreak
  5. //
  6. // History:
  7. // created 7/99 aarayas
  8. //
  9. // �1999 Microsoft Corporation
  10. //----------------------------------------------------------------------------
  11. #include "cthwb.hpp"
  12. //+---------------------------------------------------------------------------
  13. //
  14. // Function: ExtractALT
  15. //
  16. // Synopsis: The functions takes a tag and return Alternate Tags.
  17. //
  18. // Arguments:
  19. //
  20. // Modifies:
  21. //
  22. // History: created 3/00 aarayas
  23. //
  24. // Notes:
  25. //
  26. //----------------------------------------------------------------------------
  27. inline BYTE ExtractALT(DWORD dwTag)
  28. {
  29. return (BYTE) ( (dwTag & iAltMask) >> iAltShift);
  30. }
  31. //+---------------------------------------------------------------------------
  32. //
  33. // Class: CThaiWordBreak
  34. //
  35. // Synopsis: constructor
  36. //
  37. // Arguments:
  38. //
  39. // Modifies:
  40. //
  41. // History: created 8/00 aarayas
  42. //
  43. // Notes:
  44. //
  45. //----------------------------------------------------------------------------
  46. CThaiWordBreak::CThaiWordBreak()
  47. {
  48. wordCount[0] = 0;
  49. }
  50. //+---------------------------------------------------------------------------
  51. //
  52. // Class: CThaiWordBreak
  53. //
  54. // Synopsis: destructor
  55. //
  56. // Arguments:
  57. //
  58. // Modifies:
  59. //
  60. // History: created 8/00 aarayas
  61. //
  62. // Notes:
  63. //
  64. //----------------------------------------------------------------------------
  65. CThaiWordBreak::~CThaiWordBreak()
  66. {
  67. wordCount[0] = 0;
  68. #if defined (_DEBUG)
  69. assert(listWordBreak.length == 0);
  70. #endif
  71. }
  72. //+---------------------------------------------------------------------------
  73. //
  74. // Class: CThaiWordBreak
  75. //
  76. // Synopsis: Initialize ThaiWordBreak.
  77. //
  78. // Arguments:
  79. //
  80. // Modifies:
  81. //
  82. // History: created 7/99 aarayas
  83. //
  84. // Notes:
  85. //
  86. //----------------------------------------------------------------------------
  87. PTEC CThaiWordBreak::Init(const WCHAR* wzFileName, const WCHAR* wzFileNameTrigram)
  88. {
  89. // Declare and Initialize local variables.
  90. PTEC retValue = m_trie.Init(wzFileName);
  91. if (retValue == ptecNoErrors)
  92. {
  93. retValue = m_trie_trigram.Init(wzFileNameTrigram);
  94. }
  95. // new memory management
  96. listWordBreak.Init(&m_trie,&m_trie_trigram);
  97. for (int i = 0; i < 10; i++)
  98. {
  99. listWordBreak.CreateWordBreak();
  100. }
  101. return retValue;
  102. }
  103. //+---------------------------------------------------------------------------
  104. //
  105. // Class: CThaiWordBreak
  106. //
  107. // Synopsis: Initialize ThaiWordBreak.
  108. //
  109. // Arguments:
  110. //
  111. // Modifies:
  112. //
  113. // History: created 7/99 aarayas
  114. //
  115. // Notes:
  116. //
  117. //----------------------------------------------------------------------------
  118. PTEC CThaiWordBreak::InitRc(LPBYTE pThaiDic, LPBYTE pThaiTrigram, BOOL fSkipHeader)
  119. {
  120. // Declare and Initialize local variables.
  121. PTEC retValue = m_trie.InitRc(pThaiDic, fSkipHeader);
  122. if (retValue == ptecNoErrors)
  123. retValue = m_trie_trigram.InitRc(pThaiTrigram, fSkipHeader);
  124. // new memory management
  125. listWordBreak.Init(&m_trie,&m_trie_trigram);
  126. for (int i = 0; i < 10; i++)
  127. {
  128. listWordBreak.CreateWordBreak();
  129. }
  130. return retValue;
  131. }
  132. //+---------------------------------------------------------------------------
  133. //
  134. // Class: CThaiWordBreak
  135. //
  136. // Synopsis: UnInitialize ThaiWordBreak.
  137. //
  138. // Arguments:
  139. //
  140. // Modifies:
  141. //
  142. // History: created 7/99 aarayas
  143. //
  144. // Notes:
  145. //
  146. //----------------------------------------------------------------------------
  147. void CThaiWordBreak::UnInit()
  148. {
  149. // new memory management
  150. listWordBreak.Flush();
  151. m_trie.UnInit();
  152. #if defined (NGRAM_ENABLE)
  153. m_trie_sentence_struct.UnInit();
  154. #endif
  155. m_trie_trigram.UnInit();
  156. }
  157. //+---------------------------------------------------------------------------
  158. //
  159. // Class: CThaiWordBreak
  160. //
  161. // Synopsis:
  162. //
  163. // Arguments:
  164. //
  165. // Modifies:
  166. //
  167. // History: created 7/99 aarayas
  168. //
  169. // Notes:
  170. //
  171. //----------------------------------------------------------------------------
  172. enum merge_direction {
  173. NO_MERGE,
  174. MERGE_RIGHT,
  175. MERGE_LEFT,
  176. MERGE_BOTH_DIRECTIONS,
  177. NOT_SURE_WHICH_DIRECTION
  178. };
  179. merge_direction DetermineMergeDirection(WCHAR wc)
  180. {
  181. if (wc == 0x0020) // space
  182. return NO_MERGE;
  183. else if ( wc == 0x0022 || // quotation mark
  184. wc == 0x0027 ) // apostrophe
  185. return NOT_SURE_WHICH_DIRECTION;
  186. else if ( wc == 0x0028 || // left parenthesis
  187. wc == 0x003C || // less than sign
  188. wc == 0x005B || // left square bracket
  189. wc == 0x007B || // left curly bracket
  190. wc == 0x2018 || // left single quotation mark
  191. wc == 0x201C || // left double quotation mark
  192. wc == 0x201F ) // left double quotation mark reverse
  193. return MERGE_RIGHT;
  194. // TODO: need to add MERGE_BOTH_DIRECTIONS for character joiner characters.
  195. // all other character merge left.
  196. return MERGE_LEFT;
  197. }
  198. //+---------------------------------------------------------------------------
  199. //
  200. // Class: CThaiWordBreak
  201. //
  202. // Synopsis:
  203. //
  204. // Arguments:
  205. //
  206. // Modifies:
  207. //
  208. // History: created 7/99 aarayas
  209. //
  210. // Notes:
  211. //
  212. //----------------------------------------------------------------------------
  213. DWORD_PTR CThaiWordBreak::CreateWordBreaker()
  214. {
  215. CThaiBreakTree* breakTree = NULL;
  216. breakTree = new CThaiBreakTree();
  217. #if defined (NGRAM_ENABLE)
  218. if (breakTree)
  219. breakTree->Init(&m_trie, &m_trie_sentence_struct, &m_trie_trigram);
  220. #else
  221. if (breakTree)
  222. breakTree->Init(&m_trie, &m_trie_trigram);
  223. #endif
  224. return (DWORD_PTR)breakTree;
  225. }
  226. //+---------------------------------------------------------------------------
  227. //
  228. // Class: CThaiWordBreak
  229. //
  230. // Synopsis:
  231. //
  232. // Arguments:
  233. //
  234. // Modifies:
  235. //
  236. // History: created 7/99 aarayas
  237. //
  238. // Notes:
  239. //
  240. //----------------------------------------------------------------------------
  241. bool CThaiWordBreak::DeleteWordBreaker(DWORD_PTR dwBreaker)
  242. {
  243. CThaiBreakTree* breakTree = (CThaiBreakTree*) dwBreaker;
  244. if (breakTree)
  245. {
  246. delete breakTree;
  247. return true;
  248. }
  249. return false;
  250. }
  251. //+---------------------------------------------------------------------------
  252. //
  253. // Class: CThaiWordBreak
  254. //
  255. // Synopsis: This funciton segment Thai word use for Indexing.
  256. //
  257. // Arguments:
  258. // wzString - input string. (in)
  259. // iStringLen - input string length. (in)
  260. // pBreakPos - array of break position. (out)
  261. // pThwb_Struct - array structure of THWB. (out)
  262. // iBreakMax - length of pBreakPos and
  263. // pThwb_Struct. (out)
  264. //
  265. // Modifies:
  266. //
  267. // History: created 3/00 aarayas
  268. //
  269. // Notes:
  270. //
  271. //----------------------------------------------------------------------------
  272. int CThaiWordBreak::IndexWordBreak(WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,THWB_STRUCT* pThwb_Struct,unsigned int iBreakMax)
  273. {
  274. unsigned int iBreakIndex = 0; // Contain number of Breaks.
  275. CThaiBreakTree* breakTree = NULL;
  276. breakTree = new CThaiBreakTree();
  277. if (breakTree)
  278. {
  279. breakTree->Init(&m_trie, &m_trie_trigram);
  280. iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,WB_INDEX,true,pThwb_Struct);
  281. delete breakTree;
  282. }
  283. return iBreakIndex;
  284. }
  285. //+---------------------------------------------------------------------------
  286. //
  287. // Class: CThaiWordBreak
  288. //
  289. // Synopsis:
  290. //
  291. // Arguments:
  292. //
  293. // wzWord - input string. (in)
  294. // iWordLen - input string length. (in)
  295. // Alt - find close alternate word (in)
  296. // pBreakPos - array of break position allways 5 byte. (out)
  297. //
  298. // Modifies:
  299. //
  300. // History: created 3/00 aarayas
  301. //
  302. // Notes:
  303. //
  304. //----------------------------------------------------------------------------
  305. int CThaiWordBreak::FindAltWord(WCHAR* wzWord,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos)
  306. {
  307. unsigned int iBreakIndex = 0; // Contain number of Breaks.
  308. CThaiBreakTree* breakTree = NULL;
  309. breakTree = new CThaiBreakTree();
  310. if (breakTree)
  311. {
  312. breakTree->Init(&m_trie, &m_trie_trigram);
  313. iBreakIndex = breakTree->FindAltWord(wzWord,iWordLen,Alt,pBreakPos);
  314. delete breakTree;
  315. }
  316. return iBreakIndex;
  317. }
  318. //+---------------------------------------------------------------------------
  319. //
  320. // Class: CThaiWordBreak
  321. //
  322. // Synopsis: This funciton segment Thai text segment them depending on the modes specifies.
  323. //
  324. // WB_LINEBREAK - is used when the application needs to break for line wrapping,
  325. // this mode takes into the consideration of punctuations.
  326. //
  327. // WB_NORMAL - is used when application wants determine word for searching,
  328. // autocorrect, etc.
  329. //
  330. // WB_SPELLER - not yet implemented, but same as normal with additional soundex
  331. // rules.
  332. //
  333. // Arguments:
  334. //
  335. // wzString - input string. (in)
  336. // iStringLen - input string length. (in)
  337. // pBreakPos - array of break position. (out)
  338. // iBreakMax - length of pBreakPos (out)
  339. // mode - either WB_LINEBREAK, etct (in)
  340. // fFastWordBreak - true for fast algorithm (in)
  341. //
  342. // Modifies:
  343. //
  344. // History: created 7/99 aarayas
  345. //
  346. // Notes:
  347. //
  348. //----------------------------------------------------------------------------
  349. int CThaiWordBreak::FindWordBreak(WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,unsigned int iBreakMax, BYTE mode, bool fFastWordBreak)
  350. {
  351. unsigned int iBreakIndex = 0; // Contain number of Breaks.
  352. CThaiBreakTree* breakTree = NULL;
  353. #if defined(OLD)
  354. breakTree = new CThaiBreakTree();
  355. #else
  356. // new memory management
  357. WordBreakElement* pWordBreakElement = NULL;
  358. pWordBreakElement = listWordBreak.GetFreeWB();
  359. breakTree = pWordBreakElement->breakTree;
  360. listWordBreak.MarkWordBreak(pWordBreakElement,false); // Mark word break as in use.
  361. #endif
  362. if (breakTree)
  363. {
  364. #if defined(OLD)
  365. breakTree->Init(&m_trie, &m_trie_trigram);
  366. assert(mode != WB_INDEX); // If this assert come up, use function IndexWordBreak
  367. iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,mode,fFastWordBreak,0);
  368. delete breakTree;
  369. #else
  370. iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,mode,fFastWordBreak,0);
  371. listWordBreak.MarkWordBreak(pWordBreakElement,true); // Mark word break as free.
  372. #endif
  373. }
  374. else
  375. {
  376. assert(false);
  377. }
  378. return iBreakIndex;
  379. }
  380. //+---------------------------------------------------------------------------
  381. //
  382. // Class: CThaiWordBreak
  383. //
  384. // Synopsis: This funciton segment Thai text segment them depending on the modes specifies.
  385. //
  386. // WB_LINEBREAK - is used when the application needs to break for line wrapping,
  387. // this mode takes into the consideration of punctuations.
  388. //
  389. // WB_NORMAL - is used when application wants determine word for searching,
  390. // autocorrect, etc.
  391. //
  392. // WB_SPELLER - not yet implemented, but same as normal with additional soundex
  393. // rules.
  394. //
  395. // WB_INDEX - is used when application wanted to do Thai indexing.
  396. //
  397. //
  398. // Arguments:
  399. //
  400. // wzString - input string. (in)
  401. // iStringLen - input string length. (in)
  402. // pBreakPos - array of break position. (out)
  403. // iBreakMax - length of pBreakPos (out)
  404. // must be greater than 1.
  405. // mode - either WB_LINEBREAK, etct (in)
  406. // fFastWordBreak - true for fast algorithm (in)
  407. // pThwb_Struct - array structure of THWB. (out)
  408. //
  409. // Modifies:
  410. //
  411. // History: created 11/99 aarayas
  412. //
  413. // Notes:
  414. //
  415. //----------------------------------------------------------------------------
  416. int CThaiWordBreak::FindWordBreak(DWORD_PTR dwBreaker, WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,unsigned int iBreakMax, BYTE mode, bool fFastWordBreak, THWB_STRUCT* pThwb_Struct)
  417. {
  418. // Declare and Initialize all local variables.
  419. WCHAR* pwszRunStart = wzString;
  420. const WCHAR* pwszMax = wzString + iStringLen;
  421. WCHAR* pwch = wzString;
  422. bool fThaiRun = true;
  423. bool fCaretBreak = false;
  424. int iRunCount = 0;
  425. unsigned int i = 0;
  426. unsigned int iBreakIndex = 0; // Contain number of Breaks.
  427. merge_direction dirPrevious = NO_MERGE;
  428. merge_direction dirCurrent = NO_MERGE;
  429. CThaiBreakTree* breakTree = (CThaiBreakTree*) dwBreaker;
  430. // check for possible invalid arguments.
  431. assert(wzString != NULL);
  432. assert(iBreakMax > 0);
  433. assert(pBreakPos != NULL);
  434. if ((wzString == NULL) || (iBreakMax == 0) || (pBreakPos == NULL))
  435. return 0;
  436. switch (mode)
  437. {
  438. case WB_LINEBREAK:
  439. case 2: // to be compatible with old api.
  440. do
  441. {
  442. while ((TWB_IsCharPunctW(*pwch) || TWB_IsCharWordDelimW(*pwch)) && iBreakIndex < iBreakMax && pwch < pwszMax)
  443. {
  444. dirCurrent = DetermineMergeDirection(*pwch);
  445. switch (dirCurrent)
  446. {
  447. case NO_MERGE:
  448. if ( pwch + 1 < pwszMax && *(pwch + 1) == THAI_Vowel_MaiYaMok && iBreakIndex > 0)
  449. {
  450. // Mai Ya Mok case only.
  451. pBreakPos[iBreakIndex - 1] += 2;
  452. dirCurrent = MERGE_LEFT;
  453. pwch++;
  454. }
  455. else
  456. pBreakPos[iBreakIndex++] = 1;
  457. break;
  458. case MERGE_RIGHT:
  459. if (dirPrevious == MERGE_RIGHT)
  460. pBreakPos[iBreakIndex - 1]++;
  461. else if (!TWB_IsCharPunctW(*(pwch + 1)))
  462. pBreakPos[iBreakIndex++] = 1;
  463. else
  464. pBreakPos[iBreakIndex++] = 1;
  465. break;
  466. case NOT_SURE_WHICH_DIRECTION:
  467. if (pwch == wzString || // if pwch is first character.
  468. TWB_IsCharWordDelimW(*(pwch - 1)) ) // if previous character is delimiter.
  469. {
  470. pBreakPos[iBreakIndex++] = 1;
  471. dirCurrent = MERGE_RIGHT;
  472. }
  473. else
  474. {
  475. pBreakPos[iBreakIndex - 1]++;
  476. dirCurrent = MERGE_LEFT;
  477. }
  478. break;
  479. case MERGE_LEFT:
  480. default:
  481. if (iBreakIndex == 0)
  482. if (pwch == wzString)
  483. pBreakPos[iBreakIndex++] = 1;
  484. else
  485. pBreakPos[iBreakIndex]++;
  486. else
  487. pBreakPos[iBreakIndex - 1]++;
  488. break;
  489. }
  490. dirPrevious = dirCurrent;
  491. pwch++;
  492. pwszRunStart = pwch;
  493. }
  494. assert(pwszRunStart == pwch);
  495. if( iBreakIndex >= iBreakMax || pwch >= pwszMax)
  496. break;
  497. // Detect if this is a Thai Run.
  498. fThaiRun = IsThaiChar(*pwch);
  499. do
  500. {
  501. pwch++;
  502. iRunCount++;
  503. } while ((IsThaiChar(*pwch)==fThaiRun &&
  504. iRunCount < (MAXBREAK - 2) &&
  505. *pwch &&
  506. !TWB_IsCharWordDelimW(*pwch) &&
  507. (pwch < pwszMax) ) ||
  508. ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) ));
  509. if (fThaiRun)
  510. {
  511. unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch);
  512. for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++)
  513. {
  514. // First Thai character of the run.
  515. if (dirPrevious == MERGE_RIGHT)
  516. {
  517. assert(iBreakIndex != 0);
  518. pBreakPos[iBreakIndex - 1] += breakTree->breakArray[i];
  519. }
  520. else
  521. pBreakPos[iBreakIndex++] = breakTree->breakArray[i];
  522. dirPrevious = NO_MERGE;
  523. }
  524. }
  525. else
  526. {
  527. // Not a Thai Run simply put the whole thing in the break array.
  528. assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk.
  529. if (dirPrevious == MERGE_RIGHT)
  530. {
  531. assert(iBreakIndex != 0);
  532. pBreakPos[iBreakIndex - 1] += (BYTE) (pwch - pwszRunStart);
  533. }
  534. else
  535. pBreakPos[iBreakIndex++] = (BYTE) (pwch - pwszRunStart);
  536. }
  537. iRunCount = 0;
  538. pwszRunStart = pwch;
  539. // Make sure we haven't pass iBreakMax define by user else return whatever we got.
  540. } while(iBreakIndex < iBreakMax && pwch < pwszMax);
  541. break;
  542. case WB_INDEX:
  543. // Make sure argument is the same.
  544. assert(pThwb_Struct != NULL);
  545. if (pThwb_Struct == NULL)
  546. return 0;
  547. do
  548. {
  549. while (TWB_IsCharWordDelimW(*pwch) && pwszMax > pwch)
  550. pwch++;
  551. if( pwszRunStart < pwch)
  552. {
  553. pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart);
  554. pwszRunStart = pwch;
  555. }
  556. if( iBreakIndex >= iBreakMax || pwch >= pwszMax)
  557. break;
  558. // Detect if this is a Thai Run.
  559. fThaiRun = IsThaiChar(*pwch); //TODO: Add comma and period to Thai range.
  560. do
  561. {
  562. pwch++;
  563. iRunCount++;
  564. } while ((IsThaiChar(*pwch)==fThaiRun &&
  565. iRunCount < (MAXBREAK - 2) &&
  566. *pwch &&
  567. !TWB_IsCharWordDelimW(*pwch) &&
  568. (pwch < pwszMax) ) ||
  569. ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) ));
  570. if (fThaiRun)
  571. {
  572. unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch);
  573. for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++)
  574. {
  575. pThwb_Struct[iBreakIndex].fThai = true;
  576. pThwb_Struct[iBreakIndex].alt = ExtractALT(breakTree->tagArray[i]);
  577. pBreakPos[iBreakIndex++] = breakTree->breakArray[i];
  578. }
  579. }
  580. else
  581. {
  582. // Not a Thai Run simply put the whole thing in the break array.
  583. assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk.
  584. pThwb_Struct[iBreakIndex].fThai = false;
  585. pThwb_Struct[iBreakIndex].alt = 0;
  586. pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart);
  587. }
  588. iRunCount = 0;
  589. pwszRunStart = pwch;
  590. // Make sure we haven't pass iBreakMax define by user else return whatever we got.
  591. } while(iBreakIndex < iBreakMax && pwch < pwszMax);
  592. break;
  593. case WB_CARETBREAK:
  594. fCaretBreak = true;
  595. case WB_NORMAL:
  596. default:
  597. do
  598. {
  599. while (TWB_IsCharWordDelimW(*pwch) && pwszMax > pwch)
  600. pwch++;
  601. if (fCaretBreak)
  602. {
  603. // 010.181686. Taking care of puntuation.
  604. while (TWB_IsCharPunctW(*pwch) && pwszMax > pwch)
  605. pwch++;
  606. }
  607. if( pwszRunStart < pwch)
  608. {
  609. if (fCaretBreak && *pwszRunStart == L' ' && iBreakIndex > 0)
  610. {
  611. // 010.182719. For the MaiYaMok case we only accept if
  612. // space follow by MaiYaMok
  613. if (*pwch == THAI_Vowel_MaiYaMok &&
  614. wzString < (pwszRunStart-1) &&
  615. IsThaiChar(*(pwszRunStart-1)) &&
  616. pwch == (pwszRunStart+1) )
  617. {
  618. pBreakPos[iBreakIndex - 1] += 2;
  619. pwch++;
  620. }
  621. else
  622. // This is a caret movement features, should merge space to
  623. // the right words.
  624. pBreakPos[iBreakIndex - 1] += (BYTE)(pwch - pwszRunStart);
  625. }
  626. else
  627. pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart);
  628. pwszRunStart = pwch;
  629. }
  630. if( iBreakIndex >= iBreakMax || pwch >= pwszMax)
  631. break;
  632. // Detect if this is a Thai Run.
  633. fThaiRun = IsThaiChar(*pwch); //TODO: Add comma and period to Thai range.
  634. if (!fCaretBreak)
  635. {
  636. do
  637. {
  638. pwch++;
  639. iRunCount++;
  640. } while ((IsThaiChar(*pwch)==fThaiRun &&
  641. iRunCount < (MAXBREAK - 2) &&
  642. *pwch &&
  643. !TWB_IsCharWordDelimW(*pwch) &&
  644. (pwch < pwszMax) ) ||
  645. ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) ));
  646. }
  647. else
  648. {
  649. do
  650. {
  651. pwch++;
  652. iRunCount++;
  653. } while ((IsThaiChar(*pwch)==fThaiRun &&
  654. iRunCount < (MAXBREAK - 2) &&
  655. *pwch &&
  656. !TWB_IsCharWordDelimW(*pwch) &&
  657. !TWB_IsCharPunctW(*pwch) &&
  658. (pwch < pwszMax) ) ||
  659. ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) ));
  660. }
  661. if (fThaiRun)
  662. {
  663. #if defined (NGRAM_ENABLE)
  664. if (!fFastWordBreak)
  665. {
  666. if (WordBreak(pwszRunStart,pwch))
  667. for (i=0; i < breakTree.maxToken && iBreakIndex <iBreakMax; i++)
  668. pBreakPos[iBreakIndex++] = breakTree->maximalMatchingBreakArray[i];
  669. }
  670. else
  671. {
  672. unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch);
  673. for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++)
  674. pBreakPos[iBreakIndex++] = breakTree->breakArray[i];
  675. }
  676. #else
  677. unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch);
  678. for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++)
  679. pBreakPos[iBreakIndex++] = breakTree->breakArray[i];
  680. #endif
  681. }
  682. else
  683. {
  684. // Not a Thai Run simply put the whole thing in the break array.
  685. assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk.
  686. pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart);
  687. }
  688. iRunCount = 0;
  689. pwszRunStart = pwch;
  690. // Make sure we haven't pass iBreakMax define by user else return whatever we got.
  691. } while(iBreakIndex < iBreakMax && pwch < pwszMax);
  692. break;
  693. }
  694. #if defined (_DEBUG)
  695. unsigned int iTotalChar = 0;
  696. for (i = 0; i < iBreakIndex; i++)
  697. {
  698. iTotalChar += pBreakPos[i];
  699. }
  700. if (iBreakIndex < iBreakMax)
  701. assert(iStringLen == iTotalChar);
  702. #endif
  703. return iBreakIndex;
  704. }
  705. //+---------------------------------------------------------------------------
  706. //
  707. // Class: CThaiWordBreak
  708. //
  709. // Synopsis:
  710. //
  711. // Arguments:
  712. //
  713. // Modifies:
  714. //
  715. // History: created 7/99 aarayas
  716. //
  717. // Notes:
  718. //
  719. //----------------------------------------------------------------------------
  720. #if defined (NGRAM_ENABLE)
  721. BOOL CThaiWordBreak::WordBreak(WCHAR* pszBegin, WCHAR* pszEnd)
  722. {
  723. // Declare and Initialize all local variables.
  724. bool fWordEnd = false;
  725. bool fCorrectPath = false;
  726. WCHAR* pszIndex = pszBegin;
  727. int iNumCluster = 1;
  728. assert(pszBegin < pszEnd); // Make sure pszEnd is at least greater pszBegin.
  729. breakTree.GenerateTree(pszBegin, pszEnd);
  730. breakTree.MaximalMatching();
  731. return (breakTree.maxToken > 0);
  732. }
  733. #endif
  734. //+---------------------------------------------------------------------------
  735. //
  736. // Class: CThaiWordBreak
  737. //
  738. // Synopsis:
  739. //
  740. // Arguments:
  741. //
  742. // Modifies:
  743. //
  744. // History: created 7/99 aarayas
  745. //
  746. // Notes:
  747. //
  748. //----------------------------------------------------------------------------
  749. BOOL CThaiWordBreak::Find(const WCHAR* wzString, DWORD* pdwPOS)
  750. {
  751. return m_trie.Find(wzString, pdwPOS);
  752. }