Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2169 lines
71 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. //
  4. // CThaiBreakTree - class CThaiBreakTree
  5. //
  6. // History:
  7. // created 7/99 aarayas
  8. //
  9. // �1999 Microsoft Corporation
  10. //----------------------------------------------------------------------------
  11. #include "CThaiBreakTree.hpp"
  12. //+---------------------------------------------------------------------------
  13. //
  14. // Function: ExtractPOS
  15. //
  16. // Synopsis: The functions takes a tag and return Part Of Speech Tags.
  17. //
  18. // Arguments:
  19. //
  20. // Modifies:
  21. //
  22. // History: created 7/99 aarayas
  23. //
  24. // Notes:
  25. //
  26. //----------------------------------------------------------------------------
  27. inline WCHAR ExtractPOS(DWORD dwTag)
  28. {
  29. return (WCHAR) ( (dwTag & iPosMask) >> iPosShift);
  30. }
  31. //+---------------------------------------------------------------------------
  32. //
  33. // Function: ExtractFrq
  34. //
  35. // Synopsis: The functions takes a tag and return Frquency of words.
  36. //
  37. // Arguments:
  38. //
  39. // Modifies:
  40. //
  41. // History: created 7/99 aarayas
  42. //
  43. // Notes:
  44. //
  45. //----------------------------------------------------------------------------
  46. inline BYTE ExtractFrq(DWORD dwTag)
  47. {
  48. return (BYTE) ( (dwTag & 0x300) >> iFrqShift);
  49. }
  50. //+---------------------------------------------------------------------------
  51. //
  52. // Function: DetermineFrequencyWeight
  53. //
  54. // Synopsis: The functions returns the frequency weight of a words.
  55. //
  56. // Arguments:
  57. //
  58. // Modifies:
  59. //
  60. // History: created 7/99 aarayas
  61. //
  62. // Notes:
  63. //
  64. //----------------------------------------------------------------------------
  65. inline void DetermineFrequencyWeight(BYTE frq, unsigned int* uiWeight)
  66. {
  67. switch (frq)
  68. {
  69. case frqpenInfrequent:
  70. (*uiWeight) -= 2;
  71. break;
  72. case frqpenSomewhat:
  73. (*uiWeight)--;
  74. break;
  75. case frqpenVery:
  76. (*uiWeight) += 2;
  77. break;
  78. case frqpenNormal:
  79. default:
  80. (*uiWeight)++;
  81. break;
  82. }
  83. }
  84. //+---------------------------------------------------------------------------
  85. //
  86. // Function: DetermineFrequencyWeight
  87. //
  88. // Synopsis: The functions returns the frequency weight of a words.
  89. //
  90. // Arguments:
  91. //
  92. // Modifies:
  93. //
  94. // History: created 7/99 aarayas
  95. //
  96. // Notes:
  97. //
  98. //----------------------------------------------------------------------------
  99. inline void DetermineFrequencyWeight(BYTE frq, DWORD* uiWeight)
  100. {
  101. switch (frq)
  102. {
  103. case frqpenInfrequent:
  104. (*uiWeight) -= 2;
  105. break;
  106. case frqpenSomewhat:
  107. (*uiWeight)--;
  108. break;
  109. case frqpenVery:
  110. (*uiWeight) += 2;
  111. break;
  112. case frqpenNormal:
  113. default:
  114. (*uiWeight)++;
  115. break;
  116. }
  117. }
  118. //+---------------------------------------------------------------------------
  119. //
  120. // Class: CThaiTrieIter
  121. //
  122. // Synopsis: Constructor - initialize local variables
  123. //
  124. // Arguments:
  125. //
  126. // Modifies:
  127. //
  128. // History: created 7/99 aarayas
  129. //
  130. // Notes:
  131. //
  132. //----------------------------------------------------------------------------
  133. CThaiBreakTree::CThaiBreakTree() : iNodeIndex(0), iNumNode(0),
  134. pszBegin(NULL), pszEnd(NULL),
  135. breakTree(NULL), breakArray(NULL),
  136. tagArray(NULL), maximalMatchingBreakArray(NULL),
  137. maximalMatchingTAGArray(NULL),
  138. POSArray(NULL), maximalMatchingPOSArray(NULL)
  139. {
  140. // Allocate memory need for CThaiBreakTree.
  141. #if defined (NGRAM_ENABLE)
  142. breakTree = new ThaiBreakNode[MAXTHAIBREAKNODE];
  143. #endif
  144. breakArray = new BYTE[MAXBREAK];
  145. tagArray = new DWORD[MAXBREAK];
  146. POSArray = new WCHAR[MAXBREAK];
  147. }
  148. //+---------------------------------------------------------------------------
  149. //
  150. // Class: CThaiTrieIter
  151. //
  152. // Synopsis: Destructor - clean up code
  153. //
  154. // Arguments:
  155. //
  156. // Modifies:
  157. //
  158. // History: created 7/99 aarayas
  159. //
  160. // Notes:
  161. //
  162. //----------------------------------------------------------------------------
  163. CThaiBreakTree::~CThaiBreakTree()
  164. {
  165. // Clean up all memory used.
  166. #if defined (NGRAM_ENABLE)
  167. if (breakTree)
  168. delete breakTree;
  169. if (maximalMatchingBreakArray)
  170. delete maximalMatchingBreakArray;
  171. if (maximalMatchingTAGArray)
  172. delete maximalMatchingTAGArray;
  173. if (maximalMatchingPOSArray)
  174. delete maximalMatchingPOSArray;
  175. #endif
  176. if (breakArray)
  177. delete breakArray;
  178. if (tagArray)
  179. delete tagArray;
  180. if (POSArray)
  181. delete POSArray;
  182. }
  183. //+---------------------------------------------------------------------------
  184. //
  185. // Class: CThaiBreakTree
  186. //
  187. // Synopsis: Associate the class to the string.
  188. //
  189. // Arguments:
  190. //
  191. // Modifies:
  192. //
  193. // History: created 7/99 aarayas
  194. //
  195. // Notes:
  196. //
  197. //----------------------------------------------------------------------------
  198. #if defined (NGRAM_ENABLE)
  199. void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pSentTrie, CTrie* pTrigramTrie)
  200. #else
  201. void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pTrigramTrie)
  202. #endif
  203. {
  204. assert(pTrie != NULL);
  205. thaiTrieIter.Init(pTrie);
  206. thaiTrieIter1.Init(pTrie);
  207. #if defined (NGRAM_ENABLE)
  208. assert(pSentTrie != NULL);
  209. thaiSentIter.Init(pSentTrie);
  210. #endif
  211. assert(pTrigramTrie != NULL);
  212. thaiTrigramIter.Init(pTrigramTrie);
  213. }
  214. #if defined (NGRAM_ENABLE)
  215. //+---------------------------------------------------------------------------
  216. //
  217. // Class: CThaiBreakTree
  218. //
  219. // Synopsis: reset iterator to top of the tree
  220. //
  221. // Arguments:
  222. //
  223. // Modifies:
  224. //
  225. // History: created 7/99 aarayas
  226. //
  227. // Notes:
  228. //
  229. //----------------------------------------------------------------------------
  230. inline void CThaiBreakTree::Reset()
  231. {
  232. iNodeIndex = 0;
  233. }
  234. //+---------------------------------------------------------------------------
  235. //
  236. // Class: CThaiBreakTree
  237. //
  238. // Synopsis: Move to the next break.
  239. //
  240. // Arguments:
  241. //
  242. // Modifies:
  243. //
  244. // History: created 7/99 aarayas
  245. //
  246. // Notes:
  247. //
  248. //----------------------------------------------------------------------------
  249. inline bool CThaiBreakTree::MoveNext()
  250. {
  251. iNodeIndex = breakTree[iNodeIndex].NextBreak;
  252. return (iNodeIndex != 0);
  253. }
  254. //+---------------------------------------------------------------------------
  255. //
  256. // Class: CThaiBreakTree
  257. //
  258. // Synopsis: Move down to next level.
  259. //
  260. // Arguments:
  261. //
  262. // Modifies:
  263. //
  264. // History: created 7/99 aarayas
  265. //
  266. // Notes:
  267. //
  268. //----------------------------------------------------------------------------
  269. inline bool CThaiBreakTree::MoveDown()
  270. {
  271. iNodeIndex = breakTree[iNodeIndex].Down;
  272. return (iNodeIndex != 0);
  273. }
  274. //+---------------------------------------------------------------------------
  275. //
  276. // Class: CThaiBreakTree
  277. //
  278. // Synopsis: create new node to position, and return index to the node.
  279. //
  280. // * return Unable to Create Node.
  281. //
  282. // Arguments:
  283. //
  284. // Modifies:
  285. //
  286. // History: created 7/99 aarayas
  287. //
  288. // Notes:
  289. //
  290. //----------------------------------------------------------------------------
  291. inline unsigned int CThaiBreakTree::CreateNode(int iPos, BYTE iBreakLen, DWORD dwTAG)
  292. {
  293. assert(iNumNode < MAXTHAIBREAKNODE);
  294. if (iNumNode >= MAXTHAIBREAKNODE)
  295. {
  296. return UNABLETOCREATENODE;
  297. }
  298. breakTree[iNumNode].iPos = iPos;
  299. breakTree[iNumNode].iBreakLen = iBreakLen;
  300. breakTree[iNumNode].dwTAG = dwTAG;
  301. breakTree[iNumNode].NextBreak = 0;
  302. breakTree[iNumNode].Down = 0;
  303. iNumNode++;
  304. return (iNumNode - 1);
  305. }
  306. //+---------------------------------------------------------------------------
  307. //
  308. // Class: CThaiBreakTree
  309. //
  310. // Synopsis: Generate a Tree of possible break from the given string.
  311. //
  312. // * Note - false if there aren't enough memory to create node.
  313. //
  314. // Arguments:
  315. //
  316. // Modifies:
  317. //
  318. // History: created 7/99 aarayas
  319. //
  320. // Notes:
  321. //
  322. //----------------------------------------------------------------------------
  323. enum thai_parse_state {
  324. END_SENTENCE, // Reached the end of sentence.
  325. LONGEST_MATCH, // Longest possible matched.
  326. NOMATCH_FOUND, // Unable to find word.
  327. ERROR_OUTMEMORY, // Out of Memory.
  328. };
  329. bool CThaiBreakTree::GenerateTree(WCHAR* pszBegin, WCHAR* pszEnd1)
  330. {
  331. // Declare and initialize local variables.
  332. unsigned int iIndexBreakTree = 0;
  333. unsigned int iPrevIndexBreakTree = 0;
  334. unsigned int iParentNode = 0;
  335. WCHAR* pszBeginWord = pszBegin;
  336. WCHAR* pszIndex = pszBegin;
  337. unsigned int iNumCluster = 1;
  338. unsigned int iNumLastCluster;
  339. unsigned int iWordLen = 0;
  340. unsigned int iNodeAnalyze = 0;
  341. thai_parse_state parseState = END_SENTENCE;
  342. bool fFoundMatch = false;
  343. bool fAddToNodeAnalyze = false;
  344. bool fDoneGenerateTree = false;
  345. pszEnd = pszEnd1;
  346. #if defined (_DEBUG)
  347. memset(breakTree,0,sizeof(ThaiBreakNode)*MAXTHAIBREAKNODE);
  348. #endif
  349. iNodeIndex = 0;
  350. iNumNode = 0;
  351. while (true)
  352. {
  353. // Reset Iterator for generating break for new word.
  354. fFoundMatch = false;
  355. thaiTrieIter.Reset();
  356. if (iIndexBreakTree != 0)
  357. {
  358. while (true)
  359. {
  360. // If this is not the first node than set pszBeginWord after the last break.
  361. pszBeginWord = pszBegin + breakTree[iNodeAnalyze].iPos + breakTree[iNodeAnalyze].iBreakLen;
  362. fAddToNodeAnalyze = true;
  363. // Are we at the end of the sentence.
  364. if ( (pszBeginWord == pszEnd) ||
  365. (breakTree[iNodeAnalyze].dwTAG == TAGPOS_PURGE) )
  366. {
  367. iNodeAnalyze++; // Move to next node.
  368. if (iNodeAnalyze >= iNumNode)
  369. {
  370. fDoneGenerateTree = true;
  371. break;
  372. }
  373. }
  374. else
  375. break;
  376. }
  377. }
  378. pszIndex = pszBeginWord;
  379. iParentNode = iNodeAnalyze;
  380. if (fDoneGenerateTree)
  381. break;
  382. // Get next level of tree.
  383. while (TRUE)
  384. {
  385. iNumLastCluster = iNumCluster;
  386. iNumCluster = GetCluster(pszIndex);
  387. if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster))
  388. {
  389. pszIndex += iNumCluster;
  390. if (thaiTrieIter.fWordEnd)
  391. {
  392. fFoundMatch = true;
  393. // if first node add first node
  394. if (iIndexBreakTree == 0)
  395. {
  396. CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
  397. iIndexBreakTree++;
  398. }
  399. else
  400. {
  401. if (fAddToNodeAnalyze)
  402. {
  403. fAddToNodeAnalyze = false;
  404. breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
  405. // Determine if an error has occur.
  406. if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
  407. {
  408. breakTree[iNodeAnalyze].NextBreak = 0;
  409. parseState = ERROR_OUTMEMORY;
  410. break;
  411. }
  412. iPrevIndexBreakTree = breakTree[iNodeAnalyze].NextBreak;
  413. iNodeAnalyze++;
  414. }
  415. else
  416. {
  417. breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
  418. // Determine if an error has occur.
  419. if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
  420. {
  421. breakTree[iPrevIndexBreakTree].Down = 0;
  422. parseState = ERROR_OUTMEMORY;
  423. break;
  424. }
  425. iPrevIndexBreakTree = iIndexBreakTree;
  426. }
  427. iIndexBreakTree++;
  428. }
  429. }
  430. if (pszIndex >= pszEnd)
  431. {
  432. assert(pszIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
  433. parseState = END_SENTENCE;
  434. break;
  435. }
  436. }
  437. else
  438. {
  439. if (fFoundMatch)
  440. parseState = LONGEST_MATCH;
  441. else
  442. parseState = NOMATCH_FOUND;
  443. break;
  444. }
  445. }
  446. if (parseState == LONGEST_MATCH)
  447. {
  448. // We found a matched.
  449. assert(breakTree[iPrevIndexBreakTree].Down == 0); // at this point breakTree[iPreveIndexBreakTree].Down should equal null.(optimization note)
  450. if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree)
  451. {
  452. assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown.
  453. DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak);
  454. }
  455. }
  456. else if (parseState == NOMATCH_FOUND)
  457. {
  458. // Should mark node as unknown.
  459. if (fAddToNodeAnalyze)
  460. {
  461. fAddToNodeAnalyze = false;
  462. iWordLen = pszIndex - pszBeginWord;
  463. // Make sure we don't only have a cluster of text before making a node.
  464. if (iWordLen == 0)
  465. {
  466. // If we have an UNKNOWN word of one character only current node mark it as unknown.
  467. assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
  468. breakTree[iNodeAnalyze].iBreakLen += iNumCluster;
  469. breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
  470. }
  471. else
  472. {
  473. if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8)
  474. // The reason we are using 8 is because from corpora analysis
  475. // the average Thai word is about 7.732 characters.
  476. // TODO: We should add orthographic analysis here to get a better on boundary
  477. // of unknown word.
  478. {
  479. assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
  480. breakTree[iNodeAnalyze].iBreakLen += iWordLen;
  481. breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
  482. }
  483. else
  484. {
  485. if (GetWeight(pszIndex - iNumLastCluster))
  486. breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN);
  487. else
  488. breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN);
  489. // Determine if an error has occur.
  490. if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
  491. {
  492. breakTree[iNodeAnalyze].NextBreak = 0;
  493. parseState = ERROR_OUTMEMORY;
  494. break;
  495. }
  496. iNodeAnalyze++;
  497. iIndexBreakTree++;
  498. }
  499. }
  500. }
  501. else
  502. {
  503. breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN);
  504. // Determine if an error has occur.
  505. if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
  506. {
  507. breakTree[iPrevIndexBreakTree].Down = 0;
  508. parseState = ERROR_OUTMEMORY;
  509. break;
  510. }
  511. iIndexBreakTree++;
  512. }
  513. }
  514. else if (parseState == END_SENTENCE)
  515. {
  516. // If we find ourself at the end of a sentence and no match.
  517. if (!fFoundMatch)
  518. {
  519. if (fAddToNodeAnalyze)
  520. {
  521. fAddToNodeAnalyze = false;
  522. iWordLen = pszIndex - pszBeginWord;
  523. // Make sure we don't only have a cluster of text before making a node.
  524. if (iWordLen == 0)
  525. {
  526. // If we have an UNKNOWN word of one character only current node mark it as unknown.
  527. assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
  528. breakTree[iNodeAnalyze].iBreakLen += iNumCluster;
  529. breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
  530. }
  531. else
  532. {
  533. if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8)
  534. // The reason we are using 8 is because from corpora analysis
  535. // the average Thai word is about 7.732 characters.
  536. // TODO: We should add orthographic analysis here to get a better on boundary
  537. // of unknown word.
  538. {
  539. assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
  540. breakTree[iNodeAnalyze].iBreakLen += iWordLen;
  541. breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
  542. }
  543. else
  544. {
  545. if (GetWeight(pszIndex - iNumLastCluster))
  546. breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN);
  547. else
  548. breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN);
  549. // Determine if an error has occur.
  550. if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
  551. {
  552. breakTree[iNodeAnalyze].NextBreak = 0;
  553. parseState = ERROR_OUTMEMORY;
  554. break;
  555. }
  556. iNodeAnalyze++;
  557. iIndexBreakTree++;
  558. }
  559. }
  560. }
  561. else
  562. {
  563. breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN);
  564. // Determine if an error has occur.
  565. if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
  566. {
  567. breakTree[iPrevIndexBreakTree].Down = 0;
  568. parseState = ERROR_OUTMEMORY;
  569. break;
  570. }
  571. }
  572. iIndexBreakTree++;
  573. }
  574. // If the beginning of node the branch isn't equal to leaf node perphase it is possible to
  575. // do some ending optimization.
  576. else if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree)
  577. {
  578. assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown.
  579. DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak);
  580. }
  581. }
  582. else if ( (breakTree[iNodeAnalyze].iBreakLen == 0) || (parseState == ERROR_OUTMEMORY) )
  583. break;
  584. }
  585. return (parseState != ERROR_OUTMEMORY);
  586. }
  587. //+---------------------------------------------------------------------------
  588. //
  589. // Class: CThaiBreakTree
  590. //
  591. // Synopsis: Traverse all the tree and look for the least number of token.
  592. //
  593. // Arguments:
  594. //
  595. // Modifies:
  596. //
  597. // History: created 7/99 aarayas
  598. //
  599. // Notes:
  600. //
  601. //----------------------------------------------------------------------------
  602. bool CThaiBreakTree::MaximalMatching()
  603. {
  604. // If maximal matching break array has not been allocate, than allocate it.
  605. if (!maximalMatchingBreakArray)
  606. maximalMatchingBreakArray = new BYTE[MAXBREAK];
  607. if (!maximalMatchingTAGArray)
  608. maximalMatchingTAGArray = new DWORD[MAXBREAK];
  609. if (!maximalMatchingPOSArray)
  610. maximalMatchingPOSArray = new WCHAR[MAXBREAK];
  611. maxLevel = MAXUNSIGNEDINT;
  612. maxToken = 0;
  613. iNumUnknownMaximalPOSArray = MAXBREAK;
  614. Traverse(0,0,0);
  615. return true;
  616. }
  617. //+---------------------------------------------------------------------------
  618. //
  619. // Class: CThaiBreakTree
  620. //
  621. // Synopsis: The function determine if the node if the node should,
  622. // be tag as unknown or purge.
  623. //
  624. // Arguments:
  625. //
  626. // Modifies:
  627. //
  628. // History: created 8/99 aarayas
  629. //
  630. // Notes:
  631. //
  632. //----------------------------------------------------------------------------
  633. inline DWORD CThaiBreakTree::DeterminePurgeOrUnknown(unsigned int iCurrentNode, unsigned int iBreakLen)
  634. {
  635. // Declare and initialize local variables.
  636. unsigned int iNode = breakTree[iCurrentNode].Down;
  637. while (iNode != 0)
  638. {
  639. if ( (breakTree[iNode].iBreakLen == iBreakLen) ||
  640. (breakTree[iNode].iBreakLen < iBreakLen) &&
  641. ( (breakTree[iNode].dwTAG != TAGPOS_UNKNOWN) ||
  642. (breakTree[iNode].dwTAG != TAGPOS_PURGE) ))
  643. {
  644. // Since we are purging this break just make sure the NextBreak is Null.
  645. assert(breakTree[iCurrentNode].NextBreak == 0);
  646. return TAGPOS_PURGE;
  647. }
  648. iNode = breakTree[iNode].Down;
  649. }
  650. return TAGPOS_UNKNOWN;
  651. }
  652. //+---------------------------------------------------------------------------
  653. //
  654. // Class: CThaiBreakTree
  655. //
  656. // Synopsis: Ending optimization - if we have found the end of a sentence,
  657. // and possible break. Purge the branch for unnecessary break.
  658. //
  659. // Arguments:
  660. //
  661. // Modifies:
  662. //
  663. // History: created 8/99 aarayas
  664. //
  665. // Notes:
  666. //
  667. //----------------------------------------------------------------------------
  668. inline void CThaiBreakTree::DeterminePurgeEndingSentence(WCHAR* pszBeginWord, unsigned int iNode)
  669. {
  670. while (breakTree[iNode].Down != 0)
  671. {
  672. // Determine if the next string has a possiblity to become a word.
  673. // TODO: We may need to change this once the GetWeight add soundex
  674. // functionality.
  675. if (GetWeight(pszBeginWord + breakTree[iNode].iBreakLen) == 0)
  676. {
  677. // Since we are purging this break just make sure the NextBreak is Null.
  678. assert(breakTree[iNode].NextBreak == 0);
  679. breakTree[iNode].dwTAG = TAGPOS_PURGE;
  680. }
  681. iNode = breakTree[iNode].Down;
  682. }
  683. }
  684. #endif
  685. //+---------------------------------------------------------------------------
  686. //
  687. // Class: CThaiBreakTree
  688. //
  689. // Synopsis:
  690. //
  691. // Arguments:
  692. //
  693. // Modifies:
  694. //
  695. // History: created 8/99 aarayas
  696. //
  697. // Notes:
  698. //
  699. //----------------------------------------------------------------------------
  700. unsigned int CThaiBreakTree::GetLongestSubstring(WCHAR* pszBegin, unsigned int iWordLen)
  701. {
  702. // Declare and initialize local variables.
  703. unsigned int iNumCluster = 1;
  704. unsigned int lastWeight = 0;
  705. unsigned int Weight = 0;
  706. bool fBeginNewWord;
  707. WCHAR* pszIndex = pszBegin;
  708. // Short circuit the length is less of string is less than 1.
  709. if ((pszEnd - pszBegin) == 1)
  710. return Weight;
  711. else if (pszEnd == pszBegin)
  712. return 1000;
  713. // Reset Iterator for generating break for new word.
  714. fBeginNewWord = true;
  715. // Get next level of tree.
  716. while (true)
  717. {
  718. iNumCluster = GetCluster(pszIndex);
  719. if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
  720. {
  721. fBeginNewWord = false;
  722. pszIndex += iNumCluster;
  723. if (thaiTrieIter.fWordEnd)
  724. {
  725. lastWeight = Weight;
  726. Weight = (unsigned int) (pszIndex - pszBegin);
  727. }
  728. }
  729. else
  730. {
  731. if ((Weight == iWordLen) && (lastWeight < Weight) && (lastWeight > 0))
  732. {
  733. Weight = lastWeight;
  734. }
  735. break;
  736. }
  737. }
  738. return Weight;
  739. }
  740. //+---------------------------------------------------------------------------
  741. //
  742. // Class: CThaiBreakTree
  743. //
  744. // Synopsis:
  745. //
  746. // Arguments:
  747. //
  748. // Modifies:
  749. //
  750. // History: created 8/99 aarayas
  751. //
  752. // Notes:
  753. //
  754. //----------------------------------------------------------------------------
  755. unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin)
  756. {
  757. // Declare and initialize local variables.
  758. unsigned int iNumCluster = 1;
  759. unsigned int Weight = 0;
  760. bool fBeginNewWord;
  761. WCHAR* pszIndex = pszBegin;
  762. // Short circuit the length is less of string is less than 1.
  763. if ((pszEnd - pszBegin) == 1)
  764. return Weight;
  765. else if (pszEnd == pszBegin)
  766. return 1000;
  767. // Reset Iterator for generating break for new word.
  768. fBeginNewWord = true;
  769. // Get next level of tree.
  770. while (true)
  771. {
  772. iNumCluster = GetCluster(pszIndex);
  773. if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
  774. {
  775. fBeginNewWord = false;
  776. pszIndex += iNumCluster;
  777. if (thaiTrieIter.fWordEnd)
  778. Weight = (unsigned int) (pszIndex - pszBegin);
  779. }
  780. else
  781. break;
  782. }
  783. return Weight;
  784. }
  785. //+---------------------------------------------------------------------------
  786. //
  787. // Class: CThaiBreakTree
  788. //
  789. // Synopsis:
  790. //
  791. // Arguments:
  792. //
  793. // Modifies:
  794. //
  795. // History: created 8/99 aarayas
  796. //
  797. // Notes:
  798. //
  799. //----------------------------------------------------------------------------
  800. unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin, DWORD* pdwTag)
  801. {
  802. // Declare and initialize local variables.
  803. unsigned int iNumCluster = 1;
  804. unsigned int Weight = 0;
  805. bool fBeginNewWord;
  806. WCHAR* pszIndex = pszBegin;
  807. // Short circuit the length is less of string is less than 1.
  808. if ((pszEnd - pszBegin) == 1)
  809. return Weight;
  810. else if (pszEnd == pszBegin)
  811. return 1000;
  812. // Reset Iterator for generating break for new word.
  813. fBeginNewWord = true;
  814. // Get next level of tree.
  815. while (true)
  816. {
  817. iNumCluster = GetCluster(pszIndex);
  818. if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
  819. {
  820. fBeginNewWord = false;
  821. pszIndex += iNumCluster;
  822. if (thaiTrieIter.fWordEnd)
  823. {
  824. Weight = (unsigned int) (pszIndex - pszBegin);
  825. *pdwTag = thaiTrieIter.dwTag;
  826. }
  827. }
  828. else
  829. break;
  830. }
  831. return Weight;
  832. }
  833. //+---------------------------------------------------------------------------
  834. //
  835. // Class: CThaiBreakTree
  836. //
  837. // Synopsis: Traverse the tree.
  838. //
  839. // Arguments:
  840. //
  841. // Modifies:
  842. //
  843. // History: created 7/99 aarayas
  844. //
  845. // Notes:
  846. //
  847. //----------------------------------------------------------------------------
  848. bool CThaiBreakTree::Traverse(unsigned int iLevel, unsigned int iCurrentNode, unsigned int iNumUnknown)
  849. {
  850. assert (iLevel < MAXBREAK);
  851. // Process node.
  852. breakArray[iLevel] = breakTree[iCurrentNode].iBreakLen;
  853. tagArray[iLevel] = breakTree[iCurrentNode].dwTAG;
  854. if (tagArray[iLevel] == TAGPOS_UNKNOWN)
  855. iNumUnknown++;
  856. // Have we found the end of the sentence.
  857. if (breakTree[iCurrentNode].NextBreak == 0)
  858. {
  859. if (breakTree[iCurrentNode].dwTAG != TAGPOS_PURGE)
  860. AddBreakToList(iLevel + 1, iNumUnknown);
  861. if (breakTree[iCurrentNode].Down != 0)
  862. {
  863. if (tagArray[iLevel] == TAGPOS_UNKNOWN)
  864. iNumUnknown--;
  865. return Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown);
  866. }
  867. else
  868. return true;
  869. }
  870. else
  871. Traverse(iLevel + 1, breakTree[iCurrentNode].NextBreak, iNumUnknown);
  872. if (breakTree[iCurrentNode].Down != 0)
  873. {
  874. if (tagArray[iLevel] == TAGPOS_UNKNOWN)
  875. iNumUnknown--;
  876. Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown);
  877. }
  878. return true;
  879. }
  880. //+---------------------------------------------------------------------------
  881. //
  882. // Class: CThaiBreakTree
  883. //
  884. // Synopsis:
  885. //
  886. // Arguments:
  887. //
  888. // Modifies:
  889. //
  890. // History: created 8/99 aarayas
  891. //
  892. // Notes:
  893. //
  894. //----------------------------------------------------------------------------
  895. unsigned int CThaiBreakTree::SoundexSearch(WCHAR* pszBegin)
  896. {
  897. // Declare and initialize local variables.
  898. unsigned int iNumCluster = 1;
  899. unsigned int iNumNextCluster = 1;
  900. unsigned int iLongestWord = 0;
  901. unsigned int iPenalty = 0;
  902. WCHAR* pszIndex = pszBegin;
  903. // Short circuit the length is less of string is less than 1.
  904. if ( (pszBegin+1) >= pszEnd )
  905. return iLongestWord;
  906. // Reset Iterator for generating break for new word.
  907. thaiTrieIter1.Reset();
  908. // Get next level of tree.
  909. while (true)
  910. {
  911. iNumCluster = GetCluster(pszIndex);
  912. // Determine iNumNextCluster let iNumNextCluster = 0, if we reached the end of string.
  913. if (pszIndex + iNumCluster >= pszEnd)
  914. iNumNextCluster = 0;
  915. else
  916. iNumNextCluster = GetCluster(pszIndex+iNumCluster);
  917. // Determine penalty
  918. switch (thaiTrieIter1.MoveSoundexByCluster(pszIndex, iNumCluster, iNumNextCluster))
  919. {
  920. case SUBSTITUTE_SOUNDLIKECHAR:
  921. iPenalty += 2;
  922. break;
  923. case SUBSTITUTE_DIACRITIC:
  924. iPenalty++;
  925. break;
  926. case UNABLE_TO_MOVE:
  927. iPenalty += 2;
  928. break;
  929. case STOP_MOVE:
  930. iPenalty += 1000;
  931. break;
  932. default:
  933. case NOSUBSTITUTE:
  934. break;
  935. }
  936. // Update Index.
  937. if (iPenalty <= 2)
  938. {
  939. pszIndex += iNumCluster;
  940. if (thaiTrieIter1.fWordEnd)
  941. iLongestWord = (unsigned int) (pszIndex - pszBegin);
  942. }
  943. else
  944. break;
  945. }
  946. return iLongestWord;
  947. }
  948. //+---------------------------------------------------------------------------
  949. //
  950. // Class: CThaiBreakTree
  951. //
  952. // Synopsis: The information used here is a reference to the orthographic
  953. // analysis work done on the Thai languages. (see paper: Natural
  954. // Language Processing in Thailand 1993 Chulalongkorn. p 361).
  955. //
  956. // Arguments: pszBoundaryChar - Contain pointer to at least two thai character
  957. // character next to each other which we will
  958. // use to calculate wheather we should or
  959. // should not merge the two word.
  960. //
  961. // iPrevWordLen -
  962. //
  963. // Modifies:
  964. //
  965. // History: created 8/99 aarayas
  966. //
  967. // Notes:
  968. //
  969. //----------------------------------------------------------------------------
  970. inline bool CThaiBreakTree::ShouldMerge(const WCHAR* pwszPrevWord, unsigned int iPrevWordLen, unsigned int iMergeWordLen, DWORD dwPrevTag)
  971. {
  972. const WCHAR* pwszBoundary = pwszPrevWord + iPrevWordLen - 1;
  973. assert(iMergeWordLen != 0);
  974. assert(iPrevWordLen != 0);
  975. // There are very few words in Thai that are 4 character or less, therefore we should
  976. // found a pair that less than 4 character we should merge.
  977. // Or if merge word length is one than also merge.
  978. // Of if last cluster of the word is a Thanthakhat(Karan) we should always merge.
  979. if (iPrevWordLen + iMergeWordLen <= 4 || iMergeWordLen == 1 ||
  980. (iMergeWordLen == 2 && *(pwszBoundary + iMergeWordLen) == THAI_Thanthakhat))
  981. return true;
  982. if (iPrevWordLen >=2)
  983. {
  984. const WCHAR* pwszPrevCharBoundary = pwszBoundary - 1;
  985. // TO IMPROVE: It better to check the last character of Previous word, it can give us a
  986. // much better guess
  987. if ((*pwszPrevCharBoundary == THAI_Vowel_Sign_Mai_HanAkat || *pwszBoundary == THAI_Vowel_Sign_Mai_HanAkat) ||
  988. (*pwszPrevCharBoundary == THAI_Tone_Mai_Tri || *pwszBoundary == THAI_Tone_Mai_Tri) ||
  989. (*pwszPrevCharBoundary == THAI_Sara_Ue || *pwszBoundary == THAI_Sara_Ue) )
  990. return true;
  991. }
  992. // If the first character of the next word is mostly likly the beginning
  993. // character and last character of the previous word is not sara-A than
  994. // we have a high probability that we found a begin of word boundary,
  995. // therefore we shouldn't merge.
  996. if ( (IsThaiMostlyBeginCharacter(pwszBoundary[1]) && *pwszBoundary != THAI_Vowel_Sara_A) )
  997. return false;
  998. // If the last character of the previous word is mostly likely an ending
  999. // character than, than there is a high probability that the found a boundary.
  1000. // There are very few words in Thai that are 4 character or less, therefore we should
  1001. // found a pair that less than 4 character we should merge.
  1002. if (IsThaiMostlyLastCharacter(*pwszBoundary))
  1003. return false;
  1004. // O10.192931 Adding Diacritic check rules. We might want to expand this to more diacritic
  1005. // for now Mai HanAkart would do. It is highly unlikely that a word contain more than 1 of Mai HanAkart diacritic.
  1006. if (IsContain(pwszPrevWord,iPrevWordLen,THAI_Vowel_Sign_Mai_HanAkat) && IsContain(pwszBoundary + 1,iMergeWordLen,THAI_Vowel_Sign_Mai_HanAkat))
  1007. return false;
  1008. if (iMergeWordLen == 3 && GetCluster(pwszBoundary + 1) == iMergeWordLen)
  1009. {
  1010. if (*(pwszBoundary + 2) == THAI_Vowel_Sara_I)
  1011. {
  1012. if (*(pwszBoundary+3) == THAI_Tone_Mai_Ek || *(pwszBoundary+3) == THAI_Tone_Mai_Tro)
  1013. return false;
  1014. }
  1015. }
  1016. // if previous tag is equal to Title Noun than the next word is highly likly to be a name.
  1017. if (ExtractPOS(dwPrevTag) == 6)
  1018. return false;
  1019. // O11.134455. For the case of trailling punctuation.
  1020. if (dwPrevTag == TAGPOS_PUNC && iMergeWordLen > 1 && iPrevWordLen > 1)
  1021. return false;
  1022. // The reason we are using 8 is because from corpora analysis
  1023. // the average Thai word is about 7.732 characters. Or, if previous word is already
  1024. // an unknown, to keep the amount of unknown low the unknown to previous words.
  1025. if ( (iPrevWordLen + iMergeWordLen < 8) || (dwPrevTag == TAGPOS_UNKNOWN) )
  1026. return true;
  1027. return false;
  1028. }
  1029. //+---------------------------------------------------------------------------
  1030. //
  1031. // Class: CThaiBreakTree
  1032. //
  1033. // Synopsis:
  1034. //
  1035. // Arguments:
  1036. //
  1037. // Modifies:
  1038. //
  1039. // History: created 7/99 aarayas
  1040. // 8/17/99 optimize some code.
  1041. //
  1042. // Notes:
  1043. //
  1044. //----------------------------------------------------------------------------
  1045. inline void CThaiBreakTree::AddBreakToList(unsigned int iNumBreak, unsigned int iNumUnknown)
  1046. {
  1047. #if defined (_DEBUG)
  1048. breakArray[iNumBreak] = 0;
  1049. #endif
  1050. if (CompareSentenceStructure(iNumBreak, iNumUnknown))
  1051. {
  1052. maxToken = maxLevel = iNumBreak; // This is ugly but it save 5 clock cycle.
  1053. memcpy(maximalMatchingBreakArray,breakArray,maxToken);
  1054. memcpy(maximalMatchingTAGArray,tagArray,sizeof(DWORD)*maxToken);
  1055. maximalMatchingBreakArray[maxToken] = 0;
  1056. maximalMatchingTAGArray[maxToken] = 0;
  1057. }
  1058. }
  1059. //+---------------------------------------------------------------------------
  1060. //
  1061. // Class: CThaiBreakTree
  1062. //
  1063. // Synopsis: The function compares sentence structure of
  1064. // maximalMatchingPOSArray with posArray.
  1065. //
  1066. // Arguments:
  1067. //
  1068. // Modifies:
  1069. //
  1070. // History: created 7/99 aarayas
  1071. //
  1072. // Notes:
  1073. //
  1074. //----------------------------------------------------------------------------
  1075. inline bool CThaiBreakTree::CompareSentenceStructure(unsigned int iNumBreak, unsigned int iNumUnknownPOSArray)
  1076. {
  1077. if ( (iNumBreak < maxLevel) && (iNumUnknownMaximalPOSArray >= iNumUnknownPOSArray) )
  1078. {
  1079. iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
  1080. return true;
  1081. }
  1082. else if (iNumBreak == maxLevel)
  1083. {
  1084. // true - maximal matching has a larger unknown.
  1085. if (iNumUnknownMaximalPOSArray > iNumUnknownPOSArray)
  1086. {
  1087. iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
  1088. return true;
  1089. }
  1090. for(unsigned int i = 0; i <= iNumBreak; i++)
  1091. {
  1092. maximalMatchingPOSArray[i] = ExtractPOS(maximalMatchingTAGArray[i]);
  1093. POSArray[i] = ExtractPOS(tagArray[i]);
  1094. }
  1095. // Determine if the sentence structure is like any one of the sentence
  1096. // sentence structure in our corpora.
  1097. if ( (IsSentenceStruct(POSArray, iNumBreak)) &&
  1098. (!IsSentenceStruct(maximalMatchingPOSArray, iNumBreak)) )
  1099. {
  1100. iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
  1101. return true;
  1102. }
  1103. else if (iNumUnknownMaximalPOSArray == iNumUnknownPOSArray)
  1104. {
  1105. // Determine the frequency of word used in the sentence.
  1106. unsigned int iFrequencyArray = 500;
  1107. unsigned int iFrequencyMaximalArray = 500;
  1108. for(unsigned int i = 0; i <= iNumBreak; i++)
  1109. {
  1110. DetermineFrequencyWeight(ExtractFrq(maximalMatchingTAGArray[i]),&iFrequencyMaximalArray);
  1111. DetermineFrequencyWeight(ExtractFrq(tagArray[i]),&iFrequencyArray);
  1112. }
  1113. return (iFrequencyArray > iFrequencyMaximalArray);
  1114. }
  1115. }
  1116. return false;
  1117. }
  1118. //+---------------------------------------------------------------------------
  1119. //
  1120. // Class: CThaiBreakTree
  1121. //
  1122. // Synopsis:
  1123. //
  1124. // Arguments:
  1125. //
  1126. // Modifies:
  1127. //
  1128. // History: created 8/99 aarayas
  1129. //
  1130. // Notes:
  1131. //
  1132. //----------------------------------------------------------------------------
  1133. bool CThaiBreakTree::IsSentenceStruct(const WCHAR* pos, unsigned int iPosLen)
  1134. {
  1135. // Declare and initialize all local variables.
  1136. unsigned int i = 0;
  1137. thaiSentIter.Reset();
  1138. if (!thaiSentIter.Down())
  1139. return FALSE;
  1140. while (TRUE)
  1141. {
  1142. thaiSentIter.GetNode();
  1143. if (thaiSentIter.pos == pos[i])
  1144. {
  1145. i++;
  1146. if (thaiSentIter.fWordEnd && i == iPosLen)
  1147. {
  1148. return TRUE;
  1149. }
  1150. else if (i == iPosLen) break;
  1151. // Move down the Trie Branch.
  1152. else if (!thaiSentIter.Down()) break;
  1153. }
  1154. // Move right of the Trie Branch
  1155. else if (!thaiSentIter.Right()) break;
  1156. }
  1157. return FALSE;
  1158. }
  1159. //+---------------------------------------------------------------------------
  1160. //
  1161. // Class: CThaiBreakTree
  1162. //
  1163. // Synopsis:
  1164. //
  1165. // Arguments:
  1166. //
  1167. // Modifies:
  1168. //
  1169. // History: created 8/99 aarayas
  1170. //
  1171. // Notes:
  1172. //
  1173. //----------------------------------------------------------------------------
  1174. float CThaiBreakTree::BigramProbablity(DWORD dwTag1,DWORD dwTag2)
  1175. {
  1176. unsigned int iWeight = 4;
  1177. // TODO : Use the distribution of word category to determine optimial search - exmaple
  1178. // NOUN VERB ADVERB CLASSIFIER CONJETURE PREP et....
  1179. // TODO : Once we got trigram use it to create bigram probability as well.
  1180. if ( (dwTag1 != TAGPOS_UNKNOWN) &&
  1181. (dwTag2 != TAGPOS_UNKNOWN) )
  1182. {
  1183. WCHAR pos1 = ExtractPOS(dwTag1);
  1184. WCHAR pos2 = ExtractPOS(dwTag2);
  1185. // case NCMN VATT
  1186. /// a common noun is often followed by attributive verb(adjective)
  1187. // Example: (In Thai) book good, people nice
  1188. if (pos1 == 5 && pos2 == 13)
  1189. iWeight += 10;
  1190. // case NTTL NPRP
  1191. // a title noun is often followed by proper noun
  1192. // Example: Dr. Athapan, Mr. Sam
  1193. else if (pos1 == 6 && pos2 == 1)
  1194. iWeight += 5;
  1195. // case JSBR (XVAM || VSTA)
  1196. // a subordinating conjunction is often followed by preverb auxillary or Active verb
  1197. // Example: (In Thai) Because of , Because see
  1198. else if (pos1 == 39 && (pos2 == 15 || pos2 == 12))
  1199. iWeight += 10;
  1200. // case ADVN NCMN
  1201. // a Adverb normal form is often followed by Common noun (Bug 55057).
  1202. // Example: (In Thai) under table.
  1203. else if (pos1 == 28 && pos2 == 5)
  1204. iWeight += 5;
  1205. // case VACT XVAE
  1206. else if (pos1 == 11 && pos2 == 18)
  1207. iWeight += 5;
  1208. // case VACT DDBQ
  1209. // Active verb follow by Definite determiner.
  1210. // Example: (In Thai) working for, singing again.
  1211. else if (pos1 == 11 && pos2 == 21)
  1212. iWeight += 10;
  1213. // case VATT VACT
  1214. // adjective are followed by verb.
  1215. // Example: (In Thai keyboard)sivd;jk
  1216. else if (pos1 == 13 && pos2 == 11)
  1217. iWeight += 2;
  1218. // case XVAE VACT
  1219. // a post verb auxilliary are often followed by an active verb.
  1220. // Example: (In Thai) come singing, go work.
  1221. else if (pos1 == 18 && pos2 == 11)
  1222. iWeight += 10;
  1223. // case CLTV NCMN
  1224. // a Collective classfier are often followed by Common Noun
  1225. // Example: (In Thai) group people, flock bird
  1226. else if (pos1 == 33 && pos2 == 5)
  1227. iWeight += 5;
  1228. // case NEG (VACT || VSTA || VATT || XVAM || XVAE)
  1229. // a negator (ie. not) is often followed by some kind of VERB.
  1230. // Example: He is not going.
  1231. else if (pos1 == 46 && (pos2 == 11 || pos2 == 12 || pos2 == 13 || pos2 == 15 || pos2 == 16))
  1232. iWeight += 8;
  1233. // case EAFF or EITT
  1234. // Ending for affirmative, and interrogative are more often ending of the pair
  1235. // Example: (In Thai) Krub, Ka,
  1236. else if (pos2 == 44 || pos2 == 45)
  1237. iWeight += 3;
  1238. // case VATT and VATT
  1239. // Attributive Verb and Attributive Verb occur when often in spoken laguages.
  1240. // Example: she is reall really cute.
  1241. else if (pos1 == 13 && pos2 == 13)
  1242. iWeight += 2;
  1243. // case NCMN and DDAC
  1244. // Common Noun and Definitive determiner classifier.
  1245. // Example: Food here (Thai)
  1246. else if (pos1 == 5 && pos2 == 20)
  1247. iWeight += 3;
  1248. // case CMTR and JCMP
  1249. // Measurement classifier and Comparative conjunction, are likly to appear in Thai.
  1250. // Example: year about (Thai) -> English about a year.
  1251. else if (pos1 == 34 && pos2 == 38)
  1252. iWeight += 5;
  1253. // case XVBB and VACT
  1254. else if (pos1 == 17 && pos2 == 11)
  1255. iWeight += 5;
  1256. // case NCMN and NCMN
  1257. // Common Noun and Common Noun
  1258. // Example: electric bulb(in thai)
  1259. else if (pos1 == 5 && pos2 == 5)
  1260. iWeight += 1;
  1261. }
  1262. DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight);
  1263. DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight);
  1264. return (float) iWeight;
  1265. }
  1266. //+---------------------------------------------------------------------------
  1267. //
  1268. // Class: CThaiBreakTree
  1269. //
  1270. // Synopsis:
  1271. //
  1272. // Arguments:
  1273. //
  1274. // Modifies:
  1275. //
  1276. // History: created 8/99 aarayas
  1277. //
  1278. // Notes:
  1279. //
  1280. //----------------------------------------------------------------------------
  1281. DWORD CThaiBreakTree::TrigramProbablity(DWORD dwTag1,DWORD dwTag2,DWORD dwTag3)
  1282. {
  1283. DWORD iWeight = 6;
  1284. if ( (dwTag1 != TAGPOS_UNKNOWN) &&
  1285. (dwTag2 != TAGPOS_UNKNOWN) &&
  1286. (dwTag3 != TAGPOS_UNKNOWN) )
  1287. {
  1288. WCHAR pos1 = ExtractPOS(dwTag1);
  1289. WCHAR pos2 = ExtractPOS(dwTag2);
  1290. WCHAR pos3 = ExtractPOS(dwTag3);
  1291. // optimization we if any POS is none than trigram shouldn't therefor no need to search.
  1292. if ( pos1 != 0 && pos2 != 0 && pos3 != 0)
  1293. {
  1294. WCHAR posArray[4];
  1295. posArray[0] = pos1;
  1296. posArray[1] = pos2;
  1297. posArray[2] = pos3;
  1298. posArray[3] = 0;
  1299. iWeight += thaiTrigramIter.GetProb(posArray);
  1300. }
  1301. }
  1302. DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight);
  1303. DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight);
  1304. DetermineFrequencyWeight(ExtractFrq(dwTag3), &iWeight);
  1305. // We reached zero probablity.
  1306. return (DWORD)iWeight;
  1307. }
  1308. //+---------------------------------------------------------------------------
  1309. //
  1310. // Class: CThaiBreakTree
  1311. //
  1312. // Synopsis:
  1313. //
  1314. // Arguments:
  1315. //
  1316. // Modifies:
  1317. //
  1318. // History: created 8/99 aarayas
  1319. //
  1320. // Notes:
  1321. //
  1322. //----------------------------------------------------------------------------
  1323. unsigned int CThaiBreakTree::TrigramBreak(WCHAR* pwchBegin, WCHAR* pwchEnd1)
  1324. {
  1325. // Declare and initialize local variables.
  1326. WCHAR* pwchBeginWord = pwchBegin;
  1327. WCHAR* pwchIndex = pwchBegin;
  1328. unsigned int iWordLen;
  1329. unsigned int iNumCluster = 1;
  1330. unsigned int iNumLastCluster;
  1331. unsigned int iBreakIndex = 0;
  1332. BYTE nextBreakArray[MAXBREAK];
  1333. DWORD nextTagArray[MAXBREAK];
  1334. unsigned int iNextBreakIndex; // index for array nextBreakArray and nextTagArray.
  1335. bool fFoundMatch;
  1336. unsigned int iWeight;
  1337. unsigned int iSumWeight;
  1338. unsigned int iPrevWeight;
  1339. unsigned int iCurrWeight;
  1340. BYTE iSoundexWordLen;
  1341. DWORD iPrevProbability;
  1342. DWORD iCurrentProbability;
  1343. DWORD dwTagTemp;
  1344. DWORD dwLastTag;
  1345. int i; // temporary int for use as need.
  1346. bool fBeginNewWord;
  1347. bool fEndWord = false;
  1348. pszEnd = pwchEnd1;
  1349. breakArray[0] = 0;
  1350. POSArray[0] = 0;
  1351. tagArray[0] = 0;
  1352. nextBreakArray[0] = 0;
  1353. nextTagArray[0] = 0;
  1354. while (true)
  1355. {
  1356. // Reset Iterator for generating break for new word.
  1357. fFoundMatch = false;
  1358. fBeginNewWord = true;
  1359. // Get begin word string for next round of word break.
  1360. pwchIndex = pwchBeginWord;
  1361. iNextBreakIndex = 0;
  1362. if (pwchIndex == pszEnd)
  1363. break;
  1364. while(true)
  1365. {
  1366. iNumLastCluster = iNumCluster;
  1367. iNumCluster = GetCluster(pwchIndex);
  1368. if (!thaiTrieIter.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
  1369. {
  1370. if ((iNumCluster == 0) && (pwchIndex == pszEnd))
  1371. fEndWord = true;
  1372. else
  1373. break;
  1374. }
  1375. fBeginNewWord = false;
  1376. pwchIndex += iNumCluster;
  1377. if (thaiTrieIter.fWordEnd)
  1378. {
  1379. if (thaiTrieIter.m_fThaiNumber)
  1380. {
  1381. // If we have Thai number accumulate it as one break.
  1382. assert(iNumCluster == 1);
  1383. fFoundMatch = true;
  1384. nextBreakArray[0]= (BYTE)(pwchIndex - pwchBeginWord);
  1385. nextTagArray[0] = TAGPOS_NCNM;
  1386. iNextBreakIndex = 1;
  1387. }
  1388. else
  1389. {
  1390. fFoundMatch = true;
  1391. nextBreakArray[iNextBreakIndex] = (BYTE)(pwchIndex - pwchBeginWord);
  1392. nextTagArray[iNextBreakIndex] = thaiTrieIter.dwTag;
  1393. iNextBreakIndex++;
  1394. }
  1395. if (pwchIndex >= pszEnd)
  1396. {
  1397. assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
  1398. assert(iNextBreakIndex != 0);
  1399. if ( iNumCluster == 1 &&
  1400. *(pwchIndex - 1) == L'.' &&
  1401. iBreakIndex > 0 &&
  1402. iNextBreakIndex == 1 &&
  1403. tagArray[iBreakIndex - 1] == TAGPOS_ABBR )
  1404. {
  1405. // backtrack one if we have abbrivation case.
  1406. // ex. B.K.K. (in Thai). (more info O11.145042.)
  1407. breakArray[iBreakIndex - 1] += nextBreakArray[iNextBreakIndex - 1];
  1408. return iBreakIndex;
  1409. }
  1410. breakArray[iBreakIndex] = nextBreakArray[iNextBreakIndex - 1];
  1411. tagArray[iBreakIndex] = nextTagArray[iNextBreakIndex - 1];
  1412. return (++iBreakIndex);
  1413. }
  1414. }
  1415. else if ((pwchIndex >= pszEnd && iNextBreakIndex == 0) || fEndWord)
  1416. {
  1417. assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
  1418. iWordLen = (unsigned int) (pwchIndex - pwchBeginWord);
  1419. switch (iWordLen)
  1420. {
  1421. case 0:
  1422. if (iBreakIndex > 0)
  1423. {
  1424. // if We have a length of one character add it to previous node.
  1425. breakArray[iBreakIndex - 1] += (BYTE) iNumCluster;
  1426. tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
  1427. }
  1428. else
  1429. {
  1430. // if this is the first break create a new break.
  1431. breakArray[iBreakIndex] = (BYTE) iNumCluster;
  1432. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1433. iBreakIndex++;
  1434. }
  1435. break;
  1436. case 1:
  1437. if (iBreakIndex > 0)
  1438. {
  1439. // if We have a length of one character add it to previous node.
  1440. breakArray[iBreakIndex - 1] += (BYTE) iWordLen;
  1441. tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
  1442. }
  1443. else
  1444. {
  1445. // if this is the first break create a new break.
  1446. breakArray[iBreakIndex] = (BYTE) iWordLen;
  1447. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1448. iBreakIndex++;
  1449. }
  1450. break;
  1451. default:
  1452. if ( iBreakIndex > 0 &&
  1453. ShouldMerge(pwchBeginWord - breakArray[iBreakIndex - 1], breakArray[iBreakIndex - 1],
  1454. iWordLen , tagArray[iBreakIndex - 1]) )
  1455. {
  1456. breakArray[iBreakIndex - 1] += (BYTE) iWordLen;
  1457. tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
  1458. }
  1459. else
  1460. {
  1461. breakArray[iBreakIndex] = (BYTE) iWordLen;
  1462. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1463. iBreakIndex++;
  1464. }
  1465. }
  1466. return iBreakIndex;
  1467. }
  1468. else if (pwchIndex >= pszEnd)
  1469. {
  1470. // O10.229346. If we get here we are at the end of word or end of sentence,
  1471. // We will need to decide what to depending on if we found the word or not.
  1472. break;
  1473. }
  1474. }
  1475. if (fFoundMatch) // Longest Matching.
  1476. {
  1477. // If we only found one break, than say it the maximum.
  1478. if (1 == iNextBreakIndex)
  1479. {
  1480. if ( nextBreakArray[0] == 2 &&
  1481. iNumCluster + iNumLastCluster == 2 &&
  1482. iBreakIndex > 0 &&
  1483. *(pwchBeginWord+1) == L'.' &&
  1484. tagArray[iBreakIndex - 1] == TAGPOS_ABBR )
  1485. {
  1486. // backtrack one if we have abbrivation case.
  1487. // ex. B.K.K. (in Thai). (more info O11.145042.)
  1488. breakArray[iBreakIndex - 1] += nextBreakArray[0];
  1489. pwchBeginWord += nextBreakArray[0];
  1490. }
  1491. else if ( iBreakIndex > 0 &&
  1492. IsThaiEndingSign(*pwchBeginWord) &&
  1493. iNumCluster == 1 )
  1494. {
  1495. breakArray[iBreakIndex - 1] += nextBreakArray[0];
  1496. pwchBeginWord += nextBreakArray[0];
  1497. }
  1498. else
  1499. {
  1500. breakArray[iBreakIndex] = nextBreakArray[0];
  1501. tagArray[iBreakIndex] = nextTagArray[0];
  1502. pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
  1503. iBreakIndex++;
  1504. }
  1505. }
  1506. else
  1507. {
  1508. bool fWeightCompare = false;
  1509. iSumWeight = 0;
  1510. iPrevWeight = 0;
  1511. iCurrWeight = 0;
  1512. iPrevProbability = 0;
  1513. iCurrentProbability = 0;
  1514. dwLastTag = TAGPOS_UNKNOWN;
  1515. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1516. for (i = (iNextBreakIndex - 1); i >= 0 ; i--)
  1517. {
  1518. if ( iBreakIndex == 0)
  1519. {
  1520. iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp);
  1521. if (iWeight != 0)
  1522. // Bigram Probability
  1523. iCurrentProbability = (DWORD)BigramProbablity(nextTagArray[i], dwTagTemp);
  1524. }
  1525. else
  1526. {
  1527. iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp);
  1528. if (iBreakIndex == 1)
  1529. // Get Trigram Probability.
  1530. iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 1], nextTagArray[i], dwTagTemp);
  1531. else if (iBreakIndex >= 2)
  1532. {
  1533. // Get Trigram Probability.
  1534. iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 2], tagArray[iBreakIndex - 1], nextTagArray[i]);
  1535. if (iWeight != 0)
  1536. iCurrentProbability += (DWORD)BigramProbablity(nextTagArray[i],dwTagTemp);
  1537. }
  1538. }
  1539. fWeightCompare = false;
  1540. iCurrWeight = iWeight + nextBreakArray[i];
  1541. if (iPrevProbability == 0 && (iCurrWeight+1) == iSumWeight && iCurrentProbability > 5)
  1542. {
  1543. fWeightCompare = true;
  1544. }
  1545. else if (iCurrWeight == iSumWeight && ( Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight ||
  1546. iCurrentProbability > iPrevProbability))
  1547. {
  1548. fWeightCompare = true;
  1549. }
  1550. else if ( iWeight >= iPrevWeight - 1 &&
  1551. iPrevProbability > 0 && iPrevProbability < 10 &&
  1552. iCurrentProbability > iPrevProbability * 5000 )
  1553. {
  1554. // O11.187913. We'll trust our trigram data more if the current probability is
  1555. // so much greater than previous probability.
  1556. //
  1557. // * Note: we could probably use one of GA algorithm to get better value than 5K.
  1558. fWeightCompare = true;
  1559. }
  1560. // Store the string the best maximum weight, if the pair is equal
  1561. // store the string with maxim
  1562. if ( iCurrWeight > iSumWeight ||
  1563. fWeightCompare)
  1564. // ( (iCurrWeight == iSumWeight) &&
  1565. // ( (Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight) || (iCurrentProbability > iPrevProbability) ) ))
  1566. {
  1567. if (iCurrentProbability >= iPrevProbability || iSumWeight < iCurrWeight)
  1568. {
  1569. iSumWeight = Maximum(iWeight,1) + nextBreakArray[i];
  1570. iPrevWeight = Maximum(iWeight,nextBreakArray[i]);
  1571. breakArray[iBreakIndex] = nextBreakArray[i];
  1572. tagArray[iBreakIndex] = nextTagArray[i];
  1573. iPrevProbability = iCurrentProbability;
  1574. dwLastTag = dwTagTemp;
  1575. }
  1576. }
  1577. }
  1578. pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
  1579. iBreakIndex++;
  1580. }
  1581. }
  1582. else
  1583. {
  1584. // NOMATCH_FOUND
  1585. iWordLen = (unsigned int)(pwchIndex - pwchBeginWord);
  1586. if (iBreakIndex > 0)
  1587. {
  1588. i = iBreakIndex - 1; // set i to previous break
  1589. if (iWordLen == 0)
  1590. {
  1591. if (iNumCluster == 1 && *pwchBeginWord == L',' &&
  1592. IsThaiChar(*(pwchBeginWord-breakArray[i])) )
  1593. {
  1594. // We should not merge comma into the word, only merge comma to
  1595. // Number.
  1596. // TODO: Should add TAGPOS_PUNCT.
  1597. breakArray[iBreakIndex] = (BYTE) iNumCluster;
  1598. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1599. pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
  1600. iBreakIndex++;
  1601. }
  1602. else if (iNumCluster > 1 && *pwchBeginWord == L'.')
  1603. {
  1604. // O11.134455. This is an ellipse case we shouldn't merge this string.
  1605. breakArray[iBreakIndex] = (BYTE) iNumCluster;
  1606. tagArray[iBreakIndex] = TAGPOS_PUNC;
  1607. pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
  1608. iBreakIndex++;
  1609. }
  1610. else if (ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iNumCluster, tagArray[i]))
  1611. {
  1612. // If word length is null use the cluster add to previous node.
  1613. breakArray[i] += (BYTE) iNumCluster;
  1614. tagArray[i] = TAGPOS_UNKNOWN;
  1615. pwchBeginWord += iNumCluster; // update begin word for next round.
  1616. }
  1617. else
  1618. {
  1619. // Add the unknown word to list.
  1620. breakArray[iBreakIndex] = (BYTE) iNumCluster;
  1621. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1622. pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
  1623. iBreakIndex++;
  1624. }
  1625. }
  1626. else
  1627. {
  1628. // Try checking for abbrivations.
  1629. if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.')
  1630. {
  1631. // The word is an abbrivated words.
  1632. // TODO: #1. Add TAGPOS_ABBRV.
  1633. // TODO: #2. May need to add rules code abbrivated word with 3 letters.
  1634. breakArray[iBreakIndex] = iWordLen + iNumCluster;
  1635. tagArray[iBreakIndex] = TAGPOS_ABBR;
  1636. pwchBeginWord += breakArray[iBreakIndex];
  1637. iBreakIndex++;
  1638. }
  1639. else if (iWordLen == 1 &&
  1640. tagArray[i] == TAGPOS_ABBR &&
  1641. *(pwchBeginWord+1) == L'.' &&
  1642. IsThaiConsonant(*pwchBeginWord) &&
  1643. pwchBeginWord+1 < pszEnd )
  1644. {
  1645. // O11.145042. This is the case where we are a <abbrivated><consonant><period>, the
  1646. // likely hood is the character is also an abbrivation.
  1647. breakArray[iBreakIndex - 1] += iWordLen + 1;
  1648. pwchBeginWord += iWordLen + 1;
  1649. }
  1650. // Abbreviation are usally 3 characters.
  1651. else if ( iWordLen == 2 &&
  1652. IsThaiConsonant(*(pwchBeginWord+2)) &&
  1653. *(pwchBeginWord+3) == L'.' &&
  1654. tagArray[i] != TAGPOS_UNKNOWN )
  1655. {
  1656. // O11.80619. This is the case where we are a <known word><abbrivated>
  1657. breakArray[iBreakIndex] = iWordLen + 1;
  1658. tagArray[iBreakIndex] = TAGPOS_ABBR;
  1659. pwchBeginWord += breakArray[iBreakIndex];
  1660. iBreakIndex++;
  1661. }
  1662. // Perhase Misspelled word try use sounding to spell the words.
  1663. // Try soundex two word back.
  1664. else if ( (iBreakIndex >= 2) &&
  1665. ( (iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i] - breakArray[i - 1])) > (BYTE) (breakArray[i] + breakArray[i - 1]) ) &&
  1666. GetWeight(pwchBeginWord - breakArray[i] - breakArray[i - 1] + iSoundexWordLen) )
  1667. {
  1668. // Resize the word.
  1669. pwchBeginWord = (pwchBeginWord - breakArray[i] - breakArray[i - 1]) + iSoundexWordLen; // update begin word for next round.
  1670. breakArray[i - 1] = iSoundexWordLen;
  1671. tagArray[i - 1] = thaiTrieIter.dwTag;
  1672. iBreakIndex--; // Decrement iBreakIndex.
  1673. }
  1674. // Try soundex one words back.
  1675. else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i])) > (BYTE) breakArray[i]) &&
  1676. GetWeight(pwchBeginWord - breakArray[i] + iSoundexWordLen) &&
  1677. ExtractPOS(tagArray[i]) != 6) // Make sure that previous word is not a NTTL.
  1678. {
  1679. // Resize the word
  1680. pwchBeginWord = (pwchBeginWord - breakArray[i]) + iSoundexWordLen; // update begin word for next round.
  1681. breakArray[i] = iSoundexWordLen;
  1682. tagArray[i] = thaiTrieIter.dwTag;
  1683. }
  1684. // Try soundex on this word.
  1685. else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord)) > (BYTE) iWordLen) &&
  1686. GetWeight(pwchBeginWord + iSoundexWordLen) )
  1687. {
  1688. // Resize the word.
  1689. breakArray[iBreakIndex] = iSoundexWordLen;
  1690. tagArray[iBreakIndex] = thaiTrieIter.dwTag;
  1691. pwchBeginWord += iSoundexWordLen; // update begin word for next round.
  1692. iBreakIndex++;
  1693. }
  1694. else if ( ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iWordLen , tagArray[i]) )
  1695. {
  1696. // Merge the words.
  1697. breakArray[i] += (BYTE) iWordLen;
  1698. tagArray[i] = TAGPOS_UNKNOWN;
  1699. pwchBeginWord += iWordLen; // update begin word for next round.
  1700. }
  1701. else
  1702. {
  1703. // Add the unknown word to list.
  1704. breakArray[iBreakIndex] = (BYTE) iWordLen;
  1705. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1706. pwchBeginWord += iWordLen; // update begin word for next round.
  1707. iBreakIndex++;
  1708. }
  1709. }
  1710. }
  1711. else
  1712. {
  1713. // Add unknown word to list and mark it.
  1714. if (iWordLen == 0)
  1715. {
  1716. // If word length is null use the cluster add to previous node.
  1717. breakArray[iBreakIndex] = (BYTE) iNumCluster;
  1718. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1719. pwchBeginWord += iNumCluster; // update begin word for next round.
  1720. }
  1721. else
  1722. {
  1723. // We we are here there are 2 case that can happen:
  1724. // 1. We take too little into our unknown.
  1725. // 2. We take too much into our unknown word.
  1726. // Have we taken too little check if this unknown word is an abbrivated words.
  1727. if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.')
  1728. breakArray[iBreakIndex] = iWordLen + iNumCluster;
  1729. // Try to see if we are taking to much, see if we can get a Weight from last cluster.
  1730. else if ( (iWordLen - iNumLastCluster > 0) && GetWeight(pwchIndex - iNumLastCluster) )
  1731. {
  1732. breakArray[iBreakIndex] = iWordLen - iNumLastCluster;
  1733. if (breakArray[iBreakIndex] == 1)
  1734. {
  1735. iWeight = GetWeight(pwchIndex - iNumLastCluster);
  1736. if (iWeight > iNumLastCluster && iWeight < 40)
  1737. breakArray[iBreakIndex] += (BYTE) iWeight;
  1738. else
  1739. breakArray[iBreakIndex] += (BYTE) iNumLastCluster;
  1740. }
  1741. }
  1742. // We may have a case of iWordLen is 1 and iNumCluster, we have a case of misspelled
  1743. // an extra character is incorrectly inserted over a correct word.
  1744. else if (iWordLen == 1)
  1745. {
  1746. iWeight = GetWeight(pwchIndex - iWordLen);
  1747. if (iWeight > iNumCluster && iWeight < 40)
  1748. breakArray[iBreakIndex] = iWordLen + iWeight;
  1749. else
  1750. breakArray[iBreakIndex] = iWordLen + iNumCluster;
  1751. }
  1752. else
  1753. breakArray[iBreakIndex] = (BYTE) iWordLen;
  1754. if (iNumLastCluster + iNumCluster == iWordLen && *(pwchBeginWord+iNumLastCluster) == L'.')
  1755. {
  1756. tagArray[iBreakIndex] = TAGPOS_ABBR;
  1757. }
  1758. else
  1759. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1760. pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
  1761. }
  1762. iBreakIndex++;
  1763. }
  1764. }
  1765. }
  1766. return iBreakIndex;
  1767. }
  1768. //+---------------------------------------------------------------------------
  1769. //
  1770. // Class: CThaiBreakTree
  1771. //
  1772. // Synopsis:
  1773. //
  1774. // Arguments:
  1775. //
  1776. // Modifies:
  1777. //
  1778. // History: created 8/99 aarayas
  1779. //
  1780. // Notes:
  1781. //
  1782. //----------------------------------------------------------------------------
  1783. int CThaiBreakTree::Soundex(WCHAR* word)
  1784. {
  1785. return thaiTrieIter.Soundex(word);
  1786. }
  1787. //+---------------------------------------------------------------------------
  1788. //
  1789. // Function: GetCluster
  1790. //
  1791. // Synopsis: The function return the next number of character which represent
  1792. // a cluster of Thai text.
  1793. //
  1794. // ie. Kor Kai, Kor Kai -> 1
  1795. // Kor Kai, Sara Um -> 2
  1796. //
  1797. // * Note this function will not return no more than 3 character,
  1798. // for cluster as this would represent invalid sequence of character.
  1799. //
  1800. // Arguments:
  1801. //
  1802. // Modifies:
  1803. //
  1804. // History: created 7/99 aarayas
  1805. //
  1806. // Notes:
  1807. //
  1808. //----------------------------------------------------------------------------
  1809. unsigned int CThaiBreakTree::GetCluster(const WCHAR* pszIndex)
  1810. {
  1811. bool fHasSaraE;
  1812. int iRetValue = 0;
  1813. bool fNeedEndingCluster = false;
  1814. if (pszIndex == pszEnd)
  1815. return 0;
  1816. while (true)
  1817. {
  1818. fHasSaraE= false;
  1819. // Take all begin cluster character.
  1820. while (IsThaiBeginClusterCharacter(*pszIndex))
  1821. {
  1822. if (*pszIndex == THAI_Vowel_Sara_E)
  1823. fHasSaraE = true;
  1824. pszIndex++;
  1825. iRetValue++;
  1826. }
  1827. if (IsThaiConsonant(*pszIndex))
  1828. {
  1829. pszIndex++;
  1830. iRetValue++;
  1831. while (IsThaiUpperAndLowerClusterCharacter(*pszIndex))
  1832. {
  1833. // Mai Han Akat is a special type of cluster that will need at lease
  1834. // one ending cluster.
  1835. if (*pszIndex == THAI_Vowel_Sign_Mai_HanAkat)
  1836. fNeedEndingCluster = true;
  1837. // In Thai it isn't possible to make a sound if we have the SaraE
  1838. // following by vowel below vowel.
  1839. else if ( fHasSaraE &&
  1840. ( (*pszIndex == THAI_Vowel_Sara_II) ||
  1841. (*pszIndex == THAI_Tone_MaiTaiKhu) ||
  1842. (*pszIndex == THAI_Vowel_Sara_I) ||
  1843. (*pszIndex == THAI_Sara_Uee) ))
  1844. fNeedEndingCluster = true;
  1845. pszIndex++;
  1846. iRetValue++;
  1847. }
  1848. while (IsThaiEndingClusterCharacter(*pszIndex))
  1849. {
  1850. pszIndex++;
  1851. iRetValue++;
  1852. fNeedEndingCluster = false;
  1853. }
  1854. /*
  1855. // Include period as part of a cluster. Bug#57106
  1856. if (*pszIndex == 0x002e)
  1857. {
  1858. pszIndex++;
  1859. iRetValue++;
  1860. fNeedEndingCluster = false;
  1861. }
  1862. */
  1863. }
  1864. if (fNeedEndingCluster)
  1865. fNeedEndingCluster = false;
  1866. else
  1867. break;
  1868. }
  1869. if (iRetValue == 0)
  1870. {
  1871. // O11.134455. Ellipse case we go to combine ellipses to one cluster.
  1872. if (*pszIndex == 0x002e)
  1873. {
  1874. while (*pszIndex == 0x002e && pszIndex <= pszEnd)
  1875. {
  1876. pszIndex++;
  1877. iRetValue++;
  1878. }
  1879. }
  1880. else
  1881. iRetValue++; // The character is probably a punctuation.
  1882. }
  1883. if (pszIndex > pszEnd)
  1884. {
  1885. // We need to do this as we have gone over end buff boundary.
  1886. iRetValue -= (int) (pszIndex - pszEnd);
  1887. pszIndex = pszEnd;
  1888. }
  1889. return iRetValue;
  1890. }
  1891. //+---------------------------------------------------------------------------
  1892. //
  1893. // Class: CThaiBreakTree
  1894. //
  1895. // Synopsis:
  1896. //
  1897. // Arguments:
  1898. //
  1899. // wzWord - input string. (in)
  1900. // iWordLen - input string length. (in)
  1901. // Alt - find close alternate word (in)
  1902. // pBreakPos - array of break position allways 5 byte. (out)
  1903. //
  1904. // Modifies:
  1905. //
  1906. // History: created 3/00 aarayas
  1907. //
  1908. // Notes:
  1909. //
  1910. //----------------------------------------------------------------------------
  1911. int CThaiBreakTree::FindAltWord(WCHAR* pwchBegin,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos)
  1912. {
  1913. // Declare and initialize local variables.
  1914. unsigned int iNumCluster = 1;
  1915. WCHAR* pwchBeginWord = pwchBegin;
  1916. WCHAR* pwchIndex = pwchBegin;
  1917. bool fBeginNewWord = true;
  1918. unsigned int iBreakIndex = 0;
  1919. unsigned int iBreakTemp = 0;
  1920. unsigned int iBreakTemp1 = 0;
  1921. unsigned int iBreakTemp2 = 0;
  1922. pszEnd = pwchBegin + iWordLen;
  1923. // TODO: Need to clean this code up.
  1924. switch(Alt)
  1925. {
  1926. case 3:
  1927. while (true)
  1928. {
  1929. iNumCluster = GetCluster(pwchIndex);
  1930. if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
  1931. return iBreakIndex;
  1932. fBeginNewWord = false;
  1933. pwchIndex += iNumCluster;
  1934. if (thaiTrieIter1.fWordEnd)
  1935. {
  1936. iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
  1937. // reached the end of word unable to find alt word.
  1938. if (iBreakTemp >= iWordLen)
  1939. return 0;
  1940. iBreakTemp1 = GetWeight(pwchIndex);
  1941. // reached the end of word unable to find alt word.
  1942. if (iBreakTemp + iBreakTemp1 >= iWordLen)
  1943. return 0;
  1944. iBreakTemp2 = GetWeight(pwchIndex+iBreakTemp1);
  1945. if (iBreakTemp + iBreakTemp1 + iBreakTemp2 == iWordLen)
  1946. {
  1947. pBreakPos[0] = (BYTE)iBreakTemp;
  1948. pBreakPos[1] = (BYTE)iBreakTemp1;
  1949. pBreakPos[2] = (BYTE)iBreakTemp2;
  1950. return 3;
  1951. }
  1952. }
  1953. if (pwchIndex >= pszEnd)
  1954. return iBreakIndex;
  1955. }
  1956. break;
  1957. case 2:
  1958. while (true)
  1959. {
  1960. iNumCluster = GetCluster(pwchIndex);
  1961. if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
  1962. return iBreakIndex;
  1963. fBeginNewWord = false;
  1964. pwchIndex += iNumCluster;
  1965. if (thaiTrieIter1.fWordEnd)
  1966. {
  1967. iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
  1968. // reached the end of word unable to find alt word.
  1969. if (iBreakTemp >= iWordLen)
  1970. return 0;
  1971. iBreakTemp1 = GetWeight(pwchIndex);
  1972. if (iBreakTemp + iBreakTemp1 == iWordLen)
  1973. {
  1974. pBreakPos[0] = (BYTE)iBreakTemp;
  1975. pBreakPos[1] = (BYTE)iBreakTemp1;
  1976. return 2;
  1977. }
  1978. }
  1979. if (pwchIndex >= pszEnd)
  1980. return iBreakIndex;
  1981. }
  1982. break;
  1983. default:
  1984. case 1:
  1985. while (iBreakIndex < Alt)
  1986. {
  1987. iNumCluster = GetCluster(pwchIndex);
  1988. if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
  1989. return iBreakIndex;
  1990. fBeginNewWord = false;
  1991. pwchIndex += iNumCluster;
  1992. if (thaiTrieIter1.fWordEnd)
  1993. {
  1994. fBeginNewWord = true;
  1995. iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
  1996. // reached the end of word unable to find alt word.
  1997. if (iBreakTemp >= iWordLen)
  1998. return 0;
  1999. iBreakTemp1 = GetLongestSubstring(pwchBeginWord,iWordLen);
  2000. if (iBreakTemp1 > iBreakTemp && iBreakTemp1 < iWordLen)
  2001. pBreakPos[iBreakIndex] = (BYTE) iBreakTemp1;
  2002. else
  2003. pBreakPos[iBreakIndex] = (BYTE) iBreakTemp;
  2004. pwchBeginWord += pBreakPos[iBreakIndex];
  2005. iWordLen -= pBreakPos[iBreakIndex];
  2006. iBreakIndex++;
  2007. }
  2008. if (pwchIndex >= pszEnd)
  2009. return iBreakIndex;
  2010. }
  2011. break;
  2012. }
  2013. return iBreakIndex;
  2014. }