Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1914 lines
61 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. //
  4. // CThaiBreakTree - class CThaiBreakTree
  5. //
  6. // History:
  7. // created 7/99 aarayas
  8. //
  9. // �1999 Microsoft Corporation
  10. //----------------------------------------------------------------------------
  11. #include "CThaiBreakTree.hpp"
  12. //+---------------------------------------------------------------------------
  13. //
  14. // Function: ExtractPOS
  15. //
  16. // Synopsis: The functions takes a tag and return Part Of Speech Tags.
  17. //
  18. // Arguments:
  19. //
  20. // Modifies:
  21. //
  22. // History: created 7/99 aarayas
  23. //
  24. // Notes:
  25. //
  26. //----------------------------------------------------------------------------
  27. inline WCHAR ExtractPOS(DWORD dwTag)
  28. {
  29. return (WCHAR) ( (dwTag & iPosMask) >> iPosShift);
  30. }
  31. //+---------------------------------------------------------------------------
  32. //
  33. // Function: ExtractFrq
  34. //
  35. // Synopsis: The functions takes a tag and return Frquency of words.
  36. //
  37. // Arguments:
  38. //
  39. // Modifies:
  40. //
  41. // History: created 7/99 aarayas
  42. //
  43. // Notes:
  44. //
  45. //----------------------------------------------------------------------------
  46. inline BYTE ExtractFrq(DWORD dwTag)
  47. {
  48. return (BYTE) ( (dwTag & 0x300) >> iFrqShift);
  49. }
  50. //+---------------------------------------------------------------------------
  51. //
  52. // Function: DetermineFrequencyWeight
  53. //
  54. // Synopsis: The functions returns the frequency weight of a words.
  55. //
  56. // Arguments:
  57. //
  58. // Modifies:
  59. //
  60. // History: created 7/99 aarayas
  61. //
  62. // Notes:
  63. //
  64. //----------------------------------------------------------------------------
  65. inline void DetermineFrequencyWeight(BYTE frq, unsigned int* uiWeight)
  66. {
  67. switch (frq)
  68. {
  69. case frqpenInfrequent:
  70. (*uiWeight) -= 2;
  71. break;
  72. case frqpenSomewhat:
  73. (*uiWeight)--;
  74. break;
  75. case frqpenVery:
  76. (*uiWeight) += 2;
  77. break;
  78. case frqpenNormal:
  79. default:
  80. (*uiWeight)++;
  81. break;
  82. }
  83. }
  84. //+---------------------------------------------------------------------------
  85. //
  86. // Function: DetermineFrequencyWeight
  87. //
  88. // Synopsis: The functions returns the frequency weight of a words.
  89. //
  90. // Arguments:
  91. //
  92. // Modifies:
  93. //
  94. // History: created 7/99 aarayas
  95. //
  96. // Notes:
  97. //
  98. //----------------------------------------------------------------------------
  99. inline void DetermineFrequencyWeight(BYTE frq, DWORD* uiWeight)
  100. {
  101. switch (frq)
  102. {
  103. case frqpenInfrequent:
  104. (*uiWeight) -= 2;
  105. break;
  106. case frqpenSomewhat:
  107. (*uiWeight)--;
  108. break;
  109. case frqpenVery:
  110. (*uiWeight) += 2;
  111. break;
  112. case frqpenNormal:
  113. default:
  114. (*uiWeight)++;
  115. break;
  116. }
  117. }
  118. //+---------------------------------------------------------------------------
  119. //
  120. // Class: CThaiTrieIter
  121. //
  122. // Synopsis: Constructor - initialize local variables
  123. //
  124. // Arguments:
  125. //
  126. // Modifies:
  127. //
  128. // History: created 7/99 aarayas
  129. //
  130. // Notes:
  131. //
  132. //----------------------------------------------------------------------------
  133. CThaiBreakTree::CThaiBreakTree() : iNodeIndex(0), iNumNode(0),
  134. pszBegin(NULL), pszEnd(NULL),
  135. breakTree(NULL), breakArray(NULL),
  136. tagArray(NULL), maximalMatchingBreakArray(NULL),
  137. maximalMatchingTAGArray(NULL),
  138. POSArray(NULL), maximalMatchingPOSArray(NULL)
  139. {
  140. // Allocate memory need for CThaiBreakTree.
  141. #if defined (NGRAM_ENABLE)
  142. breakTree = new ThaiBreakNode[MAXTHAIBREAKNODE];
  143. #endif
  144. breakArray = new BYTE[MAXBREAK];
  145. tagArray = new DWORD[MAXBREAK];
  146. POSArray = new WCHAR[MAXBREAK];
  147. }
  148. //+---------------------------------------------------------------------------
  149. //
  150. // Class: CThaiTrieIter
  151. //
  152. // Synopsis: Destructor - clean up code
  153. //
  154. // Arguments:
  155. //
  156. // Modifies:
  157. //
  158. // History: created 7/99 aarayas
  159. //
  160. // Notes:
  161. //
  162. //----------------------------------------------------------------------------
  163. CThaiBreakTree::~CThaiBreakTree()
  164. {
  165. // Clean up all memory used.
  166. #if defined (NGRAM_ENABLE)
  167. if (breakTree)
  168. delete breakTree;
  169. if (maximalMatchingBreakArray)
  170. delete maximalMatchingBreakArray;
  171. if (maximalMatchingTAGArray)
  172. delete maximalMatchingTAGArray;
  173. if (maximalMatchingPOSArray)
  174. delete maximalMatchingPOSArray;
  175. #endif
  176. if (breakArray)
  177. delete breakArray;
  178. if (tagArray)
  179. delete tagArray;
  180. if (POSArray)
  181. delete POSArray;
  182. }
  183. //+---------------------------------------------------------------------------
  184. //
  185. // Class: CThaiBreakTree
  186. //
  187. // Synopsis: Associate the class to the string.
  188. //
  189. // Arguments:
  190. //
  191. // Modifies:
  192. //
  193. // History: created 7/99 aarayas
  194. //
  195. // Notes:
  196. //
  197. //----------------------------------------------------------------------------
  198. #if defined (NGRAM_ENABLE)
  199. void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pSentTrie, CTrie* pTrigramTrie)
  200. #else
  201. void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pTrigramTrie)
  202. #endif
  203. {
  204. assert(pTrie != NULL);
  205. thaiTrieIter.Init(pTrie);
  206. thaiTrieIter1.Init(pTrie);
  207. #if defined (NGRAM_ENABLE)
  208. assert(pSentTrie != NULL);
  209. thaiSentIter.Init(pSentTrie);
  210. #endif
  211. assert(pTrigramTrie != NULL);
  212. thaiTrigramIter.Init(pTrigramTrie);
  213. }
  214. #if defined (NGRAM_ENABLE)
  215. //+---------------------------------------------------------------------------
  216. //
  217. // Class: CThaiBreakTree
  218. //
  219. // Synopsis: reset iterator to top of the tree
  220. //
  221. // Arguments:
  222. //
  223. // Modifies:
  224. //
  225. // History: created 7/99 aarayas
  226. //
  227. // Notes:
  228. //
  229. //----------------------------------------------------------------------------
  230. inline void CThaiBreakTree::Reset()
  231. {
  232. iNodeIndex = 0;
  233. }
  234. //+---------------------------------------------------------------------------
  235. //
  236. // Class: CThaiBreakTree
  237. //
  238. // Synopsis: Move to the next break.
  239. //
  240. // Arguments:
  241. //
  242. // Modifies:
  243. //
  244. // History: created 7/99 aarayas
  245. //
  246. // Notes:
  247. //
  248. //----------------------------------------------------------------------------
  249. inline bool CThaiBreakTree::MoveNext()
  250. {
  251. iNodeIndex = breakTree[iNodeIndex].NextBreak;
  252. return (iNodeIndex != 0);
  253. }
  254. //+---------------------------------------------------------------------------
  255. //
  256. // Class: CThaiBreakTree
  257. //
  258. // Synopsis: Move down to next level.
  259. //
  260. // Arguments:
  261. //
  262. // Modifies:
  263. //
  264. // History: created 7/99 aarayas
  265. //
  266. // Notes:
  267. //
  268. //----------------------------------------------------------------------------
  269. inline bool CThaiBreakTree::MoveDown()
  270. {
  271. iNodeIndex = breakTree[iNodeIndex].Down;
  272. return (iNodeIndex != 0);
  273. }
  274. //+---------------------------------------------------------------------------
  275. //
  276. // Class: CThaiBreakTree
  277. //
  278. // Synopsis: create new node to position, and return index to the node.
  279. //
  280. // * return Unable to Create Node.
  281. //
  282. // Arguments:
  283. //
  284. // Modifies:
  285. //
  286. // History: created 7/99 aarayas
  287. //
  288. // Notes:
  289. //
  290. //----------------------------------------------------------------------------
  291. inline unsigned int CThaiBreakTree::CreateNode(int iPos, BYTE iBreakLen, DWORD dwTAG)
  292. {
  293. assert(iNumNode < MAXTHAIBREAKNODE);
  294. breakTree[iNumNode].iPos = iPos;
  295. breakTree[iNumNode].iBreakLen = iBreakLen;
  296. breakTree[iNumNode].dwTAG = dwTAG;
  297. breakTree[iNumNode].NextBreak = 0;
  298. breakTree[iNumNode].Down = 0;
  299. if (iNumNode >= MAXTHAIBREAKNODE)
  300. {
  301. return UNABLETOCREATENODE;
  302. }
  303. iNumNode++;
  304. return (iNumNode - 1);
  305. }
  306. //+---------------------------------------------------------------------------
  307. //
  308. // Class: CThaiBreakTree
  309. //
  310. // Synopsis: Generate a Tree of possible break from the given string.
  311. //
  312. // * Note - false if there aren't enough memory to create node.
  313. //
  314. // Arguments:
  315. //
  316. // Modifies:
  317. //
  318. // History: created 7/99 aarayas
  319. //
  320. // Notes:
  321. //
  322. //----------------------------------------------------------------------------
  323. enum thai_parse_state {
  324. END_SENTENCE, // Reached the end of sentence.
  325. LONGEST_MATCH, // Longest possible matched.
  326. NOMATCH_FOUND, // Unable to find word.
  327. ERROR_OUTMEMORY, // Out of Memory.
  328. };
  329. bool CThaiBreakTree::GenerateTree(WCHAR* pszBegin, WCHAR* pszEnd1)
  330. {
  331. // Declare and initialize local variables.
  332. unsigned int iIndexBreakTree = 0;
  333. unsigned int iPrevIndexBreakTree = 0;
  334. unsigned int iParentNode = 0;
  335. WCHAR* pszBeginWord = pszBegin;
  336. WCHAR* pszIndex = pszBegin;
  337. unsigned int iNumCluster = 1;
  338. unsigned int iNumLastCluster;
  339. unsigned int iWordLen = 0;
  340. unsigned int iNodeAnalyze = 0;
  341. thai_parse_state parseState = END_SENTENCE;
  342. bool fFoundMatch = false;
  343. bool fAddToNodeAnalyze = false;
  344. bool fDoneGenerateTree = false;
  345. pszEnd = pszEnd1;
  346. #if defined (_DEBUG)
  347. memset(breakTree,0,sizeof(ThaiBreakNode)*MAXTHAIBREAKNODE);
  348. #endif
  349. iNodeIndex = 0;
  350. iNumNode = 0;
  351. while (true)
  352. {
  353. // Reset Iterator for generating break for new word.
  354. fFoundMatch = false;
  355. thaiTrieIter.Reset();
  356. if (iIndexBreakTree != 0)
  357. {
  358. while (true)
  359. {
  360. // If this is not the first node than set pszBeginWord after the last break.
  361. pszBeginWord = pszBegin + breakTree[iNodeAnalyze].iPos + breakTree[iNodeAnalyze].iBreakLen;
  362. fAddToNodeAnalyze = true;
  363. // Are we at the end of the sentence.
  364. if ( (pszBeginWord == pszEnd) ||
  365. (breakTree[iNodeAnalyze].dwTAG == TAGPOS_PURGE) )
  366. {
  367. iNodeAnalyze++; // Move to next node.
  368. if (iNodeAnalyze >= iNumNode)
  369. {
  370. fDoneGenerateTree = true;
  371. break;
  372. }
  373. }
  374. else
  375. break;
  376. }
  377. }
  378. pszIndex = pszBeginWord;
  379. iParentNode = iNodeAnalyze;
  380. if (fDoneGenerateTree)
  381. break;
  382. // Get next level of tree.
  383. while (TRUE)
  384. {
  385. iNumLastCluster = iNumCluster;
  386. iNumCluster = GetCluster(pszIndex);
  387. if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster))
  388. {
  389. pszIndex += iNumCluster;
  390. if (thaiTrieIter.fWordEnd)
  391. {
  392. fFoundMatch = true;
  393. // if first node add first node
  394. if (iIndexBreakTree == 0)
  395. {
  396. CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
  397. iIndexBreakTree++;
  398. }
  399. else
  400. {
  401. if (fAddToNodeAnalyze)
  402. {
  403. fAddToNodeAnalyze = false;
  404. breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
  405. // Determine if an error has occur.
  406. if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
  407. {
  408. breakTree[iNodeAnalyze].NextBreak = 0;
  409. parseState = ERROR_OUTMEMORY;
  410. break;
  411. }
  412. iPrevIndexBreakTree = breakTree[iNodeAnalyze].NextBreak;
  413. iNodeAnalyze++;
  414. }
  415. else
  416. {
  417. breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
  418. // Determine if an error has occur.
  419. if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
  420. {
  421. breakTree[iPrevIndexBreakTree].Down = 0;
  422. parseState = ERROR_OUTMEMORY;
  423. break;
  424. }
  425. iPrevIndexBreakTree = iIndexBreakTree;
  426. }
  427. iIndexBreakTree++;
  428. }
  429. }
  430. if (pszIndex >= pszEnd)
  431. {
  432. assert(pszIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
  433. parseState = END_SENTENCE;
  434. break;
  435. }
  436. }
  437. else
  438. {
  439. if (fFoundMatch)
  440. parseState = LONGEST_MATCH;
  441. else
  442. parseState = NOMATCH_FOUND;
  443. break;
  444. }
  445. }
  446. if (parseState == LONGEST_MATCH)
  447. {
  448. // We found a matched.
  449. assert(breakTree[iPrevIndexBreakTree].Down == 0); // at this point breakTree[iPreveIndexBreakTree].Down should equal null.(optimization note)
  450. if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree)
  451. {
  452. assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown.
  453. DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak);
  454. }
  455. }
  456. else if (parseState == NOMATCH_FOUND)
  457. {
  458. // Should mark node as unknown.
  459. if (fAddToNodeAnalyze)
  460. {
  461. fAddToNodeAnalyze = false;
  462. iWordLen = pszIndex - pszBeginWord;
  463. // Make sure we don't only have a cluster of text before making a node.
  464. if (iWordLen == 0)
  465. {
  466. // If we have an UNKNOWN word of one character only current node mark it as unknown.
  467. assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
  468. breakTree[iNodeAnalyze].iBreakLen += iNumCluster;
  469. breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
  470. }
  471. else
  472. {
  473. if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8)
  474. // The reason we are using 8 is because from corpora analysis
  475. // the average Thai word is about 7.732 characters.
  476. // TODO: We should add orthographic analysis here to get a better on boundary
  477. // of unknown word.
  478. {
  479. assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
  480. breakTree[iNodeAnalyze].iBreakLen += iWordLen;
  481. breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
  482. }
  483. else
  484. {
  485. if (GetWeight(pszIndex - iNumLastCluster))
  486. breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN);
  487. else
  488. breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN);
  489. // Determine if an error has occur.
  490. if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
  491. {
  492. breakTree[iNodeAnalyze].NextBreak = 0;
  493. parseState = ERROR_OUTMEMORY;
  494. break;
  495. }
  496. iNodeAnalyze++;
  497. iIndexBreakTree++;
  498. }
  499. }
  500. }
  501. else
  502. {
  503. breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN);
  504. // Determine if an error has occur.
  505. if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
  506. {
  507. breakTree[iPrevIndexBreakTree].Down = 0;
  508. parseState = ERROR_OUTMEMORY;
  509. break;
  510. }
  511. iIndexBreakTree++;
  512. }
  513. }
  514. else if (parseState == END_SENTENCE)
  515. {
  516. // If we find ourself at the end of a sentence and no match.
  517. if (!fFoundMatch)
  518. {
  519. if (fAddToNodeAnalyze)
  520. {
  521. fAddToNodeAnalyze = false;
  522. iWordLen = pszIndex - pszBeginWord;
  523. // Make sure we don't only have a cluster of text before making a node.
  524. if (iWordLen == 0)
  525. {
  526. // If we have an UNKNOWN word of one character only current node mark it as unknown.
  527. assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
  528. breakTree[iNodeAnalyze].iBreakLen += iNumCluster;
  529. breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
  530. }
  531. else
  532. {
  533. if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8)
  534. // The reason we are using 8 is because from corpora analysis
  535. // the average Thai word is about 7.732 characters.
  536. // TODO: We should add orthographic analysis here to get a better on boundary
  537. // of unknown word.
  538. {
  539. assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
  540. breakTree[iNodeAnalyze].iBreakLen += iWordLen;
  541. breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
  542. }
  543. else
  544. {
  545. if (GetWeight(pszIndex - iNumLastCluster))
  546. breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN);
  547. else
  548. breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN);
  549. // Determine if an error has occur.
  550. if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
  551. {
  552. breakTree[iNodeAnalyze].NextBreak = 0;
  553. parseState = ERROR_OUTMEMORY;
  554. break;
  555. }
  556. iNodeAnalyze++;
  557. iIndexBreakTree++;
  558. }
  559. }
  560. }
  561. else
  562. {
  563. breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN);
  564. // Determine if an error has occur.
  565. if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
  566. {
  567. breakTree[iPrevIndexBreakTree].Down = 0;
  568. parseState = ERROR_OUTMEMORY;
  569. break;
  570. }
  571. }
  572. iIndexBreakTree++;
  573. }
  574. // If the beginning of node the branch isn't equal to leaf node perphase it is possible to
  575. // do some ending optimization.
  576. else if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree)
  577. {
  578. assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown.
  579. DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak);
  580. }
  581. }
  582. else if ( (breakTree[iNodeAnalyze].iBreakLen == 0) || (parseState == ERROR_OUTMEMORY) )
  583. break;
  584. }
  585. return (parseState != ERROR_OUTMEMORY);
  586. }
  587. //+---------------------------------------------------------------------------
  588. //
  589. // Class: CThaiBreakTree
  590. //
  591. // Synopsis: Traverse all the tree and look for the least number of token.
  592. //
  593. // Arguments:
  594. //
  595. // Modifies:
  596. //
  597. // History: created 7/99 aarayas
  598. //
  599. // Notes:
  600. //
  601. //----------------------------------------------------------------------------
  602. bool CThaiBreakTree::MaximalMatching()
  603. {
  604. // If maximal matching break array has not been allocate, than allocate it.
  605. if (!maximalMatchingBreakArray)
  606. maximalMatchingBreakArray = new BYTE[MAXBREAK];
  607. if (!maximalMatchingTAGArray)
  608. maximalMatchingTAGArray = new DWORD[MAXBREAK];
  609. if (!maximalMatchingPOSArray)
  610. maximalMatchingPOSArray = new WCHAR[MAXBREAK];
  611. maxLevel = MAXUNSIGNEDINT;
  612. maxToken = 0;
  613. iNumUnknownMaximalPOSArray = MAXBREAK;
  614. Traverse(0,0,0);
  615. return true;
  616. }
  617. //+---------------------------------------------------------------------------
  618. //
  619. // Class: CThaiBreakTree
  620. //
  621. // Synopsis: The function determine if the node if the node should,
  622. // be tag as unknown or purge.
  623. //
  624. // Arguments:
  625. //
  626. // Modifies:
  627. //
  628. // History: created 8/99 aarayas
  629. //
  630. // Notes:
  631. //
  632. //----------------------------------------------------------------------------
  633. inline DWORD CThaiBreakTree::DeterminePurgeOrUnknown(unsigned int iCurrentNode, unsigned int iBreakLen)
  634. {
  635. // Declare and initialize local variables.
  636. unsigned int iNode = breakTree[iCurrentNode].Down;
  637. while (iNode != 0)
  638. {
  639. if ( (breakTree[iNode].iBreakLen == iBreakLen) ||
  640. (breakTree[iNode].iBreakLen < iBreakLen) &&
  641. ( (breakTree[iNode].dwTAG != TAGPOS_UNKNOWN) ||
  642. (breakTree[iNode].dwTAG != TAGPOS_PURGE) ))
  643. {
  644. // Since we are purging this break just make sure the NextBreak is Null.
  645. assert(breakTree[iCurrentNode].NextBreak == 0);
  646. return TAGPOS_PURGE;
  647. }
  648. iNode = breakTree[iNode].Down;
  649. }
  650. return TAGPOS_UNKNOWN;
  651. }
  652. //+---------------------------------------------------------------------------
  653. //
  654. // Class: CThaiBreakTree
  655. //
  656. // Synopsis: Ending optimization - if we have found the end of a sentence,
  657. // and possible break. Purge the branch for unnecessary break.
  658. //
  659. // Arguments:
  660. //
  661. // Modifies:
  662. //
  663. // History: created 8/99 aarayas
  664. //
  665. // Notes:
  666. //
  667. //----------------------------------------------------------------------------
  668. inline void CThaiBreakTree::DeterminePurgeEndingSentence(WCHAR* pszBeginWord, unsigned int iNode)
  669. {
  670. while (breakTree[iNode].Down != 0)
  671. {
  672. // Determine if the next string has a possiblity to become a word.
  673. // TODO: We may need to change this once the GetWeight add soundex
  674. // functionality.
  675. if (GetWeight(pszBeginWord + breakTree[iNode].iBreakLen) == 0)
  676. {
  677. // Since we are purging this break just make sure the NextBreak is Null.
  678. assert(breakTree[iNode].NextBreak == 0);
  679. breakTree[iNode].dwTAG = TAGPOS_PURGE;
  680. }
  681. iNode = breakTree[iNode].Down;
  682. }
  683. }
  684. #endif
  685. //+---------------------------------------------------------------------------
  686. //
  687. // Class: CThaiBreakTree
  688. //
  689. // Synopsis:
  690. //
  691. // Arguments:
  692. //
  693. // Modifies:
  694. //
  695. // History: created 8/99 aarayas
  696. //
  697. // Notes:
  698. //
  699. //----------------------------------------------------------------------------
  700. unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin)
  701. {
  702. // Declare and initialize local variables.
  703. unsigned int iNumCluster = 1;
  704. unsigned int Weight = 0;
  705. bool fBeginNewWord;
  706. WCHAR* pszIndex = pszBegin;
  707. // Short circuit the length is less of string is less than 1.
  708. if ((pszEnd - pszBegin) == 1)
  709. return Weight;
  710. else if (pszEnd == pszBegin)
  711. return 1000;
  712. // Reset Iterator for generating break for new word.
  713. fBeginNewWord = true;
  714. // Get next level of tree.
  715. while (true)
  716. {
  717. iNumCluster = GetCluster(pszIndex);
  718. if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
  719. {
  720. fBeginNewWord = false;
  721. pszIndex += iNumCluster;
  722. if (thaiTrieIter.fWordEnd)
  723. Weight = (unsigned int) (pszIndex - pszBegin);
  724. }
  725. else
  726. break;
  727. }
  728. return Weight;
  729. }
  730. //+---------------------------------------------------------------------------
  731. //
  732. // Class: CThaiBreakTree
  733. //
  734. // Synopsis:
  735. //
  736. // Arguments:
  737. //
  738. // Modifies:
  739. //
  740. // History: created 8/99 aarayas
  741. //
  742. // Notes:
  743. //
  744. //----------------------------------------------------------------------------
  745. unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin, DWORD* pdwTag)
  746. {
  747. // Declare and initialize local variables.
  748. unsigned int iNumCluster = 1;
  749. unsigned int Weight = 0;
  750. bool fBeginNewWord;
  751. WCHAR* pszIndex = pszBegin;
  752. // Short circuit the length is less of string is less than 1.
  753. if ((pszEnd - pszBegin) == 1)
  754. return Weight;
  755. else if (pszEnd == pszBegin)
  756. return 1000;
  757. // Reset Iterator for generating break for new word.
  758. fBeginNewWord = true;
  759. // Get next level of tree.
  760. while (true)
  761. {
  762. iNumCluster = GetCluster(pszIndex);
  763. if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
  764. {
  765. fBeginNewWord = false;
  766. pszIndex += iNumCluster;
  767. if (thaiTrieIter.fWordEnd)
  768. {
  769. Weight = (unsigned int) (pszIndex - pszBegin);
  770. *pdwTag = thaiTrieIter.dwTag;
  771. }
  772. }
  773. else
  774. break;
  775. }
  776. return Weight;
  777. }
  778. //+---------------------------------------------------------------------------
  779. //
  780. // Class: CThaiBreakTree
  781. //
  782. // Synopsis: Traverse the tree.
  783. //
  784. // Arguments:
  785. //
  786. // Modifies:
  787. //
  788. // History: created 7/99 aarayas
  789. //
  790. // Notes:
  791. //
  792. //----------------------------------------------------------------------------
  793. bool CThaiBreakTree::Traverse(unsigned int iLevel, unsigned int iCurrentNode, unsigned int iNumUnknown)
  794. {
  795. assert (iLevel < MAXBREAK);
  796. // Process node.
  797. breakArray[iLevel] = breakTree[iCurrentNode].iBreakLen;
  798. tagArray[iLevel] = breakTree[iCurrentNode].dwTAG;
  799. if (tagArray[iLevel] == TAGPOS_UNKNOWN)
  800. iNumUnknown++;
  801. // Have we found the end of the sentence.
  802. if (breakTree[iCurrentNode].NextBreak == 0)
  803. {
  804. if (breakTree[iCurrentNode].dwTAG != TAGPOS_PURGE)
  805. AddBreakToList(iLevel + 1, iNumUnknown);
  806. if (breakTree[iCurrentNode].Down != 0)
  807. {
  808. if (tagArray[iLevel] == TAGPOS_UNKNOWN)
  809. iNumUnknown--;
  810. return Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown);
  811. }
  812. else
  813. return true;
  814. }
  815. else
  816. Traverse(iLevel + 1, breakTree[iCurrentNode].NextBreak, iNumUnknown);
  817. if (breakTree[iCurrentNode].Down != 0)
  818. {
  819. if (tagArray[iLevel] == TAGPOS_UNKNOWN)
  820. iNumUnknown--;
  821. Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown);
  822. }
  823. return true;
  824. }
  825. //+---------------------------------------------------------------------------
  826. //
  827. // Class: CThaiBreakTree
  828. //
  829. // Synopsis:
  830. //
  831. // Arguments:
  832. //
  833. // Modifies:
  834. //
  835. // History: created 8/99 aarayas
  836. //
  837. // Notes:
  838. //
  839. //----------------------------------------------------------------------------
  840. unsigned int CThaiBreakTree::SoundexSearch(WCHAR* pszBegin)
  841. {
  842. // Declare and initialize local variables.
  843. unsigned int iNumCluster = 1;
  844. unsigned int iNumNextCluster = 1;
  845. unsigned int iLongestWord = 0;
  846. unsigned int iPenalty = 0;
  847. WCHAR* pszIndex = pszBegin;
  848. // Short circuit the length is less of string is less than 1.
  849. if ( (pszBegin+1) >= pszEnd )
  850. return iLongestWord;
  851. // Reset Iterator for generating break for new word.
  852. thaiTrieIter1.Reset();
  853. // Get next level of tree.
  854. while (true)
  855. {
  856. iNumCluster = GetCluster(pszIndex);
  857. // Determine iNumNextCluster let iNumNextCluster = 0, if we reached the end of string.
  858. if (pszIndex + iNumCluster >= pszEnd)
  859. iNumNextCluster = 0;
  860. else
  861. iNumNextCluster = GetCluster(pszIndex+iNumCluster);
  862. // Determine penalty
  863. switch (thaiTrieIter1.MoveSoundexByCluster(pszIndex, iNumCluster, iNumNextCluster))
  864. {
  865. case SUBSTITUTE_SOUNDLIKECHAR:
  866. iPenalty += 2;
  867. break;
  868. case SUBSTITUTE_DIACRITIC:
  869. iPenalty++;
  870. break;
  871. case UNABLE_TO_MOVE:
  872. iPenalty += 2;
  873. break;
  874. default:
  875. case NOSUBSTITUTE:
  876. break;
  877. }
  878. // Update Index.
  879. if (iPenalty <= 2)
  880. {
  881. pszIndex += iNumCluster;
  882. if (thaiTrieIter1.fWordEnd)
  883. iLongestWord = (unsigned int) (pszIndex - pszBegin);
  884. }
  885. else
  886. break;
  887. }
  888. return iLongestWord;
  889. }
  890. //+---------------------------------------------------------------------------
  891. //
  892. // Class: CThaiBreakTree
  893. //
  894. // Synopsis: The information used here is a reference to the orthographic
  895. // analysis work done on the Thai languages. (see paper: Natural
  896. // Language Processing in Thailand 1993 Chulalongkorn. p 361).
  897. //
  898. // Arguments: pszBoundaryChar - Contain pointer to at least two thai character
  899. // character next to each other which we will
  900. // use to calculate wheather we should or
  901. // should not merge the two word.
  902. //
  903. // iPrevWordLen -
  904. //
  905. // Modifies:
  906. //
  907. // History: created 8/99 aarayas
  908. //
  909. // Notes:
  910. //
  911. //----------------------------------------------------------------------------
  912. inline bool CThaiBreakTree::ShouldMerge(WCHAR* pwszPrevWord, unsigned int iPrevWordLen, unsigned int iMergeWordLen, DWORD dwPrevTag)
  913. {
  914. WCHAR* pwszBoundary = pwszPrevWord + iPrevWordLen - 1;
  915. assert(iMergeWordLen != 0);
  916. assert(iPrevWordLen != 0);
  917. // There are very few words in Thai that are 4 character or less, therefore we should
  918. // found a pair that less than 4 character we should merge.
  919. // Or if merge word length is one than also merge.
  920. // Of if last cluster of the word is a Thanthakhat(Karan) we should always merge.
  921. if (iPrevWordLen + iMergeWordLen <= 4 || iMergeWordLen == 1 ||
  922. (iMergeWordLen == 2 && *(pwszBoundary + iMergeWordLen) == THAI_Thanthakhat))
  923. return true;
  924. if (iPrevWordLen >=2)
  925. {
  926. WCHAR* pwszPrevCharBoundary = pwszBoundary - 1;
  927. // TO IMPROVE: It better to check the last character of Previous word, it can give us a
  928. // much better guess
  929. if ((*pwszPrevCharBoundary == THAI_Vowel_Sign_Mai_HanAkat || *pwszBoundary == THAI_Vowel_Sign_Mai_HanAkat) ||
  930. (*pwszPrevCharBoundary == THAI_Tone_Mai_Tri || *pwszBoundary == THAI_Tone_Mai_Tri) ||
  931. (*pwszPrevCharBoundary == THAI_Sara_Ue || *pwszBoundary == THAI_Sara_Ue) )
  932. return true;
  933. }
  934. // If the first character of the next word is mostly likly the beginning
  935. // character and last character of the previous word is not sara-A than
  936. // we have a high probability that we found a begin of word boundary,
  937. // therefore we shouldn't merge.
  938. if ( (IsThaiMostlyBeginCharacter(pwszBoundary[1]) && *pwszBoundary != THAI_Vowel_Sara_A) )
  939. return false;
  940. // If the last character of the previous word is mostly likely an ending
  941. // character than, than there is a high probability that the found a boundary.
  942. // There are very few words in Thai that are 4 character or less, therefore we should
  943. // found a pair that less than 4 character we should merge.
  944. if (IsThaiMostlyLastCharacter(*pwszBoundary))
  945. return false;
  946. // The reason we are using 8 is because from corpora analysis
  947. // the average Thai word is about 7.732 characters. Or, if previous word is already
  948. // an unknown, to keep the amount of unknown low the unknown to previous words.
  949. if ( (iPrevWordLen + iMergeWordLen < 8) || (dwPrevTag == TAGPOS_UNKNOWN) )
  950. return true;
  951. return false;
  952. }
  953. //+---------------------------------------------------------------------------
  954. //
  955. // Class: CThaiBreakTree
  956. //
  957. // Synopsis:
  958. //
  959. // Arguments:
  960. //
  961. // Modifies:
  962. //
  963. // History: created 7/99 aarayas
  964. // 8/17/99 optimize some code.
  965. //
  966. // Notes:
  967. //
  968. //----------------------------------------------------------------------------
  969. inline void CThaiBreakTree::AddBreakToList(unsigned int iNumBreak, unsigned int iNumUnknown)
  970. {
  971. #if defined (_DEBUG)
  972. breakArray[iNumBreak] = 0;
  973. #endif
  974. if (CompareSentenceStructure(iNumBreak, iNumUnknown))
  975. {
  976. maxToken = maxLevel = iNumBreak; // This is ugly but it save 5 clock cycle.
  977. memcpy(maximalMatchingBreakArray,breakArray,maxToken);
  978. memcpy(maximalMatchingTAGArray,tagArray,sizeof(unsigned int)*maxToken);
  979. maximalMatchingBreakArray[maxToken] = 0;
  980. maximalMatchingTAGArray[maxToken] = 0;
  981. }
  982. }
  983. //+---------------------------------------------------------------------------
  984. //
  985. // Class: CThaiBreakTree
  986. //
  987. // Synopsis: The function compares sentence structure of
  988. // maximalMatchingPOSArray with posArray.
  989. //
  990. // Arguments:
  991. //
  992. // Modifies:
  993. //
  994. // History: created 7/99 aarayas
  995. //
  996. // Notes:
  997. //
  998. //----------------------------------------------------------------------------
  999. inline bool CThaiBreakTree::CompareSentenceStructure(unsigned int iNumBreak, unsigned int iNumUnknownPOSArray)
  1000. {
  1001. if ( (iNumBreak < maxLevel) && (iNumUnknownMaximalPOSArray >= iNumUnknownPOSArray) )
  1002. {
  1003. iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
  1004. return true;
  1005. }
  1006. else if (iNumBreak == maxLevel)
  1007. {
  1008. // true - maximal matching has a larger unknown.
  1009. if (iNumUnknownMaximalPOSArray > iNumUnknownPOSArray)
  1010. {
  1011. iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
  1012. return true;
  1013. }
  1014. for(unsigned int i = 0; i <= iNumBreak; i++)
  1015. {
  1016. maximalMatchingPOSArray[i] = ExtractPOS(maximalMatchingTAGArray[i]);
  1017. POSArray[i] = ExtractPOS(tagArray[i]);
  1018. }
  1019. // Determine if the sentence structure is like any one of the sentence
  1020. // sentence structure in our corpora.
  1021. if ( (IsSentenceStruct(POSArray, iNumBreak)) &&
  1022. (!IsSentenceStruct(maximalMatchingPOSArray, iNumBreak)) )
  1023. {
  1024. iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
  1025. return true;
  1026. }
  1027. else if (iNumUnknownMaximalPOSArray == iNumUnknownPOSArray)
  1028. {
  1029. // Determine the frequency of word used in the sentence.
  1030. unsigned int iFrequencyArray = 500;
  1031. unsigned int iFrequencyMaximalArray = 500;
  1032. for(unsigned int i = 0; i <= iNumBreak; i++)
  1033. {
  1034. DetermineFrequencyWeight(ExtractFrq(maximalMatchingTAGArray[i]),&iFrequencyMaximalArray);
  1035. DetermineFrequencyWeight(ExtractFrq(tagArray[i]),&iFrequencyArray);
  1036. }
  1037. return (iFrequencyArray > iFrequencyMaximalArray);
  1038. }
  1039. }
  1040. return false;
  1041. }
  1042. //+---------------------------------------------------------------------------
  1043. //
  1044. // Class: CThaiBreakTree
  1045. //
  1046. // Synopsis:
  1047. //
  1048. // Arguments:
  1049. //
  1050. // Modifies:
  1051. //
  1052. // History: created 8/99 aarayas
  1053. //
  1054. // Notes:
  1055. //
  1056. //----------------------------------------------------------------------------
  1057. bool CThaiBreakTree::IsSentenceStruct(WCHAR* pos, unsigned int iPosLen)
  1058. {
  1059. // Declare and initialize all local variables.
  1060. unsigned int i = 0;
  1061. thaiSentIter.Reset();
  1062. if (!thaiSentIter.Down())
  1063. return FALSE;
  1064. while (TRUE)
  1065. {
  1066. thaiSentIter.GetNode();
  1067. if (thaiSentIter.pos == pos[i])
  1068. {
  1069. i++;
  1070. if (thaiSentIter.fWordEnd && i == iPosLen)
  1071. {
  1072. return TRUE;
  1073. }
  1074. else if (i == iPosLen) break;
  1075. // Move down the Trie Branch.
  1076. else if (!thaiSentIter.Down()) break;
  1077. }
  1078. // Move right of the Trie Branch
  1079. else if (!thaiSentIter.Right()) break;
  1080. }
  1081. return FALSE;
  1082. }
  1083. //+---------------------------------------------------------------------------
  1084. //
  1085. // Class: CThaiBreakTree
  1086. //
  1087. // Synopsis:
  1088. //
  1089. // Arguments:
  1090. //
  1091. // Modifies:
  1092. //
  1093. // History: created 8/99 aarayas
  1094. //
  1095. // Notes:
  1096. //
  1097. //----------------------------------------------------------------------------
  1098. float CThaiBreakTree::BigramProbablity(DWORD dwTag1,DWORD dwTag2)
  1099. {
  1100. unsigned int iWeight = 4;
  1101. // TODO : Use the distribution of word category to determine optimial search - exmaple
  1102. // NOUN VERB ADVERB CLASSIFIER CONJETURE PREP et....
  1103. // TODO : Once we got trigram use it to create bigram probability as well.
  1104. if ( (dwTag1 != TAGPOS_UNKNOWN) &&
  1105. (dwTag2 != TAGPOS_UNKNOWN) )
  1106. {
  1107. WCHAR pos1 = ExtractPOS(dwTag1);
  1108. WCHAR pos2 = ExtractPOS(dwTag2);
  1109. // case NCMN VATT
  1110. /// a common noun is often followed by attributive verb(adjective)
  1111. // Example: (In Thai) book good, people nice
  1112. if (pos1 == 5 && pos2 == 13)
  1113. iWeight += 10;
  1114. // case NTTL NPRP
  1115. // a title noun is often followed by proper noun
  1116. // Example: Dr. Athapan, Mr. Sam
  1117. else if (pos1 == 6 && pos2 == 1)
  1118. iWeight += 5;
  1119. // case JSBR (XVAM || VSTA)
  1120. // a subordinating conjunction is often followed by preverb auxillary or Active verb
  1121. // Example: (In Thai) Because of , Because see
  1122. else if (pos1 == 39 && (pos2 == 15 || pos2 == 12))
  1123. iWeight += 10;
  1124. // case ADVN NCMN
  1125. // a Adverb normal form is often followed by Common noun (Bug 55057).
  1126. // Example: (In Thai) under table.
  1127. else if (pos1 == 28 && pos2 == 5)
  1128. iWeight += 5;
  1129. // case VACT XVAE
  1130. else if (pos1 == 11 && pos2 == 18)
  1131. iWeight += 5;
  1132. // case VACT DDBQ
  1133. // Active verb follow by Definite determiner.
  1134. // Example: (In Thai) working for, singing again.
  1135. else if (pos1 == 11 && pos2 == 21)
  1136. iWeight += 10;
  1137. // case XVAE VACT
  1138. // a post verb auxilliary are often followed by an active verb.
  1139. // Example: (In Thai) come singing, go work.
  1140. else if (pos1 == 18 && pos2 == 11)
  1141. iWeight += 10;
  1142. // case CLTV NCMN
  1143. // a Collective classfier are often followed by Common Noun
  1144. // Example: (In Thai) group people, flock bird
  1145. else if (pos1 == 33 && pos2 == 5)
  1146. iWeight += 5;
  1147. // case NEG (VACT || VSTA || VATT || XVAM || XVAE)
  1148. // a negator (ie. not) is often followed by some kind of VERB.
  1149. // Example: He is not going.
  1150. else if (pos1 == 46 && (pos2 == 11 || pos2 == 12 || pos2 == 13 || pos2 == 15 || pos2 == 16))
  1151. iWeight += 8;
  1152. // case EAFF or EITT
  1153. // Ending for affirmative, and interrogative are more often ending of the pair
  1154. // Example: (In Thai) Krub, Ka,
  1155. else if (pos2 == 44 || pos2 == 45)
  1156. iWeight += 3;
  1157. // case VATT and VATT
  1158. // Attributive Verb and Attributive Verb occur when often in spoken laguages.
  1159. // Example: she is reall really cute.
  1160. else if (pos1 == 13 && pos2 == 13)
  1161. iWeight += 2;
  1162. // case NCMN and DDAC
  1163. // Common Noun and Definitive determiner classifier.
  1164. // Example: Food here (Thai)
  1165. else if (pos1 == 5 && pos2 == 20)
  1166. iWeight += 3;
  1167. // case CMTR and JCMP
  1168. // Measurement classifier and Comparative conjunction, are likly to appear in Thai.
  1169. // Example: year about (Thai) -> English about a year.
  1170. else if (pos1 == 34 && pos2 == 38)
  1171. iWeight += 5;
  1172. }
  1173. DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight);
  1174. DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight);
  1175. return (float) iWeight;
  1176. }
  1177. //+---------------------------------------------------------------------------
  1178. //
  1179. // Class: CThaiBreakTree
  1180. //
  1181. // Synopsis:
  1182. //
  1183. // Arguments:
  1184. //
  1185. // Modifies:
  1186. //
  1187. // History: created 8/99 aarayas
  1188. //
  1189. // Notes:
  1190. //
  1191. //----------------------------------------------------------------------------
  1192. DWORD CThaiBreakTree::TrigramProbablity(DWORD dwTag1,DWORD dwTag2,DWORD dwTag3)
  1193. {
  1194. DWORD iWeight = 6;
  1195. if ( (dwTag1 != TAGPOS_UNKNOWN) &&
  1196. (dwTag2 != TAGPOS_UNKNOWN) &&
  1197. (dwTag3 != TAGPOS_UNKNOWN) )
  1198. {
  1199. WCHAR pos1 = ExtractPOS(dwTag1);
  1200. WCHAR pos2 = ExtractPOS(dwTag2);
  1201. WCHAR pos3 = ExtractPOS(dwTag3);
  1202. WCHAR posArray[4];
  1203. posArray[0] = pos1;
  1204. posArray[1] = pos2;
  1205. posArray[2] = pos3;
  1206. posArray[3] = 0;
  1207. iWeight += thaiTrigramIter.GetProb(posArray);
  1208. /*
  1209. // TODO: We are hard coding this part until we get finish Trigram probablity analysis
  1210. // than we simply return stright look up.
  1211. if ( (pos1 == 18) && // Post verb auxiliary - XVAE
  1212. (pos2 == 5 ) && // Common Noun - NCMN
  1213. (pos3 == 18) ) // Post verb auxiliary - XVAE
  1214. {
  1215. return 70; // Return 70% probablity
  1216. }
  1217. else if ( (pos1 == 18) && // Post verb auxiliary - XVAE
  1218. (pos2 == 5 ) && // Common Noun - NCMN
  1219. (pos3 == 5 ) ) // Common Noun - NCMN
  1220. {
  1221. return 30; // Return 30%.
  1222. }
  1223. */
  1224. // iWeight = thaiTrigramIter.GetProb(pos1,pos2,pos3);
  1225. }
  1226. DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight);
  1227. DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight);
  1228. DetermineFrequencyWeight(ExtractFrq(dwTag3), &iWeight);
  1229. // We reached zero probablity.
  1230. return (DWORD)iWeight;
  1231. }
  1232. //+---------------------------------------------------------------------------
  1233. //
  1234. // Class: CThaiBreakTree
  1235. //
  1236. // Synopsis:
  1237. //
  1238. // Arguments:
  1239. //
  1240. // Modifies:
  1241. //
  1242. // History: created 8/99 aarayas
  1243. //
  1244. // Notes:
  1245. //
  1246. //----------------------------------------------------------------------------
  1247. unsigned int CThaiBreakTree::TrigramBreak(WCHAR* pwchBegin, WCHAR* pwchEnd1)
  1248. {
  1249. // Declare and initialize local variables.
  1250. WCHAR* pwchBeginWord = pwchBegin;
  1251. WCHAR* pwchIndex = pwchBegin;
  1252. unsigned int iWordLen;
  1253. unsigned int iNumCluster = 1;
  1254. unsigned int iNumLastCluster;
  1255. unsigned int iBreakIndex = 0;
  1256. BYTE nextBreakArray[MAXBREAK];
  1257. DWORD nextTagArray[MAXBREAK];
  1258. unsigned int iNextBreakIndex; // index for array nextBreakArray and nextTagArray.
  1259. bool fFoundMatch;
  1260. unsigned int iWeight;
  1261. unsigned int iSumWeight;
  1262. unsigned int iPrevWeight;
  1263. BYTE iSoundexWordLen;
  1264. DWORD iPrevProbability;
  1265. DWORD iCurrentProbability;
  1266. DWORD dwTagTemp;
  1267. DWORD dwLastTag;
  1268. int i; // temporary int for use as need.
  1269. bool fBeginNewWord;
  1270. bool fEndWord = false;
  1271. pszEnd = pwchEnd1;
  1272. breakArray[0] = 0;
  1273. POSArray[0] = 0;
  1274. tagArray[0] = 0;
  1275. nextBreakArray[0] = 0;
  1276. nextTagArray[0] = 0;
  1277. while (true)
  1278. {
  1279. // Reset Iterator for generating break for new word.
  1280. fFoundMatch = false;
  1281. fBeginNewWord = true;
  1282. // Get begin word string for next round of word break.
  1283. pwchIndex = pwchBeginWord;
  1284. iNextBreakIndex = 0;
  1285. if (pwchIndex == pszEnd)
  1286. break;
  1287. while(true)
  1288. {
  1289. iNumLastCluster = iNumCluster;
  1290. iNumCluster = GetCluster(pwchIndex);
  1291. if (!thaiTrieIter.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
  1292. {
  1293. if ((iNumCluster == 0) && (pwchIndex == pszEnd))
  1294. fEndWord = true;
  1295. else
  1296. break;
  1297. }
  1298. fBeginNewWord = false;
  1299. pwchIndex += iNumCluster;
  1300. if (thaiTrieIter.fWordEnd)
  1301. {
  1302. if (thaiTrieIter.m_fThaiNumber)
  1303. {
  1304. // If we have Thai number accumulate it as one break.
  1305. assert(iNumCluster == 1);
  1306. fFoundMatch = true;
  1307. nextBreakArray[0]= (BYTE)(pwchIndex - pwchBeginWord);
  1308. nextTagArray[0] = TAGPOS_NCNM;
  1309. iNextBreakIndex = 1;
  1310. }
  1311. else
  1312. {
  1313. fFoundMatch = true;
  1314. nextBreakArray[iNextBreakIndex] = (BYTE)(pwchIndex - pwchBeginWord);
  1315. nextTagArray[iNextBreakIndex] = thaiTrieIter.dwTag;
  1316. iNextBreakIndex++;
  1317. }
  1318. if (pwchIndex >= pszEnd)
  1319. {
  1320. assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
  1321. assert(iNextBreakIndex != 0);
  1322. breakArray[iBreakIndex] = nextBreakArray[iNextBreakIndex - 1];
  1323. tagArray[iBreakIndex] = nextTagArray[iNextBreakIndex - 1];
  1324. return (++iBreakIndex);
  1325. }
  1326. }
  1327. else if ((pwchIndex >= pszEnd && iNextBreakIndex == 0) || fEndWord)
  1328. {
  1329. assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
  1330. iWordLen = (unsigned int) (pwchIndex - pwchBeginWord);
  1331. switch (iWordLen)
  1332. {
  1333. case 0:
  1334. if (iBreakIndex > 0)
  1335. {
  1336. // if We have a length of one character add it to previous node.
  1337. breakArray[iBreakIndex - 1] += (BYTE) iNumCluster;
  1338. tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
  1339. }
  1340. else
  1341. {
  1342. // if this is the first break create a new break.
  1343. breakArray[iBreakIndex] = (BYTE) iNumCluster;
  1344. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1345. iBreakIndex++;
  1346. }
  1347. break;
  1348. case 1:
  1349. if (iBreakIndex > 0)
  1350. {
  1351. // if We have a length of one character add it to previous node.
  1352. breakArray[iBreakIndex - 1] += (BYTE) iWordLen;
  1353. tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
  1354. }
  1355. else
  1356. {
  1357. // if this is the first break create a new break.
  1358. breakArray[iBreakIndex] = (BYTE) iWordLen;
  1359. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1360. iBreakIndex++;
  1361. }
  1362. break;
  1363. default:
  1364. if ( iBreakIndex > 0 &&
  1365. ShouldMerge(pwchBeginWord - breakArray[iBreakIndex - 1], breakArray[iBreakIndex - 1],
  1366. iWordLen , tagArray[iBreakIndex - 1]) )
  1367. {
  1368. breakArray[iBreakIndex - 1] += (BYTE) iWordLen;
  1369. tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
  1370. }
  1371. else
  1372. {
  1373. breakArray[iBreakIndex] = (BYTE) iWordLen;
  1374. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1375. iBreakIndex++;
  1376. }
  1377. }
  1378. return iBreakIndex;
  1379. }
  1380. }
  1381. if (fFoundMatch) // Longest Matching.
  1382. {
  1383. // If we only found one break, than say it the maximum.
  1384. if (1 == iNextBreakIndex)
  1385. {
  1386. breakArray[iBreakIndex] = nextBreakArray[0];
  1387. tagArray[iBreakIndex] = nextTagArray[0];
  1388. }
  1389. else
  1390. {
  1391. iSumWeight = 0;
  1392. iPrevWeight = 0;
  1393. iPrevProbability = 0;
  1394. iCurrentProbability = 0;
  1395. dwLastTag = TAGPOS_UNKNOWN;
  1396. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1397. for (i = (iNextBreakIndex - 1); i >= 0 ; i--)
  1398. {
  1399. if ( iBreakIndex == 0)
  1400. {
  1401. iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp);
  1402. if (iWeight != 0)
  1403. // Bigram Probability
  1404. iCurrentProbability = (DWORD)BigramProbablity(nextTagArray[i], dwTagTemp);
  1405. }
  1406. else
  1407. {
  1408. iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp);
  1409. if (iBreakIndex == 1)
  1410. // Get Trigram Probability.
  1411. iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 1], nextTagArray[i], dwTagTemp);
  1412. else if (iBreakIndex >= 2)
  1413. {
  1414. // Get Trigram Probability.
  1415. iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 2], tagArray[iBreakIndex - 1], nextTagArray[i]);
  1416. if (iWeight != 0)
  1417. iCurrentProbability += (DWORD)BigramProbablity(nextTagArray[i],dwTagTemp);
  1418. }
  1419. }
  1420. // Store the string the best maximum weight, if the pair is equal
  1421. // store the string with maxim
  1422. if ( (iWeight + nextBreakArray[i] > iSumWeight) ||
  1423. ( (iWeight + nextBreakArray[i] == iSumWeight) &&
  1424. ( (Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight) || (iCurrentProbability > iPrevProbability) ) ))
  1425. {
  1426. if (iCurrentProbability >= iPrevProbability || iSumWeight < iWeight + nextBreakArray[i])
  1427. {
  1428. iSumWeight = Maximum(iWeight,1) + nextBreakArray[i];
  1429. iPrevWeight = Maximum(iWeight,nextBreakArray[i]);
  1430. breakArray[iBreakIndex] = nextBreakArray[i];
  1431. tagArray[iBreakIndex] = nextTagArray[i];
  1432. iPrevProbability = iCurrentProbability;
  1433. dwLastTag = dwTagTemp;
  1434. }
  1435. }
  1436. }
  1437. }
  1438. pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
  1439. iBreakIndex++;
  1440. }
  1441. else
  1442. {
  1443. // NOMATCH_FOUND
  1444. iWordLen = (unsigned int)(pwchIndex - pwchBeginWord);
  1445. if (iBreakIndex > 0)
  1446. {
  1447. i = iBreakIndex - 1; // set i to previous break
  1448. if (iWordLen == 0)
  1449. {
  1450. if (iNumCluster == 1 && *pwchBeginWord == L',' &&
  1451. IsThaiChar(*(pwchBeginWord-breakArray[i])) )
  1452. {
  1453. // We should not merge comma into the word, only merge comma to
  1454. // Number.
  1455. // TODO: Should add TAGPOS_PUNCT.
  1456. breakArray[iBreakIndex] = (BYTE) iNumCluster;
  1457. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1458. pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
  1459. iBreakIndex++;
  1460. }
  1461. else if (ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iNumCluster, tagArray[i]))
  1462. {
  1463. // If word length is null use the cluster add to previous node.
  1464. breakArray[i] += (BYTE) iNumCluster;
  1465. tagArray[i] = TAGPOS_UNKNOWN;
  1466. pwchBeginWord += iNumCluster; // update begin word for next round.
  1467. }
  1468. else
  1469. {
  1470. // Add the unknown word to list.
  1471. breakArray[iBreakIndex] = (BYTE) iNumCluster;
  1472. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1473. pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
  1474. iBreakIndex++;
  1475. }
  1476. }
  1477. else
  1478. {
  1479. // Perphase Misspelled word try use sounding to spell the words.
  1480. if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.')
  1481. {
  1482. // The word is an abbrivated words.
  1483. // TODO: #1. Add TAGPOS_ABBRV.
  1484. // TODO: #2. May need to add rules code abbrivated word with 3 letters.
  1485. breakArray[iBreakIndex] = iWordLen + iNumCluster;
  1486. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1487. pwchBeginWord += breakArray[iBreakIndex];
  1488. iBreakIndex++;
  1489. }
  1490. // Try soundex two word back.
  1491. else if ( (iBreakIndex >= 2) &&
  1492. ( (iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i] - breakArray[i - 1])) > (BYTE) (breakArray[i] + breakArray[i - 1]) ) &&
  1493. GetWeight(pwchBeginWord - breakArray[i] - breakArray[i - 1] + iSoundexWordLen) )
  1494. {
  1495. // Resize the word.
  1496. pwchBeginWord = (pwchBeginWord - breakArray[i] - breakArray[i - 1]) + iSoundexWordLen; // update begin word for next round.
  1497. breakArray[i - 1] = iSoundexWordLen;
  1498. tagArray[i - 1] = thaiTrieIter.dwTag;
  1499. iBreakIndex--; // Decrement iBreakIndex.
  1500. }
  1501. // Try soundex one words back.
  1502. else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i])) > (BYTE) breakArray[i]) &&
  1503. GetWeight(pwchBeginWord - breakArray[i] + iSoundexWordLen) )
  1504. {
  1505. // Resize the word
  1506. pwchBeginWord = (pwchBeginWord - breakArray[i]) + iSoundexWordLen; // update begin word for next round.
  1507. breakArray[i] = iSoundexWordLen;
  1508. tagArray[i] = thaiTrieIter.dwTag;
  1509. }
  1510. // Try soundex on this word.
  1511. else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord)) > (BYTE) iWordLen) &&
  1512. GetWeight(pwchBeginWord + iSoundexWordLen) )
  1513. {
  1514. // Resize the word.
  1515. breakArray[iBreakIndex] = iSoundexWordLen;
  1516. tagArray[iBreakIndex] = thaiTrieIter.dwTag;
  1517. pwchBeginWord += iSoundexWordLen; // update begin word for next round.
  1518. iBreakIndex++;
  1519. }
  1520. else if ( ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iWordLen , tagArray[i]) )
  1521. {
  1522. // Merge the words.
  1523. breakArray[i] += (BYTE) iWordLen;
  1524. tagArray[i] = TAGPOS_UNKNOWN;
  1525. pwchBeginWord += iWordLen; // update begin word for next round.
  1526. }
  1527. else
  1528. {
  1529. // Add the unknown word to list.
  1530. breakArray[iBreakIndex] = (BYTE) iWordLen;
  1531. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1532. pwchBeginWord += iWordLen; // update begin word for next round.
  1533. iBreakIndex++;
  1534. }
  1535. }
  1536. }
  1537. else
  1538. {
  1539. // Add unknown word to list and mark it.
  1540. if (iWordLen == 0)
  1541. {
  1542. // If word length is null use the cluster add to previous node.
  1543. breakArray[iBreakIndex] = (BYTE) iNumCluster;
  1544. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1545. pwchBeginWord += iNumCluster; // update begin word for next round.
  1546. }
  1547. else
  1548. {
  1549. // We we are here there are 2 case that can happen:
  1550. // 1. We take too little into our unknown.
  1551. // 2. We take too much into our unknown word.
  1552. // Have we taken too little check if this unknown word is an abbrivated words.
  1553. if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.')
  1554. breakArray[iBreakIndex] = iWordLen + iNumCluster;
  1555. // Try to see if we are taking to much, see if we can get a Weight from last cluster.
  1556. else if ( (iWordLen - iNumLastCluster > 0) && GetWeight(pwchIndex - iNumLastCluster) )
  1557. breakArray[iBreakIndex] = iWordLen - iNumLastCluster;
  1558. else
  1559. breakArray[iBreakIndex] = (BYTE) iWordLen;
  1560. tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
  1561. pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
  1562. }
  1563. iBreakIndex++;
  1564. }
  1565. }
  1566. }
  1567. return iBreakIndex;
  1568. }
  1569. //+---------------------------------------------------------------------------
  1570. //
  1571. // Class: CThaiBreakTree
  1572. //
  1573. // Synopsis:
  1574. //
  1575. // Arguments:
  1576. //
  1577. // Modifies:
  1578. //
  1579. // History: created 8/99 aarayas
  1580. //
  1581. // Notes:
  1582. //
  1583. //----------------------------------------------------------------------------
  1584. int CThaiBreakTree::Soundex(WCHAR* word)
  1585. {
  1586. return thaiTrieIter.Soundex(word);
  1587. }
  1588. //+---------------------------------------------------------------------------
  1589. //
  1590. // Function: GetCluster
  1591. //
  1592. // Synopsis: The function return the next number of character which represent
  1593. // a cluster of Thai text.
  1594. //
  1595. // ie. Kor Kai, Kor Kai -> 1
  1596. // Kor Kai, Sara Um -> 2
  1597. //
  1598. // * Note this function will not return no more than 3 character,
  1599. // for cluster as this would represent invalid sequence of character.
  1600. //
  1601. // Arguments:
  1602. //
  1603. // Modifies:
  1604. //
  1605. // History: created 7/99 aarayas
  1606. //
  1607. // Notes:
  1608. //
  1609. //----------------------------------------------------------------------------
  1610. unsigned int CThaiBreakTree::GetCluster(WCHAR* pszIndex)
  1611. {
  1612. bool fHasSaraE;
  1613. int iRetValue = 0;
  1614. bool fNeedEndingCluster = false;
  1615. if (pszIndex == pszEnd)
  1616. return 0;
  1617. while (true)
  1618. {
  1619. fHasSaraE= false;
  1620. // Take all begin cluster character.
  1621. while (IsThaiBeginClusterCharacter(*pszIndex))
  1622. {
  1623. if (*pszIndex == THAI_Vowel_Sara_E)
  1624. fHasSaraE = true;
  1625. pszIndex++;
  1626. iRetValue++;
  1627. }
  1628. if (IsThaiConsonant(*pszIndex))
  1629. {
  1630. pszIndex++;
  1631. iRetValue++;
  1632. while (IsThaiUpperAndLowerClusterCharacter(*pszIndex))
  1633. {
  1634. // Mai Han Akat is a special type of cluster that will need at lease
  1635. // one ending cluster.
  1636. if (*pszIndex == THAI_Vowel_Sign_Mai_HanAkat)
  1637. fNeedEndingCluster = true;
  1638. // In Thai it isn't possible to make a sound if we have the SaraE
  1639. // following by vowel below vowel.
  1640. else if ( fHasSaraE &&
  1641. ( (*pszIndex == THAI_Vowel_Sara_II) ||
  1642. (*pszIndex == THAI_Tone_MaiTaiKhu) ||
  1643. (*pszIndex == THAI_Vowel_Sara_I) ||
  1644. (*pszIndex == THAI_Sara_Uee) ))
  1645. fNeedEndingCluster = true;
  1646. pszIndex++;
  1647. iRetValue++;
  1648. }
  1649. while (IsThaiEndingClusterCharacter(*pszIndex))
  1650. {
  1651. pszIndex++;
  1652. iRetValue++;
  1653. fNeedEndingCluster = false;
  1654. }
  1655. // Include period as part of a cluster. Bug#57106
  1656. if (*pszIndex == 0x002e)
  1657. {
  1658. pszIndex++;
  1659. iRetValue++;
  1660. fNeedEndingCluster = false;
  1661. }
  1662. }
  1663. if (fNeedEndingCluster)
  1664. fNeedEndingCluster = false;
  1665. else
  1666. break;
  1667. }
  1668. if (iRetValue == 0)
  1669. iRetValue++; // The character is probably a punctuation.
  1670. if (pszIndex > pszEnd)
  1671. {
  1672. // We need to do this as we have gone over end buff boundary.
  1673. iRetValue -= (int) (pszIndex - pszEnd);
  1674. pszIndex = pszEnd;
  1675. }
  1676. return iRetValue;
  1677. }
  1678. //+---------------------------------------------------------------------------
  1679. //
  1680. // Class: CThaiBreakTree
  1681. //
  1682. // Synopsis:
  1683. //
  1684. // Arguments:
  1685. //
  1686. // wzWord - input string. (in)
  1687. // iWordLen - input string length. (in)
  1688. // Alt - find close alternate word (in)
  1689. // pBreakPos - array of break position allways 5 byte. (out)
  1690. //
  1691. // Modifies:
  1692. //
  1693. // History: created 3/00 aarayas
  1694. //
  1695. // Notes:
  1696. //
  1697. //----------------------------------------------------------------------------
  1698. int CThaiBreakTree::FindAltWord(WCHAR* pwchBegin,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos)
  1699. {
  1700. // Declare and initialize local variables.
  1701. unsigned int iNumCluster = 1;
  1702. WCHAR* pwchBeginWord = pwchBegin;
  1703. WCHAR* pwchIndex = pwchBegin;
  1704. bool fBeginNewWord = true;
  1705. unsigned int iBreakIndex = 0;
  1706. unsigned int iBreakTemp = 0;
  1707. unsigned int iBreakTemp1 = 0;
  1708. unsigned int iBreakTemp2 = 0;
  1709. pszEnd = pwchBegin + iWordLen;
  1710. // TODO: Need to clean this code up.
  1711. switch(Alt)
  1712. {
  1713. case 3:
  1714. while (true)
  1715. {
  1716. iNumCluster = GetCluster(pwchIndex);
  1717. if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
  1718. return iBreakIndex;
  1719. fBeginNewWord = false;
  1720. pwchIndex += iNumCluster;
  1721. if (thaiTrieIter1.fWordEnd)
  1722. {
  1723. iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
  1724. iBreakTemp1 = GetWeight(pwchIndex);
  1725. iBreakTemp2 = GetWeight(pwchIndex+iBreakTemp1);
  1726. if (iBreakTemp + iBreakTemp1 + iBreakTemp2 == iWordLen)
  1727. {
  1728. pBreakPos[0] = (BYTE)iBreakTemp;
  1729. pBreakPos[1] = (BYTE)iBreakTemp1;
  1730. pBreakPos[2] = (BYTE)iBreakTemp2;
  1731. return 3;
  1732. }
  1733. }
  1734. if (pwchIndex >= pszEnd)
  1735. return iBreakIndex;
  1736. }
  1737. break;
  1738. case 2:
  1739. while (true)
  1740. {
  1741. iNumCluster = GetCluster(pwchIndex);
  1742. if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
  1743. return iBreakIndex;
  1744. fBeginNewWord = false;
  1745. pwchIndex += iNumCluster;
  1746. if (thaiTrieIter1.fWordEnd)
  1747. {
  1748. iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
  1749. iBreakTemp1 = GetWeight(pwchIndex);
  1750. if (iBreakTemp + iBreakTemp1 == iWordLen)
  1751. {
  1752. pBreakPos[0] = (BYTE)iBreakTemp;
  1753. pBreakPos[1] = (BYTE)iBreakTemp1;
  1754. return 2;
  1755. }
  1756. }
  1757. if (pwchIndex >= pszEnd)
  1758. return iBreakIndex;
  1759. }
  1760. break;
  1761. default:
  1762. case 1:
  1763. while (iBreakIndex < Alt)
  1764. {
  1765. iNumCluster = GetCluster(pwchIndex);
  1766. if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
  1767. return iBreakIndex;
  1768. fBeginNewWord = false;
  1769. pwchIndex += iNumCluster;
  1770. if (thaiTrieIter1.fWordEnd)
  1771. {
  1772. fBeginNewWord = true;
  1773. pBreakPos[iBreakIndex] = (BYTE)(pwchIndex - pwchBeginWord);
  1774. iBreakIndex++;
  1775. pwchBeginWord = pwchIndex;
  1776. }
  1777. if (pwchIndex >= pszEnd)
  1778. return iBreakIndex;
  1779. }
  1780. break;
  1781. }
  1782. return iBreakIndex;
  1783. }