Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

117 lines
3.4 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. //
  4. // CThaiBreakTree - class CThaiBreakTree
  5. //
  6. // History:
  7. // created 7/99 aarayas
  8. //
  9. // �1999 Microsoft Corporation
  10. //----------------------------------------------------------------------------
  11. #ifndef _CTHAIBREAKTREE_H_
  12. #define _CTHAIBREAKTREE_H_
  13. #include <windows.h>
  14. #include <assert.h>
  15. #include "CThaiTrieIter.hpp"
  16. #include "CThaiSentTrieIter.hpp"
  17. #include "CThaiTrigramTrieIter.hpp"
  18. #include "CBreakTree.hpp"
  19. #include "lextable.hpp"
  20. #define MAXTHAIBREAKNODE 255000
  21. #define MAXUNSIGNEDINT 4294967295
  22. #define UNABLETOCREATENODE MAXUNSIGNEDINT
  23. class CThaiWordBreak;
  24. class ThaiBreakNode
  25. {
  26. public:
  27. ThaiBreakNode() {};
  28. int iPos;
  29. BYTE iBreakLen;
  30. DWORD dwTAG;
  31. unsigned int NextBreak;
  32. unsigned int Down;
  33. };
  34. class CThaiBreakTree : public CBreakTree
  35. {
  36. friend class CThaiWordBreak;
  37. public:
  38. CThaiBreakTree();
  39. ~CThaiBreakTree();
  40. #if defined (NGRAM_ENABLE)
  41. void Init(CTrie* pTrie, CTrie* pSentTrie, CTrie* pTrigramTrie);
  42. #else
  43. void Init(CTrie* pTrie, CTrie* pTrigramTrie);
  44. #endif
  45. #if defined (NGRAM_ENABLE)
  46. inline void Reset();
  47. inline bool MoveNext();
  48. inline bool MoveDown();
  49. inline unsigned int CreateNode(int iPos, BYTE iBreakLen, DWORD dwPOS);
  50. bool GenerateTree(WCHAR* pszBegin, WCHAR* pszEnd);
  51. bool MaximalMatching();
  52. #endif
  53. int Soundex(WCHAR* word);
  54. unsigned int TrigramBreak(WCHAR* pwchBegin, WCHAR* pwchEnd);
  55. int FindAltWord(WCHAR* wzWord,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos);
  56. protected:
  57. #if defined (NGRAM_ENABLE)
  58. inline DWORD DeterminePurgeOrUnknown(unsigned int iCurrentNode, unsigned int iBreakLen);
  59. inline void DeterminePurgeEndingSentence(WCHAR* pszBeginWord, unsigned int iNode);
  60. #endif
  61. inline unsigned int Maximum(unsigned int x, unsigned y) { if (x > y) return x; else return y;}
  62. unsigned int GetWeight(WCHAR* pszBegin);
  63. unsigned int GetWeight(WCHAR* pszBegin, DWORD* pdwTag);
  64. float BigramProbablity(DWORD dwTag1,DWORD dwTag2);
  65. DWORD TrigramProbablity(DWORD dwTag1,DWORD dwTag2,DWORD dwTag3);
  66. unsigned int SoundexSearch(WCHAR* pszBegin);
  67. inline bool ShouldMerge(WCHAR* pwszPrevWord, unsigned int iPrevWordLen, unsigned int iMergeWordLen, DWORD dwPrevTag);
  68. bool Traverse(unsigned int iLevel, unsigned int iCurrentNode, unsigned int iNumUnknown);
  69. unsigned int GetCluster(WCHAR* pszIndex);
  70. void MaximalMatchingAddBreakToList(unsigned int iNumBreak);
  71. inline void AddBreakToList(unsigned int iNumBreak, unsigned int iNumUnknown);
  72. inline bool CompareSentenceStructure(unsigned int iNumBreak, unsigned int iNumUnknown);
  73. bool IsSentenceStruct(WCHAR* pos, unsigned int iPosLen);
  74. ThaiBreakNode* breakTree;
  75. CThaiTrieIter thaiTrieIter;
  76. CThaiTrieIter thaiTrieIter1;
  77. CThaiSentTrieIter thaiSentIter;
  78. CThaiTrigramTrieIter thaiTrigramIter;
  79. WCHAR* pszBegin;
  80. WCHAR* pszEnd;
  81. unsigned int iNodeIndex;
  82. unsigned int iNumNode;
  83. // Array of break and part-of-speech use for Traverse the Tree.
  84. BYTE* breakArray;
  85. DWORD* tagArray;
  86. WCHAR* POSArray;
  87. unsigned int iNumUnknownMaximalPOSArray;
  88. // Array of break for use with maximal matching array;
  89. unsigned int maxToken;
  90. unsigned int maxLevel;
  91. BYTE* maximalMatchingBreakArray;
  92. DWORD* maximalMatchingTAGArray;
  93. WCHAR* maximalMatchingPOSArray;
  94. // Array of break for use with trigram array.
  95. BYTE* trigramBreakArray;
  96. };
  97. #endif