Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

118 lines
3.6 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. //
  4. // CThaiBreakTree - class CThaiBreakTree
  5. //
  6. // History:
  7. // created 7/99 aarayas
  8. //
  9. // �1999 Microsoft Corporation
  10. //----------------------------------------------------------------------------
  11. #ifndef _CTHAIBREAKTREE_H_
  12. #define _CTHAIBREAKTREE_H_
  13. #include <windows.h>
  14. #include <assert.h>
  15. #include "CThaiTrieIter.hpp"
  16. #include "CThaiSentTrieIter.hpp"
  17. #include "CThaiTrigramTrieIter.hpp"
  18. #include "CBreakTree.hpp"
  19. #include "lextable.hpp"
  20. #define MAXTHAIBREAKNODE 255000
  21. #define MAXUNSIGNEDINT 4294967295
  22. #define UNABLETOCREATENODE MAXUNSIGNEDINT
  23. class CThaiWordBreak;
  24. class ThaiBreakNode
  25. {
  26. public:
  27. ThaiBreakNode() {};
  28. int iPos;
  29. BYTE iBreakLen;
  30. DWORD dwTAG;
  31. unsigned int NextBreak;
  32. unsigned int Down;
  33. };
  34. class CThaiBreakTree : public CBreakTree
  35. {
  36. friend class CThaiWordBreak;
  37. public:
  38. CThaiBreakTree();
  39. ~CThaiBreakTree();
  40. #if defined (NGRAM_ENABLE)
  41. void Init(CTrie* pTrie, CTrie* pSentTrie, CTrie* pTrigramTrie);
  42. #else
  43. void Init(CTrie* pTrie, CTrie* pTrigramTrie);
  44. #endif
  45. #if defined (NGRAM_ENABLE)
  46. inline void Reset();
  47. inline bool MoveNext();
  48. inline bool MoveDown();
  49. inline unsigned int CreateNode(int iPos, BYTE iBreakLen, DWORD dwPOS);
  50. bool GenerateTree(WCHAR* pszBegin, WCHAR* pszEnd);
  51. bool MaximalMatching();
  52. #endif
  53. int Soundex(WCHAR* word);
  54. unsigned int TrigramBreak(WCHAR* pwchBegin, WCHAR* pwchEnd);
  55. int FindAltWord(WCHAR* wzWord,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos);
  56. protected:
  57. #if defined (NGRAM_ENABLE)
  58. inline DWORD DeterminePurgeOrUnknown(unsigned int iCurrentNode, unsigned int iBreakLen);
  59. inline void DeterminePurgeEndingSentence(WCHAR* pszBeginWord, unsigned int iNode);
  60. #endif
  61. inline unsigned int Maximum(unsigned int x, unsigned y) { if (x > y) return x; else return y;}
  62. unsigned int GetLongestSubstring(WCHAR* pszBegin, unsigned int iWordLen);
  63. unsigned int GetWeight(WCHAR* pszBegin);
  64. unsigned int GetWeight(WCHAR* pszBegin, DWORD* pdwTag);
  65. float BigramProbablity(DWORD dwTag1,DWORD dwTag2);
  66. DWORD TrigramProbablity(DWORD dwTag1,DWORD dwTag2,DWORD dwTag3);
  67. unsigned int SoundexSearch(WCHAR* pszBegin);
  68. inline bool ShouldMerge(const WCHAR* pwszPrevWord, unsigned int iPrevWordLen, unsigned int iMergeWordLen, DWORD dwPrevTag);
  69. bool Traverse(unsigned int iLevel, unsigned int iCurrentNode, unsigned int iNumUnknown);
  70. unsigned int GetCluster(const WCHAR* pszIndex);
  71. void MaximalMatchingAddBreakToList(unsigned int iNumBreak);
  72. inline void AddBreakToList(unsigned int iNumBreak, unsigned int iNumUnknown);
  73. inline bool CompareSentenceStructure(unsigned int iNumBreak, unsigned int iNumUnknown);
  74. bool IsSentenceStruct(const WCHAR* pos, unsigned int iPosLen);
  75. ThaiBreakNode* breakTree;
  76. CThaiTrieIter thaiTrieIter;
  77. CThaiTrieIter thaiTrieIter1;
  78. CThaiSentTrieIter thaiSentIter;
  79. CThaiTrigramTrieIter thaiTrigramIter;
  80. WCHAR* pszBegin;
  81. WCHAR* pszEnd;
  82. unsigned int iNodeIndex;
  83. unsigned int iNumNode;
  84. // Array of break and part-of-speech use for Traverse the Tree.
  85. BYTE* breakArray;
  86. DWORD* tagArray;
  87. WCHAR* POSArray;
  88. unsigned int iNumUnknownMaximalPOSArray;
  89. // Array of break for use with maximal matching array;
  90. unsigned int maxToken;
  91. unsigned int maxLevel;
  92. BYTE* maximalMatchingBreakArray;
  93. DWORD* maximalMatchingTAGArray;
  94. WCHAR* maximalMatchingPOSArray;
  95. // Array of break for use with trigram array.
  96. BYTE* trigramBreakArray;
  97. };
  98. #endif