Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

119 lines
3.6 KiB

//+---------------------------------------------------------------------------
//
//
// CThaiBreakTree - class CThaiBreakTree
//
// History:
// created 7/99 aarayas
//
// ©1999 Microsoft Corporation
//----------------------------------------------------------------------------
#ifndef _CTHAIBREAKTREE_H_
#define _CTHAIBREAKTREE_H_
#include <windows.h>
#include <assert.h>
#include "CThaiTrieIter.hpp"
#include "CThaiSentTrieIter.hpp"
#include "CThaiTrigramTrieIter.hpp"
#include "CBreakTree.hpp"
#include "lextable.hpp"
#define MAXTHAIBREAKNODE 255000
#define MAXUNSIGNEDINT 4294967295
#define UNABLETOCREATENODE MAXUNSIGNEDINT
class CThaiWordBreak;
class ThaiBreakNode
{
public:
ThaiBreakNode() {};
int iPos;
BYTE iBreakLen;
DWORD dwTAG;
unsigned int NextBreak;
unsigned int Down;
};
class CThaiBreakTree : public CBreakTree
{
friend class CThaiWordBreak;
public:
CThaiBreakTree();
~CThaiBreakTree();
#if defined (NGRAM_ENABLE)
void Init(CTrie* pTrie, CTrie* pSentTrie, CTrie* pTrigramTrie);
#else
void Init(CTrie* pTrie, CTrie* pTrigramTrie);
#endif
#if defined (NGRAM_ENABLE)
inline void Reset();
inline bool MoveNext();
inline bool MoveDown();
inline unsigned int CreateNode(int iPos, BYTE iBreakLen, DWORD dwPOS);
bool GenerateTree(WCHAR* pszBegin, WCHAR* pszEnd);
bool MaximalMatching();
#endif
int Soundex(WCHAR* word);
unsigned int TrigramBreak(WCHAR* pwchBegin, WCHAR* pwchEnd);
int FindAltWord(WCHAR* wzWord,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos);
protected:
#if defined (NGRAM_ENABLE)
inline DWORD DeterminePurgeOrUnknown(unsigned int iCurrentNode, unsigned int iBreakLen);
inline void DeterminePurgeEndingSentence(WCHAR* pszBeginWord, unsigned int iNode);
#endif
inline unsigned int Maximum(unsigned int x, unsigned y) { if (x > y) return x; else return y;}
unsigned int GetLongestSubstring(WCHAR* pszBegin, unsigned int iWordLen);
unsigned int GetWeight(WCHAR* pszBegin);
unsigned int GetWeight(WCHAR* pszBegin, DWORD* pdwTag);
float BigramProbablity(DWORD dwTag1,DWORD dwTag2);
DWORD TrigramProbablity(DWORD dwTag1,DWORD dwTag2,DWORD dwTag3);
unsigned int SoundexSearch(WCHAR* pszBegin);
inline bool ShouldMerge(const WCHAR* pwszPrevWord, unsigned int iPrevWordLen, unsigned int iMergeWordLen, DWORD dwPrevTag);
bool Traverse(unsigned int iLevel, unsigned int iCurrentNode, unsigned int iNumUnknown);
unsigned int GetCluster(const WCHAR* pszIndex);
void MaximalMatchingAddBreakToList(unsigned int iNumBreak);
inline void AddBreakToList(unsigned int iNumBreak, unsigned int iNumUnknown);
inline bool CompareSentenceStructure(unsigned int iNumBreak, unsigned int iNumUnknown);
bool IsSentenceStruct(const WCHAR* pos, unsigned int iPosLen);
ThaiBreakNode* breakTree;
CThaiTrieIter thaiTrieIter;
CThaiTrieIter thaiTrieIter1;
CThaiSentTrieIter thaiSentIter;
CThaiTrigramTrieIter thaiTrigramIter;
WCHAR* pszBegin;
WCHAR* pszEnd;
unsigned int iNodeIndex;
unsigned int iNumNode;
// Array of break and part-of-speech use for Traverse the Tree.
BYTE* breakArray;
DWORD* tagArray;
WCHAR* POSArray;
unsigned int iNumUnknownMaximalPOSArray;
// Array of break for use with maximal matching array;
unsigned int maxToken;
unsigned int maxLevel;
BYTE* maximalMatchingBreakArray;
DWORD* maximalMatchingTAGArray;
WCHAR* maximalMatchingPOSArray;
// Array of break for use with trigram array.
BYTE* trigramBreakArray;
};
#endif