You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2169 lines
71 KiB
2169 lines
71 KiB
//+---------------------------------------------------------------------------
|
|
//
|
|
//
|
|
// CThaiBreakTree - class CThaiBreakTree
|
|
//
|
|
// History:
|
|
// created 7/99 aarayas
|
|
//
|
|
// ©1999 Microsoft Corporation
|
|
//----------------------------------------------------------------------------
|
|
#include "CThaiBreakTree.hpp"
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: ExtractPOS
|
|
//
|
|
// Synopsis: The functions takes a tag and return Part Of Speech Tags.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline WCHAR ExtractPOS(DWORD dwTag)
|
|
{
|
|
return (WCHAR) ( (dwTag & iPosMask) >> iPosShift);
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: ExtractFrq
|
|
//
|
|
// Synopsis: The functions takes a tag and return Frquency of words.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline BYTE ExtractFrq(DWORD dwTag)
|
|
{
|
|
return (BYTE) ( (dwTag & 0x300) >> iFrqShift);
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: DetermineFrequencyWeight
|
|
//
|
|
// Synopsis: The functions returns the frequency weight of a words.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline void DetermineFrequencyWeight(BYTE frq, unsigned int* uiWeight)
|
|
{
|
|
switch (frq)
|
|
{
|
|
case frqpenInfrequent:
|
|
(*uiWeight) -= 2;
|
|
break;
|
|
case frqpenSomewhat:
|
|
(*uiWeight)--;
|
|
break;
|
|
case frqpenVery:
|
|
(*uiWeight) += 2;
|
|
break;
|
|
case frqpenNormal:
|
|
default:
|
|
(*uiWeight)++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: DetermineFrequencyWeight
|
|
//
|
|
// Synopsis: The functions returns the frequency weight of a words.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline void DetermineFrequencyWeight(BYTE frq, DWORD* uiWeight)
|
|
{
|
|
switch (frq)
|
|
{
|
|
case frqpenInfrequent:
|
|
(*uiWeight) -= 2;
|
|
break;
|
|
case frqpenSomewhat:
|
|
(*uiWeight)--;
|
|
break;
|
|
case frqpenVery:
|
|
(*uiWeight) += 2;
|
|
break;
|
|
case frqpenNormal:
|
|
default:
|
|
(*uiWeight)++;
|
|
break;
|
|
}
|
|
}
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiTrieIter
|
|
//
|
|
// Synopsis: Constructor - initialize local variables
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
CThaiBreakTree::CThaiBreakTree() : iNodeIndex(0), iNumNode(0),
|
|
pszBegin(NULL), pszEnd(NULL),
|
|
breakTree(NULL), breakArray(NULL),
|
|
tagArray(NULL), maximalMatchingBreakArray(NULL),
|
|
maximalMatchingTAGArray(NULL),
|
|
POSArray(NULL), maximalMatchingPOSArray(NULL)
|
|
{
|
|
// Allocate memory need for CThaiBreakTree.
|
|
#if defined (NGRAM_ENABLE)
|
|
breakTree = new ThaiBreakNode[MAXTHAIBREAKNODE];
|
|
#endif
|
|
breakArray = new BYTE[MAXBREAK];
|
|
tagArray = new DWORD[MAXBREAK];
|
|
POSArray = new WCHAR[MAXBREAK];
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiTrieIter
|
|
//
|
|
// Synopsis: Destructor - clean up code
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
CThaiBreakTree::~CThaiBreakTree()
|
|
{
|
|
// Clean up all memory used.
|
|
#if defined (NGRAM_ENABLE)
|
|
if (breakTree)
|
|
delete breakTree;
|
|
if (maximalMatchingBreakArray)
|
|
delete maximalMatchingBreakArray;
|
|
if (maximalMatchingTAGArray)
|
|
delete maximalMatchingTAGArray;
|
|
if (maximalMatchingPOSArray)
|
|
delete maximalMatchingPOSArray;
|
|
#endif
|
|
if (breakArray)
|
|
delete breakArray;
|
|
if (tagArray)
|
|
delete tagArray;
|
|
if (POSArray)
|
|
delete POSArray;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: Associate the class to the string.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
#if defined (NGRAM_ENABLE)
|
|
void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pSentTrie, CTrie* pTrigramTrie)
|
|
#else
|
|
void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pTrigramTrie)
|
|
#endif
|
|
{
|
|
assert(pTrie != NULL);
|
|
thaiTrieIter.Init(pTrie);
|
|
thaiTrieIter1.Init(pTrie);
|
|
|
|
#if defined (NGRAM_ENABLE)
|
|
assert(pSentTrie != NULL);
|
|
thaiSentIter.Init(pSentTrie);
|
|
#endif
|
|
assert(pTrigramTrie != NULL);
|
|
thaiTrigramIter.Init(pTrigramTrie);
|
|
}
|
|
|
|
#if defined (NGRAM_ENABLE)
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: reset iterator to top of the tree
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline void CThaiBreakTree::Reset()
|
|
{
|
|
iNodeIndex = 0;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: Move to the next break.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline bool CThaiBreakTree::MoveNext()
|
|
{
|
|
iNodeIndex = breakTree[iNodeIndex].NextBreak;
|
|
return (iNodeIndex != 0);
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: Move down to next level.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline bool CThaiBreakTree::MoveDown()
|
|
{
|
|
iNodeIndex = breakTree[iNodeIndex].Down;
|
|
return (iNodeIndex != 0);
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: create new node to position, and return index to the node.
|
|
//
|
|
// * return Unable to Create Node.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline unsigned int CThaiBreakTree::CreateNode(int iPos, BYTE iBreakLen, DWORD dwTAG)
|
|
{
|
|
assert(iNumNode < MAXTHAIBREAKNODE);
|
|
|
|
if (iNumNode >= MAXTHAIBREAKNODE)
|
|
{
|
|
return UNABLETOCREATENODE;
|
|
}
|
|
breakTree[iNumNode].iPos = iPos;
|
|
breakTree[iNumNode].iBreakLen = iBreakLen;
|
|
breakTree[iNumNode].dwTAG = dwTAG;
|
|
breakTree[iNumNode].NextBreak = 0;
|
|
breakTree[iNumNode].Down = 0;
|
|
|
|
iNumNode++;
|
|
return (iNumNode - 1);
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: Generate a Tree of possible break from the given string.
|
|
//
|
|
// * Note - false if there aren't enough memory to create node.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
enum thai_parse_state {
|
|
END_SENTENCE, // Reached the end of sentence.
|
|
LONGEST_MATCH, // Longest possible matched.
|
|
NOMATCH_FOUND, // Unable to find word.
|
|
ERROR_OUTMEMORY, // Out of Memory.
|
|
};
|
|
|
|
bool CThaiBreakTree::GenerateTree(WCHAR* pszBegin, WCHAR* pszEnd1)
|
|
{
|
|
// Declare and initialize local variables.
|
|
unsigned int iIndexBreakTree = 0;
|
|
unsigned int iPrevIndexBreakTree = 0;
|
|
unsigned int iParentNode = 0;
|
|
WCHAR* pszBeginWord = pszBegin;
|
|
WCHAR* pszIndex = pszBegin;
|
|
unsigned int iNumCluster = 1;
|
|
unsigned int iNumLastCluster;
|
|
unsigned int iWordLen = 0;
|
|
unsigned int iNodeAnalyze = 0;
|
|
thai_parse_state parseState = END_SENTENCE;
|
|
bool fFoundMatch = false;
|
|
bool fAddToNodeAnalyze = false;
|
|
bool fDoneGenerateTree = false;
|
|
pszEnd = pszEnd1;
|
|
|
|
#if defined (_DEBUG)
|
|
memset(breakTree,0,sizeof(ThaiBreakNode)*MAXTHAIBREAKNODE);
|
|
#endif
|
|
iNodeIndex = 0;
|
|
iNumNode = 0;
|
|
|
|
while (true)
|
|
{
|
|
// Reset Iterator for generating break for new word.
|
|
fFoundMatch = false;
|
|
thaiTrieIter.Reset();
|
|
|
|
if (iIndexBreakTree != 0)
|
|
{
|
|
while (true)
|
|
{
|
|
// If this is not the first node than set pszBeginWord after the last break.
|
|
pszBeginWord = pszBegin + breakTree[iNodeAnalyze].iPos + breakTree[iNodeAnalyze].iBreakLen;
|
|
fAddToNodeAnalyze = true;
|
|
|
|
// Are we at the end of the sentence.
|
|
if ( (pszBeginWord == pszEnd) ||
|
|
(breakTree[iNodeAnalyze].dwTAG == TAGPOS_PURGE) )
|
|
{
|
|
iNodeAnalyze++; // Move to next node.
|
|
if (iNodeAnalyze >= iNumNode)
|
|
{
|
|
fDoneGenerateTree = true;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
pszIndex = pszBeginWord;
|
|
iParentNode = iNodeAnalyze;
|
|
|
|
if (fDoneGenerateTree)
|
|
break;
|
|
|
|
// Get next level of tree.
|
|
while (TRUE)
|
|
{
|
|
iNumLastCluster = iNumCluster;
|
|
iNumCluster = GetCluster(pszIndex);
|
|
if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster))
|
|
{
|
|
pszIndex += iNumCluster;
|
|
if (thaiTrieIter.fWordEnd)
|
|
{
|
|
fFoundMatch = true;
|
|
// if first node add first node
|
|
if (iIndexBreakTree == 0)
|
|
{
|
|
CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
|
|
iIndexBreakTree++;
|
|
}
|
|
else
|
|
{
|
|
if (fAddToNodeAnalyze)
|
|
{
|
|
fAddToNodeAnalyze = false;
|
|
breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
|
|
|
|
// Determine if an error has occur.
|
|
if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
|
|
{
|
|
breakTree[iNodeAnalyze].NextBreak = 0;
|
|
parseState = ERROR_OUTMEMORY;
|
|
break;
|
|
}
|
|
|
|
iPrevIndexBreakTree = breakTree[iNodeAnalyze].NextBreak;
|
|
iNodeAnalyze++;
|
|
}
|
|
else
|
|
{
|
|
breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
|
|
|
|
// Determine if an error has occur.
|
|
if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
|
|
{
|
|
breakTree[iPrevIndexBreakTree].Down = 0;
|
|
parseState = ERROR_OUTMEMORY;
|
|
break;
|
|
}
|
|
|
|
iPrevIndexBreakTree = iIndexBreakTree;
|
|
}
|
|
iIndexBreakTree++;
|
|
}
|
|
}
|
|
|
|
if (pszIndex >= pszEnd)
|
|
{
|
|
assert(pszIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
|
|
parseState = END_SENTENCE;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (fFoundMatch)
|
|
parseState = LONGEST_MATCH;
|
|
else
|
|
parseState = NOMATCH_FOUND;
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
if (parseState == LONGEST_MATCH)
|
|
{
|
|
// We found a matched.
|
|
assert(breakTree[iPrevIndexBreakTree].Down == 0); // at this point breakTree[iPreveIndexBreakTree].Down should equal null.(optimization note)
|
|
if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree)
|
|
{
|
|
assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown.
|
|
DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak);
|
|
}
|
|
}
|
|
else if (parseState == NOMATCH_FOUND)
|
|
{
|
|
// Should mark node as unknown.
|
|
if (fAddToNodeAnalyze)
|
|
{
|
|
fAddToNodeAnalyze = false;
|
|
iWordLen = pszIndex - pszBeginWord;
|
|
|
|
// Make sure we don't only have a cluster of text before making a node.
|
|
if (iWordLen == 0)
|
|
{
|
|
// If we have an UNKNOWN word of one character only current node mark it as unknown.
|
|
assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
|
|
breakTree[iNodeAnalyze].iBreakLen += iNumCluster;
|
|
breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
|
|
}
|
|
else
|
|
{
|
|
if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8)
|
|
// The reason we are using 8 is because from corpora analysis
|
|
// the average Thai word is about 7.732 characters.
|
|
// TODO: We should add orthographic analysis here to get a better on boundary
|
|
// of unknown word.
|
|
{
|
|
assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
|
|
breakTree[iNodeAnalyze].iBreakLen += iWordLen;
|
|
breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
|
|
}
|
|
else
|
|
{
|
|
if (GetWeight(pszIndex - iNumLastCluster))
|
|
breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN);
|
|
else
|
|
breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN);
|
|
|
|
// Determine if an error has occur.
|
|
if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
|
|
{
|
|
breakTree[iNodeAnalyze].NextBreak = 0;
|
|
parseState = ERROR_OUTMEMORY;
|
|
break;
|
|
}
|
|
iNodeAnalyze++;
|
|
iIndexBreakTree++;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN);
|
|
|
|
// Determine if an error has occur.
|
|
if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
|
|
{
|
|
breakTree[iPrevIndexBreakTree].Down = 0;
|
|
parseState = ERROR_OUTMEMORY;
|
|
break;
|
|
}
|
|
iIndexBreakTree++;
|
|
}
|
|
}
|
|
else if (parseState == END_SENTENCE)
|
|
{
|
|
// If we find ourself at the end of a sentence and no match.
|
|
if (!fFoundMatch)
|
|
{
|
|
if (fAddToNodeAnalyze)
|
|
{
|
|
fAddToNodeAnalyze = false;
|
|
iWordLen = pszIndex - pszBeginWord;
|
|
|
|
// Make sure we don't only have a cluster of text before making a node.
|
|
if (iWordLen == 0)
|
|
{
|
|
// If we have an UNKNOWN word of one character only current node mark it as unknown.
|
|
assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
|
|
breakTree[iNodeAnalyze].iBreakLen += iNumCluster;
|
|
breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
|
|
}
|
|
else
|
|
{
|
|
if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8)
|
|
// The reason we are using 8 is because from corpora analysis
|
|
// the average Thai word is about 7.732 characters.
|
|
// TODO: We should add orthographic analysis here to get a better on boundary
|
|
// of unknown word.
|
|
{
|
|
assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
|
|
breakTree[iNodeAnalyze].iBreakLen += iWordLen;
|
|
breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
|
|
}
|
|
else
|
|
{
|
|
if (GetWeight(pszIndex - iNumLastCluster))
|
|
breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN);
|
|
else
|
|
breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN);
|
|
|
|
// Determine if an error has occur.
|
|
if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
|
|
{
|
|
breakTree[iNodeAnalyze].NextBreak = 0;
|
|
parseState = ERROR_OUTMEMORY;
|
|
break;
|
|
}
|
|
iNodeAnalyze++;
|
|
iIndexBreakTree++;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN);
|
|
|
|
// Determine if an error has occur.
|
|
if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
|
|
{
|
|
breakTree[iPrevIndexBreakTree].Down = 0;
|
|
parseState = ERROR_OUTMEMORY;
|
|
break;
|
|
}
|
|
}
|
|
iIndexBreakTree++;
|
|
}
|
|
// If the beginning of node the branch isn't equal to leaf node perphase it is possible to
|
|
// do some ending optimization.
|
|
else if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree)
|
|
{
|
|
assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown.
|
|
DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak);
|
|
}
|
|
}
|
|
else if ( (breakTree[iNodeAnalyze].iBreakLen == 0) || (parseState == ERROR_OUTMEMORY) )
|
|
break;
|
|
}
|
|
|
|
return (parseState != ERROR_OUTMEMORY);
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: Traverse all the tree and look for the least number of token.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
bool CThaiBreakTree::MaximalMatching()
|
|
{
|
|
// If maximal matching break array has not been allocate, than allocate it.
|
|
if (!maximalMatchingBreakArray)
|
|
maximalMatchingBreakArray = new BYTE[MAXBREAK];
|
|
if (!maximalMatchingTAGArray)
|
|
maximalMatchingTAGArray = new DWORD[MAXBREAK];
|
|
if (!maximalMatchingPOSArray)
|
|
maximalMatchingPOSArray = new WCHAR[MAXBREAK];
|
|
|
|
maxLevel = MAXUNSIGNEDINT;
|
|
maxToken = 0;
|
|
iNumUnknownMaximalPOSArray = MAXBREAK;
|
|
Traverse(0,0,0);
|
|
|
|
return true;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: The function determine if the node if the node should,
|
|
// be tag as unknown or purge.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline DWORD CThaiBreakTree::DeterminePurgeOrUnknown(unsigned int iCurrentNode, unsigned int iBreakLen)
|
|
{
|
|
// Declare and initialize local variables.
|
|
unsigned int iNode = breakTree[iCurrentNode].Down;
|
|
|
|
while (iNode != 0)
|
|
{
|
|
if ( (breakTree[iNode].iBreakLen == iBreakLen) ||
|
|
(breakTree[iNode].iBreakLen < iBreakLen) &&
|
|
( (breakTree[iNode].dwTAG != TAGPOS_UNKNOWN) ||
|
|
(breakTree[iNode].dwTAG != TAGPOS_PURGE) ))
|
|
{
|
|
// Since we are purging this break just make sure the NextBreak is Null.
|
|
assert(breakTree[iCurrentNode].NextBreak == 0);
|
|
return TAGPOS_PURGE;
|
|
}
|
|
|
|
iNode = breakTree[iNode].Down;
|
|
}
|
|
return TAGPOS_UNKNOWN;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: Ending optimization - if we have found the end of a sentence,
|
|
// and possible break. Purge the branch for unnecessary break.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline void CThaiBreakTree::DeterminePurgeEndingSentence(WCHAR* pszBeginWord, unsigned int iNode)
|
|
{
|
|
while (breakTree[iNode].Down != 0)
|
|
{
|
|
// Determine if the next string has a possiblity to become a word.
|
|
// TODO: We may need to change this once the GetWeight add soundex
|
|
// functionality.
|
|
if (GetWeight(pszBeginWord + breakTree[iNode].iBreakLen) == 0)
|
|
{
|
|
// Since we are purging this break just make sure the NextBreak is Null.
|
|
assert(breakTree[iNode].NextBreak == 0);
|
|
breakTree[iNode].dwTAG = TAGPOS_PURGE;
|
|
}
|
|
iNode = breakTree[iNode].Down;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
unsigned int CThaiBreakTree::GetLongestSubstring(WCHAR* pszBegin, unsigned int iWordLen)
|
|
{
|
|
// Declare and initialize local variables.
|
|
unsigned int iNumCluster = 1;
|
|
unsigned int lastWeight = 0;
|
|
unsigned int Weight = 0;
|
|
bool fBeginNewWord;
|
|
WCHAR* pszIndex = pszBegin;
|
|
|
|
// Short circuit the length is less of string is less than 1.
|
|
if ((pszEnd - pszBegin) == 1)
|
|
return Weight;
|
|
else if (pszEnd == pszBegin)
|
|
return 1000;
|
|
|
|
// Reset Iterator for generating break for new word.
|
|
fBeginNewWord = true;
|
|
|
|
// Get next level of tree.
|
|
while (true)
|
|
{
|
|
iNumCluster = GetCluster(pszIndex);
|
|
if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
|
|
{
|
|
fBeginNewWord = false;
|
|
pszIndex += iNumCluster;
|
|
if (thaiTrieIter.fWordEnd)
|
|
{
|
|
lastWeight = Weight;
|
|
Weight = (unsigned int) (pszIndex - pszBegin);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ((Weight == iWordLen) && (lastWeight < Weight) && (lastWeight > 0))
|
|
{
|
|
Weight = lastWeight;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return Weight;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin)
|
|
{
|
|
// Declare and initialize local variables.
|
|
unsigned int iNumCluster = 1;
|
|
unsigned int Weight = 0;
|
|
bool fBeginNewWord;
|
|
WCHAR* pszIndex = pszBegin;
|
|
|
|
// Short circuit the length is less of string is less than 1.
|
|
if ((pszEnd - pszBegin) == 1)
|
|
return Weight;
|
|
else if (pszEnd == pszBegin)
|
|
return 1000;
|
|
|
|
// Reset Iterator for generating break for new word.
|
|
fBeginNewWord = true;
|
|
|
|
// Get next level of tree.
|
|
while (true)
|
|
{
|
|
iNumCluster = GetCluster(pszIndex);
|
|
if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
|
|
{
|
|
fBeginNewWord = false;
|
|
pszIndex += iNumCluster;
|
|
if (thaiTrieIter.fWordEnd)
|
|
Weight = (unsigned int) (pszIndex - pszBegin);
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
return Weight;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin, DWORD* pdwTag)
|
|
{
|
|
// Declare and initialize local variables.
|
|
unsigned int iNumCluster = 1;
|
|
unsigned int Weight = 0;
|
|
bool fBeginNewWord;
|
|
WCHAR* pszIndex = pszBegin;
|
|
|
|
// Short circuit the length is less of string is less than 1.
|
|
if ((pszEnd - pszBegin) == 1)
|
|
return Weight;
|
|
else if (pszEnd == pszBegin)
|
|
return 1000;
|
|
|
|
// Reset Iterator for generating break for new word.
|
|
fBeginNewWord = true;
|
|
|
|
// Get next level of tree.
|
|
while (true)
|
|
{
|
|
iNumCluster = GetCluster(pszIndex);
|
|
if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
|
|
{
|
|
fBeginNewWord = false;
|
|
pszIndex += iNumCluster;
|
|
if (thaiTrieIter.fWordEnd)
|
|
{
|
|
Weight = (unsigned int) (pszIndex - pszBegin);
|
|
*pdwTag = thaiTrieIter.dwTag;
|
|
}
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
return Weight;
|
|
}
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: Traverse the tree.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
bool CThaiBreakTree::Traverse(unsigned int iLevel, unsigned int iCurrentNode, unsigned int iNumUnknown)
|
|
{
|
|
assert (iLevel < MAXBREAK);
|
|
// Process node.
|
|
breakArray[iLevel] = breakTree[iCurrentNode].iBreakLen;
|
|
tagArray[iLevel] = breakTree[iCurrentNode].dwTAG;
|
|
if (tagArray[iLevel] == TAGPOS_UNKNOWN)
|
|
iNumUnknown++;
|
|
|
|
// Have we found the end of the sentence.
|
|
if (breakTree[iCurrentNode].NextBreak == 0)
|
|
{
|
|
if (breakTree[iCurrentNode].dwTAG != TAGPOS_PURGE)
|
|
AddBreakToList(iLevel + 1, iNumUnknown);
|
|
if (breakTree[iCurrentNode].Down != 0)
|
|
{
|
|
if (tagArray[iLevel] == TAGPOS_UNKNOWN)
|
|
iNumUnknown--;
|
|
return Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown);
|
|
}
|
|
else
|
|
return true;
|
|
}
|
|
else
|
|
Traverse(iLevel + 1, breakTree[iCurrentNode].NextBreak, iNumUnknown);
|
|
|
|
if (breakTree[iCurrentNode].Down != 0)
|
|
{
|
|
if (tagArray[iLevel] == TAGPOS_UNKNOWN)
|
|
iNumUnknown--;
|
|
|
|
Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
unsigned int CThaiBreakTree::SoundexSearch(WCHAR* pszBegin)
|
|
{
|
|
// Declare and initialize local variables.
|
|
unsigned int iNumCluster = 1;
|
|
unsigned int iNumNextCluster = 1;
|
|
unsigned int iLongestWord = 0;
|
|
unsigned int iPenalty = 0;
|
|
WCHAR* pszIndex = pszBegin;
|
|
|
|
// Short circuit the length is less of string is less than 1.
|
|
if ( (pszBegin+1) >= pszEnd )
|
|
return iLongestWord;
|
|
|
|
// Reset Iterator for generating break for new word.
|
|
thaiTrieIter1.Reset();
|
|
|
|
// Get next level of tree.
|
|
while (true)
|
|
{
|
|
iNumCluster = GetCluster(pszIndex);
|
|
|
|
// Determine iNumNextCluster let iNumNextCluster = 0, if we reached the end of string.
|
|
if (pszIndex + iNumCluster >= pszEnd)
|
|
iNumNextCluster = 0;
|
|
else
|
|
iNumNextCluster = GetCluster(pszIndex+iNumCluster);
|
|
|
|
// Determine penalty
|
|
switch (thaiTrieIter1.MoveSoundexByCluster(pszIndex, iNumCluster, iNumNextCluster))
|
|
{
|
|
case SUBSTITUTE_SOUNDLIKECHAR:
|
|
iPenalty += 2;
|
|
break;
|
|
case SUBSTITUTE_DIACRITIC:
|
|
iPenalty++;
|
|
break;
|
|
case UNABLE_TO_MOVE:
|
|
iPenalty += 2;
|
|
break;
|
|
case STOP_MOVE:
|
|
iPenalty += 1000;
|
|
break;
|
|
default:
|
|
case NOSUBSTITUTE:
|
|
break;
|
|
}
|
|
|
|
// Update Index.
|
|
if (iPenalty <= 2)
|
|
{
|
|
pszIndex += iNumCluster;
|
|
if (thaiTrieIter1.fWordEnd)
|
|
iLongestWord = (unsigned int) (pszIndex - pszBegin);
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
return iLongestWord;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: The information used here is a reference to the orthographic
|
|
// analysis work done on the Thai languages. (see paper: Natural
|
|
// Language Processing in Thailand 1993 Chulalongkorn. p 361).
|
|
//
|
|
// Arguments: pszBoundaryChar - Contain pointer to at least two thai character
|
|
// character next to each other which we will
|
|
// use to calculate wheather we should or
|
|
// should not merge the two word.
|
|
//
|
|
// iPrevWordLen -
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline bool CThaiBreakTree::ShouldMerge(const WCHAR* pwszPrevWord, unsigned int iPrevWordLen, unsigned int iMergeWordLen, DWORD dwPrevTag)
|
|
{
|
|
const WCHAR* pwszBoundary = pwszPrevWord + iPrevWordLen - 1;
|
|
|
|
assert(iMergeWordLen != 0);
|
|
assert(iPrevWordLen != 0);
|
|
|
|
// There are very few words in Thai that are 4 character or less, therefore we should
|
|
// found a pair that less than 4 character we should merge.
|
|
// Or if merge word length is one than also merge.
|
|
// Of if last cluster of the word is a Thanthakhat(Karan) we should always merge.
|
|
if (iPrevWordLen + iMergeWordLen <= 4 || iMergeWordLen == 1 ||
|
|
(iMergeWordLen == 2 && *(pwszBoundary + iMergeWordLen) == THAI_Thanthakhat))
|
|
return true;
|
|
|
|
if (iPrevWordLen >=2)
|
|
{
|
|
const WCHAR* pwszPrevCharBoundary = pwszBoundary - 1;
|
|
|
|
// TO IMPROVE: It better to check the last character of Previous word, it can give us a
|
|
// much better guess
|
|
if ((*pwszPrevCharBoundary == THAI_Vowel_Sign_Mai_HanAkat || *pwszBoundary == THAI_Vowel_Sign_Mai_HanAkat) ||
|
|
(*pwszPrevCharBoundary == THAI_Tone_Mai_Tri || *pwszBoundary == THAI_Tone_Mai_Tri) ||
|
|
(*pwszPrevCharBoundary == THAI_Sara_Ue || *pwszBoundary == THAI_Sara_Ue) )
|
|
return true;
|
|
}
|
|
|
|
// If the first character of the next word is mostly likly the beginning
|
|
// character and last character of the previous word is not sara-A than
|
|
// we have a high probability that we found a begin of word boundary,
|
|
// therefore we shouldn't merge.
|
|
if ( (IsThaiMostlyBeginCharacter(pwszBoundary[1]) && *pwszBoundary != THAI_Vowel_Sara_A) )
|
|
return false;
|
|
|
|
// If the last character of the previous word is mostly likely an ending
|
|
// character than, than there is a high probability that the found a boundary.
|
|
// There are very few words in Thai that are 4 character or less, therefore we should
|
|
// found a pair that less than 4 character we should merge.
|
|
if (IsThaiMostlyLastCharacter(*pwszBoundary))
|
|
return false;
|
|
|
|
// O10.192931 Adding Diacritic check rules. We might want to expand this to more diacritic
|
|
// for now Mai HanAkart would do. It is highly unlikely that a word contain more than 1 of Mai HanAkart diacritic.
|
|
if (IsContain(pwszPrevWord,iPrevWordLen,THAI_Vowel_Sign_Mai_HanAkat) && IsContain(pwszBoundary + 1,iMergeWordLen,THAI_Vowel_Sign_Mai_HanAkat))
|
|
return false;
|
|
|
|
if (iMergeWordLen == 3 && GetCluster(pwszBoundary + 1) == iMergeWordLen)
|
|
{
|
|
if (*(pwszBoundary + 2) == THAI_Vowel_Sara_I)
|
|
{
|
|
if (*(pwszBoundary+3) == THAI_Tone_Mai_Ek || *(pwszBoundary+3) == THAI_Tone_Mai_Tro)
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// if previous tag is equal to Title Noun than the next word is highly likly to be a name.
|
|
if (ExtractPOS(dwPrevTag) == 6)
|
|
return false;
|
|
|
|
// O11.134455. For the case of trailling punctuation.
|
|
if (dwPrevTag == TAGPOS_PUNC && iMergeWordLen > 1 && iPrevWordLen > 1)
|
|
return false;
|
|
|
|
// The reason we are using 8 is because from corpora analysis
|
|
// the average Thai word is about 7.732 characters. Or, if previous word is already
|
|
// an unknown, to keep the amount of unknown low the unknown to previous words.
|
|
if ( (iPrevWordLen + iMergeWordLen < 8) || (dwPrevTag == TAGPOS_UNKNOWN) )
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
// 8/17/99 optimize some code.
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline void CThaiBreakTree::AddBreakToList(unsigned int iNumBreak, unsigned int iNumUnknown)
|
|
{
|
|
#if defined (_DEBUG)
|
|
breakArray[iNumBreak] = 0;
|
|
#endif
|
|
if (CompareSentenceStructure(iNumBreak, iNumUnknown))
|
|
{
|
|
maxToken = maxLevel = iNumBreak; // This is ugly but it save 5 clock cycle.
|
|
memcpy(maximalMatchingBreakArray,breakArray,maxToken);
|
|
memcpy(maximalMatchingTAGArray,tagArray,sizeof(DWORD)*maxToken);
|
|
maximalMatchingBreakArray[maxToken] = 0;
|
|
maximalMatchingTAGArray[maxToken] = 0;
|
|
}
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis: The function compares sentence structure of
|
|
// maximalMatchingPOSArray with posArray.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
inline bool CThaiBreakTree::CompareSentenceStructure(unsigned int iNumBreak, unsigned int iNumUnknownPOSArray)
|
|
{
|
|
if ( (iNumBreak < maxLevel) && (iNumUnknownMaximalPOSArray >= iNumUnknownPOSArray) )
|
|
{
|
|
iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
|
|
return true;
|
|
}
|
|
else if (iNumBreak == maxLevel)
|
|
{
|
|
// true - maximal matching has a larger unknown.
|
|
if (iNumUnknownMaximalPOSArray > iNumUnknownPOSArray)
|
|
{
|
|
iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
|
|
return true;
|
|
}
|
|
|
|
for(unsigned int i = 0; i <= iNumBreak; i++)
|
|
{
|
|
maximalMatchingPOSArray[i] = ExtractPOS(maximalMatchingTAGArray[i]);
|
|
POSArray[i] = ExtractPOS(tagArray[i]);
|
|
}
|
|
|
|
// Determine if the sentence structure is like any one of the sentence
|
|
// sentence structure in our corpora.
|
|
if ( (IsSentenceStruct(POSArray, iNumBreak)) &&
|
|
(!IsSentenceStruct(maximalMatchingPOSArray, iNumBreak)) )
|
|
{
|
|
iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
|
|
return true;
|
|
}
|
|
else if (iNumUnknownMaximalPOSArray == iNumUnknownPOSArray)
|
|
{
|
|
// Determine the frequency of word used in the sentence.
|
|
unsigned int iFrequencyArray = 500;
|
|
unsigned int iFrequencyMaximalArray = 500;
|
|
|
|
for(unsigned int i = 0; i <= iNumBreak; i++)
|
|
{
|
|
DetermineFrequencyWeight(ExtractFrq(maximalMatchingTAGArray[i]),&iFrequencyMaximalArray);
|
|
DetermineFrequencyWeight(ExtractFrq(tagArray[i]),&iFrequencyArray);
|
|
}
|
|
return (iFrequencyArray > iFrequencyMaximalArray);
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
bool CThaiBreakTree::IsSentenceStruct(const WCHAR* pos, unsigned int iPosLen)
|
|
{
|
|
// Declare and initialize all local variables.
|
|
unsigned int i = 0;
|
|
|
|
thaiSentIter.Reset();
|
|
|
|
if (!thaiSentIter.Down())
|
|
return FALSE;
|
|
|
|
while (TRUE)
|
|
{
|
|
thaiSentIter.GetNode();
|
|
if (thaiSentIter.pos == pos[i])
|
|
{
|
|
i++;
|
|
if (thaiSentIter.fWordEnd && i == iPosLen)
|
|
{
|
|
return TRUE;
|
|
}
|
|
else if (i == iPosLen) break;
|
|
// Move down the Trie Branch.
|
|
else if (!thaiSentIter.Down()) break;
|
|
}
|
|
// Move right of the Trie Branch
|
|
else if (!thaiSentIter.Right()) break;
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
float CThaiBreakTree::BigramProbablity(DWORD dwTag1,DWORD dwTag2)
|
|
{
|
|
unsigned int iWeight = 4;
|
|
|
|
// TODO : Use the distribution of word category to determine optimial search - exmaple
|
|
// NOUN VERB ADVERB CLASSIFIER CONJETURE PREP et....
|
|
// TODO : Once we got trigram use it to create bigram probability as well.
|
|
if ( (dwTag1 != TAGPOS_UNKNOWN) &&
|
|
(dwTag2 != TAGPOS_UNKNOWN) )
|
|
{
|
|
WCHAR pos1 = ExtractPOS(dwTag1);
|
|
WCHAR pos2 = ExtractPOS(dwTag2);
|
|
|
|
// case NCMN VATT
|
|
/// a common noun is often followed by attributive verb(adjective)
|
|
// Example: (In Thai) book good, people nice
|
|
if (pos1 == 5 && pos2 == 13)
|
|
iWeight += 10;
|
|
// case NTTL NPRP
|
|
// a title noun is often followed by proper noun
|
|
// Example: Dr. Athapan, Mr. Sam
|
|
else if (pos1 == 6 && pos2 == 1)
|
|
iWeight += 5;
|
|
// case JSBR (XVAM || VSTA)
|
|
// a subordinating conjunction is often followed by preverb auxillary or Active verb
|
|
// Example: (In Thai) Because of , Because see
|
|
else if (pos1 == 39 && (pos2 == 15 || pos2 == 12))
|
|
iWeight += 10;
|
|
// case ADVN NCMN
|
|
// a Adverb normal form is often followed by Common noun (Bug 55057).
|
|
// Example: (In Thai) under table.
|
|
else if (pos1 == 28 && pos2 == 5)
|
|
iWeight += 5;
|
|
// case VACT XVAE
|
|
else if (pos1 == 11 && pos2 == 18)
|
|
iWeight += 5;
|
|
// case VACT DDBQ
|
|
// Active verb follow by Definite determiner.
|
|
// Example: (In Thai) working for, singing again.
|
|
else if (pos1 == 11 && pos2 == 21)
|
|
iWeight += 10;
|
|
// case VATT VACT
|
|
// adjective are followed by verb.
|
|
// Example: (In Thai keyboard)sivd;jk
|
|
else if (pos1 == 13 && pos2 == 11)
|
|
iWeight += 2;
|
|
// case XVAE VACT
|
|
// a post verb auxilliary are often followed by an active verb.
|
|
// Example: (In Thai) come singing, go work.
|
|
else if (pos1 == 18 && pos2 == 11)
|
|
iWeight += 10;
|
|
// case CLTV NCMN
|
|
// a Collective classfier are often followed by Common Noun
|
|
// Example: (In Thai) group people, flock bird
|
|
else if (pos1 == 33 && pos2 == 5)
|
|
iWeight += 5;
|
|
// case NEG (VACT || VSTA || VATT || XVAM || XVAE)
|
|
// a negator (ie. not) is often followed by some kind of VERB.
|
|
// Example: He is not going.
|
|
else if (pos1 == 46 && (pos2 == 11 || pos2 == 12 || pos2 == 13 || pos2 == 15 || pos2 == 16))
|
|
iWeight += 8;
|
|
// case EAFF or EITT
|
|
// Ending for affirmative, and interrogative are more often ending of the pair
|
|
// Example: (In Thai) Krub, Ka,
|
|
else if (pos2 == 44 || pos2 == 45)
|
|
iWeight += 3;
|
|
// case VATT and VATT
|
|
// Attributive Verb and Attributive Verb occur when often in spoken laguages.
|
|
// Example: she is reall really cute.
|
|
else if (pos1 == 13 && pos2 == 13)
|
|
iWeight += 2;
|
|
// case NCMN and DDAC
|
|
// Common Noun and Definitive determiner classifier.
|
|
// Example: Food here (Thai)
|
|
else if (pos1 == 5 && pos2 == 20)
|
|
iWeight += 3;
|
|
// case CMTR and JCMP
|
|
// Measurement classifier and Comparative conjunction, are likly to appear in Thai.
|
|
// Example: year about (Thai) -> English about a year.
|
|
else if (pos1 == 34 && pos2 == 38)
|
|
iWeight += 5;
|
|
// case XVBB and VACT
|
|
else if (pos1 == 17 && pos2 == 11)
|
|
iWeight += 5;
|
|
// case NCMN and NCMN
|
|
// Common Noun and Common Noun
|
|
// Example: electric bulb(in thai)
|
|
else if (pos1 == 5 && pos2 == 5)
|
|
iWeight += 1;
|
|
}
|
|
|
|
DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight);
|
|
DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight);
|
|
return (float) iWeight;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
DWORD CThaiBreakTree::TrigramProbablity(DWORD dwTag1,DWORD dwTag2,DWORD dwTag3)
|
|
{
|
|
DWORD iWeight = 6;
|
|
|
|
if ( (dwTag1 != TAGPOS_UNKNOWN) &&
|
|
(dwTag2 != TAGPOS_UNKNOWN) &&
|
|
(dwTag3 != TAGPOS_UNKNOWN) )
|
|
{
|
|
WCHAR pos1 = ExtractPOS(dwTag1);
|
|
WCHAR pos2 = ExtractPOS(dwTag2);
|
|
WCHAR pos3 = ExtractPOS(dwTag3);
|
|
|
|
// optimization we if any POS is none than trigram shouldn't therefor no need to search.
|
|
if ( pos1 != 0 && pos2 != 0 && pos3 != 0)
|
|
{
|
|
WCHAR posArray[4];
|
|
posArray[0] = pos1;
|
|
posArray[1] = pos2;
|
|
posArray[2] = pos3;
|
|
posArray[3] = 0;
|
|
iWeight += thaiTrigramIter.GetProb(posArray);
|
|
}
|
|
}
|
|
DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight);
|
|
DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight);
|
|
DetermineFrequencyWeight(ExtractFrq(dwTag3), &iWeight);
|
|
|
|
// We reached zero probablity.
|
|
return (DWORD)iWeight;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
unsigned int CThaiBreakTree::TrigramBreak(WCHAR* pwchBegin, WCHAR* pwchEnd1)
|
|
{
|
|
// Declare and initialize local variables.
|
|
WCHAR* pwchBeginWord = pwchBegin;
|
|
WCHAR* pwchIndex = pwchBegin;
|
|
unsigned int iWordLen;
|
|
unsigned int iNumCluster = 1;
|
|
unsigned int iNumLastCluster;
|
|
unsigned int iBreakIndex = 0;
|
|
BYTE nextBreakArray[MAXBREAK];
|
|
DWORD nextTagArray[MAXBREAK];
|
|
unsigned int iNextBreakIndex; // index for array nextBreakArray and nextTagArray.
|
|
bool fFoundMatch;
|
|
unsigned int iWeight;
|
|
unsigned int iSumWeight;
|
|
unsigned int iPrevWeight;
|
|
unsigned int iCurrWeight;
|
|
BYTE iSoundexWordLen;
|
|
DWORD iPrevProbability;
|
|
DWORD iCurrentProbability;
|
|
DWORD dwTagTemp;
|
|
DWORD dwLastTag;
|
|
int i; // temporary int for use as need.
|
|
bool fBeginNewWord;
|
|
bool fEndWord = false;
|
|
|
|
pszEnd = pwchEnd1;
|
|
breakArray[0] = 0;
|
|
POSArray[0] = 0;
|
|
tagArray[0] = 0;
|
|
nextBreakArray[0] = 0;
|
|
nextTagArray[0] = 0;
|
|
|
|
while (true)
|
|
{
|
|
// Reset Iterator for generating break for new word.
|
|
fFoundMatch = false;
|
|
fBeginNewWord = true;
|
|
|
|
|
|
// Get begin word string for next round of word break.
|
|
pwchIndex = pwchBeginWord;
|
|
iNextBreakIndex = 0;
|
|
|
|
if (pwchIndex == pszEnd)
|
|
break;
|
|
|
|
while(true)
|
|
{
|
|
iNumLastCluster = iNumCluster;
|
|
iNumCluster = GetCluster(pwchIndex);
|
|
if (!thaiTrieIter.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
|
|
{
|
|
if ((iNumCluster == 0) && (pwchIndex == pszEnd))
|
|
fEndWord = true;
|
|
else
|
|
break;
|
|
}
|
|
|
|
fBeginNewWord = false;
|
|
pwchIndex += iNumCluster;
|
|
if (thaiTrieIter.fWordEnd)
|
|
{
|
|
if (thaiTrieIter.m_fThaiNumber)
|
|
{
|
|
// If we have Thai number accumulate it as one break.
|
|
assert(iNumCluster == 1);
|
|
fFoundMatch = true;
|
|
nextBreakArray[0]= (BYTE)(pwchIndex - pwchBeginWord);
|
|
nextTagArray[0] = TAGPOS_NCNM;
|
|
iNextBreakIndex = 1;
|
|
}
|
|
else
|
|
{
|
|
fFoundMatch = true;
|
|
nextBreakArray[iNextBreakIndex] = (BYTE)(pwchIndex - pwchBeginWord);
|
|
nextTagArray[iNextBreakIndex] = thaiTrieIter.dwTag;
|
|
iNextBreakIndex++;
|
|
}
|
|
if (pwchIndex >= pszEnd)
|
|
{
|
|
assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
|
|
assert(iNextBreakIndex != 0);
|
|
|
|
if ( iNumCluster == 1 &&
|
|
*(pwchIndex - 1) == L'.' &&
|
|
iBreakIndex > 0 &&
|
|
iNextBreakIndex == 1 &&
|
|
tagArray[iBreakIndex - 1] == TAGPOS_ABBR )
|
|
{
|
|
// backtrack one if we have abbrivation case.
|
|
// ex. B.K.K. (in Thai). (more info O11.145042.)
|
|
breakArray[iBreakIndex - 1] += nextBreakArray[iNextBreakIndex - 1];
|
|
return iBreakIndex;
|
|
}
|
|
|
|
breakArray[iBreakIndex] = nextBreakArray[iNextBreakIndex - 1];
|
|
tagArray[iBreakIndex] = nextTagArray[iNextBreakIndex - 1];
|
|
return (++iBreakIndex);
|
|
}
|
|
}
|
|
else if ((pwchIndex >= pszEnd && iNextBreakIndex == 0) || fEndWord)
|
|
{
|
|
assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
|
|
iWordLen = (unsigned int) (pwchIndex - pwchBeginWord);
|
|
switch (iWordLen)
|
|
{
|
|
case 0:
|
|
if (iBreakIndex > 0)
|
|
{
|
|
// if We have a length of one character add it to previous node.
|
|
breakArray[iBreakIndex - 1] += (BYTE) iNumCluster;
|
|
tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
|
|
}
|
|
else
|
|
{
|
|
// if this is the first break create a new break.
|
|
breakArray[iBreakIndex] = (BYTE) iNumCluster;
|
|
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
|
|
iBreakIndex++;
|
|
}
|
|
break;
|
|
case 1:
|
|
if (iBreakIndex > 0)
|
|
{
|
|
// if We have a length of one character add it to previous node.
|
|
breakArray[iBreakIndex - 1] += (BYTE) iWordLen;
|
|
tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
|
|
}
|
|
else
|
|
{
|
|
// if this is the first break create a new break.
|
|
breakArray[iBreakIndex] = (BYTE) iWordLen;
|
|
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
|
|
iBreakIndex++;
|
|
}
|
|
break;
|
|
default:
|
|
if ( iBreakIndex > 0 &&
|
|
ShouldMerge(pwchBeginWord - breakArray[iBreakIndex - 1], breakArray[iBreakIndex - 1],
|
|
iWordLen , tagArray[iBreakIndex - 1]) )
|
|
{
|
|
breakArray[iBreakIndex - 1] += (BYTE) iWordLen;
|
|
tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
|
|
}
|
|
else
|
|
{
|
|
breakArray[iBreakIndex] = (BYTE) iWordLen;
|
|
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
|
|
iBreakIndex++;
|
|
}
|
|
}
|
|
return iBreakIndex;
|
|
}
|
|
else if (pwchIndex >= pszEnd)
|
|
{
|
|
// O10.229346. If we get here we are at the end of word or end of sentence,
|
|
// We will need to decide what to depending on if we found the word or not.
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (fFoundMatch) // Longest Matching.
|
|
{
|
|
// If we only found one break, than say it the maximum.
|
|
if (1 == iNextBreakIndex)
|
|
{
|
|
if ( nextBreakArray[0] == 2 &&
|
|
iNumCluster + iNumLastCluster == 2 &&
|
|
iBreakIndex > 0 &&
|
|
*(pwchBeginWord+1) == L'.' &&
|
|
tagArray[iBreakIndex - 1] == TAGPOS_ABBR )
|
|
{
|
|
// backtrack one if we have abbrivation case.
|
|
// ex. B.K.K. (in Thai). (more info O11.145042.)
|
|
breakArray[iBreakIndex - 1] += nextBreakArray[0];
|
|
pwchBeginWord += nextBreakArray[0];
|
|
}
|
|
else if ( iBreakIndex > 0 &&
|
|
IsThaiEndingSign(*pwchBeginWord) &&
|
|
iNumCluster == 1 )
|
|
{
|
|
breakArray[iBreakIndex - 1] += nextBreakArray[0];
|
|
pwchBeginWord += nextBreakArray[0];
|
|
|
|
}
|
|
else
|
|
{
|
|
breakArray[iBreakIndex] = nextBreakArray[0];
|
|
tagArray[iBreakIndex] = nextTagArray[0];
|
|
pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
|
|
iBreakIndex++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
bool fWeightCompare = false;
|
|
|
|
iSumWeight = 0;
|
|
iPrevWeight = 0;
|
|
iCurrWeight = 0;
|
|
iPrevProbability = 0;
|
|
iCurrentProbability = 0;
|
|
dwLastTag = TAGPOS_UNKNOWN;
|
|
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
|
|
|
|
for (i = (iNextBreakIndex - 1); i >= 0 ; i--)
|
|
{
|
|
if ( iBreakIndex == 0)
|
|
{
|
|
iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp);
|
|
|
|
if (iWeight != 0)
|
|
// Bigram Probability
|
|
iCurrentProbability = (DWORD)BigramProbablity(nextTagArray[i], dwTagTemp);
|
|
}
|
|
else
|
|
{
|
|
iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp);
|
|
|
|
if (iBreakIndex == 1)
|
|
// Get Trigram Probability.
|
|
iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 1], nextTagArray[i], dwTagTemp);
|
|
else if (iBreakIndex >= 2)
|
|
{
|
|
// Get Trigram Probability.
|
|
iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 2], tagArray[iBreakIndex - 1], nextTagArray[i]);
|
|
if (iWeight != 0)
|
|
iCurrentProbability += (DWORD)BigramProbablity(nextTagArray[i],dwTagTemp);
|
|
}
|
|
}
|
|
|
|
fWeightCompare = false;
|
|
|
|
iCurrWeight = iWeight + nextBreakArray[i];
|
|
|
|
if (iPrevProbability == 0 && (iCurrWeight+1) == iSumWeight && iCurrentProbability > 5)
|
|
{
|
|
fWeightCompare = true;
|
|
}
|
|
else if (iCurrWeight == iSumWeight && ( Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight ||
|
|
iCurrentProbability > iPrevProbability))
|
|
{
|
|
fWeightCompare = true;
|
|
}
|
|
else if ( iWeight >= iPrevWeight - 1 &&
|
|
iPrevProbability > 0 && iPrevProbability < 10 &&
|
|
iCurrentProbability > iPrevProbability * 5000 )
|
|
{
|
|
// O11.187913. We'll trust our trigram data more if the current probability is
|
|
// so much greater than previous probability.
|
|
//
|
|
// * Note: we could probably use one of GA algorithm to get better value than 5K.
|
|
fWeightCompare = true;
|
|
}
|
|
|
|
// Store the string the best maximum weight, if the pair is equal
|
|
// store the string with maxim
|
|
if ( iCurrWeight > iSumWeight ||
|
|
fWeightCompare)
|
|
// ( (iCurrWeight == iSumWeight) &&
|
|
// ( (Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight) || (iCurrentProbability > iPrevProbability) ) ))
|
|
{
|
|
if (iCurrentProbability >= iPrevProbability || iSumWeight < iCurrWeight)
|
|
{
|
|
iSumWeight = Maximum(iWeight,1) + nextBreakArray[i];
|
|
iPrevWeight = Maximum(iWeight,nextBreakArray[i]);
|
|
breakArray[iBreakIndex] = nextBreakArray[i];
|
|
tagArray[iBreakIndex] = nextTagArray[i];
|
|
iPrevProbability = iCurrentProbability;
|
|
dwLastTag = dwTagTemp;
|
|
}
|
|
}
|
|
}
|
|
pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
|
|
iBreakIndex++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// NOMATCH_FOUND
|
|
iWordLen = (unsigned int)(pwchIndex - pwchBeginWord);
|
|
if (iBreakIndex > 0)
|
|
{
|
|
i = iBreakIndex - 1; // set i to previous break
|
|
if (iWordLen == 0)
|
|
{
|
|
if (iNumCluster == 1 && *pwchBeginWord == L',' &&
|
|
IsThaiChar(*(pwchBeginWord-breakArray[i])) )
|
|
{
|
|
// We should not merge comma into the word, only merge comma to
|
|
// Number.
|
|
// TODO: Should add TAGPOS_PUNCT.
|
|
breakArray[iBreakIndex] = (BYTE) iNumCluster;
|
|
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
|
|
pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
|
|
iBreakIndex++;
|
|
}
|
|
else if (iNumCluster > 1 && *pwchBeginWord == L'.')
|
|
{
|
|
// O11.134455. This is an ellipse case we shouldn't merge this string.
|
|
breakArray[iBreakIndex] = (BYTE) iNumCluster;
|
|
tagArray[iBreakIndex] = TAGPOS_PUNC;
|
|
pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
|
|
iBreakIndex++;
|
|
}
|
|
else if (ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iNumCluster, tagArray[i]))
|
|
{
|
|
// If word length is null use the cluster add to previous node.
|
|
breakArray[i] += (BYTE) iNumCluster;
|
|
tagArray[i] = TAGPOS_UNKNOWN;
|
|
pwchBeginWord += iNumCluster; // update begin word for next round.
|
|
}
|
|
else
|
|
{
|
|
// Add the unknown word to list.
|
|
breakArray[iBreakIndex] = (BYTE) iNumCluster;
|
|
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
|
|
pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
|
|
iBreakIndex++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Try checking for abbrivations.
|
|
if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.')
|
|
{
|
|
// The word is an abbrivated words.
|
|
// TODO: #1. Add TAGPOS_ABBRV.
|
|
// TODO: #2. May need to add rules code abbrivated word with 3 letters.
|
|
breakArray[iBreakIndex] = iWordLen + iNumCluster;
|
|
tagArray[iBreakIndex] = TAGPOS_ABBR;
|
|
pwchBeginWord += breakArray[iBreakIndex];
|
|
iBreakIndex++;
|
|
}
|
|
else if (iWordLen == 1 &&
|
|
tagArray[i] == TAGPOS_ABBR &&
|
|
*(pwchBeginWord+1) == L'.' &&
|
|
IsThaiConsonant(*pwchBeginWord) &&
|
|
pwchBeginWord+1 < pszEnd )
|
|
{
|
|
// O11.145042. This is the case where we are a <abbrivated><consonant><period>, the
|
|
// likely hood is the character is also an abbrivation.
|
|
breakArray[iBreakIndex - 1] += iWordLen + 1;
|
|
pwchBeginWord += iWordLen + 1;
|
|
}
|
|
// Abbreviation are usally 3 characters.
|
|
else if ( iWordLen == 2 &&
|
|
IsThaiConsonant(*(pwchBeginWord+2)) &&
|
|
*(pwchBeginWord+3) == L'.' &&
|
|
tagArray[i] != TAGPOS_UNKNOWN )
|
|
{
|
|
// O11.80619. This is the case where we are a <known word><abbrivated>
|
|
breakArray[iBreakIndex] = iWordLen + 1;
|
|
tagArray[iBreakIndex] = TAGPOS_ABBR;
|
|
pwchBeginWord += breakArray[iBreakIndex];
|
|
iBreakIndex++;
|
|
}
|
|
// Perhase Misspelled word try use sounding to spell the words.
|
|
// Try soundex two word back.
|
|
else if ( (iBreakIndex >= 2) &&
|
|
( (iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i] - breakArray[i - 1])) > (BYTE) (breakArray[i] + breakArray[i - 1]) ) &&
|
|
GetWeight(pwchBeginWord - breakArray[i] - breakArray[i - 1] + iSoundexWordLen) )
|
|
{
|
|
// Resize the word.
|
|
pwchBeginWord = (pwchBeginWord - breakArray[i] - breakArray[i - 1]) + iSoundexWordLen; // update begin word for next round.
|
|
breakArray[i - 1] = iSoundexWordLen;
|
|
tagArray[i - 1] = thaiTrieIter.dwTag;
|
|
iBreakIndex--; // Decrement iBreakIndex.
|
|
}
|
|
// Try soundex one words back.
|
|
else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i])) > (BYTE) breakArray[i]) &&
|
|
GetWeight(pwchBeginWord - breakArray[i] + iSoundexWordLen) &&
|
|
ExtractPOS(tagArray[i]) != 6) // Make sure that previous word is not a NTTL.
|
|
{
|
|
// Resize the word
|
|
pwchBeginWord = (pwchBeginWord - breakArray[i]) + iSoundexWordLen; // update begin word for next round.
|
|
breakArray[i] = iSoundexWordLen;
|
|
tagArray[i] = thaiTrieIter.dwTag;
|
|
}
|
|
// Try soundex on this word.
|
|
else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord)) > (BYTE) iWordLen) &&
|
|
GetWeight(pwchBeginWord + iSoundexWordLen) )
|
|
{
|
|
// Resize the word.
|
|
breakArray[iBreakIndex] = iSoundexWordLen;
|
|
tagArray[iBreakIndex] = thaiTrieIter.dwTag;
|
|
pwchBeginWord += iSoundexWordLen; // update begin word for next round.
|
|
iBreakIndex++;
|
|
}
|
|
else if ( ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iWordLen , tagArray[i]) )
|
|
{
|
|
// Merge the words.
|
|
breakArray[i] += (BYTE) iWordLen;
|
|
tagArray[i] = TAGPOS_UNKNOWN;
|
|
pwchBeginWord += iWordLen; // update begin word for next round.
|
|
}
|
|
else
|
|
{
|
|
// Add the unknown word to list.
|
|
breakArray[iBreakIndex] = (BYTE) iWordLen;
|
|
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
|
|
pwchBeginWord += iWordLen; // update begin word for next round.
|
|
iBreakIndex++;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Add unknown word to list and mark it.
|
|
if (iWordLen == 0)
|
|
{
|
|
// If word length is null use the cluster add to previous node.
|
|
breakArray[iBreakIndex] = (BYTE) iNumCluster;
|
|
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
|
|
pwchBeginWord += iNumCluster; // update begin word for next round.
|
|
}
|
|
else
|
|
{
|
|
// We we are here there are 2 case that can happen:
|
|
// 1. We take too little into our unknown.
|
|
// 2. We take too much into our unknown word.
|
|
|
|
// Have we taken too little check if this unknown word is an abbrivated words.
|
|
if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.')
|
|
breakArray[iBreakIndex] = iWordLen + iNumCluster;
|
|
// Try to see if we are taking to much, see if we can get a Weight from last cluster.
|
|
else if ( (iWordLen - iNumLastCluster > 0) && GetWeight(pwchIndex - iNumLastCluster) )
|
|
{
|
|
breakArray[iBreakIndex] = iWordLen - iNumLastCluster;
|
|
if (breakArray[iBreakIndex] == 1)
|
|
{
|
|
iWeight = GetWeight(pwchIndex - iNumLastCluster);
|
|
if (iWeight > iNumLastCluster && iWeight < 40)
|
|
breakArray[iBreakIndex] += (BYTE) iWeight;
|
|
else
|
|
breakArray[iBreakIndex] += (BYTE) iNumLastCluster;
|
|
|
|
}
|
|
}
|
|
// We may have a case of iWordLen is 1 and iNumCluster, we have a case of misspelled
|
|
// an extra character is incorrectly inserted over a correct word.
|
|
else if (iWordLen == 1)
|
|
{
|
|
iWeight = GetWeight(pwchIndex - iWordLen);
|
|
if (iWeight > iNumCluster && iWeight < 40)
|
|
breakArray[iBreakIndex] = iWordLen + iWeight;
|
|
else
|
|
breakArray[iBreakIndex] = iWordLen + iNumCluster;
|
|
}
|
|
else
|
|
breakArray[iBreakIndex] = (BYTE) iWordLen;
|
|
if (iNumLastCluster + iNumCluster == iWordLen && *(pwchBeginWord+iNumLastCluster) == L'.')
|
|
{
|
|
tagArray[iBreakIndex] = TAGPOS_ABBR;
|
|
}
|
|
else
|
|
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
|
|
|
|
pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
|
|
}
|
|
iBreakIndex++;
|
|
}
|
|
}
|
|
}
|
|
return iBreakIndex;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 8/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
int CThaiBreakTree::Soundex(WCHAR* word)
|
|
{
|
|
return thaiTrieIter.Soundex(word);
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: GetCluster
|
|
//
|
|
// Synopsis: The function return the next number of character which represent
|
|
// a cluster of Thai text.
|
|
//
|
|
// ie. Kor Kai, Kor Kai -> 1
|
|
// Kor Kai, Sara Um -> 2
|
|
//
|
|
// * Note this function will not return no more than 3 character,
|
|
// for cluster as this would represent invalid sequence of character.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 7/99 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
unsigned int CThaiBreakTree::GetCluster(const WCHAR* pszIndex)
|
|
{
|
|
bool fHasSaraE;
|
|
int iRetValue = 0;
|
|
bool fNeedEndingCluster = false;
|
|
|
|
if (pszIndex == pszEnd)
|
|
return 0;
|
|
|
|
while (true)
|
|
{
|
|
fHasSaraE= false;
|
|
|
|
// Take all begin cluster character.
|
|
while (IsThaiBeginClusterCharacter(*pszIndex))
|
|
{
|
|
if (*pszIndex == THAI_Vowel_Sara_E)
|
|
fHasSaraE = true;
|
|
pszIndex++;
|
|
iRetValue++;
|
|
|
|
}
|
|
|
|
if (IsThaiConsonant(*pszIndex))
|
|
{
|
|
pszIndex++;
|
|
iRetValue++;
|
|
|
|
while (IsThaiUpperAndLowerClusterCharacter(*pszIndex))
|
|
{
|
|
// Mai Han Akat is a special type of cluster that will need at lease
|
|
// one ending cluster.
|
|
if (*pszIndex == THAI_Vowel_Sign_Mai_HanAkat)
|
|
fNeedEndingCluster = true;
|
|
|
|
// In Thai it isn't possible to make a sound if we have the SaraE
|
|
// following by vowel below vowel.
|
|
else if ( fHasSaraE &&
|
|
( (*pszIndex == THAI_Vowel_Sara_II) ||
|
|
(*pszIndex == THAI_Tone_MaiTaiKhu) ||
|
|
(*pszIndex == THAI_Vowel_Sara_I) ||
|
|
(*pszIndex == THAI_Sara_Uee) ))
|
|
fNeedEndingCluster = true;
|
|
pszIndex++;
|
|
iRetValue++;
|
|
}
|
|
|
|
while (IsThaiEndingClusterCharacter(*pszIndex))
|
|
{
|
|
pszIndex++;
|
|
iRetValue++;
|
|
fNeedEndingCluster = false;
|
|
}
|
|
/*
|
|
// Include period as part of a cluster. Bug#57106
|
|
if (*pszIndex == 0x002e)
|
|
{
|
|
pszIndex++;
|
|
iRetValue++;
|
|
fNeedEndingCluster = false;
|
|
}
|
|
*/
|
|
}
|
|
|
|
if (fNeedEndingCluster)
|
|
fNeedEndingCluster = false;
|
|
else
|
|
break;
|
|
}
|
|
|
|
if (iRetValue == 0)
|
|
{
|
|
// O11.134455. Ellipse case we go to combine ellipses to one cluster.
|
|
if (*pszIndex == 0x002e)
|
|
{
|
|
while (*pszIndex == 0x002e && pszIndex <= pszEnd)
|
|
{
|
|
pszIndex++;
|
|
iRetValue++;
|
|
}
|
|
}
|
|
else
|
|
iRetValue++; // The character is probably a punctuation.
|
|
}
|
|
|
|
if (pszIndex > pszEnd)
|
|
{
|
|
// We need to do this as we have gone over end buff boundary.
|
|
iRetValue -= (int) (pszIndex - pszEnd);
|
|
pszIndex = pszEnd;
|
|
}
|
|
return iRetValue;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Class: CThaiBreakTree
|
|
//
|
|
// Synopsis:
|
|
//
|
|
// Arguments:
|
|
//
|
|
// wzWord - input string. (in)
|
|
// iWordLen - input string length. (in)
|
|
// Alt - find close alternate word (in)
|
|
// pBreakPos - array of break position allways 5 byte. (out)
|
|
//
|
|
// Modifies:
|
|
//
|
|
// History: created 3/00 aarayas
|
|
//
|
|
// Notes:
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
int CThaiBreakTree::FindAltWord(WCHAR* pwchBegin,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos)
|
|
{
|
|
// Declare and initialize local variables.
|
|
unsigned int iNumCluster = 1;
|
|
WCHAR* pwchBeginWord = pwchBegin;
|
|
WCHAR* pwchIndex = pwchBegin;
|
|
bool fBeginNewWord = true;
|
|
unsigned int iBreakIndex = 0;
|
|
unsigned int iBreakTemp = 0;
|
|
unsigned int iBreakTemp1 = 0;
|
|
unsigned int iBreakTemp2 = 0;
|
|
|
|
pszEnd = pwchBegin + iWordLen;
|
|
|
|
// TODO: Need to clean this code up.
|
|
switch(Alt)
|
|
{
|
|
case 3:
|
|
while (true)
|
|
{
|
|
iNumCluster = GetCluster(pwchIndex);
|
|
|
|
if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
|
|
return iBreakIndex;
|
|
|
|
fBeginNewWord = false;
|
|
pwchIndex += iNumCluster;
|
|
if (thaiTrieIter1.fWordEnd)
|
|
{
|
|
iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
|
|
|
|
// reached the end of word unable to find alt word.
|
|
if (iBreakTemp >= iWordLen)
|
|
return 0;
|
|
|
|
iBreakTemp1 = GetWeight(pwchIndex);
|
|
|
|
// reached the end of word unable to find alt word.
|
|
if (iBreakTemp + iBreakTemp1 >= iWordLen)
|
|
return 0;
|
|
|
|
iBreakTemp2 = GetWeight(pwchIndex+iBreakTemp1);
|
|
if (iBreakTemp + iBreakTemp1 + iBreakTemp2 == iWordLen)
|
|
{
|
|
pBreakPos[0] = (BYTE)iBreakTemp;
|
|
pBreakPos[1] = (BYTE)iBreakTemp1;
|
|
pBreakPos[2] = (BYTE)iBreakTemp2;
|
|
return 3;
|
|
}
|
|
}
|
|
if (pwchIndex >= pszEnd)
|
|
return iBreakIndex;
|
|
}
|
|
break;
|
|
case 2:
|
|
while (true)
|
|
{
|
|
iNumCluster = GetCluster(pwchIndex);
|
|
|
|
if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
|
|
return iBreakIndex;
|
|
|
|
fBeginNewWord = false;
|
|
pwchIndex += iNumCluster;
|
|
if (thaiTrieIter1.fWordEnd)
|
|
{
|
|
iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
|
|
|
|
// reached the end of word unable to find alt word.
|
|
if (iBreakTemp >= iWordLen)
|
|
return 0;
|
|
|
|
iBreakTemp1 = GetWeight(pwchIndex);
|
|
if (iBreakTemp + iBreakTemp1 == iWordLen)
|
|
{
|
|
pBreakPos[0] = (BYTE)iBreakTemp;
|
|
pBreakPos[1] = (BYTE)iBreakTemp1;
|
|
return 2;
|
|
}
|
|
}
|
|
if (pwchIndex >= pszEnd)
|
|
return iBreakIndex;
|
|
}
|
|
break;
|
|
default:
|
|
case 1:
|
|
while (iBreakIndex < Alt)
|
|
{
|
|
iNumCluster = GetCluster(pwchIndex);
|
|
|
|
if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
|
|
return iBreakIndex;
|
|
|
|
fBeginNewWord = false;
|
|
pwchIndex += iNumCluster;
|
|
if (thaiTrieIter1.fWordEnd)
|
|
{
|
|
fBeginNewWord = true;
|
|
|
|
iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
|
|
|
|
// reached the end of word unable to find alt word.
|
|
if (iBreakTemp >= iWordLen)
|
|
return 0;
|
|
|
|
iBreakTemp1 = GetLongestSubstring(pwchBeginWord,iWordLen);
|
|
|
|
if (iBreakTemp1 > iBreakTemp && iBreakTemp1 < iWordLen)
|
|
pBreakPos[iBreakIndex] = (BYTE) iBreakTemp1;
|
|
else
|
|
pBreakPos[iBreakIndex] = (BYTE) iBreakTemp;
|
|
pwchBeginWord += pBreakPos[iBreakIndex];
|
|
iWordLen -= pBreakPos[iBreakIndex];
|
|
iBreakIndex++;
|
|
}
|
|
if (pwchIndex >= pszEnd)
|
|
return iBreakIndex;
|
|
}
|
|
break;
|
|
}
|
|
|
|
return iBreakIndex;
|
|
}
|