Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2169 lines
71 KiB

//+---------------------------------------------------------------------------
//
//
// CThaiBreakTree - class CThaiBreakTree
//
// History:
// created 7/99 aarayas
//
// ©1999 Microsoft Corporation
//----------------------------------------------------------------------------
#include "CThaiBreakTree.hpp"
//+---------------------------------------------------------------------------
//
// Function: ExtractPOS
//
// Synopsis: The functions takes a tag and return Part Of Speech Tags.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline WCHAR ExtractPOS(DWORD dwTag)
{
return (WCHAR) ( (dwTag & iPosMask) >> iPosShift);
}
//+---------------------------------------------------------------------------
//
// Function: ExtractFrq
//
// Synopsis: The functions takes a tag and return Frquency of words.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline BYTE ExtractFrq(DWORD dwTag)
{
return (BYTE) ( (dwTag & 0x300) >> iFrqShift);
}
//+---------------------------------------------------------------------------
//
// Function: DetermineFrequencyWeight
//
// Synopsis: The functions returns the frequency weight of a words.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline void DetermineFrequencyWeight(BYTE frq, unsigned int* uiWeight)
{
switch (frq)
{
case frqpenInfrequent:
(*uiWeight) -= 2;
break;
case frqpenSomewhat:
(*uiWeight)--;
break;
case frqpenVery:
(*uiWeight) += 2;
break;
case frqpenNormal:
default:
(*uiWeight)++;
break;
}
}
//+---------------------------------------------------------------------------
//
// Function: DetermineFrequencyWeight
//
// Synopsis: The functions returns the frequency weight of a words.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline void DetermineFrequencyWeight(BYTE frq, DWORD* uiWeight)
{
switch (frq)
{
case frqpenInfrequent:
(*uiWeight) -= 2;
break;
case frqpenSomewhat:
(*uiWeight)--;
break;
case frqpenVery:
(*uiWeight) += 2;
break;
case frqpenNormal:
default:
(*uiWeight)++;
break;
}
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: Constructor - initialize local variables
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiBreakTree::CThaiBreakTree() : iNodeIndex(0), iNumNode(0),
pszBegin(NULL), pszEnd(NULL),
breakTree(NULL), breakArray(NULL),
tagArray(NULL), maximalMatchingBreakArray(NULL),
maximalMatchingTAGArray(NULL),
POSArray(NULL), maximalMatchingPOSArray(NULL)
{
// Allocate memory need for CThaiBreakTree.
#if defined (NGRAM_ENABLE)
breakTree = new ThaiBreakNode[MAXTHAIBREAKNODE];
#endif
breakArray = new BYTE[MAXBREAK];
tagArray = new DWORD[MAXBREAK];
POSArray = new WCHAR[MAXBREAK];
}
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: Destructor - clean up code
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiBreakTree::~CThaiBreakTree()
{
// Clean up all memory used.
#if defined (NGRAM_ENABLE)
if (breakTree)
delete breakTree;
if (maximalMatchingBreakArray)
delete maximalMatchingBreakArray;
if (maximalMatchingTAGArray)
delete maximalMatchingTAGArray;
if (maximalMatchingPOSArray)
delete maximalMatchingPOSArray;
#endif
if (breakArray)
delete breakArray;
if (tagArray)
delete tagArray;
if (POSArray)
delete POSArray;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Associate the class to the string.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
#if defined (NGRAM_ENABLE)
void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pSentTrie, CTrie* pTrigramTrie)
#else
void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pTrigramTrie)
#endif
{
assert(pTrie != NULL);
thaiTrieIter.Init(pTrie);
thaiTrieIter1.Init(pTrie);
#if defined (NGRAM_ENABLE)
assert(pSentTrie != NULL);
thaiSentIter.Init(pSentTrie);
#endif
assert(pTrigramTrie != NULL);
thaiTrigramIter.Init(pTrigramTrie);
}
#if defined (NGRAM_ENABLE)
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: reset iterator to top of the tree
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline void CThaiBreakTree::Reset()
{
iNodeIndex = 0;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Move to the next break.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline bool CThaiBreakTree::MoveNext()
{
iNodeIndex = breakTree[iNodeIndex].NextBreak;
return (iNodeIndex != 0);
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Move down to next level.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline bool CThaiBreakTree::MoveDown()
{
iNodeIndex = breakTree[iNodeIndex].Down;
return (iNodeIndex != 0);
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: create new node to position, and return index to the node.
//
// * return Unable to Create Node.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline unsigned int CThaiBreakTree::CreateNode(int iPos, BYTE iBreakLen, DWORD dwTAG)
{
assert(iNumNode < MAXTHAIBREAKNODE);
if (iNumNode >= MAXTHAIBREAKNODE)
{
return UNABLETOCREATENODE;
}
breakTree[iNumNode].iPos = iPos;
breakTree[iNumNode].iBreakLen = iBreakLen;
breakTree[iNumNode].dwTAG = dwTAG;
breakTree[iNumNode].NextBreak = 0;
breakTree[iNumNode].Down = 0;
iNumNode++;
return (iNumNode - 1);
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Generate a Tree of possible break from the given string.
//
// * Note - false if there aren't enough memory to create node.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
enum thai_parse_state {
END_SENTENCE, // Reached the end of sentence.
LONGEST_MATCH, // Longest possible matched.
NOMATCH_FOUND, // Unable to find word.
ERROR_OUTMEMORY, // Out of Memory.
};
bool CThaiBreakTree::GenerateTree(WCHAR* pszBegin, WCHAR* pszEnd1)
{
// Declare and initialize local variables.
unsigned int iIndexBreakTree = 0;
unsigned int iPrevIndexBreakTree = 0;
unsigned int iParentNode = 0;
WCHAR* pszBeginWord = pszBegin;
WCHAR* pszIndex = pszBegin;
unsigned int iNumCluster = 1;
unsigned int iNumLastCluster;
unsigned int iWordLen = 0;
unsigned int iNodeAnalyze = 0;
thai_parse_state parseState = END_SENTENCE;
bool fFoundMatch = false;
bool fAddToNodeAnalyze = false;
bool fDoneGenerateTree = false;
pszEnd = pszEnd1;
#if defined (_DEBUG)
memset(breakTree,0,sizeof(ThaiBreakNode)*MAXTHAIBREAKNODE);
#endif
iNodeIndex = 0;
iNumNode = 0;
while (true)
{
// Reset Iterator for generating break for new word.
fFoundMatch = false;
thaiTrieIter.Reset();
if (iIndexBreakTree != 0)
{
while (true)
{
// If this is not the first node than set pszBeginWord after the last break.
pszBeginWord = pszBegin + breakTree[iNodeAnalyze].iPos + breakTree[iNodeAnalyze].iBreakLen;
fAddToNodeAnalyze = true;
// Are we at the end of the sentence.
if ( (pszBeginWord == pszEnd) ||
(breakTree[iNodeAnalyze].dwTAG == TAGPOS_PURGE) )
{
iNodeAnalyze++; // Move to next node.
if (iNodeAnalyze >= iNumNode)
{
fDoneGenerateTree = true;
break;
}
}
else
break;
}
}
pszIndex = pszBeginWord;
iParentNode = iNodeAnalyze;
if (fDoneGenerateTree)
break;
// Get next level of tree.
while (TRUE)
{
iNumLastCluster = iNumCluster;
iNumCluster = GetCluster(pszIndex);
if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster))
{
pszIndex += iNumCluster;
if (thaiTrieIter.fWordEnd)
{
fFoundMatch = true;
// if first node add first node
if (iIndexBreakTree == 0)
{
CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
iIndexBreakTree++;
}
else
{
if (fAddToNodeAnalyze)
{
fAddToNodeAnalyze = false;
breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
// Determine if an error has occur.
if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
{
breakTree[iNodeAnalyze].NextBreak = 0;
parseState = ERROR_OUTMEMORY;
break;
}
iPrevIndexBreakTree = breakTree[iNodeAnalyze].NextBreak;
iNodeAnalyze++;
}
else
{
breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
// Determine if an error has occur.
if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
{
breakTree[iPrevIndexBreakTree].Down = 0;
parseState = ERROR_OUTMEMORY;
break;
}
iPrevIndexBreakTree = iIndexBreakTree;
}
iIndexBreakTree++;
}
}
if (pszIndex >= pszEnd)
{
assert(pszIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
parseState = END_SENTENCE;
break;
}
}
else
{
if (fFoundMatch)
parseState = LONGEST_MATCH;
else
parseState = NOMATCH_FOUND;
break;
}
}
if (parseState == LONGEST_MATCH)
{
// We found a matched.
assert(breakTree[iPrevIndexBreakTree].Down == 0); // at this point breakTree[iPreveIndexBreakTree].Down should equal null.(optimization note)
if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree)
{
assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown.
DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak);
}
}
else if (parseState == NOMATCH_FOUND)
{
// Should mark node as unknown.
if (fAddToNodeAnalyze)
{
fAddToNodeAnalyze = false;
iWordLen = pszIndex - pszBeginWord;
// Make sure we don't only have a cluster of text before making a node.
if (iWordLen == 0)
{
// If we have an UNKNOWN word of one character only current node mark it as unknown.
assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
breakTree[iNodeAnalyze].iBreakLen += iNumCluster;
breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
}
else
{
if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8)
// The reason we are using 8 is because from corpora analysis
// the average Thai word is about 7.732 characters.
// TODO: We should add orthographic analysis here to get a better on boundary
// of unknown word.
{
assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
breakTree[iNodeAnalyze].iBreakLen += iWordLen;
breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
}
else
{
if (GetWeight(pszIndex - iNumLastCluster))
breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN);
else
breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN);
// Determine if an error has occur.
if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
{
breakTree[iNodeAnalyze].NextBreak = 0;
parseState = ERROR_OUTMEMORY;
break;
}
iNodeAnalyze++;
iIndexBreakTree++;
}
}
}
else
{
breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN);
// Determine if an error has occur.
if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
{
breakTree[iPrevIndexBreakTree].Down = 0;
parseState = ERROR_OUTMEMORY;
break;
}
iIndexBreakTree++;
}
}
else if (parseState == END_SENTENCE)
{
// If we find ourself at the end of a sentence and no match.
if (!fFoundMatch)
{
if (fAddToNodeAnalyze)
{
fAddToNodeAnalyze = false;
iWordLen = pszIndex - pszBeginWord;
// Make sure we don't only have a cluster of text before making a node.
if (iWordLen == 0)
{
// If we have an UNKNOWN word of one character only current node mark it as unknown.
assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
breakTree[iNodeAnalyze].iBreakLen += iNumCluster;
breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
}
else
{
if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8)
// The reason we are using 8 is because from corpora analysis
// the average Thai word is about 7.732 characters.
// TODO: We should add orthographic analysis here to get a better on boundary
// of unknown word.
{
assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
breakTree[iNodeAnalyze].iBreakLen += iWordLen;
breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen);
}
else
{
if (GetWeight(pszIndex - iNumLastCluster))
breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN);
else
breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN);
// Determine if an error has occur.
if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE)
{
breakTree[iNodeAnalyze].NextBreak = 0;
parseState = ERROR_OUTMEMORY;
break;
}
iNodeAnalyze++;
iIndexBreakTree++;
}
}
}
else
{
breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN);
// Determine if an error has occur.
if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE)
{
breakTree[iPrevIndexBreakTree].Down = 0;
parseState = ERROR_OUTMEMORY;
break;
}
}
iIndexBreakTree++;
}
// If the beginning of node the branch isn't equal to leaf node perphase it is possible to
// do some ending optimization.
else if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree)
{
assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown.
DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak);
}
}
else if ( (breakTree[iNodeAnalyze].iBreakLen == 0) || (parseState == ERROR_OUTMEMORY) )
break;
}
return (parseState != ERROR_OUTMEMORY);
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Traverse all the tree and look for the least number of token.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiBreakTree::MaximalMatching()
{
// If maximal matching break array has not been allocate, than allocate it.
if (!maximalMatchingBreakArray)
maximalMatchingBreakArray = new BYTE[MAXBREAK];
if (!maximalMatchingTAGArray)
maximalMatchingTAGArray = new DWORD[MAXBREAK];
if (!maximalMatchingPOSArray)
maximalMatchingPOSArray = new WCHAR[MAXBREAK];
maxLevel = MAXUNSIGNEDINT;
maxToken = 0;
iNumUnknownMaximalPOSArray = MAXBREAK;
Traverse(0,0,0);
return true;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: The function determine if the node if the node should,
// be tag as unknown or purge.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline DWORD CThaiBreakTree::DeterminePurgeOrUnknown(unsigned int iCurrentNode, unsigned int iBreakLen)
{
// Declare and initialize local variables.
unsigned int iNode = breakTree[iCurrentNode].Down;
while (iNode != 0)
{
if ( (breakTree[iNode].iBreakLen == iBreakLen) ||
(breakTree[iNode].iBreakLen < iBreakLen) &&
( (breakTree[iNode].dwTAG != TAGPOS_UNKNOWN) ||
(breakTree[iNode].dwTAG != TAGPOS_PURGE) ))
{
// Since we are purging this break just make sure the NextBreak is Null.
assert(breakTree[iCurrentNode].NextBreak == 0);
return TAGPOS_PURGE;
}
iNode = breakTree[iNode].Down;
}
return TAGPOS_UNKNOWN;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Ending optimization - if we have found the end of a sentence,
// and possible break. Purge the branch for unnecessary break.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline void CThaiBreakTree::DeterminePurgeEndingSentence(WCHAR* pszBeginWord, unsigned int iNode)
{
while (breakTree[iNode].Down != 0)
{
// Determine if the next string has a possiblity to become a word.
// TODO: We may need to change this once the GetWeight add soundex
// functionality.
if (GetWeight(pszBeginWord + breakTree[iNode].iBreakLen) == 0)
{
// Since we are purging this break just make sure the NextBreak is Null.
assert(breakTree[iNode].NextBreak == 0);
breakTree[iNode].dwTAG = TAGPOS_PURGE;
}
iNode = breakTree[iNode].Down;
}
}
#endif
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::GetLongestSubstring(WCHAR* pszBegin, unsigned int iWordLen)
{
// Declare and initialize local variables.
unsigned int iNumCluster = 1;
unsigned int lastWeight = 0;
unsigned int Weight = 0;
bool fBeginNewWord;
WCHAR* pszIndex = pszBegin;
// Short circuit the length is less of string is less than 1.
if ((pszEnd - pszBegin) == 1)
return Weight;
else if (pszEnd == pszBegin)
return 1000;
// Reset Iterator for generating break for new word.
fBeginNewWord = true;
// Get next level of tree.
while (true)
{
iNumCluster = GetCluster(pszIndex);
if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
{
fBeginNewWord = false;
pszIndex += iNumCluster;
if (thaiTrieIter.fWordEnd)
{
lastWeight = Weight;
Weight = (unsigned int) (pszIndex - pszBegin);
}
}
else
{
if ((Weight == iWordLen) && (lastWeight < Weight) && (lastWeight > 0))
{
Weight = lastWeight;
}
break;
}
}
return Weight;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin)
{
// Declare and initialize local variables.
unsigned int iNumCluster = 1;
unsigned int Weight = 0;
bool fBeginNewWord;
WCHAR* pszIndex = pszBegin;
// Short circuit the length is less of string is less than 1.
if ((pszEnd - pszBegin) == 1)
return Weight;
else if (pszEnd == pszBegin)
return 1000;
// Reset Iterator for generating break for new word.
fBeginNewWord = true;
// Get next level of tree.
while (true)
{
iNumCluster = GetCluster(pszIndex);
if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
{
fBeginNewWord = false;
pszIndex += iNumCluster;
if (thaiTrieIter.fWordEnd)
Weight = (unsigned int) (pszIndex - pszBegin);
}
else
break;
}
return Weight;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin, DWORD* pdwTag)
{
// Declare and initialize local variables.
unsigned int iNumCluster = 1;
unsigned int Weight = 0;
bool fBeginNewWord;
WCHAR* pszIndex = pszBegin;
// Short circuit the length is less of string is less than 1.
if ((pszEnd - pszBegin) == 1)
return Weight;
else if (pszEnd == pszBegin)
return 1000;
// Reset Iterator for generating break for new word.
fBeginNewWord = true;
// Get next level of tree.
while (true)
{
iNumCluster = GetCluster(pszIndex);
if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord))
{
fBeginNewWord = false;
pszIndex += iNumCluster;
if (thaiTrieIter.fWordEnd)
{
Weight = (unsigned int) (pszIndex - pszBegin);
*pdwTag = thaiTrieIter.dwTag;
}
}
else
break;
}
return Weight;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Traverse the tree.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiBreakTree::Traverse(unsigned int iLevel, unsigned int iCurrentNode, unsigned int iNumUnknown)
{
assert (iLevel < MAXBREAK);
// Process node.
breakArray[iLevel] = breakTree[iCurrentNode].iBreakLen;
tagArray[iLevel] = breakTree[iCurrentNode].dwTAG;
if (tagArray[iLevel] == TAGPOS_UNKNOWN)
iNumUnknown++;
// Have we found the end of the sentence.
if (breakTree[iCurrentNode].NextBreak == 0)
{
if (breakTree[iCurrentNode].dwTAG != TAGPOS_PURGE)
AddBreakToList(iLevel + 1, iNumUnknown);
if (breakTree[iCurrentNode].Down != 0)
{
if (tagArray[iLevel] == TAGPOS_UNKNOWN)
iNumUnknown--;
return Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown);
}
else
return true;
}
else
Traverse(iLevel + 1, breakTree[iCurrentNode].NextBreak, iNumUnknown);
if (breakTree[iCurrentNode].Down != 0)
{
if (tagArray[iLevel] == TAGPOS_UNKNOWN)
iNumUnknown--;
Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown);
}
return true;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::SoundexSearch(WCHAR* pszBegin)
{
// Declare and initialize local variables.
unsigned int iNumCluster = 1;
unsigned int iNumNextCluster = 1;
unsigned int iLongestWord = 0;
unsigned int iPenalty = 0;
WCHAR* pszIndex = pszBegin;
// Short circuit the length is less of string is less than 1.
if ( (pszBegin+1) >= pszEnd )
return iLongestWord;
// Reset Iterator for generating break for new word.
thaiTrieIter1.Reset();
// Get next level of tree.
while (true)
{
iNumCluster = GetCluster(pszIndex);
// Determine iNumNextCluster let iNumNextCluster = 0, if we reached the end of string.
if (pszIndex + iNumCluster >= pszEnd)
iNumNextCluster = 0;
else
iNumNextCluster = GetCluster(pszIndex+iNumCluster);
// Determine penalty
switch (thaiTrieIter1.MoveSoundexByCluster(pszIndex, iNumCluster, iNumNextCluster))
{
case SUBSTITUTE_SOUNDLIKECHAR:
iPenalty += 2;
break;
case SUBSTITUTE_DIACRITIC:
iPenalty++;
break;
case UNABLE_TO_MOVE:
iPenalty += 2;
break;
case STOP_MOVE:
iPenalty += 1000;
break;
default:
case NOSUBSTITUTE:
break;
}
// Update Index.
if (iPenalty <= 2)
{
pszIndex += iNumCluster;
if (thaiTrieIter1.fWordEnd)
iLongestWord = (unsigned int) (pszIndex - pszBegin);
}
else
break;
}
return iLongestWord;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: The information used here is a reference to the orthographic
// analysis work done on the Thai languages. (see paper: Natural
// Language Processing in Thailand 1993 Chulalongkorn. p 361).
//
// Arguments: pszBoundaryChar - Contain pointer to at least two thai character
// character next to each other which we will
// use to calculate wheather we should or
// should not merge the two word.
//
// iPrevWordLen -
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline bool CThaiBreakTree::ShouldMerge(const WCHAR* pwszPrevWord, unsigned int iPrevWordLen, unsigned int iMergeWordLen, DWORD dwPrevTag)
{
const WCHAR* pwszBoundary = pwszPrevWord + iPrevWordLen - 1;
assert(iMergeWordLen != 0);
assert(iPrevWordLen != 0);
// There are very few words in Thai that are 4 character or less, therefore we should
// found a pair that less than 4 character we should merge.
// Or if merge word length is one than also merge.
// Of if last cluster of the word is a Thanthakhat(Karan) we should always merge.
if (iPrevWordLen + iMergeWordLen <= 4 || iMergeWordLen == 1 ||
(iMergeWordLen == 2 && *(pwszBoundary + iMergeWordLen) == THAI_Thanthakhat))
return true;
if (iPrevWordLen >=2)
{
const WCHAR* pwszPrevCharBoundary = pwszBoundary - 1;
// TO IMPROVE: It better to check the last character of Previous word, it can give us a
// much better guess
if ((*pwszPrevCharBoundary == THAI_Vowel_Sign_Mai_HanAkat || *pwszBoundary == THAI_Vowel_Sign_Mai_HanAkat) ||
(*pwszPrevCharBoundary == THAI_Tone_Mai_Tri || *pwszBoundary == THAI_Tone_Mai_Tri) ||
(*pwszPrevCharBoundary == THAI_Sara_Ue || *pwszBoundary == THAI_Sara_Ue) )
return true;
}
// If the first character of the next word is mostly likly the beginning
// character and last character of the previous word is not sara-A than
// we have a high probability that we found a begin of word boundary,
// therefore we shouldn't merge.
if ( (IsThaiMostlyBeginCharacter(pwszBoundary[1]) && *pwszBoundary != THAI_Vowel_Sara_A) )
return false;
// If the last character of the previous word is mostly likely an ending
// character than, than there is a high probability that the found a boundary.
// There are very few words in Thai that are 4 character or less, therefore we should
// found a pair that less than 4 character we should merge.
if (IsThaiMostlyLastCharacter(*pwszBoundary))
return false;
// O10.192931 Adding Diacritic check rules. We might want to expand this to more diacritic
// for now Mai HanAkart would do. It is highly unlikely that a word contain more than 1 of Mai HanAkart diacritic.
if (IsContain(pwszPrevWord,iPrevWordLen,THAI_Vowel_Sign_Mai_HanAkat) && IsContain(pwszBoundary + 1,iMergeWordLen,THAI_Vowel_Sign_Mai_HanAkat))
return false;
if (iMergeWordLen == 3 && GetCluster(pwszBoundary + 1) == iMergeWordLen)
{
if (*(pwszBoundary + 2) == THAI_Vowel_Sara_I)
{
if (*(pwszBoundary+3) == THAI_Tone_Mai_Ek || *(pwszBoundary+3) == THAI_Tone_Mai_Tro)
return false;
}
}
// if previous tag is equal to Title Noun than the next word is highly likly to be a name.
if (ExtractPOS(dwPrevTag) == 6)
return false;
// O11.134455. For the case of trailling punctuation.
if (dwPrevTag == TAGPOS_PUNC && iMergeWordLen > 1 && iPrevWordLen > 1)
return false;
// The reason we are using 8 is because from corpora analysis
// the average Thai word is about 7.732 characters. Or, if previous word is already
// an unknown, to keep the amount of unknown low the unknown to previous words.
if ( (iPrevWordLen + iMergeWordLen < 8) || (dwPrevTag == TAGPOS_UNKNOWN) )
return true;
return false;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
// 8/17/99 optimize some code.
//
// Notes:
//
//----------------------------------------------------------------------------
inline void CThaiBreakTree::AddBreakToList(unsigned int iNumBreak, unsigned int iNumUnknown)
{
#if defined (_DEBUG)
breakArray[iNumBreak] = 0;
#endif
if (CompareSentenceStructure(iNumBreak, iNumUnknown))
{
maxToken = maxLevel = iNumBreak; // This is ugly but it save 5 clock cycle.
memcpy(maximalMatchingBreakArray,breakArray,maxToken);
memcpy(maximalMatchingTAGArray,tagArray,sizeof(DWORD)*maxToken);
maximalMatchingBreakArray[maxToken] = 0;
maximalMatchingTAGArray[maxToken] = 0;
}
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: The function compares sentence structure of
// maximalMatchingPOSArray with posArray.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline bool CThaiBreakTree::CompareSentenceStructure(unsigned int iNumBreak, unsigned int iNumUnknownPOSArray)
{
if ( (iNumBreak < maxLevel) && (iNumUnknownMaximalPOSArray >= iNumUnknownPOSArray) )
{
iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
return true;
}
else if (iNumBreak == maxLevel)
{
// true - maximal matching has a larger unknown.
if (iNumUnknownMaximalPOSArray > iNumUnknownPOSArray)
{
iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
return true;
}
for(unsigned int i = 0; i <= iNumBreak; i++)
{
maximalMatchingPOSArray[i] = ExtractPOS(maximalMatchingTAGArray[i]);
POSArray[i] = ExtractPOS(tagArray[i]);
}
// Determine if the sentence structure is like any one of the sentence
// sentence structure in our corpora.
if ( (IsSentenceStruct(POSArray, iNumBreak)) &&
(!IsSentenceStruct(maximalMatchingPOSArray, iNumBreak)) )
{
iNumUnknownMaximalPOSArray = iNumUnknownPOSArray;
return true;
}
else if (iNumUnknownMaximalPOSArray == iNumUnknownPOSArray)
{
// Determine the frequency of word used in the sentence.
unsigned int iFrequencyArray = 500;
unsigned int iFrequencyMaximalArray = 500;
for(unsigned int i = 0; i <= iNumBreak; i++)
{
DetermineFrequencyWeight(ExtractFrq(maximalMatchingTAGArray[i]),&iFrequencyMaximalArray);
DetermineFrequencyWeight(ExtractFrq(tagArray[i]),&iFrequencyArray);
}
return (iFrequencyArray > iFrequencyMaximalArray);
}
}
return false;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiBreakTree::IsSentenceStruct(const WCHAR* pos, unsigned int iPosLen)
{
// Declare and initialize all local variables.
unsigned int i = 0;
thaiSentIter.Reset();
if (!thaiSentIter.Down())
return FALSE;
while (TRUE)
{
thaiSentIter.GetNode();
if (thaiSentIter.pos == pos[i])
{
i++;
if (thaiSentIter.fWordEnd && i == iPosLen)
{
return TRUE;
}
else if (i == iPosLen) break;
// Move down the Trie Branch.
else if (!thaiSentIter.Down()) break;
}
// Move right of the Trie Branch
else if (!thaiSentIter.Right()) break;
}
return FALSE;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
float CThaiBreakTree::BigramProbablity(DWORD dwTag1,DWORD dwTag2)
{
unsigned int iWeight = 4;
// TODO : Use the distribution of word category to determine optimial search - exmaple
// NOUN VERB ADVERB CLASSIFIER CONJETURE PREP et....
// TODO : Once we got trigram use it to create bigram probability as well.
if ( (dwTag1 != TAGPOS_UNKNOWN) &&
(dwTag2 != TAGPOS_UNKNOWN) )
{
WCHAR pos1 = ExtractPOS(dwTag1);
WCHAR pos2 = ExtractPOS(dwTag2);
// case NCMN VATT
/// a common noun is often followed by attributive verb(adjective)
// Example: (In Thai) book good, people nice
if (pos1 == 5 && pos2 == 13)
iWeight += 10;
// case NTTL NPRP
// a title noun is often followed by proper noun
// Example: Dr. Athapan, Mr. Sam
else if (pos1 == 6 && pos2 == 1)
iWeight += 5;
// case JSBR (XVAM || VSTA)
// a subordinating conjunction is often followed by preverb auxillary or Active verb
// Example: (In Thai) Because of , Because see
else if (pos1 == 39 && (pos2 == 15 || pos2 == 12))
iWeight += 10;
// case ADVN NCMN
// a Adverb normal form is often followed by Common noun (Bug 55057).
// Example: (In Thai) under table.
else if (pos1 == 28 && pos2 == 5)
iWeight += 5;
// case VACT XVAE
else if (pos1 == 11 && pos2 == 18)
iWeight += 5;
// case VACT DDBQ
// Active verb follow by Definite determiner.
// Example: (In Thai) working for, singing again.
else if (pos1 == 11 && pos2 == 21)
iWeight += 10;
// case VATT VACT
// adjective are followed by verb.
// Example: (In Thai keyboard)sivd;jk
else if (pos1 == 13 && pos2 == 11)
iWeight += 2;
// case XVAE VACT
// a post verb auxilliary are often followed by an active verb.
// Example: (In Thai) come singing, go work.
else if (pos1 == 18 && pos2 == 11)
iWeight += 10;
// case CLTV NCMN
// a Collective classfier are often followed by Common Noun
// Example: (In Thai) group people, flock bird
else if (pos1 == 33 && pos2 == 5)
iWeight += 5;
// case NEG (VACT || VSTA || VATT || XVAM || XVAE)
// a negator (ie. not) is often followed by some kind of VERB.
// Example: He is not going.
else if (pos1 == 46 && (pos2 == 11 || pos2 == 12 || pos2 == 13 || pos2 == 15 || pos2 == 16))
iWeight += 8;
// case EAFF or EITT
// Ending for affirmative, and interrogative are more often ending of the pair
// Example: (In Thai) Krub, Ka,
else if (pos2 == 44 || pos2 == 45)
iWeight += 3;
// case VATT and VATT
// Attributive Verb and Attributive Verb occur when often in spoken laguages.
// Example: she is reall really cute.
else if (pos1 == 13 && pos2 == 13)
iWeight += 2;
// case NCMN and DDAC
// Common Noun and Definitive determiner classifier.
// Example: Food here (Thai)
else if (pos1 == 5 && pos2 == 20)
iWeight += 3;
// case CMTR and JCMP
// Measurement classifier and Comparative conjunction, are likly to appear in Thai.
// Example: year about (Thai) -> English about a year.
else if (pos1 == 34 && pos2 == 38)
iWeight += 5;
// case XVBB and VACT
else if (pos1 == 17 && pos2 == 11)
iWeight += 5;
// case NCMN and NCMN
// Common Noun and Common Noun
// Example: electric bulb(in thai)
else if (pos1 == 5 && pos2 == 5)
iWeight += 1;
}
DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight);
DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight);
return (float) iWeight;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
DWORD CThaiBreakTree::TrigramProbablity(DWORD dwTag1,DWORD dwTag2,DWORD dwTag3)
{
DWORD iWeight = 6;
if ( (dwTag1 != TAGPOS_UNKNOWN) &&
(dwTag2 != TAGPOS_UNKNOWN) &&
(dwTag3 != TAGPOS_UNKNOWN) )
{
WCHAR pos1 = ExtractPOS(dwTag1);
WCHAR pos2 = ExtractPOS(dwTag2);
WCHAR pos3 = ExtractPOS(dwTag3);
// optimization we if any POS is none than trigram shouldn't therefor no need to search.
if ( pos1 != 0 && pos2 != 0 && pos3 != 0)
{
WCHAR posArray[4];
posArray[0] = pos1;
posArray[1] = pos2;
posArray[2] = pos3;
posArray[3] = 0;
iWeight += thaiTrigramIter.GetProb(posArray);
}
}
DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight);
DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight);
DetermineFrequencyWeight(ExtractFrq(dwTag3), &iWeight);
// We reached zero probablity.
return (DWORD)iWeight;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::TrigramBreak(WCHAR* pwchBegin, WCHAR* pwchEnd1)
{
// Declare and initialize local variables.
WCHAR* pwchBeginWord = pwchBegin;
WCHAR* pwchIndex = pwchBegin;
unsigned int iWordLen;
unsigned int iNumCluster = 1;
unsigned int iNumLastCluster;
unsigned int iBreakIndex = 0;
BYTE nextBreakArray[MAXBREAK];
DWORD nextTagArray[MAXBREAK];
unsigned int iNextBreakIndex; // index for array nextBreakArray and nextTagArray.
bool fFoundMatch;
unsigned int iWeight;
unsigned int iSumWeight;
unsigned int iPrevWeight;
unsigned int iCurrWeight;
BYTE iSoundexWordLen;
DWORD iPrevProbability;
DWORD iCurrentProbability;
DWORD dwTagTemp;
DWORD dwLastTag;
int i; // temporary int for use as need.
bool fBeginNewWord;
bool fEndWord = false;
pszEnd = pwchEnd1;
breakArray[0] = 0;
POSArray[0] = 0;
tagArray[0] = 0;
nextBreakArray[0] = 0;
nextTagArray[0] = 0;
while (true)
{
// Reset Iterator for generating break for new word.
fFoundMatch = false;
fBeginNewWord = true;
// Get begin word string for next round of word break.
pwchIndex = pwchBeginWord;
iNextBreakIndex = 0;
if (pwchIndex == pszEnd)
break;
while(true)
{
iNumLastCluster = iNumCluster;
iNumCluster = GetCluster(pwchIndex);
if (!thaiTrieIter.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
{
if ((iNumCluster == 0) && (pwchIndex == pszEnd))
fEndWord = true;
else
break;
}
fBeginNewWord = false;
pwchIndex += iNumCluster;
if (thaiTrieIter.fWordEnd)
{
if (thaiTrieIter.m_fThaiNumber)
{
// If we have Thai number accumulate it as one break.
assert(iNumCluster == 1);
fFoundMatch = true;
nextBreakArray[0]= (BYTE)(pwchIndex - pwchBeginWord);
nextTagArray[0] = TAGPOS_NCNM;
iNextBreakIndex = 1;
}
else
{
fFoundMatch = true;
nextBreakArray[iNextBreakIndex] = (BYTE)(pwchIndex - pwchBeginWord);
nextTagArray[iNextBreakIndex] = thaiTrieIter.dwTag;
iNextBreakIndex++;
}
if (pwchIndex >= pszEnd)
{
assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
assert(iNextBreakIndex != 0);
if ( iNumCluster == 1 &&
*(pwchIndex - 1) == L'.' &&
iBreakIndex > 0 &&
iNextBreakIndex == 1 &&
tagArray[iBreakIndex - 1] == TAGPOS_ABBR )
{
// backtrack one if we have abbrivation case.
// ex. B.K.K. (in Thai). (more info O11.145042.)
breakArray[iBreakIndex - 1] += nextBreakArray[iNextBreakIndex - 1];
return iBreakIndex;
}
breakArray[iBreakIndex] = nextBreakArray[iNextBreakIndex - 1];
tagArray[iBreakIndex] = nextTagArray[iNextBreakIndex - 1];
return (++iBreakIndex);
}
}
else if ((pwchIndex >= pszEnd && iNextBreakIndex == 0) || fEndWord)
{
assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
iWordLen = (unsigned int) (pwchIndex - pwchBeginWord);
switch (iWordLen)
{
case 0:
if (iBreakIndex > 0)
{
// if We have a length of one character add it to previous node.
breakArray[iBreakIndex - 1] += (BYTE) iNumCluster;
tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
}
else
{
// if this is the first break create a new break.
breakArray[iBreakIndex] = (BYTE) iNumCluster;
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
iBreakIndex++;
}
break;
case 1:
if (iBreakIndex > 0)
{
// if We have a length of one character add it to previous node.
breakArray[iBreakIndex - 1] += (BYTE) iWordLen;
tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
}
else
{
// if this is the first break create a new break.
breakArray[iBreakIndex] = (BYTE) iWordLen;
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
iBreakIndex++;
}
break;
default:
if ( iBreakIndex > 0 &&
ShouldMerge(pwchBeginWord - breakArray[iBreakIndex - 1], breakArray[iBreakIndex - 1],
iWordLen , tagArray[iBreakIndex - 1]) )
{
breakArray[iBreakIndex - 1] += (BYTE) iWordLen;
tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN;
}
else
{
breakArray[iBreakIndex] = (BYTE) iWordLen;
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
iBreakIndex++;
}
}
return iBreakIndex;
}
else if (pwchIndex >= pszEnd)
{
// O10.229346. If we get here we are at the end of word or end of sentence,
// We will need to decide what to depending on if we found the word or not.
break;
}
}
if (fFoundMatch) // Longest Matching.
{
// If we only found one break, than say it the maximum.
if (1 == iNextBreakIndex)
{
if ( nextBreakArray[0] == 2 &&
iNumCluster + iNumLastCluster == 2 &&
iBreakIndex > 0 &&
*(pwchBeginWord+1) == L'.' &&
tagArray[iBreakIndex - 1] == TAGPOS_ABBR )
{
// backtrack one if we have abbrivation case.
// ex. B.K.K. (in Thai). (more info O11.145042.)
breakArray[iBreakIndex - 1] += nextBreakArray[0];
pwchBeginWord += nextBreakArray[0];
}
else if ( iBreakIndex > 0 &&
IsThaiEndingSign(*pwchBeginWord) &&
iNumCluster == 1 )
{
breakArray[iBreakIndex - 1] += nextBreakArray[0];
pwchBeginWord += nextBreakArray[0];
}
else
{
breakArray[iBreakIndex] = nextBreakArray[0];
tagArray[iBreakIndex] = nextTagArray[0];
pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
iBreakIndex++;
}
}
else
{
bool fWeightCompare = false;
iSumWeight = 0;
iPrevWeight = 0;
iCurrWeight = 0;
iPrevProbability = 0;
iCurrentProbability = 0;
dwLastTag = TAGPOS_UNKNOWN;
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
for (i = (iNextBreakIndex - 1); i >= 0 ; i--)
{
if ( iBreakIndex == 0)
{
iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp);
if (iWeight != 0)
// Bigram Probability
iCurrentProbability = (DWORD)BigramProbablity(nextTagArray[i], dwTagTemp);
}
else
{
iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp);
if (iBreakIndex == 1)
// Get Trigram Probability.
iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 1], nextTagArray[i], dwTagTemp);
else if (iBreakIndex >= 2)
{
// Get Trigram Probability.
iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 2], tagArray[iBreakIndex - 1], nextTagArray[i]);
if (iWeight != 0)
iCurrentProbability += (DWORD)BigramProbablity(nextTagArray[i],dwTagTemp);
}
}
fWeightCompare = false;
iCurrWeight = iWeight + nextBreakArray[i];
if (iPrevProbability == 0 && (iCurrWeight+1) == iSumWeight && iCurrentProbability > 5)
{
fWeightCompare = true;
}
else if (iCurrWeight == iSumWeight && ( Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight ||
iCurrentProbability > iPrevProbability))
{
fWeightCompare = true;
}
else if ( iWeight >= iPrevWeight - 1 &&
iPrevProbability > 0 && iPrevProbability < 10 &&
iCurrentProbability > iPrevProbability * 5000 )
{
// O11.187913. We'll trust our trigram data more if the current probability is
// so much greater than previous probability.
//
// * Note: we could probably use one of GA algorithm to get better value than 5K.
fWeightCompare = true;
}
// Store the string the best maximum weight, if the pair is equal
// store the string with maxim
if ( iCurrWeight > iSumWeight ||
fWeightCompare)
// ( (iCurrWeight == iSumWeight) &&
// ( (Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight) || (iCurrentProbability > iPrevProbability) ) ))
{
if (iCurrentProbability >= iPrevProbability || iSumWeight < iCurrWeight)
{
iSumWeight = Maximum(iWeight,1) + nextBreakArray[i];
iPrevWeight = Maximum(iWeight,nextBreakArray[i]);
breakArray[iBreakIndex] = nextBreakArray[i];
tagArray[iBreakIndex] = nextTagArray[i];
iPrevProbability = iCurrentProbability;
dwLastTag = dwTagTemp;
}
}
}
pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
iBreakIndex++;
}
}
else
{
// NOMATCH_FOUND
iWordLen = (unsigned int)(pwchIndex - pwchBeginWord);
if (iBreakIndex > 0)
{
i = iBreakIndex - 1; // set i to previous break
if (iWordLen == 0)
{
if (iNumCluster == 1 && *pwchBeginWord == L',' &&
IsThaiChar(*(pwchBeginWord-breakArray[i])) )
{
// We should not merge comma into the word, only merge comma to
// Number.
// TODO: Should add TAGPOS_PUNCT.
breakArray[iBreakIndex] = (BYTE) iNumCluster;
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
iBreakIndex++;
}
else if (iNumCluster > 1 && *pwchBeginWord == L'.')
{
// O11.134455. This is an ellipse case we shouldn't merge this string.
breakArray[iBreakIndex] = (BYTE) iNumCluster;
tagArray[iBreakIndex] = TAGPOS_PUNC;
pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
iBreakIndex++;
}
else if (ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iNumCluster, tagArray[i]))
{
// If word length is null use the cluster add to previous node.
breakArray[i] += (BYTE) iNumCluster;
tagArray[i] = TAGPOS_UNKNOWN;
pwchBeginWord += iNumCluster; // update begin word for next round.
}
else
{
// Add the unknown word to list.
breakArray[iBreakIndex] = (BYTE) iNumCluster;
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
iBreakIndex++;
}
}
else
{
// Try checking for abbrivations.
if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.')
{
// The word is an abbrivated words.
// TODO: #1. Add TAGPOS_ABBRV.
// TODO: #2. May need to add rules code abbrivated word with 3 letters.
breakArray[iBreakIndex] = iWordLen + iNumCluster;
tagArray[iBreakIndex] = TAGPOS_ABBR;
pwchBeginWord += breakArray[iBreakIndex];
iBreakIndex++;
}
else if (iWordLen == 1 &&
tagArray[i] == TAGPOS_ABBR &&
*(pwchBeginWord+1) == L'.' &&
IsThaiConsonant(*pwchBeginWord) &&
pwchBeginWord+1 < pszEnd )
{
// O11.145042. This is the case where we are a <abbrivated><consonant><period>, the
// likely hood is the character is also an abbrivation.
breakArray[iBreakIndex - 1] += iWordLen + 1;
pwchBeginWord += iWordLen + 1;
}
// Abbreviation are usally 3 characters.
else if ( iWordLen == 2 &&
IsThaiConsonant(*(pwchBeginWord+2)) &&
*(pwchBeginWord+3) == L'.' &&
tagArray[i] != TAGPOS_UNKNOWN )
{
// O11.80619. This is the case where we are a <known word><abbrivated>
breakArray[iBreakIndex] = iWordLen + 1;
tagArray[iBreakIndex] = TAGPOS_ABBR;
pwchBeginWord += breakArray[iBreakIndex];
iBreakIndex++;
}
// Perhase Misspelled word try use sounding to spell the words.
// Try soundex two word back.
else if ( (iBreakIndex >= 2) &&
( (iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i] - breakArray[i - 1])) > (BYTE) (breakArray[i] + breakArray[i - 1]) ) &&
GetWeight(pwchBeginWord - breakArray[i] - breakArray[i - 1] + iSoundexWordLen) )
{
// Resize the word.
pwchBeginWord = (pwchBeginWord - breakArray[i] - breakArray[i - 1]) + iSoundexWordLen; // update begin word for next round.
breakArray[i - 1] = iSoundexWordLen;
tagArray[i - 1] = thaiTrieIter.dwTag;
iBreakIndex--; // Decrement iBreakIndex.
}
// Try soundex one words back.
else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i])) > (BYTE) breakArray[i]) &&
GetWeight(pwchBeginWord - breakArray[i] + iSoundexWordLen) &&
ExtractPOS(tagArray[i]) != 6) // Make sure that previous word is not a NTTL.
{
// Resize the word
pwchBeginWord = (pwchBeginWord - breakArray[i]) + iSoundexWordLen; // update begin word for next round.
breakArray[i] = iSoundexWordLen;
tagArray[i] = thaiTrieIter.dwTag;
}
// Try soundex on this word.
else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord)) > (BYTE) iWordLen) &&
GetWeight(pwchBeginWord + iSoundexWordLen) )
{
// Resize the word.
breakArray[iBreakIndex] = iSoundexWordLen;
tagArray[iBreakIndex] = thaiTrieIter.dwTag;
pwchBeginWord += iSoundexWordLen; // update begin word for next round.
iBreakIndex++;
}
else if ( ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iWordLen , tagArray[i]) )
{
// Merge the words.
breakArray[i] += (BYTE) iWordLen;
tagArray[i] = TAGPOS_UNKNOWN;
pwchBeginWord += iWordLen; // update begin word for next round.
}
else
{
// Add the unknown word to list.
breakArray[iBreakIndex] = (BYTE) iWordLen;
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
pwchBeginWord += iWordLen; // update begin word for next round.
iBreakIndex++;
}
}
}
else
{
// Add unknown word to list and mark it.
if (iWordLen == 0)
{
// If word length is null use the cluster add to previous node.
breakArray[iBreakIndex] = (BYTE) iNumCluster;
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
pwchBeginWord += iNumCluster; // update begin word for next round.
}
else
{
// We we are here there are 2 case that can happen:
// 1. We take too little into our unknown.
// 2. We take too much into our unknown word.
// Have we taken too little check if this unknown word is an abbrivated words.
if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.')
breakArray[iBreakIndex] = iWordLen + iNumCluster;
// Try to see if we are taking to much, see if we can get a Weight from last cluster.
else if ( (iWordLen - iNumLastCluster > 0) && GetWeight(pwchIndex - iNumLastCluster) )
{
breakArray[iBreakIndex] = iWordLen - iNumLastCluster;
if (breakArray[iBreakIndex] == 1)
{
iWeight = GetWeight(pwchIndex - iNumLastCluster);
if (iWeight > iNumLastCluster && iWeight < 40)
breakArray[iBreakIndex] += (BYTE) iWeight;
else
breakArray[iBreakIndex] += (BYTE) iNumLastCluster;
}
}
// We may have a case of iWordLen is 1 and iNumCluster, we have a case of misspelled
// an extra character is incorrectly inserted over a correct word.
else if (iWordLen == 1)
{
iWeight = GetWeight(pwchIndex - iWordLen);
if (iWeight > iNumCluster && iWeight < 40)
breakArray[iBreakIndex] = iWordLen + iWeight;
else
breakArray[iBreakIndex] = iWordLen + iNumCluster;
}
else
breakArray[iBreakIndex] = (BYTE) iWordLen;
if (iNumLastCluster + iNumCluster == iWordLen && *(pwchBeginWord+iNumLastCluster) == L'.')
{
tagArray[iBreakIndex] = TAGPOS_ABBR;
}
else
tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
}
iBreakIndex++;
}
}
}
return iBreakIndex;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiBreakTree::Soundex(WCHAR* word)
{
return thaiTrieIter.Soundex(word);
}
//+---------------------------------------------------------------------------
//
// Function: GetCluster
//
// Synopsis: The function return the next number of character which represent
// a cluster of Thai text.
//
// ie. Kor Kai, Kor Kai -> 1
// Kor Kai, Sara Um -> 2
//
// * Note this function will not return no more than 3 character,
// for cluster as this would represent invalid sequence of character.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::GetCluster(const WCHAR* pszIndex)
{
bool fHasSaraE;
int iRetValue = 0;
bool fNeedEndingCluster = false;
if (pszIndex == pszEnd)
return 0;
while (true)
{
fHasSaraE= false;
// Take all begin cluster character.
while (IsThaiBeginClusterCharacter(*pszIndex))
{
if (*pszIndex == THAI_Vowel_Sara_E)
fHasSaraE = true;
pszIndex++;
iRetValue++;
}
if (IsThaiConsonant(*pszIndex))
{
pszIndex++;
iRetValue++;
while (IsThaiUpperAndLowerClusterCharacter(*pszIndex))
{
// Mai Han Akat is a special type of cluster that will need at lease
// one ending cluster.
if (*pszIndex == THAI_Vowel_Sign_Mai_HanAkat)
fNeedEndingCluster = true;
// In Thai it isn't possible to make a sound if we have the SaraE
// following by vowel below vowel.
else if ( fHasSaraE &&
( (*pszIndex == THAI_Vowel_Sara_II) ||
(*pszIndex == THAI_Tone_MaiTaiKhu) ||
(*pszIndex == THAI_Vowel_Sara_I) ||
(*pszIndex == THAI_Sara_Uee) ))
fNeedEndingCluster = true;
pszIndex++;
iRetValue++;
}
while (IsThaiEndingClusterCharacter(*pszIndex))
{
pszIndex++;
iRetValue++;
fNeedEndingCluster = false;
}
/*
// Include period as part of a cluster. Bug#57106
if (*pszIndex == 0x002e)
{
pszIndex++;
iRetValue++;
fNeedEndingCluster = false;
}
*/
}
if (fNeedEndingCluster)
fNeedEndingCluster = false;
else
break;
}
if (iRetValue == 0)
{
// O11.134455. Ellipse case we go to combine ellipses to one cluster.
if (*pszIndex == 0x002e)
{
while (*pszIndex == 0x002e && pszIndex <= pszEnd)
{
pszIndex++;
iRetValue++;
}
}
else
iRetValue++; // The character is probably a punctuation.
}
if (pszIndex > pszEnd)
{
// We need to do this as we have gone over end buff boundary.
iRetValue -= (int) (pszIndex - pszEnd);
pszIndex = pszEnd;
}
return iRetValue;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// wzWord - input string. (in)
// iWordLen - input string length. (in)
// Alt - find close alternate word (in)
// pBreakPos - array of break position allways 5 byte. (out)
//
// Modifies:
//
// History: created 3/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiBreakTree::FindAltWord(WCHAR* pwchBegin,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos)
{
// Declare and initialize local variables.
unsigned int iNumCluster = 1;
WCHAR* pwchBeginWord = pwchBegin;
WCHAR* pwchIndex = pwchBegin;
bool fBeginNewWord = true;
unsigned int iBreakIndex = 0;
unsigned int iBreakTemp = 0;
unsigned int iBreakTemp1 = 0;
unsigned int iBreakTemp2 = 0;
pszEnd = pwchBegin + iWordLen;
// TODO: Need to clean this code up.
switch(Alt)
{
case 3:
while (true)
{
iNumCluster = GetCluster(pwchIndex);
if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
return iBreakIndex;
fBeginNewWord = false;
pwchIndex += iNumCluster;
if (thaiTrieIter1.fWordEnd)
{
iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
// reached the end of word unable to find alt word.
if (iBreakTemp >= iWordLen)
return 0;
iBreakTemp1 = GetWeight(pwchIndex);
// reached the end of word unable to find alt word.
if (iBreakTemp + iBreakTemp1 >= iWordLen)
return 0;
iBreakTemp2 = GetWeight(pwchIndex+iBreakTemp1);
if (iBreakTemp + iBreakTemp1 + iBreakTemp2 == iWordLen)
{
pBreakPos[0] = (BYTE)iBreakTemp;
pBreakPos[1] = (BYTE)iBreakTemp1;
pBreakPos[2] = (BYTE)iBreakTemp2;
return 3;
}
}
if (pwchIndex >= pszEnd)
return iBreakIndex;
}
break;
case 2:
while (true)
{
iNumCluster = GetCluster(pwchIndex);
if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
return iBreakIndex;
fBeginNewWord = false;
pwchIndex += iNumCluster;
if (thaiTrieIter1.fWordEnd)
{
iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
// reached the end of word unable to find alt word.
if (iBreakTemp >= iWordLen)
return 0;
iBreakTemp1 = GetWeight(pwchIndex);
if (iBreakTemp + iBreakTemp1 == iWordLen)
{
pBreakPos[0] = (BYTE)iBreakTemp;
pBreakPos[1] = (BYTE)iBreakTemp1;
return 2;
}
}
if (pwchIndex >= pszEnd)
return iBreakIndex;
}
break;
default:
case 1:
while (iBreakIndex < Alt)
{
iNumCluster = GetCluster(pwchIndex);
if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord))
return iBreakIndex;
fBeginNewWord = false;
pwchIndex += iNumCluster;
if (thaiTrieIter1.fWordEnd)
{
fBeginNewWord = true;
iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
// reached the end of word unable to find alt word.
if (iBreakTemp >= iWordLen)
return 0;
iBreakTemp1 = GetLongestSubstring(pwchBeginWord,iWordLen);
if (iBreakTemp1 > iBreakTemp && iBreakTemp1 < iWordLen)
pBreakPos[iBreakIndex] = (BYTE) iBreakTemp1;
else
pBreakPos[iBreakIndex] = (BYTE) iBreakTemp;
pwchBeginWord += pBreakPos[iBreakIndex];
iWordLen -= pBreakPos[iBreakIndex];
iBreakIndex++;
}
if (pwchIndex >= pszEnd)
return iBreakIndex;
}
break;
}
return iBreakIndex;
}