|
|
//+---------------------------------------------------------------------------
//
//
// CThaiBreakTree - class CThaiBreakTree
//
// History:
// created 7/99 aarayas
//
// �1999 Microsoft Corporation
//----------------------------------------------------------------------------
#include "CThaiBreakTree.hpp"
//+---------------------------------------------------------------------------
//
// Function: ExtractPOS
//
// Synopsis: The functions takes a tag and return Part Of Speech Tags.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline WCHAR ExtractPOS(DWORD dwTag) { return (WCHAR) ( (dwTag & iPosMask) >> iPosShift); }
//+---------------------------------------------------------------------------
//
// Function: ExtractFrq
//
// Synopsis: The functions takes a tag and return Frquency of words.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline BYTE ExtractFrq(DWORD dwTag) { return (BYTE) ( (dwTag & 0x300) >> iFrqShift); }
//+---------------------------------------------------------------------------
//
// Function: DetermineFrequencyWeight
//
// Synopsis: The functions returns the frequency weight of a words.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline void DetermineFrequencyWeight(BYTE frq, unsigned int* uiWeight) { switch (frq) { case frqpenInfrequent: (*uiWeight) -= 2; break; case frqpenSomewhat: (*uiWeight)--; break; case frqpenVery: (*uiWeight) += 2; break; case frqpenNormal: default: (*uiWeight)++; break; } }
//+---------------------------------------------------------------------------
//
// Function: DetermineFrequencyWeight
//
// Synopsis: The functions returns the frequency weight of a words.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline void DetermineFrequencyWeight(BYTE frq, DWORD* uiWeight) { switch (frq) { case frqpenInfrequent: (*uiWeight) -= 2; break; case frqpenSomewhat: (*uiWeight)--; break; case frqpenVery: (*uiWeight) += 2; break; case frqpenNormal: default: (*uiWeight)++; break; } } //+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: Constructor - initialize local variables
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiBreakTree::CThaiBreakTree() : iNodeIndex(0), iNumNode(0), pszBegin(NULL), pszEnd(NULL), breakTree(NULL), breakArray(NULL), tagArray(NULL), maximalMatchingBreakArray(NULL), maximalMatchingTAGArray(NULL), POSArray(NULL), maximalMatchingPOSArray(NULL) { // Allocate memory need for CThaiBreakTree.
#if defined (NGRAM_ENABLE)
breakTree = new ThaiBreakNode[MAXTHAIBREAKNODE]; #endif
breakArray = new BYTE[MAXBREAK]; tagArray = new DWORD[MAXBREAK]; POSArray = new WCHAR[MAXBREAK]; }
//+---------------------------------------------------------------------------
//
// Class: CThaiTrieIter
//
// Synopsis: Destructor - clean up code
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiBreakTree::~CThaiBreakTree() { // Clean up all memory used.
#if defined (NGRAM_ENABLE)
if (breakTree) delete breakTree; if (maximalMatchingBreakArray) delete maximalMatchingBreakArray; if (maximalMatchingTAGArray) delete maximalMatchingTAGArray; if (maximalMatchingPOSArray) delete maximalMatchingPOSArray; #endif
if (breakArray) delete breakArray; if (tagArray) delete tagArray; if (POSArray) delete POSArray; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Associate the class to the string.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
#if defined (NGRAM_ENABLE)
void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pSentTrie, CTrie* pTrigramTrie) #else
void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pTrigramTrie) #endif
{ assert(pTrie != NULL); thaiTrieIter.Init(pTrie); thaiTrieIter1.Init(pTrie);
#if defined (NGRAM_ENABLE)
assert(pSentTrie != NULL); thaiSentIter.Init(pSentTrie); #endif
assert(pTrigramTrie != NULL); thaiTrigramIter.Init(pTrigramTrie); }
#if defined (NGRAM_ENABLE)
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: reset iterator to top of the tree
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline void CThaiBreakTree::Reset() { iNodeIndex = 0; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Move to the next break.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline bool CThaiBreakTree::MoveNext() { iNodeIndex = breakTree[iNodeIndex].NextBreak; return (iNodeIndex != 0); }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Move down to next level.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline bool CThaiBreakTree::MoveDown() { iNodeIndex = breakTree[iNodeIndex].Down; return (iNodeIndex != 0); }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: create new node to position, and return index to the node.
//
// * return Unable to Create Node.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline unsigned int CThaiBreakTree::CreateNode(int iPos, BYTE iBreakLen, DWORD dwTAG) { assert(iNumNode < MAXTHAIBREAKNODE);
if (iNumNode >= MAXTHAIBREAKNODE) { return UNABLETOCREATENODE; } breakTree[iNumNode].iPos = iPos; breakTree[iNumNode].iBreakLen = iBreakLen; breakTree[iNumNode].dwTAG = dwTAG; breakTree[iNumNode].NextBreak = 0; breakTree[iNumNode].Down = 0;
iNumNode++; return (iNumNode - 1); }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Generate a Tree of possible break from the given string.
//
// * Note - false if there aren't enough memory to create node.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
enum thai_parse_state { END_SENTENCE, // Reached the end of sentence.
LONGEST_MATCH, // Longest possible matched.
NOMATCH_FOUND, // Unable to find word.
ERROR_OUTMEMORY, // Out of Memory.
};
bool CThaiBreakTree::GenerateTree(WCHAR* pszBegin, WCHAR* pszEnd1) { // Declare and initialize local variables.
unsigned int iIndexBreakTree = 0; unsigned int iPrevIndexBreakTree = 0; unsigned int iParentNode = 0; WCHAR* pszBeginWord = pszBegin; WCHAR* pszIndex = pszBegin; unsigned int iNumCluster = 1; unsigned int iNumLastCluster; unsigned int iWordLen = 0; unsigned int iNodeAnalyze = 0; thai_parse_state parseState = END_SENTENCE; bool fFoundMatch = false; bool fAddToNodeAnalyze = false; bool fDoneGenerateTree = false; pszEnd = pszEnd1;
#if defined (_DEBUG)
memset(breakTree,0,sizeof(ThaiBreakNode)*MAXTHAIBREAKNODE); #endif
iNodeIndex = 0; iNumNode = 0;
while (true) { // Reset Iterator for generating break for new word.
fFoundMatch = false; thaiTrieIter.Reset(); if (iIndexBreakTree != 0) { while (true) { // If this is not the first node than set pszBeginWord after the last break.
pszBeginWord = pszBegin + breakTree[iNodeAnalyze].iPos + breakTree[iNodeAnalyze].iBreakLen; fAddToNodeAnalyze = true;
// Are we at the end of the sentence.
if ( (pszBeginWord == pszEnd) || (breakTree[iNodeAnalyze].dwTAG == TAGPOS_PURGE) ) { iNodeAnalyze++; // Move to next node.
if (iNodeAnalyze >= iNumNode) { fDoneGenerateTree = true; break; } } else break; } } pszIndex = pszBeginWord; iParentNode = iNodeAnalyze;
if (fDoneGenerateTree) break;
// Get next level of tree.
while (TRUE) { iNumLastCluster = iNumCluster; iNumCluster = GetCluster(pszIndex); if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster)) { pszIndex += iNumCluster; if (thaiTrieIter.fWordEnd) { fFoundMatch = true; // if first node add first node
if (iIndexBreakTree == 0) { CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag); iIndexBreakTree++; } else { if (fAddToNodeAnalyze) { fAddToNodeAnalyze = false; breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
// Determine if an error has occur.
if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE) { breakTree[iNodeAnalyze].NextBreak = 0; parseState = ERROR_OUTMEMORY; break; }
iPrevIndexBreakTree = breakTree[iNodeAnalyze].NextBreak; iNodeAnalyze++; } else { breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag);
// Determine if an error has occur.
if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE) { breakTree[iPrevIndexBreakTree].Down = 0; parseState = ERROR_OUTMEMORY; break; }
iPrevIndexBreakTree = iIndexBreakTree; } iIndexBreakTree++; } }
if (pszIndex >= pszEnd) { assert(pszIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
parseState = END_SENTENCE; break; } } else { if (fFoundMatch) parseState = LONGEST_MATCH; else parseState = NOMATCH_FOUND; break;
} }
if (parseState == LONGEST_MATCH) { // We found a matched.
assert(breakTree[iPrevIndexBreakTree].Down == 0); // at this point breakTree[iPreveIndexBreakTree].Down should equal null.(optimization note)
if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree) { assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown.
DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak); } } else if (parseState == NOMATCH_FOUND) { // Should mark node as unknown.
if (fAddToNodeAnalyze) { fAddToNodeAnalyze = false; iWordLen = pszIndex - pszBeginWord; // Make sure we don't only have a cluster of text before making a node.
if (iWordLen == 0) { // If we have an UNKNOWN word of one character only current node mark it as unknown.
assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
breakTree[iNodeAnalyze].iBreakLen += iNumCluster; breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen); } else { if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8) // The reason we are using 8 is because from corpora analysis
// the average Thai word is about 7.732 characters.
// TODO: We should add orthographic analysis here to get a better on boundary
// of unknown word.
{ assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
breakTree[iNodeAnalyze].iBreakLen += iWordLen; breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen); } else { if (GetWeight(pszIndex - iNumLastCluster)) breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN); else breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN);
// Determine if an error has occur.
if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE) { breakTree[iNodeAnalyze].NextBreak = 0; parseState = ERROR_OUTMEMORY; break; } iNodeAnalyze++; iIndexBreakTree++; } } } else { breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN);
// Determine if an error has occur.
if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE) { breakTree[iPrevIndexBreakTree].Down = 0; parseState = ERROR_OUTMEMORY; break; } iIndexBreakTree++; } } else if (parseState == END_SENTENCE) { // If we find ourself at the end of a sentence and no match.
if (!fFoundMatch) { if (fAddToNodeAnalyze) { fAddToNodeAnalyze = false; iWordLen = pszIndex - pszBeginWord; // Make sure we don't only have a cluster of text before making a node.
if (iWordLen == 0) { // If we have an UNKNOWN word of one character only current node mark it as unknown.
assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
breakTree[iNodeAnalyze].iBreakLen += iNumCluster; breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen); } else { if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8) // The reason we are using 8 is because from corpora analysis
// the average Thai word is about 7.732 characters.
// TODO: We should add orthographic analysis here to get a better on boundary
// of unknown word.
{ assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode
breakTree[iNodeAnalyze].iBreakLen += iWordLen; breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen); } else { if (GetWeight(pszIndex - iNumLastCluster)) breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN); else breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN);
// Determine if an error has occur.
if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE) { breakTree[iNodeAnalyze].NextBreak = 0; parseState = ERROR_OUTMEMORY; break; } iNodeAnalyze++; iIndexBreakTree++; } } } else { breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN);
// Determine if an error has occur.
if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE) { breakTree[iPrevIndexBreakTree].Down = 0; parseState = ERROR_OUTMEMORY; break; } } iIndexBreakTree++; } // If the beginning of node the branch isn't equal to leaf node perphase it is possible to
// do some ending optimization.
else if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree) { assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown.
DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak); } } else if ( (breakTree[iNodeAnalyze].iBreakLen == 0) || (parseState == ERROR_OUTMEMORY) ) break; }
return (parseState != ERROR_OUTMEMORY); }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Traverse all the tree and look for the least number of token.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiBreakTree::MaximalMatching() { // If maximal matching break array has not been allocate, than allocate it.
if (!maximalMatchingBreakArray) maximalMatchingBreakArray = new BYTE[MAXBREAK]; if (!maximalMatchingTAGArray) maximalMatchingTAGArray = new DWORD[MAXBREAK]; if (!maximalMatchingPOSArray) maximalMatchingPOSArray = new WCHAR[MAXBREAK];
maxLevel = MAXUNSIGNEDINT; maxToken = 0; iNumUnknownMaximalPOSArray = MAXBREAK; Traverse(0,0,0);
return true; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: The function determine if the node if the node should,
// be tag as unknown or purge.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline DWORD CThaiBreakTree::DeterminePurgeOrUnknown(unsigned int iCurrentNode, unsigned int iBreakLen) { // Declare and initialize local variables.
unsigned int iNode = breakTree[iCurrentNode].Down;
while (iNode != 0) { if ( (breakTree[iNode].iBreakLen == iBreakLen) || (breakTree[iNode].iBreakLen < iBreakLen) && ( (breakTree[iNode].dwTAG != TAGPOS_UNKNOWN) || (breakTree[iNode].dwTAG != TAGPOS_PURGE) )) { // Since we are purging this break just make sure the NextBreak is Null.
assert(breakTree[iCurrentNode].NextBreak == 0); return TAGPOS_PURGE; }
iNode = breakTree[iNode].Down; } return TAGPOS_UNKNOWN; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Ending optimization - if we have found the end of a sentence,
// and possible break. Purge the branch for unnecessary break.
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline void CThaiBreakTree::DeterminePurgeEndingSentence(WCHAR* pszBeginWord, unsigned int iNode) { while (breakTree[iNode].Down != 0) { // Determine if the next string has a possiblity to become a word.
// TODO: We may need to change this once the GetWeight add soundex
// functionality.
if (GetWeight(pszBeginWord + breakTree[iNode].iBreakLen) == 0) { // Since we are purging this break just make sure the NextBreak is Null.
assert(breakTree[iNode].NextBreak == 0); breakTree[iNode].dwTAG = TAGPOS_PURGE; } iNode = breakTree[iNode].Down; } } #endif
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::GetLongestSubstring(WCHAR* pszBegin, unsigned int iWordLen) { // Declare and initialize local variables.
unsigned int iNumCluster = 1; unsigned int lastWeight = 0; unsigned int Weight = 0; bool fBeginNewWord; WCHAR* pszIndex = pszBegin; // Short circuit the length is less of string is less than 1.
if ((pszEnd - pszBegin) == 1) return Weight; else if (pszEnd == pszBegin) return 1000;
// Reset Iterator for generating break for new word.
fBeginNewWord = true;
// Get next level of tree.
while (true) { iNumCluster = GetCluster(pszIndex); if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord)) { fBeginNewWord = false; pszIndex += iNumCluster; if (thaiTrieIter.fWordEnd) { lastWeight = Weight; Weight = (unsigned int) (pszIndex - pszBegin); } } else { if ((Weight == iWordLen) && (lastWeight < Weight) && (lastWeight > 0)) { Weight = lastWeight; } break; } } return Weight; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin) { // Declare and initialize local variables.
unsigned int iNumCluster = 1; unsigned int Weight = 0; bool fBeginNewWord; WCHAR* pszIndex = pszBegin; // Short circuit the length is less of string is less than 1.
if ((pszEnd - pszBegin) == 1) return Weight; else if (pszEnd == pszBegin) return 1000;
// Reset Iterator for generating break for new word.
fBeginNewWord = true;
// Get next level of tree.
while (true) { iNumCluster = GetCluster(pszIndex); if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord)) { fBeginNewWord = false; pszIndex += iNumCluster; if (thaiTrieIter.fWordEnd) Weight = (unsigned int) (pszIndex - pszBegin); } else break; } return Weight; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin, DWORD* pdwTag) { // Declare and initialize local variables.
unsigned int iNumCluster = 1; unsigned int Weight = 0; bool fBeginNewWord; WCHAR* pszIndex = pszBegin; // Short circuit the length is less of string is less than 1.
if ((pszEnd - pszBegin) == 1) return Weight; else if (pszEnd == pszBegin) return 1000;
// Reset Iterator for generating break for new word.
fBeginNewWord = true;
// Get next level of tree.
while (true) { iNumCluster = GetCluster(pszIndex); if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord)) { fBeginNewWord = false; pszIndex += iNumCluster; if (thaiTrieIter.fWordEnd) { Weight = (unsigned int) (pszIndex - pszBegin); *pdwTag = thaiTrieIter.dwTag; } } else break; } return Weight; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: Traverse the tree.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiBreakTree::Traverse(unsigned int iLevel, unsigned int iCurrentNode, unsigned int iNumUnknown) { assert (iLevel < MAXBREAK); // Process node.
breakArray[iLevel] = breakTree[iCurrentNode].iBreakLen; tagArray[iLevel] = breakTree[iCurrentNode].dwTAG; if (tagArray[iLevel] == TAGPOS_UNKNOWN) iNumUnknown++;
// Have we found the end of the sentence.
if (breakTree[iCurrentNode].NextBreak == 0) { if (breakTree[iCurrentNode].dwTAG != TAGPOS_PURGE) AddBreakToList(iLevel + 1, iNumUnknown); if (breakTree[iCurrentNode].Down != 0) { if (tagArray[iLevel] == TAGPOS_UNKNOWN) iNumUnknown--; return Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown); } else return true; } else Traverse(iLevel + 1, breakTree[iCurrentNode].NextBreak, iNumUnknown);
if (breakTree[iCurrentNode].Down != 0) { if (tagArray[iLevel] == TAGPOS_UNKNOWN) iNumUnknown--;
Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown); }
return true; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::SoundexSearch(WCHAR* pszBegin) { // Declare and initialize local variables.
unsigned int iNumCluster = 1; unsigned int iNumNextCluster = 1; unsigned int iLongestWord = 0; unsigned int iPenalty = 0; WCHAR* pszIndex = pszBegin; // Short circuit the length is less of string is less than 1.
if ( (pszBegin+1) >= pszEnd ) return iLongestWord;
// Reset Iterator for generating break for new word.
thaiTrieIter1.Reset();
// Get next level of tree.
while (true) { iNumCluster = GetCluster(pszIndex); // Determine iNumNextCluster let iNumNextCluster = 0, if we reached the end of string.
if (pszIndex + iNumCluster >= pszEnd) iNumNextCluster = 0; else iNumNextCluster = GetCluster(pszIndex+iNumCluster);
// Determine penalty
switch (thaiTrieIter1.MoveSoundexByCluster(pszIndex, iNumCluster, iNumNextCluster)) { case SUBSTITUTE_SOUNDLIKECHAR: iPenalty += 2; break; case SUBSTITUTE_DIACRITIC: iPenalty++; break; case UNABLE_TO_MOVE: iPenalty += 2; break; case STOP_MOVE: iPenalty += 1000; break; default: case NOSUBSTITUTE: break; }
// Update Index.
if (iPenalty <= 2) { pszIndex += iNumCluster; if (thaiTrieIter1.fWordEnd) iLongestWord = (unsigned int) (pszIndex - pszBegin); } else break; } return iLongestWord; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: The information used here is a reference to the orthographic
// analysis work done on the Thai languages. (see paper: Natural
// Language Processing in Thailand 1993 Chulalongkorn. p 361).
//
// Arguments: pszBoundaryChar - Contain pointer to at least two thai character
// character next to each other which we will
// use to calculate wheather we should or
// should not merge the two word.
//
// iPrevWordLen -
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline bool CThaiBreakTree::ShouldMerge(const WCHAR* pwszPrevWord, unsigned int iPrevWordLen, unsigned int iMergeWordLen, DWORD dwPrevTag) { const WCHAR* pwszBoundary = pwszPrevWord + iPrevWordLen - 1;
assert(iMergeWordLen != 0); assert(iPrevWordLen != 0);
// There are very few words in Thai that are 4 character or less, therefore we should
// found a pair that less than 4 character we should merge.
// Or if merge word length is one than also merge.
// Of if last cluster of the word is a Thanthakhat(Karan) we should always merge.
if (iPrevWordLen + iMergeWordLen <= 4 || iMergeWordLen == 1 || (iMergeWordLen == 2 && *(pwszBoundary + iMergeWordLen) == THAI_Thanthakhat)) return true;
if (iPrevWordLen >=2) { const WCHAR* pwszPrevCharBoundary = pwszBoundary - 1;
// TO IMPROVE: It better to check the last character of Previous word, it can give us a
// much better guess
if ((*pwszPrevCharBoundary == THAI_Vowel_Sign_Mai_HanAkat || *pwszBoundary == THAI_Vowel_Sign_Mai_HanAkat) || (*pwszPrevCharBoundary == THAI_Tone_Mai_Tri || *pwszBoundary == THAI_Tone_Mai_Tri) || (*pwszPrevCharBoundary == THAI_Sara_Ue || *pwszBoundary == THAI_Sara_Ue) ) return true; }
// If the first character of the next word is mostly likly the beginning
// character and last character of the previous word is not sara-A than
// we have a high probability that we found a begin of word boundary,
// therefore we shouldn't merge.
if ( (IsThaiMostlyBeginCharacter(pwszBoundary[1]) && *pwszBoundary != THAI_Vowel_Sara_A) ) return false;
// If the last character of the previous word is mostly likely an ending
// character than, than there is a high probability that the found a boundary.
// There are very few words in Thai that are 4 character or less, therefore we should
// found a pair that less than 4 character we should merge.
if (IsThaiMostlyLastCharacter(*pwszBoundary)) return false;
// O10.192931 Adding Diacritic check rules. We might want to expand this to more diacritic
// for now Mai HanAkart would do. It is highly unlikely that a word contain more than 1 of Mai HanAkart diacritic.
if (IsContain(pwszPrevWord,iPrevWordLen,THAI_Vowel_Sign_Mai_HanAkat) && IsContain(pwszBoundary + 1,iMergeWordLen,THAI_Vowel_Sign_Mai_HanAkat)) return false;
if (iMergeWordLen == 3 && GetCluster(pwszBoundary + 1) == iMergeWordLen) { if (*(pwszBoundary + 2) == THAI_Vowel_Sara_I) { if (*(pwszBoundary+3) == THAI_Tone_Mai_Ek || *(pwszBoundary+3) == THAI_Tone_Mai_Tro) return false; } }
// if previous tag is equal to Title Noun than the next word is highly likly to be a name.
if (ExtractPOS(dwPrevTag) == 6) return false;
// O11.134455. For the case of trailling punctuation.
if (dwPrevTag == TAGPOS_PUNC && iMergeWordLen > 1 && iPrevWordLen > 1) return false;
// The reason we are using 8 is because from corpora analysis
// the average Thai word is about 7.732 characters. Or, if previous word is already
// an unknown, to keep the amount of unknown low the unknown to previous words.
if ( (iPrevWordLen + iMergeWordLen < 8) || (dwPrevTag == TAGPOS_UNKNOWN) ) return true;
return false; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
// 8/17/99 optimize some code.
//
// Notes:
//
//----------------------------------------------------------------------------
inline void CThaiBreakTree::AddBreakToList(unsigned int iNumBreak, unsigned int iNumUnknown) { #if defined (_DEBUG)
breakArray[iNumBreak] = 0; #endif
if (CompareSentenceStructure(iNumBreak, iNumUnknown)) { maxToken = maxLevel = iNumBreak; // This is ugly but it save 5 clock cycle.
memcpy(maximalMatchingBreakArray,breakArray,maxToken); memcpy(maximalMatchingTAGArray,tagArray,sizeof(DWORD)*maxToken); maximalMatchingBreakArray[maxToken] = 0; maximalMatchingTAGArray[maxToken] = 0; } }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis: The function compares sentence structure of
// maximalMatchingPOSArray with posArray.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline bool CThaiBreakTree::CompareSentenceStructure(unsigned int iNumBreak, unsigned int iNumUnknownPOSArray) { if ( (iNumBreak < maxLevel) && (iNumUnknownMaximalPOSArray >= iNumUnknownPOSArray) ) { iNumUnknownMaximalPOSArray = iNumUnknownPOSArray; return true; } else if (iNumBreak == maxLevel) { // true - maximal matching has a larger unknown.
if (iNumUnknownMaximalPOSArray > iNumUnknownPOSArray) { iNumUnknownMaximalPOSArray = iNumUnknownPOSArray; return true; }
for(unsigned int i = 0; i <= iNumBreak; i++) { maximalMatchingPOSArray[i] = ExtractPOS(maximalMatchingTAGArray[i]); POSArray[i] = ExtractPOS(tagArray[i]); }
// Determine if the sentence structure is like any one of the sentence
// sentence structure in our corpora.
if ( (IsSentenceStruct(POSArray, iNumBreak)) && (!IsSentenceStruct(maximalMatchingPOSArray, iNumBreak)) ) { iNumUnknownMaximalPOSArray = iNumUnknownPOSArray; return true; } else if (iNumUnknownMaximalPOSArray == iNumUnknownPOSArray) { // Determine the frequency of word used in the sentence.
unsigned int iFrequencyArray = 500; unsigned int iFrequencyMaximalArray = 500; for(unsigned int i = 0; i <= iNumBreak; i++) { DetermineFrequencyWeight(ExtractFrq(maximalMatchingTAGArray[i]),&iFrequencyMaximalArray); DetermineFrequencyWeight(ExtractFrq(tagArray[i]),&iFrequencyArray); } return (iFrequencyArray > iFrequencyMaximalArray); } } return false; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiBreakTree::IsSentenceStruct(const WCHAR* pos, unsigned int iPosLen) { // Declare and initialize all local variables.
unsigned int i = 0;
thaiSentIter.Reset();
if (!thaiSentIter.Down()) return FALSE;
while (TRUE) { thaiSentIter.GetNode(); if (thaiSentIter.pos == pos[i]) { i++; if (thaiSentIter.fWordEnd && i == iPosLen) { return TRUE; } else if (i == iPosLen) break; // Move down the Trie Branch.
else if (!thaiSentIter.Down()) break; } // Move right of the Trie Branch
else if (!thaiSentIter.Right()) break; } return FALSE; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
float CThaiBreakTree::BigramProbablity(DWORD dwTag1,DWORD dwTag2) { unsigned int iWeight = 4;
// TODO : Use the distribution of word category to determine optimial search - exmaple
// NOUN VERB ADVERB CLASSIFIER CONJETURE PREP et....
// TODO : Once we got trigram use it to create bigram probability as well.
if ( (dwTag1 != TAGPOS_UNKNOWN) && (dwTag2 != TAGPOS_UNKNOWN) ) { WCHAR pos1 = ExtractPOS(dwTag1); WCHAR pos2 = ExtractPOS(dwTag2);
// case NCMN VATT
/// a common noun is often followed by attributive verb(adjective)
// Example: (In Thai) book good, people nice
if (pos1 == 5 && pos2 == 13) iWeight += 10; // case NTTL NPRP
// a title noun is often followed by proper noun
// Example: Dr. Athapan, Mr. Sam
else if (pos1 == 6 && pos2 == 1) iWeight += 5; // case JSBR (XVAM || VSTA)
// a subordinating conjunction is often followed by preverb auxillary or Active verb
// Example: (In Thai) Because of , Because see
else if (pos1 == 39 && (pos2 == 15 || pos2 == 12)) iWeight += 10; // case ADVN NCMN
// a Adverb normal form is often followed by Common noun (Bug 55057).
// Example: (In Thai) under table.
else if (pos1 == 28 && pos2 == 5) iWeight += 5; // case VACT XVAE
else if (pos1 == 11 && pos2 == 18) iWeight += 5; // case VACT DDBQ
// Active verb follow by Definite determiner.
// Example: (In Thai) working for, singing again.
else if (pos1 == 11 && pos2 == 21) iWeight += 10; // case VATT VACT
// adjective are followed by verb.
// Example: (In Thai keyboard)sivd;jk
else if (pos1 == 13 && pos2 == 11) iWeight += 2; // case XVAE VACT
// a post verb auxilliary are often followed by an active verb.
// Example: (In Thai) come singing, go work.
else if (pos1 == 18 && pos2 == 11) iWeight += 10; // case CLTV NCMN
// a Collective classfier are often followed by Common Noun
// Example: (In Thai) group people, flock bird
else if (pos1 == 33 && pos2 == 5) iWeight += 5; // case NEG (VACT || VSTA || VATT || XVAM || XVAE)
// a negator (ie. not) is often followed by some kind of VERB.
// Example: He is not going.
else if (pos1 == 46 && (pos2 == 11 || pos2 == 12 || pos2 == 13 || pos2 == 15 || pos2 == 16)) iWeight += 8; // case EAFF or EITT
// Ending for affirmative, and interrogative are more often ending of the pair
// Example: (In Thai) Krub, Ka,
else if (pos2 == 44 || pos2 == 45) iWeight += 3; // case VATT and VATT
// Attributive Verb and Attributive Verb occur when often in spoken laguages.
// Example: she is reall really cute.
else if (pos1 == 13 && pos2 == 13) iWeight += 2; // case NCMN and DDAC
// Common Noun and Definitive determiner classifier.
// Example: Food here (Thai)
else if (pos1 == 5 && pos2 == 20) iWeight += 3; // case CMTR and JCMP
// Measurement classifier and Comparative conjunction, are likly to appear in Thai.
// Example: year about (Thai) -> English about a year.
else if (pos1 == 34 && pos2 == 38) iWeight += 5; // case XVBB and VACT
else if (pos1 == 17 && pos2 == 11) iWeight += 5; // case NCMN and NCMN
// Common Noun and Common Noun
// Example: electric bulb(in thai)
else if (pos1 == 5 && pos2 == 5) iWeight += 1; }
DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight); DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight); return (float) iWeight; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
DWORD CThaiBreakTree::TrigramProbablity(DWORD dwTag1,DWORD dwTag2,DWORD dwTag3) { DWORD iWeight = 6;
if ( (dwTag1 != TAGPOS_UNKNOWN) && (dwTag2 != TAGPOS_UNKNOWN) && (dwTag3 != TAGPOS_UNKNOWN) ) { WCHAR pos1 = ExtractPOS(dwTag1); WCHAR pos2 = ExtractPOS(dwTag2); WCHAR pos3 = ExtractPOS(dwTag3);
// optimization we if any POS is none than trigram shouldn't therefor no need to search.
if ( pos1 != 0 && pos2 != 0 && pos3 != 0) { WCHAR posArray[4]; posArray[0] = pos1; posArray[1] = pos2; posArray[2] = pos3; posArray[3] = 0; iWeight += thaiTrigramIter.GetProb(posArray); } } DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight); DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight); DetermineFrequencyWeight(ExtractFrq(dwTag3), &iWeight); // We reached zero probablity.
return (DWORD)iWeight; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::TrigramBreak(WCHAR* pwchBegin, WCHAR* pwchEnd1) { // Declare and initialize local variables.
WCHAR* pwchBeginWord = pwchBegin; WCHAR* pwchIndex = pwchBegin; unsigned int iWordLen; unsigned int iNumCluster = 1; unsigned int iNumLastCluster; unsigned int iBreakIndex = 0; BYTE nextBreakArray[MAXBREAK]; DWORD nextTagArray[MAXBREAK]; unsigned int iNextBreakIndex; // index for array nextBreakArray and nextTagArray.
bool fFoundMatch; unsigned int iWeight; unsigned int iSumWeight; unsigned int iPrevWeight; unsigned int iCurrWeight; BYTE iSoundexWordLen; DWORD iPrevProbability; DWORD iCurrentProbability; DWORD dwTagTemp; DWORD dwLastTag; int i; // temporary int for use as need.
bool fBeginNewWord; bool fEndWord = false;
pszEnd = pwchEnd1; breakArray[0] = 0; POSArray[0] = 0; tagArray[0] = 0; nextBreakArray[0] = 0; nextTagArray[0] = 0;
while (true) { // Reset Iterator for generating break for new word.
fFoundMatch = false; fBeginNewWord = true;
// Get begin word string for next round of word break.
pwchIndex = pwchBeginWord; iNextBreakIndex = 0;
if (pwchIndex == pszEnd) break;
while(true) { iNumLastCluster = iNumCluster; iNumCluster = GetCluster(pwchIndex); if (!thaiTrieIter.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord)) { if ((iNumCluster == 0) && (pwchIndex == pszEnd)) fEndWord = true; else break; }
fBeginNewWord = false; pwchIndex += iNumCluster; if (thaiTrieIter.fWordEnd) { if (thaiTrieIter.m_fThaiNumber) { // If we have Thai number accumulate it as one break.
assert(iNumCluster == 1); fFoundMatch = true; nextBreakArray[0]= (BYTE)(pwchIndex - pwchBeginWord); nextTagArray[0] = TAGPOS_NCNM; iNextBreakIndex = 1; } else { fFoundMatch = true; nextBreakArray[iNextBreakIndex] = (BYTE)(pwchIndex - pwchBeginWord); nextTagArray[iNextBreakIndex] = thaiTrieIter.dwTag; iNextBreakIndex++; } if (pwchIndex >= pszEnd) { assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
assert(iNextBreakIndex != 0);
if ( iNumCluster == 1 && *(pwchIndex - 1) == L'.' && iBreakIndex > 0 && iNextBreakIndex == 1 && tagArray[iBreakIndex - 1] == TAGPOS_ABBR ) { // backtrack one if we have abbrivation case.
// ex. B.K.K. (in Thai). (more info O11.145042.)
breakArray[iBreakIndex - 1] += nextBreakArray[iNextBreakIndex - 1]; return iBreakIndex; }
breakArray[iBreakIndex] = nextBreakArray[iNextBreakIndex - 1]; tagArray[iBreakIndex] = nextTagArray[iNextBreakIndex - 1]; return (++iBreakIndex); } } else if ((pwchIndex >= pszEnd && iNextBreakIndex == 0) || fEndWord) { assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton.
iWordLen = (unsigned int) (pwchIndex - pwchBeginWord); switch (iWordLen) { case 0: if (iBreakIndex > 0) { // if We have a length of one character add it to previous node.
breakArray[iBreakIndex - 1] += (BYTE) iNumCluster; tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN; } else { // if this is the first break create a new break.
breakArray[iBreakIndex] = (BYTE) iNumCluster; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; iBreakIndex++; } break; case 1: if (iBreakIndex > 0) { // if We have a length of one character add it to previous node.
breakArray[iBreakIndex - 1] += (BYTE) iWordLen; tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN; } else { // if this is the first break create a new break.
breakArray[iBreakIndex] = (BYTE) iWordLen; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; iBreakIndex++; } break; default: if ( iBreakIndex > 0 && ShouldMerge(pwchBeginWord - breakArray[iBreakIndex - 1], breakArray[iBreakIndex - 1], iWordLen , tagArray[iBreakIndex - 1]) ) { breakArray[iBreakIndex - 1] += (BYTE) iWordLen; tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN; } else { breakArray[iBreakIndex] = (BYTE) iWordLen; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; iBreakIndex++; } } return iBreakIndex; } else if (pwchIndex >= pszEnd) { // O10.229346. If we get here we are at the end of word or end of sentence,
// We will need to decide what to depending on if we found the word or not.
break; } }
if (fFoundMatch) // Longest Matching.
{ // If we only found one break, than say it the maximum.
if (1 == iNextBreakIndex) { if ( nextBreakArray[0] == 2 && iNumCluster + iNumLastCluster == 2 && iBreakIndex > 0 && *(pwchBeginWord+1) == L'.' && tagArray[iBreakIndex - 1] == TAGPOS_ABBR ) { // backtrack one if we have abbrivation case.
// ex. B.K.K. (in Thai). (more info O11.145042.)
breakArray[iBreakIndex - 1] += nextBreakArray[0]; pwchBeginWord += nextBreakArray[0]; } else if ( iBreakIndex > 0 && IsThaiEndingSign(*pwchBeginWord) && iNumCluster == 1 ) { breakArray[iBreakIndex - 1] += nextBreakArray[0]; pwchBeginWord += nextBreakArray[0];
} else { breakArray[iBreakIndex] = nextBreakArray[0]; tagArray[iBreakIndex] = nextTagArray[0]; pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
iBreakIndex++; } } else { bool fWeightCompare = false;
iSumWeight = 0; iPrevWeight = 0; iCurrWeight = 0; iPrevProbability = 0; iCurrentProbability = 0; dwLastTag = TAGPOS_UNKNOWN; tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
for (i = (iNextBreakIndex - 1); i >= 0 ; i--) { if ( iBreakIndex == 0) { iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp);
if (iWeight != 0) // Bigram Probability
iCurrentProbability = (DWORD)BigramProbablity(nextTagArray[i], dwTagTemp); } else { iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp);
if (iBreakIndex == 1) // Get Trigram Probability.
iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 1], nextTagArray[i], dwTagTemp); else if (iBreakIndex >= 2) { // Get Trigram Probability.
iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 2], tagArray[iBreakIndex - 1], nextTagArray[i]); if (iWeight != 0) iCurrentProbability += (DWORD)BigramProbablity(nextTagArray[i],dwTagTemp); } }
fWeightCompare = false;
iCurrWeight = iWeight + nextBreakArray[i];
if (iPrevProbability == 0 && (iCurrWeight+1) == iSumWeight && iCurrentProbability > 5) { fWeightCompare = true; } else if (iCurrWeight == iSumWeight && ( Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight || iCurrentProbability > iPrevProbability)) { fWeightCompare = true; } else if ( iWeight >= iPrevWeight - 1 && iPrevProbability > 0 && iPrevProbability < 10 && iCurrentProbability > iPrevProbability * 5000 ) { // O11.187913. We'll trust our trigram data more if the current probability is
// so much greater than previous probability.
//
// * Note: we could probably use one of GA algorithm to get better value than 5K.
fWeightCompare = true; }
// Store the string the best maximum weight, if the pair is equal
// store the string with maxim
if ( iCurrWeight > iSumWeight || fWeightCompare) // ( (iCurrWeight == iSumWeight) &&
// ( (Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight) || (iCurrentProbability > iPrevProbability) ) ))
{ if (iCurrentProbability >= iPrevProbability || iSumWeight < iCurrWeight) { iSumWeight = Maximum(iWeight,1) + nextBreakArray[i]; iPrevWeight = Maximum(iWeight,nextBreakArray[i]); breakArray[iBreakIndex] = nextBreakArray[i]; tagArray[iBreakIndex] = nextTagArray[i]; iPrevProbability = iCurrentProbability; dwLastTag = dwTagTemp; } } } pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
iBreakIndex++; } } else { // NOMATCH_FOUND
iWordLen = (unsigned int)(pwchIndex - pwchBeginWord); if (iBreakIndex > 0) { i = iBreakIndex - 1; // set i to previous break
if (iWordLen == 0) { if (iNumCluster == 1 && *pwchBeginWord == L',' && IsThaiChar(*(pwchBeginWord-breakArray[i])) ) { // We should not merge comma into the word, only merge comma to
// Number.
// TODO: Should add TAGPOS_PUNCT.
breakArray[iBreakIndex] = (BYTE) iNumCluster; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
iBreakIndex++; } else if (iNumCluster > 1 && *pwchBeginWord == L'.') { // O11.134455. This is an ellipse case we shouldn't merge this string.
breakArray[iBreakIndex] = (BYTE) iNumCluster; tagArray[iBreakIndex] = TAGPOS_PUNC; pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
iBreakIndex++; } else if (ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iNumCluster, tagArray[i])) { // If word length is null use the cluster add to previous node.
breakArray[i] += (BYTE) iNumCluster; tagArray[i] = TAGPOS_UNKNOWN; pwchBeginWord += iNumCluster; // update begin word for next round.
} else { // Add the unknown word to list.
breakArray[iBreakIndex] = (BYTE) iNumCluster; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round.
iBreakIndex++; } } else { // Try checking for abbrivations.
if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.') { // The word is an abbrivated words.
// TODO: #1. Add TAGPOS_ABBRV.
// TODO: #2. May need to add rules code abbrivated word with 3 letters.
breakArray[iBreakIndex] = iWordLen + iNumCluster; tagArray[iBreakIndex] = TAGPOS_ABBR; pwchBeginWord += breakArray[iBreakIndex]; iBreakIndex++; } else if (iWordLen == 1 && tagArray[i] == TAGPOS_ABBR && *(pwchBeginWord+1) == L'.' && IsThaiConsonant(*pwchBeginWord) && pwchBeginWord+1 < pszEnd ) { // O11.145042. This is the case where we are a <abbrivated><consonant><period>, the
// likely hood is the character is also an abbrivation.
breakArray[iBreakIndex - 1] += iWordLen + 1; pwchBeginWord += iWordLen + 1; } // Abbreviation are usally 3 characters.
else if ( iWordLen == 2 && IsThaiConsonant(*(pwchBeginWord+2)) && *(pwchBeginWord+3) == L'.' && tagArray[i] != TAGPOS_UNKNOWN ) { // O11.80619. This is the case where we are a <known word><abbrivated>
breakArray[iBreakIndex] = iWordLen + 1; tagArray[iBreakIndex] = TAGPOS_ABBR; pwchBeginWord += breakArray[iBreakIndex]; iBreakIndex++; } // Perhase Misspelled word try use sounding to spell the words.
// Try soundex two word back.
else if ( (iBreakIndex >= 2) && ( (iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i] - breakArray[i - 1])) > (BYTE) (breakArray[i] + breakArray[i - 1]) ) && GetWeight(pwchBeginWord - breakArray[i] - breakArray[i - 1] + iSoundexWordLen) ) { // Resize the word.
pwchBeginWord = (pwchBeginWord - breakArray[i] - breakArray[i - 1]) + iSoundexWordLen; // update begin word for next round.
breakArray[i - 1] = iSoundexWordLen; tagArray[i - 1] = thaiTrieIter.dwTag; iBreakIndex--; // Decrement iBreakIndex.
} // Try soundex one words back.
else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i])) > (BYTE) breakArray[i]) && GetWeight(pwchBeginWord - breakArray[i] + iSoundexWordLen) && ExtractPOS(tagArray[i]) != 6) // Make sure that previous word is not a NTTL.
{ // Resize the word
pwchBeginWord = (pwchBeginWord - breakArray[i]) + iSoundexWordLen; // update begin word for next round.
breakArray[i] = iSoundexWordLen; tagArray[i] = thaiTrieIter.dwTag; } // Try soundex on this word.
else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord)) > (BYTE) iWordLen) && GetWeight(pwchBeginWord + iSoundexWordLen) ) { // Resize the word.
breakArray[iBreakIndex] = iSoundexWordLen; tagArray[iBreakIndex] = thaiTrieIter.dwTag; pwchBeginWord += iSoundexWordLen; // update begin word for next round.
iBreakIndex++; } else if ( ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iWordLen , tagArray[i]) ) { // Merge the words.
breakArray[i] += (BYTE) iWordLen; tagArray[i] = TAGPOS_UNKNOWN; pwchBeginWord += iWordLen; // update begin word for next round.
} else { // Add the unknown word to list.
breakArray[iBreakIndex] = (BYTE) iWordLen; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; pwchBeginWord += iWordLen; // update begin word for next round.
iBreakIndex++; } } } else { // Add unknown word to list and mark it.
if (iWordLen == 0) { // If word length is null use the cluster add to previous node.
breakArray[iBreakIndex] = (BYTE) iNumCluster; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; pwchBeginWord += iNumCluster; // update begin word for next round.
} else { // We we are here there are 2 case that can happen:
// 1. We take too little into our unknown.
// 2. We take too much into our unknown word.
// Have we taken too little check if this unknown word is an abbrivated words.
if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.') breakArray[iBreakIndex] = iWordLen + iNumCluster; // Try to see if we are taking to much, see if we can get a Weight from last cluster.
else if ( (iWordLen - iNumLastCluster > 0) && GetWeight(pwchIndex - iNumLastCluster) ) { breakArray[iBreakIndex] = iWordLen - iNumLastCluster; if (breakArray[iBreakIndex] == 1) { iWeight = GetWeight(pwchIndex - iNumLastCluster); if (iWeight > iNumLastCluster && iWeight < 40) breakArray[iBreakIndex] += (BYTE) iWeight; else breakArray[iBreakIndex] += (BYTE) iNumLastCluster;
} } // We may have a case of iWordLen is 1 and iNumCluster, we have a case of misspelled
// an extra character is incorrectly inserted over a correct word.
else if (iWordLen == 1) { iWeight = GetWeight(pwchIndex - iWordLen); if (iWeight > iNumCluster && iWeight < 40) breakArray[iBreakIndex] = iWordLen + iWeight; else breakArray[iBreakIndex] = iWordLen + iNumCluster; } else breakArray[iBreakIndex] = (BYTE) iWordLen; if (iNumLastCluster + iNumCluster == iWordLen && *(pwchBeginWord+iNumLastCluster) == L'.') { tagArray[iBreakIndex] = TAGPOS_ABBR; } else tagArray[iBreakIndex] = TAGPOS_UNKNOWN;
pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round.
} iBreakIndex++; } } } return iBreakIndex; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 8/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiBreakTree::Soundex(WCHAR* word) { return thaiTrieIter.Soundex(word); }
//+---------------------------------------------------------------------------
//
// Function: GetCluster
//
// Synopsis: The function return the next number of character which represent
// a cluster of Thai text.
//
// ie. Kor Kai, Kor Kai -> 1
// Kor Kai, Sara Um -> 2
//
// * Note this function will not return no more than 3 character,
// for cluster as this would represent invalid sequence of character.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
unsigned int CThaiBreakTree::GetCluster(const WCHAR* pszIndex) { bool fHasSaraE; int iRetValue = 0; bool fNeedEndingCluster = false;
if (pszIndex == pszEnd) return 0;
while (true) { fHasSaraE= false;
// Take all begin cluster character.
while (IsThaiBeginClusterCharacter(*pszIndex)) { if (*pszIndex == THAI_Vowel_Sara_E) fHasSaraE = true; pszIndex++; iRetValue++;
}
if (IsThaiConsonant(*pszIndex)) { pszIndex++; iRetValue++;
while (IsThaiUpperAndLowerClusterCharacter(*pszIndex)) { // Mai Han Akat is a special type of cluster that will need at lease
// one ending cluster.
if (*pszIndex == THAI_Vowel_Sign_Mai_HanAkat) fNeedEndingCluster = true;
// In Thai it isn't possible to make a sound if we have the SaraE
// following by vowel below vowel.
else if ( fHasSaraE && ( (*pszIndex == THAI_Vowel_Sara_II) || (*pszIndex == THAI_Tone_MaiTaiKhu) || (*pszIndex == THAI_Vowel_Sara_I) || (*pszIndex == THAI_Sara_Uee) )) fNeedEndingCluster = true; pszIndex++; iRetValue++; }
while (IsThaiEndingClusterCharacter(*pszIndex)) { pszIndex++; iRetValue++; fNeedEndingCluster = false; } /*
// Include period as part of a cluster. Bug#57106
if (*pszIndex == 0x002e) { pszIndex++; iRetValue++; fNeedEndingCluster = false; } */ }
if (fNeedEndingCluster) fNeedEndingCluster = false; else break; }
if (iRetValue == 0) { // O11.134455. Ellipse case we go to combine ellipses to one cluster.
if (*pszIndex == 0x002e) { while (*pszIndex == 0x002e && pszIndex <= pszEnd) { pszIndex++; iRetValue++; } } else iRetValue++; // The character is probably a punctuation.
}
if (pszIndex > pszEnd) { // We need to do this as we have gone over end buff boundary.
iRetValue -= (int) (pszIndex - pszEnd); pszIndex = pszEnd; } return iRetValue; }
//+---------------------------------------------------------------------------
//
// Class: CThaiBreakTree
//
// Synopsis:
//
// Arguments:
//
// wzWord - input string. (in)
// iWordLen - input string length. (in)
// Alt - find close alternate word (in)
// pBreakPos - array of break position allways 5 byte. (out)
//
// Modifies:
//
// History: created 3/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiBreakTree::FindAltWord(WCHAR* pwchBegin,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos) { // Declare and initialize local variables.
unsigned int iNumCluster = 1; WCHAR* pwchBeginWord = pwchBegin; WCHAR* pwchIndex = pwchBegin; bool fBeginNewWord = true; unsigned int iBreakIndex = 0; unsigned int iBreakTemp = 0; unsigned int iBreakTemp1 = 0; unsigned int iBreakTemp2 = 0;
pszEnd = pwchBegin + iWordLen; // TODO: Need to clean this code up.
switch(Alt) { case 3: while (true) { iNumCluster = GetCluster(pwchIndex);
if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord)) return iBreakIndex;
fBeginNewWord = false; pwchIndex += iNumCluster; if (thaiTrieIter1.fWordEnd) { iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
// reached the end of word unable to find alt word.
if (iBreakTemp >= iWordLen) return 0;
iBreakTemp1 = GetWeight(pwchIndex);
// reached the end of word unable to find alt word.
if (iBreakTemp + iBreakTemp1 >= iWordLen) return 0;
iBreakTemp2 = GetWeight(pwchIndex+iBreakTemp1); if (iBreakTemp + iBreakTemp1 + iBreakTemp2 == iWordLen) { pBreakPos[0] = (BYTE)iBreakTemp; pBreakPos[1] = (BYTE)iBreakTemp1; pBreakPos[2] = (BYTE)iBreakTemp2; return 3; } } if (pwchIndex >= pszEnd) return iBreakIndex; } break; case 2: while (true) { iNumCluster = GetCluster(pwchIndex);
if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord)) return iBreakIndex;
fBeginNewWord = false; pwchIndex += iNumCluster; if (thaiTrieIter1.fWordEnd) { iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
// reached the end of word unable to find alt word.
if (iBreakTemp >= iWordLen) return 0;
iBreakTemp1 = GetWeight(pwchIndex); if (iBreakTemp + iBreakTemp1 == iWordLen) { pBreakPos[0] = (BYTE)iBreakTemp; pBreakPos[1] = (BYTE)iBreakTemp1; return 2; } } if (pwchIndex >= pszEnd) return iBreakIndex; } break; default: case 1: while (iBreakIndex < Alt) { iNumCluster = GetCluster(pwchIndex);
if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord)) return iBreakIndex;
fBeginNewWord = false; pwchIndex += iNumCluster; if (thaiTrieIter1.fWordEnd) { fBeginNewWord = true;
iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord);
// reached the end of word unable to find alt word.
if (iBreakTemp >= iWordLen) return 0;
iBreakTemp1 = GetLongestSubstring(pwchBeginWord,iWordLen);
if (iBreakTemp1 > iBreakTemp && iBreakTemp1 < iWordLen) pBreakPos[iBreakIndex] = (BYTE) iBreakTemp1; else pBreakPos[iBreakIndex] = (BYTE) iBreakTemp; pwchBeginWord += pBreakPos[iBreakIndex]; iWordLen -= pBreakPos[iBreakIndex]; iBreakIndex++; } if (pwchIndex >= pszEnd) return iBreakIndex; } break; }
return iBreakIndex; }
|