//+--------------------------------------------------------------------------- // // // CThaiBreakTree - class CThaiBreakTree // // History: // created 7/99 aarayas // // ©1999 Microsoft Corporation //---------------------------------------------------------------------------- #include "CThaiBreakTree.hpp" //+--------------------------------------------------------------------------- // // Function: ExtractPOS // // Synopsis: The functions takes a tag and return Part Of Speech Tags. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline WCHAR ExtractPOS(DWORD dwTag) { return (WCHAR) ( (dwTag & iPosMask) >> iPosShift); } //+--------------------------------------------------------------------------- // // Function: ExtractFrq // // Synopsis: The functions takes a tag and return Frquency of words. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline BYTE ExtractFrq(DWORD dwTag) { return (BYTE) ( (dwTag & 0x300) >> iFrqShift); } //+--------------------------------------------------------------------------- // // Function: DetermineFrequencyWeight // // Synopsis: The functions returns the frequency weight of a words. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline void DetermineFrequencyWeight(BYTE frq, unsigned int* uiWeight) { switch (frq) { case frqpenInfrequent: (*uiWeight) -= 2; break; case frqpenSomewhat: (*uiWeight)--; break; case frqpenVery: (*uiWeight) += 2; break; case frqpenNormal: default: (*uiWeight)++; break; } } //+--------------------------------------------------------------------------- // // Function: DetermineFrequencyWeight // // Synopsis: The functions returns the frequency weight of a words. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline void DetermineFrequencyWeight(BYTE frq, DWORD* uiWeight) { switch (frq) { case frqpenInfrequent: (*uiWeight) -= 2; break; case frqpenSomewhat: (*uiWeight)--; break; case frqpenVery: (*uiWeight) += 2; break; case frqpenNormal: default: (*uiWeight)++; break; } } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synopsis: Constructor - initialize local variables // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- CThaiBreakTree::CThaiBreakTree() : iNodeIndex(0), iNumNode(0), pszBegin(NULL), pszEnd(NULL), breakTree(NULL), breakArray(NULL), tagArray(NULL), maximalMatchingBreakArray(NULL), maximalMatchingTAGArray(NULL), POSArray(NULL), maximalMatchingPOSArray(NULL) { // Allocate memory need for CThaiBreakTree. #if defined (NGRAM_ENABLE) breakTree = new ThaiBreakNode[MAXTHAIBREAKNODE]; #endif breakArray = new BYTE[MAXBREAK]; tagArray = new DWORD[MAXBREAK]; POSArray = new WCHAR[MAXBREAK]; } //+--------------------------------------------------------------------------- // // Class: CThaiTrieIter // // Synopsis: Destructor - clean up code // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- CThaiBreakTree::~CThaiBreakTree() { // Clean up all memory used. #if defined (NGRAM_ENABLE) if (breakTree) delete breakTree; if (maximalMatchingBreakArray) delete maximalMatchingBreakArray; if (maximalMatchingTAGArray) delete maximalMatchingTAGArray; if (maximalMatchingPOSArray) delete maximalMatchingPOSArray; #endif if (breakArray) delete breakArray; if (tagArray) delete tagArray; if (POSArray) delete POSArray; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: Associate the class to the string. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- #if defined (NGRAM_ENABLE) void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pSentTrie, CTrie* pTrigramTrie) #else void CThaiBreakTree::Init(CTrie* pTrie, CTrie* pTrigramTrie) #endif { assert(pTrie != NULL); thaiTrieIter.Init(pTrie); thaiTrieIter1.Init(pTrie); #if defined (NGRAM_ENABLE) assert(pSentTrie != NULL); thaiSentIter.Init(pSentTrie); #endif assert(pTrigramTrie != NULL); thaiTrigramIter.Init(pTrigramTrie); } #if defined (NGRAM_ENABLE) //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: reset iterator to top of the tree // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline void CThaiBreakTree::Reset() { iNodeIndex = 0; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: Move to the next break. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline bool CThaiBreakTree::MoveNext() { iNodeIndex = breakTree[iNodeIndex].NextBreak; return (iNodeIndex != 0); } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: Move down to next level. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline bool CThaiBreakTree::MoveDown() { iNodeIndex = breakTree[iNodeIndex].Down; return (iNodeIndex != 0); } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: create new node to position, and return index to the node. // // * return Unable to Create Node. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline unsigned int CThaiBreakTree::CreateNode(int iPos, BYTE iBreakLen, DWORD dwTAG) { assert(iNumNode < MAXTHAIBREAKNODE); if (iNumNode >= MAXTHAIBREAKNODE) { return UNABLETOCREATENODE; } breakTree[iNumNode].iPos = iPos; breakTree[iNumNode].iBreakLen = iBreakLen; breakTree[iNumNode].dwTAG = dwTAG; breakTree[iNumNode].NextBreak = 0; breakTree[iNumNode].Down = 0; iNumNode++; return (iNumNode - 1); } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: Generate a Tree of possible break from the given string. // // * Note - false if there aren't enough memory to create node. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- enum thai_parse_state { END_SENTENCE, // Reached the end of sentence. LONGEST_MATCH, // Longest possible matched. NOMATCH_FOUND, // Unable to find word. ERROR_OUTMEMORY, // Out of Memory. }; bool CThaiBreakTree::GenerateTree(WCHAR* pszBegin, WCHAR* pszEnd1) { // Declare and initialize local variables. unsigned int iIndexBreakTree = 0; unsigned int iPrevIndexBreakTree = 0; unsigned int iParentNode = 0; WCHAR* pszBeginWord = pszBegin; WCHAR* pszIndex = pszBegin; unsigned int iNumCluster = 1; unsigned int iNumLastCluster; unsigned int iWordLen = 0; unsigned int iNodeAnalyze = 0; thai_parse_state parseState = END_SENTENCE; bool fFoundMatch = false; bool fAddToNodeAnalyze = false; bool fDoneGenerateTree = false; pszEnd = pszEnd1; #if defined (_DEBUG) memset(breakTree,0,sizeof(ThaiBreakNode)*MAXTHAIBREAKNODE); #endif iNodeIndex = 0; iNumNode = 0; while (true) { // Reset Iterator for generating break for new word. fFoundMatch = false; thaiTrieIter.Reset(); if (iIndexBreakTree != 0) { while (true) { // If this is not the first node than set pszBeginWord after the last break. pszBeginWord = pszBegin + breakTree[iNodeAnalyze].iPos + breakTree[iNodeAnalyze].iBreakLen; fAddToNodeAnalyze = true; // Are we at the end of the sentence. if ( (pszBeginWord == pszEnd) || (breakTree[iNodeAnalyze].dwTAG == TAGPOS_PURGE) ) { iNodeAnalyze++; // Move to next node. if (iNodeAnalyze >= iNumNode) { fDoneGenerateTree = true; break; } } else break; } } pszIndex = pszBeginWord; iParentNode = iNodeAnalyze; if (fDoneGenerateTree) break; // Get next level of tree. while (TRUE) { iNumLastCluster = iNumCluster; iNumCluster = GetCluster(pszIndex); if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster)) { pszIndex += iNumCluster; if (thaiTrieIter.fWordEnd) { fFoundMatch = true; // if first node add first node if (iIndexBreakTree == 0) { CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag); iIndexBreakTree++; } else { if (fAddToNodeAnalyze) { fAddToNodeAnalyze = false; breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag); // Determine if an error has occur. if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE) { breakTree[iNodeAnalyze].NextBreak = 0; parseState = ERROR_OUTMEMORY; break; } iPrevIndexBreakTree = breakTree[iNodeAnalyze].NextBreak; iNodeAnalyze++; } else { breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, thaiTrieIter.dwTag); // Determine if an error has occur. if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE) { breakTree[iPrevIndexBreakTree].Down = 0; parseState = ERROR_OUTMEMORY; break; } iPrevIndexBreakTree = iIndexBreakTree; } iIndexBreakTree++; } } if (pszIndex >= pszEnd) { assert(pszIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton. parseState = END_SENTENCE; break; } } else { if (fFoundMatch) parseState = LONGEST_MATCH; else parseState = NOMATCH_FOUND; break; } } if (parseState == LONGEST_MATCH) { // We found a matched. assert(breakTree[iPrevIndexBreakTree].Down == 0); // at this point breakTree[iPreveIndexBreakTree].Down should equal null.(optimization note) if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree) { assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown. DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak); } } else if (parseState == NOMATCH_FOUND) { // Should mark node as unknown. if (fAddToNodeAnalyze) { fAddToNodeAnalyze = false; iWordLen = pszIndex - pszBeginWord; // Make sure we don't only have a cluster of text before making a node. if (iWordLen == 0) { // If we have an UNKNOWN word of one character only current node mark it as unknown. assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode breakTree[iNodeAnalyze].iBreakLen += iNumCluster; breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen); } else { if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8) // The reason we are using 8 is because from corpora analysis // the average Thai word is about 7.732 characters. // TODO: We should add orthographic analysis here to get a better on boundary // of unknown word. { assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode breakTree[iNodeAnalyze].iBreakLen += iWordLen; breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen); } else { if (GetWeight(pszIndex - iNumLastCluster)) breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN); else breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN); // Determine if an error has occur. if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE) { breakTree[iNodeAnalyze].NextBreak = 0; parseState = ERROR_OUTMEMORY; break; } iNodeAnalyze++; iIndexBreakTree++; } } } else { breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN); // Determine if an error has occur. if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE) { breakTree[iPrevIndexBreakTree].Down = 0; parseState = ERROR_OUTMEMORY; break; } iIndexBreakTree++; } } else if (parseState == END_SENTENCE) { // If we find ourself at the end of a sentence and no match. if (!fFoundMatch) { if (fAddToNodeAnalyze) { fAddToNodeAnalyze = false; iWordLen = pszIndex - pszBeginWord; // Make sure we don't only have a cluster of text before making a node. if (iWordLen == 0) { // If we have an UNKNOWN word of one character only current node mark it as unknown. assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode breakTree[iNodeAnalyze].iBreakLen += iNumCluster; breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen); } else { if (breakTree[iNodeAnalyze].iBreakLen + iWordLen < 8) // The reason we are using 8 is because from corpora analysis // the average Thai word is about 7.732 characters. // TODO: We should add orthographic analysis here to get a better on boundary // of unknown word. { assert(iNodeAnalyze == iParentNode); // Since we have a no match iNodeAnalyze better equal iParentNode breakTree[iNodeAnalyze].iBreakLen += iWordLen; breakTree[iNodeAnalyze].dwTAG = DeterminePurgeOrUnknown(iNodeAnalyze,breakTree[iNodeAnalyze].iBreakLen); } else { if (GetWeight(pszIndex - iNumLastCluster)) breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen - iNumLastCluster, TAGPOS_UNKNOWN); else breakTree[iNodeAnalyze].NextBreak = CreateNode(pszBeginWord - pszBegin, iWordLen, TAGPOS_UNKNOWN); // Determine if an error has occur. if (breakTree[iNodeAnalyze].NextBreak == UNABLETOCREATENODE) { breakTree[iNodeAnalyze].NextBreak = 0; parseState = ERROR_OUTMEMORY; break; } iNodeAnalyze++; iIndexBreakTree++; } } } else { breakTree[iPrevIndexBreakTree].Down = CreateNode(pszBeginWord - pszBegin, pszIndex - pszBeginWord, TAGPOS_UNKNOWN); // Determine if an error has occur. if (breakTree[iPrevIndexBreakTree].Down == UNABLETOCREATENODE) { breakTree[iPrevIndexBreakTree].Down = 0; parseState = ERROR_OUTMEMORY; break; } } iIndexBreakTree++; } // If the beginning of node the branch isn't equal to leaf node perphase it is possible to // do some ending optimization. else if (breakTree[iParentNode].NextBreak != iPrevIndexBreakTree) { assert(breakTree[iPrevIndexBreakTree].dwTAG != TAGPOS_UNKNOWN); // shouldn't assert because the end node should ever be unknown. DeterminePurgeEndingSentence(pszBeginWord, breakTree[iParentNode].NextBreak); } } else if ( (breakTree[iNodeAnalyze].iBreakLen == 0) || (parseState == ERROR_OUTMEMORY) ) break; } return (parseState != ERROR_OUTMEMORY); } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: Traverse all the tree and look for the least number of token. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool CThaiBreakTree::MaximalMatching() { // If maximal matching break array has not been allocate, than allocate it. if (!maximalMatchingBreakArray) maximalMatchingBreakArray = new BYTE[MAXBREAK]; if (!maximalMatchingTAGArray) maximalMatchingTAGArray = new DWORD[MAXBREAK]; if (!maximalMatchingPOSArray) maximalMatchingPOSArray = new WCHAR[MAXBREAK]; maxLevel = MAXUNSIGNEDINT; maxToken = 0; iNumUnknownMaximalPOSArray = MAXBREAK; Traverse(0,0,0); return true; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: The function determine if the node if the node should, // be tag as unknown or purge. // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline DWORD CThaiBreakTree::DeterminePurgeOrUnknown(unsigned int iCurrentNode, unsigned int iBreakLen) { // Declare and initialize local variables. unsigned int iNode = breakTree[iCurrentNode].Down; while (iNode != 0) { if ( (breakTree[iNode].iBreakLen == iBreakLen) || (breakTree[iNode].iBreakLen < iBreakLen) && ( (breakTree[iNode].dwTAG != TAGPOS_UNKNOWN) || (breakTree[iNode].dwTAG != TAGPOS_PURGE) )) { // Since we are purging this break just make sure the NextBreak is Null. assert(breakTree[iCurrentNode].NextBreak == 0); return TAGPOS_PURGE; } iNode = breakTree[iNode].Down; } return TAGPOS_UNKNOWN; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: Ending optimization - if we have found the end of a sentence, // and possible break. Purge the branch for unnecessary break. // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline void CThaiBreakTree::DeterminePurgeEndingSentence(WCHAR* pszBeginWord, unsigned int iNode) { while (breakTree[iNode].Down != 0) { // Determine if the next string has a possiblity to become a word. // TODO: We may need to change this once the GetWeight add soundex // functionality. if (GetWeight(pszBeginWord + breakTree[iNode].iBreakLen) == 0) { // Since we are purging this break just make sure the NextBreak is Null. assert(breakTree[iNode].NextBreak == 0); breakTree[iNode].dwTAG = TAGPOS_PURGE; } iNode = breakTree[iNode].Down; } } #endif //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- unsigned int CThaiBreakTree::GetLongestSubstring(WCHAR* pszBegin, unsigned int iWordLen) { // Declare and initialize local variables. unsigned int iNumCluster = 1; unsigned int lastWeight = 0; unsigned int Weight = 0; bool fBeginNewWord; WCHAR* pszIndex = pszBegin; // Short circuit the length is less of string is less than 1. if ((pszEnd - pszBegin) == 1) return Weight; else if (pszEnd == pszBegin) return 1000; // Reset Iterator for generating break for new word. fBeginNewWord = true; // Get next level of tree. while (true) { iNumCluster = GetCluster(pszIndex); if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord)) { fBeginNewWord = false; pszIndex += iNumCluster; if (thaiTrieIter.fWordEnd) { lastWeight = Weight; Weight = (unsigned int) (pszIndex - pszBegin); } } else { if ((Weight == iWordLen) && (lastWeight < Weight) && (lastWeight > 0)) { Weight = lastWeight; } break; } } return Weight; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin) { // Declare and initialize local variables. unsigned int iNumCluster = 1; unsigned int Weight = 0; bool fBeginNewWord; WCHAR* pszIndex = pszBegin; // Short circuit the length is less of string is less than 1. if ((pszEnd - pszBegin) == 1) return Weight; else if (pszEnd == pszBegin) return 1000; // Reset Iterator for generating break for new word. fBeginNewWord = true; // Get next level of tree. while (true) { iNumCluster = GetCluster(pszIndex); if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord)) { fBeginNewWord = false; pszIndex += iNumCluster; if (thaiTrieIter.fWordEnd) Weight = (unsigned int) (pszIndex - pszBegin); } else break; } return Weight; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- unsigned int CThaiBreakTree::GetWeight(WCHAR* pszBegin, DWORD* pdwTag) { // Declare and initialize local variables. unsigned int iNumCluster = 1; unsigned int Weight = 0; bool fBeginNewWord; WCHAR* pszIndex = pszBegin; // Short circuit the length is less of string is less than 1. if ((pszEnd - pszBegin) == 1) return Weight; else if (pszEnd == pszBegin) return 1000; // Reset Iterator for generating break for new word. fBeginNewWord = true; // Get next level of tree. while (true) { iNumCluster = GetCluster(pszIndex); if (thaiTrieIter.MoveCluster(pszIndex, iNumCluster, fBeginNewWord)) { fBeginNewWord = false; pszIndex += iNumCluster; if (thaiTrieIter.fWordEnd) { Weight = (unsigned int) (pszIndex - pszBegin); *pdwTag = thaiTrieIter.dwTag; } } else break; } return Weight; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: Traverse the tree. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool CThaiBreakTree::Traverse(unsigned int iLevel, unsigned int iCurrentNode, unsigned int iNumUnknown) { assert (iLevel < MAXBREAK); // Process node. breakArray[iLevel] = breakTree[iCurrentNode].iBreakLen; tagArray[iLevel] = breakTree[iCurrentNode].dwTAG; if (tagArray[iLevel] == TAGPOS_UNKNOWN) iNumUnknown++; // Have we found the end of the sentence. if (breakTree[iCurrentNode].NextBreak == 0) { if (breakTree[iCurrentNode].dwTAG != TAGPOS_PURGE) AddBreakToList(iLevel + 1, iNumUnknown); if (breakTree[iCurrentNode].Down != 0) { if (tagArray[iLevel] == TAGPOS_UNKNOWN) iNumUnknown--; return Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown); } else return true; } else Traverse(iLevel + 1, breakTree[iCurrentNode].NextBreak, iNumUnknown); if (breakTree[iCurrentNode].Down != 0) { if (tagArray[iLevel] == TAGPOS_UNKNOWN) iNumUnknown--; Traverse(iLevel,breakTree[iCurrentNode].Down, iNumUnknown); } return true; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- unsigned int CThaiBreakTree::SoundexSearch(WCHAR* pszBegin) { // Declare and initialize local variables. unsigned int iNumCluster = 1; unsigned int iNumNextCluster = 1; unsigned int iLongestWord = 0; unsigned int iPenalty = 0; WCHAR* pszIndex = pszBegin; // Short circuit the length is less of string is less than 1. if ( (pszBegin+1) >= pszEnd ) return iLongestWord; // Reset Iterator for generating break for new word. thaiTrieIter1.Reset(); // Get next level of tree. while (true) { iNumCluster = GetCluster(pszIndex); // Determine iNumNextCluster let iNumNextCluster = 0, if we reached the end of string. if (pszIndex + iNumCluster >= pszEnd) iNumNextCluster = 0; else iNumNextCluster = GetCluster(pszIndex+iNumCluster); // Determine penalty switch (thaiTrieIter1.MoveSoundexByCluster(pszIndex, iNumCluster, iNumNextCluster)) { case SUBSTITUTE_SOUNDLIKECHAR: iPenalty += 2; break; case SUBSTITUTE_DIACRITIC: iPenalty++; break; case UNABLE_TO_MOVE: iPenalty += 2; break; case STOP_MOVE: iPenalty += 1000; break; default: case NOSUBSTITUTE: break; } // Update Index. if (iPenalty <= 2) { pszIndex += iNumCluster; if (thaiTrieIter1.fWordEnd) iLongestWord = (unsigned int) (pszIndex - pszBegin); } else break; } return iLongestWord; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: The information used here is a reference to the orthographic // analysis work done on the Thai languages. (see paper: Natural // Language Processing in Thailand 1993 Chulalongkorn. p 361). // // Arguments: pszBoundaryChar - Contain pointer to at least two thai character // character next to each other which we will // use to calculate wheather we should or // should not merge the two word. // // iPrevWordLen - // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline bool CThaiBreakTree::ShouldMerge(const WCHAR* pwszPrevWord, unsigned int iPrevWordLen, unsigned int iMergeWordLen, DWORD dwPrevTag) { const WCHAR* pwszBoundary = pwszPrevWord + iPrevWordLen - 1; assert(iMergeWordLen != 0); assert(iPrevWordLen != 0); // There are very few words in Thai that are 4 character or less, therefore we should // found a pair that less than 4 character we should merge. // Or if merge word length is one than also merge. // Of if last cluster of the word is a Thanthakhat(Karan) we should always merge. if (iPrevWordLen + iMergeWordLen <= 4 || iMergeWordLen == 1 || (iMergeWordLen == 2 && *(pwszBoundary + iMergeWordLen) == THAI_Thanthakhat)) return true; if (iPrevWordLen >=2) { const WCHAR* pwszPrevCharBoundary = pwszBoundary - 1; // TO IMPROVE: It better to check the last character of Previous word, it can give us a // much better guess if ((*pwszPrevCharBoundary == THAI_Vowel_Sign_Mai_HanAkat || *pwszBoundary == THAI_Vowel_Sign_Mai_HanAkat) || (*pwszPrevCharBoundary == THAI_Tone_Mai_Tri || *pwszBoundary == THAI_Tone_Mai_Tri) || (*pwszPrevCharBoundary == THAI_Sara_Ue || *pwszBoundary == THAI_Sara_Ue) ) return true; } // If the first character of the next word is mostly likly the beginning // character and last character of the previous word is not sara-A than // we have a high probability that we found a begin of word boundary, // therefore we shouldn't merge. if ( (IsThaiMostlyBeginCharacter(pwszBoundary[1]) && *pwszBoundary != THAI_Vowel_Sara_A) ) return false; // If the last character of the previous word is mostly likely an ending // character than, than there is a high probability that the found a boundary. // There are very few words in Thai that are 4 character or less, therefore we should // found a pair that less than 4 character we should merge. if (IsThaiMostlyLastCharacter(*pwszBoundary)) return false; // O10.192931 Adding Diacritic check rules. We might want to expand this to more diacritic // for now Mai HanAkart would do. It is highly unlikely that a word contain more than 1 of Mai HanAkart diacritic. if (IsContain(pwszPrevWord,iPrevWordLen,THAI_Vowel_Sign_Mai_HanAkat) && IsContain(pwszBoundary + 1,iMergeWordLen,THAI_Vowel_Sign_Mai_HanAkat)) return false; if (iMergeWordLen == 3 && GetCluster(pwszBoundary + 1) == iMergeWordLen) { if (*(pwszBoundary + 2) == THAI_Vowel_Sara_I) { if (*(pwszBoundary+3) == THAI_Tone_Mai_Ek || *(pwszBoundary+3) == THAI_Tone_Mai_Tro) return false; } } // if previous tag is equal to Title Noun than the next word is highly likly to be a name. if (ExtractPOS(dwPrevTag) == 6) return false; // O11.134455. For the case of trailling punctuation. if (dwPrevTag == TAGPOS_PUNC && iMergeWordLen > 1 && iPrevWordLen > 1) return false; // The reason we are using 8 is because from corpora analysis // the average Thai word is about 7.732 characters. Or, if previous word is already // an unknown, to keep the amount of unknown low the unknown to previous words. if ( (iPrevWordLen + iMergeWordLen < 8) || (dwPrevTag == TAGPOS_UNKNOWN) ) return true; return false; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // Modifies: // // History: created 7/99 aarayas // 8/17/99 optimize some code. // // Notes: // //---------------------------------------------------------------------------- inline void CThaiBreakTree::AddBreakToList(unsigned int iNumBreak, unsigned int iNumUnknown) { #if defined (_DEBUG) breakArray[iNumBreak] = 0; #endif if (CompareSentenceStructure(iNumBreak, iNumUnknown)) { maxToken = maxLevel = iNumBreak; // This is ugly but it save 5 clock cycle. memcpy(maximalMatchingBreakArray,breakArray,maxToken); memcpy(maximalMatchingTAGArray,tagArray,sizeof(DWORD)*maxToken); maximalMatchingBreakArray[maxToken] = 0; maximalMatchingTAGArray[maxToken] = 0; } } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: The function compares sentence structure of // maximalMatchingPOSArray with posArray. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- inline bool CThaiBreakTree::CompareSentenceStructure(unsigned int iNumBreak, unsigned int iNumUnknownPOSArray) { if ( (iNumBreak < maxLevel) && (iNumUnknownMaximalPOSArray >= iNumUnknownPOSArray) ) { iNumUnknownMaximalPOSArray = iNumUnknownPOSArray; return true; } else if (iNumBreak == maxLevel) { // true - maximal matching has a larger unknown. if (iNumUnknownMaximalPOSArray > iNumUnknownPOSArray) { iNumUnknownMaximalPOSArray = iNumUnknownPOSArray; return true; } for(unsigned int i = 0; i <= iNumBreak; i++) { maximalMatchingPOSArray[i] = ExtractPOS(maximalMatchingTAGArray[i]); POSArray[i] = ExtractPOS(tagArray[i]); } // Determine if the sentence structure is like any one of the sentence // sentence structure in our corpora. if ( (IsSentenceStruct(POSArray, iNumBreak)) && (!IsSentenceStruct(maximalMatchingPOSArray, iNumBreak)) ) { iNumUnknownMaximalPOSArray = iNumUnknownPOSArray; return true; } else if (iNumUnknownMaximalPOSArray == iNumUnknownPOSArray) { // Determine the frequency of word used in the sentence. unsigned int iFrequencyArray = 500; unsigned int iFrequencyMaximalArray = 500; for(unsigned int i = 0; i <= iNumBreak; i++) { DetermineFrequencyWeight(ExtractFrq(maximalMatchingTAGArray[i]),&iFrequencyMaximalArray); DetermineFrequencyWeight(ExtractFrq(tagArray[i]),&iFrequencyArray); } return (iFrequencyArray > iFrequencyMaximalArray); } } return false; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool CThaiBreakTree::IsSentenceStruct(const WCHAR* pos, unsigned int iPosLen) { // Declare and initialize all local variables. unsigned int i = 0; thaiSentIter.Reset(); if (!thaiSentIter.Down()) return FALSE; while (TRUE) { thaiSentIter.GetNode(); if (thaiSentIter.pos == pos[i]) { i++; if (thaiSentIter.fWordEnd && i == iPosLen) { return TRUE; } else if (i == iPosLen) break; // Move down the Trie Branch. else if (!thaiSentIter.Down()) break; } // Move right of the Trie Branch else if (!thaiSentIter.Right()) break; } return FALSE; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- float CThaiBreakTree::BigramProbablity(DWORD dwTag1,DWORD dwTag2) { unsigned int iWeight = 4; // TODO : Use the distribution of word category to determine optimial search - exmaple // NOUN VERB ADVERB CLASSIFIER CONJETURE PREP et.... // TODO : Once we got trigram use it to create bigram probability as well. if ( (dwTag1 != TAGPOS_UNKNOWN) && (dwTag2 != TAGPOS_UNKNOWN) ) { WCHAR pos1 = ExtractPOS(dwTag1); WCHAR pos2 = ExtractPOS(dwTag2); // case NCMN VATT /// a common noun is often followed by attributive verb(adjective) // Example: (In Thai) book good, people nice if (pos1 == 5 && pos2 == 13) iWeight += 10; // case NTTL NPRP // a title noun is often followed by proper noun // Example: Dr. Athapan, Mr. Sam else if (pos1 == 6 && pos2 == 1) iWeight += 5; // case JSBR (XVAM || VSTA) // a subordinating conjunction is often followed by preverb auxillary or Active verb // Example: (In Thai) Because of , Because see else if (pos1 == 39 && (pos2 == 15 || pos2 == 12)) iWeight += 10; // case ADVN NCMN // a Adverb normal form is often followed by Common noun (Bug 55057). // Example: (In Thai) under table. else if (pos1 == 28 && pos2 == 5) iWeight += 5; // case VACT XVAE else if (pos1 == 11 && pos2 == 18) iWeight += 5; // case VACT DDBQ // Active verb follow by Definite determiner. // Example: (In Thai) working for, singing again. else if (pos1 == 11 && pos2 == 21) iWeight += 10; // case VATT VACT // adjective are followed by verb. // Example: (In Thai keyboard)sivd;jk else if (pos1 == 13 && pos2 == 11) iWeight += 2; // case XVAE VACT // a post verb auxilliary are often followed by an active verb. // Example: (In Thai) come singing, go work. else if (pos1 == 18 && pos2 == 11) iWeight += 10; // case CLTV NCMN // a Collective classfier are often followed by Common Noun // Example: (In Thai) group people, flock bird else if (pos1 == 33 && pos2 == 5) iWeight += 5; // case NEG (VACT || VSTA || VATT || XVAM || XVAE) // a negator (ie. not) is often followed by some kind of VERB. // Example: He is not going. else if (pos1 == 46 && (pos2 == 11 || pos2 == 12 || pos2 == 13 || pos2 == 15 || pos2 == 16)) iWeight += 8; // case EAFF or EITT // Ending for affirmative, and interrogative are more often ending of the pair // Example: (In Thai) Krub, Ka, else if (pos2 == 44 || pos2 == 45) iWeight += 3; // case VATT and VATT // Attributive Verb and Attributive Verb occur when often in spoken laguages. // Example: she is reall really cute. else if (pos1 == 13 && pos2 == 13) iWeight += 2; // case NCMN and DDAC // Common Noun and Definitive determiner classifier. // Example: Food here (Thai) else if (pos1 == 5 && pos2 == 20) iWeight += 3; // case CMTR and JCMP // Measurement classifier and Comparative conjunction, are likly to appear in Thai. // Example: year about (Thai) -> English about a year. else if (pos1 == 34 && pos2 == 38) iWeight += 5; // case XVBB and VACT else if (pos1 == 17 && pos2 == 11) iWeight += 5; // case NCMN and NCMN // Common Noun and Common Noun // Example: electric bulb(in thai) else if (pos1 == 5 && pos2 == 5) iWeight += 1; } DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight); DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight); return (float) iWeight; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- DWORD CThaiBreakTree::TrigramProbablity(DWORD dwTag1,DWORD dwTag2,DWORD dwTag3) { DWORD iWeight = 6; if ( (dwTag1 != TAGPOS_UNKNOWN) && (dwTag2 != TAGPOS_UNKNOWN) && (dwTag3 != TAGPOS_UNKNOWN) ) { WCHAR pos1 = ExtractPOS(dwTag1); WCHAR pos2 = ExtractPOS(dwTag2); WCHAR pos3 = ExtractPOS(dwTag3); // optimization we if any POS is none than trigram shouldn't therefor no need to search. if ( pos1 != 0 && pos2 != 0 && pos3 != 0) { WCHAR posArray[4]; posArray[0] = pos1; posArray[1] = pos2; posArray[2] = pos3; posArray[3] = 0; iWeight += thaiTrigramIter.GetProb(posArray); } } DetermineFrequencyWeight(ExtractFrq(dwTag1), &iWeight); DetermineFrequencyWeight(ExtractFrq(dwTag2), &iWeight); DetermineFrequencyWeight(ExtractFrq(dwTag3), &iWeight); // We reached zero probablity. return (DWORD)iWeight; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- unsigned int CThaiBreakTree::TrigramBreak(WCHAR* pwchBegin, WCHAR* pwchEnd1) { // Declare and initialize local variables. WCHAR* pwchBeginWord = pwchBegin; WCHAR* pwchIndex = pwchBegin; unsigned int iWordLen; unsigned int iNumCluster = 1; unsigned int iNumLastCluster; unsigned int iBreakIndex = 0; BYTE nextBreakArray[MAXBREAK]; DWORD nextTagArray[MAXBREAK]; unsigned int iNextBreakIndex; // index for array nextBreakArray and nextTagArray. bool fFoundMatch; unsigned int iWeight; unsigned int iSumWeight; unsigned int iPrevWeight; unsigned int iCurrWeight; BYTE iSoundexWordLen; DWORD iPrevProbability; DWORD iCurrentProbability; DWORD dwTagTemp; DWORD dwLastTag; int i; // temporary int for use as need. bool fBeginNewWord; bool fEndWord = false; pszEnd = pwchEnd1; breakArray[0] = 0; POSArray[0] = 0; tagArray[0] = 0; nextBreakArray[0] = 0; nextTagArray[0] = 0; while (true) { // Reset Iterator for generating break for new word. fFoundMatch = false; fBeginNewWord = true; // Get begin word string for next round of word break. pwchIndex = pwchBeginWord; iNextBreakIndex = 0; if (pwchIndex == pszEnd) break; while(true) { iNumLastCluster = iNumCluster; iNumCluster = GetCluster(pwchIndex); if (!thaiTrieIter.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord)) { if ((iNumCluster == 0) && (pwchIndex == pszEnd)) fEndWord = true; else break; } fBeginNewWord = false; pwchIndex += iNumCluster; if (thaiTrieIter.fWordEnd) { if (thaiTrieIter.m_fThaiNumber) { // If we have Thai number accumulate it as one break. assert(iNumCluster == 1); fFoundMatch = true; nextBreakArray[0]= (BYTE)(pwchIndex - pwchBeginWord); nextTagArray[0] = TAGPOS_NCNM; iNextBreakIndex = 1; } else { fFoundMatch = true; nextBreakArray[iNextBreakIndex] = (BYTE)(pwchIndex - pwchBeginWord); nextTagArray[iNextBreakIndex] = thaiTrieIter.dwTag; iNextBreakIndex++; } if (pwchIndex >= pszEnd) { assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton. assert(iNextBreakIndex != 0); if ( iNumCluster == 1 && *(pwchIndex - 1) == L'.' && iBreakIndex > 0 && iNextBreakIndex == 1 && tagArray[iBreakIndex - 1] == TAGPOS_ABBR ) { // backtrack one if we have abbrivation case. // ex. B.K.K. (in Thai). (more info O11.145042.) breakArray[iBreakIndex - 1] += nextBreakArray[iNextBreakIndex - 1]; return iBreakIndex; } breakArray[iBreakIndex] = nextBreakArray[iNextBreakIndex - 1]; tagArray[iBreakIndex] = nextTagArray[iNextBreakIndex - 1]; return (++iBreakIndex); } } else if ((pwchIndex >= pszEnd && iNextBreakIndex == 0) || fEndWord) { assert(pwchIndex <= pszEnd); // assert should never come up - if it appear likely bug in GetCluster funciton. iWordLen = (unsigned int) (pwchIndex - pwchBeginWord); switch (iWordLen) { case 0: if (iBreakIndex > 0) { // if We have a length of one character add it to previous node. breakArray[iBreakIndex - 1] += (BYTE) iNumCluster; tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN; } else { // if this is the first break create a new break. breakArray[iBreakIndex] = (BYTE) iNumCluster; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; iBreakIndex++; } break; case 1: if (iBreakIndex > 0) { // if We have a length of one character add it to previous node. breakArray[iBreakIndex - 1] += (BYTE) iWordLen; tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN; } else { // if this is the first break create a new break. breakArray[iBreakIndex] = (BYTE) iWordLen; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; iBreakIndex++; } break; default: if ( iBreakIndex > 0 && ShouldMerge(pwchBeginWord - breakArray[iBreakIndex - 1], breakArray[iBreakIndex - 1], iWordLen , tagArray[iBreakIndex - 1]) ) { breakArray[iBreakIndex - 1] += (BYTE) iWordLen; tagArray[iBreakIndex - 1] = TAGPOS_UNKNOWN; } else { breakArray[iBreakIndex] = (BYTE) iWordLen; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; iBreakIndex++; } } return iBreakIndex; } else if (pwchIndex >= pszEnd) { // O10.229346. If we get here we are at the end of word or end of sentence, // We will need to decide what to depending on if we found the word or not. break; } } if (fFoundMatch) // Longest Matching. { // If we only found one break, than say it the maximum. if (1 == iNextBreakIndex) { if ( nextBreakArray[0] == 2 && iNumCluster + iNumLastCluster == 2 && iBreakIndex > 0 && *(pwchBeginWord+1) == L'.' && tagArray[iBreakIndex - 1] == TAGPOS_ABBR ) { // backtrack one if we have abbrivation case. // ex. B.K.K. (in Thai). (more info O11.145042.) breakArray[iBreakIndex - 1] += nextBreakArray[0]; pwchBeginWord += nextBreakArray[0]; } else if ( iBreakIndex > 0 && IsThaiEndingSign(*pwchBeginWord) && iNumCluster == 1 ) { breakArray[iBreakIndex - 1] += nextBreakArray[0]; pwchBeginWord += nextBreakArray[0]; } else { breakArray[iBreakIndex] = nextBreakArray[0]; tagArray[iBreakIndex] = nextTagArray[0]; pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round. iBreakIndex++; } } else { bool fWeightCompare = false; iSumWeight = 0; iPrevWeight = 0; iCurrWeight = 0; iPrevProbability = 0; iCurrentProbability = 0; dwLastTag = TAGPOS_UNKNOWN; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; for (i = (iNextBreakIndex - 1); i >= 0 ; i--) { if ( iBreakIndex == 0) { iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp); if (iWeight != 0) // Bigram Probability iCurrentProbability = (DWORD)BigramProbablity(nextTagArray[i], dwTagTemp); } else { iWeight = GetWeight(pwchBeginWord + nextBreakArray[i], &dwTagTemp); if (iBreakIndex == 1) // Get Trigram Probability. iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 1], nextTagArray[i], dwTagTemp); else if (iBreakIndex >= 2) { // Get Trigram Probability. iCurrentProbability = TrigramProbablity(tagArray[iBreakIndex - 2], tagArray[iBreakIndex - 1], nextTagArray[i]); if (iWeight != 0) iCurrentProbability += (DWORD)BigramProbablity(nextTagArray[i],dwTagTemp); } } fWeightCompare = false; iCurrWeight = iWeight + nextBreakArray[i]; if (iPrevProbability == 0 && (iCurrWeight+1) == iSumWeight && iCurrentProbability > 5) { fWeightCompare = true; } else if (iCurrWeight == iSumWeight && ( Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight || iCurrentProbability > iPrevProbability)) { fWeightCompare = true; } else if ( iWeight >= iPrevWeight - 1 && iPrevProbability > 0 && iPrevProbability < 10 && iCurrentProbability > iPrevProbability * 5000 ) { // O11.187913. We'll trust our trigram data more if the current probability is // so much greater than previous probability. // // * Note: we could probably use one of GA algorithm to get better value than 5K. fWeightCompare = true; } // Store the string the best maximum weight, if the pair is equal // store the string with maxim if ( iCurrWeight > iSumWeight || fWeightCompare) // ( (iCurrWeight == iSumWeight) && // ( (Maximum(iWeight,nextBreakArray[i]) <= iPrevWeight) || (iCurrentProbability > iPrevProbability) ) )) { if (iCurrentProbability >= iPrevProbability || iSumWeight < iCurrWeight) { iSumWeight = Maximum(iWeight,1) + nextBreakArray[i]; iPrevWeight = Maximum(iWeight,nextBreakArray[i]); breakArray[iBreakIndex] = nextBreakArray[i]; tagArray[iBreakIndex] = nextTagArray[i]; iPrevProbability = iCurrentProbability; dwLastTag = dwTagTemp; } } } pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round. iBreakIndex++; } } else { // NOMATCH_FOUND iWordLen = (unsigned int)(pwchIndex - pwchBeginWord); if (iBreakIndex > 0) { i = iBreakIndex - 1; // set i to previous break if (iWordLen == 0) { if (iNumCluster == 1 && *pwchBeginWord == L',' && IsThaiChar(*(pwchBeginWord-breakArray[i])) ) { // We should not merge comma into the word, only merge comma to // Number. // TODO: Should add TAGPOS_PUNCT. breakArray[iBreakIndex] = (BYTE) iNumCluster; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round. iBreakIndex++; } else if (iNumCluster > 1 && *pwchBeginWord == L'.') { // O11.134455. This is an ellipse case we shouldn't merge this string. breakArray[iBreakIndex] = (BYTE) iNumCluster; tagArray[iBreakIndex] = TAGPOS_PUNC; pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round. iBreakIndex++; } else if (ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iNumCluster, tagArray[i])) { // If word length is null use the cluster add to previous node. breakArray[i] += (BYTE) iNumCluster; tagArray[i] = TAGPOS_UNKNOWN; pwchBeginWord += iNumCluster; // update begin word for next round. } else { // Add the unknown word to list. breakArray[iBreakIndex] = (BYTE) iNumCluster; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; pwchBeginWord += (BYTE) iNumCluster; // update begin word for next round. iBreakIndex++; } } else { // Try checking for abbrivations. if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.') { // The word is an abbrivated words. // TODO: #1. Add TAGPOS_ABBRV. // TODO: #2. May need to add rules code abbrivated word with 3 letters. breakArray[iBreakIndex] = iWordLen + iNumCluster; tagArray[iBreakIndex] = TAGPOS_ABBR; pwchBeginWord += breakArray[iBreakIndex]; iBreakIndex++; } else if (iWordLen == 1 && tagArray[i] == TAGPOS_ABBR && *(pwchBeginWord+1) == L'.' && IsThaiConsonant(*pwchBeginWord) && pwchBeginWord+1 < pszEnd ) { // O11.145042. This is the case where we are a , the // likely hood is the character is also an abbrivation. breakArray[iBreakIndex - 1] += iWordLen + 1; pwchBeginWord += iWordLen + 1; } // Abbreviation are usally 3 characters. else if ( iWordLen == 2 && IsThaiConsonant(*(pwchBeginWord+2)) && *(pwchBeginWord+3) == L'.' && tagArray[i] != TAGPOS_UNKNOWN ) { // O11.80619. This is the case where we are a breakArray[iBreakIndex] = iWordLen + 1; tagArray[iBreakIndex] = TAGPOS_ABBR; pwchBeginWord += breakArray[iBreakIndex]; iBreakIndex++; } // Perhase Misspelled word try use sounding to spell the words. // Try soundex two word back. else if ( (iBreakIndex >= 2) && ( (iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i] - breakArray[i - 1])) > (BYTE) (breakArray[i] + breakArray[i - 1]) ) && GetWeight(pwchBeginWord - breakArray[i] - breakArray[i - 1] + iSoundexWordLen) ) { // Resize the word. pwchBeginWord = (pwchBeginWord - breakArray[i] - breakArray[i - 1]) + iSoundexWordLen; // update begin word for next round. breakArray[i - 1] = iSoundexWordLen; tagArray[i - 1] = thaiTrieIter.dwTag; iBreakIndex--; // Decrement iBreakIndex. } // Try soundex one words back. else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord - breakArray[i])) > (BYTE) breakArray[i]) && GetWeight(pwchBeginWord - breakArray[i] + iSoundexWordLen) && ExtractPOS(tagArray[i]) != 6) // Make sure that previous word is not a NTTL. { // Resize the word pwchBeginWord = (pwchBeginWord - breakArray[i]) + iSoundexWordLen; // update begin word for next round. breakArray[i] = iSoundexWordLen; tagArray[i] = thaiTrieIter.dwTag; } // Try soundex on this word. else if (((iSoundexWordLen = (BYTE) SoundexSearch(pwchBeginWord)) > (BYTE) iWordLen) && GetWeight(pwchBeginWord + iSoundexWordLen) ) { // Resize the word. breakArray[iBreakIndex] = iSoundexWordLen; tagArray[iBreakIndex] = thaiTrieIter.dwTag; pwchBeginWord += iSoundexWordLen; // update begin word for next round. iBreakIndex++; } else if ( ShouldMerge(pwchBeginWord - breakArray[i], breakArray[i], iWordLen , tagArray[i]) ) { // Merge the words. breakArray[i] += (BYTE) iWordLen; tagArray[i] = TAGPOS_UNKNOWN; pwchBeginWord += iWordLen; // update begin word for next round. } else { // Add the unknown word to list. breakArray[iBreakIndex] = (BYTE) iWordLen; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; pwchBeginWord += iWordLen; // update begin word for next round. iBreakIndex++; } } } else { // Add unknown word to list and mark it. if (iWordLen == 0) { // If word length is null use the cluster add to previous node. breakArray[iBreakIndex] = (BYTE) iNumCluster; tagArray[iBreakIndex] = TAGPOS_UNKNOWN; pwchBeginWord += iNumCluster; // update begin word for next round. } else { // We we are here there are 2 case that can happen: // 1. We take too little into our unknown. // 2. We take too much into our unknown word. // Have we taken too little check if this unknown word is an abbrivated words. if (iWordLen == 1 && iNumCluster == 2 && pwchIndex[1] == L'.') breakArray[iBreakIndex] = iWordLen + iNumCluster; // Try to see if we are taking to much, see if we can get a Weight from last cluster. else if ( (iWordLen - iNumLastCluster > 0) && GetWeight(pwchIndex - iNumLastCluster) ) { breakArray[iBreakIndex] = iWordLen - iNumLastCluster; if (breakArray[iBreakIndex] == 1) { iWeight = GetWeight(pwchIndex - iNumLastCluster); if (iWeight > iNumLastCluster && iWeight < 40) breakArray[iBreakIndex] += (BYTE) iWeight; else breakArray[iBreakIndex] += (BYTE) iNumLastCluster; } } // We may have a case of iWordLen is 1 and iNumCluster, we have a case of misspelled // an extra character is incorrectly inserted over a correct word. else if (iWordLen == 1) { iWeight = GetWeight(pwchIndex - iWordLen); if (iWeight > iNumCluster && iWeight < 40) breakArray[iBreakIndex] = iWordLen + iWeight; else breakArray[iBreakIndex] = iWordLen + iNumCluster; } else breakArray[iBreakIndex] = (BYTE) iWordLen; if (iNumLastCluster + iNumCluster == iWordLen && *(pwchBeginWord+iNumLastCluster) == L'.') { tagArray[iBreakIndex] = TAGPOS_ABBR; } else tagArray[iBreakIndex] = TAGPOS_UNKNOWN; pwchBeginWord += breakArray[iBreakIndex]; // update begin word for next round. } iBreakIndex++; } } } return iBreakIndex; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // Modifies: // // History: created 8/99 aarayas // // Notes: // //---------------------------------------------------------------------------- int CThaiBreakTree::Soundex(WCHAR* word) { return thaiTrieIter.Soundex(word); } //+--------------------------------------------------------------------------- // // Function: GetCluster // // Synopsis: The function return the next number of character which represent // a cluster of Thai text. // // ie. Kor Kai, Kor Kai -> 1 // Kor Kai, Sara Um -> 2 // // * Note this function will not return no more than 3 character, // for cluster as this would represent invalid sequence of character. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- unsigned int CThaiBreakTree::GetCluster(const WCHAR* pszIndex) { bool fHasSaraE; int iRetValue = 0; bool fNeedEndingCluster = false; if (pszIndex == pszEnd) return 0; while (true) { fHasSaraE= false; // Take all begin cluster character. while (IsThaiBeginClusterCharacter(*pszIndex)) { if (*pszIndex == THAI_Vowel_Sara_E) fHasSaraE = true; pszIndex++; iRetValue++; } if (IsThaiConsonant(*pszIndex)) { pszIndex++; iRetValue++; while (IsThaiUpperAndLowerClusterCharacter(*pszIndex)) { // Mai Han Akat is a special type of cluster that will need at lease // one ending cluster. if (*pszIndex == THAI_Vowel_Sign_Mai_HanAkat) fNeedEndingCluster = true; // In Thai it isn't possible to make a sound if we have the SaraE // following by vowel below vowel. else if ( fHasSaraE && ( (*pszIndex == THAI_Vowel_Sara_II) || (*pszIndex == THAI_Tone_MaiTaiKhu) || (*pszIndex == THAI_Vowel_Sara_I) || (*pszIndex == THAI_Sara_Uee) )) fNeedEndingCluster = true; pszIndex++; iRetValue++; } while (IsThaiEndingClusterCharacter(*pszIndex)) { pszIndex++; iRetValue++; fNeedEndingCluster = false; } /* // Include period as part of a cluster. Bug#57106 if (*pszIndex == 0x002e) { pszIndex++; iRetValue++; fNeedEndingCluster = false; } */ } if (fNeedEndingCluster) fNeedEndingCluster = false; else break; } if (iRetValue == 0) { // O11.134455. Ellipse case we go to combine ellipses to one cluster. if (*pszIndex == 0x002e) { while (*pszIndex == 0x002e && pszIndex <= pszEnd) { pszIndex++; iRetValue++; } } else iRetValue++; // The character is probably a punctuation. } if (pszIndex > pszEnd) { // We need to do this as we have gone over end buff boundary. iRetValue -= (int) (pszIndex - pszEnd); pszIndex = pszEnd; } return iRetValue; } //+--------------------------------------------------------------------------- // // Class: CThaiBreakTree // // Synopsis: // // Arguments: // // wzWord - input string. (in) // iWordLen - input string length. (in) // Alt - find close alternate word (in) // pBreakPos - array of break position allways 5 byte. (out) // // Modifies: // // History: created 3/00 aarayas // // Notes: // //---------------------------------------------------------------------------- int CThaiBreakTree::FindAltWord(WCHAR* pwchBegin,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos) { // Declare and initialize local variables. unsigned int iNumCluster = 1; WCHAR* pwchBeginWord = pwchBegin; WCHAR* pwchIndex = pwchBegin; bool fBeginNewWord = true; unsigned int iBreakIndex = 0; unsigned int iBreakTemp = 0; unsigned int iBreakTemp1 = 0; unsigned int iBreakTemp2 = 0; pszEnd = pwchBegin + iWordLen; // TODO: Need to clean this code up. switch(Alt) { case 3: while (true) { iNumCluster = GetCluster(pwchIndex); if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord)) return iBreakIndex; fBeginNewWord = false; pwchIndex += iNumCluster; if (thaiTrieIter1.fWordEnd) { iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord); // reached the end of word unable to find alt word. if (iBreakTemp >= iWordLen) return 0; iBreakTemp1 = GetWeight(pwchIndex); // reached the end of word unable to find alt word. if (iBreakTemp + iBreakTemp1 >= iWordLen) return 0; iBreakTemp2 = GetWeight(pwchIndex+iBreakTemp1); if (iBreakTemp + iBreakTemp1 + iBreakTemp2 == iWordLen) { pBreakPos[0] = (BYTE)iBreakTemp; pBreakPos[1] = (BYTE)iBreakTemp1; pBreakPos[2] = (BYTE)iBreakTemp2; return 3; } } if (pwchIndex >= pszEnd) return iBreakIndex; } break; case 2: while (true) { iNumCluster = GetCluster(pwchIndex); if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord)) return iBreakIndex; fBeginNewWord = false; pwchIndex += iNumCluster; if (thaiTrieIter1.fWordEnd) { iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord); // reached the end of word unable to find alt word. if (iBreakTemp >= iWordLen) return 0; iBreakTemp1 = GetWeight(pwchIndex); if (iBreakTemp + iBreakTemp1 == iWordLen) { pBreakPos[0] = (BYTE)iBreakTemp; pBreakPos[1] = (BYTE)iBreakTemp1; return 2; } } if (pwchIndex >= pszEnd) return iBreakIndex; } break; default: case 1: while (iBreakIndex < Alt) { iNumCluster = GetCluster(pwchIndex); if (!thaiTrieIter1.MoveCluster(pwchIndex, iNumCluster, fBeginNewWord)) return iBreakIndex; fBeginNewWord = false; pwchIndex += iNumCluster; if (thaiTrieIter1.fWordEnd) { fBeginNewWord = true; iBreakTemp = (unsigned int)(pwchIndex - pwchBeginWord); // reached the end of word unable to find alt word. if (iBreakTemp >= iWordLen) return 0; iBreakTemp1 = GetLongestSubstring(pwchBeginWord,iWordLen); if (iBreakTemp1 > iBreakTemp && iBreakTemp1 < iWordLen) pBreakPos[iBreakIndex] = (BYTE) iBreakTemp1; else pBreakPos[iBreakIndex] = (BYTE) iBreakTemp; pwchBeginWord += pBreakPos[iBreakIndex]; iWordLen -= pBreakPos[iBreakIndex]; iBreakIndex++; } if (pwchIndex >= pszEnd) return iBreakIndex; } break; } return iBreakIndex; }