//+--------------------------------------------------------------------------- // // // CThaiWordBreak // // History: // created 7/99 aarayas // // ©1999 Microsoft Corporation //---------------------------------------------------------------------------- #include "cthwb.hpp" //+--------------------------------------------------------------------------- // // Function: ExtractALT // // Synopsis: The functions takes a tag and return Alternate Tags. // // Arguments: // // Modifies: // // History: created 3/00 aarayas // // Notes: // //---------------------------------------------------------------------------- inline BYTE ExtractALT(DWORD dwTag) { return (BYTE) ( (dwTag & iAltMask) >> iAltShift); } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: constructor // // Arguments: // // Modifies: // // History: created 8/00 aarayas // // Notes: // //---------------------------------------------------------------------------- CThaiWordBreak::CThaiWordBreak() { wordCount[0] = 0; } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: destructor // // Arguments: // // Modifies: // // History: created 8/00 aarayas // // Notes: // //---------------------------------------------------------------------------- CThaiWordBreak::~CThaiWordBreak() { wordCount[0] = 0; #if defined (_DEBUG) assert(listWordBreak.length == 0); #endif } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: Initialize ThaiWordBreak. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- PTEC CThaiWordBreak::Init(const WCHAR* wzFileName, const WCHAR* wzFileNameTrigram) { // Declare and Initialize local variables. PTEC retValue = m_trie.Init(wzFileName); if (retValue == ptecNoErrors) { retValue = m_trie_trigram.Init(wzFileNameTrigram); } // new memory management listWordBreak.Init(&m_trie,&m_trie_trigram); for (int i = 0; i < 10; i++) { listWordBreak.CreateWordBreak(); } return retValue; } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: Initialize ThaiWordBreak. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- PTEC CThaiWordBreak::InitRc(LPBYTE pThaiDic, LPBYTE pThaiTrigram, BOOL fSkipHeader) { // Declare and Initialize local variables. PTEC retValue = m_trie.InitRc(pThaiDic, fSkipHeader); if (retValue == ptecNoErrors) retValue = m_trie_trigram.InitRc(pThaiTrigram, fSkipHeader); // new memory management listWordBreak.Init(&m_trie,&m_trie_trigram); for (int i = 0; i < 10; i++) { listWordBreak.CreateWordBreak(); } return retValue; } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: UnInitialize ThaiWordBreak. // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- void CThaiWordBreak::UnInit() { // new memory management listWordBreak.Flush(); m_trie.UnInit(); #if defined (NGRAM_ENABLE) m_trie_sentence_struct.UnInit(); #endif m_trie_trigram.UnInit(); } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- enum merge_direction { NO_MERGE, MERGE_RIGHT, MERGE_LEFT, MERGE_BOTH_DIRECTIONS, NOT_SURE_WHICH_DIRECTION }; merge_direction DetermineMergeDirection(WCHAR wc) { if (wc == 0x0020) // space return NO_MERGE; else if ( wc == 0x0022 || // quotation mark wc == 0x0027 ) // apostrophe return NOT_SURE_WHICH_DIRECTION; else if ( wc == 0x0028 || // left parenthesis wc == 0x003C || // less than sign wc == 0x005B || // left square bracket wc == 0x007B || // left curly bracket wc == 0x2018 || // left single quotation mark wc == 0x201C || // left double quotation mark wc == 0x201F ) // left double quotation mark reverse return MERGE_RIGHT; // TODO: need to add MERGE_BOTH_DIRECTIONS for character joiner characters. // all other character merge left. return MERGE_LEFT; } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- DWORD_PTR CThaiWordBreak::CreateWordBreaker() { CThaiBreakTree* breakTree = NULL; breakTree = new CThaiBreakTree(); #if defined (NGRAM_ENABLE) if (breakTree) breakTree->Init(&m_trie, &m_trie_sentence_struct, &m_trie_trigram); #else if (breakTree) breakTree->Init(&m_trie, &m_trie_trigram); #endif return (DWORD_PTR)breakTree; } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- bool CThaiWordBreak::DeleteWordBreaker(DWORD_PTR dwBreaker) { CThaiBreakTree* breakTree = (CThaiBreakTree*) dwBreaker; if (breakTree) { delete breakTree; return true; } return false; } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: This funciton segment Thai word use for Indexing. // // Arguments: // wzString - input string. (in) // iStringLen - input string length. (in) // pBreakPos - array of break position. (out) // pThwb_Struct - array structure of THWB. (out) // iBreakMax - length of pBreakPos and // pThwb_Struct. (out) // // Modifies: // // History: created 3/00 aarayas // // Notes: // //---------------------------------------------------------------------------- int CThaiWordBreak::IndexWordBreak(WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,THWB_STRUCT* pThwb_Struct,unsigned int iBreakMax) { unsigned int iBreakIndex = 0; // Contain number of Breaks. CThaiBreakTree* breakTree = NULL; breakTree = new CThaiBreakTree(); if (breakTree) { breakTree->Init(&m_trie, &m_trie_trigram); iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,WB_INDEX,true,pThwb_Struct); delete breakTree; } return iBreakIndex; } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: // // Arguments: // // wzWord - input string. (in) // iWordLen - input string length. (in) // Alt - find close alternate word (in) // pBreakPos - array of break position allways 5 byte. (out) // // Modifies: // // History: created 3/00 aarayas // // Notes: // //---------------------------------------------------------------------------- int CThaiWordBreak::FindAltWord(WCHAR* wzWord,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos) { unsigned int iBreakIndex = 0; // Contain number of Breaks. CThaiBreakTree* breakTree = NULL; breakTree = new CThaiBreakTree(); if (breakTree) { breakTree->Init(&m_trie, &m_trie_trigram); iBreakIndex = breakTree->FindAltWord(wzWord,iWordLen,Alt,pBreakPos); delete breakTree; } return iBreakIndex; } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: This funciton segment Thai text segment them depending on the modes specifies. // // WB_LINEBREAK - is used when the application needs to break for line wrapping, // this mode takes into the consideration of punctuations. // // WB_NORMAL - is used when application wants determine word for searching, // autocorrect, etc. // // WB_SPELLER - not yet implemented, but same as normal with additional soundex // rules. // // Arguments: // // wzString - input string. (in) // iStringLen - input string length. (in) // pBreakPos - array of break position. (out) // iBreakMax - length of pBreakPos (out) // mode - either WB_LINEBREAK, etct (in) // fFastWordBreak - true for fast algorithm (in) // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- int CThaiWordBreak::FindWordBreak(WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,unsigned int iBreakMax, BYTE mode, bool fFastWordBreak) { unsigned int iBreakIndex = 0; // Contain number of Breaks. CThaiBreakTree* breakTree = NULL; #if defined(OLD) breakTree = new CThaiBreakTree(); #else // new memory management WordBreakElement* pWordBreakElement = NULL; pWordBreakElement = listWordBreak.GetFreeWB(); breakTree = pWordBreakElement->breakTree; listWordBreak.MarkWordBreak(pWordBreakElement,false); // Mark word break as in use. #endif if (breakTree) { #if defined(OLD) breakTree->Init(&m_trie, &m_trie_trigram); assert(mode != WB_INDEX); // If this assert come up, use function IndexWordBreak iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,mode,fFastWordBreak,0); delete breakTree; #else iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,mode,fFastWordBreak,0); listWordBreak.MarkWordBreak(pWordBreakElement,true); // Mark word break as free. #endif } else { assert(false); } return iBreakIndex; } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: This funciton segment Thai text segment them depending on the modes specifies. // // WB_LINEBREAK - is used when the application needs to break for line wrapping, // this mode takes into the consideration of punctuations. // // WB_NORMAL - is used when application wants determine word for searching, // autocorrect, etc. // // WB_SPELLER - not yet implemented, but same as normal with additional soundex // rules. // // WB_INDEX - is used when application wanted to do Thai indexing. // // // Arguments: // // wzString - input string. (in) // iStringLen - input string length. (in) // pBreakPos - array of break position. (out) // iBreakMax - length of pBreakPos (out) // must be greater than 1. // mode - either WB_LINEBREAK, etct (in) // fFastWordBreak - true for fast algorithm (in) // pThwb_Struct - array structure of THWB. (out) // // Modifies: // // History: created 11/99 aarayas // // Notes: // //---------------------------------------------------------------------------- int CThaiWordBreak::FindWordBreak(DWORD_PTR dwBreaker, WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,unsigned int iBreakMax, BYTE mode, bool fFastWordBreak, THWB_STRUCT* pThwb_Struct) { // Declare and Initialize all local variables. WCHAR* pwszRunStart = wzString; const WCHAR* pwszMax = wzString + iStringLen; WCHAR* pwch = wzString; bool fThaiRun = true; bool fCaretBreak = false; int iRunCount = 0; unsigned int i = 0; unsigned int iBreakIndex = 0; // Contain number of Breaks. merge_direction dirPrevious = NO_MERGE; merge_direction dirCurrent = NO_MERGE; CThaiBreakTree* breakTree = (CThaiBreakTree*) dwBreaker; // check for possible invalid arguments. assert(wzString != NULL); assert(iBreakMax > 0); assert(pBreakPos != NULL); if ((wzString == NULL) || (iBreakMax == 0) || (pBreakPos == NULL)) return 0; switch (mode) { case WB_LINEBREAK: case 2: // to be compatible with old api. do { while ((TWB_IsCharPunctW(*pwch) || TWB_IsCharWordDelimW(*pwch)) && iBreakIndex < iBreakMax && pwch < pwszMax) { dirCurrent = DetermineMergeDirection(*pwch); switch (dirCurrent) { case NO_MERGE: if ( pwch + 1 < pwszMax && *(pwch + 1) == THAI_Vowel_MaiYaMok && iBreakIndex > 0) { // Mai Ya Mok case only. pBreakPos[iBreakIndex - 1] += 2; dirCurrent = MERGE_LEFT; pwch++; } else pBreakPos[iBreakIndex++] = 1; break; case MERGE_RIGHT: if (dirPrevious == MERGE_RIGHT) pBreakPos[iBreakIndex - 1]++; else if (!TWB_IsCharPunctW(*(pwch + 1))) pBreakPos[iBreakIndex++] = 1; else pBreakPos[iBreakIndex++] = 1; break; case NOT_SURE_WHICH_DIRECTION: if (pwch == wzString || // if pwch is first character. TWB_IsCharWordDelimW(*(pwch - 1)) ) // if previous character is delimiter. { pBreakPos[iBreakIndex++] = 1; dirCurrent = MERGE_RIGHT; } else { pBreakPos[iBreakIndex - 1]++; dirCurrent = MERGE_LEFT; } break; case MERGE_LEFT: default: if (iBreakIndex == 0) if (pwch == wzString) pBreakPos[iBreakIndex++] = 1; else pBreakPos[iBreakIndex]++; else pBreakPos[iBreakIndex - 1]++; break; } dirPrevious = dirCurrent; pwch++; pwszRunStart = pwch; } assert(pwszRunStart == pwch); if( iBreakIndex >= iBreakMax || pwch >= pwszMax) break; // Detect if this is a Thai Run. fThaiRun = IsThaiChar(*pwch); do { pwch++; iRunCount++; } while ((IsThaiChar(*pwch)==fThaiRun && iRunCount < (MAXBREAK - 2) && *pwch && !TWB_IsCharWordDelimW(*pwch) && (pwch < pwszMax) ) || ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) )); if (fThaiRun) { unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch); for (i=0; i < iBreak && iBreakIndex breakArray[i]; } else pBreakPos[iBreakIndex++] = breakTree->breakArray[i]; dirPrevious = NO_MERGE; } } else { // Not a Thai Run simply put the whole thing in the break array. assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk. if (dirPrevious == MERGE_RIGHT) { assert(iBreakIndex != 0); pBreakPos[iBreakIndex - 1] += (BYTE) (pwch - pwszRunStart); } else pBreakPos[iBreakIndex++] = (BYTE) (pwch - pwszRunStart); } iRunCount = 0; pwszRunStart = pwch; // Make sure we haven't pass iBreakMax define by user else return whatever we got. } while(iBreakIndex < iBreakMax && pwch < pwszMax); break; case WB_INDEX: // Make sure argument is the same. assert(pThwb_Struct != NULL); if (pThwb_Struct == NULL) return 0; do { while (TWB_IsCharWordDelimW(*pwch) && pwszMax > pwch) pwch++; if( pwszRunStart < pwch) { pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart); pwszRunStart = pwch; } if( iBreakIndex >= iBreakMax || pwch >= pwszMax) break; // Detect if this is a Thai Run. fThaiRun = IsThaiChar(*pwch); //TODO: Add comma and period to Thai range. do { pwch++; iRunCount++; } while ((IsThaiChar(*pwch)==fThaiRun && iRunCount < (MAXBREAK - 2) && *pwch && !TWB_IsCharWordDelimW(*pwch) && (pwch < pwszMax) ) || ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) )); if (fThaiRun) { unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch); for (i=0; i < iBreak && iBreakIndex tagArray[i]); pBreakPos[iBreakIndex++] = breakTree->breakArray[i]; } } else { // Not a Thai Run simply put the whole thing in the break array. assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk. pThwb_Struct[iBreakIndex].fThai = false; pThwb_Struct[iBreakIndex].alt = 0; pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart); } iRunCount = 0; pwszRunStart = pwch; // Make sure we haven't pass iBreakMax define by user else return whatever we got. } while(iBreakIndex < iBreakMax && pwch < pwszMax); break; case WB_CARETBREAK: fCaretBreak = true; case WB_NORMAL: default: do { while (TWB_IsCharWordDelimW(*pwch) && pwszMax > pwch) pwch++; if (fCaretBreak) { // 010.181686. Taking care of puntuation. while (TWB_IsCharPunctW(*pwch) && pwszMax > pwch) pwch++; } if( pwszRunStart < pwch) { if (fCaretBreak && *pwszRunStart == L' ' && iBreakIndex > 0) { // 010.182719. For the MaiYaMok case we only accept if // space follow by MaiYaMok if (*pwch == THAI_Vowel_MaiYaMok && wzString < (pwszRunStart-1) && IsThaiChar(*(pwszRunStart-1)) && pwch == (pwszRunStart+1) ) { pBreakPos[iBreakIndex - 1] += 2; pwch++; } else // This is a caret movement features, should merge space to // the right words. pBreakPos[iBreakIndex - 1] += (BYTE)(pwch - pwszRunStart); } else pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart); pwszRunStart = pwch; } if( iBreakIndex >= iBreakMax || pwch >= pwszMax) break; // Detect if this is a Thai Run. fThaiRun = IsThaiChar(*pwch); //TODO: Add comma and period to Thai range. if (!fCaretBreak) { do { pwch++; iRunCount++; } while ((IsThaiChar(*pwch)==fThaiRun && iRunCount < (MAXBREAK - 2) && *pwch && !TWB_IsCharWordDelimW(*pwch) && (pwch < pwszMax) ) || ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) )); } else { do { pwch++; iRunCount++; } while ((IsThaiChar(*pwch)==fThaiRun && iRunCount < (MAXBREAK - 2) && *pwch && !TWB_IsCharWordDelimW(*pwch) && !TWB_IsCharPunctW(*pwch) && (pwch < pwszMax) ) || ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) )); } if (fThaiRun) { #if defined (NGRAM_ENABLE) if (!fFastWordBreak) { if (WordBreak(pwszRunStart,pwch)) for (i=0; i < breakTree.maxToken && iBreakIndex maximalMatchingBreakArray[i]; } else { unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch); for (i=0; i < iBreak && iBreakIndex breakArray[i]; } #else unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch); for (i=0; i < iBreak && iBreakIndex breakArray[i]; #endif } else { // Not a Thai Run simply put the whole thing in the break array. assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk. pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart); } iRunCount = 0; pwszRunStart = pwch; // Make sure we haven't pass iBreakMax define by user else return whatever we got. } while(iBreakIndex < iBreakMax && pwch < pwszMax); break; } #if defined (_DEBUG) unsigned int iTotalChar = 0; for (i = 0; i < iBreakIndex; i++) { iTotalChar += pBreakPos[i]; } if (iBreakIndex < iBreakMax) assert(iStringLen == iTotalChar); #endif return iBreakIndex; } //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- #if defined (NGRAM_ENABLE) BOOL CThaiWordBreak::WordBreak(WCHAR* pszBegin, WCHAR* pszEnd) { // Declare and Initialize all local variables. bool fWordEnd = false; bool fCorrectPath = false; WCHAR* pszIndex = pszBegin; int iNumCluster = 1; assert(pszBegin < pszEnd); // Make sure pszEnd is at least greater pszBegin. breakTree.GenerateTree(pszBegin, pszEnd); breakTree.MaximalMatching(); return (breakTree.maxToken > 0); } #endif //+--------------------------------------------------------------------------- // // Class: CThaiWordBreak // // Synopsis: // // Arguments: // // Modifies: // // History: created 7/99 aarayas // // Notes: // //---------------------------------------------------------------------------- BOOL CThaiWordBreak::Find(const WCHAR* wzString, DWORD* pdwPOS) { return m_trie.Find(wzString, pdwPOS); }