|
|
//+---------------------------------------------------------------------------
//
//
// CThaiWordBreak
//
// History:
// created 7/99 aarayas
//
// �1999 Microsoft Corporation
//----------------------------------------------------------------------------
#include "cthwb.hpp"
//+---------------------------------------------------------------------------
//
// Function: ExtractALT
//
// Synopsis: The functions takes a tag and return Alternate Tags.
//
// Arguments:
//
// Modifies:
//
// History: created 3/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline BYTE ExtractALT(DWORD dwTag) { return (BYTE) ( (dwTag & iAltMask) >> iAltShift); }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: constructor
//
// Arguments:
//
// Modifies:
//
// History: created 8/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiWordBreak::CThaiWordBreak() { wordCount[0] = 0; }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: destructor
//
// Arguments:
//
// Modifies:
//
// History: created 8/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiWordBreak::~CThaiWordBreak() { wordCount[0] = 0; #if defined (_DEBUG)
assert(listWordBreak.length == 0); #endif
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: Initialize ThaiWordBreak.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
PTEC CThaiWordBreak::Init(const WCHAR* wzFileName, const WCHAR* wzFileNameTrigram) { // Declare and Initialize local variables.
PTEC retValue = m_trie.Init(wzFileName); if (retValue == ptecNoErrors) { retValue = m_trie_trigram.Init(wzFileNameTrigram); }
// new memory management
listWordBreak.Init(&m_trie,&m_trie_trigram); for (int i = 0; i < 10; i++) { listWordBreak.CreateWordBreak(); }
return retValue; }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: Initialize ThaiWordBreak.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
PTEC CThaiWordBreak::InitRc(LPBYTE pThaiDic, LPBYTE pThaiTrigram, BOOL fSkipHeader) { // Declare and Initialize local variables.
PTEC retValue = m_trie.InitRc(pThaiDic, fSkipHeader); if (retValue == ptecNoErrors) retValue = m_trie_trigram.InitRc(pThaiTrigram, fSkipHeader);
// new memory management
listWordBreak.Init(&m_trie,&m_trie_trigram); for (int i = 0; i < 10; i++) { listWordBreak.CreateWordBreak(); }
return retValue; }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: UnInitialize ThaiWordBreak.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
void CThaiWordBreak::UnInit() { // new memory management
listWordBreak.Flush();
m_trie.UnInit(); #if defined (NGRAM_ENABLE)
m_trie_sentence_struct.UnInit(); #endif
m_trie_trigram.UnInit(); }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
enum merge_direction { NO_MERGE, MERGE_RIGHT, MERGE_LEFT, MERGE_BOTH_DIRECTIONS, NOT_SURE_WHICH_DIRECTION }; merge_direction DetermineMergeDirection(WCHAR wc) { if (wc == 0x0020) // space
return NO_MERGE; else if ( wc == 0x0022 || // quotation mark
wc == 0x0027 ) // apostrophe
return NOT_SURE_WHICH_DIRECTION; else if ( wc == 0x0028 || // left parenthesis
wc == 0x003C || // less than sign
wc == 0x005B || // left square bracket
wc == 0x007B || // left curly bracket
wc == 0x2018 || // left single quotation mark
wc == 0x201C || // left double quotation mark
wc == 0x201F ) // left double quotation mark reverse
return MERGE_RIGHT;
// TODO: need to add MERGE_BOTH_DIRECTIONS for character joiner characters.
// all other character merge left.
return MERGE_LEFT; } //+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
DWORD_PTR CThaiWordBreak::CreateWordBreaker() { CThaiBreakTree* breakTree = NULL; breakTree = new CThaiBreakTree(); #if defined (NGRAM_ENABLE)
if (breakTree) breakTree->Init(&m_trie, &m_trie_sentence_struct, &m_trie_trigram); #else
if (breakTree) breakTree->Init(&m_trie, &m_trie_trigram); #endif
return (DWORD_PTR)breakTree; }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiWordBreak::DeleteWordBreaker(DWORD_PTR dwBreaker) { CThaiBreakTree* breakTree = (CThaiBreakTree*) dwBreaker;
if (breakTree) { delete breakTree; return true; }
return false; }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: This funciton segment Thai word use for Indexing.
//
// Arguments:
// wzString - input string. (in)
// iStringLen - input string length. (in)
// pBreakPos - array of break position. (out)
// pThwb_Struct - array structure of THWB. (out)
// iBreakMax - length of pBreakPos and
// pThwb_Struct. (out)
//
// Modifies:
//
// History: created 3/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiWordBreak::IndexWordBreak(WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,THWB_STRUCT* pThwb_Struct,unsigned int iBreakMax) { unsigned int iBreakIndex = 0; // Contain number of Breaks.
CThaiBreakTree* breakTree = NULL; breakTree = new CThaiBreakTree();
if (breakTree) { breakTree->Init(&m_trie, &m_trie_trigram);
iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,WB_INDEX,true,pThwb_Struct);
delete breakTree; }
return iBreakIndex; }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// wzWord - input string. (in)
// iWordLen - input string length. (in)
// Alt - find close alternate word (in)
// pBreakPos - array of break position allways 5 byte. (out)
//
// Modifies:
//
// History: created 3/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiWordBreak::FindAltWord(WCHAR* wzWord,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos) { unsigned int iBreakIndex = 0; // Contain number of Breaks.
CThaiBreakTree* breakTree = NULL; breakTree = new CThaiBreakTree();
if (breakTree) { breakTree->Init(&m_trie, &m_trie_trigram);
iBreakIndex = breakTree->FindAltWord(wzWord,iWordLen,Alt,pBreakPos);
delete breakTree; }
return iBreakIndex; }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: This funciton segment Thai text segment them depending on the modes specifies.
//
// WB_LINEBREAK - is used when the application needs to break for line wrapping,
// this mode takes into the consideration of punctuations.
//
// WB_NORMAL - is used when application wants determine word for searching,
// autocorrect, etc.
//
// WB_SPELLER - not yet implemented, but same as normal with additional soundex
// rules.
//
// Arguments:
//
// wzString - input string. (in)
// iStringLen - input string length. (in)
// pBreakPos - array of break position. (out)
// iBreakMax - length of pBreakPos (out)
// mode - either WB_LINEBREAK, etct (in)
// fFastWordBreak - true for fast algorithm (in)
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiWordBreak::FindWordBreak(WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,unsigned int iBreakMax, BYTE mode, bool fFastWordBreak) { unsigned int iBreakIndex = 0; // Contain number of Breaks.
CThaiBreakTree* breakTree = NULL;
#if defined(OLD)
breakTree = new CThaiBreakTree(); #else
// new memory management
WordBreakElement* pWordBreakElement = NULL; pWordBreakElement = listWordBreak.GetFreeWB(); breakTree = pWordBreakElement->breakTree; listWordBreak.MarkWordBreak(pWordBreakElement,false); // Mark word break as in use.
#endif
if (breakTree) { #if defined(OLD)
breakTree->Init(&m_trie, &m_trie_trigram);
assert(mode != WB_INDEX); // If this assert come up, use function IndexWordBreak
iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,mode,fFastWordBreak,0);
delete breakTree; #else
iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,mode,fFastWordBreak,0); listWordBreak.MarkWordBreak(pWordBreakElement,true); // Mark word break as free.
#endif
} else { assert(false); }
return iBreakIndex; }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: This funciton segment Thai text segment them depending on the modes specifies.
//
// WB_LINEBREAK - is used when the application needs to break for line wrapping,
// this mode takes into the consideration of punctuations.
//
// WB_NORMAL - is used when application wants determine word for searching,
// autocorrect, etc.
//
// WB_SPELLER - not yet implemented, but same as normal with additional soundex
// rules.
//
// WB_INDEX - is used when application wanted to do Thai indexing.
//
//
// Arguments:
//
// wzString - input string. (in)
// iStringLen - input string length. (in)
// pBreakPos - array of break position. (out)
// iBreakMax - length of pBreakPos (out)
// must be greater than 1.
// mode - either WB_LINEBREAK, etct (in)
// fFastWordBreak - true for fast algorithm (in)
// pThwb_Struct - array structure of THWB. (out)
//
// Modifies:
//
// History: created 11/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiWordBreak::FindWordBreak(DWORD_PTR dwBreaker, WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,unsigned int iBreakMax, BYTE mode, bool fFastWordBreak, THWB_STRUCT* pThwb_Struct) { // Declare and Initialize all local variables.
WCHAR* pwszRunStart = wzString; const WCHAR* pwszMax = wzString + iStringLen; WCHAR* pwch = wzString; bool fThaiRun = true; bool fCaretBreak = false; int iRunCount = 0; unsigned int i = 0; unsigned int iBreakIndex = 0; // Contain number of Breaks.
merge_direction dirPrevious = NO_MERGE; merge_direction dirCurrent = NO_MERGE;
CThaiBreakTree* breakTree = (CThaiBreakTree*) dwBreaker;
// check for possible invalid arguments.
assert(wzString != NULL); assert(iBreakMax > 0); assert(pBreakPos != NULL); if ((wzString == NULL) || (iBreakMax == 0) || (pBreakPos == NULL)) return 0;
switch (mode) { case WB_LINEBREAK: case 2: // to be compatible with old api.
do { while ((TWB_IsCharPunctW(*pwch) || TWB_IsCharWordDelimW(*pwch)) && iBreakIndex < iBreakMax && pwch < pwszMax) { dirCurrent = DetermineMergeDirection(*pwch); switch (dirCurrent) { case NO_MERGE: if ( pwch + 1 < pwszMax && *(pwch + 1) == THAI_Vowel_MaiYaMok && iBreakIndex > 0) { // Mai Ya Mok case only.
pBreakPos[iBreakIndex - 1] += 2; dirCurrent = MERGE_LEFT; pwch++; } else pBreakPos[iBreakIndex++] = 1; break;
case MERGE_RIGHT: if (dirPrevious == MERGE_RIGHT) pBreakPos[iBreakIndex - 1]++; else if (!TWB_IsCharPunctW(*(pwch + 1))) pBreakPos[iBreakIndex++] = 1; else pBreakPos[iBreakIndex++] = 1; break;
case NOT_SURE_WHICH_DIRECTION: if (pwch == wzString || // if pwch is first character.
TWB_IsCharWordDelimW(*(pwch - 1)) ) // if previous character is delimiter.
{ pBreakPos[iBreakIndex++] = 1; dirCurrent = MERGE_RIGHT; } else { pBreakPos[iBreakIndex - 1]++; dirCurrent = MERGE_LEFT; } break; case MERGE_LEFT: default: if (iBreakIndex == 0) if (pwch == wzString) pBreakPos[iBreakIndex++] = 1; else pBreakPos[iBreakIndex]++; else pBreakPos[iBreakIndex - 1]++; break; } dirPrevious = dirCurrent; pwch++; pwszRunStart = pwch; }
assert(pwszRunStart == pwch);
if( iBreakIndex >= iBreakMax || pwch >= pwszMax) break;
// Detect if this is a Thai Run.
fThaiRun = IsThaiChar(*pwch); do { pwch++; iRunCount++; } while ((IsThaiChar(*pwch)==fThaiRun && iRunCount < (MAXBREAK - 2) && *pwch && !TWB_IsCharWordDelimW(*pwch) && (pwch < pwszMax) ) || ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) ));
if (fThaiRun) { unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch); for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++) { // First Thai character of the run.
if (dirPrevious == MERGE_RIGHT) { assert(iBreakIndex != 0); pBreakPos[iBreakIndex - 1] += breakTree->breakArray[i]; } else pBreakPos[iBreakIndex++] = breakTree->breakArray[i];
dirPrevious = NO_MERGE;
} } else { // Not a Thai Run simply put the whole thing in the break array.
assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk.
if (dirPrevious == MERGE_RIGHT) { assert(iBreakIndex != 0); pBreakPos[iBreakIndex - 1] += (BYTE) (pwch - pwszRunStart); } else pBreakPos[iBreakIndex++] = (BYTE) (pwch - pwszRunStart); } iRunCount = 0; pwszRunStart = pwch;
// Make sure we haven't pass iBreakMax define by user else return whatever we got.
} while(iBreakIndex < iBreakMax && pwch < pwszMax); break; case WB_INDEX: // Make sure argument is the same.
assert(pThwb_Struct != NULL); if (pThwb_Struct == NULL) return 0; do { while (TWB_IsCharWordDelimW(*pwch) && pwszMax > pwch) pwch++;
if( pwszRunStart < pwch) { pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart); pwszRunStart = pwch; }
if( iBreakIndex >= iBreakMax || pwch >= pwszMax) break;
// Detect if this is a Thai Run.
fThaiRun = IsThaiChar(*pwch); //TODO: Add comma and period to Thai range.
do { pwch++; iRunCount++; } while ((IsThaiChar(*pwch)==fThaiRun && iRunCount < (MAXBREAK - 2) && *pwch && !TWB_IsCharWordDelimW(*pwch) && (pwch < pwszMax) ) ||
( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) ));
if (fThaiRun) { unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch); for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++) { pThwb_Struct[iBreakIndex].fThai = true; pThwb_Struct[iBreakIndex].alt = ExtractALT(breakTree->tagArray[i]); pBreakPos[iBreakIndex++] = breakTree->breakArray[i]; } } else { // Not a Thai Run simply put the whole thing in the break array.
assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk.
pThwb_Struct[iBreakIndex].fThai = false; pThwb_Struct[iBreakIndex].alt = 0; pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart); } iRunCount = 0; pwszRunStart = pwch;
// Make sure we haven't pass iBreakMax define by user else return whatever we got.
} while(iBreakIndex < iBreakMax && pwch < pwszMax); break; case WB_CARETBREAK: fCaretBreak = true; case WB_NORMAL: default: do { while (TWB_IsCharWordDelimW(*pwch) && pwszMax > pwch) pwch++;
if (fCaretBreak) { // 010.181686. Taking care of puntuation.
while (TWB_IsCharPunctW(*pwch) && pwszMax > pwch) pwch++; }
if( pwszRunStart < pwch) { if (fCaretBreak && *pwszRunStart == L' ' && iBreakIndex > 0) { // 010.182719. For the MaiYaMok case we only accept if
// space follow by MaiYaMok
if (*pwch == THAI_Vowel_MaiYaMok && wzString < (pwszRunStart-1) && IsThaiChar(*(pwszRunStart-1)) && pwch == (pwszRunStart+1) ) { pBreakPos[iBreakIndex - 1] += 2; pwch++; } else // This is a caret movement features, should merge space to
// the right words.
pBreakPos[iBreakIndex - 1] += (BYTE)(pwch - pwszRunStart); } else pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart); pwszRunStart = pwch; }
if( iBreakIndex >= iBreakMax || pwch >= pwszMax) break;
// Detect if this is a Thai Run.
fThaiRun = IsThaiChar(*pwch); //TODO: Add comma and period to Thai range.
if (!fCaretBreak) { do { pwch++; iRunCount++; } while ((IsThaiChar(*pwch)==fThaiRun && iRunCount < (MAXBREAK - 2) && *pwch && !TWB_IsCharWordDelimW(*pwch) && (pwch < pwszMax) ) || ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) )); } else { do { pwch++; iRunCount++; } while ((IsThaiChar(*pwch)==fThaiRun && iRunCount < (MAXBREAK - 2) && *pwch && !TWB_IsCharWordDelimW(*pwch) && !TWB_IsCharPunctW(*pwch) && (pwch < pwszMax) ) || ( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) )); }
if (fThaiRun) { #if defined (NGRAM_ENABLE)
if (!fFastWordBreak) { if (WordBreak(pwszRunStart,pwch)) for (i=0; i < breakTree.maxToken && iBreakIndex <iBreakMax; i++) pBreakPos[iBreakIndex++] = breakTree->maximalMatchingBreakArray[i]; } else { unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch); for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++) pBreakPos[iBreakIndex++] = breakTree->breakArray[i]; } #else
unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch); for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++) pBreakPos[iBreakIndex++] = breakTree->breakArray[i]; #endif
} else { // Not a Thai Run simply put the whole thing in the break array.
assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk.
pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart); } iRunCount = 0; pwszRunStart = pwch;
// Make sure we haven't pass iBreakMax define by user else return whatever we got.
} while(iBreakIndex < iBreakMax && pwch < pwszMax); break; }
#if defined (_DEBUG)
unsigned int iTotalChar = 0; for (i = 0; i < iBreakIndex; i++) { iTotalChar += pBreakPos[i]; } if (iBreakIndex < iBreakMax) assert(iStringLen == iTotalChar); #endif
return iBreakIndex; }
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
#if defined (NGRAM_ENABLE)
BOOL CThaiWordBreak::WordBreak(WCHAR* pszBegin, WCHAR* pszEnd) { // Declare and Initialize all local variables.
bool fWordEnd = false; bool fCorrectPath = false; WCHAR* pszIndex = pszBegin; int iNumCluster = 1;
assert(pszBegin < pszEnd); // Make sure pszEnd is at least greater pszBegin.
breakTree.GenerateTree(pszBegin, pszEnd); breakTree.MaximalMatching();
return (breakTree.maxToken > 0);
} #endif
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CThaiWordBreak::Find(const WCHAR* wzString, DWORD* pdwPOS) { return m_trie.Find(wzString, pdwPOS); }
|