Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

827 lines
23 KiB

//+---------------------------------------------------------------------------
//
//
// CThaiWordBreak
//
// History:
// created 7/99 aarayas
//
// ©1999 Microsoft Corporation
//----------------------------------------------------------------------------
#include "cthwb.hpp"
//+---------------------------------------------------------------------------
//
// Function: ExtractALT
//
// Synopsis: The functions takes a tag and return Alternate Tags.
//
// Arguments:
//
// Modifies:
//
// History: created 3/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
inline BYTE ExtractALT(DWORD dwTag)
{
return (BYTE) ( (dwTag & iAltMask) >> iAltShift);
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: constructor
//
// Arguments:
//
// Modifies:
//
// History: created 8/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiWordBreak::CThaiWordBreak()
{
wordCount[0] = 0;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: destructor
//
// Arguments:
//
// Modifies:
//
// History: created 8/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
CThaiWordBreak::~CThaiWordBreak()
{
wordCount[0] = 0;
#if defined (_DEBUG)
assert(listWordBreak.length == 0);
#endif
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: Initialize ThaiWordBreak.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
PTEC CThaiWordBreak::Init(const WCHAR* wzFileName, const WCHAR* wzFileNameTrigram)
{
// Declare and Initialize local variables.
PTEC retValue = m_trie.Init(wzFileName);
if (retValue == ptecNoErrors)
{
retValue = m_trie_trigram.Init(wzFileNameTrigram);
}
// new memory management
listWordBreak.Init(&m_trie,&m_trie_trigram);
for (int i = 0; i < 10; i++)
{
listWordBreak.CreateWordBreak();
}
return retValue;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: Initialize ThaiWordBreak.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
PTEC CThaiWordBreak::InitRc(LPBYTE pThaiDic, LPBYTE pThaiTrigram, BOOL fSkipHeader)
{
// Declare and Initialize local variables.
PTEC retValue = m_trie.InitRc(pThaiDic, fSkipHeader);
if (retValue == ptecNoErrors)
retValue = m_trie_trigram.InitRc(pThaiTrigram, fSkipHeader);
// new memory management
listWordBreak.Init(&m_trie,&m_trie_trigram);
for (int i = 0; i < 10; i++)
{
listWordBreak.CreateWordBreak();
}
return retValue;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: UnInitialize ThaiWordBreak.
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
void CThaiWordBreak::UnInit()
{
// new memory management
listWordBreak.Flush();
m_trie.UnInit();
#if defined (NGRAM_ENABLE)
m_trie_sentence_struct.UnInit();
#endif
m_trie_trigram.UnInit();
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
enum merge_direction {
NO_MERGE,
MERGE_RIGHT,
MERGE_LEFT,
MERGE_BOTH_DIRECTIONS,
NOT_SURE_WHICH_DIRECTION
};
merge_direction DetermineMergeDirection(WCHAR wc)
{
if (wc == 0x0020) // space
return NO_MERGE;
else if ( wc == 0x0022 || // quotation mark
wc == 0x0027 ) // apostrophe
return NOT_SURE_WHICH_DIRECTION;
else if ( wc == 0x0028 || // left parenthesis
wc == 0x003C || // less than sign
wc == 0x005B || // left square bracket
wc == 0x007B || // left curly bracket
wc == 0x2018 || // left single quotation mark
wc == 0x201C || // left double quotation mark
wc == 0x201F ) // left double quotation mark reverse
return MERGE_RIGHT;
// TODO: need to add MERGE_BOTH_DIRECTIONS for character joiner characters.
// all other character merge left.
return MERGE_LEFT;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
DWORD_PTR CThaiWordBreak::CreateWordBreaker()
{
CThaiBreakTree* breakTree = NULL;
breakTree = new CThaiBreakTree();
#if defined (NGRAM_ENABLE)
if (breakTree)
breakTree->Init(&m_trie, &m_trie_sentence_struct, &m_trie_trigram);
#else
if (breakTree)
breakTree->Init(&m_trie, &m_trie_trigram);
#endif
return (DWORD_PTR)breakTree;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
bool CThaiWordBreak::DeleteWordBreaker(DWORD_PTR dwBreaker)
{
CThaiBreakTree* breakTree = (CThaiBreakTree*) dwBreaker;
if (breakTree)
{
delete breakTree;
return true;
}
return false;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: This funciton segment Thai word use for Indexing.
//
// Arguments:
// wzString - input string. (in)
// iStringLen - input string length. (in)
// pBreakPos - array of break position. (out)
// pThwb_Struct - array structure of THWB. (out)
// iBreakMax - length of pBreakPos and
// pThwb_Struct. (out)
//
// Modifies:
//
// History: created 3/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiWordBreak::IndexWordBreak(WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,THWB_STRUCT* pThwb_Struct,unsigned int iBreakMax)
{
unsigned int iBreakIndex = 0; // Contain number of Breaks.
CThaiBreakTree* breakTree = NULL;
breakTree = new CThaiBreakTree();
if (breakTree)
{
breakTree->Init(&m_trie, &m_trie_trigram);
iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,WB_INDEX,true,pThwb_Struct);
delete breakTree;
}
return iBreakIndex;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// wzWord - input string. (in)
// iWordLen - input string length. (in)
// Alt - find close alternate word (in)
// pBreakPos - array of break position allways 5 byte. (out)
//
// Modifies:
//
// History: created 3/00 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiWordBreak::FindAltWord(WCHAR* wzWord,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos)
{
unsigned int iBreakIndex = 0; // Contain number of Breaks.
CThaiBreakTree* breakTree = NULL;
breakTree = new CThaiBreakTree();
if (breakTree)
{
breakTree->Init(&m_trie, &m_trie_trigram);
iBreakIndex = breakTree->FindAltWord(wzWord,iWordLen,Alt,pBreakPos);
delete breakTree;
}
return iBreakIndex;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: This funciton segment Thai text segment them depending on the modes specifies.
//
// WB_LINEBREAK - is used when the application needs to break for line wrapping,
// this mode takes into the consideration of punctuations.
//
// WB_NORMAL - is used when application wants determine word for searching,
// autocorrect, etc.
//
// WB_SPELLER - not yet implemented, but same as normal with additional soundex
// rules.
//
// Arguments:
//
// wzString - input string. (in)
// iStringLen - input string length. (in)
// pBreakPos - array of break position. (out)
// iBreakMax - length of pBreakPos (out)
// mode - either WB_LINEBREAK, etct (in)
// fFastWordBreak - true for fast algorithm (in)
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiWordBreak::FindWordBreak(WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,unsigned int iBreakMax, BYTE mode, bool fFastWordBreak)
{
unsigned int iBreakIndex = 0; // Contain number of Breaks.
CThaiBreakTree* breakTree = NULL;
#if defined(OLD)
breakTree = new CThaiBreakTree();
#else
// new memory management
WordBreakElement* pWordBreakElement = NULL;
pWordBreakElement = listWordBreak.GetFreeWB();
breakTree = pWordBreakElement->breakTree;
listWordBreak.MarkWordBreak(pWordBreakElement,false); // Mark word break as in use.
#endif
if (breakTree)
{
#if defined(OLD)
breakTree->Init(&m_trie, &m_trie_trigram);
assert(mode != WB_INDEX); // If this assert come up, use function IndexWordBreak
iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,mode,fFastWordBreak,0);
delete breakTree;
#else
iBreakIndex = FindWordBreak((DWORD_PTR)breakTree,wzString,iStringLen,pBreakPos,iBreakMax,mode,fFastWordBreak,0);
listWordBreak.MarkWordBreak(pWordBreakElement,true); // Mark word break as free.
#endif
}
else
{
assert(false);
}
return iBreakIndex;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis: This funciton segment Thai text segment them depending on the modes specifies.
//
// WB_LINEBREAK - is used when the application needs to break for line wrapping,
// this mode takes into the consideration of punctuations.
//
// WB_NORMAL - is used when application wants determine word for searching,
// autocorrect, etc.
//
// WB_SPELLER - not yet implemented, but same as normal with additional soundex
// rules.
//
// WB_INDEX - is used when application wanted to do Thai indexing.
//
//
// Arguments:
//
// wzString - input string. (in)
// iStringLen - input string length. (in)
// pBreakPos - array of break position. (out)
// iBreakMax - length of pBreakPos (out)
// must be greater than 1.
// mode - either WB_LINEBREAK, etct (in)
// fFastWordBreak - true for fast algorithm (in)
// pThwb_Struct - array structure of THWB. (out)
//
// Modifies:
//
// History: created 11/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
int CThaiWordBreak::FindWordBreak(DWORD_PTR dwBreaker, WCHAR* wzString,unsigned int iStringLen, BYTE* pBreakPos,unsigned int iBreakMax, BYTE mode, bool fFastWordBreak, THWB_STRUCT* pThwb_Struct)
{
// Declare and Initialize all local variables.
WCHAR* pwszRunStart = wzString;
const WCHAR* pwszMax = wzString + iStringLen;
WCHAR* pwch = wzString;
bool fThaiRun = true;
bool fCaretBreak = false;
int iRunCount = 0;
unsigned int i = 0;
unsigned int iBreakIndex = 0; // Contain number of Breaks.
merge_direction dirPrevious = NO_MERGE;
merge_direction dirCurrent = NO_MERGE;
CThaiBreakTree* breakTree = (CThaiBreakTree*) dwBreaker;
// check for possible invalid arguments.
assert(wzString != NULL);
assert(iBreakMax > 0);
assert(pBreakPos != NULL);
if ((wzString == NULL) || (iBreakMax == 0) || (pBreakPos == NULL))
return 0;
switch (mode)
{
case WB_LINEBREAK:
case 2: // to be compatible with old api.
do
{
while ((TWB_IsCharPunctW(*pwch) || TWB_IsCharWordDelimW(*pwch)) && iBreakIndex < iBreakMax && pwch < pwszMax)
{
dirCurrent = DetermineMergeDirection(*pwch);
switch (dirCurrent)
{
case NO_MERGE:
if ( pwch + 1 < pwszMax && *(pwch + 1) == THAI_Vowel_MaiYaMok && iBreakIndex > 0)
{
// Mai Ya Mok case only.
pBreakPos[iBreakIndex - 1] += 2;
dirCurrent = MERGE_LEFT;
pwch++;
}
else
pBreakPos[iBreakIndex++] = 1;
break;
case MERGE_RIGHT:
if (dirPrevious == MERGE_RIGHT)
pBreakPos[iBreakIndex - 1]++;
else if (!TWB_IsCharPunctW(*(pwch + 1)))
pBreakPos[iBreakIndex++] = 1;
else
pBreakPos[iBreakIndex++] = 1;
break;
case NOT_SURE_WHICH_DIRECTION:
if (pwch == wzString || // if pwch is first character.
TWB_IsCharWordDelimW(*(pwch - 1)) ) // if previous character is delimiter.
{
pBreakPos[iBreakIndex++] = 1;
dirCurrent = MERGE_RIGHT;
}
else
{
pBreakPos[iBreakIndex - 1]++;
dirCurrent = MERGE_LEFT;
}
break;
case MERGE_LEFT:
default:
if (iBreakIndex == 0)
if (pwch == wzString)
pBreakPos[iBreakIndex++] = 1;
else
pBreakPos[iBreakIndex]++;
else
pBreakPos[iBreakIndex - 1]++;
break;
}
dirPrevious = dirCurrent;
pwch++;
pwszRunStart = pwch;
}
assert(pwszRunStart == pwch);
if( iBreakIndex >= iBreakMax || pwch >= pwszMax)
break;
// Detect if this is a Thai Run.
fThaiRun = IsThaiChar(*pwch);
do
{
pwch++;
iRunCount++;
} while ((IsThaiChar(*pwch)==fThaiRun &&
iRunCount < (MAXBREAK - 2) &&
*pwch &&
!TWB_IsCharWordDelimW(*pwch) &&
(pwch < pwszMax) ) ||
( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) ));
if (fThaiRun)
{
unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch);
for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++)
{
// First Thai character of the run.
if (dirPrevious == MERGE_RIGHT)
{
assert(iBreakIndex != 0);
pBreakPos[iBreakIndex - 1] += breakTree->breakArray[i];
}
else
pBreakPos[iBreakIndex++] = breakTree->breakArray[i];
dirPrevious = NO_MERGE;
}
}
else
{
// Not a Thai Run simply put the whole thing in the break array.
assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk.
if (dirPrevious == MERGE_RIGHT)
{
assert(iBreakIndex != 0);
pBreakPos[iBreakIndex - 1] += (BYTE) (pwch - pwszRunStart);
}
else
pBreakPos[iBreakIndex++] = (BYTE) (pwch - pwszRunStart);
}
iRunCount = 0;
pwszRunStart = pwch;
// Make sure we haven't pass iBreakMax define by user else return whatever we got.
} while(iBreakIndex < iBreakMax && pwch < pwszMax);
break;
case WB_INDEX:
// Make sure argument is the same.
assert(pThwb_Struct != NULL);
if (pThwb_Struct == NULL)
return 0;
do
{
while (TWB_IsCharWordDelimW(*pwch) && pwszMax > pwch)
pwch++;
if( pwszRunStart < pwch)
{
pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart);
pwszRunStart = pwch;
}
if( iBreakIndex >= iBreakMax || pwch >= pwszMax)
break;
// Detect if this is a Thai Run.
fThaiRun = IsThaiChar(*pwch); //TODO: Add comma and period to Thai range.
do
{
pwch++;
iRunCount++;
} while ((IsThaiChar(*pwch)==fThaiRun &&
iRunCount < (MAXBREAK - 2) &&
*pwch &&
!TWB_IsCharWordDelimW(*pwch) &&
(pwch < pwszMax) ) ||
( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) ));
if (fThaiRun)
{
unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch);
for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++)
{
pThwb_Struct[iBreakIndex].fThai = true;
pThwb_Struct[iBreakIndex].alt = ExtractALT(breakTree->tagArray[i]);
pBreakPos[iBreakIndex++] = breakTree->breakArray[i];
}
}
else
{
// Not a Thai Run simply put the whole thing in the break array.
assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk.
pThwb_Struct[iBreakIndex].fThai = false;
pThwb_Struct[iBreakIndex].alt = 0;
pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart);
}
iRunCount = 0;
pwszRunStart = pwch;
// Make sure we haven't pass iBreakMax define by user else return whatever we got.
} while(iBreakIndex < iBreakMax && pwch < pwszMax);
break;
case WB_CARETBREAK:
fCaretBreak = true;
case WB_NORMAL:
default:
do
{
while (TWB_IsCharWordDelimW(*pwch) && pwszMax > pwch)
pwch++;
if (fCaretBreak)
{
// 010.181686. Taking care of puntuation.
while (TWB_IsCharPunctW(*pwch) && pwszMax > pwch)
pwch++;
}
if( pwszRunStart < pwch)
{
if (fCaretBreak && *pwszRunStart == L' ' && iBreakIndex > 0)
{
// 010.182719. For the MaiYaMok case we only accept if
// space follow by MaiYaMok
if (*pwch == THAI_Vowel_MaiYaMok &&
wzString < (pwszRunStart-1) &&
IsThaiChar(*(pwszRunStart-1)) &&
pwch == (pwszRunStart+1) )
{
pBreakPos[iBreakIndex - 1] += 2;
pwch++;
}
else
// This is a caret movement features, should merge space to
// the right words.
pBreakPos[iBreakIndex - 1] += (BYTE)(pwch - pwszRunStart);
}
else
pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart);
pwszRunStart = pwch;
}
if( iBreakIndex >= iBreakMax || pwch >= pwszMax)
break;
// Detect if this is a Thai Run.
fThaiRun = IsThaiChar(*pwch); //TODO: Add comma and period to Thai range.
if (!fCaretBreak)
{
do
{
pwch++;
iRunCount++;
} while ((IsThaiChar(*pwch)==fThaiRun &&
iRunCount < (MAXBREAK - 2) &&
*pwch &&
!TWB_IsCharWordDelimW(*pwch) &&
(pwch < pwszMax) ) ||
( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) ));
}
else
{
do
{
pwch++;
iRunCount++;
} while ((IsThaiChar(*pwch)==fThaiRun &&
iRunCount < (MAXBREAK - 2) &&
*pwch &&
!TWB_IsCharWordDelimW(*pwch) &&
!TWB_IsCharPunctW(*pwch) &&
(pwch < pwszMax) ) ||
( ( *pwch == 0x2c || *pwch == 0x2e) && (iRunCount < (MAXBREAK - 2)) && (pwch < pwszMax) ));
}
if (fThaiRun)
{
#if defined (NGRAM_ENABLE)
if (!fFastWordBreak)
{
if (WordBreak(pwszRunStart,pwch))
for (i=0; i < breakTree.maxToken && iBreakIndex <iBreakMax; i++)
pBreakPos[iBreakIndex++] = breakTree->maximalMatchingBreakArray[i];
}
else
{
unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch);
for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++)
pBreakPos[iBreakIndex++] = breakTree->breakArray[i];
}
#else
unsigned int iBreak = breakTree->TrigramBreak(pwszRunStart,pwch);
for (i=0; i < iBreak && iBreakIndex <iBreakMax; i++)
pBreakPos[iBreakIndex++] = breakTree->breakArray[i];
#endif
}
else
{
// Not a Thai Run simply put the whole thing in the break array.
assert(pwch > pwszRunStart); // pwch must be greater than pwszRunStart, since we just walk.
pBreakPos[iBreakIndex++] = (BYTE)(pwch - pwszRunStart);
}
iRunCount = 0;
pwszRunStart = pwch;
// Make sure we haven't pass iBreakMax define by user else return whatever we got.
} while(iBreakIndex < iBreakMax && pwch < pwszMax);
break;
}
#if defined (_DEBUG)
unsigned int iTotalChar = 0;
for (i = 0; i < iBreakIndex; i++)
{
iTotalChar += pBreakPos[i];
}
if (iBreakIndex < iBreakMax)
assert(iStringLen == iTotalChar);
#endif
return iBreakIndex;
}
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
#if defined (NGRAM_ENABLE)
BOOL CThaiWordBreak::WordBreak(WCHAR* pszBegin, WCHAR* pszEnd)
{
// Declare and Initialize all local variables.
bool fWordEnd = false;
bool fCorrectPath = false;
WCHAR* pszIndex = pszBegin;
int iNumCluster = 1;
assert(pszBegin < pszEnd); // Make sure pszEnd is at least greater pszBegin.
breakTree.GenerateTree(pszBegin, pszEnd);
breakTree.MaximalMatching();
return (breakTree.maxToken > 0);
}
#endif
//+---------------------------------------------------------------------------
//
// Class: CThaiWordBreak
//
// Synopsis:
//
// Arguments:
//
// Modifies:
//
// History: created 7/99 aarayas
//
// Notes:
//
//----------------------------------------------------------------------------
BOOL CThaiWordBreak::Find(const WCHAR* wzString, DWORD* pdwPOS)
{
return m_trie.Find(wzString, pdwPOS);
}