Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

503 lines
13 KiB

// IWBreak.cpp
//
// CWordBreak implementation
//
// Copyright 2000 Microsoft Corp.
//
// Modification History:
// 18 APR 2000 bhshin added WordBreak destructor
// 30 MAR 2000 bhshin created
#include "StdAfx.h"
#include "KorWbrk.h"
#include "IWBreak.h"
#include "Lex.h"
#include "Token.h"
#include "Record.h"
#include "Analyze.h"
#include "IndexRec.h"
#include "unikor.h"
#include "Morpho.h"
extern CRITICAL_SECTION g_CritSect;
extern MAPFILE g_LexMap;
extern BOOL g_fLoaded;
/////////////////////////////////////////////////////////////////////////////
// CWordBreaker member functions
// CWordBreaker::Init
//
// intialize WordBreaker object & lexicon
//
// Parameters:
// fQuery -> (BOOL) query time flag
// ulMaxTokenSize -> (ULONG) maximum input token length
// *pfLicense <- (BOOL*) always return TRUE
//
// Result:
// (HRESULT)
//
// 30MAR00 bhshin began
STDMETHODIMP CWordBreaker::Init(BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense)
{
if (pfLicense == NULL)
return E_INVALIDARG;
if (IsBadWritePtr(pfLicense, sizeof(DWORD)))
return E_INVALIDARG;
// store intitializing information
m_fQuery = fQuery;
m_ulMaxTokenSize = ulMaxTokenSize;
*pfLicense = TRUE;
if (!g_fLoaded)
{
// load lexicon file
ATLTRACE(L"Load lexicon...\r\n");
if (!InitLexicon(&g_LexMap))
return LANGUAGE_E_DATABASE_NOT_FOUND;
g_fLoaded = TRUE;
}
m_PI.lexicon = g_LexMap;
WB_LOG_PRINT_HEADER(fQuery);
return S_OK;
}
// CWordBreaker::BreakText
//
// main word breaking method
//
// Parameters:
// pTextSource -> (TEXT_SOURCE*) pointer to the structure of source text
// pWordSink -> (IWordSink*) pointer to the word sink
// pPhraseSink -> (IPhraseSink*) pointer to the phrase sink
//
// Result:
// (HRESULT)
//
// 30MAR00 bhshin began
STDMETHODIMP CWordBreaker::BreakText(TEXT_SOURCE *pTextSource, IWordSink *pWordSink, IPhraseSink *pPhraseSink)
{
WT Type;
int cchTextProcessed, cchProcessed, cchHanguel;
WCHAR wchLast = L'\0';
if (pTextSource == NULL)
return E_INVALIDARG;
if (pWordSink == NULL)
return S_OK;
if (pTextSource->iCur == pTextSource->iEnd)
return S_OK;
ATLASSERT(pTextSource->iCur < pTextSource->iEnd);
do
{
while (pTextSource->iCur < pTextSource->iEnd)
{
Tokenize(TRUE, pTextSource, pTextSource->iCur, &Type, &cchTextProcessed, &cchHanguel);
if (Type == WT_REACHEND)
break;
cchProcessed = WordBreak(pTextSource, Type, cchTextProcessed, cchHanguel, pWordSink, pPhraseSink, &wchLast);
if (cchProcessed < 0)
return E_UNEXPECTED;
pTextSource->iCur += cchProcessed;
}
} while (SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));
while ( pTextSource->iCur < pTextSource->iEnd )
{
Tokenize(FALSE, pTextSource, pTextSource->iCur, &Type, &cchTextProcessed, &cchHanguel);
cchProcessed = WordBreak(pTextSource, Type, cchTextProcessed, cchHanguel, pWordSink, pPhraseSink, &wchLast);
if (cchProcessed < 0)
return E_UNEXPECTED;
pTextSource->iCur += cchProcessed;
}
return S_OK;
}
// CWordBreaker::ComposePhrase
//
// convert a noun and modifier back into a source phrase (NOT USED)
//
// Parameters:
// pwcNoun -> (const WCHAR*) input noun
// cwcNoun -> (ULONG) length of input noun
// pwcModifier -> (const WCHAR *) input modifier
// cwcModifier -> (ULONG) length of input modifier
// ulAttachmentType -> (ULONG) value about the method of composition
// pwcPhrase -> (WCHAR *) pointer to the returned buffer
// pcwcPhrase -> (ULONG *) length of returned string
//
// Result:
// (HRESULT)
//
// 30MAR00 bhshin began
STDMETHODIMP CWordBreaker::ComposePhrase(const WCHAR *pwcNoun, ULONG cwcNoun, const WCHAR *pwcModifier, ULONG cwcModifier, ULONG ulAttachmentType, WCHAR *pwcPhrase, ULONG *pcwcPhrase)
{
if (m_fQuery)
return E_NOTIMPL;
return WBREAK_E_QUERY_ONLY;
}
// CWordBreaker::GetLicenseToUse
//
// return license information
//
// Parameters:
// ppwcsLicense -> (const WCHAR **) output pointer to the license information
//
// Result:
// (HRESULT)
//
// 30MAR00 bhshin began
STDMETHODIMP CWordBreaker::GetLicenseToUse(const WCHAR ** ppwcsLicense)
{
static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-2000";
if (ppwcsLicense == NULL)
return E_INVALIDARG;
if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD)))
return E_INVALIDARG;
*ppwcsLicense = wcsCopyright;
return S_OK;
}
// CWordBreaker::WordBreak
//
// main hangul word breaking operator
//
// Parameters:
// pTextSource -> (TEXT_SOURCE*) pointer to the structure of source text
// Type -> (WT) word token type
// cchTextProcessed -> (int) input length to process
// cchHanguel -> (int) hangul token length (hanguel+romaji case only)
// pWordSink -> (IWordSink*) pointer to the word sink
// pPhraseSink -> (IPhraseSink*) pointer to the phrase sink
// pwchLast -> (WCHAR*) input & output last character of previous token
//
// Result:
// (int) -1 if error occurs, text length to process
//
// 30MAR00 bhshin began
int CWordBreaker::WordBreak(TEXT_SOURCE *pTextSource, WT Type,
int cchTextProcessed, int cchHanguel,
IWordSink *pWordSink, IPhraseSink *pPhraseSink,
WCHAR *pwchLast)
{
const WCHAR *pwcStem;
int iCur;
int cchToken, cchProcessed, cchHg;
int cchPrefix;
ATLASSERT(cchTextProcessed > 0);
if (cchTextProcessed <= 0)
return -1;
iCur = pTextSource->iCur;
pwcStem = pTextSource->awcBuffer + iCur;
cchProcessed = cchTextProcessed;
cchToken = cchTextProcessed;
// check too long token
if (cchToken > (int)m_ulMaxTokenSize || cchToken > MAX_INDEX_STRING)
{
cchProcessed = (m_ulMaxTokenSize < MAX_INDEX_STRING) ? m_ulMaxTokenSize : MAX_INDEX_STRING;
pWordSink->PutWord(cchProcessed,
pwcStem,
cchProcessed,
pTextSource->iCur);
return cchProcessed;
}
//=================================================
// query & index time
//=================================================
if (Type == WT_PHRASE_SEP)
{
// phrase separator
*pwchLast = L'\0';
pWordSink->PutBreak(WORDREP_BREAK_EOS);
}
else if (Type == WT_WORD_SEP)
{
if (!fIsWhiteSpace(*pwcStem))
*pwchLast = L'\0';
// Korean WB do not add EOW.
}
else if (Type == WT_ROMAJI)
{
// symbol, alphabet, hanja, romaji + hanguel
// get next token
iCur += cchToken;
Tokenize(FALSE, pTextSource, iCur, &Type, &cchToken, &cchHg);
if (Type == WT_ROMAJI)
{
if (cchHg > 0)
{
// romaji+(hanguel+romaji) case -> put word itself
cchProcessed += cchToken;
iCur += cchToken;
cchProcessed += GetWordPhrase(FALSE, pTextSource, iCur);
WB_LOG_START(pwcStem, cchProcessed);
pWordSink->PutWord(cchProcessed,
pwcStem,
cchProcessed,
pTextSource->iCur);
WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_SYMBOL);
}
else
{
WB_LOG_START(pwcStem, cchProcessed);
// {romaj}{romaj} case : -> breaking first {romaji}
CIndexInfo IndexInfo;
if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink))
goto ErrorReturn;
AnalyzeRomaji(pwcStem, cchProcessed, pTextSource->iCur, cchProcessed,
cchHanguel, &IndexInfo, &cchPrefix);
if (m_fQuery)
{
IndexInfo.AddIndex(pwcStem, cchProcessed+cchToken, WEIGHT_HARD_MATCH, 0, cchProcessed+cchToken-1);
WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY);
if (!IndexInfo.PutQueryIndexList())
goto ErrorReturn;
}
else
{
if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur))
goto ErrorReturn;
}
}
}
else if (Type == WT_HANGUEL)
{
// romaji(hanguel+romaji) + hanguel case
WCHAR wzRomaji[MAX_INDEX_STRING+1];
int cchRomaji;
cchRomaji = (cchProcessed > MAX_INDEX_STRING) ? MAX_INDEX_STRING : cchProcessed;
wcsncpy(wzRomaji, pwcStem, cchRomaji);
wzRomaji[cchRomaji] = L'\0';
WB_LOG_START(pwcStem, cchProcessed+cchToken);
cchProcessed += cchToken;
// start position include romanji
CIndexInfo IndexInfo;
if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink))
goto ErrorReturn;
if (cchHanguel > 0)
{
AnalyzeRomaji(pwcStem, cchRomaji, pTextSource->iCur, cchRomaji,
cchHanguel, &IndexInfo, &cchPrefix);
}
else
{
cchPrefix = CheckURLPrefix(pwcStem, cchProcessed-cchToken);
}
// analyze string starts from last hangul
pwcStem = pTextSource->awcBuffer + iCur;
if (cchRomaji > 0)
IndexInfo.SetRomajiInfo(wzRomaji, cchRomaji, cchPrefix);
// analyze string always with indexing mode on symbol processing
if (!AnalyzeString(&m_PI, m_fQuery, pwcStem, cchToken, iCur, &IndexInfo, *pwchLast))
goto ErrorReturn;
if (m_fQuery)
{
if (cchRomaji > 0)
IndexInfo.SetRomajiInfo(NULL, 0, 0);
IndexInfo.AddIndex(pTextSource->awcBuffer + pTextSource->iCur, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed+cchToken-1);
WB_LOG_ADD_INDEX(pTextSource->awcBuffer + pTextSource->iCur, cchProcessed, INDEX_QUERY);
if (!IndexInfo.PutQueryIndexList())
goto ErrorReturn;
}
else
{
if (!IndexInfo.MakeSingleLengthMergedIndex())
goto ErrorReturn;
if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur))
goto ErrorReturn;
}
*pwchLast = *(pwcStem + cchToken - 1);
}
else // next: WT_START, WT_PHRASE_SEP, WT_WORD_SEP, WT_REACHEND
{
WB_LOG_START(pwcStem, cchProcessed);
CIndexInfo IndexInfo;
if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink))
goto ErrorReturn;
AnalyzeRomaji(pwcStem, cchProcessed, pTextSource->iCur, cchProcessed,
cchHanguel, &IndexInfo, &cchPrefix);
if (m_fQuery)
{
IndexInfo.AddIndex(pwcStem, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed-1);
WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY);
if (!IndexInfo.PutQueryIndexList())
goto ErrorReturn;
}
else
{
if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur))
goto ErrorReturn;
}
}
}
else if (Type == WT_HANGUEL)
{
// hangul input
WB_LOG_START(pwcStem, cchProcessed);
CIndexInfo IndexInfo;
if (!IndexInfo.Initialize(cchProcessed, iCur, pWordSink, pPhraseSink))
goto ErrorReturn;
if (!AnalyzeString(&m_PI, m_fQuery, pwcStem, cchProcessed, iCur, &IndexInfo, *pwchLast))
goto ErrorReturn;
if (m_fQuery)
{
IndexInfo.AddIndex(pwcStem, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed-1);
WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY);
if (!IndexInfo.PutQueryIndexList())
goto ErrorReturn;
}
else
{
if (!IndexInfo.MakeSingleLengthMergedIndex())
goto ErrorReturn;
if (!IndexInfo.PutFinalIndexList(pwcStem))
goto ErrorReturn;
}
*pwchLast = *(pwcStem + cchProcessed - 1);
}
WB_LOG_PRINT_ALL();
WB_LOG_END();
return cchProcessed;
ErrorReturn:
WB_LOG_END();
return -1;
}
// CWordBreaker::AnalyzeRomaji
//
// helper function for romaji token wordbreaking
//
// Parameters:
// pwcStem -> (const WCHAR*) input token string
// cchStem -> (int) length of input romaji token
// iCur -> (int) source string position
// cchProcessed -> (int) input length to process
// cchHanguel -> (int) hangul token length (hanguel+romaji case only)
// pIndexInfo -> (CIndexInfo *) output index list
// pcchPrefix -> (int*) output prefix length
//
// Result:
// (void)
//
// 23NOV00 bhshin began
void CWordBreaker::AnalyzeRomaji(const WCHAR *pwcStem, int cchStem,
int iCur, int cchProcessed, int cchHanguel,
CIndexInfo *pIndexInfo, int *pcchPrefix)
{
int cchPrefix = 0;
// hanguel+romaji case
if (cchHanguel < cchProcessed)
{
// hanguel
if (cchHanguel > 0)
{
pIndexInfo->AddIndex(pwcStem, cchHanguel, WEIGHT_HARD_MATCH, 0, cchHanguel-1);
WB_LOG_ADD_INDEX(pwcStem, cchHanguel, INDEX_SYMBOL);
}
// romaji
if ((cchStem-cchHanguel) > 0)
{
pIndexInfo->AddIndex(pwcStem + cchHanguel, cchStem - cchHanguel, WEIGHT_HARD_MATCH, cchHanguel, cchStem-1);
WB_LOG_ADD_INDEX(pwcStem + cchHanguel, cchStem - cchHanguel, INDEX_SYMBOL);
}
}
if (cchHanguel == 1 || (cchStem-cchHanguel) == 1)
{
// romaji(hangul+romaji)
pIndexInfo->AddIndex(pwcStem, cchStem, WEIGHT_HARD_MATCH, 0, cchStem-1);
WB_LOG_ADD_INDEX(pwcStem, cchStem, INDEX_SYMBOL);
}
// check URL prefix
cchPrefix = CheckURLPrefix(pwcStem, cchProcessed);
if (cchPrefix > 0 && cchPrefix < cchProcessed)
{
pIndexInfo->AddIndex(pwcStem + cchPrefix, cchStem - cchPrefix, WEIGHT_HARD_MATCH, cchPrefix, cchStem-1);
WB_LOG_ADD_INDEX(pwcStem + cchPrefix, cchStem - cchPrefix, INDEX_SYMBOL);
}
*pcchPrefix = cchPrefix; // return it
}