mirror of https://github.com/tongzx/nt5src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
503 lines
13 KiB
503 lines
13 KiB
// IWBreak.cpp
|
|
//
|
|
// CWordBreak implementation
|
|
//
|
|
// Copyright 2000 Microsoft Corp.
|
|
//
|
|
// Modification History:
|
|
// 18 APR 2000 bhshin added WordBreak destructor
|
|
// 30 MAR 2000 bhshin created
|
|
|
|
#include "StdAfx.h"
|
|
#include "KorWbrk.h"
|
|
#include "IWBreak.h"
|
|
#include "Lex.h"
|
|
#include "Token.h"
|
|
#include "Record.h"
|
|
#include "Analyze.h"
|
|
#include "IndexRec.h"
|
|
#include "unikor.h"
|
|
#include "Morpho.h"
|
|
|
|
extern CRITICAL_SECTION g_CritSect;
|
|
extern MAPFILE g_LexMap;
|
|
extern BOOL g_fLoaded;
|
|
|
|
/////////////////////////////////////////////////////////////////////////////
|
|
// CWordBreaker member functions
|
|
|
|
// CWordBreaker::Init
|
|
//
|
|
// intialize WordBreaker object & lexicon
|
|
//
|
|
// Parameters:
|
|
// fQuery -> (BOOL) query time flag
|
|
// ulMaxTokenSize -> (ULONG) maximum input token length
|
|
// *pfLicense <- (BOOL*) always return TRUE
|
|
//
|
|
// Result:
|
|
// (HRESULT)
|
|
//
|
|
// 30MAR00 bhshin began
|
|
STDMETHODIMP CWordBreaker::Init(BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense)
|
|
{
|
|
if (pfLicense == NULL)
|
|
return E_INVALIDARG;
|
|
|
|
if (IsBadWritePtr(pfLicense, sizeof(DWORD)))
|
|
return E_INVALIDARG;
|
|
|
|
// store intitializing information
|
|
m_fQuery = fQuery;
|
|
m_ulMaxTokenSize = ulMaxTokenSize;
|
|
|
|
*pfLicense = TRUE;
|
|
|
|
if (!g_fLoaded)
|
|
{
|
|
// load lexicon file
|
|
ATLTRACE(L"Load lexicon...\r\n");
|
|
|
|
if (!InitLexicon(&g_LexMap))
|
|
return LANGUAGE_E_DATABASE_NOT_FOUND;
|
|
|
|
g_fLoaded = TRUE;
|
|
}
|
|
|
|
m_PI.lexicon = g_LexMap;
|
|
|
|
WB_LOG_PRINT_HEADER(fQuery);
|
|
|
|
return S_OK;
|
|
}
|
|
|
|
// CWordBreaker::BreakText
|
|
//
|
|
// main word breaking method
|
|
//
|
|
// Parameters:
|
|
// pTextSource -> (TEXT_SOURCE*) pointer to the structure of source text
|
|
// pWordSink -> (IWordSink*) pointer to the word sink
|
|
// pPhraseSink -> (IPhraseSink*) pointer to the phrase sink
|
|
//
|
|
// Result:
|
|
// (HRESULT)
|
|
//
|
|
// 30MAR00 bhshin began
|
|
STDMETHODIMP CWordBreaker::BreakText(TEXT_SOURCE *pTextSource, IWordSink *pWordSink, IPhraseSink *pPhraseSink)
|
|
{
|
|
WT Type;
|
|
int cchTextProcessed, cchProcessed, cchHanguel;
|
|
WCHAR wchLast = L'\0';
|
|
|
|
if (pTextSource == NULL)
|
|
return E_INVALIDARG;
|
|
|
|
if (pWordSink == NULL)
|
|
return S_OK;
|
|
|
|
if (pTextSource->iCur == pTextSource->iEnd)
|
|
return S_OK;
|
|
|
|
ATLASSERT(pTextSource->iCur < pTextSource->iEnd);
|
|
|
|
do
|
|
{
|
|
while (pTextSource->iCur < pTextSource->iEnd)
|
|
{
|
|
Tokenize(TRUE, pTextSource, pTextSource->iCur, &Type, &cchTextProcessed, &cchHanguel);
|
|
|
|
if (Type == WT_REACHEND)
|
|
break;
|
|
|
|
cchProcessed = WordBreak(pTextSource, Type, cchTextProcessed, cchHanguel, pWordSink, pPhraseSink, &wchLast);
|
|
if (cchProcessed < 0)
|
|
return E_UNEXPECTED;
|
|
|
|
pTextSource->iCur += cchProcessed;
|
|
}
|
|
|
|
} while (SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));
|
|
|
|
while ( pTextSource->iCur < pTextSource->iEnd )
|
|
{
|
|
Tokenize(FALSE, pTextSource, pTextSource->iCur, &Type, &cchTextProcessed, &cchHanguel);
|
|
|
|
cchProcessed = WordBreak(pTextSource, Type, cchTextProcessed, cchHanguel, pWordSink, pPhraseSink, &wchLast);
|
|
if (cchProcessed < 0)
|
|
return E_UNEXPECTED;
|
|
|
|
pTextSource->iCur += cchProcessed;
|
|
}
|
|
|
|
return S_OK;
|
|
}
|
|
|
|
// CWordBreaker::ComposePhrase
|
|
//
|
|
// convert a noun and modifier back into a source phrase (NOT USED)
|
|
//
|
|
// Parameters:
|
|
// pwcNoun -> (const WCHAR*) input noun
|
|
// cwcNoun -> (ULONG) length of input noun
|
|
// pwcModifier -> (const WCHAR *) input modifier
|
|
// cwcModifier -> (ULONG) length of input modifier
|
|
// ulAttachmentType -> (ULONG) value about the method of composition
|
|
// pwcPhrase -> (WCHAR *) pointer to the returned buffer
|
|
// pcwcPhrase -> (ULONG *) length of returned string
|
|
//
|
|
// Result:
|
|
// (HRESULT)
|
|
//
|
|
// 30MAR00 bhshin began
|
|
STDMETHODIMP CWordBreaker::ComposePhrase(const WCHAR *pwcNoun, ULONG cwcNoun, const WCHAR *pwcModifier, ULONG cwcModifier, ULONG ulAttachmentType, WCHAR *pwcPhrase, ULONG *pcwcPhrase)
|
|
{
|
|
if (m_fQuery)
|
|
return E_NOTIMPL;
|
|
|
|
return WBREAK_E_QUERY_ONLY;
|
|
}
|
|
|
|
// CWordBreaker::GetLicenseToUse
|
|
//
|
|
// return license information
|
|
//
|
|
// Parameters:
|
|
// ppwcsLicense -> (const WCHAR **) output pointer to the license information
|
|
//
|
|
// Result:
|
|
// (HRESULT)
|
|
//
|
|
// 30MAR00 bhshin began
|
|
STDMETHODIMP CWordBreaker::GetLicenseToUse(const WCHAR ** ppwcsLicense)
|
|
{
|
|
static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-2000";
|
|
|
|
if (ppwcsLicense == NULL)
|
|
return E_INVALIDARG;
|
|
|
|
if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD)))
|
|
return E_INVALIDARG;
|
|
|
|
*ppwcsLicense = wcsCopyright;
|
|
|
|
return S_OK;
|
|
}
|
|
|
|
// CWordBreaker::WordBreak
|
|
//
|
|
// main hangul word breaking operator
|
|
//
|
|
// Parameters:
|
|
// pTextSource -> (TEXT_SOURCE*) pointer to the structure of source text
|
|
// Type -> (WT) word token type
|
|
// cchTextProcessed -> (int) input length to process
|
|
// cchHanguel -> (int) hangul token length (hanguel+romaji case only)
|
|
// pWordSink -> (IWordSink*) pointer to the word sink
|
|
// pPhraseSink -> (IPhraseSink*) pointer to the phrase sink
|
|
// pwchLast -> (WCHAR*) input & output last character of previous token
|
|
//
|
|
// Result:
|
|
// (int) -1 if error occurs, text length to process
|
|
//
|
|
// 30MAR00 bhshin began
|
|
int CWordBreaker::WordBreak(TEXT_SOURCE *pTextSource, WT Type,
|
|
int cchTextProcessed, int cchHanguel,
|
|
IWordSink *pWordSink, IPhraseSink *pPhraseSink,
|
|
WCHAR *pwchLast)
|
|
{
|
|
const WCHAR *pwcStem;
|
|
int iCur;
|
|
int cchToken, cchProcessed, cchHg;
|
|
int cchPrefix;
|
|
|
|
ATLASSERT(cchTextProcessed > 0);
|
|
|
|
if (cchTextProcessed <= 0)
|
|
return -1;
|
|
|
|
iCur = pTextSource->iCur;
|
|
pwcStem = pTextSource->awcBuffer + iCur;
|
|
cchProcessed = cchTextProcessed;
|
|
cchToken = cchTextProcessed;
|
|
|
|
// check too long token
|
|
if (cchToken > (int)m_ulMaxTokenSize || cchToken > MAX_INDEX_STRING)
|
|
{
|
|
cchProcessed = (m_ulMaxTokenSize < MAX_INDEX_STRING) ? m_ulMaxTokenSize : MAX_INDEX_STRING;
|
|
|
|
pWordSink->PutWord(cchProcessed,
|
|
pwcStem,
|
|
cchProcessed,
|
|
pTextSource->iCur);
|
|
|
|
return cchProcessed;
|
|
}
|
|
|
|
//=================================================
|
|
// query & index time
|
|
//=================================================
|
|
|
|
if (Type == WT_PHRASE_SEP)
|
|
{
|
|
// phrase separator
|
|
*pwchLast = L'\0';
|
|
|
|
pWordSink->PutBreak(WORDREP_BREAK_EOS);
|
|
}
|
|
else if (Type == WT_WORD_SEP)
|
|
{
|
|
if (!fIsWhiteSpace(*pwcStem))
|
|
*pwchLast = L'\0';
|
|
|
|
// Korean WB do not add EOW.
|
|
}
|
|
else if (Type == WT_ROMAJI)
|
|
{
|
|
// symbol, alphabet, hanja, romaji + hanguel
|
|
|
|
// get next token
|
|
iCur += cchToken;
|
|
Tokenize(FALSE, pTextSource, iCur, &Type, &cchToken, &cchHg);
|
|
|
|
if (Type == WT_ROMAJI)
|
|
{
|
|
if (cchHg > 0)
|
|
{
|
|
// romaji+(hanguel+romaji) case -> put word itself
|
|
cchProcessed += cchToken;
|
|
iCur += cchToken;
|
|
cchProcessed += GetWordPhrase(FALSE, pTextSource, iCur);
|
|
|
|
WB_LOG_START(pwcStem, cchProcessed);
|
|
|
|
pWordSink->PutWord(cchProcessed,
|
|
pwcStem,
|
|
cchProcessed,
|
|
pTextSource->iCur);
|
|
|
|
WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_SYMBOL);
|
|
}
|
|
else
|
|
{
|
|
WB_LOG_START(pwcStem, cchProcessed);
|
|
|
|
// {romaj}{romaj} case : -> breaking first {romaji}
|
|
CIndexInfo IndexInfo;
|
|
|
|
if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink))
|
|
goto ErrorReturn;
|
|
|
|
AnalyzeRomaji(pwcStem, cchProcessed, pTextSource->iCur, cchProcessed,
|
|
cchHanguel, &IndexInfo, &cchPrefix);
|
|
|
|
if (m_fQuery)
|
|
{
|
|
IndexInfo.AddIndex(pwcStem, cchProcessed+cchToken, WEIGHT_HARD_MATCH, 0, cchProcessed+cchToken-1);
|
|
WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY);
|
|
|
|
if (!IndexInfo.PutQueryIndexList())
|
|
goto ErrorReturn;
|
|
}
|
|
else
|
|
{
|
|
if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur))
|
|
goto ErrorReturn;
|
|
}
|
|
}
|
|
}
|
|
else if (Type == WT_HANGUEL)
|
|
{
|
|
// romaji(hanguel+romaji) + hanguel case
|
|
WCHAR wzRomaji[MAX_INDEX_STRING+1];
|
|
int cchRomaji;
|
|
|
|
cchRomaji = (cchProcessed > MAX_INDEX_STRING) ? MAX_INDEX_STRING : cchProcessed;
|
|
|
|
wcsncpy(wzRomaji, pwcStem, cchRomaji);
|
|
wzRomaji[cchRomaji] = L'\0';
|
|
|
|
WB_LOG_START(pwcStem, cchProcessed+cchToken);
|
|
|
|
cchProcessed += cchToken;
|
|
|
|
// start position include romanji
|
|
CIndexInfo IndexInfo;
|
|
|
|
if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink))
|
|
goto ErrorReturn;
|
|
|
|
if (cchHanguel > 0)
|
|
{
|
|
AnalyzeRomaji(pwcStem, cchRomaji, pTextSource->iCur, cchRomaji,
|
|
cchHanguel, &IndexInfo, &cchPrefix);
|
|
}
|
|
else
|
|
{
|
|
cchPrefix = CheckURLPrefix(pwcStem, cchProcessed-cchToken);
|
|
}
|
|
|
|
// analyze string starts from last hangul
|
|
pwcStem = pTextSource->awcBuffer + iCur;
|
|
|
|
if (cchRomaji > 0)
|
|
IndexInfo.SetRomajiInfo(wzRomaji, cchRomaji, cchPrefix);
|
|
|
|
// analyze string always with indexing mode on symbol processing
|
|
if (!AnalyzeString(&m_PI, m_fQuery, pwcStem, cchToken, iCur, &IndexInfo, *pwchLast))
|
|
goto ErrorReturn;
|
|
|
|
if (m_fQuery)
|
|
{
|
|
if (cchRomaji > 0)
|
|
IndexInfo.SetRomajiInfo(NULL, 0, 0);
|
|
|
|
IndexInfo.AddIndex(pTextSource->awcBuffer + pTextSource->iCur, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed+cchToken-1);
|
|
WB_LOG_ADD_INDEX(pTextSource->awcBuffer + pTextSource->iCur, cchProcessed, INDEX_QUERY);
|
|
|
|
if (!IndexInfo.PutQueryIndexList())
|
|
goto ErrorReturn;
|
|
}
|
|
else
|
|
{
|
|
if (!IndexInfo.MakeSingleLengthMergedIndex())
|
|
goto ErrorReturn;
|
|
|
|
if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur))
|
|
goto ErrorReturn;
|
|
}
|
|
|
|
*pwchLast = *(pwcStem + cchToken - 1);
|
|
}
|
|
else // next: WT_START, WT_PHRASE_SEP, WT_WORD_SEP, WT_REACHEND
|
|
{
|
|
WB_LOG_START(pwcStem, cchProcessed);
|
|
|
|
CIndexInfo IndexInfo;
|
|
|
|
if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink))
|
|
goto ErrorReturn;
|
|
|
|
AnalyzeRomaji(pwcStem, cchProcessed, pTextSource->iCur, cchProcessed,
|
|
cchHanguel, &IndexInfo, &cchPrefix);
|
|
|
|
if (m_fQuery)
|
|
{
|
|
IndexInfo.AddIndex(pwcStem, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed-1);
|
|
WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY);
|
|
|
|
if (!IndexInfo.PutQueryIndexList())
|
|
goto ErrorReturn;
|
|
}
|
|
else
|
|
{
|
|
if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur))
|
|
goto ErrorReturn;
|
|
}
|
|
}
|
|
}
|
|
else if (Type == WT_HANGUEL)
|
|
{
|
|
// hangul input
|
|
|
|
WB_LOG_START(pwcStem, cchProcessed);
|
|
|
|
CIndexInfo IndexInfo;
|
|
|
|
if (!IndexInfo.Initialize(cchProcessed, iCur, pWordSink, pPhraseSink))
|
|
goto ErrorReturn;
|
|
|
|
if (!AnalyzeString(&m_PI, m_fQuery, pwcStem, cchProcessed, iCur, &IndexInfo, *pwchLast))
|
|
goto ErrorReturn;
|
|
|
|
if (m_fQuery)
|
|
{
|
|
IndexInfo.AddIndex(pwcStem, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed-1);
|
|
WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY);
|
|
|
|
if (!IndexInfo.PutQueryIndexList())
|
|
goto ErrorReturn;
|
|
}
|
|
else
|
|
{
|
|
if (!IndexInfo.MakeSingleLengthMergedIndex())
|
|
goto ErrorReturn;
|
|
|
|
if (!IndexInfo.PutFinalIndexList(pwcStem))
|
|
goto ErrorReturn;
|
|
}
|
|
|
|
*pwchLast = *(pwcStem + cchProcessed - 1);
|
|
}
|
|
|
|
WB_LOG_PRINT_ALL();
|
|
WB_LOG_END();
|
|
|
|
return cchProcessed;
|
|
|
|
ErrorReturn:
|
|
|
|
WB_LOG_END();
|
|
|
|
return -1;
|
|
}
|
|
|
|
// CWordBreaker::AnalyzeRomaji
|
|
//
|
|
// helper function for romaji token wordbreaking
|
|
//
|
|
// Parameters:
|
|
// pwcStem -> (const WCHAR*) input token string
|
|
// cchStem -> (int) length of input romaji token
|
|
// iCur -> (int) source string position
|
|
// cchProcessed -> (int) input length to process
|
|
// cchHanguel -> (int) hangul token length (hanguel+romaji case only)
|
|
// pIndexInfo -> (CIndexInfo *) output index list
|
|
// pcchPrefix -> (int*) output prefix length
|
|
//
|
|
// Result:
|
|
// (void)
|
|
//
|
|
// 23NOV00 bhshin began
|
|
void CWordBreaker::AnalyzeRomaji(const WCHAR *pwcStem, int cchStem,
|
|
int iCur, int cchProcessed, int cchHanguel,
|
|
CIndexInfo *pIndexInfo, int *pcchPrefix)
|
|
{
|
|
int cchPrefix = 0;
|
|
|
|
// hanguel+romaji case
|
|
if (cchHanguel < cchProcessed)
|
|
{
|
|
// hanguel
|
|
if (cchHanguel > 0)
|
|
{
|
|
pIndexInfo->AddIndex(pwcStem, cchHanguel, WEIGHT_HARD_MATCH, 0, cchHanguel-1);
|
|
WB_LOG_ADD_INDEX(pwcStem, cchHanguel, INDEX_SYMBOL);
|
|
}
|
|
|
|
// romaji
|
|
if ((cchStem-cchHanguel) > 0)
|
|
{
|
|
pIndexInfo->AddIndex(pwcStem + cchHanguel, cchStem - cchHanguel, WEIGHT_HARD_MATCH, cchHanguel, cchStem-1);
|
|
WB_LOG_ADD_INDEX(pwcStem + cchHanguel, cchStem - cchHanguel, INDEX_SYMBOL);
|
|
}
|
|
}
|
|
|
|
if (cchHanguel == 1 || (cchStem-cchHanguel) == 1)
|
|
{
|
|
// romaji(hangul+romaji)
|
|
pIndexInfo->AddIndex(pwcStem, cchStem, WEIGHT_HARD_MATCH, 0, cchStem-1);
|
|
WB_LOG_ADD_INDEX(pwcStem, cchStem, INDEX_SYMBOL);
|
|
}
|
|
|
|
// check URL prefix
|
|
cchPrefix = CheckURLPrefix(pwcStem, cchProcessed);
|
|
if (cchPrefix > 0 && cchPrefix < cchProcessed)
|
|
{
|
|
pIndexInfo->AddIndex(pwcStem + cchPrefix, cchStem - cchPrefix, WEIGHT_HARD_MATCH, cchPrefix, cchStem-1);
|
|
WB_LOG_ADD_INDEX(pwcStem + cchPrefix, cchStem - cchPrefix, INDEX_SYMBOL);
|
|
}
|
|
|
|
*pcchPrefix = cchPrefix; // return it
|
|
}
|
|
|