mirror of https://github.com/tongzx/nt5src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
138 lines
3.8 KiB
138 lines
3.8 KiB
#include "base.h"
|
|
#include "SpanishTokenizer.h"
|
|
#include "WbUtils.h"
|
|
|
|
CAutoClassPointer<CSpanishDict> g_apSpanishDict;
|
|
|
|
CSpanishTokenizer::CSpanishTokenizer(
|
|
TEXT_SOURCE* pTxtSource,
|
|
IWordSink * pWordSink,
|
|
IPhraseSink * pPhraseSink,
|
|
LCID lcid,
|
|
BOOL bQueryTime,
|
|
ULONG ulMaxTokenSize) :
|
|
CTokenizer(pTxtSource, pWordSink, pPhraseSink, lcid, bQueryTime, ulMaxTokenSize)
|
|
{
|
|
if (NULL == g_apSpanishDict.Get())
|
|
{
|
|
CSyncMutexCatcher cs(m_csSpanishDictInit);
|
|
if (NULL == g_apSpanishDict.Get())
|
|
{
|
|
CAutoArrayPointer<WCHAR> apwcsPath;
|
|
|
|
apwcsPath = CreateFilePath(L"SpanishDict.txt");
|
|
|
|
if (NULL == g_apSpanishUtil.Get())
|
|
{
|
|
g_apSpanishUtil = new CSpanishUtil;
|
|
}
|
|
|
|
if (NULL == g_apSpanishDict.Get())
|
|
{
|
|
g_apSpanishDict = new CSpanishDict(apwcsPath.Get());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void CSpanishTokenizer::OutputSimpleToken(
|
|
CTokenState& State,
|
|
const CCliticsTerm* pTerm)
|
|
{
|
|
HRESULT hr;
|
|
ULONG ulOffsetInTxtSourceBuffer =
|
|
m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
if ((TAIL_MATCH_TRUNCATE == pTerm->ulOp) ||
|
|
(HEAD_MATCH_TRUNCATE == pTerm->ulOp))
|
|
{
|
|
if (0 == ( State.m_ulEnd - State.m_ulStart - pTerm->ulLen ))
|
|
{
|
|
return;
|
|
}
|
|
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
|
|
if (pTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
Assert(pTerm->ulOp == HEAD_MATCH_TRUNCATE);
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
|
|
&State.m_pwcsToken[State.m_ulStart + pTerm->ulLen],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
WCHAR pwcsAlt[32];
|
|
ULONG ulAltLen;
|
|
bool bAlt = false;
|
|
ULONG ulWordLen = State.m_ulEnd - State.m_ulStart;
|
|
|
|
if (ulWordLen < 32)
|
|
{
|
|
g_apSpanishDict->BreakWord(
|
|
ulWordLen,
|
|
State.m_pwcsToken + State.m_ulStart,
|
|
&bAlt,
|
|
&ulAltLen,
|
|
pwcsAlt);
|
|
}
|
|
|
|
if (bAlt)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
ulAltLen,
|
|
pwcsAlt,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer
|
|
);
|
|
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer
|
|
);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
}
|
|
|
|
|