You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
3207 lines
86 KiB
3207 lines
86 KiB
////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// Filename : Tokenizer.cpp
|
|
// Purpose : Tokenizer implementation
|
|
//
|
|
// Project : WordBreakers
|
|
// Component: English word breaker
|
|
//
|
|
// Author : yairh
|
|
//
|
|
// Log:
|
|
//
|
|
// Jan 06 2000 yairh creation
|
|
// Apr 04 2000 dovh on behalf of dlee - Fix CTokenizer::OutputClitics
|
|
// to avoid PutWord of length 0 (leads to multiple PutWord at
|
|
// same location (duplicate keys), and index corruption!
|
|
// Example: :...'s :...'s (. stands for junk character)
|
|
// Apr 05 2000 dovh - Fixed two problematic debug / tracer buffer size
|
|
// problems. (Related to Bug 15449).
|
|
// May 07 2000 dovh - USE_WS_SENTINEL algorithm in BreakText
|
|
// May 11 2000 dovh - Simplify VerifyMisc test.
|
|
// Nov 11 2000 dovh - Special underscore treatment
|
|
// Add AddBackUnderscores '_' + alphanumeric treatment.
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "base.h"
|
|
#include "Tokenizer.h"
|
|
#include "PropArray.h"
|
|
#include "excption.h"
|
|
#include "formats.h"
|
|
|
|
DECLARE_TRIE_SENTINEL;
|
|
CWbToUpper g_WbToUpper;
|
|
|
|
CAutoClassPointer<CPropArray> g_pPropArray;
|
|
|
|
CTokenizer::CTokenizer(
|
|
TEXT_SOURCE* pTxtSource,
|
|
IWordSink * pWordSink,
|
|
IPhraseSink * pPhraseSink,
|
|
LCID lcid,
|
|
BOOL bQueryTime,
|
|
ULONG ulMaxTokenSize) :
|
|
m_pTxtSource(pTxtSource),
|
|
m_apWordSink(pWordSink),
|
|
m_apPhraseSink(pPhraseSink),
|
|
m_Lcid(lcid),
|
|
m_bQueryTime(bQueryTime),
|
|
m_bNoMoreTxt(false),
|
|
m_Token(ulMaxTokenSize),
|
|
m_bWhiteSpaceGuarranteed(false)
|
|
{
|
|
m_ulMaxTokenSize = min(ulMaxTokenSize, TOKENIZER_MAXBUFFERLIMIT);
|
|
|
|
m_apLangSupport = new CLangSupport(lcid);
|
|
|
|
m_pCurToken = &m_Token;
|
|
|
|
if (pTxtSource->iEnd > pTxtSource->iCur)
|
|
{
|
|
CalculateUpdateEndOfBuffer();
|
|
}
|
|
else
|
|
{
|
|
m_ulUpdatedEndOfBuffer = pTxtSource->iEnd;
|
|
}
|
|
}
|
|
|
|
|
|
void CTokenizer::BreakText()
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizer,
|
|
("CTokenizer::BreakText()"));
|
|
|
|
|
|
WCHAR wch;
|
|
ULONGLONG ullflags(PROP_DEFAULT);
|
|
|
|
//
|
|
// USE_WS_SENTINEL Algorithm:
|
|
//
|
|
|
|
HRESULT hr = S_OK;
|
|
|
|
if (m_pTxtSource->iCur >= m_ulUpdatedEndOfBuffer)
|
|
{
|
|
|
|
hr = FillBuffer();
|
|
|
|
}
|
|
|
|
while ( SUCCEEDED(hr) )
|
|
{
|
|
if ( m_bWhiteSpaceGuarranteed )
|
|
{
|
|
while (true)
|
|
{
|
|
wch = m_pTxtSource->awcBuffer[m_pTxtSource->iCur];
|
|
|
|
ullflags = (GET_PROP(wch).m_ulFlag);
|
|
|
|
if (ullflags & PROP_WS)
|
|
{
|
|
if (m_pCurToken->IsNotEmpty())
|
|
{
|
|
ProcessToken();
|
|
}
|
|
m_pTxtSource->iCur++;
|
|
|
|
if (m_pTxtSource->iCur >= m_ulUpdatedEndOfBuffer)
|
|
{
|
|
hr = FillBuffer();
|
|
break;
|
|
|
|
}
|
|
continue;
|
|
|
|
}
|
|
|
|
//
|
|
// The following lines are inline expenstion of what
|
|
// used to be CToken::RecordChar:
|
|
//
|
|
|
|
Assert(m_pCurToken->m_ulBufPos < m_ulMaxTokenSize);
|
|
m_pCurToken->m_awchBuf[m_pCurToken->m_ulBufPos] = wch;
|
|
m_pCurToken->m_ulBufPos++;
|
|
m_pCurToken->m_State.m_Properties.m_ulFlag |= ullflags;
|
|
m_pTxtSource->iCur++;
|
|
|
|
} // while
|
|
}
|
|
else
|
|
{
|
|
while (true)
|
|
{
|
|
if (m_pTxtSource->iCur >= m_ulUpdatedEndOfBuffer)
|
|
{
|
|
Assert(m_pTxtSource->iCur == m_ulUpdatedEndOfBuffer);
|
|
|
|
//
|
|
// before we switch between buffers if the current token is not empty we
|
|
// need to proccess it. m_ulUpdatedEndOfBuffer always points to a breaker character
|
|
// (usually it is a WS) thus no token can start at a certain buffer and end in the
|
|
// proceeding buffer.
|
|
//
|
|
|
|
if (m_pCurToken->IsNotEmpty())
|
|
{
|
|
ProcessToken();
|
|
}
|
|
|
|
hr = FillBuffer();
|
|
if (FAILED(hr))
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
wch = m_pTxtSource->awcBuffer[m_pTxtSource->iCur];
|
|
|
|
ULONGLONG ullflags(GET_PROP(wch).m_ulFlag);
|
|
|
|
if (ullflags & PROP_WS)
|
|
{
|
|
if (m_pCurToken->IsNotEmpty())
|
|
{
|
|
ProcessToken();
|
|
}
|
|
m_pTxtSource->iCur++;
|
|
continue;
|
|
}
|
|
|
|
//
|
|
// the following lines are inline expenstion of what used to be CToken::RecordChar.
|
|
//
|
|
|
|
Assert(m_pCurToken->m_ulBufPos < m_ulMaxTokenSize);
|
|
m_pCurToken->m_awchBuf[m_pCurToken->m_ulBufPos] = wch;
|
|
m_pCurToken->m_ulBufPos++;
|
|
m_pCurToken->m_State.m_Properties.m_ulFlag |= ullflags;
|
|
m_pTxtSource->iCur++;
|
|
|
|
} // while
|
|
|
|
} // if
|
|
|
|
} // while ( !FAILED(hr) )
|
|
|
|
} // CTokenizer::BreakText
|
|
|
|
void CTokenizer::ProcessToken()
|
|
{
|
|
ULONG ulOffset;
|
|
|
|
if (m_pTxtSource->iCur < m_pCurToken->m_ulBufPos)
|
|
{
|
|
Trace(
|
|
elWarning,
|
|
s_tagTokenizer,
|
|
("CTokenizer::ProcessToken() wrong offset calculation"));
|
|
|
|
//
|
|
// BUGBUG need to understand why we got to this place.
|
|
//
|
|
Assert(0 && "Wrong offset calculation");
|
|
|
|
ulOffset = m_pCurToken->m_ulBufPos + 1;
|
|
}
|
|
else if (m_pTxtSource->iCur == m_pCurToken->m_ulBufPos)
|
|
{
|
|
ulOffset = m_pCurToken->m_ulBufPos;
|
|
}
|
|
else
|
|
{
|
|
ulOffset = m_pTxtSource->iCur;
|
|
}
|
|
|
|
m_pCurToken->MarkEndToken(ulOffset);
|
|
#ifdef DEBUG
|
|
TraceToken();
|
|
#endif
|
|
|
|
//
|
|
// simple token.
|
|
//
|
|
if (IS_PROP_SIMPLE(m_pCurToken->m_State.m_Properties))
|
|
{
|
|
OutputSimpleToken(
|
|
m_pCurToken->m_State,
|
|
&g_EmptyClitics);
|
|
|
|
}
|
|
else
|
|
{
|
|
ProcessTokenInternal();
|
|
}
|
|
|
|
if (m_pCurToken->m_fHasEos)
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("EOS"));
|
|
|
|
HRESULT hr;
|
|
hr = m_apWordSink->PutBreak(WORDREP_BREAK_EOS);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
m_pCurToken->Clear();
|
|
}
|
|
|
|
void CTokenizer::ProcessTokenInternal()
|
|
{
|
|
|
|
do
|
|
{
|
|
|
|
//
|
|
// url
|
|
//
|
|
|
|
if (HAS_PROP_SLASH(m_pCurToken->m_State.m_Properties) &&
|
|
HAS_PROP_COLON(m_pCurToken->m_State.m_Properties) &&
|
|
HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to be <alpha>:// url",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
if (VerifyAlphaUrl())
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (HAS_PROP_PERIOD(m_pCurToken->m_State.m_Properties) &&
|
|
HAS_PROP_W(m_pCurToken->m_State.m_Properties))
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to be www. url",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
if (VerifyWwwUrl())
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
// Acronym
|
|
//
|
|
|
|
if (HAS_PROP_PERIOD(m_pCurToken->m_State.m_Properties) &&
|
|
HAS_PROP_UPPER_CASE(m_pCurToken->m_State.m_Properties))
|
|
{
|
|
if (!HAS_PROP_LOWER_CASE(m_pCurToken->m_State.m_Properties) ||
|
|
HAS_PROP_APOSTROPHE(m_pCurToken->m_State.m_Properties))
|
|
{
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to be an acronym",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
if (VerifyAcronym())
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Abbreviation
|
|
//
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to be an abbreviation",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
|
|
if (VerifyAbbreviation())
|
|
{
|
|
break;
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to be a special abbreviation",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
if (VerifySpecialAbbreviation())
|
|
{
|
|
break;
|
|
}
|
|
|
|
}
|
|
|
|
//
|
|
// Hyphenation
|
|
//
|
|
if (HAS_PROP_DASH(m_pCurToken->m_State.m_Properties) &&
|
|
HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to have a hyphenation",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
if (VerifyHyphenation())
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
//
|
|
// (s) parenthesis
|
|
//
|
|
|
|
if (HAS_PROP_LEFT_PAREN(m_pCurToken->m_State.m_Properties) &&
|
|
HAS_PROP_RIGHT_PAREN(m_pCurToken->m_State.m_Properties) &&
|
|
HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to have a (s) Parenthesis",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
if (VerifyParens())
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
// Currency
|
|
//
|
|
if (HAS_PROP_CURRENCY(m_pCurToken->m_State.m_Properties) &&
|
|
HAS_PROP_NUMBER(m_pCurToken->m_State.m_Properties))
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to be a currency",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
if (VerifyCurrency())
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Numbers / time / dates
|
|
//
|
|
|
|
if (HAS_PROP_NUMBER(m_pCurToken->m_State.m_Properties))
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to be a number or a time or a date",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
if (VerifyNumberOrTimeOrDate())
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
//
|
|
// commersial signs
|
|
//
|
|
if (TEST_PROP(m_pCurToken->m_State.m_Properties, PROP_COMMERSIAL_SIGN) &&
|
|
HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties))
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to have a commesial sign",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
if (VerifyCommersialSign())
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Misc - C++, J++, A+, A- .. C#
|
|
//
|
|
|
|
if ( TEST_PROP(m_pCurToken->m_State.m_Properties, (PROP_MINUS|PROP_PLUS|PROP_POUND)) &&
|
|
HAS_PROP_ALPHA(m_pCurToken->m_State.m_Properties) )
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to belong to the misc list",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
|
|
if (VerifyMisc())
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
//
|
|
// default
|
|
//
|
|
|
|
ProcessDefault();
|
|
|
|
} while (false);
|
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
void CTokenizer::TraceToken()
|
|
{
|
|
WCHAR buf[MAX_NUM_PROP+1];
|
|
|
|
size_t bufLen = wcslen(TRACE_CHAR);
|
|
Assert(bufLen < MAX_NUM_PROP + 1);
|
|
buf[bufLen] = L'\0';
|
|
|
|
for(int i=0; i<bufLen; i++)
|
|
{
|
|
if(TEST_PROP(m_pCurToken->m_State.m_Properties, (1<<i)))
|
|
{
|
|
buf[i] = TRACE_CHAR[i];
|
|
}
|
|
else
|
|
{
|
|
buf[i] = L'_';
|
|
}
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerTrace,
|
|
("[%S] - %*.*S",
|
|
buf,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
|
|
}
|
|
#endif // DEBUG
|
|
|
|
bool CTokenizer::VerifyAlphaUrl()
|
|
{
|
|
//
|
|
// looking for <alpha>:// pattern
|
|
//
|
|
|
|
CTokenState State(m_pCurToken->m_State);
|
|
|
|
ULONG ul = State.m_ulStart;
|
|
|
|
if (!HAS_PROP_ALPHA(GET_PROP(State.m_pwcsToken[ul])))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
while (HAS_PROP_EXTENDED_ALPHA(GET_PROP(State.m_pwcsToken[ul])))
|
|
{
|
|
ul++;
|
|
}
|
|
|
|
if (!(HAS_PROP_COLON(GET_PROP(State.m_pwcsToken[ul]))))
|
|
{
|
|
return false;
|
|
}
|
|
ul++;
|
|
|
|
if (!(HAS_PROP_SLASH(GET_PROP(State.m_pwcsToken[ul]))))
|
|
{
|
|
return false;
|
|
}
|
|
ul++;
|
|
|
|
if (!(HAS_PROP_SLASH(GET_PROP(State.m_pwcsToken[ul]))))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is an <alpha>:// url",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
}
|
|
|
|
OutputUrl(State);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CTokenizer::VerifyWwwUrl()
|
|
{
|
|
CTokenState State(m_pCurToken->m_State);
|
|
|
|
if (State.m_ulEnd - State.m_ulStart <= 4)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (0 != _wcsnicmp(State.m_pwcsToken + State.m_ulStart, L"www.", 4))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is a www. url",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
OutputUrl(State);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CTokenizer::VerifyAcronym()
|
|
{
|
|
//
|
|
// looking for I.B.M or I.B.M. or A.B.CC but not A.B.CC.
|
|
//
|
|
|
|
CTokenState State(m_pCurToken->m_State);
|
|
|
|
CPropFlag AbbPuctTail(ACRONYM_PUNCT_TAIL);
|
|
CPropFlag AbbPuctHead(ACRONYM_PUNCT_HEAD);
|
|
bool fNeedToRemoveEos = true;
|
|
|
|
if (TEST_PROP(State.m_Properties, (ACRONYM_PUNCT_TAIL | ACRONYM_PUNCT_HEAD)))
|
|
{
|
|
if (TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd- 1]), ABBREVIATION_EOS))
|
|
{
|
|
fNeedToRemoveEos = false;
|
|
}
|
|
|
|
ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(AbbPuctTail, State);
|
|
ulCharRemoved += m_pCurToken->RemoveHeadPunct(AbbPuctHead, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
const CCliticsTerm* pCliticsTerm;
|
|
pCliticsTerm = VerifyClitics(State);
|
|
|
|
ULONG ulEnd = State.m_ulEnd;
|
|
ULONG ulCur = State.m_ulStart;
|
|
|
|
if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
ulCur += pCliticsTerm->ulLen;
|
|
}
|
|
else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
ulEnd -= pCliticsTerm->ulLen;
|
|
}
|
|
|
|
//
|
|
// finding the last period
|
|
//
|
|
while ((ulEnd > ulCur) &&
|
|
HAS_PROP_UPPER_CASE(GET_PROP(State.m_pwcsToken[ulEnd- 1])))
|
|
{
|
|
ulEnd--;
|
|
}
|
|
|
|
if ((ulEnd == ulCur) ||
|
|
!HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[ulEnd- 1])))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ULONG ulCounter = 0;
|
|
|
|
while (ulCur < ulEnd)
|
|
{
|
|
if (ulCounter%2 == 0)
|
|
{
|
|
if (!HAS_PROP_UPPER_CASE(GET_PROP(State.m_pwcsToken[ulCur])))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[ulCur])))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
ulCur++;
|
|
ulCounter++;
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is an acronym",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
if (fNeedToRemoveEos && (pCliticsTerm->ulOp != TAIL_MATCH_TRUNCATE))
|
|
{
|
|
m_pCurToken->m_fHasEos = false;
|
|
}
|
|
OutputAcronym(State, pCliticsTerm);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CTokenizer::VerifyAbbreviation()
|
|
{
|
|
//
|
|
// looking for Sr. Jr.
|
|
// we define abbreviation as a pattern with 2 letters ending with a dot and the first letter
|
|
// is a capital one
|
|
//
|
|
|
|
CTokenState State(m_pCurToken->m_State);
|
|
CPropFlag AbbPuctTail(ABBREVIATION_PUNCT_TAIL);
|
|
CPropFlag AbbPuctHead(ABBREVIATION_PUNCT_HEAD);
|
|
bool fNeedToRemoveEos = true;
|
|
|
|
if (TEST_PROP(State.m_Properties, (ABBREVIATION_PUNCT_TAIL | ABBREVIATION_PUNCT_HEAD)))
|
|
{
|
|
if (TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd- 1]), ABBREVIATION_EOS))
|
|
{
|
|
fNeedToRemoveEos = false;
|
|
}
|
|
|
|
ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(AbbPuctTail, State);
|
|
ulCharRemoved += m_pCurToken->RemoveHeadPunct(AbbPuctHead, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
if ((State.m_ulEnd - State.m_ulStart) != 3)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (!HAS_PROP_UPPER_CASE(GET_PROP(State.m_pwcsToken[State.m_ulStart])))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (!HAS_PROP_EXTENDED_ALPHA(GET_PROP(State.m_pwcsToken[State.m_ulStart + 1])))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (!HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[State.m_ulStart + 2])))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is an abbreviation",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
|
|
if (fNeedToRemoveEos)
|
|
{
|
|
m_pCurToken->m_fHasEos = false;
|
|
}
|
|
|
|
OutputAbbreviation(State);
|
|
return true;
|
|
|
|
}
|
|
|
|
bool CTokenizer::VerifySpecialAbbreviation()
|
|
{
|
|
CTokenState State(m_pCurToken->m_State);
|
|
CPropFlag AbbPuctTail(SPECIAL_ABBREVIATION_PUNCT_TAIL);
|
|
CPropFlag AbbPuctHead(SPECIAL_ABBREVIATION_PUNCT_HEAD);
|
|
|
|
if (TEST_PROP(State.m_Properties, (SPECIAL_ABBREVIATION_PUNCT_TAIL | SPECIAL_ABBREVIATION_PUNCT_HEAD)))
|
|
{
|
|
ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(AbbPuctTail, State);
|
|
ulCharRemoved += m_pCurToken->RemoveHeadPunct(AbbPuctHead, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
|
|
if (!HAS_PROP_PERIOD(State.m_Properties))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
const CCliticsTerm* pCliticsTerm;
|
|
pCliticsTerm = VerifyClitics(State);
|
|
|
|
ULONG ulAddToStart = 0;
|
|
ULONG ulDecFromEnd = 0;
|
|
|
|
if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
ulAddToStart = pCliticsTerm->ulLen;
|
|
}
|
|
else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
ulDecFromEnd = pCliticsTerm->ulLen;
|
|
}
|
|
|
|
|
|
CAbbTerm* pTerm;
|
|
short sResCount = 0;
|
|
DictStatus status;
|
|
|
|
CSpecialAbbreviationSet* pAbbSet = m_apLangSupport->GetAbbSet();
|
|
status = pAbbSet->m_trieAbb.trie_Find(
|
|
State.m_pwcsToken + State.m_ulStart + ulAddToStart,
|
|
TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
|
|
1,
|
|
&pTerm,
|
|
&sResCount);
|
|
|
|
if (sResCount &&
|
|
(pTerm->ulAbbLen == (State.m_ulEnd - State.m_ulStart - ulAddToStart - ulDecFromEnd)))
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is an abbreviation",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
OutputSpecialAbbreviation(State, pTerm, pCliticsTerm);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool CTokenizer::VerifyMisc()
|
|
{
|
|
CTokenState State(m_pCurToken->m_State);
|
|
CPropFlag MiscPuctTail(MISC_PUNCT_TAIL);
|
|
CPropFlag MiscPuctHead(MISC_PUNCT_HEAD);
|
|
|
|
if (TEST_PROP(State.m_Properties, (MISC_PUNCT_TAIL | MISC_PUNCT_HEAD)))
|
|
{
|
|
ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(MiscPuctTail, State);
|
|
ulCharRemoved += m_pCurToken->RemoveHeadPunct(MiscPuctHead, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
const CCliticsTerm* pCliticsTerm;
|
|
pCliticsTerm = VerifyClitics(State);
|
|
|
|
ULONG ulAddToStart = 0;
|
|
ULONG ulDecFromEnd = 0;
|
|
|
|
if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
ulAddToStart = pCliticsTerm->ulLen;
|
|
}
|
|
else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
ulDecFromEnd = pCliticsTerm->ulLen;
|
|
}
|
|
|
|
int iEnd = State.m_ulEnd - ulDecFromEnd;
|
|
int iStart = State.m_ulStart + ulAddToStart;
|
|
if (iEnd <= iStart)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool bPatternContainOnlyUpperCase = true;
|
|
ULONG ulSuffixSize = 0;
|
|
|
|
if (TEST_PROP(State.m_Properties, PROP_POUND))
|
|
{
|
|
//
|
|
// look for A# C#
|
|
//
|
|
|
|
ULONG ulEnd = State.m_ulEnd - ulDecFromEnd;
|
|
ULONG ulStart = State.m_ulStart + ulAddToStart;
|
|
if (ulEnd - ulStart != 2)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (!TEST_PROP(GET_PROP(State.m_pwcsToken[ulEnd - 1]), PROP_POUND))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (!TEST_PROP(GET_PROP(State.m_pwcsToken[ulStart]), PROP_UPPER_CASE))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ulSuffixSize = 1;
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// look for C++ COM+ ...
|
|
//
|
|
|
|
ULONG ul = State.m_ulEnd - ulDecFromEnd - 1;
|
|
while ((int)ul >= (int)(State.m_ulStart + ulAddToStart))
|
|
{
|
|
if (!TEST_PROP(GET_PROP(State.m_pwcsToken[ul]), PROP_PLUS | PROP_MINUS))
|
|
{
|
|
break;
|
|
}
|
|
ulSuffixSize++;
|
|
ul--;
|
|
}
|
|
|
|
if (ulSuffixSize > 2)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
while ((int)ul >= (int)(State.m_ulStart + ulAddToStart))
|
|
{
|
|
CPropFlag prop(GET_PROP(State.m_pwcsToken[ul]));
|
|
if (!HAS_PROP_EXTENDED_ALPHA(prop))
|
|
{
|
|
return false;
|
|
}
|
|
if (!TEST_PROP(prop, PROP_UPPER_CASE))
|
|
{
|
|
bPatternContainOnlyUpperCase = false;
|
|
}
|
|
|
|
ul--;
|
|
}
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is detected",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
OutputMisc(
|
|
State,
|
|
bPatternContainOnlyUpperCase,
|
|
ulSuffixSize,
|
|
pCliticsTerm);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
bool CTokenizer::VerifyHyphenation()
|
|
{
|
|
//
|
|
// looking for data-base
|
|
//
|
|
|
|
CPropFlag PunctHead(HYPHENATION_PUNCT_HEAD);
|
|
CPropFlag PunctTail(HYPHENATION_PUNCT_TAIL);
|
|
CTokenState State(m_pCurToken->m_State);
|
|
|
|
if (TEST_PROP(State.m_Properties, (HYPHENATION_PUNCT_HEAD | HYPHENATION_PUNCT_TAIL)))
|
|
{
|
|
ULONG ulCharRemoved;
|
|
ulCharRemoved = m_pCurToken->RemoveHeadPunct(PunctHead, State);
|
|
ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
if (!HAS_PROP_DASH(State.m_Properties))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
const CCliticsTerm* pCliticsTerm;
|
|
pCliticsTerm = VerifyClitics(State);
|
|
|
|
ULONG ulAddToStart = 0;
|
|
ULONG ulDecFromEnd = 0;
|
|
|
|
if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
ulAddToStart = pCliticsTerm->ulLen;
|
|
}
|
|
else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
ulDecFromEnd = pCliticsTerm->ulLen;
|
|
}
|
|
|
|
|
|
ULONG ulCur = State.m_ulStart + ulAddToStart;
|
|
ULONG ulEnd = State.m_ulEnd - ulDecFromEnd;
|
|
|
|
bool bReadAlpha = false;
|
|
|
|
do
|
|
{
|
|
while (ulCur < ulEnd)
|
|
{
|
|
if (HAS_PROP_EXTENDED_ALPHA(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
|
|
{
|
|
ulCur++;
|
|
bReadAlpha = true;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
|
|
if (!bReadAlpha)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (ulCur < ulEnd)
|
|
{
|
|
if (!HAS_PROP_DASH(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
|
|
ulCur++;
|
|
bReadAlpha = false;
|
|
}
|
|
while (ulCur < ulEnd);
|
|
|
|
if (!bReadAlpha)
|
|
{
|
|
//
|
|
// last characters where not alpha ex. free-
|
|
//
|
|
return false;
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is an hyphenation",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
OutputHyphenation(State, pCliticsTerm);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CTokenizer::VerifyParens()
|
|
{
|
|
CPropFlag PunctTail(PAREN_PUNCT_TAIL);
|
|
CPropFlag PunctHead(PAREN_PUNCT_HEAD);
|
|
|
|
CTokenState State(m_pCurToken->m_State);
|
|
|
|
if (TEST_PROP(State.m_Properties, (PAREN_PUNCT_TAIL | PAREN_PUNCT_HEAD)))
|
|
{
|
|
ULONG ulCharRemoved;
|
|
ulCharRemoved = m_pCurToken->RemoveTailPunct(PunctTail, State);
|
|
ulCharRemoved += m_pCurToken->RemoveHeadPunct(PunctHead, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
//
|
|
// looking for (s)
|
|
//
|
|
|
|
if ((State.m_ulEnd - State.m_ulStart) < 4)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (0 != wcsncmp(State.m_pwcsToken + State.m_ulEnd - 3, L"(s)", 3))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd - 3; ul++)
|
|
{
|
|
if (!HAS_PROP_EXTENDED_ALPHA(GET_PROP(State.m_pwcsToken[ul])))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S has (s) parenthesis",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
OutputParens(State);
|
|
|
|
return true;
|
|
}
|
|
|
|
const CCliticsTerm* CTokenizer::VerifyClitics(CTokenState& S)
|
|
{
|
|
if (TEST_PROP(GET_PROP(S.m_pwcsToken[S.m_ulStart]), PROP_APOSTROPHE))
|
|
{
|
|
S.m_ulStart++;
|
|
|
|
if ((TEST_PROP(GET_PROP(S.m_pwcsToken[S.m_ulEnd - 1]), PROP_APOSTROPHE)) &&
|
|
(S.m_ulEnd > S.m_ulStart))
|
|
{
|
|
S.m_ulEnd--;
|
|
}
|
|
|
|
m_pCurToken->ComputeStateProperties(S);
|
|
}
|
|
|
|
if (!(HAS_PROP_APOSTROPHE(S.m_Properties)))
|
|
{
|
|
return &g_EmptyClitics;
|
|
}
|
|
|
|
CPropFlag PunctTail(CLITICS_PUNC_TAIL);
|
|
CPropFlag PunctHead(CLITICS_PUNCT_HEAD);
|
|
|
|
CTokenState State(S);
|
|
|
|
if (TEST_PROP(State.m_Properties, (CLITICS_PUNC_TAIL | CLITICS_PUNCT_HEAD)))
|
|
{
|
|
ULONG ulCharRemoved;
|
|
ulCharRemoved = m_pCurToken->RemoveTailPunct(PunctTail, State);
|
|
ulCharRemoved += m_pCurToken->RemoveHeadPunct(PunctHead, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to have an apostophe",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
|
|
|
|
ULONG ulApostrophePos = -1;
|
|
ULONG ulCur;
|
|
for (ulCur = State.m_ulStart; ulCur < State.m_ulEnd ; ulCur++)
|
|
{
|
|
|
|
if (TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur]), PROP_APOSTROPHE))
|
|
{
|
|
if ((-1 != ulApostrophePos) || (State.m_ulStart == ulCur))
|
|
{
|
|
//
|
|
// this is not the first \' this is not a valid clitics
|
|
// or the term start with a new apostrophe
|
|
//
|
|
return &g_EmptyClitics;
|
|
}
|
|
ulApostrophePos = ulCur;
|
|
//
|
|
// replace the apostrophe with an ascii apostrophe.
|
|
//
|
|
State.m_pwcsToken[ulCur] = L'\'';
|
|
continue;
|
|
}
|
|
}
|
|
|
|
//
|
|
// looking for xxxxs'
|
|
//
|
|
if ((ulApostrophePos == State.m_ulEnd - 1) &&
|
|
(State.m_pwcsToken[ulApostrophePos - 1] == L's'))
|
|
{
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S has a s' clitcs",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
S = State;
|
|
return &g_SClitics;
|
|
|
|
}
|
|
|
|
//
|
|
// looking for tail clitics like xxx's
|
|
//
|
|
|
|
DictStatus status;
|
|
|
|
CCliticsTerm* pTerm;
|
|
short sResCount = 0;
|
|
|
|
if (ulCur > State.m_ulStart)
|
|
{
|
|
status = g_pClitics->m_trieClitics.trie_Find(
|
|
State.m_pwcsToken + ulApostrophePos,
|
|
TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
|
|
1,
|
|
&pTerm,
|
|
&sResCount);
|
|
if (sResCount && pTerm->ulLen == (State.m_ulEnd - ulApostrophePos))
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S has a %S clitcs",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart,
|
|
pTerm->pwcs
|
|
));
|
|
|
|
S = State;
|
|
return pTerm;
|
|
}
|
|
}
|
|
|
|
//
|
|
// looking for head clitics like l'xxxx
|
|
//
|
|
|
|
status = g_pClitics->m_trieClitics.trie_Find(
|
|
State.m_pwcsToken + State.m_ulStart,
|
|
TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
|
|
1,
|
|
&pTerm,
|
|
&sResCount);
|
|
if (sResCount)
|
|
{
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S has a %S clitcs",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart,
|
|
pTerm->pwcs
|
|
));
|
|
|
|
S = State;
|
|
return pTerm;
|
|
}
|
|
|
|
return &g_EmptyClitics;
|
|
}
|
|
|
|
bool CTokenizer::VerifyNumberOrTimeOrDate()
|
|
{
|
|
CPropFlag PunctHead(NUM_DATE_TIME_PUNCT_HEAD);
|
|
CPropFlag PunctTail(NUM_DATE_TIME_PUNCT_TAIL);
|
|
CTokenState State(m_pCurToken->m_State);
|
|
|
|
if (TEST_PROP(State.m_Properties,
|
|
(NUM_DATE_TIME_PUNCT_HEAD | NUM_DATE_TIME_PUNCT_TAIL)))
|
|
{
|
|
ULONG ulCharRemoved;
|
|
ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
|
|
ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
if ((TEST_PROP(
|
|
State.m_Properties,
|
|
(GET_PROP(m_apLangSupport->GetTimeSeperator()).m_ulFlag))) ||
|
|
HAS_PROP_ALPHA(State.m_Properties))
|
|
{
|
|
//
|
|
// suspected to be time 12:33 14:22 15:22:33
|
|
// or AM/PM time format 12:22AM 13PM
|
|
//
|
|
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to be AM/PM time",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
if (VerifyTime(State))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to be a simple number",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
if (VerifyNumber(State))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
if (TEST_PROP(State.m_Properties, PROP_DATE_SEPERATOR))
|
|
{
|
|
//
|
|
// suspected to be a date 1999-05-04 or 1998/11/10 1999.05.04
|
|
//
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerSuspect,
|
|
("%*.*S suspected to be a date",
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_ulEnd - m_pCurToken->m_State.m_ulStart,
|
|
m_pCurToken->m_State.m_pwcsToken + m_pCurToken->m_State.m_ulStart
|
|
));
|
|
|
|
return VerifyDate(State);
|
|
}
|
|
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
bool CTokenizer::VerifyTime(CTokenState& S)
|
|
{
|
|
CTokenState State(S);
|
|
CPropFlag PunctHead(TIME_ADDITIONAL_PUNCT_HEAD);
|
|
CPropFlag PunctTail(TIME_ADDITIONAL_PUNCT_TAIL);
|
|
|
|
if (TEST_PROP(State.m_Properties,
|
|
(TIME_ADDITIONAL_PUNCT_HEAD | TIME_ADDITIONAL_PUNCT_TAIL)))
|
|
{
|
|
ULONG ulCharRemoved;
|
|
ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
|
|
ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
if ((State.m_ulEnd - State.m_ulStart) > MAX_TIME_FORMAT_LEN)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
WCHAR pwcsBuf[MAX_TIME_FORMAT_LEN + 1];
|
|
ULONG ulCur = State.m_ulStart;
|
|
WCHAR wcSeperator = 0xFFFF;
|
|
ULONG ul = 0;
|
|
|
|
//
|
|
// formatting the text to a date format
|
|
//
|
|
|
|
while (ulCur < State.m_ulEnd)
|
|
{
|
|
CPropFlag prop(GET_PROP(State.m_pwcsToken[ulCur]));
|
|
if (HAS_PROP_NUMBER(prop))
|
|
{
|
|
pwcsBuf[ul] = L'#';
|
|
}
|
|
else if (State.m_pwcsToken[ulCur] == m_apLangSupport->GetTimeSeperator())
|
|
{
|
|
if (0xFFFF == wcSeperator)
|
|
{
|
|
wcSeperator = State.m_pwcsToken[ulCur];
|
|
}
|
|
else if (wcSeperator != State.m_pwcsToken[ulCur])
|
|
{
|
|
return false;
|
|
}
|
|
pwcsBuf[ul] = L':';
|
|
}
|
|
else if (HAS_PROP_ALPHA(prop) || HAS_PROP_PERIOD(prop))
|
|
{
|
|
pwcsBuf[ul] = State.m_pwcsToken[ulCur];
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ul++;
|
|
ulCur++;
|
|
}
|
|
|
|
pwcsBuf[ul] = L'\0';
|
|
|
|
CTimeTerm* pTerm;
|
|
short sResCount = 0;
|
|
DictStatus status;
|
|
|
|
status = g_pTimeFormat->m_trieTimeFormat.trie_Find(
|
|
pwcsBuf,
|
|
TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
|
|
1,
|
|
&pTerm,
|
|
&sResCount);
|
|
if (!(sResCount && (pTerm->bLen == ul)))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
LONG lHour;
|
|
LONG lMin;
|
|
LONG lSec;
|
|
TimeFormat AmPm;
|
|
|
|
GetValuesFromTimeString(
|
|
pTerm,
|
|
State.m_pwcsToken + State.m_ulStart ,
|
|
&lHour,
|
|
&lMin,
|
|
&lSec,
|
|
&AmPm);
|
|
|
|
if (None == AmPm)
|
|
{
|
|
if (lHour > 24)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (lHour > 12)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (Am == AmPm)
|
|
{
|
|
if (12 == lHour)
|
|
{
|
|
lHour = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (lHour < 12)
|
|
{
|
|
lHour += 12;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if (lMin > 59)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (lSec > 59)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
WCHAR pwcsTime[9] = {L'\0',L'\0',L'\0',L'\0',L'\0',L'\0',L'\0',L'\0',L'\0'};
|
|
|
|
swprintf(pwcsTime, L"TT%02d%02d", lHour, lMin);
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is a time -> %S",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart,
|
|
pwcsTime));
|
|
|
|
OutputTime(pwcsTime, State);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CTokenizer::VerifyDate(CTokenState& S)
|
|
{
|
|
CTokenState State(S);
|
|
CPropFlag PunctHead(DATE_ADDITIONAL_PUNCT_HEAD);
|
|
CPropFlag PunctTail(DATE_ADDITIONAL_PUNCT_TAIL);
|
|
if (TEST_PROP(State.m_Properties,
|
|
(DATE_ADDITIONAL_PUNCT_HEAD | DATE_ADDITIONAL_PUNCT_TAIL)))
|
|
{
|
|
ULONG ulCharRemoved;
|
|
ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
|
|
ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
|
|
WCHAR pwcsBuf[MAX_DATE_FORMAT_LEN + 1];
|
|
|
|
if (State.m_ulEnd - State.m_ulStart > MAX_DATE_FORMAT_LEN)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ULONG ulCur = State.m_ulStart;
|
|
WCHAR wcSeperator = 0xFFFF;
|
|
ULONG ul = 0;
|
|
|
|
//
|
|
// formatting the text to a date format
|
|
//
|
|
|
|
while (ulCur < State.m_ulEnd)
|
|
{
|
|
CPropFlag prop(GET_PROP(State.m_pwcsToken[ulCur]));
|
|
if (HAS_PROP_NUMBER(prop))
|
|
{
|
|
pwcsBuf[ul] = L'#';
|
|
}
|
|
else if (HAS_PROP_PERIOD(prop) ||
|
|
HAS_PROP_DASH(prop) ||
|
|
HAS_PROP_SLASH(prop))
|
|
{
|
|
if (0xFFFF == wcSeperator)
|
|
{
|
|
wcSeperator = State.m_pwcsToken[ulCur];
|
|
}
|
|
else if (wcSeperator != State.m_pwcsToken[ulCur])
|
|
{
|
|
return false;
|
|
}
|
|
pwcsBuf[ul] = L'.';
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ul++;
|
|
ulCur++;
|
|
}
|
|
|
|
pwcsBuf[ul] = L'\0';
|
|
|
|
CDateTerm* pTerm;
|
|
short sResCount = 0;
|
|
DictStatus status;
|
|
|
|
status = g_pDateFormat->m_trieDateFormat.trie_Find(
|
|
pwcsBuf,
|
|
TRIE_LONGEST_MATCH | TRIE_IGNORECASE,
|
|
1,
|
|
&pTerm,
|
|
&sResCount);
|
|
if (!(sResCount && (pTerm->bLen == ul)))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
LONG lD_M1;
|
|
LONG lD_M2;
|
|
LONG lYear;
|
|
|
|
GetValuesFromDateString(
|
|
pTerm,
|
|
State.m_pwcsToken + State.m_ulStart,
|
|
&lD_M1,
|
|
&lD_M2,
|
|
&lYear);
|
|
|
|
LONG lDay;
|
|
LONG lMonth;
|
|
|
|
//
|
|
// language dependent
|
|
//
|
|
|
|
if (m_apLangSupport->IsDayMonthOrder() ||
|
|
pTerm->bType == YYMMDD_TYPE)
|
|
{
|
|
lDay = lD_M1;
|
|
lMonth = lD_M2;
|
|
}
|
|
else
|
|
{
|
|
lDay = lD_M2;
|
|
lMonth = lD_M1;
|
|
}
|
|
|
|
if (!((lDay > 0) && (lDay <= 31)))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (!((lMonth > 0) && (lMonth <= 12)))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
|
|
WCHAR pwcsDate1[11] = { L'D', L'D', L'0', L'0', L'0', L'0', L'0', L'0', L'0', L'0', L'\0'};
|
|
WCHAR pwcsDate2[11];
|
|
bool bY2K = false;
|
|
|
|
if (lYear <= 99) // Y2k bug
|
|
{
|
|
_ltow(lYear + 1900, pwcsDate1 + 2, 10);
|
|
bY2K = true;
|
|
}
|
|
else if (lYear < 1000)
|
|
{
|
|
_ltow(lYear, pwcsDate1 + 3, 10);
|
|
}
|
|
else
|
|
{
|
|
_ltow(lYear, pwcsDate1 + 2, 10);
|
|
}
|
|
|
|
if (lMonth < 10)
|
|
{
|
|
pwcsDate1[6] = L'0';
|
|
_ltow(lMonth, pwcsDate1 + 7, 10);
|
|
}
|
|
else
|
|
{
|
|
_ltow(lMonth, pwcsDate1 + 6, 10);
|
|
}
|
|
|
|
if (lDay < 10)
|
|
{
|
|
pwcsDate1[8] = L'0';
|
|
_ltow(lDay, pwcsDate1 + 9, 10);
|
|
}
|
|
else
|
|
{
|
|
_ltow(lDay, pwcsDate1 + 8, 10);
|
|
}
|
|
|
|
if (bY2K)
|
|
{
|
|
wcscpy(pwcsDate2, pwcsDate1);
|
|
pwcsDate2[2] = L'2';
|
|
pwcsDate2[3] = L'0';
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is a date",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
if (bY2K)
|
|
{
|
|
OutputDate(pwcsDate1, pwcsDate2, State);
|
|
}
|
|
else
|
|
{
|
|
OutputDate(pwcsDate1, NULL, State);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool CTokenizer::VerifyNumber(CTokenState& S)
|
|
{
|
|
CTokenState State(S);
|
|
|
|
WCHAR pwcsNumber[TOKENIZER_MAXBUFFERLIMIT + 10];
|
|
|
|
ULONG ulOutLen;
|
|
ULONG ulOffsetToTxt;
|
|
|
|
const CCliticsTerm* pCliticsTerm;
|
|
pCliticsTerm = VerifyClitics(State);
|
|
|
|
ULONG ulAddToStart = 0;
|
|
ULONG ulDecFromEnd = 0;
|
|
|
|
if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
ulAddToStart = pCliticsTerm->ulLen;
|
|
}
|
|
else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
ulDecFromEnd = pCliticsTerm->ulLen;
|
|
}
|
|
|
|
bool fRet = CheckAndCreateNumber(
|
|
State.m_pwcsToken + State.m_ulStart + ulAddToStart,
|
|
State.m_ulEnd - State.m_ulStart - ulAddToStart - ulDecFromEnd,
|
|
pwcsNumber,
|
|
&ulOffsetToTxt,
|
|
&ulOutLen);
|
|
|
|
if (!fRet)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is a number",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
OutputNumbers(State, ulOutLen, pwcsNumber + ulOffsetToTxt, pCliticsTerm);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CTokenizer::VerifyCurrency()
|
|
{
|
|
//
|
|
// format is either $12.22 or 12.22$
|
|
//
|
|
CPropFlag PunctHead(CURRENCY_PUNCT_HEAD);
|
|
CPropFlag PunctTail(CURRENCY_PUNCT_TAIL);
|
|
CTokenState State(m_pCurToken->m_State);
|
|
|
|
if (TEST_PROP(State.m_Properties,
|
|
(CURRENCY_PUNCT_HEAD | CURRENCY_PUNCT_TAIL)))
|
|
{
|
|
ULONG ulCharRemoved;
|
|
ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
|
|
ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
const CCliticsTerm* pCliticsTerm;
|
|
pCliticsTerm = VerifyClitics(State);
|
|
|
|
ULONG ulAddToStart = 0;
|
|
ULONG ulDecFromEnd = 0;
|
|
|
|
if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
ulAddToStart = pCliticsTerm->ulLen;
|
|
}
|
|
else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
ulDecFromEnd = pCliticsTerm->ulLen;
|
|
}
|
|
|
|
|
|
WCHAR wchCurrency;
|
|
WCHAR pwcsCurrency[TOKENIZER_MAXBUFFERLIMIT + 10];
|
|
WCHAR* pwcsStr = State.m_pwcsToken + State.m_ulStart;
|
|
|
|
if (HAS_PROP_CURRENCY(GET_PROP(State.m_pwcsToken[State.m_ulStart + ulAddToStart])))
|
|
{
|
|
wchCurrency = State.m_pwcsToken[State.m_ulStart + ulAddToStart];
|
|
pwcsStr += 1;
|
|
}
|
|
else if (HAS_PROP_CURRENCY(GET_PROP(State.m_pwcsToken[State.m_ulEnd - 1 - ulDecFromEnd])))
|
|
{
|
|
wchCurrency = State.m_pwcsToken[State.m_ulEnd - 1 - ulDecFromEnd];
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ULONG ulOutLen;
|
|
ULONG ulOffsetToTxt;
|
|
|
|
if (false == CheckAndCreateNumber(
|
|
pwcsStr + ulAddToStart,
|
|
State.m_ulEnd - State.m_ulStart - 1 - ulAddToStart - ulDecFromEnd,
|
|
pwcsCurrency,
|
|
&ulOffsetToTxt,
|
|
&ulOutLen))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
Assert(ulOffsetToTxt + ulOutLen + 1 < m_ulMaxTokenSize + 4);
|
|
pwcsCurrency[ulOffsetToTxt + ulOutLen] = wchCurrency;
|
|
pwcsCurrency[ulOffsetToTxt + ulOutLen + 1] = L'\0';
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is a currency",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
OutputCurrency(ulOutLen+1, pwcsCurrency + ulOffsetToTxt , State, pCliticsTerm);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CTokenizer::VerifyCommersialSign()
|
|
{
|
|
CTokenState State(m_pCurToken->m_State);
|
|
CPropFlag CommPunctTail(COMMERSIAL_SIGN_PUNCT_TAIL);
|
|
CPropFlag CommPunctHead(COMMERSIAL_SIGN_PUNCT_HEAD);
|
|
|
|
if (TEST_PROP(State.m_Properties, (COMMERSIAL_SIGN_PUNCT_TAIL | COMMERSIAL_SIGN_PUNCT_HEAD)))
|
|
{
|
|
ULONG ulCharRemoved = m_pCurToken->RemoveTailPunct(CommPunctTail, State);
|
|
ulCharRemoved += m_pCurToken->RemoveHeadPunct(CommPunctHead, State);
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
|
|
if (TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd - 1]),
|
|
PROP_COMMERSIAL_SIGN))
|
|
{
|
|
//
|
|
// the length of the token must be greater then 1 since it includes an alpha
|
|
// and the commersial sign
|
|
//
|
|
Assert((State.m_ulEnd - State.m_ulStart) > 1);
|
|
OutputCommersialSignToken(State);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void CTokenizer::ProcessDefault()
|
|
{
|
|
CTokenState State(m_pCurToken->m_State);
|
|
|
|
if (TEST_PROP(State.m_Properties, PROP_DEFAULT_BREAKER))
|
|
{
|
|
if (TEST_PROP(State.m_Properties, PROP_FIRST_LEVEL_BREAKER))
|
|
{
|
|
CPropFlag prop(PROP_FIRST_LEVEL_BREAKER);
|
|
|
|
BreakCompundString(State, prop);
|
|
|
|
return;
|
|
}
|
|
|
|
if (TEST_PROP(State.m_Properties, PROP_SECOND_LEVEL_BREAKER))
|
|
{
|
|
CPropFlag prop(PROP_SECOND_LEVEL_BREAKER);
|
|
|
|
BreakCompundString(State, prop);
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
//
|
|
// this is a simple token
|
|
//
|
|
|
|
const CCliticsTerm* pCliticsTerm;
|
|
pCliticsTerm = VerifyClitics(State);
|
|
|
|
if (pCliticsTerm == &g_EmptyClitics)
|
|
{
|
|
if (TEST_PROP(State.m_Properties, PROP_NBS))
|
|
{
|
|
CPropFlag prop(PROP_NBS);
|
|
|
|
BreakCompundString(State, prop);
|
|
|
|
return;
|
|
}
|
|
|
|
CPropFlag PunctHead(SIMPLE_PUNCT_HEAD);
|
|
CPropFlag PunctTail(SIMPLE_PUNCT_TAIL);
|
|
|
|
if (TEST_PROP(State.m_Properties,
|
|
(SIMPLE_PUNCT_HEAD | SIMPLE_PUNCT_TAIL)))
|
|
{
|
|
ULONG ulCharRemoved;
|
|
ulCharRemoved= m_pCurToken->RemoveHeadPunct(PunctHead, State);
|
|
ulCharRemoved += m_pCurToken->RemoveTailPunct(PunctTail, State);
|
|
|
|
if ( TEST_PROP(State.m_Properties, PROP_UNDERSCORE) )
|
|
{
|
|
|
|
bool hasFrontUnderscore =
|
|
(State.m_ulStart > m_pCurToken->m_State.m_ulStart) &&
|
|
TEST_PROP( GET_PROP(State.m_pwcsToken[State.m_ulStart-1]),
|
|
PROP_UNDERSCORE ) &&
|
|
TEST_PROP( GET_PROP(State.m_pwcsToken[State.m_ulStart]),
|
|
PROP_ALPHA_NUMERIC );
|
|
|
|
bool hasBackUnderscore =
|
|
(State.m_ulEnd < m_pCurToken->m_State.m_ulEnd) &&
|
|
TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd]),
|
|
PROP_UNDERSCORE) &&
|
|
TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd-1]),
|
|
PROP_ALPHA_NUMERIC);
|
|
|
|
//
|
|
// Note: To change the policy to "leave ALL attached underscore
|
|
// seuences, simply change below condition to:
|
|
// if ( (hasFrontUnderscore || hasBackUnderscore) )
|
|
//
|
|
|
|
if ( (hasFrontUnderscore ^ hasBackUnderscore) )
|
|
{
|
|
ulCharRemoved -=
|
|
|
|
AddBackUnderscores(
|
|
State,
|
|
hasFrontUnderscore,
|
|
hasBackUnderscore
|
|
);
|
|
|
|
}
|
|
|
|
} // if ( TEST_PROP(State.m_Properties, PROP_UNDERSCORE) )
|
|
|
|
if (ulCharRemoved)
|
|
{
|
|
m_pCurToken->ComputeStateProperties(State);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (State.m_ulEnd == State.m_ulStart)
|
|
{
|
|
//
|
|
// case we remove all chracters in the above statement
|
|
//
|
|
return;
|
|
}
|
|
|
|
Trace(
|
|
elVerbose,
|
|
s_tagTokenizerDecision,
|
|
("%*.*S is a simple token",
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart
|
|
));
|
|
|
|
OutputSimpleToken(State, pCliticsTerm);
|
|
}
|
|
|
|
//
|
|
// CTokenizer::AddBackUnderscores:
|
|
//
|
|
// Treat cases of a "simple" token with head and/or tail underscore
|
|
// sequence (consecutive underscores prefix or suffix); those
|
|
// do not get flipped off and remain part of the token.
|
|
// This routine is called after underscore removal, (as a result of
|
|
// Remove[Head|Tail]Punct) and adds them back in.
|
|
//
|
|
// return value: Number of underscores added back in.
|
|
//
|
|
ULONG
|
|
CTokenizer::AddBackUnderscores(
|
|
IN CTokenState& State,
|
|
IN bool hasFrontUnderscore,
|
|
IN bool hasBackUnderscore
|
|
)
|
|
{
|
|
ULONG ulCharsAdded = 0;
|
|
|
|
if ( hasFrontUnderscore )
|
|
{
|
|
// Move left over consecutive underscores
|
|
ulCharsAdded = m_pCurToken->FindLeftmostUnderscore(State);
|
|
|
|
}
|
|
|
|
if ( hasBackUnderscore )
|
|
{
|
|
|
|
// Move right over consecutive underscores
|
|
ulCharsAdded += m_pCurToken->FindRightmostUnderscore(State);
|
|
|
|
} // if ( hasFrontUnderscore )
|
|
|
|
return ulCharsAdded;
|
|
|
|
} // CTokenizer::AddBackUnderscores()
|
|
|
|
void CTokenizer::OutputUrl(CTokenState& State)
|
|
{
|
|
HRESULT hr;
|
|
|
|
ULONG ulOffsetInTxtSourceBuffer =
|
|
m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
ULONG ulCur = State.m_ulStart;
|
|
ULONG ulStart = ulCur;
|
|
ULONG ulLenInTxtSourceBuffer = 0;
|
|
ULONG ulOffsetDueToAnEscapeChar;
|
|
|
|
while (ulCur < State.m_ulEnd)
|
|
{
|
|
ulLenInTxtSourceBuffer++;
|
|
ulOffsetDueToAnEscapeChar = 0;
|
|
|
|
if ((State.m_pwcsToken[ulCur] == L'%') &&
|
|
(ulCur <= State.m_ulEnd - 2))
|
|
{
|
|
//
|
|
// replacing escape charaters with real ones.
|
|
//
|
|
if (TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+1]) , PROP_XDIGIT) &&
|
|
TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+2]) , PROP_XDIGIT))
|
|
{
|
|
short sVal;
|
|
sVal = ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 1]);
|
|
sVal *= 16;
|
|
sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 2]);
|
|
|
|
State.m_pwcsToken[ulCur+2] = sVal;
|
|
for (ULONG ul = ulCur -1 ; ul >= ulStart; ul--)
|
|
{
|
|
State.m_pwcsToken[ul+2] = State.m_pwcsToken[ul];
|
|
}
|
|
ulCur += 2;
|
|
ulStart+=2;
|
|
ulOffsetDueToAnEscapeChar = 2;
|
|
ulLenInTxtSourceBuffer += 2;
|
|
}
|
|
else if ((ulCur <= State.m_ulEnd - 5) &&
|
|
((State.m_pwcsToken[ulCur+1] == L'u') ||
|
|
(State.m_pwcsToken[ulCur+1] == L'U')) &&
|
|
TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+2]) , PROP_XDIGIT) &&
|
|
TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+3]) , PROP_XDIGIT) &&
|
|
TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+4]) , PROP_XDIGIT) &&
|
|
TEST_PROP(GET_PROP(State.m_pwcsToken[ulCur+5]) , PROP_XDIGIT))
|
|
{
|
|
short sVal;
|
|
sVal = ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 2]);
|
|
sVal *= 0x1000;
|
|
sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 3]);
|
|
sVal *= 0x100;
|
|
sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 4]);
|
|
sVal *= 0x10;
|
|
sVal += ConvertHexCharToNumber(State.m_pwcsToken[ulCur + 5]);
|
|
|
|
State.m_pwcsToken[ulCur+5] = sVal;
|
|
|
|
for (ULONG ul = ulCur -1 ; ul >= ulStart; ul--)
|
|
{
|
|
State.m_pwcsToken[ul+5] = State.m_pwcsToken[ul];
|
|
}
|
|
ulCur += 5;
|
|
ulStart+=5;
|
|
ulOffsetDueToAnEscapeChar = 5;
|
|
ulLenInTxtSourceBuffer += 5;
|
|
}
|
|
}
|
|
|
|
if ( IS_BREAKER( State.m_pwcsToken[ulCur] ) )
|
|
{
|
|
if (ulCur - ulStart == 0)
|
|
{
|
|
//
|
|
// only punctuation
|
|
//
|
|
ulCur++;
|
|
ulStart = ulCur;
|
|
ulOffsetInTxtSourceBuffer += ulOffsetDueToAnEscapeChar + 1;
|
|
ulLenInTxtSourceBuffer = 0;
|
|
continue;
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
ulCur - ulStart,
|
|
&State.m_pwcsToken[ulStart],
|
|
ulLenInTxtSourceBuffer - 1 - ulOffsetDueToAnEscapeChar,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
ulStart = ulCur + 1;
|
|
ulOffsetInTxtSourceBuffer += ulLenInTxtSourceBuffer;
|
|
ulLenInTxtSourceBuffer = 0;
|
|
|
|
}
|
|
ulCur++;
|
|
|
|
}
|
|
|
|
//
|
|
// last word.
|
|
//
|
|
|
|
if (ulStart < ulCur)
|
|
{
|
|
hr = m_apWordSink->PutWord(
|
|
ulCur - ulStart,
|
|
&State.m_pwcsToken[ulStart],
|
|
ulLenInTxtSourceBuffer,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
void CTokenizer::OutputNumbers(
|
|
CTokenState& State,
|
|
ULONG ulLen,
|
|
WCHAR* pwcsNumber,
|
|
const CCliticsTerm* pCliticsTerm)
|
|
{
|
|
HRESULT hr;
|
|
//
|
|
// Input: 1.22 Output: 1.22, NN1D22
|
|
//
|
|
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
if (ulLen > m_ulMaxTokenSize)
|
|
{
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
return;
|
|
}
|
|
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
|
|
State.m_pwcsToken + State.m_ulStart + pCliticsTerm->ulLen,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
|
|
State.m_pwcsToken + State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
ulLen,
|
|
pwcsNumber,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
|
|
}
|
|
|
|
void CTokenizer::OutputParens(CTokenState& State)
|
|
{
|
|
HRESULT hr;
|
|
//
|
|
// format is xxx(s)
|
|
// Input: xxx(s) Output: xxx
|
|
//
|
|
|
|
State.m_pwcsToken[State.m_ulEnd - 3] = L'\0';
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - 3 - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State));
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
void CTokenizer::OutputAcronym(CTokenState& State, const CCliticsTerm* pCliticsTerm)
|
|
{
|
|
HRESULT hr;
|
|
//
|
|
// Input: I.B.M Output: I.B.M, IBM
|
|
//
|
|
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
ULONG ulAddToStart = 0;
|
|
ULONG ulDecFromEnd = 0;
|
|
|
|
if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
ulAddToStart = pCliticsTerm->ulLen;
|
|
}
|
|
else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
ulDecFromEnd = pCliticsTerm->ulLen;
|
|
|
|
}
|
|
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - ulDecFromEnd - (State.m_ulStart + ulAddToStart),
|
|
State.m_pwcsToken + State.m_ulStart + ulAddToStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
ULONG ulCur = State.m_ulStart + ulAddToStart;
|
|
ULONG ulNext = ulCur;
|
|
|
|
while (ulCur < State.m_ulEnd)
|
|
{
|
|
if (!HAS_PROP_PERIOD(GET_PROP(State.m_pwcsToken[ulCur])))
|
|
{
|
|
State.m_pwcsToken[ulNext] = State.m_pwcsToken[ulCur];
|
|
ulNext++;
|
|
ulCur++;
|
|
continue;
|
|
}
|
|
ulCur++;
|
|
}
|
|
|
|
if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
ulNext - (State.m_ulStart + ulAddToStart),
|
|
State.m_pwcsToken + State.m_ulStart + ulAddToStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
ulNext - ulDecFromEnd - (State.m_ulStart + ulAddToStart),
|
|
State.m_pwcsToken + State.m_ulStart + ulAddToStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
void CTokenizer::OutputAbbreviation(CTokenState& State)
|
|
{
|
|
HRESULT hr;
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart - 1,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
void CTokenizer::OutputSpecialAbbreviation(
|
|
CTokenState& State,
|
|
CAbbTerm* pTerm,
|
|
const CCliticsTerm* pCliticsTerm)
|
|
{
|
|
HRESULT hr;
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
WCHAR* pwcsAbb = pTerm->pwcsAbb;
|
|
ULONG ulLen = pTerm->ulAbbLen;
|
|
|
|
if (pTerm->pwcsCanonicalForm)
|
|
{
|
|
pwcsAbb = pTerm->pwcsCanonicalForm;
|
|
ulLen = pTerm->ulCanLen;
|
|
}
|
|
|
|
if (TAIL_MATCH_TRUNCATE == pCliticsTerm->ulOp)
|
|
{
|
|
WCHAR pwcs[TOKENIZER_MAXBUFFERLIMIT];
|
|
|
|
int iCount;
|
|
iCount = _snwprintf(
|
|
pwcs,
|
|
TOKENIZER_MAXBUFFERLIMIT,
|
|
L"%s%s",
|
|
pwcsAbb,
|
|
pCliticsTerm->pwcs);
|
|
|
|
Assert(iCount < TOKENIZER_MAXBUFFERLIMIT);
|
|
|
|
pwcs[TOKENIZER_MAXBUFFERLIMIT - 1] = L'\0';
|
|
hr = m_apWordSink->PutAltWord(
|
|
ulLen + pCliticsTerm->ulLen,
|
|
pwcs,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
ulLen,
|
|
pwcsAbb,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
void CTokenizer::OutputHyphenation(CTokenState& State, const CCliticsTerm* pCliticsTerm)
|
|
{
|
|
//
|
|
// Input: Data-Base Output Data Base, DataBase (only in query time)
|
|
//
|
|
HRESULT hr;
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
ULONG ulAddToStart = 0;
|
|
ULONG ulDecFromEnd = 0;
|
|
|
|
if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
ulAddToStart = pCliticsTerm->ulLen;
|
|
}
|
|
else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
ulDecFromEnd = pCliticsTerm->ulLen;
|
|
}
|
|
|
|
ULONG ulCur = State.m_ulStart + ulAddToStart;
|
|
ULONG ulStart = ulCur;
|
|
ULONG ulRelPosInTxtSrcBuff = ulOffsetInTxtSourceBuffer;
|
|
|
|
if (m_bQueryTime)
|
|
{
|
|
ULONG ulNext = ulCur;
|
|
hr = m_apWordSink->StartAltPhrase();
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
ULONG ulAdd = ulAddToStart;
|
|
while (ulCur < State.m_ulEnd)
|
|
{
|
|
if ( HAS_PROP_DASH(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
|
|
{
|
|
hr = m_apWordSink->PutWord(
|
|
ulNext - ulStart,
|
|
&State.m_pwcsToken[ulStart],
|
|
ulNext - ulStart + ulAdd,
|
|
ulRelPosInTxtSrcBuff);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
ulRelPosInTxtSrcBuff += ulNext - ulStart + 1 + ulAdd;
|
|
ulStart = ulNext;
|
|
ulCur++;
|
|
ulAdd = 0;
|
|
continue;
|
|
}
|
|
|
|
State.m_pwcsToken[ulNext] = State.m_pwcsToken[ulCur];
|
|
ulNext++;
|
|
ulCur++;
|
|
}
|
|
|
|
Assert(ulCur > ulStart);
|
|
|
|
if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
ulNext - ulStart,
|
|
&State.m_pwcsToken[ulStart],
|
|
ulNext - ulStart,
|
|
ulRelPosInTxtSrcBuff);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
ulNext - ulStart - ulDecFromEnd,
|
|
&State.m_pwcsToken[ulStart],
|
|
ulNext - ulStart,
|
|
ulRelPosInTxtSrcBuff);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
hr = m_apWordSink->StartAltPhrase();
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
ulNext - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart - ulAddToStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
ulNext - State.m_ulStart - ulDecFromEnd - ulAddToStart,
|
|
State.m_pwcsToken + State.m_ulStart + ulAddToStart,
|
|
State.m_ulEnd - State.m_ulStart + ulAddToStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
hr = m_apWordSink->EndAltPhrase();
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
ULONG ulAdd = ulAddToStart;
|
|
|
|
while (ulCur < State.m_ulEnd)
|
|
{
|
|
if (HAS_PROP_DASH(GET_PROP(m_pCurToken->m_State.m_pwcsToken[ulCur])))
|
|
{
|
|
hr = m_apWordSink->PutWord(
|
|
ulCur - ulStart,
|
|
&State.m_pwcsToken[ulStart],
|
|
ulCur - ulStart + ulAdd,
|
|
ulRelPosInTxtSrcBuff);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
ulRelPosInTxtSrcBuff += ulCur - ulStart + 1 + ulAdd;
|
|
ulStart = ulCur + 1;
|
|
ulAdd = 0;
|
|
}
|
|
ulCur++;
|
|
}
|
|
|
|
Assert(ulCur > ulStart);
|
|
|
|
if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
ulCur - ulStart,
|
|
&State.m_pwcsToken[ulStart],
|
|
ulCur - ulStart,
|
|
ulRelPosInTxtSrcBuff);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
ulCur - ulStart - ulDecFromEnd,
|
|
&State.m_pwcsToken[ulStart],
|
|
ulCur - ulStart,
|
|
ulRelPosInTxtSrcBuff);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
}
|
|
}
|
|
|
|
void CTokenizer::OutputTime(WCHAR* pwcsTime, CTokenState& State)
|
|
{
|
|
HRESULT hr;
|
|
//
|
|
// Output: TT1353
|
|
//
|
|
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
6,
|
|
pwcsTime,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
}
|
|
|
|
void CTokenizer::OutputDate(
|
|
WCHAR* pwcsDate1,
|
|
WCHAR* pwcsDate2,
|
|
CTokenState& State)
|
|
{
|
|
HRESULT hr;
|
|
//
|
|
// Output: DD19990921
|
|
//
|
|
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
|
|
if (pwcsDate2)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
10,
|
|
pwcsDate2,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
10,
|
|
pwcsDate1,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
void CTokenizer::OutputSimpleToken(CTokenState& State, const CCliticsTerm* pTerm)
|
|
{
|
|
HRESULT hr;
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
if (((TAIL_MATCH_TRUNCATE == pTerm->ulOp) ||
|
|
(HEAD_MATCH_TRUNCATE == pTerm->ulOp)) &&
|
|
(State.m_ulStart + pTerm->ulLen < State.m_ulEnd))
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
|
|
if (pTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
Assert(pTerm->ulOp == HEAD_MATCH_TRUNCATE);
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
|
|
&State.m_pwcsToken[State.m_ulStart + pTerm->ulLen],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State));
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
void CTokenizer::OutputCurrency(
|
|
ULONG ulLen,
|
|
WCHAR* pwcsCurrency,
|
|
CTokenState& State,
|
|
const CCliticsTerm* pTerm)
|
|
{
|
|
HRESULT hr;
|
|
//
|
|
// Output: CC12.22$
|
|
//
|
|
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
if (ulLen > m_ulMaxTokenSize)
|
|
{
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
return;
|
|
}
|
|
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
|
|
if (pTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
else if (pTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart - pTerm->ulLen,
|
|
&State.m_pwcsToken[State.m_ulStart + pTerm->ulLen],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
ulLen,
|
|
pwcsCurrency,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
|
|
}
|
|
|
|
void CTokenizer::OutputCommersialSignToken(
|
|
CTokenState& State)
|
|
{
|
|
HRESULT hr;
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart - 1,
|
|
State.m_pwcsToken + State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
State.m_pwcsToken + State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
void CTokenizer::OutputMisc(
|
|
CTokenState& State,
|
|
bool bPatternContainOnlyUpperCase,
|
|
ULONG ulSuffixSize,
|
|
const CCliticsTerm* pCliticsTerm)
|
|
{
|
|
HRESULT hr;
|
|
ULONG ulOffsetInTxtSourceBuffer = m_pCurToken->CalculateStateOffsetInTxtSourceBuffer(State);
|
|
|
|
ULONG ulAddToStart = 0;
|
|
ULONG ulDecFromEnd = 0;
|
|
|
|
|
|
if (pCliticsTerm->ulOp == HEAD_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
|
|
State.m_pwcsToken + State.m_ulStart + pCliticsTerm->ulLen,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
ulAddToStart = pCliticsTerm->ulLen;
|
|
}
|
|
else if (pCliticsTerm->ulOp == TAIL_MATCH_TRUNCATE)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart - pCliticsTerm->ulLen,
|
|
State.m_pwcsToken + State.m_ulStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
ulDecFromEnd = pCliticsTerm->ulLen;
|
|
}
|
|
|
|
if (!bPatternContainOnlyUpperCase)
|
|
{
|
|
hr = m_apWordSink->PutAltWord(
|
|
State.m_ulEnd - State.m_ulStart - ulAddToStart - ulDecFromEnd - ulSuffixSize,
|
|
State.m_pwcsToken + State.m_ulStart + ulAddToStart,
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
hr = m_apWordSink->PutWord(
|
|
State.m_ulEnd - State.m_ulStart,
|
|
&State.m_pwcsToken[State.m_ulStart],
|
|
State.m_ulEnd - State.m_ulStart,
|
|
ulOffsetInTxtSourceBuffer);
|
|
if (FAILED(hr))
|
|
{
|
|
THROW_HRESULT_EXCEPTION(hr);
|
|
}
|
|
|
|
}
|
|
|
|
#define NUMBER_NO_ERROR 0
|
|
#define NUMBER_SEPERATOR_ERROR 1
|
|
#define NUMBER_ERROR 2
|
|
|
|
bool CTokenizer::CheckAndCreateNumber(
|
|
WCHAR* pwcsStr,
|
|
ULONG ulLen,
|
|
WCHAR* pwcsOut,
|
|
ULONG* pulOffsetToTxt, // the actual output does not always start at the beginning of buffer
|
|
ULONG* pulOutLen)
|
|
{
|
|
|
|
int iRet;
|
|
|
|
iRet = CheckAndCreateNumber(
|
|
pwcsStr,
|
|
ulLen,
|
|
m_apLangSupport->GetDecimalSeperator(),
|
|
m_apLangSupport->GetThousandSeperator(),
|
|
pwcsOut,
|
|
pulOffsetToTxt,
|
|
pulOutLen);
|
|
if (NUMBER_NO_ERROR == iRet)
|
|
{
|
|
return true;
|
|
}
|
|
else if (NUMBER_ERROR == iRet)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
iRet = CheckAndCreateNumber(
|
|
pwcsStr,
|
|
ulLen,
|
|
L'.', // default value
|
|
0xFFFF, // no thousand sperator
|
|
pwcsOut,
|
|
pulOffsetToTxt,
|
|
pulOutLen);
|
|
if (NUMBER_NO_ERROR == iRet)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
//
|
|
// return value:
|
|
// NUMBER_NO_ERROR - success
|
|
// NUMBER_SEPERATOR_ERROR - error due to sperators
|
|
// NUMBER_ERROR - error since it's not a number.
|
|
//
|
|
|
|
int CTokenizer::CheckAndCreateNumber(
|
|
WCHAR* pwcsStr,
|
|
ULONG ulLen,
|
|
WCHAR wchSDecimal,
|
|
WCHAR wchSThousand,
|
|
WCHAR* pwcsOut,
|
|
ULONG* pulOffsetToTxt, // the actual output does not always start at the beginning of buffer
|
|
ULONG* pulOutLen)
|
|
{
|
|
Assert(ulLen > 0);
|
|
//
|
|
// assumes that the out buffer is big enough.
|
|
// looking for the following formats: 1111 1111.2222 1,111,111.222
|
|
//
|
|
|
|
ULONG ulCur = ulLen - 1;
|
|
ULONG ulNumCharsBeforDigitSeperator = 0;
|
|
ULONG ulNextChar = ulLen - 1 + 3; // +3 is for the NN at the begging of the formated token +
|
|
// additional 0 in the begining in case .50
|
|
|
|
bool fHasFraction = false;
|
|
|
|
while ((((int)(ulCur)) >= 0) &&
|
|
HAS_PROP_NUMBER(GET_PROP(pwcsStr[ulCur])))
|
|
{
|
|
pwcsOut[ulNextChar] = pwcsStr[ulCur];
|
|
ulCur--;
|
|
ulNextChar--;
|
|
ulNumCharsBeforDigitSeperator++;
|
|
}
|
|
|
|
if (ulCur == ulLen - 1)
|
|
{
|
|
//
|
|
// did not read any digits.
|
|
//
|
|
return NUMBER_ERROR;
|
|
}
|
|
|
|
if ((((int)ulCur) >= 0) && (pwcsStr[ulCur] == wchSDecimal))
|
|
{
|
|
fHasFraction = true;
|
|
pwcsOut[ulNextChar] = L'D';
|
|
ulCur--;
|
|
ulNextChar--;
|
|
ulNumCharsBeforDigitSeperator = 0;
|
|
}
|
|
|
|
ULONG ulNumOfThousandSeperator = 0;
|
|
while (((int)ulCur) >= 0)
|
|
{
|
|
if (pwcsStr[ulCur] == wchSThousand)
|
|
{
|
|
if (3 != ulNumCharsBeforDigitSeperator)
|
|
{
|
|
return NUMBER_SEPERATOR_ERROR;
|
|
}
|
|
ulNumCharsBeforDigitSeperator = 0;
|
|
ulNumOfThousandSeperator++;
|
|
}
|
|
else if(HAS_PROP_NUMBER(GET_PROP(pwcsStr[ulCur])))
|
|
{
|
|
pwcsOut[ulNextChar] = pwcsStr[ulCur];
|
|
ulNumCharsBeforDigitSeperator++;
|
|
ulNextChar--;
|
|
}
|
|
else
|
|
{
|
|
if (TEST_PROP(
|
|
GET_PROP(pwcsStr[ulCur]), PROP_DEFAULT_BREAKER))
|
|
{
|
|
return NUMBER_SEPERATOR_ERROR;
|
|
}
|
|
|
|
return NUMBER_ERROR;
|
|
}
|
|
|
|
ulCur--;
|
|
}
|
|
|
|
*pulOutLen = ulLen;
|
|
|
|
if (L'D' == pwcsOut[ulNextChar+1])
|
|
{
|
|
Assert(ulNextChar >= 2);
|
|
//
|
|
// the number has the following format .50
|
|
//
|
|
pwcsOut[ulNextChar] = L'0';
|
|
ulNextChar--;
|
|
*pulOutLen += 1;
|
|
}
|
|
|
|
Assert(ulNextChar >= 1);
|
|
pwcsOut[ulLen + 3] = L'\0';
|
|
pwcsOut[ulNextChar] = L'N';
|
|
pwcsOut[ulNextChar - 1] = L'N';
|
|
|
|
*pulOutLen = *pulOutLen + 2 - ulNumOfThousandSeperator; // don't use += because 2 - ulNextChar + 1
|
|
*pulOffsetToTxt = ulNextChar - 1;
|
|
// can be negative and since it is ULONG we
|
|
// can get the wrong result.
|
|
if (fHasFraction)
|
|
{
|
|
while (HAS_PROP_NUMBER(GET_PROP(pwcsOut[*pulOutLen + *pulOffsetToTxt - 1])) &&
|
|
(0 == ConvertCharToDigit(pwcsOut[*pulOutLen + *pulOffsetToTxt - 1])))
|
|
{
|
|
Assert(*pulOutLen > 3);
|
|
(*pulOutLen)--;
|
|
}
|
|
|
|
if (L'D' == pwcsOut[*pulOutLen + *pulOffsetToTxt - 1])
|
|
{
|
|
(*pulOutLen)--;
|
|
}
|
|
}
|
|
return NUMBER_NO_ERROR;
|
|
}
|
|
|
|
|
|
|
|
void CTokenizer::GetValuesFromDateString(
|
|
CDateTerm* pFormat,
|
|
WCHAR* pwcsDate,
|
|
LONG* plD_M1, // we can't tell in this stage whether this is a Day or a month.
|
|
LONG* plD_M2,
|
|
LONG* plYear)
|
|
{
|
|
BYTE i;
|
|
int iBase;
|
|
|
|
*plD_M1 = 0;
|
|
for ( i = pFormat->bD_M1Len, iBase = 1; i > 0; i--, iBase *= 10)
|
|
{
|
|
*plD_M1 += ConvertCharToDigit(pwcsDate[pFormat->bD_M1Offset + i - 1]) * iBase;
|
|
}
|
|
|
|
*plD_M2 = 0;
|
|
for ( i = pFormat->bD_M2Len, iBase = 1; i > 0; i--, iBase *= 10)
|
|
{
|
|
*plD_M2 += ConvertCharToDigit(pwcsDate[pFormat->bD_M2Offset + i - 1]) * iBase;
|
|
}
|
|
|
|
*plYear = 0;
|
|
for ( i = pFormat->bYearLen, iBase = 1; i > 0; i--, iBase *= 10)
|
|
{
|
|
*plYear += ConvertCharToDigit(pwcsDate[pFormat->bYearOffset + i - 1]) * iBase;
|
|
}
|
|
|
|
}
|
|
|
|
void CTokenizer::GetValuesFromTimeString(
|
|
CTimeTerm* pFormat,
|
|
WCHAR* pwcsTime,
|
|
LONG* plHour,
|
|
LONG* plMin,
|
|
LONG* plSec,
|
|
TimeFormat* pAmPm)
|
|
{
|
|
BYTE i;
|
|
int iBase;
|
|
|
|
*plHour = 0;
|
|
for ( i = pFormat->bHourLen, iBase = 1; i > 0; i--, iBase *= 10)
|
|
{
|
|
*plHour += ConvertCharToDigit(pwcsTime[pFormat->bHourOffset + i - 1]) * iBase;
|
|
}
|
|
|
|
*plMin = 0;
|
|
for ( i = pFormat->bMinLen, iBase = 1; i > 0; i--, iBase *= 10)
|
|
{
|
|
*plMin += ConvertCharToDigit(pwcsTime[pFormat->bMinOffset + i - 1]) * iBase;
|
|
}
|
|
|
|
*plSec = 0;
|
|
for ( i = pFormat->bSecLen, iBase = 1; i > 0; i--, iBase *= 10)
|
|
{
|
|
*plSec += ConvertCharToDigit(pwcsTime[pFormat->bSecOffset + i - 1]) * iBase;
|
|
}
|
|
|
|
*pAmPm = pFormat->AmPm;
|
|
|
|
}
|
|
|
|
void CTokenizer::BreakCompundString(CTokenState& State, CPropFlag& propBreaker)
|
|
{
|
|
//
|
|
// still there are puctutaitons inside the token
|
|
// we break them up and resubmit them.
|
|
//
|
|
ULONG ulStart = State.m_ulStart;
|
|
ULONG ulCur = ulStart;
|
|
|
|
while (ulCur < State.m_ulEnd)
|
|
{
|
|
if ( TEST_PROP1(GET_PROP(State.m_pwcsToken[ulCur]), propBreaker))
|
|
{
|
|
if (ulCur - ulStart == 0)
|
|
{
|
|
//
|
|
// only punctuation
|
|
//
|
|
ulCur++;
|
|
ulStart = ulCur;
|
|
continue;
|
|
}
|
|
|
|
m_pCurToken->m_State.m_ulStart = 0;
|
|
m_pCurToken->m_State.m_ulEnd = ulCur - ulStart;
|
|
m_pCurToken->m_State.m_pwcsToken = State.m_pwcsToken + ulStart;
|
|
m_pCurToken->ComputeStateProperties(m_pCurToken->m_State);
|
|
//
|
|
// we just created a sub token need to procces it
|
|
//
|
|
|
|
ProcessTokenInternal();
|
|
ulStart = ulCur + 1;
|
|
|
|
}
|
|
ulCur++;
|
|
}
|
|
|
|
if (ulStart < ulCur)
|
|
{
|
|
//
|
|
// last sub token
|
|
//
|
|
m_pCurToken->m_State.m_ulStart = 0;
|
|
m_pCurToken->m_State.m_ulEnd = ulCur - ulStart;
|
|
m_pCurToken->m_State.m_pwcsToken = State.m_pwcsToken + ulStart;
|
|
m_pCurToken->ComputeStateProperties(m_pCurToken->m_State);
|
|
//
|
|
// we just created a sub token need to procces it
|
|
//
|
|
|
|
ProcessTokenInternal();
|
|
}
|
|
|
|
return;
|
|
|
|
}
|