|
|
/*************************************************************************
* @doc SHROOM EXTERNAL API * * * * STDBRKR.CPP * * * * Copyright (C) Microsoft Corporation 1997 * * All Rights reserved. * * * * This file contains the implementation of CITStdBreaker methods. * * CITStdBreaker is a pluggable word breaker object that can optionally * * use a character class table and stop word list during its breaking * * operations. Although all the word breaking interface methods * * that accepts text require it to be Unicode, CITStdBreaker still only * * support MBCS internally. * * * ************************************************************************** * * * Written By : Bill Aloof * * Current Owner: billa * * * **************************************************************************/ #include <mvopsys.h>
#ifdef _DEBUG
static char s_aszModule[] = __FILE__; /* For error report */ #endif
#ifdef IA64
#include <itdfguid.h>
#endif
#include <atlinc.h> // includes for ATL.
#include <_mvutil.h>
#include <mem.h>
#include <orkin.h>
#include <mvsearch.h>
#include "common.h"
#include <iterror.h>
#include <itwbrk.h>
#include <itwbrkid.h>
#include "stdbrkr.h"
HRESULT FAR PASCAL StdBreakerWordFunc(LST lstRawWord, LST lstNormWord, DWORD dwWordOffset, LPVOID lpvUser);
//---------------------------------------------------------------------------
// Constructor and Destructor
//---------------------------------------------------------------------------
CITStdBreaker::CITStdBreaker() { ClearMembers(); m_hmemAnsi = NULL; m_cbBufAnsiCur = 0; m_pistem = NULL; }
CITStdBreaker::~CITStdBreaker() { Close(); }
//---------------------------------------------------------------------------
// IWordBreaker Method Implementations
//---------------------------------------------------------------------------
/********************************************************************
* @method STDMETHODIMP | IWordBreaker | Init | * Gives the breaker object a chance to initialize itself beyond * what it did during IPersistStreamInit::InitNew or ::Load. * @parm BOOL | fQuery | TRUE means breaker context is query processing * @parm ULONG | ulMaxTokenSize | Max term length requested by caller * @parm BOOL* | pfLicense | Whether the breaker is subject to a license * * @rvalue E_POINTER | pfLicense was NULL * ********************************************************************/ STDMETHODIMP CITStdBreaker::Init(BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense) { HRESULT hr = S_OK; // NOTE: We don't check m_fInitialized here because we consider ourselves
// adequately initialized once IPersistStreamInit::InitNew or ::Load
// has been called.
if (pfLicense == NULL) return (SetErrReturn(E_POINTER));
// If we haven't been initialized yet (i.e. no call was made to either
// IPersistStreamInit::InitNew or Load), we'll initialize ourselves now.
// This allows Tripoli clients to use us without any code changes on their
// part.
if (!m_fInitialized) hr = InitNew();
if (SUCCEEDED(hr) && m_pistem != NULL) hr = m_pistem->Init(ulMaxTokenSize, pfLicense); if (SUCCEEDED(hr)) { if (m_fQueryContext = fQuery) MVCharTableSetWildcards(m_lpctab); // We set *pfLicense only if the stemmer didn't.
if (m_pistem == NULL) *pfLicense = FALSE; }
// NOTE: We don't support caller-specified internal truncation of terms
// based on ulMaxTokenSize. The breaker routines have a hard-coded
// maximum of CB_MAX_WORD_LEN. This is OK since the word sink is supposed
// to be prepared to have to truncate anyway.
return (hr); }
/********************************************************************
* @method STDMETHODIMP | IWordBreaker | BreakText | * Parses text to find both individual tokens and noun phrases, then * calls methods of IWordSink and IPhraseSink with the results. * * @parm TEXT_SOURCE | *pTextSource | Source of the UniCode text. * @parm IWordSink | *pWordSink | Pointer to the word sink. * @parm IPhraseSink | *pPhraseSink | Pointer to the phrase sink. * (Not supported at this time.) * * @rvalue S_OK | The operation completed successfully. * @rvalue E_POINTER | The text source is null. * @rvalue E_INVALIDARG | The word sink is NULL. * @rvalue E_NOTOPEN | * @rvalue E_OUTOFMEMORY | There was not enough memory to complete the operation. * * @comm * The raw text in pTextSource is parsed by the word breaker until no * more text is available to refill the buffer. At this point, BreakText returns S_OK. * * ********************************************************************/ STDMETHODIMP CITStdBreaker::BreakText(TEXT_SOURCE *pTextSource, IWordSink *pWordSink, IPhraseSink *pPhraseSink) { HRESULT hr = S_OK; LPIBI lpibi = NULL;
if (pTextSource == NULL) return (SetErrReturn(E_POINTER));
// We treat a NULL pWordSink different than a NULL pTextSource
// to indicate to the caller that we can't do anything meaningful
// without a pWordSink because we don't do phrase breaking.
if (pWordSink == NULL) return (SetErrReturn(E_INVALIDARG));
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
m_cs.Lock();
if ((lpibi = BreakerInitiate()) != NULL) { BRK_PARMS bkp; WRDFNPM wrdfnpm;
// Set up word callback wrapper params.
MEMSET(&wrdfnpm, NULL, sizeof(WRDFNPM)); wrdfnpm.piwrdsnk = pWordSink; wrdfnpm.dwCodePageID = m_brkctl.dwCodePageID;
// Set up breaker params that will get passed to FBreakX.
bkp.lpInternalBreakInfo = lpibi; bkp.lcbBufOffset = 0; bkp.lpvUser = (LPVOID) &wrdfnpm; bkp.lpfnOutWord = StdBreakerWordFunc; bkp.lpStopInfoBlock = m_lpsipb; bkp.lpCharTab = m_lpctab; bkp.fFlags = ((m_brkctl.grfBreakFlags & IITWBC_BREAK_ACCEPT_WILDCARDS) != 0 ? ACCEPT_WILDCARD : 0);
// Loop to break text.
do { DWORD cbAnsi; DWORD cwch;
// Make the ANSI buffer big enough to handle all DBCS in case
// that's what we get when converting from Unicode.
cbAnsi = sizeof(WCHAR) * (cwch = (pTextSource->iEnd - pTextSource->iCur));
if (SUCCEEDED(hr = ReallocBuffer(&m_hmemAnsi, &m_cbBufAnsiCur, cbAnsi))) { bkp.lpbBuf = (LPBYTE) _GLOBALLOCK(m_hmemAnsi);
if ((bkp.cbBufCount = WideCharToMultiByte(m_brkctl.dwCodePageID, NULL, (LPCWSTR) &pTextSource->awcBuffer[pTextSource->iCur], cwch, (char *) bkp.lpbBuf, m_cbBufAnsiCur, NULL, NULL)) > 0) { // StdBreakerWordFunc needs the MBCS buffer to compute an
// accurate word offset into the Unicode buffer.
wrdfnpm.lpbBuf = bkp.lpbBuf; switch (m_brkctl.dwBreakWordType) { case IITWBC_BREAKTYPE_TEXT: if (SUCCEEDED(hr = FBreakWords(&bkp))) { /* Flush the word breaker */ bkp.lpbBuf = NULL; bkp.cbBufCount = 0; hr = FBreakWords(&bkp); } break;
case IITWBC_BREAKTYPE_NUMBER: if (SUCCEEDED(hr = FBreakNumber(&bkp))) { /* Flush the word breaker */ bkp.lpbBuf = NULL; bkp.cbBufCount = 0; hr = FBreakNumber(&bkp); } break;
case IITWBC_BREAKTYPE_DATE: if (SUCCEEDED(hr = FBreakDate(&bkp))) { /* Flush the word breaker */ bkp.lpbBuf = NULL; bkp.cbBufCount = 0; hr = FBreakDate(&bkp); } break;
case IITWBC_BREAKTYPE_TIME: if (SUCCEEDED(hr = FBreakTime(&bkp))) { /* Flush the word breaker */ bkp.lpbBuf = NULL; bkp.cbBufCount = 0; hr = FBreakTime(&bkp); } break;
case IITWBC_BREAKTYPE_EPOCH: if (SUCCEEDED(hr = FBreakEpoch(&bkp))) { /* Flush the word breaker */ bkp.lpbBuf = NULL; bkp.cbBufCount = 0; hr = FBreakEpoch(&bkp); } break;
default: ITASSERT(FALSE); hr = E_UNEXPECTED; break; }; } else hr = E_UNEXPECTED;
_GLOBALUNLOCK(m_hmemAnsi); }
// Advance cur to end just in case the caller cares about this
// being the case when we ask for more characters.
pTextSource->iCur = pTextSource->iEnd;
} while (SUCCEEDED(hr) && SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));
// Free any buffer that the word callback wrapper may have allocated.
if (wrdfnpm.hmemUnicode != NULL) _GLOBALFREE(wrdfnpm.hmemUnicode); } else hr = E_OUTOFMEMORY;
if (lpibi != NULL) BreakerFree(lpibi);
m_cs.Unlock();
return (hr); }
/********************************************************************
* @method STDMETHODIMP | IWordBreaker | ComposePhrase | * Converts a noun and modifier back into a linguistically correct source phrase. * * * @parm WCHAR const | *pwcNoun | Pointer to the word being modified. * @parm ULONG | cwcNoun | The count of characters in pwcNoun. * @parm WCHAR const | *pwcModifier | Points to the word modifying pwcNoun * @parm ULONG | cwcModifier | Length of pwcModifier * @parm ULONG | ulAttachmentType | A wordbreaker-specific value which a * wordbreaker can use to store additional information about the method of composition. * @parm WCHAR | *pwcPhrase | Pointer to a buffer in which to store the composed phrase * @parm ULONG | *pcwcPhrase | [in] length in characters of the pwcPhrase buffer. * [out] the actual length of the composed phrase. If * WBREAK_E_BUFFER_TOO_SMALL is returned, then on output pcwcPhrase * contains the required length of pwcPhrase. * * @rvalue S_OK | The object was successfully created * @rvalue E_INVALIDARG | The argument was not valid * @rvalue E_NOTINIT | * @rvalue E_OUTOFMEMORY | * * @comm * Not implemented ********************************************************************/ STDMETHODIMP CITStdBreaker::ComposePhrase(WCHAR const *pwcNoun, ULONG cwcNoun, WCHAR const *pwcModifier, ULONG cwcModifier, ULONG ulAttachmentType, WCHAR *pwcPhrase, ULONG *pcwcPhrase) { return (E_NOTIMPL); }
/********************************************************************
* @method STDMETHODIMP | IWordBreaker | GetLicenseToUse | * Returns a pointer to the license information provided by the vendor * of this specific implementation of the IWordBreaker interface. * * @parm WCHAR const | **ppwcsLicense | Pointer to the license information. * * @rvalue E_POINTER | ppwcsLicense is null. ********************************************************************/ STDMETHODIMP CITStdBreaker::GetLicenseToUse(WCHAR const **ppwcsLicense) { HRESULT hr; if (ppwcsLicense == NULL) return (SetErrReturn(E_POINTER)); if (m_pistem != NULL) hr = m_pistem->GetLicenseToUse(ppwcsLicense); else hr = E_NOTIMPL; return (hr); }
//---------------------------------------------------------------------------
// IWordBreakerConfig Method Implementations
//---------------------------------------------------------------------------
/********************************************************************
* @method STDMETHODIMP | IWordBreakerConfig | SetLocaleInfo| * Sets locale information for the word breaker. * * * @parm DWORD | dwCodePageID | ANSI code page no. specified at build time. * @parm LCID | lcid | Win32 locale identifier specified at build time. * * @rvalue E_NOTOPEN | [?] is not initialized. * @rvalue S_OK | The locale described by the parameters is supported. * ********************************************************************/ STDMETHODIMP CITStdBreaker::SetLocaleInfo(DWORD dwCodePageID, LCID lcid) { if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
m_cs.Lock();
m_brkctl.dwCodePageID = dwCodePageID; m_brkctl.lcid = lcid; m_fDirty = TRUE;
m_cs.Unlock();
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IWordBreakerConfig | GetLocaleInfo| * Retrieves locale information. * * @parm DWORD | *pdwCodePageID | Pointer to ANSI code page no. specified at build time. * @parm LCID | *plcid | Pointer to Win32 locale identifier specified at build time. * * @rvalue E_POINTER | Either the code page pointer or the locale identifier is null. * @rvalue E_NOTOPEN | [?] is not initialized. * @rvalue S_OK | The operation completed successfully. * ****************************************************************/ STDMETHODIMP CITStdBreaker::GetLocaleInfo(DWORD *pdwCodePageID, LCID *plcid) { if (pdwCodePageID == NULL || plcid == NULL) return (SetErrReturn(E_POINTER));
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
m_cs.Lock();
*pdwCodePageID = m_brkctl.dwCodePageID; *plcid = m_brkctl.lcid;
m_cs.Unlock();
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IWordBreakerConfig | SetBreakWordType| * Sets the type of words the breaker should expect * to see in all subsequent calls to IWordBreaker::BreakText. * * @parm DWORD | dwBreakWordType | Specifies the type for break words. * Can be one of IITWBC_BREAKTYPE_TEXT, IITWBC_BREAKTYPE_NUMBER, * IITWBC_BREAKTYPE_DATE, IITWBC_BREAKTYPE_TIME, IITWBC_BREAKTYPE_EPOCH. * * * @rvalue E_INVALIDARG | Invalid break word type. * @rvalue S_OK | The operation completed successfully. *****************************************************************/ STDMETHODIMP CITStdBreaker::SetBreakWordType(DWORD dwBreakWordType) { if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
switch (dwBreakWordType) { case IITWBC_BREAKTYPE_TEXT: case IITWBC_BREAKTYPE_NUMBER: case IITWBC_BREAKTYPE_DATE: case IITWBC_BREAKTYPE_TIME: case IITWBC_BREAKTYPE_EPOCH: break;
default: return (SetErrReturn(E_INVALIDARG)); };
m_cs.Lock();
m_brkctl.dwBreakWordType = dwBreakWordType; m_fDirty = TRUE;
m_cs.Unlock();
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IWordBreakerConfig | GetBreakWordType| * Retrieves the type of words the breaker expects to see in * calls to IWordBreaker::BreakText. * * @parm DWORD | *pdwBreakWordType | Pointer to the type for break words. * Can be one of IITWBC_BREAKTYPE_TEXT (0), IITWBC_BREAKTYPE_NUMBER (1), * IITWBC_BREAKTYPE_DATE (2), IITWBC_BREAKTYPE_TIME (3), IITWBC_BREAKTYPE_EPOCH (4). * * * @rvalue E_POINTER | Break word type is null. * @rvalue S_OK | The operation completed successfully. *****************************************************************/ STDMETHODIMP CITStdBreaker::GetBreakWordType(DWORD *pdwBreakWordType) { if (pdwBreakWordType == NULL) return (SetErrReturn(E_POINTER));
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
*pdwBreakWordType = m_brkctl.dwBreakWordType;
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IWordBreakerConfig | SetControlInfo | * Sets information that controls certain aspects of word breaking. * * @parm DWORD | grfBreakFlags | Can be: IITWBC_BREAK_ACCEPT_WILDCARDS * (0x00000001), to interpret wild card characters as such; and * IITWBC_BREAK_AND_STEM (0x00000002), stem words after breaking. * @parm DWORD | dwReserved |Reserved for future use. * * @rvalue E_INVALIDARG | Invalid control flag. * @rvalue S_OK | The operation completed successfully. *****************************************************************/ STDMETHODIMP CITStdBreaker::SetControlInfo(DWORD grfBreakFlags, DWORD dwReserved) { DWORD grfFlagsUnsupported;
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
grfFlagsUnsupported = ~(IITWBC_BREAK_ACCEPT_WILDCARDS);
if ((grfBreakFlags & grfFlagsUnsupported) != 0) return (SetErrReturn(E_INVALIDARG));
m_cs.Lock();
m_brkctl.grfBreakFlags = grfBreakFlags; m_fDirty = TRUE;
m_cs.Unlock();
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IWordBreakerConfig | GetControlInfo | * Retrieves information about word breaker control flags. * * @parm DWORD | *pgrfBreakFlags | Pointer to breaker control flags. * @parm DWORD | *pdwReserved |Reserved for future use. * * @rvalue E_POINTER | Break flags are not set (pgrfBreakFlags is null). * @rvalue S_OK | The operation completed successfully. *****************************************************************/ STDMETHODIMP CITStdBreaker::GetControlInfo(DWORD *pgrfBreakFlags, DWORD *pdwReserved) { if (pgrfBreakFlags == NULL) return (SetErrReturn(E_POINTER));
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
*pgrfBreakFlags = m_brkctl.grfBreakFlags;
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IWordBreakerConfig | LoadExternalBreakerData | * Loads word breaker data from an external source, such as a table * containing char-by-char break information or a list of stop words. * * @parm IStream | *pStream | Pointer to external source of data. * @parm DWORD | dwExtDataType | Specifies the type of data in the stream. * * @rvalue E_POINTER | pStream is null. * @rvalue E_NOTOPEN | The stream has not been initialized. * @rvalue S_OK | The operation completed successfully. * * @comm * Although the format of the data in the stream is entirely * implementation-specific, this interface does define a couple * of general types for that data which can be passed in * dwStreamDataType: * IITWBC_EXTDATA_CHARTABLE * IITWBC_EXTDATA_STOPWORDLIST * *****************************************************************/ STDMETHODIMP CITStdBreaker::LoadExternalBreakerData(IStream *pStream, DWORD dwExtDataType) { HRESULT hr; HFPB hfpb; LPCTAB lpctab; LPSIPB lpsipb; if (pStream == NULL) return (SetErrReturn(E_POINTER)); if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
m_cs.Lock(); if ((hfpb = FpbFromHf((HF) pStream, &hr)) != NULL) { switch (dwExtDataType) { case IITWBC_EXTDATA_CHARTABLE: // Load the external character table.
lpctab = MVCharTableLoad(hfpb, NULL, &hr); if (SUCCEEDED(hr)) { ITASSERT(lpctab != NULL); m_fDirty = TRUE; m_grfPersistedItems |= ITSTDBRK_PERSISTED_CHARTABLE; if (m_fQueryContext) MVCharTableSetWildcards(lpctab); // Dispose of any pre-existing char table.
MVCharTableDispose(m_lpctab); m_lpctab = lpctab; } break; case IITWBC_EXTDATA_STOPWORDLIST: // We should at least have an internal default char table.
ITASSERT(m_lpctab != NULL); // Init the in-memory stop word list and load the external
// list.
if ((lpsipb = MVStopListInitiate(ITSTDBRK_STOPHASH_SIZE, &hr)) != NULL && SUCCEEDED(hr = MVStopListLoad(hfpb, lpsipb, NULL, FBreakWords, m_lpctab))) { m_fDirty = TRUE; m_grfPersistedItems |= ITSTDBRK_PERSISTED_STOPWORDLIST;
MVStopListDispose(m_lpsipb); m_lpsipb = lpsipb; } break; default: hr = E_INVALIDARG; break; }; FreeHfpb(hfpb); } m_cs.Unlock();
return (hr); }
/*****************************************************************
* @method STDMETHODIMP | IWordBreakerConfig | SetWordStemmer | * Allows you to associate a stemmer with the word breaker. * * @parm REFCLSID | rclsid | Class identifier for the stemmer. * @parm IStemmer | *pStemmer | Pointer to the stemmer. * * @rvalue E_NOTOPEN | [?] has not been initialized. * @rvalue S_OK | The operation completed successfully. * * @comm * The breaker takes responsibility for calling IPersistStreamInit::Load/Save * when it is loaded/saved if the stemmer supports that interface. *****************************************************************/ STDMETHODIMP CITStdBreaker::SetWordStemmer(REFCLSID rclsid, IStemmer *pStemmer) { if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
m_cs.Lock(); if (m_pistem != NULL) m_pistem->Release(); if ((m_pistem = pStemmer) != NULL) { m_pistem->AddRef(); ITASSERT(rclsid != GUID_NULL); m_clsidStemmer = rclsid;
m_fDirty = TRUE; }
SetGrfFlag(&m_grfPersistedItems, ITSTDBRK_PERSISTED_STEMMER, m_pistem != NULL); m_cs.Unlock();
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IWordBreakerConfig | GetWordStemmer | * Indicates whether or not a stemmer is associated with the word breaker. * * @parm IStemmer | **ppStemmer | Pointer to the stemmer. * * @rvalue E_POINTER | No stemmer has been associated (ppStemmer is NULL). * @rvalue E_NOTOPEN | [?] has not been initialized. * @rvalue S_OK | The operation completed successfully. * * @comm * The breaker takes responsibility for calling IPersistStreamInit::Load/Save * when it is loaded/saved if the stemmer supports that interface. *****************************************************************/ STDMETHODIMP CITStdBreaker::GetWordStemmer(IStemmer **ppStemmer) { if (ppStemmer == NULL) return (SetErrReturn(E_POINTER)); if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); if ((*ppStemmer = m_pistem) != NULL) m_pistem->AddRef();
return (m_pistem != NULL ? S_OK : S_FALSE); }
//---------------------------------------------------------------------------
// IITStopWordList Method Implementations
//---------------------------------------------------------------------------
/*****************************************************************
* @method STDMETHODIMP | IITStopWordList | AddWord | * Adds a word to the stop word list. * * @parm WCHAR const | *pwcInBuf | Pointer to the input buffer. * @parm ULONG | cwc | Length of word (count of wide characters). * * @rvalue S_OK | The operation completed successfully. * *****************************************************************/ STDMETHODIMP CITStdBreaker::AddWord(WCHAR const *pwcInBuf, ULONG cwc) { return (StopListOp(pwcInBuf, cwc, TRUE)); }
/*****************************************************************
* @method STDMETHODIMP | IITStopWordList | LookupWord | * Looks up a word in the stop word list. * * @parm WCHAR const | *pwcInBuf | Pointer to the input buffer. * @parm ULONG | cwc | Length of word (count of wide characters). * * @rvalue S_OK | The operation completed successfully. * *****************************************************************/ STDMETHODIMP CITStdBreaker::LookupWord(WCHAR const *pwcInBuf, ULONG cwc) { return (StopListOp(pwcInBuf, cwc, FALSE)); }
//---------------------------------------------------------------------------
// IPersistStreamInit Method Implementations
//---------------------------------------------------------------------------
STDMETHODIMP CITStdBreaker::GetClassID(CLSID *pclsid) { if (pclsid == NULL) return (SetErrReturn(E_POINTER));
*pclsid = CLSID_ITStdBreaker; return (S_OK); }
STDMETHODIMP CITStdBreaker::IsDirty(void) { if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
return (m_fDirty ? S_OK : S_FALSE); }
STDMETHODIMP CITStdBreaker::Load(IStream *pStream) { HRESULT hr; DWORD dwVersion; DWORD grfPersistedItems; DWORD cbRead;
if (pStream == NULL) return (SetErrReturn(E_POINTER));
// Lock before checking m_fInitialized to make sure we don't compete
// with a call to ::InitNew.
m_cs.Lock();
if (m_fInitialized) return (SetErrReturn(E_ALREADYOPEN));
if (SUCCEEDED(hr = pStream->Read((LPVOID) &dwVersion, sizeof(DWORD), &cbRead)) && SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) && SUCCEEDED(hr = ((dwVersion == VERSION_STDBRKR) ? S_OK : E_BADVERSION)) && SUCCEEDED(hr = pStream->Read((LPVOID) &grfPersistedItems, sizeof(DWORD), &cbRead)) && SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT))) { if (grfPersistedItems != 0) { HFPB hfpb = NULL;
if ((grfPersistedItems & ITSTDBRK_PERSISTED_BRKCTL) != 0) { if (SUCCEEDED(hr = pStream->Read((LPVOID) &m_brkctl, sizeof(BRKCTL), &cbRead))) hr = ((cbRead == sizeof(BRKCTL)) ? S_OK : E_BADFORMAT); } else { // We have an inconsistent persistent state. The only way
// we should have no BRKCTL is if we have no persistent
// state at all (except for version number and persistent
// flags which we've already loaded).
ITASSERT(FALSE); hr = E_UNEXPECTED; }
if (SUCCEEDED(hr) && (hfpb = FpbFromHf((HF) pStream, &hr)) != NULL) { // Load the character table if one is there; otherwise just
// use the internal default table.
if ((grfPersistedItems & ITSTDBRK_PERSISTED_CHARTABLE) != 0) m_lpctab = MVCharTableIndexLoad(hfpb, NULL, &hr); else m_lpctab = MVCharTableGetDefault(&hr); }
if (SUCCEEDED(hr) && (grfPersistedItems & ITSTDBRK_PERSISTED_STOPWORDLIST) != 0) { // Load the stop word list.
if ((m_lpsipb = MVStopListInitiate(ITSTDBRK_STOPHASH_SIZE, &hr)) != NULL) hr = MVStopListIndexLoad(hfpb, m_lpsipb, NULL); }
if (hfpb != NULL) FreeHfpb(hfpb); if (SUCCEEDED(hr) && (grfPersistedItems & ITSTDBRK_PERSISTED_STEMMER) != 0) { IPersistStreamInit *pipstmi; ITASSERT(m_pistem == NULL); // Instantiate and load the stemmer if it
// implements IPersistStreamInit.
if (SUCCEEDED(hr = ReadClassStm(pStream, &m_clsidStemmer)) && SUCCEEDED(hr = CoCreateInstance(m_clsidStemmer, NULL, CLSCTX_INPROC_SERVER, IID_IStemmer, (LPVOID *)&m_pistem)) && SUCCEEDED(m_pistem->QueryInterface(IID_IPersistStreamInit, (LPVOID *)&pipstmi))) { hr = pipstmi->Load(pStream); pipstmi->Release(); } } } else { // If there were no persisted items (we release one beta version
// without pluggable breakers where we had dummy instance data
// where this was true) then we should just behave like we're being
// created anew.
hr = InitNew(); } }
if (SUCCEEDED(hr)) { // We don't want to assign an incorrect grfPersistedItems if
// we ended up calling InitNew.
if (!m_fInitialized) { m_grfPersistedItems = grfPersistedItems; m_fInitialized = TRUE; } } else // Free any peristed items which may have been loaded successfully.
Close();
m_cs.Unlock(); return (hr); }
STDMETHODIMP CITStdBreaker::Save(IStream *pStream, BOOL fClearDirty) { HRESULT hr; DWORD dwVersion; DWORD cbWritten;
if (pStream == NULL) return (SetErrReturn(E_POINTER));
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
m_cs.Lock();
dwVersion = VERSION_STDBRKR; if (SUCCEEDED(hr = pStream->Write((LPVOID) &dwVersion, sizeof(DWORD), &cbWritten)) && SUCCEEDED(hr = pStream->Write((LPVOID) &m_grfPersistedItems, sizeof(DWORD), &cbWritten))) { HFPB hfpb = NULL;
if ((m_grfPersistedItems & ITSTDBRK_PERSISTED_BRKCTL) != 0) hr = pStream->Write((LPVOID) &m_brkctl, sizeof(BRKCTL), &cbWritten); else { // We should always be writing the BRKCTL structure, but if for some
// reason the flag to write it is not set, we can still continue
// because at load time we will tolerate the absence of the struct.
ITASSERT(FALSE); }
if (SUCCEEDED(hr) && (hfpb = FpbFromHf((HF) pStream, &hr)) != NULL && (m_grfPersistedItems & ITSTDBRK_PERSISTED_CHARTABLE) != 0) { // Save char table.
if (m_lpctab != NULL) hr = MVCharTableFileBuild(hfpb, m_lpctab, NULL); else { ITASSERT(FALSE); hr = E_UNEXPECTED; } }
if (SUCCEEDED(hr) && (m_grfPersistedItems & ITSTDBRK_PERSISTED_STOPWORDLIST) != 0) { // Save stop word list.
if (m_lpsipb != NULL) hr = MVStopFileBuild(hfpb, m_lpsipb, NULL); else { ITASSERT(FALSE); hr = E_UNEXPECTED; } }
if (hfpb != NULL) FreeHfpb(hfpb); if (SUCCEEDED(hr) && (m_grfPersistedItems & ITSTDBRK_PERSISTED_STEMMER) != 0) { IPersistStreamInit *pipstmi; ITASSERT(m_pistem != NULL); // Write the stemmer's CLSID and save the stemmer if it
// implements IPersistStreamInit.
if (SUCCEEDED(hr = WriteClassStm(pStream, m_clsidStemmer)) && SUCCEEDED(m_pistem->QueryInterface(IID_IPersistStreamInit, (LPVOID *) &pipstmi))) { hr = pipstmi->Save(pStream, fClearDirty); pipstmi->Release(); } } }
if (SUCCEEDED(hr) && fClearDirty) m_fDirty = FALSE;
m_cs.Unlock();
return (hr); }
STDMETHODIMP CITStdBreaker::GetSizeMax(ULARGE_INTEGER *pcbSizeMax) { return (E_NOTIMPL); }
STDMETHODIMP CITStdBreaker::InitNew(void) { HRESULT hr = S_OK; // Lock before checking m_fInitialized to make sure we don't compete
// with a call to ::Load.
m_cs.Lock();
if (m_fInitialized) return (SetErrReturn(E_ALREADYOPEN));
InitBrkCtl(); m_grfPersistedItems |= ITSTDBRK_PERSISTED_BRKCTL;
// Get the default char table in case we're never asked to load an
// external one. If we do load an external one, we'll properly
// discard this one. We don't set the persisted flag for the
// char table because we don't need to persist the internal default.
m_lpctab = MVCharTableGetDefault(&hr);
// Initialize the stop word list so that stop words can be added
// programmatically if a client desires.
if (SUCCEEDED(hr)) m_lpsipb = MVStopListInitiate(ITSTDBRK_STOPHASH_SIZE, &hr);
if (SUCCEEDED(hr)) m_fInitialized = m_fDirty = TRUE; else Close();
m_cs.Unlock(); return (hr); }
//---------------------------------------------------------------------------
// Private Method Implementations
//---------------------------------------------------------------------------
HRESULT CITStdBreaker::StopListOp(WCHAR const *pwcInBuf, ULONG cwc, BOOL fAddWord) { HRESULT hr; DWORD cbAnsi; if (pwcInBuf == NULL) return (E_POINTER); if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); if (m_lpsipb == NULL) return (SetErrReturn(E_NOTINIT)); m_cs.Lock(); cbAnsi = (sizeof(WCHAR) * cwc) + sizeof(WORD); if (SUCCEEDED(hr = ReallocBuffer(&m_hmemAnsi, &m_cbBufAnsiCur, cbAnsi))) { char *lpchBuf; lpchBuf = (char *) _GLOBALLOCK(m_hmemAnsi);
if ((*((WORD *)lpchBuf) = (WORD) ( WideCharToMultiByte(m_brkctl.dwCodePageID, NULL, pwcInBuf, cwc, lpchBuf + sizeof(WORD), cbAnsi - sizeof(WORD), NULL, NULL))) > 0) { if (fAddWord) hr = MVStopListAddWord(m_lpsipb, (LPBYTE)lpchBuf); else hr = MVStopListLookup(m_lpsipb, (LPBYTE)lpchBuf); } else hr = E_UNEXPECTED; _GLOBALUNLOCK(m_hmemAnsi); } m_cs.Unlock();
return (hr); }
HRESULT CITStdBreaker::ReallocBuffer(HGLOBAL *phmemBuf, DWORD *pcbBufCur, DWORD cbBufNew) { HRESULT hr = S_OK;
m_cs.Lock();
hr = ReallocBufferHmem(phmemBuf, pcbBufCur, max(cbBufNew, cbAnsiBufInit));
m_cs.Unlock();
return (hr); }
void CITStdBreaker::ClearMembers(void) { MEMSET(&m_brkctl, NULL, sizeof(BRKCTL)); m_fInitialized = m_fDirty = m_fQueryContext = FALSE; m_grfPersistedItems = 0; m_lpctab = NULL; m_lpsipb = NULL; m_clsidStemmer = GUID_NULL; }
void CITStdBreaker::InitBrkCtl(void) { m_brkctl.dwCodePageID = GetACP(); m_brkctl.lcid = GetUserDefaultLCID(); m_brkctl.dwBreakWordType = IITWBC_BREAKTYPE_TEXT; m_brkctl.grfBreakFlags = 0; }
void CITStdBreaker::Close(void) { m_cs.Lock(); if (m_hmemAnsi != NULL) { _GLOBALFREE(m_hmemAnsi); m_hmemAnsi = NULL; m_cbBufAnsiCur = 0; } if (m_pistem != NULL) { m_pistem->Release(); m_pistem = NULL; }
MVCharTableDispose(m_lpctab); MVStopListDispose(m_lpsipb);
ClearMembers(); m_cs.Unlock(); }
//---------------------------------------------------------------------------
// Utility Functions
//---------------------------------------------------------------------------
// (6/19/97): BillA, JohnRush, and MikkyA all agreed that we would stop storing
// offset and length information in the index because the new HTML-based
// display engines don't allow our clients to find words using that information
// anyway.
//
// However, the above decision doesn't eliminate the need to accurately
// correlate offsets into the MBCS text buffer with offsets into the original
// Unicode buffer. This is needed by the query parsing code at runtime.
// The method for achieving offset correlation is simple: call
// MultiByteToWideChar on the MBCS text buffer up to dwWordOffset to get
// back the equivalent Unicode offset which we will pass to the word sink.
//
// NOTE: The above method will work as long as the breaker code is using
// the same lead byte table as the system conversion function. For now,
// our clients will be responsible for making sure the character table
// is consistent with the system's lead byte table. In the future, we
// probably should make the breaker explicitly set the lead bytes in the
// character table using the system's lead byte table.
//
// In the case of single byte characters, the offset and length information
// automatically correlates between MBCS and Unicode because it is essentially
// stated in characters, not bytes.
//
HRESULT FAR PASCAL StdBreakerWordFunc(LST lstRawWord, LST lstNormWord, DWORD dwWordOffset, LPVOID lpvUser) { HRESULT hr; DWORD cbAnsi; DWORD cwch; DWORD cwchRaw; DWORD iwchWordOffset = dwWordOffset; WCHAR *lpwchBuf; WRDFNPM *pwrdfnpm;
if (lstRawWord == NULL || lstNormWord == NULL || lpvUser == NULL) return (E_POINTER);
pwrdfnpm = (WRDFNPM *) lpvUser;
// We will set up the Unicode buffer to have as many characters as there are
// bytes in the Ansi string since we don't know how much, if any, DBCS chars
// there are in the Ansi string.
cwch = cbAnsi = (DWORD)(*((WORD *)lstNormWord)); cwchRaw = (DWORD)(*((WORD *)lstRawWord));
// Set up Unicode buffer for the normalized word.
if (SUCCEEDED(hr = ReallocBufferHmem(&pwrdfnpm->hmemUnicode, &pwrdfnpm->cbBufUnicodeCur, sizeof(WCHAR) * cwch))) { lpwchBuf = (WCHAR *) _GLOBALLOCK(pwrdfnpm->hmemUnicode);
// Compute the Unicode offset that corresponds to the
// MBCS-based dwWordOffset. We pass lpwchBuf as a valid placeholder
// buffer (in case non-NULL is required), but nothing will get
// written to it.
iwchWordOffset = MultiByteToWideChar(pwrdfnpm->dwCodePageID, NULL, (LPCSTR) pwrdfnpm->lpbBuf, dwWordOffset, lpwchBuf, 0); // Convert the normalized word to Unicode.
if ((cwch = MultiByteToWideChar(pwrdfnpm->dwCodePageID, NULL, (LPCSTR) &lstNormWord[sizeof(WORD)], cbAnsi, lpwchBuf, cwch)) > 0 && pwrdfnpm->piwrdsnk != NULL) { // Send the normalized word to the word sink.
hr = pwrdfnpm->piwrdsnk->PutAltWord(lpwchBuf, cwch, cwchRaw, iwchWordOffset); } else hr = E_UNEXPECTED;
_GLOBALUNLOCK(pwrdfnpm->hmemUnicode); }
cwch = cbAnsi = cwchRaw;
// Set up Unicode buffer for the raw word.
if (SUCCEEDED(hr) && SUCCEEDED(hr = ReallocBufferHmem(&pwrdfnpm->hmemUnicode, &pwrdfnpm->cbBufUnicodeCur, sizeof(WCHAR) * cwch))) { lpwchBuf = (WCHAR *) _GLOBALLOCK(pwrdfnpm->hmemUnicode);
// Convert the raw word to Unicode.
if ((cwch = MultiByteToWideChar(pwrdfnpm->dwCodePageID, NULL, (LPCSTR) &lstRawWord[sizeof(WORD)], cbAnsi, lpwchBuf, cwch)) > 0 && pwrdfnpm->piwrdsnk != NULL) { // Send the raw word to the word sink.
hr = pwrdfnpm->piwrdsnk->PutWord(lpwchBuf, cwch, cwchRaw, iwchWordOffset); } else hr = E_UNEXPECTED;
_GLOBALUNLOCK(pwrdfnpm->hmemUnicode); }
return (hr); }
|