|
|
/*************************************************************************
* @doc SHROOM EXTERNAL API * * * * ENGSTEM.CPP * * * * Copyright (C) Microsoft Corporation 1997 * * All Rights reserved. * * * * This file contains the implementation of CITEngStemmer methods. * * CITEngStemmer is a pluggable word stemer object. * * Although all the word breaking interface methods that accept text * * require it to be Unicode, CITEngStemmer still only supports ANSI * * internally. * * * ************************************************************************** * * * Written By : Bill Aloof * * Current Owner: billa * * * **************************************************************************/
#include <mvopsys.h>
#ifdef _DEBUG
static char s_aszModule[] = __FILE__; /* For error report */ #endif
#include <atlinc.h> // includes for ATL.
#include <_mvutil.h>
#include <mem.h>
#include <orkin.h>
#include <mvsearch.h>
#include "common.h"
#include <iterror.h>
#include <itstem.h>
#include <itwbrkid.h>
#include "engstem.h"
//---------------------------------------------------------------------------
// Constructor and Destructor
//---------------------------------------------------------------------------
CITEngStemmer::CITEngStemmer() { ClearMembers(); m_hmem1 = m_hmem2 = NULL; m_cbBuf1Cur = m_cbBuf2Cur = 0; }
CITEngStemmer::~CITEngStemmer() { Close(); }
//---------------------------------------------------------------------------
// IStemmer Method Implementations
//---------------------------------------------------------------------------
/********************************************************************
* @method STDMETHODIMP | IStemmer | Init | * Gives the stemmer object a chance to initialize itself beyond * what it did during IPersistStreamInit::InitNew or ::Load. * @parm ULONG | ulMaxTokenSize | Max term length requested by caller * @parm BOOL* | pfLicense | Whether the stemmer is subject to a license * * @rvalue E_POINTER | pfLicense was NULL * ********************************************************************/ STDMETHODIMP CITEngStemmer::Init(ULONG ulMaxTokenSize, BOOL *pfLicense) { HRESULT hr; if (pfLicense == NULL) return (SetErrReturn(E_POINTER));
// If we haven't been initialized yet (i.e. no call was made to either
// IPersistStreamInit::InitNew or Load), we'll initialize ourselves now.
// This allows Tripoli clients to use us without any code changes on their
// part. If we have already been initialized, the caller has had a chance
// to correctly set the lcid, so we check it now; otherwise, we want to
// still give the caller a chance to set it correctly.
if (m_fInitialized) hr = (PRIMARYLANGID(LANGIDFROMLCID(m_stemctl.lcid)) == LANG_ENGLISH ? S_OK : E_FAIL); else hr = InitNew(); if (SUCCEEDED(hr)) *pfLicense = FALSE; // NOTE: We don't support internal truncation of terms based on
// ulMaxTokenSize. This is OK since the word sink is supposed to be
// prepared to have to truncate anyway.
return (hr); }
/********************************************************************
* @method STDMETHODIMP | IStemmer | StemWord | * stems the input word and calls the methods of IStemSink with the results. * * @parm WCHAR const | *pwcInBuf | Input Unicode word. * @parm ULONG | cwc | count of Unicode characters in the input word. * @parm IStemSink | *pStemSink | Pointer to the stemmer sink. * * * * @rvalue E_WORDTOOLONG | cwc is larger than 0x7FFF * @rvalue E_POINTER | Either the input buffer or *pStemSink is NULL. * @rvalue S_OK | The operation completed successfully. * ********************************************************************/ STDMETHODIMP CITEngStemmer::StemWord(WCHAR const *pwcInBuf, ULONG cwc, IStemSink *pStemSink) { HRESULT hr = S_OK;
if (pwcInBuf == NULL || pStemSink == NULL) return (SetErrReturn(E_POINTER));
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
if (PRIMARYLANGID(LANGIDFROMLCID(m_stemctl.lcid)) != LANG_ENGLISH) return (SetErrReturn(E_FAIL)); if (cwc > 0x7FFF) return (SetErrReturn(E_WORDTOOLONG)); m_cs.Lock();
// We allocate enough space for a worst case Unicode ---> MBCS conversion
// and allow an extra word for a length prefix that we will add later.
// This is probably overly cautious because we shouldn't be seeing any
// DBCS anyway (we're an English stemmer).
if (SUCCEEDED(hr = ReallocBuffer(&m_hmem1, &m_cbBuf1Cur, (sizeof(WCHAR) * cwc) + sizeof(WORD)))) { LPBYTE lpbRawWord; lpbRawWord = (LPBYTE) _GLOBALLOCK(m_hmem1);
// REVIEW (billa): Need to make sure that the word being stemmed is in
// lower case.
// Convert the raw word to ANSI.
if ((*((WORD *)lpbRawWord) = (WORD) WideCharToMultiByte(m_stemctl.dwCodePageID, NULL, pwcInBuf, cwc, (char *)lpbRawWord + sizeof(WORD), (m_cbBuf1Cur - sizeof(WORD)), NULL, NULL)) > 0) { // We want the buffer we allocate for the stemmed word to be larger
// than the raw word length so that we can handle the rare case
// where the stemmed word has grown. We can just use the raw word
// buffer size because it included a lot of extra padding.
if (SUCCEEDED(hr = ReallocBuffer(&m_hmem2, &m_cbBuf2Cur, m_cbBuf1Cur))) { LPBYTE lpbStemWord; lpbStemWord = (LPBYTE) _GLOBALLOCK(m_hmem2); if (SUCCEEDED(hr = FStem(lpbStemWord, lpbRawWord))) { WCHAR *lpwchStem; DWORD cwchStem; DWORD cbStemWord; _GLOBALUNLOCK(m_hmem1); cwchStem = cbStemWord = (DWORD)(*((WORD *)lpbStemWord)); hr = ReallocBuffer(&m_hmem1, &m_cbBuf1Cur, sizeof (WCHAR) * cbStemWord); // Relock buffer even if we've failed the realloc
// so that the unlock we do later is valid. An
// unconditional relock is OK because ReallocBuffer
// won't invalidate the original m_hmem1 if it fails.
lpwchStem = (WCHAR *) _GLOBALLOCK(m_hmem1); // Convert the stem word back to Unicode so that we can
// call the stem sink.
if ((cwchStem = MultiByteToWideChar(m_stemctl.dwCodePageID, NULL, (LPCSTR)lpbStemWord + sizeof(WORD), cbStemWord, lpwchStem, cwchStem)) > 0) { // Send the raw word to the word sink.
hr = pStemSink->PutWord(lpwchStem, cwchStem); } else hr = E_UNEXPECTED; } _GLOBALUNLOCK(m_hmem2); } } else hr = E_UNEXPECTED;
_GLOBALUNLOCK(m_hmem1); }
m_cs.Unlock();
return (hr); }
/*****************************************************************
* @method STDMETHODIMP | IStemmer | GetLicenseToUse | * * Not yet implemented * ****************************************************************/ STDMETHODIMP CITEngStemmer::GetLicenseToUse(WCHAR const **ppwcsLicense) { return (E_NOTIMPL); }
//---------------------------------------------------------------------------
// IStemmerConfig Method Implementations
//---------------------------------------------------------------------------
/*****************************************************************
* @method STDMETHODIMP | IStemmerConfig | SetLocaleInfo | * Sets locale information that affects the stemming * behavior of IStemmer::StemWord. * @parm DWORD | dwCodePageID | ANSI code page no. specified at build time. * @parm LCID | lcid | Win32 locale identifier specified at build time. * * @rvalue S_OK | Locale described by the parameters is supported * @rvalue E_INVALIDARG | Locale described by the parameters is not supported. * * ****************************************************************/ STDMETHODIMP CITEngStemmer::SetLocaleInfo(DWORD dwCodePageID, LCID lcid) { if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); if (PRIMARYLANGID(LANGIDFROMLCID(lcid)) != LANG_ENGLISH) return (SetErrReturn(E_INVALIDARG));
m_cs.Lock();
m_stemctl.dwCodePageID = dwCodePageID; m_stemctl.lcid = lcid; m_fDirty = TRUE;
m_cs.Unlock();
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IStemmerConfig | GetLocaleInfo | * Gets locale information that affects the stemming * behavior of IStemmer::StemWord. * @parm DWORD | *pdwCodePageID | Pointer to code page identifier * @parm LCID | *plcid | Pointer to Win32 locale identifier. * * @rvalue S_OK | Locale described by the parameters is supported * @rvalue E_INVALIDARG | Locale described by the parameters is not supported. * * ****************************************************************/ STDMETHODIMP CITEngStemmer::GetLocaleInfo(DWORD *pdwCodePageID, LCID *plcid) { if (pdwCodePageID == NULL || plcid == NULL) return (SetErrReturn(E_POINTER));
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
m_cs.Lock();
*pdwCodePageID = m_stemctl.dwCodePageID; *plcid = m_stemctl.lcid;
m_cs.Unlock();
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IStemmerConfig | SetControlInfo | * Sets information that controls certain aspects of stemming. * * @parm DWORD | grfStemFlags | Flags that control stemming behavior. * @parm DWORD | dwReserved | Reserved for future use. * * @rvalue S_OK | The operation completed successfully. * * @comm * In the future, additional information may be passed in through * dwReserved. ****************************************************************/
STDMETHODIMP CITEngStemmer::SetControlInfo(DWORD grfStemFlags, DWORD dwReserved) { DWORD grfFlagsUnsupported;
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
grfFlagsUnsupported = ~(0);
if ((grfStemFlags & grfFlagsUnsupported) != 0) return (SetErrReturn(E_INVALIDARG));
m_cs.Lock();
m_stemctl.grfStemFlags = grfStemFlags; m_fDirty = TRUE;
m_cs.Unlock();
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IStemmerConfig | GetControlInfo | * Gets information that controls stemming behavior. * * @parm DWORD | *pgrfStemFlags | Pointer to flags that control stemming behavior. * @parm DWORD | *pdwReserved | Reserved for future use. * * @rvalue S_OK | The operation completed successfully. * ****************************************************************/ STDMETHODIMP CITEngStemmer::GetControlInfo(DWORD *pgrfStemFlags, DWORD *pdwReserved) { if (pgrfStemFlags == NULL) return (SetErrReturn(E_POINTER));
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
*pgrfStemFlags = m_stemctl.grfStemFlags;
return (S_OK); }
/*****************************************************************
* @method STDMETHODIMP | IStemmerConfig | LoadExternalStemmerData | * Loads external stemmer data, such as word part lists. * * @parm IStream | *pStream | Pointer to stream object containing * stenner data. * @parm DWORD | dwExtDataType | Data type. * * @comm * Not implemented yet. ****************************************************************/ STDMETHODIMP CITEngStemmer::LoadExternalStemmerData(IStream *pStream, DWORD dwExtDataType) { if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
return (E_NOTIMPL); }
//---------------------------------------------------------------------------
// IPersistStreamInit Method Implementations
//---------------------------------------------------------------------------
STDMETHODIMP CITEngStemmer::GetClassID(CLSID *pclsid) { if (pclsid == NULL) return (SetErrReturn(E_POINTER));
*pclsid = CLSID_ITEngStemmer; return (S_OK); }
STDMETHODIMP CITEngStemmer::IsDirty(void) { if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
return (m_fDirty ? S_OK : S_FALSE); }
STDMETHODIMP CITEngStemmer::Load(IStream *pStream) { HRESULT hr; DWORD dwVersion; DWORD grfPersistedItems; DWORD cbRead;
if (pStream == NULL) return (SetErrReturn(E_POINTER));
// Lock before checking m_fInitialized to make sure we don't compete
// with a call to ::InitNew.
m_cs.Lock();
if (m_fInitialized) return (SetErrReturn(E_ALREADYOPEN));
if (SUCCEEDED(hr = pStream->Read((LPVOID) &dwVersion, sizeof(DWORD), &cbRead)) && SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) && SUCCEEDED(hr = ((dwVersion == VERSION_ENGSTEMMER) ? S_OK : E_BADVERSION)) && SUCCEEDED(hr = pStream->Read((LPVOID) &grfPersistedItems, sizeof(DWORD), &cbRead)) && SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) && grfPersistedItems != 0) { if ((grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) != 0) { if (SUCCEEDED(hr = pStream->Read((LPVOID) &m_stemctl, sizeof(STEMCTL), &cbRead))) hr = ((cbRead == sizeof(STEMCTL)) ? S_OK : E_BADFORMAT); } else { // It is a surprise not to find the STEMCTL structure in the stream,
// but we can continue on because we will initialize the structure
// with good defaults before we exit this routine.
ITASSERT(FALSE); }
}
if (SUCCEEDED(hr)) { if ((grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) == 0) { InitStemCtl();
// Set flag in case we're asked to save.
grfPersistedItems |= ITSTDBRK_PERSISTED_STEMCTL; }
m_grfPersistedItems = grfPersistedItems; m_fInitialized = TRUE; } else // Free any peristed items which may have been loaded successfully.
Close();
m_cs.Unlock(); return (hr); }
STDMETHODIMP CITEngStemmer::Save(IStream *pStream, BOOL fClearDirty) { HRESULT hr; DWORD dwVersion; DWORD cbWritten;
if (pStream == NULL) return (SetErrReturn(E_POINTER));
if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN));
m_cs.Lock();
dwVersion = VERSION_ENGSTEMMER; if (SUCCEEDED(hr = pStream->Write((LPVOID) &dwVersion, sizeof(DWORD), &cbWritten)) && SUCCEEDED(hr = pStream->Write((LPVOID) &m_grfPersistedItems, sizeof(DWORD), &cbWritten))) { if ((m_grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) != 0) hr = pStream->Write((LPVOID) &m_stemctl, sizeof(STEMCTL), &cbWritten); else { // We should always be writing the STEMCTL structure, but if for
// some reason the flag to write it is not set, we can still continue
// because at load time we will tolerate the absence of the struct.
ITASSERT(FALSE); }
}
if (SUCCEEDED(hr) && fClearDirty) m_fDirty = FALSE;
m_cs.Unlock();
return (hr); }
STDMETHODIMP CITEngStemmer::GetSizeMax(ULARGE_INTEGER *pcbSizeMax) { return (E_NOTIMPL); }
STDMETHODIMP CITEngStemmer::InitNew(void) { // Lock before checking m_fInitialized to make sure we don't compete
// with a call to ::Load.
m_cs.Lock();
if (m_fInitialized) return (SetErrReturn(E_ALREADYOPEN));
InitStemCtl(); m_grfPersistedItems |= ITSTDBRK_PERSISTED_STEMCTL; m_fInitialized = TRUE;
m_cs.Unlock(); return (S_OK); }
//---------------------------------------------------------------------------
// Private Method Implementations
//---------------------------------------------------------------------------
HRESULT CITEngStemmer::ReallocBuffer(HGLOBAL *phmemBuf, DWORD *pcbBufCur, DWORD cbBufNew) { HRESULT hr = S_OK;
m_cs.Lock();
hr = ReallocBufferHmem(phmemBuf, pcbBufCur, max(cbBufNew, cbAnsiBufInit));
m_cs.Unlock();
return (hr); }
void CITEngStemmer::ClearMembers(void) { MEMSET(&m_stemctl, NULL, sizeof(STEMCTL)); m_fInitialized = m_fDirty = FALSE; m_grfPersistedItems = 0; }
void CITEngStemmer::InitStemCtl(void) { m_stemctl.dwCodePageID = GetACP(); // If the user default language is not English, we'll store the
// value and check it in IStemmer::Init and ::StemWord.
m_stemctl.lcid = GetUserDefaultLCID(); m_stemctl.grfStemFlags = 0; }
void CITEngStemmer::Close(void) { if (m_hmem1 != NULL) { _GLOBALFREE(m_hmem1); m_hmem1 = NULL; m_cbBuf1Cur = 0; }
if (m_hmem2 != NULL) { _GLOBALFREE(m_hmem2); m_hmem2 = NULL; m_cbBuf2Cur = 0; }
ClearMembers(); }
|