windows-xp/Source/XPSP1/NT/enduser/stuff/itircl/fts/breakers/stdbrkr.cpp


								/*************************************************************************

								*  @doc SHROOM EXTERNAL API                                              *

								*																		 *

								*  STDBRKR.CPP                                                           *

								*                                                                        *

								*  Copyright (C) Microsoft Corporation 1997                              *

								*  All Rights reserved.                                                  *

								*                                                                        *

								*  This file contains the implementation of CITStdBreaker methods.       *

								*  CITStdBreaker is a pluggable word breaker object that can optionally  *

								*  use a character class table and stop word list during its breaking	 *

								*  operations.  Although all the word breaking interface methods		 *

								*  that accepts text require it to be Unicode, CITStdBreaker still only	 *

								*  support MBCS internally.												 *

								*																	     *

								**************************************************************************

								*                                                                        *

								*  Written By   : Bill Aloof	                                         *

								*  Current Owner: billa		                                             *

								*                                                                        *

								**************************************************************************/

								#include <mvopsys.h>


								#ifdef _DEBUG

								static char s_aszModule[] = __FILE__;   /* For error report */

								#endif


								#ifdef IA64

								#include <itdfguid.h>

								#endif


								#include <atlinc.h>	    // includes for ATL.

								#include <_mvutil.h>

								#include <mem.h>

								#include <orkin.h>

								#include <mvsearch.h>

								#include "common.h"

								#include <iterror.h>

								#include <itwbrk.h>

								#include <itwbrkid.h>

								#include "stdbrkr.h"


								HRESULT FAR PASCAL StdBreakerWordFunc(LST lstRawWord, LST lstNormWord,

																		DWORD dwWordOffset, LPVOID lpvUser);


								//---------------------------------------------------------------------------

								//						Constructor and Destructor

								//---------------------------------------------------------------------------


								CITStdBreaker::CITStdBreaker()

								{

									ClearMembers();

									m_hmemAnsi = NULL;

									m_cbBufAnsiCur = 0;

									m_pistem = NULL;

								}


								CITStdBreaker::~CITStdBreaker()

								{

									Close();

								}


								//---------------------------------------------------------------------------

								//						IWordBreaker Method Implementations

								//---------------------------------------------------------------------------


								/********************************************************************

								 * @method    STDMETHODIMP | IWordBreaker | Init |

								 *     Gives the breaker object a chance to initialize itself beyond

								 *	   what it did during IPersistStreamInit::InitNew or ::Load.

								 * @parm BOOL | fQuery | TRUE means breaker context is query processing

								 * @parm ULONG | ulMaxTokenSize | Max term length requested by caller

								 * @parm BOOL* | pfLicense | Whether the breaker is subject to a license

								 *

								 * @rvalue E_POINTER | pfLicense was NULL

								 *

								 ********************************************************************/

								STDMETHODIMP

								CITStdBreaker::Init(BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense)

								{

									HRESULT	hr = S_OK;


									// NOTE: We don't check m_fInitialized here because we consider ourselves

									// adequately initialized once IPersistStreamInit::InitNew or ::Load

									// has been called.

									if (pfLicense == NULL)

										return (SetErrReturn(E_POINTER));


									// If we haven't been initialized yet (i.e. no call was made to either

									// IPersistStreamInit::InitNew or Load), we'll initialize ourselves now.

									// This allows Tripoli clients to use us without any code changes on their

									// part.

									if (!m_fInitialized)

										hr = InitNew();


									if (SUCCEEDED(hr) && m_pistem != NULL)

										hr = m_pistem->Init(ulMaxTokenSize, pfLicense);


									if (SUCCEEDED(hr))

									{

										if (m_fQueryContext = fQuery)

											MVCharTableSetWildcards(m_lpctab);


										// We set *pfLicense only if the stemmer didn't.

										if (m_pistem == NULL)

											*pfLicense = FALSE;

									}


									// NOTE: We don't support caller-specified internal truncation of terms

									// based on ulMaxTokenSize.  The breaker routines have a hard-coded

									// maximum of CB_MAX_WORD_LEN.  This is OK since the word sink is supposed

									// to be prepared to have to truncate anyway.


									return (hr);

								}


								/********************************************************************

								 * @method    STDMETHODIMP | IWordBreaker | BreakText |

								 * Parses text to find both individual tokens and noun phrases, then

								 * calls methods of IWordSink and IPhraseSink with the results.

								 *

								 * @parm TEXT_SOURCE | *pTextSource | Source of the UniCode text.

								 * @parm IWordSink | *pWordSink | Pointer to the word sink.

								 * @parm IPhraseSink | *pPhraseSink | Pointer to the phrase sink.

								 *      (Not supported at this time.)

								 *

								 * @rvalue S_OK | The operation completed successfully.

								 * @rvalue E_POINTER | The text source is null.

								 * @rvalue E_INVALIDARG | The word sink is NULL.

								 * @rvalue E_NOTOPEN |

								 * @rvalue E_OUTOFMEMORY | There was not enough memory to complete the operation.

								 *

								 * @comm

								 * The raw text in pTextSource is parsed by the word breaker until no

								 * more text is available to refill the buffer.  At this point, BreakText returns S_OK.

								 *

								 *

								 ********************************************************************/

								STDMETHODIMP

								CITStdBreaker::BreakText(TEXT_SOURCE *pTextSource, IWordSink *pWordSink,

																			IPhraseSink *pPhraseSink)

								{

									HRESULT		hr = S_OK;

									LPIBI		lpibi = NULL;


									if (pTextSource == NULL)

										return (SetErrReturn(E_POINTER));


									// We treat a NULL pWordSink different than a NULL pTextSource

									// to indicate to the caller that we can't do anything meaningful

									// without a pWordSink because we don't do phrase breaking.

									if (pWordSink == NULL)

										return (SetErrReturn(E_INVALIDARG));


									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									m_cs.Lock();


									if ((lpibi = BreakerInitiate()) != NULL)

									{

										BRK_PARMS	bkp;

										WRDFNPM		wrdfnpm;


										// Set up word callback wrapper params.

										MEMSET(&wrdfnpm, NULL, sizeof(WRDFNPM));

										wrdfnpm.piwrdsnk = pWordSink;

										wrdfnpm.dwCodePageID = m_brkctl.dwCodePageID;


										// Set up breaker params that will get passed to FBreakX.

										bkp.lpInternalBreakInfo = lpibi;

										bkp.lcbBufOffset = 0;

										bkp.lpvUser = (LPVOID) &wrdfnpm;

										bkp.lpfnOutWord = StdBreakerWordFunc;

										bkp.lpStopInfoBlock = m_lpsipb;

										bkp.lpCharTab = m_lpctab;

										bkp.fFlags =

											((m_brkctl.grfBreakFlags & IITWBC_BREAK_ACCEPT_WILDCARDS) != 0 ?

																						ACCEPT_WILDCARD : 0);


										// Loop to break text.

										do

										{

											DWORD	cbAnsi;

											DWORD	cwch;


											// Make the ANSI buffer big enough to handle all DBCS in case

											// that's what we get when converting from Unicode.

											cbAnsi = sizeof(WCHAR) *

														(cwch = (pTextSource->iEnd - pTextSource->iCur));


											if (SUCCEEDED(hr =

													ReallocBuffer(&m_hmemAnsi, &m_cbBufAnsiCur, cbAnsi)))

											{

												bkp.lpbBuf = (LPBYTE) _GLOBALLOCK(m_hmemAnsi);


												if ((bkp.cbBufCount =

														WideCharToMultiByte(m_brkctl.dwCodePageID, NULL,

														  (LPCWSTR) &pTextSource->awcBuffer[pTextSource->iCur],

																	cwch, (char *) bkp.lpbBuf, m_cbBufAnsiCur,

																							NULL, NULL)) > 0)

												{

													// StdBreakerWordFunc needs the MBCS buffer to compute an

													// accurate word offset into the Unicode buffer.

													wrdfnpm.lpbBuf = bkp.lpbBuf;


													switch (m_brkctl.dwBreakWordType)

													{

														case IITWBC_BREAKTYPE_TEXT:

															if (SUCCEEDED(hr = FBreakWords(&bkp)))

															{

													            /* Flush the word breaker */

													            bkp.lpbBuf = NULL;

													            bkp.cbBufCount = 0;

													            hr = FBreakWords(&bkp);

															}

															break;


														case IITWBC_BREAKTYPE_NUMBER:

															if (SUCCEEDED(hr = FBreakNumber(&bkp)))

															{

													            /* Flush the word breaker */

													            bkp.lpbBuf = NULL;

													            bkp.cbBufCount = 0;

													            hr = FBreakNumber(&bkp);

															}

															break;


														case IITWBC_BREAKTYPE_DATE:

															if (SUCCEEDED(hr = FBreakDate(&bkp)))

															{

													            /* Flush the word breaker */

													            bkp.lpbBuf = NULL;

													            bkp.cbBufCount = 0;

													            hr = FBreakDate(&bkp);

															}

															break;


														case IITWBC_BREAKTYPE_TIME:

															if (SUCCEEDED(hr = FBreakTime(&bkp)))

															{

													            /* Flush the word breaker */

													            bkp.lpbBuf = NULL;

													            bkp.cbBufCount = 0;

													            hr = FBreakTime(&bkp);

															}

															break;


														case IITWBC_BREAKTYPE_EPOCH:

															if (SUCCEEDED(hr = FBreakEpoch(&bkp)))

															{

													            /* Flush the word breaker */

													            bkp.lpbBuf = NULL;

													            bkp.cbBufCount = 0;

													            hr = FBreakEpoch(&bkp);

															}

															break;


														default:

															ITASSERT(FALSE);

															hr = E_UNEXPECTED;

															break;

													};

												}

												else

													hr = E_UNEXPECTED;


												_GLOBALUNLOCK(m_hmemAnsi);

											}


											// Advance cur to end just in case the caller cares about this

											// being the case when we ask for more characters.

											pTextSource->iCur = pTextSource->iEnd;


										} while (SUCCEEDED(hr) &&

												 SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));


										// Free any buffer that the word callback wrapper may have allocated.

										if (wrdfnpm.hmemUnicode != NULL)

											_GLOBALFREE(wrdfnpm.hmemUnicode);

									}

									else

										hr = E_OUTOFMEMORY;


									if (lpibi != NULL)

										BreakerFree(lpibi);


									m_cs.Unlock();


									return (hr);

								}


								/********************************************************************

								 * @method    STDMETHODIMP | IWordBreaker | ComposePhrase |

								 *  Converts a noun and modifier back into a linguistically correct source phrase.

								 *

								 *

								 * @parm WCHAR const | *pwcNoun | Pointer to the word being modified.

								 * @parm ULONG | cwcNoun | The count of characters in pwcNoun.

								 * @parm WCHAR const | *pwcModifier | Points to the word modifying pwcNoun

								 * @parm ULONG | cwcModifier | Length of pwcModifier

								 * @parm ULONG | ulAttachmentType | A wordbreaker-specific value which a

								 *         wordbreaker can use to store additional information about the method of composition.

								 * @parm WCHAR | *pwcPhrase | Pointer to a buffer in which to store the composed phrase

								 * @parm ULONG | *pcwcPhrase | [in]  length in characters of the pwcPhrase buffer.

								 *              [out] the actual length of the composed phrase. If

								 *              WBREAK_E_BUFFER_TOO_SMALL is returned, then on output pcwcPhrase

								 *              contains the required length of pwcPhrase.

								 *

								 * @rvalue S_OK | The object was successfully created

								 * @rvalue E_INVALIDARG | The argument was not valid

								 * @rvalue E_NOTINIT |

								 * @rvalue E_OUTOFMEMORY |

								 *

								 * @comm

								 * Not implemented

								 ********************************************************************/

								STDMETHODIMP

								CITStdBreaker::ComposePhrase(WCHAR const *pwcNoun, ULONG cwcNoun,

														WCHAR const *pwcModifier, ULONG cwcModifier,

														ULONG ulAttachmentType, WCHAR *pwcPhrase,

																				ULONG *pcwcPhrase)

								{

									return (E_NOTIMPL);

								}


								/********************************************************************

								 * @method    STDMETHODIMP | IWordBreaker | GetLicenseToUse |

								 * Returns a pointer to the license information provided by the vendor

								 * of this specific implementation of the IWordBreaker interface.

								 *

								 * @parm WCHAR const | **ppwcsLicense | Pointer to the license information.

								 *

								 * @rvalue E_POINTER | ppwcsLicense is null.

								 ********************************************************************/

								STDMETHODIMP

								CITStdBreaker::GetLicenseToUse(WCHAR const **ppwcsLicense)

								{

									HRESULT	hr;


									if (ppwcsLicense == NULL)

										return (SetErrReturn(E_POINTER));


									if (m_pistem != NULL)

										hr = m_pistem->GetLicenseToUse(ppwcsLicense);

									else

										hr = E_NOTIMPL;


									return (hr);

								}


								//---------------------------------------------------------------------------

								//						IWordBreakerConfig Method Implementations

								//---------------------------------------------------------------------------


								/********************************************************************

								 * @method    STDMETHODIMP | IWordBreakerConfig | SetLocaleInfo|

								 * Sets locale information for the word breaker.

								 *

								 *

								 * @parm DWORD | dwCodePageID | ANSI code page no. specified at build time.

								 * @parm LCID | lcid | Win32 locale identifier specified at build time.

								 *

								 * @rvalue E_NOTOPEN | [?] is not initialized.

								 * @rvalue S_OK | The locale described by the parameters is supported.

								 *

								 ********************************************************************/

								STDMETHODIMP

								CITStdBreaker::SetLocaleInfo(DWORD dwCodePageID, LCID lcid)

								{

									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									m_cs.Lock();


									m_brkctl.dwCodePageID = dwCodePageID;

									m_brkctl.lcid = lcid;

									m_fDirty = TRUE;


									m_cs.Unlock();


									return (S_OK);

								}


								/*****************************************************************

								 * @method    STDMETHODIMP | IWordBreakerConfig | GetLocaleInfo|

								 * Retrieves locale information.

								 *

								 * @parm DWORD | *pdwCodePageID | Pointer to ANSI code page no. specified at build time.

								 * @parm LCID | *plcid | Pointer to Win32 locale identifier specified at build time.

								 *

								 * @rvalue E_POINTER | Either the code page pointer or the locale identifier is null.

								 * @rvalue E_NOTOPEN | [?] is not initialized.

								 * @rvalue S_OK | The operation completed successfully.

								  *

								 ****************************************************************/

								STDMETHODIMP

								CITStdBreaker::GetLocaleInfo(DWORD *pdwCodePageID, LCID *plcid)

								{

									if (pdwCodePageID == NULL || plcid == NULL)

										return (SetErrReturn(E_POINTER));


									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									m_cs.Lock();


									*pdwCodePageID = m_brkctl.dwCodePageID;

									*plcid = m_brkctl.lcid;


									m_cs.Unlock();


									return (S_OK);

								}


								/*****************************************************************

								 * @method    STDMETHODIMP | IWordBreakerConfig | SetBreakWordType|

								 * Sets the type of words the breaker should expect

								 * to see in all subsequent calls to IWordBreaker::BreakText.

								 *

								 * @parm DWORD | dwBreakWordType | Specifies the type for break words.

								 *  Can be one of IITWBC_BREAKTYPE_TEXT, IITWBC_BREAKTYPE_NUMBER,

								 *  IITWBC_BREAKTYPE_DATE, IITWBC_BREAKTYPE_TIME, IITWBC_BREAKTYPE_EPOCH.

								 *

								 *

								 * @rvalue E_INVALIDARG | Invalid break word type.

								 * @rvalue S_OK | The operation completed successfully.

								 *****************************************************************/

								STDMETHODIMP

								CITStdBreaker::SetBreakWordType(DWORD dwBreakWordType)

								{

									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									switch (dwBreakWordType)

									{

										case IITWBC_BREAKTYPE_TEXT:

										case IITWBC_BREAKTYPE_NUMBER:

										case IITWBC_BREAKTYPE_DATE:

										case IITWBC_BREAKTYPE_TIME:

										case IITWBC_BREAKTYPE_EPOCH:

											break;


										default:

											return (SetErrReturn(E_INVALIDARG));

									};


									m_cs.Lock();


									m_brkctl.dwBreakWordType = dwBreakWordType;

									m_fDirty = TRUE;


									m_cs.Unlock();


									return (S_OK);

								}


								/*****************************************************************

								 * @method    STDMETHODIMP | IWordBreakerConfig | GetBreakWordType|

								 * Retrieves the type of words the breaker expects to see in

								 * calls to IWordBreaker::BreakText.

								 *

								 * @parm DWORD | *pdwBreakWordType | Pointer to the type for break words.

								 *  Can be one of IITWBC_BREAKTYPE_TEXT (0), IITWBC_BREAKTYPE_NUMBER (1),

								 *  IITWBC_BREAKTYPE_DATE (2), IITWBC_BREAKTYPE_TIME (3), IITWBC_BREAKTYPE_EPOCH (4).

								 *

								 *

								 * @rvalue E_POINTER | Break word type is null.

								 * @rvalue S_OK | The operation completed successfully.

								 *****************************************************************/

								STDMETHODIMP

								CITStdBreaker::GetBreakWordType(DWORD *pdwBreakWordType)

								{

									if (pdwBreakWordType == NULL)

										return (SetErrReturn(E_POINTER));


									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									*pdwBreakWordType = m_brkctl.dwBreakWordType;


									return (S_OK);

								}


								/*****************************************************************

								 * @method    STDMETHODIMP | IWordBreakerConfig | SetControlInfo |

								 * Sets information that controls certain aspects of word breaking.

								 *

								 * @parm DWORD | grfBreakFlags | Can be: IITWBC_BREAK_ACCEPT_WILDCARDS

								 *    (0x00000001), to interpret wild card characters as such; and

								 *     IITWBC_BREAK_AND_STEM (0x00000002), stem words after breaking.

								 * @parm DWORD | dwReserved |Reserved for future use.

								 *

								 * @rvalue E_INVALIDARG | Invalid control flag.

								 * @rvalue S_OK | The operation completed successfully.

								 *****************************************************************/

								STDMETHODIMP

								CITStdBreaker::SetControlInfo(DWORD grfBreakFlags, DWORD dwReserved)

								{

									DWORD	grfFlagsUnsupported;


									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									grfFlagsUnsupported = ~(IITWBC_BREAK_ACCEPT_WILDCARDS);


									if ((grfBreakFlags & grfFlagsUnsupported) != 0)

										return (SetErrReturn(E_INVALIDARG));


									m_cs.Lock();


									m_brkctl.grfBreakFlags = grfBreakFlags;

									m_fDirty = TRUE;


									m_cs.Unlock();


									return (S_OK);

								}


								/*****************************************************************

								 * @method    STDMETHODIMP | IWordBreakerConfig | GetControlInfo |

								 * Retrieves information about word breaker control flags.

								 *

								 * @parm DWORD | *pgrfBreakFlags | Pointer to breaker control flags.

								 * @parm DWORD | *pdwReserved |Reserved for future use.

								 *

								 * @rvalue E_POINTER | Break flags are not set (pgrfBreakFlags is null).

								 * @rvalue S_OK | The operation completed successfully.

								 *****************************************************************/

								STDMETHODIMP

								CITStdBreaker::GetControlInfo(DWORD *pgrfBreakFlags, DWORD *pdwReserved)

								{

									if (pgrfBreakFlags == NULL)

										return (SetErrReturn(E_POINTER));


									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									*pgrfBreakFlags = m_brkctl.grfBreakFlags;


									return (S_OK);

								}


								/*****************************************************************

								 * @method    STDMETHODIMP | IWordBreakerConfig | LoadExternalBreakerData |

								 * Loads word breaker data from an external source, such as a table

								 * containing char-by-char break information or a list of stop words.

								 *

								 * @parm IStream | *pStream | Pointer to external source of data.

								 * @parm DWORD | dwExtDataType | Specifies the type of data in the stream.

								 *

								 * @rvalue E_POINTER | pStream is null.

								 * @rvalue E_NOTOPEN | The stream has not been initialized.

								 * @rvalue S_OK | The operation completed successfully.

								 *

								 * @comm

								 * Although the format of the data in the stream is entirely

								 * implementation-specific, this interface does define a couple

								 * of general types for that data which can be passed in

								 * dwStreamDataType:

								 *		IITWBC_EXTDATA_CHARTABLE

								 *		IITWBC_EXTDATA_STOPWORDLIST

								 *

								 *****************************************************************/

								STDMETHODIMP

								CITStdBreaker::LoadExternalBreakerData(IStream *pStream, DWORD dwExtDataType)

								{

									HRESULT	hr;

									HFPB	hfpb;

									LPCTAB	lpctab;

									LPSIPB	lpsipb;


									if (pStream == NULL)

										return (SetErrReturn(E_POINTER));


									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									m_cs.Lock();


									if ((hfpb = FpbFromHf((HF) pStream, &hr)) != NULL)

									{

										switch (dwExtDataType)

										{

											case IITWBC_EXTDATA_CHARTABLE:


												// Load the external character table.

												lpctab = MVCharTableLoad(hfpb, NULL, &hr);


												if (SUCCEEDED(hr))

												{

													ITASSERT(lpctab != NULL);

													m_fDirty = TRUE;

													m_grfPersistedItems |= ITSTDBRK_PERSISTED_CHARTABLE;

													if (m_fQueryContext)

														MVCharTableSetWildcards(lpctab);


								 					// Dispose of any pre-existing char table.

													MVCharTableDispose(m_lpctab);

													m_lpctab = lpctab;

												}

												break;


											case IITWBC_EXTDATA_STOPWORDLIST:

												// We should at least have an internal default char table.

												ITASSERT(m_lpctab != NULL);


												// Init the in-memory stop word list and load the external

												// list.

												if ((lpsipb = MVStopListInitiate(ITSTDBRK_STOPHASH_SIZE,

																							&hr)) != NULL &&

													SUCCEEDED(hr = MVStopListLoad(hfpb, lpsipb, NULL,

																					FBreakWords, m_lpctab)))

												{

													m_fDirty = TRUE;

													m_grfPersistedItems |= ITSTDBRK_PERSISTED_STOPWORDLIST;


													MVStopListDispose(m_lpsipb);

													m_lpsipb = lpsipb;

												}

												break;


											default:

												hr = E_INVALIDARG;

												break;

										};


										FreeHfpb(hfpb);

									}


									m_cs.Unlock();


									return (hr);

								}


								/*****************************************************************

								 * @method    STDMETHODIMP | IWordBreakerConfig | SetWordStemmer |

								 * Allows you to associate a stemmer with the word breaker.

								 *

								 * @parm REFCLSID | rclsid | Class identifier for the stemmer.

								 * @parm IStemmer | *pStemmer | Pointer to the stemmer.

								 *

								 * @rvalue E_NOTOPEN | [?] has not been initialized.

								 * @rvalue S_OK | The operation completed successfully.

								 *

								 * @comm

								 * The 	breaker takes responsibility for calling IPersistStreamInit::Load/Save

								 * when it is loaded/saved if the stemmer supports that interface.

								 *****************************************************************/

								STDMETHODIMP

								CITStdBreaker::SetWordStemmer(REFCLSID rclsid, IStemmer *pStemmer)

								{

									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									m_cs.Lock();


									if (m_pistem != NULL)

										m_pistem->Release();


									if ((m_pistem = pStemmer) != NULL)

									{

										m_pistem->AddRef();


										ITASSERT(rclsid != GUID_NULL);

										m_clsidStemmer = rclsid;


										m_fDirty = TRUE;

									}


									SetGrfFlag(&m_grfPersistedItems,

												ITSTDBRK_PERSISTED_STEMMER, m_pistem != NULL);


									m_cs.Unlock();


									return (S_OK);

								}


								/*****************************************************************

								 * @method    STDMETHODIMP | IWordBreakerConfig | GetWordStemmer |

								 * Indicates whether or not a stemmer is associated with the word breaker.

								 *

								 * @parm IStemmer | **ppStemmer | Pointer to the stemmer.

								 *

								 * @rvalue E_POINTER | No stemmer has been associated (ppStemmer is NULL).

								 * @rvalue E_NOTOPEN | [?] has not been initialized.

								 * @rvalue S_OK | The operation completed successfully.

								 *

								 * @comm

								 * The 	breaker takes responsibility for calling IPersistStreamInit::Load/Save

								 * when it is loaded/saved if the stemmer supports that interface.

								 *****************************************************************/

								STDMETHODIMP

								CITStdBreaker::GetWordStemmer(IStemmer **ppStemmer)

								{

									if (ppStemmer == NULL)

										return (SetErrReturn(E_POINTER));


									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									if ((*ppStemmer = m_pistem) != NULL)

										m_pistem->AddRef();


									return (m_pistem != NULL ? S_OK : S_FALSE);

								}


								//---------------------------------------------------------------------------

								//						IITStopWordList Method Implementations

								//---------------------------------------------------------------------------


								/*****************************************************************

								 * @method    STDMETHODIMP | IITStopWordList | AddWord |

								 * Adds a word to the stop word list.

								 *

								 * @parm WCHAR const | *pwcInBuf | Pointer to the input buffer.

								 * @parm ULONG | cwc | Length of word (count of wide characters).

								 *

								 * @rvalue S_OK | The operation completed successfully.

								 *

								 *****************************************************************/

								STDMETHODIMP

								CITStdBreaker::AddWord(WCHAR const *pwcInBuf, ULONG cwc)

								{

									return (StopListOp(pwcInBuf, cwc, TRUE));

								}


								/*****************************************************************

								 * @method    STDMETHODIMP | IITStopWordList | LookupWord |

								 * Looks up a word in the stop word list.

								 *

								 * @parm WCHAR const | *pwcInBuf | Pointer to the input buffer.

								 * @parm ULONG | cwc | Length of word (count of wide characters).

								 *

								 * @rvalue S_OK | The operation completed successfully.

								 *

								 *****************************************************************/

								STDMETHODIMP

								CITStdBreaker::LookupWord(WCHAR const *pwcInBuf, ULONG cwc)

								{

									return (StopListOp(pwcInBuf, cwc, FALSE));

								}


								//---------------------------------------------------------------------------

								//						IPersistStreamInit Method Implementations

								//---------------------------------------------------------------------------


								STDMETHODIMP

								CITStdBreaker::GetClassID(CLSID *pclsid)

								{

									if (pclsid == NULL)

										return (SetErrReturn(E_POINTER));


									*pclsid = CLSID_ITStdBreaker;

									return (S_OK);

								}


								STDMETHODIMP

								CITStdBreaker::IsDirty(void)

								{

									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									return (m_fDirty ? S_OK : S_FALSE);

								}


								STDMETHODIMP

								CITStdBreaker::Load(IStream *pStream)

								{

									HRESULT	hr;

									DWORD	dwVersion;

									DWORD	grfPersistedItems;

									DWORD	cbRead;


									if (pStream == NULL)

										return (SetErrReturn(E_POINTER));


									// Lock before checking m_fInitialized to make sure we don't compete

									// with a call to ::InitNew.

									m_cs.Lock();


									if (m_fInitialized)

										return (SetErrReturn(E_ALREADYOPEN));


									if (SUCCEEDED(hr = pStream->Read((LPVOID) &dwVersion, sizeof(DWORD),

																								&cbRead)) &&

										SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) &&

										SUCCEEDED(hr = ((dwVersion == VERSION_STDBRKR) ? S_OK :

																							E_BADVERSION)) &&

										SUCCEEDED(hr = pStream->Read((LPVOID) &grfPersistedItems,

																					sizeof(DWORD), &cbRead)) &&

										SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)))

									{

										if (grfPersistedItems != 0)

										{

											HFPB	hfpb = NULL;


											if ((grfPersistedItems & ITSTDBRK_PERSISTED_BRKCTL) != 0)

											{

												if (SUCCEEDED(hr =

														pStream->Read((LPVOID) &m_brkctl, sizeof(BRKCTL), &cbRead)))

													hr = ((cbRead == sizeof(BRKCTL)) ? S_OK : E_BADFORMAT);

											}

											else

											{

												// We have an inconsistent persistent state.  The only way

												// we should have no BRKCTL is if we have no persistent

												// state at all (except for version number and persistent

												// flags which we've already loaded).

												ITASSERT(FALSE);

												hr = E_UNEXPECTED;

											}


											if (SUCCEEDED(hr) &&

												(hfpb = FpbFromHf((HF) pStream, &hr)) != NULL)

											{

												// Load the character table if one is there; otherwise just

												// use the internal default table.

												if ((grfPersistedItems & ITSTDBRK_PERSISTED_CHARTABLE) != 0)

													m_lpctab = MVCharTableIndexLoad(hfpb, NULL, &hr);

												else

													m_lpctab = MVCharTableGetDefault(&hr);

											}


											if (SUCCEEDED(hr) &&

												(grfPersistedItems & ITSTDBRK_PERSISTED_STOPWORDLIST) != 0)

											{

												// Load the stop word list.

												if ((m_lpsipb =	MVStopListInitiate(ITSTDBRK_STOPHASH_SIZE,

																								&hr)) != NULL)

													hr = MVStopListIndexLoad(hfpb, m_lpsipb, NULL);

											}


											if (hfpb != NULL)

												FreeHfpb(hfpb);


											if (SUCCEEDED(hr) &&

												(grfPersistedItems & ITSTDBRK_PERSISTED_STEMMER) != 0)

											{

												IPersistStreamInit	*pipstmi;


												ITASSERT(m_pistem == NULL);


												// Instantiate and load the stemmer if it

												// implements IPersistStreamInit.

												if (SUCCEEDED(hr = ReadClassStm(pStream, &m_clsidStemmer)) &&

													SUCCEEDED(hr = CoCreateInstance(m_clsidStemmer, NULL,

																					CLSCTX_INPROC_SERVER,

																		IID_IStemmer, (LPVOID *)&m_pistem)) &&

													SUCCEEDED(m_pistem->QueryInterface(IID_IPersistStreamInit,

																							(LPVOID *)&pipstmi)))

												{

													hr = pipstmi->Load(pStream);

													pipstmi->Release();

												}

											}

										}

										else

										{

											// If there were no persisted items (we release one beta version

											// without pluggable breakers where we had dummy instance data

											// where this was true) then we should just behave like we're being

											// created anew.

											hr = InitNew();

										}

									}


									if (SUCCEEDED(hr))

									{

										// We don't want to assign an incorrect grfPersistedItems if

										// we ended up calling InitNew.

										if (!m_fInitialized)

										{

											m_grfPersistedItems = grfPersistedItems;

											m_fInitialized = TRUE;

										}

									}

									else

										// Free any peristed items which may have been loaded successfully.

										Close();


									m_cs.Unlock();

									return (hr);

								}


								STDMETHODIMP

								CITStdBreaker::Save(IStream *pStream, BOOL fClearDirty)

								{

									HRESULT	hr;

									DWORD	dwVersion;

									DWORD	cbWritten;


									if (pStream == NULL)

										return (SetErrReturn(E_POINTER));


									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									m_cs.Lock();


									dwVersion = VERSION_STDBRKR;

									if (SUCCEEDED(hr = pStream->Write((LPVOID) &dwVersion, sizeof(DWORD),

																								&cbWritten)) &&

										SUCCEEDED(hr = pStream->Write((LPVOID) &m_grfPersistedItems,

																				sizeof(DWORD), &cbWritten)))

									{

										HFPB	hfpb = NULL;


										if ((m_grfPersistedItems & ITSTDBRK_PERSISTED_BRKCTL) != 0)

											hr = pStream->Write((LPVOID) &m_brkctl, sizeof(BRKCTL), &cbWritten);

										else

										{

											// We should always be writing the BRKCTL structure, but if for some

											// reason the flag to write it is not set, we can still continue

											// because at load time we will tolerate the absence of the struct.

											ITASSERT(FALSE);

										}


										if (SUCCEEDED(hr) &&

											(hfpb = FpbFromHf((HF) pStream, &hr)) != NULL &&

											(m_grfPersistedItems & ITSTDBRK_PERSISTED_CHARTABLE) != 0)

										{

											// Save char table.

											if (m_lpctab != NULL)

												hr = MVCharTableFileBuild(hfpb, m_lpctab, NULL);

											else

											{

												ITASSERT(FALSE);

												hr = E_UNEXPECTED;

											}

										}


										if (SUCCEEDED(hr) &&

											(m_grfPersistedItems & ITSTDBRK_PERSISTED_STOPWORDLIST) != 0)

										{

											// Save stop word list.

											if (m_lpsipb != NULL)

												hr = MVStopFileBuild(hfpb, m_lpsipb, NULL);

											else

											{

												ITASSERT(FALSE);

												hr = E_UNEXPECTED;

											}

										}


										if (hfpb != NULL)

											FreeHfpb(hfpb);


										if (SUCCEEDED(hr) &&

											(m_grfPersistedItems & ITSTDBRK_PERSISTED_STEMMER) != 0)

										{

											IPersistStreamInit	*pipstmi;


											ITASSERT(m_pistem != NULL);


											// Write the stemmer's CLSID and save the stemmer if it

											// implements IPersistStreamInit.

											if (SUCCEEDED(hr = WriteClassStm(pStream, m_clsidStemmer)) &&

												SUCCEEDED(m_pistem->QueryInterface(IID_IPersistStreamInit,

																					(LPVOID *) &pipstmi)))

											{

												hr = pipstmi->Save(pStream, fClearDirty);

												pipstmi->Release();

											}

										}

									}


									if (SUCCEEDED(hr) && fClearDirty)

										m_fDirty = FALSE;


									m_cs.Unlock();


									return (hr);

								}


								STDMETHODIMP

								CITStdBreaker::GetSizeMax(ULARGE_INTEGER *pcbSizeMax)

								{

									return (E_NOTIMPL);

								}


								STDMETHODIMP

								CITStdBreaker::InitNew(void)

								{

									HRESULT	hr = S_OK;


									// Lock before checking m_fInitialized to make sure we don't compete

									// with a call to ::Load.

									m_cs.Lock();


									if (m_fInitialized)

										return (SetErrReturn(E_ALREADYOPEN));


									InitBrkCtl();

									m_grfPersistedItems |= ITSTDBRK_PERSISTED_BRKCTL;


									// Get the default char table in case we're never asked to load an

									// external one.  If we do load an external one, we'll properly

									// discard this one.  We don't set the persisted flag for the

									// char table because we don't need to persist the internal default.

									m_lpctab = MVCharTableGetDefault(&hr);


									// Initialize the stop word list so that stop words can be added

									// programmatically if a client desires.

									if (SUCCEEDED(hr))

										m_lpsipb = MVStopListInitiate(ITSTDBRK_STOPHASH_SIZE, &hr);


									if (SUCCEEDED(hr))

										m_fInitialized = m_fDirty = TRUE;

									else

										Close();


									m_cs.Unlock();

									return (hr);

								}


								//---------------------------------------------------------------------------

								//						Private Method Implementations

								//---------------------------------------------------------------------------


								HRESULT

								CITStdBreaker::StopListOp(WCHAR const *pwcInBuf, ULONG cwc, BOOL fAddWord)

								{

									HRESULT	hr;

									DWORD	cbAnsi;


									if (pwcInBuf == NULL)

										return (E_POINTER);


									if (!m_fInitialized)

										return (SetErrReturn(E_NOTOPEN));


									if (m_lpsipb == NULL)

										return (SetErrReturn(E_NOTINIT));


									m_cs.Lock();


								 	cbAnsi = (sizeof(WCHAR) * cwc) + sizeof(WORD);


									if (SUCCEEDED(hr =

											ReallocBuffer(&m_hmemAnsi, &m_cbBufAnsiCur, cbAnsi)))

									{

										char	*lpchBuf;


										lpchBuf = (char *) _GLOBALLOCK(m_hmemAnsi);


										if ((*((WORD *)lpchBuf) = (WORD) (

												WideCharToMultiByte(m_brkctl.dwCodePageID, NULL, pwcInBuf, cwc,

																lpchBuf + sizeof(WORD), cbAnsi - sizeof(WORD),

																							NULL, NULL))) > 0)

										{

											if (fAddWord)

												hr = MVStopListAddWord(m_lpsipb, (LPBYTE)lpchBuf);

											else

												hr = MVStopListLookup(m_lpsipb, (LPBYTE)lpchBuf);

										}

										else

											hr = E_UNEXPECTED;


										_GLOBALUNLOCK(m_hmemAnsi);

									}


									m_cs.Unlock();


									return (hr);

								}


								HRESULT

								CITStdBreaker::ReallocBuffer(HGLOBAL *phmemBuf, DWORD *pcbBufCur, DWORD cbBufNew)

								{

									HRESULT hr = S_OK;


									m_cs.Lock();


									hr = ReallocBufferHmem(phmemBuf, pcbBufCur, max(cbBufNew, cbAnsiBufInit));


									m_cs.Unlock();


									return (hr);

								}


								void

								CITStdBreaker::ClearMembers(void)

								{

									MEMSET(&m_brkctl, NULL, sizeof(BRKCTL));

									m_fInitialized = m_fDirty = m_fQueryContext = FALSE;

									m_grfPersistedItems = 0;

									m_lpctab = NULL;

									m_lpsipb = NULL;

									m_clsidStemmer = GUID_NULL;

								}


								void

								CITStdBreaker::InitBrkCtl(void)

								{

									m_brkctl.dwCodePageID = GetACP();

									m_brkctl.lcid = GetUserDefaultLCID();

									m_brkctl.dwBreakWordType = IITWBC_BREAKTYPE_TEXT;

									m_brkctl.grfBreakFlags = 0;

								}


								void

								CITStdBreaker::Close(void)

								{

									m_cs.Lock();


									if (m_hmemAnsi != NULL)

									{

										_GLOBALFREE(m_hmemAnsi);

										m_hmemAnsi = NULL;

										m_cbBufAnsiCur = 0;

									}


									if (m_pistem != NULL)

									{

										m_pistem->Release();

										m_pistem = NULL;

									}


									MVCharTableDispose(m_lpctab);

									MVStopListDispose(m_lpsipb);


									ClearMembers();


									m_cs.Unlock();

								}


								//---------------------------------------------------------------------------

								//								Utility Functions

								//---------------------------------------------------------------------------


								//	(6/19/97): BillA, JohnRush, and MikkyA all agreed that we would stop storing

								//	offset and length information in the index because the new HTML-based

								//	display engines don't allow our clients to find words using that information

								//	anyway.

								//

								//	However, the above decision doesn't eliminate the need to accurately

								//	correlate offsets into the MBCS text buffer with offsets into the original

								//	Unicode buffer.  This is needed by the query parsing code at runtime.

								//	The method for achieving offset correlation is simple: call

								//	MultiByteToWideChar on the MBCS text buffer up to dwWordOffset to get

								//	back the equivalent Unicode offset which we will pass to the word sink.

								//

								//	NOTE: The above method will work as long as the breaker code is using

								//	the same lead byte table as the system conversion function.  For now,

								//	our clients will be responsible for making sure the character table

								//	is consistent with the system's lead byte table.  In the future, we

								//	probably should make the breaker explicitly set the lead bytes in the

								//	character table using the system's lead byte table.

								//

								//	In the case of single byte characters, the offset and length information

								//	automatically correlates between MBCS and Unicode because it is essentially

								//	stated in characters, not bytes.

								//

								HRESULT FAR PASCAL StdBreakerWordFunc(LST lstRawWord, LST lstNormWord,

																		DWORD dwWordOffset, LPVOID lpvUser)

								{

									HRESULT	hr;

									DWORD	cbAnsi;

									DWORD	cwch;

									DWORD	cwchRaw;

									DWORD	iwchWordOffset = dwWordOffset;

									WCHAR	*lpwchBuf;

									WRDFNPM	*pwrdfnpm;


									if (lstRawWord == NULL || lstNormWord == NULL || lpvUser == NULL)

										return (E_POINTER);


									pwrdfnpm = (WRDFNPM *) lpvUser;


									// We will set up the Unicode buffer to have as many characters as there are

									// bytes in the Ansi string since we don't know how much, if any, DBCS chars

									// there are in the Ansi string.

									cwch = cbAnsi = (DWORD)(*((WORD *)lstNormWord));

									cwchRaw = (DWORD)(*((WORD *)lstRawWord));


									// Set up Unicode buffer for the normalized word.

									if (SUCCEEDED(hr = ReallocBufferHmem(&pwrdfnpm->hmemUnicode,

																	 &pwrdfnpm->cbBufUnicodeCur,

																	 sizeof(WCHAR) * cwch)))

									{

										lpwchBuf = (WCHAR *) _GLOBALLOCK(pwrdfnpm->hmemUnicode);


										// Compute the Unicode offset that corresponds to the

										// MBCS-based dwWordOffset.  We pass lpwchBuf as a valid placeholder

										// buffer (in case non-NULL is required), but nothing will get

										// written to it.

										iwchWordOffset = MultiByteToWideChar(pwrdfnpm->dwCodePageID, NULL,

																	(LPCSTR) pwrdfnpm->lpbBuf, dwWordOffset,

																								lpwchBuf, 0);


										// Convert the normalized word to Unicode.

										if ((cwch = MultiByteToWideChar(pwrdfnpm->dwCodePageID, NULL,

																		(LPCSTR) &lstNormWord[sizeof(WORD)],

																				cbAnsi, lpwchBuf, cwch)) > 0 &&

											pwrdfnpm->piwrdsnk != NULL)

										{

											// Send the normalized word to the word sink.

											hr = pwrdfnpm->piwrdsnk->PutAltWord(lpwchBuf, cwch, cwchRaw,

																							iwchWordOffset);

										}

										else

											hr = E_UNEXPECTED;


										_GLOBALUNLOCK(pwrdfnpm->hmemUnicode);

									}


									cwch = cbAnsi = cwchRaw;


									// Set up Unicode buffer for the raw word.

									if (SUCCEEDED(hr) &&

										SUCCEEDED(hr = ReallocBufferHmem(&pwrdfnpm->hmemUnicode,

																	 &pwrdfnpm->cbBufUnicodeCur,

																	 sizeof(WCHAR) * cwch)))

									{

										lpwchBuf = (WCHAR *) _GLOBALLOCK(pwrdfnpm->hmemUnicode);


										// Convert the raw word to Unicode.

										if ((cwch = MultiByteToWideChar(pwrdfnpm->dwCodePageID, NULL,

																		(LPCSTR) &lstRawWord[sizeof(WORD)],

																			cbAnsi, lpwchBuf, cwch)) > 0 &&

											pwrdfnpm->piwrdsnk != NULL)

										{

											// Send the raw word to the word sink.

											hr = pwrdfnpm->piwrdsnk->PutWord(lpwchBuf, cwch, cwchRaw,

																							iwchWordOffset);

										}

										else

											hr = E_UNEXPECTED;


										_GLOBALUNLOCK(pwrdfnpm->hmemUnicode);

									}


									return (hr);

								}