Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1279 lines
36 KiB

  1. /*************************************************************************
  2. * @doc SHROOM EXTERNAL API *
  3. * *
  4. * STDBRKR.CPP *
  5. * *
  6. * Copyright (C) Microsoft Corporation 1997 *
  7. * All Rights reserved. *
  8. * *
  9. * This file contains the implementation of CITStdBreaker methods. *
  10. * CITStdBreaker is a pluggable word breaker object that can optionally *
  11. * use a character class table and stop word list during its breaking *
  12. * operations. Although all the word breaking interface methods *
  13. * that accepts text require it to be Unicode, CITStdBreaker still only *
  14. * support MBCS internally. *
  15. * *
  16. **************************************************************************
  17. * *
  18. * Written By : Bill Aloof *
  19. * Current Owner: billa *
  20. * *
  21. **************************************************************************/
  22. #include <mvopsys.h>
  23. #ifdef _DEBUG
  24. static char s_aszModule[] = __FILE__; /* For error report */
  25. #endif
  26. #ifdef IA64
  27. #include <itdfguid.h>
  28. #endif
  29. #include <atlinc.h> // includes for ATL.
  30. #include <_mvutil.h>
  31. #include <mem.h>
  32. #include <orkin.h>
  33. #include <mvsearch.h>
  34. #include "common.h"
  35. #include <iterror.h>
  36. #include <itwbrk.h>
  37. #include <itwbrkid.h>
  38. #include "stdbrkr.h"
  39. HRESULT FAR PASCAL StdBreakerWordFunc(LST lstRawWord, LST lstNormWord,
  40. DWORD dwWordOffset, LPVOID lpvUser);
  41. //---------------------------------------------------------------------------
  42. // Constructor and Destructor
  43. //---------------------------------------------------------------------------
  44. CITStdBreaker::CITStdBreaker()
  45. {
  46. ClearMembers();
  47. m_hmemAnsi = NULL;
  48. m_cbBufAnsiCur = 0;
  49. m_pistem = NULL;
  50. }
  51. CITStdBreaker::~CITStdBreaker()
  52. {
  53. Close();
  54. }
  55. //---------------------------------------------------------------------------
  56. // IWordBreaker Method Implementations
  57. //---------------------------------------------------------------------------
  58. /********************************************************************
  59. * @method STDMETHODIMP | IWordBreaker | Init |
  60. * Gives the breaker object a chance to initialize itself beyond
  61. * what it did during IPersistStreamInit::InitNew or ::Load.
  62. * @parm BOOL | fQuery | TRUE means breaker context is query processing
  63. * @parm ULONG | ulMaxTokenSize | Max term length requested by caller
  64. * @parm BOOL* | pfLicense | Whether the breaker is subject to a license
  65. *
  66. * @rvalue E_POINTER | pfLicense was NULL
  67. *
  68. ********************************************************************/
  69. STDMETHODIMP
  70. CITStdBreaker::Init(BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense)
  71. {
  72. HRESULT hr = S_OK;
  73. // NOTE: We don't check m_fInitialized here because we consider ourselves
  74. // adequately initialized once IPersistStreamInit::InitNew or ::Load
  75. // has been called.
  76. if (pfLicense == NULL)
  77. return (SetErrReturn(E_POINTER));
  78. // If we haven't been initialized yet (i.e. no call was made to either
  79. // IPersistStreamInit::InitNew or Load), we'll initialize ourselves now.
  80. // This allows Tripoli clients to use us without any code changes on their
  81. // part.
  82. if (!m_fInitialized)
  83. hr = InitNew();
  84. if (SUCCEEDED(hr) && m_pistem != NULL)
  85. hr = m_pistem->Init(ulMaxTokenSize, pfLicense);
  86. if (SUCCEEDED(hr))
  87. {
  88. if (m_fQueryContext = fQuery)
  89. MVCharTableSetWildcards(m_lpctab);
  90. // We set *pfLicense only if the stemmer didn't.
  91. if (m_pistem == NULL)
  92. *pfLicense = FALSE;
  93. }
  94. // NOTE: We don't support caller-specified internal truncation of terms
  95. // based on ulMaxTokenSize. The breaker routines have a hard-coded
  96. // maximum of CB_MAX_WORD_LEN. This is OK since the word sink is supposed
  97. // to be prepared to have to truncate anyway.
  98. return (hr);
  99. }
  100. /********************************************************************
  101. * @method STDMETHODIMP | IWordBreaker | BreakText |
  102. * Parses text to find both individual tokens and noun phrases, then
  103. * calls methods of IWordSink and IPhraseSink with the results.
  104. *
  105. * @parm TEXT_SOURCE | *pTextSource | Source of the UniCode text.
  106. * @parm IWordSink | *pWordSink | Pointer to the word sink.
  107. * @parm IPhraseSink | *pPhraseSink | Pointer to the phrase sink.
  108. * (Not supported at this time.)
  109. *
  110. * @rvalue S_OK | The operation completed successfully.
  111. * @rvalue E_POINTER | The text source is null.
  112. * @rvalue E_INVALIDARG | The word sink is NULL.
  113. * @rvalue E_NOTOPEN |
  114. * @rvalue E_OUTOFMEMORY | There was not enough memory to complete the operation.
  115. *
  116. * @comm
  117. * The raw text in pTextSource is parsed by the word breaker until no
  118. * more text is available to refill the buffer. At this point, BreakText returns S_OK.
  119. *
  120. *
  121. ********************************************************************/
  122. STDMETHODIMP
  123. CITStdBreaker::BreakText(TEXT_SOURCE *pTextSource, IWordSink *pWordSink,
  124. IPhraseSink *pPhraseSink)
  125. {
  126. HRESULT hr = S_OK;
  127. LPIBI lpibi = NULL;
  128. if (pTextSource == NULL)
  129. return (SetErrReturn(E_POINTER));
  130. // We treat a NULL pWordSink different than a NULL pTextSource
  131. // to indicate to the caller that we can't do anything meaningful
  132. // without a pWordSink because we don't do phrase breaking.
  133. if (pWordSink == NULL)
  134. return (SetErrReturn(E_INVALIDARG));
  135. if (!m_fInitialized)
  136. return (SetErrReturn(E_NOTOPEN));
  137. m_cs.Lock();
  138. if ((lpibi = BreakerInitiate()) != NULL)
  139. {
  140. BRK_PARMS bkp;
  141. WRDFNPM wrdfnpm;
  142. // Set up word callback wrapper params.
  143. MEMSET(&wrdfnpm, NULL, sizeof(WRDFNPM));
  144. wrdfnpm.piwrdsnk = pWordSink;
  145. wrdfnpm.dwCodePageID = m_brkctl.dwCodePageID;
  146. // Set up breaker params that will get passed to FBreakX.
  147. bkp.lpInternalBreakInfo = lpibi;
  148. bkp.lcbBufOffset = 0;
  149. bkp.lpvUser = (LPVOID) &wrdfnpm;
  150. bkp.lpfnOutWord = StdBreakerWordFunc;
  151. bkp.lpStopInfoBlock = m_lpsipb;
  152. bkp.lpCharTab = m_lpctab;
  153. bkp.fFlags =
  154. ((m_brkctl.grfBreakFlags & IITWBC_BREAK_ACCEPT_WILDCARDS) != 0 ?
  155. ACCEPT_WILDCARD : 0);
  156. // Loop to break text.
  157. do
  158. {
  159. DWORD cbAnsi;
  160. DWORD cwch;
  161. // Make the ANSI buffer big enough to handle all DBCS in case
  162. // that's what we get when converting from Unicode.
  163. cbAnsi = sizeof(WCHAR) *
  164. (cwch = (pTextSource->iEnd - pTextSource->iCur));
  165. if (SUCCEEDED(hr =
  166. ReallocBuffer(&m_hmemAnsi, &m_cbBufAnsiCur, cbAnsi)))
  167. {
  168. bkp.lpbBuf = (LPBYTE) _GLOBALLOCK(m_hmemAnsi);
  169. if ((bkp.cbBufCount =
  170. WideCharToMultiByte(m_brkctl.dwCodePageID, NULL,
  171. (LPCWSTR) &pTextSource->awcBuffer[pTextSource->iCur],
  172. cwch, (char *) bkp.lpbBuf, m_cbBufAnsiCur,
  173. NULL, NULL)) > 0)
  174. {
  175. // StdBreakerWordFunc needs the MBCS buffer to compute an
  176. // accurate word offset into the Unicode buffer.
  177. wrdfnpm.lpbBuf = bkp.lpbBuf;
  178. switch (m_brkctl.dwBreakWordType)
  179. {
  180. case IITWBC_BREAKTYPE_TEXT:
  181. if (SUCCEEDED(hr = FBreakWords(&bkp)))
  182. {
  183. /* Flush the word breaker */
  184. bkp.lpbBuf = NULL;
  185. bkp.cbBufCount = 0;
  186. hr = FBreakWords(&bkp);
  187. }
  188. break;
  189. case IITWBC_BREAKTYPE_NUMBER:
  190. if (SUCCEEDED(hr = FBreakNumber(&bkp)))
  191. {
  192. /* Flush the word breaker */
  193. bkp.lpbBuf = NULL;
  194. bkp.cbBufCount = 0;
  195. hr = FBreakNumber(&bkp);
  196. }
  197. break;
  198. case IITWBC_BREAKTYPE_DATE:
  199. if (SUCCEEDED(hr = FBreakDate(&bkp)))
  200. {
  201. /* Flush the word breaker */
  202. bkp.lpbBuf = NULL;
  203. bkp.cbBufCount = 0;
  204. hr = FBreakDate(&bkp);
  205. }
  206. break;
  207. case IITWBC_BREAKTYPE_TIME:
  208. if (SUCCEEDED(hr = FBreakTime(&bkp)))
  209. {
  210. /* Flush the word breaker */
  211. bkp.lpbBuf = NULL;
  212. bkp.cbBufCount = 0;
  213. hr = FBreakTime(&bkp);
  214. }
  215. break;
  216. case IITWBC_BREAKTYPE_EPOCH:
  217. if (SUCCEEDED(hr = FBreakEpoch(&bkp)))
  218. {
  219. /* Flush the word breaker */
  220. bkp.lpbBuf = NULL;
  221. bkp.cbBufCount = 0;
  222. hr = FBreakEpoch(&bkp);
  223. }
  224. break;
  225. default:
  226. ITASSERT(FALSE);
  227. hr = E_UNEXPECTED;
  228. break;
  229. };
  230. }
  231. else
  232. hr = E_UNEXPECTED;
  233. _GLOBALUNLOCK(m_hmemAnsi);
  234. }
  235. // Advance cur to end just in case the caller cares about this
  236. // being the case when we ask for more characters.
  237. pTextSource->iCur = pTextSource->iEnd;
  238. } while (SUCCEEDED(hr) &&
  239. SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));
  240. // Free any buffer that the word callback wrapper may have allocated.
  241. if (wrdfnpm.hmemUnicode != NULL)
  242. _GLOBALFREE(wrdfnpm.hmemUnicode);
  243. }
  244. else
  245. hr = E_OUTOFMEMORY;
  246. if (lpibi != NULL)
  247. BreakerFree(lpibi);
  248. m_cs.Unlock();
  249. return (hr);
  250. }
  251. /********************************************************************
  252. * @method STDMETHODIMP | IWordBreaker | ComposePhrase |
  253. * Converts a noun and modifier back into a linguistically correct source phrase.
  254. *
  255. *
  256. * @parm WCHAR const | *pwcNoun | Pointer to the word being modified.
  257. * @parm ULONG | cwcNoun | The count of characters in pwcNoun.
  258. * @parm WCHAR const | *pwcModifier | Points to the word modifying pwcNoun
  259. * @parm ULONG | cwcModifier | Length of pwcModifier
  260. * @parm ULONG | ulAttachmentType | A wordbreaker-specific value which a
  261. * wordbreaker can use to store additional information about the method of composition.
  262. * @parm WCHAR | *pwcPhrase | Pointer to a buffer in which to store the composed phrase
  263. * @parm ULONG | *pcwcPhrase | [in] length in characters of the pwcPhrase buffer.
  264. * [out] the actual length of the composed phrase. If
  265. * WBREAK_E_BUFFER_TOO_SMALL is returned, then on output pcwcPhrase
  266. * contains the required length of pwcPhrase.
  267. *
  268. * @rvalue S_OK | The object was successfully created
  269. * @rvalue E_INVALIDARG | The argument was not valid
  270. * @rvalue E_NOTINIT |
  271. * @rvalue E_OUTOFMEMORY |
  272. *
  273. * @comm
  274. * Not implemented
  275. ********************************************************************/
  276. STDMETHODIMP
  277. CITStdBreaker::ComposePhrase(WCHAR const *pwcNoun, ULONG cwcNoun,
  278. WCHAR const *pwcModifier, ULONG cwcModifier,
  279. ULONG ulAttachmentType, WCHAR *pwcPhrase,
  280. ULONG *pcwcPhrase)
  281. {
  282. return (E_NOTIMPL);
  283. }
  284. /********************************************************************
  285. * @method STDMETHODIMP | IWordBreaker | GetLicenseToUse |
  286. * Returns a pointer to the license information provided by the vendor
  287. * of this specific implementation of the IWordBreaker interface.
  288. *
  289. * @parm WCHAR const | **ppwcsLicense | Pointer to the license information.
  290. *
  291. * @rvalue E_POINTER | ppwcsLicense is null.
  292. ********************************************************************/
  293. STDMETHODIMP
  294. CITStdBreaker::GetLicenseToUse(WCHAR const **ppwcsLicense)
  295. {
  296. HRESULT hr;
  297. if (ppwcsLicense == NULL)
  298. return (SetErrReturn(E_POINTER));
  299. if (m_pistem != NULL)
  300. hr = m_pistem->GetLicenseToUse(ppwcsLicense);
  301. else
  302. hr = E_NOTIMPL;
  303. return (hr);
  304. }
  305. //---------------------------------------------------------------------------
  306. // IWordBreakerConfig Method Implementations
  307. //---------------------------------------------------------------------------
  308. /********************************************************************
  309. * @method STDMETHODIMP | IWordBreakerConfig | SetLocaleInfo|
  310. * Sets locale information for the word breaker.
  311. *
  312. *
  313. * @parm DWORD | dwCodePageID | ANSI code page no. specified at build time.
  314. * @parm LCID | lcid | Win32 locale identifier specified at build time.
  315. *
  316. * @rvalue E_NOTOPEN | [?] is not initialized.
  317. * @rvalue S_OK | The locale described by the parameters is supported.
  318. *
  319. ********************************************************************/
  320. STDMETHODIMP
  321. CITStdBreaker::SetLocaleInfo(DWORD dwCodePageID, LCID lcid)
  322. {
  323. if (!m_fInitialized)
  324. return (SetErrReturn(E_NOTOPEN));
  325. m_cs.Lock();
  326. m_brkctl.dwCodePageID = dwCodePageID;
  327. m_brkctl.lcid = lcid;
  328. m_fDirty = TRUE;
  329. m_cs.Unlock();
  330. return (S_OK);
  331. }
  332. /*****************************************************************
  333. * @method STDMETHODIMP | IWordBreakerConfig | GetLocaleInfo|
  334. * Retrieves locale information.
  335. *
  336. * @parm DWORD | *pdwCodePageID | Pointer to ANSI code page no. specified at build time.
  337. * @parm LCID | *plcid | Pointer to Win32 locale identifier specified at build time.
  338. *
  339. * @rvalue E_POINTER | Either the code page pointer or the locale identifier is null.
  340. * @rvalue E_NOTOPEN | [?] is not initialized.
  341. * @rvalue S_OK | The operation completed successfully.
  342. *
  343. ****************************************************************/
  344. STDMETHODIMP
  345. CITStdBreaker::GetLocaleInfo(DWORD *pdwCodePageID, LCID *plcid)
  346. {
  347. if (pdwCodePageID == NULL || plcid == NULL)
  348. return (SetErrReturn(E_POINTER));
  349. if (!m_fInitialized)
  350. return (SetErrReturn(E_NOTOPEN));
  351. m_cs.Lock();
  352. *pdwCodePageID = m_brkctl.dwCodePageID;
  353. *plcid = m_brkctl.lcid;
  354. m_cs.Unlock();
  355. return (S_OK);
  356. }
  357. /*****************************************************************
  358. * @method STDMETHODIMP | IWordBreakerConfig | SetBreakWordType|
  359. * Sets the type of words the breaker should expect
  360. * to see in all subsequent calls to IWordBreaker::BreakText.
  361. *
  362. * @parm DWORD | dwBreakWordType | Specifies the type for break words.
  363. * Can be one of IITWBC_BREAKTYPE_TEXT, IITWBC_BREAKTYPE_NUMBER,
  364. * IITWBC_BREAKTYPE_DATE, IITWBC_BREAKTYPE_TIME, IITWBC_BREAKTYPE_EPOCH.
  365. *
  366. *
  367. * @rvalue E_INVALIDARG | Invalid break word type.
  368. * @rvalue S_OK | The operation completed successfully.
  369. *****************************************************************/
  370. STDMETHODIMP
  371. CITStdBreaker::SetBreakWordType(DWORD dwBreakWordType)
  372. {
  373. if (!m_fInitialized)
  374. return (SetErrReturn(E_NOTOPEN));
  375. switch (dwBreakWordType)
  376. {
  377. case IITWBC_BREAKTYPE_TEXT:
  378. case IITWBC_BREAKTYPE_NUMBER:
  379. case IITWBC_BREAKTYPE_DATE:
  380. case IITWBC_BREAKTYPE_TIME:
  381. case IITWBC_BREAKTYPE_EPOCH:
  382. break;
  383. default:
  384. return (SetErrReturn(E_INVALIDARG));
  385. };
  386. m_cs.Lock();
  387. m_brkctl.dwBreakWordType = dwBreakWordType;
  388. m_fDirty = TRUE;
  389. m_cs.Unlock();
  390. return (S_OK);
  391. }
  392. /*****************************************************************
  393. * @method STDMETHODIMP | IWordBreakerConfig | GetBreakWordType|
  394. * Retrieves the type of words the breaker expects to see in
  395. * calls to IWordBreaker::BreakText.
  396. *
  397. * @parm DWORD | *pdwBreakWordType | Pointer to the type for break words.
  398. * Can be one of IITWBC_BREAKTYPE_TEXT (0), IITWBC_BREAKTYPE_NUMBER (1),
  399. * IITWBC_BREAKTYPE_DATE (2), IITWBC_BREAKTYPE_TIME (3), IITWBC_BREAKTYPE_EPOCH (4).
  400. *
  401. *
  402. * @rvalue E_POINTER | Break word type is null.
  403. * @rvalue S_OK | The operation completed successfully.
  404. *****************************************************************/
  405. STDMETHODIMP
  406. CITStdBreaker::GetBreakWordType(DWORD *pdwBreakWordType)
  407. {
  408. if (pdwBreakWordType == NULL)
  409. return (SetErrReturn(E_POINTER));
  410. if (!m_fInitialized)
  411. return (SetErrReturn(E_NOTOPEN));
  412. *pdwBreakWordType = m_brkctl.dwBreakWordType;
  413. return (S_OK);
  414. }
  415. /*****************************************************************
  416. * @method STDMETHODIMP | IWordBreakerConfig | SetControlInfo |
  417. * Sets information that controls certain aspects of word breaking.
  418. *
  419. * @parm DWORD | grfBreakFlags | Can be: IITWBC_BREAK_ACCEPT_WILDCARDS
  420. * (0x00000001), to interpret wild card characters as such; and
  421. * IITWBC_BREAK_AND_STEM (0x00000002), stem words after breaking.
  422. * @parm DWORD | dwReserved |Reserved for future use.
  423. *
  424. * @rvalue E_INVALIDARG | Invalid control flag.
  425. * @rvalue S_OK | The operation completed successfully.
  426. *****************************************************************/
  427. STDMETHODIMP
  428. CITStdBreaker::SetControlInfo(DWORD grfBreakFlags, DWORD dwReserved)
  429. {
  430. DWORD grfFlagsUnsupported;
  431. if (!m_fInitialized)
  432. return (SetErrReturn(E_NOTOPEN));
  433. grfFlagsUnsupported = ~(IITWBC_BREAK_ACCEPT_WILDCARDS);
  434. if ((grfBreakFlags & grfFlagsUnsupported) != 0)
  435. return (SetErrReturn(E_INVALIDARG));
  436. m_cs.Lock();
  437. m_brkctl.grfBreakFlags = grfBreakFlags;
  438. m_fDirty = TRUE;
  439. m_cs.Unlock();
  440. return (S_OK);
  441. }
  442. /*****************************************************************
  443. * @method STDMETHODIMP | IWordBreakerConfig | GetControlInfo |
  444. * Retrieves information about word breaker control flags.
  445. *
  446. * @parm DWORD | *pgrfBreakFlags | Pointer to breaker control flags.
  447. * @parm DWORD | *pdwReserved |Reserved for future use.
  448. *
  449. * @rvalue E_POINTER | Break flags are not set (pgrfBreakFlags is null).
  450. * @rvalue S_OK | The operation completed successfully.
  451. *****************************************************************/
  452. STDMETHODIMP
  453. CITStdBreaker::GetControlInfo(DWORD *pgrfBreakFlags, DWORD *pdwReserved)
  454. {
  455. if (pgrfBreakFlags == NULL)
  456. return (SetErrReturn(E_POINTER));
  457. if (!m_fInitialized)
  458. return (SetErrReturn(E_NOTOPEN));
  459. *pgrfBreakFlags = m_brkctl.grfBreakFlags;
  460. return (S_OK);
  461. }
  462. /*****************************************************************
  463. * @method STDMETHODIMP | IWordBreakerConfig | LoadExternalBreakerData |
  464. * Loads word breaker data from an external source, such as a table
  465. * containing char-by-char break information or a list of stop words.
  466. *
  467. * @parm IStream | *pStream | Pointer to external source of data.
  468. * @parm DWORD | dwExtDataType | Specifies the type of data in the stream.
  469. *
  470. * @rvalue E_POINTER | pStream is null.
  471. * @rvalue E_NOTOPEN | The stream has not been initialized.
  472. * @rvalue S_OK | The operation completed successfully.
  473. *
  474. * @comm
  475. * Although the format of the data in the stream is entirely
  476. * implementation-specific, this interface does define a couple
  477. * of general types for that data which can be passed in
  478. * dwStreamDataType:
  479. * IITWBC_EXTDATA_CHARTABLE
  480. * IITWBC_EXTDATA_STOPWORDLIST
  481. *
  482. *****************************************************************/
  483. STDMETHODIMP
  484. CITStdBreaker::LoadExternalBreakerData(IStream *pStream, DWORD dwExtDataType)
  485. {
  486. HRESULT hr;
  487. HFPB hfpb;
  488. LPCTAB lpctab;
  489. LPSIPB lpsipb;
  490. if (pStream == NULL)
  491. return (SetErrReturn(E_POINTER));
  492. if (!m_fInitialized)
  493. return (SetErrReturn(E_NOTOPEN));
  494. m_cs.Lock();
  495. if ((hfpb = FpbFromHf((HF) pStream, &hr)) != NULL)
  496. {
  497. switch (dwExtDataType)
  498. {
  499. case IITWBC_EXTDATA_CHARTABLE:
  500. // Load the external character table.
  501. lpctab = MVCharTableLoad(hfpb, NULL, &hr);
  502. if (SUCCEEDED(hr))
  503. {
  504. ITASSERT(lpctab != NULL);
  505. m_fDirty = TRUE;
  506. m_grfPersistedItems |= ITSTDBRK_PERSISTED_CHARTABLE;
  507. if (m_fQueryContext)
  508. MVCharTableSetWildcards(lpctab);
  509. // Dispose of any pre-existing char table.
  510. MVCharTableDispose(m_lpctab);
  511. m_lpctab = lpctab;
  512. }
  513. break;
  514. case IITWBC_EXTDATA_STOPWORDLIST:
  515. // We should at least have an internal default char table.
  516. ITASSERT(m_lpctab != NULL);
  517. // Init the in-memory stop word list and load the external
  518. // list.
  519. if ((lpsipb = MVStopListInitiate(ITSTDBRK_STOPHASH_SIZE,
  520. &hr)) != NULL &&
  521. SUCCEEDED(hr = MVStopListLoad(hfpb, lpsipb, NULL,
  522. FBreakWords, m_lpctab)))
  523. {
  524. m_fDirty = TRUE;
  525. m_grfPersistedItems |= ITSTDBRK_PERSISTED_STOPWORDLIST;
  526. MVStopListDispose(m_lpsipb);
  527. m_lpsipb = lpsipb;
  528. }
  529. break;
  530. default:
  531. hr = E_INVALIDARG;
  532. break;
  533. };
  534. FreeHfpb(hfpb);
  535. }
  536. m_cs.Unlock();
  537. return (hr);
  538. }
  539. /*****************************************************************
  540. * @method STDMETHODIMP | IWordBreakerConfig | SetWordStemmer |
  541. * Allows you to associate a stemmer with the word breaker.
  542. *
  543. * @parm REFCLSID | rclsid | Class identifier for the stemmer.
  544. * @parm IStemmer | *pStemmer | Pointer to the stemmer.
  545. *
  546. * @rvalue E_NOTOPEN | [?] has not been initialized.
  547. * @rvalue S_OK | The operation completed successfully.
  548. *
  549. * @comm
  550. * The breaker takes responsibility for calling IPersistStreamInit::Load/Save
  551. * when it is loaded/saved if the stemmer supports that interface.
  552. *****************************************************************/
  553. STDMETHODIMP
  554. CITStdBreaker::SetWordStemmer(REFCLSID rclsid, IStemmer *pStemmer)
  555. {
  556. if (!m_fInitialized)
  557. return (SetErrReturn(E_NOTOPEN));
  558. m_cs.Lock();
  559. if (m_pistem != NULL)
  560. m_pistem->Release();
  561. if ((m_pistem = pStemmer) != NULL)
  562. {
  563. m_pistem->AddRef();
  564. ITASSERT(rclsid != GUID_NULL);
  565. m_clsidStemmer = rclsid;
  566. m_fDirty = TRUE;
  567. }
  568. SetGrfFlag(&m_grfPersistedItems,
  569. ITSTDBRK_PERSISTED_STEMMER, m_pistem != NULL);
  570. m_cs.Unlock();
  571. return (S_OK);
  572. }
  573. /*****************************************************************
  574. * @method STDMETHODIMP | IWordBreakerConfig | GetWordStemmer |
  575. * Indicates whether or not a stemmer is associated with the word breaker.
  576. *
  577. * @parm IStemmer | **ppStemmer | Pointer to the stemmer.
  578. *
  579. * @rvalue E_POINTER | No stemmer has been associated (ppStemmer is NULL).
  580. * @rvalue E_NOTOPEN | [?] has not been initialized.
  581. * @rvalue S_OK | The operation completed successfully.
  582. *
  583. * @comm
  584. * The breaker takes responsibility for calling IPersistStreamInit::Load/Save
  585. * when it is loaded/saved if the stemmer supports that interface.
  586. *****************************************************************/
  587. STDMETHODIMP
  588. CITStdBreaker::GetWordStemmer(IStemmer **ppStemmer)
  589. {
  590. if (ppStemmer == NULL)
  591. return (SetErrReturn(E_POINTER));
  592. if (!m_fInitialized)
  593. return (SetErrReturn(E_NOTOPEN));
  594. if ((*ppStemmer = m_pistem) != NULL)
  595. m_pistem->AddRef();
  596. return (m_pistem != NULL ? S_OK : S_FALSE);
  597. }
  598. //---------------------------------------------------------------------------
  599. // IITStopWordList Method Implementations
  600. //---------------------------------------------------------------------------
  601. /*****************************************************************
  602. * @method STDMETHODIMP | IITStopWordList | AddWord |
  603. * Adds a word to the stop word list.
  604. *
  605. * @parm WCHAR const | *pwcInBuf | Pointer to the input buffer.
  606. * @parm ULONG | cwc | Length of word (count of wide characters).
  607. *
  608. * @rvalue S_OK | The operation completed successfully.
  609. *
  610. *****************************************************************/
  611. STDMETHODIMP
  612. CITStdBreaker::AddWord(WCHAR const *pwcInBuf, ULONG cwc)
  613. {
  614. return (StopListOp(pwcInBuf, cwc, TRUE));
  615. }
  616. /*****************************************************************
  617. * @method STDMETHODIMP | IITStopWordList | LookupWord |
  618. * Looks up a word in the stop word list.
  619. *
  620. * @parm WCHAR const | *pwcInBuf | Pointer to the input buffer.
  621. * @parm ULONG | cwc | Length of word (count of wide characters).
  622. *
  623. * @rvalue S_OK | The operation completed successfully.
  624. *
  625. *****************************************************************/
  626. STDMETHODIMP
  627. CITStdBreaker::LookupWord(WCHAR const *pwcInBuf, ULONG cwc)
  628. {
  629. return (StopListOp(pwcInBuf, cwc, FALSE));
  630. }
  631. //---------------------------------------------------------------------------
  632. // IPersistStreamInit Method Implementations
  633. //---------------------------------------------------------------------------
  634. STDMETHODIMP
  635. CITStdBreaker::GetClassID(CLSID *pclsid)
  636. {
  637. if (pclsid == NULL)
  638. return (SetErrReturn(E_POINTER));
  639. *pclsid = CLSID_ITStdBreaker;
  640. return (S_OK);
  641. }
  642. STDMETHODIMP
  643. CITStdBreaker::IsDirty(void)
  644. {
  645. if (!m_fInitialized)
  646. return (SetErrReturn(E_NOTOPEN));
  647. return (m_fDirty ? S_OK : S_FALSE);
  648. }
  649. STDMETHODIMP
  650. CITStdBreaker::Load(IStream *pStream)
  651. {
  652. HRESULT hr;
  653. DWORD dwVersion;
  654. DWORD grfPersistedItems;
  655. DWORD cbRead;
  656. if (pStream == NULL)
  657. return (SetErrReturn(E_POINTER));
  658. // Lock before checking m_fInitialized to make sure we don't compete
  659. // with a call to ::InitNew.
  660. m_cs.Lock();
  661. if (m_fInitialized)
  662. return (SetErrReturn(E_ALREADYOPEN));
  663. if (SUCCEEDED(hr = pStream->Read((LPVOID) &dwVersion, sizeof(DWORD),
  664. &cbRead)) &&
  665. SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) &&
  666. SUCCEEDED(hr = ((dwVersion == VERSION_STDBRKR) ? S_OK :
  667. E_BADVERSION)) &&
  668. SUCCEEDED(hr = pStream->Read((LPVOID) &grfPersistedItems,
  669. sizeof(DWORD), &cbRead)) &&
  670. SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)))
  671. {
  672. if (grfPersistedItems != 0)
  673. {
  674. HFPB hfpb = NULL;
  675. if ((grfPersistedItems & ITSTDBRK_PERSISTED_BRKCTL) != 0)
  676. {
  677. if (SUCCEEDED(hr =
  678. pStream->Read((LPVOID) &m_brkctl, sizeof(BRKCTL), &cbRead)))
  679. hr = ((cbRead == sizeof(BRKCTL)) ? S_OK : E_BADFORMAT);
  680. }
  681. else
  682. {
  683. // We have an inconsistent persistent state. The only way
  684. // we should have no BRKCTL is if we have no persistent
  685. // state at all (except for version number and persistent
  686. // flags which we've already loaded).
  687. ITASSERT(FALSE);
  688. hr = E_UNEXPECTED;
  689. }
  690. if (SUCCEEDED(hr) &&
  691. (hfpb = FpbFromHf((HF) pStream, &hr)) != NULL)
  692. {
  693. // Load the character table if one is there; otherwise just
  694. // use the internal default table.
  695. if ((grfPersistedItems & ITSTDBRK_PERSISTED_CHARTABLE) != 0)
  696. m_lpctab = MVCharTableIndexLoad(hfpb, NULL, &hr);
  697. else
  698. m_lpctab = MVCharTableGetDefault(&hr);
  699. }
  700. if (SUCCEEDED(hr) &&
  701. (grfPersistedItems & ITSTDBRK_PERSISTED_STOPWORDLIST) != 0)
  702. {
  703. // Load the stop word list.
  704. if ((m_lpsipb = MVStopListInitiate(ITSTDBRK_STOPHASH_SIZE,
  705. &hr)) != NULL)
  706. hr = MVStopListIndexLoad(hfpb, m_lpsipb, NULL);
  707. }
  708. if (hfpb != NULL)
  709. FreeHfpb(hfpb);
  710. if (SUCCEEDED(hr) &&
  711. (grfPersistedItems & ITSTDBRK_PERSISTED_STEMMER) != 0)
  712. {
  713. IPersistStreamInit *pipstmi;
  714. ITASSERT(m_pistem == NULL);
  715. // Instantiate and load the stemmer if it
  716. // implements IPersistStreamInit.
  717. if (SUCCEEDED(hr = ReadClassStm(pStream, &m_clsidStemmer)) &&
  718. SUCCEEDED(hr = CoCreateInstance(m_clsidStemmer, NULL,
  719. CLSCTX_INPROC_SERVER,
  720. IID_IStemmer, (LPVOID *)&m_pistem)) &&
  721. SUCCEEDED(m_pistem->QueryInterface(IID_IPersistStreamInit,
  722. (LPVOID *)&pipstmi)))
  723. {
  724. hr = pipstmi->Load(pStream);
  725. pipstmi->Release();
  726. }
  727. }
  728. }
  729. else
  730. {
  731. // If there were no persisted items (we release one beta version
  732. // without pluggable breakers where we had dummy instance data
  733. // where this was true) then we should just behave like we're being
  734. // created anew.
  735. hr = InitNew();
  736. }
  737. }
  738. if (SUCCEEDED(hr))
  739. {
  740. // We don't want to assign an incorrect grfPersistedItems if
  741. // we ended up calling InitNew.
  742. if (!m_fInitialized)
  743. {
  744. m_grfPersistedItems = grfPersistedItems;
  745. m_fInitialized = TRUE;
  746. }
  747. }
  748. else
  749. // Free any peristed items which may have been loaded successfully.
  750. Close();
  751. m_cs.Unlock();
  752. return (hr);
  753. }
  754. STDMETHODIMP
  755. CITStdBreaker::Save(IStream *pStream, BOOL fClearDirty)
  756. {
  757. HRESULT hr;
  758. DWORD dwVersion;
  759. DWORD cbWritten;
  760. if (pStream == NULL)
  761. return (SetErrReturn(E_POINTER));
  762. if (!m_fInitialized)
  763. return (SetErrReturn(E_NOTOPEN));
  764. m_cs.Lock();
  765. dwVersion = VERSION_STDBRKR;
  766. if (SUCCEEDED(hr = pStream->Write((LPVOID) &dwVersion, sizeof(DWORD),
  767. &cbWritten)) &&
  768. SUCCEEDED(hr = pStream->Write((LPVOID) &m_grfPersistedItems,
  769. sizeof(DWORD), &cbWritten)))
  770. {
  771. HFPB hfpb = NULL;
  772. if ((m_grfPersistedItems & ITSTDBRK_PERSISTED_BRKCTL) != 0)
  773. hr = pStream->Write((LPVOID) &m_brkctl, sizeof(BRKCTL), &cbWritten);
  774. else
  775. {
  776. // We should always be writing the BRKCTL structure, but if for some
  777. // reason the flag to write it is not set, we can still continue
  778. // because at load time we will tolerate the absence of the struct.
  779. ITASSERT(FALSE);
  780. }
  781. if (SUCCEEDED(hr) &&
  782. (hfpb = FpbFromHf((HF) pStream, &hr)) != NULL &&
  783. (m_grfPersistedItems & ITSTDBRK_PERSISTED_CHARTABLE) != 0)
  784. {
  785. // Save char table.
  786. if (m_lpctab != NULL)
  787. hr = MVCharTableFileBuild(hfpb, m_lpctab, NULL);
  788. else
  789. {
  790. ITASSERT(FALSE);
  791. hr = E_UNEXPECTED;
  792. }
  793. }
  794. if (SUCCEEDED(hr) &&
  795. (m_grfPersistedItems & ITSTDBRK_PERSISTED_STOPWORDLIST) != 0)
  796. {
  797. // Save stop word list.
  798. if (m_lpsipb != NULL)
  799. hr = MVStopFileBuild(hfpb, m_lpsipb, NULL);
  800. else
  801. {
  802. ITASSERT(FALSE);
  803. hr = E_UNEXPECTED;
  804. }
  805. }
  806. if (hfpb != NULL)
  807. FreeHfpb(hfpb);
  808. if (SUCCEEDED(hr) &&
  809. (m_grfPersistedItems & ITSTDBRK_PERSISTED_STEMMER) != 0)
  810. {
  811. IPersistStreamInit *pipstmi;
  812. ITASSERT(m_pistem != NULL);
  813. // Write the stemmer's CLSID and save the stemmer if it
  814. // implements IPersistStreamInit.
  815. if (SUCCEEDED(hr = WriteClassStm(pStream, m_clsidStemmer)) &&
  816. SUCCEEDED(m_pistem->QueryInterface(IID_IPersistStreamInit,
  817. (LPVOID *) &pipstmi)))
  818. {
  819. hr = pipstmi->Save(pStream, fClearDirty);
  820. pipstmi->Release();
  821. }
  822. }
  823. }
  824. if (SUCCEEDED(hr) && fClearDirty)
  825. m_fDirty = FALSE;
  826. m_cs.Unlock();
  827. return (hr);
  828. }
  829. STDMETHODIMP
  830. CITStdBreaker::GetSizeMax(ULARGE_INTEGER *pcbSizeMax)
  831. {
  832. return (E_NOTIMPL);
  833. }
  834. STDMETHODIMP
  835. CITStdBreaker::InitNew(void)
  836. {
  837. HRESULT hr = S_OK;
  838. // Lock before checking m_fInitialized to make sure we don't compete
  839. // with a call to ::Load.
  840. m_cs.Lock();
  841. if (m_fInitialized)
  842. return (SetErrReturn(E_ALREADYOPEN));
  843. InitBrkCtl();
  844. m_grfPersistedItems |= ITSTDBRK_PERSISTED_BRKCTL;
  845. // Get the default char table in case we're never asked to load an
  846. // external one. If we do load an external one, we'll properly
  847. // discard this one. We don't set the persisted flag for the
  848. // char table because we don't need to persist the internal default.
  849. m_lpctab = MVCharTableGetDefault(&hr);
  850. // Initialize the stop word list so that stop words can be added
  851. // programmatically if a client desires.
  852. if (SUCCEEDED(hr))
  853. m_lpsipb = MVStopListInitiate(ITSTDBRK_STOPHASH_SIZE, &hr);
  854. if (SUCCEEDED(hr))
  855. m_fInitialized = m_fDirty = TRUE;
  856. else
  857. Close();
  858. m_cs.Unlock();
  859. return (hr);
  860. }
  861. //---------------------------------------------------------------------------
  862. // Private Method Implementations
  863. //---------------------------------------------------------------------------
  864. HRESULT
  865. CITStdBreaker::StopListOp(WCHAR const *pwcInBuf, ULONG cwc, BOOL fAddWord)
  866. {
  867. HRESULT hr;
  868. DWORD cbAnsi;
  869. if (pwcInBuf == NULL)
  870. return (E_POINTER);
  871. if (!m_fInitialized)
  872. return (SetErrReturn(E_NOTOPEN));
  873. if (m_lpsipb == NULL)
  874. return (SetErrReturn(E_NOTINIT));
  875. m_cs.Lock();
  876. cbAnsi = (sizeof(WCHAR) * cwc) + sizeof(WORD);
  877. if (SUCCEEDED(hr =
  878. ReallocBuffer(&m_hmemAnsi, &m_cbBufAnsiCur, cbAnsi)))
  879. {
  880. char *lpchBuf;
  881. lpchBuf = (char *) _GLOBALLOCK(m_hmemAnsi);
  882. if ((*((WORD *)lpchBuf) = (WORD) (
  883. WideCharToMultiByte(m_brkctl.dwCodePageID, NULL, pwcInBuf, cwc,
  884. lpchBuf + sizeof(WORD), cbAnsi - sizeof(WORD),
  885. NULL, NULL))) > 0)
  886. {
  887. if (fAddWord)
  888. hr = MVStopListAddWord(m_lpsipb, (LPBYTE)lpchBuf);
  889. else
  890. hr = MVStopListLookup(m_lpsipb, (LPBYTE)lpchBuf);
  891. }
  892. else
  893. hr = E_UNEXPECTED;
  894. _GLOBALUNLOCK(m_hmemAnsi);
  895. }
  896. m_cs.Unlock();
  897. return (hr);
  898. }
  899. HRESULT
  900. CITStdBreaker::ReallocBuffer(HGLOBAL *phmemBuf, DWORD *pcbBufCur, DWORD cbBufNew)
  901. {
  902. HRESULT hr = S_OK;
  903. m_cs.Lock();
  904. hr = ReallocBufferHmem(phmemBuf, pcbBufCur, max(cbBufNew, cbAnsiBufInit));
  905. m_cs.Unlock();
  906. return (hr);
  907. }
  908. void
  909. CITStdBreaker::ClearMembers(void)
  910. {
  911. MEMSET(&m_brkctl, NULL, sizeof(BRKCTL));
  912. m_fInitialized = m_fDirty = m_fQueryContext = FALSE;
  913. m_grfPersistedItems = 0;
  914. m_lpctab = NULL;
  915. m_lpsipb = NULL;
  916. m_clsidStemmer = GUID_NULL;
  917. }
  918. void
  919. CITStdBreaker::InitBrkCtl(void)
  920. {
  921. m_brkctl.dwCodePageID = GetACP();
  922. m_brkctl.lcid = GetUserDefaultLCID();
  923. m_brkctl.dwBreakWordType = IITWBC_BREAKTYPE_TEXT;
  924. m_brkctl.grfBreakFlags = 0;
  925. }
  926. void
  927. CITStdBreaker::Close(void)
  928. {
  929. m_cs.Lock();
  930. if (m_hmemAnsi != NULL)
  931. {
  932. _GLOBALFREE(m_hmemAnsi);
  933. m_hmemAnsi = NULL;
  934. m_cbBufAnsiCur = 0;
  935. }
  936. if (m_pistem != NULL)
  937. {
  938. m_pistem->Release();
  939. m_pistem = NULL;
  940. }
  941. MVCharTableDispose(m_lpctab);
  942. MVStopListDispose(m_lpsipb);
  943. ClearMembers();
  944. m_cs.Unlock();
  945. }
  946. //---------------------------------------------------------------------------
  947. // Utility Functions
  948. //---------------------------------------------------------------------------
  949. // (6/19/97): BillA, JohnRush, and MikkyA all agreed that we would stop storing
  950. // offset and length information in the index because the new HTML-based
  951. // display engines don't allow our clients to find words using that information
  952. // anyway.
  953. //
  954. // However, the above decision doesn't eliminate the need to accurately
  955. // correlate offsets into the MBCS text buffer with offsets into the original
  956. // Unicode buffer. This is needed by the query parsing code at runtime.
  957. // The method for achieving offset correlation is simple: call
  958. // MultiByteToWideChar on the MBCS text buffer up to dwWordOffset to get
  959. // back the equivalent Unicode offset which we will pass to the word sink.
  960. //
  961. // NOTE: The above method will work as long as the breaker code is using
  962. // the same lead byte table as the system conversion function. For now,
  963. // our clients will be responsible for making sure the character table
  964. // is consistent with the system's lead byte table. In the future, we
  965. // probably should make the breaker explicitly set the lead bytes in the
  966. // character table using the system's lead byte table.
  967. //
  968. // In the case of single byte characters, the offset and length information
  969. // automatically correlates between MBCS and Unicode because it is essentially
  970. // stated in characters, not bytes.
  971. //
  972. HRESULT FAR PASCAL StdBreakerWordFunc(LST lstRawWord, LST lstNormWord,
  973. DWORD dwWordOffset, LPVOID lpvUser)
  974. {
  975. HRESULT hr;
  976. DWORD cbAnsi;
  977. DWORD cwch;
  978. DWORD cwchRaw;
  979. DWORD iwchWordOffset = dwWordOffset;
  980. WCHAR *lpwchBuf;
  981. WRDFNPM *pwrdfnpm;
  982. if (lstRawWord == NULL || lstNormWord == NULL || lpvUser == NULL)
  983. return (E_POINTER);
  984. pwrdfnpm = (WRDFNPM *) lpvUser;
  985. // We will set up the Unicode buffer to have as many characters as there are
  986. // bytes in the Ansi string since we don't know how much, if any, DBCS chars
  987. // there are in the Ansi string.
  988. cwch = cbAnsi = (DWORD)(*((WORD *)lstNormWord));
  989. cwchRaw = (DWORD)(*((WORD *)lstRawWord));
  990. // Set up Unicode buffer for the normalized word.
  991. if (SUCCEEDED(hr = ReallocBufferHmem(&pwrdfnpm->hmemUnicode,
  992. &pwrdfnpm->cbBufUnicodeCur,
  993. sizeof(WCHAR) * cwch)))
  994. {
  995. lpwchBuf = (WCHAR *) _GLOBALLOCK(pwrdfnpm->hmemUnicode);
  996. // Compute the Unicode offset that corresponds to the
  997. // MBCS-based dwWordOffset. We pass lpwchBuf as a valid placeholder
  998. // buffer (in case non-NULL is required), but nothing will get
  999. // written to it.
  1000. iwchWordOffset = MultiByteToWideChar(pwrdfnpm->dwCodePageID, NULL,
  1001. (LPCSTR) pwrdfnpm->lpbBuf, dwWordOffset,
  1002. lpwchBuf, 0);
  1003. // Convert the normalized word to Unicode.
  1004. if ((cwch = MultiByteToWideChar(pwrdfnpm->dwCodePageID, NULL,
  1005. (LPCSTR) &lstNormWord[sizeof(WORD)],
  1006. cbAnsi, lpwchBuf, cwch)) > 0 &&
  1007. pwrdfnpm->piwrdsnk != NULL)
  1008. {
  1009. // Send the normalized word to the word sink.
  1010. hr = pwrdfnpm->piwrdsnk->PutAltWord(lpwchBuf, cwch, cwchRaw,
  1011. iwchWordOffset);
  1012. }
  1013. else
  1014. hr = E_UNEXPECTED;
  1015. _GLOBALUNLOCK(pwrdfnpm->hmemUnicode);
  1016. }
  1017. cwch = cbAnsi = cwchRaw;
  1018. // Set up Unicode buffer for the raw word.
  1019. if (SUCCEEDED(hr) &&
  1020. SUCCEEDED(hr = ReallocBufferHmem(&pwrdfnpm->hmemUnicode,
  1021. &pwrdfnpm->cbBufUnicodeCur,
  1022. sizeof(WCHAR) * cwch)))
  1023. {
  1024. lpwchBuf = (WCHAR *) _GLOBALLOCK(pwrdfnpm->hmemUnicode);
  1025. // Convert the raw word to Unicode.
  1026. if ((cwch = MultiByteToWideChar(pwrdfnpm->dwCodePageID, NULL,
  1027. (LPCSTR) &lstRawWord[sizeof(WORD)],
  1028. cbAnsi, lpwchBuf, cwch)) > 0 &&
  1029. pwrdfnpm->piwrdsnk != NULL)
  1030. {
  1031. // Send the raw word to the word sink.
  1032. hr = pwrdfnpm->piwrdsnk->PutWord(lpwchBuf, cwch, cwchRaw,
  1033. iwchWordOffset);
  1034. }
  1035. else
  1036. hr = E_UNEXPECTED;
  1037. _GLOBALUNLOCK(pwrdfnpm->hmemUnicode);
  1038. }
  1039. return (hr);
  1040. }