Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

610 lines
16 KiB

  1. /*************************************************************************
  2. * @doc SHROOM EXTERNAL API *
  3. * *
  4. * ENGSTEM.CPP *
  5. * *
  6. * Copyright (C) Microsoft Corporation 1997 *
  7. * All Rights reserved. *
  8. * *
  9. * This file contains the implementation of CITEngStemmer methods. *
  10. * CITEngStemmer is a pluggable word stemer object. *
  11. * Although all the word breaking interface methods that accept text *
  12. * require it to be Unicode, CITEngStemmer still only supports ANSI *
  13. * internally. *
  14. * *
  15. **************************************************************************
  16. * *
  17. * Written By : Bill Aloof *
  18. * Current Owner: billa *
  19. * *
  20. **************************************************************************/
  21. #include <mvopsys.h>
  22. #ifdef _DEBUG
  23. static char s_aszModule[] = __FILE__; /* For error report */
  24. #endif
  25. #include <atlinc.h> // includes for ATL.
  26. #include <_mvutil.h>
  27. #include <mem.h>
  28. #include <orkin.h>
  29. #include <mvsearch.h>
  30. #include "common.h"
  31. #include <iterror.h>
  32. #include <itstem.h>
  33. #include <itwbrkid.h>
  34. #include "engstem.h"
  35. //---------------------------------------------------------------------------
  36. // Constructor and Destructor
  37. //---------------------------------------------------------------------------
  38. CITEngStemmer::CITEngStemmer()
  39. {
  40. ClearMembers();
  41. m_hmem1 = m_hmem2 = NULL;
  42. m_cbBuf1Cur = m_cbBuf2Cur = 0;
  43. }
  44. CITEngStemmer::~CITEngStemmer()
  45. {
  46. Close();
  47. }
  48. //---------------------------------------------------------------------------
  49. // IStemmer Method Implementations
  50. //---------------------------------------------------------------------------
  51. /********************************************************************
  52. * @method STDMETHODIMP | IStemmer | Init |
  53. * Gives the stemmer object a chance to initialize itself beyond
  54. * what it did during IPersistStreamInit::InitNew or ::Load.
  55. * @parm ULONG | ulMaxTokenSize | Max term length requested by caller
  56. * @parm BOOL* | pfLicense | Whether the stemmer is subject to a license
  57. *
  58. * @rvalue E_POINTER | pfLicense was NULL
  59. *
  60. ********************************************************************/
  61. STDMETHODIMP
  62. CITEngStemmer::Init(ULONG ulMaxTokenSize, BOOL *pfLicense)
  63. {
  64. HRESULT hr;
  65. if (pfLicense == NULL)
  66. return (SetErrReturn(E_POINTER));
  67. // If we haven't been initialized yet (i.e. no call was made to either
  68. // IPersistStreamInit::InitNew or Load), we'll initialize ourselves now.
  69. // This allows Tripoli clients to use us without any code changes on their
  70. // part. If we have already been initialized, the caller has had a chance
  71. // to correctly set the lcid, so we check it now; otherwise, we want to
  72. // still give the caller a chance to set it correctly.
  73. if (m_fInitialized)
  74. hr = (PRIMARYLANGID(LANGIDFROMLCID(m_stemctl.lcid)) == LANG_ENGLISH ?
  75. S_OK : E_FAIL);
  76. else
  77. hr = InitNew();
  78. if (SUCCEEDED(hr))
  79. *pfLicense = FALSE;
  80. // NOTE: We don't support internal truncation of terms based on
  81. // ulMaxTokenSize. This is OK since the word sink is supposed to be
  82. // prepared to have to truncate anyway.
  83. return (hr);
  84. }
  85. /********************************************************************
  86. * @method STDMETHODIMP | IStemmer | StemWord |
  87. * stems the input word and calls the methods of IStemSink with the results.
  88. *
  89. * @parm WCHAR const | *pwcInBuf | Input Unicode word.
  90. * @parm ULONG | cwc | count of Unicode characters in the input word.
  91. * @parm IStemSink | *pStemSink | Pointer to the stemmer sink.
  92. *
  93. *
  94. *
  95. * @rvalue E_WORDTOOLONG | cwc is larger than 0x7FFF
  96. * @rvalue E_POINTER | Either the input buffer or *pStemSink is NULL.
  97. * @rvalue S_OK | The operation completed successfully.
  98. *
  99. ********************************************************************/
  100. STDMETHODIMP
  101. CITEngStemmer::StemWord(WCHAR const *pwcInBuf, ULONG cwc, IStemSink *pStemSink)
  102. {
  103. HRESULT hr = S_OK;
  104. if (pwcInBuf == NULL || pStemSink == NULL)
  105. return (SetErrReturn(E_POINTER));
  106. if (!m_fInitialized)
  107. return (SetErrReturn(E_NOTOPEN));
  108. if (PRIMARYLANGID(LANGIDFROMLCID(m_stemctl.lcid)) != LANG_ENGLISH)
  109. return (SetErrReturn(E_FAIL));
  110. if (cwc > 0x7FFF)
  111. return (SetErrReturn(E_WORDTOOLONG));
  112. m_cs.Lock();
  113. // We allocate enough space for a worst case Unicode ---> MBCS conversion
  114. // and allow an extra word for a length prefix that we will add later.
  115. // This is probably overly cautious because we shouldn't be seeing any
  116. // DBCS anyway (we're an English stemmer).
  117. if (SUCCEEDED(hr = ReallocBuffer(&m_hmem1, &m_cbBuf1Cur,
  118. (sizeof(WCHAR) * cwc) + sizeof(WORD))))
  119. {
  120. LPBYTE lpbRawWord;
  121. lpbRawWord = (LPBYTE) _GLOBALLOCK(m_hmem1);
  122. // REVIEW (billa): Need to make sure that the word being stemmed is in
  123. // lower case.
  124. // Convert the raw word to ANSI.
  125. if ((*((WORD *)lpbRawWord) =
  126. (WORD) WideCharToMultiByte(m_stemctl.dwCodePageID, NULL,
  127. pwcInBuf, cwc, (char *)lpbRawWord + sizeof(WORD),
  128. (m_cbBuf1Cur - sizeof(WORD)), NULL, NULL)) > 0)
  129. {
  130. // We want the buffer we allocate for the stemmed word to be larger
  131. // than the raw word length so that we can handle the rare case
  132. // where the stemmed word has grown. We can just use the raw word
  133. // buffer size because it included a lot of extra padding.
  134. if (SUCCEEDED(hr = ReallocBuffer(&m_hmem2, &m_cbBuf2Cur,
  135. m_cbBuf1Cur)))
  136. {
  137. LPBYTE lpbStemWord;
  138. lpbStemWord = (LPBYTE) _GLOBALLOCK(m_hmem2);
  139. if (SUCCEEDED(hr = FStem(lpbStemWord, lpbRawWord)))
  140. {
  141. WCHAR *lpwchStem;
  142. DWORD cwchStem;
  143. DWORD cbStemWord;
  144. _GLOBALUNLOCK(m_hmem1);
  145. cwchStem = cbStemWord = (DWORD)(*((WORD *)lpbStemWord));
  146. hr = ReallocBuffer(&m_hmem1, &m_cbBuf1Cur,
  147. sizeof (WCHAR) * cbStemWord);
  148. // Relock buffer even if we've failed the realloc
  149. // so that the unlock we do later is valid. An
  150. // unconditional relock is OK because ReallocBuffer
  151. // won't invalidate the original m_hmem1 if it fails.
  152. lpwchStem = (WCHAR *) _GLOBALLOCK(m_hmem1);
  153. // Convert the stem word back to Unicode so that we can
  154. // call the stem sink.
  155. if ((cwchStem =
  156. MultiByteToWideChar(m_stemctl.dwCodePageID, NULL,
  157. (LPCSTR)lpbStemWord + sizeof(WORD), cbStemWord,
  158. lpwchStem, cwchStem)) > 0)
  159. {
  160. // Send the raw word to the word sink.
  161. hr = pStemSink->PutWord(lpwchStem, cwchStem);
  162. }
  163. else
  164. hr = E_UNEXPECTED;
  165. }
  166. _GLOBALUNLOCK(m_hmem2);
  167. }
  168. }
  169. else
  170. hr = E_UNEXPECTED;
  171. _GLOBALUNLOCK(m_hmem1);
  172. }
  173. m_cs.Unlock();
  174. return (hr);
  175. }
  176. /*****************************************************************
  177. * @method STDMETHODIMP | IStemmer | GetLicenseToUse |
  178. *
  179. * Not yet implemented
  180. *
  181. ****************************************************************/
  182. STDMETHODIMP
  183. CITEngStemmer::GetLicenseToUse(WCHAR const **ppwcsLicense)
  184. {
  185. return (E_NOTIMPL);
  186. }
  187. //---------------------------------------------------------------------------
  188. // IStemmerConfig Method Implementations
  189. //---------------------------------------------------------------------------
  190. /*****************************************************************
  191. * @method STDMETHODIMP | IStemmerConfig | SetLocaleInfo |
  192. * Sets locale information that affects the stemming
  193. * behavior of IStemmer::StemWord.
  194. * @parm DWORD | dwCodePageID | ANSI code page no. specified at build time.
  195. * @parm LCID | lcid | Win32 locale identifier specified at build time.
  196. *
  197. * @rvalue S_OK | Locale described by the parameters is supported
  198. * @rvalue E_INVALIDARG | Locale described by the parameters is not supported.
  199. *
  200. *
  201. ****************************************************************/
  202. STDMETHODIMP
  203. CITEngStemmer::SetLocaleInfo(DWORD dwCodePageID, LCID lcid)
  204. {
  205. if (!m_fInitialized)
  206. return (SetErrReturn(E_NOTOPEN));
  207. if (PRIMARYLANGID(LANGIDFROMLCID(lcid)) != LANG_ENGLISH)
  208. return (SetErrReturn(E_INVALIDARG));
  209. m_cs.Lock();
  210. m_stemctl.dwCodePageID = dwCodePageID;
  211. m_stemctl.lcid = lcid;
  212. m_fDirty = TRUE;
  213. m_cs.Unlock();
  214. return (S_OK);
  215. }
  216. /*****************************************************************
  217. * @method STDMETHODIMP | IStemmerConfig | GetLocaleInfo |
  218. * Gets locale information that affects the stemming
  219. * behavior of IStemmer::StemWord.
  220. * @parm DWORD | *pdwCodePageID | Pointer to code page identifier
  221. * @parm LCID | *plcid | Pointer to Win32 locale identifier.
  222. *
  223. * @rvalue S_OK | Locale described by the parameters is supported
  224. * @rvalue E_INVALIDARG | Locale described by the parameters is not supported.
  225. *
  226. *
  227. ****************************************************************/
  228. STDMETHODIMP
  229. CITEngStemmer::GetLocaleInfo(DWORD *pdwCodePageID, LCID *plcid)
  230. {
  231. if (pdwCodePageID == NULL || plcid == NULL)
  232. return (SetErrReturn(E_POINTER));
  233. if (!m_fInitialized)
  234. return (SetErrReturn(E_NOTOPEN));
  235. m_cs.Lock();
  236. *pdwCodePageID = m_stemctl.dwCodePageID;
  237. *plcid = m_stemctl.lcid;
  238. m_cs.Unlock();
  239. return (S_OK);
  240. }
  241. /*****************************************************************
  242. * @method STDMETHODIMP | IStemmerConfig | SetControlInfo |
  243. * Sets information that controls certain aspects of stemming.
  244. *
  245. * @parm DWORD | grfStemFlags | Flags that control stemming behavior.
  246. * @parm DWORD | dwReserved | Reserved for future use.
  247. *
  248. * @rvalue S_OK | The operation completed successfully.
  249. *
  250. * @comm
  251. * In the future, additional information may be passed in through
  252. * dwReserved.
  253. ****************************************************************/
  254. STDMETHODIMP
  255. CITEngStemmer::SetControlInfo(DWORD grfStemFlags, DWORD dwReserved)
  256. {
  257. DWORD grfFlagsUnsupported;
  258. if (!m_fInitialized)
  259. return (SetErrReturn(E_NOTOPEN));
  260. grfFlagsUnsupported = ~(0);
  261. if ((grfStemFlags & grfFlagsUnsupported) != 0)
  262. return (SetErrReturn(E_INVALIDARG));
  263. m_cs.Lock();
  264. m_stemctl.grfStemFlags = grfStemFlags;
  265. m_fDirty = TRUE;
  266. m_cs.Unlock();
  267. return (S_OK);
  268. }
  269. /*****************************************************************
  270. * @method STDMETHODIMP | IStemmerConfig | GetControlInfo |
  271. * Gets information that controls stemming behavior.
  272. *
  273. * @parm DWORD | *pgrfStemFlags | Pointer to flags that control stemming behavior.
  274. * @parm DWORD | *pdwReserved | Reserved for future use.
  275. *
  276. * @rvalue S_OK | The operation completed successfully.
  277. *
  278. ****************************************************************/
  279. STDMETHODIMP
  280. CITEngStemmer::GetControlInfo(DWORD *pgrfStemFlags, DWORD *pdwReserved)
  281. {
  282. if (pgrfStemFlags == NULL)
  283. return (SetErrReturn(E_POINTER));
  284. if (!m_fInitialized)
  285. return (SetErrReturn(E_NOTOPEN));
  286. *pgrfStemFlags = m_stemctl.grfStemFlags;
  287. return (S_OK);
  288. }
  289. /*****************************************************************
  290. * @method STDMETHODIMP | IStemmerConfig | LoadExternalStemmerData |
  291. * Loads external stemmer data, such as word part lists.
  292. *
  293. * @parm IStream | *pStream | Pointer to stream object containing
  294. * stenner data.
  295. * @parm DWORD | dwExtDataType | Data type.
  296. *
  297. * @comm
  298. * Not implemented yet.
  299. ****************************************************************/
  300. STDMETHODIMP
  301. CITEngStemmer::LoadExternalStemmerData(IStream *pStream, DWORD dwExtDataType)
  302. {
  303. if (!m_fInitialized)
  304. return (SetErrReturn(E_NOTOPEN));
  305. return (E_NOTIMPL);
  306. }
  307. //---------------------------------------------------------------------------
  308. // IPersistStreamInit Method Implementations
  309. //---------------------------------------------------------------------------
  310. STDMETHODIMP
  311. CITEngStemmer::GetClassID(CLSID *pclsid)
  312. {
  313. if (pclsid == NULL)
  314. return (SetErrReturn(E_POINTER));
  315. *pclsid = CLSID_ITEngStemmer;
  316. return (S_OK);
  317. }
  318. STDMETHODIMP
  319. CITEngStemmer::IsDirty(void)
  320. {
  321. if (!m_fInitialized)
  322. return (SetErrReturn(E_NOTOPEN));
  323. return (m_fDirty ? S_OK : S_FALSE);
  324. }
  325. STDMETHODIMP
  326. CITEngStemmer::Load(IStream *pStream)
  327. {
  328. HRESULT hr;
  329. DWORD dwVersion;
  330. DWORD grfPersistedItems;
  331. DWORD cbRead;
  332. if (pStream == NULL)
  333. return (SetErrReturn(E_POINTER));
  334. // Lock before checking m_fInitialized to make sure we don't compete
  335. // with a call to ::InitNew.
  336. m_cs.Lock();
  337. if (m_fInitialized)
  338. return (SetErrReturn(E_ALREADYOPEN));
  339. if (SUCCEEDED(hr = pStream->Read((LPVOID) &dwVersion, sizeof(DWORD),
  340. &cbRead)) &&
  341. SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) &&
  342. SUCCEEDED(hr = ((dwVersion == VERSION_ENGSTEMMER) ? S_OK :
  343. E_BADVERSION)) &&
  344. SUCCEEDED(hr = pStream->Read((LPVOID) &grfPersistedItems,
  345. sizeof(DWORD), &cbRead)) &&
  346. SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) &&
  347. grfPersistedItems != 0)
  348. {
  349. if ((grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) != 0)
  350. {
  351. if (SUCCEEDED(hr =
  352. pStream->Read((LPVOID) &m_stemctl, sizeof(STEMCTL), &cbRead)))
  353. hr = ((cbRead == sizeof(STEMCTL)) ? S_OK : E_BADFORMAT);
  354. }
  355. else
  356. {
  357. // It is a surprise not to find the STEMCTL structure in the stream,
  358. // but we can continue on because we will initialize the structure
  359. // with good defaults before we exit this routine.
  360. ITASSERT(FALSE);
  361. }
  362. }
  363. if (SUCCEEDED(hr))
  364. {
  365. if ((grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) == 0)
  366. {
  367. InitStemCtl();
  368. // Set flag in case we're asked to save.
  369. grfPersistedItems |= ITSTDBRK_PERSISTED_STEMCTL;
  370. }
  371. m_grfPersistedItems = grfPersistedItems;
  372. m_fInitialized = TRUE;
  373. }
  374. else
  375. // Free any peristed items which may have been loaded successfully.
  376. Close();
  377. m_cs.Unlock();
  378. return (hr);
  379. }
  380. STDMETHODIMP
  381. CITEngStemmer::Save(IStream *pStream, BOOL fClearDirty)
  382. {
  383. HRESULT hr;
  384. DWORD dwVersion;
  385. DWORD cbWritten;
  386. if (pStream == NULL)
  387. return (SetErrReturn(E_POINTER));
  388. if (!m_fInitialized)
  389. return (SetErrReturn(E_NOTOPEN));
  390. m_cs.Lock();
  391. dwVersion = VERSION_ENGSTEMMER;
  392. if (SUCCEEDED(hr = pStream->Write((LPVOID) &dwVersion, sizeof(DWORD),
  393. &cbWritten)) &&
  394. SUCCEEDED(hr = pStream->Write((LPVOID) &m_grfPersistedItems,
  395. sizeof(DWORD), &cbWritten)))
  396. {
  397. if ((m_grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) != 0)
  398. hr = pStream->Write((LPVOID) &m_stemctl, sizeof(STEMCTL),
  399. &cbWritten);
  400. else
  401. {
  402. // We should always be writing the STEMCTL structure, but if for
  403. // some reason the flag to write it is not set, we can still continue
  404. // because at load time we will tolerate the absence of the struct.
  405. ITASSERT(FALSE);
  406. }
  407. }
  408. if (SUCCEEDED(hr) && fClearDirty)
  409. m_fDirty = FALSE;
  410. m_cs.Unlock();
  411. return (hr);
  412. }
  413. STDMETHODIMP
  414. CITEngStemmer::GetSizeMax(ULARGE_INTEGER *pcbSizeMax)
  415. {
  416. return (E_NOTIMPL);
  417. }
  418. STDMETHODIMP
  419. CITEngStemmer::InitNew(void)
  420. {
  421. // Lock before checking m_fInitialized to make sure we don't compete
  422. // with a call to ::Load.
  423. m_cs.Lock();
  424. if (m_fInitialized)
  425. return (SetErrReturn(E_ALREADYOPEN));
  426. InitStemCtl();
  427. m_grfPersistedItems |= ITSTDBRK_PERSISTED_STEMCTL;
  428. m_fInitialized = TRUE;
  429. m_cs.Unlock();
  430. return (S_OK);
  431. }
  432. //---------------------------------------------------------------------------
  433. // Private Method Implementations
  434. //---------------------------------------------------------------------------
  435. HRESULT
  436. CITEngStemmer::ReallocBuffer(HGLOBAL *phmemBuf, DWORD *pcbBufCur, DWORD cbBufNew)
  437. {
  438. HRESULT hr = S_OK;
  439. m_cs.Lock();
  440. hr = ReallocBufferHmem(phmemBuf, pcbBufCur, max(cbBufNew, cbAnsiBufInit));
  441. m_cs.Unlock();
  442. return (hr);
  443. }
  444. void
  445. CITEngStemmer::ClearMembers(void)
  446. {
  447. MEMSET(&m_stemctl, NULL, sizeof(STEMCTL));
  448. m_fInitialized = m_fDirty = FALSE;
  449. m_grfPersistedItems = 0;
  450. }
  451. void
  452. CITEngStemmer::InitStemCtl(void)
  453. {
  454. m_stemctl.dwCodePageID = GetACP();
  455. // If the user default language is not English, we'll store the
  456. // value and check it in IStemmer::Init and ::StemWord.
  457. m_stemctl.lcid = GetUserDefaultLCID();
  458. m_stemctl.grfStemFlags = 0;
  459. }
  460. void
  461. CITEngStemmer::Close(void)
  462. {
  463. if (m_hmem1 != NULL)
  464. {
  465. _GLOBALFREE(m_hmem1);
  466. m_hmem1 = NULL;
  467. m_cbBuf1Cur = 0;
  468. }
  469. if (m_hmem2 != NULL)
  470. {
  471. _GLOBALFREE(m_hmem2);
  472. m_hmem2 = NULL;
  473. m_cbBuf2Cur = 0;
  474. }
  475. ClearMembers();
  476. }