Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

616 lines
17 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1991 - 2000
  5. //
  6. // File: KEYMAK.CXX
  7. //
  8. // Contents: Key maker
  9. //
  10. // Classes: CKeyMaker
  11. //
  12. // History: 31-Jan-92 BartoszM Created
  13. // 24-Apr-95 SitaramR Removed US/Fake stemmer and added
  14. // Infosoft stemmer
  15. //
  16. // Notes: The filtering pipeline is hidden in the Data Repository
  17. // object which serves as a sink for the filter.
  18. // The sink for the Data Repository is the Key Repository.
  19. // The language dependent part of the pipeline
  20. // is obtained from the Language List object and is called
  21. // Language Dependent Key Maker. It consists of:
  22. //
  23. // Word Breaker
  24. // Stemmer (optional)
  25. // Normalizer
  26. // Noise List
  27. //
  28. // Each object serves as a sink for its predecessor,
  29. // Key Repository is the final sink.
  30. //
  31. //----------------------------------------------------------------------------
  32. #include <pch.cxx>
  33. #pragma hdrstop
  34. #include <lang.hxx>
  35. #include <keymak.hxx>
  36. #include <noise.hxx>
  37. #include <norm.hxx>
  38. #include <stemsink.hxx>
  39. //+---------------------------------------------------------------------------
  40. //
  41. // Member: CKeyMaker::CKeyMaker
  42. //
  43. // Synopsis: Constructs a language-dependant key maker object
  44. //
  45. // Effects: Creates a noiselist, normalizer and borrows a wordbreaker, stemmer
  46. //
  47. // Arguments: [locale] -- language locale
  48. // [krep] -- key repository to place completed keys in
  49. // [pPhraseSink] -- sink for collecting phrases
  50. // [fQuery] -- true if this is during querying
  51. // [ulFuzzy] -- fuzzy level of query
  52. //
  53. // History: 05-June-91 t-WadeR Created.
  54. // 12-Oct-92 AmyA Added Unicode support
  55. //
  56. //----------------------------------------------------------------------------
  57. CKeyMaker::CKeyMaker( LCID locale,
  58. PROPID pid,
  59. PKeyRepository& krep,
  60. IPhraseSink *pPhraseSink,
  61. BOOL fQuery,
  62. ULONG ulFuzzy,
  63. CLangList & langList )
  64. : _pPhraseSink(pPhraseSink),
  65. _fQuery( fQuery ),
  66. _sLang( locale, pid, &langList, fQuery ? LANG_LOAD_ALL : LANG_LOAD_NO_STEMMER ),
  67. _lcid( locale ),
  68. _pid( pid )
  69. {
  70. krep.GetSourcePosBuffers (&_pcwcSrcPos, &_pcwcSrcLen );
  71. CStringTable* noiseTable;
  72. //
  73. // Don't remove noise words if we're doing prefix matching. The noise
  74. // *word* is potentially only a prefix for a non-noise word.
  75. //
  76. if (GENERATE_METHOD_PREFIX == ulFuzzy )
  77. noiseTable = 0;
  78. else
  79. noiseTable = _sLang->GetNoiseTable();
  80. if ( noiseTable != 0 )
  81. _xNoiseList.Set( new CNoiseList( *noiseTable, krep ) );
  82. else
  83. _xNoiseList.Set( new CNoiseListEmpty( krep, ulFuzzy ) );
  84. _xWordRep.Set( new CNormalizer( _xNoiseList.GetReference() ) );
  85. // Get Normalizer's buffer length
  86. _cwcMaxNormBuf = _xWordRep->GetMaxBufferLen();
  87. // get stemmer (optional)
  88. if ( ulFuzzy == GENERATE_METHOD_STEMMED )
  89. {
  90. IStemmer *pStemmer = _sLang->GetStemmer();
  91. if ( pStemmer )
  92. {
  93. BOOL fCopyright;
  94. SCODE sc = pStemmer->Init( _cwcMaxNormBuf, &fCopyright );
  95. if ( FAILED(sc) )
  96. {
  97. ciDebugOut(( DEB_ERROR, "IStemmer::Init returned 0x%x\n", sc ));
  98. THROW( CException( sc ) );
  99. }
  100. if ( fCopyright )
  101. {
  102. WCHAR const * pLicense;
  103. sc = pStemmer->GetLicenseToUse( &pLicense );
  104. if ( SUCCEEDED(sc) )
  105. {
  106. ciDebugOut(( DEB_WORDS, "%ws\n", pLicense ));
  107. }
  108. else
  109. {
  110. ciDebugOut(( DEB_ERROR, "IStemmer::GetLicenseToUse returned 0x%x\n", sc ));
  111. THROW( CException( sc ) );
  112. }
  113. }
  114. _xWordRep2.Set( _xWordRep.Acquire() );
  115. _xWordRep.Set( new CStemmerSink( pStemmer, _xWordRep2.GetReference() ) );
  116. }
  117. else
  118. {
  119. ciDebugOut(( DEB_ERROR,
  120. "Fuzzy2 query, but no stemmer available for locale 0x%x\n",
  121. locale ));
  122. }
  123. }
  124. //
  125. // Initialize word breaker
  126. //
  127. _pWBreak = _sLang->GetWordBreaker();
  128. Win4Assert( _pWBreak );
  129. BOOL fCopyright;
  130. SCODE sc = _pWBreak->Init( fQuery, _cwcMaxNormBuf, &fCopyright );
  131. if ( FAILED(sc) )
  132. {
  133. ciDebugOut(( DEB_ERROR, "IWordBreaker::Init returned 0x%x\n", sc ));
  134. THROW( CException( sc ) );
  135. }
  136. if ( fCopyright )
  137. {
  138. WCHAR const * pLicense;
  139. sc = _pWBreak->GetLicenseToUse( &pLicense );
  140. if ( SUCCEEDED(sc) )
  141. {
  142. ciDebugOut(( DEB_WORDS, "%ws\n", pLicense ));
  143. }
  144. else
  145. {
  146. ciDebugOut(( DEB_ERROR, "IWordBreaker::GetLicenseToUse returned 0x%x\n", sc ));
  147. THROW( CException( sc ) );
  148. }
  149. }
  150. } //CKeyMaker
  151. //+---------------------------------------------------------------------------
  152. //
  153. // Member: CKeyMaker::CKeyMaker
  154. //
  155. // Synopsis: Constructs key maker for noise word list initialization.
  156. //
  157. // Arguments: [pWBreak] -- word breaker
  158. // [Noise] -- noise word list
  159. //
  160. // History: 05-June-91 t-WadeR Created.
  161. // 12-Oct-92 AmyA Added Unicode support
  162. //
  163. //----------------------------------------------------------------------------
  164. CKeyMaker::CKeyMaker( IWordBreaker * pWBreak, PNoiseList & Noise )
  165. : _pWBreak( pWBreak ),
  166. _pPhraseSink(0),
  167. _fQuery(FALSE)
  168. {
  169. _xWordRep.Set( new CNormalizer( Noise ) );
  170. // Get Normalizer's buffer length
  171. _cwcMaxNormBuf = _xWordRep->GetMaxBufferLen();
  172. _pcwcSrcPos = 0; // We don't use them!
  173. _pcwcSrcLen = 0;
  174. //
  175. // Initialize word breaker
  176. //
  177. Win4Assert( _pWBreak );
  178. BOOL fCopyright;
  179. SCODE sc = _pWBreak->Init( FALSE, _cwcMaxNormBuf, &fCopyright );
  180. if ( FAILED(sc) )
  181. {
  182. ciDebugOut(( DEB_ERROR, "IWordBreaker::Init returned 0x%x\n", sc ));
  183. THROW( CException( sc ) );
  184. }
  185. if ( fCopyright )
  186. {
  187. WCHAR const * pLicense;
  188. sc = _pWBreak->GetLicenseToUse( &pLicense );
  189. if ( SUCCEEDED(sc) )
  190. {
  191. ciDebugOut(( DEB_WORDS, "%ws\n", pLicense ));
  192. }
  193. else
  194. {
  195. ciDebugOut(( DEB_ERROR, "IWordBreaker::GetLicenseToUse returned 0x%x\n", sc ));
  196. THROW( CException( sc ) );
  197. }
  198. }
  199. } //CKeyMaker
  200. //+---------------------------------------------------------------------------
  201. //
  202. // Member: CKeyMaker::~CKeyMaker
  203. //
  204. // Synopsis: destroys a key maker object
  205. //
  206. // History: 05-June-91 t-WadeR Created.
  207. //
  208. //----------------------------------------------------------------------------
  209. CKeyMaker::~CKeyMaker()
  210. {
  211. }
  212. //
  213. // The following are needed to make midl happy. There are no other interfaces
  214. // to bind to. Inheritance from IUnknown is unnecessary.
  215. //
  216. SCODE STDMETHODCALLTYPE CKeyMaker::QueryInterface(REFIID riid, void * * ppvObject)
  217. {
  218. *ppvObject = 0;
  219. return( E_NOTIMPL );
  220. }
  221. ULONG STDMETHODCALLTYPE CKeyMaker::AddRef()
  222. {
  223. return( 1 );
  224. }
  225. ULONG STDMETHODCALLTYPE CKeyMaker::Release()
  226. {
  227. return( 1 );
  228. }
  229. //+-------------------------------------------------------------------------
  230. //
  231. // Method: CKeyMaker::PutWord
  232. //
  233. // Synopsis: Store word in word repository
  234. //
  235. // Arguments: [cwc] -- Count of characters in [pwcInBuf]
  236. // [pwcInBuf] -- Word
  237. // [cwcSrcLen] -- count of characters in pTextSource buffer (see IWordBreaker::BreakText)
  238. // [cwcSrcPos] -- position of word in pTextSource buffer
  239. //
  240. // History: 19-Apr-1994 KyleP Created
  241. //
  242. //--------------------------------------------------------------------------
  243. SCODE STDMETHODCALLTYPE CKeyMaker::PutWord( ULONG cwc,
  244. WCHAR const *pwcInBuf,
  245. ULONG cwcSrcLen,
  246. ULONG cwcSrcPos )
  247. {
  248. SCODE sc = S_OK;
  249. // validate PutWord call
  250. if ( !_altWordsEnforcer.IsPutWordOk() )
  251. {
  252. Win4Assert( !"CKeyMaker::PutWord - invalid state" );
  253. ciDebugOut(( DEB_ITRACE, "PutWord: %.*ws\n", cwc, pwcInBuf ));
  254. return E_FAIL;
  255. }
  256. CTranslateSystemExceptions translate;
  257. TRY
  258. {
  259. if ( cwc > _cwcMaxNormBuf )
  260. {
  261. sc = LANGUAGE_S_LARGE_WORD;
  262. cwc = _cwcMaxNormBuf;
  263. }
  264. if ( cwc > 0 )
  265. {
  266. #if CIDBG == 1
  267. if ( ciInfoLevel & DEB_WORDS )
  268. {
  269. //
  270. // Check for 'printable' characters.
  271. //
  272. BOOL fOk = TRUE;
  273. for ( unsigned i = 0; i < cwc; i++ )
  274. {
  275. if ( pwcInBuf[i] > 0xFF )
  276. {
  277. fOk = FALSE;
  278. break;
  279. }
  280. }
  281. if ( fOk )
  282. ciDebugOut(( DEB_WORDS,
  283. "PutWord: \"%.*ws\" Occ = %d cwcSrcLen = %d, cwcSrcPos = %d\n",
  284. cwc, pwcInBuf, _xWordRep->GetOccurrence(), cwcSrcLen, cwcSrcPos ));
  285. else
  286. {
  287. ciDebugOut(( DEB_WORDS, "PutWord:" ));
  288. for ( i = 0; i < cwc; i++ )
  289. ciDebugOut(( DEB_WORDS | DEB_NOCOMPNAME, " %04X", pwcInBuf[i] ));
  290. ciDebugOut(( DEB_WORDS | DEB_NOCOMPNAME,
  291. " Occ = %d cwcSrcLen = %d, cwcSrcPos = %d\n",
  292. _xWordRep->GetOccurrence(), cwcSrcLen, cwcSrcPos ));
  293. }
  294. }
  295. #endif // CIDBG
  296. //
  297. // No internal call to PutAltWord for performance reasons.
  298. //
  299. if (0 != _pcwcSrcPos)
  300. {
  301. Win4Assert ( 0 != _pcwcSrcLen );
  302. *_pcwcSrcLen = cwcSrcLen;
  303. *_pcwcSrcPos = cwcSrcPos;
  304. }
  305. _xWordRep->ProcessWord( pwcInBuf, cwc );
  306. }
  307. }
  308. CATCH( CException, e )
  309. {
  310. sc = e.GetErrorCode();
  311. }
  312. END_CATCH;
  313. return sc;
  314. } //PutWord
  315. //+-------------------------------------------------------------------------
  316. //
  317. // Method: CKeyMaker::PutAltWord
  318. //
  319. // Synopsis: Store alternate word in word repository.
  320. //
  321. // Effects: Identical to PutWord except occurrence count is not
  322. // incremented.
  323. //
  324. // Arguments: [cwc] -- Count of characters in [pwcInBuf]
  325. // [pwcInBuf] -- Word
  326. // [cwcSrcLen] -- count of characters in pTextSource buffer (see IWordBreaker::BreakText)
  327. // [cwcSrcPos] -- position of word in pTextSource buffer
  328. //
  329. // History: 19-Apr-1994 KyleP Created
  330. //
  331. //--------------------------------------------------------------------------
  332. SCODE STDMETHODCALLTYPE CKeyMaker::PutAltWord( ULONG cwc,
  333. WCHAR const *pwcInBuf,
  334. ULONG cwcSrcLen,
  335. ULONG cwcSrcPos )
  336. {
  337. SCODE sc = S_OK;
  338. // validate PutWord call
  339. if ( !_altWordsEnforcer.IsPutAltWordOk() )
  340. {
  341. Win4Assert( !"CKeyMaker::PutAltWord - invalid state" );
  342. ciDebugOut(( DEB_ITRACE, "PutAltWord: %.*ws\n", cwc, pwcInBuf ));
  343. return E_FAIL;
  344. }
  345. CTranslateSystemExceptions translate;
  346. TRY
  347. {
  348. //
  349. // What is to be done if two large, alternate words end up with the
  350. // same (truncated) prefix after truncation ?
  351. // This is fixed in Babylon and isn't a problem here.
  352. //
  353. if ( cwc > _cwcMaxNormBuf )
  354. {
  355. sc = LANGUAGE_S_LARGE_WORD;
  356. cwc = _cwcMaxNormBuf;
  357. }
  358. if ( cwc > 0 )
  359. {
  360. ciDebugOut(( DEB_WORDS,
  361. "PutAltWord: \"%.*ws\" Occ = %d cwcSrcLen = %d, cwcSrcPos = %d\n",
  362. cwc, pwcInBuf, _xWordRep->GetOccurrence(), cwcSrcLen, cwcSrcPos ));
  363. if (0 != _pcwcSrcPos)
  364. {
  365. Win4Assert ( 0 != _pcwcSrcLen );
  366. *_pcwcSrcLen = cwcSrcLen;
  367. *_pcwcSrcPos = cwcSrcPos;
  368. }
  369. _xWordRep->ProcessAltWord( pwcInBuf, cwc );
  370. }
  371. }
  372. CATCH( CException, e )
  373. {
  374. sc = e.GetErrorCode();
  375. }
  376. END_CATCH;
  377. return sc;
  378. } //PutAltWord
  379. //+-------------------------------------------------------------------------
  380. //
  381. // Method: CKeyMaker::StartAltPhrase
  382. //
  383. // Synopsis: Pass on StartAltPhrase to word repository
  384. //
  385. // History: 24-Apr-1994 KyleP Created
  386. //
  387. //--------------------------------------------------------------------------
  388. SCODE STDMETHODCALLTYPE CKeyMaker::StartAltPhrase()
  389. {
  390. SCODE sc = S_OK;
  391. CTranslateSystemExceptions translate;
  392. TRY
  393. {
  394. if ( _fQuery )
  395. {
  396. // validate StartAltPhrase call
  397. if ( !_altWordsEnforcer.IsStartAltPhraseOk() || !_altPhrasesEnforcer.IsStartAltPhraseOk() )
  398. {
  399. Win4Assert( !"CKeyMaker::StartAltPhrase - invalid state" );
  400. THROW( CException( E_FAIL ) );
  401. }
  402. _xWordRep->StartAltPhrase();
  403. }
  404. else
  405. sc = WBREAK_E_QUERY_ONLY;
  406. }
  407. CATCH( CException, e )
  408. {
  409. sc = e.GetErrorCode();
  410. }
  411. END_CATCH;
  412. return sc;
  413. } //StartAltPhrase
  414. //+-------------------------------------------------------------------------
  415. //
  416. // Method: CKeyMaker::EndAltPhrase
  417. //
  418. // Synopsis: Pass on EndAltPhrase to word repository
  419. //
  420. // History: 24-Apr-1994 KyleP Created
  421. //
  422. //--------------------------------------------------------------------------
  423. SCODE STDMETHODCALLTYPE CKeyMaker::EndAltPhrase()
  424. {
  425. SCODE sc = S_OK;
  426. CTranslateSystemExceptions translate;
  427. TRY
  428. {
  429. if ( _fQuery )
  430. {
  431. // validate EndAltPhrase call
  432. if ( !_altWordsEnforcer.IsEndAltPhraseOk() || !_altPhrasesEnforcer.IsEndAltPhraseOk() )
  433. {
  434. Win4Assert( !"CKeyMaker::EndAltPhrase - invalid state" );
  435. THROW( CException( E_FAIL ) );
  436. }
  437. _xWordRep->EndAltPhrase();
  438. }
  439. else
  440. sc = WBREAK_E_QUERY_ONLY;
  441. }
  442. CATCH( CException, e )
  443. {
  444. sc = e.GetErrorCode();
  445. }
  446. END_CATCH;
  447. return sc;
  448. } //EndAltPhrase
  449. //+-------------------------------------------------------------------------
  450. //
  451. // Method: CKeyMaker::PutBreak
  452. //
  453. // Synopsis: Increment the occurrence count appropriately
  454. //
  455. // History: 24-Apr-1994 KyleP Created
  456. //
  457. //--------------------------------------------------------------------------
  458. SCODE STDMETHODCALLTYPE CKeyMaker::PutBreak( WORDREP_BREAK_TYPE breakType )
  459. {
  460. // We are modeling PutBreak by a skip of the appropriate number of noise words
  461. switch ( breakType )
  462. {
  463. case WORDREP_BREAK_EOW:
  464. _xWordRep->SkipNoiseWords( 1 );
  465. break;
  466. case WORDREP_BREAK_EOS:
  467. _xWordRep->SkipNoiseWords( 8 );
  468. break;
  469. case WORDREP_BREAK_EOP:
  470. _xWordRep->SkipNoiseWords( 128 );
  471. break;
  472. case WORDREP_BREAK_EOC:
  473. _xWordRep->SkipNoiseWords( 1024 );
  474. break;
  475. default:
  476. ciDebugOut(( DEB_ERROR,
  477. "CKeyMaker::PutBreak -- Bad break type %d\n",
  478. breakType ));
  479. return( E_FAIL );
  480. }
  481. return( S_OK );
  482. } //PutBreak
  483. //+-------------------------------------------------------------------------
  484. //
  485. // Method: CKeyMaker::Supports
  486. //
  487. // Synopsis: Checks if the pid/lang are supported by the language object
  488. //
  489. // Arguments: [pid] -- The property ID
  490. // [lcid] -- The locale
  491. //
  492. // Returns: TRUE if it is supported
  493. //
  494. // History: 24-Apr-1994 KyleP Created
  495. //
  496. //--------------------------------------------------------------------------
  497. BOOL CKeyMaker::Supports( PROPID pid, LCID lcid )
  498. {
  499. if ( (lcid == _lcid) && (pid == _pid) )
  500. return TRUE;
  501. else
  502. return _sLang.Supports( pid, lcid );
  503. } //Supports
  504. //+---------------------------------------------------------------------------
  505. //
  506. // Member: CKeyMaker::NormalizeWStr - Public
  507. //
  508. // Synopsis: Normalizes a UniCode string
  509. //
  510. // Arguments: [pwcInBuf] -- input buffer
  511. // [cwcInBuf] -- count of chars in pwcInBuf
  512. // [pbOutBuf] -- output buffer.
  513. // [pcbOutBuf] - pointer to output count of bytes.
  514. //
  515. // History: 10-Feb-2000 KitmanH Created
  516. //
  517. //----------------------------------------------------------------------------
  518. void CKeyMaker::NormalizeWStr( WCHAR const *pwcInBuf,
  519. ULONG cwcInBuf,
  520. BYTE *pbOutBuf,
  521. unsigned *pcbOutBuf )
  522. {
  523. _xWordRep->NormalizeWStr( pwcInBuf,
  524. cwcInBuf,
  525. pbOutBuf,
  526. pcbOutBuf );
  527. }