Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

742 lines
22 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1991 - 1994.
  5. //
  6. // File: DefBreak.cxx
  7. //
  8. // Contents: Text Word Breaker
  9. //
  10. // History: 08-May-91 t-WadeR Created stubs, filled in ASCII code.
  11. // 06-Jun-91 t-WadeR Changed to use input-based pipeline
  12. // 11-Apr-92 KyleP Sync to spec
  13. //
  14. //----------------------------------------------------------------------------
  15. #include <pch.cxx>
  16. #pragma hdrstop
  17. #include <DefBreak.hxx>
  18. //+---------------------------------------------------------------------------
  19. //
  20. // Member: CDefWordBreaker::CDefWordBreaker
  21. //
  22. // Synopsis: Constructor for the CDefWordBreaker class.
  23. //
  24. // History: 07-June-91 t-WadeR Created
  25. // 12-Oct-92 AmyA Added Unicode support
  26. //
  27. //----------------------------------------------------------------------------
  28. CDefWordBreaker::CDefWordBreaker()
  29. : _cRefs(1)
  30. {
  31. ciDebugOut(( DEB_ITRACE, "Creating default wordbreaker\n" ));
  32. // Look at IsWordChar. We don't want the last non-breaking
  33. // space in the chunk to be considered a word break.
  34. // It will be processed again (correctly) when we move to the next chunk.
  35. _aCharInfo3 [CDefWordBreaker::ccCompare] = C3_NONSPACING;
  36. }
  37. //+---------------------------------------------------------------------------
  38. //
  39. // Member: CWordBreaker::~CWordBreaker
  40. //
  41. // Synopsis: Destructor for the CWordBreaker class.
  42. //
  43. //----------------------------------------------------------------------------
  44. CDefWordBreaker::~CDefWordBreaker()
  45. {
  46. }
  47. //+-------------------------------------------------------------------------
  48. //
  49. // Method: CDefWordBreaker::QueryInterface
  50. //
  51. // Synopsis: Rebind to other interface
  52. //
  53. // Arguments: [riid] -- IID of new interface
  54. // [ppvObject] -- New interface * returned here
  55. //
  56. // Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
  57. //
  58. // History: 23-Feb-1994 KyleP Created
  59. //
  60. //--------------------------------------------------------------------------
  61. SCODE STDMETHODCALLTYPE CDefWordBreaker::QueryInterface( REFIID riid,
  62. void ** ppvObject)
  63. {
  64. if ( 0 == ppvObject )
  65. return E_INVALIDARG;
  66. if ( IID_IWordBreaker == riid )
  67. *ppvObject = (IUnknown *)(IWordBreaker *)this;
  68. else if ( IID_IUnknown == riid )
  69. *ppvObject = (IUnknown *)(IPersist *)(IPersistFile *)this;
  70. else
  71. {
  72. *ppvObject = 0;
  73. return E_NOINTERFACE;
  74. }
  75. AddRef();
  76. return S_OK;
  77. }
  78. //+-------------------------------------------------------------------------
  79. //
  80. // Method: CDefWordBreaker::AddRef
  81. //
  82. // Synopsis: Increments refcount
  83. //
  84. // History: 23-Feb-1994 KyleP Created
  85. //
  86. //--------------------------------------------------------------------------
  87. ULONG STDMETHODCALLTYPE CDefWordBreaker::AddRef()
  88. {
  89. return InterlockedIncrement( &_cRefs );
  90. }
  91. //+-------------------------------------------------------------------------
  92. //
  93. // Method: CDefWordBreaker::Release
  94. //
  95. // Synopsis: Decrement refcount. Delete if necessary.
  96. //
  97. // History: 23-Feb-1994 KyleP Created
  98. //
  99. //--------------------------------------------------------------------------
  100. ULONG STDMETHODCALLTYPE CDefWordBreaker::Release()
  101. {
  102. unsigned long uTmp = InterlockedDecrement( &_cRefs );
  103. if ( 0 == uTmp )
  104. delete this;
  105. return uTmp;
  106. }
  107. //+-------------------------------------------------------------------------
  108. //
  109. // Method: CDefWordBreaker::Init
  110. //
  111. // Synopsis: Initialize word-breaker
  112. //
  113. // Arguments: [fQuery] -- TRUE if query-time
  114. // [ulMaxTokenSize] -- Maximum size token stored by caller
  115. // [pfLicense] -- Set to true if use restricted
  116. //
  117. // Returns: Status code
  118. //
  119. // History: 11-Apr-1994 KyleP Created
  120. //
  121. //--------------------------------------------------------------------------
  122. SCODE STDMETHODCALLTYPE CDefWordBreaker::Init( BOOL fQuery,
  123. ULONG ulMaxTokenSize,
  124. BOOL *pfLicense )
  125. {
  126. if ( 0 == pfLicense )
  127. return E_INVALIDARG;
  128. *pfLicense = FALSE;
  129. return S_OK;
  130. }
  131. //+-------------------------------------------------------------------------
  132. //
  133. // Method: CDefWordBreaker::IsWordChar
  134. //
  135. // Synopsis: Find whether the i'th character in the buffer _awString
  136. // is a word character (rather than word break)
  137. //
  138. // Arguments: [i] -- index into _awString
  139. //
  140. // History: 22-Jul-1994 BartoszM Created
  141. //
  142. //--------------------------------------------------------------------------
  143. inline BOOL CDefWordBreaker::IsWordChar (int i) const
  144. {
  145. if ( (_aCharInfo1[i] & (C1_ALPHA | C1_DIGIT))
  146. || (_aCharInfo3[i] & C3_NONSPACING) )
  147. {
  148. return TRUE;
  149. }
  150. WCHAR c = _pwcChunk[i];
  151. if (c == L'_')
  152. return TRUE;
  153. if (c == 0xa0) // non breaking space
  154. {
  155. // followed by a non-spacing character
  156. // (looking ahead is okay)
  157. if (_aCharInfo3[i+1] & C3_NONSPACING)
  158. return TRUE;
  159. }
  160. return FALSE;
  161. }
  162. //+---------------------------------------------------------------------------
  163. //
  164. // Member: CDefWordBreaker::ScanChunk
  165. //
  166. // Synopsis: For each character find its type
  167. //
  168. //
  169. // History: 16-Aug-94 BartoszM Created
  170. //
  171. //----------------------------------------------------------------------------
  172. BOOL CDefWordBreaker::ScanChunk ()
  173. {
  174. //
  175. // GetStringTypeW is returning error 87 (ERROR_INVALID_PARAMETER) if
  176. // we pass in a null string.
  177. //
  178. Win4Assert( (0 != _cMapped) && (0 != _pwcChunk) );
  179. if ( !GetStringTypeW( CT_CTYPE1, // POSIX character typing
  180. _pwcChunk, // Source
  181. _cMapped, // Size of source
  182. _aCharInfo1 ) ) // Character info
  183. {
  184. ciDebugOut(( DEB_ERROR, "GetStringTypeW returned %d\n",
  185. GetLastError() ));
  186. return FALSE;
  187. }
  188. if ( !GetStringTypeW( CT_CTYPE3, // Additional POSIX
  189. _pwcChunk,
  190. _cMapped, // Size of source
  191. _aCharInfo3 ) ) // Character info 3
  192. {
  193. ciDebugOut(( DEB_ERROR, "GetStringTypeW CTYPE3 returned %d\n",
  194. GetLastError() ));
  195. return FALSE;
  196. }
  197. return TRUE;
  198. }
  199. //+---------------------------------------------------------------------------
  200. //
  201. // Member: CDefWordBreaker::BreakText
  202. //
  203. // Synopsis: Break input stream into words.
  204. //
  205. // Arguments: [pTextSource] - source of input buffers
  206. // [pWordSink] - sink for words
  207. // [pPhraseSink] - sink for noun phrases
  208. //
  209. // History: 07-June-91 t-WadeR Created
  210. // 12-Oct-92 AmyA Added Unicode support
  211. // 18-Nov-92 AmyA Overloaded
  212. // 11-Apr-94 KyleP Sync with spec
  213. // 26-Aug-94 BartoszM Fixed Unicode parsing
  214. //
  215. //----------------------------------------------------------------------------
  216. SCODE STDMETHODCALLTYPE CDefWordBreaker::BreakText( TEXT_SOURCE *pTextSource,
  217. IWordSink *pWordSink,
  218. IPhraseSink *pPhraseSink )
  219. {
  220. if ( 0 == pTextSource )
  221. return E_INVALIDARG;
  222. if ( 0 == pWordSink || pTextSource->iCur == pTextSource->iEnd)
  223. return S_OK;
  224. if (pTextSource->iCur > pTextSource->iEnd)
  225. {
  226. Win4Assert ( !"BreakText called with bad TEXT_SOURCE" );
  227. return E_FAIL;
  228. }
  229. SCODE sc = S_OK;
  230. ULONG cwc, cwcProcd; // cwcProcd is # chars actually processed by Tokenize()
  231. TRY
  232. {
  233. do
  234. {
  235. //
  236. // Flag for first time thru loop below. This is to fix the case
  237. // where the length of the buffer passed in is less than
  238. // MAX_II_BUFFER_LEN. In this case iEnd-iCur is <= MAX_II_BUFFER_LEN
  239. // and we break out the inner loop and call
  240. // pfnFillTextBuffer without having processed any characters,
  241. // and so pfnFillTextBuffer returns TRUE without adding any new
  242. // characters and this results in an infinite loop.
  243. //
  244. BOOL fFirstTime = TRUE;
  245. while ( pTextSource->iCur < pTextSource->iEnd )
  246. {
  247. cwc = pTextSource->iEnd - pTextSource->iCur;
  248. //
  249. // Process in buckets of MAX_II_BUFER_LEN only
  250. //
  251. if ( cwc >= CDefWordBreaker::ccCompare )
  252. cwc = CDefWordBreaker::ccCompare;
  253. else if ( !fFirstTime )
  254. break;
  255. Tokenize( pTextSource, cwc, pWordSink, cwcProcd );
  256. Win4Assert( cwcProcd <= cwc );
  257. pTextSource->iCur += cwcProcd;
  258. fFirstTime = FALSE;
  259. }
  260. } while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) );
  261. cwc = pTextSource->iEnd - pTextSource->iCur;
  262. // we know that the remaining text should be less than ccCompare
  263. Win4Assert( cwc < CDefWordBreaker::ccCompare );
  264. if ( 0 != cwc )
  265. {
  266. Tokenize( pTextSource, cwc, pWordSink, cwcProcd );
  267. }
  268. }
  269. CATCH (CException, e)
  270. {
  271. ciDebugOut(( DEB_ITRACE,
  272. "Exception 0x%x caught when breaking text in default wordbreaker\n",
  273. e.GetErrorCode() ));
  274. sc = GetOleError( e );
  275. }
  276. END_CATCH
  277. return sc;
  278. }
  279. //+---------------------------------------------------------------------------
  280. //
  281. // Member: CDefWordBreaker::Tokenize
  282. //
  283. // Synopsis: Tokenize the input buffer into words
  284. //
  285. // Arguments: [pTextSource] -- input text source
  286. // [cwc] -- # chars to process
  287. // [pWordSink] -- sink for words
  288. // [cwcProd] -- # chars actually processed returned here
  289. //
  290. // History: 10-Aug-95 SitaramR Created
  291. //
  292. //----------------------------------------------------------------------------
  293. void CDefWordBreaker::Tokenize( TEXT_SOURCE *pTextSource,
  294. ULONG cwc,
  295. IWordSink *pWordSink,
  296. ULONG& cwcProcd )
  297. {
  298. _pwcChunk = &pTextSource->awcBuffer[pTextSource->iCur];
  299. _cMapped = cwc;
  300. if ( !ScanChunk() )
  301. THROW( CException( E_FAIL ) );
  302. BOOL fWordHasZWS = FALSE; // Does the current word have a zero-width-space ?
  303. unsigned uLenZWS; // Length of a word minus embedded zero-width-spaces
  304. //
  305. // iBeginWord is the offset into _aCharInfo of the beginning character of
  306. // a word. iCur is the first *unprocessed* character.
  307. // They are indexes into the mapped chunk.
  308. //
  309. unsigned iBeginWord = 0;
  310. unsigned iCur = 0;
  311. SCODE sc = S_OK;
  312. //
  313. // Pump words from mapped chunk to word sink
  314. //
  315. while ( iCur < _cMapped )
  316. {
  317. //
  318. // Skip whitespace, punctuation, etc.
  319. //
  320. for (; iCur < _cMapped; iCur++)
  321. if ( IsWordChar (iCur) )
  322. break;
  323. // iCur points to a word char or is equal to _cMapped
  324. iBeginWord = iCur;
  325. if (iCur < _cMapped)
  326. iCur++; // we knew it pointed at word character
  327. //
  328. // Find word break. Filter may output Unicode zero-width-space, which
  329. // should be ignored by the wordbreaker.
  330. //
  331. fWordHasZWS = FALSE;
  332. for (; iCur < _cMapped; iCur++)
  333. {
  334. if ( !IsWordChar (iCur) )
  335. {
  336. if ( _pwcChunk[iCur] == ZERO_WIDTH_SPACE )
  337. fWordHasZWS = TRUE;
  338. else
  339. break;
  340. }
  341. }
  342. if ( fWordHasZWS )
  343. {
  344. //
  345. // Copy word into _awcBufZWS after stripping zero-width-spaces
  346. //
  347. uLenZWS = 0;
  348. for ( unsigned i=iBeginWord; i<iCur; i++ )
  349. {
  350. if ( _pwcChunk[i] != ZERO_WIDTH_SPACE )
  351. _awcBufZWS[uLenZWS++] = _pwcChunk[i];
  352. }
  353. }
  354. // iCur points to a non-word char or is equal to _cMapped
  355. if ( iCur < _cMapped )
  356. {
  357. // store the word and its source position
  358. if ( fWordHasZWS )
  359. sc = pWordSink->PutWord( uLenZWS,
  360. _awcBufZWS, // stripped word
  361. iCur - iBeginWord,
  362. pTextSource->iCur + iBeginWord );
  363. else
  364. sc = pWordSink->PutWord( iCur - iBeginWord,
  365. _pwcChunk + iBeginWord, // the word
  366. iCur - iBeginWord,
  367. pTextSource->iCur + iBeginWord );
  368. if ( FAILED( sc ) )
  369. THROW( CException( sc ) );
  370. iCur++; // we knew it pointed at non-word char
  371. iBeginWord = iCur; // in case we exit the loop now
  372. }
  373. } // next word
  374. Win4Assert( iCur == _cMapped );
  375. // End of words in chunk.
  376. // iCur == _cMapped
  377. // iBeginWord points at beginning of word or == _cMapped
  378. if ( 0 == iBeginWord )
  379. {
  380. // A single word fills from beginning of this chunk
  381. // to the end. This is either a very long word or
  382. // a short word in a leftover buffer.
  383. // store the word and its source position
  384. if ( fWordHasZWS )
  385. sc = pWordSink->PutWord( uLenZWS,
  386. _awcBufZWS, // stripped word
  387. iCur,
  388. pTextSource->iCur ); // its source pos.
  389. else
  390. sc = pWordSink->PutWord( iCur,
  391. _pwcChunk, // the word
  392. iCur,
  393. pTextSource->iCur ); // its source pos.
  394. if ( FAILED( sc ) )
  395. THROW( CException( sc ) );
  396. //
  397. // Position it to not add the word twice.
  398. //
  399. iBeginWord = iCur;
  400. }
  401. //
  402. // If this is the last chunk from text source, then process the
  403. // last fragment
  404. //
  405. if ( cwc < CDefWordBreaker::ccCompare && iBeginWord != iCur )
  406. {
  407. // store the word and its source position
  408. if ( fWordHasZWS )
  409. sc = pWordSink->PutWord( uLenZWS,
  410. _awcBufZWS, // stripped word
  411. iCur - iBeginWord,
  412. pTextSource->iCur + iBeginWord );
  413. else
  414. sc = pWordSink->PutWord( iCur - iBeginWord,
  415. _pwcChunk + iBeginWord, // the word
  416. iCur - iBeginWord,
  417. pTextSource->iCur + iBeginWord );
  418. if ( FAILED( sc ) )
  419. THROW( CException( sc ) );
  420. iBeginWord = iCur;
  421. }
  422. cwcProcd = iBeginWord;
  423. }
  424. //+---------------------------------------------------------------------------
  425. //
  426. // Member: CDefWordBreaker::ComposePhrase
  427. //
  428. // Synopsis: Convert a noun and a modifier into a phrase
  429. //
  430. // Arguments: [pwcNoun] -- pointer to noun.
  431. // [cwcNoun] -- count of chars in pwcNoun
  432. // [pwcModifier] -- pointer to word modifying pwcNoun
  433. // [cwcModifier] -- count of chars in pwcModifier
  434. // [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
  435. //
  436. // History: 10-Aug-95 SitaramR Created Header
  437. //
  438. //----------------------------------------------------------------------------
  439. SCODE STDMETHODCALLTYPE CDefWordBreaker::ComposePhrase( WCHAR const *pwcNoun,
  440. ULONG cwcNoun,
  441. WCHAR const *pwcModifier,
  442. ULONG cwcModifier,
  443. ULONG ulAttachmentType,
  444. WCHAR *pwcPhrase,
  445. ULONG *pcwcPhrase )
  446. {
  447. //
  448. // Never emitted phrase in the first place.
  449. //
  450. ciDebugOut(( DEB_WARN,
  451. "IWordBreaker::ComposePhrase called on default word breaker\n" ));
  452. return( E_FAIL );
  453. }
  454. //+---------------------------------------------------------------------------
  455. //
  456. // Member: CWordBreaker::GetLicenseToUse
  457. //
  458. // Synopsis: Returns a pointer to vendors license information
  459. //
  460. // Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
  461. //
  462. // History: 10-Aug-95 SitaramR Created Header
  463. //
  464. //----------------------------------------------------------------------------
  465. SCODE STDMETHODCALLTYPE CDefWordBreaker::GetLicenseToUse( const WCHAR **ppwcsLicense )
  466. {
  467. if ( 0 == ppwcsLicense )
  468. return E_INVALIDARG;
  469. static WCHAR const * wcsCopyright = L"Copyright (c) Microsoft Corporation, 1991-1998";
  470. *ppwcsLicense = wcsCopyright;
  471. return( S_OK );
  472. }
  473. extern long gulcInstances;
  474. //+-------------------------------------------------------------------------
  475. //
  476. // Method: CDefWordBreakerCF::CDefWordBreakerCF
  477. //
  478. // Synopsis: Default Word Breaker class factory constructor
  479. //
  480. // History: 07-Feb-1995 SitaramR Created
  481. //
  482. //--------------------------------------------------------------------------
  483. CDefWordBreakerCF::CDefWordBreakerCF( )
  484. : _cRefs( 1 )
  485. {
  486. InterlockedIncrement( &gulcInstances );
  487. }
  488. //+-------------------------------------------------------------------------
  489. //
  490. // Method: CDefWordBreakerCF::~CDefWordBreakerCF
  491. //
  492. // Synopsis: Default Word Breaker class factory destructor
  493. //
  494. // History: 07-Feb-1995 SitaramR Created
  495. //
  496. //--------------------------------------------------------------------------
  497. CDefWordBreakerCF::~CDefWordBreakerCF()
  498. {
  499. InterlockedDecrement( &gulcInstances );
  500. }
  501. //+-------------------------------------------------------------------------
  502. //
  503. // Method: CDefWordBreakerCF::QueryInterface
  504. //
  505. // Synopsis: Rebind to other interface
  506. //
  507. // Arguments: [riid] -- IID of new interface
  508. // [ppvObject] -- New interface * returned here
  509. //
  510. // Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
  511. //
  512. // History: 07-Feb-1995 SitaramR Created
  513. //
  514. //--------------------------------------------------------------------------
  515. SCODE STDMETHODCALLTYPE CDefWordBreakerCF::QueryInterface(
  516. REFIID riid,
  517. void ** ppvObject )
  518. {
  519. if ( IID_IClassFactory == riid )
  520. *ppvObject = (IUnknown *)(IClassFactory *)this;
  521. else if ( IID_IUnknown == riid )
  522. *ppvObject = (IUnknown *)this;
  523. else
  524. {
  525. *ppvObject = 0;
  526. return E_NOINTERFACE;
  527. }
  528. AddRef();
  529. return S_OK;
  530. }
  531. //+-------------------------------------------------------------------------
  532. //
  533. // Method: CDefWordBreakerCF::AddRef
  534. //
  535. // Synopsis: Increments refcount
  536. //
  537. // History: 07-Feb-1995 SitaramR Created
  538. //
  539. //--------------------------------------------------------------------------
  540. ULONG STDMETHODCALLTYPE CDefWordBreakerCF::AddRef()
  541. {
  542. return InterlockedIncrement( &_cRefs );
  543. }
  544. //+-------------------------------------------------------------------------
  545. //
  546. // Method: CDefWordBreakerCF::Release
  547. //
  548. // Synopsis: Decrement refcount. Delete if necessary.
  549. //
  550. // History: 07-Feb-1995 SitaramR Created
  551. //
  552. //--------------------------------------------------------------------------
  553. ULONG STDMETHODCALLTYPE CDefWordBreakerCF::Release()
  554. {
  555. unsigned long uTmp = InterlockedDecrement( &_cRefs );
  556. if ( 0 == uTmp )
  557. delete this;
  558. return uTmp;
  559. }
  560. //+-------------------------------------------------------------------------
  561. //
  562. // Method: CDefWordBreakerCF::CreateInstance
  563. //
  564. // Synopsis: Creates new CDefWordBreaker object
  565. //
  566. // Arguments: [pUnkOuter] -- 'Outer' IUnknown
  567. // [riid] -- Interface to bind
  568. // [ppvObject] -- Interface returned here
  569. //
  570. // History: 07-Feb-1995 SitaramR Created
  571. //
  572. //--------------------------------------------------------------------------
  573. SCODE STDMETHODCALLTYPE CDefWordBreakerCF::CreateInstance( IUnknown * pUnkOuter,
  574. REFIID riid,
  575. void * * ppvObject )
  576. {
  577. CDefWordBreaker *pIUnk = 0;
  578. SCODE sc = S_OK;
  579. TRY
  580. {
  581. pIUnk = new CDefWordBreaker();
  582. sc = pIUnk->QueryInterface( riid , ppvObject );
  583. pIUnk->Release(); // Release extra refcount from QueryInterface
  584. }
  585. CATCH(CException, e)
  586. {
  587. Win4Assert( 0 == pIUnk );
  588. switch( e.GetErrorCode() )
  589. {
  590. case E_OUTOFMEMORY:
  591. sc = (E_OUTOFMEMORY);
  592. break;
  593. default:
  594. sc = (E_UNEXPECTED);
  595. }
  596. }
  597. END_CATCH;
  598. return (sc);
  599. }
  600. //+-------------------------------------------------------------------------
  601. //
  602. // Method: CDefWordBreakerCF::LockServer
  603. //
  604. // Synopsis: Force class factory to remain loaded
  605. //
  606. // Arguments: [fLock] -- TRUE if locking, FALSE if unlocking
  607. //
  608. // Returns: S_OK
  609. //
  610. // History: 07-Feb-1995 SitaramR Created
  611. //
  612. //--------------------------------------------------------------------------
  613. SCODE STDMETHODCALLTYPE CDefWordBreakerCF::LockServer(BOOL fLock)
  614. {
  615. if(fLock)
  616. InterlockedIncrement( &gulcInstances );
  617. else
  618. InterlockedDecrement( &gulcInstances );
  619. return(S_OK);
  620. }