Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

445 lines
13 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1994 - 1997
  5. //
  6. // File: IWBreak.cxx
  7. //
  8. // Contents: FarEast Word Breaker glue code
  9. //
  10. // History: 01-Jul-96 PatHal Created.
  11. //
  12. //----------------------------------------------------------------------------
  13. #include "pch.cxx"
  14. #pragma hdrstop
  15. #include "iwbreak.hxx"
  16. extern long gulcInstances;
  17. //+---------------------------------------------------------------------------
  18. //
  19. // Member: CWordBreaker::CWordBreaker
  20. //
  21. // Synopsis: Constructor for the CWordBreaker class.
  22. //
  23. // Arguments: [lcid] -- locale id
  24. //
  25. //----------------------------------------------------------------------------
  26. CWordBreaker::CWordBreaker( LCID lcid )
  27. : _cRefs(1),
  28. _lcid(lcid)
  29. {
  30. InterlockedIncrement( &gulcInstances );
  31. #if defined(TH_LOG)
  32. _hLog = ThLogOpen( "log.utf");
  33. #endif
  34. }
  35. //+---------------------------------------------------------------------------
  36. //
  37. // Member: CWordBreaker::~CWordBreaker
  38. //
  39. // Synopsis: Destructor for the CWordBreaker class.
  40. //
  41. // Notes: All termination/deallocation is done by embedded smart pointers
  42. //
  43. //----------------------------------------------------------------------------
  44. CWordBreaker::~CWordBreaker()
  45. {
  46. InterlockedDecrement( &gulcInstances );
  47. #if defined(TH_LOG)
  48. ThLogClose( _hLog );
  49. #endif
  50. }
  51. //+-------------------------------------------------------------------------
  52. //
  53. // Method: CWordBreaker::QueryInterface
  54. //
  55. // Synopsis: Rebind to other interface
  56. //
  57. // Arguments: [riid] -- IID of new interface
  58. // [ppvObject] -- New interface * returned here
  59. //
  60. // Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
  61. //
  62. //--------------------------------------------------------------------------
  63. SCODE STDMETHODCALLTYPE
  64. CWordBreaker::QueryInterface( REFIID riid, void ** ppvObject)
  65. {
  66. //
  67. // Optimize QueryInterface by only checking minimal number of bytes.
  68. //
  69. // IID_IUnknown = 00000000-0000-0000-C000-000000000046
  70. // IID_IWordBreaker = D53552C8-77E3-101A-B552-08002B33B0E6
  71. // --------
  72. // |
  73. // +--- Unique!
  74. //
  75. Assert( (IID_IUnknown.Data1 & 0x000000FF) == 0x00 );
  76. Assert( (IID_IWordBreaker.Data1 & 0x000000FF) == 0xC8 );
  77. IUnknown *pUnkTemp;
  78. SCODE sc = S_OK;
  79. switch( riid.Data1 )
  80. {
  81. case 0x00000000:
  82. if ( memcmp( &IID_IUnknown, &riid, sizeof(riid) ) == 0 )
  83. pUnkTemp = (IUnknown *)this;
  84. else
  85. sc = E_NOINTERFACE;
  86. break;
  87. case 0xD53552C8:
  88. if ( memcmp( &IID_IWordBreaker, &riid, sizeof(riid) ) == 0 )
  89. pUnkTemp = (IUnknown *)(IWordBreaker *)this;
  90. else
  91. sc = E_NOINTERFACE;
  92. break;
  93. default:
  94. pUnkTemp = 0;
  95. sc = E_NOINTERFACE;
  96. break;
  97. }
  98. if( 0 != pUnkTemp )
  99. {
  100. *ppvObject = (void * )pUnkTemp;
  101. pUnkTemp->AddRef();
  102. }
  103. else
  104. *ppvObject = 0;
  105. return(sc);
  106. }
  107. //+-------------------------------------------------------------------------
  108. //
  109. // Method: CWordBreaker::AddRef
  110. //
  111. // Synopsis: Increments refcount
  112. //
  113. //--------------------------------------------------------------------------
  114. ULONG STDMETHODCALLTYPE
  115. CWordBreaker::AddRef()
  116. {
  117. return InterlockedIncrement( &_cRefs );
  118. }
  119. //+-------------------------------------------------------------------------
  120. //
  121. // Method: CWordBreaker::Release
  122. //
  123. // Synopsis: Decrement refcount. Delete if necessary.
  124. //
  125. //--------------------------------------------------------------------------
  126. ULONG STDMETHODCALLTYPE
  127. CWordBreaker::Release()
  128. {
  129. unsigned long uTmp = InterlockedDecrement( &_cRefs );
  130. if ( 0 == uTmp )
  131. delete this;
  132. return(uTmp);
  133. }
  134. //+-------------------------------------------------------------------------
  135. //
  136. // Method: CWordBreaker::Init
  137. //
  138. // Synopsis: Initialize word-breaker
  139. //
  140. // Arguments: [fQuery] -- TRUE if query-time
  141. // [ulMaxTokenSize] -- Maximum size token stored by caller
  142. // [pfLicense] -- Set to true if use restricted
  143. //
  144. // Returns: Status code
  145. //
  146. //--------------------------------------------------------------------------
  147. SCODE STDMETHODCALLTYPE
  148. CWordBreaker::Init(
  149. BOOL fQuery,
  150. ULONG ulMaxTokenSize,
  151. BOOL *pfLicense )
  152. {
  153. if (IsBadWritePtr(pfLicense, sizeof(DWORD))) {
  154. return E_FAIL;
  155. }
  156. *pfLicense = TRUE;
  157. _fQuery = fQuery;
  158. _ulMaxTokenSize = ulMaxTokenSize;
  159. return S_OK;
  160. }
  161. //+---------------------------------------------------------------------------
  162. //
  163. // Member: CWordBreaker::ComposePhrase
  164. //
  165. // Synopsis: Convert a noun and a modifier into a phrase.
  166. //
  167. // Arguments: [pwcNoun] -- pointer to noun.
  168. // [cwcNoun] -- count of chars in pwcNoun
  169. // [pwcModifier] -- pointer to word modifying pwcNoun
  170. // [cwcModifier] -- count of chars in pwcModifier
  171. // [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
  172. //
  173. //----------------------------------------------------------------------------
  174. SCODE STDMETHODCALLTYPE
  175. CWordBreaker::ComposePhrase(
  176. WCHAR const *pwcNoun,
  177. ULONG cwcNoun,
  178. WCHAR const *pwcModifier,
  179. ULONG cwcModifier,
  180. ULONG ulAttachmentType,
  181. WCHAR *pwcPhrase,
  182. ULONG *pcwcPhrase )
  183. {
  184. //
  185. // Need to code in later
  186. //
  187. if ( _fQuery )
  188. return( E_NOTIMPL );
  189. else
  190. return ( WBREAK_E_QUERY_ONLY );
  191. }
  192. //+---------------------------------------------------------------------------
  193. //
  194. // Member: CWordBreaker::GetLicenseToUse
  195. //
  196. // Synopsis: Returns a pointer to vendors license information
  197. //
  198. // Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
  199. //
  200. //----------------------------------------------------------------------------
  201. SCODE STDMETHODCALLTYPE
  202. CWordBreaker::GetLicenseToUse(
  203. const WCHAR **ppwcsLicense )
  204. {
  205. static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1995";
  206. if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) {
  207. return E_FAIL;
  208. }
  209. *ppwcsLicense = wcsCopyright;
  210. return( S_OK );
  211. }
  212. //+---------------------------------------------------------------------------
  213. //
  214. // Member: CWordBreaker::BreakText
  215. //
  216. // Synopsis: Break input stream into words.
  217. //
  218. // Arguments: [pTextSource] -- source of Unicode text
  219. // [pWordSink] -- sink for collecting words
  220. // [pPhraseSink] -- sink for collecting phrases
  221. //
  222. // Notes: Since the input buffer may be greater than MAX_BUFFER_LEN
  223. // we process the buffer in chunks of length MAX_BUFFER_LEN.
  224. //
  225. //----------------------------------------------------------------------------
  226. SCODE STDMETHODCALLTYPE
  227. CWordBreaker::BreakText(
  228. TEXT_SOURCE *pTextSource,
  229. IWordSink *pWordSink,
  230. IPhraseSink *pPhraseSink )
  231. {
  232. SCODE sc = S_OK;
  233. if ( 0 == pWordSink ) {
  234. // BUGBUG, propagate the null word sink error code
  235. return sc;
  236. }
  237. // BUGBUG, need to normalize nums within T-Hammer, pass as flag?
  238. // turn on noun phrase analysis if there is a phrase sink
  239. if ( 0 != pPhraseSink ) {
  240. // BUGBUG, do we need to pass a separate flag to T-Hammer for this?
  241. // ignore the phrase sink for now
  242. // return sc;
  243. }
  244. if ( ( 0 == pTextSource ) ||
  245. ( pTextSource->iEnd < pTextSource->iCur ) ) {
  246. return E_FAIL;
  247. }
  248. if (pTextSource->iEnd == pTextSource->iCur) {
  249. return S_OK;
  250. }
  251. CONST WCHAR *pwcInput, *pwcStem;
  252. ULONG cwc, cwcTail, iwcCurrent;
  253. DWORD i;
  254. BYTE ct;
  255. BOOL fRomanWord = FALSE;
  256. __try {
  257. cwcTail = pTextSource->iEnd - pTextSource->iCur;
  258. #if defined(TH_LOG)
  259. // DEBUG: Print out one QUERY or INDEX banner per entry to BreakText
  260. //
  261. WCHAR wszBanner[256];
  262. if (_fQuery) {
  263. wsprintfW( wszBanner, L"\r\n== QUERY ===================================================\r\n");
  264. }
  265. else {
  266. wsprintfW( wszBanner, L"\r\n== INDEX ===================================================\r\n");
  267. }
  268. ThLogWrite( _hLog, wszBanner );
  269. #endif // defined(TH_LOG)
  270. do {
  271. cwc = pTextSource->iEnd - pTextSource->iCur;
  272. // Reinit the callback data structure
  273. iwcCurrent = pTextSource->iCur;
  274. pwcStem = pwcInput = pTextSource->awcBuffer + pTextSource->iCur;
  275. #if defined(TH_LOG)
  276. // DEBUG: Print out one of these banners for every buffer of text
  277. //
  278. wsprintfW( wszBanner, L"\r\n*********************************************************\r\n");
  279. ThLogWrite( _hLog, wszBanner );
  280. wsprintfW( wszBanner, L"iCur = %d, iEnd = %d\r\n\r\n", pTextSource->iCur, pTextSource->iEnd);
  281. ThLogWrite( _hLog, wszBanner );
  282. WCHAR *pwc = (WCHAR*)LocalAlloc( LMEM_FIXED, (cwc+1) * sizeof(WCHAR));
  283. if (pwc != NULL) {
  284. memcpy( pwc, &pTextSource->awcBuffer[ pTextSource->iCur ], cwc * sizeof(WCHAR) );
  285. *(pwc+cwc)=L'\0';
  286. ThLogWrite( _hLog, pwc );
  287. LocalFree(pwc);
  288. }
  289. #endif defined(TH_LOG)
  290. for (i=0; i< cwc-1; i++, pwcInput++) {
  291. if (*(pwcInput+1) != 0) {
  292. ct = GetCharType(*pwcInput);
  293. if (ct == CH) {
  294. if (!fRomanWord) {
  295. pwcStem = pwcInput;
  296. fRomanWord = TRUE;
  297. }
  298. }
  299. else {
  300. if (fRomanWord) {
  301. DWORD cwcTemp = pwcInput - pwcStem;
  302. if (cwcTemp > 0) {
  303. (pWordSink->PutWord)(cwcTemp, pwcStem, cwcTemp,
  304. iwcCurrent + (i - cwcTemp));
  305. }
  306. fRomanWord = FALSE;
  307. }
  308. else {
  309. switch (ct) {
  310. case PS:
  311. (pWordSink->PutBreak)( WORDREP_BREAK_EOS );
  312. case WS:
  313. break;
  314. default:
  315. (pWordSink->PutWord)(2, pwcInput, 2, iwcCurrent + i);
  316. break;
  317. }
  318. }
  319. }
  320. }
  321. }
  322. pTextSource->iCur += i;
  323. cwcTail = 1;
  324. } while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) );
  325. // Don't ignore the tail HPB
  326. if (cwcTail > 1) {
  327. iwcCurrent = pTextSource->iCur;
  328. pwcInput = pTextSource->awcBuffer + pTextSource->iCur;
  329. #if defined(TH_LOG)
  330. wsprintfW( wszBanner, L"\r\n** TAIL *************************************************\r\n");
  331. ThLogWrite( _hLog, wszBanner );
  332. wsprintfW( wszBanner, L"iCur = %d, iEnd = %d, cwcTail = %d\r\n\r\n", pTextSource->iCur, pTextSource->iEnd, cwcTail);
  333. ThLogWrite( _hLog, wszBanner );
  334. WCHAR *pwc = (WCHAR*)LocalAlloc( LMEM_FIXED, (cwcTail+1) * sizeof(WCHAR));
  335. if (pwc != NULL) {
  336. memcpy( pwc, &pTextSource->awcBuffer[ pTextSource->iCur ], cwcTail * sizeof(WCHAR) );
  337. *(pwc+cwcTail)=L'\0';
  338. ThLogWrite( _hLog, pwc );
  339. LocalFree(pwc);
  340. }
  341. #endif defined(TH_LOG)
  342. for (i=0; i< cwcTail-1; i++, pwcInput++) {
  343. if (*(pwcInput+1) != 0) {
  344. ct = GetCharType(*pwcInput);
  345. if (ct == CH) {
  346. if (!fRomanWord) {
  347. pwcStem = pwcInput;
  348. fRomanWord = TRUE;
  349. }
  350. }
  351. else {
  352. if (fRomanWord) {
  353. DWORD cwcTemp = pwcInput - pwcStem;
  354. (pWordSink->PutWord)(cwcTemp, pwcStem, cwcTemp,
  355. iwcCurrent + (i - cwcTemp));
  356. fRomanWord = FALSE;
  357. }
  358. else {
  359. switch (ct) {
  360. case PS:
  361. (pWordSink->PutBreak)( WORDREP_BREAK_EOS );
  362. case WS:
  363. break;
  364. default:
  365. (pWordSink->PutWord)(2, pwcInput, 2, iwcCurrent + i);
  366. break;
  367. }
  368. }
  369. }
  370. }
  371. }
  372. }
  373. // put the last English word
  374. if (fRomanWord) {
  375. DWORD cwcTemp = pwcInput - pwcStem;
  376. if (cwcTemp > 0) {
  377. (pWordSink->PutWord)(cwcTemp, pwcStem, cwcTemp,
  378. iwcCurrent + (i - cwcTemp));
  379. }
  380. fRomanWord = FALSE;
  381. }
  382. } __except(1) {
  383. sc = E_UNEXPECTED;
  384. }
  385. return sc;
  386. }