Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

485 lines
13 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1994 - 1997
  5. //
  6. // File: IWBreak.cxx
  7. //
  8. // Contents: FarEast Word Breaker glue code
  9. //
  10. // History: 01-Jul-96 PatHal Created.
  11. // weibz Merged and modified to NT5
  12. //
  13. //----------------------------------------------------------------------------
  14. #include "pch.cxx"
  15. #pragma hdrstop
  16. #include "iwbreak.hxx"
  17. extern long gulcInstances;
  18. #ifdef IWBDBG
  19. void WbDbgOutputInt(WCHAR *pTitle, INT data)
  20. {
  21. WCHAR Outdbg[20];
  22. int itmp, ii;
  23. OutputDebugStringW(pTitle);
  24. for (ii=0; ii<20; ii++)
  25. Outdbg[ii] = 0x0020;
  26. ii =7;
  27. itmp = data;
  28. Outdbg[ii--] = 0x0000;
  29. while (itmp) {
  30. if ( (itmp % 16) < 10 )
  31. Outdbg[ii] = itmp % 16 + L'0';
  32. else
  33. Outdbg[ii] = itmp % 16 + L'A' - 10;
  34. ii --;
  35. itmp = itmp / 16;
  36. }
  37. OutputDebugStringW(Outdbg);
  38. }
  39. #endif
  40. //+---------------------------------------------------------------------------
  41. //
  42. // Member: CWordBreaker::CWordBreaker
  43. //
  44. // Synopsis: Constructor for the CWordBreaker class.
  45. //
  46. // Arguments: [lcid] -- locale id
  47. //
  48. //----------------------------------------------------------------------------
  49. CWordBreaker::CWordBreaker( LCID lcid )
  50. : _cRefs(1),
  51. _lcid(lcid)
  52. {
  53. InterlockedIncrement( &gulcInstances );
  54. #if defined(TH_LOG)
  55. _hLog = ThLogOpen( "log.utf");
  56. #endif
  57. }
  58. //+---------------------------------------------------------------------------
  59. //
  60. // Member: CWordBreaker::~CWordBreaker
  61. //
  62. // Synopsis: Destructor for the CWordBreaker class.
  63. //
  64. // Notes: All termination/deallocation is done by embedded smart pointers
  65. //
  66. //----------------------------------------------------------------------------
  67. CWordBreaker::~CWordBreaker()
  68. {
  69. InterlockedDecrement( &gulcInstances );
  70. #if defined(TH_LOG)
  71. ThLogClose( _hLog );
  72. #endif
  73. }
  74. //+-------------------------------------------------------------------------
  75. //
  76. // Method: CWordBreaker::QueryInterface
  77. //
  78. // Synopsis: Rebind to other interface
  79. //
  80. // Arguments: [riid] -- IID of new interface
  81. // [ppvObject] -- New interface * returned here
  82. //
  83. // Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
  84. //
  85. //--------------------------------------------------------------------------
  86. SCODE STDMETHODCALLTYPE
  87. CWordBreaker::QueryInterface( REFIID riid, void ** ppvObject)
  88. {
  89. //
  90. // Optimize QueryInterface by only checking minimal number of bytes.
  91. //
  92. // IID_IUnknown = 00000000-0000-0000-C000-000000000046
  93. // IID_IWordBreaker = D53552C8-77E3-101A-B552-08002B33B0E6
  94. // --------
  95. // |
  96. // +--- Unique!
  97. //
  98. Assert( (IID_IUnknown.Data1 & 0x000000FF) == 0x00 );
  99. Assert( (IID_IWordBreaker.Data1 & 0x000000FF) == 0xC8 );
  100. IUnknown *pUnkTemp;
  101. SCODE sc = S_OK;
  102. switch( riid.Data1 )
  103. {
  104. case 0x00000000:
  105. if ( memcmp( &IID_IUnknown, &riid, sizeof(riid) ) == 0 )
  106. pUnkTemp = (IUnknown *)this;
  107. else
  108. sc = E_NOINTERFACE;
  109. break;
  110. case 0xD53552C8:
  111. if ( memcmp( &IID_IWordBreaker, &riid, sizeof(riid) ) == 0 )
  112. pUnkTemp = (IUnknown *)(IWordBreaker *)this;
  113. else
  114. sc = E_NOINTERFACE;
  115. break;
  116. default:
  117. pUnkTemp = 0;
  118. sc = E_NOINTERFACE;
  119. break;
  120. }
  121. if( 0 != pUnkTemp )
  122. {
  123. *ppvObject = (void * )pUnkTemp;
  124. pUnkTemp->AddRef();
  125. }
  126. else
  127. *ppvObject = 0;
  128. return(sc);
  129. }
  130. //+-------------------------------------------------------------------------
  131. //
  132. // Method: CWordBreaker::AddRef
  133. //
  134. // Synopsis: Increments refcount
  135. //
  136. //--------------------------------------------------------------------------
  137. ULONG STDMETHODCALLTYPE
  138. CWordBreaker::AddRef()
  139. {
  140. return InterlockedIncrement( &_cRefs );
  141. }
  142. //+-------------------------------------------------------------------------
  143. //
  144. // Method: CWordBreaker::Release
  145. //
  146. // Synopsis: Decrement refcount. Delete if necessary.
  147. //
  148. //--------------------------------------------------------------------------
  149. ULONG STDMETHODCALLTYPE
  150. CWordBreaker::Release()
  151. {
  152. unsigned long uTmp = InterlockedDecrement( &_cRefs );
  153. if ( 0 == uTmp )
  154. delete this;
  155. return(uTmp);
  156. }
  157. //+-------------------------------------------------------------------------
  158. //
  159. // Method: CWordBreaker::Init
  160. //
  161. // Synopsis: Initialize word-breaker
  162. //
  163. // Arguments: [fQuery] -- TRUE if query-time
  164. // [ulMaxTokenSize] -- Maximum size token stored by caller
  165. // [pfLicense] -- Set to true if use restricted
  166. //
  167. // Returns: Status code
  168. //
  169. //--------------------------------------------------------------------------
  170. SCODE STDMETHODCALLTYPE
  171. CWordBreaker::Init(
  172. BOOL fQuery,
  173. ULONG ulMaxTokenSize,
  174. BOOL *pfLicense )
  175. {
  176. if ( NULL == pfLicense )
  177. return E_INVALIDARG;
  178. if (IsBadWritePtr(pfLicense, sizeof(DWORD))) {
  179. return E_INVALIDARG;
  180. }
  181. *pfLicense = TRUE;
  182. _fQuery = fQuery;
  183. _ulMaxTokenSize = ulMaxTokenSize;
  184. return S_OK;
  185. }
  186. //+---------------------------------------------------------------------------
  187. //
  188. // Member: CWordBreaker::ComposePhrase
  189. //
  190. // Synopsis: Convert a noun and a modifier into a phrase.
  191. //
  192. // Arguments: [pwcNoun] -- pointer to noun.
  193. // [cwcNoun] -- count of chars in pwcNoun
  194. // [pwcModifier] -- pointer to word modifying pwcNoun
  195. // [cwcModifier] -- count of chars in pwcModifier
  196. // [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
  197. //
  198. //----------------------------------------------------------------------------
  199. SCODE STDMETHODCALLTYPE
  200. CWordBreaker::ComposePhrase(
  201. WCHAR const *pwcNoun,
  202. ULONG cwcNoun,
  203. WCHAR const *pwcModifier,
  204. ULONG cwcModifier,
  205. ULONG ulAttachmentType,
  206. WCHAR *pwcPhrase,
  207. ULONG *pcwcPhrase )
  208. {
  209. //
  210. // Need to code in later
  211. //
  212. if ( _fQuery )
  213. return( E_NOTIMPL );
  214. else
  215. return ( WBREAK_E_QUERY_ONLY );
  216. }
  217. //+---------------------------------------------------------------------------
  218. //
  219. // Member: CWordBreaker::GetLicenseToUse
  220. //
  221. // Synopsis: Returns a pointer to vendors license information
  222. //
  223. // Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
  224. //
  225. //----------------------------------------------------------------------------
  226. SCODE STDMETHODCALLTYPE
  227. CWordBreaker::GetLicenseToUse(
  228. const WCHAR **ppwcsLicense )
  229. {
  230. static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1998";
  231. if ( NULL == ppwcsLicense )
  232. return E_INVALIDARG;
  233. if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) {
  234. return E_INVALIDARG;
  235. }
  236. *ppwcsLicense = wcsCopyright;
  237. return( S_OK );
  238. }
  239. //+---------------------------------------------------------------------------
  240. //
  241. // Member: CWordBreaker::BreakText
  242. //
  243. // Synopsis: Break input stream into words.
  244. //
  245. // Arguments: [pTextSource] -- source of Unicode text
  246. // [pWordSink] -- sink for collecting words
  247. // [pPhraseSink] -- sink for collecting phrases
  248. //
  249. // Notes: Since the input buffer may be greater than MAX_BUFFER_LEN
  250. // we process the buffer in chunks of length MAX_BUFFER_LEN.
  251. //
  252. //----------------------------------------------------------------------------
  253. SCODE STDMETHODCALLTYPE
  254. CWordBreaker::BreakText(
  255. TEXT_SOURCE *pTextSource,
  256. IWordSink *pWordSink,
  257. IPhraseSink *pPhraseSink )
  258. {
  259. SCODE sc = S_OK;
  260. if ( NULL == pWordSink ) {
  261. // BUGBUG, propagate the null word sink error code
  262. return sc;
  263. }
  264. // BUGBUG, need to normalize nums within T-Hammer, pass as flag?
  265. // turn on noun phrase analysis if there is a phrase sink
  266. if ( 0 != pPhraseSink ) {
  267. // BUGBUG, do we need to pass a separate flag to T-Hammer for this?
  268. // ignore the phrase sink for now
  269. // return sc;
  270. }
  271. if ( ( NULL == pTextSource ) ||
  272. ( pTextSource->iEnd < pTextSource->iCur ) ) {
  273. return E_INVALIDARG;
  274. }
  275. if (pTextSource->iEnd == pTextSource->iCur) {
  276. return S_OK;
  277. }
  278. CONST WCHAR *pwcInput, *pwcStem;
  279. ULONG cwc, cwcTail, iwcCurrent;
  280. DWORD i;
  281. BYTE ct;
  282. BOOL fRomanWord = FALSE;
  283. __try {
  284. cwcTail = pTextSource->iEnd - pTextSource->iCur;
  285. #ifdef IWBDBG
  286. {
  287. WCHAR tmp[2];
  288. DWORD ii;
  289. WbDbgOutputInt(L"\niCur=", pTextSource->iCur);
  290. WbDbgOutputInt(L"\niEnd=", pTextSource->iEnd);
  291. OutputDebugStringW(L"\n the Source String is:\n");
  292. for (ii=pTextSource->iCur; ii<pTextSource->iEnd; ii++) {
  293. tmp[0] = *(pTextSource->awcBuffer + ii);
  294. tmp[1] = L'\0';
  295. OutputDebugStringW(tmp);
  296. }
  297. OutputDebugStringW(L"\n");
  298. }
  299. #endif
  300. do {
  301. cwc = pTextSource->iEnd - pTextSource->iCur;
  302. // Reinit the callback data structure
  303. iwcCurrent = pTextSource->iCur;
  304. pwcStem = pwcInput = pTextSource->awcBuffer + pTextSource->iCur;
  305. for (i=0; i< cwc; i++, pwcInput++) {
  306. if (*(pwcInput) != 0) {
  307. ct = GetCharType(*pwcInput);
  308. if (ct == CH) {
  309. if (!fRomanWord) {
  310. pwcStem = pwcInput;
  311. fRomanWord = TRUE;
  312. }
  313. }
  314. else {
  315. if (fRomanWord) {
  316. DWORD cwcTemp = (DWORD)(pwcInput - pwcStem);
  317. if (cwcTemp > 0) {
  318. (pWordSink->PutWord)(cwcTemp, pwcStem, cwcTemp,
  319. iwcCurrent + (i - cwcTemp));
  320. }
  321. fRomanWord = FALSE;
  322. }
  323. // else {
  324. switch (ct) {
  325. case PS:
  326. (pWordSink->PutBreak)( WORDREP_BREAK_EOS );
  327. case WS:
  328. break;
  329. default:
  330. (pWordSink->PutWord)(1, pwcInput, 1, iwcCurrent + i);
  331. break;
  332. }
  333. // }
  334. }
  335. }
  336. }
  337. if ( !fRomanWord )
  338. pTextSource->iCur += i;
  339. else {
  340. CONST WCHAR *pStart;
  341. pStart = pTextSource->awcBuffer + pTextSource->iCur;
  342. pTextSource->iCur += (DWORD)(pwcStem - pStart);
  343. fRomanWord = FALSE;
  344. }
  345. cwcTail = pTextSource->iEnd - pTextSource->iCur;
  346. } while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) );
  347. // Don't ignore the tail HPB
  348. if (cwcTail > 0) {
  349. iwcCurrent = pTextSource->iCur;
  350. pwcInput = pTextSource->awcBuffer + pTextSource->iCur;
  351. for (i=0; i< cwcTail; i++, pwcInput++) {
  352. if (*(pwcInput) != 0) {
  353. ct = GetCharType(*pwcInput);
  354. if (ct == CH) {
  355. if (!fRomanWord) {
  356. pwcStem = pwcInput;
  357. fRomanWord = TRUE;
  358. }
  359. }
  360. else {
  361. if (fRomanWord) {
  362. DWORD cwcTemp = (DWORD)(pwcInput - pwcStem);
  363. (pWordSink->PutWord)(cwcTemp, pwcStem, cwcTemp,
  364. iwcCurrent + (i - cwcTemp));
  365. fRomanWord = FALSE;
  366. }
  367. // else {
  368. switch (ct) {
  369. case PS:
  370. (pWordSink->PutBreak)( WORDREP_BREAK_EOS );
  371. case WS:
  372. break;
  373. default:
  374. (pWordSink->PutWord)(1, pwcInput, 1, iwcCurrent + i);
  375. break;
  376. }
  377. // }
  378. }
  379. }
  380. }
  381. }
  382. // put the last English word
  383. if (fRomanWord) {
  384. DWORD cwcTemp = (DWORD)(pwcInput - pwcStem);
  385. assert( cwcTemp > 0);
  386. if ( 0 == *(pwcInput-1) ) {
  387. i--;
  388. cwcTemp--;
  389. }
  390. (pWordSink->PutWord)(cwcTemp, pwcStem, cwcTemp,
  391. iwcCurrent + (i - cwcTemp));
  392. fRomanWord = FALSE;
  393. }
  394. } __except(1) {
  395. sc = E_UNEXPECTED;
  396. }
  397. return sc;
  398. }