Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

677 lines
24 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1997
  5. //
  6. // File: IWBreak.cxx
  7. //
  8. // Contents: Thai Word Breaker glue code
  9. //
  10. // History: weibz, 10-Nov-1997 created
  11. //
  12. //----------------------------------------------------------------------------
  13. #include <pch.cxx>
  14. #include <filterr.h>
  15. #include "iwbreak.hxx"
  16. #include "thwbint.h"
  17. #define MAX_BREAKS 255
  18. #define WB_NORMAL 1
  19. extern long gulcInstances;
  20. //+---------------------------------------------------------------------------
  21. //
  22. // Member: CWordBreaker::CWordBreaker
  23. //
  24. // Synopsis: Constructor for the CWordBreaker class.
  25. //
  26. // Arguments: [lcid] -- locale id
  27. //
  28. //----------------------------------------------------------------------------
  29. CWordBreaker::CWordBreaker( LCID lcid )
  30. : _cRefs(1),
  31. _lcid(lcid)
  32. {
  33. InterlockedIncrement( &gulcInstances );
  34. }
  35. //+---------------------------------------------------------------------------
  36. //
  37. // Member: CWordBreaker::~CWordBreaker
  38. //
  39. // Synopsis: Destructor for the CWordBreaker class.
  40. //
  41. // Notes: All termination/deallocation is done by embedded smart pointers
  42. //
  43. //----------------------------------------------------------------------------
  44. CWordBreaker::~CWordBreaker()
  45. {
  46. InterlockedDecrement( &gulcInstances );
  47. }
  48. //+-------------------------------------------------------------------------
  49. //
  50. // Method: CWordBreaker::QueryInterface
  51. //
  52. // Synopsis: Rebind to other interface
  53. //
  54. // Arguments: [riid] -- IID of new interface
  55. // [ppvObject] -- New interface * returned here
  56. //
  57. // Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
  58. //
  59. //--------------------------------------------------------------------------
  60. SCODE STDMETHODCALLTYPE
  61. CWordBreaker::QueryInterface( REFIID riid, void ** ppvObject)
  62. {
  63. if ( 0 == ppvObject )
  64. return E_INVALIDARG;
  65. *ppvObject = 0;
  66. if ( IID_IWordBreaker == riid )
  67. *ppvObject = (IUnknown *)(IWordBreaker *)this;
  68. else if ( IID_IUnknown == riid )
  69. *ppvObject = (IUnknown *)this;
  70. else
  71. return E_NOINTERFACE;
  72. AddRef();
  73. return S_OK;
  74. }
  75. //+-------------------------------------------------------------------------
  76. //
  77. // Method: CWordBreaker::AddRef
  78. //
  79. // Synopsis: Increments refcount
  80. //
  81. //--------------------------------------------------------------------------
  82. ULONG STDMETHODCALLTYPE
  83. CWordBreaker::AddRef()
  84. {
  85. return InterlockedIncrement( &_cRefs );
  86. }
  87. //+-------------------------------------------------------------------------
  88. //
  89. // Method: CWordBreaker::Release
  90. //
  91. // Synopsis: Decrement refcount. Delete if necessary.
  92. //
  93. //--------------------------------------------------------------------------
  94. ULONG STDMETHODCALLTYPE
  95. CWordBreaker::Release()
  96. {
  97. unsigned long uTmp = InterlockedDecrement( &_cRefs );
  98. if ( 0 == uTmp )
  99. delete this;
  100. return(uTmp);
  101. }
  102. //+-------------------------------------------------------------------------
  103. //
  104. // Method: CWordBreaker::Init
  105. //
  106. // Synopsis: Initialize word-breaker
  107. //
  108. // Arguments: [fQuery] -- TRUE if query-time
  109. // [ulMaxTokenSize] -- Maximum size token stored by caller
  110. // [pfLicense] -- Set to true if use restricted
  111. //
  112. // Returns: Status code
  113. //
  114. //--------------------------------------------------------------------------
  115. SCODE STDMETHODCALLTYPE
  116. CWordBreaker::Init(
  117. BOOL fQuery,
  118. ULONG ulMaxTokenSize,
  119. BOOL *pfLicense )
  120. {
  121. if ( NULL == pfLicense ) {
  122. return E_INVALIDARG;
  123. }
  124. if (IsBadWritePtr(pfLicense, sizeof(DWORD))) {
  125. return E_INVALIDARG;
  126. }
  127. *pfLicense = TRUE;
  128. _fQuery = fQuery;
  129. _ulMaxTokenSize = ulMaxTokenSize;
  130. return S_OK;
  131. }
  132. //+---------------------------------------------------------------------------
  133. //
  134. // Member: CWordBreaker::ComposePhrase
  135. //
  136. // Synopsis: Convert a noun and a modifier into a phrase.
  137. //
  138. // Arguments: [pwcNoun] -- pointer to noun.
  139. // [cwcNoun] -- count of chars in pwcNoun
  140. // [pwcModifier] -- pointer to word modifying pwcNoun
  141. // [cwcModifier] -- count of chars in pwcModifier
  142. // [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
  143. //
  144. //----------------------------------------------------------------------------
  145. SCODE STDMETHODCALLTYPE
  146. CWordBreaker::ComposePhrase(
  147. WCHAR const *pwcNoun,
  148. ULONG cwcNoun,
  149. WCHAR const *pwcModifier,
  150. ULONG cwcModifier,
  151. ULONG ulAttachmentType,
  152. WCHAR *pwcPhrase,
  153. ULONG *pcwcPhrase )
  154. {
  155. //
  156. // Need to code in later
  157. //
  158. if ( _fQuery )
  159. return( E_NOTIMPL );
  160. else
  161. return ( WBREAK_E_QUERY_ONLY );
  162. }
  163. //+---------------------------------------------------------------------------
  164. //
  165. // Member: CWordBreaker::GetLicenseToUse
  166. //
  167. // Synopsis: Returns a pointer to vendors license information
  168. //
  169. // Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
  170. //
  171. //----------------------------------------------------------------------------
  172. SCODE STDMETHODCALLTYPE
  173. CWordBreaker::GetLicenseToUse(
  174. const WCHAR **ppwcsLicense )
  175. {
  176. static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1998";
  177. if ( NULL == ppwcsLicense ) {
  178. return E_INVALIDARG;
  179. }
  180. if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) {
  181. return E_INVALIDARG;
  182. }
  183. *ppwcsLicense = wcsCopyright;
  184. return( S_OK );
  185. }
  186. //+---------------------------------------------------------------------------
  187. //
  188. // Member: CWordBreaker::BreakText
  189. //
  190. // Synopsis: Break input stream into words.
  191. //
  192. // Arguments: [pTextSource] -- source of Unicode text
  193. // [pWordSink] -- sink for collecting words
  194. // [pPhraseSink] -- sink for collecting phrases
  195. //
  196. // History: 10-Nov-1997, WeibZ, Created.
  197. //
  198. // Notes: Since the input buffer may be greater than MAX_II_BUFFER_LEN
  199. // we process the buffer in chunks of length MAX_II_BUFFER_LEN.
  200. //
  201. //----------------------------------------------------------------------------
  202. SCODE STDMETHODCALLTYPE CWordBreaker::BreakText( TEXT_SOURCE *pTextSource,
  203. IWordSink *pWordSink,
  204. IPhraseSink *pPhraseSink )
  205. {
  206. SCODE sc = S_OK;
  207. ULONG cwc;
  208. SCRIPT_ITEM *pItems, *pItem_Next, *pItem_org;
  209. SCRIPT_ANALYSIS *psa;
  210. PCWSTR pwcInChars;
  211. INT iItems;
  212. BOOL bItemProc;
  213. PCWSTR pwcChars;
  214. INT cChars;
  215. HRESULT retUSP;
  216. BOOL fSucceeded = true;
  217. if ( NULL == pTextSource ) {
  218. return E_INVALIDARG;
  219. }
  220. if ( NULL == pWordSink )
  221. {
  222. // BUGBUG, propagate the null word sink error code
  223. return sc;
  224. }
  225. if ( 0 != pPhraseSink )
  226. {
  227. // ignore the phrase sink for now
  228. // return sc;
  229. }
  230. if (pTextSource->iEnd == pTextSource->iCur) {
  231. return S_OK;
  232. }
  233. Assert( pTextSource->iCur < pTextSource->iEnd );
  234. __try
  235. {
  236. do
  237. {
  238. if ( pTextSource->iCur >= pTextSource->iEnd )
  239. continue;
  240. cwc = pTextSource->iEnd - pTextSource->iCur;
  241. pwcInChars = pTextSource->awcBuffer + pTextSource->iCur;
  242. pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));
  243. if ( !pItems) {
  244. return E_UNEXPECTED;
  245. }
  246. pItem_org = pItems;
  247. iItems = 0;
  248. retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL,
  249. pItems, &iItems);
  250. if (retUSP != S_OK) {
  251. LocalFree(pItem_org);
  252. return E_UNEXPECTED;
  253. }
  254. while ( iItems > 1 ) {
  255. pItem_Next = pItems + 1;
  256. pwcChars = pwcInChars + pItems->iCharPos;
  257. cChars = pItem_Next->iCharPos - pItems->iCharPos;
  258. sc = ProcessItem( pwcChars,
  259. cChars,
  260. pItems,
  261. FALSE, // no need to keep chars
  262. pTextSource,
  263. pWordSink,
  264. pPhraseSink);
  265. if ( ( FAILED( sc ) ) &&
  266. ( FILTER_E_NO_MORE_VALUES != sc ) &&
  267. ( FILTER_E_NO_TEXT != sc ) &&
  268. ( FILTER_E_NO_VALUES != sc ) &&
  269. ( FILTER_E_NO_MORE_TEXT != sc ) &&
  270. ( FILTER_E_END_OF_CHUNKS != sc ) &&
  271. ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&
  272. ( WBREAK_E_END_OF_TEXT != sc ) ) {
  273. LocalFree(pItem_org);
  274. return sc;
  275. }
  276. sc = S_OK;
  277. pItems++;
  278. iItems--;
  279. }
  280. // special handle for the last item
  281. if ( iItems == 1 ) {
  282. pwcChars = pwcInChars + pItems->iCharPos;
  283. cChars = pTextSource->iEnd - pTextSource->iCur;
  284. sc = ProcessItem(pwcChars,
  285. cChars,
  286. pItems,
  287. TRUE, // need to keep chars
  288. pTextSource,
  289. pWordSink,
  290. pPhraseSink);
  291. if ( ( FAILED( sc ) ) &&
  292. ( FILTER_E_NO_MORE_VALUES != sc ) &&
  293. ( FILTER_E_NO_TEXT != sc ) &&
  294. ( FILTER_E_NO_VALUES != sc ) &&
  295. ( FILTER_E_NO_MORE_TEXT != sc ) &&
  296. ( FILTER_E_END_OF_CHUNKS != sc ) &&
  297. ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&
  298. ( WBREAK_E_END_OF_TEXT != sc ) ) {
  299. LocalFree(pItem_org);
  300. return sc;
  301. }
  302. sc = S_OK;
  303. }
  304. if (pItem_org)
  305. LocalFree(pItem_org);
  306. // O11.17064. Under low memory it is possible to pfnFillTextBuffer to failed.
  307. // We will need to return the error of TextSource for loging to Sharepoint.
  308. sc = pTextSource->pfnFillTextBuffer(pTextSource);
  309. fSucceeded = SUCCEEDED(sc);
  310. if ( ( FAILED( sc ) ) &&
  311. ( FILTER_E_NO_MORE_VALUES != sc ) &&
  312. ( FILTER_E_NO_TEXT != sc ) &&
  313. ( FILTER_E_NO_VALUES != sc ) &&
  314. ( FILTER_E_NO_MORE_TEXT != sc ) &&
  315. ( FILTER_E_END_OF_CHUNKS != sc ) &&
  316. ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&
  317. ( WBREAK_E_END_OF_TEXT != sc ) ) {
  318. return sc;
  319. }
  320. sc = S_OK;
  321. } while (fSucceeded);
  322. if ( pTextSource->iCur < pTextSource->iEnd ) {
  323. cwc = pTextSource->iEnd - pTextSource->iCur;
  324. pwcInChars = pTextSource->awcBuffer + pTextSource->iCur;
  325. pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));
  326. if ( !pItems ) {
  327. return E_UNEXPECTED;
  328. }
  329. pItem_org = pItems;
  330. iItems = 0;
  331. retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL,
  332. pItems, &iItems);
  333. if (retUSP != S_OK) {
  334. LocalFree(pItem_org);
  335. return E_UNEXPECTED;
  336. }
  337. while ( iItems > 1 ) {
  338. pItem_Next = pItems + 1;
  339. pwcChars = pwcInChars + pItems->iCharPos;
  340. cChars = pItem_Next->iCharPos - pItems->iCharPos;
  341. sc = ProcessItem(pwcChars,
  342. cChars,
  343. pItems,
  344. FALSE, // no need to keep chars
  345. pTextSource,
  346. pWordSink,
  347. pPhraseSink);
  348. if ( ( FAILED( sc ) ) &&
  349. ( FILTER_E_NO_MORE_VALUES != sc ) &&
  350. ( FILTER_E_NO_TEXT != sc ) &&
  351. ( FILTER_E_NO_VALUES != sc ) &&
  352. ( FILTER_E_NO_MORE_TEXT != sc ) &&
  353. ( FILTER_E_END_OF_CHUNKS != sc ) &&
  354. ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&
  355. ( WBREAK_E_END_OF_TEXT != sc ) ) {
  356. LocalFree(pItem_org);
  357. return sc;
  358. }
  359. sc = S_OK;
  360. pItems++;
  361. iItems--;
  362. }
  363. if ( iItems == 1 ) {
  364. pwcChars = pwcInChars + pItems->iCharPos;
  365. cChars = pTextSource->iEnd - pTextSource->iCur;
  366. sc = ProcessItem(pwcChars,
  367. cChars,
  368. pItems,
  369. FALSE, // no need to keep chars
  370. pTextSource,
  371. pWordSink,
  372. pPhraseSink);
  373. if ( ( FAILED( sc ) ) &&
  374. ( FILTER_E_NO_MORE_VALUES != sc ) &&
  375. ( FILTER_E_NO_TEXT != sc ) &&
  376. ( FILTER_E_NO_VALUES != sc ) &&
  377. ( FILTER_E_NO_MORE_TEXT != sc ) &&
  378. ( FILTER_E_END_OF_CHUNKS != sc ) &&
  379. ( FILTER_E_EMBEDDING_UNAVAILABLE != sc ) &&
  380. ( WBREAK_E_END_OF_TEXT != sc ) ) {
  381. LocalFree(pItem_org);
  382. return sc;
  383. }
  384. sc = S_OK;
  385. }
  386. if ( pItem_org )
  387. LocalFree(pItem_org);
  388. }
  389. } __except(1) {
  390. sc = E_UNEXPECTED;
  391. }
  392. return sc;
  393. }
  394. SCODE CWordBreaker::ProcessItem(
  395. PCWSTR pwcChars,
  396. INT cChars,
  397. SCRIPT_ITEM *pItems,
  398. BOOL fKeep,
  399. TEXT_SOURCE *pTextSource,
  400. IWordSink *pWordSink,
  401. IPhraseSink *pPhraseSink )
  402. {
  403. INT iChar,i;
  404. INT iWord, iWordStart, iWordLen;
  405. const SCRIPT_PROPERTIES **pScript_Properties;
  406. DWORD LangID;
  407. WORD iScript;
  408. HRESULT retUSP;
  409. SCODE scRetVal = S_OK;
  410. ScriptGetProperties(&pScript_Properties, NULL);
  411. iScript = pItems->a.eScript;
  412. LangID = (pScript_Properties[iScript])->langid;
  413. switch (LangID) {
  414. case LANG_THAI:
  415. {
  416. BYTE* pBreakPos;
  417. int iNumberOfBreak = 0;
  418. int i;
  419. WCHAR* pwch = (WCHAR*) pwcChars;
  420. THWB_STRUCT* pThwbStruct = NULL;
  421. pBreakPos = new BYTE[cChars];
  422. if ( pBreakPos == NULL )
  423. return FALSE;
  424. pThwbStruct = THWB_CreateThwbStruct(cChars);
  425. pBreakPos[0] = 0;
  426. iNumberOfBreak = THWB_IndexWordBreak(pwch,cChars, pBreakPos, pThwbStruct,cChars);
  427. for (i=0;i < iNumberOfBreak; i++)
  428. {
  429. // Search index alternate words.
  430. // If not query create Alternate word.
  431. if (pThwbStruct[i].alt != 0 && !_fQuery)
  432. {
  433. int iNumAltWord = 0, k;
  434. BYTE pAltBreakPos[5];
  435. WCHAR* word1 = pwch;
  436. int indexWord1 = 0;
  437. // Find Alternate words
  438. iNumAltWord = THWB_FindAltWord(word1,pBreakPos[i], pThwbStruct[i].alt, pAltBreakPos);
  439. // Put alternate words.
  440. for(k=0; k<iNumAltWord;k++)
  441. {
  442. scRetVal = pWordSink->PutAltWord(pAltBreakPos[k],&word1[indexWord1],pBreakPos[i],pTextSource->iCur);
  443. indexWord1 += pAltBreakPos[k];
  444. }
  445. }
  446. // if PutAltWord not okay return.
  447. if (scRetVal != S_OK)
  448. break;
  449. if (*pwch >= THAI_Ko_Kai && *pwch <= THAI_Vowel_MaiYaMok)
  450. scRetVal = pWordSink->PutWord(pBreakPos[i], pwch, pBreakPos[i], pTextSource->iCur);
  451. if (scRetVal != S_OK)
  452. break;
  453. pTextSource->iCur += pBreakPos[i];
  454. pwch += pBreakPos[i];
  455. }
  456. if (pBreakPos)
  457. delete pBreakPos;
  458. // Prefix bug 1055941 - clear allocated memory.
  459. THWB_DeleteThwbStruct(pThwbStruct);
  460. break;
  461. }
  462. case LANG_ENGLISH : // handle English chars
  463. {
  464. BYTE ct;
  465. BOOL fRomanWord = FALSE;
  466. CONST WCHAR *pwcInput;
  467. WT Type;
  468. Type = WT_START;
  469. pwcInput = pwcChars;
  470. iWordStart = 0;
  471. for (iChar=0; iChar< cChars; iChar++, pwcInput++)
  472. {
  473. ct = GetCharType(*pwcInput);
  474. if ( (ct != WS) && (ct != PS) )
  475. ct = CH;
  476. switch (ct) {
  477. case CH :
  478. if (!fRomanWord) {
  479. iWordStart = iChar;
  480. fRomanWord = TRUE;
  481. Type = WT_ROMAJI;
  482. }
  483. break;
  484. case WS :
  485. if (fRomanWord) {
  486. iWordLen = iChar - iWordStart;
  487. scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen, pTextSource->iCur);
  488. pTextSource->iCur += iWordLen;
  489. fRomanWord = FALSE;
  490. }
  491. Type = WT_WORD_SEP;
  492. pTextSource->iCur++;
  493. break;
  494. case PS :
  495. if (fRomanWord) {
  496. iWordLen = iChar - iWordStart;
  497. scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen, pTextSource->iCur);
  498. pTextSource->iCur += iWordLen;
  499. fRomanWord = FALSE;
  500. }
  501. Type = WT_PHRASE_SEP;
  502. scRetVal = pWordSink->PutBreak(WORDREP_BREAK_EOS);
  503. pTextSource->iCur++;
  504. break;
  505. }
  506. if (scRetVal != S_OK)
  507. break;
  508. }
  509. if ((Type == WT_WORD_SEP) || (Type == WT_PHRASE_SEP))
  510. break;
  511. if ( fKeep )
  512. break;
  513. if (scRetVal != S_OK)
  514. break;
  515. iWordLen =cChars - iWordStart;
  516. scRetVal = pWordSink->PutWord(iWordLen, pwcChars+iWordStart, iWordLen,pTextSource->iCur);
  517. pTextSource->iCur += iWordLen;
  518. if (scRetVal != S_OK)
  519. {
  520. break;
  521. }
  522. }
  523. break;
  524. default:
  525. pTextSource->iCur += cChars;
  526. break;
  527. }
  528. return scRetVal;
  529. }