Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

724 lines
20 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1997
  5. //
  6. // File: IWBreak.cxx
  7. //
  8. // Contents: Thai Word Breaker glue code
  9. //
  10. // History: weibz, 10-Nov-1997 created
  11. //
  12. //----------------------------------------------------------------------------
  13. #include <pch.cxx>
  14. #include "iwbreak.hxx"
  15. extern long gulcInstances;
  16. //+---------------------------------------------------------------------------
  17. //
  18. // Member: CWordBreaker::CWordBreaker
  19. //
  20. // Synopsis: Constructor for the CWordBreaker class.
  21. //
  22. // Arguments: [lcid] -- locale id
  23. //
  24. //----------------------------------------------------------------------------
  25. CWordBreaker::CWordBreaker( LCID lcid )
  26. : _cRefs(1),
  27. _lcid(lcid)
  28. {
  29. InterlockedIncrement( &gulcInstances );
  30. }
  31. //+---------------------------------------------------------------------------
  32. //
  33. // Member: CWordBreaker::~CWordBreaker
  34. //
  35. // Synopsis: Destructor for the CWordBreaker class.
  36. //
  37. // Notes: All termination/deallocation is done by embedded smart pointers
  38. //
  39. //----------------------------------------------------------------------------
  40. CWordBreaker::~CWordBreaker()
  41. {
  42. InterlockedDecrement( &gulcInstances );
  43. }
  44. //+-------------------------------------------------------------------------
  45. //
  46. // Method: CWordBreaker::QueryInterface
  47. //
  48. // Synopsis: Rebind to other interface
  49. //
  50. // Arguments: [riid] -- IID of new interface
  51. // [ppvObject] -- New interface * returned here
  52. //
  53. // Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
  54. //
  55. //--------------------------------------------------------------------------
  56. SCODE STDMETHODCALLTYPE
  57. CWordBreaker::QueryInterface( REFIID riid, void ** ppvObject)
  58. {
  59. //
  60. // Optimize QueryInterface by only checking minimal number of bytes.
  61. //
  62. // IID_IUnknown = 00000000-0000-0000-C000-000000000046
  63. // IID_IWordBreaker = D53552C8-77E3-101A-B552-08002B33B0E6
  64. // --------
  65. // |
  66. // +--- Unique!
  67. //
  68. Assert( (IID_IUnknown.Data1 & 0x000000FF) == 0x00 );
  69. Assert( (IID_IWordBreaker.Data1 & 0x000000FF) == 0xC8 );
  70. IUnknown *pUnkTemp;
  71. SCODE sc = S_OK;
  72. switch( riid.Data1 )
  73. {
  74. case 0x00000000:
  75. if ( memcmp( &IID_IUnknown, &riid, sizeof(riid) ) == 0 )
  76. pUnkTemp = (IUnknown *)this;
  77. else
  78. sc = E_NOINTERFACE;
  79. break;
  80. case 0xD53552C8:
  81. if ( memcmp( &IID_IWordBreaker, &riid, sizeof(riid) ) == 0 )
  82. pUnkTemp = (IUnknown *)(IWordBreaker *)this;
  83. else
  84. sc = E_NOINTERFACE;
  85. break;
  86. default:
  87. pUnkTemp = 0;
  88. sc = E_NOINTERFACE;
  89. break;
  90. }
  91. if( 0 != pUnkTemp )
  92. {
  93. *ppvObject = (void * )pUnkTemp;
  94. pUnkTemp->AddRef();
  95. }
  96. else
  97. *ppvObject = 0;
  98. return(sc);
  99. }
  100. //+-------------------------------------------------------------------------
  101. //
  102. // Method: CWordBreaker::AddRef
  103. //
  104. // Synopsis: Increments refcount
  105. //
  106. //--------------------------------------------------------------------------
  107. ULONG STDMETHODCALLTYPE
  108. CWordBreaker::AddRef()
  109. {
  110. return InterlockedIncrement( &_cRefs );
  111. }
  112. //+-------------------------------------------------------------------------
  113. //
  114. // Method: CWordBreaker::Release
  115. //
  116. // Synopsis: Decrement refcount. Delete if necessary.
  117. //
  118. //--------------------------------------------------------------------------
  119. ULONG STDMETHODCALLTYPE
  120. CWordBreaker::Release()
  121. {
  122. unsigned long uTmp = InterlockedDecrement( &_cRefs );
  123. if ( 0 == uTmp )
  124. delete this;
  125. return(uTmp);
  126. }
  127. //+-------------------------------------------------------------------------
  128. //
  129. // Method: CWordBreaker::Init
  130. //
  131. // Synopsis: Initialize word-breaker
  132. //
  133. // Arguments: [fQuery] -- TRUE if query-time
  134. // [ulMaxTokenSize] -- Maximum size token stored by caller
  135. // [pfLicense] -- Set to true if use restricted
  136. //
  137. // Returns: Status code
  138. //
  139. //--------------------------------------------------------------------------
  140. SCODE STDMETHODCALLTYPE
  141. CWordBreaker::Init(
  142. BOOL fQuery,
  143. ULONG ulMaxTokenSize,
  144. BOOL *pfLicense )
  145. {
  146. if ( NULL == pfLicense ) {
  147. return E_INVALIDARG;
  148. }
  149. if (IsBadWritePtr(pfLicense, sizeof(DWORD))) {
  150. return E_INVALIDARG;
  151. }
  152. *pfLicense = TRUE;
  153. _fQuery = fQuery;
  154. _ulMaxTokenSize = ulMaxTokenSize;
  155. return S_OK;
  156. }
  157. //+---------------------------------------------------------------------------
  158. //
  159. // Member: CWordBreaker::ComposePhrase
  160. //
  161. // Synopsis: Convert a noun and a modifier into a phrase.
  162. //
  163. // Arguments: [pwcNoun] -- pointer to noun.
  164. // [cwcNoun] -- count of chars in pwcNoun
  165. // [pwcModifier] -- pointer to word modifying pwcNoun
  166. // [cwcModifier] -- count of chars in pwcModifier
  167. // [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
  168. //
  169. //----------------------------------------------------------------------------
  170. SCODE STDMETHODCALLTYPE
  171. CWordBreaker::ComposePhrase(
  172. WCHAR const *pwcNoun,
  173. ULONG cwcNoun,
  174. WCHAR const *pwcModifier,
  175. ULONG cwcModifier,
  176. ULONG ulAttachmentType,
  177. WCHAR *pwcPhrase,
  178. ULONG *pcwcPhrase )
  179. {
  180. //
  181. // Need to code in later
  182. //
  183. if ( _fQuery )
  184. return( E_NOTIMPL );
  185. else
  186. return ( WBREAK_E_QUERY_ONLY );
  187. }
  188. //+---------------------------------------------------------------------------
  189. //
  190. // Member: CWordBreaker::GetLicenseToUse
  191. //
  192. // Synopsis: Returns a pointer to vendors license information
  193. //
  194. // Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
  195. //
  196. //----------------------------------------------------------------------------
  197. SCODE STDMETHODCALLTYPE
  198. CWordBreaker::GetLicenseToUse(
  199. const WCHAR **ppwcsLicense )
  200. {
  201. static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1998";
  202. if ( NULL == ppwcsLicense ) {
  203. return E_INVALIDARG;
  204. }
  205. if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) {
  206. return E_INVALIDARG;
  207. }
  208. *ppwcsLicense = wcsCopyright;
  209. return( S_OK );
  210. }
  211. //+---------------------------------------------------------------------------
  212. //
  213. // Member: CWordBreaker::BreakText
  214. //
  215. // Synopsis: Break input stream into words.
  216. //
  217. // Arguments: [pTextSource] -- source of Unicode text
  218. // [pWordSink] -- sink for collecting words
  219. // [pPhraseSink] -- sink for collecting phrases
  220. //
  221. // History: 10-Nov-1997, WeibZ, Created.
  222. //
  223. // Notes: Since the input buffer may be greater than MAX_II_BUFFER_LEN
  224. // we process the buffer in chunks of length MAX_II_BUFFER_LEN.
  225. //
  226. //----------------------------------------------------------------------------
  227. SCODE STDMETHODCALLTYPE CWordBreaker::BreakText( TEXT_SOURCE *pTextSource,
  228. IWordSink *pWordSink,
  229. IPhraseSink *pPhraseSink )
  230. {
  231. SCODE sc = S_OK;
  232. ULONG cwc;
  233. SCRIPT_ITEM *pItems, *pItem_Next, *pItem_org;
  234. SCRIPT_ANALYSIS *psa;
  235. PCWSTR pwcInChars;
  236. INT iItems;
  237. BOOL bItemProc;
  238. PCWSTR pwcChars;
  239. INT cChars;
  240. HRESULT retUSP;
  241. if ( NULL == pTextSource ) {
  242. return E_INVALIDARG;
  243. }
  244. if ( NULL == pWordSink )
  245. {
  246. // BUGBUG, propagate the null word sink error code
  247. return sc;
  248. }
  249. if ( 0 != pPhraseSink )
  250. {
  251. // ignore the phrase sink for now
  252. // return sc;
  253. }
  254. if (pTextSource->iEnd == pTextSource->iCur) {
  255. return S_OK;
  256. }
  257. Assert( pTextSource->iCur < pTextSource->iEnd );
  258. __try
  259. {
  260. do {
  261. if ( pTextSource->iCur >= pTextSource->iEnd )
  262. continue;
  263. cwc = pTextSource->iEnd - pTextSource->iCur;
  264. pwcInChars = pTextSource->awcBuffer + pTextSource->iCur;
  265. pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));
  266. if ( !pItems) {
  267. return E_UNEXPECTED;
  268. }
  269. pItem_org = pItems;
  270. iItems = 0;
  271. retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL,
  272. pItems, &iItems);
  273. if (retUSP != S_OK) {
  274. LocalFree(pItem_org);
  275. return E_UNEXPECTED;
  276. }
  277. while ( iItems > 1 ) {
  278. pItem_Next = pItems + 1;
  279. pwcChars = pwcInChars + pItems->iCharPos;
  280. cChars = pItem_Next->iCharPos - pItems->iCharPos;
  281. bItemProc = ProcessItem(pwcChars,
  282. cChars,
  283. pItems,
  284. FALSE, // no need to keep chars
  285. pTextSource,
  286. pWordSink,
  287. pPhraseSink);
  288. if ( bItemProc == FALSE ) {
  289. LocalFree(pItem_org);
  290. return E_UNEXPECTED;
  291. }
  292. pItems++;
  293. iItems--;
  294. }
  295. // special handle for the last item
  296. if ( iItems == 1 ) {
  297. pwcChars = pwcInChars + pItems->iCharPos;
  298. cChars = pTextSource->iEnd - pTextSource->iCur;
  299. bItemProc = ProcessItem(pwcChars,
  300. cChars,
  301. pItems,
  302. TRUE, // need to keep chars
  303. pTextSource,
  304. pWordSink,
  305. pPhraseSink);
  306. if ( bItemProc == FALSE ) {
  307. LocalFree(pItem_org);
  308. return E_UNEXPECTED;
  309. }
  310. }
  311. if (pItem_org)
  312. LocalFree(pItem_org);
  313. } while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) );
  314. if ( pTextSource->iCur < pTextSource->iEnd ) {
  315. cwc = pTextSource->iEnd - pTextSource->iCur;
  316. pwcInChars = pTextSource->awcBuffer + pTextSource->iCur;
  317. pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));
  318. if ( !pItems ) {
  319. return E_UNEXPECTED;
  320. }
  321. pItem_org = pItems;
  322. iItems = 0;
  323. retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL,
  324. pItems, &iItems);
  325. if (retUSP != S_OK) {
  326. LocalFree(pItem_org);
  327. return E_UNEXPECTED;
  328. }
  329. while ( iItems > 1 ) {
  330. pItem_Next = pItems + 1;
  331. pwcChars = pwcInChars + pItems->iCharPos;
  332. cChars = pItem_Next->iCharPos - pItems->iCharPos;
  333. bItemProc = ProcessItem(pwcChars,
  334. cChars,
  335. pItems,
  336. FALSE, // no need to keep chars
  337. pTextSource,
  338. pWordSink,
  339. pPhraseSink);
  340. if ( bItemProc == FALSE ) {
  341. LocalFree(pItem_org);
  342. return E_UNEXPECTED;
  343. }
  344. pItems++;
  345. iItems--;
  346. }
  347. if ( iItems == 1 ) {
  348. pwcChars = pwcInChars + pItems->iCharPos;
  349. cChars = pTextSource->iEnd - pTextSource->iCur;
  350. bItemProc = ProcessItem(pwcChars,
  351. cChars,
  352. pItems,
  353. FALSE, // no need to keep chars
  354. pTextSource,
  355. pWordSink,
  356. pPhraseSink);
  357. if ( bItemProc == FALSE ) {
  358. LocalFree(pItem_org);
  359. return E_UNEXPECTED;
  360. }
  361. }
  362. if ( pItem_org )
  363. LocalFree(pItem_org);
  364. }
  365. } __except(1) {
  366. sc = E_UNEXPECTED;
  367. }
  368. return sc;
  369. }
  370. BOOL CWordBreaker::ProcessItem(
  371. PCWSTR pwcChars,
  372. INT cChars,
  373. SCRIPT_ITEM *pItems,
  374. BOOL fKeep,
  375. TEXT_SOURCE *pTextSource,
  376. IWordSink *pWordSink,
  377. IPhraseSink *pPhraseSink )
  378. {
  379. SCRIPT_LOGATTR *psla, *psla_org, *pslatmp;
  380. INT iChar,i;
  381. INT iWord, iWordStart, iWordLen;
  382. PTHAIWORD pThaiWord, pThaiWordTmp;
  383. BOOL fLastIsWhiteSpace=FALSE;
  384. const SCRIPT_PROPERTIES **pScript_Properties;
  385. DWORD LangID;
  386. WORD iScript;
  387. HRESULT retUSP;
  388. ScriptGetProperties(&pScript_Properties, NULL);
  389. iScript = pItems->a.eScript;
  390. LangID = (pScript_Properties[iScript])->langid;
  391. switch (LangID) {
  392. case LANG_THAI: //Thai script, there are SCRIPT_THAI & SCRIPT_THAI_NUM
  393. // if (pScript_Properties[iScript].fNeedsWordBreaking == FALSE) {
  394. // // this is SCRIPT_THAI_NUM,
  395. //
  396. // for ( iChar=0; iChar < cChars; iChar++ ) {
  397. // pWordSink->PutWord(1,
  398. // pwcChars+iChar,
  399. // 1,
  400. // pTextSource->iCur);
  401. // pTextSource->iCur ++;
  402. // }
  403. //
  404. // break;
  405. //
  406. // }
  407. //
  408. // // Following code is for SCRIPT_THAI
  409. psla = (SCRIPT_LOGATTR *)LocalAlloc(LPTR,
  410. (sizeof(SCRIPT_LOGATTR)+sizeof(THAIWORD))*cChars);
  411. psla_org = psla;
  412. pThaiWord = (PTHAIWORD)&psla[cChars];
  413. if ( psla == NULL )
  414. return FALSE;
  415. retUSP = ScriptBreak(pwcChars, cChars, &(pItems->a), psla);
  416. if (retUSP != S_OK) {
  417. LocalFree(psla_org);
  418. return FALSE;
  419. }
  420. pThaiWordTmp = pThaiWord;
  421. // check if the last char is a white space.
  422. if ( psla[cChars-1].fWhiteSpace )
  423. fLastIsWhiteSpace=TRUE;
  424. iWordStart = -1;
  425. iWord=0;
  426. for (iChar=0; iChar < cChars; iChar++) {
  427. if (psla->fWordStop) {
  428. if ( iWordStart != -1 ) {
  429. iWordLen = iChar - iWordStart;
  430. if (iWordLen > 0) {
  431. pThaiWordTmp->offset = iWordStart;
  432. pThaiWordTmp->len = iWordLen;
  433. pThaiWordTmp++;
  434. iWord++;
  435. }
  436. }
  437. iWordStart = iChar;
  438. }
  439. if ( psla->fWhiteSpace ) {
  440. if ( iWordStart != -1 ) {
  441. iWordLen = iChar - iWordStart;
  442. if (iWordLen > 0 ) {
  443. pThaiWordTmp->offset = iWordStart;
  444. pThaiWordTmp->len = iWordLen;
  445. pThaiWordTmp++;
  446. iWord++;
  447. }
  448. }
  449. iWordStart = iChar;
  450. // skip all the consequent white spaces.
  451. while ( (psla->fWhiteSpace) && (iChar < cChars) ) {
  452. iWordStart = iChar;
  453. iChar ++;
  454. psla++;
  455. }
  456. if ( iChar < cChars)
  457. iWordStart = iChar;
  458. }
  459. psla++;
  460. }
  461. if ( !fLastIsWhiteSpace ) {
  462. iWordLen = cChars - iWordStart;
  463. if ( iWordLen > 0 ) {
  464. pThaiWordTmp->offset = iWordStart;
  465. pThaiWordTmp->len = iWordLen;
  466. iWord++;
  467. }
  468. }
  469. // if this is the last item, and the last char is a Thai char,
  470. // we will discard the last 4 words and integrate into the next
  471. // filled buffer and do again!!!
  472. if (fKeep && !fLastIsWhiteSpace)
  473. iWord = iWord - 4;
  474. pThaiWordTmp = pThaiWord;
  475. for (i=0; i<iWord; i++) {
  476. iWordLen = pThaiWordTmp->len;
  477. iWordStart = pThaiWordTmp->offset;
  478. pWordSink->PutWord(iWordLen,
  479. pwcChars+iWordStart,
  480. iWordLen,
  481. pTextSource->iCur+iWordStart);
  482. pThaiWordTmp++;
  483. }
  484. if ( fKeep && !fLastIsWhiteSpace)
  485. pTextSource->iCur += iWordStart + iWordLen;
  486. else
  487. pTextSource->iCur += cChars;
  488. LocalFree(psla_org);
  489. break;
  490. case LANG_ENGLISH : // handle English chars
  491. {
  492. BYTE ct;
  493. BOOL fRomanWord = FALSE;
  494. CONST WCHAR *pwcInput;
  495. WT Type;
  496. Type = WT_START;
  497. pwcInput = pwcChars;
  498. iWordStart = 0;
  499. for (iChar=0; iChar< cChars; iChar++, pwcInput++) {
  500. ct = GetCharType(*pwcInput);
  501. if ( (ct != WS) && (ct != PS) )
  502. ct = CH;
  503. switch (ct) {
  504. case CH :
  505. if (!fRomanWord) {
  506. iWordStart = iChar;
  507. fRomanWord = TRUE;
  508. Type = WT_ROMAJI;
  509. }
  510. break;
  511. case WS :
  512. if (fRomanWord) {
  513. iWordLen = iChar - iWordStart;
  514. pWordSink->PutWord(iWordLen,
  515. pwcChars+iWordStart,
  516. iWordLen,
  517. pTextSource->iCur);
  518. pTextSource->iCur += iWordLen;
  519. fRomanWord = FALSE;
  520. }
  521. Type = WT_WORD_SEP;
  522. pTextSource->iCur++;
  523. break;
  524. case PS :
  525. if (fRomanWord) {
  526. iWordLen = iChar - iWordStart;
  527. pWordSink->PutWord(iWordLen,
  528. pwcChars+iWordStart,
  529. iWordLen,
  530. pTextSource->iCur);
  531. pTextSource->iCur += iWordLen;
  532. fRomanWord = FALSE;
  533. }
  534. Type = WT_PHRASE_SEP;
  535. pWordSink->PutBreak(WORDREP_BREAK_EOS);
  536. pTextSource->iCur++;
  537. break;
  538. }
  539. }
  540. if ((Type == WT_WORD_SEP) || (Type == WT_PHRASE_SEP))
  541. break;
  542. if ( fKeep )
  543. break;
  544. iWordLen =cChars - iWordStart;
  545. pWordSink->PutWord(iWordLen,
  546. pwcChars+iWordStart,
  547. iWordLen,
  548. pTextSource->iCur);
  549. pTextSource->iCur += iWordLen;
  550. }
  551. break;
  552. default :
  553. pTextSource->iCur += cChars;
  554. break;
  555. }
  556. return TRUE;
  557. }