Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

619 lines
18 KiB

  1. //+---------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1997
  5. //
  6. // File: IWBreak.cxx
  7. //
  8. // Contents: Thai Word Breaker glue code
  9. //
  10. // History: weibz, 10-Nov-1997 created
  11. //
  12. //----------------------------------------------------------------------------
  13. #include <pch.cxx>
  14. #include "iwbreak.hxx"
  15. #include "thwbint.h"
  16. #define MAX_BREAKS 255
  17. #define WB_NORMAL 1
  18. extern long gulcInstances;
  19. //+---------------------------------------------------------------------------
  20. //
  21. // Member: CWordBreaker::CWordBreaker
  22. //
  23. // Synopsis: Constructor for the CWordBreaker class.
  24. //
  25. // Arguments: [lcid] -- locale id
  26. //
  27. //----------------------------------------------------------------------------
  28. CWordBreaker::CWordBreaker( LCID lcid )
  29. : _cRefs(1),
  30. _lcid(lcid)
  31. {
  32. InterlockedIncrement( &gulcInstances );
  33. }
  34. //+---------------------------------------------------------------------------
  35. //
  36. // Member: CWordBreaker::~CWordBreaker
  37. //
  38. // Synopsis: Destructor for the CWordBreaker class.
  39. //
  40. // Notes: All termination/deallocation is done by embedded smart pointers
  41. //
  42. //----------------------------------------------------------------------------
  43. CWordBreaker::~CWordBreaker()
  44. {
  45. InterlockedDecrement( &gulcInstances );
  46. }
  47. //+-------------------------------------------------------------------------
  48. //
  49. // Method: CWordBreaker::QueryInterface
  50. //
  51. // Synopsis: Rebind to other interface
  52. //
  53. // Arguments: [riid] -- IID of new interface
  54. // [ppvObject] -- New interface * returned here
  55. //
  56. // Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
  57. //
  58. //--------------------------------------------------------------------------
  59. SCODE STDMETHODCALLTYPE
  60. CWordBreaker::QueryInterface( REFIID riid, void ** ppvObject)
  61. {
  62. if ( 0 == ppvObject )
  63. return E_INVALIDARG;
  64. *ppvObject = 0;
  65. if ( IID_IWordBreaker == riid )
  66. *ppvObject = (IUnknown *)(IWordBreaker *)this;
  67. else if ( IID_IUnknown == riid )
  68. *ppvObject = (IUnknown *)this;
  69. else
  70. return E_NOINTERFACE;
  71. AddRef();
  72. return S_OK;
  73. }
  74. //+-------------------------------------------------------------------------
  75. //
  76. // Method: CWordBreaker::AddRef
  77. //
  78. // Synopsis: Increments refcount
  79. //
  80. //--------------------------------------------------------------------------
  81. ULONG STDMETHODCALLTYPE
  82. CWordBreaker::AddRef()
  83. {
  84. return InterlockedIncrement( &_cRefs );
  85. }
  86. //+-------------------------------------------------------------------------
  87. //
  88. // Method: CWordBreaker::Release
  89. //
  90. // Synopsis: Decrement refcount. Delete if necessary.
  91. //
  92. //--------------------------------------------------------------------------
  93. ULONG STDMETHODCALLTYPE
  94. CWordBreaker::Release()
  95. {
  96. unsigned long uTmp = InterlockedDecrement( &_cRefs );
  97. if ( 0 == uTmp )
  98. delete this;
  99. return(uTmp);
  100. }
  101. //+-------------------------------------------------------------------------
  102. //
  103. // Method: CWordBreaker::Init
  104. //
  105. // Synopsis: Initialize word-breaker
  106. //
  107. // Arguments: [fQuery] -- TRUE if query-time
  108. // [ulMaxTokenSize] -- Maximum size token stored by caller
  109. // [pfLicense] -- Set to true if use restricted
  110. //
  111. // Returns: Status code
  112. //
  113. //--------------------------------------------------------------------------
  114. SCODE STDMETHODCALLTYPE
  115. CWordBreaker::Init(
  116. BOOL fQuery,
  117. ULONG ulMaxTokenSize,
  118. BOOL *pfLicense )
  119. {
  120. if ( NULL == pfLicense ) {
  121. return E_INVALIDARG;
  122. }
  123. if (IsBadWritePtr(pfLicense, sizeof(DWORD))) {
  124. return E_INVALIDARG;
  125. }
  126. *pfLicense = TRUE;
  127. _fQuery = fQuery;
  128. _ulMaxTokenSize = ulMaxTokenSize;
  129. return S_OK;
  130. }
  131. //+---------------------------------------------------------------------------
  132. //
  133. // Member: CWordBreaker::ComposePhrase
  134. //
  135. // Synopsis: Convert a noun and a modifier into a phrase.
  136. //
  137. // Arguments: [pwcNoun] -- pointer to noun.
  138. // [cwcNoun] -- count of chars in pwcNoun
  139. // [pwcModifier] -- pointer to word modifying pwcNoun
  140. // [cwcModifier] -- count of chars in pwcModifier
  141. // [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
  142. //
  143. //----------------------------------------------------------------------------
  144. SCODE STDMETHODCALLTYPE
  145. CWordBreaker::ComposePhrase(
  146. WCHAR const *pwcNoun,
  147. ULONG cwcNoun,
  148. WCHAR const *pwcModifier,
  149. ULONG cwcModifier,
  150. ULONG ulAttachmentType,
  151. WCHAR *pwcPhrase,
  152. ULONG *pcwcPhrase )
  153. {
  154. //
  155. // Need to code in later
  156. //
  157. if ( _fQuery )
  158. return( E_NOTIMPL );
  159. else
  160. return ( WBREAK_E_QUERY_ONLY );
  161. }
  162. //+---------------------------------------------------------------------------
  163. //
  164. // Member: CWordBreaker::GetLicenseToUse
  165. //
  166. // Synopsis: Returns a pointer to vendors license information
  167. //
  168. // Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
  169. //
  170. //----------------------------------------------------------------------------
  171. SCODE STDMETHODCALLTYPE
  172. CWordBreaker::GetLicenseToUse(
  173. const WCHAR **ppwcsLicense )
  174. {
  175. static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1998";
  176. if ( NULL == ppwcsLicense ) {
  177. return E_INVALIDARG;
  178. }
  179. if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) {
  180. return E_INVALIDARG;
  181. }
  182. *ppwcsLicense = wcsCopyright;
  183. return( S_OK );
  184. }
  185. //+---------------------------------------------------------------------------
  186. //
  187. // Member: CWordBreaker::BreakText
  188. //
  189. // Synopsis: Break input stream into words.
  190. //
  191. // Arguments: [pTextSource] -- source of Unicode text
  192. // [pWordSink] -- sink for collecting words
  193. // [pPhraseSink] -- sink for collecting phrases
  194. //
  195. // History: 10-Nov-1997, WeibZ, Created.
  196. //
  197. // Notes: Since the input buffer may be greater than MAX_II_BUFFER_LEN
  198. // we process the buffer in chunks of length MAX_II_BUFFER_LEN.
  199. //
  200. //----------------------------------------------------------------------------
  201. SCODE STDMETHODCALLTYPE CWordBreaker::BreakText( TEXT_SOURCE *pTextSource,
  202. IWordSink *pWordSink,
  203. IPhraseSink *pPhraseSink )
  204. {
  205. SCODE sc = S_OK;
  206. ULONG cwc;
  207. SCRIPT_ITEM *pItems, *pItem_Next, *pItem_org;
  208. SCRIPT_ANALYSIS *psa;
  209. PCWSTR pwcInChars;
  210. INT iItems;
  211. BOOL bItemProc;
  212. PCWSTR pwcChars;
  213. INT cChars;
  214. HRESULT retUSP;
  215. if ( NULL == pTextSource ) {
  216. return E_INVALIDARG;
  217. }
  218. if ( NULL == pWordSink )
  219. {
  220. // BUGBUG, propagate the null word sink error code
  221. return sc;
  222. }
  223. if ( 0 != pPhraseSink )
  224. {
  225. // ignore the phrase sink for now
  226. // return sc;
  227. }
  228. if (pTextSource->iEnd == pTextSource->iCur) {
  229. return S_OK;
  230. }
  231. Assert( pTextSource->iCur < pTextSource->iEnd );
  232. __try
  233. {
  234. do {
  235. if ( pTextSource->iCur >= pTextSource->iEnd )
  236. continue;
  237. cwc = pTextSource->iEnd - pTextSource->iCur;
  238. pwcInChars = pTextSource->awcBuffer + pTextSource->iCur;
  239. pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));
  240. if ( !pItems) {
  241. return E_UNEXPECTED;
  242. }
  243. pItem_org = pItems;
  244. iItems = 0;
  245. retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL,
  246. pItems, &iItems);
  247. if (retUSP != S_OK) {
  248. LocalFree(pItem_org);
  249. return E_UNEXPECTED;
  250. }
  251. while ( iItems > 1 ) {
  252. pItem_Next = pItems + 1;
  253. pwcChars = pwcInChars + pItems->iCharPos;
  254. cChars = pItem_Next->iCharPos - pItems->iCharPos;
  255. bItemProc = ProcessItem(pwcChars,
  256. cChars,
  257. pItems,
  258. FALSE, // no need to keep chars
  259. pTextSource,
  260. pWordSink,
  261. pPhraseSink);
  262. if ( bItemProc == FALSE ) {
  263. LocalFree(pItem_org);
  264. return E_UNEXPECTED;
  265. }
  266. pItems++;
  267. iItems--;
  268. }
  269. // special handle for the last item
  270. if ( iItems == 1 ) {
  271. pwcChars = pwcInChars + pItems->iCharPos;
  272. cChars = pTextSource->iEnd - pTextSource->iCur;
  273. bItemProc = ProcessItem(pwcChars,
  274. cChars,
  275. pItems,
  276. TRUE, // need to keep chars
  277. pTextSource,
  278. pWordSink,
  279. pPhraseSink);
  280. if ( bItemProc == FALSE ) {
  281. LocalFree(pItem_org);
  282. return E_UNEXPECTED;
  283. }
  284. }
  285. if (pItem_org)
  286. LocalFree(pItem_org);
  287. } while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) );
  288. if ( pTextSource->iCur < pTextSource->iEnd ) {
  289. cwc = pTextSource->iEnd - pTextSource->iCur;
  290. pwcInChars = pTextSource->awcBuffer + pTextSource->iCur;
  291. pItems = (SCRIPT_ITEM *)LocalAlloc(LPTR,sizeof(SCRIPT_ITEM)*(cwc+1));
  292. if ( !pItems ) {
  293. return E_UNEXPECTED;
  294. }
  295. pItem_org = pItems;
  296. iItems = 0;
  297. retUSP = ScriptItemize(pwcInChars,cwc,cwc+1, NULL, NULL,
  298. pItems, &iItems);
  299. if (retUSP != S_OK) {
  300. LocalFree(pItem_org);
  301. return E_UNEXPECTED;
  302. }
  303. while ( iItems > 1 ) {
  304. pItem_Next = pItems + 1;
  305. pwcChars = pwcInChars + pItems->iCharPos;
  306. cChars = pItem_Next->iCharPos - pItems->iCharPos;
  307. bItemProc = ProcessItem(pwcChars,
  308. cChars,
  309. pItems,
  310. FALSE, // no need to keep chars
  311. pTextSource,
  312. pWordSink,
  313. pPhraseSink);
  314. if ( bItemProc == FALSE ) {
  315. LocalFree(pItem_org);
  316. return E_UNEXPECTED;
  317. }
  318. pItems++;
  319. iItems--;
  320. }
  321. if ( iItems == 1 ) {
  322. pwcChars = pwcInChars + pItems->iCharPos;
  323. cChars = pTextSource->iEnd - pTextSource->iCur;
  324. bItemProc = ProcessItem(pwcChars,
  325. cChars,
  326. pItems,
  327. FALSE, // no need to keep chars
  328. pTextSource,
  329. pWordSink,
  330. pPhraseSink);
  331. if ( bItemProc == FALSE ) {
  332. LocalFree(pItem_org);
  333. return E_UNEXPECTED;
  334. }
  335. }
  336. if ( pItem_org )
  337. LocalFree(pItem_org);
  338. }
  339. } __except(1) {
  340. sc = E_UNEXPECTED;
  341. }
  342. return sc;
  343. }
  344. BOOL CWordBreaker::ProcessItem(
  345. PCWSTR pwcChars,
  346. INT cChars,
  347. SCRIPT_ITEM *pItems,
  348. BOOL fKeep,
  349. TEXT_SOURCE *pTextSource,
  350. IWordSink *pWordSink,
  351. IPhraseSink *pPhraseSink )
  352. {
  353. // SCRIPT_LOGATTR *psla, *psla_org, *pslatmp;
  354. INT iChar,i;
  355. INT iWord, iWordStart, iWordLen;
  356. // PTHAIWORD pThaiWord, pThaiWordTmp;
  357. // BOOL fLastIsWhiteSpace=FALSE;
  358. const SCRIPT_PROPERTIES **pScript_Properties;
  359. DWORD LangID;
  360. WORD iScript;
  361. HRESULT retUSP;
  362. ScriptGetProperties(&pScript_Properties, NULL);
  363. iScript = pItems->a.eScript;
  364. LangID = (pScript_Properties[iScript])->langid;
  365. switch (LangID) {
  366. case LANG_THAI:
  367. {
  368. BYTE* pBreakPos;
  369. int iNumberOfBreak = 0;
  370. int i;
  371. WCHAR* pwch = (WCHAR*) pwcChars;
  372. THWB_STRUCT* pThwbStruct = NULL;
  373. pBreakPos = new BYTE[cChars];
  374. if ( pBreakPos == NULL )
  375. return FALSE;
  376. pThwbStruct = THWB_CreateThwbStruct(cChars);
  377. pBreakPos[0] = 0;
  378. // iNumberOfBreak = THWB_FindWordBreak(pwch, cChars, pBreakPos, cChars, WB_NORMAL);
  379. iNumberOfBreak = THWB_IndexWordBreak(pwch,cChars, pBreakPos, pThwbStruct,cChars);
  380. for (i=0;i < iNumberOfBreak; i++)
  381. {
  382. // Search index alternate words.
  383. // If not query create Alternate word.
  384. if (pThwbStruct[i].alt != 0 && !_fQuery)
  385. {
  386. int iNumAltWord = 0, k;
  387. BYTE pAltBreakPos[5];
  388. WCHAR* word1 = pwch;
  389. int indexWord1 = 0;
  390. // Find Alternate words
  391. iNumAltWord = THWB_FindAltWord(word1,pBreakPos[i], pThwbStruct[i].alt, pAltBreakPos);
  392. // Put alternate words.
  393. for(k=0; k<iNumAltWord;k++)
  394. {
  395. pWordSink->PutAltWord(pAltBreakPos[k],&word1[indexWord1],pBreakPos[i],pTextSource->iCur);
  396. indexWord1 += pAltBreakPos[k];
  397. }
  398. }
  399. if (*pwch >= THAI_Ko_Kai && *pwch <= THAI_Vowel_MaiYaMok)
  400. pWordSink->PutWord( pBreakPos[i], pwch, pBreakPos[i], pTextSource->iCur);
  401. pTextSource->iCur += pBreakPos[i];
  402. pwch += pBreakPos[i];
  403. }
  404. if (pBreakPos)
  405. delete pBreakPos;
  406. // Prefix bug 1055941 - clear allocated memory.
  407. THWB_DeleteThwbStruct(pThwbStruct);
  408. break;
  409. }
  410. case LANG_ENGLISH : // handle English chars
  411. {
  412. BYTE ct;
  413. BOOL fRomanWord = FALSE;
  414. CONST WCHAR *pwcInput;
  415. WT Type;
  416. Type = WT_START;
  417. pwcInput = pwcChars;
  418. iWordStart = 0;
  419. for (iChar=0; iChar< cChars; iChar++, pwcInput++) {
  420. ct = GetCharType(*pwcInput);
  421. if ( (ct != WS) && (ct != PS) )
  422. ct = CH;
  423. switch (ct) {
  424. case CH :
  425. if (!fRomanWord) {
  426. iWordStart = iChar;
  427. fRomanWord = TRUE;
  428. Type = WT_ROMAJI;
  429. }
  430. break;
  431. case WS :
  432. if (fRomanWord) {
  433. iWordLen = iChar - iWordStart;
  434. pWordSink->PutWord(iWordLen,
  435. pwcChars+iWordStart,
  436. iWordLen,
  437. pTextSource->iCur);
  438. pTextSource->iCur += iWordLen;
  439. fRomanWord = FALSE;
  440. }
  441. Type = WT_WORD_SEP;
  442. pTextSource->iCur++;
  443. break;
  444. case PS :
  445. if (fRomanWord) {
  446. iWordLen = iChar - iWordStart;
  447. pWordSink->PutWord(iWordLen,
  448. pwcChars+iWordStart,
  449. iWordLen,
  450. pTextSource->iCur);
  451. pTextSource->iCur += iWordLen;
  452. fRomanWord = FALSE;
  453. }
  454. Type = WT_PHRASE_SEP;
  455. pWordSink->PutBreak(WORDREP_BREAK_EOS);
  456. pTextSource->iCur++;
  457. break;
  458. }
  459. }
  460. if ((Type == WT_WORD_SEP) || (Type == WT_PHRASE_SEP))
  461. break;
  462. if ( fKeep )
  463. break;
  464. iWordLen =cChars - iWordStart;
  465. pWordSink->PutWord(iWordLen,
  466. pwcChars+iWordStart,
  467. iWordLen,
  468. pTextSource->iCur);
  469. pTextSource->iCur += iWordLen;
  470. }
  471. break;
  472. default :
  473. pTextSource->iCur += cChars;
  474. break;
  475. }
  476. return TRUE;
  477. }