Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

497 lines
15 KiB

  1. #include <windows.h>
  2. #include "IWBrKr.h"
  3. #include "DefBrKr.h"
  4. #define ZERO_WIDTH_SPACE 0x200B
  5. #define MAX_Def_WordBrKr_Prcess_Len 1000
  6. BOOL IsWinNT(void)
  7. {
  8. OSVERSIONINFOA osVersionInfo;
  9. BOOL fRet = FALSE;
  10. osVersionInfo.dwOSVersionInfoSize = sizeof(osVersionInfo);
  11. GetVersionExA(&osVersionInfo);
  12. if (osVersionInfo.dwPlatformId == VER_PLATFORM_WIN32_NT) {
  13. fRet = TRUE;
  14. }
  15. return fRet;
  16. }
  17. BOOL MyGetStringTypeEx(
  18. LCID LocalID,
  19. DWORD dwInfoType,
  20. const WCHAR *lpSrcStr, // unicode base
  21. INT cchSrc,
  22. LPWORD lpCharType)
  23. {
  24. BOOL fRet = FALSE;
  25. if (IsWinNT()) {
  26. fRet = GetStringTypeW(dwInfoType, lpSrcStr, cchSrc,lpCharType);
  27. } else {
  28. DWORD dwANSISize = 0;
  29. dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc,
  30. NULL, 0, NULL, NULL);
  31. if (dwANSISize) {
  32. LPSTR lpAnsiStr = NULL;
  33. lpAnsiStr = new CHAR[dwANSISize];
  34. if (lpAnsiStr) {
  35. dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc,
  36. lpAnsiStr, dwANSISize, NULL, NULL);
  37. fRet = GetStringTypeExA(LocalID, dwInfoType, lpAnsiStr, dwANSISize, lpCharType);
  38. if (ERROR_INVALID_PARAMETER == GetLastError() && (CT_CTYPE1 == dwInfoType || CT_CTYPE3 == dwInfoType)) {
  39. for (INT i = 0; i < cchSrc; ++i) {
  40. switch (dwInfoType) {
  41. case CT_CTYPE1:
  42. lpCharType[i] = C1_ALPHA;
  43. break;
  44. case CT_CTYPE3:
  45. lpCharType[i] = (C3_NONSPACING | C3_ALPHA);
  46. break;
  47. }
  48. }
  49. fRet = TRUE;
  50. }
  51. delete [] lpAnsiStr;
  52. lpAnsiStr = NULL;
  53. }
  54. }
  55. }
  56. return fRet;
  57. }
  58. CDefWordBreaker::CDefWordBreaker()
  59. {
  60. ccCompare = MAX_Def_WordBrKr_Prcess_Len;
  61. }
  62. //+-------------------------------------------------------------------------
  63. //
  64. // Method: CDefWordBreaker::IsWordChar
  65. //
  66. // Synopsis: Find whether the i'th character in the buffer _awString
  67. // is a word character (rather than word break)
  68. //
  69. // Arguments: [i] -- index into _awString
  70. //
  71. // History: 22-Jul-1994 BartoszM Created
  72. //
  73. //--------------------------------------------------------------------------
  74. inline BOOL CDefWordBreaker::IsWordChar(
  75. int i,
  76. PWORD _aCharInfo1,
  77. PWORD _aCharInfo3,
  78. const WCHAR* pwcChunk) const
  79. {
  80. if ( (_aCharInfo1[i] & (C1_ALPHA | C1_DIGIT))
  81. || (_aCharInfo3[i] & C3_NONSPACING) )
  82. {
  83. return TRUE;
  84. }
  85. WCHAR c = pwcChunk[i];
  86. if (c == L'_')
  87. return TRUE;
  88. if (c == 0xa0) // non breaking space
  89. {
  90. // followed by a non-spacing character
  91. // (looking ahead is okay)
  92. if (_aCharInfo3[i+1] & C3_NONSPACING)
  93. return TRUE;
  94. }
  95. return FALSE;
  96. }
  97. //+---------------------------------------------------------------------------
  98. //
  99. // Member: CDefWordBreaker::ScanChunk
  100. //
  101. // Synopsis: For each character find its type
  102. //
  103. //
  104. // History: 16-Aug-94 BartoszM Created
  105. //
  106. //----------------------------------------------------------------------------
  107. BOOL CDefWordBreaker::ScanChunk(
  108. PWORD _aCharInfo1,
  109. PWORD _aCharInfo3,
  110. const WCHAR *pwcChunk,
  111. ULONG ucwc)
  112. {
  113. BOOL fRet = FALSE;
  114. // POSIX character typing, Source, Size of source, Character info
  115. if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE1, pwcChunk, ucwc, _aCharInfo1)) {
  116. // Additional POSIX, Source, Size of source, Character info 3
  117. } else if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE3, pwcChunk, ucwc, _aCharInfo3)) { //
  118. } else {
  119. fRet = TRUE;
  120. }
  121. return fRet;
  122. }
  123. /*
  124. BOOL CDefWordBreaker::ScanChunk(
  125. PWORD _aCharInfo1,
  126. PWORD _aCharInfo3,
  127. const WCHAR *pwcChunk,
  128. ULONG ucwc)
  129. {
  130. //
  131. // GetStringTypeW is returning error 87 (ERROR_INVALID_PARAMETER) if
  132. // we pass in a null string.
  133. //
  134. // Win4Assert( (0 != _cMapped) && (0 != _pwcChunk) );
  135. if (IsWinNT())
  136. {
  137. if (!MyGetStringTypeEx(0, // Dummy
  138. CT_CTYPE1, // POSIX character typing
  139. pwcChunk, // Source
  140. ucwc, // Size of source
  141. _aCharInfo1 ) ) // Character info
  142. {
  143. return FALSE;
  144. }
  145. if ( !MyGetStringTypeEx(0, // Dummy
  146. CT_CTYPE3, // Additional POSIX
  147. pwcChunk, // Source
  148. ucwc, // Size of source
  149. _aCharInfo3 ) ) // Character info 3
  150. {
  151. return FALSE;
  152. }
  153. }
  154. else
  155. {
  156. //
  157. // BUGBUG: This is all wrong -- we don't know if this is the right
  158. // locale to use and there isn't a way to know at this point.
  159. //
  160. if (!MyGetStringTypeEx( GetSystemDefaultLCID(),
  161. CT_CTYPE1, // POSIX character typing
  162. pwcChunk, // Source
  163. ucwc, // Size of source
  164. _aCharInfo1 ) ) // Character info
  165. {
  166. // ciDebugOut(( DEB_ERROR, "GetStringTypeW returned %d\n",
  167. // GetLastError() ));
  168. // Win9x just stinks. No 2 ways about it.
  169. if ( ERROR_INVALID_PARAMETER == GetLastError() )
  170. {
  171. for ( unsigned i = 0; i < ucwc; i++ )
  172. _aCharInfo1[i] = C1_ALPHA;
  173. return TRUE;
  174. }
  175. return FALSE;
  176. }
  177. if ( !MyGetStringTypeEx(GetSystemDefaultLCID(),
  178. CT_CTYPE3, // Additional POSIX
  179. pwcChunk, // Source
  180. ucwc, // Size of source
  181. _aCharInfo3 ) ) // Character info 3
  182. {
  183. // ciDebugOut(( DEB_ERROR, "GetStringTypeW CTYPE3 returned %d\n",
  184. // GetLastError() ));
  185. // Win9x just stinks. No 2 ways about it.
  186. if ( ERROR_INVALID_PARAMETER == GetLastError() )
  187. {
  188. for ( unsigned i = 0; i < ucwc; i++ )
  189. _aCharInfo3[i] = ( C3_NONSPACING | C3_ALPHA );
  190. return TRUE;
  191. }
  192. return FALSE;
  193. }
  194. }
  195. return TRUE;
  196. } //ScanChunk
  197. */
  198. //+---------------------------------------------------------------------------
  199. //
  200. // Member: CDefWordBreaker::BreakText
  201. //
  202. // Synopsis: Break input stream into words.
  203. //
  204. // Arguments: [pTextSource] - source of input buffers
  205. // [pWordSink] - sink for words
  206. // [pPhraseSink] - sink for noun phrases
  207. //
  208. // History: 07-June-91 t-WadeR Created
  209. // 12-Oct-92 AmyA Added Unicode support
  210. // 18-Nov-92 AmyA Overloaded
  211. // 11-Apr-94 KyleP Sync with spec
  212. // 26-Aug-94 BartoszM Fixed Unicode parsing
  213. //
  214. //----------------------------------------------------------------------------
  215. SCODE CDefWordBreaker::BreakText(
  216. TEXT_SOURCE *pTextSource,
  217. IWordSink *pWordSink,
  218. IPhraseSink *pPhraseSink,
  219. DWORD dwBase)
  220. {
  221. LPWORD _aCharInfo1 = NULL;
  222. LPWORD _aCharInfo3 = NULL;
  223. if ( 0 == pTextSource )
  224. return E_INVALIDARG;
  225. if ( 0 == pWordSink || pTextSource->iCur == pTextSource->iEnd)
  226. return S_OK;
  227. if (pTextSource->iCur > pTextSource->iEnd)
  228. {
  229. // Win4Assert ( !"BreakText called with bad TEXT_SOURCE" );
  230. return E_FAIL;
  231. }
  232. SCODE sc = S_OK;
  233. ULONG cwc, cwcProcd; // cwcProcd is # chars actually processed by Tokenize()
  234. do {
  235. //
  236. // Flag for first time thru loop below. This is to fix the case
  237. // where the length of the buffer passed in is less than
  238. // MAX_II_BUFFER_LEN. In this case iEnd-iCur is <= MAX_II_BUFFER_LEN
  239. // and we break out the inner loop and call
  240. // pfnFillTextBuffer without having processed any characters,
  241. // and so pfnFillTextBuffer returns TRUE without adding any new
  242. // characters and this results in an infinite loop.
  243. BOOL fFirstTime = TRUE;
  244. while (pTextSource->iCur < pTextSource->iEnd) {
  245. cwc = pTextSource->iEnd - pTextSource->iCur;
  246. // Process in buckets of MAX_II_BUFER_LEN only
  247. if (cwc >= CDefWordBreaker::ccCompare) {
  248. cwc = CDefWordBreaker::ccCompare;
  249. } else if ( !fFirstTime) {
  250. break;
  251. } else {
  252. }
  253. if (_aCharInfo1) {
  254. delete [] _aCharInfo1;
  255. _aCharInfo1 = NULL;
  256. }
  257. if (_aCharInfo3) {
  258. delete [] _aCharInfo3;
  259. _aCharInfo3 = NULL;
  260. }
  261. _aCharInfo1 = new WORD[cwc + 1];
  262. _aCharInfo3 = new WORD[cwc + 1];
  263. if (_aCharInfo1 && _aCharInfo3) {
  264. Tokenize( pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase);
  265. }
  266. // Win4Assert( cwcProcd <= cwc );
  267. pTextSource->iCur += cwcProcd;
  268. fFirstTime = FALSE;
  269. }
  270. } while(SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));
  271. cwc = pTextSource->iEnd - pTextSource->iCur;
  272. // we know that the remaining text should be less than ccCompare
  273. // Win4Assert( cwc < CDefWordBreaker::ccCompare );
  274. if (0 != cwc) {
  275. if (_aCharInfo1) {
  276. delete [] _aCharInfo1;
  277. _aCharInfo1 = NULL;
  278. }
  279. if (_aCharInfo3) {
  280. delete [] _aCharInfo3;
  281. _aCharInfo3 = NULL;
  282. }
  283. _aCharInfo1 = new WORD[cwc + 1];
  284. _aCharInfo3 = new WORD[cwc + 1];
  285. if (_aCharInfo1 && _aCharInfo1) {
  286. Tokenize(pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase);
  287. }
  288. }
  289. if (_aCharInfo1) {
  290. delete [] _aCharInfo1;
  291. _aCharInfo1 = NULL;
  292. }
  293. if (_aCharInfo3) {
  294. delete [] _aCharInfo3;
  295. _aCharInfo3 = NULL;
  296. }
  297. return sc;
  298. } //BreakText
  299. //+---------------------------------------------------------------------------
  300. //
  301. // Member: CDefWordBreaker::Tokenize
  302. //
  303. // Synopsis: Tokenize the input buffer into words
  304. //
  305. // Arguments: [pTextSource] -- input text source
  306. // [cwc] -- # chars to process
  307. // [pWordSink] -- sink for words
  308. // [cwcProd] -- # chars actually processed returned here
  309. //
  310. // History: 10-Aug-95 SitaramR Created
  311. //
  312. //----------------------------------------------------------------------------
  313. void CDefWordBreaker::Tokenize( TEXT_SOURCE *pTextSource,
  314. ULONG cwc,
  315. IWordSink *pWordSink,
  316. ULONG& cwcProcd,
  317. PWORD _aCharInfo1,
  318. PWORD _aCharInfo3,
  319. DWORD dwBase)
  320. {
  321. const WCHAR* pwcChunk = NULL;
  322. WCHAR _awcBufZWS[MAX_Def_WordBrKr_Prcess_Len];
  323. pwcChunk = &pTextSource->awcBuffer[pTextSource->iCur];
  324. if (!ScanChunk(_aCharInfo1, _aCharInfo3, pwcChunk, cwc)) {
  325. return;
  326. }
  327. BOOL fWordHasZWS = FALSE; // Does the current word have a zero-width-space ?
  328. unsigned uLenZWS; // Length of a word minus embedded zero-width-spaces
  329. //
  330. // iBeginWord is the offset into _aCharInfo of the beginning character of
  331. // a word. iCur is the first *unprocessed* character.
  332. // They are indexes into the mapped chunk.
  333. //
  334. unsigned iBeginWord = 0;
  335. unsigned iCur = 0;
  336. //
  337. // Pump words from mapped chunk to word sink
  338. //
  339. while (iCur < cwc)
  340. {
  341. //
  342. // Skip whitespace, punctuation, etc.
  343. //
  344. for (; iCur < cwc; iCur++)
  345. if (IsWordChar (iCur, _aCharInfo1, _aCharInfo3, pwcChunk))
  346. break;
  347. // iCur points to a word char or is equal to _cMapped
  348. iBeginWord = iCur;
  349. if (iCur < cwc)
  350. iCur++; // we knew it pointed at word character
  351. //
  352. // Find word break. Filter may output Unicode zero-width-space, which
  353. // should be ignored by the wordbreaker.
  354. //
  355. fWordHasZWS = FALSE;
  356. for (; iCur < cwc; iCur++)
  357. {
  358. if (!IsWordChar(iCur, _aCharInfo1, _aCharInfo3, pwcChunk))
  359. {
  360. if (pwcChunk[iCur] == ZERO_WIDTH_SPACE )
  361. fWordHasZWS = TRUE;
  362. else
  363. break;
  364. }
  365. }
  366. if (fWordHasZWS)
  367. {
  368. //
  369. // Copy word into _awcBufZWS after stripping zero-width-spaces
  370. //
  371. uLenZWS = 0;
  372. for ( unsigned i=iBeginWord; i<iCur; i++ )
  373. {
  374. if (pwcChunk[i] != ZERO_WIDTH_SPACE )
  375. _awcBufZWS[uLenZWS++] = pwcChunk[i];
  376. }
  377. }
  378. // iCur points to a non-word char or is equal to _cMapped
  379. if (iCur < cwc)
  380. {
  381. // store the word and its source position
  382. if ( fWordHasZWS )
  383. pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
  384. iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
  385. else
  386. pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word
  387. iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
  388. iCur++; // we knew it pointed at non-word char
  389. iBeginWord = iCur; // in case we exit the loop now
  390. }
  391. } // next word
  392. // Win4Assert( iCur == _cMapped );
  393. // End of words in chunk.
  394. // iCur == _cMapped
  395. // iBeginWord points at beginning of word or == _cMapped
  396. if ( 0 == iBeginWord )
  397. {
  398. // A single word fills from beginning of this chunk
  399. // to the end. This is either a very long word or
  400. // a short word in a leftover buffer.
  401. // store the word and its source position
  402. if ( fWordHasZWS )
  403. pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
  404. iCur, pTextSource->iCur + dwBase); // its source pos.
  405. else
  406. pWordSink->PutWord( iCur, pwcChunk, // the word
  407. iCur, pTextSource->iCur + dwBase); // its source pos.
  408. //
  409. // Position it to not add the word twice.
  410. //
  411. iBeginWord = iCur;
  412. }
  413. //
  414. // If this is the last chunk from text source, then process the
  415. // last fragment
  416. //
  417. if ( cwc < CDefWordBreaker::ccCompare && iBeginWord != iCur )
  418. {
  419. // store the word and its source position
  420. if ( fWordHasZWS )
  421. pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
  422. iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
  423. else
  424. pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word
  425. iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
  426. iBeginWord = iCur;
  427. }
  428. cwcProcd = iBeginWord;
  429. }