Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

499 lines
16 KiB

  1. #include <windows.h>
  2. #include "IWBrKr.h"
  3. #include "DefBrKr.h"
  4. #define ZERO_WIDTH_SPACE 0x200B
  5. #define MAX_Def_WordBrKr_Prcess_Len 1000
  6. BOOL IsWinNT(void)
  7. {
  8. OSVERSIONINFOA osVersionInfo;
  9. BOOL fRet = FALSE;
  10. osVersionInfo.dwOSVersionInfoSize = sizeof(osVersionInfo);
  11. GetVersionExA(&osVersionInfo);
  12. if (osVersionInfo.dwPlatformId == VER_PLATFORM_WIN32_NT) {
  13. fRet = TRUE;
  14. }
  15. return fRet;
  16. }
  17. BOOL MyGetStringTypeEx(
  18. LCID LocalID,
  19. DWORD dwInfoType,
  20. const WCHAR *lpSrcStr, // unicode base
  21. INT cchSrc,
  22. LPWORD lpCharType)
  23. {
  24. BOOL fRet = FALSE;
  25. if (IsWinNT()) {
  26. fRet = GetStringTypeW(dwInfoType, lpSrcStr, cchSrc,lpCharType);
  27. } else {
  28. DWORD dwANSISize = 0;
  29. dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc,
  30. NULL, 0, NULL, NULL);
  31. if (dwANSISize) {
  32. LPSTR lpAnsiStr = NULL;
  33. lpAnsiStr = new CHAR[dwANSISize];
  34. if (lpAnsiStr) {
  35. dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc,
  36. lpAnsiStr, dwANSISize, NULL, NULL);
  37. fRet = GetStringTypeExA(LocalID, dwInfoType, lpAnsiStr, dwANSISize, lpCharType);
  38. if (ERROR_INVALID_PARAMETER == GetLastError() && (CT_CTYPE1 == dwInfoType || CT_CTYPE3 == dwInfoType)) {
  39. for (INT i = 0; i < cchSrc; ++i) {
  40. switch (dwInfoType) {
  41. case CT_CTYPE1:
  42. lpCharType[i] = C1_ALPHA;
  43. break;
  44. case CT_CTYPE3:
  45. lpCharType[i] = (C3_NONSPACING | C3_ALPHA);
  46. break;
  47. }
  48. }
  49. fRet = TRUE;
  50. }
  51. delete [] lpAnsiStr;
  52. lpAnsiStr = NULL;
  53. }
  54. }
  55. }
  56. return fRet;
  57. }
  58. CDefWordBreaker::CDefWordBreaker()
  59. {
  60. ccCompare = MAX_Def_WordBrKr_Prcess_Len;
  61. }
  62. //+-------------------------------------------------------------------------
  63. //
  64. // Method: CDefWordBreaker::IsWordChar
  65. //
  66. // Synopsis: Find whether the i'th character in the buffer _awString
  67. // is a word character (rather than word break)
  68. //
  69. // Arguments: [i] -- index into _awString
  70. //
  71. // History: 22-Jul-1994 BartoszM Created
  72. //
  73. //--------------------------------------------------------------------------
  74. inline BOOL CDefWordBreaker::IsWordChar(
  75. int i,
  76. PWORD _aCharInfo1,
  77. PWORD _aCharInfo3,
  78. const WCHAR* pwcChunk) const
  79. {
  80. if ( (_aCharInfo1[i] & (C1_ALPHA | C1_DIGIT))
  81. || (_aCharInfo3[i] & C3_NONSPACING) )
  82. {
  83. return TRUE;
  84. }
  85. WCHAR c = pwcChunk[i];
  86. if (c == L'_')
  87. return TRUE;
  88. if (c == 0xa0) // non breaking space
  89. {
  90. // followed by a non-spacing character
  91. // (looking ahead is okay)
  92. if (_aCharInfo3[i+1] & C3_NONSPACING)
  93. return TRUE;
  94. }
  95. return FALSE;
  96. }
  97. //+---------------------------------------------------------------------------
  98. //
  99. // Member: CDefWordBreaker::ScanChunk
  100. //
  101. // Synopsis: For each character find its type
  102. //
  103. //
  104. // History: 16-Aug-94 BartoszM Created
  105. //
  106. //----------------------------------------------------------------------------
  107. BOOL CDefWordBreaker::ScanChunk(
  108. PWORD _aCharInfo1,
  109. PWORD _aCharInfo3,
  110. const WCHAR *pwcChunk,
  111. ULONG ucwc)
  112. {
  113. BOOL fRet = FALSE;
  114. // POSIX character typing, Source, Size of source, Character info
  115. if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE1, pwcChunk, ucwc, _aCharInfo1)) {
  116. // Additional POSIX, Source, Size of source, Character info 3
  117. } else if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE3, pwcChunk, ucwc, _aCharInfo3)) { //
  118. } else {
  119. fRet = TRUE;
  120. }
  121. return fRet;
  122. }
  123. /*
  124. BOOL CDefWordBreaker::ScanChunk(
  125. PWORD _aCharInfo1,
  126. PWORD _aCharInfo3,
  127. const WCHAR *pwcChunk,
  128. ULONG ucwc)
  129. {
  130. //
  131. // GetStringTypeW is returning error 87 (ERROR_INVALID_PARAMETER) if
  132. // we pass in a null string.
  133. //
  134. // Win4Assert( (0 != _cMapped) && (0 != _pwcChunk) );
  135. if (IsWinNT())
  136. {
  137. if (!MyGetStringTypeEx(0, // Dummy
  138. CT_CTYPE1, // POSIX character typing
  139. pwcChunk, // Source
  140. ucwc, // Size of source
  141. _aCharInfo1 ) ) // Character info
  142. {
  143. return FALSE;
  144. }
  145. if ( !MyGetStringTypeEx(0, // Dummy
  146. CT_CTYPE3, // Additional POSIX
  147. pwcChunk, // Source
  148. ucwc, // Size of source
  149. _aCharInfo3 ) ) // Character info 3
  150. {
  151. return FALSE;
  152. }
  153. }
  154. else
  155. {
  156. //
  157. // BUGBUG: This is all wrong -- we don't know if this is the right
  158. // locale to use and there isn't a way to know at this point.
  159. //
  160. if (!MyGetStringTypeEx( GetSystemDefaultLCID(),
  161. CT_CTYPE1, // POSIX character typing
  162. pwcChunk, // Source
  163. ucwc, // Size of source
  164. _aCharInfo1 ) ) // Character info
  165. {
  166. // ciDebugOut(( DEB_ERROR, "GetStringTypeW returned %d\n",
  167. // GetLastError() ));
  168. // Win9x just stinks. No 2 ways about it.
  169. if ( ERROR_INVALID_PARAMETER == GetLastError() )
  170. {
  171. for ( unsigned i = 0; i < ucwc; i++ )
  172. _aCharInfo1[i] = C1_ALPHA;
  173. return TRUE;
  174. }
  175. return FALSE;
  176. }
  177. if ( !MyGetStringTypeEx(GetSystemDefaultLCID(),
  178. CT_CTYPE3, // Additional POSIX
  179. pwcChunk, // Source
  180. ucwc, // Size of source
  181. _aCharInfo3 ) ) // Character info 3
  182. {
  183. // ciDebugOut(( DEB_ERROR, "GetStringTypeW CTYPE3 returned %d\n",
  184. // GetLastError() ));
  185. // Win9x just stinks. No 2 ways about it.
  186. if ( ERROR_INVALID_PARAMETER == GetLastError() )
  187. {
  188. for ( unsigned i = 0; i < ucwc; i++ )
  189. _aCharInfo3[i] = ( C3_NONSPACING | C3_ALPHA );
  190. return TRUE;
  191. }
  192. return FALSE;
  193. }
  194. }
  195. return TRUE;
  196. } //ScanChunk
  197. */
  198. //+---------------------------------------------------------------------------
  199. //
  200. // Member: CDefWordBreaker::BreakText
  201. //
  202. // Synopsis: Break input stream into words.
  203. //
  204. // Arguments: [pTextSource] - source of input buffers
  205. // [pWordSink] - sink for words
  206. // [pPhraseSink] - sink for noun phrases
  207. //
  208. // History: 07-June-91 t-WadeR Created
  209. // 12-Oct-92 AmyA Added Unicode support
  210. // 18-Nov-92 AmyA Overloaded
  211. // 11-Apr-94 KyleP Sync with spec
  212. // 26-Aug-94 BartoszM Fixed Unicode parsing
  213. //
  214. //----------------------------------------------------------------------------
  215. SCODE CDefWordBreaker::BreakText(
  216. TEXT_SOURCE *pTextSource,
  217. IWordSink *pWordSink,
  218. IPhraseSink *pPhraseSink,
  219. DWORD dwBase)
  220. {
  221. LPWORD _aCharInfo1 = NULL;
  222. LPWORD _aCharInfo3 = NULL;
  223. if ( 0 == pTextSource )
  224. return E_INVALIDARG;
  225. if ( 0 == pWordSink || pTextSource->iCur == pTextSource->iEnd)
  226. return S_OK;
  227. if (pTextSource->iCur > pTextSource->iEnd)
  228. {
  229. // Win4Assert ( !"BreakText called with bad TEXT_SOURCE" );
  230. return E_FAIL;
  231. }
  232. SCODE sc = S_OK;
  233. ULONG cwc, cwcProcd; // cwcProcd is # chars actually processed by Tokenize()
  234. cwc = 0;
  235. cwcProcd = 0;
  236. do {
  237. //
  238. // Flag for first time thru loop below. This is to fix the case
  239. // where the length of the buffer passed in is less than
  240. // MAX_II_BUFFER_LEN. In this case iEnd-iCur is <= MAX_II_BUFFER_LEN
  241. // and we break out the inner loop and call
  242. // pfnFillTextBuffer without having processed any characters,
  243. // and so pfnFillTextBuffer returns TRUE without adding any new
  244. // characters and this results in an infinite loop.
  245. BOOL fFirstTime = TRUE;
  246. while (pTextSource->iCur < pTextSource->iEnd) {
  247. cwc = pTextSource->iEnd - pTextSource->iCur;
  248. // Process in buckets of MAX_II_BUFER_LEN only
  249. if (cwc >= CDefWordBreaker::ccCompare) {
  250. cwc = CDefWordBreaker::ccCompare;
  251. } else if ( !fFirstTime) {
  252. break;
  253. } else {
  254. }
  255. if (_aCharInfo1) {
  256. delete [] _aCharInfo1;
  257. _aCharInfo1 = NULL;
  258. }
  259. if (_aCharInfo3) {
  260. delete [] _aCharInfo3;
  261. _aCharInfo3 = NULL;
  262. }
  263. _aCharInfo1 = new WORD[cwc + 1];
  264. _aCharInfo3 = new WORD[cwc + 1];
  265. if (_aCharInfo1 && _aCharInfo3) {
  266. Tokenize( pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase);
  267. }
  268. // Win4Assert( cwcProcd <= cwc );
  269. pTextSource->iCur += cwcProcd;
  270. fFirstTime = FALSE;
  271. }
  272. } while(SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));
  273. cwc = pTextSource->iEnd - pTextSource->iCur;
  274. // we know that the remaining text should be less than ccCompare
  275. // Win4Assert( cwc < CDefWordBreaker::ccCompare );
  276. if (0 != cwc) {
  277. if (_aCharInfo1) {
  278. delete [] _aCharInfo1;
  279. _aCharInfo1 = NULL;
  280. }
  281. if (_aCharInfo3) {
  282. delete [] _aCharInfo3;
  283. _aCharInfo3 = NULL;
  284. }
  285. _aCharInfo1 = new WORD[cwc + 1];
  286. _aCharInfo3 = new WORD[cwc + 1];
  287. if (_aCharInfo1 && _aCharInfo1) {
  288. Tokenize(pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase);
  289. }
  290. }
  291. if (_aCharInfo1) {
  292. delete [] _aCharInfo1;
  293. _aCharInfo1 = NULL;
  294. }
  295. if (_aCharInfo3) {
  296. delete [] _aCharInfo3;
  297. _aCharInfo3 = NULL;
  298. }
  299. return sc;
  300. } //BreakText
  301. //+---------------------------------------------------------------------------
  302. //
  303. // Member: CDefWordBreaker::Tokenize
  304. //
  305. // Synopsis: Tokenize the input buffer into words
  306. //
  307. // Arguments: [pTextSource] -- input text source
  308. // [cwc] -- # chars to process
  309. // [pWordSink] -- sink for words
  310. // [cwcProd] -- # chars actually processed returned here
  311. //
  312. // History: 10-Aug-95 SitaramR Created
  313. //
  314. //----------------------------------------------------------------------------
  315. void CDefWordBreaker::Tokenize( TEXT_SOURCE *pTextSource,
  316. ULONG cwc,
  317. IWordSink *pWordSink,
  318. ULONG& cwcProcd,
  319. PWORD _aCharInfo1,
  320. PWORD _aCharInfo3,
  321. DWORD dwBase)
  322. {
  323. const WCHAR* pwcChunk = NULL;
  324. WCHAR _awcBufZWS[MAX_Def_WordBrKr_Prcess_Len];
  325. pwcChunk = &pTextSource->awcBuffer[pTextSource->iCur];
  326. if (!ScanChunk(_aCharInfo1, _aCharInfo3, pwcChunk, cwc)) {
  327. return;
  328. }
  329. BOOL fWordHasZWS = FALSE; // Does the current word have a zero-width-space ?
  330. unsigned uLenZWS; // Length of a word minus embedded zero-width-spaces
  331. //
  332. // iBeginWord is the offset into _aCharInfo of the beginning character of
  333. // a word. iCur is the first *unprocessed* character.
  334. // They are indexes into the mapped chunk.
  335. //
  336. unsigned iBeginWord = 0;
  337. unsigned iCur = 0;
  338. //
  339. // Pump words from mapped chunk to word sink
  340. //
  341. while (iCur < cwc)
  342. {
  343. //
  344. // Skip whitespace, punctuation, etc.
  345. //
  346. for (; iCur < cwc; iCur++)
  347. if (IsWordChar (iCur, _aCharInfo1, _aCharInfo3, pwcChunk))
  348. break;
  349. // iCur points to a word char or is equal to _cMapped
  350. iBeginWord = iCur;
  351. if (iCur < cwc)
  352. iCur++; // we knew it pointed at word character
  353. //
  354. // Find word break. Filter may output Unicode zero-width-space, which
  355. // should be ignored by the wordbreaker.
  356. //
  357. fWordHasZWS = FALSE;
  358. for (; iCur < cwc; iCur++)
  359. {
  360. if (!IsWordChar(iCur, _aCharInfo1, _aCharInfo3, pwcChunk))
  361. {
  362. if (pwcChunk[iCur] == ZERO_WIDTH_SPACE )
  363. fWordHasZWS = TRUE;
  364. else
  365. break;
  366. }
  367. }
  368. if (fWordHasZWS)
  369. {
  370. //
  371. // Copy word into _awcBufZWS after stripping zero-width-spaces
  372. //
  373. uLenZWS = 0;
  374. for ( unsigned i=iBeginWord; i<iCur; i++ )
  375. {
  376. if (pwcChunk[i] != ZERO_WIDTH_SPACE )
  377. _awcBufZWS[uLenZWS++] = pwcChunk[i];
  378. }
  379. }
  380. // iCur points to a non-word char or is equal to _cMapped
  381. if (iCur < cwc)
  382. {
  383. // store the word and its source position
  384. if ( fWordHasZWS )
  385. pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
  386. iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
  387. else
  388. pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word
  389. iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
  390. iCur++; // we knew it pointed at non-word char
  391. iBeginWord = iCur; // in case we exit the loop now
  392. }
  393. } // next word
  394. // Win4Assert( iCur == _cMapped );
  395. // End of words in chunk.
  396. // iCur == _cMapped
  397. // iBeginWord points at beginning of word or == _cMapped
  398. if ( 0 == iBeginWord )
  399. {
  400. // A single word fills from beginning of this chunk
  401. // to the end. This is either a very long word or
  402. // a short word in a leftover buffer.
  403. // store the word and its source position
  404. if ( fWordHasZWS )
  405. pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
  406. iCur, pTextSource->iCur + dwBase); // its source pos.
  407. else
  408. pWordSink->PutWord( iCur, pwcChunk, // the word
  409. iCur, pTextSource->iCur + dwBase); // its source pos.
  410. //
  411. // Position it to not add the word twice.
  412. //
  413. iBeginWord = iCur;
  414. }
  415. //
  416. // If this is the last chunk from text source, then process the
  417. // last fragment
  418. //
  419. if ( cwc < CDefWordBreaker::ccCompare && iBeginWord != iCur )
  420. {
  421. // store the word and its source position
  422. if ( fWordHasZWS )
  423. pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
  424. iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
  425. else
  426. pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word
  427. iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
  428. iBeginWord = iCur;
  429. }
  430. cwcProcd = iBeginWord;
  431. }