Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

527 lines
16 KiB

  1. /*
  2. JUNKUTIL.CPP
  3. (c) copyright 1998 Microsoft Corp
  4. Shared utility functions
  5. Created by Robert Rounthwaite (RobertRo@microsoft.com)
  6. Modified by Brian Moore (brimo@microsoft.com)
  7. */
  8. #include <pch.hxx>
  9. #include "junkutil.h"
  10. #include <msoedbg.h>
  11. #define _WIN32_OE 0x0501
  12. #include <mimeole.h>
  13. WORD WGetStringTypeEx(LPCSTR pszText)
  14. {
  15. WORD wType = 0;
  16. if (NULL == pszText)
  17. {
  18. wType = 0;
  19. goto exit;
  20. }
  21. if (IsDBCSLeadByte(*pszText))
  22. SideAssert(GetStringTypeEx(LOCALE_USER_DEFAULT, CT_CTYPE1, pszText, 2, &wType));
  23. else
  24. SideAssert(GetStringTypeEx(LOCALE_USER_DEFAULT, CT_CTYPE1, pszText, 1, &wType));
  25. exit:
  26. return wType;
  27. }
  28. BOOL FMatchToken(BOOL fStart, BOOL fEnd, LPCSTR pszPrev, DWORD * pdwFlagsPrev, LPCSTR pszWord, ULONG cchWord, DWORD * pdwFlagsWord, LPCSTR pszEnd)
  29. {
  30. BOOL fRet = FALSE;
  31. DWORD dwFlagsEnd = 0;
  32. LPCSTR pszEndWord = NULL;
  33. // this code checks to see that the spot we found is a "word" and not a subword
  34. // we want the character before and after to be word break, unless the character on that end of the
  35. // string already is not a word break (or we're at the beginning of the string, for the char before)
  36. // front checking
  37. // f1: in either case we don't have to check the front if this is the first character; otherwise,
  38. // f2: either the first character of the string is alnum and the previous character is not (and is not an "internal" character)
  39. // f3: or the first character of the string isn't alnum, the previous character either is, or is a whitespace character
  40. // rear checking
  41. // r1: either we are at the end of the string
  42. // r2: or the last character is alpha and the following character is not alpha or number (and is not an "internal" character)
  43. // r3: or the last character is not alpha or num and the following character either is, or is a whitespace character
  44. // r4: or the last character is num and the test depends on the first character:
  45. // r5: if it was alphanum, then the following character is not alpha or number (and is not an "internal" character)
  46. // r6: or it wasn't alphanum, then the following character is alpha or is a whitespace character
  47. // Whew! This mimics the criteria used by GetNextFeature() in splitting up the string. The easiest way to understand this criteria
  48. // is to examine that function
  49. if ((FALSE != fStart) || // f1
  50. ((FALSE != FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
  51. (FALSE == FDoWordMatchStart(pszPrev, pdwFlagsPrev, CT_START_ALPHANUM)) &&
  52. (FALSE == FIsInternalChar(*pszPrev))) || // f2
  53. ((FALSE == FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
  54. (FALSE != FDoWordMatchStart(pszPrev, pdwFlagsPrev, CT_START_ALPHANUMSPACE)))) // f3
  55. {
  56. // Make it a little more readable
  57. pszEndWord = pszWord + cchWord - 1;
  58. if ((FALSE != fEnd) || // r1
  59. ((FALSE != FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_ALPHA)) &&
  60. (FALSE == FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUM)) &&
  61. (FALSE == FIsInternalChar(*pszEnd))) || // r2
  62. ((FALSE == FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_ALPHANUM)) &&
  63. (FALSE != FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHASPACE))) || // r3
  64. ((FALSE != FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_NUM)) && // r4
  65. (((FALSE != FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
  66. (FALSE == FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUM)) &&
  67. (FALSE == FIsInternalChar(*pszEnd))) || // r5
  68. ((FALSE == FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
  69. (FALSE != FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUMSPACE)))))) // r6
  70. {
  71. // Good match
  72. fRet = TRUE;
  73. }
  74. }
  75. return fRet;
  76. }
  77. /////////////////////////////////////////////////////////////////////////////
  78. // FWordPresent
  79. //
  80. // Determines if the given "word" is present in the Text. A word in this
  81. // case is any string of characters with a non-alpha character on either
  82. // side (or with the beginning or end of the text on either side).
  83. // Case sensitive.
  84. /////////////////////////////////////////////////////////////////////////////
  85. BOOL FWordPresent(LPSTR pszText, DWORD * pdwFlags, LPSTR pszWord, ULONG cchWord, LPSTR * ppszMatch)
  86. {
  87. BOOL fRet = FALSE;
  88. LPSTR pszLoc = NULL;
  89. DWORD dwFlagsPrev = 0;
  90. // If there's nothing to do then just exit
  91. if ((NULL == pszText) || ('\0' == pszText[0]) || (NULL == pszWord) || (NULL == pdwFlags) || (0 == cchWord))
  92. {
  93. fRet = FALSE;
  94. goto exit;
  95. }
  96. // How big is the text
  97. for (pszLoc = pszText; NULL != (pszLoc = StrStr(pszLoc, pszWord)); pszLoc = CharNext(pszLoc))
  98. {
  99. if (FALSE != FMatchToken((pszLoc == pszText), ('\0' == pszLoc[cchWord]),
  100. (pszLoc != pszText) ? CharPrev(pszText, pszLoc) : NULL,
  101. &dwFlagsPrev, pszWord, cchWord, pdwFlags, pszLoc + cchWord))
  102. {
  103. // Good match
  104. if (NULL != ppszMatch)
  105. {
  106. *ppszMatch = pszLoc;
  107. }
  108. fRet = TRUE;
  109. goto exit;
  110. }
  111. // Don't cache these flags...
  112. dwFlagsPrev = 0;
  113. }
  114. exit:
  115. return fRet;
  116. }
  117. /////////////////////////////////////////////////////////////////////////////
  118. // Special feature implementations
  119. //
  120. /////////////////////////////////////////////////////////////////////////////
  121. // This feature is 25% of first 50 words contain no lowercase letters (includes words with no letters at all)
  122. // p20_BODY_INTRO_UPPERCASE_WORDS
  123. const UINT g_cWordsMax = 50;
  124. const DOUBLE g_cNonLowerWordsThreshold = 0.25;
  125. BOOL FSpecialFeatureUpperCaseWords(LPCSTR pszText)
  126. {
  127. BOOL fRet = FALSE;
  128. UINT cWords = 0;
  129. UINT cNonLowerWords = 0;
  130. BOOL fHasLowerLetter = FALSE;
  131. LPCSTR pszPos = NULL;
  132. WORD wType = 0;
  133. if (NULL == pszText)
  134. {
  135. fRet = FALSE;
  136. goto exit;
  137. }
  138. // Skip over the leading spaces
  139. pszPos = PszSkipWhiteSpace(pszText);
  140. if ('\0' == *pszPos)
  141. {
  142. fRet = FALSE;
  143. goto exit;
  144. }
  145. while (cWords < g_cWordsMax)
  146. {
  147. // Are we at the end of a word?
  148. wType = WGetStringTypeEx(pszPos);
  149. if ((0 != (wType & C1_SPACE)) || ('\0' == *pszPos))
  150. {
  151. // We found a word
  152. cWords++;
  153. // Did we have any lower case letters in the word
  154. if (FALSE == fHasLowerLetter)
  155. {
  156. cNonLowerWords++;
  157. }
  158. else
  159. {
  160. fHasLowerLetter = FALSE;
  161. }
  162. // Skip over the trailing spaces
  163. pszPos = PszSkipWhiteSpace(pszPos);
  164. // Are we done with the string?
  165. if ('\0' == *pszPos)
  166. {
  167. break;
  168. }
  169. }
  170. else
  171. {
  172. fHasLowerLetter |= (0 != (wType & C1_LOWER));
  173. // Move to the next character
  174. pszPos = CharNext(pszPos);
  175. }
  176. }
  177. // Set the return value
  178. fRet = ((cWords > 0) && ((cNonLowerWords / (double)cWords) >= g_cNonLowerWordsThreshold));
  179. exit:
  180. return fRet;
  181. }
  182. BOOL FSpecialFeatureUpperCaseWordsStm(IStream * pIStm)
  183. {
  184. BOOL fRet = FALSE;
  185. TCHAR rgchBuff[4096 + 1];
  186. ULONG chRead = 0;
  187. LARGE_INTEGER liZero = {0};
  188. if (NULL == pIStm)
  189. {
  190. fRet = FALSE;
  191. goto exit;
  192. }
  193. // Seek to the start of the stream
  194. if (FAILED(pIStm->Seek(liZero, STREAM_SEEK_SET, NULL)))
  195. {
  196. fRet = FALSE;
  197. goto exit;
  198. }
  199. // Fill up the buffer
  200. if (FAILED(pIStm->Read(rgchBuff, 4096, &chRead)))
  201. {
  202. fRet = FALSE;
  203. goto exit;
  204. }
  205. // Make sure the buffer is zero terminated
  206. rgchBuff[chRead] = '\0';
  207. fRet = FSpecialFeatureUpperCaseWords(rgchBuff);
  208. exit:
  209. return fRet;
  210. }
  211. // This feature is: 8% of first 200 non-space and non-numeric characters aren't letters
  212. // p20_BODY_INTRO_NONALPHA
  213. const UINT g_cchTextMax = 200;
  214. const DOUBLE g_cNonSpaceNumThreshold = 0.08;
  215. BOOL FSpecialFeatureNonAlpha(LPCSTR pszText)
  216. {
  217. BOOL fRet = FALSE;
  218. UINT cchText = 0;
  219. UINT cchNonAlpha = 0;
  220. LPCSTR pszPos = NULL;
  221. WORD wType = 0;
  222. if (NULL == pszText)
  223. {
  224. fRet = FALSE;
  225. goto exit;
  226. }
  227. // Skip over the leading spaces
  228. pszPos = PszSkipWhiteSpace(pszText);
  229. for (; '\0' != *pszPos; pszPos = CharNext(pszPos))
  230. {
  231. wType = WGetStringTypeEx(pszPos);
  232. // Are we not a space or a digit?
  233. if ((0 == (wType & C1_SPACE)) && (0 == (wType & C1_DIGIT)))
  234. {
  235. cchText++;
  236. if (0 == (wType & C1_ALPHA))
  237. {
  238. cchNonAlpha++;
  239. }
  240. // Have we checked enough characters?
  241. if (cchText >= g_cchTextMax)
  242. {
  243. break;
  244. }
  245. }
  246. }
  247. // Set the return value
  248. fRet = (cchText > 0) && ((cchNonAlpha / (double)cchText) >= g_cNonSpaceNumThreshold);
  249. exit:
  250. return fRet;
  251. }
  252. BOOL FSpecialFeatureNonAlphaStm(IStream * pIStm)
  253. {
  254. BOOL fRet = FALSE;
  255. TCHAR rgchBuff[1024 + 1];
  256. ULONG chRead = 0;
  257. LARGE_INTEGER liZero = {0};
  258. if (NULL == pIStm)
  259. {
  260. fRet = FALSE;
  261. goto exit;
  262. }
  263. // Seek to the start of the stream
  264. if (FAILED(pIStm->Seek(liZero, STREAM_SEEK_SET, NULL)))
  265. {
  266. fRet = FALSE;
  267. goto exit;
  268. }
  269. // Fill up the buffer
  270. if (FAILED(pIStm->Read(rgchBuff, 1024, &chRead)))
  271. {
  272. fRet = FALSE;
  273. goto exit;
  274. }
  275. // Make sure the buffer is zero terminated
  276. rgchBuff[chRead] = '\0';
  277. fRet = FSpecialFeatureNonAlpha(rgchBuff);
  278. exit:
  279. return fRet;
  280. }
  281. // --------------------------------------------------------------------------------
  282. // FStreamStringSearch
  283. // --------------------------------------------------------------------------------
  284. #define CB_STREAMMATCH 0x00000FFF
  285. BOOL FStreamStringSearch(LPSTREAM pstm, DWORD * pdwFlagsSearch, LPSTR pszSearch, ULONG cchSearch, DWORD dwFlags)
  286. {
  287. BOOL fRet = FALSE;
  288. ULONG cbSave = 0;
  289. CHAR rgchBuff[CB_STREAMMATCH + 1];
  290. LPSTR pszRead = NULL;
  291. ULONG cbRead = 0;
  292. ULONG cbIn = 0;
  293. ULONG cchGood = NULL;
  294. CHAR chSave = '\0';
  295. LONG cbSize = 0;
  296. LPSTR pszMatch = NULL;
  297. ULONG cbWalk = 0;
  298. // Check incoming params
  299. if ((NULL == pstm) || (NULL == pszSearch) || (0 == cchSearch))
  300. {
  301. goto exit;
  302. }
  303. // We want to save off the lead char and
  304. // a possible ending lead byte...
  305. cbSave = cchSearch + 2;
  306. if (cbSave > ARRAYSIZE(rgchBuff))
  307. {
  308. // we've got a problem - this can cause a buffer overflow later on
  309. Assert(0);
  310. goto exit;
  311. }
  312. // Get the stream size
  313. if (FAILED(HrGetStreamSize(pstm, (ULONG *) &cbSize)))
  314. {
  315. goto exit;
  316. }
  317. // Reset the stream to the beginning
  318. if (FAILED(HrRewindStream(pstm)))
  319. {
  320. goto exit;
  321. }
  322. // Set up the defaults
  323. pszRead = rgchBuff;
  324. cbRead = CB_STREAMMATCH;
  325. // Search for string through the entire stream
  326. while ((cbSize > 0) && (S_OK == pstm->Read(pszRead, cbRead, &cbIn)))
  327. {
  328. // We're done if we read nothing...
  329. if (0 == cbIn)
  330. {
  331. goto exit;
  332. }
  333. // Note that we've read the bytes
  334. cbSize -= cbIn;
  335. // Zero terminate the buffer
  336. pszRead[cbIn] = '\0';
  337. // Should we convert the buffer to upper case
  338. if (0 == (dwFlags & SSF_CASESENSITIVE))
  339. {
  340. cchGood = CharUpperBuff(rgchBuff, (ULONG)(cbIn + pszRead - rgchBuff));
  341. }
  342. else
  343. {
  344. // We need to spin over the buffer figuring out if the end character is a lead
  345. // byte without a corresponding tail byte
  346. cbWalk = (ULONG) (cbIn + pszRead - rgchBuff);
  347. for (cchGood = 0; cchGood < cbWalk; cchGood++)
  348. {
  349. if (IsDBCSLeadByte(rgchBuff[cchGood]))
  350. {
  351. if ((cchGood + 1) >= cbWalk)
  352. {
  353. break;
  354. }
  355. cchGood++;
  356. }
  357. }
  358. }
  359. chSave = rgchBuff[cchGood];
  360. rgchBuff[cchGood] = '\0';
  361. // Search for string
  362. if (FALSE != FWordPresent(rgchBuff, pdwFlagsSearch, pszSearch, cchSearch, &pszMatch))
  363. {
  364. // If we aren't at the end of the stream and we can't
  365. // tell if we are at a word break
  366. if ((0 >= cbSize) || ((pszMatch + cchSearch) != (pszRead + cchGood)))
  367. {
  368. fRet = TRUE;
  369. break;
  370. }
  371. }
  372. // Are we done with the stream
  373. if (0 >= cbSize)
  374. {
  375. break;
  376. }
  377. rgchBuff[cchGood] = chSave;
  378. // Save part of the buffer
  379. // How much space do we have in the buffer
  380. cbRead = CB_STREAMMATCH - cbSave;
  381. // Save the characters
  382. MoveMemory(rgchBuff, rgchBuff + cbRead, cbSave);
  383. // Figure out the new start of the buffer
  384. pszRead = rgchBuff + cbSave;
  385. }
  386. exit:
  387. return(fRet);
  388. }
  389. HRESULT HrConvertHTMLToPlainText(IStream * pIStmHtml, IStream ** ppIStmText)
  390. {
  391. HRESULT hr = S_OK;
  392. IDataObject * pIDataObj = NULL;
  393. FORMATETC fetc = {0};
  394. STGMEDIUM stgmed = {0};
  395. // Check incoming params
  396. if ((NULL == pIStmHtml) || (NULL == ppIStmText))
  397. {
  398. hr = E_INVALIDARG;
  399. goto exit;
  400. }
  401. // Initialize the outgoing param
  402. *ppIStmText = NULL;
  403. hr = MimeEditDocumentFromStream(pIStmHtml, IID_IDataObject, (VOID **)&pIDataObj);
  404. if (FAILED(hr))
  405. {
  406. goto exit;
  407. }
  408. // Set up the format
  409. fetc.cfFormat = CF_TEXT;
  410. fetc.dwAspect = DVASPECT_CONTENT;
  411. fetc.lindex = -1;
  412. fetc.tymed = TYMED_ISTREAM;
  413. // Get the data
  414. hr = pIDataObj->GetData(&fetc, &stgmed);
  415. if (FAILED(hr))
  416. {
  417. goto exit;
  418. }
  419. if (NULL == stgmed.pstm)
  420. {
  421. hr = E_FAIL;
  422. goto exit;
  423. }
  424. // Save the item
  425. *ppIStmText = stgmed.pstm;
  426. (*ppIStmText)->AddRef();
  427. // addref the pUnk as it will be release in releasestgmed
  428. if(NULL != stgmed.pUnkForRelease)
  429. {
  430. (stgmed.pUnkForRelease)->AddRef();
  431. }
  432. hr = S_OK;
  433. exit:
  434. ReleaseStgMedium(&stgmed);
  435. ReleaseObj(pIDataObj);
  436. return hr;
  437. }