/* JUNKUTIL.CPP (c) copyright 1998 Microsoft Corp Shared utility functions Created by Robert Rounthwaite (RobertRo@microsoft.com) Modified by Brian Moore (brimo@microsoft.com) */ #include #include "junkutil.h" #include #define _WIN32_OE 0x0501 #include WORD WGetStringTypeEx(LPCSTR pszText) { WORD wType = 0; if (NULL == pszText) { wType = 0; goto exit; } if (IsDBCSLeadByte(*pszText)) SideAssert(GetStringTypeEx(LOCALE_USER_DEFAULT, CT_CTYPE1, pszText, 2, &wType)); else SideAssert(GetStringTypeEx(LOCALE_USER_DEFAULT, CT_CTYPE1, pszText, 1, &wType)); exit: return wType; } BOOL FMatchToken(BOOL fStart, BOOL fEnd, LPCSTR pszPrev, DWORD * pdwFlagsPrev, LPCSTR pszWord, ULONG cchWord, DWORD * pdwFlagsWord, LPCSTR pszEnd) { BOOL fRet = FALSE; DWORD dwFlagsEnd = 0; LPCSTR pszEndWord = NULL; // this code checks to see that the spot we found is a "word" and not a subword // we want the character before and after to be word break, unless the character on that end of the // string already is not a word break (or we're at the beginning of the string, for the char before) // front checking // f1: in either case we don't have to check the front if this is the first character; otherwise, // f2: either the first character of the string is alnum and the previous character is not (and is not an "internal" character) // f3: or the first character of the string isn't alnum, the previous character either is, or is a whitespace character // rear checking // r1: either we are at the end of the string // r2: or the last character is alpha and the following character is not alpha or number (and is not an "internal" character) // r3: or the last character is not alpha or num and the following character either is, or is a whitespace character // r4: or the last character is num and the test depends on the first character: // r5: if it was alphanum, then the following character is not alpha or number (and is not an "internal" character) // r6: or it wasn't alphanum, then the following character is alpha or is a whitespace character // Whew! This mimics the criteria used by GetNextFeature() in splitting up the string. The easiest way to understand this criteria // is to examine that function if ((FALSE != fStart) || // f1 ((FALSE != FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) && (FALSE == FDoWordMatchStart(pszPrev, pdwFlagsPrev, CT_START_ALPHANUM)) && (FALSE == FIsInternalChar(*pszPrev))) || // f2 ((FALSE == FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) && (FALSE != FDoWordMatchStart(pszPrev, pdwFlagsPrev, CT_START_ALPHANUMSPACE)))) // f3 { // Make it a little more readable pszEndWord = pszWord + cchWord - 1; if ((FALSE != fEnd) || // r1 ((FALSE != FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_ALPHA)) && (FALSE == FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUM)) && (FALSE == FIsInternalChar(*pszEnd))) || // r2 ((FALSE == FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_ALPHANUM)) && (FALSE != FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHASPACE))) || // r3 ((FALSE != FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_NUM)) && // r4 (((FALSE != FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) && (FALSE == FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUM)) && (FALSE == FIsInternalChar(*pszEnd))) || // r5 ((FALSE == FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) && (FALSE != FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUMSPACE)))))) // r6 { // Good match fRet = TRUE; } } return fRet; } ///////////////////////////////////////////////////////////////////////////// // FWordPresent // // Determines if the given "word" is present in the Text. A word in this // case is any string of characters with a non-alpha character on either // side (or with the beginning or end of the text on either side). // Case sensitive. ///////////////////////////////////////////////////////////////////////////// BOOL FWordPresent(LPSTR pszText, DWORD * pdwFlags, LPSTR pszWord, ULONG cchWord, LPSTR * ppszMatch) { BOOL fRet = FALSE; LPSTR pszLoc = NULL; DWORD dwFlagsPrev = 0; // If there's nothing to do then just exit if ((NULL == pszText) || ('\0' == pszText[0]) || (NULL == pszWord) || (NULL == pdwFlags) || (0 == cchWord)) { fRet = FALSE; goto exit; } // How big is the text for (pszLoc = pszText; NULL != (pszLoc = StrStr(pszLoc, pszWord)); pszLoc = CharNext(pszLoc)) { if (FALSE != FMatchToken((pszLoc == pszText), ('\0' == pszLoc[cchWord]), (pszLoc != pszText) ? CharPrev(pszText, pszLoc) : NULL, &dwFlagsPrev, pszWord, cchWord, pdwFlags, pszLoc + cchWord)) { // Good match if (NULL != ppszMatch) { *ppszMatch = pszLoc; } fRet = TRUE; goto exit; } // Don't cache these flags... dwFlagsPrev = 0; } exit: return fRet; } ///////////////////////////////////////////////////////////////////////////// // Special feature implementations // ///////////////////////////////////////////////////////////////////////////// // This feature is 25% of first 50 words contain no lowercase letters (includes words with no letters at all) // p20_BODY_INTRO_UPPERCASE_WORDS const UINT g_cWordsMax = 50; const DOUBLE g_cNonLowerWordsThreshold = 0.25; BOOL FSpecialFeatureUpperCaseWords(LPCSTR pszText) { BOOL fRet = FALSE; UINT cWords = 0; UINT cNonLowerWords = 0; BOOL fHasLowerLetter = FALSE; LPCSTR pszPos = NULL; WORD wType = 0; if (NULL == pszText) { fRet = FALSE; goto exit; } // Skip over the leading spaces pszPos = PszSkipWhiteSpace(pszText); if ('\0' == *pszPos) { fRet = FALSE; goto exit; } while (cWords < g_cWordsMax) { // Are we at the end of a word? wType = WGetStringTypeEx(pszPos); if ((0 != (wType & C1_SPACE)) || ('\0' == *pszPos)) { // We found a word cWords++; // Did we have any lower case letters in the word if (FALSE == fHasLowerLetter) { cNonLowerWords++; } else { fHasLowerLetter = FALSE; } // Skip over the trailing spaces pszPos = PszSkipWhiteSpace(pszPos); // Are we done with the string? if ('\0' == *pszPos) { break; } } else { fHasLowerLetter |= (0 != (wType & C1_LOWER)); // Move to the next character pszPos = CharNext(pszPos); } } // Set the return value fRet = ((cWords > 0) && ((cNonLowerWords / (double)cWords) >= g_cNonLowerWordsThreshold)); exit: return fRet; } BOOL FSpecialFeatureUpperCaseWordsStm(IStream * pIStm) { BOOL fRet = FALSE; TCHAR rgchBuff[4096 + 1]; ULONG chRead = 0; LARGE_INTEGER liZero = {0}; if (NULL == pIStm) { fRet = FALSE; goto exit; } // Seek to the start of the stream if (FAILED(pIStm->Seek(liZero, STREAM_SEEK_SET, NULL))) { fRet = FALSE; goto exit; } // Fill up the buffer if (FAILED(pIStm->Read(rgchBuff, 4096, &chRead))) { fRet = FALSE; goto exit; } // Make sure the buffer is zero terminated rgchBuff[chRead] = '\0'; fRet = FSpecialFeatureUpperCaseWords(rgchBuff); exit: return fRet; } // This feature is: 8% of first 200 non-space and non-numeric characters aren't letters // p20_BODY_INTRO_NONALPHA const UINT g_cchTextMax = 200; const DOUBLE g_cNonSpaceNumThreshold = 0.08; BOOL FSpecialFeatureNonAlpha(LPCSTR pszText) { BOOL fRet = FALSE; UINT cchText = 0; UINT cchNonAlpha = 0; LPCSTR pszPos = NULL; WORD wType = 0; if (NULL == pszText) { fRet = FALSE; goto exit; } // Skip over the leading spaces pszPos = PszSkipWhiteSpace(pszText); for (; '\0' != *pszPos; pszPos = CharNext(pszPos)) { wType = WGetStringTypeEx(pszPos); // Are we not a space or a digit? if ((0 == (wType & C1_SPACE)) && (0 == (wType & C1_DIGIT))) { cchText++; if (0 == (wType & C1_ALPHA)) { cchNonAlpha++; } // Have we checked enough characters? if (cchText >= g_cchTextMax) { break; } } } // Set the return value fRet = (cchText > 0) && ((cchNonAlpha / (double)cchText) >= g_cNonSpaceNumThreshold); exit: return fRet; } BOOL FSpecialFeatureNonAlphaStm(IStream * pIStm) { BOOL fRet = FALSE; TCHAR rgchBuff[1024 + 1]; ULONG chRead = 0; LARGE_INTEGER liZero = {0}; if (NULL == pIStm) { fRet = FALSE; goto exit; } // Seek to the start of the stream if (FAILED(pIStm->Seek(liZero, STREAM_SEEK_SET, NULL))) { fRet = FALSE; goto exit; } // Fill up the buffer if (FAILED(pIStm->Read(rgchBuff, 1024, &chRead))) { fRet = FALSE; goto exit; } // Make sure the buffer is zero terminated rgchBuff[chRead] = '\0'; fRet = FSpecialFeatureNonAlpha(rgchBuff); exit: return fRet; } // -------------------------------------------------------------------------------- // FStreamStringSearch // -------------------------------------------------------------------------------- #define CB_STREAMMATCH 0x00000FFF BOOL FStreamStringSearch(LPSTREAM pstm, DWORD * pdwFlagsSearch, LPSTR pszSearch, ULONG cchSearch, DWORD dwFlags) { BOOL fRet = FALSE; ULONG cbSave = 0; CHAR rgchBuff[CB_STREAMMATCH + 1]; LPSTR pszRead = NULL; ULONG cbRead = 0; ULONG cbIn = 0; ULONG cchGood = NULL; CHAR chSave = '\0'; LONG cbSize = 0; LPSTR pszMatch = NULL; ULONG cbWalk = 0; // Check incoming params if ((NULL == pstm) || (NULL == pszSearch) || (0 == cchSearch)) { goto exit; } // We want to save off the lead char and // a possible ending lead byte... cbSave = cchSearch + 2; if (cbSave > ARRAYSIZE(rgchBuff)) { // we've got a problem - this can cause a buffer overflow later on Assert(0); goto exit; } // Get the stream size if (FAILED(HrGetStreamSize(pstm, (ULONG *) &cbSize))) { goto exit; } // Reset the stream to the beginning if (FAILED(HrRewindStream(pstm))) { goto exit; } // Set up the defaults pszRead = rgchBuff; cbRead = CB_STREAMMATCH; // Search for string through the entire stream while ((cbSize > 0) && (S_OK == pstm->Read(pszRead, cbRead, &cbIn))) { // We're done if we read nothing... if (0 == cbIn) { goto exit; } // Note that we've read the bytes cbSize -= cbIn; // Zero terminate the buffer pszRead[cbIn] = '\0'; // Should we convert the buffer to upper case if (0 == (dwFlags & SSF_CASESENSITIVE)) { cchGood = CharUpperBuff(rgchBuff, (ULONG)(cbIn + pszRead - rgchBuff)); } else { // We need to spin over the buffer figuring out if the end character is a lead // byte without a corresponding tail byte cbWalk = (ULONG) (cbIn + pszRead - rgchBuff); for (cchGood = 0; cchGood < cbWalk; cchGood++) { if (IsDBCSLeadByte(rgchBuff[cchGood])) { if ((cchGood + 1) >= cbWalk) { break; } cchGood++; } } } chSave = rgchBuff[cchGood]; rgchBuff[cchGood] = '\0'; // Search for string if (FALSE != FWordPresent(rgchBuff, pdwFlagsSearch, pszSearch, cchSearch, &pszMatch)) { // If we aren't at the end of the stream and we can't // tell if we are at a word break if ((0 >= cbSize) || ((pszMatch + cchSearch) != (pszRead + cchGood))) { fRet = TRUE; break; } } // Are we done with the stream if (0 >= cbSize) { break; } rgchBuff[cchGood] = chSave; // Save part of the buffer // How much space do we have in the buffer cbRead = CB_STREAMMATCH - cbSave; // Save the characters MoveMemory(rgchBuff, rgchBuff + cbRead, cbSave); // Figure out the new start of the buffer pszRead = rgchBuff + cbSave; } exit: return(fRet); } HRESULT HrConvertHTMLToPlainText(IStream * pIStmHtml, IStream ** ppIStmText) { HRESULT hr = S_OK; IDataObject * pIDataObj = NULL; FORMATETC fetc = {0}; STGMEDIUM stgmed = {0}; // Check incoming params if ((NULL == pIStmHtml) || (NULL == ppIStmText)) { hr = E_INVALIDARG; goto exit; } // Initialize the outgoing param *ppIStmText = NULL; hr = MimeEditDocumentFromStream(pIStmHtml, IID_IDataObject, (VOID **)&pIDataObj); if (FAILED(hr)) { goto exit; } // Set up the format fetc.cfFormat = CF_TEXT; fetc.dwAspect = DVASPECT_CONTENT; fetc.lindex = -1; fetc.tymed = TYMED_ISTREAM; // Get the data hr = pIDataObj->GetData(&fetc, &stgmed); if (FAILED(hr)) { goto exit; } if (NULL == stgmed.pstm) { hr = E_FAIL; goto exit; } // Save the item *ppIStmText = stgmed.pstm; (*ppIStmText)->AddRef(); // addref the pUnk as it will be release in releasestgmed if(NULL != stgmed.pUnkForRelease) { (stgmed.pUnkForRelease)->AddRef(); } hr = S_OK; exit: ReleaseStgMedium(&stgmed); ReleaseObj(pIDataObj); return hr; }