windows-server-2003/inetcore/outlookexpress/oejunk/junkutil.cpp


								/*


								  JUNKUTIL.CPP

								  (c) copyright 1998 Microsoft Corp


								  Shared utility functions


								  Created by Robert Rounthwaite ([email protected])


								  Modified by Brian Moore ([email protected])


								*/


								#include <pch.hxx>

								#include "junkutil.h"

								#include <msoedbg.h>

								#define _WIN32_OE 0x0501

								#include <mimeole.h>


								WORD WGetStringTypeEx(LPCSTR pszText)

								{

								    WORD wType = 0;


								    if (NULL == pszText)

								    {

								        wType = 0;

								        goto exit;

								    }


								    if (IsDBCSLeadByte(*pszText))

								        SideAssert(GetStringTypeEx(LOCALE_USER_DEFAULT, CT_CTYPE1, pszText, 2, &wType));

								    else

								        SideAssert(GetStringTypeEx(LOCALE_USER_DEFAULT, CT_CTYPE1, pszText, 1, &wType));


								exit:

								    return wType;

								}


								BOOL FMatchToken(BOOL fStart, BOOL fEnd, LPCSTR pszPrev, DWORD * pdwFlagsPrev, LPCSTR pszWord, ULONG cchWord, DWORD * pdwFlagsWord, LPCSTR pszEnd)

								{

								    BOOL    fRet = FALSE;

								    DWORD   dwFlagsEnd = 0;

								    LPCSTR  pszEndWord = NULL;


								    // this code checks to see that the spot we found is a "word" and not a subword

								    // we want the character before and after to be word break, unless the character on that end of the

								    // string already is not a word break (or we're at the beginning of the string, for the char before)

								    // front checking

								    // f1: in either case we don't have to check the front if this is the first character; otherwise,

								    // f2: either the first character of the string is alnum and the previous character is not (and is not an "internal" character)

								    // f3: or the first character of the string isn't alnum, the previous character either is, or is a whitespace character

								    // rear checking

								    // r1: either we are at the end of the string

								    // r2: or the last character is alpha and the following character is not alpha or number (and is not an "internal" character)

								    // r3: or the last character is not alpha or num and the following character either is, or is a whitespace character

								    // r4: or the last character is num and the test depends on the first character:

								    // r5:      if it was alphanum, then the following character is not alpha or number (and is not an "internal" character)

								    // r6:      or it wasn't alphanum, then the following character is alpha or is a whitespace character

								    // Whew! This mimics the criteria used by GetNextFeature() in splitting up the string. The easiest way to understand this criteria

								    // is to examine that function

								    if ((FALSE != fStart) ||                                                                                // f1

								            ((FALSE != FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&

								                    (FALSE == FDoWordMatchStart(pszPrev, pdwFlagsPrev, CT_START_ALPHANUM)) &&

								                    (FALSE == FIsInternalChar(*pszPrev))) ||                                                // f2

								            ((FALSE == FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&

								                    (FALSE != FDoWordMatchStart(pszPrev, pdwFlagsPrev, CT_START_ALPHANUMSPACE))))           // f3

								    {

								        // Make it a little more readable

								        pszEndWord = pszWord + cchWord - 1;


								        if ((FALSE != fEnd) ||                                                                              // r1

								                ((FALSE != FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_ALPHA)) &&

								                        (FALSE == FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUM)) &&

								                        (FALSE == FIsInternalChar(*pszEnd))) ||                                             // r2

								                ((FALSE == FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_ALPHANUM)) &&

								                        (FALSE != FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHASPACE))) ||              // r3

								                ((FALSE != FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_NUM)) &&                        // r4

								                    (((FALSE != FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&

								                            (FALSE == FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUM)) &&

								                                    (FALSE == FIsInternalChar(*pszEnd))) ||                                 // r5

								                        ((FALSE == FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&

								                            (FALSE != FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUMSPACE))))))       // r6

								        {

								            // Good match

								            fRet = TRUE;

								        }

								    }


								    return fRet;

								}


								/////////////////////////////////////////////////////////////////////////////

								// FWordPresent

								//

								// Determines if the given "word" is present in the Text. A word in this

								// case is any string of characters with a non-alpha character on either

								// side (or with the beginning or end of the text on either side).

								// Case sensitive.

								/////////////////////////////////////////////////////////////////////////////

								BOOL FWordPresent(LPSTR pszText, DWORD * pdwFlags, LPSTR pszWord, ULONG cchWord, LPSTR * ppszMatch)

								{

								    BOOL    fRet = FALSE;

								    LPSTR   pszLoc = NULL;

								    DWORD   dwFlagsPrev = 0;


								    // If there's nothing to do then just exit

								    if ((NULL == pszText) || ('\0' == pszText[0]) || (NULL == pszWord) || (NULL == pdwFlags) || (0 == cchWord))

								    {

								        fRet = FALSE;

								        goto exit;

								    }


								    // How big is the text

								    for (pszLoc = pszText; NULL != (pszLoc = StrStr(pszLoc, pszWord)); pszLoc = CharNext(pszLoc))

								    {

								        if (FALSE != FMatchToken((pszLoc == pszText), ('\0' == pszLoc[cchWord]),

								                    (pszLoc != pszText) ? CharPrev(pszText, pszLoc) : NULL,

								                    &dwFlagsPrev, pszWord, cchWord, pdwFlags, pszLoc + cchWord))

								        {

								            // Good match

								            if (NULL != ppszMatch)

								            {

								                *ppszMatch = pszLoc;

								            }

								            fRet = TRUE;

								            goto exit;

								        }


								        // Don't cache these flags...

								        dwFlagsPrev = 0;

								    }


								exit:

								    return fRet;

								}


								/////////////////////////////////////////////////////////////////////////////

								// Special feature implementations

								//

								/////////////////////////////////////////////////////////////////////////////


								// This feature is 25% of first 50 words contain no lowercase letters (includes words with no letters at all)

								// p20_BODY_INTRO_UPPERCASE_WORDS


								const UINT      g_cWordsMax = 50;

								const DOUBLE    g_cNonLowerWordsThreshold = 0.25;

								BOOL FSpecialFeatureUpperCaseWords(LPCSTR pszText)

								{

								    BOOL    fRet = FALSE;

								    UINT    cWords = 0;

								    UINT    cNonLowerWords = 0;

								    BOOL    fHasLowerLetter = FALSE;

								    LPCSTR  pszPos = NULL;

								    WORD    wType = 0;


								    if (NULL == pszText)

								    {

								        fRet = FALSE;

								        goto exit;

								    }


								    // Skip over the leading spaces

								    pszPos = PszSkipWhiteSpace(pszText);


								    if ('\0' == *pszPos)

								    {

								        fRet = FALSE;

								        goto exit;

								    }


								    while (cWords < g_cWordsMax)

								    {

								        // Are we at the end of a word?

								        wType = WGetStringTypeEx(pszPos);


								        if ((0 != (wType & C1_SPACE)) || ('\0' == *pszPos))

								        {

								            // We found a word

								            cWords++;


								            // Did we have any lower case letters in the word

								            if (FALSE == fHasLowerLetter)

								            {

								                cNonLowerWords++;

								            }

								            else

								            {

								                fHasLowerLetter = FALSE;

								            }


								            // Skip over the trailing spaces

								            pszPos = PszSkipWhiteSpace(pszPos);


								            // Are we done with the string?

								            if ('\0' == *pszPos)

								            {

								                break;

								            }

								        }

								        else

								        {

								            fHasLowerLetter |= (0 != (wType & C1_LOWER));


								            // Move to the next character

								            pszPos = CharNext(pszPos);

								        }

								    }


								    // Set the return value

								    fRet = ((cWords > 0) && ((cNonLowerWords / (double)cWords) >= g_cNonLowerWordsThreshold));


								exit:

								    return fRet;

								}


								BOOL FSpecialFeatureUpperCaseWordsStm(IStream * pIStm)

								{

								    BOOL            fRet = FALSE;

								    TCHAR           rgchBuff[4096 + 1];

								    ULONG           chRead = 0;

								    LARGE_INTEGER   liZero = {0};


								    if (NULL == pIStm)

								    {

								        fRet = FALSE;

								        goto exit;

								    }


								    // Seek to the start of the stream

								    if (FAILED(pIStm->Seek(liZero, STREAM_SEEK_SET, NULL)))

								    {

								        fRet = FALSE;

								        goto exit;

								    }


								    // Fill up the buffer

								    if (FAILED(pIStm->Read(rgchBuff, 4096, &chRead)))

								    {

								        fRet = FALSE;

								        goto exit;

								    }


								    // Make sure the buffer is zero terminated

								    rgchBuff[chRead] = '\0';


								    fRet = FSpecialFeatureUpperCaseWords(rgchBuff);


								exit:

								    return fRet;

								}


								// This feature is: 8% of first 200 non-space and non-numeric characters aren't letters

								// p20_BODY_INTRO_NONALPHA

								const UINT      g_cchTextMax = 200;

								const DOUBLE    g_cNonSpaceNumThreshold = 0.08;

								BOOL FSpecialFeatureNonAlpha(LPCSTR pszText)

								{

								    BOOL    fRet = FALSE;

								    UINT    cchText = 0;

								    UINT    cchNonAlpha = 0;

								    LPCSTR  pszPos = NULL;

								    WORD    wType = 0;


								    if (NULL == pszText)

								    {

								        fRet = FALSE;

								        goto exit;

								    }


								    // Skip over the leading spaces

								    pszPos = PszSkipWhiteSpace(pszText);


								    for (; '\0' != *pszPos; pszPos = CharNext(pszPos))

								    {

								        wType = WGetStringTypeEx(pszPos);


								        // Are we not a space or a digit?

								        if ((0 == (wType & C1_SPACE)) && (0 == (wType & C1_DIGIT)))

								        {

								            cchText++;


								            if (0 == (wType & C1_ALPHA))

								            {

								                cchNonAlpha++;

								            }


								            // Have we checked enough characters?

								            if (cchText >= g_cchTextMax)

								            {

								                break;

								            }

								        }

								    }


								    // Set the return value

								    fRet = (cchText > 0) && ((cchNonAlpha / (double)cchText) >= g_cNonSpaceNumThreshold);


								exit:

								    return fRet;

								}


								BOOL FSpecialFeatureNonAlphaStm(IStream * pIStm)

								{

								    BOOL            fRet = FALSE;

								    TCHAR           rgchBuff[1024 + 1];

								    ULONG           chRead = 0;

								    LARGE_INTEGER   liZero = {0};


								    if (NULL == pIStm)

								    {

								        fRet = FALSE;

								        goto exit;

								    }


								    // Seek to the start of the stream

								    if (FAILED(pIStm->Seek(liZero, STREAM_SEEK_SET, NULL)))

								    {

								        fRet = FALSE;

								        goto exit;

								    }


								    // Fill up the buffer

								    if (FAILED(pIStm->Read(rgchBuff, 1024, &chRead)))

								    {

								        fRet = FALSE;

								        goto exit;

								    }


								    // Make sure the buffer is zero terminated

								    rgchBuff[chRead] = '\0';


								    fRet = FSpecialFeatureNonAlpha(rgchBuff);


								exit:

								    return fRet;

								}


								// --------------------------------------------------------------------------------

								// FStreamStringSearch

								// --------------------------------------------------------------------------------

								#define CB_STREAMMATCH  0x00000FFF

								BOOL FStreamStringSearch(LPSTREAM pstm, DWORD * pdwFlagsSearch, LPSTR pszSearch, ULONG cchSearch, DWORD dwFlags)

								{

								    BOOL            fRet = FALSE;

								    ULONG           cbSave = 0;

								    CHAR            rgchBuff[CB_STREAMMATCH + 1];

								    LPSTR           pszRead = NULL;

								    ULONG           cbRead = 0;

								    ULONG           cbIn = 0;

								    ULONG           cchGood = NULL;

								    CHAR            chSave = '\0';

								    LONG            cbSize = 0;

								    LPSTR           pszMatch = NULL;

								    ULONG           cbWalk = 0;


								    // Check incoming params

								    if ((NULL == pstm) || (NULL == pszSearch) || (0 == cchSearch))

								    {

								        goto exit;

								    }


								    // We want to save off the lead char and

								    // a possible ending lead byte...

								    cbSave = cchSearch + 2;

								    if (cbSave > ARRAYSIZE(rgchBuff))

								    {

								        // we've got a problem - this can cause a buffer overflow later on

								        Assert(0);

								        goto exit;

								  	}


								    // Get the stream size

								    if (FAILED(HrGetStreamSize(pstm, (ULONG *) &cbSize)))

								    {

								        goto exit;

								    }


								    // Reset the stream to the beginning

								    if (FAILED(HrRewindStream(pstm)))

								    {

								        goto exit;

								    }


								    // Set up the defaults

								    pszRead = rgchBuff;

								    cbRead = CB_STREAMMATCH;


								    // Search for string through the entire stream

								    while ((cbSize > 0) && (S_OK == pstm->Read(pszRead, cbRead, &cbIn)))

								    {

								        // We're done if we read nothing...

								        if (0 == cbIn)

								        {

								            goto exit;

								        }


								        // Note that we've read the bytes

								        cbSize -= cbIn;


								        // Zero terminate the buffer

								        pszRead[cbIn] = '\0';


								        // Should we convert the buffer to upper case

								        if (0 == (dwFlags & SSF_CASESENSITIVE))

								        {

								            cchGood = CharUpperBuff(rgchBuff, (ULONG)(cbIn + pszRead - rgchBuff));

								        }

								        else

								        {

								            // We need to spin over the buffer figuring out if the end character is a lead

								            // byte without a corresponding tail byte

								            cbWalk = (ULONG) (cbIn + pszRead - rgchBuff);

								            for (cchGood = 0; cchGood < cbWalk; cchGood++)

								            {

								                if (IsDBCSLeadByte(rgchBuff[cchGood]))

								                {

								                    if ((cchGood + 1) >= cbWalk)

								                    {

								                        break;

								                    }


								                    cchGood++;

								                }

								            }

								        }


								        chSave = rgchBuff[cchGood];

								        rgchBuff[cchGood] = '\0';


								        // Search for string

								        if (FALSE != FWordPresent(rgchBuff, pdwFlagsSearch, pszSearch, cchSearch, &pszMatch))

								        {

								            // If we aren't at the end of the stream and we can't

								            // tell if we are at a word break

								            if ((0 >= cbSize) || ((pszMatch + cchSearch) != (pszRead + cchGood)))

								            {

								                fRet = TRUE;

								                break;

								            }

								        }


								        // Are we done with the stream

								        if (0 >= cbSize)

								        {

								            break;

								        }


								        rgchBuff[cchGood] = chSave;


								        // Save part of the buffer


								        // How much space do we have in the buffer

								        cbRead = CB_STREAMMATCH - cbSave;


								        // Save the characters

								        MoveMemory(rgchBuff, rgchBuff + cbRead, cbSave);


								        // Figure out the new start of the buffer

								        pszRead = rgchBuff + cbSave;

								    }


								exit:

								    return(fRet);

								}


								HRESULT HrConvertHTMLToPlainText(IStream * pIStmHtml, IStream ** ppIStmText)

								{

								    HRESULT         hr = S_OK;

								    IDataObject *   pIDataObj = NULL;

								    FORMATETC       fetc = {0};

								    STGMEDIUM       stgmed = {0};


								    // Check incoming params

								    if ((NULL == pIStmHtml) || (NULL == ppIStmText))

								    {

								        hr = E_INVALIDARG;

								        goto exit;

								    }


								    // Initialize the outgoing param

								    *ppIStmText = NULL;


								    hr = MimeEditDocumentFromStream(pIStmHtml, IID_IDataObject, (VOID **)&pIDataObj);

								    if (FAILED(hr))

								    {

								        goto exit;

								    }


								    // Set up the format

								    fetc.cfFormat = CF_TEXT;

								    fetc.dwAspect = DVASPECT_CONTENT;

								    fetc.lindex = -1;

								    fetc.tymed = TYMED_ISTREAM;


								    // Get the data

								    hr = pIDataObj->GetData(&fetc, &stgmed);

								    if (FAILED(hr))

								    {

								        goto exit;

								    }


								    if (NULL == stgmed.pstm)

								    {

								        hr = E_FAIL;

								        goto exit;

								    }


								    // Save the item

								    *ppIStmText = stgmed.pstm;

								    (*ppIStmText)->AddRef();


								    // addref the pUnk as it will be release in releasestgmed

								    if(NULL != stgmed.pUnkForRelease)

								    {

								        (stgmed.pUnkForRelease)->AddRef();

								    }


								    hr = S_OK;


								exit:

								    ReleaseStgMedium(&stgmed);

								    ReleaseObj(pIDataObj);

								    return hr;

								}