windows-server-2003/inetcore/outlookexpress/oejunk/junkutil.cpp

/*

  JUNKUTIL.CPP
  (c) copyright 1998 Microsoft Corp

  Shared utility functions

  Created by Robert Rounthwaite (RobertRo@microsoft.com)

  Modified by Brian Moore (brimo@microsoft.com)

*/

#include <pch.hxx>
#include "junkutil.h"
#include <msoedbg.h>
#define _WIN32_OE 0x0501
#include <mimeole.h>

WORD WGetStringTypeEx(LPCSTR pszText)
{
    WORD wType = 0;

    if (NULL == pszText)
    {
        wType = 0;
        goto exit;
    }

    if (IsDBCSLeadByte(*pszText))
        SideAssert(GetStringTypeEx(LOCALE_USER_DEFAULT, CT_CTYPE1, pszText, 2, &wType));
    else
        SideAssert(GetStringTypeEx(LOCALE_USER_DEFAULT, CT_CTYPE1, pszText, 1, &wType));
        
exit:
    return wType;
}

BOOL FMatchToken(BOOL fStart, BOOL fEnd, LPCSTR pszPrev, DWORD * pdwFlagsPrev, LPCSTR pszWord, ULONG cchWord, DWORD * pdwFlagsWord, LPCSTR pszEnd)
{
    BOOL    fRet = FALSE;
    DWORD   dwFlagsEnd = 0;
    LPCSTR  pszEndWord = NULL;

    // this code checks to see that the spot we found is a "word" and not a subword
    // we want the character before and after to be word break, unless the character on that end of the 
    // string already is not a word break (or we're at the beginning of the string, for the char before)
    // front checking
    // f1: in either case we don't have to check the front if this is the first character; otherwise,
    // f2: either the first character of the string is alnum and the previous character is not (and is not an "internal" character)
    // f3: or the first character of the string isn't alnum, the previous character either is, or is a whitespace character
    // rear checking
    // r1: either we are at the end of the string
    // r2: or the last character is alpha and the following character is not alpha or number (and is not an "internal" character)
    // r3: or the last character is not alpha or num and the following character either is, or is a whitespace character
    // r4: or the last character is num and the test depends on the first character: 
    // r5:      if it was alphanum, then the following character is not alpha or number (and is not an "internal" character)
    // r6:      or it wasn't alphanum, then the following character is alpha or is a whitespace character
    // Whew! This mimics the criteria used by GetNextFeature() in splitting up the string. The easiest way to understand this criteria
    // is to examine that function
    if ((FALSE != fStart) ||                                                                                // f1
            ((FALSE != FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
                    (FALSE == FDoWordMatchStart(pszPrev, pdwFlagsPrev, CT_START_ALPHANUM)) &&
                    (FALSE == FIsInternalChar(*pszPrev))) ||                                                // f2               
            ((FALSE == FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
                    (FALSE != FDoWordMatchStart(pszPrev, pdwFlagsPrev, CT_START_ALPHANUMSPACE))))           // f3
    {
        // Make it a little more readable
        pszEndWord = pszWord + cchWord - 1;
        
        if ((FALSE != fEnd) ||                                                                              // r1
                ((FALSE != FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_ALPHA)) &&
                        (FALSE == FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUM)) &&
                        (FALSE == FIsInternalChar(*pszEnd))) ||                                             // r2
                ((FALSE == FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_ALPHANUM)) &&
                        (FALSE != FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHASPACE))) ||              // r3
                ((FALSE != FDoWordMatchEnd(pszEndWord, pdwFlagsWord, CT_END_NUM)) &&                        // r4
                    (((FALSE != FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
                            (FALSE == FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUM)) &&
                                    (FALSE == FIsInternalChar(*pszEnd))) ||                                 // r5
                        ((FALSE == FDoWordMatchStart(pszWord, pdwFlagsWord, CT_START_ALPHANUM)) &&
                            (FALSE != FDoWordMatchEnd(pszEnd, &dwFlagsEnd, CT_END_ALPHANUMSPACE))))))       // r6
        {
            // Good match
            fRet = TRUE;
        }
    }

    return fRet;
}

/////////////////////////////////////////////////////////////////////////////
// FWordPresent
//
// Determines if the given "word" is present in the Text. A word in this
// case is any string of characters with a non-alpha character on either
// side (or with the beginning or end of the text on either side).
// Case sensitive.
/////////////////////////////////////////////////////////////////////////////
BOOL FWordPresent(LPSTR pszText, DWORD * pdwFlags, LPSTR pszWord, ULONG cchWord, LPSTR * ppszMatch)
{
    BOOL    fRet = FALSE;
    LPSTR   pszLoc = NULL;
    DWORD   dwFlagsPrev = 0;
    
    // If there's nothing to do then just exit
    if ((NULL == pszText) || ('\0' == pszText[0]) || (NULL == pszWord) || (NULL == pdwFlags) || (0 == cchWord))
    {
        fRet = FALSE;
        goto exit;
    }
    
    // How big is the text
    for (pszLoc = pszText; NULL != (pszLoc = StrStr(pszLoc, pszWord)); pszLoc = CharNext(pszLoc))
    {
        if (FALSE != FMatchToken((pszLoc == pszText), ('\0' == pszLoc[cchWord]),
                    (pszLoc != pszText) ? CharPrev(pszText, pszLoc) : NULL,
                    &dwFlagsPrev, pszWord, cchWord, pdwFlags, pszLoc + cchWord))
        {
            // Good match
            if (NULL != ppszMatch)
            {
                *ppszMatch = pszLoc;
            }
            fRet = TRUE;
            goto exit;
        }

        // Don't cache these flags...
        dwFlagsPrev = 0;        
    }
    
exit:
    return fRet;
}

/////////////////////////////////////////////////////////////////////////////
// Special feature implementations
//
/////////////////////////////////////////////////////////////////////////////

// This feature is 25% of first 50 words contain no lowercase letters (includes words with no letters at all)
// p20_BODY_INTRO_UPPERCASE_WORDS

const UINT      g_cWordsMax = 50;
const DOUBLE    g_cNonLowerWordsThreshold = 0.25;
BOOL FSpecialFeatureUpperCaseWords(LPCSTR pszText)
{
    BOOL    fRet = FALSE;
    UINT    cWords = 0;
    UINT    cNonLowerWords = 0;
    BOOL    fHasLowerLetter = FALSE;
    LPCSTR  pszPos = NULL; 
    WORD    wType = 0;

    if (NULL == pszText)
    {
        fRet = FALSE;
        goto exit;
    }

    // Skip over the leading spaces
    pszPos = PszSkipWhiteSpace(pszText);

    if ('\0' == *pszPos)
    {
        fRet = FALSE;
        goto exit;
    }
    
    while (cWords < g_cWordsMax)
    {
        // Are we at the end of a word?
        wType = WGetStringTypeEx(pszPos);
        
        if ((0 != (wType & C1_SPACE)) || ('\0' == *pszPos))
        {
            // We found a word
            cWords++;
            
            // Did we have any lower case letters in the word
            if (FALSE == fHasLowerLetter)
            {
                cNonLowerWords++;
            }
            else
            {
                fHasLowerLetter = FALSE;
            }

            // Skip over the trailing spaces
            pszPos = PszSkipWhiteSpace(pszPos);
            
            // Are we done with the string?
            if ('\0' == *pszPos)
            {
                break;
            }
        }
        else
        {
            fHasLowerLetter |= (0 != (wType & C1_LOWER));

            // Move to the next character
            pszPos = CharNext(pszPos);
        }
    }

    // Set the return value
    fRet = ((cWords > 0) && ((cNonLowerWords / (double)cWords) >= g_cNonLowerWordsThreshold));
    
exit:
    return fRet;
}

BOOL FSpecialFeatureUpperCaseWordsStm(IStream * pIStm)
{
    BOOL            fRet = FALSE;
    TCHAR           rgchBuff[4096 + 1];
    ULONG           chRead = 0;
    LARGE_INTEGER   liZero = {0};
    
    if (NULL == pIStm)
    {
        fRet = FALSE;
        goto exit;
    }

    // Seek to the start of the stream
    if (FAILED(pIStm->Seek(liZero, STREAM_SEEK_SET, NULL)))
    {
        fRet = FALSE;
        goto exit;
    }

    // Fill up the buffer
    if (FAILED(pIStm->Read(rgchBuff, 4096, &chRead)))
    {
        fRet = FALSE;
        goto exit;
    }

    // Make sure the buffer is zero terminated
    rgchBuff[chRead] = '\0';
    
    fRet = FSpecialFeatureUpperCaseWords(rgchBuff);
    
exit:
    return fRet;
}

// This feature is: 8% of first 200 non-space and non-numeric characters aren't letters
// p20_BODY_INTRO_NONALPHA
const UINT      g_cchTextMax = 200;
const DOUBLE    g_cNonSpaceNumThreshold = 0.08;
BOOL FSpecialFeatureNonAlpha(LPCSTR pszText)
{
    BOOL    fRet = FALSE;
    UINT    cchText = 0;
    UINT    cchNonAlpha = 0;
    LPCSTR  pszPos = NULL; 
    WORD    wType = 0;

    if (NULL == pszText)
    {
        fRet = FALSE;
        goto exit;
    }

    // Skip over the leading spaces
    pszPos = PszSkipWhiteSpace(pszText);

    for (; '\0' != *pszPos; pszPos = CharNext(pszPos))
    {
        wType = WGetStringTypeEx(pszPos);
        
        // Are we not a space or a digit?
        if ((0 == (wType & C1_SPACE)) && (0 == (wType & C1_DIGIT)))
        {
            cchText++;
            
            if (0 == (wType & C1_ALPHA))
            {
                cchNonAlpha++;
            }

            // Have we checked enough characters?
            if (cchText >= g_cchTextMax)
            {
                break;
            }
        }
    }

    // Set the return value
    fRet = (cchText > 0) && ((cchNonAlpha / (double)cchText) >= g_cNonSpaceNumThreshold);
    
exit:
    return fRet;
}

BOOL FSpecialFeatureNonAlphaStm(IStream * pIStm)
{
    BOOL            fRet = FALSE;
    TCHAR           rgchBuff[1024 + 1];
    ULONG           chRead = 0;
    LARGE_INTEGER   liZero = {0};
    
    if (NULL == pIStm)
    {
        fRet = FALSE;
        goto exit;
    }

    // Seek to the start of the stream
    if (FAILED(pIStm->Seek(liZero, STREAM_SEEK_SET, NULL)))
    {
        fRet = FALSE;
        goto exit;
    }

    // Fill up the buffer
    if (FAILED(pIStm->Read(rgchBuff, 1024, &chRead)))
    {
        fRet = FALSE;
        goto exit;
    }

    // Make sure the buffer is zero terminated
    rgchBuff[chRead] = '\0';
    
    fRet = FSpecialFeatureNonAlpha(rgchBuff);
    
exit:
    return fRet;
}

// --------------------------------------------------------------------------------
// FStreamStringSearch
// --------------------------------------------------------------------------------
#define CB_STREAMMATCH  0x00000FFF
BOOL FStreamStringSearch(LPSTREAM pstm, DWORD * pdwFlagsSearch, LPSTR pszSearch, ULONG cchSearch, DWORD dwFlags)
{
    BOOL            fRet = FALSE;
    ULONG           cbSave = 0;
    CHAR            rgchBuff[CB_STREAMMATCH + 1];
    LPSTR           pszRead = NULL;
    ULONG           cbRead = 0;
    ULONG           cbIn = 0;
    ULONG           cchGood = NULL;
    CHAR            chSave = '\0';
    LONG            cbSize = 0;
    LPSTR           pszMatch = NULL;
    ULONG           cbWalk = 0;

    // Check incoming params
    if ((NULL == pstm) || (NULL == pszSearch) || (0 == cchSearch))
    {
        goto exit;
    }

    // We want to save off the lead char and
    // a possible ending lead byte...
    cbSave = cchSearch + 2;
    if (cbSave > ARRAYSIZE(rgchBuff))
    {
        // we've got a problem - this can cause a buffer overflow later on
        Assert(0);
        goto exit;
  	}
    
    // Get the stream size
    if (FAILED(HrGetStreamSize(pstm, (ULONG *) &cbSize)))
    {
        goto exit;
    }

    // Reset the stream to the beginning
    if (FAILED(HrRewindStream(pstm)))
    {
        goto exit;
    }

    // Set up the defaults
    pszRead = rgchBuff;
    cbRead = CB_STREAMMATCH;
    
    // Search for string through the entire stream
    while ((cbSize > 0) && (S_OK == pstm->Read(pszRead, cbRead, &cbIn)))
    {
        // We're done if we read nothing...
        if (0 == cbIn)
        {
            goto exit;
        }
        
        // Note that we've read the bytes
        cbSize -= cbIn;
        
        // Zero terminate the buffer
        pszRead[cbIn] = '\0';

        // Should we convert the buffer to upper case
        if (0 == (dwFlags & SSF_CASESENSITIVE))
        {
            cchGood = CharUpperBuff(rgchBuff, (ULONG)(cbIn + pszRead - rgchBuff));
        }
        else
        {
            // We need to spin over the buffer figuring out if the end character is a lead
            // byte without a corresponding tail byte
            cbWalk = (ULONG) (cbIn + pszRead - rgchBuff);
            for (cchGood = 0; cchGood < cbWalk; cchGood++)
            {
                if (IsDBCSLeadByte(rgchBuff[cchGood]))
                {
                    if ((cchGood + 1) >= cbWalk)
                    {
                        break;
                    }

                    cchGood++;
                }
            }
        }

        chSave = rgchBuff[cchGood];
        rgchBuff[cchGood] = '\0';
        
        // Search for string
        if (FALSE != FWordPresent(rgchBuff, pdwFlagsSearch, pszSearch, cchSearch, &pszMatch))
        {
            // If we aren't at the end of the stream and we can't
            // tell if we are at a word break
            if ((0 >= cbSize) || ((pszMatch + cchSearch) != (pszRead + cchGood)))
            {
                fRet = TRUE;
                break;
            }
        }
        
        // Are we done with the stream
        if (0 >= cbSize)
        {
            break;
        }

        rgchBuff[cchGood] = chSave;
        
        // Save part of the buffer
        
        // How much space do we have in the buffer
        cbRead = CB_STREAMMATCH - cbSave;
        
        // Save the characters
        MoveMemory(rgchBuff, rgchBuff + cbRead, cbSave);

        // Figure out the new start of the buffer
        pszRead = rgchBuff + cbSave;
    }

exit:
    return(fRet);
}

HRESULT HrConvertHTMLToPlainText(IStream * pIStmHtml, IStream ** ppIStmText)
{
    HRESULT         hr = S_OK;
    IDataObject *   pIDataObj = NULL;
    FORMATETC       fetc = {0};
    STGMEDIUM       stgmed = {0};

    // Check incoming params
    if ((NULL == pIStmHtml) || (NULL == ppIStmText))
    {
        hr = E_INVALIDARG;
        goto exit;
    }

    // Initialize the outgoing param
    *ppIStmText = NULL;
    
    hr = MimeEditDocumentFromStream(pIStmHtml, IID_IDataObject, (VOID **)&pIDataObj);
    if (FAILED(hr))
    {
        goto exit;
    }

    // Set up the format
    fetc.cfFormat = CF_TEXT;
    fetc.dwAspect = DVASPECT_CONTENT;
    fetc.lindex = -1;
    fetc.tymed = TYMED_ISTREAM;

    // Get the data
    hr = pIDataObj->GetData(&fetc, &stgmed);
    if (FAILED(hr))
    {
        goto exit;
    }

    if (NULL == stgmed.pstm)
    {
        hr = E_FAIL;
        goto exit;
    }
    
    // Save the item
    *ppIStmText = stgmed.pstm;
    (*ppIStmText)->AddRef();

    // addref the pUnk as it will be release in releasestgmed
    if(NULL != stgmed.pUnkForRelease)
    {
        (stgmed.pUnkForRelease)->AddRef();
    }
        
    hr = S_OK;
    
exit:
    ReleaseStgMedium(&stgmed);
    ReleaseObj(pIDataObj);
    return hr;
}