windows-server-2003/shell/ext/webcheck/webcrawl.cpp

// TODO: Allow trident to download frames (and process new html)
// nuke urlmon code (use trident always)

#include "private.h"
#include "shui.h"
#include "downld.h"
#include "subsmgrp.h"
#include <ocidl.h>

#include <initguid.h>

#include <mluisupp.h>

extern HICON g_webCrawlerIcon;
extern HICON g_channelIcon;
extern HICON g_desktopIcon;

void LoadDefaultIcons();

#undef TF_THISMODULE
#define TF_THISMODULE   TF_WEBCRAWL

#define _ERROR_REPROCESSING -1

// DWORD field of the m_pPages string list
const DWORD DATA_RECURSEMASK = 0x000000FF;  // Levels of recursion from this page
const DWORD DATA_DLSTARTED =   0x80000000;  // Have we started downloading
const DWORD DATA_DLFINISHED =  0x40000000;  // Have we finished this page
const DWORD DATA_DLERROR =     0x20000000;  // An error during download
const DWORD DATA_CODEBASE =    0x10000000;  // Is codebase
const DWORD DATA_LINK =        0x08000000;  // Is link from page (not dependency)

// DWORD field of m_pPendingLinks string list
const DWORD DATA_ROBOTSTXTMASK=0x00000FFF;  // index into m_pRobotsTxt list

// used internally; not actually stored in string list field
const DWORD DATA_ROBOTSTXT =   0x01000000;  // Is robots.txt

// m_pDependencyLinks uses m_pPages values

// DWORD field of m_pRobotsTxt is NULL or (CWCDwordStringList *)

// DWORD field of m_pRobotsTxt referenced string list
const DWORD DATA_ALLOW =        0x80000000;
const DWORD DATA_DISALLOW =     0x40000000;

const WCHAR c_wszRobotsMetaName[] = L"Robots\n";
const int c_iRobotsMetaNameLen = 7;        // string len without nullterm

const WCHAR c_wszRobotsNoFollow[] = L"NoFollow";
const int c_iRobotsNoFollow = 8;

const WCHAR c_wszRobotsTxtURL[] = L"/robots.txt";

const DWORD MAX_ROBOTS_SIZE = 8192;         // Max size of robots.txt file

// tokens for parsing of robots.txt
const CHAR  c_szRobots_UserAgent[] = "User-Agent:";
const CHAR  c_szRobots_OurUserAgent[] = "MSIECrawler";
const CHAR  c_szRobots_Allow[] = "Allow:";
const CHAR  c_szRobots_Disallow[] = "Disallow:";

// This GUID comes from Trident and is a hack for getting PARAM values for APPLET tags.
DEFINE_GUID(CGID_JavaParambagCompatHack, 0x3050F405, 0x98B5, 0x11CF, 0xBB, 0x82, 0x00, 0xAA, 0x00, 0xBD, 0xCE, 0x0B);

// This GUID is helpfully not defined elsewhere.
DEFINE_GUID(CLSID_JavaVM, 0x08B0E5C0, 0x4FCB, 0x11CF, 0xAA, 0xA5, 0x00, 0x40, 0x1C, 0x60, 0x85, 0x01);

// Get host channel agent's subscription item, if any.
inline HRESULT CWebCrawler::GetChannelItem(ISubscriptionItem **ppChannelItem)
{
    IServiceProvider *pSP;
    HRESULT hr = E_NOINTERFACE;

    if (ppChannelItem)
        *ppChannelItem = NULL;

    if (SUCCEEDED(m_pAgentEvents->QueryInterface(IID_IServiceProvider, (void **)&pSP)) && pSP)
    {
        ISubscriptionItem *pTempChannelItem = NULL;
        pSP->QueryService(CLSID_ChannelAgent, IID_ISubscriptionItem, (void **)&pTempChannelItem);
        pSP->Release();

        if(pTempChannelItem) 
            hr = S_OK;
            
        if(ppChannelItem)
            *ppChannelItem = pTempChannelItem;
        else
        {
            if(pTempChannelItem)
                pTempChannelItem->Release();    
        }
    }

    return hr;
}

//////////////////////////////////////////////////////////////////////////
//
// Helper functions - copied over from urlmon\download\helpers.cxx - Is there
// an equivalent routine or better place for this, webcrawl.cpp?
//
//////////////////////////////////////////////////////////////////////////

// ---------------------------------------------------------------------------
// %%Function: GetVersionFromString
//
//    converts version in text format (a,b,c,d) into two dwords (a,b), (c,d)
//    The printed version number is of format a.b.d (but, we don't care)
// ---------------------------------------------------------------------------
HRESULT
GetVersionFromString(const char *szBuf, LPDWORD pdwFileVersionMS, LPDWORD pdwFileVersionLS)
{
    const char *pch = szBuf;
    char ch;

    *pdwFileVersionMS = 0;
    *pdwFileVersionLS = 0;

    if (!pch)            // default to zero if none provided
        return S_OK;

    if (StrCmpA(pch, "-1,-1,-1,-1") == 0) {
        *pdwFileVersionMS = 0xffffffff;
        *pdwFileVersionLS = 0xffffffff;
    }

    USHORT n = 0;

    USHORT a = 0;
    USHORT b = 0;
    USHORT c = 0;
    USHORT d = 0;

    enum HAVE { HAVE_NONE, HAVE_A, HAVE_B, HAVE_C, HAVE_D } have = HAVE_NONE;


    for (ch = *pch++;;ch = *pch++) {

        if ((ch == ',') || (ch == '\0')) {

            switch (have) {

            case HAVE_NONE:
                a = n;
                have = HAVE_A;
                break;

            case HAVE_A:
                b = n;
                have = HAVE_B;
                break;

            case HAVE_B:
                c = n;
                have = HAVE_C;
                break;

            case HAVE_C:
                d = n;
                have = HAVE_D;
                break;

            case HAVE_D:
                return E_INVALIDARG; // invalid arg
            }

            if (ch == '\0') {
                // all done convert a,b,c,d into two dwords of version

                *pdwFileVersionMS = ((a << 16)|b);
                *pdwFileVersionLS = ((c << 16)|d);

                return S_OK;
            }

            n = 0; // reset

        } else if ( (ch < '0') || (ch > '9'))
            return E_INVALIDARG;    // invalid arg
        else
            n = n*10 + (ch - '0');


    } /* end forever */

    // NEVERREACHED
}

/////////////////////////////////////////////////////////////////////////////////////////
// CombineBaseAndRelativeURLs -
//         Three URLs are combined by following rules (this is used for finding the URL
//         to load Applet CABs from.)  Three inputs, the Base URL, the Code Base URL
//         and the file name URL.
//
//         If file name URL is absolute return it.
//         Otherwise if CodeBase URL is absolute combine it with filename and return.
//         Otherwise if Base URL is absolute, combine CodeBase and fileName URL, then
//            combine with Base URL and return it.
////////////////////////////////////////////////////////////////////////////////////////

HRESULT CombineBaseAndRelativeURLs(LPCWSTR szBaseURL, LPCWSTR szRelative1, LPWSTR *szRelative2)
{

    WCHAR wszTemp[INTERNET_MAX_URL_LENGTH];
    DWORD dwLen = ARRAYSIZE(wszTemp);

    ASSERT(szRelative2);                // should never happen.
    if (szRelative2 == NULL)
        return E_FAIL;

    if (IsValidURL(NULL, *szRelative2, 0) == S_OK)
        return S_OK;

    if (szRelative1 && (IsValidURL(NULL, szRelative1, 0) == S_OK))
    {

        if (SUCCEEDED(UrlCombineW((LPCWSTR)szRelative1, (LPCWSTR)*szRelative2, (LPWSTR)wszTemp, &dwLen, 0)))
        {
            BSTR bstrNew = SysAllocString(wszTemp);
            if (bstrNew)
            {
                SAFEFREEBSTR(*szRelative2);
                *szRelative2 = bstrNew;
                return S_OK;
            }
        }
    }

    if (szBaseURL && (IsValidURL(NULL, szBaseURL, 0) == S_OK))
    {
        LPWSTR szNewRel = NULL;
        WCHAR wszCombined[INTERNET_MAX_URL_LENGTH];

        if (szRelative1)
        {
            // NOTE: lstr[cpy|cat]W are macroed to work on Win95.
            DWORD dwLen2 = lstrlenW(*szRelative2);
            StrCpyNW(wszTemp, szRelative1, ARRAYSIZE(wszTemp) - 1); //paranoia
            DWORD dwTempLen = lstrlenW(wszTemp);
            if ((dwLen2 > 0) && ((*szRelative2)[dwLen2-1] == (unsigned short)L'\\') ||
                                ((*szRelative2)[dwLen2-1] == (unsigned short) L'/'))
            {
                StrNCatW(wszTemp, *szRelative2, ARRAYSIZE(wszTemp) - dwTempLen);
            }
            else
            {
                StrNCatW(wszTemp, L"/", ARRAYSIZE(wszTemp) - dwTempLen);
                StrNCatW(wszTemp, *szRelative2, ARRAYSIZE(wszTemp) - dwTempLen - 1);
            }

            szNewRel = wszTemp;
        }
        else
        {
            szNewRel = *szRelative2;
        }

        dwLen = INTERNET_MAX_URL_LENGTH;
        if (SUCCEEDED(UrlCombineW((LPCWSTR)szBaseURL, (LPCWSTR)szNewRel, (LPWSTR)wszCombined, &dwLen, 0)))
        {
            BSTR bstrNew = SysAllocString(wszCombined);
            if (bstrNew)
            {
                SAFEFREEBSTR(*szRelative2);
                *szRelative2 = bstrNew;
                return S_OK;
            }
        }
    }

    // In all likelyhood one of the URL's in bad and nothing good can be done.
    return E_FAIL;
}

//////////////////////////////////////////////////////////////////////////
//
// CWebCrawler implementation
//
//////////////////////////////////////////////////////////////////////////

//
// CWebCrawler Helpers
//

HRESULT CWebCrawler::CreateInstance(IUnknown *punkOuter, IUnknown **ppunk)
{
    HRESULT hr;

    ASSERT(NULL == punkOuter);
    ASSERT(NULL != ppunk);
    
    CWebCrawler *pwc = new CWebCrawler;

    if (NULL != pwc)
    {
        hr = pwc->Initialize();

        if (FAILED(hr))
        {
            pwc->Release();
        }
    }
    else
    {
        hr = E_OUTOFMEMORY;
    }

    if (SUCCEEDED(hr))
    {
        *ppunk = (ISubscriptionAgentControl *)pwc;
    }

    return hr;
}

HRESULT CWebCrawler::Initialize()
{
    m_hrCritDependencies = InitializeCriticalSectionAndSpinCount(&m_critDependencies, 0) ? S_OK : E_OUTOFMEMORY;

    return m_hrCritDependencies;
}

CWebCrawler::CWebCrawler()
{
    DBG("Creating CWebCrawler object");
}

CWebCrawler::~CWebCrawler()
{
    _CleanUp();

    if (SUCCEEDED(m_hrCritDependencies))
    {
        DeleteCriticalSection(&m_critDependencies);
    }
    DBG("Destroyed CWebCrawler object");
}

void CWebCrawler::CleanUp()
{
    _CleanUp();
    CDeliveryAgent::CleanUp();
}

void CWebCrawler::_CleanUp()
{
    if (m_pCurDownload)
    {
        m_pCurDownload->DoneDownloading();
        m_pCurDownload->Release();
        m_pCurDownload = NULL;
    }

    CRunDeliveryAgent::SafeRelease(m_pRunAgent);

    SAFEFREEBSTR(m_bstrHostName);
    SAFEFREEBSTR(m_bstrBaseURL);
    SAFELOCALFREE(m_pszLocalDest);
    SAFELOCALFREE(m_pBuf);

    EnterCriticalSection(&m_critDependencies);
    SAFEDELETE(m_pDependencies);
    LeaveCriticalSection(&m_critDependencies);
    if (m_pDownloadNotify)
    {
        m_pDownloadNotify->LeaveMeAlone();
        m_pDownloadNotify->Release();
        m_pDownloadNotify=NULL;
    }

    SAFEDELETE(m_pPages);
    SAFEDELETE(m_pPendingLinks);
    SAFEDELETE(m_pDependencyLinks);
    SAFERELEASE(m_pUrlIconHelper);

    FreeRobotsTxt();
    FreeCodeBaseList();
}

// Format of m_pRobotsTxt:
// Array of hostnames for which we have attempted to get Robots.txt
// DWORD for each hostname contains pointer to CDwordStringList of Robots.txt data,
//  or 0 if we couldn't find robots.txt for that host name
// Robots.txt data stored in form: url, flag = allow or disallow
void CWebCrawler::FreeRobotsTxt()
{
    if (m_pRobotsTxt)
    {
        DWORD_PTR dwPtr;
        int iLen = m_pRobotsTxt->NumStrings();
        for (int i=0; i<iLen; i++)
        {
            dwPtr = m_pRobotsTxt->GetStringData(i);
            if (dwPtr)
            {
                delete ((CWCStringList *)dwPtr);
                m_pRobotsTxt->SetStringData(i, 0);
            }
        }

        delete m_pRobotsTxt;
        m_pRobotsTxt = NULL;
    }
}

void CWebCrawler::FreeCodeBaseList()
{
    if (m_pCodeBaseList) {
        CCodeBaseHold *pcbh;
        int iLen = m_pCodeBaseList->NumStrings();
        for (int i=0; i<iLen; i++)
        {
            pcbh = (CCodeBaseHold *)m_pCodeBaseList->GetStringData(i);
            if (pcbh != NULL)
            {
                SAFEFREEBSTR(pcbh->szDistUnit);
                SAFEDELETE(pcbh);
                m_pCodeBaseList->SetStringData(i, 0);
            }
        }

        SAFEDELETE(m_pCodeBaseList);
    }
}

HRESULT CWebCrawler::StartOperation()
{
    ISubscriptionItem *pItem = m_pSubscriptionItem;

    DWORD           dwTemp;

    ASSERT(pItem);

    DBG("CWebCrawler in StartOperation");

    if (m_pCurDownload || GetBusy())
    {
        DBG_WARN("Webcrawl busy, returning failure");
        return E_FAIL;
    }

    SAFEFREEBSTR(m_bstrBaseURL);
    if (FAILED(
        ReadBSTR(pItem, c_szPropURL, &m_bstrBaseURL)) ||
        !m_bstrBaseURL ||
        !CUrlDownload::IsValidURL(m_bstrBaseURL))
    {
        DBG_WARN("Couldn't get valid URL, aborting");
        SetEndStatus(E_INVALIDARG);
        SendUpdateNone();
        return E_INVALIDARG;
    }

    if (SHRestricted2W(REST_NoSubscriptionContent, NULL, 0))
        SetAgentFlag(FLAG_CHANGESONLY);

    if (IsAgentFlagSet(FLAG_CHANGESONLY))
    {
        m_dwRecurseLevels = 0;
        m_dwRecurseFlags = WEBCRAWL_DONT_MAKE_STICKY;
        DBG("Webcrawler is in 'changes only' mode.");
    }
    else
    {
/*
        BSTR bstrLocalDest=NULL;
        SAFELOCALFREE(m_pszLocalDest);
        ReadBSTR(c_szPropCrawlLocalDest, &bstrLocalDest);
        if (bstrLocalDest && bstrLocalDest[0])
        {
            int iLen = SysStringByteLen(bstrLocalDest)+1;
            m_pszLocalDest = (LPTSTR) MemAlloc(LMEM_FIXED, iLen);
            if (m_pszLocalDest)
            {
                MyOleStrToStrN(m_pszLocalDest, iLen, bstrLocalDest);
            }
        }
        SAFEFREEBSTR(bstrLocalDest);
*/

        m_dwRecurseLevels=0;
        ReadDWORD(pItem, c_szPropCrawlLevels, &m_dwRecurseLevels);

        if (!IsAgentFlagSet(DELIVERY_AGENT_FLAG_NO_RESTRICTIONS))
        {
            // Note: MaxWebcrawlLevels is stored as N+1 because 0
            // disables the restriction
            dwTemp = SHRestricted2W(REST_MaxWebcrawlLevels, NULL, 0);
            if (dwTemp && m_dwRecurseLevels >= dwTemp)
                m_dwRecurseLevels = dwTemp - 1;
        }

        m_dwRecurseFlags=0;
        ReadDWORD(pItem, c_szPropCrawlFlags, &m_dwRecurseFlags);

        // Read max size in cache in KB
        m_dwMaxSize=0;
        ReadDWORD(pItem, c_szPropCrawlMaxSize, &m_dwMaxSize);
        if (!IsAgentFlagSet(DELIVERY_AGENT_FLAG_NO_RESTRICTIONS))
        {
            dwTemp = SHRestricted2W(REST_MaxSubscriptionSize, NULL, 0);
            if (dwTemp && (!m_dwMaxSize || m_dwMaxSize > dwTemp))
                m_dwMaxSize = dwTemp;
        }

        if (IsRecurseFlagSet(WEBCRAWL_DONT_MAKE_STICKY))
            dwTemp = 0;

        // Read old group ID
        ReadLONGLONG(pItem, c_szPropCrawlGroupID, &m_llOldCacheGroupID);

        // Read new ID if present
        m_llCacheGroupID = 0;
        ReadLONGLONG(pItem, c_szPropCrawlNewGroupID, &m_llCacheGroupID);
        if (m_llCacheGroupID)
        {
            DBG("Adding to existing cache group");
        }
    } // !ChangesOnly

    // finish initializing new operation
    m_iDownloadErrors = 0;
    m_dwCurSize = 0;
    m_lMaxNumUrls = (m_dwRecurseLevels) ? -1 : 1;
    SAFEFREEBSTR(m_bstrHostName);

    m_dwCurSize = NULL;
    m_pPages = NULL;
    m_pDependencies = NULL;

    // After calling this, we'll reenter either in "StartDownload" (connection successful)
    //  or in "AbortUpdate" with GetEndStatus() == INET_E_AGENT_CONNECTION_FAILED
    return CDeliveryAgent::StartOperation();
}

HRESULT CWebCrawler::AgentPause(DWORD dwFlags)
{
    DBG("CWebCrawler::AgentPause");

    // Abort our current url
    if (m_pRunAgent)
    {
        m_pRunAgent->AgentPause(dwFlags);
    }

    if (m_pCurDownload)
    {
        m_pCurDownload->AbortDownload();
        m_pCurDownload->DestroyBrowser();
    }

    return CDeliveryAgent::AgentPause(dwFlags);
}

HRESULT CWebCrawler::AgentResume(DWORD dwFlags)
{
    DBG("CWebCrawler::AgentResume");

    if (m_pRunAgent)
    {
        m_pRunAgent->AgentResume(dwFlags);
    }
    else
    {
        // If we just increased our cache size, reprocess same url
        if (SUBSCRIPTION_AGENT_RESUME_INCREASED_CACHE & dwFlags)
        {
            DBG("CWebCrawler reprocessing same url after cache size increase");
            OnDownloadComplete(0, _ERROR_REPROCESSING);
        }
        else
        {
            // If we're not still downloading, restart our same url
            if (0 == m_iNumPagesDownloading)
            {
                if (FAILED(ActuallyStartDownload(m_pCurDownloadStringList, m_iCurDownloadStringIndex, TRUE)))
                {
                    ASSERT_MSG(0, "CWebCrawler::AgentResume"); // this should never happen
                    SetEndStatus(E_FAIL);
                    CleanUp();
                }
            }
        }
    }

    return CDeliveryAgent::AgentResume(dwFlags);
}

// Forcibly abort current operation
HRESULT CWebCrawler::AgentAbort(DWORD dwFlags)
{
    DBG("CWebCrawler::AgentAbort");

    if (m_pCurDownload)
    {
        m_pCurDownload->DoneDownloading();
    }

    if (m_pRunAgent)
    {
        m_pRunAgent->AgentAbort(dwFlags);
    }

    return CDeliveryAgent::AgentAbort(dwFlags);
}

//---------------------------------------------------------------
//

HRESULT CWebCrawler::StartDownload()
{
    ASSERT(!m_pCurDownload);

    m_iPagesStarted = 0;
    m_iRobotsStarted = 0;
    m_iDependencyStarted = 0;
    m_iDependenciesProcessed = 0;
    m_iTotalStarted = 0;
    m_iCodeBaseStarted = 0;
    m_iNumPagesDownloading = 0;

    // Create new cache group
    if (IsAgentFlagSet(FLAG_CHANGESONLY))
    {
        m_llCacheGroupID = 0;
    }
    else
    {
        if (!m_llCacheGroupID)
        {
            m_llCacheGroupID = CreateUrlCacheGroup(
                (IsRecurseFlagSet(WEBCRAWL_DONT_MAKE_STICKY) ? 0 : CACHEGROUP_FLAG_NONPURGEABLE), 0);

            ASSERT_MSG(m_llCacheGroupID != 0, "Create cache group failed");
        }
    }

    // Create string lists
    m_pPages = new CWCDwordStringList;
    if (m_pPages)
        m_pPages->Init(m_dwRecurseLevels ? -1 : 512);
    else
        SetEndStatus(E_FAIL);

    if (m_dwRecurseLevels && !IsRecurseFlagSet(WEBCRAWL_IGNORE_ROBOTSTXT))
    {
        m_pRobotsTxt = new CWCDwordStringList;
        if (m_pRobotsTxt)
            m_pRobotsTxt->Init(512);
        else
            SetEndStatus(E_FAIL);
    }

    // FEATURE : Shouldn't allocate this memory in changes only mode
    m_pCodeBaseList = new CWCDwordStringList;
    if (m_pCodeBaseList)
        m_pCodeBaseList->Init(512);
    else
        SetEndStatus(E_FAIL);

    // Avoid duplicate processing of dependencies
    if (!IsAgentFlagSet(FLAG_CHANGESONLY))
    {
        m_pDependencies = new CWCDwordStringList;
        if (m_pDependencies)
            m_pDependencies->Init();
        else
            SetEndStatus(E_FAIL);
    }

    if (GetEndStatus() == E_FAIL)
        return E_FAIL;

    m_pCurDownload = new CUrlDownload(this, 0);
    if (!m_pCurDownload)
        return E_OUTOFMEMORY;

    // Add first URL to string list, then start it
    if ((CWCStringList::STRLST_ADDED == m_pPages->AddString(m_bstrBaseURL, m_dwRecurseLevels)) &&
        m_pPages->NumStrings() == 1)
    {
        return StartNextDownload();
    }

    SetEndStatus(E_FAIL);
    return E_FAIL;
}

// Attempts to begin the next download
HRESULT CWebCrawler::StartNextDownload()
{
    if (!m_pPages || m_iNumPagesDownloading)
        return E_FAIL;

    CWCStringList *pslUrls = NULL;
    int iIndex = 0;

    // See if we have any more URLs to download.
    // Check dependency links first
    if (m_pDependencyLinks)
    {
        ProcessDependencyLinks(&pslUrls, &iIndex);
#ifdef DEBUG
        if (pslUrls) DBG("Downloading dependency link (frame):");
#endif
    }

    if (!pslUrls)
    {
        // Check robots.txt
        if (m_pRobotsTxt && (m_iRobotsStarted < m_pRobotsTxt->NumStrings()))
        {
            pslUrls = m_pRobotsTxt;
            iIndex = m_iRobotsStarted ++;
        }
        else if (m_pPendingLinks)   // add pending links to pages list
        {
            // Pending links to process and we've retrieved all robots.txt
            // Process pending links (validate & add to download list)
            ProcessPendingLinks();
        }

        if (!pslUrls && (m_iPagesStarted < m_pPages->NumStrings()))
        {
            DWORD_PTR dwTmp;
            ASSERT(!m_pDependencyLinks);// should be downloaded already
            ASSERT(!m_pPendingLinks);   // should be validated already
            // Skip any pages we've started
            while (m_iPagesStarted < m_pPages->NumStrings())
            {
                dwTmp = m_pPages->GetStringData(m_iPagesStarted);
                if (IsFlagSet(dwTmp, DATA_DLSTARTED))
                    m_iPagesStarted++;
                else
                    break;
            }
            if (m_iPagesStarted < m_pPages->NumStrings())
            {
                pslUrls = m_pPages;
                iIndex = m_iPagesStarted ++;
            }
        }

        if (!pslUrls && (m_iCodeBaseStarted < m_pCodeBaseList->NumStrings()))
        {
            // Nothing else pull, do code bases last.

            while (m_iCodeBaseStarted < m_pCodeBaseList->NumStrings())
            {
                CCodeBaseHold *pcbh = (CCodeBaseHold *)
                                    m_pCodeBaseList->GetStringData(m_iCodeBaseStarted);
                if (IsFlagSet(pcbh->dwFlags, DATA_DLSTARTED))
                    m_iCodeBaseStarted++;
                else
                    break;
            }
            while (m_iCodeBaseStarted < m_pCodeBaseList->NumStrings())
            {
                // We have some codebases to download.
                // We return if the download is async and simply
                // start the next one if it finishes synchronously
                iIndex = m_iCodeBaseStarted;
                m_iCodeBaseStarted++; // increment so that next download is not repeated

                // Init the cur download infor for resume if paused
                m_iCurDownloadStringIndex = iIndex;
                m_pCurDownloadStringList = m_pCodeBaseList;
                
               if(ActuallyDownloadCodeBase(m_pCodeBaseList, iIndex, FALSE) == E_PENDING)
                    return S_OK; // We break out of the while and try next download in OnAgentEnd()

            }
        }
    }

    if (pslUrls)
    {
        m_iCurDownloadStringIndex = iIndex;
        m_pCurDownloadStringList = pslUrls;

        return ActuallyStartDownload(pslUrls, iIndex);
    }

    DBG("WebCrawler: StartNextDownload failing, nothing more to download.");
    return E_FAIL;
}

HRESULT CWebCrawler::ActuallyStartDownload(CWCStringList *pslUrls, int iIndex, BOOL fReStart /* = FALSE */)
{
    // We have urls to download. Do it.
    DWORD_PTR dwData;
    LPCWSTR pwszURL;
    DWORD   dwBrowseFlags;
    BDUMethod method;
    BDUOptions options;

    if(pslUrls == m_pCodeBaseList)
    {
        ASSERT(fReStart); // Should happen only with resume
        HRESULT hr = ActuallyDownloadCodeBase(m_pCodeBaseList, iIndex, fReStart);
        if(E_PENDING == hr)
            return S_OK;
        return E_FAIL; // hackhack - since we don't handle synchronous downloads well - we hang if 
                       // resumed download is synchronous
    }

    if (pslUrls != m_pRobotsTxt)
    {
        dwData = pslUrls->GetStringData(iIndex);
#ifdef DEBUG
        if (fReStart)
            if (~(dwData & DATA_DLSTARTED)) DBG_WARN("WebCrawler: Trying to restart one we haven't started yet!");
        else
            if ((dwData & DATA_DLSTARTED)) DBG_WARN("WebCrawler: Trying to download one we've already started?");
#endif
        pslUrls->SetStringData(iIndex, DATA_DLSTARTED | dwData);
    }
    else
        dwData = DATA_ROBOTSTXT;

    pwszURL = pslUrls->GetString(iIndex);

    ASSERT(iIndex < pslUrls->NumStrings());

#ifdef DEBUG
    int iMax = m_lMaxNumUrls;
    if (iMax<0)
        iMax = m_pPages->NumStrings() + ((m_pRobotsTxt) ? m_pRobotsTxt->NumStrings() : 0);
    TraceMsgA(TF_THISMODULE, "WebCrawler GET_URL (%d of %c%d) Recurse %d : %ws",
        m_iTotalStarted+1, ((m_lMaxNumUrls>0) ? ' ' : '?'), iMax,
        pslUrls->GetStringData(iIndex) & DATA_RECURSEMASK, pwszURL);
#endif

    dwBrowseFlags = DLCTL_DOWNLOADONLY |
        DLCTL_NO_FRAMEDOWNLOAD | DLCTL_NO_SCRIPTS | DLCTL_NO_JAVA |
        DLCTL_NO_RUNACTIVEXCTLS;

    if (IsRecurseFlagSet(WEBCRAWL_GET_IMAGES))      dwBrowseFlags |= DLCTL_DLIMAGES;
    if (IsRecurseFlagSet(WEBCRAWL_GET_VIDEOS))      dwBrowseFlags |= DLCTL_VIDEOS;
    if (IsRecurseFlagSet(WEBCRAWL_GET_BGSOUNDS))    dwBrowseFlags |= DLCTL_BGSOUNDS;
    if (!IsRecurseFlagSet(WEBCRAWL_GET_CONTROLS))   dwBrowseFlags |= DLCTL_NO_DLACTIVEXCTLS;
    if (IsRecurseFlagSet(WEBCRAWL_PRIV_OFFLINE_MODE))
    {
        dwBrowseFlags |= DLCTL_FORCEOFFLINE;
        dwBrowseFlags &= ~(DLCTL_DLIMAGES | DLCTL_VIDEOS | DLCTL_BGSOUNDS);
        DBG("GET is OFFLINE");
    }

    m_pCurDownload->SetDLCTL(dwBrowseFlags);

#ifdef DEBUG
    if (fReStart)
    {
        ASSERT(m_iCurDownloadStringIndex == iIndex);
        ASSERT(m_pCurDownloadStringList == pslUrls);
    }
#endif

    if (!fReStart)
    {
        // Get the info for change detection, unless we already know it's changed
        if (!IsAgentFlagSet(FLAG_CRAWLCHANGED) && !(dwData & DATA_ROBOTSTXT))
        {
            TCHAR   szUrl[INTERNET_MAX_URL_LENGTH];

            m_varChange.vt = VT_EMPTY;

            if (IsAgentFlagSet(FLAG_CHANGESONLY))
            {
                // "Changes Only" mode, we have persisted a change detection code
                ASSERT(m_iTotalStarted == 0);
                LPCWSTR pPropChange = c_szPropChangeCode;
                m_pSubscriptionItem->ReadProperties(1, &pPropChange, &m_varChange);
            }

            BOOL fMustGET = TRUE;

            MyOleStrToStrN(szUrl, INTERNET_MAX_URL_LENGTH, pwszURL);
            PreCheckUrlForChange(szUrl, &m_varChange, &fMustGET);

            if (IsAgentFlagSet(FLAG_CHANGESONLY) && !fMustGET)
                SetAgentFlag(FLAG_HEADONLY);
        }

        m_iTotalStarted ++;
    }

    if (IsPaused())
    {
        DBG("WebCrawler paused, not starting another download");
        if (m_pCurDownload)
            m_pCurDownload->DestroyBrowser(); // free browser until resumed
        return E_PENDING;
    }

    m_iNumPagesDownloading ++;

    // Send our update progress with the url we're about to download
    SendUpdateProgress(pwszURL, m_iTotalStarted, m_lMaxNumUrls, (m_dwCurSize >> 10));

    if (IsAgentFlagSet(FLAG_HEADONLY))
    {
        ASSERT(m_iTotalStarted == 1);
        method = BDU2_HEADONLY;                 // Only get HEAD info with Urlmon
    }
    else if (IsAgentFlagSet(FLAG_CHANGESONLY)   // Only want HTML, or
        || m_pszLocalDest                       // We're going to move this one file, or
        || (dwData & DATA_ROBOTSTXT))           // This is a robots.txt, so
    {
        method = BDU2_URLMON;                   // Get with Urlmon
    }
    else if (m_iTotalStarted == 1)              // First file, we need status code, so
    {
        ISubscriptionItem *pCDFItem;
        method = BDU2_SNIFF;                    // Get with Urlmon then MSHTML (if HTML)

        // Find out if we're hosted by channel agent
        if (SUCCEEDED(GetChannelItem(&pCDFItem)))
        {
            // If we're hosted by channel agent, use its original hostname
            BSTR bstrBaseUrl;
            if (SUCCEEDED(ReadBSTR(pCDFItem, c_szPropURL, &bstrBaseUrl)))
            {
                GetHostName(bstrBaseUrl, &m_bstrHostName);
                SysFreeString(bstrBaseUrl);
            }
#ifdef DEBUG
            if (m_bstrHostName)
                TraceMsg(TF_THISMODULE, "Got host name from channel agent: %ws", m_bstrHostName);
#endif
            pCDFItem->Release();

            DBG("Using 'smart' mode for first url in webcrawl; spawned from channel crawl");
            method = BDU2_SMART;                // Use 'smart' mode for first url if channel crawl
            SetAgentFlag(FLAG_HOSTED);
        }
    }
    else
        method = BDU2_SMART;                    // Get with Urlmon or MSHTML as appropriate

    if (dwData & DATA_ROBOTSTXT)
        options = BDU2_NEEDSTREAM;              // Need IStream to parse robots.txt
    else
        options = BDU2_NONE;

    options |= BDU2_DOWNLOADNOTIFY_REQUIRED;    // Always get download notify callbacks

    if (IsRecurseFlagSet(WEBCRAWL_ONLY_LINKS_TO_HTML) && (dwData & DATA_LINK))
    {
        // Don't follow any links unless they are to html pages.
        options |= BDU2_FAIL_IF_NOT_HTML;
    }

    if (FAILED(m_pCurDownload->BeginDownloadURL2(pwszURL,
            method, options, m_pszLocalDest, 
            m_dwMaxSize ? (m_dwMaxSize<<10)-m_dwCurSize : 0)))
    {
        DBG("BeginDownloadURL2 failed (ignoring & waiting for OnDownloadComplete call)");
    }

    return S_OK;
}

HRESULT CWebCrawler::ActuallyDownloadCodeBase(CWCStringList *pslUrls, int iIndex, BOOL fReStart)
{
    CCodeBaseHold *pcbh;
    LPCWSTR pwszURL;
    HRESULT hr = S_OK;

    if (pslUrls != m_pCodeBaseList)
    {
        ASSERT(0);
        DBG_WARN("WebCrawler: Wrong URLs being processed as CodeBase.");
        hr = E_FAIL;
        goto Exit;
    }

    pcbh = (CCodeBaseHold *)pslUrls->GetStringData(iIndex);

#ifdef DEBUG
    if (fReStart)
        if (~(pcbh->dwFlags & DATA_DLSTARTED)) DBG_WARN("WebCrawler: Trying to restart CodeBase D/L we haven't started yet!");
    else
        if ((pcbh->dwFlags & DATA_DLSTARTED)) DBG_WARN("WebCrawler: Trying to download CodeBase D/L we've already started?");
#endif
    pcbh->dwFlags |= DATA_DLSTARTED;

    pwszURL = pslUrls->GetString(iIndex);

    ASSERT(iIndex < pslUrls->NumStrings());

    if (!fReStart)
        m_iTotalStarted ++;

    if (IsPaused())
    {
        DBG("WebCrawler paused, not starting another download");
        if (m_pCurDownload)
            m_pCurDownload->DestroyBrowser(); // free browser until resumed
        return S_FALSE;
    }

    m_iNumPagesDownloading ++;

    // Send our update progress with the CODEBASE we're about to download
    SendUpdateProgress(pwszURL, m_iTotalStarted, m_lMaxNumUrls);

    if (m_pRunAgent)
    {
        ASSERT(0);
        DBG_WARN("WebCrawler: Attempting to download next CODEBASE when not done last one.");
        hr = E_FAIL;
        goto Exit;
    }
    else
    {
        // create subscription item for CDL agent.

        ISubscriptionItem *pItem = NULL;

        if (m_dwMaxSize && ((m_dwCurSize>>10) >= m_dwMaxSize))
        {
            // We've exceeded our maximum download KB limit and can't continue.
            DBG_WARN("WebCrawler: Exceeded Maximum KB download limit with CodeBase download.");
            SetEndStatus(hr = INET_E_AGENT_MAX_SIZE_EXCEEDED);
            goto Exit;
        }

        if (!m_pSubscriptionItem ||
            FAILED(hr = DoCloneSubscriptionItem(m_pSubscriptionItem, NULL, &pItem)))
        {
            goto Exit;
        }
        ASSERT(pItem != NULL);

        WriteOLESTR(pItem, c_szPropURL, pwszURL);
        WriteOLESTR(pItem, L"DistUnit", pcbh->szDistUnit);
        WriteDWORD(pItem, L"VersionMS", pcbh->dwVersionMS);
        WriteDWORD(pItem, L"VersionLS", pcbh->dwVersionLS);
        if (m_dwMaxSize)
            WriteDWORD(pItem, c_szPropCrawlMaxSize, m_dwMaxSize - (m_dwCurSize>>10));    // KB limit for us to pull.

        m_pRunAgent = new CRunDeliveryAgent();
        if (m_pRunAgent)
            hr = m_pRunAgent->Init((CRunDeliveryAgentSink *)this, pItem, CLSID_CDLAgent);
        pItem->Release();

        if (m_pRunAgent && SUCCEEDED(hr))
        {
            hr = m_pRunAgent->StartAgent();
            //if (hr == E_PENDING)
            //{
                //hr = S_OK;
            //}
        }
        else
        {
            hr = E_OUTOFMEMORY;
        }
    }

Exit:
    return hr;

}

HRESULT CWebCrawler::ProcessDependencyLinks(CWCStringList **ppslUrls, int *piStarted)
{
    ASSERT(ppslUrls && !*ppslUrls && piStarted);

    int iIndex;
    DWORD_PTR dwData;

    if (!m_pDependencyLinks)
        return S_FALSE;

    // See if we have any more dependency links to download
    while (m_iDependencyStarted < m_pDependencyLinks->NumStrings())
    {
        if (!m_pPages->FindString(m_pDependencyLinks->GetString(m_iDependencyStarted),
                               m_pDependencyLinks->GetStringLen(m_iDependencyStarted), &iIndex))
        {
            ASSERT(0);  // find string failed?!? We added it above!
            return E_FAIL;
        }

        ASSERT(iIndex>=0 && iIndex<m_pPages->NumStrings());

        m_iDependencyStarted ++;

        // See if we've downloaded this yet.
        dwData = m_pPages->GetStringData(iIndex);
        if (!(dwData & DATA_DLSTARTED))
        {
            // Nope. Start download.
            *ppslUrls = m_pPages;
            *piStarted = iIndex;
            return S_OK;
        }

        // We have already downloaded this page. Go to next dependency link.
    }

    // Done processing. Clear for next page.
    SAFEDELETE(m_pDependencyLinks);

    return S_FALSE;
}

HRESULT CWebCrawler::ProcessPendingLinks()
{
    int         iNumLinks, iAddCode, i, iAddIndex, iRobotsIndex;
    LPCWSTR     pwszUrl;
    BOOL        fAllow;

    if (!m_pPendingLinks)
        return S_FALSE;

    ASSERT(m_lMaxNumUrls<0);
    ASSERT(0 == (m_dwPendingRecurseLevel & ~DATA_RECURSEMASK));

    iNumLinks = m_pPendingLinks->NumStrings();

    TraceMsg(TF_THISMODULE, "Processing %d pending links from %ws",
        iNumLinks, m_pPages->GetString(m_iPagesStarted-1));

    // Add the links to our global page list
    for (i=0; i<iNumLinks; i++)
    {
        // Validate with robots.txt if appropriate
        pwszUrl = m_pPendingLinks->GetString(i);
        iRobotsIndex = (int)(m_pPendingLinks->GetStringData(i) & DATA_ROBOTSTXTMASK);
        ValidateWithRobotsTxt(pwszUrl, iRobotsIndex, &fAllow);

        if (fAllow)
        {
/*
As long as we retrieve pages in decreasing-recursion order (top to bottom), we don't
have to worry about bumping pages to a higher recurse level (except for frames).
*/
            iAddCode = m_pPages->AddString(pwszUrl,
                        DATA_LINK | m_dwPendingRecurseLevel,
                        &iAddIndex);
            if (iAddCode == CWCStringList::STRLST_FAIL)
                break;
        }
    }
    SAFEDELETE(m_pPendingLinks);

    return S_OK;
}


// Combine with our base url to get full url
// We use this for frames, but also for <Link> tags, since the processing is identical
HRESULT CWebCrawler::CheckFrame(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwBaseUrl, DWORD *pdwStringData)
{
    WCHAR   wszCombined[INTERNET_MAX_URL_LENGTH];
    DWORD   dwLen = ARRAYSIZE(wszCombined);

    ASSERT(pbstrItem && *pbstrItem && punkItem && dwBaseUrl);
    if (!pbstrItem || !*pbstrItem || !punkItem || !dwBaseUrl)
        return E_FAIL;      // bogus

    if (SUCCEEDED(UrlCombineW((LPCWSTR)dwBaseUrl, *pbstrItem, wszCombined, &dwLen, 0)))
    {
        BSTR bstrNew = SysAllocString(wszCombined);

        if (bstrNew)
        {
            SysFreeString(*pbstrItem);
            *pbstrItem = bstrNew;
            return S_OK;
        }
    }

    TraceMsg(TF_WARNING, "CWebCrawler::CheckFrame failing. Not getting frame or <link> url=%ws.", *pbstrItem);
    return E_FAIL;  // Couldn't combine url; don't add
}

// See if we should follow this link. Clears pbstrItem if not.
// Accepts either pLink or pArea
HRESULT CWebCrawler::CheckLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwThis, DWORD *pdwStringData)
{
    HRESULT         hrRet = S_OK;
    CWebCrawler    *pThis = (CWebCrawler *)dwThis;

    ASSERT(pbstrItem && *pbstrItem && punkItem && dwThis);
    if (!pbstrItem || !*pbstrItem || !punkItem || !dwThis)
        return E_FAIL;      // bogus

    // First see if it's 'valid'
    // We only add the link if it's HTTP (or https)
    // (we don't want to get mailto: links, for example)
    if (CUrlDownload::IsValidURL(*pbstrItem))
    {
        // Strip off any anchor
        CUrlDownload::StripAnchor(*pbstrItem);
    }
    else
    {
        // Skip this link
        SysFreeString(*pbstrItem);
        *pbstrItem = NULL;
        return S_FALSE;
    }

    if (pThis->IsRecurseFlagSet(WEBCRAWL_ONLY_LINKS_TO_HTML))
    {
        // See if we can tell that this is not an HTML link
        if (CUrlDownload::IsNonHtmlUrl(*pbstrItem))
        {
            // Skip this link
            SysFreeString(*pbstrItem);
            *pbstrItem = NULL;
            return S_FALSE;
        }
    }

    if (!(pThis->IsRecurseFlagSet(WEBCRAWL_LINKS_ELSEWHERE)))
    {
        BSTR bstrHost=NULL;
        IHTMLAnchorElement *pLink=NULL;
        IHTMLAreaElement *pArea=NULL;

        // Check to see if the host names match
        punkItem->QueryInterface(IID_IHTMLAnchorElement, (void **)&pLink);

        if (pLink)
        {
            pLink->get_hostname(&bstrHost);
            pLink->Release();
        }
        else
        {
            punkItem->QueryInterface(IID_IHTMLAreaElement, (void **)&pArea);

            if (pArea)
            {
                pArea->get_hostname(&bstrHost);
                pArea->Release();
            }
            else
            {
                DBG_WARN("CWebCrawler::CheckLink Unable to get Area or Anchor interface!");
                return E_FAIL;      // Bad element
            }
        }

        if (!bstrHost || !*bstrHost)
        {
            DBG_WARN("CWebCrawler::CheckLink : (pLink|pArea)->get_hostname() failed");
            hrRet = S_OK;      // always accept if get_hostname fails
        }
        else
        {
            if (pThis->m_bstrHostName && MyAsciiCmpW(bstrHost, pThis->m_bstrHostName))
            {
                // Skip url; different host name.
                SAFEFREEBSTR(*pbstrItem);
                hrRet = S_FALSE;
            }
        }

        SAFEFREEBSTR(bstrHost);
    }

    if (*pbstrItem && pdwStringData)
    {
        pThis->GetRobotsTxtIndex(*pbstrItem, TRUE, pdwStringData);
        *pdwStringData &= DATA_ROBOTSTXTMASK;
    }
    else if (pdwStringData)
        *pdwStringData = 0;

    return hrRet;
}

// S_OK    : Already retrieved this robots.txt info
// S_FALSE : Haven't yet retrieved this robots.txt info
// E_*     : Bad
HRESULT CWebCrawler::GetRobotsTxtIndex(LPCWSTR pwszUrl, BOOL fAddToList, DWORD *pdwRobotsTxtIndex)
{
    HRESULT hr=S_OK;
    int    iIndex=-1;

    if (m_pRobotsTxt)
    {
        // See which robots.txt file we should use to validate this link
        // If not yet available, add it to the list to be downloaded
        DWORD  dwBufLen = lstrlenW(pwszUrl) + ARRAYSIZE(c_wszRobotsTxtURL); //This get's us a terminating NULL
        LPWSTR pwszRobots = (LPWSTR)MemAlloc(LMEM_FIXED, dwBufLen * sizeof(WCHAR));
        int    iAddCode;

        if (pwszRobots)
        {
            // PERF: do the internetcombine in startnextdownload
            if (SUCCEEDED(UrlCombineW(pwszUrl, c_wszRobotsTxtURL, pwszRobots, &dwBufLen, 0))
                && !memcmp(pwszRobots, L"http", 4 * sizeof(WCHAR)))
            {
                if (fAddToList)
                {
                    iAddCode = m_pRobotsTxt->AddString(pwszRobots, 0, &iIndex);
                }
                else
                {
                    if (m_pRobotsTxt->FindString(pwszRobots, -1, &iIndex))
                    {
                        iAddCode = CWCStringList::STRLST_DUPLICATE;
                    }
                    else
                    {
                        iIndex=-1;
                        iAddCode = CWCStringList::STRLST_FAIL;
                    }
                }

                if (CWCStringList::STRLST_FAIL == iAddCode)
                    hr = E_FAIL;    // bad news
                else if (CWCStringList::STRLST_ADDED == iAddCode)
                    hr = S_FALSE;   // haven't gotten it yet
                else
                    hr = S_OK;      // already got it
            }
            MemFree(pwszRobots);
        }
        else
            hr = E_OUTOFMEMORY;
    }
    else
    {
        hr = E_FAIL;    // too many robots.txt files???
    }

    *pdwRobotsTxtIndex = iIndex;

    return hr;
}

// iRobotsIndex : Index into robots.txt, -1 if unavailable
HRESULT CWebCrawler::ValidateWithRobotsTxt(LPCWSTR pwszUrl, int iRobotsIndex, BOOL *pfAllow)
{
    int iNumDirectives, i;
    CWCStringList *pslThisRobotsTxt=NULL;

    *pfAllow = TRUE;

    if (!m_pRobotsTxt)
        return S_OK;

    if (iRobotsIndex == -1)
    {
        DWORD dwIndex;

        if (S_OK != GetRobotsTxtIndex(pwszUrl, FALSE, &dwIndex))
            return E_FAIL;

        iRobotsIndex = (int)dwIndex;
    }

    if ((iRobotsIndex >= 0) && iRobotsIndex<m_pRobotsTxt->NumStrings())
    {
        pslThisRobotsTxt = (CWCStringList *)(m_pRobotsTxt->GetStringData(iRobotsIndex));

        if (pslThisRobotsTxt)
        {
            iNumDirectives = pslThisRobotsTxt->NumStrings();

            for (i=0; i<iNumDirectives; i++)
            {
                // See if this url starts with the same thing as the directive
                if (!MyAsciiCmpNIW(pwszUrl, pslThisRobotsTxt->GetString(i), pslThisRobotsTxt->GetStringLen(i)))
                {
                    // hit! see if this is "allow" or "disallow"
                    if (!(pslThisRobotsTxt->GetStringData(i) & DATA_ALLOW))
                    {
                        TraceMsg(TF_THISMODULE, "ValidateWithRobotsTxt disallowing: (%ws) (%ws)",
                            pslThisRobotsTxt->GetString(i), pwszUrl);
                        *pfAllow = FALSE;
                        m_iSkippedByRobotsTxt ++;
                    }
                    break;
                }
            }
        }
        return S_OK;
    }

    return E_FAIL;
}

typedef struct
{
    LPCWSTR         pwszThisUrl;
    CWCStringList   *pslGlobal;
    BOOL            fDiskFull;
    DWORD           dwSize;
    GROUPID         llGroupID;
}
ENUMDEPENDENCIES;

// Doesn't process it if we already have it in the global dependency list
HRESULT CWebCrawler::CheckImageOrLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwEnumDep, DWORD *pdwStringData)
{
    if (!dwEnumDep)
        return E_FAIL;

    ENUMDEPENDENCIES *pEnumDep = (ENUMDEPENDENCIES *) dwEnumDep;

    WCHAR   wszCombinedUrl[INTERNET_MAX_URL_LENGTH];
    DWORD   dwLen = ARRAYSIZE(wszCombinedUrl);

    HRESULT hr;

    if (pEnumDep->fDiskFull)
        return E_ABORT;     // Abort enumeration

    if (SUCCEEDED(UrlCombineW(pEnumDep->pwszThisUrl, *pbstrItem, wszCombinedUrl, &dwLen, 0)))
    {
        TCHAR   szCombinedUrl[INTERNET_MAX_URL_LENGTH];
        BYTE    chBuf[MY_MAX_CACHE_ENTRY_INFO];

        if (pEnumDep->pslGlobal != NULL)
        {
            int iCode = pEnumDep->pslGlobal->AddString(*pbstrItem, 0);

            if (CWCStringList::STRLST_ADDED != iCode)
            {
                // The string already existed (or Add failed). Don't process this.
                return S_OK;
            }
        }

        // Process this url.
        MyOleStrToStrN(szCombinedUrl, INTERNET_MAX_URL_LENGTH, wszCombinedUrl);

        hr = GetUrlInfoAndMakeSticky(NULL, szCombinedUrl,
                (LPINTERNET_CACHE_ENTRY_INFO)chBuf, sizeof(chBuf),
                pEnumDep->llGroupID);

        if (E_OUTOFMEMORY == hr)
        {
            pEnumDep->fDiskFull = TRUE;
            return E_ABORT;     // Skip rest of enumeration
        }

        if (SUCCEEDED(hr))
            pEnumDep->dwSize += ((LPINTERNET_CACHE_ENTRY_INFO)chBuf)->dwSizeLow;
    }

    return S_OK;
}

HRESULT CWebCrawler::MatchNames(BSTR bstrName, BOOL fPassword)
{
    static const WCHAR c_szPassword1[] = L"password";
    static const WCHAR c_szUsername1[] = L"user";
    static const WCHAR c_szUsername2[] = L"username";

    HRESULT hr = E_FAIL;
    LPCTSTR pszKey = c_szRegKeyPasswords;

    // See if the name matches our preset options.
    // Should these be localized?  I don't think so or subscribing to
    // US sites will fail in international versions of the browser.
    if (fPassword)
    {
        if (StrCmpIW(bstrName, c_szPassword1) == 0)
        {
            hr = S_OK;
        }
    }
    else
    {
        if ((StrCmpIW(bstrName, c_szUsername1) == 0) ||
            (StrCmpIW(bstrName, c_szUsername2) == 0))
        {
            hr = S_OK;
        }
        else
        {
            pszKey = c_szRegKeyUsernames;
        }
    }

    // Try the registry for custom form names if the presets didn't match.
    if (FAILED(hr))
    {
        LONG lRes;
        HKEY hKey;
        DWORD cValues;
        DWORD i;
        lRes = RegOpenKeyEx(HKEY_CURRENT_USER, pszKey, 0, KEY_READ, &hKey);
        if (ERROR_SUCCESS == lRes)
        {
            lRes = RegQueryInfoKey(hKey, NULL, NULL, NULL, NULL, NULL, NULL, &cValues, NULL, NULL, NULL, NULL);
            if (ERROR_SUCCESS == lRes)
            {
                for (i = 0; i < cValues; i++)
                {
                    TCHAR szValueName[MAX_PATH];
                    DWORD cchValueName = ARRAYSIZE(szValueName);

                    lRes = SHEnumValue(hKey, i, szValueName, &cchValueName, NULL, NULL, NULL);
                    if (ERROR_SUCCESS == lRes)
                    {
                        WCHAR wszValueName[MAX_PATH];
                        MyStrToOleStrN(wszValueName, ARRAYSIZE(wszValueName), szValueName);
                        if (StrCmpIW(bstrName, wszValueName) == 0)
                        {
                            hr = S_OK;
                            break;
                        }
                    }
                }
            }
            lRes = RegCloseKey(hKey);
            ASSERT(ERROR_SUCCESS == lRes);
        }
    }

    return hr;
}

HRESULT CWebCrawler::FindAndSubmitForm(void)
{
    // FindAndSubmitForm - If there is a user name and password in
    // the start item, this will attempt to fill in and submit
    // a form.  It should only be called on the top level page of a
    // webcrawl. We still need to check the host name in case we were
    // spawned from a channel crawl.
    //
    // return values: S_OK      successfully found and submitted a form -> restart webcrawl
    //                S_FALSE   no username, no form, or unrecognized form ->continue webcrawl
    //                E_FAIL    submit failed -> abort webcrawl
    //
    HRESULT hrReturn = S_FALSE;
    HRESULT hr = S_OK;
    BSTR bstrUsername = NULL;
    BSTR bstrPassword = NULL;
    BSTR bstrInputType= NULL;

    static const WCHAR c_szInputTextType[]=L"text";

    // If our host name doesn't match the root host name, don't return auth
    // information.
    if (m_bstrHostName)
    {
        LPWSTR pwszUrl, bstrHostName=NULL;

        m_pCurDownload->GetRealURL(&pwszUrl);   // may re-enter Trident

        if (pwszUrl)
        {
            GetHostName(pwszUrl, &bstrHostName);
            LocalFree(pwszUrl);
        }

        if (bstrHostName)
        {
            if (MyAsciiCmpW(bstrHostName, m_bstrHostName))
            {
                hr = E_FAIL;
            }
            SysFreeString(bstrHostName);
        }
    }

    if (SUCCEEDED(hr))
        hr = ReadBSTR(m_pSubscriptionItem, c_szPropCrawlUsername, &bstrUsername);

    if (SUCCEEDED(hr) && bstrUsername && bstrUsername[0])
    {
        // NOTE: We don't allow NULL passwords.
        hr = ReadPassword(m_pSubscriptionItem, &bstrPassword);
        if (SUCCEEDED(hr) && bstrPassword && bstrPassword[0])
        {
            IHTMLDocument2 *pDoc = NULL;
            hr = m_pCurDownload->GetDocument(&pDoc);
            if (SUCCEEDED(hr) && pDoc)
            {
                IHTMLElementCollection *pFormsCollection = NULL;
                hr = pDoc->get_forms(&pFormsCollection);
                if (SUCCEEDED(hr) && pFormsCollection)
                {
                    long length;
                    hr = pFormsCollection->get_length(&length);
                    TraceMsg(TF_THISMODULE, "**** FOUND USER NAME, PASSWORD, & %d FORMS ****", (int)length);
                    if (SUCCEEDED(hr) && length > 0)
                    {
                        // We only check the first form for a user name and password.
                        // Why do we pass an index to IHTMLElementCollection when
                        // the interface prototype says it takes a name?
                        IDispatch *pDispForm = NULL;
                        VARIANT vIndex, vEmpty;
                        VariantInit(&vIndex);
                        VariantInit(&vEmpty);
                        vIndex.vt = VT_I4;
                        vIndex.lVal = 0;
                        hr = pFormsCollection->item(vIndex, vEmpty, &pDispForm);
                        if (SUCCEEDED(hr) && pDispForm)
                        {
                            IHTMLFormElement *pForm = NULL;
                            hr = pDispForm->QueryInterface(IID_IHTMLFormElement, (void **)&pForm);
                            if (SUCCEEDED(hr) && pForm)
                            {
                                // Enum form elements looking for the input types we care about.
                                // Would it be faster to use tags()?
                                hr = pForm->get_length(&length);
                                if (SUCCEEDED(hr) && length >= 2)
                                {
                                    // TraceMsg(TF_THISMODULE, "**** FORM ELEMENTS (%d) ****", (int)length);
                                    BOOL fUsernameSet = FALSE;
                                    BOOL fPasswordSet = FALSE;
                                    IDispatch *pDispItem = NULL;
                                    long i;
                                    for (i = 0; i < length; i++)
                                    {
                                        vIndex.lVal = i;    // re-use vIndex above
                                        hr = pForm->item(vIndex, vEmpty, &pDispItem);
                                        if (SUCCEEDED(hr) && pDispItem)
                                        {
                                            IHTMLInputTextElement *pInput = NULL;
                                            // QI was the easiest way to tell them apart...
                                            // InputText is derived from InputPassword
                                            hr = pDispItem->QueryInterface(IID_IHTMLInputTextElement, (void **)&pInput);
                                            SAFERELEASE(pDispItem);
                                            if (SUCCEEDED(hr) && pInput)
                                            {
                                                hr = pInput->get_type(&bstrInputType);
                                                ASSERT(SUCCEEDED(hr) && bstrInputType);
                                                BSTR bstrName = NULL;
                                                if (StrCmpIW(bstrInputType, c_szInputTextType) == 0)
                                                {
                                                    // We found an INPUT element with attribute TYPE="text".
                                                    // Set it if the NAME attribute matches.
                                                    // Only setting the first matching input.
                                                    // Do we care about max length or does put_value handle it?
                                                    // TraceMsg(TF_THISMODULE, "**** FORM ELEMENT INPUT (%d) ****", (int)i);
                                                    if (!fUsernameSet)
                                                    {
                                                        hr = pInput->get_name(&bstrName);
                                                        ASSERT(SUCCEEDED(hr) && bstrName);
                                                        if (SUCCEEDED(hr) && bstrName && SUCCEEDED(MatchNames(bstrName, FALSE)))
                                                        {
                                                            hr = pInput->put_value(bstrUsername);
                                                            if (SUCCEEDED(hr))
                                                                fUsernameSet = TRUE;
                                                        }
                                                    }
                                                }
                                                else
                                                {
                                                    // We found an INPUT element with attribute TYPE="password"
                                                    // Set it if the name attribute matches.
                                                    // Only setting the first matching input.
                                                    // Do we care about max length or does put_value handle it?
                                                    // TraceMsg(TF_THISMODULE, "**** FORM ELEMENT PASSWORD (%d) ****", (int)i);
                                                    if (!fPasswordSet)
                                                    {
                                                        hr = pInput->get_name(&bstrName);
                                                        ASSERT(SUCCEEDED(hr) && bstrName);
                                                        if (SUCCEEDED(hr) && bstrName  && SUCCEEDED(MatchNames(bstrName, TRUE)))
                                                        {
                                                            hr = pInput->put_value(bstrPassword);
                                                            if (SUCCEEDED(hr))
                                                                fPasswordSet = TRUE;
                                                        }
                                                    }
                                                }
                                                SAFEFREEBSTR(bstrName);
                                                SAFERELEASE(pInput);
                                            }
                                        }
                                    }
                                    // Submit the form is everything was set.
                                    if (fUsernameSet && fPasswordSet)
                                    {
                                        ASSERT(!m_pCurDownload->GetFormSubmitted());
                                        m_pCurDownload->SetFormSubmitted(TRUE);
                                        hr = pForm->submit();
                                        if (SUCCEEDED(hr))
                                        {
                                            m_iNumPagesDownloading ++;
                                            TraceMsg(TF_THISMODULE, "**** FORM SUBMIT WORKED ****");
                                            hrReturn = S_OK;
                                        }
                                        else
                                        {
                                            TraceMsg(TF_THISMODULE, "**** FORM SUBMIT FAILED ****");
                                            hrReturn = E_FAIL;
                                        }
                                    }
                                }
                                SAFERELEASE(pForm);
                            }
                            SAFERELEASE(pDispForm);
                        }
                        // only length
                    }
                    SAFERELEASE(pFormsCollection);
                }
                SAFERELEASE(pDoc);
            }
            // free bstr below because we check for empty bstrs
        }
        SAFEFREEBSTR(bstrPassword);
    }
    SAFEFREEBSTR(bstrUsername);
    return hrReturn;
}

// Make page and dependencies sticky and get total size
HRESULT CWebCrawler::MakePageStickyAndGetSize(LPCWSTR pwszURL, DWORD *pdwSize, BOOL *pfDiskFull)
{
    ASSERT(m_pDependencies || IsRecurseFlagSet(WEBCRAWL_DONT_MAKE_STICKY));

    HRESULT hr;
    TCHAR   szThisUrl[INTERNET_MAX_URL_LENGTH]; // use ansi internally
    BYTE    chBuf[MY_MAX_CACHE_ENTRY_INFO];

    LPINTERNET_CACHE_ENTRY_INFO lpInfo = (LPINTERNET_CACHE_ENTRY_INFO) chBuf;

    DWORD   dwBufSize = sizeof(chBuf);

    *pdwSize = 0;

    // First we make our base url sticky and check it for changes

    MyOleStrToStrN(szThisUrl, INTERNET_MAX_URL_LENGTH, pwszURL);

    hr = GetUrlInfoAndMakeSticky(NULL, szThisUrl, lpInfo, dwBufSize, m_llCacheGroupID);

    if (E_OUTOFMEMORY != hr)
    {
        if (SUCCEEDED(hr))
            *pdwSize += lpInfo->dwSizeLow;

        if (!IsAgentFlagSet(FLAG_CRAWLCHANGED) && SUCCEEDED(hr))
        {
            hr = PostCheckUrlForChange(&m_varChange, lpInfo, lpInfo->LastModifiedTime);
            // If we FAILED, we mark it as changed.
            if (hr == S_OK || FAILED(hr))
            {
                SetAgentFlag(FLAG_CRAWLCHANGED);
                DBG("URL has changed; will flag webcrawl as changed");
            }

            // "Changes Only" mode, persist change detection code
            if (IsAgentFlagSet(FLAG_CHANGESONLY))
            {
                ASSERT(m_iTotalStarted == 1);
                WriteVariant(m_pSubscriptionItem, c_szPropChangeCode, &m_varChange);
                return S_OK;    // We know there are no dependencies
            }

            hr = S_OK;
        }
    }
    else
    {
        *pfDiskFull = TRUE;
    }

    // Now we make all the new dependencies we downloaded for this page sticky
    if (!*pfDiskFull && m_pDependencies)
    {
        EnterCriticalSection(&m_critDependencies);

        for (; m_iDependenciesProcessed < m_pDependencies->NumStrings(); m_iDependenciesProcessed ++)
        {
            MyOleStrToStrN(szThisUrl, INTERNET_MAX_URL_LENGTH, m_pDependencies->GetString(m_iDependenciesProcessed));

            hr = GetUrlInfoAndMakeSticky(NULL, szThisUrl, lpInfo, dwBufSize, m_llCacheGroupID);

            if (E_OUTOFMEMORY == hr)
            {
                *pfDiskFull = TRUE;
                break;
            }

            if (SUCCEEDED(hr))
                *pdwSize += lpInfo->dwSizeLow;
        }

        LeaveCriticalSection(&m_critDependencies);
    }

    if (*pfDiskFull)
    {
        DBG_WARN("Webcrawler: UrlCache full trying to make sticky");
        return E_OUTOFMEMORY;
    }

    return S_OK;
}

// true if found token & made null-term
LPSTR GetToken(LPSTR pszBuf, /*inout*/int *piBufPtr, /*out*/int *piLen)
{
static const CHAR szWhitespace[] = " \t\n\r";

    int iPtr = *piBufPtr;
    int iLen;

    while (1)
    {
        // skip leading whitespace
        iPtr += StrSpnA(pszBuf+iPtr, szWhitespace);

        if (!pszBuf[iPtr])
            return NULL;

        if (pszBuf[iPtr] == '#')
        {
            // comment; skip line
            while (pszBuf[iPtr] && pszBuf[iPtr]!='\r' && pszBuf[iPtr]!='\n') iPtr++;

            if (!pszBuf[iPtr])
                return NULL;

            continue;
        }

        // skip to next whitespace
        iLen = StrCSpnA(pszBuf+iPtr, szWhitespace);

        if (iLen == 0)
            return NULL;        // shoudln't happen

        *piBufPtr = iLen + iPtr;

        if (piLen)
            *piLen = iLen;

        if (pszBuf[iLen+iPtr])
        {
            pszBuf[iLen+iPtr] = NULL;
            ++ *piBufPtr;
        }

        break;
    }

//  TraceMsgA(TF_THISMODULE, "GetToken returning \"%s\"", (LPSTR)(pszBuf+iPtr));
    return pszBuf + iPtr;
}


// === Support functions for OnDownloadComplete

// ParseRobotsTxt gets the stream from CUrlDownload, parses it, and fills in parsed
//  info to *ppslRet
HRESULT CWebCrawler::ParseRobotsTxt(LPCWSTR pwszRobotsTxtURL, CWCStringList **ppslRet)
{
    // Given a robots.txt file (from CUrlDownload), it
    //  parses the file and fills in a string list with appropriate
    //  info.
    *ppslRet = FALSE;

    CHAR    szRobotsTxt[MAX_ROBOTS_SIZE];
    HRESULT hr=S_OK;
    LPSTR   pszToken;
    IStream *pstm=NULL;
    DWORD_PTR dwData;

    hr = m_pCurDownload->GetStream(&pstm);

    if (SUCCEEDED(hr))
    {
        STATSTG st;
        DWORD   dwSize;

        DBG("CWebCrawler parsing robots.txt file");

        pstm->Stat(&st, STATFLAG_NONAME);

        dwSize = st.cbSize.LowPart;

        if (st.cbSize.HighPart || dwSize >= MAX_ROBOTS_SIZE)
        {
            szRobotsTxt[0] = 0;
            DBG("CWebCrawler: Robots.Txt too big; ignoring");
            hr = E_FAIL;
        }
        else
        {
            hr = pstm->Read(szRobotsTxt, dwSize, NULL);
            szRobotsTxt[dwSize] = 0;
        }

        pstm->Release();
        pstm=NULL;

        if ((szRobotsTxt[0] == 0xff) && (szRobotsTxt[1] == 0xfe))
        {
            DBG_WARN("Unicode robots.txt! Ignoring ...");
            hr = E_FAIL;
        }
    }

    if (FAILED(hr))
        return hr;

    int iPtr = 0;
    WCHAR wchBuf2[256];
    WCHAR wchBuf[INTERNET_MAX_URL_LENGTH];
    DWORD dwBufSize;

    // Find the first "user-agent" which matches
    while ((pszToken = GetToken(szRobotsTxt, &iPtr, NULL)) != NULL)
    {
        if (lstrcmpiA(pszToken, c_szRobots_UserAgent))
            continue;

        pszToken = GetToken(szRobotsTxt, &iPtr, NULL);
        if (!pszToken)
            break;

        if ((*pszToken == '*') ||
            (!lstrcmpiA(pszToken, c_szRobots_OurUserAgent)))
        {
            TraceMsgA(TF_THISMODULE, "Using user agent segment: \"%s\"", pszToken);
            break;
        }
    }

    if (!pszToken)
        return E_FAIL;

    CWCStringList *psl = new CWCDwordStringList;
    if (psl)
    {
        psl->Init(2048);

        // Look for Allow: or Disallow: sections
        while ((pszToken = GetToken(szRobotsTxt, &iPtr, NULL)) != NULL)
        {
            if (!lstrcmpiA(pszToken, c_szRobots_UserAgent))
                break;  // end of our 'user-agent' section

            dwData = 0;

            if (!lstrcmpiA(pszToken, c_szRobots_Allow))     dwData = DATA_ALLOW;
            if (!lstrcmpiA(pszToken, c_szRobots_Disallow))  dwData = DATA_DISALLOW;

            if (!dwData)
                continue;   // look for next token

            pszToken = GetToken(szRobotsTxt, &iPtr, NULL);
            if (!pszToken)
                break;

            // Ensure that they don't have blank entries; we'll abort if so
            if (!lstrcmpiA(pszToken, c_szRobots_UserAgent) ||
                !lstrcmpiA(pszToken, c_szRobots_Allow) ||
                !lstrcmpiA(pszToken, c_szRobots_Disallow))
            {
                break;
            }

            // Combine this url with the base for this site.
            dwBufSize = ARRAYSIZE(wchBuf);
            if (SHAnsiToUnicode(pszToken, wchBuf2, ARRAYSIZE(wchBuf2)) &&
                SUCCEEDED(UrlCombineW(pwszRobotsTxtURL, wchBuf2, wchBuf, &dwBufSize, 0)))
            {
                TraceMsgA(TF_THISMODULE, "Robots.txt will %s urls with %s (%ws)",
                    ((dwData==DATA_ALLOW) ? c_szRobots_Allow : c_szRobots_Disallow),
                    pszToken, wchBuf);

                // if this is a duplicate url we effectively ignore this directive
                //  thanks to CWCStringList removing duplicates for us

                psl->AddString(wchBuf, dwData);
            }
        }
    }

    if (psl && (psl->NumStrings() > 0))
    {
        *ppslRet = psl;
        return S_OK;
    }

    if (psl)
        delete psl;

    return E_FAIL;
}

HRESULT CWebCrawler::GetRealUrl(int iPageIndex, LPWSTR *ppwszThisUrl)
{
    m_pCurDownload->GetRealURL(ppwszThisUrl);

    if (*ppwszThisUrl)
    {
        return S_OK;
    }

    DBG_WARN("m_pCurDownload->GetRealURL failed!!!");

    // Get url from string list
    LPCWSTR pwszUrl=NULL;

    pwszUrl = m_pPages->GetString(iPageIndex);

    if (pwszUrl)
    {
        *ppwszThisUrl = StrDupW(pwszUrl);
    }

    return (*ppwszThisUrl) ? S_OK : E_OUTOFMEMORY;
}

// Allocates BSTR for host name.
HRESULT CWebCrawler::GetHostName(LPCWSTR pwszThisUrl, BSTR *pbstrHostName)
{
    if (pwszThisUrl)
    {
        URL_COMPONENTSA comp;
        LPSTR           pszUrl;
        int             iLen;

//      InternetCrackUrlW(pszUrl, 0, 0, &comp)  // this is even slower than converting it ourselves...

        // convert to ansi
        iLen = lstrlenW(pwszThisUrl) + 1;
        pszUrl = (LPSTR)MemAlloc(LMEM_FIXED, iLen);
        if (pszUrl)
        {
            SHUnicodeToAnsi(pwszThisUrl, pszUrl, iLen);

            // crack out the host name
            ZeroMemory(&comp, sizeof(comp));
            comp.dwStructSize = sizeof(comp);
            comp.dwHostNameLength = 1;  // indicate that we want the host name

            if (InternetCrackUrlA(pszUrl, 0, 0, &comp))
            {
                *pbstrHostName = SysAllocStringLen(NULL, comp.dwHostNameLength);
                if (*pbstrHostName)
                {
                    comp.lpszHostName[comp.dwHostNameLength] = 0; // avoid debug rip
                    SHAnsiToUnicode(comp.lpszHostName, *pbstrHostName, comp.dwHostNameLength + 1);
                    ASSERT((*pbstrHostName)[comp.dwHostNameLength] == 0);
                }
            }

            MemFree((HLOCAL)pszUrl);
        }
    }

    return S_OK;
}

// Gets partly validated (CUrlDownload::IsValidUrl and hostname validation)
//  string lists and leaves in m_pPendingLinks
// Remaining validation is robots.txt if any
HRESULT CWebCrawler::GetLinksFromPage()
{
    // Get links from this page that we want to follow.
    CWCStringList *pslLinks=NULL, slMeta;

    IHTMLDocument2  *pDoc;
    BOOL            fFollowLinks = TRUE;
    int             i;

    slMeta.Init(2048);

    m_pCurDownload->GetDocument(&pDoc);
    if (pDoc)
    {
        // See if there is a META tag telling us not to follow
        CHelperOM::GetCollection(pDoc, &slMeta, CHelperOM::CTYPE_META, NULL, 0);
        for (i=0; i<slMeta.NumStrings(); i++)
        {
            if (!StrCmpNIW(slMeta.GetString(i), c_wszRobotsMetaName, c_iRobotsMetaNameLen))
            {
                LPCWSTR pwszContent = slMeta.GetString(i) + c_iRobotsMetaNameLen;
                TraceMsg(TF_THISMODULE, "Found 'robots' meta tag; content=%ws", pwszContent);

                while (pwszContent && *pwszContent)
                {
                    if (!StrCmpNIW(pwszContent, c_wszRobotsNoFollow, c_iRobotsNoFollow))
                    {
                        DBG("Not following links from this page.");
                        fFollowLinks = FALSE;
                        break;
                    }
                    pwszContent = StrChrW(pwszContent+1, L',');
                    if (pwszContent && *pwszContent)
                        pwszContent ++;
                }
                break;
            }
        }
        if (fFollowLinks)
        {
            if (m_pPendingLinks)
                pslLinks = m_pPendingLinks;
            else
            {
                pslLinks = new CWCDwordStringList;
                if (pslLinks)
                    pslLinks->Init();
                else
                    return E_OUTOFMEMORY;
            }

            CHelperOM::GetCollection(pDoc, pslLinks, CHelperOM::CTYPE_LINKS, &CheckLink, (DWORD_PTR)this);
            CHelperOM::GetCollection(pDoc, pslLinks, CHelperOM::CTYPE_MAPS, &CheckLink, (DWORD_PTR)this);
        }
        pDoc->Release();
        pDoc=NULL;
    }

    m_pPendingLinks = pslLinks;

    return S_OK;
}

// Gets 'dependency links' such as frames from a page
HRESULT CWebCrawler::GetDependencyLinksFromPage(LPCWSTR pwszThisUrl, DWORD dwRecurse)
{
    CWCStringList *psl=NULL;
    IHTMLDocument2 *pDoc;
    int i, iAdd, iIndex, iOldMax;
    DWORD_PTR dwData;

    if (m_pDependencyLinks)
        psl = m_pDependencyLinks;
    else
    {
        m_iDependencyStarted = 0;
        psl = new CWCStringList;
        if (psl)
            psl->Init(2048);
        else
            return E_OUTOFMEMORY;
    }

    iOldMax = psl->NumStrings();

    m_pCurDownload->GetDocument(&pDoc);
    if (pDoc)
    {
        // Add Frames ("Frame" and "IFrame" tags) if present
        CHelperOM::GetCollection(pDoc, psl, CHelperOM::CTYPE_FRAMES, CheckFrame, (DWORD_PTR)pwszThisUrl);
    }

    SAFERELEASE(pDoc);

    m_pDependencyLinks = psl;

    // Add the new urls to the main page list
    for (i = iOldMax; i<psl->NumStrings(); i++)
    {
        iAdd = m_pPages->AddString(m_pDependencyLinks->GetString(i),
                        dwRecurse,
                        &iIndex);

        if (m_lMaxNumUrls > 0 && iAdd==CWCStringList::STRLST_ADDED)
            m_lMaxNumUrls ++;

        if (iAdd == CWCStringList::STRLST_FAIL)
            return E_OUTOFMEMORY;

        if (iAdd == CWCStringList::STRLST_DUPLICATE)
        {
            // bump up recursion level of old page if necessary
            // See if we've downloaded this yet.
            dwData = m_pPages->GetStringData(iIndex);
            if (!(dwData & DATA_DLSTARTED))
            {
                // Haven't downloaded it yet.
                // Update the recurse levels if necessary.
                if ((dwData & DATA_RECURSEMASK) < dwRecurse)
                {
                    dwData = (dwData & ~DATA_RECURSEMASK) | dwRecurse;
                }

                // Turn off the "link" bit
                dwData &= ~DATA_LINK;

                m_pPages->SetStringData(iIndex, dwData);
            }
#ifdef DEBUG
            // Shouldn't happen; this frame already dl'd with lower recurse level
            else
                ASSERT((dwData & DATA_RECURSEMASK) >= dwRecurse);
#endif
        }
    }

    return S_OK;
}

//-------------------------------------
// OnDownloadComplete
//
// Called when a url is finished downloading, it processes the url
//  and kicks off the next download
//
HRESULT CWebCrawler::OnDownloadComplete(UINT iID, int iError)
{
    int         iPageIndex = m_iCurDownloadStringIndex;
    BOOL        fOperationComplete = FALSE;
    BOOL        fDiskFull = FALSE;
    BSTR        bstrCDFURL = NULL; //  CDF URL if there is one
    LPWSTR      pwszThisUrl=NULL;

    HRESULT     hr;

    TraceMsg(TF_THISMODULE, "WebCrawler: OnDownloadComplete(%d)", iError);
    ASSERT(m_pPages);
    ASSERT(iPageIndex < m_pCurDownloadStringList->NumStrings());

    if (_ERROR_REPROCESSING != iError)
    {
        m_iNumPagesDownloading --;
        ASSERT(m_iNumPagesDownloading == 0);
    }

    if (m_pCurDownloadStringList == m_pRobotsTxt)
    {
        CWCStringList *pslNew=NULL;

        // Process robots.txt file
        if (SUCCEEDED(ParseRobotsTxt(m_pRobotsTxt->GetString(iPageIndex), &pslNew)))
        {
            m_pRobotsTxt->SetStringData(iPageIndex, (DWORD_PTR)(pslNew));
        }
    }
    else
    {
        // Process normal file
        ASSERT(m_pCurDownloadStringList == m_pPages);

        DWORD dwData, dwRecurseLevelsFromThisPage;

        dwData = (DWORD)m_pPages->GetStringData(iPageIndex);
        dwRecurseLevelsFromThisPage = dwData & DATA_RECURSEMASK;

        dwData |= DATA_DLFINISHED;
        if (iError > 0)
            dwData |= DATA_DLERROR;

        // mark as downloaded
        m_pCurDownloadStringList->SetStringData(iPageIndex, dwData);

        // Is this the first page?
        if (m_iTotalStarted == 1)
        {
            // Check the HTTP response code
            DWORD dwResponseCode;

            hr = m_pCurDownload->GetResponseCode(&dwResponseCode);

            if (SUCCEEDED(hr))
            {
                hr = CheckResponseCode(dwResponseCode);
                if (FAILED(hr))
                    fOperationComplete = TRUE;
            }
            else
                DBG("CWebCrawler failed to GetResponseCode");

            // Get the Charset
            BSTR bstrCharSet=NULL;
            IHTMLDocument2 *pDoc=NULL;


            // -> Bharats --------
            // Find a link tag and store it away the cdf by copying it (if it points to a cdf.)
            // do url combine of this cdf 
            if (SUCCEEDED(m_pCurDownload->GetDocument(&pDoc)) && pDoc &&
                SUCCEEDED(pDoc->get_charset(&bstrCharSet)) && bstrCharSet)
            {
                WriteOLESTR(m_pSubscriptionItem, c_szPropCharSet, bstrCharSet);
                TraceMsg(TF_THISMODULE, "Charset = \"%ws\"", bstrCharSet);
                SysFreeString(bstrCharSet);        
            }
            else
                WriteEMPTY(m_pSubscriptionItem, c_szPropCharSet);

            if(pDoc)
            {
                if(FAILED(GetChannelItem(NULL)))   // A Doc exists and this download is not from a channel itself
                {
                    IHTMLLinkElement *pLink = NULL;
                    hr = SearchForElementInHead(pDoc, OLESTR("REL"), OLESTR("OFFLINE"), 
                                            IID_IHTMLLinkElement, (IUnknown **)&pLink);
                    if(S_OK == hr)
                    {
                        hr = pLink->get_href(&bstrCDFURL);
                        pLink->Release();
                    }
                }   
                pDoc->Release();
                pDoc = NULL;
            }
        }

        if ((iError != _ERROR_REPROCESSING) && (iError != BDU2_ERROR_NONE))
        {
            if (iError != BDU2_ERROR_NOT_HTML)
                m_iDownloadErrors ++;

            if (iError == BDU2_ERROR_MAXSIZE)
            {
                SetEndStatus(INET_E_AGENT_MAX_SIZE_EXCEEDED);
                fOperationComplete = TRUE;
            }
        }
        else
        {
            // Don't process this url if we already have set fOperationComplete
            if (!fOperationComplete)
            {
                // Did we get *just* the HEAD info?
                if (IsAgentFlagSet(FLAG_HEADONLY))
                {
                    SYSTEMTIME stLastModified;
                    FILETIME   ftLastModified;

                    if (SUCCEEDED(m_pCurDownload->GetLastModified(&stLastModified)) &&
                                  SystemTimeToFileTime(&stLastModified, &ftLastModified))
                    {
                        DBG("Retrieved 'HEAD' info; change detection based on Last Modified");

                        hr = PostCheckUrlForChange(&m_varChange, NULL, ftLastModified);
                        // If we FAILED, we mark it as changed.
                        if (hr == S_OK || FAILED(hr))
                        {
                            SetAgentFlag(FLAG_CRAWLCHANGED);
                            DBG("URL has changed; will flag webcrawl as changed");
                        }

                        // "Changes Only" mode, persist change detection code
                        ASSERT(IsAgentFlagSet(FLAG_CHANGESONLY));
                        ASSERT(m_iTotalStarted == 1);
                        WriteVariant(m_pSubscriptionItem, c_szPropChangeCode, &m_varChange);
                    }
                }
                else
                {
                    // Get real URL in case we were redirected
                    if (FAILED(GetRealUrl(iPageIndex, &pwszThisUrl)))
                    {
                        fOperationComplete = TRUE;        // bad
                    }
                    else
                    {
                        ASSERT(pwszThisUrl);

                        // Get host name from first page if necessary
                        if ((iPageIndex==0) &&
                            (m_dwRecurseLevels>0) &&
                            !IsRecurseFlagSet(WEBCRAWL_LINKS_ELSEWHERE) &&
                            !m_bstrHostName)
                        {
                            GetHostName(pwszThisUrl, &m_bstrHostName);
#ifdef DEBUG
                            if (m_bstrHostName)
                                TraceMsg(TF_THISMODULE, "Just got first host name: %ws", m_bstrHostName);
                            else
                                DBG_WARN("Get first host name failed!!!");
#endif
                        }

                        DWORD dwCurSize = 0, dwRepeat = 0;

                        HRESULT hr1;

                        do
                        {
                            hr1 = S_OK;

                            // Make page and dependencies sticky and get their total size
                            fDiskFull = FALSE;
                            MakePageStickyAndGetSize(pwszThisUrl, &dwCurSize, &fDiskFull);

                            if (fDiskFull && (dwRepeat < 2))
                            {
                                // If we couldn't make stuff sticky, ask host to make cache bigger
                                hr1 = m_pAgentEvents->ReportError(&m_SubscriptionCookie,
                                            INET_E_AGENT_EXCEEDING_CACHE_SIZE, NULL);

                                if (hr1 == E_PENDING)
                                {
                                    // Host is going to ask the user to increase the cache size.
                                    // Host should either abort or resume us later.
                                    SetAgentFlag(FLAG_WAITING_FOR_INCREASED_CACHE);
                                    goto done;
                                }
                                else if (hr1 == INET_S_AGENT_INCREASED_CACHE_SIZE)
                                {
                                    // Host just increased the cache size. Try it again.
                                }
                                else
                                {
                                    // Not gonna do it. Abort.
                                }
                            }
                        }
                        while ((hr1 == INET_S_AGENT_INCREASED_CACHE_SIZE) && (++dwRepeat <= 2));

                        m_dwCurSize += dwCurSize;

                        // Is there form based authentication that we need to handle
                        // on the top page of this subscription?
                        if (!fDiskFull && (0 == iPageIndex) && !m_pCurDownload->GetFormSubmitted())
                        {
                            hr = FindAndSubmitForm();
                            if (S_OK == hr)
                            {
                                // Successfully submitted form.  Bail and wait for the next OnDownloadComplete() call.
                                // FEATURE: Should we make the form URL and dependencies sticky?
                                return S_OK;
                            }
                            else if (FAILED(hr))
                            {
                                // We failed trying to submit the form.  Bail.
                                // FEATURE: Should we set a better error string?
                                SetEndStatus(E_FAIL);
                                CleanUp();
                                return S_OK;
                            }
                            // else no form - fall through
                        }

                        TraceMsg(TF_THISMODULE, "WebCrawler up to %d kb", (int)(m_dwCurSize>>10));

                        if ((m_lMaxNumUrls < 0) &&
                            !dwRecurseLevelsFromThisPage &&
                            !(dwData & DATA_CODEBASE))
                        {
                            m_lMaxNumUrls = m_pPages->NumStrings() + ((m_pRobotsTxt) ? m_pRobotsTxt->NumStrings() : 0);
                        }
                    }  // SUCCEEDED(GetRealUrl)
                }  // !FLAG_HEADONLY
            } // !fOperationComplete

            // If we're in "Changes Only" mode, we're done.
            if (IsAgentFlagSet(FLAG_CHANGESONLY))
                fOperationComplete = TRUE;

            // Check to see if we're past our max size
            if (!fOperationComplete && fDiskFull || (m_dwMaxSize && (m_dwCurSize >= (m_dwMaxSize<<10))))
            {
        #ifdef DEBUG
                if (fDiskFull)
                    DBG_WARN("Disk/cache full; aborting.");
                else
                    TraceMsg(TF_WARNING, "Past maximum size; aborting. (%d kb of %d kb)", (int)(m_dwCurSize>>10), (int)m_dwMaxSize);
        #endif
                // abort operation
                fOperationComplete = TRUE;

                if (fDiskFull)
                {
                    SetEndStatus(INET_E_AGENT_CACHE_SIZE_EXCEEDED);
                }
                else
                {
                    SetEndStatus(INET_E_AGENT_MAX_SIZE_EXCEEDED);
                }
            }

            if (!fOperationComplete)
            {
                // Get any links from page
                // Get "dependency links" from page - frames, etc.

                // we do this even if a CDF file is specified
                // Essentially, since the user has no clue about the CDF
                // file - we do not want to confuse the user
                GetDependencyLinksFromPage(pwszThisUrl, dwRecurseLevelsFromThisPage);

                if (dwRecurseLevelsFromThisPage)
                {
                    // Get links from this page that we want to follow.
                    GetLinksFromPage();

                    if (m_pPendingLinks)
                        TraceMsg(TF_THISMODULE,
                            "Total of %d unique valid links found", m_pPendingLinks->NumStrings());

                    m_dwPendingRecurseLevel = dwRecurseLevelsFromThisPage - 1;
                }

            }
        }   // !iError
    } // !robots.txt

    if(!fOperationComplete)
        StartCDFDownload(bstrCDFURL, pwszThisUrl);
        
    if(!m_fCDFDownloadInProgress)
    {
        // Don't try code downloads or any of the rest until you're done with
        // the cdf download
        // See if we have any more URLs to download.
        if (!fOperationComplete && FAILED(StartNextDownload()))
            fOperationComplete = TRUE;  // No, we're done!
    }

    CheckOperationComplete(fOperationComplete);

done:
    if (pwszThisUrl)
        MemFree(pwszThisUrl);

    SAFEFREEBSTR(bstrCDFURL);
            

    return S_OK;
}


HRESULT CWebCrawler::StartCDFDownload(WCHAR *pwszCDFURL, WCHAR *pwszBaseUrl)
{
    HRESULT hr = E_FAIL;
    m_fCDFDownloadInProgress = FALSE;
    if(pwszCDFURL)
    {
        // We have a CDF File - begin download of it
    
        if (m_pRunAgent)
        {
            ASSERT(0);
            DBG_WARN("WebCrawler: Attempting to download next CDF when nother CDF exists.");
            hr = E_FAIL;
            goto Exit;
        }
        else
        {
             // create subscription item for CDL agent.

            ISubscriptionItem *pItem = NULL;
            
            
            if (m_dwMaxSize && ((m_dwCurSize>>10) >= m_dwMaxSize))
            {
                // We've exceeded our maximum download KB limit and can't continue.
                DBG_WARN("WebCrawler: Exceeded Maximum KB download limit with CodeBase download.");
                SetEndStatus(hr = INET_E_AGENT_MAX_SIZE_EXCEEDED);
                goto Exit;
            }

            if (!m_pSubscriptionItem ||
                FAILED(hr = DoCloneSubscriptionItem(m_pSubscriptionItem, NULL, &pItem)))
            {
                goto Exit;
            }
            ASSERT(pItem != NULL);
            ASSERT(pwszCDFURL != NULL);
            WCHAR   wszCombined[INTERNET_MAX_URL_LENGTH];
            DWORD dwBufSize = ARRAYSIZE(wszCombined);
            
            if (SUCCEEDED(UrlCombineW(pwszBaseUrl, pwszCDFURL, wszCombined, &dwBufSize, 0)))
            {
            
                WriteOLESTR(pItem, c_szPropURL, wszCombined);
            
                WriteEMPTY(pItem, c_szPropCrawlGroupID); // clear the old cache group id - don't want 
                                                         // children to know of it 
                // The crawler already has a cache group id that we simply use as the new ID
                WriteLONGLONG(pItem, c_szPropCrawlNewGroupID, m_llCacheGroupID);
                WriteDWORD(pItem, c_szPropChannelFlags, CHANNEL_AGENT_PRECACHE_ALL);
                // Finally - since we know that this is for offline use, we just set the flags to precache all
            
                m_pRunAgent = new CRunDeliveryAgent();
                if (m_pRunAgent)
                    hr = m_pRunAgent->Init((CRunDeliveryAgentSink *)this, pItem, CLSID_ChannelAgent);
                pItem->Release();

                if (m_pRunAgent && SUCCEEDED(hr))
                {
                    hr = m_pRunAgent->StartAgent(); 
                    if (hr == E_PENDING)
                    {
                        hr = S_OK;
                        m_fCDFDownloadInProgress = TRUE;
                    }
                }
                else
                {
                    hr = E_OUTOFMEMORY;
                }
            }
        }
    }
Exit:
    if((S_OK != hr) && m_pRunAgent)
    {
        CRunDeliveryAgent::SafeRelease(m_pRunAgent);
    }
    return hr;

}

// CRunDeliveryAgentSink call back method to signal the end of a codebase download.

HRESULT CWebCrawler::OnAgentEnd(const SUBSCRIPTIONCOOKIE *pSubscriptionCookie, 
                               long lSizeDownloaded, HRESULT hrResult, LPCWSTR wszResult,
                               BOOL fSynchronous)
{
    ASSERT(m_pRunAgent != NULL);
    BOOL        fOperationComplete = FALSE;
    CRunDeliveryAgent::SafeRelease(m_pRunAgent);


    if(m_fCDFDownloadInProgress)
    {
        m_fCDFDownloadInProgress = FALSE; 
    }
    else
    {
        int         iPageIndex = m_iCurDownloadStringIndex;
        BOOL        fDiskFull = FALSE;
        CCodeBaseHold *pcbh = NULL;
        BOOL        fError;
        LPCWSTR     pwszThisURL=NULL;

        TraceMsg(TF_THISMODULE, "WebCrawler: OnAgentEnd of CRunDeliveryAgentSink");
        ASSERT(m_pCodeBaseList);
        ASSERT(iPageIndex < m_pCurDownloadStringList->NumStrings());
        ASSERT(m_pCurDownloadStringList == m_pCodeBaseList);

        m_iNumPagesDownloading --;
        ASSERT(m_iNumPagesDownloading == 0);

        pcbh = (CCodeBaseHold *)m_pCodeBaseList->GetStringData(iPageIndex);
        pwszThisURL = m_pCodeBaseList->GetString(iPageIndex);
        ASSERT(pwszThisURL);

        pcbh->dwFlags |= DATA_DLFINISHED;

        fError = FAILED(hrResult);
        if (fSynchronous)
        {
            fError = TRUE;
            ASSERT(FAILED(hrResult));       // we can't succeed synchronously...
        }

        //NOTE: The CDL agent will abort if it finds the file exceeds the MaxSizeKB.  In this case the file is not
        //      counted and there may be other smaller CAB's that can be downloaded, so we continue to proceed.

        if (fError)
        {
            pcbh->dwFlags |= DATA_DLERROR;
            m_iDownloadErrors ++;
            SetEndStatus(hrResult);
        }
        else
        {
            BYTE chBuf[MY_MAX_CACHE_ENTRY_INFO];
            LPINTERNET_CACHE_ENTRY_INFO lpInfo = (LPINTERNET_CACHE_ENTRY_INFO) chBuf;
            TCHAR   szUrl[INTERNET_MAX_URL_LENGTH];

            MyOleStrToStrN(szUrl, INTERNET_MAX_URL_LENGTH, pwszThisURL);

            if (FAILED(GetUrlInfoAndMakeSticky(NULL, szUrl,
                                 lpInfo, sizeof(chBuf), m_llCacheGroupID)))
            {
                //REVIEW: Do something here?  Unlikely to occur in practice.
                fOperationComplete = TRUE;
                ASSERT(0);
            }
            else
            {
                m_dwCurSize += lpInfo->dwSizeLow;
            }

            TraceMsg(TF_THISMODULE, "WebCrawler up to %d kb", (int)(m_dwCurSize>>10));

            if (m_dwMaxSize && ((m_dwCurSize>>10)>m_dwMaxSize))
            {

                // abort operation
                fOperationComplete = TRUE;
                if (fDiskFull)
                    SetEndStatus(INET_E_AGENT_CACHE_SIZE_EXCEEDED);
                else
                    SetEndStatus(INET_E_AGENT_MAX_SIZE_EXCEEDED);
            }

        } // !fError
    }
    // See if we have any more URLs to download.
    if (!fOperationComplete && FAILED(StartNextDownload()))
        fOperationComplete = TRUE;  // No, we're done!

    if(!fSynchronous)
        CheckOperationComplete(fOperationComplete);

    return S_OK;
}

//////////////////////////////////////////////////////////////////////////
//
// CheckCompleteOperation :: If parameter is TRUE, then all downloads are
//                           complete, the appropriate STATUS_CODE is set
//                           and clean up initiated.
//
//////////////////////////////////////////////////////////////////////////
void CWebCrawler::CheckOperationComplete(BOOL fOperationComplete)
{
    if (fOperationComplete)
    {
        DBG("WebCrawler complete. Shutting down.");
        if (INET_S_AGENT_BASIC_SUCCESS == GetEndStatus())
        {
            // Set end status appropriately
            if (m_iDownloadErrors)
            {
                if (m_iPagesStarted<=1)
                {
                    DBG("Webcrawl failed - first URL failed.");
                    SetEndStatus(E_INVALIDARG);
                }
                else
                {
                    DBG("Webcrawl succeeded - some URLs failed.");
                    SetEndStatus(INET_S_AGENT_PART_FAIL);
                }
            }
            else
            {
                DBG("Webcrawl succeeded");
                if (!IsAgentFlagSet(FLAG_CRAWLCHANGED))
                {
                    SetEndStatus(S_FALSE);
                    DBG("No changes were detected");
                }
                else
                {
                    DBG("Webcrawl succeeded");
                    SetEndStatus(S_OK);
                }
            }
        }

        if (m_llOldCacheGroupID)
        {
            DBG("Nuking old cache group.");
            if (!DeleteUrlCacheGroup(m_llOldCacheGroupID, 0, 0))
            {
                DBG_WARN("Failed to delete old cache group!");
            }
        }

        WriteLONGLONG(m_pSubscriptionItem, c_szPropCrawlGroupID, m_llCacheGroupID);

        m_lSizeDownloadedKB = ((m_dwCurSize+511)>>10);

        WriteDWORD(m_pSubscriptionItem, c_szPropCrawlActualSize, m_lSizeDownloadedKB);

        if (m_lMaxNumUrls >= 0)
        {
            WriteDWORD(m_pSubscriptionItem, c_szPropActualProgressMax, m_lMaxNumUrls);
        }

        // Send a robots.txt warning to the user if we ended up not downloading stuff
        //  because of the server's robots.txt file
        if (m_iSkippedByRobotsTxt != 0)
        {
            HRESULT hr = S_OK;      // Make it an "information" message
            WCHAR wszMessage[200];

            if (m_iPagesStarted==1)
            {
                hr = INET_E_AGENT_WARNING;  // Unless we're missing almost everything
            }

            if (MLLoadStringW(IDS_CRAWL_ROBOTS_TXT_WARNING, wszMessage, ARRAYSIZE(wszMessage)))
            {
                m_pAgentEvents->ReportError(&m_SubscriptionCookie, hr, wszMessage);
            }
        }

        // Will call "UpdateEnd"
        CleanUp();
    }
}

HRESULT CWebCrawler::ModifyUpdateEnd(ISubscriptionItem *pEndItem, UINT *puiRes)
{
    // Customize our end status string
    switch (GetEndStatus())
    {
        case INET_E_AGENT_MAX_SIZE_EXCEEDED :
                              *puiRes = IDS_AGNT_STATUS_SIZELIMIT; break;
        case INET_E_AGENT_CACHE_SIZE_EXCEEDED :
                              *puiRes = IDS_AGNT_STATUS_CACHELIMIT; break;
        case E_FAIL         : *puiRes = IDS_CRAWL_STATUS_NOT_OK; break;
        case S_OK           :
            if (!IsAgentFlagSet(FLAG_CHANGESONLY))
                *puiRes = IDS_CRAWL_STATUS_OK;
            else
                *puiRes = IDS_URL_STATUS_OK;
            break;
        case S_FALSE        :
            if (!IsAgentFlagSet(FLAG_CHANGESONLY))
                *puiRes = IDS_CRAWL_STATUS_UNCHANGED;
            else
                *puiRes = IDS_URL_STATUS_UNCHANGED;
            break;
        case INET_S_AGENT_PART_FAIL : *puiRes = IDS_CRAWL_STATUS_MOSTLYOK; break;
    }

    return CDeliveryAgent::ModifyUpdateEnd(pEndItem, puiRes);
}

HRESULT CWebCrawler::DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved)
{
    HRESULT hr = S_OK, hr2;

    // free threaded
    EnterCriticalSection(&m_critDependencies);

    if (NULL == pchUrl)
    {
        DBG_WARN("CWebCrawler::DownloadStart pchUrl=NULL");
    }
    else
    {
        // Check to see if this is already in our dependencies list and abort if so
        if (CWCStringList::STRLST_ADDED != m_pDependencies->AddString(pchUrl, 0))
        {
            hr = E_ABORT;       // Don't download this thing.
            TraceMsg(TF_THISMODULE, "Aborting mshtml url (already added): %ws", pchUrl);
        }

        if (SUCCEEDED(hr))
        {
            // Check to see if this fails the robots.txt and abort if so
            // Note, this will only work if we happen to have already gotten this robots.txt
            // Need to abort here if we haven't gotten it, then get it, then get just this dep. Yuck.
            // Also shouldn't do the check if this is the first page downloaded
            DWORD dwIndex;
            hr2 = GetRobotsTxtIndex(pchUrl, FALSE, &dwIndex);
            if (SUCCEEDED(hr2))
            {
                BOOL fAllow;
                if (SUCCEEDED(ValidateWithRobotsTxt(pchUrl, dwIndex, &fAllow)))
                {
                    if (!fAllow)
                        hr = E_ABORT;   // ooh, failed the test.
                }
            }
        }
    }

    LeaveCriticalSection(&m_critDependencies);

    return hr;
}

HRESULT CWebCrawler::DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved)
{
    // free threaded
    // Do nothing. We may wish to post message to make sticky here. We may wish to
    //  mark as downloaded in string list here.
//  EnterCriticalSection(&m_critDependencies);
//  LeaveCriticalSection(&m_critDependencies);
    return S_OK;
}


/* 41927 (IE5 4491)
HRESULT CWebCrawler::OnGetReferer(LPCWSTR *ppwszReferer)
{
    if (m_iPagesStarted <= 1)
    {
        *ppwszReferer = NULL;
        return S_FALSE;
    }

    if (m_pCurDownloadStringList == m_pRobotsTxt)
    {
        // Referer is last page from main list to be downloaded
        *ppwszReferer = m_pPages->GetString(m_iPagesStarted-1);
        return S_OK;
    }

    if (m_pCurDownloadStringList == m_pPages)
    {
        // Referer is stored in string list data
        *ppwszReferer = m_pPages->GetString(
            ((m_pPages->GetStringData(m_iCurDownloadStringIndex) & DATA_REFERERMASK) >> DATA_REFERERSHIFT));
        return S_OK;
    }

    // We don't return a referer for code bases
    ASSERT(m_pCurDownloadStringList == m_pCodeBaseList);

    return S_FALSE;
}
*/

HRESULT CWebCrawler::OnAuthenticate(HWND *phwnd, LPWSTR *ppszUsername, LPWSTR *ppszPassword)
{
    HRESULT hr, hrRet=E_FAIL;
    ASSERT(phwnd && ppszUsername && ppszPassword);
    ASSERT((HWND)-1 == *phwnd && NULL == *ppszUsername && NULL == *ppszPassword);

    // If our host name doesn't match the root host name, don't return auth
    // information.

    LPWSTR pwszUrl, bstrHostName=NULL;

    m_pCurDownload->GetRealURL(&pwszUrl);   // may re-enter Trident

    if (pwszUrl)
    {
        GetHostName(pwszUrl, &bstrHostName);
        LocalFree(pwszUrl);
    }

    if (bstrHostName)
    {
        if (!m_bstrHostName || !MyAsciiCmpW(bstrHostName, m_bstrHostName))
        {
            // Host names match. Return auth information.
            // If we're hosted by channel agent, use its auth information
            ISubscriptionItem *pChannel=NULL;
            ISubscriptionItem *pItem=m_pSubscriptionItem;
            
            if (SUCCEEDED(GetChannelItem(&pChannel)))
            {
                pItem = pChannel;
            }
            
            hr = ReadOLESTR(pItem, c_szPropCrawlUsername, ppszUsername);
            if (SUCCEEDED(hr))
            {
                BSTR bstrPassword = NULL;
                hr = ReadPassword(pItem, &bstrPassword);
                if (SUCCEEDED(hr))
                {
                    int len = (lstrlenW(bstrPassword) + 1) * sizeof(WCHAR);
                    *ppszPassword = (LPWSTR) CoTaskMemAlloc(len);
                    if (*ppszPassword)
                    {
                        CopyMemory(*ppszPassword, bstrPassword, len);
                    }
                    SAFEFREEBSTR(bstrPassword);
                    if (*ppszPassword)
                    {
                        hrRet = S_OK;
                    }
                }
            }

            if (FAILED(hrRet))
            {
                SAFEFREEOLESTR(*ppszUsername);
                SAFEFREEOLESTR(*ppszPassword);
            }

            SAFERELEASE(pChannel);
        }

        SysFreeString(bstrHostName);
    }
    return hrRet;
}

HRESULT CWebCrawler::OnClientPull(UINT iID, LPCWSTR pwszOldURL, LPCWSTR pwszNewURL)
{
    // CUrlDownload is informing us it's about to do a client pull.

    // Let's send out a progress report for the new url
    SendUpdateProgress(pwszNewURL, m_iTotalStarted, m_lMaxNumUrls);

    // Now we need to process the current url: make it and dependencies sticky
    DWORD dwCurSize=0;
    BOOL fDiskFull=FALSE;
    MakePageStickyAndGetSize(pwszOldURL, &dwCurSize, &fDiskFull);
    m_dwCurSize += dwCurSize;
    TraceMsg(TF_THISMODULE, "WebCrawler processed page prior to client pull - now up to %d kb", (int)(m_dwCurSize>>10));

    // Tell CUrlDownload to go ahead and download the new url
    return S_OK;
}

HRESULT CWebCrawler::OnOleCommandTargetExec(const GUID *pguidCmdGroup, DWORD nCmdID,
                                DWORD nCmdexecopt, VARIANTARG *pvarargIn,
                                VARIANTARG *pvarargOut)
{
    HRESULT hr = OLECMDERR_E_NOTSUPPORTED;
    IPropertyBag2 *pPropBag = NULL;
    int i;

    //REVIEW: CLSID for this not yet defined.
    if (    pguidCmdGroup 
        && (*pguidCmdGroup == CGID_JavaParambagCompatHack) 
        && (nCmdID == 0) 
        && (nCmdexecopt == MSOCMDEXECOPT_DONTPROMPTUSER))
    {
        if (!IsRecurseFlagSet(WEBCRAWL_GET_CONTROLS))
        {
            goto Exit;
        }

        uCLSSPEC ucs;
        QUERYCONTEXT qc = { 0 };

        ucs.tyspec = TYSPEC_CLSID;
        ucs.tagged_union.clsid = CLSID_JavaVM;

        // Check to see if Java VM is installed. Don't try to get applets if not.
        if (!SUCCEEDED(FaultInIEFeature(NULL, &ucs, &qc, FIEF_FLAG_PEEK)))
        {
            goto Exit;
        }

        ULONG enIndex;
        const DWORD enMax = 7, enMin = 0;
        PROPBAG2 pb[enMax];
        VARIANT vaProps[enMax];
        HRESULT hrResult[enMax];
        enum { enCodeBase = 0, enCabBase, enCabinets, enArchive, enUsesLib, enLibrary, enUsesVer };
        LPWSTR pwszThisURL = NULL;
        int chLen;

        //REVIEW: This will need to be reviewed later when matching trident code is available
        //        and details worked out.

        if ((pvarargIn->vt != VT_UNKNOWN) || 
            (FAILED(pvarargIn->punkVal->QueryInterface(IID_IPropertyBag2, (void **)&pPropBag))))
        {
             goto Exit;
        }

        if (FAILED(GetRealUrl(m_iCurDownloadStringIndex, &pwszThisURL)))
        {
            pwszThisURL = StrDupW(L"");
        }

        // PROPBAG2 structure for data retrieval
        for (i=enMin; i<enMax; i++)
        {
            pb[i].dwType = PROPBAG2_TYPE_DATA;
            pb[i].vt = VT_BSTR;
            pb[i].cfType = NULL;                   // CLIPFORMAT
            pb[i].dwHint = 0;                      // ????
            pb[i].pstrName = NULL;
            pb[i].clsid = CLSID_NULL;              // ????
            vaProps[i].vt = VT_EMPTY;
            vaProps[i].bstrVal = NULL;
            hrResult[i] = E_FAIL;
        }

        if (((pb[enCodeBase].pstrName = SysAllocString(L"CODEBASE")) != NULL) &&
            ((pb[enCabBase].pstrName = SysAllocString(L"CABBASE")) != NULL) &&
            ((pb[enCabinets].pstrName = SysAllocString(L"CABINETS")) != NULL) &&
            ((pb[enArchive].pstrName = SysAllocString(L"ARCHIVE")) != NULL) &&
            ((pb[enUsesLib].pstrName = SysAllocString(L"USESLIBRARY")) != NULL) &&
            ((pb[enLibrary].pstrName = SysAllocString(L"USESLIBRARYCODEBASE")) != NULL) &&
            ((pb[enUsesVer].pstrName = SysAllocString(L"USESLIBRARYVERSION")) != NULL))
        {

            //Read returns E_FAIL even if it read some of the properties.
            //Since we check hrResult's below this isn't a big deal.

            hr = pPropBag->Read(enMax, &pb[0], NULL, &vaProps[0], &hrResult[0]);

            {
                BSTR bstrCodeBase = NULL;

                // check for CODEBASE
                if (SUCCEEDED(hrResult[enCodeBase]) && (vaProps[enCodeBase].vt == VT_BSTR))
                {
                    bstrCodeBase = vaProps[enCodeBase].bstrVal;
                }

                // add a trailing slash if not already present
                chLen = lstrlenW(bstrCodeBase);
                if (chLen && bstrCodeBase[chLen-1] != '/')
                {
                    LPWSTR szNewCodeBase = 0;
                    int nLen = chLen + 2;
                    szNewCodeBase = (LPWSTR) LocalAlloc(0,sizeof(WCHAR)*nLen);
                    if (szNewCodeBase)
                    {
                        StrCpyNW(szNewCodeBase, bstrCodeBase, nLen);
                        StrCatBuffW(szNewCodeBase, L"/", nLen);
                        SAFEFREEBSTR(bstrCodeBase);
                        bstrCodeBase = vaProps[enCodeBase].bstrVal = SysAllocString(szNewCodeBase);
                        LocalFree(szNewCodeBase);     
                    }
                }

                // check for CABBASE
                if (SUCCEEDED(hrResult[enCabBase]) && (vaProps[enCabBase].vt == VT_BSTR))
                {
                    BSTR szCabBase = vaProps[enCabBase].bstrVal;

                    // Add CABBASE URL to list of CABs to pull.
                    if (SUCCEEDED(CombineBaseAndRelativeURLs(pwszThisURL, bstrCodeBase, &szCabBase)))
                    {
                        m_pPages->AddString(szCabBase, 0);
                    }
                }

                // check for CABINETS
                for (enIndex = enCabinets; enIndex<(enArchive+1); enIndex++)
                {
                    if (SUCCEEDED(hrResult[enIndex]) && (vaProps[enIndex].vt == VT_BSTR))
                    {
                        BSTR szCur = vaProps[enIndex].bstrVal, szPrev = NULL;
                        while (szCur)
                        {
                            WCHAR wcCur = *szCur;

                            if ((wcCur == L'+') || (wcCur == L',') || (wcCur == L'\0'))
                            {
                                BSTR szLast = szPrev, szCabBase = NULL;
                                BOOL bLastFile = FALSE;
                                if (!szPrev)
                                {
                                    szLast = vaProps[enIndex].bstrVal;
                                }
                                szPrev = szCur; szPrev++;

                                if (*szCur == L'\0')
                                {
                                    bLastFile = TRUE;
                                }
                                *szCur = (unsigned short)L'\0';

                                // szLast points to current CabBase.
                                szCabBase = SysAllocString(szLast);
                                if (SUCCEEDED(CombineBaseAndRelativeURLs(pwszThisURL, bstrCodeBase, &szCabBase)))
                                {
                                    int iAdd=m_pPages->AddString(szCabBase, DATA_CODEBASE);
                                    if (m_lMaxNumUrls > 0 && iAdd==CWCStringList::STRLST_ADDED)
                                        m_lMaxNumUrls ++;
                                }
                                SAFEFREEBSTR(szCabBase);

                                if (bLastFile)
                                {
                                    szCur = NULL;
                                    break;
                                }
                            }
                            szCur++;
                        }  // while (szCur)
                    }  // cabinets
                }

                // check for USESLIBRARY* parameters.
                CCodeBaseHold *pcbh = NULL;
                if (SUCCEEDED(hrResult[enUsesLib]) && (vaProps[enUsesLib].vt == VT_BSTR) &&
                    SUCCEEDED(hrResult[enLibrary]) && (vaProps[enLibrary].vt == VT_BSTR))
                {
                    BSTR szThisLibCAB = NULL;
                    pcbh = new CCodeBaseHold();
                    if (pcbh)
                    {
                        pcbh->szDistUnit = SysAllocString(vaProps[enUsesLib].bstrVal);
                        pcbh->dwVersionMS = pcbh->dwVersionLS = -1;
                        pcbh->dwFlags = 0;
                        szThisLibCAB = SysAllocString(vaProps[enLibrary].bstrVal);
                        if (FAILED(CombineBaseAndRelativeURLs(pwszThisURL, bstrCodeBase, &szThisLibCAB)) ||
                            m_pCodeBaseList->AddString(szThisLibCAB, (DWORD_PTR)pcbh) != CWCStringList::STRLST_ADDED)
                        {
                            SAFEFREEBSTR(pcbh->szDistUnit);
                            SAFEDELETE(pcbh);
                        }
                        SAFEFREEBSTR(szThisLibCAB);
                    }
                }

                // Check for USESLIBRARYVERSION (optional)
                if (pcbh && SUCCEEDED(hrResult[enUsesVer]) && (vaProps[enUsesVer].vt == VT_BSTR))
                {
                    int iLen = SysStringByteLen(vaProps[enUsesVer].bstrVal)+1;
                    CHAR *szVerStr = (LPSTR)MemAlloc(LMEM_FIXED, iLen);

                    if (szVerStr)
                    {
                        SHUnicodeToAnsi(vaProps[enUsesVer].bstrVal, szVerStr, iLen);

                        if (FAILED(GetVersionFromString(szVerStr,
                                     &pcbh->dwVersionMS, &pcbh->dwVersionLS)))
                        {
                            hr = HRESULT_FROM_WIN32(GetLastError());
                            MemFree(szVerStr);
                            SAFEFREEBSTR(pcbh->szDistUnit);
                            SAFEDELETE(pcbh);
                        }
                        MemFree(szVerStr);
                    }
                }
            }
        } // Read properties

        for (i=enMin; i<enMax; i++)
        {
            SAFEFREEBSTR(pb[i].pstrName);
        }

        if (pwszThisURL)
            LocalFree(pwszThisURL);

        hr = S_OK;
    }

Exit:
    SAFERELEASE(pPropBag);
    return hr;
}

HRESULT CWebCrawler::GetDownloadNotify(IDownloadNotify **ppOut)
{
    HRESULT hr=S_OK;

    if (m_pDownloadNotify)
    {
        m_pDownloadNotify->LeaveMeAlone();
        m_pDownloadNotify->Release();
        m_pDownloadNotify=NULL;
    }

    CDownloadNotify *pdn = new CDownloadNotify(this);

    if (pdn)
    {
        hr = pdn->Initialize();
        
        if (SUCCEEDED(hr))
        {
            m_pDownloadNotify = pdn;
            *ppOut = m_pDownloadNotify;
            m_pDownloadNotify->AddRef();
        }
        else
        {
            pdn->Release();
        }
    }
    else
    {
        hr = E_OUTOFMEMORY;
        *ppOut = NULL;
    }

    return hr;
}

//---------------------------------------------------------------
// CWebCrawler::CDownloadNotify class
//---------------------------------------------------------------
CWebCrawler::CDownloadNotify::CDownloadNotify(CWebCrawler *pParent)
{
    ASSERT(pParent);

    m_cRef = 1;

    m_pParent = pParent;
    pParent->AddRef();
}

HRESULT CWebCrawler::CDownloadNotify::Initialize()
{
    m_hrCritParent = InitializeCriticalSectionAndSpinCount(&m_critParent, 0) ? S_OK : E_OUTOFMEMORY;

    return m_hrCritParent;
}

CWebCrawler::CDownloadNotify::~CDownloadNotify()
{
    DBG("Destroying CWebCrawler::CDownloadNotify");

    ASSERT(!m_pParent);
    SAFERELEASE(m_pParent);

    if (SUCCEEDED(m_hrCritParent))
    {
        DeleteCriticalSection(&m_critParent);
    }
}

void CWebCrawler::CDownloadNotify::LeaveMeAlone()
{
    if (m_pParent)
    {
        EnterCriticalSection(&m_critParent);
        SAFERELEASE(m_pParent);
        LeaveCriticalSection(&m_critParent);
    }
}

// IUnknown members
HRESULT CWebCrawler::CDownloadNotify::QueryInterface(REFIID riid, void **ppv)
{
    if ((IID_IUnknown == riid) ||
        (IID_IDownloadNotify == riid))
    {
        *ppv = (IDownloadNotify *)this;
    }
    else
    {
        *ppv = NULL;
        return E_NOINTERFACE;
    }

    ((LPUNKNOWN)*ppv)->AddRef();

    return S_OK;
}

ULONG CWebCrawler::CDownloadNotify::AddRef(void)
{
    return InterlockedIncrement(&m_cRef);
}

ULONG CWebCrawler::CDownloadNotify::Release(void)
{
    ASSERT( 0 != m_cRef );
    ULONG cRef = InterlockedDecrement(&m_cRef);
    if ( 0 == cRef )
    {
        delete this;
    }
    return cRef;
}

// IDownloadNotify
HRESULT CWebCrawler::CDownloadNotify::DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved)
{
    HRESULT hr = E_ABORT;   // abort it if we have nobody listening

    TraceMsg(TF_THISMODULE, "DownloadStart id=%d url=%ws", dwDownloadId, pchUrl ? pchUrl : L"(null)");

    EnterCriticalSection(&m_critParent);
    if (m_pParent)
        hr = m_pParent->DownloadStart(pchUrl, dwDownloadId, dwType, dwReserved);
    LeaveCriticalSection(&m_critParent);

    return hr;
}

HRESULT CWebCrawler::CDownloadNotify::DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved)
{
    HRESULT hr = S_OK;

//  TraceMsg(TF_THISMODULE, "DownloadComplete id=%d hr=%x", dwDownloadId, hrNotify);

    EnterCriticalSection(&m_critParent);
    if (m_pParent)
        hr = m_pParent->DownloadComplete(dwDownloadId, hrNotify, dwReserved);
    LeaveCriticalSection(&m_critParent);

    return hr;
}

//////////////////////////////////////////////////////////////////////////
//
// Other functions
//
//////////////////////////////////////////////////////////////////////////
// Make a single absolute or relative url sticky and get size
HRESULT GetUrlInfoAndMakeSticky(
            LPCTSTR                     pszBaseUrl,
            LPCTSTR                     pszThisUrl,
            LPINTERNET_CACHE_ENTRY_INFO lpCacheEntryInfo,
            DWORD                       dwBufSize,
            GROUPID                     llCacheGroupID)
{
    DWORD   dwSize;
    TCHAR   szCombined[INTERNET_MAX_URL_LENGTH];

    ASSERT(lpCacheEntryInfo);

    // Combine urls if necessary
    if (pszBaseUrl)
    {
        dwSize = ARRAYSIZE(szCombined);
        if (SUCCEEDED(UrlCombine(pszBaseUrl, pszThisUrl,
                szCombined, &dwSize, 0)))
        {
            pszThisUrl = szCombined;
        }
        else
            DBG_WARN("UrlCombine failed!");
    }

    // Add the size of this URL
    lpCacheEntryInfo->dwStructSize = dwBufSize;
    if (!GetUrlCacheEntryInfo(pszThisUrl, lpCacheEntryInfo, &dwBufSize))
    {
#ifdef DEBUG
        if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
            DBG_WARN("Failed GetUrlCacheEntryInfo, insufficient buffer");
        else
            TraceMsgA(llCacheGroupID ? TF_WARNING : TF_THISMODULE,
                "Failed GetUrlCacheEntryInfo (not in cache) URL=%ws", pszThisUrl);
#endif
        return E_FAIL;
    }

    // Add to new group
    if (llCacheGroupID != 0)
    {
        if (!SetUrlCacheEntryGroup(pszThisUrl, INTERNET_CACHE_GROUP_ADD,
            llCacheGroupID, NULL, 0, NULL))
        {
            switch (GetLastError())
            {
                case ERROR_FILE_NOT_FOUND:  //  Huh? Must not have been able to add the index entry?
                case ERROR_DISK_FULL:
                    return E_OUTOFMEMORY;

                case ERROR_NOT_ENOUGH_QUOTA:
                    return S_OK;            //  We do our own quota handling.

                default:
                    TraceMsgA(TF_WARNING | TF_THISMODULE, "GetUrlInfoAndMakeSticky: Got unexpected error from SetUrlCacheEntryGroup() - GLE = 0x%08x", GetLastError());
                    return E_FAIL;
            }
        }
    }

    return S_OK;
}

// GenerateCode will generate a DWORD code from a file.

#define ELEMENT_PER_READ        256
#define ELEMENT_SIZE            sizeof(DWORD)

HRESULT GenerateCode(LPCTSTR lpszLocalFileName, DWORD *pdwRet)
{
    DWORD dwCode=0;
    DWORD dwData[ELEMENT_PER_READ], i, dwRead;
    HRESULT hr = S_OK;
    HANDLE  hFile;

    hFile = CreateFile(lpszLocalFileName, GENERIC_READ,
            FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING,
            0, NULL);

    if (INVALID_HANDLE_VALUE != hFile)
    {
        do
        {
            dwRead = 0;
            if (ReadFile(hFile, dwData, ELEMENT_PER_READ * ELEMENT_SIZE, &dwRead, NULL))
            {
                for (i=0; i<dwRead / ELEMENT_SIZE; i++)
                {
                    dwCode = (dwCode << 31) | (dwCode >> 1) + dwData[i];
//                  dwCode += dwData[i];
                }
            }   
        }
        while (ELEMENT_PER_READ * ELEMENT_SIZE == dwRead);

        CloseHandle(hFile);
    }
    else
    {
        hr = E_FAIL;
        TraceMsg(TF_THISMODULE|TF_WARNING,"GenerateCode: Unable to open cache file, Error=%x", GetLastError());
    }

    *pdwRet = dwCode;

    return hr;
}

// S_OK : We retrieved a good last modified or content code to use
// S_FALSE : We fell back to using the one passed into pvarChange
// E_FAIL : We failed miserably.
// E_INVALIDARG : Get a clue
// *pfGetContent : TRUE if we need a GET for PostCheckUrlForChange to work right
HRESULT PreCheckUrlForChange(LPCTSTR lpURL, VARIANT *pvarChange, BOOL *pfGetContent)
{
    BYTE    chBuf[MY_MAX_CACHE_ENTRY_INFO];

    LPINTERNET_CACHE_ENTRY_INFO lpInfo = (LPINTERNET_CACHE_ENTRY_INFO) chBuf;

    if (pvarChange->vt != VT_EMPTY && pvarChange->vt != VT_I4 && pvarChange->vt != VT_CY)
        return E_INVALIDARG;

    if (SUCCEEDED(GetUrlInfoAndMakeSticky(NULL, lpURL, lpInfo, sizeof(chBuf), 0)))
    {
        FILETIME ftOldLastModified = *((FILETIME *) &pvarChange->cyVal);

        if (lpInfo->LastModifiedTime.dwHighDateTime || lpInfo->LastModifiedTime.dwLowDateTime)
        {
            // We have a last modified time. Use it or the persisted one.

            if (pfGetContent)
                *pfGetContent = FALSE;

            if ((pvarChange->vt != VT_CY)
             || (lpInfo->LastModifiedTime.dwHighDateTime > ftOldLastModified.dwHighDateTime)
             || ((lpInfo->LastModifiedTime.dwHighDateTime == ftOldLastModified.dwHighDateTime)
                && (lpInfo->LastModifiedTime.dwLowDateTime > ftOldLastModified.dwLowDateTime)))
            {
                // Cache Last Modified is newer than saved Last Modified. Use cache's.
                pvarChange->vt = VT_CY;
                pvarChange->cyVal = *((CY *)&(lpInfo->LastModifiedTime));

                return S_OK;
            }

            ASSERT(pvarChange->vt == VT_CY);

            // Persisted Last Modified time is most recent. Use it.
            return S_OK;
        }

        DWORD dwCode;

        if (SUCCEEDED(GenerateCode(lpInfo->lpszLocalFileName, &dwCode)))
        {
            pvarChange->vt = VT_I4;
            pvarChange->lVal = (LONG) dwCode;

            if (pfGetContent)
                *pfGetContent = TRUE;

            return S_OK;
        }

        // Failed GenerateCode. Weird. Fall through.
    }

    if (pvarChange->vt != VT_EMPTY)
    {
        if (pfGetContent)
            *pfGetContent = (pvarChange->vt == VT_I4);

        return S_FALSE;
    }

    // We don't have old change detection, we don't have cache content, better GET
    if (pfGetContent)
        *pfGetContent = TRUE;

    return E_FAIL;  // Couldn't get anything. pvarChange->vt==VT_EMPTY
}

// S_FALSE : no change
// S_OK    : changed
// E_      : failure of some sort

// pvarChange from PreCheckUrlForChange. We return a new one.
// lpInfo  : must be valid if *pfGetContent was TRUE
// ftNewLastModified : must be filled in if *pfGetContent was FALSE
HRESULT PostCheckUrlForChange(VARIANT                    *pvarChange,
                              LPINTERNET_CACHE_ENTRY_INFO lpInfo,
                              FILETIME                    ftNewLastModified)
{
    HRESULT hr = S_FALSE;
    VARIANT varChangeNew;

    DWORD   dwNewCode = 0;

    if (!pvarChange || (pvarChange->vt != VT_I4 && pvarChange->vt != VT_CY && pvarChange->vt != VT_EMPTY))
        return E_INVALIDARG;

    varChangeNew.vt = VT_EMPTY;

    if (ftNewLastModified.dwHighDateTime || ftNewLastModified.dwLowDateTime)
    {
        varChangeNew.vt = VT_CY;
        varChangeNew.cyVal = *((CY *) &ftNewLastModified);
    }
    else
    {
        if (lpInfo &&
            SUCCEEDED(GenerateCode(lpInfo->lpszLocalFileName, &dwNewCode)))
        {
            varChangeNew.vt = VT_I4;
            varChangeNew.lVal = dwNewCode;
        }
    }

    if (pvarChange->vt == VT_CY)
    {
        // We have an old last modified time. Use that to determine change.
        FILETIME ftOldLastModified = *((FILETIME *) &(pvarChange->cyVal));

        if ((!ftNewLastModified.dwHighDateTime && !ftNewLastModified.dwLowDateTime)
            || (ftNewLastModified.dwHighDateTime > ftOldLastModified.dwHighDateTime)
            || ((ftNewLastModified.dwHighDateTime == ftOldLastModified.dwHighDateTime)
                && (ftNewLastModified.dwLowDateTime > ftOldLastModified.dwLowDateTime)))
        {
            // NewLastModified > OldLastModified (or we don't have a NewLastModified)
            DBG("PostCheckUrlForChange change detected via Last Modified");
            hr = S_OK;      // We have changed
        }
    }
    else if (pvarChange->vt == VT_I4)
    {
        // We have an old code. Use that to determine change.
        DWORD dwOldCode = (DWORD) (pvarChange->lVal);

        if ((dwOldCode != dwNewCode) ||
            !dwNewCode)
        {
            DBG("PostCheckUrlForChange change detected via content code");
            hr = S_OK;  // We have changed
        }
    }
    else
        hr = E_FAIL;    // No old code.

    *pvarChange = varChangeNew;

    return hr;
}

//////////////////////////////////////////////////////////////////////////
//
// CHelperOM implementation
//
//////////////////////////////////////////////////////////////////////////

CHelperOM::CHelperOM(IHTMLDocument2 *pDoc)
{
    ASSERT(pDoc);
    m_pDoc = pDoc;
    if (pDoc)
        pDoc->AddRef();
}

CHelperOM::~CHelperOM()
{
    SAFERELEASE(m_pDoc);
}

HRESULT CHelperOM::GetTagCollection(
                        IHTMLDocument2          *pDoc,
                        LPCWSTR                  wszTagName,
                        IHTMLElementCollection **ppCollection)
{
    IHTMLElementCollection *pAll=NULL;
    IDispatch              *pDisp=NULL;
    VARIANT                 TagName;
    HRESULT                 hr;

    // We have to get "all", then sub-collection
    hr = pDoc->get_all(&pAll);
    if (pAll)
    {
        TagName.vt = VT_BSTR;
        TagName.bstrVal = SysAllocString(wszTagName);
        if (NULL == TagName.bstrVal)
            hr = E_OUTOFMEMORY;
        else
        {
            hr = pAll->tags(TagName, &pDisp);
            SysFreeString(TagName.bstrVal);
        }
        pAll->Release();
    }
    if (pDisp)
    {
        hr = pDisp->QueryInterface(IID_IHTMLElementCollection,
                                        (void **)ppCollection);
        pDisp->Release();
    }
    if (FAILED(hr)) DBG("GetSubCollection failed");

    return hr;
}


// Collections we get:
//
// IHTMLWindow2->get_document
//  IHTMLDocument2  ->get_links
//  IHTMLElementCollection->item
//                          ->get_hostname
//                          ->get_href
//                  ->get_all
//                      ->tags("map")
//  IHTMLElementCollection  ->item
//                              ->get_areas
//  IHTMLElementCollection          ->item
//  IHTMLAreaElement                    ->get_href
//                  ->get_all
//                      ->tags("meta")
//  IHTMLElementCollection  ->item
//                  ->get_all
//                      ->tags("frame")
//  IHTMLElementCollection  ->item
//                  ->get_all
//                      ->tags("iframe")
//  IHTMLElementCollection  ->item

// We recurse EnumCollection to get the maps (since
//      it's a collection of collections)


// hideous hack: IHTMLElementCollection can actually be IHTMLAreasCollection
//  the interface used to be derived from the other. It still has identical
//  methods. We typecast just in case that changes. Hopefully they will fix
//  so that Areas is derived from Element again.
HRESULT CHelperOM::EnumCollection(
            IHTMLElementCollection *pCollection,
            CWCStringList          *pStringList,
            CollectionType          Type,
            PFN_CB                  pfnCB,
            DWORD_PTR               dwCBData)
{
    IHTMLAnchorElement *pLink;
    IHTMLMapElement  *pMap;
    IHTMLAreaElement *pArea;
    IHTMLMetaElement *pMeta;
    IHTMLElement     *pEle;
    IDispatch        *pDispItem = NULL;

    HRESULT hr;
    BSTR    bstrItem=NULL;
    long    l, lCount;
    VARIANT vIndex, vEmpty, vData;
    BSTR    bstrTmp1, bstrTmp2;
    DWORD   dwStringData;

    VariantInit(&vEmpty);
    VariantInit(&vIndex);
    VariantInit(&vData);

    if (Type==CTYPE_MAP)
        hr = ((IHTMLAreasCollection *)pCollection)->get_length(&lCount);
    else
        hr = pCollection->get_length(&lCount);

    if (FAILED(hr))
        lCount = 0;

#ifdef DEBUG
    LPSTR lpDSTR[]={"Links","Maps","Areas (links) In Map", "Meta", "Frames"};
    TraceMsgA(TF_THISMODULE, "CWebCrawler::GetCollection, %d %s found", lCount, lpDSTR[(int)Type]);
#endif

    for (l=0; l<lCount; l++)
    {
        vIndex.vt = VT_I4;
        vIndex.lVal = l;
        dwStringData = 0;

        if (Type==CTYPE_MAP)
            hr = ((IHTMLAreasCollection *)pCollection)->item(vIndex, vEmpty, &pDispItem);
        else
            hr = pCollection->item(vIndex, vEmpty, &pDispItem);

        if (SUCCEEDED(hr))
        {
            ASSERT(vData.vt == VT_EMPTY);
            ASSERT(!bstrItem);

            if (pDispItem)
            {
                // Get the URL from the IDispatch
                switch(Type)
                {
                    case CTYPE_LINKS:       // get href from <a>
                        hr = pDispItem->QueryInterface(IID_IHTMLAnchorElement, (void **)&pLink);

                        if (SUCCEEDED(hr) && pLink)
                        {
                            hr = pLink->get_href(&bstrItem);
                            pLink->Release();
                        }
                        break;

                    case CTYPE_MAPS:    // enumeration areas for this map
                        hr = pDispItem->QueryInterface(IID_IHTMLMapElement, (void **)&pMap);

                        if (SUCCEEDED(hr) && pMap)
                        {
                            IHTMLAreasCollection *pNewCollection=NULL;
                            // This gives us another collection. Enumerate it
                            //  for the strings.
                            hr = pMap->get_areas(&pNewCollection);
                            if (pNewCollection)
                            {
                                hr = EnumCollection((IHTMLElementCollection *)pNewCollection, pStringList, CTYPE_MAP, pfnCB, dwCBData);
                                pNewCollection->Release();
                            }
                            pMap->Release();
                        }
                        break;

                    case CTYPE_MAP:     // get href for this area
                        hr = pDispItem->QueryInterface(IID_IHTMLAreaElement, (void **)&pArea);

                        if (SUCCEEDED(hr) && pArea)
                        {
                            hr = pArea->get_href(&bstrItem);
                            pArea->Release();
                        }
                        break;

                    case CTYPE_META:    // get meta name and content as single string
                        hr = pDispItem->QueryInterface(IID_IHTMLMetaElement, (void **)&pMeta);

                        if (SUCCEEDED(hr) && pMeta)
                        {
                            pMeta->get_name(&bstrTmp1);
                            pMeta->get_content(&bstrTmp2);
                            if (bstrTmp1 && bstrTmp2 && *bstrTmp1 && *bstrTmp2)
                            {
                                int nLen = lstrlenW(bstrTmp1) + lstrlenW(bstrTmp2) + 2;
                                bstrItem = SysAllocStringLen(NULL, nLen);

                                StrCpyNW(bstrItem, bstrTmp1, nLen);
                                StrCatBuffW(bstrItem, L"\n", nLen);
                                StrCatBuffW(bstrItem, bstrTmp2, nLen);
                            }
                            SysFreeString(bstrTmp1);
                            SysFreeString(bstrTmp2);
                            pMeta->Release();
                        }
                        break;

                    case CTYPE_FRAMES:      // get "src" attribute
                        hr = pDispItem->QueryInterface(IID_IHTMLElement, (void **)&pEle);

                        if (SUCCEEDED(hr) && pEle)
                        {
                            bstrTmp1 = SysAllocString(L"SRC");

                            if (bstrTmp1)
                            {
                                hr = pEle->getAttribute(bstrTmp1, VARIANT_FALSE, &vData);
                                if (SUCCEEDED(hr) && vData.vt == VT_BSTR)
                                {
                                    bstrItem = vData.bstrVal;
                                    vData.vt = VT_EMPTY;
                                }
                                else
                                    VariantClear(&vData);

                                SysFreeString(bstrTmp1);
                            }
                            else
                            {
                                hr = E_FAIL;
                            }

                            pEle->Release();
                        }
                        break;

                    default:
                        ASSERT(0);
                        // bug in calling code
                }

                if (SUCCEEDED(hr) && bstrItem)
                {
                    // Verify we want to add this item to string list & get data
                    if (pfnCB)
                        hr = pfnCB(pDispItem, &bstrItem, dwCBData, &dwStringData);

                    if (SUCCEEDED(hr) && bstrItem && pStringList)
                        pStringList->AddString(bstrItem, dwStringData);
                }
                SAFERELEASE(pDispItem);
                SAFEFREEBSTR(bstrItem);
            }
        }
        if (E_ABORT == hr)
        {
            DBG_WARN("Aborting enumeration in CHelperOM::EnumCollection at callback's request.");
            break;
        }
    }

    return hr;
}


// Gets all urls from a collection, recursing through frames
HRESULT CHelperOM::GetCollection(
    IHTMLDocument2 *pDoc,
    CWCStringList  *pStringList,
    CollectionType  Type,
    PFN_CB          pfnCB,
    DWORD_PTR       dwCBData)
{
    HRESULT         hr;

    // Get the collection from the document
    ASSERT(pDoc);
    ASSERT(pStringList || pfnCB);

    hr = _GetCollection(pDoc, pStringList, Type, pfnCB, dwCBData);

    return hr;
}

// get all urls from a collection
HRESULT CHelperOM::_GetCollection(
    IHTMLDocument2 *pDoc,
    CWCStringList  *pStringList,
    CollectionType  Type,
    PFN_CB          pfnCB,
    DWORD_PTR       dwCBData)
{
    HRESULT         hr;
    IHTMLElementCollection *pCollection=NULL;

    // From IHTMLDocument2 we get IHTMLElementCollection, then enumerate for the urls

    // Get appropriate collection from document
    switch (Type)
    {
        case CTYPE_LINKS:
            hr = pDoc->get_links(&pCollection);
            break;
        case CTYPE_MAPS:
            hr = GetTagCollection(pDoc, L"map", &pCollection);
            break;
        case CTYPE_META:
            hr = GetTagCollection(pDoc, L"meta", &pCollection);
            break;
        case CTYPE_FRAMES:
            hr = GetTagCollection(pDoc, L"frame", &pCollection);
            break;

        default:
            hr = E_FAIL;
    }
    if (!pCollection) hr=E_NOINTERFACE;
#ifdef DEBUG
    if (FAILED(hr)) DBG_WARN("CWebCrawler::_GetCollection:  get_collection failed");
#endif

    if (SUCCEEDED(hr))
    {
        hr = EnumCollection(pCollection, pStringList, Type, pfnCB, dwCBData);

        // If we're getting frames, we need to enum "iframe" tags separately
        if (SUCCEEDED(hr) && (Type == CTYPE_FRAMES))
        {
            SAFERELEASE(pCollection);
            hr = GetTagCollection(pDoc, L"iframe", &pCollection);

            if (SUCCEEDED(hr) && pCollection)
            {
                hr = EnumCollection(pCollection, pStringList, Type, pfnCB, dwCBData);
            }
        }
    }

    if (pCollection)
        pCollection->Release();

    return hr;
}

extern HRESULT LoadWithCookie(LPCTSTR, POOEBuf, DWORD *, SUBSCRIPTIONCOOKIE *);

// IExtractIcon members
STDMETHODIMP CWebCrawler::GetIconLocation(UINT uFlags, LPTSTR szIconFile, UINT cchMax, int * piIndex, UINT * pwFlags)
{
    IUniformResourceLocator* pUrl = NULL;
    IExtractIcon* pUrlIcon = NULL;
    HRESULT hr = S_OK;
    BOOL bCalledCoInit = FALSE;

    if (!szIconFile || !piIndex || !pwFlags)
        return E_INVALIDARG;
    //zero out return values in case one of the COM calls fails...
    *szIconFile = 0;
    *piIndex = -1;

    if (!m_pBuf)    {
        m_pBuf = (POOEBuf)MemAlloc(LPTR, sizeof(OOEBuf));
        if (!m_pBuf)
            return E_OUTOFMEMORY;

        DWORD   dwSize;
        hr = LoadWithCookie(NULL, m_pBuf, &dwSize, &m_SubscriptionCookie);
        RETURN_ON_FAILURE(hr);
    }


    if (m_pBuf->bDesktop)
    {
        StrCpyN(szIconFile, TEXT(":desktop:"), cchMax);
    }
    else
    {
        if (m_pUrlIconHelper)
        {
            hr = m_pUrlIconHelper->GetIconLocation (uFlags, szIconFile, cchMax, piIndex, pwFlags);
        }
        else
        {
            hr = CoCreateInstance (CLSID_InternetShortcut, NULL, CLSCTX_INPROC_SERVER, IID_IUniformResourceLocator, (void**)&pUrl);
            if ((CO_E_NOTINITIALIZED == hr || REGDB_E_IIDNOTREG == hr) &&
                SUCCEEDED (CoInitialize(NULL)))
            {
                bCalledCoInit = TRUE;
                hr = CoCreateInstance (CLSID_InternetShortcut, NULL, CLSCTX_INPROC_SERVER, IID_IUniformResourceLocator, (void**)&pUrl);
            }

            if (SUCCEEDED (hr))
            {
                hr = pUrl->SetURL (m_pBuf->m_URL, 1);
                if (SUCCEEDED (hr))
                {
                    hr = pUrl->QueryInterface (IID_IExtractIcon, (void**)&pUrlIcon);
                    if (SUCCEEDED (hr))
                    {
                        hr = pUrlIcon->GetIconLocation (uFlags, szIconFile, cchMax, piIndex, pwFlags);

                        //pUrlIcon->Release();  //released in destructor
                        ASSERT (m_pUrlIconHelper == NULL);
                        m_pUrlIconHelper = pUrlIcon;
                    }
                }
                pUrl->Release();
            }

            //balance CoInit with CoUnit
            //(we still have a pointer to the CLSID_InternetShortcut object, m_pUrlIconHelper,
            //but since that code is in shdocvw there's no danger of it getting unloaded and
            //invalidating our pointer, sez cdturner.)
            if (bCalledCoInit)
                CoUninitialize();
        }
    }

    return hr;
}

STDMETHODIMP CWebCrawler::Extract(LPCTSTR szIconFile, UINT nIconIndex, HICON * phiconLarge, HICON * phiconSmall, UINT nIconSize)
{
    HRESULT hr = S_OK;

    if (!phiconLarge || !phiconSmall)
        return E_INVALIDARG;

    //zero out return values in case one of the COM calls fails...
    *phiconLarge = NULL;
    *phiconSmall = NULL;

    if ((NULL != m_pBuf) && (m_pBuf->bDesktop))
    {
        LoadDefaultIcons();
        *phiconLarge = *phiconSmall = g_desktopIcon;
    }
    else
    {
        if (!m_pUrlIconHelper)
            return E_FAIL;

        hr = m_pUrlIconHelper->Extract (szIconFile, nIconIndex, phiconLarge, phiconSmall, nIconSize);
    }

    return hr;
}