#ifndef _WEBCRAWL_H #define _WEBCRAWL_H #include "strlist.h" ////////////////////////////////////////////////////////////////////////// // // Webcrawler object // ////////////////////////////////////////////////////////////////////////// class CCodeBaseHold { public: LPWSTR szDistUnit; DWORD dwVersionMS; DWORD dwVersionLS; DWORD dwFlags; }; class CWebCrawler : public CDeliveryAgent, public CUrlDownloadSink, public CRunDeliveryAgentSink { protected: class CDownloadNotify; public: // internal flag used to run in offline mode enum { WEBCRAWL_PRIV_OFFLINE_MODE = 0x80000000 }; protected: // properties BSTR m_bstrBaseURL; DWORD m_dwRecurseFlags; DWORD m_dwRecurseLevels; DWORD m_dwMaxSize; LPTSTR m_pszLocalDest; // local destination (instead of cache) // other data CWCStringList *m_pPages; // always valid during update. CWCStringList *m_pRobotsTxt; // array of robots.txt arrays, may be NULL CWCStringList *m_pPendingLinks; // Links from last page to be added to m_pPages CWCStringList *m_pDependencyLinks;// Links from last page to be downloaded now CWCStringList *m_pCodeBaseList; // List of CodeBase URL's to Crawl // Dword is ptr to CCodeBaseHold CRITICAL_SECTION m_critDependencies; HRESULT m_hrCritDependencies; CWCStringList *m_pDependencies; // all dependencies downloaded int m_iDependenciesProcessed; DWORD m_dwPendingRecurseLevel; // # to recurse from pending links DWORD m_dwCurSize; // currently downloaded in BYTES GROUPID m_llCacheGroupID; GROUPID m_llOldCacheGroupID; IExtractIcon* m_pUrlIconHelper; int m_iPagesStarted; // # m_pPages started int m_iRobotsStarted; // # m_pRobotsTxt started int m_iDependencyStarted;// # m_pDependencyLinks started int m_iTotalStarted; // # any toplevel url started int m_iCodeBaseStarted; // # of codebases started BSTR m_bstrHostName; // host name from first url long m_lMaxNumUrls; // is -1 until we know total # pages int m_iDownloadErrors; // have we had any download failures? int m_iSkippedByRobotsTxt; // how many skipped by robots.txt? CUrlDownload *m_pCurDownload; // current download CDownloadNotify *m_pDownloadNotify; // to get urls downloaded on a page int m_iCurDownloadStringIndex; CWCStringList *m_pCurDownloadStringList; // can be: m_pRobotsTxt, Pages, CodeBaseList int m_iNumPagesDownloading; // 0 or 1 BOOL m_fHasInitCookie; // One time deal, don't try again. // For change detection VARIANT m_varChange; CRunDeliveryAgent *m_pRunAgent; // host CDL/Channel agent BOOL m_fCDFDownloadInProgress; // other flags enum { FLAG_CRAWLCHANGED = 0x80000000, // have we found a change in the crawl? FLAG_HEADONLY = 0x40000000, // should we only get the HEAD data? }; // private member functions BOOL IsRecurseFlagSet(DWORD dwFlag) { return dwFlag & m_dwRecurseFlags; } static HRESULT CheckLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwThis, DWORD *pdwStringData); static HRESULT CheckFrame(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwBaseUrl, DWORD *pdwStringData); static HRESULT CheckImageOrLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwEnumDep, DWORD *pdwStringData); HRESULT MatchNames(BSTR bstrName, BOOL fPassword); HRESULT FindAndSubmitForm(void); void CheckOperationComplete(BOOL fOperationComplete); void FreeRobotsTxt(); void FreeCodeBaseList(); private: ~CWebCrawler(void); CWebCrawler(void); HRESULT Initialize(); public: static HRESULT CreateInstance(IUnknown *punkOuter, IUnknown **ppunk); // CUrlDownloadSink HRESULT OnDownloadComplete(UINT iID, int iError); HRESULT OnClientPull(UINT iID, LPCWSTR pwszOldURL, LPCWSTR pwszNewURL); HRESULT OnAuthenticate(HWND *phwnd, LPWSTR *ppszUsername, LPWSTR *ppszPassword); HRESULT OnOleCommandTargetExec(const GUID *pguidCmdGroup, DWORD nCmdID, DWORD nCmdexecopt, VARIANTARG *pvarargIn, VARIANTARG *pvarargOut); HRESULT GetDownloadNotify(IDownloadNotify **ppOut); // virtual functions overriding CDeliveryAgent HRESULT AgentPause(DWORD dwFlags); HRESULT AgentResume(DWORD dwFlags); HRESULT AgentAbort(DWORD dwFlags); STDMETHODIMP GetIconLocation(UINT, LPTSTR, UINT, int *, UINT *); STDMETHODIMP Extract(LPCTSTR, UINT, HICON *, HICON *, UINT); // CRunDeliveryAgentSink HRESULT OnAgentEnd(const SUBSCRIPTIONCOOKIE *, long, HRESULT, LPCWSTR, BOOL); protected: // CDeliveryAgent overrides HRESULT ModifyUpdateEnd(ISubscriptionItem *pEndItem, UINT *puiRes); HRESULT StartOperation(); HRESULT StartDownload(); void CleanUp(); void _CleanUp(); // members used during download HRESULT GetRealUrl(int iPageIndex, LPWSTR *ppwszThisUrl); HRESULT MakePageStickyAndGetSize(LPCWSTR pwszURL, DWORD *pdwSize, BOOL *pfDiskFull); HRESULT GetLinksFromPage(); HRESULT GetDependencyLinksFromPage(LPCWSTR pwszThisUrl, DWORD dwRecurse); HRESULT ProcessDependencyLinks(CWCStringList **ppslUrls, int *piStarted); HRESULT ProcessPendingLinks(); HRESULT ParseRobotsTxt(LPCWSTR pwszRobotsTxtURL, CWCStringList **ppslRet); HRESULT GetRobotsTxtIndex(LPCWSTR pwszUrl, BOOL fAddToList, DWORD *pdwRobotsTxtIndex); HRESULT ValidateWithRobotsTxt(LPCWSTR pwszUrl, int iRobotsIndex, BOOL *pfAllow); HRESULT StartNextDownload(); HRESULT StartCDFDownload(WCHAR *pwszCDFURL, WCHAR *pwszBaseUrl); HRESULT ActuallyStartDownload(CWCStringList *pslUrls, int iIndex, BOOL fReStart=FALSE); HRESULT ActuallyDownloadCodeBase(CWCStringList *pslUrls, int iIndex, BOOL fReStart=FALSE); static HRESULT GetHostName(LPCWSTR pwszThisUrl, BSTR *pbstrHostName); inline HRESULT GetChannelItem(ISubscriptionItem **ppChannelItem); public: // Callbacks from CDownloadNotify (free threaded) HRESULT DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved); HRESULT DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved); protected: class CDownloadNotify : public IDownloadNotify { public: CDownloadNotify(CWebCrawler *pParent); ~CDownloadNotify(); HRESULT Initialize(); void LeaveMeAlone(); protected: long m_cRef; CWebCrawler *m_pParent; // we keep a reference CRITICAL_SECTION m_critParent; HRESULT m_hrCritParent; public: // IUnknown members STDMETHODIMP QueryInterface(REFIID riid, void **ppunk); STDMETHODIMP_(ULONG) AddRef(void); STDMETHODIMP_(ULONG) Release(void); // IDownloadNotify STDMETHODIMP DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved); STDMETHODIMP DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved); }; }; ////////////////////////////////////////////////////////////////////////// // // CHelperOM // // Helps with MSHTML object model ////////////////////////////////////////////////////////////////////////// class CHelperOM { IHTMLDocument2 *m_pDoc; public: typedef enum { CTYPE_LINKS, // Get all links () on a page CTYPE_MAPS, // Get all maps on page CTYPE_MAP, // Get all links within a map CTYPE_META, // Get meta tags (name\ncontent) CTYPE_FRAMES, // Get all frame urls on a page } CollectionType; typedef HRESULT (*PFNHELPERCALLBACK)(IUnknown *punkItem, /*inout*/BSTR *pbstrURL, DWORD_PTR dwCBData, DWORD *pdwStringData); typedef PFNHELPERCALLBACK PFN_CB; public: CHelperOM(IHTMLDocument2 *pDoc); ~CHelperOM(); static HRESULT GetTagCollection( IHTMLDocument2 *pDoc, LPCWSTR wszTagName, IHTMLElementCollection **ppCollection); // static HRESULT WinFromDoc(IHTMLDocument2 *pDoc, IHTMLWindow2 **ppWin); static HRESULT GetCollection (IHTMLDocument2 *pDoc, CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData); static HRESULT EnumCollection(IHTMLElementCollection *pCollection, CWCStringList *pStringList, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData); HRESULT GetTagCollection(LPCWSTR wszTagName, IHTMLElementCollection **ppCollection) { return GetTagCollection(m_pDoc, wszTagName, ppCollection); } HRESULT GetCollection(CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData) { return GetCollection(m_pDoc, psl, Type, pfnCB, dwData); } protected: static HRESULT _GetCollection(IHTMLDocument2 *pDoc, CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData); }; #endif _WEBCRAWL_H