You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
247 lines
9.6 KiB
247 lines
9.6 KiB
#ifndef _WEBCRAWL_H
|
|
#define _WEBCRAWL_H
|
|
|
|
#include "strlist.h"
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// Webcrawler object
|
|
//
|
|
//////////////////////////////////////////////////////////////////////////
|
|
class CCodeBaseHold
|
|
{
|
|
public:
|
|
LPWSTR szDistUnit;
|
|
DWORD dwVersionMS;
|
|
DWORD dwVersionLS;
|
|
DWORD dwFlags;
|
|
};
|
|
|
|
class CWebCrawler : public CDeliveryAgent,
|
|
public CUrlDownloadSink,
|
|
public CRunDeliveryAgentSink
|
|
{
|
|
protected:
|
|
class CDownloadNotify;
|
|
|
|
public:
|
|
// internal flag used to run in offline mode
|
|
enum { WEBCRAWL_PRIV_OFFLINE_MODE = 0x80000000 };
|
|
protected:
|
|
// properties
|
|
BSTR m_bstrBaseURL;
|
|
DWORD m_dwRecurseFlags;
|
|
DWORD m_dwRecurseLevels;
|
|
DWORD m_dwMaxSize;
|
|
LPTSTR m_pszLocalDest; // local destination (instead of cache)
|
|
|
|
// other data
|
|
CWCStringList *m_pPages; // always valid during update.
|
|
CWCStringList *m_pRobotsTxt; // array of robots.txt arrays, may be NULL
|
|
CWCStringList *m_pPendingLinks; // Links from last page to be added to m_pPages
|
|
CWCStringList *m_pDependencyLinks;// Links from last page to be downloaded now
|
|
CWCStringList *m_pCodeBaseList; // List of CodeBase URL's to Crawl
|
|
// Dword is ptr to CCodeBaseHold
|
|
|
|
CRITICAL_SECTION m_critDependencies;
|
|
HRESULT m_hrCritDependencies;
|
|
CWCStringList *m_pDependencies; // all dependencies downloaded
|
|
int m_iDependenciesProcessed;
|
|
|
|
DWORD m_dwPendingRecurseLevel; // # to recurse from pending links
|
|
|
|
DWORD m_dwCurSize; // currently downloaded in BYTES
|
|
|
|
GROUPID m_llCacheGroupID;
|
|
GROUPID m_llOldCacheGroupID;
|
|
|
|
IExtractIcon* m_pUrlIconHelper;
|
|
|
|
int m_iPagesStarted; // # m_pPages started
|
|
int m_iRobotsStarted; // # m_pRobotsTxt started
|
|
int m_iDependencyStarted;// # m_pDependencyLinks started
|
|
int m_iTotalStarted; // # any toplevel url started
|
|
int m_iCodeBaseStarted; // # of codebases started
|
|
|
|
BSTR m_bstrHostName; // host name from first url
|
|
|
|
long m_lMaxNumUrls; // is -1 until we know total # pages
|
|
|
|
int m_iDownloadErrors; // have we had any download failures?
|
|
int m_iSkippedByRobotsTxt; // how many skipped by robots.txt?
|
|
|
|
CUrlDownload *m_pCurDownload; // current download
|
|
CDownloadNotify *m_pDownloadNotify; // to get urls downloaded on a page
|
|
|
|
int m_iCurDownloadStringIndex;
|
|
CWCStringList *m_pCurDownloadStringList; // can be: m_pRobotsTxt, Pages, CodeBaseList
|
|
|
|
int m_iNumPagesDownloading; // 0 or 1
|
|
|
|
BOOL m_fHasInitCookie; // One time deal, don't try again.
|
|
|
|
// For change detection
|
|
VARIANT m_varChange;
|
|
|
|
CRunDeliveryAgent *m_pRunAgent; // host CDL/Channel agent
|
|
BOOL m_fCDFDownloadInProgress;
|
|
|
|
// other flags
|
|
enum {
|
|
FLAG_CRAWLCHANGED = 0x80000000, // have we found a change in the crawl?
|
|
FLAG_HEADONLY = 0x40000000, // should we only get the HEAD data?
|
|
};
|
|
|
|
// private member functions
|
|
BOOL IsRecurseFlagSet(DWORD dwFlag) { return dwFlag & m_dwRecurseFlags; }
|
|
|
|
static HRESULT CheckLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwThis, DWORD *pdwStringData);
|
|
static HRESULT CheckFrame(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwBaseUrl, DWORD *pdwStringData);
|
|
static HRESULT CheckImageOrLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwEnumDep, DWORD *pdwStringData);
|
|
|
|
HRESULT MatchNames(BSTR bstrName, BOOL fPassword);
|
|
HRESULT FindAndSubmitForm(void);
|
|
|
|
void CheckOperationComplete(BOOL fOperationComplete);
|
|
|
|
void FreeRobotsTxt();
|
|
void FreeCodeBaseList();
|
|
|
|
private:
|
|
~CWebCrawler(void);
|
|
CWebCrawler(void);
|
|
HRESULT Initialize();
|
|
|
|
public:
|
|
static HRESULT CreateInstance(IUnknown *punkOuter, IUnknown **ppunk);
|
|
|
|
// CUrlDownloadSink
|
|
HRESULT OnDownloadComplete(UINT iID, int iError);
|
|
HRESULT OnClientPull(UINT iID, LPCWSTR pwszOldURL, LPCWSTR pwszNewURL);
|
|
HRESULT OnAuthenticate(HWND *phwnd, LPWSTR *ppszUsername, LPWSTR *ppszPassword);
|
|
HRESULT OnOleCommandTargetExec(const GUID *pguidCmdGroup, DWORD nCmdID,
|
|
DWORD nCmdexecopt, VARIANTARG *pvarargIn,
|
|
VARIANTARG *pvarargOut);
|
|
HRESULT GetDownloadNotify(IDownloadNotify **ppOut);
|
|
|
|
// virtual functions overriding CDeliveryAgent
|
|
HRESULT AgentPause(DWORD dwFlags);
|
|
HRESULT AgentResume(DWORD dwFlags);
|
|
HRESULT AgentAbort(DWORD dwFlags);
|
|
STDMETHODIMP GetIconLocation(UINT, LPTSTR, UINT, int *, UINT *);
|
|
STDMETHODIMP Extract(LPCTSTR, UINT, HICON *, HICON *, UINT);
|
|
|
|
// CRunDeliveryAgentSink
|
|
HRESULT OnAgentEnd(const SUBSCRIPTIONCOOKIE *, long, HRESULT, LPCWSTR, BOOL);
|
|
|
|
protected:
|
|
// CDeliveryAgent overrides
|
|
HRESULT ModifyUpdateEnd(ISubscriptionItem *pEndItem, UINT *puiRes);
|
|
HRESULT StartOperation();
|
|
HRESULT StartDownload();
|
|
void CleanUp();
|
|
|
|
void _CleanUp();
|
|
|
|
// members used during download
|
|
HRESULT GetRealUrl(int iPageIndex, LPWSTR *ppwszThisUrl);
|
|
HRESULT MakePageStickyAndGetSize(LPCWSTR pwszURL, DWORD *pdwSize, BOOL *pfDiskFull);
|
|
HRESULT GetLinksFromPage();
|
|
HRESULT GetDependencyLinksFromPage(LPCWSTR pwszThisUrl, DWORD dwRecurse);
|
|
HRESULT ProcessDependencyLinks(CWCStringList **ppslUrls, int *piStarted);
|
|
HRESULT ProcessPendingLinks();
|
|
HRESULT ParseRobotsTxt(LPCWSTR pwszRobotsTxtURL, CWCStringList **ppslRet);
|
|
HRESULT GetRobotsTxtIndex(LPCWSTR pwszUrl, BOOL fAddToList, DWORD *pdwRobotsTxtIndex);
|
|
HRESULT ValidateWithRobotsTxt(LPCWSTR pwszUrl, int iRobotsIndex, BOOL *pfAllow);
|
|
|
|
|
|
HRESULT StartNextDownload();
|
|
HRESULT StartCDFDownload(WCHAR *pwszCDFURL, WCHAR *pwszBaseUrl);
|
|
HRESULT ActuallyStartDownload(CWCStringList *pslUrls, int iIndex, BOOL fReStart=FALSE);
|
|
HRESULT ActuallyDownloadCodeBase(CWCStringList *pslUrls, int iIndex, BOOL fReStart=FALSE);
|
|
|
|
static HRESULT GetHostName(LPCWSTR pwszThisUrl, BSTR *pbstrHostName);
|
|
|
|
inline HRESULT GetChannelItem(ISubscriptionItem **ppChannelItem);
|
|
|
|
public:
|
|
// Callbacks from CDownloadNotify (free threaded)
|
|
HRESULT DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved);
|
|
HRESULT DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved);
|
|
|
|
protected:
|
|
class CDownloadNotify : public IDownloadNotify
|
|
{
|
|
public:
|
|
CDownloadNotify(CWebCrawler *pParent);
|
|
~CDownloadNotify();
|
|
HRESULT Initialize();
|
|
|
|
void LeaveMeAlone();
|
|
|
|
protected:
|
|
long m_cRef;
|
|
CWebCrawler *m_pParent; // we keep a reference
|
|
CRITICAL_SECTION m_critParent;
|
|
HRESULT m_hrCritParent;
|
|
|
|
public:
|
|
// IUnknown members
|
|
STDMETHODIMP QueryInterface(REFIID riid, void **ppunk);
|
|
STDMETHODIMP_(ULONG) AddRef(void);
|
|
STDMETHODIMP_(ULONG) Release(void);
|
|
|
|
// IDownloadNotify
|
|
STDMETHODIMP DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved);
|
|
STDMETHODIMP DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved);
|
|
};
|
|
};
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// CHelperOM
|
|
//
|
|
// Helps with MSHTML object model
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
class CHelperOM
|
|
{
|
|
IHTMLDocument2 *m_pDoc;
|
|
|
|
public:
|
|
typedef enum {
|
|
CTYPE_LINKS, // Get all links (<a href>) on a page
|
|
CTYPE_MAPS, // Get all maps on page
|
|
CTYPE_MAP, // Get all links within a map
|
|
CTYPE_META, // Get meta tags (name\ncontent)
|
|
CTYPE_FRAMES, // Get all frame urls on a page
|
|
} CollectionType;
|
|
|
|
typedef HRESULT (*PFNHELPERCALLBACK)(IUnknown *punkItem, /*inout*/BSTR *pbstrURL, DWORD_PTR dwCBData, DWORD *pdwStringData);
|
|
typedef PFNHELPERCALLBACK PFN_CB;
|
|
|
|
public:
|
|
CHelperOM(IHTMLDocument2 *pDoc);
|
|
~CHelperOM();
|
|
|
|
static HRESULT GetTagCollection(
|
|
IHTMLDocument2 *pDoc,
|
|
LPCWSTR wszTagName,
|
|
IHTMLElementCollection **ppCollection);
|
|
|
|
// static HRESULT WinFromDoc(IHTMLDocument2 *pDoc, IHTMLWindow2 **ppWin);
|
|
|
|
static HRESULT GetCollection (IHTMLDocument2 *pDoc, CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData);
|
|
static HRESULT EnumCollection(IHTMLElementCollection *pCollection,
|
|
CWCStringList *pStringList, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData);
|
|
|
|
HRESULT GetTagCollection(LPCWSTR wszTagName, IHTMLElementCollection **ppCollection)
|
|
{ return GetTagCollection(m_pDoc, wszTagName, ppCollection); }
|
|
HRESULT GetCollection(CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData)
|
|
{ return GetCollection(m_pDoc, psl, Type, pfnCB, dwData); }
|
|
|
|
protected:
|
|
static HRESULT _GetCollection(IHTMLDocument2 *pDoc, CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData);
|
|
};
|
|
|
|
#endif _WEBCRAWL_H
|