Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

242 lines
9.1 KiB

  1. #ifndef _WEBCRAWL_H
  2. #define _WEBCRAWL_H
  3. #include "strlist.h"
  4. //////////////////////////////////////////////////////////////////////////
  5. //
  6. // Webcrawler object
  7. //
  8. //////////////////////////////////////////////////////////////////////////
  9. class CCodeBaseHold
  10. {
  11. public:
  12. LPWSTR szDistUnit;
  13. DWORD dwVersionMS;
  14. DWORD dwVersionLS;
  15. DWORD dwFlags;
  16. };
  17. class CWebCrawler : public CDeliveryAgent,
  18. public CUrlDownloadSink,
  19. public CRunDeliveryAgentSink
  20. {
  21. protected:
  22. class CDownloadNotify;
  23. public:
  24. // internal flag used to run in offline mode
  25. enum { WEBCRAWL_PRIV_OFFLINE_MODE = 0x80000000 };
  26. protected:
  27. // properties
  28. BSTR m_bstrBaseURL;
  29. DWORD m_dwRecurseFlags;
  30. DWORD m_dwRecurseLevels;
  31. DWORD m_dwMaxSize;
  32. LPTSTR m_pszLocalDest; // local destination (instead of cache)
  33. // other data
  34. CWCStringList *m_pPages; // always valid during update.
  35. CWCStringList *m_pRobotsTxt; // array of robots.txt arrays, may be NULL
  36. CWCStringList *m_pPendingLinks; // Links from last page to be added to m_pPages
  37. CWCStringList *m_pDependencyLinks;// Links from last page to be downloaded now
  38. CWCStringList *m_pCodeBaseList; // List of CodeBase URL's to Crawl
  39. // Dword is ptr to CCodeBaseHold
  40. CRITICAL_SECTION m_critDependencies;
  41. CWCStringList *m_pDependencies; // all dependencies downloaded
  42. int m_iDependenciesProcessed;
  43. DWORD m_dwPendingRecurseLevel; // # to recurse from pending links
  44. DWORD m_dwCurSize; // currently downloaded in BYTES
  45. GROUPID m_llCacheGroupID;
  46. GROUPID m_llOldCacheGroupID;
  47. IExtractIcon* m_pUrlIconHelper;
  48. int m_iPagesStarted; // # m_pPages started
  49. int m_iRobotsStarted; // # m_pRobotsTxt started
  50. int m_iDependencyStarted;// # m_pDependencyLinks started
  51. int m_iTotalStarted; // # any toplevel url started
  52. int m_iCodeBaseStarted; // # of codebases started
  53. BSTR m_bstrHostName; // host name from first url
  54. long m_lMaxNumUrls; // is -1 until we know total # pages
  55. int m_iDownloadErrors; // have we had any download failures?
  56. int m_iSkippedByRobotsTxt; // how many skipped by robots.txt?
  57. CUrlDownload *m_pCurDownload; // current download
  58. CDownloadNotify *m_pDownloadNotify; // to get urls downloaded on a page
  59. int m_iCurDownloadStringIndex;
  60. CWCStringList *m_pCurDownloadStringList; // can be: m_pRobotsTxt, Pages, CodeBaseList
  61. int m_iNumPagesDownloading; // 0 or 1
  62. BOOL m_fHasInitCookie; // One time deal, don't try again.
  63. // For change detection
  64. VARIANT m_varChange;
  65. CRunDeliveryAgent *m_pRunAgent; // host CDL/Channel agent
  66. BOOL m_fCDFDownloadInProgress;
  67. // other flags
  68. enum {
  69. FLAG_CRAWLCHANGED = 0x80000000, // have we found a change in the crawl?
  70. FLAG_HEADONLY = 0x40000000, // should we only get the HEAD data?
  71. };
  72. // private member functions
  73. BOOL IsRecurseFlagSet(DWORD dwFlag) { return dwFlag & m_dwRecurseFlags; }
  74. static HRESULT CheckLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwThis, DWORD *pdwStringData);
  75. static HRESULT CheckFrame(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwBaseUrl, DWORD *pdwStringData);
  76. static HRESULT CheckImageOrLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwEnumDep, DWORD *pdwStringData);
  77. HRESULT MatchNames(BSTR bstrName, BOOL fPassword);
  78. HRESULT FindAndSubmitForm(void);
  79. void CheckOperationComplete(BOOL fOperationComplete);
  80. void FreeRobotsTxt();
  81. void FreeCodeBaseList();
  82. private:
  83. ~CWebCrawler(void);
  84. public:
  85. CWebCrawler(void);
  86. // CUrlDownloadSink
  87. HRESULT OnDownloadComplete(UINT iID, int iError);
  88. HRESULT OnClientPull(UINT iID, LPCWSTR pwszOldURL, LPCWSTR pwszNewURL);
  89. HRESULT OnAuthenticate(HWND *phwnd, LPWSTR *ppszUsername, LPWSTR *ppszPassword);
  90. HRESULT OnOleCommandTargetExec(const GUID *pguidCmdGroup, DWORD nCmdID,
  91. DWORD nCmdexecopt, VARIANTARG *pvarargIn,
  92. VARIANTARG *pvarargOut);
  93. HRESULT GetDownloadNotify(IDownloadNotify **ppOut);
  94. // virtual functions overriding CDeliveryAgent
  95. HRESULT AgentPause(DWORD dwFlags);
  96. HRESULT AgentResume(DWORD dwFlags);
  97. HRESULT AgentAbort(DWORD dwFlags);
  98. STDMETHODIMP GetIconLocation(UINT, LPTSTR, UINT, int *, UINT *);
  99. STDMETHODIMP Extract(LPCTSTR, UINT, HICON *, HICON *, UINT);
  100. // CRunDeliveryAgentSink
  101. HRESULT OnAgentEnd(const SUBSCRIPTIONCOOKIE *, long, HRESULT, LPCWSTR, BOOL);
  102. protected:
  103. // CDeliveryAgent overrides
  104. HRESULT ModifyUpdateEnd(ISubscriptionItem *pEndItem, UINT *puiRes);
  105. HRESULT StartOperation();
  106. HRESULT StartDownload();
  107. void CleanUp();
  108. void _CleanUp();
  109. // members used during download
  110. HRESULT GetRealUrl(int iPageIndex, LPWSTR *ppwszThisUrl);
  111. HRESULT MakePageStickyAndGetSize(LPCWSTR pwszURL, DWORD *pdwSize, BOOL *pfDiskFull);
  112. HRESULT GetLinksFromPage();
  113. HRESULT GetDependencyLinksFromPage(LPCWSTR pwszThisUrl, DWORD dwRecurse);
  114. HRESULT ProcessDependencyLinks(CWCStringList **ppslUrls, int *piStarted);
  115. HRESULT ProcessPendingLinks();
  116. HRESULT ParseRobotsTxt(LPCWSTR pwszRobotsTxtURL, CWCStringList **ppslRet);
  117. HRESULT GetRobotsTxtIndex(LPCWSTR pwszUrl, BOOL fAddToList, DWORD *pdwRobotsTxtIndex);
  118. HRESULT ValidateWithRobotsTxt(LPCWSTR pwszUrl, int iRobotsIndex, BOOL *pfAllow);
  119. HRESULT StartNextDownload();
  120. HRESULT StartCDFDownload(WCHAR *pwszCDFURL, WCHAR *pwszBaseUrl);
  121. HRESULT ActuallyStartDownload(CWCStringList *pslUrls, int iIndex, BOOL fReStart=FALSE);
  122. HRESULT ActuallyDownloadCodeBase(CWCStringList *pslUrls, int iIndex, BOOL fReStart=FALSE);
  123. static HRESULT GetHostName(LPCWSTR pwszThisUrl, BSTR *pbstrHostName);
  124. inline HRESULT GetChannelItem(ISubscriptionItem **ppChannelItem);
  125. public:
  126. // Callbacks from CDownloadNotify (free threaded)
  127. HRESULT DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved);
  128. HRESULT DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved);
  129. protected:
  130. class CDownloadNotify : public IDownloadNotify
  131. {
  132. public:
  133. CDownloadNotify(CWebCrawler *pParent);
  134. ~CDownloadNotify();
  135. void LeaveMeAlone();
  136. protected:
  137. long m_cRef;
  138. CWebCrawler *m_pParent; // we keep a reference
  139. CRITICAL_SECTION m_critParent;
  140. public:
  141. // IUnknown members
  142. STDMETHODIMP QueryInterface(REFIID riid, void **ppunk);
  143. STDMETHODIMP_(ULONG) AddRef(void);
  144. STDMETHODIMP_(ULONG) Release(void);
  145. // IDownloadNotify
  146. STDMETHODIMP DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved);
  147. STDMETHODIMP DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved);
  148. };
  149. };
  150. //////////////////////////////////////////////////////////////////////////
  151. //
  152. // CHelperOM
  153. //
  154. // Helps with MSHTML object model
  155. //////////////////////////////////////////////////////////////////////////
  156. class CHelperOM
  157. {
  158. IHTMLDocument2 *m_pDoc;
  159. public:
  160. typedef enum {
  161. CTYPE_LINKS, // Get all links (<a href>) on a page
  162. CTYPE_MAPS, // Get all maps on page
  163. CTYPE_MAP, // Get all links within a map
  164. CTYPE_META, // Get meta tags (name\ncontent)
  165. CTYPE_FRAMES, // Get all frame urls on a page
  166. } CollectionType;
  167. typedef HRESULT (*PFNHELPERCALLBACK)(IUnknown *punkItem, /*inout*/BSTR *pbstrURL, DWORD_PTR dwCBData, DWORD *pdwStringData);
  168. typedef PFNHELPERCALLBACK PFN_CB;
  169. public:
  170. CHelperOM(IHTMLDocument2 *pDoc);
  171. ~CHelperOM();
  172. static HRESULT GetTagCollection(
  173. IHTMLDocument2 *pDoc,
  174. LPCWSTR wszTagName,
  175. IHTMLElementCollection **ppCollection);
  176. // static HRESULT WinFromDoc(IHTMLDocument2 *pDoc, IHTMLWindow2 **ppWin);
  177. static HRESULT GetCollection (IHTMLDocument2 *pDoc, CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData);
  178. static HRESULT EnumCollection(IHTMLElementCollection *pCollection,
  179. CWCStringList *pStringList, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData);
  180. HRESULT GetTagCollection(LPCWSTR wszTagName, IHTMLElementCollection **ppCollection)
  181. { return GetTagCollection(m_pDoc, wszTagName, ppCollection); }
  182. HRESULT GetCollection(CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData)
  183. { return GetCollection(m_pDoc, psl, Type, pfnCB, dwData); }
  184. protected:
  185. static HRESULT _GetCollection(IHTMLDocument2 *pDoc, CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData);
  186. };
  187. #endif _WEBCRAWL_H