Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

247 lines
9.6 KiB

  1. #ifndef _WEBCRAWL_H
  2. #define _WEBCRAWL_H
  3. #include "strlist.h"
  4. //////////////////////////////////////////////////////////////////////////
  5. //
  6. // Webcrawler object
  7. //
  8. //////////////////////////////////////////////////////////////////////////
  9. class CCodeBaseHold
  10. {
  11. public:
  12. LPWSTR szDistUnit;
  13. DWORD dwVersionMS;
  14. DWORD dwVersionLS;
  15. DWORD dwFlags;
  16. };
  17. class CWebCrawler : public CDeliveryAgent,
  18. public CUrlDownloadSink,
  19. public CRunDeliveryAgentSink
  20. {
  21. protected:
  22. class CDownloadNotify;
  23. public:
  24. // internal flag used to run in offline mode
  25. enum { WEBCRAWL_PRIV_OFFLINE_MODE = 0x80000000 };
  26. protected:
  27. // properties
  28. BSTR m_bstrBaseURL;
  29. DWORD m_dwRecurseFlags;
  30. DWORD m_dwRecurseLevels;
  31. DWORD m_dwMaxSize;
  32. LPTSTR m_pszLocalDest; // local destination (instead of cache)
  33. // other data
  34. CWCStringList *m_pPages; // always valid during update.
  35. CWCStringList *m_pRobotsTxt; // array of robots.txt arrays, may be NULL
  36. CWCStringList *m_pPendingLinks; // Links from last page to be added to m_pPages
  37. CWCStringList *m_pDependencyLinks;// Links from last page to be downloaded now
  38. CWCStringList *m_pCodeBaseList; // List of CodeBase URL's to Crawl
  39. // Dword is ptr to CCodeBaseHold
  40. CRITICAL_SECTION m_critDependencies;
  41. HRESULT m_hrCritDependencies;
  42. CWCStringList *m_pDependencies; // all dependencies downloaded
  43. int m_iDependenciesProcessed;
  44. DWORD m_dwPendingRecurseLevel; // # to recurse from pending links
  45. DWORD m_dwCurSize; // currently downloaded in BYTES
  46. GROUPID m_llCacheGroupID;
  47. GROUPID m_llOldCacheGroupID;
  48. IExtractIcon* m_pUrlIconHelper;
  49. int m_iPagesStarted; // # m_pPages started
  50. int m_iRobotsStarted; // # m_pRobotsTxt started
  51. int m_iDependencyStarted;// # m_pDependencyLinks started
  52. int m_iTotalStarted; // # any toplevel url started
  53. int m_iCodeBaseStarted; // # of codebases started
  54. BSTR m_bstrHostName; // host name from first url
  55. long m_lMaxNumUrls; // is -1 until we know total # pages
  56. int m_iDownloadErrors; // have we had any download failures?
  57. int m_iSkippedByRobotsTxt; // how many skipped by robots.txt?
  58. CUrlDownload *m_pCurDownload; // current download
  59. CDownloadNotify *m_pDownloadNotify; // to get urls downloaded on a page
  60. int m_iCurDownloadStringIndex;
  61. CWCStringList *m_pCurDownloadStringList; // can be: m_pRobotsTxt, Pages, CodeBaseList
  62. int m_iNumPagesDownloading; // 0 or 1
  63. BOOL m_fHasInitCookie; // One time deal, don't try again.
  64. // For change detection
  65. VARIANT m_varChange;
  66. CRunDeliveryAgent *m_pRunAgent; // host CDL/Channel agent
  67. BOOL m_fCDFDownloadInProgress;
  68. // other flags
  69. enum {
  70. FLAG_CRAWLCHANGED = 0x80000000, // have we found a change in the crawl?
  71. FLAG_HEADONLY = 0x40000000, // should we only get the HEAD data?
  72. };
  73. // private member functions
  74. BOOL IsRecurseFlagSet(DWORD dwFlag) { return dwFlag & m_dwRecurseFlags; }
  75. static HRESULT CheckLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwThis, DWORD *pdwStringData);
  76. static HRESULT CheckFrame(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwBaseUrl, DWORD *pdwStringData);
  77. static HRESULT CheckImageOrLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwEnumDep, DWORD *pdwStringData);
  78. HRESULT MatchNames(BSTR bstrName, BOOL fPassword);
  79. HRESULT FindAndSubmitForm(void);
  80. void CheckOperationComplete(BOOL fOperationComplete);
  81. void FreeRobotsTxt();
  82. void FreeCodeBaseList();
  83. private:
  84. ~CWebCrawler(void);
  85. CWebCrawler(void);
  86. HRESULT Initialize();
  87. public:
  88. static HRESULT CreateInstance(IUnknown *punkOuter, IUnknown **ppunk);
  89. // CUrlDownloadSink
  90. HRESULT OnDownloadComplete(UINT iID, int iError);
  91. HRESULT OnClientPull(UINT iID, LPCWSTR pwszOldURL, LPCWSTR pwszNewURL);
  92. HRESULT OnAuthenticate(HWND *phwnd, LPWSTR *ppszUsername, LPWSTR *ppszPassword);
  93. HRESULT OnOleCommandTargetExec(const GUID *pguidCmdGroup, DWORD nCmdID,
  94. DWORD nCmdexecopt, VARIANTARG *pvarargIn,
  95. VARIANTARG *pvarargOut);
  96. HRESULT GetDownloadNotify(IDownloadNotify **ppOut);
  97. // virtual functions overriding CDeliveryAgent
  98. HRESULT AgentPause(DWORD dwFlags);
  99. HRESULT AgentResume(DWORD dwFlags);
  100. HRESULT AgentAbort(DWORD dwFlags);
  101. STDMETHODIMP GetIconLocation(UINT, LPTSTR, UINT, int *, UINT *);
  102. STDMETHODIMP Extract(LPCTSTR, UINT, HICON *, HICON *, UINT);
  103. // CRunDeliveryAgentSink
  104. HRESULT OnAgentEnd(const SUBSCRIPTIONCOOKIE *, long, HRESULT, LPCWSTR, BOOL);
  105. protected:
  106. // CDeliveryAgent overrides
  107. HRESULT ModifyUpdateEnd(ISubscriptionItem *pEndItem, UINT *puiRes);
  108. HRESULT StartOperation();
  109. HRESULT StartDownload();
  110. void CleanUp();
  111. void _CleanUp();
  112. // members used during download
  113. HRESULT GetRealUrl(int iPageIndex, LPWSTR *ppwszThisUrl);
  114. HRESULT MakePageStickyAndGetSize(LPCWSTR pwszURL, DWORD *pdwSize, BOOL *pfDiskFull);
  115. HRESULT GetLinksFromPage();
  116. HRESULT GetDependencyLinksFromPage(LPCWSTR pwszThisUrl, DWORD dwRecurse);
  117. HRESULT ProcessDependencyLinks(CWCStringList **ppslUrls, int *piStarted);
  118. HRESULT ProcessPendingLinks();
  119. HRESULT ParseRobotsTxt(LPCWSTR pwszRobotsTxtURL, CWCStringList **ppslRet);
  120. HRESULT GetRobotsTxtIndex(LPCWSTR pwszUrl, BOOL fAddToList, DWORD *pdwRobotsTxtIndex);
  121. HRESULT ValidateWithRobotsTxt(LPCWSTR pwszUrl, int iRobotsIndex, BOOL *pfAllow);
  122. HRESULT StartNextDownload();
  123. HRESULT StartCDFDownload(WCHAR *pwszCDFURL, WCHAR *pwszBaseUrl);
  124. HRESULT ActuallyStartDownload(CWCStringList *pslUrls, int iIndex, BOOL fReStart=FALSE);
  125. HRESULT ActuallyDownloadCodeBase(CWCStringList *pslUrls, int iIndex, BOOL fReStart=FALSE);
  126. static HRESULT GetHostName(LPCWSTR pwszThisUrl, BSTR *pbstrHostName);
  127. inline HRESULT GetChannelItem(ISubscriptionItem **ppChannelItem);
  128. public:
  129. // Callbacks from CDownloadNotify (free threaded)
  130. HRESULT DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved);
  131. HRESULT DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved);
  132. protected:
  133. class CDownloadNotify : public IDownloadNotify
  134. {
  135. public:
  136. CDownloadNotify(CWebCrawler *pParent);
  137. ~CDownloadNotify();
  138. HRESULT Initialize();
  139. void LeaveMeAlone();
  140. protected:
  141. long m_cRef;
  142. CWebCrawler *m_pParent; // we keep a reference
  143. CRITICAL_SECTION m_critParent;
  144. HRESULT m_hrCritParent;
  145. public:
  146. // IUnknown members
  147. STDMETHODIMP QueryInterface(REFIID riid, void **ppunk);
  148. STDMETHODIMP_(ULONG) AddRef(void);
  149. STDMETHODIMP_(ULONG) Release(void);
  150. // IDownloadNotify
  151. STDMETHODIMP DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved);
  152. STDMETHODIMP DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved);
  153. };
  154. };
  155. //////////////////////////////////////////////////////////////////////////
  156. //
  157. // CHelperOM
  158. //
  159. // Helps with MSHTML object model
  160. //////////////////////////////////////////////////////////////////////////
  161. class CHelperOM
  162. {
  163. IHTMLDocument2 *m_pDoc;
  164. public:
  165. typedef enum {
  166. CTYPE_LINKS, // Get all links (<a href>) on a page
  167. CTYPE_MAPS, // Get all maps on page
  168. CTYPE_MAP, // Get all links within a map
  169. CTYPE_META, // Get meta tags (name\ncontent)
  170. CTYPE_FRAMES, // Get all frame urls on a page
  171. } CollectionType;
  172. typedef HRESULT (*PFNHELPERCALLBACK)(IUnknown *punkItem, /*inout*/BSTR *pbstrURL, DWORD_PTR dwCBData, DWORD *pdwStringData);
  173. typedef PFNHELPERCALLBACK PFN_CB;
  174. public:
  175. CHelperOM(IHTMLDocument2 *pDoc);
  176. ~CHelperOM();
  177. static HRESULT GetTagCollection(
  178. IHTMLDocument2 *pDoc,
  179. LPCWSTR wszTagName,
  180. IHTMLElementCollection **ppCollection);
  181. // static HRESULT WinFromDoc(IHTMLDocument2 *pDoc, IHTMLWindow2 **ppWin);
  182. static HRESULT GetCollection (IHTMLDocument2 *pDoc, CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData);
  183. static HRESULT EnumCollection(IHTMLElementCollection *pCollection,
  184. CWCStringList *pStringList, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData);
  185. HRESULT GetTagCollection(LPCWSTR wszTagName, IHTMLElementCollection **ppCollection)
  186. { return GetTagCollection(m_pDoc, wszTagName, ppCollection); }
  187. HRESULT GetCollection(CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData)
  188. { return GetCollection(m_pDoc, psl, Type, pfnCB, dwData); }
  189. protected:
  190. static HRESULT _GetCollection(IHTMLDocument2 *pDoc, CWCStringList *psl, CollectionType Type, PFN_CB pfnCB, DWORD_PTR dwData);
  191. };
  192. #endif _WEBCRAWL_H