Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

4072 lines
134 KiB

  1. // TODO: Allow trident to download frames (and process new html)
  2. // nuke urlmon code (use trident always)
  3. #include "private.h"
  4. #include "shui.h"
  5. #include "downld.h"
  6. #include "subsmgrp.h"
  7. #include <ocidl.h>
  8. #include <initguid.h>
  9. #include <mluisupp.h>
  10. extern HICON g_webCrawlerIcon;
  11. extern HICON g_channelIcon;
  12. extern HICON g_desktopIcon;
  13. void LoadDefaultIcons();
  14. #undef TF_THISMODULE
  15. #define TF_THISMODULE TF_WEBCRAWL
  16. #define _ERROR_REPROCESSING -1
  17. // DWORD field of the m_pPages string list
  18. const DWORD DATA_RECURSEMASK = 0x000000FF; // Levels of recursion from this page
  19. const DWORD DATA_DLSTARTED = 0x80000000; // Have we started downloading
  20. const DWORD DATA_DLFINISHED = 0x40000000; // Have we finished this page
  21. const DWORD DATA_DLERROR = 0x20000000; // An error during download
  22. const DWORD DATA_CODEBASE = 0x10000000; // Is codebase
  23. const DWORD DATA_LINK = 0x08000000; // Is link from page (not dependency)
  24. // DWORD field of m_pPendingLinks string list
  25. const DWORD DATA_ROBOTSTXTMASK=0x00000FFF; // index into m_pRobotsTxt list
  26. // used internally; not actually stored in string list field
  27. const DWORD DATA_ROBOTSTXT = 0x01000000; // Is robots.txt
  28. // m_pDependencyLinks uses m_pPages values
  29. // DWORD field of m_pRobotsTxt is NULL or (CWCDwordStringList *)
  30. // DWORD field of m_pRobotsTxt referenced string list
  31. const DWORD DATA_ALLOW = 0x80000000;
  32. const DWORD DATA_DISALLOW = 0x40000000;
  33. const WCHAR c_wszRobotsMetaName[] = L"Robots\n";
  34. const int c_iRobotsMetaNameLen = 7; // string len without nullterm
  35. const WCHAR c_wszRobotsNoFollow[] = L"NoFollow";
  36. const int c_iRobotsNoFollow = 8;
  37. const WCHAR c_wszRobotsTxtURL[] = L"/robots.txt";
  38. const DWORD MAX_ROBOTS_SIZE = 8192; // Max size of robots.txt file
  39. // tokens for parsing of robots.txt
  40. const CHAR c_szRobots_UserAgent[] = "User-Agent:";
  41. const CHAR c_szRobots_OurUserAgent[] = "MSIECrawler";
  42. const CHAR c_szRobots_Allow[] = "Allow:";
  43. const CHAR c_szRobots_Disallow[] = "Disallow:";
  44. // This GUID comes from Trident and is a hack for getting PARAM values for APPLET tags.
  45. DEFINE_GUID(CGID_JavaParambagCompatHack, 0x3050F405, 0x98B5, 0x11CF, 0xBB, 0x82, 0x00, 0xAA, 0x00, 0xBD, 0xCE, 0x0B);
  46. // This GUID is helpfully not defined elsewhere.
  47. DEFINE_GUID(CLSID_JavaVM, 0x08B0E5C0, 0x4FCB, 0x11CF, 0xAA, 0xA5, 0x00, 0x40, 0x1C, 0x60, 0x85, 0x01);
  48. // Get host channel agent's subscription item, if any.
  49. inline HRESULT CWebCrawler::GetChannelItem(ISubscriptionItem **ppChannelItem)
  50. {
  51. IServiceProvider *pSP;
  52. HRESULT hr = E_NOINTERFACE;
  53. if (ppChannelItem)
  54. *ppChannelItem = NULL;
  55. if (SUCCEEDED(m_pAgentEvents->QueryInterface(IID_IServiceProvider, (void **)&pSP)) && pSP)
  56. {
  57. ISubscriptionItem *pTempChannelItem = NULL;
  58. pSP->QueryService(CLSID_ChannelAgent, IID_ISubscriptionItem, (void **)&pTempChannelItem);
  59. pSP->Release();
  60. if(pTempChannelItem)
  61. hr = S_OK;
  62. if(ppChannelItem)
  63. *ppChannelItem = pTempChannelItem;
  64. else
  65. {
  66. if(pTempChannelItem)
  67. pTempChannelItem->Release();
  68. }
  69. }
  70. return hr;
  71. }
  72. //////////////////////////////////////////////////////////////////////////
  73. //
  74. // Helper functions - copied over from urlmon\download\helpers.cxx - Is there
  75. // an equivalent routine or better place for this, webcrawl.cpp?
  76. //
  77. //////////////////////////////////////////////////////////////////////////
  78. // ---------------------------------------------------------------------------
  79. // %%Function: GetVersionFromString
  80. //
  81. // converts version in text format (a,b,c,d) into two dwords (a,b), (c,d)
  82. // The printed version number is of format a.b.d (but, we don't care)
  83. // ---------------------------------------------------------------------------
  84. HRESULT
  85. GetVersionFromString(const char *szBuf, LPDWORD pdwFileVersionMS, LPDWORD pdwFileVersionLS)
  86. {
  87. const char *pch = szBuf;
  88. char ch;
  89. *pdwFileVersionMS = 0;
  90. *pdwFileVersionLS = 0;
  91. if (!pch) // default to zero if none provided
  92. return S_OK;
  93. if (StrCmpA(pch, "-1,-1,-1,-1") == 0) {
  94. *pdwFileVersionMS = 0xffffffff;
  95. *pdwFileVersionLS = 0xffffffff;
  96. }
  97. USHORT n = 0;
  98. USHORT a = 0;
  99. USHORT b = 0;
  100. USHORT c = 0;
  101. USHORT d = 0;
  102. enum HAVE { HAVE_NONE, HAVE_A, HAVE_B, HAVE_C, HAVE_D } have = HAVE_NONE;
  103. for (ch = *pch++;;ch = *pch++) {
  104. if ((ch == ',') || (ch == '\0')) {
  105. switch (have) {
  106. case HAVE_NONE:
  107. a = n;
  108. have = HAVE_A;
  109. break;
  110. case HAVE_A:
  111. b = n;
  112. have = HAVE_B;
  113. break;
  114. case HAVE_B:
  115. c = n;
  116. have = HAVE_C;
  117. break;
  118. case HAVE_C:
  119. d = n;
  120. have = HAVE_D;
  121. break;
  122. case HAVE_D:
  123. return E_INVALIDARG; // invalid arg
  124. }
  125. if (ch == '\0') {
  126. // all done convert a,b,c,d into two dwords of version
  127. *pdwFileVersionMS = ((a << 16)|b);
  128. *pdwFileVersionLS = ((c << 16)|d);
  129. return S_OK;
  130. }
  131. n = 0; // reset
  132. } else if ( (ch < '0') || (ch > '9'))
  133. return E_INVALIDARG; // invalid arg
  134. else
  135. n = n*10 + (ch - '0');
  136. } /* end forever */
  137. // NEVERREACHED
  138. }
  139. /////////////////////////////////////////////////////////////////////////////////////////
  140. // CombineBaseAndRelativeURLs -
  141. // Three URLs are combined by following rules (this is used for finding the URL
  142. // to load Applet CABs from.) Three inputs, the Base URL, the Code Base URL
  143. // and the file name URL.
  144. //
  145. // If file name URL is absolute return it.
  146. // Otherwise if CodeBase URL is absolute combine it with filename and return.
  147. // Otherwise if Base URL is absolute, combine CodeBase and fileName URL, then
  148. // combine with Base URL and return it.
  149. ////////////////////////////////////////////////////////////////////////////////////////
  150. HRESULT CombineBaseAndRelativeURLs(LPCWSTR szBaseURL, LPCWSTR szRelative1, LPWSTR *szRelative2)
  151. {
  152. WCHAR wszTemp[INTERNET_MAX_URL_LENGTH];
  153. DWORD dwLen = ARRAYSIZE(wszTemp);
  154. ASSERT(szRelative2); // should never happen.
  155. if (szRelative2 == NULL)
  156. return E_FAIL;
  157. if (IsValidURL(NULL, *szRelative2, 0) == S_OK)
  158. return S_OK;
  159. if (szRelative1 && (IsValidURL(NULL, szRelative1, 0) == S_OK))
  160. {
  161. if (SUCCEEDED(UrlCombineW((LPCWSTR)szRelative1, (LPCWSTR)*szRelative2, (LPWSTR)wszTemp, &dwLen, 0)))
  162. {
  163. BSTR bstrNew = SysAllocString(wszTemp);
  164. if (bstrNew)
  165. {
  166. SAFEFREEBSTR(*szRelative2);
  167. *szRelative2 = bstrNew;
  168. return S_OK;
  169. }
  170. }
  171. }
  172. if (szBaseURL && (IsValidURL(NULL, szBaseURL, 0) == S_OK))
  173. {
  174. LPWSTR szNewRel = NULL;
  175. WCHAR wszCombined[INTERNET_MAX_URL_LENGTH];
  176. if (szRelative1)
  177. {
  178. // NOTE: lstr[cpy|cat]W are macroed to work on Win95.
  179. DWORD dwLen2 = lstrlenW(*szRelative2);
  180. StrCpyNW(wszTemp, szRelative1, ARRAYSIZE(wszTemp) - 1); //paranoia
  181. DWORD dwTempLen = lstrlenW(wszTemp);
  182. if ((dwLen2 > 0) && ((*szRelative2)[dwLen2-1] == (unsigned short)L'\\') ||
  183. ((*szRelative2)[dwLen2-1] == (unsigned short) L'/'))
  184. {
  185. StrNCatW(wszTemp, *szRelative2, ARRAYSIZE(wszTemp) - dwTempLen);
  186. }
  187. else
  188. {
  189. StrNCatW(wszTemp, L"/", ARRAYSIZE(wszTemp) - dwTempLen);
  190. StrNCatW(wszTemp, *szRelative2, ARRAYSIZE(wszTemp) - dwTempLen - 1);
  191. }
  192. szNewRel = wszTemp;
  193. }
  194. else
  195. {
  196. szNewRel = *szRelative2;
  197. }
  198. dwLen = INTERNET_MAX_URL_LENGTH;
  199. if (SUCCEEDED(UrlCombineW((LPCWSTR)szBaseURL, (LPCWSTR)szNewRel, (LPWSTR)wszCombined, &dwLen, 0)))
  200. {
  201. BSTR bstrNew = SysAllocString(wszCombined);
  202. if (bstrNew)
  203. {
  204. SAFEFREEBSTR(*szRelative2);
  205. *szRelative2 = bstrNew;
  206. return S_OK;
  207. }
  208. }
  209. }
  210. // In all likelyhood one of the URL's in bad and nothing good can be done.
  211. return E_FAIL;
  212. }
  213. //////////////////////////////////////////////////////////////////////////
  214. //
  215. // CWebCrawler implementation
  216. //
  217. //////////////////////////////////////////////////////////////////////////
  218. //
  219. // CWebCrawler Helpers
  220. //
  221. HRESULT CWebCrawler::CreateInstance(IUnknown *punkOuter, IUnknown **ppunk)
  222. {
  223. HRESULT hr;
  224. ASSERT(NULL == punkOuter);
  225. ASSERT(NULL != ppunk);
  226. CWebCrawler *pwc = new CWebCrawler;
  227. if (NULL != pwc)
  228. {
  229. hr = pwc->Initialize();
  230. if (FAILED(hr))
  231. {
  232. pwc->Release();
  233. }
  234. }
  235. else
  236. {
  237. hr = E_OUTOFMEMORY;
  238. }
  239. if (SUCCEEDED(hr))
  240. {
  241. *ppunk = (ISubscriptionAgentControl *)pwc;
  242. }
  243. return hr;
  244. }
  245. HRESULT CWebCrawler::Initialize()
  246. {
  247. m_hrCritDependencies = InitializeCriticalSectionAndSpinCount(&m_critDependencies, 0) ? S_OK : E_OUTOFMEMORY;
  248. return m_hrCritDependencies;
  249. }
  250. CWebCrawler::CWebCrawler()
  251. {
  252. DBG("Creating CWebCrawler object");
  253. }
  254. CWebCrawler::~CWebCrawler()
  255. {
  256. _CleanUp();
  257. if (SUCCEEDED(m_hrCritDependencies))
  258. {
  259. DeleteCriticalSection(&m_critDependencies);
  260. }
  261. DBG("Destroyed CWebCrawler object");
  262. }
  263. void CWebCrawler::CleanUp()
  264. {
  265. _CleanUp();
  266. CDeliveryAgent::CleanUp();
  267. }
  268. void CWebCrawler::_CleanUp()
  269. {
  270. if (m_pCurDownload)
  271. {
  272. m_pCurDownload->DoneDownloading();
  273. m_pCurDownload->Release();
  274. m_pCurDownload = NULL;
  275. }
  276. CRunDeliveryAgent::SafeRelease(m_pRunAgent);
  277. SAFEFREEBSTR(m_bstrHostName);
  278. SAFEFREEBSTR(m_bstrBaseURL);
  279. SAFELOCALFREE(m_pszLocalDest);
  280. SAFELOCALFREE(m_pBuf);
  281. EnterCriticalSection(&m_critDependencies);
  282. SAFEDELETE(m_pDependencies);
  283. LeaveCriticalSection(&m_critDependencies);
  284. if (m_pDownloadNotify)
  285. {
  286. m_pDownloadNotify->LeaveMeAlone();
  287. m_pDownloadNotify->Release();
  288. m_pDownloadNotify=NULL;
  289. }
  290. SAFEDELETE(m_pPages);
  291. SAFEDELETE(m_pPendingLinks);
  292. SAFEDELETE(m_pDependencyLinks);
  293. SAFERELEASE(m_pUrlIconHelper);
  294. FreeRobotsTxt();
  295. FreeCodeBaseList();
  296. }
  297. // Format of m_pRobotsTxt:
  298. // Array of hostnames for which we have attempted to get Robots.txt
  299. // DWORD for each hostname contains pointer to CDwordStringList of Robots.txt data,
  300. // or 0 if we couldn't find robots.txt for that host name
  301. // Robots.txt data stored in form: url, flag = allow or disallow
  302. void CWebCrawler::FreeRobotsTxt()
  303. {
  304. if (m_pRobotsTxt)
  305. {
  306. DWORD_PTR dwPtr;
  307. int iLen = m_pRobotsTxt->NumStrings();
  308. for (int i=0; i<iLen; i++)
  309. {
  310. dwPtr = m_pRobotsTxt->GetStringData(i);
  311. if (dwPtr)
  312. {
  313. delete ((CWCStringList *)dwPtr);
  314. m_pRobotsTxt->SetStringData(i, 0);
  315. }
  316. }
  317. delete m_pRobotsTxt;
  318. m_pRobotsTxt = NULL;
  319. }
  320. }
  321. void CWebCrawler::FreeCodeBaseList()
  322. {
  323. if (m_pCodeBaseList) {
  324. CCodeBaseHold *pcbh;
  325. int iLen = m_pCodeBaseList->NumStrings();
  326. for (int i=0; i<iLen; i++)
  327. {
  328. pcbh = (CCodeBaseHold *)m_pCodeBaseList->GetStringData(i);
  329. if (pcbh != NULL)
  330. {
  331. SAFEFREEBSTR(pcbh->szDistUnit);
  332. SAFEDELETE(pcbh);
  333. m_pCodeBaseList->SetStringData(i, 0);
  334. }
  335. }
  336. SAFEDELETE(m_pCodeBaseList);
  337. }
  338. }
  339. HRESULT CWebCrawler::StartOperation()
  340. {
  341. ISubscriptionItem *pItem = m_pSubscriptionItem;
  342. DWORD dwTemp;
  343. ASSERT(pItem);
  344. DBG("CWebCrawler in StartOperation");
  345. if (m_pCurDownload || GetBusy())
  346. {
  347. DBG_WARN("Webcrawl busy, returning failure");
  348. return E_FAIL;
  349. }
  350. SAFEFREEBSTR(m_bstrBaseURL);
  351. if (FAILED(
  352. ReadBSTR(pItem, c_szPropURL, &m_bstrBaseURL)) ||
  353. !m_bstrBaseURL ||
  354. !CUrlDownload::IsValidURL(m_bstrBaseURL))
  355. {
  356. DBG_WARN("Couldn't get valid URL, aborting");
  357. SetEndStatus(E_INVALIDARG);
  358. SendUpdateNone();
  359. return E_INVALIDARG;
  360. }
  361. if (SHRestricted2W(REST_NoSubscriptionContent, NULL, 0))
  362. SetAgentFlag(FLAG_CHANGESONLY);
  363. if (IsAgentFlagSet(FLAG_CHANGESONLY))
  364. {
  365. m_dwRecurseLevels = 0;
  366. m_dwRecurseFlags = WEBCRAWL_DONT_MAKE_STICKY;
  367. DBG("Webcrawler is in 'changes only' mode.");
  368. }
  369. else
  370. {
  371. /*
  372. BSTR bstrLocalDest=NULL;
  373. SAFELOCALFREE(m_pszLocalDest);
  374. ReadBSTR(c_szPropCrawlLocalDest, &bstrLocalDest);
  375. if (bstrLocalDest && bstrLocalDest[0])
  376. {
  377. int iLen = SysStringByteLen(bstrLocalDest)+1;
  378. m_pszLocalDest = (LPTSTR) MemAlloc(LMEM_FIXED, iLen);
  379. if (m_pszLocalDest)
  380. {
  381. MyOleStrToStrN(m_pszLocalDest, iLen, bstrLocalDest);
  382. }
  383. }
  384. SAFEFREEBSTR(bstrLocalDest);
  385. */
  386. m_dwRecurseLevels=0;
  387. ReadDWORD(pItem, c_szPropCrawlLevels, &m_dwRecurseLevels);
  388. if (!IsAgentFlagSet(DELIVERY_AGENT_FLAG_NO_RESTRICTIONS))
  389. {
  390. // Note: MaxWebcrawlLevels is stored as N+1 because 0
  391. // disables the restriction
  392. dwTemp = SHRestricted2W(REST_MaxWebcrawlLevels, NULL, 0);
  393. if (dwTemp && m_dwRecurseLevels >= dwTemp)
  394. m_dwRecurseLevels = dwTemp - 1;
  395. }
  396. m_dwRecurseFlags=0;
  397. ReadDWORD(pItem, c_szPropCrawlFlags, &m_dwRecurseFlags);
  398. // Read max size in cache in KB
  399. m_dwMaxSize=0;
  400. ReadDWORD(pItem, c_szPropCrawlMaxSize, &m_dwMaxSize);
  401. if (!IsAgentFlagSet(DELIVERY_AGENT_FLAG_NO_RESTRICTIONS))
  402. {
  403. dwTemp = SHRestricted2W(REST_MaxSubscriptionSize, NULL, 0);
  404. if (dwTemp && (!m_dwMaxSize || m_dwMaxSize > dwTemp))
  405. m_dwMaxSize = dwTemp;
  406. }
  407. if (IsRecurseFlagSet(WEBCRAWL_DONT_MAKE_STICKY))
  408. dwTemp = 0;
  409. // Read old group ID
  410. ReadLONGLONG(pItem, c_szPropCrawlGroupID, &m_llOldCacheGroupID);
  411. // Read new ID if present
  412. m_llCacheGroupID = 0;
  413. ReadLONGLONG(pItem, c_szPropCrawlNewGroupID, &m_llCacheGroupID);
  414. if (m_llCacheGroupID)
  415. {
  416. DBG("Adding to existing cache group");
  417. }
  418. } // !ChangesOnly
  419. // finish initializing new operation
  420. m_iDownloadErrors = 0;
  421. m_dwCurSize = 0;
  422. m_lMaxNumUrls = (m_dwRecurseLevels) ? -1 : 1;
  423. SAFEFREEBSTR(m_bstrHostName);
  424. m_dwCurSize = NULL;
  425. m_pPages = NULL;
  426. m_pDependencies = NULL;
  427. // After calling this, we'll reenter either in "StartDownload" (connection successful)
  428. // or in "AbortUpdate" with GetEndStatus() == INET_E_AGENT_CONNECTION_FAILED
  429. return CDeliveryAgent::StartOperation();
  430. }
  431. HRESULT CWebCrawler::AgentPause(DWORD dwFlags)
  432. {
  433. DBG("CWebCrawler::AgentPause");
  434. // Abort our current url
  435. if (m_pRunAgent)
  436. {
  437. m_pRunAgent->AgentPause(dwFlags);
  438. }
  439. if (m_pCurDownload)
  440. {
  441. m_pCurDownload->AbortDownload();
  442. m_pCurDownload->DestroyBrowser();
  443. }
  444. return CDeliveryAgent::AgentPause(dwFlags);
  445. }
  446. HRESULT CWebCrawler::AgentResume(DWORD dwFlags)
  447. {
  448. DBG("CWebCrawler::AgentResume");
  449. if (m_pRunAgent)
  450. {
  451. m_pRunAgent->AgentResume(dwFlags);
  452. }
  453. else
  454. {
  455. // If we just increased our cache size, reprocess same url
  456. if (SUBSCRIPTION_AGENT_RESUME_INCREASED_CACHE & dwFlags)
  457. {
  458. DBG("CWebCrawler reprocessing same url after cache size increase");
  459. OnDownloadComplete(0, _ERROR_REPROCESSING);
  460. }
  461. else
  462. {
  463. // If we're not still downloading, restart our same url
  464. if (0 == m_iNumPagesDownloading)
  465. {
  466. if (FAILED(ActuallyStartDownload(m_pCurDownloadStringList, m_iCurDownloadStringIndex, TRUE)))
  467. {
  468. ASSERT_MSG(0, "CWebCrawler::AgentResume"); // this should never happen
  469. SetEndStatus(E_FAIL);
  470. CleanUp();
  471. }
  472. }
  473. }
  474. }
  475. return CDeliveryAgent::AgentResume(dwFlags);
  476. }
  477. // Forcibly abort current operation
  478. HRESULT CWebCrawler::AgentAbort(DWORD dwFlags)
  479. {
  480. DBG("CWebCrawler::AgentAbort");
  481. if (m_pCurDownload)
  482. {
  483. m_pCurDownload->DoneDownloading();
  484. }
  485. if (m_pRunAgent)
  486. {
  487. m_pRunAgent->AgentAbort(dwFlags);
  488. }
  489. return CDeliveryAgent::AgentAbort(dwFlags);
  490. }
  491. //---------------------------------------------------------------
  492. //
  493. HRESULT CWebCrawler::StartDownload()
  494. {
  495. ASSERT(!m_pCurDownload);
  496. m_iPagesStarted = 0;
  497. m_iRobotsStarted = 0;
  498. m_iDependencyStarted = 0;
  499. m_iDependenciesProcessed = 0;
  500. m_iTotalStarted = 0;
  501. m_iCodeBaseStarted = 0;
  502. m_iNumPagesDownloading = 0;
  503. // Create new cache group
  504. if (IsAgentFlagSet(FLAG_CHANGESONLY))
  505. {
  506. m_llCacheGroupID = 0;
  507. }
  508. else
  509. {
  510. if (!m_llCacheGroupID)
  511. {
  512. m_llCacheGroupID = CreateUrlCacheGroup(
  513. (IsRecurseFlagSet(WEBCRAWL_DONT_MAKE_STICKY) ? 0 : CACHEGROUP_FLAG_NONPURGEABLE), 0);
  514. ASSERT_MSG(m_llCacheGroupID != 0, "Create cache group failed");
  515. }
  516. }
  517. // Create string lists
  518. m_pPages = new CWCDwordStringList;
  519. if (m_pPages)
  520. m_pPages->Init(m_dwRecurseLevels ? -1 : 512);
  521. else
  522. SetEndStatus(E_FAIL);
  523. if (m_dwRecurseLevels && !IsRecurseFlagSet(WEBCRAWL_IGNORE_ROBOTSTXT))
  524. {
  525. m_pRobotsTxt = new CWCDwordStringList;
  526. if (m_pRobotsTxt)
  527. m_pRobotsTxt->Init(512);
  528. else
  529. SetEndStatus(E_FAIL);
  530. }
  531. // FEATURE : Shouldn't allocate this memory in changes only mode
  532. m_pCodeBaseList = new CWCDwordStringList;
  533. if (m_pCodeBaseList)
  534. m_pCodeBaseList->Init(512);
  535. else
  536. SetEndStatus(E_FAIL);
  537. // Avoid duplicate processing of dependencies
  538. if (!IsAgentFlagSet(FLAG_CHANGESONLY))
  539. {
  540. m_pDependencies = new CWCDwordStringList;
  541. if (m_pDependencies)
  542. m_pDependencies->Init();
  543. else
  544. SetEndStatus(E_FAIL);
  545. }
  546. if (GetEndStatus() == E_FAIL)
  547. return E_FAIL;
  548. m_pCurDownload = new CUrlDownload(this, 0);
  549. if (!m_pCurDownload)
  550. return E_OUTOFMEMORY;
  551. // Add first URL to string list, then start it
  552. if ((CWCStringList::STRLST_ADDED == m_pPages->AddString(m_bstrBaseURL, m_dwRecurseLevels)) &&
  553. m_pPages->NumStrings() == 1)
  554. {
  555. return StartNextDownload();
  556. }
  557. SetEndStatus(E_FAIL);
  558. return E_FAIL;
  559. }
  560. // Attempts to begin the next download
  561. HRESULT CWebCrawler::StartNextDownload()
  562. {
  563. if (!m_pPages || m_iNumPagesDownloading)
  564. return E_FAIL;
  565. CWCStringList *pslUrls = NULL;
  566. int iIndex = 0;
  567. // See if we have any more URLs to download.
  568. // Check dependency links first
  569. if (m_pDependencyLinks)
  570. {
  571. ProcessDependencyLinks(&pslUrls, &iIndex);
  572. #ifdef DEBUG
  573. if (pslUrls) DBG("Downloading dependency link (frame):");
  574. #endif
  575. }
  576. if (!pslUrls)
  577. {
  578. // Check robots.txt
  579. if (m_pRobotsTxt && (m_iRobotsStarted < m_pRobotsTxt->NumStrings()))
  580. {
  581. pslUrls = m_pRobotsTxt;
  582. iIndex = m_iRobotsStarted ++;
  583. }
  584. else if (m_pPendingLinks) // add pending links to pages list
  585. {
  586. // Pending links to process and we've retrieved all robots.txt
  587. // Process pending links (validate & add to download list)
  588. ProcessPendingLinks();
  589. }
  590. if (!pslUrls && (m_iPagesStarted < m_pPages->NumStrings()))
  591. {
  592. DWORD_PTR dwTmp;
  593. ASSERT(!m_pDependencyLinks);// should be downloaded already
  594. ASSERT(!m_pPendingLinks); // should be validated already
  595. // Skip any pages we've started
  596. while (m_iPagesStarted < m_pPages->NumStrings())
  597. {
  598. dwTmp = m_pPages->GetStringData(m_iPagesStarted);
  599. if (IsFlagSet(dwTmp, DATA_DLSTARTED))
  600. m_iPagesStarted++;
  601. else
  602. break;
  603. }
  604. if (m_iPagesStarted < m_pPages->NumStrings())
  605. {
  606. pslUrls = m_pPages;
  607. iIndex = m_iPagesStarted ++;
  608. }
  609. }
  610. if (!pslUrls && (m_iCodeBaseStarted < m_pCodeBaseList->NumStrings()))
  611. {
  612. // Nothing else pull, do code bases last.
  613. while (m_iCodeBaseStarted < m_pCodeBaseList->NumStrings())
  614. {
  615. CCodeBaseHold *pcbh = (CCodeBaseHold *)
  616. m_pCodeBaseList->GetStringData(m_iCodeBaseStarted);
  617. if (IsFlagSet(pcbh->dwFlags, DATA_DLSTARTED))
  618. m_iCodeBaseStarted++;
  619. else
  620. break;
  621. }
  622. while (m_iCodeBaseStarted < m_pCodeBaseList->NumStrings())
  623. {
  624. // We have some codebases to download.
  625. // We return if the download is async and simply
  626. // start the next one if it finishes synchronously
  627. iIndex = m_iCodeBaseStarted;
  628. m_iCodeBaseStarted++; // increment so that next download is not repeated
  629. // Init the cur download infor for resume if paused
  630. m_iCurDownloadStringIndex = iIndex;
  631. m_pCurDownloadStringList = m_pCodeBaseList;
  632. if(ActuallyDownloadCodeBase(m_pCodeBaseList, iIndex, FALSE) == E_PENDING)
  633. return S_OK; // We break out of the while and try next download in OnAgentEnd()
  634. }
  635. }
  636. }
  637. if (pslUrls)
  638. {
  639. m_iCurDownloadStringIndex = iIndex;
  640. m_pCurDownloadStringList = pslUrls;
  641. return ActuallyStartDownload(pslUrls, iIndex);
  642. }
  643. DBG("WebCrawler: StartNextDownload failing, nothing more to download.");
  644. return E_FAIL;
  645. }
  646. HRESULT CWebCrawler::ActuallyStartDownload(CWCStringList *pslUrls, int iIndex, BOOL fReStart /* = FALSE */)
  647. {
  648. // We have urls to download. Do it.
  649. DWORD_PTR dwData;
  650. LPCWSTR pwszURL;
  651. DWORD dwBrowseFlags;
  652. BDUMethod method;
  653. BDUOptions options;
  654. if(pslUrls == m_pCodeBaseList)
  655. {
  656. ASSERT(fReStart); // Should happen only with resume
  657. HRESULT hr = ActuallyDownloadCodeBase(m_pCodeBaseList, iIndex, fReStart);
  658. if(E_PENDING == hr)
  659. return S_OK;
  660. return E_FAIL; // hackhack - since we don't handle synchronous downloads well - we hang if
  661. // resumed download is synchronous
  662. }
  663. if (pslUrls != m_pRobotsTxt)
  664. {
  665. dwData = pslUrls->GetStringData(iIndex);
  666. #ifdef DEBUG
  667. if (fReStart)
  668. if (~(dwData & DATA_DLSTARTED)) DBG_WARN("WebCrawler: Trying to restart one we haven't started yet!");
  669. else
  670. if ((dwData & DATA_DLSTARTED)) DBG_WARN("WebCrawler: Trying to download one we've already started?");
  671. #endif
  672. pslUrls->SetStringData(iIndex, DATA_DLSTARTED | dwData);
  673. }
  674. else
  675. dwData = DATA_ROBOTSTXT;
  676. pwszURL = pslUrls->GetString(iIndex);
  677. ASSERT(iIndex < pslUrls->NumStrings());
  678. #ifdef DEBUG
  679. int iMax = m_lMaxNumUrls;
  680. if (iMax<0)
  681. iMax = m_pPages->NumStrings() + ((m_pRobotsTxt) ? m_pRobotsTxt->NumStrings() : 0);
  682. TraceMsgA(TF_THISMODULE, "WebCrawler GET_URL (%d of %c%d) Recurse %d : %ws",
  683. m_iTotalStarted+1, ((m_lMaxNumUrls>0) ? ' ' : '?'), iMax,
  684. pslUrls->GetStringData(iIndex) & DATA_RECURSEMASK, pwszURL);
  685. #endif
  686. dwBrowseFlags = DLCTL_DOWNLOADONLY |
  687. DLCTL_NO_FRAMEDOWNLOAD | DLCTL_NO_SCRIPTS | DLCTL_NO_JAVA |
  688. DLCTL_NO_RUNACTIVEXCTLS;
  689. if (IsRecurseFlagSet(WEBCRAWL_GET_IMAGES)) dwBrowseFlags |= DLCTL_DLIMAGES;
  690. if (IsRecurseFlagSet(WEBCRAWL_GET_VIDEOS)) dwBrowseFlags |= DLCTL_VIDEOS;
  691. if (IsRecurseFlagSet(WEBCRAWL_GET_BGSOUNDS)) dwBrowseFlags |= DLCTL_BGSOUNDS;
  692. if (!IsRecurseFlagSet(WEBCRAWL_GET_CONTROLS)) dwBrowseFlags |= DLCTL_NO_DLACTIVEXCTLS;
  693. if (IsRecurseFlagSet(WEBCRAWL_PRIV_OFFLINE_MODE))
  694. {
  695. dwBrowseFlags |= DLCTL_FORCEOFFLINE;
  696. dwBrowseFlags &= ~(DLCTL_DLIMAGES | DLCTL_VIDEOS | DLCTL_BGSOUNDS);
  697. DBG("GET is OFFLINE");
  698. }
  699. m_pCurDownload->SetDLCTL(dwBrowseFlags);
  700. #ifdef DEBUG
  701. if (fReStart)
  702. {
  703. ASSERT(m_iCurDownloadStringIndex == iIndex);
  704. ASSERT(m_pCurDownloadStringList == pslUrls);
  705. }
  706. #endif
  707. if (!fReStart)
  708. {
  709. // Get the info for change detection, unless we already know it's changed
  710. if (!IsAgentFlagSet(FLAG_CRAWLCHANGED) && !(dwData & DATA_ROBOTSTXT))
  711. {
  712. TCHAR szUrl[INTERNET_MAX_URL_LENGTH];
  713. m_varChange.vt = VT_EMPTY;
  714. if (IsAgentFlagSet(FLAG_CHANGESONLY))
  715. {
  716. // "Changes Only" mode, we have persisted a change detection code
  717. ASSERT(m_iTotalStarted == 0);
  718. LPCWSTR pPropChange = c_szPropChangeCode;
  719. m_pSubscriptionItem->ReadProperties(1, &pPropChange, &m_varChange);
  720. }
  721. BOOL fMustGET = TRUE;
  722. MyOleStrToStrN(szUrl, INTERNET_MAX_URL_LENGTH, pwszURL);
  723. PreCheckUrlForChange(szUrl, &m_varChange, &fMustGET);
  724. if (IsAgentFlagSet(FLAG_CHANGESONLY) && !fMustGET)
  725. SetAgentFlag(FLAG_HEADONLY);
  726. }
  727. m_iTotalStarted ++;
  728. }
  729. if (IsPaused())
  730. {
  731. DBG("WebCrawler paused, not starting another download");
  732. if (m_pCurDownload)
  733. m_pCurDownload->DestroyBrowser(); // free browser until resumed
  734. return E_PENDING;
  735. }
  736. m_iNumPagesDownloading ++;
  737. // Send our update progress with the url we're about to download
  738. SendUpdateProgress(pwszURL, m_iTotalStarted, m_lMaxNumUrls, (m_dwCurSize >> 10));
  739. if (IsAgentFlagSet(FLAG_HEADONLY))
  740. {
  741. ASSERT(m_iTotalStarted == 1);
  742. method = BDU2_HEADONLY; // Only get HEAD info with Urlmon
  743. }
  744. else if (IsAgentFlagSet(FLAG_CHANGESONLY) // Only want HTML, or
  745. || m_pszLocalDest // We're going to move this one file, or
  746. || (dwData & DATA_ROBOTSTXT)) // This is a robots.txt, so
  747. {
  748. method = BDU2_URLMON; // Get with Urlmon
  749. }
  750. else if (m_iTotalStarted == 1) // First file, we need status code, so
  751. {
  752. ISubscriptionItem *pCDFItem;
  753. method = BDU2_SNIFF; // Get with Urlmon then MSHTML (if HTML)
  754. // Find out if we're hosted by channel agent
  755. if (SUCCEEDED(GetChannelItem(&pCDFItem)))
  756. {
  757. // If we're hosted by channel agent, use its original hostname
  758. BSTR bstrBaseUrl;
  759. if (SUCCEEDED(ReadBSTR(pCDFItem, c_szPropURL, &bstrBaseUrl)))
  760. {
  761. GetHostName(bstrBaseUrl, &m_bstrHostName);
  762. SysFreeString(bstrBaseUrl);
  763. }
  764. #ifdef DEBUG
  765. if (m_bstrHostName)
  766. TraceMsg(TF_THISMODULE, "Got host name from channel agent: %ws", m_bstrHostName);
  767. #endif
  768. pCDFItem->Release();
  769. DBG("Using 'smart' mode for first url in webcrawl; spawned from channel crawl");
  770. method = BDU2_SMART; // Use 'smart' mode for first url if channel crawl
  771. SetAgentFlag(FLAG_HOSTED);
  772. }
  773. }
  774. else
  775. method = BDU2_SMART; // Get with Urlmon or MSHTML as appropriate
  776. if (dwData & DATA_ROBOTSTXT)
  777. options = BDU2_NEEDSTREAM; // Need IStream to parse robots.txt
  778. else
  779. options = BDU2_NONE;
  780. options |= BDU2_DOWNLOADNOTIFY_REQUIRED; // Always get download notify callbacks
  781. if (IsRecurseFlagSet(WEBCRAWL_ONLY_LINKS_TO_HTML) && (dwData & DATA_LINK))
  782. {
  783. // Don't follow any links unless they are to html pages.
  784. options |= BDU2_FAIL_IF_NOT_HTML;
  785. }
  786. if (FAILED(m_pCurDownload->BeginDownloadURL2(pwszURL,
  787. method, options, m_pszLocalDest,
  788. m_dwMaxSize ? (m_dwMaxSize<<10)-m_dwCurSize : 0)))
  789. {
  790. DBG("BeginDownloadURL2 failed (ignoring & waiting for OnDownloadComplete call)");
  791. }
  792. return S_OK;
  793. }
  794. HRESULT CWebCrawler::ActuallyDownloadCodeBase(CWCStringList *pslUrls, int iIndex, BOOL fReStart)
  795. {
  796. CCodeBaseHold *pcbh;
  797. LPCWSTR pwszURL;
  798. HRESULT hr = S_OK;
  799. if (pslUrls != m_pCodeBaseList)
  800. {
  801. ASSERT(0);
  802. DBG_WARN("WebCrawler: Wrong URLs being processed as CodeBase.");
  803. hr = E_FAIL;
  804. goto Exit;
  805. }
  806. pcbh = (CCodeBaseHold *)pslUrls->GetStringData(iIndex);
  807. #ifdef DEBUG
  808. if (fReStart)
  809. if (~(pcbh->dwFlags & DATA_DLSTARTED)) DBG_WARN("WebCrawler: Trying to restart CodeBase D/L we haven't started yet!");
  810. else
  811. if ((pcbh->dwFlags & DATA_DLSTARTED)) DBG_WARN("WebCrawler: Trying to download CodeBase D/L we've already started?");
  812. #endif
  813. pcbh->dwFlags |= DATA_DLSTARTED;
  814. pwszURL = pslUrls->GetString(iIndex);
  815. ASSERT(iIndex < pslUrls->NumStrings());
  816. if (!fReStart)
  817. m_iTotalStarted ++;
  818. if (IsPaused())
  819. {
  820. DBG("WebCrawler paused, not starting another download");
  821. if (m_pCurDownload)
  822. m_pCurDownload->DestroyBrowser(); // free browser until resumed
  823. return S_FALSE;
  824. }
  825. m_iNumPagesDownloading ++;
  826. // Send our update progress with the CODEBASE we're about to download
  827. SendUpdateProgress(pwszURL, m_iTotalStarted, m_lMaxNumUrls);
  828. if (m_pRunAgent)
  829. {
  830. ASSERT(0);
  831. DBG_WARN("WebCrawler: Attempting to download next CODEBASE when not done last one.");
  832. hr = E_FAIL;
  833. goto Exit;
  834. }
  835. else
  836. {
  837. // create subscription item for CDL agent.
  838. ISubscriptionItem *pItem = NULL;
  839. if (m_dwMaxSize && ((m_dwCurSize>>10) >= m_dwMaxSize))
  840. {
  841. // We've exceeded our maximum download KB limit and can't continue.
  842. DBG_WARN("WebCrawler: Exceeded Maximum KB download limit with CodeBase download.");
  843. SetEndStatus(hr = INET_E_AGENT_MAX_SIZE_EXCEEDED);
  844. goto Exit;
  845. }
  846. if (!m_pSubscriptionItem ||
  847. FAILED(hr = DoCloneSubscriptionItem(m_pSubscriptionItem, NULL, &pItem)))
  848. {
  849. goto Exit;
  850. }
  851. ASSERT(pItem != NULL);
  852. WriteOLESTR(pItem, c_szPropURL, pwszURL);
  853. WriteOLESTR(pItem, L"DistUnit", pcbh->szDistUnit);
  854. WriteDWORD(pItem, L"VersionMS", pcbh->dwVersionMS);
  855. WriteDWORD(pItem, L"VersionLS", pcbh->dwVersionLS);
  856. if (m_dwMaxSize)
  857. WriteDWORD(pItem, c_szPropCrawlMaxSize, m_dwMaxSize - (m_dwCurSize>>10)); // KB limit for us to pull.
  858. m_pRunAgent = new CRunDeliveryAgent();
  859. if (m_pRunAgent)
  860. hr = m_pRunAgent->Init((CRunDeliveryAgentSink *)this, pItem, CLSID_CDLAgent);
  861. pItem->Release();
  862. if (m_pRunAgent && SUCCEEDED(hr))
  863. {
  864. hr = m_pRunAgent->StartAgent();
  865. //if (hr == E_PENDING)
  866. //{
  867. //hr = S_OK;
  868. //}
  869. }
  870. else
  871. {
  872. hr = E_OUTOFMEMORY;
  873. }
  874. }
  875. Exit:
  876. return hr;
  877. }
  878. HRESULT CWebCrawler::ProcessDependencyLinks(CWCStringList **ppslUrls, int *piStarted)
  879. {
  880. ASSERT(ppslUrls && !*ppslUrls && piStarted);
  881. int iIndex;
  882. DWORD_PTR dwData;
  883. if (!m_pDependencyLinks)
  884. return S_FALSE;
  885. // See if we have any more dependency links to download
  886. while (m_iDependencyStarted < m_pDependencyLinks->NumStrings())
  887. {
  888. if (!m_pPages->FindString(m_pDependencyLinks->GetString(m_iDependencyStarted),
  889. m_pDependencyLinks->GetStringLen(m_iDependencyStarted), &iIndex))
  890. {
  891. ASSERT(0); // find string failed?!? We added it above!
  892. return E_FAIL;
  893. }
  894. ASSERT(iIndex>=0 && iIndex<m_pPages->NumStrings());
  895. m_iDependencyStarted ++;
  896. // See if we've downloaded this yet.
  897. dwData = m_pPages->GetStringData(iIndex);
  898. if (!(dwData & DATA_DLSTARTED))
  899. {
  900. // Nope. Start download.
  901. *ppslUrls = m_pPages;
  902. *piStarted = iIndex;
  903. return S_OK;
  904. }
  905. // We have already downloaded this page. Go to next dependency link.
  906. }
  907. // Done processing. Clear for next page.
  908. SAFEDELETE(m_pDependencyLinks);
  909. return S_FALSE;
  910. }
  911. HRESULT CWebCrawler::ProcessPendingLinks()
  912. {
  913. int iNumLinks, iAddCode, i, iAddIndex, iRobotsIndex;
  914. LPCWSTR pwszUrl;
  915. BOOL fAllow;
  916. if (!m_pPendingLinks)
  917. return S_FALSE;
  918. ASSERT(m_lMaxNumUrls<0);
  919. ASSERT(0 == (m_dwPendingRecurseLevel & ~DATA_RECURSEMASK));
  920. iNumLinks = m_pPendingLinks->NumStrings();
  921. TraceMsg(TF_THISMODULE, "Processing %d pending links from %ws",
  922. iNumLinks, m_pPages->GetString(m_iPagesStarted-1));
  923. // Add the links to our global page list
  924. for (i=0; i<iNumLinks; i++)
  925. {
  926. // Validate with robots.txt if appropriate
  927. pwszUrl = m_pPendingLinks->GetString(i);
  928. iRobotsIndex = (int)(m_pPendingLinks->GetStringData(i) & DATA_ROBOTSTXTMASK);
  929. ValidateWithRobotsTxt(pwszUrl, iRobotsIndex, &fAllow);
  930. if (fAllow)
  931. {
  932. /*
  933. As long as we retrieve pages in decreasing-recursion order (top to bottom), we don't
  934. have to worry about bumping pages to a higher recurse level (except for frames).
  935. */
  936. iAddCode = m_pPages->AddString(pwszUrl,
  937. DATA_LINK | m_dwPendingRecurseLevel,
  938. &iAddIndex);
  939. if (iAddCode == CWCStringList::STRLST_FAIL)
  940. break;
  941. }
  942. }
  943. SAFEDELETE(m_pPendingLinks);
  944. return S_OK;
  945. }
  946. // Combine with our base url to get full url
  947. // We use this for frames, but also for <Link> tags, since the processing is identical
  948. HRESULT CWebCrawler::CheckFrame(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwBaseUrl, DWORD *pdwStringData)
  949. {
  950. WCHAR wszCombined[INTERNET_MAX_URL_LENGTH];
  951. DWORD dwLen = ARRAYSIZE(wszCombined);
  952. ASSERT(pbstrItem && *pbstrItem && punkItem && dwBaseUrl);
  953. if (!pbstrItem || !*pbstrItem || !punkItem || !dwBaseUrl)
  954. return E_FAIL; // bogus
  955. if (SUCCEEDED(UrlCombineW((LPCWSTR)dwBaseUrl, *pbstrItem, wszCombined, &dwLen, 0)))
  956. {
  957. BSTR bstrNew = SysAllocString(wszCombined);
  958. if (bstrNew)
  959. {
  960. SysFreeString(*pbstrItem);
  961. *pbstrItem = bstrNew;
  962. return S_OK;
  963. }
  964. }
  965. TraceMsg(TF_WARNING, "CWebCrawler::CheckFrame failing. Not getting frame or <link> url=%ws.", *pbstrItem);
  966. return E_FAIL; // Couldn't combine url; don't add
  967. }
  968. // See if we should follow this link. Clears pbstrItem if not.
  969. // Accepts either pLink or pArea
  970. HRESULT CWebCrawler::CheckLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwThis, DWORD *pdwStringData)
  971. {
  972. HRESULT hrRet = S_OK;
  973. CWebCrawler *pThis = (CWebCrawler *)dwThis;
  974. ASSERT(pbstrItem && *pbstrItem && punkItem && dwThis);
  975. if (!pbstrItem || !*pbstrItem || !punkItem || !dwThis)
  976. return E_FAIL; // bogus
  977. // First see if it's 'valid'
  978. // We only add the link if it's HTTP (or https)
  979. // (we don't want to get mailto: links, for example)
  980. if (CUrlDownload::IsValidURL(*pbstrItem))
  981. {
  982. // Strip off any anchor
  983. CUrlDownload::StripAnchor(*pbstrItem);
  984. }
  985. else
  986. {
  987. // Skip this link
  988. SysFreeString(*pbstrItem);
  989. *pbstrItem = NULL;
  990. return S_FALSE;
  991. }
  992. if (pThis->IsRecurseFlagSet(WEBCRAWL_ONLY_LINKS_TO_HTML))
  993. {
  994. // See if we can tell that this is not an HTML link
  995. if (CUrlDownload::IsNonHtmlUrl(*pbstrItem))
  996. {
  997. // Skip this link
  998. SysFreeString(*pbstrItem);
  999. *pbstrItem = NULL;
  1000. return S_FALSE;
  1001. }
  1002. }
  1003. if (!(pThis->IsRecurseFlagSet(WEBCRAWL_LINKS_ELSEWHERE)))
  1004. {
  1005. BSTR bstrHost=NULL;
  1006. IHTMLAnchorElement *pLink=NULL;
  1007. IHTMLAreaElement *pArea=NULL;
  1008. // Check to see if the host names match
  1009. punkItem->QueryInterface(IID_IHTMLAnchorElement, (void **)&pLink);
  1010. if (pLink)
  1011. {
  1012. pLink->get_hostname(&bstrHost);
  1013. pLink->Release();
  1014. }
  1015. else
  1016. {
  1017. punkItem->QueryInterface(IID_IHTMLAreaElement, (void **)&pArea);
  1018. if (pArea)
  1019. {
  1020. pArea->get_hostname(&bstrHost);
  1021. pArea->Release();
  1022. }
  1023. else
  1024. {
  1025. DBG_WARN("CWebCrawler::CheckLink Unable to get Area or Anchor interface!");
  1026. return E_FAIL; // Bad element
  1027. }
  1028. }
  1029. if (!bstrHost || !*bstrHost)
  1030. {
  1031. DBG_WARN("CWebCrawler::CheckLink : (pLink|pArea)->get_hostname() failed");
  1032. hrRet = S_OK; // always accept if get_hostname fails
  1033. }
  1034. else
  1035. {
  1036. if (pThis->m_bstrHostName && MyAsciiCmpW(bstrHost, pThis->m_bstrHostName))
  1037. {
  1038. // Skip url; different host name.
  1039. SAFEFREEBSTR(*pbstrItem);
  1040. hrRet = S_FALSE;
  1041. }
  1042. }
  1043. SAFEFREEBSTR(bstrHost);
  1044. }
  1045. if (*pbstrItem && pdwStringData)
  1046. {
  1047. pThis->GetRobotsTxtIndex(*pbstrItem, TRUE, pdwStringData);
  1048. *pdwStringData &= DATA_ROBOTSTXTMASK;
  1049. }
  1050. else if (pdwStringData)
  1051. *pdwStringData = 0;
  1052. return hrRet;
  1053. }
  1054. // S_OK : Already retrieved this robots.txt info
  1055. // S_FALSE : Haven't yet retrieved this robots.txt info
  1056. // E_* : Bad
  1057. HRESULT CWebCrawler::GetRobotsTxtIndex(LPCWSTR pwszUrl, BOOL fAddToList, DWORD *pdwRobotsTxtIndex)
  1058. {
  1059. HRESULT hr=S_OK;
  1060. int iIndex=-1;
  1061. if (m_pRobotsTxt)
  1062. {
  1063. // See which robots.txt file we should use to validate this link
  1064. // If not yet available, add it to the list to be downloaded
  1065. DWORD dwBufLen = lstrlenW(pwszUrl) + ARRAYSIZE(c_wszRobotsTxtURL); //This get's us a terminating NULL
  1066. LPWSTR pwszRobots = (LPWSTR)MemAlloc(LMEM_FIXED, dwBufLen * sizeof(WCHAR));
  1067. int iAddCode;
  1068. if (pwszRobots)
  1069. {
  1070. // PERF: do the internetcombine in startnextdownload
  1071. if (SUCCEEDED(UrlCombineW(pwszUrl, c_wszRobotsTxtURL, pwszRobots, &dwBufLen, 0))
  1072. && !memcmp(pwszRobots, L"http", 4 * sizeof(WCHAR)))
  1073. {
  1074. if (fAddToList)
  1075. {
  1076. iAddCode = m_pRobotsTxt->AddString(pwszRobots, 0, &iIndex);
  1077. }
  1078. else
  1079. {
  1080. if (m_pRobotsTxt->FindString(pwszRobots, -1, &iIndex))
  1081. {
  1082. iAddCode = CWCStringList::STRLST_DUPLICATE;
  1083. }
  1084. else
  1085. {
  1086. iIndex=-1;
  1087. iAddCode = CWCStringList::STRLST_FAIL;
  1088. }
  1089. }
  1090. if (CWCStringList::STRLST_FAIL == iAddCode)
  1091. hr = E_FAIL; // bad news
  1092. else if (CWCStringList::STRLST_ADDED == iAddCode)
  1093. hr = S_FALSE; // haven't gotten it yet
  1094. else
  1095. hr = S_OK; // already got it
  1096. }
  1097. MemFree(pwszRobots);
  1098. }
  1099. else
  1100. hr = E_OUTOFMEMORY;
  1101. }
  1102. else
  1103. {
  1104. hr = E_FAIL; // too many robots.txt files???
  1105. }
  1106. *pdwRobotsTxtIndex = iIndex;
  1107. return hr;
  1108. }
  1109. // iRobotsIndex : Index into robots.txt, -1 if unavailable
  1110. HRESULT CWebCrawler::ValidateWithRobotsTxt(LPCWSTR pwszUrl, int iRobotsIndex, BOOL *pfAllow)
  1111. {
  1112. int iNumDirectives, i;
  1113. CWCStringList *pslThisRobotsTxt=NULL;
  1114. *pfAllow = TRUE;
  1115. if (!m_pRobotsTxt)
  1116. return S_OK;
  1117. if (iRobotsIndex == -1)
  1118. {
  1119. DWORD dwIndex;
  1120. if (S_OK != GetRobotsTxtIndex(pwszUrl, FALSE, &dwIndex))
  1121. return E_FAIL;
  1122. iRobotsIndex = (int)dwIndex;
  1123. }
  1124. if ((iRobotsIndex >= 0) && iRobotsIndex<m_pRobotsTxt->NumStrings())
  1125. {
  1126. pslThisRobotsTxt = (CWCStringList *)(m_pRobotsTxt->GetStringData(iRobotsIndex));
  1127. if (pslThisRobotsTxt)
  1128. {
  1129. iNumDirectives = pslThisRobotsTxt->NumStrings();
  1130. for (i=0; i<iNumDirectives; i++)
  1131. {
  1132. // See if this url starts with the same thing as the directive
  1133. if (!MyAsciiCmpNIW(pwszUrl, pslThisRobotsTxt->GetString(i), pslThisRobotsTxt->GetStringLen(i)))
  1134. {
  1135. // hit! see if this is "allow" or "disallow"
  1136. if (!(pslThisRobotsTxt->GetStringData(i) & DATA_ALLOW))
  1137. {
  1138. TraceMsg(TF_THISMODULE, "ValidateWithRobotsTxt disallowing: (%ws) (%ws)",
  1139. pslThisRobotsTxt->GetString(i), pwszUrl);
  1140. *pfAllow = FALSE;
  1141. m_iSkippedByRobotsTxt ++;
  1142. }
  1143. break;
  1144. }
  1145. }
  1146. }
  1147. return S_OK;
  1148. }
  1149. return E_FAIL;
  1150. }
  1151. typedef struct
  1152. {
  1153. LPCWSTR pwszThisUrl;
  1154. CWCStringList *pslGlobal;
  1155. BOOL fDiskFull;
  1156. DWORD dwSize;
  1157. GROUPID llGroupID;
  1158. }
  1159. ENUMDEPENDENCIES;
  1160. // Doesn't process it if we already have it in the global dependency list
  1161. HRESULT CWebCrawler::CheckImageOrLink(IUnknown *punkItem, BSTR *pbstrItem, DWORD_PTR dwEnumDep, DWORD *pdwStringData)
  1162. {
  1163. if (!dwEnumDep)
  1164. return E_FAIL;
  1165. ENUMDEPENDENCIES *pEnumDep = (ENUMDEPENDENCIES *) dwEnumDep;
  1166. WCHAR wszCombinedUrl[INTERNET_MAX_URL_LENGTH];
  1167. DWORD dwLen = ARRAYSIZE(wszCombinedUrl);
  1168. HRESULT hr;
  1169. if (pEnumDep->fDiskFull)
  1170. return E_ABORT; // Abort enumeration
  1171. if (SUCCEEDED(UrlCombineW(pEnumDep->pwszThisUrl, *pbstrItem, wszCombinedUrl, &dwLen, 0)))
  1172. {
  1173. TCHAR szCombinedUrl[INTERNET_MAX_URL_LENGTH];
  1174. BYTE chBuf[MY_MAX_CACHE_ENTRY_INFO];
  1175. if (pEnumDep->pslGlobal != NULL)
  1176. {
  1177. int iCode = pEnumDep->pslGlobal->AddString(*pbstrItem, 0);
  1178. if (CWCStringList::STRLST_ADDED != iCode)
  1179. {
  1180. // The string already existed (or Add failed). Don't process this.
  1181. return S_OK;
  1182. }
  1183. }
  1184. // Process this url.
  1185. MyOleStrToStrN(szCombinedUrl, INTERNET_MAX_URL_LENGTH, wszCombinedUrl);
  1186. hr = GetUrlInfoAndMakeSticky(NULL, szCombinedUrl,
  1187. (LPINTERNET_CACHE_ENTRY_INFO)chBuf, sizeof(chBuf),
  1188. pEnumDep->llGroupID);
  1189. if (E_OUTOFMEMORY == hr)
  1190. {
  1191. pEnumDep->fDiskFull = TRUE;
  1192. return E_ABORT; // Skip rest of enumeration
  1193. }
  1194. if (SUCCEEDED(hr))
  1195. pEnumDep->dwSize += ((LPINTERNET_CACHE_ENTRY_INFO)chBuf)->dwSizeLow;
  1196. }
  1197. return S_OK;
  1198. }
  1199. HRESULT CWebCrawler::MatchNames(BSTR bstrName, BOOL fPassword)
  1200. {
  1201. static const WCHAR c_szPassword1[] = L"password";
  1202. static const WCHAR c_szUsername1[] = L"user";
  1203. static const WCHAR c_szUsername2[] = L"username";
  1204. HRESULT hr = E_FAIL;
  1205. LPCTSTR pszKey = c_szRegKeyPasswords;
  1206. // See if the name matches our preset options.
  1207. // Should these be localized? I don't think so or subscribing to
  1208. // US sites will fail in international versions of the browser.
  1209. if (fPassword)
  1210. {
  1211. if (StrCmpIW(bstrName, c_szPassword1) == 0)
  1212. {
  1213. hr = S_OK;
  1214. }
  1215. }
  1216. else
  1217. {
  1218. if ((StrCmpIW(bstrName, c_szUsername1) == 0) ||
  1219. (StrCmpIW(bstrName, c_szUsername2) == 0))
  1220. {
  1221. hr = S_OK;
  1222. }
  1223. else
  1224. {
  1225. pszKey = c_szRegKeyUsernames;
  1226. }
  1227. }
  1228. // Try the registry for custom form names if the presets didn't match.
  1229. if (FAILED(hr))
  1230. {
  1231. LONG lRes;
  1232. HKEY hKey;
  1233. DWORD cValues;
  1234. DWORD i;
  1235. lRes = RegOpenKeyEx(HKEY_CURRENT_USER, pszKey, 0, KEY_READ, &hKey);
  1236. if (ERROR_SUCCESS == lRes)
  1237. {
  1238. lRes = RegQueryInfoKey(hKey, NULL, NULL, NULL, NULL, NULL, NULL, &cValues, NULL, NULL, NULL, NULL);
  1239. if (ERROR_SUCCESS == lRes)
  1240. {
  1241. for (i = 0; i < cValues; i++)
  1242. {
  1243. TCHAR szValueName[MAX_PATH];
  1244. DWORD cchValueName = ARRAYSIZE(szValueName);
  1245. lRes = SHEnumValue(hKey, i, szValueName, &cchValueName, NULL, NULL, NULL);
  1246. if (ERROR_SUCCESS == lRes)
  1247. {
  1248. WCHAR wszValueName[MAX_PATH];
  1249. MyStrToOleStrN(wszValueName, ARRAYSIZE(wszValueName), szValueName);
  1250. if (StrCmpIW(bstrName, wszValueName) == 0)
  1251. {
  1252. hr = S_OK;
  1253. break;
  1254. }
  1255. }
  1256. }
  1257. }
  1258. lRes = RegCloseKey(hKey);
  1259. ASSERT(ERROR_SUCCESS == lRes);
  1260. }
  1261. }
  1262. return hr;
  1263. }
  1264. HRESULT CWebCrawler::FindAndSubmitForm(void)
  1265. {
  1266. // FindAndSubmitForm - If there is a user name and password in
  1267. // the start item, this will attempt to fill in and submit
  1268. // a form. It should only be called on the top level page of a
  1269. // webcrawl. We still need to check the host name in case we were
  1270. // spawned from a channel crawl.
  1271. //
  1272. // return values: S_OK successfully found and submitted a form -> restart webcrawl
  1273. // S_FALSE no username, no form, or unrecognized form ->continue webcrawl
  1274. // E_FAIL submit failed -> abort webcrawl
  1275. //
  1276. HRESULT hrReturn = S_FALSE;
  1277. HRESULT hr = S_OK;
  1278. BSTR bstrUsername = NULL;
  1279. BSTR bstrPassword = NULL;
  1280. BSTR bstrInputType= NULL;
  1281. static const WCHAR c_szInputTextType[]=L"text";
  1282. // If our host name doesn't match the root host name, don't return auth
  1283. // information.
  1284. if (m_bstrHostName)
  1285. {
  1286. LPWSTR pwszUrl, bstrHostName=NULL;
  1287. m_pCurDownload->GetRealURL(&pwszUrl); // may re-enter Trident
  1288. if (pwszUrl)
  1289. {
  1290. GetHostName(pwszUrl, &bstrHostName);
  1291. LocalFree(pwszUrl);
  1292. }
  1293. if (bstrHostName)
  1294. {
  1295. if (MyAsciiCmpW(bstrHostName, m_bstrHostName))
  1296. {
  1297. hr = E_FAIL;
  1298. }
  1299. SysFreeString(bstrHostName);
  1300. }
  1301. }
  1302. if (SUCCEEDED(hr))
  1303. hr = ReadBSTR(m_pSubscriptionItem, c_szPropCrawlUsername, &bstrUsername);
  1304. if (SUCCEEDED(hr) && bstrUsername && bstrUsername[0])
  1305. {
  1306. // NOTE: We don't allow NULL passwords.
  1307. hr = ReadPassword(m_pSubscriptionItem, &bstrPassword);
  1308. if (SUCCEEDED(hr) && bstrPassword && bstrPassword[0])
  1309. {
  1310. IHTMLDocument2 *pDoc = NULL;
  1311. hr = m_pCurDownload->GetDocument(&pDoc);
  1312. if (SUCCEEDED(hr) && pDoc)
  1313. {
  1314. IHTMLElementCollection *pFormsCollection = NULL;
  1315. hr = pDoc->get_forms(&pFormsCollection);
  1316. if (SUCCEEDED(hr) && pFormsCollection)
  1317. {
  1318. long length;
  1319. hr = pFormsCollection->get_length(&length);
  1320. TraceMsg(TF_THISMODULE, "**** FOUND USER NAME, PASSWORD, & %d FORMS ****", (int)length);
  1321. if (SUCCEEDED(hr) && length > 0)
  1322. {
  1323. // We only check the first form for a user name and password.
  1324. // Why do we pass an index to IHTMLElementCollection when
  1325. // the interface prototype says it takes a name?
  1326. IDispatch *pDispForm = NULL;
  1327. VARIANT vIndex, vEmpty;
  1328. VariantInit(&vIndex);
  1329. VariantInit(&vEmpty);
  1330. vIndex.vt = VT_I4;
  1331. vIndex.lVal = 0;
  1332. hr = pFormsCollection->item(vIndex, vEmpty, &pDispForm);
  1333. if (SUCCEEDED(hr) && pDispForm)
  1334. {
  1335. IHTMLFormElement *pForm = NULL;
  1336. hr = pDispForm->QueryInterface(IID_IHTMLFormElement, (void **)&pForm);
  1337. if (SUCCEEDED(hr) && pForm)
  1338. {
  1339. // Enum form elements looking for the input types we care about.
  1340. // Would it be faster to use tags()?
  1341. hr = pForm->get_length(&length);
  1342. if (SUCCEEDED(hr) && length >= 2)
  1343. {
  1344. // TraceMsg(TF_THISMODULE, "**** FORM ELEMENTS (%d) ****", (int)length);
  1345. BOOL fUsernameSet = FALSE;
  1346. BOOL fPasswordSet = FALSE;
  1347. IDispatch *pDispItem = NULL;
  1348. long i;
  1349. for (i = 0; i < length; i++)
  1350. {
  1351. vIndex.lVal = i; // re-use vIndex above
  1352. hr = pForm->item(vIndex, vEmpty, &pDispItem);
  1353. if (SUCCEEDED(hr) && pDispItem)
  1354. {
  1355. IHTMLInputTextElement *pInput = NULL;
  1356. // QI was the easiest way to tell them apart...
  1357. // InputText is derived from InputPassword
  1358. hr = pDispItem->QueryInterface(IID_IHTMLInputTextElement, (void **)&pInput);
  1359. SAFERELEASE(pDispItem);
  1360. if (SUCCEEDED(hr) && pInput)
  1361. {
  1362. hr = pInput->get_type(&bstrInputType);
  1363. ASSERT(SUCCEEDED(hr) && bstrInputType);
  1364. BSTR bstrName = NULL;
  1365. if (StrCmpIW(bstrInputType, c_szInputTextType) == 0)
  1366. {
  1367. // We found an INPUT element with attribute TYPE="text".
  1368. // Set it if the NAME attribute matches.
  1369. // Only setting the first matching input.
  1370. // Do we care about max length or does put_value handle it?
  1371. // TraceMsg(TF_THISMODULE, "**** FORM ELEMENT INPUT (%d) ****", (int)i);
  1372. if (!fUsernameSet)
  1373. {
  1374. hr = pInput->get_name(&bstrName);
  1375. ASSERT(SUCCEEDED(hr) && bstrName);
  1376. if (SUCCEEDED(hr) && bstrName && SUCCEEDED(MatchNames(bstrName, FALSE)))
  1377. {
  1378. hr = pInput->put_value(bstrUsername);
  1379. if (SUCCEEDED(hr))
  1380. fUsernameSet = TRUE;
  1381. }
  1382. }
  1383. }
  1384. else
  1385. {
  1386. // We found an INPUT element with attribute TYPE="password"
  1387. // Set it if the name attribute matches.
  1388. // Only setting the first matching input.
  1389. // Do we care about max length or does put_value handle it?
  1390. // TraceMsg(TF_THISMODULE, "**** FORM ELEMENT PASSWORD (%d) ****", (int)i);
  1391. if (!fPasswordSet)
  1392. {
  1393. hr = pInput->get_name(&bstrName);
  1394. ASSERT(SUCCEEDED(hr) && bstrName);
  1395. if (SUCCEEDED(hr) && bstrName && SUCCEEDED(MatchNames(bstrName, TRUE)))
  1396. {
  1397. hr = pInput->put_value(bstrPassword);
  1398. if (SUCCEEDED(hr))
  1399. fPasswordSet = TRUE;
  1400. }
  1401. }
  1402. }
  1403. SAFEFREEBSTR(bstrName);
  1404. SAFERELEASE(pInput);
  1405. }
  1406. }
  1407. }
  1408. // Submit the form is everything was set.
  1409. if (fUsernameSet && fPasswordSet)
  1410. {
  1411. ASSERT(!m_pCurDownload->GetFormSubmitted());
  1412. m_pCurDownload->SetFormSubmitted(TRUE);
  1413. hr = pForm->submit();
  1414. if (SUCCEEDED(hr))
  1415. {
  1416. m_iNumPagesDownloading ++;
  1417. TraceMsg(TF_THISMODULE, "**** FORM SUBMIT WORKED ****");
  1418. hrReturn = S_OK;
  1419. }
  1420. else
  1421. {
  1422. TraceMsg(TF_THISMODULE, "**** FORM SUBMIT FAILED ****");
  1423. hrReturn = E_FAIL;
  1424. }
  1425. }
  1426. }
  1427. SAFERELEASE(pForm);
  1428. }
  1429. SAFERELEASE(pDispForm);
  1430. }
  1431. // only length
  1432. }
  1433. SAFERELEASE(pFormsCollection);
  1434. }
  1435. SAFERELEASE(pDoc);
  1436. }
  1437. // free bstr below because we check for empty bstrs
  1438. }
  1439. SAFEFREEBSTR(bstrPassword);
  1440. }
  1441. SAFEFREEBSTR(bstrUsername);
  1442. return hrReturn;
  1443. }
  1444. // Make page and dependencies sticky and get total size
  1445. HRESULT CWebCrawler::MakePageStickyAndGetSize(LPCWSTR pwszURL, DWORD *pdwSize, BOOL *pfDiskFull)
  1446. {
  1447. ASSERT(m_pDependencies || IsRecurseFlagSet(WEBCRAWL_DONT_MAKE_STICKY));
  1448. HRESULT hr;
  1449. TCHAR szThisUrl[INTERNET_MAX_URL_LENGTH]; // use ansi internally
  1450. BYTE chBuf[MY_MAX_CACHE_ENTRY_INFO];
  1451. LPINTERNET_CACHE_ENTRY_INFO lpInfo = (LPINTERNET_CACHE_ENTRY_INFO) chBuf;
  1452. DWORD dwBufSize = sizeof(chBuf);
  1453. *pdwSize = 0;
  1454. // First we make our base url sticky and check it for changes
  1455. MyOleStrToStrN(szThisUrl, INTERNET_MAX_URL_LENGTH, pwszURL);
  1456. hr = GetUrlInfoAndMakeSticky(NULL, szThisUrl, lpInfo, dwBufSize, m_llCacheGroupID);
  1457. if (E_OUTOFMEMORY != hr)
  1458. {
  1459. if (SUCCEEDED(hr))
  1460. *pdwSize += lpInfo->dwSizeLow;
  1461. if (!IsAgentFlagSet(FLAG_CRAWLCHANGED) && SUCCEEDED(hr))
  1462. {
  1463. hr = PostCheckUrlForChange(&m_varChange, lpInfo, lpInfo->LastModifiedTime);
  1464. // If we FAILED, we mark it as changed.
  1465. if (hr == S_OK || FAILED(hr))
  1466. {
  1467. SetAgentFlag(FLAG_CRAWLCHANGED);
  1468. DBG("URL has changed; will flag webcrawl as changed");
  1469. }
  1470. // "Changes Only" mode, persist change detection code
  1471. if (IsAgentFlagSet(FLAG_CHANGESONLY))
  1472. {
  1473. ASSERT(m_iTotalStarted == 1);
  1474. WriteVariant(m_pSubscriptionItem, c_szPropChangeCode, &m_varChange);
  1475. return S_OK; // We know there are no dependencies
  1476. }
  1477. hr = S_OK;
  1478. }
  1479. }
  1480. else
  1481. {
  1482. *pfDiskFull = TRUE;
  1483. }
  1484. // Now we make all the new dependencies we downloaded for this page sticky
  1485. if (!*pfDiskFull && m_pDependencies)
  1486. {
  1487. EnterCriticalSection(&m_critDependencies);
  1488. for (; m_iDependenciesProcessed < m_pDependencies->NumStrings(); m_iDependenciesProcessed ++)
  1489. {
  1490. MyOleStrToStrN(szThisUrl, INTERNET_MAX_URL_LENGTH, m_pDependencies->GetString(m_iDependenciesProcessed));
  1491. hr = GetUrlInfoAndMakeSticky(NULL, szThisUrl, lpInfo, dwBufSize, m_llCacheGroupID);
  1492. if (E_OUTOFMEMORY == hr)
  1493. {
  1494. *pfDiskFull = TRUE;
  1495. break;
  1496. }
  1497. if (SUCCEEDED(hr))
  1498. *pdwSize += lpInfo->dwSizeLow;
  1499. }
  1500. LeaveCriticalSection(&m_critDependencies);
  1501. }
  1502. if (*pfDiskFull)
  1503. {
  1504. DBG_WARN("Webcrawler: UrlCache full trying to make sticky");
  1505. return E_OUTOFMEMORY;
  1506. }
  1507. return S_OK;
  1508. }
  1509. // true if found token & made null-term
  1510. LPSTR GetToken(LPSTR pszBuf, /*inout*/int *piBufPtr, /*out*/int *piLen)
  1511. {
  1512. static const CHAR szWhitespace[] = " \t\n\r";
  1513. int iPtr = *piBufPtr;
  1514. int iLen;
  1515. while (1)
  1516. {
  1517. // skip leading whitespace
  1518. iPtr += StrSpnA(pszBuf+iPtr, szWhitespace);
  1519. if (!pszBuf[iPtr])
  1520. return NULL;
  1521. if (pszBuf[iPtr] == '#')
  1522. {
  1523. // comment; skip line
  1524. while (pszBuf[iPtr] && pszBuf[iPtr]!='\r' && pszBuf[iPtr]!='\n') iPtr++;
  1525. if (!pszBuf[iPtr])
  1526. return NULL;
  1527. continue;
  1528. }
  1529. // skip to next whitespace
  1530. iLen = StrCSpnA(pszBuf+iPtr, szWhitespace);
  1531. if (iLen == 0)
  1532. return NULL; // shoudln't happen
  1533. *piBufPtr = iLen + iPtr;
  1534. if (piLen)
  1535. *piLen = iLen;
  1536. if (pszBuf[iLen+iPtr])
  1537. {
  1538. pszBuf[iLen+iPtr] = NULL;
  1539. ++ *piBufPtr;
  1540. }
  1541. break;
  1542. }
  1543. // TraceMsgA(TF_THISMODULE, "GetToken returning \"%s\"", (LPSTR)(pszBuf+iPtr));
  1544. return pszBuf + iPtr;
  1545. }
  1546. // === Support functions for OnDownloadComplete
  1547. // ParseRobotsTxt gets the stream from CUrlDownload, parses it, and fills in parsed
  1548. // info to *ppslRet
  1549. HRESULT CWebCrawler::ParseRobotsTxt(LPCWSTR pwszRobotsTxtURL, CWCStringList **ppslRet)
  1550. {
  1551. // Given a robots.txt file (from CUrlDownload), it
  1552. // parses the file and fills in a string list with appropriate
  1553. // info.
  1554. *ppslRet = FALSE;
  1555. CHAR szRobotsTxt[MAX_ROBOTS_SIZE];
  1556. HRESULT hr=S_OK;
  1557. LPSTR pszToken;
  1558. IStream *pstm=NULL;
  1559. DWORD_PTR dwData;
  1560. hr = m_pCurDownload->GetStream(&pstm);
  1561. if (SUCCEEDED(hr))
  1562. {
  1563. STATSTG st;
  1564. DWORD dwSize;
  1565. DBG("CWebCrawler parsing robots.txt file");
  1566. pstm->Stat(&st, STATFLAG_NONAME);
  1567. dwSize = st.cbSize.LowPart;
  1568. if (st.cbSize.HighPart || dwSize >= MAX_ROBOTS_SIZE)
  1569. {
  1570. szRobotsTxt[0] = 0;
  1571. DBG("CWebCrawler: Robots.Txt too big; ignoring");
  1572. hr = E_FAIL;
  1573. }
  1574. else
  1575. {
  1576. hr = pstm->Read(szRobotsTxt, dwSize, NULL);
  1577. szRobotsTxt[dwSize] = 0;
  1578. }
  1579. pstm->Release();
  1580. pstm=NULL;
  1581. if ((szRobotsTxt[0] == 0xff) && (szRobotsTxt[1] == 0xfe))
  1582. {
  1583. DBG_WARN("Unicode robots.txt! Ignoring ...");
  1584. hr = E_FAIL;
  1585. }
  1586. }
  1587. if (FAILED(hr))
  1588. return hr;
  1589. int iPtr = 0;
  1590. WCHAR wchBuf2[256];
  1591. WCHAR wchBuf[INTERNET_MAX_URL_LENGTH];
  1592. DWORD dwBufSize;
  1593. // Find the first "user-agent" which matches
  1594. while ((pszToken = GetToken(szRobotsTxt, &iPtr, NULL)) != NULL)
  1595. {
  1596. if (lstrcmpiA(pszToken, c_szRobots_UserAgent))
  1597. continue;
  1598. pszToken = GetToken(szRobotsTxt, &iPtr, NULL);
  1599. if (!pszToken)
  1600. break;
  1601. if ((*pszToken == '*') ||
  1602. (!lstrcmpiA(pszToken, c_szRobots_OurUserAgent)))
  1603. {
  1604. TraceMsgA(TF_THISMODULE, "Using user agent segment: \"%s\"", pszToken);
  1605. break;
  1606. }
  1607. }
  1608. if (!pszToken)
  1609. return E_FAIL;
  1610. CWCStringList *psl = new CWCDwordStringList;
  1611. if (psl)
  1612. {
  1613. psl->Init(2048);
  1614. // Look for Allow: or Disallow: sections
  1615. while ((pszToken = GetToken(szRobotsTxt, &iPtr, NULL)) != NULL)
  1616. {
  1617. if (!lstrcmpiA(pszToken, c_szRobots_UserAgent))
  1618. break; // end of our 'user-agent' section
  1619. dwData = 0;
  1620. if (!lstrcmpiA(pszToken, c_szRobots_Allow)) dwData = DATA_ALLOW;
  1621. if (!lstrcmpiA(pszToken, c_szRobots_Disallow)) dwData = DATA_DISALLOW;
  1622. if (!dwData)
  1623. continue; // look for next token
  1624. pszToken = GetToken(szRobotsTxt, &iPtr, NULL);
  1625. if (!pszToken)
  1626. break;
  1627. // Ensure that they don't have blank entries; we'll abort if so
  1628. if (!lstrcmpiA(pszToken, c_szRobots_UserAgent) ||
  1629. !lstrcmpiA(pszToken, c_szRobots_Allow) ||
  1630. !lstrcmpiA(pszToken, c_szRobots_Disallow))
  1631. {
  1632. break;
  1633. }
  1634. // Combine this url with the base for this site.
  1635. dwBufSize = ARRAYSIZE(wchBuf);
  1636. if (SHAnsiToUnicode(pszToken, wchBuf2, ARRAYSIZE(wchBuf2)) &&
  1637. SUCCEEDED(UrlCombineW(pwszRobotsTxtURL, wchBuf2, wchBuf, &dwBufSize, 0)))
  1638. {
  1639. TraceMsgA(TF_THISMODULE, "Robots.txt will %s urls with %s (%ws)",
  1640. ((dwData==DATA_ALLOW) ? c_szRobots_Allow : c_szRobots_Disallow),
  1641. pszToken, wchBuf);
  1642. // if this is a duplicate url we effectively ignore this directive
  1643. // thanks to CWCStringList removing duplicates for us
  1644. psl->AddString(wchBuf, dwData);
  1645. }
  1646. }
  1647. }
  1648. if (psl && (psl->NumStrings() > 0))
  1649. {
  1650. *ppslRet = psl;
  1651. return S_OK;
  1652. }
  1653. if (psl)
  1654. delete psl;
  1655. return E_FAIL;
  1656. }
  1657. HRESULT CWebCrawler::GetRealUrl(int iPageIndex, LPWSTR *ppwszThisUrl)
  1658. {
  1659. m_pCurDownload->GetRealURL(ppwszThisUrl);
  1660. if (*ppwszThisUrl)
  1661. {
  1662. return S_OK;
  1663. }
  1664. DBG_WARN("m_pCurDownload->GetRealURL failed!!!");
  1665. // Get url from string list
  1666. LPCWSTR pwszUrl=NULL;
  1667. pwszUrl = m_pPages->GetString(iPageIndex);
  1668. if (pwszUrl)
  1669. {
  1670. *ppwszThisUrl = StrDupW(pwszUrl);
  1671. }
  1672. return (*ppwszThisUrl) ? S_OK : E_OUTOFMEMORY;
  1673. }
  1674. // Allocates BSTR for host name.
  1675. HRESULT CWebCrawler::GetHostName(LPCWSTR pwszThisUrl, BSTR *pbstrHostName)
  1676. {
  1677. if (pwszThisUrl)
  1678. {
  1679. URL_COMPONENTSA comp;
  1680. LPSTR pszUrl;
  1681. int iLen;
  1682. // InternetCrackUrlW(pszUrl, 0, 0, &comp) // this is even slower than converting it ourselves...
  1683. // convert to ansi
  1684. iLen = lstrlenW(pwszThisUrl) + 1;
  1685. pszUrl = (LPSTR)MemAlloc(LMEM_FIXED, iLen);
  1686. if (pszUrl)
  1687. {
  1688. SHUnicodeToAnsi(pwszThisUrl, pszUrl, iLen);
  1689. // crack out the host name
  1690. ZeroMemory(&comp, sizeof(comp));
  1691. comp.dwStructSize = sizeof(comp);
  1692. comp.dwHostNameLength = 1; // indicate that we want the host name
  1693. if (InternetCrackUrlA(pszUrl, 0, 0, &comp))
  1694. {
  1695. *pbstrHostName = SysAllocStringLen(NULL, comp.dwHostNameLength);
  1696. if (*pbstrHostName)
  1697. {
  1698. comp.lpszHostName[comp.dwHostNameLength] = 0; // avoid debug rip
  1699. SHAnsiToUnicode(comp.lpszHostName, *pbstrHostName, comp.dwHostNameLength + 1);
  1700. ASSERT((*pbstrHostName)[comp.dwHostNameLength] == 0);
  1701. }
  1702. }
  1703. MemFree((HLOCAL)pszUrl);
  1704. }
  1705. }
  1706. return S_OK;
  1707. }
  1708. // Gets partly validated (CUrlDownload::IsValidUrl and hostname validation)
  1709. // string lists and leaves in m_pPendingLinks
  1710. // Remaining validation is robots.txt if any
  1711. HRESULT CWebCrawler::GetLinksFromPage()
  1712. {
  1713. // Get links from this page that we want to follow.
  1714. CWCStringList *pslLinks=NULL, slMeta;
  1715. IHTMLDocument2 *pDoc;
  1716. BOOL fFollowLinks = TRUE;
  1717. int i;
  1718. slMeta.Init(2048);
  1719. m_pCurDownload->GetDocument(&pDoc);
  1720. if (pDoc)
  1721. {
  1722. // See if there is a META tag telling us not to follow
  1723. CHelperOM::GetCollection(pDoc, &slMeta, CHelperOM::CTYPE_META, NULL, 0);
  1724. for (i=0; i<slMeta.NumStrings(); i++)
  1725. {
  1726. if (!StrCmpNIW(slMeta.GetString(i), c_wszRobotsMetaName, c_iRobotsMetaNameLen))
  1727. {
  1728. LPCWSTR pwszContent = slMeta.GetString(i) + c_iRobotsMetaNameLen;
  1729. TraceMsg(TF_THISMODULE, "Found 'robots' meta tag; content=%ws", pwszContent);
  1730. while (pwszContent && *pwszContent)
  1731. {
  1732. if (!StrCmpNIW(pwszContent, c_wszRobotsNoFollow, c_iRobotsNoFollow))
  1733. {
  1734. DBG("Not following links from this page.");
  1735. fFollowLinks = FALSE;
  1736. break;
  1737. }
  1738. pwszContent = StrChrW(pwszContent+1, L',');
  1739. if (pwszContent && *pwszContent)
  1740. pwszContent ++;
  1741. }
  1742. break;
  1743. }
  1744. }
  1745. if (fFollowLinks)
  1746. {
  1747. if (m_pPendingLinks)
  1748. pslLinks = m_pPendingLinks;
  1749. else
  1750. {
  1751. pslLinks = new CWCDwordStringList;
  1752. if (pslLinks)
  1753. pslLinks->Init();
  1754. else
  1755. return E_OUTOFMEMORY;
  1756. }
  1757. CHelperOM::GetCollection(pDoc, pslLinks, CHelperOM::CTYPE_LINKS, &CheckLink, (DWORD_PTR)this);
  1758. CHelperOM::GetCollection(pDoc, pslLinks, CHelperOM::CTYPE_MAPS, &CheckLink, (DWORD_PTR)this);
  1759. }
  1760. pDoc->Release();
  1761. pDoc=NULL;
  1762. }
  1763. m_pPendingLinks = pslLinks;
  1764. return S_OK;
  1765. }
  1766. // Gets 'dependency links' such as frames from a page
  1767. HRESULT CWebCrawler::GetDependencyLinksFromPage(LPCWSTR pwszThisUrl, DWORD dwRecurse)
  1768. {
  1769. CWCStringList *psl=NULL;
  1770. IHTMLDocument2 *pDoc;
  1771. int i, iAdd, iIndex, iOldMax;
  1772. DWORD_PTR dwData;
  1773. if (m_pDependencyLinks)
  1774. psl = m_pDependencyLinks;
  1775. else
  1776. {
  1777. m_iDependencyStarted = 0;
  1778. psl = new CWCStringList;
  1779. if (psl)
  1780. psl->Init(2048);
  1781. else
  1782. return E_OUTOFMEMORY;
  1783. }
  1784. iOldMax = psl->NumStrings();
  1785. m_pCurDownload->GetDocument(&pDoc);
  1786. if (pDoc)
  1787. {
  1788. // Add Frames ("Frame" and "IFrame" tags) if present
  1789. CHelperOM::GetCollection(pDoc, psl, CHelperOM::CTYPE_FRAMES, CheckFrame, (DWORD_PTR)pwszThisUrl);
  1790. }
  1791. SAFERELEASE(pDoc);
  1792. m_pDependencyLinks = psl;
  1793. // Add the new urls to the main page list
  1794. for (i = iOldMax; i<psl->NumStrings(); i++)
  1795. {
  1796. iAdd = m_pPages->AddString(m_pDependencyLinks->GetString(i),
  1797. dwRecurse,
  1798. &iIndex);
  1799. if (m_lMaxNumUrls > 0 && iAdd==CWCStringList::STRLST_ADDED)
  1800. m_lMaxNumUrls ++;
  1801. if (iAdd == CWCStringList::STRLST_FAIL)
  1802. return E_OUTOFMEMORY;
  1803. if (iAdd == CWCStringList::STRLST_DUPLICATE)
  1804. {
  1805. // bump up recursion level of old page if necessary
  1806. // See if we've downloaded this yet.
  1807. dwData = m_pPages->GetStringData(iIndex);
  1808. if (!(dwData & DATA_DLSTARTED))
  1809. {
  1810. // Haven't downloaded it yet.
  1811. // Update the recurse levels if necessary.
  1812. if ((dwData & DATA_RECURSEMASK) < dwRecurse)
  1813. {
  1814. dwData = (dwData & ~DATA_RECURSEMASK) | dwRecurse;
  1815. }
  1816. // Turn off the "link" bit
  1817. dwData &= ~DATA_LINK;
  1818. m_pPages->SetStringData(iIndex, dwData);
  1819. }
  1820. #ifdef DEBUG
  1821. // Shouldn't happen; this frame already dl'd with lower recurse level
  1822. else
  1823. ASSERT((dwData & DATA_RECURSEMASK) >= dwRecurse);
  1824. #endif
  1825. }
  1826. }
  1827. return S_OK;
  1828. }
  1829. //-------------------------------------
  1830. // OnDownloadComplete
  1831. //
  1832. // Called when a url is finished downloading, it processes the url
  1833. // and kicks off the next download
  1834. //
  1835. HRESULT CWebCrawler::OnDownloadComplete(UINT iID, int iError)
  1836. {
  1837. int iPageIndex = m_iCurDownloadStringIndex;
  1838. BOOL fOperationComplete = FALSE;
  1839. BOOL fDiskFull = FALSE;
  1840. BSTR bstrCDFURL = NULL; // CDF URL if there is one
  1841. LPWSTR pwszThisUrl=NULL;
  1842. HRESULT hr;
  1843. TraceMsg(TF_THISMODULE, "WebCrawler: OnDownloadComplete(%d)", iError);
  1844. ASSERT(m_pPages);
  1845. ASSERT(iPageIndex < m_pCurDownloadStringList->NumStrings());
  1846. if (_ERROR_REPROCESSING != iError)
  1847. {
  1848. m_iNumPagesDownloading --;
  1849. ASSERT(m_iNumPagesDownloading == 0);
  1850. }
  1851. if (m_pCurDownloadStringList == m_pRobotsTxt)
  1852. {
  1853. CWCStringList *pslNew=NULL;
  1854. // Process robots.txt file
  1855. if (SUCCEEDED(ParseRobotsTxt(m_pRobotsTxt->GetString(iPageIndex), &pslNew)))
  1856. {
  1857. m_pRobotsTxt->SetStringData(iPageIndex, (DWORD_PTR)(pslNew));
  1858. }
  1859. }
  1860. else
  1861. {
  1862. // Process normal file
  1863. ASSERT(m_pCurDownloadStringList == m_pPages);
  1864. DWORD dwData, dwRecurseLevelsFromThisPage;
  1865. dwData = (DWORD)m_pPages->GetStringData(iPageIndex);
  1866. dwRecurseLevelsFromThisPage = dwData & DATA_RECURSEMASK;
  1867. dwData |= DATA_DLFINISHED;
  1868. if (iError > 0)
  1869. dwData |= DATA_DLERROR;
  1870. // mark as downloaded
  1871. m_pCurDownloadStringList->SetStringData(iPageIndex, dwData);
  1872. // Is this the first page?
  1873. if (m_iTotalStarted == 1)
  1874. {
  1875. // Check the HTTP response code
  1876. DWORD dwResponseCode;
  1877. hr = m_pCurDownload->GetResponseCode(&dwResponseCode);
  1878. if (SUCCEEDED(hr))
  1879. {
  1880. hr = CheckResponseCode(dwResponseCode);
  1881. if (FAILED(hr))
  1882. fOperationComplete = TRUE;
  1883. }
  1884. else
  1885. DBG("CWebCrawler failed to GetResponseCode");
  1886. // Get the Charset
  1887. BSTR bstrCharSet=NULL;
  1888. IHTMLDocument2 *pDoc=NULL;
  1889. // -> Bharats --------
  1890. // Find a link tag and store it away the cdf by copying it (if it points to a cdf.)
  1891. // do url combine of this cdf
  1892. if (SUCCEEDED(m_pCurDownload->GetDocument(&pDoc)) && pDoc &&
  1893. SUCCEEDED(pDoc->get_charset(&bstrCharSet)) && bstrCharSet)
  1894. {
  1895. WriteOLESTR(m_pSubscriptionItem, c_szPropCharSet, bstrCharSet);
  1896. TraceMsg(TF_THISMODULE, "Charset = \"%ws\"", bstrCharSet);
  1897. SysFreeString(bstrCharSet);
  1898. }
  1899. else
  1900. WriteEMPTY(m_pSubscriptionItem, c_szPropCharSet);
  1901. if(pDoc)
  1902. {
  1903. if(FAILED(GetChannelItem(NULL))) // A Doc exists and this download is not from a channel itself
  1904. {
  1905. IHTMLLinkElement *pLink = NULL;
  1906. hr = SearchForElementInHead(pDoc, OLESTR("REL"), OLESTR("OFFLINE"),
  1907. IID_IHTMLLinkElement, (IUnknown **)&pLink);
  1908. if(S_OK == hr)
  1909. {
  1910. hr = pLink->get_href(&bstrCDFURL);
  1911. pLink->Release();
  1912. }
  1913. }
  1914. pDoc->Release();
  1915. pDoc = NULL;
  1916. }
  1917. }
  1918. if ((iError != _ERROR_REPROCESSING) && (iError != BDU2_ERROR_NONE))
  1919. {
  1920. if (iError != BDU2_ERROR_NOT_HTML)
  1921. m_iDownloadErrors ++;
  1922. if (iError == BDU2_ERROR_MAXSIZE)
  1923. {
  1924. SetEndStatus(INET_E_AGENT_MAX_SIZE_EXCEEDED);
  1925. fOperationComplete = TRUE;
  1926. }
  1927. }
  1928. else
  1929. {
  1930. // Don't process this url if we already have set fOperationComplete
  1931. if (!fOperationComplete)
  1932. {
  1933. // Did we get *just* the HEAD info?
  1934. if (IsAgentFlagSet(FLAG_HEADONLY))
  1935. {
  1936. SYSTEMTIME stLastModified;
  1937. FILETIME ftLastModified;
  1938. if (SUCCEEDED(m_pCurDownload->GetLastModified(&stLastModified)) &&
  1939. SystemTimeToFileTime(&stLastModified, &ftLastModified))
  1940. {
  1941. DBG("Retrieved 'HEAD' info; change detection based on Last Modified");
  1942. hr = PostCheckUrlForChange(&m_varChange, NULL, ftLastModified);
  1943. // If we FAILED, we mark it as changed.
  1944. if (hr == S_OK || FAILED(hr))
  1945. {
  1946. SetAgentFlag(FLAG_CRAWLCHANGED);
  1947. DBG("URL has changed; will flag webcrawl as changed");
  1948. }
  1949. // "Changes Only" mode, persist change detection code
  1950. ASSERT(IsAgentFlagSet(FLAG_CHANGESONLY));
  1951. ASSERT(m_iTotalStarted == 1);
  1952. WriteVariant(m_pSubscriptionItem, c_szPropChangeCode, &m_varChange);
  1953. }
  1954. }
  1955. else
  1956. {
  1957. // Get real URL in case we were redirected
  1958. if (FAILED(GetRealUrl(iPageIndex, &pwszThisUrl)))
  1959. {
  1960. fOperationComplete = TRUE; // bad
  1961. }
  1962. else
  1963. {
  1964. ASSERT(pwszThisUrl);
  1965. // Get host name from first page if necessary
  1966. if ((iPageIndex==0) &&
  1967. (m_dwRecurseLevels>0) &&
  1968. !IsRecurseFlagSet(WEBCRAWL_LINKS_ELSEWHERE) &&
  1969. !m_bstrHostName)
  1970. {
  1971. GetHostName(pwszThisUrl, &m_bstrHostName);
  1972. #ifdef DEBUG
  1973. if (m_bstrHostName)
  1974. TraceMsg(TF_THISMODULE, "Just got first host name: %ws", m_bstrHostName);
  1975. else
  1976. DBG_WARN("Get first host name failed!!!");
  1977. #endif
  1978. }
  1979. DWORD dwCurSize = 0, dwRepeat = 0;
  1980. HRESULT hr1;
  1981. do
  1982. {
  1983. hr1 = S_OK;
  1984. // Make page and dependencies sticky and get their total size
  1985. fDiskFull = FALSE;
  1986. MakePageStickyAndGetSize(pwszThisUrl, &dwCurSize, &fDiskFull);
  1987. if (fDiskFull && (dwRepeat < 2))
  1988. {
  1989. // If we couldn't make stuff sticky, ask host to make cache bigger
  1990. hr1 = m_pAgentEvents->ReportError(&m_SubscriptionCookie,
  1991. INET_E_AGENT_EXCEEDING_CACHE_SIZE, NULL);
  1992. if (hr1 == E_PENDING)
  1993. {
  1994. // Host is going to ask the user to increase the cache size.
  1995. // Host should either abort or resume us later.
  1996. SetAgentFlag(FLAG_WAITING_FOR_INCREASED_CACHE);
  1997. goto done;
  1998. }
  1999. else if (hr1 == INET_S_AGENT_INCREASED_CACHE_SIZE)
  2000. {
  2001. // Host just increased the cache size. Try it again.
  2002. }
  2003. else
  2004. {
  2005. // Not gonna do it. Abort.
  2006. }
  2007. }
  2008. }
  2009. while ((hr1 == INET_S_AGENT_INCREASED_CACHE_SIZE) && (++dwRepeat <= 2));
  2010. m_dwCurSize += dwCurSize;
  2011. // Is there form based authentication that we need to handle
  2012. // on the top page of this subscription?
  2013. if (!fDiskFull && (0 == iPageIndex) && !m_pCurDownload->GetFormSubmitted())
  2014. {
  2015. hr = FindAndSubmitForm();
  2016. if (S_OK == hr)
  2017. {
  2018. // Successfully submitted form. Bail and wait for the next OnDownloadComplete() call.
  2019. // FEATURE: Should we make the form URL and dependencies sticky?
  2020. return S_OK;
  2021. }
  2022. else if (FAILED(hr))
  2023. {
  2024. // We failed trying to submit the form. Bail.
  2025. // FEATURE: Should we set a better error string?
  2026. SetEndStatus(E_FAIL);
  2027. CleanUp();
  2028. return S_OK;
  2029. }
  2030. // else no form - fall through
  2031. }
  2032. TraceMsg(TF_THISMODULE, "WebCrawler up to %d kb", (int)(m_dwCurSize>>10));
  2033. if ((m_lMaxNumUrls < 0) &&
  2034. !dwRecurseLevelsFromThisPage &&
  2035. !(dwData & DATA_CODEBASE))
  2036. {
  2037. m_lMaxNumUrls = m_pPages->NumStrings() + ((m_pRobotsTxt) ? m_pRobotsTxt->NumStrings() : 0);
  2038. }
  2039. } // SUCCEEDED(GetRealUrl)
  2040. } // !FLAG_HEADONLY
  2041. } // !fOperationComplete
  2042. // If we're in "Changes Only" mode, we're done.
  2043. if (IsAgentFlagSet(FLAG_CHANGESONLY))
  2044. fOperationComplete = TRUE;
  2045. // Check to see if we're past our max size
  2046. if (!fOperationComplete && fDiskFull || (m_dwMaxSize && (m_dwCurSize >= (m_dwMaxSize<<10))))
  2047. {
  2048. #ifdef DEBUG
  2049. if (fDiskFull)
  2050. DBG_WARN("Disk/cache full; aborting.");
  2051. else
  2052. TraceMsg(TF_WARNING, "Past maximum size; aborting. (%d kb of %d kb)", (int)(m_dwCurSize>>10), (int)m_dwMaxSize);
  2053. #endif
  2054. // abort operation
  2055. fOperationComplete = TRUE;
  2056. if (fDiskFull)
  2057. {
  2058. SetEndStatus(INET_E_AGENT_CACHE_SIZE_EXCEEDED);
  2059. }
  2060. else
  2061. {
  2062. SetEndStatus(INET_E_AGENT_MAX_SIZE_EXCEEDED);
  2063. }
  2064. }
  2065. if (!fOperationComplete)
  2066. {
  2067. // Get any links from page
  2068. // Get "dependency links" from page - frames, etc.
  2069. // we do this even if a CDF file is specified
  2070. // Essentially, since the user has no clue about the CDF
  2071. // file - we do not want to confuse the user
  2072. GetDependencyLinksFromPage(pwszThisUrl, dwRecurseLevelsFromThisPage);
  2073. if (dwRecurseLevelsFromThisPage)
  2074. {
  2075. // Get links from this page that we want to follow.
  2076. GetLinksFromPage();
  2077. if (m_pPendingLinks)
  2078. TraceMsg(TF_THISMODULE,
  2079. "Total of %d unique valid links found", m_pPendingLinks->NumStrings());
  2080. m_dwPendingRecurseLevel = dwRecurseLevelsFromThisPage - 1;
  2081. }
  2082. }
  2083. } // !iError
  2084. } // !robots.txt
  2085. if(!fOperationComplete)
  2086. StartCDFDownload(bstrCDFURL, pwszThisUrl);
  2087. if(!m_fCDFDownloadInProgress)
  2088. {
  2089. // Don't try code downloads or any of the rest until you're done with
  2090. // the cdf download
  2091. // See if we have any more URLs to download.
  2092. if (!fOperationComplete && FAILED(StartNextDownload()))
  2093. fOperationComplete = TRUE; // No, we're done!
  2094. }
  2095. CheckOperationComplete(fOperationComplete);
  2096. done:
  2097. if (pwszThisUrl)
  2098. MemFree(pwszThisUrl);
  2099. SAFEFREEBSTR(bstrCDFURL);
  2100. return S_OK;
  2101. }
  2102. HRESULT CWebCrawler::StartCDFDownload(WCHAR *pwszCDFURL, WCHAR *pwszBaseUrl)
  2103. {
  2104. HRESULT hr = E_FAIL;
  2105. m_fCDFDownloadInProgress = FALSE;
  2106. if(pwszCDFURL)
  2107. {
  2108. // We have a CDF File - begin download of it
  2109. if (m_pRunAgent)
  2110. {
  2111. ASSERT(0);
  2112. DBG_WARN("WebCrawler: Attempting to download next CDF when nother CDF exists.");
  2113. hr = E_FAIL;
  2114. goto Exit;
  2115. }
  2116. else
  2117. {
  2118. // create subscription item for CDL agent.
  2119. ISubscriptionItem *pItem = NULL;
  2120. if (m_dwMaxSize && ((m_dwCurSize>>10) >= m_dwMaxSize))
  2121. {
  2122. // We've exceeded our maximum download KB limit and can't continue.
  2123. DBG_WARN("WebCrawler: Exceeded Maximum KB download limit with CodeBase download.");
  2124. SetEndStatus(hr = INET_E_AGENT_MAX_SIZE_EXCEEDED);
  2125. goto Exit;
  2126. }
  2127. if (!m_pSubscriptionItem ||
  2128. FAILED(hr = DoCloneSubscriptionItem(m_pSubscriptionItem, NULL, &pItem)))
  2129. {
  2130. goto Exit;
  2131. }
  2132. ASSERT(pItem != NULL);
  2133. ASSERT(pwszCDFURL != NULL);
  2134. WCHAR wszCombined[INTERNET_MAX_URL_LENGTH];
  2135. DWORD dwBufSize = ARRAYSIZE(wszCombined);
  2136. if (SUCCEEDED(UrlCombineW(pwszBaseUrl, pwszCDFURL, wszCombined, &dwBufSize, 0)))
  2137. {
  2138. WriteOLESTR(pItem, c_szPropURL, wszCombined);
  2139. WriteEMPTY(pItem, c_szPropCrawlGroupID); // clear the old cache group id - don't want
  2140. // children to know of it
  2141. // The crawler already has a cache group id that we simply use as the new ID
  2142. WriteLONGLONG(pItem, c_szPropCrawlNewGroupID, m_llCacheGroupID);
  2143. WriteDWORD(pItem, c_szPropChannelFlags, CHANNEL_AGENT_PRECACHE_ALL);
  2144. // Finally - since we know that this is for offline use, we just set the flags to precache all
  2145. m_pRunAgent = new CRunDeliveryAgent();
  2146. if (m_pRunAgent)
  2147. hr = m_pRunAgent->Init((CRunDeliveryAgentSink *)this, pItem, CLSID_ChannelAgent);
  2148. pItem->Release();
  2149. if (m_pRunAgent && SUCCEEDED(hr))
  2150. {
  2151. hr = m_pRunAgent->StartAgent();
  2152. if (hr == E_PENDING)
  2153. {
  2154. hr = S_OK;
  2155. m_fCDFDownloadInProgress = TRUE;
  2156. }
  2157. }
  2158. else
  2159. {
  2160. hr = E_OUTOFMEMORY;
  2161. }
  2162. }
  2163. }
  2164. }
  2165. Exit:
  2166. if((S_OK != hr) && m_pRunAgent)
  2167. {
  2168. CRunDeliveryAgent::SafeRelease(m_pRunAgent);
  2169. }
  2170. return hr;
  2171. }
  2172. // CRunDeliveryAgentSink call back method to signal the end of a codebase download.
  2173. HRESULT CWebCrawler::OnAgentEnd(const SUBSCRIPTIONCOOKIE *pSubscriptionCookie,
  2174. long lSizeDownloaded, HRESULT hrResult, LPCWSTR wszResult,
  2175. BOOL fSynchronous)
  2176. {
  2177. ASSERT(m_pRunAgent != NULL);
  2178. BOOL fOperationComplete = FALSE;
  2179. CRunDeliveryAgent::SafeRelease(m_pRunAgent);
  2180. if(m_fCDFDownloadInProgress)
  2181. {
  2182. m_fCDFDownloadInProgress = FALSE;
  2183. }
  2184. else
  2185. {
  2186. int iPageIndex = m_iCurDownloadStringIndex;
  2187. BOOL fDiskFull = FALSE;
  2188. CCodeBaseHold *pcbh = NULL;
  2189. BOOL fError;
  2190. LPCWSTR pwszThisURL=NULL;
  2191. TraceMsg(TF_THISMODULE, "WebCrawler: OnAgentEnd of CRunDeliveryAgentSink");
  2192. ASSERT(m_pCodeBaseList);
  2193. ASSERT(iPageIndex < m_pCurDownloadStringList->NumStrings());
  2194. ASSERT(m_pCurDownloadStringList == m_pCodeBaseList);
  2195. m_iNumPagesDownloading --;
  2196. ASSERT(m_iNumPagesDownloading == 0);
  2197. pcbh = (CCodeBaseHold *)m_pCodeBaseList->GetStringData(iPageIndex);
  2198. pwszThisURL = m_pCodeBaseList->GetString(iPageIndex);
  2199. ASSERT(pwszThisURL);
  2200. pcbh->dwFlags |= DATA_DLFINISHED;
  2201. fError = FAILED(hrResult);
  2202. if (fSynchronous)
  2203. {
  2204. fError = TRUE;
  2205. ASSERT(FAILED(hrResult)); // we can't succeed synchronously...
  2206. }
  2207. //NOTE: The CDL agent will abort if it finds the file exceeds the MaxSizeKB. In this case the file is not
  2208. // counted and there may be other smaller CAB's that can be downloaded, so we continue to proceed.
  2209. if (fError)
  2210. {
  2211. pcbh->dwFlags |= DATA_DLERROR;
  2212. m_iDownloadErrors ++;
  2213. SetEndStatus(hrResult);
  2214. }
  2215. else
  2216. {
  2217. BYTE chBuf[MY_MAX_CACHE_ENTRY_INFO];
  2218. LPINTERNET_CACHE_ENTRY_INFO lpInfo = (LPINTERNET_CACHE_ENTRY_INFO) chBuf;
  2219. TCHAR szUrl[INTERNET_MAX_URL_LENGTH];
  2220. MyOleStrToStrN(szUrl, INTERNET_MAX_URL_LENGTH, pwszThisURL);
  2221. if (FAILED(GetUrlInfoAndMakeSticky(NULL, szUrl,
  2222. lpInfo, sizeof(chBuf), m_llCacheGroupID)))
  2223. {
  2224. //REVIEW: Do something here? Unlikely to occur in practice.
  2225. fOperationComplete = TRUE;
  2226. ASSERT(0);
  2227. }
  2228. else
  2229. {
  2230. m_dwCurSize += lpInfo->dwSizeLow;
  2231. }
  2232. TraceMsg(TF_THISMODULE, "WebCrawler up to %d kb", (int)(m_dwCurSize>>10));
  2233. if (m_dwMaxSize && ((m_dwCurSize>>10)>m_dwMaxSize))
  2234. {
  2235. // abort operation
  2236. fOperationComplete = TRUE;
  2237. if (fDiskFull)
  2238. SetEndStatus(INET_E_AGENT_CACHE_SIZE_EXCEEDED);
  2239. else
  2240. SetEndStatus(INET_E_AGENT_MAX_SIZE_EXCEEDED);
  2241. }
  2242. } // !fError
  2243. }
  2244. // See if we have any more URLs to download.
  2245. if (!fOperationComplete && FAILED(StartNextDownload()))
  2246. fOperationComplete = TRUE; // No, we're done!
  2247. if(!fSynchronous)
  2248. CheckOperationComplete(fOperationComplete);
  2249. return S_OK;
  2250. }
  2251. //////////////////////////////////////////////////////////////////////////
  2252. //
  2253. // CheckCompleteOperation :: If parameter is TRUE, then all downloads are
  2254. // complete, the appropriate STATUS_CODE is set
  2255. // and clean up initiated.
  2256. //
  2257. //////////////////////////////////////////////////////////////////////////
  2258. void CWebCrawler::CheckOperationComplete(BOOL fOperationComplete)
  2259. {
  2260. if (fOperationComplete)
  2261. {
  2262. DBG("WebCrawler complete. Shutting down.");
  2263. if (INET_S_AGENT_BASIC_SUCCESS == GetEndStatus())
  2264. {
  2265. // Set end status appropriately
  2266. if (m_iDownloadErrors)
  2267. {
  2268. if (m_iPagesStarted<=1)
  2269. {
  2270. DBG("Webcrawl failed - first URL failed.");
  2271. SetEndStatus(E_INVALIDARG);
  2272. }
  2273. else
  2274. {
  2275. DBG("Webcrawl succeeded - some URLs failed.");
  2276. SetEndStatus(INET_S_AGENT_PART_FAIL);
  2277. }
  2278. }
  2279. else
  2280. {
  2281. DBG("Webcrawl succeeded");
  2282. if (!IsAgentFlagSet(FLAG_CRAWLCHANGED))
  2283. {
  2284. SetEndStatus(S_FALSE);
  2285. DBG("No changes were detected");
  2286. }
  2287. else
  2288. {
  2289. DBG("Webcrawl succeeded");
  2290. SetEndStatus(S_OK);
  2291. }
  2292. }
  2293. }
  2294. if (m_llOldCacheGroupID)
  2295. {
  2296. DBG("Nuking old cache group.");
  2297. if (!DeleteUrlCacheGroup(m_llOldCacheGroupID, 0, 0))
  2298. {
  2299. DBG_WARN("Failed to delete old cache group!");
  2300. }
  2301. }
  2302. WriteLONGLONG(m_pSubscriptionItem, c_szPropCrawlGroupID, m_llCacheGroupID);
  2303. m_lSizeDownloadedKB = ((m_dwCurSize+511)>>10);
  2304. WriteDWORD(m_pSubscriptionItem, c_szPropCrawlActualSize, m_lSizeDownloadedKB);
  2305. if (m_lMaxNumUrls >= 0)
  2306. {
  2307. WriteDWORD(m_pSubscriptionItem, c_szPropActualProgressMax, m_lMaxNumUrls);
  2308. }
  2309. // Send a robots.txt warning to the user if we ended up not downloading stuff
  2310. // because of the server's robots.txt file
  2311. if (m_iSkippedByRobotsTxt != 0)
  2312. {
  2313. HRESULT hr = S_OK; // Make it an "information" message
  2314. WCHAR wszMessage[200];
  2315. if (m_iPagesStarted==1)
  2316. {
  2317. hr = INET_E_AGENT_WARNING; // Unless we're missing almost everything
  2318. }
  2319. if (MLLoadStringW(IDS_CRAWL_ROBOTS_TXT_WARNING, wszMessage, ARRAYSIZE(wszMessage)))
  2320. {
  2321. m_pAgentEvents->ReportError(&m_SubscriptionCookie, hr, wszMessage);
  2322. }
  2323. }
  2324. // Will call "UpdateEnd"
  2325. CleanUp();
  2326. }
  2327. }
  2328. HRESULT CWebCrawler::ModifyUpdateEnd(ISubscriptionItem *pEndItem, UINT *puiRes)
  2329. {
  2330. // Customize our end status string
  2331. switch (GetEndStatus())
  2332. {
  2333. case INET_E_AGENT_MAX_SIZE_EXCEEDED :
  2334. *puiRes = IDS_AGNT_STATUS_SIZELIMIT; break;
  2335. case INET_E_AGENT_CACHE_SIZE_EXCEEDED :
  2336. *puiRes = IDS_AGNT_STATUS_CACHELIMIT; break;
  2337. case E_FAIL : *puiRes = IDS_CRAWL_STATUS_NOT_OK; break;
  2338. case S_OK :
  2339. if (!IsAgentFlagSet(FLAG_CHANGESONLY))
  2340. *puiRes = IDS_CRAWL_STATUS_OK;
  2341. else
  2342. *puiRes = IDS_URL_STATUS_OK;
  2343. break;
  2344. case S_FALSE :
  2345. if (!IsAgentFlagSet(FLAG_CHANGESONLY))
  2346. *puiRes = IDS_CRAWL_STATUS_UNCHANGED;
  2347. else
  2348. *puiRes = IDS_URL_STATUS_UNCHANGED;
  2349. break;
  2350. case INET_S_AGENT_PART_FAIL : *puiRes = IDS_CRAWL_STATUS_MOSTLYOK; break;
  2351. }
  2352. return CDeliveryAgent::ModifyUpdateEnd(pEndItem, puiRes);
  2353. }
  2354. HRESULT CWebCrawler::DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved)
  2355. {
  2356. HRESULT hr = S_OK, hr2;
  2357. // free threaded
  2358. EnterCriticalSection(&m_critDependencies);
  2359. if (NULL == pchUrl)
  2360. {
  2361. DBG_WARN("CWebCrawler::DownloadStart pchUrl=NULL");
  2362. }
  2363. else
  2364. {
  2365. // Check to see if this is already in our dependencies list and abort if so
  2366. if (CWCStringList::STRLST_ADDED != m_pDependencies->AddString(pchUrl, 0))
  2367. {
  2368. hr = E_ABORT; // Don't download this thing.
  2369. TraceMsg(TF_THISMODULE, "Aborting mshtml url (already added): %ws", pchUrl);
  2370. }
  2371. if (SUCCEEDED(hr))
  2372. {
  2373. // Check to see if this fails the robots.txt and abort if so
  2374. // Note, this will only work if we happen to have already gotten this robots.txt
  2375. // Need to abort here if we haven't gotten it, then get it, then get just this dep. Yuck.
  2376. // Also shouldn't do the check if this is the first page downloaded
  2377. DWORD dwIndex;
  2378. hr2 = GetRobotsTxtIndex(pchUrl, FALSE, &dwIndex);
  2379. if (SUCCEEDED(hr2))
  2380. {
  2381. BOOL fAllow;
  2382. if (SUCCEEDED(ValidateWithRobotsTxt(pchUrl, dwIndex, &fAllow)))
  2383. {
  2384. if (!fAllow)
  2385. hr = E_ABORT; // ooh, failed the test.
  2386. }
  2387. }
  2388. }
  2389. }
  2390. LeaveCriticalSection(&m_critDependencies);
  2391. return hr;
  2392. }
  2393. HRESULT CWebCrawler::DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved)
  2394. {
  2395. // free threaded
  2396. // Do nothing. We may wish to post message to make sticky here. We may wish to
  2397. // mark as downloaded in string list here.
  2398. // EnterCriticalSection(&m_critDependencies);
  2399. // LeaveCriticalSection(&m_critDependencies);
  2400. return S_OK;
  2401. }
  2402. /* 41927 (IE5 4491)
  2403. HRESULT CWebCrawler::OnGetReferer(LPCWSTR *ppwszReferer)
  2404. {
  2405. if (m_iPagesStarted <= 1)
  2406. {
  2407. *ppwszReferer = NULL;
  2408. return S_FALSE;
  2409. }
  2410. if (m_pCurDownloadStringList == m_pRobotsTxt)
  2411. {
  2412. // Referer is last page from main list to be downloaded
  2413. *ppwszReferer = m_pPages->GetString(m_iPagesStarted-1);
  2414. return S_OK;
  2415. }
  2416. if (m_pCurDownloadStringList == m_pPages)
  2417. {
  2418. // Referer is stored in string list data
  2419. *ppwszReferer = m_pPages->GetString(
  2420. ((m_pPages->GetStringData(m_iCurDownloadStringIndex) & DATA_REFERERMASK) >> DATA_REFERERSHIFT));
  2421. return S_OK;
  2422. }
  2423. // We don't return a referer for code bases
  2424. ASSERT(m_pCurDownloadStringList == m_pCodeBaseList);
  2425. return S_FALSE;
  2426. }
  2427. */
  2428. HRESULT CWebCrawler::OnAuthenticate(HWND *phwnd, LPWSTR *ppszUsername, LPWSTR *ppszPassword)
  2429. {
  2430. HRESULT hr, hrRet=E_FAIL;
  2431. ASSERT(phwnd && ppszUsername && ppszPassword);
  2432. ASSERT((HWND)-1 == *phwnd && NULL == *ppszUsername && NULL == *ppszPassword);
  2433. // If our host name doesn't match the root host name, don't return auth
  2434. // information.
  2435. LPWSTR pwszUrl, bstrHostName=NULL;
  2436. m_pCurDownload->GetRealURL(&pwszUrl); // may re-enter Trident
  2437. if (pwszUrl)
  2438. {
  2439. GetHostName(pwszUrl, &bstrHostName);
  2440. LocalFree(pwszUrl);
  2441. }
  2442. if (bstrHostName)
  2443. {
  2444. if (!m_bstrHostName || !MyAsciiCmpW(bstrHostName, m_bstrHostName))
  2445. {
  2446. // Host names match. Return auth information.
  2447. // If we're hosted by channel agent, use its auth information
  2448. ISubscriptionItem *pChannel=NULL;
  2449. ISubscriptionItem *pItem=m_pSubscriptionItem;
  2450. if (SUCCEEDED(GetChannelItem(&pChannel)))
  2451. {
  2452. pItem = pChannel;
  2453. }
  2454. hr = ReadOLESTR(pItem, c_szPropCrawlUsername, ppszUsername);
  2455. if (SUCCEEDED(hr))
  2456. {
  2457. BSTR bstrPassword = NULL;
  2458. hr = ReadPassword(pItem, &bstrPassword);
  2459. if (SUCCEEDED(hr))
  2460. {
  2461. int len = (lstrlenW(bstrPassword) + 1) * sizeof(WCHAR);
  2462. *ppszPassword = (LPWSTR) CoTaskMemAlloc(len);
  2463. if (*ppszPassword)
  2464. {
  2465. CopyMemory(*ppszPassword, bstrPassword, len);
  2466. }
  2467. SAFEFREEBSTR(bstrPassword);
  2468. if (*ppszPassword)
  2469. {
  2470. hrRet = S_OK;
  2471. }
  2472. }
  2473. }
  2474. if (FAILED(hrRet))
  2475. {
  2476. SAFEFREEOLESTR(*ppszUsername);
  2477. SAFEFREEOLESTR(*ppszPassword);
  2478. }
  2479. SAFERELEASE(pChannel);
  2480. }
  2481. SysFreeString(bstrHostName);
  2482. }
  2483. return hrRet;
  2484. }
  2485. HRESULT CWebCrawler::OnClientPull(UINT iID, LPCWSTR pwszOldURL, LPCWSTR pwszNewURL)
  2486. {
  2487. // CUrlDownload is informing us it's about to do a client pull.
  2488. // Let's send out a progress report for the new url
  2489. SendUpdateProgress(pwszNewURL, m_iTotalStarted, m_lMaxNumUrls);
  2490. // Now we need to process the current url: make it and dependencies sticky
  2491. DWORD dwCurSize=0;
  2492. BOOL fDiskFull=FALSE;
  2493. MakePageStickyAndGetSize(pwszOldURL, &dwCurSize, &fDiskFull);
  2494. m_dwCurSize += dwCurSize;
  2495. TraceMsg(TF_THISMODULE, "WebCrawler processed page prior to client pull - now up to %d kb", (int)(m_dwCurSize>>10));
  2496. // Tell CUrlDownload to go ahead and download the new url
  2497. return S_OK;
  2498. }
  2499. HRESULT CWebCrawler::OnOleCommandTargetExec(const GUID *pguidCmdGroup, DWORD nCmdID,
  2500. DWORD nCmdexecopt, VARIANTARG *pvarargIn,
  2501. VARIANTARG *pvarargOut)
  2502. {
  2503. HRESULT hr = OLECMDERR_E_NOTSUPPORTED;
  2504. IPropertyBag2 *pPropBag = NULL;
  2505. int i;
  2506. //REVIEW: CLSID for this not yet defined.
  2507. if ( pguidCmdGroup
  2508. && (*pguidCmdGroup == CGID_JavaParambagCompatHack)
  2509. && (nCmdID == 0)
  2510. && (nCmdexecopt == MSOCMDEXECOPT_DONTPROMPTUSER))
  2511. {
  2512. if (!IsRecurseFlagSet(WEBCRAWL_GET_CONTROLS))
  2513. {
  2514. goto Exit;
  2515. }
  2516. uCLSSPEC ucs;
  2517. QUERYCONTEXT qc = { 0 };
  2518. ucs.tyspec = TYSPEC_CLSID;
  2519. ucs.tagged_union.clsid = CLSID_JavaVM;
  2520. // Check to see if Java VM is installed. Don't try to get applets if not.
  2521. if (!SUCCEEDED(FaultInIEFeature(NULL, &ucs, &qc, FIEF_FLAG_PEEK)))
  2522. {
  2523. goto Exit;
  2524. }
  2525. ULONG enIndex;
  2526. const DWORD enMax = 7, enMin = 0;
  2527. PROPBAG2 pb[enMax];
  2528. VARIANT vaProps[enMax];
  2529. HRESULT hrResult[enMax];
  2530. enum { enCodeBase = 0, enCabBase, enCabinets, enArchive, enUsesLib, enLibrary, enUsesVer };
  2531. LPWSTR pwszThisURL = NULL;
  2532. int chLen;
  2533. //REVIEW: This will need to be reviewed later when matching trident code is available
  2534. // and details worked out.
  2535. if ((pvarargIn->vt != VT_UNKNOWN) ||
  2536. (FAILED(pvarargIn->punkVal->QueryInterface(IID_IPropertyBag2, (void **)&pPropBag))))
  2537. {
  2538. goto Exit;
  2539. }
  2540. if (FAILED(GetRealUrl(m_iCurDownloadStringIndex, &pwszThisURL)))
  2541. {
  2542. pwszThisURL = StrDupW(L"");
  2543. }
  2544. // PROPBAG2 structure for data retrieval
  2545. for (i=enMin; i<enMax; i++)
  2546. {
  2547. pb[i].dwType = PROPBAG2_TYPE_DATA;
  2548. pb[i].vt = VT_BSTR;
  2549. pb[i].cfType = NULL; // CLIPFORMAT
  2550. pb[i].dwHint = 0; // ????
  2551. pb[i].pstrName = NULL;
  2552. pb[i].clsid = CLSID_NULL; // ????
  2553. vaProps[i].vt = VT_EMPTY;
  2554. vaProps[i].bstrVal = NULL;
  2555. hrResult[i] = E_FAIL;
  2556. }
  2557. if (((pb[enCodeBase].pstrName = SysAllocString(L"CODEBASE")) != NULL) &&
  2558. ((pb[enCabBase].pstrName = SysAllocString(L"CABBASE")) != NULL) &&
  2559. ((pb[enCabinets].pstrName = SysAllocString(L"CABINETS")) != NULL) &&
  2560. ((pb[enArchive].pstrName = SysAllocString(L"ARCHIVE")) != NULL) &&
  2561. ((pb[enUsesLib].pstrName = SysAllocString(L"USESLIBRARY")) != NULL) &&
  2562. ((pb[enLibrary].pstrName = SysAllocString(L"USESLIBRARYCODEBASE")) != NULL) &&
  2563. ((pb[enUsesVer].pstrName = SysAllocString(L"USESLIBRARYVERSION")) != NULL))
  2564. {
  2565. //Read returns E_FAIL even if it read some of the properties.
  2566. //Since we check hrResult's below this isn't a big deal.
  2567. hr = pPropBag->Read(enMax, &pb[0], NULL, &vaProps[0], &hrResult[0]);
  2568. {
  2569. BSTR bstrCodeBase = NULL;
  2570. // check for CODEBASE
  2571. if (SUCCEEDED(hrResult[enCodeBase]) && (vaProps[enCodeBase].vt == VT_BSTR))
  2572. {
  2573. bstrCodeBase = vaProps[enCodeBase].bstrVal;
  2574. }
  2575. // add a trailing slash if not already present
  2576. chLen = lstrlenW(bstrCodeBase);
  2577. if (chLen && bstrCodeBase[chLen-1] != '/')
  2578. {
  2579. LPWSTR szNewCodeBase = 0;
  2580. int nLen = chLen + 2;
  2581. szNewCodeBase = (LPWSTR) LocalAlloc(0,sizeof(WCHAR)*nLen);
  2582. if (szNewCodeBase)
  2583. {
  2584. StrCpyNW(szNewCodeBase, bstrCodeBase, nLen);
  2585. StrCatBuffW(szNewCodeBase, L"/", nLen);
  2586. SAFEFREEBSTR(bstrCodeBase);
  2587. bstrCodeBase = vaProps[enCodeBase].bstrVal = SysAllocString(szNewCodeBase);
  2588. LocalFree(szNewCodeBase);
  2589. }
  2590. }
  2591. // check for CABBASE
  2592. if (SUCCEEDED(hrResult[enCabBase]) && (vaProps[enCabBase].vt == VT_BSTR))
  2593. {
  2594. BSTR szCabBase = vaProps[enCabBase].bstrVal;
  2595. // Add CABBASE URL to list of CABs to pull.
  2596. if (SUCCEEDED(CombineBaseAndRelativeURLs(pwszThisURL, bstrCodeBase, &szCabBase)))
  2597. {
  2598. m_pPages->AddString(szCabBase, 0);
  2599. }
  2600. }
  2601. // check for CABINETS
  2602. for (enIndex = enCabinets; enIndex<(enArchive+1); enIndex++)
  2603. {
  2604. if (SUCCEEDED(hrResult[enIndex]) && (vaProps[enIndex].vt == VT_BSTR))
  2605. {
  2606. BSTR szCur = vaProps[enIndex].bstrVal, szPrev = NULL;
  2607. while (szCur)
  2608. {
  2609. WCHAR wcCur = *szCur;
  2610. if ((wcCur == L'+') || (wcCur == L',') || (wcCur == L'\0'))
  2611. {
  2612. BSTR szLast = szPrev, szCabBase = NULL;
  2613. BOOL bLastFile = FALSE;
  2614. if (!szPrev)
  2615. {
  2616. szLast = vaProps[enIndex].bstrVal;
  2617. }
  2618. szPrev = szCur; szPrev++;
  2619. if (*szCur == L'\0')
  2620. {
  2621. bLastFile = TRUE;
  2622. }
  2623. *szCur = (unsigned short)L'\0';
  2624. // szLast points to current CabBase.
  2625. szCabBase = SysAllocString(szLast);
  2626. if (SUCCEEDED(CombineBaseAndRelativeURLs(pwszThisURL, bstrCodeBase, &szCabBase)))
  2627. {
  2628. int iAdd=m_pPages->AddString(szCabBase, DATA_CODEBASE);
  2629. if (m_lMaxNumUrls > 0 && iAdd==CWCStringList::STRLST_ADDED)
  2630. m_lMaxNumUrls ++;
  2631. }
  2632. SAFEFREEBSTR(szCabBase);
  2633. if (bLastFile)
  2634. {
  2635. szCur = NULL;
  2636. break;
  2637. }
  2638. }
  2639. szCur++;
  2640. } // while (szCur)
  2641. } // cabinets
  2642. }
  2643. // check for USESLIBRARY* parameters.
  2644. CCodeBaseHold *pcbh = NULL;
  2645. if (SUCCEEDED(hrResult[enUsesLib]) && (vaProps[enUsesLib].vt == VT_BSTR) &&
  2646. SUCCEEDED(hrResult[enLibrary]) && (vaProps[enLibrary].vt == VT_BSTR))
  2647. {
  2648. BSTR szThisLibCAB = NULL;
  2649. pcbh = new CCodeBaseHold();
  2650. if (pcbh)
  2651. {
  2652. pcbh->szDistUnit = SysAllocString(vaProps[enUsesLib].bstrVal);
  2653. pcbh->dwVersionMS = pcbh->dwVersionLS = -1;
  2654. pcbh->dwFlags = 0;
  2655. szThisLibCAB = SysAllocString(vaProps[enLibrary].bstrVal);
  2656. if (FAILED(CombineBaseAndRelativeURLs(pwszThisURL, bstrCodeBase, &szThisLibCAB)) ||
  2657. m_pCodeBaseList->AddString(szThisLibCAB, (DWORD_PTR)pcbh) != CWCStringList::STRLST_ADDED)
  2658. {
  2659. SAFEFREEBSTR(pcbh->szDistUnit);
  2660. SAFEDELETE(pcbh);
  2661. }
  2662. SAFEFREEBSTR(szThisLibCAB);
  2663. }
  2664. }
  2665. // Check for USESLIBRARYVERSION (optional)
  2666. if (pcbh && SUCCEEDED(hrResult[enUsesVer]) && (vaProps[enUsesVer].vt == VT_BSTR))
  2667. {
  2668. int iLen = SysStringByteLen(vaProps[enUsesVer].bstrVal)+1;
  2669. CHAR *szVerStr = (LPSTR)MemAlloc(LMEM_FIXED, iLen);
  2670. if (szVerStr)
  2671. {
  2672. SHUnicodeToAnsi(vaProps[enUsesVer].bstrVal, szVerStr, iLen);
  2673. if (FAILED(GetVersionFromString(szVerStr,
  2674. &pcbh->dwVersionMS, &pcbh->dwVersionLS)))
  2675. {
  2676. hr = HRESULT_FROM_WIN32(GetLastError());
  2677. MemFree(szVerStr);
  2678. SAFEFREEBSTR(pcbh->szDistUnit);
  2679. SAFEDELETE(pcbh);
  2680. }
  2681. MemFree(szVerStr);
  2682. }
  2683. }
  2684. }
  2685. } // Read properties
  2686. for (i=enMin; i<enMax; i++)
  2687. {
  2688. SAFEFREEBSTR(pb[i].pstrName);
  2689. }
  2690. if (pwszThisURL)
  2691. LocalFree(pwszThisURL);
  2692. hr = S_OK;
  2693. }
  2694. Exit:
  2695. SAFERELEASE(pPropBag);
  2696. return hr;
  2697. }
  2698. HRESULT CWebCrawler::GetDownloadNotify(IDownloadNotify **ppOut)
  2699. {
  2700. HRESULT hr=S_OK;
  2701. if (m_pDownloadNotify)
  2702. {
  2703. m_pDownloadNotify->LeaveMeAlone();
  2704. m_pDownloadNotify->Release();
  2705. m_pDownloadNotify=NULL;
  2706. }
  2707. CDownloadNotify *pdn = new CDownloadNotify(this);
  2708. if (pdn)
  2709. {
  2710. hr = pdn->Initialize();
  2711. if (SUCCEEDED(hr))
  2712. {
  2713. m_pDownloadNotify = pdn;
  2714. *ppOut = m_pDownloadNotify;
  2715. m_pDownloadNotify->AddRef();
  2716. }
  2717. else
  2718. {
  2719. pdn->Release();
  2720. }
  2721. }
  2722. else
  2723. {
  2724. hr = E_OUTOFMEMORY;
  2725. *ppOut = NULL;
  2726. }
  2727. return hr;
  2728. }
  2729. //---------------------------------------------------------------
  2730. // CWebCrawler::CDownloadNotify class
  2731. //---------------------------------------------------------------
  2732. CWebCrawler::CDownloadNotify::CDownloadNotify(CWebCrawler *pParent)
  2733. {
  2734. ASSERT(pParent);
  2735. m_cRef = 1;
  2736. m_pParent = pParent;
  2737. pParent->AddRef();
  2738. }
  2739. HRESULT CWebCrawler::CDownloadNotify::Initialize()
  2740. {
  2741. m_hrCritParent = InitializeCriticalSectionAndSpinCount(&m_critParent, 0) ? S_OK : E_OUTOFMEMORY;
  2742. return m_hrCritParent;
  2743. }
  2744. CWebCrawler::CDownloadNotify::~CDownloadNotify()
  2745. {
  2746. DBG("Destroying CWebCrawler::CDownloadNotify");
  2747. ASSERT(!m_pParent);
  2748. SAFERELEASE(m_pParent);
  2749. if (SUCCEEDED(m_hrCritParent))
  2750. {
  2751. DeleteCriticalSection(&m_critParent);
  2752. }
  2753. }
  2754. void CWebCrawler::CDownloadNotify::LeaveMeAlone()
  2755. {
  2756. if (m_pParent)
  2757. {
  2758. EnterCriticalSection(&m_critParent);
  2759. SAFERELEASE(m_pParent);
  2760. LeaveCriticalSection(&m_critParent);
  2761. }
  2762. }
  2763. // IUnknown members
  2764. HRESULT CWebCrawler::CDownloadNotify::QueryInterface(REFIID riid, void **ppv)
  2765. {
  2766. if ((IID_IUnknown == riid) ||
  2767. (IID_IDownloadNotify == riid))
  2768. {
  2769. *ppv = (IDownloadNotify *)this;
  2770. }
  2771. else
  2772. {
  2773. *ppv = NULL;
  2774. return E_NOINTERFACE;
  2775. }
  2776. ((LPUNKNOWN)*ppv)->AddRef();
  2777. return S_OK;
  2778. }
  2779. ULONG CWebCrawler::CDownloadNotify::AddRef(void)
  2780. {
  2781. return InterlockedIncrement(&m_cRef);
  2782. }
  2783. ULONG CWebCrawler::CDownloadNotify::Release(void)
  2784. {
  2785. ASSERT( 0 != m_cRef );
  2786. ULONG cRef = InterlockedDecrement(&m_cRef);
  2787. if ( 0 == cRef )
  2788. {
  2789. delete this;
  2790. }
  2791. return cRef;
  2792. }
  2793. // IDownloadNotify
  2794. HRESULT CWebCrawler::CDownloadNotify::DownloadStart(LPCWSTR pchUrl, DWORD dwDownloadId, DWORD dwType, DWORD dwReserved)
  2795. {
  2796. HRESULT hr = E_ABORT; // abort it if we have nobody listening
  2797. TraceMsg(TF_THISMODULE, "DownloadStart id=%d url=%ws", dwDownloadId, pchUrl ? pchUrl : L"(null)");
  2798. EnterCriticalSection(&m_critParent);
  2799. if (m_pParent)
  2800. hr = m_pParent->DownloadStart(pchUrl, dwDownloadId, dwType, dwReserved);
  2801. LeaveCriticalSection(&m_critParent);
  2802. return hr;
  2803. }
  2804. HRESULT CWebCrawler::CDownloadNotify::DownloadComplete(DWORD dwDownloadId, HRESULT hrNotify, DWORD dwReserved)
  2805. {
  2806. HRESULT hr = S_OK;
  2807. // TraceMsg(TF_THISMODULE, "DownloadComplete id=%d hr=%x", dwDownloadId, hrNotify);
  2808. EnterCriticalSection(&m_critParent);
  2809. if (m_pParent)
  2810. hr = m_pParent->DownloadComplete(dwDownloadId, hrNotify, dwReserved);
  2811. LeaveCriticalSection(&m_critParent);
  2812. return hr;
  2813. }
  2814. //////////////////////////////////////////////////////////////////////////
  2815. //
  2816. // Other functions
  2817. //
  2818. //////////////////////////////////////////////////////////////////////////
  2819. // Make a single absolute or relative url sticky and get size
  2820. HRESULT GetUrlInfoAndMakeSticky(
  2821. LPCTSTR pszBaseUrl,
  2822. LPCTSTR pszThisUrl,
  2823. LPINTERNET_CACHE_ENTRY_INFO lpCacheEntryInfo,
  2824. DWORD dwBufSize,
  2825. GROUPID llCacheGroupID)
  2826. {
  2827. DWORD dwSize;
  2828. TCHAR szCombined[INTERNET_MAX_URL_LENGTH];
  2829. ASSERT(lpCacheEntryInfo);
  2830. // Combine urls if necessary
  2831. if (pszBaseUrl)
  2832. {
  2833. dwSize = ARRAYSIZE(szCombined);
  2834. if (SUCCEEDED(UrlCombine(pszBaseUrl, pszThisUrl,
  2835. szCombined, &dwSize, 0)))
  2836. {
  2837. pszThisUrl = szCombined;
  2838. }
  2839. else
  2840. DBG_WARN("UrlCombine failed!");
  2841. }
  2842. // Add the size of this URL
  2843. lpCacheEntryInfo->dwStructSize = dwBufSize;
  2844. if (!GetUrlCacheEntryInfo(pszThisUrl, lpCacheEntryInfo, &dwBufSize))
  2845. {
  2846. #ifdef DEBUG
  2847. if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
  2848. DBG_WARN("Failed GetUrlCacheEntryInfo, insufficient buffer");
  2849. else
  2850. TraceMsgA(llCacheGroupID ? TF_WARNING : TF_THISMODULE,
  2851. "Failed GetUrlCacheEntryInfo (not in cache) URL=%ws", pszThisUrl);
  2852. #endif
  2853. return E_FAIL;
  2854. }
  2855. // Add to new group
  2856. if (llCacheGroupID != 0)
  2857. {
  2858. if (!SetUrlCacheEntryGroup(pszThisUrl, INTERNET_CACHE_GROUP_ADD,
  2859. llCacheGroupID, NULL, 0, NULL))
  2860. {
  2861. switch (GetLastError())
  2862. {
  2863. case ERROR_FILE_NOT_FOUND: // Huh? Must not have been able to add the index entry?
  2864. case ERROR_DISK_FULL:
  2865. return E_OUTOFMEMORY;
  2866. case ERROR_NOT_ENOUGH_QUOTA:
  2867. return S_OK; // We do our own quota handling.
  2868. default:
  2869. TraceMsgA(TF_WARNING | TF_THISMODULE, "GetUrlInfoAndMakeSticky: Got unexpected error from SetUrlCacheEntryGroup() - GLE = 0x%08x", GetLastError());
  2870. return E_FAIL;
  2871. }
  2872. }
  2873. }
  2874. return S_OK;
  2875. }
  2876. // GenerateCode will generate a DWORD code from a file.
  2877. #define ELEMENT_PER_READ 256
  2878. #define ELEMENT_SIZE sizeof(DWORD)
  2879. HRESULT GenerateCode(LPCTSTR lpszLocalFileName, DWORD *pdwRet)
  2880. {
  2881. DWORD dwCode=0;
  2882. DWORD dwData[ELEMENT_PER_READ], i, dwRead;
  2883. HRESULT hr = S_OK;
  2884. HANDLE hFile;
  2885. hFile = CreateFile(lpszLocalFileName, GENERIC_READ,
  2886. FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING,
  2887. 0, NULL);
  2888. if (INVALID_HANDLE_VALUE != hFile)
  2889. {
  2890. do
  2891. {
  2892. dwRead = 0;
  2893. if (ReadFile(hFile, dwData, ELEMENT_PER_READ * ELEMENT_SIZE, &dwRead, NULL))
  2894. {
  2895. for (i=0; i<dwRead / ELEMENT_SIZE; i++)
  2896. {
  2897. dwCode = (dwCode << 31) | (dwCode >> 1) + dwData[i];
  2898. // dwCode += dwData[i];
  2899. }
  2900. }
  2901. }
  2902. while (ELEMENT_PER_READ * ELEMENT_SIZE == dwRead);
  2903. CloseHandle(hFile);
  2904. }
  2905. else
  2906. {
  2907. hr = E_FAIL;
  2908. TraceMsg(TF_THISMODULE|TF_WARNING,"GenerateCode: Unable to open cache file, Error=%x", GetLastError());
  2909. }
  2910. *pdwRet = dwCode;
  2911. return hr;
  2912. }
  2913. // S_OK : We retrieved a good last modified or content code to use
  2914. // S_FALSE : We fell back to using the one passed into pvarChange
  2915. // E_FAIL : We failed miserably.
  2916. // E_INVALIDARG : Get a clue
  2917. // *pfGetContent : TRUE if we need a GET for PostCheckUrlForChange to work right
  2918. HRESULT PreCheckUrlForChange(LPCTSTR lpURL, VARIANT *pvarChange, BOOL *pfGetContent)
  2919. {
  2920. BYTE chBuf[MY_MAX_CACHE_ENTRY_INFO];
  2921. LPINTERNET_CACHE_ENTRY_INFO lpInfo = (LPINTERNET_CACHE_ENTRY_INFO) chBuf;
  2922. if (pvarChange->vt != VT_EMPTY && pvarChange->vt != VT_I4 && pvarChange->vt != VT_CY)
  2923. return E_INVALIDARG;
  2924. if (SUCCEEDED(GetUrlInfoAndMakeSticky(NULL, lpURL, lpInfo, sizeof(chBuf), 0)))
  2925. {
  2926. FILETIME ftOldLastModified = *((FILETIME *) &pvarChange->cyVal);
  2927. if (lpInfo->LastModifiedTime.dwHighDateTime || lpInfo->LastModifiedTime.dwLowDateTime)
  2928. {
  2929. // We have a last modified time. Use it or the persisted one.
  2930. if (pfGetContent)
  2931. *pfGetContent = FALSE;
  2932. if ((pvarChange->vt != VT_CY)
  2933. || (lpInfo->LastModifiedTime.dwHighDateTime > ftOldLastModified.dwHighDateTime)
  2934. || ((lpInfo->LastModifiedTime.dwHighDateTime == ftOldLastModified.dwHighDateTime)
  2935. && (lpInfo->LastModifiedTime.dwLowDateTime > ftOldLastModified.dwLowDateTime)))
  2936. {
  2937. // Cache Last Modified is newer than saved Last Modified. Use cache's.
  2938. pvarChange->vt = VT_CY;
  2939. pvarChange->cyVal = *((CY *)&(lpInfo->LastModifiedTime));
  2940. return S_OK;
  2941. }
  2942. ASSERT(pvarChange->vt == VT_CY);
  2943. // Persisted Last Modified time is most recent. Use it.
  2944. return S_OK;
  2945. }
  2946. DWORD dwCode;
  2947. if (SUCCEEDED(GenerateCode(lpInfo->lpszLocalFileName, &dwCode)))
  2948. {
  2949. pvarChange->vt = VT_I4;
  2950. pvarChange->lVal = (LONG) dwCode;
  2951. if (pfGetContent)
  2952. *pfGetContent = TRUE;
  2953. return S_OK;
  2954. }
  2955. // Failed GenerateCode. Weird. Fall through.
  2956. }
  2957. if (pvarChange->vt != VT_EMPTY)
  2958. {
  2959. if (pfGetContent)
  2960. *pfGetContent = (pvarChange->vt == VT_I4);
  2961. return S_FALSE;
  2962. }
  2963. // We don't have old change detection, we don't have cache content, better GET
  2964. if (pfGetContent)
  2965. *pfGetContent = TRUE;
  2966. return E_FAIL; // Couldn't get anything. pvarChange->vt==VT_EMPTY
  2967. }
  2968. // S_FALSE : no change
  2969. // S_OK : changed
  2970. // E_ : failure of some sort
  2971. // pvarChange from PreCheckUrlForChange. We return a new one.
  2972. // lpInfo : must be valid if *pfGetContent was TRUE
  2973. // ftNewLastModified : must be filled in if *pfGetContent was FALSE
  2974. HRESULT PostCheckUrlForChange(VARIANT *pvarChange,
  2975. LPINTERNET_CACHE_ENTRY_INFO lpInfo,
  2976. FILETIME ftNewLastModified)
  2977. {
  2978. HRESULT hr = S_FALSE;
  2979. VARIANT varChangeNew;
  2980. DWORD dwNewCode = 0;
  2981. if (!pvarChange || (pvarChange->vt != VT_I4 && pvarChange->vt != VT_CY && pvarChange->vt != VT_EMPTY))
  2982. return E_INVALIDARG;
  2983. varChangeNew.vt = VT_EMPTY;
  2984. if (ftNewLastModified.dwHighDateTime || ftNewLastModified.dwLowDateTime)
  2985. {
  2986. varChangeNew.vt = VT_CY;
  2987. varChangeNew.cyVal = *((CY *) &ftNewLastModified);
  2988. }
  2989. else
  2990. {
  2991. if (lpInfo &&
  2992. SUCCEEDED(GenerateCode(lpInfo->lpszLocalFileName, &dwNewCode)))
  2993. {
  2994. varChangeNew.vt = VT_I4;
  2995. varChangeNew.lVal = dwNewCode;
  2996. }
  2997. }
  2998. if (pvarChange->vt == VT_CY)
  2999. {
  3000. // We have an old last modified time. Use that to determine change.
  3001. FILETIME ftOldLastModified = *((FILETIME *) &(pvarChange->cyVal));
  3002. if ((!ftNewLastModified.dwHighDateTime && !ftNewLastModified.dwLowDateTime)
  3003. || (ftNewLastModified.dwHighDateTime > ftOldLastModified.dwHighDateTime)
  3004. || ((ftNewLastModified.dwHighDateTime == ftOldLastModified.dwHighDateTime)
  3005. && (ftNewLastModified.dwLowDateTime > ftOldLastModified.dwLowDateTime)))
  3006. {
  3007. // NewLastModified > OldLastModified (or we don't have a NewLastModified)
  3008. DBG("PostCheckUrlForChange change detected via Last Modified");
  3009. hr = S_OK; // We have changed
  3010. }
  3011. }
  3012. else if (pvarChange->vt == VT_I4)
  3013. {
  3014. // We have an old code. Use that to determine change.
  3015. DWORD dwOldCode = (DWORD) (pvarChange->lVal);
  3016. if ((dwOldCode != dwNewCode) ||
  3017. !dwNewCode)
  3018. {
  3019. DBG("PostCheckUrlForChange change detected via content code");
  3020. hr = S_OK; // We have changed
  3021. }
  3022. }
  3023. else
  3024. hr = E_FAIL; // No old code.
  3025. *pvarChange = varChangeNew;
  3026. return hr;
  3027. }
  3028. //////////////////////////////////////////////////////////////////////////
  3029. //
  3030. // CHelperOM implementation
  3031. //
  3032. //////////////////////////////////////////////////////////////////////////
  3033. CHelperOM::CHelperOM(IHTMLDocument2 *pDoc)
  3034. {
  3035. ASSERT(pDoc);
  3036. m_pDoc = pDoc;
  3037. if (pDoc)
  3038. pDoc->AddRef();
  3039. }
  3040. CHelperOM::~CHelperOM()
  3041. {
  3042. SAFERELEASE(m_pDoc);
  3043. }
  3044. HRESULT CHelperOM::GetTagCollection(
  3045. IHTMLDocument2 *pDoc,
  3046. LPCWSTR wszTagName,
  3047. IHTMLElementCollection **ppCollection)
  3048. {
  3049. IHTMLElementCollection *pAll=NULL;
  3050. IDispatch *pDisp=NULL;
  3051. VARIANT TagName;
  3052. HRESULT hr;
  3053. // We have to get "all", then sub-collection
  3054. hr = pDoc->get_all(&pAll);
  3055. if (pAll)
  3056. {
  3057. TagName.vt = VT_BSTR;
  3058. TagName.bstrVal = SysAllocString(wszTagName);
  3059. if (NULL == TagName.bstrVal)
  3060. hr = E_OUTOFMEMORY;
  3061. else
  3062. {
  3063. hr = pAll->tags(TagName, &pDisp);
  3064. SysFreeString(TagName.bstrVal);
  3065. }
  3066. pAll->Release();
  3067. }
  3068. if (pDisp)
  3069. {
  3070. hr = pDisp->QueryInterface(IID_IHTMLElementCollection,
  3071. (void **)ppCollection);
  3072. pDisp->Release();
  3073. }
  3074. if (FAILED(hr)) DBG("GetSubCollection failed");
  3075. return hr;
  3076. }
  3077. // Collections we get:
  3078. //
  3079. // IHTMLWindow2->get_document
  3080. // IHTMLDocument2 ->get_links
  3081. // IHTMLElementCollection->item
  3082. // ->get_hostname
  3083. // ->get_href
  3084. // ->get_all
  3085. // ->tags("map")
  3086. // IHTMLElementCollection ->item
  3087. // ->get_areas
  3088. // IHTMLElementCollection ->item
  3089. // IHTMLAreaElement ->get_href
  3090. // ->get_all
  3091. // ->tags("meta")
  3092. // IHTMLElementCollection ->item
  3093. // ->get_all
  3094. // ->tags("frame")
  3095. // IHTMLElementCollection ->item
  3096. // ->get_all
  3097. // ->tags("iframe")
  3098. // IHTMLElementCollection ->item
  3099. // We recurse EnumCollection to get the maps (since
  3100. // it's a collection of collections)
  3101. // hideous hack: IHTMLElementCollection can actually be IHTMLAreasCollection
  3102. // the interface used to be derived from the other. It still has identical
  3103. // methods. We typecast just in case that changes. Hopefully they will fix
  3104. // so that Areas is derived from Element again.
  3105. HRESULT CHelperOM::EnumCollection(
  3106. IHTMLElementCollection *pCollection,
  3107. CWCStringList *pStringList,
  3108. CollectionType Type,
  3109. PFN_CB pfnCB,
  3110. DWORD_PTR dwCBData)
  3111. {
  3112. IHTMLAnchorElement *pLink;
  3113. IHTMLMapElement *pMap;
  3114. IHTMLAreaElement *pArea;
  3115. IHTMLMetaElement *pMeta;
  3116. IHTMLElement *pEle;
  3117. IDispatch *pDispItem = NULL;
  3118. HRESULT hr;
  3119. BSTR bstrItem=NULL;
  3120. long l, lCount;
  3121. VARIANT vIndex, vEmpty, vData;
  3122. BSTR bstrTmp1, bstrTmp2;
  3123. DWORD dwStringData;
  3124. VariantInit(&vEmpty);
  3125. VariantInit(&vIndex);
  3126. VariantInit(&vData);
  3127. if (Type==CTYPE_MAP)
  3128. hr = ((IHTMLAreasCollection *)pCollection)->get_length(&lCount);
  3129. else
  3130. hr = pCollection->get_length(&lCount);
  3131. if (FAILED(hr))
  3132. lCount = 0;
  3133. #ifdef DEBUG
  3134. LPSTR lpDSTR[]={"Links","Maps","Areas (links) In Map", "Meta", "Frames"};
  3135. TraceMsgA(TF_THISMODULE, "CWebCrawler::GetCollection, %d %s found", lCount, lpDSTR[(int)Type]);
  3136. #endif
  3137. for (l=0; l<lCount; l++)
  3138. {
  3139. vIndex.vt = VT_I4;
  3140. vIndex.lVal = l;
  3141. dwStringData = 0;
  3142. if (Type==CTYPE_MAP)
  3143. hr = ((IHTMLAreasCollection *)pCollection)->item(vIndex, vEmpty, &pDispItem);
  3144. else
  3145. hr = pCollection->item(vIndex, vEmpty, &pDispItem);
  3146. if (SUCCEEDED(hr))
  3147. {
  3148. ASSERT(vData.vt == VT_EMPTY);
  3149. ASSERT(!bstrItem);
  3150. if (pDispItem)
  3151. {
  3152. // Get the URL from the IDispatch
  3153. switch(Type)
  3154. {
  3155. case CTYPE_LINKS: // get href from <a>
  3156. hr = pDispItem->QueryInterface(IID_IHTMLAnchorElement, (void **)&pLink);
  3157. if (SUCCEEDED(hr) && pLink)
  3158. {
  3159. hr = pLink->get_href(&bstrItem);
  3160. pLink->Release();
  3161. }
  3162. break;
  3163. case CTYPE_MAPS: // enumeration areas for this map
  3164. hr = pDispItem->QueryInterface(IID_IHTMLMapElement, (void **)&pMap);
  3165. if (SUCCEEDED(hr) && pMap)
  3166. {
  3167. IHTMLAreasCollection *pNewCollection=NULL;
  3168. // This gives us another collection. Enumerate it
  3169. // for the strings.
  3170. hr = pMap->get_areas(&pNewCollection);
  3171. if (pNewCollection)
  3172. {
  3173. hr = EnumCollection((IHTMLElementCollection *)pNewCollection, pStringList, CTYPE_MAP, pfnCB, dwCBData);
  3174. pNewCollection->Release();
  3175. }
  3176. pMap->Release();
  3177. }
  3178. break;
  3179. case CTYPE_MAP: // get href for this area
  3180. hr = pDispItem->QueryInterface(IID_IHTMLAreaElement, (void **)&pArea);
  3181. if (SUCCEEDED(hr) && pArea)
  3182. {
  3183. hr = pArea->get_href(&bstrItem);
  3184. pArea->Release();
  3185. }
  3186. break;
  3187. case CTYPE_META: // get meta name and content as single string
  3188. hr = pDispItem->QueryInterface(IID_IHTMLMetaElement, (void **)&pMeta);
  3189. if (SUCCEEDED(hr) && pMeta)
  3190. {
  3191. pMeta->get_name(&bstrTmp1);
  3192. pMeta->get_content(&bstrTmp2);
  3193. if (bstrTmp1 && bstrTmp2 && *bstrTmp1 && *bstrTmp2)
  3194. {
  3195. int nLen = lstrlenW(bstrTmp1) + lstrlenW(bstrTmp2) + 2;
  3196. bstrItem = SysAllocStringLen(NULL, nLen);
  3197. StrCpyNW(bstrItem, bstrTmp1, nLen);
  3198. StrCatBuffW(bstrItem, L"\n", nLen);
  3199. StrCatBuffW(bstrItem, bstrTmp2, nLen);
  3200. }
  3201. SysFreeString(bstrTmp1);
  3202. SysFreeString(bstrTmp2);
  3203. pMeta->Release();
  3204. }
  3205. break;
  3206. case CTYPE_FRAMES: // get "src" attribute
  3207. hr = pDispItem->QueryInterface(IID_IHTMLElement, (void **)&pEle);
  3208. if (SUCCEEDED(hr) && pEle)
  3209. {
  3210. bstrTmp1 = SysAllocString(L"SRC");
  3211. if (bstrTmp1)
  3212. {
  3213. hr = pEle->getAttribute(bstrTmp1, VARIANT_FALSE, &vData);
  3214. if (SUCCEEDED(hr) && vData.vt == VT_BSTR)
  3215. {
  3216. bstrItem = vData.bstrVal;
  3217. vData.vt = VT_EMPTY;
  3218. }
  3219. else
  3220. VariantClear(&vData);
  3221. SysFreeString(bstrTmp1);
  3222. }
  3223. else
  3224. {
  3225. hr = E_FAIL;
  3226. }
  3227. pEle->Release();
  3228. }
  3229. break;
  3230. default:
  3231. ASSERT(0);
  3232. // bug in calling code
  3233. }
  3234. if (SUCCEEDED(hr) && bstrItem)
  3235. {
  3236. // Verify we want to add this item to string list & get data
  3237. if (pfnCB)
  3238. hr = pfnCB(pDispItem, &bstrItem, dwCBData, &dwStringData);
  3239. if (SUCCEEDED(hr) && bstrItem && pStringList)
  3240. pStringList->AddString(bstrItem, dwStringData);
  3241. }
  3242. SAFERELEASE(pDispItem);
  3243. SAFEFREEBSTR(bstrItem);
  3244. }
  3245. }
  3246. if (E_ABORT == hr)
  3247. {
  3248. DBG_WARN("Aborting enumeration in CHelperOM::EnumCollection at callback's request.");
  3249. break;
  3250. }
  3251. }
  3252. return hr;
  3253. }
  3254. // Gets all urls from a collection, recursing through frames
  3255. HRESULT CHelperOM::GetCollection(
  3256. IHTMLDocument2 *pDoc,
  3257. CWCStringList *pStringList,
  3258. CollectionType Type,
  3259. PFN_CB pfnCB,
  3260. DWORD_PTR dwCBData)
  3261. {
  3262. HRESULT hr;
  3263. // Get the collection from the document
  3264. ASSERT(pDoc);
  3265. ASSERT(pStringList || pfnCB);
  3266. hr = _GetCollection(pDoc, pStringList, Type, pfnCB, dwCBData);
  3267. return hr;
  3268. }
  3269. // get all urls from a collection
  3270. HRESULT CHelperOM::_GetCollection(
  3271. IHTMLDocument2 *pDoc,
  3272. CWCStringList *pStringList,
  3273. CollectionType Type,
  3274. PFN_CB pfnCB,
  3275. DWORD_PTR dwCBData)
  3276. {
  3277. HRESULT hr;
  3278. IHTMLElementCollection *pCollection=NULL;
  3279. // From IHTMLDocument2 we get IHTMLElementCollection, then enumerate for the urls
  3280. // Get appropriate collection from document
  3281. switch (Type)
  3282. {
  3283. case CTYPE_LINKS:
  3284. hr = pDoc->get_links(&pCollection);
  3285. break;
  3286. case CTYPE_MAPS:
  3287. hr = GetTagCollection(pDoc, L"map", &pCollection);
  3288. break;
  3289. case CTYPE_META:
  3290. hr = GetTagCollection(pDoc, L"meta", &pCollection);
  3291. break;
  3292. case CTYPE_FRAMES:
  3293. hr = GetTagCollection(pDoc, L"frame", &pCollection);
  3294. break;
  3295. default:
  3296. hr = E_FAIL;
  3297. }
  3298. if (!pCollection) hr=E_NOINTERFACE;
  3299. #ifdef DEBUG
  3300. if (FAILED(hr)) DBG_WARN("CWebCrawler::_GetCollection: get_collection failed");
  3301. #endif
  3302. if (SUCCEEDED(hr))
  3303. {
  3304. hr = EnumCollection(pCollection, pStringList, Type, pfnCB, dwCBData);
  3305. // If we're getting frames, we need to enum "iframe" tags separately
  3306. if (SUCCEEDED(hr) && (Type == CTYPE_FRAMES))
  3307. {
  3308. SAFERELEASE(pCollection);
  3309. hr = GetTagCollection(pDoc, L"iframe", &pCollection);
  3310. if (SUCCEEDED(hr) && pCollection)
  3311. {
  3312. hr = EnumCollection(pCollection, pStringList, Type, pfnCB, dwCBData);
  3313. }
  3314. }
  3315. }
  3316. if (pCollection)
  3317. pCollection->Release();
  3318. return hr;
  3319. }
  3320. extern HRESULT LoadWithCookie(LPCTSTR, POOEBuf, DWORD *, SUBSCRIPTIONCOOKIE *);
  3321. // IExtractIcon members
  3322. STDMETHODIMP CWebCrawler::GetIconLocation(UINT uFlags, LPTSTR szIconFile, UINT cchMax, int * piIndex, UINT * pwFlags)
  3323. {
  3324. IUniformResourceLocator* pUrl = NULL;
  3325. IExtractIcon* pUrlIcon = NULL;
  3326. HRESULT hr = S_OK;
  3327. BOOL bCalledCoInit = FALSE;
  3328. if (!szIconFile || !piIndex || !pwFlags)
  3329. return E_INVALIDARG;
  3330. //zero out return values in case one of the COM calls fails...
  3331. *szIconFile = 0;
  3332. *piIndex = -1;
  3333. if (!m_pBuf) {
  3334. m_pBuf = (POOEBuf)MemAlloc(LPTR, sizeof(OOEBuf));
  3335. if (!m_pBuf)
  3336. return E_OUTOFMEMORY;
  3337. DWORD dwSize;
  3338. hr = LoadWithCookie(NULL, m_pBuf, &dwSize, &m_SubscriptionCookie);
  3339. RETURN_ON_FAILURE(hr);
  3340. }
  3341. if (m_pBuf->bDesktop)
  3342. {
  3343. StrCpyN(szIconFile, TEXT(":desktop:"), cchMax);
  3344. }
  3345. else
  3346. {
  3347. if (m_pUrlIconHelper)
  3348. {
  3349. hr = m_pUrlIconHelper->GetIconLocation (uFlags, szIconFile, cchMax, piIndex, pwFlags);
  3350. }
  3351. else
  3352. {
  3353. hr = CoCreateInstance (CLSID_InternetShortcut, NULL, CLSCTX_INPROC_SERVER, IID_IUniformResourceLocator, (void**)&pUrl);
  3354. if ((CO_E_NOTINITIALIZED == hr || REGDB_E_IIDNOTREG == hr) &&
  3355. SUCCEEDED (CoInitialize(NULL)))
  3356. {
  3357. bCalledCoInit = TRUE;
  3358. hr = CoCreateInstance (CLSID_InternetShortcut, NULL, CLSCTX_INPROC_SERVER, IID_IUniformResourceLocator, (void**)&pUrl);
  3359. }
  3360. if (SUCCEEDED (hr))
  3361. {
  3362. hr = pUrl->SetURL (m_pBuf->m_URL, 1);
  3363. if (SUCCEEDED (hr))
  3364. {
  3365. hr = pUrl->QueryInterface (IID_IExtractIcon, (void**)&pUrlIcon);
  3366. if (SUCCEEDED (hr))
  3367. {
  3368. hr = pUrlIcon->GetIconLocation (uFlags, szIconFile, cchMax, piIndex, pwFlags);
  3369. //pUrlIcon->Release(); //released in destructor
  3370. ASSERT (m_pUrlIconHelper == NULL);
  3371. m_pUrlIconHelper = pUrlIcon;
  3372. }
  3373. }
  3374. pUrl->Release();
  3375. }
  3376. //balance CoInit with CoUnit
  3377. //(we still have a pointer to the CLSID_InternetShortcut object, m_pUrlIconHelper,
  3378. //but since that code is in shdocvw there's no danger of it getting unloaded and
  3379. //invalidating our pointer, sez cdturner.)
  3380. if (bCalledCoInit)
  3381. CoUninitialize();
  3382. }
  3383. }
  3384. return hr;
  3385. }
  3386. STDMETHODIMP CWebCrawler::Extract(LPCTSTR szIconFile, UINT nIconIndex, HICON * phiconLarge, HICON * phiconSmall, UINT nIconSize)
  3387. {
  3388. HRESULT hr = S_OK;
  3389. if (!phiconLarge || !phiconSmall)
  3390. return E_INVALIDARG;
  3391. //zero out return values in case one of the COM calls fails...
  3392. *phiconLarge = NULL;
  3393. *phiconSmall = NULL;
  3394. if ((NULL != m_pBuf) && (m_pBuf->bDesktop))
  3395. {
  3396. LoadDefaultIcons();
  3397. *phiconLarge = *phiconSmall = g_desktopIcon;
  3398. }
  3399. else
  3400. {
  3401. if (!m_pUrlIconHelper)
  3402. return E_FAIL;
  3403. hr = m_pUrlIconHelper->Extract (szIconFile, nIconIndex, phiconLarge, phiconSmall, nIconSize);
  3404. }
  3405. return hr;
  3406. }