Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

438 lines
8.2 KiB

  1. /*++
  2. Copyright (c) 1996 Microsoft Corporation
  3. Module Name :
  4. linkpars.cpp
  5. Abstract:
  6. Link parser class implementation. This class responsible for
  7. parsing the html file for hyperlink.
  8. Author:
  9. Michael Cheuk (mcheuk)
  10. Project:
  11. Link Checker
  12. Revision History:
  13. --*/
  14. #include "stdafx.h"
  15. #include "LinkPars.h"
  16. #include "link.h"
  17. #include "lcmgr.h"
  18. #ifdef _DEBUG
  19. #define new DEBUG_NEW
  20. #undef THIS_FILE
  21. static char THIS_FILE[] = __FILE__;
  22. #endif
  23. // Constants
  24. const CString strLocalHost_c(_T("localhost"));
  25. void
  26. CLinkParser::Parse(
  27. const CString& strData,
  28. const CString& strBaseURL,
  29. CLinkPtrList& rLinkPtrList
  30. )
  31. /*++
  32. Routine Description:
  33. Parse a page of html data
  34. Arguments:
  35. strData - page of html
  36. strBaseURL - base URL
  37. rLinkPtrList - reference to links list. The new links will
  38. will be added to this list.
  39. Return Value:
  40. N/A
  41. --*/
  42. {
  43. // Look for the first '<'
  44. LPCTSTR lpszOpen = _tcschr(strData, _TUCHAR('<'));
  45. while(lpszOpen != NULL)
  46. {
  47. // Look for the '>'
  48. LPCTSTR lpszClose = _tcschr(lpszOpen, _TUCHAR('>'));
  49. if(lpszClose)
  50. {
  51. // The possible tag must be longer than 7 bytes (a href=)
  52. int iCount = (int)(lpszClose - lpszOpen) - 1; // skip the '<'
  53. if( iCount > 7 )
  54. {
  55. int iIndex = lpszOpen - ((LPCTSTR)strData) + 1; // skip the '<'
  56. CString strPossibleURL(strData.Mid(iIndex, iCount));
  57. // Parse the possible tag
  58. if(ParsePossibleTag(strPossibleURL))
  59. {
  60. CString strURL;
  61. BOOL fLocalLink;
  62. // We found a valid tag. Time to create new link.
  63. if( CreateURL(strPossibleURL, strBaseURL, strURL, fLocalLink) )
  64. {
  65. rLinkPtrList.AddLink(strURL, strBaseURL, strPossibleURL, fLocalLink);
  66. }
  67. }
  68. }
  69. }
  70. // Look for the next '<'
  71. lpszOpen = _tcschr(++lpszOpen, _TUCHAR('<'));
  72. }
  73. } // CLinkParser::Parse
  74. BOOL
  75. CLinkParser::ParsePossibleTag(
  76. CString& strTag
  77. )
  78. /*++
  79. Routine Description:
  80. Parse a single "<.....>" for possible hyperlink
  81. Arguments:
  82. strTag - value inside a "<.....>" excluding '<' & '>'
  83. If this is a hyperlink tag, the hyperlink URL
  84. will be put in strTag.
  85. Return Value:
  86. BOOL - TRUE if hyperlink tag. FALSE otherwise.
  87. --*/
  88. {
  89. // Make a working copy
  90. CString strWorkCopy(strTag);
  91. // Let's work with lower case
  92. strWorkCopy.MakeLower();
  93. //
  94. // Check for,
  95. //
  96. // HyperLink:
  97. // <a href="url" ...>
  98. // <a href="url#anchor" ...>
  99. // <a href="#anchor" ...>
  100. //
  101. // CGI
  102. // <a href="url?parameters" ...>
  103. //
  104. // Style Sheet
  105. // <link rel="stylesheet" href="url" ...>
  106. //
  107. if( strWorkCopy[0] == _T('a') ||
  108. strWorkCopy.Find(_T("link")) == 0 )
  109. {
  110. return GetTagValue(strTag, CString(_T("href")));
  111. }
  112. //
  113. // Check for,
  114. //
  115. // <body background="url" ...>
  116. //
  117. // Table:
  118. // <table background="url" ...>
  119. // <th background="url" ...>
  120. // <td background="url" ...>
  121. //
  122. else if( strWorkCopy.Find(_T("body")) == 0 ||
  123. strWorkCopy.Find(_T("table")) == 0 ||
  124. strWorkCopy.Find(_T("th")) == 0 ||
  125. strWorkCopy.Find(_T("td")) == 0 )
  126. {
  127. return GetTagValue(strTag, CString(_T("background")));
  128. }
  129. //
  130. // Check for,
  131. //
  132. // Sound:
  133. // <bgsound src="url" ...>
  134. // <sound src="url" ...>
  135. //
  136. // Frame:
  137. // <frame src="url" ...>
  138. //
  139. // Netscape embeded:
  140. // <embed src="url" ...>
  141. //
  142. // JavaScript & VB Script
  143. // <script src="url" language="java or vbs" ...>
  144. //
  145. else if( strWorkCopy.Find(_T("bgsound")) == 0 ||
  146. strWorkCopy.Find(_T("sound")) == 0 ||
  147. strWorkCopy.Find(_T("frame")) == 0 ||
  148. strWorkCopy.Find(_T("embed")) == 0 ||
  149. strWorkCopy.Find(_T("script")) == 0 )
  150. {
  151. return GetTagValue(strTag, CString(_T("src")));
  152. }
  153. // Check for,
  154. //
  155. // Image:
  156. // <img src="url" ...>
  157. //
  158. // Video:
  159. // <img dynsrc="url">
  160. //
  161. // VRML:
  162. // <img vrml="url">
  163. //
  164. else if( strWorkCopy.Find(_T("img")) == 0 )
  165. {
  166. if(GetTagValue(strTag, CString(_T("src"))))
  167. {
  168. return TRUE;
  169. }
  170. if(GetTagValue(strTag, CString(_T("dynsrc"))))
  171. {
  172. return TRUE;
  173. }
  174. return GetTagValue(strTag, CString(_T("vrml")));
  175. }
  176. // Java
  177. // <applet code="name.class" codebase="url" ...>
  178. else if( strWorkCopy.Find(_T("applet")) == 0 )
  179. {
  180. return GetTagValue(strTag, CString(_T("codebase")));
  181. }
  182. // Form
  183. // <form action="url" ...>
  184. else if( strWorkCopy.Find(_T("form")) == 0 )
  185. {
  186. return GetTagValue(strTag, CString(_T("action")));
  187. }
  188. return FALSE;
  189. } // CLinkParser::ParsePossibleTag
  190. BOOL
  191. CLinkParser::GetTagValue(
  192. CString& strTag,
  193. const CString& strParam
  194. )
  195. /*++
  196. Routine Description:
  197. Get the hyperlink value from "<.....>"
  198. Arguments:
  199. strTag - value inside a "<.....>" excluding '<' & '>'
  200. If this is a hyperlink tag, the hyperlink URL
  201. will be put in strTag.
  202. strParam - parameter to look for. For example, src or href
  203. Return Value:
  204. BOOL - TRUE if hyperlink tag. FALSE otherwise.
  205. --*/
  206. {
  207. // Make a copy of original tag
  208. CString strWorkCopy(strTag);
  209. strWorkCopy.MakeLower();
  210. int iLength = strParam.GetLength();
  211. // Look for the parameter
  212. int iIndex = strWorkCopy.Find(strParam);
  213. if(iIndex == -1)
  214. {
  215. return FALSE;
  216. }
  217. // Remove the parameter from the tag
  218. CString strResult( strTag.Mid(iIndex + iLength) );
  219. // Look for '='
  220. iIndex = strResult.Find(_T("="));
  221. if(iIndex == -1)
  222. {
  223. return FALSE;
  224. }
  225. // Remove the '=' from the tag
  226. strResult = strResult.Mid(iIndex+1);
  227. // Look for the value
  228. int iStart = -1;
  229. int iEnd = -1;
  230. int fPara = FALSE; // is the tag start with "
  231. // Search for the value
  232. for(int i=0; i<strResult.GetLength(); i++)
  233. {
  234. // If we found the starting index of value, look
  235. // for the end of the value
  236. if(iStart!=-1 &&
  237. ( !fPara && strResult[i] == _TCHAR(' ') ||
  238. ( fPara && strResult[i] == _TCHAR('\"') )
  239. )
  240. )
  241. {
  242. iEnd = i;
  243. break;
  244. }
  245. // Look for the starting index of value
  246. if(iStart==-1 && strResult[i] != _TCHAR(' ') && strResult[i] != _TCHAR('\"') )
  247. {
  248. iStart = i;
  249. if(i - 1 >= 0)
  250. {
  251. fPara = (strResult[i-1] == _TCHAR('\"')); // found a "
  252. }
  253. }
  254. }
  255. // Found the starting index
  256. if(iStart != -1)
  257. {
  258. // If we didn't find the end of value, use the
  259. // last character as end
  260. if(iEnd == -1)
  261. {
  262. iEnd = strResult.GetLength();
  263. }
  264. // Copy the value to the input
  265. strTag = strResult.Mid(iStart, (iEnd - iStart));
  266. // Change '\' to '/'
  267. CLinkCheckerMgr::ChangeBackSlash(strTag);
  268. return TRUE;
  269. }
  270. return FALSE;
  271. } // CLinkParser::GetTagValue
  272. BOOL
  273. CLinkParser::CreateURL(
  274. const CString& strRelativeURL,
  275. const CString& strBaseURL,
  276. CString& strURL,
  277. BOOL& fLocalLink
  278. )
  279. /*++
  280. Routine Description:
  281. Create a URL from base URL & relative URL. It also check
  282. the result for local or remote link
  283. Arguments:
  284. strRelativeURL - relative URL
  285. strBaseURL - base URL
  286. strURL - result URL
  287. fLocalLink - will be set to TRUE if this is a local link
  288. Return Value:
  289. BOOL - TRUE if sucess. FALSE otherwise.
  290. --*/
  291. {
  292. ASSERT(CWininet::IsLoaded());
  293. // Remove the anchor from the relative URL
  294. CString strNewRelativeURL(strRelativeURL);
  295. int i = strNewRelativeURL.ReverseFind(_TCHAR('#'));
  296. if(i != -1)
  297. {
  298. strNewRelativeURL = strNewRelativeURL.Left(i);
  299. }
  300. // Combine the URLs
  301. DWORD dwLength = INTERNET_MAX_URL_LENGTH;
  302. LPTSTR lpBuffer = strURL.GetBuffer(dwLength);
  303. CWininet::InternetCombineUrlA(
  304. strBaseURL,
  305. strNewRelativeURL,
  306. lpBuffer,
  307. &dwLength,
  308. ICU_ENCODE_SPACES_ONLY);
  309. strURL.ReleaseBuffer();
  310. // Check for local or remote link
  311. URL_COMPONENTS urlcomp;
  312. memset(&urlcomp, 0, sizeof(urlcomp));
  313. urlcomp.dwStructSize = sizeof(urlcomp);
  314. urlcomp.dwHostNameLength = 1;
  315. VERIFY(CWininet::InternetCrackUrlA(strURL, strURL.GetLength(), NULL, &urlcomp));
  316. // Check for possible local link
  317. if((int)urlcomp.dwHostNameLength == m_strLocalHostName.GetLength() ||
  318. (int)urlcomp.dwHostNameLength == strLocalHost_c.GetLength()) // localhost
  319. {
  320. if( _tcsnccmp( urlcomp.lpszHostName, m_strLocalHostName, m_strLocalHostName.GetLength() ) == 0 ||
  321. _tcsnccmp( urlcomp.lpszHostName, strLocalHost_c, strLocalHost_c.GetLength() ) == 0)
  322. {
  323. fLocalLink = TRUE;
  324. // Local link
  325. if(GetLinkCheckerMgr().GetUserOptions().IsCheckLocalLinks())
  326. {
  327. return TRUE;
  328. }
  329. else
  330. {
  331. return FALSE;
  332. }
  333. }
  334. }
  335. // Remote link
  336. fLocalLink = FALSE;
  337. if(GetLinkCheckerMgr().GetUserOptions().IsCheckRemoteLinks())
  338. {
  339. return TRUE;
  340. }
  341. else
  342. {
  343. return FALSE;
  344. }
  345. } // CLinkParser::CreateURL