Copyright (c) 1996 Microsoft Corporation
Module Name :
Link parser class implementation. This class responsible for parsing the html file for hyperlink.
Michael Cheuk (mcheuk)
Link Checker
Revision History:
#include "stdafx.h"
#include "LinkPars.h"
#include "link.h"
#include "lcmgr.h"
#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__; #endif
// Constants
const CString strLocalHost_c(_T("localhost"));
void CLinkParser::Parse( const CString& strData, const CString& strBaseURL, CLinkPtrList& rLinkPtrList ) /*++
Routine Description:
Parse a page of html data
strData - page of html strBaseURL - base URL rLinkPtrList - reference to links list. The new links will will be added to this list.
Return Value:
--*/ { // Look for the first '<'
LPCTSTR lpszOpen = _tcschr(strData, _TUCHAR('<'));
while(lpszOpen != NULL) { // Look for the '>'
LPCTSTR lpszClose = _tcschr(lpszOpen, _TUCHAR('>')); if(lpszClose) { // The possible tag must be longer than 7 bytes (a href=)
int iCount = (int)(lpszClose - lpszOpen) - 1; // skip the '<'
if( iCount > 7 ) { int iIndex = lpszOpen - ((LPCTSTR)strData) + 1; // skip the '<'
CString strPossibleURL(strData.Mid(iIndex, iCount));
// Parse the possible tag
if(ParsePossibleTag(strPossibleURL)) { CString strURL; BOOL fLocalLink;
// We found a valid tag. Time to create new link.
if( CreateURL(strPossibleURL, strBaseURL, strURL, fLocalLink) ) { rLinkPtrList.AddLink(strURL, strBaseURL, strPossibleURL, fLocalLink); } } } }
// Look for the next '<'
lpszOpen = _tcschr(++lpszOpen, _TUCHAR('<')); }
} // CLinkParser::Parse
BOOL CLinkParser::ParsePossibleTag( CString& strTag ) /*++
Routine Description:
Parse a single "<.....>" for possible hyperlink
strTag - value inside a "<.....>" excluding '<' & '>' If this is a hyperlink tag, the hyperlink URL will be put in strTag.
Return Value:
BOOL - TRUE if hyperlink tag. FALSE otherwise.
--*/ { // Make a working copy
CString strWorkCopy(strTag);
// Let's work with lower case
// Check for,
// HyperLink:
// <a href="url" ...>
// <a href="url#anchor" ...>
// <a href="#anchor" ...>
// CGI
// <a href="url?parameters" ...>
// Style Sheet
// <link rel="stylesheet" href="url" ...>
if( strWorkCopy[0] == _T('a') || strWorkCopy.Find(_T("link")) == 0 ) { return GetTagValue(strTag, CString(_T("href"))); }
// Check for,
// <body background="url" ...>
// Table:
// <table background="url" ...>
// <th background="url" ...>
// <td background="url" ...>
else if( strWorkCopy.Find(_T("body")) == 0 || strWorkCopy.Find(_T("table")) == 0 || strWorkCopy.Find(_T("th")) == 0 || strWorkCopy.Find(_T("td")) == 0 ) { return GetTagValue(strTag, CString(_T("background"))); }
// Check for,
// Sound:
// <bgsound src="url" ...>
// <sound src="url" ...>
// Frame:
// <frame src="url" ...>
// Netscape embeded:
// <embed src="url" ...>
// JavaScript & VB Script
// <script src="url" language="java or vbs" ...>
else if( strWorkCopy.Find(_T("bgsound")) == 0 || strWorkCopy.Find(_T("sound")) == 0 || strWorkCopy.Find(_T("frame")) == 0 || strWorkCopy.Find(_T("embed")) == 0 || strWorkCopy.Find(_T("script")) == 0 ) { return GetTagValue(strTag, CString(_T("src"))); }
// Check for,
// Image:
// <img src="url" ...>
// Video:
// <img dynsrc="url">
// VRML:
// <img vrml="url">
else if( strWorkCopy.Find(_T("img")) == 0 ) { if(GetTagValue(strTag, CString(_T("src")))) { return TRUE; }
if(GetTagValue(strTag, CString(_T("dynsrc")))) { return TRUE; }
return GetTagValue(strTag, CString(_T("vrml"))); }
// Java
// <applet code="name.class" codebase="url" ...>
else if( strWorkCopy.Find(_T("applet")) == 0 ) { return GetTagValue(strTag, CString(_T("codebase"))); }
// Form
// <form action="url" ...>
else if( strWorkCopy.Find(_T("form")) == 0 ) { return GetTagValue(strTag, CString(_T("action"))); }
return FALSE;
} // CLinkParser::ParsePossibleTag
BOOL CLinkParser::GetTagValue( CString& strTag, const CString& strParam ) /*++
Routine Description:
Get the hyperlink value from "<.....>"
strTag - value inside a "<.....>" excluding '<' & '>' If this is a hyperlink tag, the hyperlink URL will be put in strTag.
strParam - parameter to look for. For example, src or href
Return Value:
BOOL - TRUE if hyperlink tag. FALSE otherwise.
--*/ { // Make a copy of original tag
CString strWorkCopy(strTag); strWorkCopy.MakeLower();
int iLength = strParam.GetLength();
// Look for the parameter
int iIndex = strWorkCopy.Find(strParam); if(iIndex == -1) { return FALSE; }
// Remove the parameter from the tag
CString strResult( strTag.Mid(iIndex + iLength) ); // Look for '='
iIndex = strResult.Find(_T("=")); if(iIndex == -1) { return FALSE; }
// Remove the '=' from the tag
strResult = strResult.Mid(iIndex+1);
// Look for the value
int iStart = -1; int iEnd = -1; int fPara = FALSE; // is the tag start with "
// Search for the value
for(int i=0; i<strResult.GetLength(); i++) { // If we found the starting index of value, look
// for the end of the value
if(iStart!=-1 && ( !fPara && strResult[i] == _TCHAR(' ') || ( fPara && strResult[i] == _TCHAR('\"') ) ) ) { iEnd = i; break; }
// Look for the starting index of value
if(iStart==-1 && strResult[i] != _TCHAR(' ') && strResult[i] != _TCHAR('\"') ) { iStart = i; if(i - 1 >= 0) { fPara = (strResult[i-1] == _TCHAR('\"')); // found a "
} } }
// Found the starting index
if(iStart != -1) { // If we didn't find the end of value, use the
// last character as end
if(iEnd == -1) { iEnd = strResult.GetLength(); }
// Copy the value to the input
strTag = strResult.Mid(iStart, (iEnd - iStart)); // Change '\' to '/'
return TRUE; }
return FALSE;
} // CLinkParser::GetTagValue
BOOL CLinkParser::CreateURL( const CString& strRelativeURL, const CString& strBaseURL, CString& strURL, BOOL& fLocalLink ) /*++
Routine Description:
Create a URL from base URL & relative URL. It also check the result for local or remote link
strRelativeURL - relative URL strBaseURL - base URL strURL - result URL fLocalLink - will be set to TRUE if this is a local link
Return Value:
BOOL - TRUE if sucess. FALSE otherwise.
--*/ { ASSERT(CWininet::IsLoaded());
// Remove the anchor from the relative URL
CString strNewRelativeURL(strRelativeURL); int i = strNewRelativeURL.ReverseFind(_TCHAR('#')); if(i != -1) { strNewRelativeURL = strNewRelativeURL.Left(i); }
// Combine the URLs
DWORD dwLength = INTERNET_MAX_URL_LENGTH; LPTSTR lpBuffer = strURL.GetBuffer(dwLength);
CWininet::InternetCombineUrlA( strBaseURL, strNewRelativeURL, lpBuffer, &dwLength, ICU_ENCODE_SPACES_ONLY);
// Check for local or remote link
memset(&urlcomp, 0, sizeof(urlcomp)); urlcomp.dwStructSize = sizeof(urlcomp); urlcomp.dwHostNameLength = 1;
VERIFY(CWininet::InternetCrackUrlA(strURL, strURL.GetLength(), NULL, &urlcomp));
// Check for possible local link
if((int)urlcomp.dwHostNameLength == m_strLocalHostName.GetLength() || (int)urlcomp.dwHostNameLength == strLocalHost_c.GetLength()) // localhost
{ if( _tcsnccmp( urlcomp.lpszHostName, m_strLocalHostName, m_strLocalHostName.GetLength() ) == 0 || _tcsnccmp( urlcomp.lpszHostName, strLocalHost_c, strLocalHost_c.GetLength() ) == 0) { fLocalLink = TRUE;
// Local link
if(GetLinkCheckerMgr().GetUserOptions().IsCheckLocalLinks()) { return TRUE; } else { return FALSE; } } } // Remote link
fLocalLink = FALSE; if(GetLinkCheckerMgr().GetUserOptions().IsCheckRemoteLinks()) { return TRUE; } else { return FALSE; }
} // CLinkParser::CreateURL