/*++

   Copyright    (c)    1996    Microsoft Corporation

   Module  Name :

        linkpars.cpp

   Abstract:

        Link parser class implementation. This class responsible for 
		parsing the html file for hyperlink.

   Author:

        Michael Cheuk (mcheuk)

   Project:

        Link Checker

   Revision History:

--*/

#include "stdafx.h"
#include "LinkPars.h"

#include "link.h"
#include "lcmgr.h"

#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif

// Constants
const CString strLocalHost_c(_T("localhost"));

void 
CLinkParser::Parse(
	const CString& strData, 
	const CString& strBaseURL, 
	CLinkPtrList& rLinkPtrList
	)
/*++

Routine Description:

    Parse a page of html data

Arguments:

    strData - page of html
	strBaseURL - base URL
	rLinkPtrList - reference to links list. The new links will
				   will be added to this list.

Return Value:

    N/A

--*/
{
	// Look for the first '<'
	LPCTSTR lpszOpen = _tcschr(strData, _TUCHAR('<'));

	while(lpszOpen != NULL)
	{
		// Look for the '>'
		LPCTSTR lpszClose = _tcschr(lpszOpen, _TUCHAR('>'));
		if(lpszClose)
		{
			// The possible tag must be longer than 7 bytes (a href=)
			int iCount = (int)(lpszClose - lpszOpen) - 1; // skip the '<'
			if( iCount  > 7 )
			{
				int iIndex = lpszOpen - ((LPCTSTR)strData) + 1; // skip the '<'

				CString strPossibleURL(strData.Mid(iIndex, iCount));

				// Parse the possible tag
				if(ParsePossibleTag(strPossibleURL))
				{
					CString strURL;
					BOOL fLocalLink;

					// We found a valid tag. Time to create new link.
					if( CreateURL(strPossibleURL, strBaseURL, strURL, fLocalLink) )
					{
						rLinkPtrList.AddLink(strURL, strBaseURL, strPossibleURL, fLocalLink);
					}
				}
			}
		}

		// Look for the next '<'
		lpszOpen = _tcschr(++lpszOpen, _TUCHAR('<'));
	}

} // CLinkParser::Parse


BOOL 
CLinkParser::ParsePossibleTag(
	CString& strTag
	)
/*++

Routine Description:

    Parse a single "<.....>" for possible hyperlink

Arguments:

    strTag - value inside a "<.....>" excluding '<' & '>'
			 If this is a hyperlink tag, the hyperlink URL
			 will be put in strTag.

Return Value:

    BOOL - TRUE if hyperlink tag. FALSE otherwise.

--*/
{
	// Make a working copy
	CString strWorkCopy(strTag);

	// Let's work with lower case
	strWorkCopy.MakeLower();

	//
	// Check for,
	//
	// HyperLink:
	// <a href="url" ...>
	// <a href="url#anchor" ...>
	// <a href="#anchor" ...>
	//
	// CGI
	// <a href="url?parameters" ...>
	//
	// Style Sheet
	// <link rel="stylesheet" href="url" ...>
	//
	if( strWorkCopy[0] == _T('a') ||
		strWorkCopy.Find(_T("link")) == 0 )
	{
		return GetTagValue(strTag, CString(_T("href")));
	}

	//
	// Check for,
	//
	// <body background="url" ...>
	//
	// Table:
	// <table background="url" ...>
	// <th background="url" ...>
	// <td background="url" ...>
	//
	else if( strWorkCopy.Find(_T("body")) == 0 ||
             strWorkCopy.Find(_T("table")) == 0 ||
			 strWorkCopy.Find(_T("th")) == 0 ||
			 strWorkCopy.Find(_T("td")) == 0 )
	{
		return GetTagValue(strTag, CString(_T("background")));
	}

	//
	// Check for,
	//
	// Sound:
	// <bgsound src="url" ...>
	// <sound src="url" ...>
	//
	// Frame:
	// <frame src="url" ...>
	//
	// Netscape embeded:
	// <embed src="url" ...>
	//
	// JavaScript & VB Script
	// <script src="url" language="java or vbs" ...>
	//
	else if( strWorkCopy.Find(_T("bgsound")) == 0 ||
             strWorkCopy.Find(_T("sound")) == 0 ||
			 strWorkCopy.Find(_T("frame")) == 0 ||
			 strWorkCopy.Find(_T("embed")) == 0 ||
			 strWorkCopy.Find(_T("script")) == 0 )
	{
		return GetTagValue(strTag, CString(_T("src")));
	}

	// Check for,
	//
	// Image:
	// <img src="url" ...>
	//
	// Video:
	// <img dynsrc="url">
	// 
	// VRML:
	// <img vrml="url">
	//
	else if( strWorkCopy.Find(_T("img")) == 0 )
	{
		if(GetTagValue(strTag, CString(_T("src"))))
		{
			return TRUE;
		}

		if(GetTagValue(strTag, CString(_T("dynsrc"))))
		{
			return TRUE;
		}

		return GetTagValue(strTag, CString(_T("vrml")));
	}

	// Java
	// <applet code="name.class" codebase="url" ...>
	else if( strWorkCopy.Find(_T("applet")) == 0 )
	{
		return GetTagValue(strTag, CString(_T("codebase")));
	}

	// Form
	// <form action="url" ...>
	else if( strWorkCopy.Find(_T("form")) == 0 )
	{
		return GetTagValue(strTag, CString(_T("action")));
	}

	return FALSE;

} // CLinkParser::ParsePossibleTag


BOOL 
CLinkParser::GetTagValue(
	CString& strTag, 
	const CString& strParam
	)
/*++

Routine Description:

    Get the hyperlink value from "<.....>"

Arguments:

    strTag - value inside a "<.....>" excluding '<' & '>'
			 If this is a hyperlink tag, the hyperlink URL
			 will be put in strTag.

	strParam - parameter to look for. For example, src or href

Return Value:

    BOOL - TRUE if hyperlink tag. FALSE otherwise.

--*/
{
	// Make a copy of original tag
	CString strWorkCopy(strTag);
	strWorkCopy.MakeLower();

	int iLength = strParam.GetLength();

	// Look for the parameter
	int iIndex = strWorkCopy.Find(strParam);
	if(iIndex == -1)
	{
		return FALSE;
	}

	// Remove the parameter from the tag
	CString strResult( strTag.Mid(iIndex + iLength) );
	
	// Look for '='
	iIndex = strResult.Find(_T("="));
	if(iIndex == -1)
	{
		return FALSE;
	}

	// Remove the '=' from the tag
	strResult = strResult.Mid(iIndex+1);

	// Look for the value
	int iStart = -1;
	int iEnd = -1;
	int fPara = FALSE; // is the tag start with "

	// Search for the value 
	for(int i=0; i<strResult.GetLength(); i++)
	{
		// If we found the starting index of value, look
		// for the end of the value
		if(iStart!=-1 && 
			( !fPara && strResult[i] == _TCHAR(' ') || 
			  ( fPara && strResult[i] == _TCHAR('\"') ) 
			) 
		   )
		{
			iEnd = i;
			break;
		}

		// Look for the starting index of value
		if(iStart==-1 && strResult[i] != _TCHAR(' ') && strResult[i] != _TCHAR('\"') )
		{
			iStart = i;
			if(i - 1 >= 0)
			{
				fPara = (strResult[i-1] == _TCHAR('\"')); // found a "
			}
		}
	}

	// Found the starting index
	if(iStart != -1)
	{
		// If we didn't find the end of value, use the
		// last character as end
		if(iEnd == -1)
		{
			iEnd = strResult.GetLength();
		}

		// Copy the value to the input
		strTag = strResult.Mid(iStart, (iEnd - iStart));
		
		// Change '\' to '/'
		CLinkCheckerMgr::ChangeBackSlash(strTag);

		return TRUE;
	}

	return FALSE;

} // CLinkParser::GetTagValue


BOOL 
CLinkParser::CreateURL(
	const CString& strRelativeURL,		
	const CString& strBaseURL, 
	CString& strURL, 
	BOOL& fLocalLink
	)
/*++

Routine Description:

    Create a URL from base URL & relative URL. It also check 
	the result for local or remote link

Arguments:

	strRelativeURL - relative URL
	strBaseURL - base URL
	strURL - result URL
	fLocalLink - will be set to TRUE if this is a local link

Return Value:

    BOOL - TRUE if sucess. FALSE otherwise.

--*/
{
	ASSERT(CWininet::IsLoaded());

	// Remove the anchor from the relative URL
	CString strNewRelativeURL(strRelativeURL);
	int i = strNewRelativeURL.ReverseFind(_TCHAR('#'));
	if(i != -1)
	{
		strNewRelativeURL = strNewRelativeURL.Left(i);
	}

	// Combine the URLs
	DWORD dwLength = INTERNET_MAX_URL_LENGTH;
	LPTSTR lpBuffer = strURL.GetBuffer(dwLength);

	CWininet::InternetCombineUrlA(
		strBaseURL,
		strNewRelativeURL,
		lpBuffer,
		&dwLength, 
		ICU_ENCODE_SPACES_ONLY);

	strURL.ReleaseBuffer();

	// Check for local or remote link
	URL_COMPONENTS urlcomp;

	memset(&urlcomp, 0, sizeof(urlcomp));
	urlcomp.dwStructSize = sizeof(urlcomp);
	urlcomp.dwHostNameLength = 1;

	VERIFY(CWininet::InternetCrackUrlA(strURL, strURL.GetLength(), NULL, &urlcomp));

	// Check for possible local link
	if((int)urlcomp.dwHostNameLength == m_strLocalHostName.GetLength() ||
       (int)urlcomp.dwHostNameLength == strLocalHost_c.GetLength()) // localhost
	{
		if( _tcsnccmp( urlcomp.lpszHostName, m_strLocalHostName, m_strLocalHostName.GetLength() ) == 0 || 
            _tcsnccmp( urlcomp.lpszHostName, strLocalHost_c, strLocalHost_c.GetLength() ) == 0)
		{
			fLocalLink = TRUE;

			// Local link
			if(GetLinkCheckerMgr().GetUserOptions().IsCheckLocalLinks())
			{
				return TRUE;
			}
			else
			{
				return FALSE;
			}
		}
	}
	
	// Remote link
	fLocalLink = FALSE;
	if(GetLinkCheckerMgr().GetUserOptions().IsCheckRemoteLinks())
	{
		return TRUE;
	}
	else
	{
		return FALSE;
	}

} // CLinkParser::CreateURL