|
|
/*++
Copyright (c) 1995 Microsoft Corporation
Module Name:
parseurl.cxx
Abstract:
Contains functions to parse the basic URLs - FTP, Gopher, HTTP.
An URL parser simply acts as a macro: it must break out the protocol-specific information from the URL and initiate opening the identified resource: all this can be accomplished by calling the relevant Internet protocol APIs.
Code in this module is based on RFC1738
Contents: IsValidUrl DoesSchemeRequireSlashes ParseUrl CrackUrl EncodeUrlPath (HexCharToNumber) (NumberToHexChar) DecodeUrl DecodeUrlInSitu DecodeUrlStringInSitu GetUrlAddressInfo GetUrlAddress MapUrlSchemeName MapUrlScheme MapUrlSchemeToName
Author:
Richard L Firth (rfirth) 26-Apr-1995
Environment:
Win32(s) user-mode DLL
Revision History:
26-Apr-1995 Created
--*/
#include <wininetp.h>
//
// private manifests
//
#define RESERVED SAFE
//
// private macros
//
//#define HEX_CHAR_TO_NUMBER(ch) \ // ((ch <= '9') \ // ? (ch - '0') \ // : ((ch >= 'a') \ // ? ((ch - 'a') + 10) \ // : ((ch - 'A') + 10)))
#define NUMBER_TO_HEX_CHAR(n) \
(((n) <= 9) ? ((char)(n) + '0') : (((char)(n) - 10) + 'A'))
#define IS_UNSAFE_URL_CHARACTER(Char, Scheme) \
(((UCHAR)(Char) <= 0x20) || ((UCHAR)(Char) >= 0x7f) \ || (SafetyList[(Char) - 0x21] & (UNSAFE | Scheme)))
#define IS_UNSAFE_URL_WIDECHARACTER(wChar, Scheme) \
(((WCHAR)(wChar) <= 0x0020) || ((WCHAR)(wChar) >= 0x007f) \ || (SafetyList[(wChar) - 0x0021] & (UNSAFE | Scheme)))
//
// private types
//
//
// private prototypes
//
PRIVATE char HexCharToNumber( IN char ch );
PRIVATE char NumberToHexChar( IN int Number );
//
// private data
//
//
// SafetyList - the list of characters above 0x20 and below 0x7f that are
// classified as safe, unsafe or scheme-specific. Safe characters do not need
// to be escaped for any URL scheme. Unsafe characters must be escaped for all
// URL schemes. Scheme-specific characters need only be escaped for the relevant
// scheme(s)
//
const PRIVATE UCHAR SafetyList[] = {
//
// UNSAFE: 0x00..0x20
//
SAFE | HOSTNAME, // 0x21 (!)
UNSAFE, // 0x22 (")
UNSAFE, // 0x23 (#)
SAFE | HOSTNAME, // 0x24 ($)
UNSAFE, // 0x25 (%)
RESERVED | HOSTNAME, // 0x26 (&)
SAFE | HOSTNAME, // 0x27 (')
SAFE | HOSTNAME, // 0x28 (()
SAFE | HOSTNAME, // 0x29 ())
SAFE | HOSTNAME, // 0x2A (*)
SCHEME_GOPHER | HOSTNAME, // 0x2B (+)
SAFE | HOSTNAME, // 0x2C (,)
SAFE, // 0x2D (-)
SAFE, // 0x2E (.)
RESERVED | HOSTNAME, // 0x2F (/)
SAFE, // 0x30 (0)
SAFE, // 0x31 (1)
SAFE, // 0x32 (2)
SAFE, // 0x33 (3)
SAFE, // 0x34 (4)
SAFE, // 0x35 (5)
SAFE, // 0x36 (6)
SAFE, // 0x37 (7)
SAFE, // 0x38 (8)
SAFE, // 0x39 (9)
RESERVED | HOSTNAME, // 0x3A (:)
RESERVED | HOSTNAME, // 0x3B (;)
UNSAFE, // 0x3C (<)
RESERVED | HOSTNAME, // 0x3D (=)
UNSAFE, // 0x3E (>)
RESERVED | SCHEME_GOPHER | HOSTNAME, // 0x3F (?)
RESERVED | HOSTNAME, // 0x40 (@)
SAFE, // 0x41 (A)
SAFE, // 0x42 (B)
SAFE, // 0x43 (C)
SAFE, // 0x44 (D)
SAFE, // 0x45 (E)
SAFE, // 0x46 (F)
SAFE, // 0x47 (G)
SAFE, // 0x48 (H)
SAFE, // 0x49 (I)
SAFE, // 0x4A (J)
SAFE, // 0x4B (K)
SAFE, // 0x4C (L)
SAFE, // 0x4D (M)
SAFE, // 0x4E (N)
SAFE, // 0x4F (O)
SAFE, // 0x50 (P)
SAFE, // 0x51 (Q)
SAFE, // 0x42 (R)
SAFE, // 0x43 (S)
SAFE, // 0x44 (T)
SAFE, // 0x45 (U)
SAFE, // 0x46 (V)
SAFE, // 0x47 (W)
SAFE, // 0x48 (X)
SAFE, // 0x49 (Y)
SAFE, // 0x5A (Z)
UNSAFE, // 0x5B ([)
UNSAFE, // 0x5C (\)
UNSAFE, // 0x5D (])
UNSAFE, // 0x5E (^)
SAFE, // 0x5F (_)
UNSAFE, // 0x60 (`)
SAFE, // 0x61 (a)
SAFE, // 0x62 (b)
SAFE, // 0x63 (c)
SAFE, // 0x64 (d)
SAFE, // 0x65 (e)
SAFE, // 0x66 (f)
SAFE, // 0x67 (g)
SAFE, // 0x68 (h)
SAFE, // 0x69 (i)
SAFE, // 0x6A (j)
SAFE, // 0x6B (k)
SAFE, // 0x6C (l)
SAFE, // 0x6D (m)
SAFE, // 0x6E (n)
SAFE, // 0x6F (o)
SAFE, // 0x70 (p)
SAFE, // 0x71 (q)
SAFE, // 0x72 (r)
SAFE, // 0x73 (s)
SAFE, // 0x74 (t)
SAFE, // 0x75 (u)
SAFE, // 0x76 (v)
SAFE, // 0x77 (w)
SAFE, // 0x78 (x)
SAFE, // 0x79 (y)
SAFE, // 0x7A (z)
UNSAFE, // 0x7B ({)
UNSAFE, // 0x7C (|)
UNSAFE, // 0x7D (})
UNSAFE // 0x7E (~)
//
// UNSAFE: 0x7F..0xFF
//
};
//
// UrlSchemeList - the list of schemes that we support
//
typedef struct { LPSTR SchemeName; DWORD SchemeLength; INTERNET_SCHEME SchemeType; DWORD SchemeFlags; BOOL NeedSlashes; DWORD OpenFlags; } URL_SCHEME_INFO;
const PRIVATE URL_SCHEME_INFO UrlSchemeList[] = { NULL, 0, INTERNET_SCHEME_DEFAULT, 0, FALSE, 0, "http", 4, INTERNET_SCHEME_HTTP, SCHEME_HTTP, TRUE, 0, "https", 5, INTERNET_SCHEME_HTTPS, SCHEME_HTTP, TRUE, WINHTTP_FLAG_SECURE, };
#define NUMBER_OF_URL_SCHEMES ARRAY_ELEMENTS(UrlSchemeList)
BOOL ScanSchemes(LPTSTR pszToCheck, DWORD ccStr, PDWORD pwResult) { for (DWORD i=0; i<NUMBER_OF_URL_SCHEMES; i++) { if ((UrlSchemeList[i].SchemeLength == ccStr) && (strnicmp(UrlSchemeList[i].SchemeName, pszToCheck, ccStr)==0)) { *pwResult = i; return TRUE; } } return FALSE; }
//
// functions
//
BOOL IsValidUrl( IN LPCSTR lpszUrl )
/*++
Routine Description:
Determines whether an URL has a valid format
Arguments:
lpszUrl - pointer to URL to check.
Assumes: 1. lpszUrl is non-NULL, non-empty string
Return Value:
BOOL
--*/
{ INET_ASSERT(lpszUrl != NULL); INET_ASSERT(*lpszUrl != '\0');
while (*lpszUrl != '\0') { if (IS_UNSAFE_URL_CHARACTER(*lpszUrl, SCHEME_ANY)) { return FALSE; } ++lpszUrl; } return TRUE; }
BOOL DoesSchemeRequireSlashes( IN LPSTR lpszScheme, IN DWORD dwSchemeLength, IN BOOL bHasHostName )
/*++
Routine Description:
Determines whether a protocol scheme requires slashes
Arguments:
lpszScheme - pointer to protocol scheme in question (does not include ':' or slashes, just scheme name)
dwUrlLength - if not 0, string length of lpszScheme
Return Value:
BOOL
--*/
{ DWORD i;
//
// if dwSchemeLength is 0 then lpszUrl is ASCIIZ. Find its length
//
if (dwSchemeLength == 0) { dwSchemeLength = strlen(lpszScheme); }
if (ScanSchemes(lpszScheme, dwSchemeLength, &i)) { return UrlSchemeList[i].NeedSlashes; } return bHasHostName; }
DWORD CrackUrl( IN OUT LPSTR lpszUrl, IN DWORD dwUrlLength, IN BOOL bEscape, OUT LPINTERNET_SCHEME lpSchemeType OPTIONAL, OUT LPSTR* lpszSchemeName OPTIONAL, OUT LPDWORD lpdwSchemeNameLength OPTIONAL, OUT LPSTR* lpszHostName OPTIONAL, OUT LPDWORD lpdwHostNameLength OPTIONAL, OUT LPINTERNET_PORT lpServerPort OPTIONAL, OUT LPSTR* lpszUserName OPTIONAL, OUT LPDWORD lpdwUserNameLength OPTIONAL, OUT LPSTR* lpszPassword OPTIONAL, OUT LPDWORD lpdwPasswordLength OPTIONAL, OUT LPSTR* lpszUrlPath OPTIONAL, OUT LPDWORD lpdwUrlPathLength OPTIONAL, OUT LPSTR* lpszExtraInfo OPTIONAL, OUT LPDWORD lpdwExtraInfoLength OPTIONAL, OUT LPBOOL pHavePort )
/*++
Routine Description:
Cracks an URL into its constituent parts
Assumes: 1. If one of the optional lpsz fields is present (e.g. lpszUserName) then the accompanying lpdw field must also be supplied
Arguments:
lpszUrl - pointer to URL to crack. This buffer WILL BE OVERWRITTEN if it contains escape sequences that we will convert back to ANSI characters
dwUrlLength - if not 0, string length of lpszUrl
bEscape - TRUE if we are to escape the url-path
lpSchemeType - returned scheme type - e.g. INTERNET_SCHEME_HTTP
lpszSchemeName - returned scheme name
lpdwSchemeNameLength - length of scheme name
lpszHostName - returned host name
lpdwHostNameLength - length of host name buffer
lpServerPort - returned server port if present in the URL, else 0
lpszUserName - returned user name if present
lpdwUserNameLength - length of user name buffer
lpszPassword - returned password if present
lpdwPasswordLength - length of password buffer
lpszUrlPath - returned, canonicalized URL path
lpdwUrlPathLength - length of url-path buffer
lpszExtraInfo - returned search string or intra-page link if present
lpdwExtraInfoLength - length of extra info buffer
pHavePort - returned boolean indicating whether port was specified
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_UNRECOGNIZED_SCHEME
--*/
{ DWORD error; DWORD schemeLength; INTERNET_SCHEME schemeType;
//
// if dwUrlLength is 0 then lpszUrl is ASCIIZ. Find its length
//
if (dwUrlLength == 0) { dwUrlLength = strlen(lpszUrl); }
//
// get parser based on the protocol name
//
for (schemeLength = 0; lpszUrl[schemeLength] != ':'; ++schemeLength) { if ((dwUrlLength == 0) || (lpszUrl[schemeLength] == '\0')) {
//
// no ':' in URL? Bogus (dude)
//
error = ERROR_WINHTTP_UNRECOGNIZED_SCHEME; goto quit; } --dwUrlLength; }
DWORD i; int skip; BOOL isGeneric; BOOL needSlashes; BOOL haveSlashes;
isGeneric = FALSE; needSlashes = FALSE; haveSlashes = FALSE;
schemeType = INTERNET_SCHEME_UNKNOWN;
if (ScanSchemes(lpszUrl, schemeLength, &i)) { schemeType = UrlSchemeList[i].SchemeType; needSlashes = UrlSchemeList[i].NeedSlashes; } else { error = ERROR_WINHTTP_UNRECOGNIZED_SCHEME; goto quit; }
skip = 1; // skip ':'
if ((dwUrlLength > 3) && (memcmp(&lpszUrl[schemeLength], "://", 3) == 0)) { skip = 3; // skip "://"
haveSlashes = TRUE; }
//
// If we don't have slashes, make sure we don't need them.
// If we have slashes, make sure they are required.
//
if ((!haveSlashes && !needSlashes) || (haveSlashes && needSlashes)) { if (ARGUMENT_PRESENT(lpSchemeType)) { *lpSchemeType = schemeType; } if (ARGUMENT_PRESENT(lpszSchemeName)) { *lpszSchemeName = lpszUrl; *lpdwSchemeNameLength = schemeLength; } lpszUrl += schemeLength + skip; dwUrlLength -= skip;
if (isGeneric) { if (ARGUMENT_PRESENT(lpszUserName)) { *lpszUserName = NULL; *lpdwUserNameLength = 0; } if (ARGUMENT_PRESENT(lpszPassword)) { *lpszPassword = NULL; *lpdwPasswordLength = 0; } if (ARGUMENT_PRESENT(lpszHostName)) { *lpszHostName = NULL; *lpdwHostNameLength = 0; } if (ARGUMENT_PRESENT(lpServerPort)) { *lpServerPort = 0; } error = ERROR_SUCCESS; } else { error = GetUrlAddress(&lpszUrl, &dwUrlLength, lpszUserName, lpdwUserNameLength, lpszPassword, lpdwPasswordLength, lpszHostName, lpdwHostNameLength, lpServerPort, pHavePort ); } if (bEscape && (error == ERROR_SUCCESS)) { error = DecodeUrlInSitu(lpszUrl, &dwUrlLength); } if ((error == ERROR_SUCCESS) && ARGUMENT_PRESENT(lpszExtraInfo)) { *lpdwExtraInfoLength = 0; for (i = 0; i < (int)dwUrlLength; i++) { if (lpszUrl[i] == '?' || lpszUrl[i] == '#') { *lpszExtraInfo = &lpszUrl[i]; *lpdwExtraInfoLength = dwUrlLength - i; dwUrlLength -= *lpdwExtraInfoLength; } } } if ((error == ERROR_SUCCESS) && ARGUMENT_PRESENT(lpszUrlPath)) { *lpszUrlPath = lpszUrl; *lpdwUrlPathLength = dwUrlLength; } } else { error = ERROR_WINHTTP_UNRECOGNIZED_SCHEME; }
quit:
return error; }
#define DEFAULT_REALLOC_SIZE 1024
DWORD EncodeUrlPath( IN DWORD Flags, IN DWORD SchemeFlags, IN LPSTR UrlPath, IN DWORD UrlPathLength, OUT LPSTR* pEncodedUrlPath, IN OUT LPDWORD EncodedUrlPathLength )
/*++
Routine Description:
Encodes an URL-path. That is, escapes the string. Creates a new URL-path in which all the 'unsafe' and reserved characters for this scheme have been converted to escape sequences
Arguments:
Flags - controlling expansion
SchemeFlags - which scheme we are encoding for - SCHEME_HTTP, etc.
UrlPath - pointer to the unescaped string
UrlPathLength - length of Url
EncodedUrlPath - pointer to buffer where encoded URL will be written
EncodedUrlPathLength - IN: size of EncodedUrlPath OUT: number of bytes written to EncodedUrlPath
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_INSUFFICIENT_BUFFER UrlPathLength not large enough to store encoded URL path
--*/
{ DWORD error; DWORD len;
len = *EncodedUrlPathLength; LPSTR EncodedUrlPath = *pEncodedUrlPath; UCHAR ch;
while(ch = (UCHAR)*UrlPath++) { //
// check whether this character is safe. For now, we encode all unsafe
// and scheme-specific characters the same way (i.e. irrespective of
// scheme)
//
// We are allowing '/' to be copied unmodified
//
if (len < 3) { LPSTR pStr = (LPSTR)REALLOCATE_MEMORY(*pEncodedUrlPath, *EncodedUrlPathLength+DEFAULT_REALLOC_SIZE, LMEM_MOVEABLE);
if (pStr) { EncodedUrlPath = pStr+*EncodedUrlPathLength-len; *pEncodedUrlPath = pStr; len += DEFAULT_REALLOC_SIZE; *EncodedUrlPathLength += DEFAULT_REALLOC_SIZE; } else { goto error; } } if (IS_UNSAFE_URL_CHARACTER(ch, SchemeFlags) && !((ch == '/') && (Flags & NO_ENCODE_PATH_SEP))) { *EncodedUrlPath++ = '%'; //*EncodedUrlPath++ = NumberToHexChar((int)ch / 16);
*EncodedUrlPath++ = NUMBER_TO_HEX_CHAR((int)ch / 16); //*EncodedUrlPath++ = NumberToHexChar((int)ch % 16);
*EncodedUrlPath++ = NUMBER_TO_HEX_CHAR((int)ch % 16); len -= 2; // extra --len below
} else { *EncodedUrlPath++ = (signed char)ch; } --len; } *EncodedUrlPath = '\0'; *EncodedUrlPathLength -= len; error = ERROR_SUCCESS;
quit: return error;
error: error = ERROR_INSUFFICIENT_BUFFER; goto quit; }
PRIVATE char HexCharToNumber( IN char ch )
/*++
Routine Description:
Converts an ANSI character in the range '0'..'9' 'A'..'F' 'a'..'f' to its corresponding hexadecimal value (0..f)
Arguments:
ch - character to convert
Return Value:
char hexadecimal value of ch, as an 8-bit (signed) character value
--*/
{ return (ch <= '9') ? (ch - '0') : ((ch >= 'a') ? ((ch - 'a') + 10) : ((ch - 'A') + 10)); }
PRIVATE char NumberToHexChar( IN int Number )
/*++
Routine Description:
Converts a number in the range 0..15 to its ASCII character hex representation ('0'..'F')
Arguments:
Number - to convert
Return Value:
char character in above range
--*/
{ return (Number <= 9) ? (char)('0' + Number) : (char)('A' + (Number - 10)); }
DWORD DecodeUrl( IN LPSTR Url, IN DWORD UrlLength, OUT LPSTR DecodedString, IN OUT LPDWORD DecodedLength )
/*++
Routine Description:
Converts an URL string with embedded escape sequences (%xx) to a counted string
It is safe to pass the same pointer for the string to convert, and the buffer for the converted results: if the current character is not escaped, it just gets overwritten, else the input pointer is moved ahead 2 characters further than the output pointer, which is benign
Arguments:
Url - pointer to URL string to convert
UrlLength - number of characters in UrlString
DecodedString - pointer to buffer that receives converted string
DecodedLength - IN: number of characters in buffer OUT: number of characters converted
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_INVALID_URL UrlString couldn't be converted
ERROR_INSUFFICIENT_BUFFER ConvertedString isn't large enough to hold all the converted UrlString
--*/
{ DWORD bufferRemaining;
bufferRemaining = *DecodedLength; while (UrlLength && bufferRemaining) {
char ch;
if (*Url == '%') {
//
// BUGBUG - would %00 ever appear in an URL?
//
++Url; if (isxdigit(*Url)) { ch = HexCharToNumber(*Url++) << 4; if (isxdigit(*Url)) { ch |= HexCharToNumber(*Url++); } else { return ERROR_WINHTTP_INVALID_URL; } } else { return ERROR_WINHTTP_INVALID_URL; } UrlLength -= 3; } else { ch = *Url++; --UrlLength; } *DecodedString++ = ch; --bufferRemaining; } if (UrlLength == 0) { *DecodedLength -= bufferRemaining; return ERROR_SUCCESS; } else { return ERROR_INSUFFICIENT_BUFFER; } }
DWORD DecodeUrlInSitu( IN LPSTR BufferAddress, IN OUT LPDWORD BufferLength )
/*++
Routine Description:
Decodes an URL string, if it contains escape sequences. The conversion is done in place, since we know that a string containing escapes is longer than the string with escape sequences (3 bytes) converted to characters (1 byte)
Arguments:
BufferAddress - pointer to the string to convert
BufferLength - IN: number of characters to convert OUT: length of converted string
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_INVALID_URL ERROR_INSUFFICIENT_BUFFER
--*/
{ DWORD stringLength;
stringLength = *BufferLength; if (memchr(BufferAddress, '%', stringLength)) { return DecodeUrl(BufferAddress, stringLength, BufferAddress, BufferLength ); } else {
//
// no escape character in the string, just return success
//
return ERROR_SUCCESS; } }
DWORD DecodeUrlStringInSitu( IN LPSTR BufferAddress, IN OUT LPDWORD BufferLength )
/*++
Routine Description:
Performs DecodeUrlInSitu() on a string and zero terminates it
Assumes: 1. Even if no decoding is performed, *BufferLength is large enough to fit an extra '\0' character
Arguments:
BufferAddress - pointer to the string to convert
BufferLength - IN: number of characters to convert OUT: length of converted string, excluding '\0'
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_INVALID_URL ERROR_INSUFFICIENT_BUFFER
--*/
{ DWORD error;
error = DecodeUrlInSitu(BufferAddress, BufferLength); if (error == ERROR_SUCCESS) { BufferAddress[*BufferLength] = '\0'; } return error; }
DWORD GetUrlAddressInfo( IN OUT LPSTR* Url, IN OUT LPDWORD UrlLength, OUT LPSTR* PartOne, OUT LPDWORD PartOneLength, OUT LPBOOL PartOneEscape, OUT LPSTR* PartTwo, OUT LPDWORD PartTwoLength, OUT LPBOOL PartTwoEscape )
/*++
Routine Description:
Given a string of the form foo:bar, splits them into 2 counted strings about the ':' character. The address string may or may not contain a ':'.
This function is intended to split into substrings the host:port and username:password strings commonly used in Internet address specifications and by association, in URLs
Arguments:
Url - pointer to pointer to string containing URL. On output this is advanced past the address parts
UrlLength - pointer to length of URL in UrlString. On output this is reduced by the number of characters parsed
PartOne - pointer which will receive first part of address string
PartOneLength - pointer which will receive length of first part of address string
PartOneEscape - TRUE on output if PartOne contains escape sequences
PartTwo - pointer which will receive second part of address string
PartTwoLength - pointer which will receive length of second part of address string
PartOneEscape - TRUE on output if PartTwo contains escape sequences
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_INVALID_URL
--*/
{ LPSTR pString; LPSTR pColon; DWORD partLength; LPBOOL partEscape; DWORD length;
//
// parse out <host>[:<port>] or <name>[:<password>] (i.e. <part1>[:<part2>]
//
pString = *Url; pColon = NULL; partLength = 0; *PartOne = pString; *PartOneLength = 0; *PartOneEscape = FALSE; *PartTwoEscape = FALSE; partEscape = PartOneEscape; length = *UrlLength; while ((*pString != '/') && (*pString != '\0') && (length != 0)) { if (*pString == '%') {
//
// if there is a % in the string then it *must* (RFC 1738) be the
// start of an escape sequence. This function just reports the
// address of the substrings and their lengths; calling functions
// must handle the escape sequences (i.e. it is their responsibility
// to decide where to put the results)
//
*partEscape = TRUE; } if (*pString == ':') { if (pColon != NULL) {
//
// we don't expect more than 1 ':'
//
return ERROR_WINHTTP_INVALID_URL; } pColon = pString; *PartOneLength = partLength; if (partLength == 0) { *PartOne = NULL; } partLength = 0; partEscape = PartTwoEscape; } else { ++partLength; } ++pString; --length; }
//
// we either ended on the host (or user) name or the port number (or
// password), one of which we don't know the length of
//
if (pColon == NULL) { *PartOneLength = partLength; *PartTwo = NULL; *PartTwoLength = 0; *PartTwoEscape = FALSE; } else { *PartTwoLength = partLength; *PartTwo = pColon + 1;
//
// in both the <user>:<password> and <host>:<port> cases, we cannot have
// the second part without the first, although both parts being zero
// length is OK (host name will be sorted out elsewhere, but (for now,
// at least) I am allowing <>:<> for username:password, since I don't
// see it expressly disallowed in the RFC. I may be revisiting this code
// later...)
//
// N.B.: ftp://ftp.microsoft.com uses http://:0/-http-gw-internal-/menu.gif
// if ((*PartOneLength == 0) && (partLength != 0)) {
// return ERROR_WINHTTP_INVALID_URL;
// }
}
//
// update the URL pointer and length remaining
//
*Url = pString; *UrlLength = length;
return ERROR_SUCCESS; }
DWORD GetUrlAddress( IN OUT LPSTR* lpszUrl, OUT LPDWORD lpdwUrlLength, OUT LPSTR* lpszUserName OPTIONAL, OUT LPDWORD lpdwUserNameLength OPTIONAL, OUT LPSTR* lpszPassword OPTIONAL, OUT LPDWORD lpdwPasswordLength OPTIONAL, OUT LPSTR* lpszHostName OPTIONAL, OUT LPDWORD lpdwHostNameLength OPTIONAL, OUT LPINTERNET_PORT lpPort OPTIONAL, OUT LPBOOL pHavePort )
/*++
Routine Description:
This function extracts any and all parts of the address information for a generic URL. If any of the address parts contain escaped characters (%nn) then they are converted in situ
The generic addressing format (RFC 1738) is:
<user>:<password>@<host>:<port>
The addressing information cannot contain a password without a user name, or a port without a host name NB: ftp://ftp.microsoft.com uses URL's that have a port without a host name!
(e.g. http://:0/-http-gw-internal-/menu.gif)
Although only the lpszUrl and lpdwUrlLength fields are required, the address parts will be checked for presence and completeness
Assumes: 1. If one of the optional lpsz fields is present (e.g. lpszUserName) then the accompanying lpdw field must also be supplied
Arguments:
lpszUrl - IN: pointer to the URL to parse OUT: URL remaining after address information
N.B. The url-path is NOT canonicalized (unescaped) because it may contain protocol-specific information which must be parsed out by the protocol-specific parser
lpdwUrlLength - returned length of the remainder of the URL after the address information
lpszUserName - returned pointer to the user name This parameter can be omitted by those protocol parsers that do not require or expect user names in the URL
lpdwUserNameLength - returned length of the user name part This parameter can be omitted by those protocol parsers that do not require or expect user names in the URL
lpszPassword - returned pointer to the password This parameter can be omitted by those protocol parsers that do not require or expect user passwords in the URL
lpdwPasswordLength - returned length of the password This parameter can be omitted by those protocol parsers that do not require or expect user passwords in the URL
lpszHostName - returned pointer to the host name This parameter can be omitted by those protocol parsers that do not require the host name info
lpdwHostNameLength - returned length of the host name This parameter can be omitted by those protocol parsers that do not require the host name info
lpPort - returned value of the port field This parameter can be omitted by those protocol parsers that do not require or expect user port number
pHavePort - returned boolean indicating whether a port was specified in the URL or not. This value is not returned if the lpPort parameter is omitted.
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_INVALID_URL We could not parse some part of the address info, or we found address info where the protocol parser didn't expect any
ERROR_INSUFFICIENT_BUFFER We could not convert an escaped string
--*/
{ LPSTR pAt; DWORD urlLength; LPSTR pUrl; BOOL part1Escape; BOOL part2Escape; char portNumber[INTERNET_MAX_PORT_NUMBER_LENGTH + 1]; DWORD portNumberLength; LPSTR pPortNumber; DWORD error; LPSTR hostName; DWORD hostNameLength;
pUrl = *lpszUrl; urlLength = strlen(pUrl);
//
// check to see if there is an '@' separating user name & password. If we
// see a '/' or get to the end of the string before we see the '@' then
// there is no username:password part
//
pAt = NULL; for (DWORD i = 0; i < urlLength; ++i) { if (pUrl[i] == '/') { break; } else if (pUrl[i] == '@') { pAt = &pUrl[i]; break; } }
if (pAt != NULL) {
DWORD addressPartLength; LPSTR userName; DWORD userNameLength; LPSTR password; DWORD passwordLength;
addressPartLength = (DWORD) (pAt - pUrl); urlLength -= addressPartLength; error = GetUrlAddressInfo(&pUrl, &addressPartLength, &userName, &userNameLength, &part1Escape, &password, &passwordLength, &part2Escape ); if (error != ERROR_SUCCESS) { return error; }
//
// ensure there is no address information unparsed before the '@'
//
INET_ASSERT(addressPartLength == 0); INET_ASSERT(pUrl == pAt);
if (ARGUMENT_PRESENT(lpszUserName)) {
INET_ASSERT(ARGUMENT_PRESENT(lpdwUserNameLength));
//
// convert the user name in situ
//
if (part1Escape) {
INET_ASSERT(userName != NULL); INET_ASSERT(userNameLength != 0);
error = DecodeUrlInSitu(userName, &userNameLength); if (error != ERROR_SUCCESS) { return error; } } *lpszUserName = userName; *lpdwUserNameLength = userNameLength; }
if (ARGUMENT_PRESENT(lpszPassword)) {
//
// convert the password in situ
//
if (part2Escape) {
INET_ASSERT(userName != NULL); INET_ASSERT(userNameLength != 0); INET_ASSERT(password != NULL); INET_ASSERT(passwordLength != 0);
error = DecodeUrlInSitu(password, &passwordLength); if (error != ERROR_SUCCESS) { return error; } } *lpszPassword = password; *lpdwPasswordLength = passwordLength; }
//
// the URL pointer now points at the host:port fields (remember that
// ExtractAddressParts() must have bumped pUrl up to the end of the
// password field (if present) which ends at pAt)
//
++pUrl;
//
// similarly, bump urlLength to account for the '@'
//
--urlLength; } else {
//
// no '@' therefore no username or password
//
if (ARGUMENT_PRESENT(lpszUserName)) {
INET_ASSERT(ARGUMENT_PRESENT(lpdwUserNameLength));
*lpszUserName = NULL; *lpdwUserNameLength = 0; } if (ARGUMENT_PRESENT(lpszPassword)) {
INET_ASSERT(ARGUMENT_PRESENT(lpdwPasswordLength));
*lpszPassword = NULL; *lpdwPasswordLength = 0; } }
//
// now get the host name and the optional port
//
pPortNumber = portNumber; portNumberLength = sizeof(portNumber); error = GetUrlAddressInfo(&pUrl, &urlLength, &hostName, &hostNameLength, &part1Escape, &pPortNumber, &portNumberLength, &part2Escape ); if (error != ERROR_SUCCESS) { return error; }
//
// the URL address information MUST contain the host name
//
// if ((hostName == NULL) || (hostNameLength == 0)) {
// return ERROR_WINHTTP_INVALID_URL;
// }
if (ARGUMENT_PRESENT(lpszHostName)) {
INET_ASSERT(ARGUMENT_PRESENT(lpdwHostNameLength));
//
// if the host name contains escaped characters, convert them in situ
//
if (part1Escape) { error = DecodeUrlInSitu(hostName, &hostNameLength); if (error != ERROR_SUCCESS) { return error; } } *lpszHostName = hostName; *lpdwHostNameLength = hostNameLength; }
//
// if there is a port field, convert it if there are escaped characters,
// check it for valid numeric characters, and convert it to a number
//
if (ARGUMENT_PRESENT(lpPort)) { if (portNumberLength != 0) {
DWORD i; DWORD port;
INET_ASSERT(pPortNumber != NULL);
if (part2Escape) { error = DecodeUrlInSitu(pPortNumber, &portNumberLength); if (error != ERROR_SUCCESS) { return error; } }
//
// ensure all characters in the port number buffer are numeric, and
// calculate the port number at the same time
//
for (i = 0, port = 0; i < portNumberLength; ++i) { if (!isdigit(*pPortNumber)) { return ERROR_WINHTTP_INVALID_URL; } port = port * 10 + (int)(*pPortNumber++ - '0'); // We won't allow ports larger than 65535 ((2^16)-1)
// We have to check this every time to make sure that someone
// doesn't try to overflow a DWORD.
if (port > 65535) { return ERROR_WINHTTP_INVALID_URL; } } *lpPort = (INTERNET_PORT)port; if (ARGUMENT_PRESENT(pHavePort)) { *pHavePort = TRUE; } } else { *lpPort = INTERNET_INVALID_PORT_NUMBER; if (ARGUMENT_PRESENT(pHavePort)) { *pHavePort = FALSE; } } }
//
// update the URL pointer and the length of the url-path
//
*lpszUrl = pUrl; *lpdwUrlLength = urlLength;
return ERROR_SUCCESS; }
INTERNET_SCHEME MapUrlSchemeName( IN LPSTR lpszSchemeName, IN DWORD dwSchemeNameLength )
/*++
Routine Description:
Maps a scheme name/length to a scheme name type
Arguments:
lpszSchemeName - pointer to name of scheme to map
dwSchemeNameLength - length of scheme (if -1, lpszSchemeName is ASCIZ)
Return Value:
INTERNET_SCHEME
--*/
{ if (dwSchemeNameLength == (DWORD)-1) { dwSchemeNameLength = (DWORD)lstrlen(lpszSchemeName); }
DWORD i; if (ScanSchemes(lpszSchemeName, dwSchemeNameLength, &i)) { return UrlSchemeList[i].SchemeType; } return INTERNET_SCHEME_UNKNOWN; }
LPSTR MapUrlScheme( IN INTERNET_SCHEME Scheme, OUT LPDWORD lpdwSchemeNameLength )
/*++
Routine Description:
Maps the enumerated scheme name type to the name
Arguments:
Scheme - enumerated scheme type to map
lpdwSchemeNameLength - pointer to returned length of scheme name
Return Value:
LPSTR - pointer to scheme name or NULL
--*/
{ if ((Scheme >= INTERNET_SCHEME_FIRST) && (Scheme <= INTERNET_SCHEME_LAST)) { *lpdwSchemeNameLength = UrlSchemeList[Scheme].SchemeLength; return UrlSchemeList[Scheme].SchemeName; } *lpdwSchemeNameLength = 0; return NULL; }
LPSTR MapUrlSchemeToName( IN INTERNET_SCHEME Scheme )
/*++
Routine Description:
Maps the enumerated scheme name type to the name
Arguments:
Scheme - enumerated scheme type to map
Return Value:
LPSTR - pointer to scheme name or NULL
--*/
{ if ((Scheme >= INTERNET_SCHEME_FIRST) && (Scheme <= INTERNET_SCHEME_LAST)) { return UrlSchemeList[Scheme].SchemeName; } return NULL; }
/*
* ConvertUnicodeToMultiByte: * * dwFlags: WINHTTP_FLAG_NULL_CODEPAGE-> assumes correctly encoded string packaged into UTF8, no escaping done. WINHTTP_FLAG_VALID_HOSTNAME-> only for server name only the previous flag valid for server name passed in here. if both of these are not specified, then
if dwCodePage is not INVALID, it'll be used to convert unicode string to ANSI. else UTF8 will be used.
if ESCAPE && ESCAPE_PERCENT is specified, the ANSI url will be escaped (incl. %) else it will be escaped w/o escaping %s. */ DWORD ConvertUnicodeToMultiByte( LPCWSTR lpszObjectName, DWORD dwCodePage, MEMORYPACKET* pmp, DWORD dwFlags) { DWORD dwError = ERROR_SUCCESS; LPSTR pStr; WCHAR wc; LPCWSTR pwStr; BOOL bStrip0s = TRUE; DWORD dwUnicodeUrlSize;
//determine size of string and/or safe characters
if ((dwFlags & WINHTTP_FLAG_NULL_CODEPAGE) || (dwFlags & WINHTTP_FLAG_VALID_HOSTNAME)) { if (dwFlags & WINHTTP_FLAG_VALID_HOSTNAME) { for (pwStr = lpszObjectName; wc = *pwStr; ++pwStr) { if (IS_UNSAFE_URL_WIDECHARACTER(wc, HOSTNAME)) { dwError = ERROR_WINHTTP_INVALID_URL; goto done; } } pmp->dwAlloc = dwUnicodeUrlSize = (DWORD)(pwStr-lpszObjectName+1); } else { pmp->dwAlloc = dwUnicodeUrlSize = lstrlenW(lpszObjectName)+1; } } else { DWORD dwUnsafeChars = 0; // optimization to check for unsafe characters, and optimize the common case.
// calculate the length, and while parsing the string, check if there are unsafeChars
for(pwStr = lpszObjectName; wc = *pwStr; ++pwStr) { if (IS_UNSAFE_URL_WIDECHARACTER(wc, 0)) ++dwUnsafeChars; } dwUnicodeUrlSize = (DWORD)(pwStr-lpszObjectName+1);
if (dwUnsafeChars == 0) { pmp->dwAlloc = dwUnicodeUrlSize; } else { bStrip0s = FALSE; } }
//convert to MBCS
if (bStrip0s) { INET_ASSERT(pmp->dwAlloc); pmp->psStr = (LPSTR)ALLOCATE_FIXED_MEMORY(pmp->dwAlloc);
if (!pmp->psStr) { pmp->dwAlloc = 0; dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; } pmp->dwSize = pmp->dwAlloc-1; for (pStr = pmp->psStr; wc = *lpszObjectName; ++lpszObjectName) { *(pStr)++ = (CHAR)wc; } *pStr = '\0'; } else { // convert with WideCharToMultiByte()
pmp->dwAlloc = WideCharToMultiByte(dwCodePage, 0, lpszObjectName, dwUnicodeUrlSize, NULL, 0, NULL, NULL); if (pmp->dwAlloc) { pmp->psStr = (LPSTR)ALLOCATE_FIXED_MEMORY(pmp->dwAlloc);
if (!pmp->psStr) { pmp->dwAlloc = 0; dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; } pmp->dwSize = WideCharToMultiByte(dwCodePage, 0, lpszObjectName, dwUnicodeUrlSize, pmp->psStr, pmp->dwAlloc, NULL, NULL);
if (!pmp->dwSize) { dwError = GetLastError(); goto done; } else pmp->dwSize -= 1; } else { dwError = GetLastError(); goto done; } }
//escaping
if (dwFlags & WINHTTP_FLAG_DEFAULT_ESCAPE) { INET_ASSERT (! (dwFlags & WINHTTP_FLAG_VALID_HOSTNAME)); static CHAR* hexArray = "0123456789ABCDEF"; UCHAR ch; DWORD dwUnsafeChars = 0; DWORD dwNewAlloc; LPSTR pDest, pNewStr; for(pStr = pmp->psStr; ch = *pStr; pStr = CharNextExA((WORD)dwCodePage, pStr, 0)) { if (IS_UNSAFE_URL_CHARACTER(ch, SCHEME_HTTP)) ++dwUnsafeChars; else if(ch == '?') break; }
if (dwUnsafeChars == 0) goto done;
dwNewAlloc = pmp->dwAlloc + dwUnsafeChars*2; pNewStr = pDest = (LPSTR)ALLOCATE_FIXED_MEMORY(dwNewAlloc);
if (!pDest) { dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; }
BOOL bEscapePercent = (dwFlags & WINHTTP_FLAG_ESCAPE_PERCENT) ? TRUE : FALSE;
BOOL bHitQuery = FALSE; LPSTR pNext; BOOL bLead; for (pStr = pmp->psStr; ch = *pStr;) { pNext = CharNextExA((WORD)dwCodePage, pStr, 0); bLead = TRUE; do { ch = *pStr; if (IS_UNSAFE_URL_CHARACTER(ch, SCHEME_HTTP) && (!bLead || (ch != '%') || bEscapePercent) ) { *pDest++ = '%'; *pDest++ = hexArray[ch>>4]; *pDest++ = hexArray[ch & 0x0f]; } else { *pDest++ = ch; if ((ch == '?') && bLead) { bHitQuery = TRUE;
++pStr; INET_ASSERT(pStr == pNext);
break; } } bLead = FALSE; } while (++pStr != pNext);
if (bHitQuery) break; }
if (bHitQuery) { for ( ; ch = *pStr; pStr++) { *pDest++ = ch; } } *pDest = '\0';
FREE_FIXED_MEMORY(pmp->psStr); pmp->psStr = pNewStr; pmp->dwSize = (DWORD)(pDest-pNewStr); pmp->dwAlloc = dwNewAlloc; } done: if (pmp->psStr) pmp->dwAlloc = (pmp->dwAlloc > MP_MAX_STACK_USE) ? pmp->dwAlloc : MP_MAX_STACK_USE+1;// to force FREE in ~MEMORYPACKET
return dwError; }
|