|
|
/*++
Copyright (c) 1995 Microsoft Corporation
Module Name:
parseurl.cxx
Abstract:
Contains functions to parse the basic URLs - FTP, Gopher, HTTP.
An URL parser simply acts as a macro: it must break out the protocol-specific information from the URL and initiate opening the identified resource: all this can be accomplished by calling the relevant Internet protocol APIs.
Code in this module is based on RFC1738
Contents: IsValidUrl DoesSchemeRequireSlashes ParseUrl CrackUrl EncodeUrlPath (HexCharToNumber) (NumberToHexChar) DecodeUrl DecodeUrlInSitu DecodeUrlStringInSitu GetUrlAddressInfo GetUrlAddress MapUrlSchemeName MapUrlScheme MapUrlSchemeToName
Author:
Richard L Firth (rfirth) 26-Apr-1995
Environment:
Win32(s) user-mode DLL
Revision History:
26-Apr-1995 Created
--*/
#include <wininetp.h>
//
// private manifests
//
#define RESERVED SAFE
//
// private macros
//
//#define HEX_CHAR_TO_NUMBER(ch) \ // ((ch <= '9') \ // ? (ch - '0') \ // : ((ch >= 'a') \ // ? ((ch - 'a') + 10) \ // : ((ch - 'A') + 10)))
#define NUMBER_TO_HEX_CHAR(n) \
(((n) <= 9) ? ((char)(n) + '0') : (((char)(n) - 10) + 'A'))
#define IS_UNSAFE_URL_CHARACTER(Char, Scheme) \
(((UCHAR)(Char) <= 0x20) || ((UCHAR)(Char) >= 0x7f) \ || (SafetyList[(Char) - 0x21] & (UNSAFE | Scheme)))
#define IS_UNSAFE_URL_WIDECHARACTER(wChar, Scheme) \
(((WCHAR)(wChar) <= 0x0020) || ((WCHAR)(wChar) >= 0x007f) \ || (SafetyList[(wChar) - 0x0021] & (UNSAFE | Scheme)))
//
// private types
//
//
// private prototypes
//
PRIVATE char HexCharToNumber( IN char ch );
PRIVATE char NumberToHexChar( IN int Number );
//
// private data
//
//
// SafetyList - the list of characters above 0x20 and below 0x7f that are
// classified as safe, unsafe or scheme-specific. Safe characters do not need
// to be escaped for any URL scheme. Unsafe characters must be escaped for all
// URL schemes. Scheme-specific characters need only be escaped for the relevant
// scheme(s)
//
const PRIVATE UCHAR SafetyList[] = {
//
// UNSAFE: 0x00..0x20
//
SAFE | HOSTNAME, // 0x21 (!)
UNSAFE, // 0x22 (")
UNSAFE, // 0x23 (#)
SAFE | HOSTNAME, // 0x24 ($)
UNSAFE, // 0x25 (%)
RESERVED | HOSTNAME, // 0x26 (&)
SAFE | HOSTNAME, // 0x27 (')
SAFE | HOSTNAME, // 0x28 (()
SAFE | HOSTNAME, // 0x29 ())
SAFE | HOSTNAME, // 0x2A (*)
SCHEME_GOPHER | HOSTNAME, // 0x2B (+)
SAFE | HOSTNAME, // 0x2C (,)
SAFE, // 0x2D (-)
SAFE, // 0x2E (.)
RESERVED | HOSTNAME, // 0x2F (/)
SAFE, // 0x30 (0)
SAFE, // 0x31 (1)
SAFE, // 0x32 (2)
SAFE, // 0x33 (3)
SAFE, // 0x34 (4)
SAFE, // 0x35 (5)
SAFE, // 0x36 (6)
SAFE, // 0x37 (7)
SAFE, // 0x38 (8)
SAFE, // 0x39 (9)
RESERVED | HOSTNAME, // 0x3A (:)
RESERVED | HOSTNAME, // 0x3B (;)
UNSAFE, // 0x3C (<)
RESERVED | HOSTNAME, // 0x3D (=)
UNSAFE, // 0x3E (>)
RESERVED | SCHEME_GOPHER | HOSTNAME, // 0x3F (?)
RESERVED | HOSTNAME, // 0x40 (@)
SAFE, // 0x41 (A)
SAFE, // 0x42 (B)
SAFE, // 0x43 (C)
SAFE, // 0x44 (D)
SAFE, // 0x45 (E)
SAFE, // 0x46 (F)
SAFE, // 0x47 (G)
SAFE, // 0x48 (H)
SAFE, // 0x49 (I)
SAFE, // 0x4A (J)
SAFE, // 0x4B (K)
SAFE, // 0x4C (L)
SAFE, // 0x4D (M)
SAFE, // 0x4E (N)
SAFE, // 0x4F (O)
SAFE, // 0x50 (P)
SAFE, // 0x51 (Q)
SAFE, // 0x42 (R)
SAFE, // 0x43 (S)
SAFE, // 0x44 (T)
SAFE, // 0x45 (U)
SAFE, // 0x46 (V)
SAFE, // 0x47 (W)
SAFE, // 0x48 (X)
SAFE, // 0x49 (Y)
SAFE, // 0x5A (Z)
UNSAFE, // 0x5B ([)
UNSAFE, // 0x5C (\)
UNSAFE, // 0x5D (])
UNSAFE, // 0x5E (^)
SAFE, // 0x5F (_)
UNSAFE, // 0x60 (`)
SAFE, // 0x61 (a)
SAFE, // 0x62 (b)
SAFE, // 0x63 (c)
SAFE, // 0x64 (d)
SAFE, // 0x65 (e)
SAFE, // 0x66 (f)
SAFE, // 0x67 (g)
SAFE, // 0x68 (h)
SAFE, // 0x69 (i)
SAFE, // 0x6A (j)
SAFE, // 0x6B (k)
SAFE, // 0x6C (l)
SAFE, // 0x6D (m)
SAFE, // 0x6E (n)
SAFE, // 0x6F (o)
SAFE, // 0x70 (p)
SAFE, // 0x71 (q)
SAFE, // 0x72 (r)
SAFE, // 0x73 (s)
SAFE, // 0x74 (t)
SAFE, // 0x75 (u)
SAFE, // 0x76 (v)
SAFE, // 0x77 (w)
SAFE, // 0x78 (x)
SAFE, // 0x79 (y)
SAFE, // 0x7A (z)
UNSAFE, // 0x7B ({)
UNSAFE, // 0x7C (|)
UNSAFE, // 0x7D (})
UNSAFE // 0x7E (~)
//
// UNSAFE: 0x7F..0xFF
//
};
INT ByteCountForLeadUtf8Byte(char ch) { static const int aiByteCountForFirstZero[] = {1,1,2,3,4,5,6,1}; // the final 1 shouldn't happen on a proper UTF-8 string
DWORD dwFirstZeroBit = 0; BYTE chMask = 0x80; // binary 1000 0000
// While the mask reveals a non-zero and we haven't counted zeroes past
//the range of aiByteCountForLeadNibbleInUtf8[], look for a zero.
while ((char)chMask & ch && dwFirstZeroBit < ARRAY_ELEMENTS(aiByteCountForFirstZero)-1 ) { dwFirstZeroBit++; chMask = chMask >> 1; }
return aiByteCountForFirstZero[dwFirstZeroBit]; }
LPSTR Utf8StrChr( LPSTR pString, LPSTR pEnd, char chTarget) { while( pString < pEnd && *pString != '\0') { if (*pString == chTarget) return pString;
pString += ByteCountForLeadUtf8Byte(*pString); }
return NULL; };
LPSTR Utf8StrChrEx( LPSTR pString, LPSTR pEnd, char chTarget1, char chTarget2) { while( pString < pEnd && *pString != '\0') { if (*pString == chTarget1 || *pString == chTarget2) { return pString; }
pString += ByteCountForLeadUtf8Byte(*pString); }
return NULL; };
//
// UrlSchemeList - the list of schemes that we support
//
typedef struct { LPSTR SchemeName; DWORD SchemeLength; INTERNET_SCHEME SchemeType; DWORD SchemeFlags; BOOL NeedSlashes; DWORD OpenFlags; } URL_SCHEME_INFO;
const PRIVATE URL_SCHEME_INFO UrlSchemeList[] = { NULL, 0, INTERNET_SCHEME_DEFAULT, 0, FALSE, 0, "http", 4, INTERNET_SCHEME_HTTP, SCHEME_HTTP, TRUE, 0, "https", 5, INTERNET_SCHEME_HTTPS, SCHEME_HTTP, TRUE, WINHTTP_FLAG_SECURE, };
#define NUMBER_OF_URL_SCHEMES ARRAY_ELEMENTS(UrlSchemeList)
BOOL ScanSchemes(LPTSTR pszToCheck, DWORD ccStr, PDWORD pwResult) { for (DWORD i=0; i<NUMBER_OF_URL_SCHEMES; i++) { if ((UrlSchemeList[i].SchemeLength == ccStr) && (strnicmp(UrlSchemeList[i].SchemeName, pszToCheck, ccStr)==0)) { *pwResult = i; return TRUE; } } return FALSE; }
//
// functions
//
BOOL IsValidUrl( IN LPCSTR lpszUrl )
/*++
Routine Description:
Determines whether an URL has a valid format
Arguments:
lpszUrl - pointer to URL to check.
Assumes: 1. lpszUrl is non-NULL, non-empty string
Return Value:
BOOL
--*/
{ INET_ASSERT(lpszUrl != NULL); INET_ASSERT(*lpszUrl != '\0');
while (*lpszUrl != '\0') { if (IS_UNSAFE_URL_CHARACTER(*lpszUrl, SCHEME_ANY)) { return FALSE; } ++lpszUrl; } return TRUE; }
BOOL IsValidHostNameW( IN LPCWSTR lpwszHostName, IN DWORD dwFlags )
/*++
Routine Description:
Determines whether an hostname has valid chars in it
Arguments:
lpwszHostName - Pointer to hostname to check. Assumes lpwszHostName is non-NULL and points to a non-empty UNICODE string.
dwFlags - Flags that modify validation. If IVH_DISALLOW_IPV6_SCOPE_ID is set then an IPv6 literal address containing a scope ID will be invalid Return Value:
BOOL
--*/
{ SOCKADDR_IN6 Address; INT Error; INT AddressLength; BOOL bAllowScopeID = ((dwFlags & IVHN_DISALLOW_IPV6_SCOPE_ID) == 0); INET_ASSERT(lpwszHostName != NULL);
// first check if this is a valid IPv4 iteral
AddressLength = (INT)sizeof(Address); Error = _I_WSAStringToAddressW((LPWSTR)lpwszHostName, AF_INET, NULL, (LPSOCKADDR)&Address, &AddressLength);
if (Error == 0) { return TRUE; } // now check if this is a valid IPv6 literal
AddressLength = sizeof(Address); Error = _I_WSAStringToAddressW((LPWSTR)lpwszHostName, AF_INET6, NULL, (LPSOCKADDR)&Address, &AddressLength);
if (Error == 0) {
// is an IPv6 literal but we also require surrounding brackets
if ((*lpwszHostName == L'[') && (*(lpwszHostName+lstrlenW(lpwszHostName)-1) == L']')) {
// check scope ID situation
if (bAllowScopeID) { return TRUE; } else { if (Address.sin6_scope_id == 0) { return TRUE; } } } } // not a literal address so do strict bad character checking
while (*lpwszHostName != L'\0') { if (IS_UNSAFE_URL_WIDECHARACTER(*lpwszHostName, HOSTNAME)) { return FALSE; } ++lpwszHostName; } return TRUE; }
BOOL IsValidHostNameA( IN LPCSTR lpszHostName, IN DWORD dwFlags )
/*++
Routine Description:
Determines whether an hostname has valid chars in it
Arguments:
lpszHostName - pointer to Hostname to check.
lpszHostName - Pointer to hostname to check. Assumes lpszHostName is non-NULL and points to a non-empty ASCII string.
dwFlags - Flags that modify validation. If IVH_DISALLOW_IPV6_SCOPE_ID is set then an IPv6 literal address containing a scope ID will be invalid Return Value:
BOOL
--*/
{ SOCKADDR_IN6 Address; INT Error; INT AddressLength; BOOL bAllowScopeID = ((dwFlags & IVHN_DISALLOW_IPV6_SCOPE_ID) == 0); INET_ASSERT(lpszHostName != NULL);
// first check if this is a valid IPv4 iteral
AddressLength = sizeof(Address); Error = _I_WSAStringToAddressA((LPSTR)lpszHostName, AF_INET, NULL, (LPSOCKADDR)&Address, &AddressLength);
if (Error == 0) { return TRUE; } // now check if this is a valid IPv6 literal
AddressLength = sizeof(Address); Error = _I_WSAStringToAddressA((LPSTR)lpszHostName, AF_INET6, NULL, (LPSOCKADDR)&Address, &AddressLength);
if (Error == 0) {
// is an IPv6 literal but we also require surrounding brackets
if ((*lpszHostName == '[') && (*(lpszHostName+lstrlen(lpszHostName)-1) == ']')) {
// check scope ID situation
if (bAllowScopeID) { return TRUE; } else { if (Address.sin6_scope_id == 0) { return TRUE; } } } } // not a literal address so do strict bad character checking
while (*lpszHostName != '\0') { if (IS_UNSAFE_URL_CHARACTER(*lpszHostName, HOSTNAME)) { return FALSE; } ++lpszHostName; } return TRUE; }
BOOL DoesSchemeRequireSlashes( IN LPSTR lpszScheme, IN DWORD dwSchemeLength, IN BOOL bHasHostName )
/*++
Routine Description:
Determines whether a protocol scheme requires slashes
Arguments:
lpszScheme - pointer to protocol scheme in question (does not include ':' or slashes, just scheme name)
dwUrlLength - if not 0, string length of lpszScheme
Return Value:
BOOL
--*/
{ DWORD i;
//
// if dwSchemeLength is 0 then lpszUrl is ASCIIZ. Find its length
//
if (dwSchemeLength == 0) { dwSchemeLength = strlen(lpszScheme); }
if (ScanSchemes(lpszScheme, dwSchemeLength, &i)) { return UrlSchemeList[i].NeedSlashes; } return bHasHostName; }
DWORD CrackUrl( IN OUT LPSTR lpszUrl, IN DWORD dwUrlLength, IN BOOL bEscape, OUT LPINTERNET_SCHEME lpSchemeType OPTIONAL, OUT LPSTR* lpszSchemeName OPTIONAL, OUT LPDWORD lpdwSchemeNameLength OPTIONAL, OUT LPSTR* lpszHostName OPTIONAL, OUT LPDWORD lpdwHostNameLength OPTIONAL, IN BOOL fUnescapeHostName, OUT LPINTERNET_PORT lpServerPort OPTIONAL, OUT LPSTR* lpszUserName OPTIONAL, OUT LPDWORD lpdwUserNameLength OPTIONAL, OUT LPSTR* lpszPassword OPTIONAL, OUT LPDWORD lpdwPasswordLength OPTIONAL, OUT LPSTR* lpszUrlPath OPTIONAL, OUT LPDWORD lpdwUrlPathLength OPTIONAL, OUT LPSTR* lpszExtraInfo OPTIONAL, OUT LPDWORD lpdwExtraInfoLength OPTIONAL, OUT LPBOOL pHavePort )
/*++
Routine Description:
Cracks an URL into its constituent parts
Assumes: 1. If one of the optional lpsz fields is present (e.g. lpszUserName) then the accompanying lpdw field must also be supplied
bEscape is no longer used/supported and must always be false.
Arguments:
lpszUrl - pointer to URL to crack. This buffer WILL BE OVERWRITTEN if it contains escape sequences that we will convert back to ANSI characters and fUnescapeHostName == TRUE
dwUrlLength - if not 0, string length of lpszUrl
bEscape - TRUE if we are to escape the url-path
lpSchemeType - returned scheme type - e.g. INTERNET_SCHEME_HTTP
lpszSchemeName - returned scheme name
lpdwSchemeNameLength - length of scheme name
lpszHostName - returned host name
lpdwHostNameLength - length of host name buffer
lpServerPort - returned server port if present in the URL, else 0
lpszUserName - returned user name if present
lpdwUserNameLength - length of user name buffer
lpszPassword - returned password if present
lpdwPasswordLength - length of password buffer
lpszUrlPath - returned, canonicalized URL path
lpdwUrlPathLength - length of url-path buffer
lpszExtraInfo - returned search string or intra-page link if present
lpdwExtraInfoLength - length of extra info buffer
pHavePort - returned boolean indicating whether port was specified
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_UNRECOGNIZED_SCHEME
--*/
{ DWORD error = ERROR_WINHTTP_INTERNAL_ERROR; DWORD schemeLength; INTERNET_SCHEME schemeType; LPSTR pCursor, pEnd;
if(bEscape) { INET_ASSERT(!"bEscape==TRUE no longer supported for parseurl.cxx::CrackUrl()"); error = ERROR_INVALID_PARAMETER; goto quit; };
//
// if dwUrlLength is 0 then lpszUrl is ASCIIZ. Find its length
//
if (dwUrlLength == 0) { dwUrlLength = strlen(lpszUrl); }
pCursor = lpszUrl; pEnd = lpszUrl + dwUrlLength;
//
// extract the scheme (ex: "SCHEME://host/path...")
//
pEnd = Utf8StrChr(pCursor, pEnd, ':'); if (pEnd == NULL) { error = ERROR_WINHTTP_UNRECOGNIZED_SCHEME; goto quit; }
schemeLength = (DWORD)(pEnd - pCursor);
//
// We now point to the scheme with pCursor.. extract some info about it
//
DWORD i; int skip; BOOL needSlashes; BOOL haveSlashes;
needSlashes = FALSE; haveSlashes = FALSE;
schemeType = INTERNET_SCHEME_UNKNOWN;
if (ScanSchemes(pCursor, schemeLength, &i)) { schemeType = UrlSchemeList[i].SchemeType; needSlashes = UrlSchemeList[i].NeedSlashes; } else { error = ERROR_WINHTTP_UNRECOGNIZED_SCHEME; goto quit; }
skip = 1; // skip ':'
if ((dwUrlLength - schemeLength > 3) && (memcmp(&lpszUrl[schemeLength], "://", 3) == 0)) { skip = 3; // skip "://"
haveSlashes = TRUE; }
//
// If we don't have slashes, make sure we don't need them.
// If we have slashes, make sure they are required.
//
if( (haveSlashes || needSlashes) && !(haveSlashes && needSlashes)) { error = ERROR_WINHTTP_INVALID_URL; goto quit; }
//
// We've parsed the scheme, so set up that result.
//
if (ARGUMENT_PRESENT(lpSchemeType)) { *lpSchemeType = schemeType; } if (ARGUMENT_PRESENT(lpszSchemeName)) { *lpszSchemeName = lpszUrl; *lpdwSchemeNameLength = schemeLength; }
//
// Now crack the rest of the URL
//
lpszUrl += schemeLength + skip; dwUrlLength -= schemeLength + skip; error = GetUrlAddress(&lpszUrl, &dwUrlLength, lpszUserName, lpdwUserNameLength, lpszPassword, lpdwPasswordLength, lpszHostName, lpdwHostNameLength, fUnescapeHostName, lpServerPort, pHavePort );
if (error != ERROR_SUCCESS) goto quit;
if (ARGUMENT_PRESENT(lpszExtraInfo)) { pCursor = Utf8StrChrEx(lpszUrl, lpszUrl+dwUrlLength, '#', '?'); if (pCursor == NULL) pCursor = lpszUrl+dwUrlLength; *lpszExtraInfo = pCursor; *lpdwExtraInfoLength = (DWORD)(lpszUrl+dwUrlLength-pCursor); dwUrlLength -= *lpdwExtraInfoLength; }
//
// If the user didn't ask for the extra info, it is returned appended to the url path.
//
if (ARGUMENT_PRESENT(lpszUrlPath)) { *lpszUrlPath = lpszUrl; *lpdwUrlPathLength = dwUrlLength; }
quit:
return error; }
#define DEFAULT_REALLOC_SIZE 1024
DWORD EncodeUrlPath( IN DWORD Flags, IN DWORD SchemeFlags, IN LPSTR UrlPath, IN DWORD UrlPathLength, OUT LPSTR* pEncodedUrlPath, IN OUT LPDWORD EncodedUrlPathLength )
/*++
Routine Description:
Encodes an URL-path. That is, escapes the string. Creates a new URL-path in which all the 'unsafe' and reserved characters for this scheme have been converted to escape sequences
Arguments:
Flags - controlling expansion
SchemeFlags - which scheme we are encoding for - SCHEME_HTTP, etc.
UrlPath - pointer to the unescaped string
UrlPathLength - length of Url
EncodedUrlPath - pointer to buffer where encoded URL will be written
EncodedUrlPathLength - IN: size of EncodedUrlPath OUT: number of bytes written to EncodedUrlPath
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_INSUFFICIENT_BUFFER UrlPathLength not large enough to store encoded URL path
--*/
{ DWORD error; DWORD len;
len = *EncodedUrlPathLength; LPSTR EncodedUrlPath = *pEncodedUrlPath; UCHAR ch;
UNREFERENCED_PARAMETER(UrlPathLength);
while(0 != (ch = (UCHAR)*UrlPath++)) { //
// check whether this character is safe. For now, we encode all unsafe
// and scheme-specific characters the same way (i.e. irrespective of
// scheme)
//
// We are allowing '/' to be copied unmodified
//
if (len < 3) { LPSTR pStr = (LPSTR)REALLOCATE_MEMORY(*pEncodedUrlPath, *EncodedUrlPathLength+DEFAULT_REALLOC_SIZE);
if (pStr) { EncodedUrlPath = pStr+*EncodedUrlPathLength-len; *pEncodedUrlPath = pStr; len += DEFAULT_REALLOC_SIZE; *EncodedUrlPathLength += DEFAULT_REALLOC_SIZE; } else { goto error; } } if (IS_UNSAFE_URL_CHARACTER(ch, SchemeFlags) && !((ch == '/') && (Flags & NO_ENCODE_PATH_SEP))) { *EncodedUrlPath++ = '%'; //*EncodedUrlPath++ = NumberToHexChar((int)ch / 16);
*EncodedUrlPath++ = (CHAR)NUMBER_TO_HEX_CHAR((int)ch / 16); //*EncodedUrlPath++ = NumberToHexChar((int)ch % 16);
*EncodedUrlPath++ = (CHAR)NUMBER_TO_HEX_CHAR((int)ch % 16); len -= 2; // extra --len below
} else { *EncodedUrlPath++ = (signed char)ch; } --len; } *EncodedUrlPath = '\0'; *EncodedUrlPathLength -= len; error = ERROR_SUCCESS;
quit: return error;
error: error = ERROR_NOT_ENOUGH_MEMORY; goto quit; }
PRIVATE char HexCharToNumber( IN char ch )
/*++
Routine Description:
Converts an ANSI character in the range '0'..'9' 'A'..'F' 'a'..'f' to its corresponding hexadecimal value (0..f)
Arguments:
ch - character to convert
Return Value:
char hexadecimal value of ch, as an 8-bit (signed) character value
--*/
{ return (CHAR)((ch <= '9') ? (ch - '0') : ((ch >= 'a') ? ((ch - 'a') + 10) : ((ch - 'A') + 10))); }
PRIVATE char NumberToHexChar( IN int Number )
/*++
Routine Description:
Converts a number in the range 0..15 to its ASCII character hex representation ('0'..'F')
Arguments:
Number - to convert
Return Value:
char character in above range
--*/
{ return (Number <= 9) ? (char)('0' + Number) : (char)('A' + (Number - 10)); }
DWORD DecodeUrl( IN LPSTR Url, IN DWORD UrlLength, OUT LPSTR DecodedString, IN OUT LPDWORD DecodedLength )
/*++
Routine Description:
Converts an URL string with embedded escape sequences (%xx) to a counted string
It is safe to pass the same pointer for the string to convert, and the buffer for the converted results: if the current character is not escaped, it just gets overwritten, else the input pointer is moved ahead 2 characters further than the output pointer, which is benign
Arguments:
Url - pointer to URL string to convert
UrlLength - number of characters in UrlString
DecodedString - pointer to buffer that receives converted string
DecodedLength - IN: number of characters in buffer OUT: number of characters converted
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_INVALID_URL UrlString couldn't be converted
ERROR_INSUFFICIENT_BUFFER ConvertedString isn't large enough to hold all the converted UrlString
--*/
{ DWORD bufferRemaining;
bufferRemaining = *DecodedLength; while (UrlLength && bufferRemaining) {
char ch;
if (*Url == '%') {
//
// BUGBUG - would %00 ever appear in an URL?
//
++Url; if (isxdigit(*Url)) { ch = HexCharToNumber(*Url++) << 4; if (isxdigit(*Url)) { ch |= HexCharToNumber(*Url++); } else { return ERROR_WINHTTP_INVALID_URL; } } else { return ERROR_WINHTTP_INVALID_URL; } UrlLength -= 3; } else { ch = *Url++; --UrlLength; } *DecodedString++ = ch; --bufferRemaining; } if (UrlLength == 0) { *DecodedLength -= bufferRemaining; return ERROR_SUCCESS; } else { return ERROR_INSUFFICIENT_BUFFER; } }
DWORD DecodeUrlInSitu( IN LPSTR BufferAddress, IN OUT LPDWORD BufferLength )
/*++
Routine Description:
Decodes an URL string, if it contains escape sequences. The conversion is done in place, since we know that a string containing escapes is longer than the string with escape sequences (3 bytes) converted to characters (1 byte)
Arguments:
BufferAddress - pointer to the string to convert
BufferLength - IN: number of characters to convert OUT: length of converted string
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_INVALID_URL ERROR_INSUFFICIENT_BUFFER
--*/
{ DWORD stringLength;
stringLength = *BufferLength; if (memchr(BufferAddress, '%', stringLength)) { return DecodeUrl(BufferAddress, stringLength, BufferAddress, BufferLength ); } else {
//
// no escape character in the string, just return success
//
return ERROR_SUCCESS; } }
DWORD DecodeUrlStringInSitu( IN LPSTR BufferAddress, IN OUT LPDWORD BufferLength )
/*++
Routine Description:
Performs DecodeUrlInSitu() on a string and zero terminates it
Assumes: 1. Even if no decoding is performed, *BufferLength is large enough to fit an extra '\0' character
Arguments:
BufferAddress - pointer to the string to convert
BufferLength - IN: number of characters to convert OUT: length of converted string, excluding '\0'
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_INVALID_URL ERROR_INSUFFICIENT_BUFFER
--*/
{ DWORD error;
error = DecodeUrlInSitu(BufferAddress, BufferLength); if (error == ERROR_SUCCESS) { BufferAddress[*BufferLength] = '\0'; } return error; }
DWORD GetUrlAddressInfo( IN OUT LPSTR* Url, IN OUT LPDWORD UrlLength, OUT LPSTR* PartOne, OUT LPDWORD PartOneLength, OUT LPBOOL PartOneEscape, OUT LPSTR* PartTwo, OUT LPDWORD PartTwoLength, OUT LPBOOL PartTwoEscape )
/*++
Routine Description:
Given a string of the form foo:bar, splits them into 2 counted strings about the ':' character. The address string may or may not contain a ':'.
This function is intended to split into substrings the host:port and username:password strings commonly used in Internet address specifications and by association, in URLs
Modified to handle IPv6 literal addresses in URLs surrounded by brackets "[ ]" as per RFC 2732. Input of "[foo]:bar" is now considered equivalent to "foo:bar". The brackets ARE returned as part of a string and counted.
Arguments:
Url - pointer to pointer to string containing URL. On output this is advanced past the address parts
UrlLength - pointer to length of URL in UrlString. On output this is reduced by the number of characters parsed
PartOne - pointer which will receive first part of address string
PartOneLength - pointer which will receive length of first part of address string
PartOneEscape - TRUE on output if PartOne contains escape sequences
PartTwo - pointer which will receive second part of address string
PartTwoLength - pointer which will receive length of second part of address string
PartOneEscape - TRUE on output if PartTwo contains escape sequences
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_INVALID_URL
--*/
{ LPSTR pString; LPSTR pColon; DWORD partLength; LPBOOL partEscape; DWORD length;
//
// parse out <host>[:<port>] or <name>[:<password>] (i.e. <part1>[:<part2>]
//
pString = *Url; pColon = NULL; partLength = 0; *PartOne = pString; *PartOneLength = 0; *PartOneEscape = FALSE; *PartTwoEscape = FALSE; partEscape = PartOneEscape; length = *UrlLength; if ((length != 0) && (*pString == '[')) { //
// If the first part starts with a '[' then we assume it's an IPv6
// literal address and it must be terminated with a ']'.
//
// Note we DO NOT output PartOneEscape == TRUE if there is a % in
// the IPv6 literal address designating a Scope ID.
//
*PartOne = pString; for (;;) {
if(*pString & ~0x7F) return ERROR_WINHTTP_INVALID_URL; ++partLength; ++pString; --length; if (length == 0) { return ERROR_WINHTTP_INVALID_URL; } if (*pString == ']') { ++partLength; break; } } ++pString; --length; //
// If there's more, then there should be a colon or forward slash
// We allow http://[addr]/...
// http://[addr]:port/...
// not
// http://[addr]junk/...
//
if (length != 0) { if ((*pString != ':') && (*pString != '/')) return ERROR_WINHTTP_INVALID_URL; } } while ((*pString != '/') && (*pString != '\0') && (length != 0)) { if (*pString == '%') {
//
// if there is a % in the string then it *must* (RFC 1738) be the
// start of an escape sequence. This function just reports the
// address of the substrings and their lengths; calling functions
// must handle the escape sequences (i.e. it is their responsibility
// to decide where to put the results)
//
*partEscape = TRUE; } if (*pString == ':') { if (pColon != NULL) {
//
// we don't expect more than 1 ':'
//
return ERROR_WINHTTP_INVALID_URL; } pColon = pString; *PartOneLength = partLength; if (partLength == 0) { *PartOne = NULL; } partLength = 0; partEscape = PartTwoEscape; } else { ++partLength; }
if(*pString & ~0x7F) return ERROR_WINHTTP_INVALID_URL;
++pString; --length; }
//
// we either ended on the host (or user) name or the port number (or
// password), one of which we don't know the length of
//
if (pColon == NULL) { *PartOneLength = partLength; *PartTwo = NULL; *PartTwoLength = 0; *PartTwoEscape = FALSE; } else { *PartTwoLength = partLength; *PartTwo = pColon + 1;
//
// in both the <user>:<password> and <host>:<port> cases, we cannot have
// the second part without the first, although both parts being zero
// length is OK (host name will be sorted out elsewhere, but (for now,
// at least) I am allowing <>:<> for username:password, since I don't
// see it expressly disallowed in the RFC. I may be revisiting this code
// later...)
//
// N.B.: ftp://ftp.microsoft.com uses http://:0/-http-gw-internal-/menu.gif
// if ((*PartOneLength == 0) && (partLength != 0)) {
// return ERROR_WINHTTP_INVALID_URL;
// }
}
//
// update the URL pointer and length remaining
//
*Url = pString; *UrlLength = length;
return ERROR_SUCCESS; }
DWORD GetUrlAddress( IN OUT LPSTR* lpszUrl, OUT LPDWORD lpdwUrlLength, OUT LPSTR* lpszUserName OPTIONAL, OUT LPDWORD lpdwUserNameLength OPTIONAL, OUT LPSTR* lpszPassword OPTIONAL, OUT LPDWORD lpdwPasswordLength OPTIONAL, OUT LPSTR* lpszHostName OPTIONAL, OUT LPDWORD lpdwHostNameLength OPTIONAL, IN BOOL fUnescapeHostName, OUT LPINTERNET_PORT lpPort OPTIONAL, OUT LPBOOL pHavePort )
/*++
Routine Description:
This function extracts any and all parts of the address information for a generic URL. If any of the address parts contain escaped characters (%nn) then they are converted in situ
The generic addressing format (RFC 1738) is:
<user>:<password>@<host>:<port>
The addressing information cannot contain a password without a user name, or a port without a host name NB: ftp://ftp.microsoft.com uses URL's that have a port without a host name!
(e.g. http://:0/-http-gw-internal-/menu.gif)
Although only the lpszUrl and lpdwUrlLength fields are required, the address parts will be checked for presence and completeness
Assumes: 1. If one of the optional lpsz fields is present (e.g. lpszUserName) then the accompanying lpdw field must also be supplied
Arguments:
lpszUrl - IN: pointer to the URL to parse OUT: URL remaining after address information
N.B. The url-path is NOT canonicalized (unescaped) because it may contain protocol-specific information which must be parsed out by the protocol-specific parser
lpdwUrlLength - returned length of the remainder of the URL after the address information
lpszUserName - returned pointer to the user name This parameter can be omitted by those protocol parsers that do not require or expect user names in the URL
lpdwUserNameLength - returned length of the user name part This parameter can be omitted by those protocol parsers that do not require or expect user names in the URL
lpszPassword - returned pointer to the password This parameter can be omitted by those protocol parsers that do not require or expect user passwords in the URL
lpdwPasswordLength - returned length of the password This parameter can be omitted by those protocol parsers that do not require or expect user passwords in the URL
lpszHostName - returned pointer to the host name This parameter can be omitted by those protocol parsers that do not require the host name info
lpdwHostNameLength - returned length of the host name This parameter can be omitted by those protocol parsers that do not require the host name info
lpPort - returned value of the port field This parameter can be omitted by those protocol parsers that do not require or expect user port number
pHavePort - returned boolean indicating whether a port was specified in the URL or not. This value is not returned if the lpPort parameter is omitted.
Return Value:
DWORD Success - ERROR_SUCCESS
Failure - ERROR_WINHTTP_INVALID_URL We could not parse some part of the address info, or we found address info where the protocol parser didn't expect any
ERROR_INSUFFICIENT_BUFFER We could not convert an escaped string
--*/
{ DWORD error = ERROR_WINHTTP_INTERNAL_ERROR; DWORD urlLength; LPSTR pUrl; BOOL part1Escape; BOOL part2Escape; char portNumber[INTERNET_MAX_PORT_NUMBER_LENGTH + 1]; DWORD portNumberLength; LPSTR pPortNumber; LPSTR hostName; DWORD hostNameLength;
pUrl = *lpszUrl; urlLength = strlen(pUrl);
char *pHead, *pTail;
//
// check to see if there is an '@' separating user name & password. If we
// see a '/' or get to the end of the string before we see the '@' then
// there is no username:password part
//
char *pAt, *pSlash;
pHead = pUrl; pTail = pHead + urlLength;
pSlash = Utf8StrChr(pHead, pTail, '/'); if (pSlash == NULL) pSlash = pTail; pAt = Utf8StrChr(pHead, pSlash, '@');
{ char *pUsername, *pPassword; int iUsernameLength, iPasswordLength; pUsername = pSlash; pPassword = pSlash; iUsernameLength = 0; iPasswordLength = 0; if (pAt != NULL) { pUsername = pHead; pPassword = Utf8StrChr( pUsername, pAt, ':'); // still a ':' ahead of the actual password..
if (pPassword == NULL) pPassword = pAt; iUsernameLength = (DWORD)(pPassword - pUsername);
if (*pPassword == ':') pPassword++;
iPasswordLength = (DWORD)(pAt - pPassword);
pHead = pAt + 1; }
if (ARGUMENT_PRESENT(lpszUserName)) { INET_ASSERT(ARGUMENT_PRESENT(lpdwUserNameLength)); *lpszUserName = pUsername; *lpdwUserNameLength = iUsernameLength; }
if (ARGUMENT_PRESENT(lpszPassword)) { INET_ASSERT(ARGUMENT_PRESENT(lpdwPasswordLength)); *lpszPassword = pPassword; *lpdwPasswordLength = iPasswordLength; } } //
// now get the host name and the optional port
//
pUrl = pHead; urlLength = (DWORD)(pTail - pHead);
pPortNumber = portNumber; portNumberLength = sizeof(portNumber); error = GetUrlAddressInfo(&pUrl, &urlLength, &hostName, &hostNameLength, &part1Escape, &pPortNumber, &portNumberLength, &part2Escape ); if (error != ERROR_SUCCESS) goto done;
//
// the URL address information MUST contain the host name
//
if ((hostName == NULL) || (hostNameLength == 0)) { error = ERROR_WINHTTP_INVALID_URL; goto done; }
if (ARGUMENT_PRESENT(lpszHostName)) { INET_ASSERT(ARGUMENT_PRESENT(lpdwHostNameLength));
//
// if the host name contains escaped characters, convert them in situ
//
if (part1Escape && fUnescapeHostName) { error = DecodeUrlInSitu(hostName, &hostNameLength); if (error != ERROR_SUCCESS) goto done; } *lpszHostName = hostName; *lpdwHostNameLength = hostNameLength; }
//
// if there is a port field, convert it if there are escaped characters,
// check it for valid numeric characters, and convert it to a number
//
if (portNumberLength != 0) { DWORD i; DWORD port;
INET_ASSERT(pPortNumber != NULL);
//
// We can ignore part2Escape because below we detect
//non-digits in the port.
//
//
// ensure all characters in the port number buffer are numeric, and
// calculate the port number at the same time
//
for (i = 0, port = 0; i < portNumberLength; ++i, ++pPortNumber) { if (!isdigit(*pPortNumber)) { error = ERROR_WINHTTP_INVALID_URL; goto done; } port = port * 10 + (int)(*pPortNumber - '0'); // We won't allow ports larger than 65535 ((2^16)-1)
// We have to check this every time to make sure that someone
// doesn't try to overflow a DWORD.
if (port > 65535) { error = ERROR_WINHTTP_INVALID_URL; goto done; } }
if (ARGUMENT_PRESENT(lpPort)) *lpPort = (INTERNET_PORT)port; if (ARGUMENT_PRESENT(pHavePort)) *pHavePort = TRUE; } else { if (ARGUMENT_PRESENT(lpPort)) *lpPort = INTERNET_INVALID_PORT_NUMBER; if (ARGUMENT_PRESENT(pHavePort)) *pHavePort = FALSE; }
//
// update the URL pointer and the length of the url-path
//
*lpszUrl = pUrl; *lpdwUrlLength = urlLength;
error = ERROR_SUCCESS;
done: return error; }
INTERNET_SCHEME MapUrlSchemeName( IN LPSTR lpszSchemeName, IN DWORD dwSchemeNameLength )
/*++
Routine Description:
Maps a scheme name/length to a scheme name type
Arguments:
lpszSchemeName - pointer to name of scheme to map
dwSchemeNameLength - length of scheme (if -1, lpszSchemeName is ASCIZ)
Return Value:
INTERNET_SCHEME
--*/
{ if (dwSchemeNameLength == (DWORD)-1) { dwSchemeNameLength = (DWORD)lstrlen(lpszSchemeName); }
DWORD i; if (ScanSchemes(lpszSchemeName, dwSchemeNameLength, &i)) { return UrlSchemeList[i].SchemeType; } return INTERNET_SCHEME_UNKNOWN; }
LPSTR MapUrlScheme( IN INTERNET_SCHEME Scheme, OUT LPDWORD lpdwSchemeNameLength )
/*++
Routine Description:
Maps the enumerated scheme name type to the name
Arguments:
Scheme - enumerated scheme type to map
lpdwSchemeNameLength - pointer to returned length of scheme name
Return Value:
LPSTR - pointer to scheme name or NULL
--*/
{ if ((Scheme >= INTERNET_SCHEME_FIRST) && (Scheme <= INTERNET_SCHEME_LAST)) { *lpdwSchemeNameLength = UrlSchemeList[Scheme].SchemeLength; return UrlSchemeList[Scheme].SchemeName; } *lpdwSchemeNameLength = 0; return NULL; }
LPSTR MapUrlSchemeToName( IN INTERNET_SCHEME Scheme )
/*++
Routine Description:
Maps the enumerated scheme name type to the name
Arguments:
Scheme - enumerated scheme type to map
Return Value:
LPSTR - pointer to scheme name or NULL
--*/
{ if ((Scheme >= INTERNET_SCHEME_FIRST) && (Scheme <= INTERNET_SCHEME_LAST)) { return UrlSchemeList[Scheme].SchemeName; } return NULL; }
//
//
// UnsafeInPathAndQueryFlags flag in table set to 1 if symbol is unsafe for path or query
// question mark treated as safe
// this table is fater then SafetyList because it requires no substraction and no masking
// and only one bound checking to access it
//
//
const PRIVATE BYTE UnsafeInPathAndQueryFlags[128] = { // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
// xx xx xx xx xx xx xx xx xx xx xx xx xx xx xx xx
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
// xx xx xx xx xx xx xx xx xx xx xx xx xx xx xx xx
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
// ! " # $ % & ' ( ) * + , - . /
1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
// 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
// @ A B C D E F G H I J K L M N O
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
// P Q R S T U V W X Y Z [ \ ] ^ _
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
// 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
// ` a b c d e f g h i j k l m n o
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
// p q r s t u v w x y z { | } ~ xx
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1 };
//
//
// ADD_HEX_TO_STRING adds ch in "%hh" format to a given string and increases string ptr
// for use inside ConvertUnicodeToMultiByte only
//
//
#define ADD_HEX_TO_STRING(pStr, ch) \
{ UCHAR c = (UCHAR)(ch);\ *pStr++ = '%'; \ *pStr++ = hexArray[c>>4]; \ *pStr++ = hexArray[c & 0x0f]; \ } //#define ADD_HEX_TO_STRING(pStr, ch) \ // { UCHAR c = (UCHAR)ch; *(DWORD*)pStr = (DWORD)'%' + ((DWORD)(hexArray[c>>4]) << 8) + ((DWORD)(hexArray[c & 0x0f]) << 16); \ // pStr += 3; }
/*
* ConvertUnicodeToMultiByte: *
dwFlags:
WINHTTP_FLAG_VALID_HOSTNAME only for server name; fast conversion is performed, no escaping WINHTTP_FLAG_NULL_CODEPAGE assumes string contains only ASCII chars, fast conversion is performed WINHTTP_FLAG_ESCAPE_PERCENT if escaping enabled, escape percent as well WINHTTP_FLAG_ESCAPE_DISABLE disable escaping (if WINHTTP_FLAG_VALID_HOSTNAME not set) WINHTTP_FLAG_ESCAPE_DISABLE_QUERY if escaping enabled escape path part, but do not escape query
*/
DWORD ConvertUnicodeToMultiByte( LPCWSTR lpszObjectName, DWORD dwCodePage, MEMORYPACKET* pmp, DWORD dwFlags) { static CHAR* hexArray = "0123456789ABCDEF";
DWORD dwError = ERROR_SUCCESS; BOOL bPureAscii = TRUE; BOOL bTreatPercentAsSafe = (dwFlags & WINHTTP_FLAG_ESCAPE_PERCENT) ? FALSE : TRUE; BOOL bNeedEscaping = (dwFlags & WINHTTP_FLAG_ESCAPE_DISABLE) ? FALSE : TRUE; BOOL bEscapeQuery = (dwFlags & WINHTTP_FLAG_ESCAPE_DISABLE_QUERY) ? FALSE : TRUE;
//determine size of string and/or safe characters
DWORD dwUnsafeChars = 0; DWORD dwUnicodeUrlSize;
if (dwFlags & WINHTTP_FLAG_VALID_HOSTNAME) { bNeedEscaping = FALSE;
if (!IsValidHostNameW(lpszObjectName, 0)) { // 0 == allow v6 literal scope ids
dwError = ERROR_WINHTTP_INVALID_URL; goto done; }
dwUnicodeUrlSize = lstrlenW(lpszObjectName)+1;
} else if ((dwFlags & WINHTTP_FLAG_NULL_CODEPAGE) && !bNeedEscaping) { //if no escaping needed there is no need to calcaulate num of unsafe char
dwUnicodeUrlSize = lstrlenW(lpszObjectName)+1; } else { // optimization to check for unsafe characters, and optimize the common case.
// calculate the length, and while parsing the string, check if there are unsafeChars
PCWSTR pwStr;
if (bTreatPercentAsSafe) for(pwStr = lpszObjectName; *pwStr; ++pwStr) { UINT16 wc = *pwStr; if (wc <= 0x7f) { if (UnsafeInPathAndQueryFlags[wc] && (wc != L'%')) ++dwUnsafeChars; } else { bPureAscii = FALSE; ++dwUnsafeChars; } } else for(pwStr = lpszObjectName; *pwStr; ++pwStr) { UINT16 wc = *pwStr; if (wc <= 0x7f) { if (UnsafeInPathAndQueryFlags[wc]) ++dwUnsafeChars; } else { bPureAscii = FALSE; ++dwUnsafeChars; } }
dwUnicodeUrlSize = (DWORD)(pwStr-lpszObjectName+1); }
//convert to MBCS
if (bPureAscii) { pmp->dwAlloc = dwUnicodeUrlSize; if (bNeedEscaping) pmp->dwAlloc += 2 * dwUnsafeChars;
pmp->psStr = (LPSTR)ALLOCATE_FIXED_MEMORY(pmp->dwAlloc);
if (!pmp->psStr) { pmp->dwAlloc = 0; dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; }
PSTR pStr = pmp->psStr; if (bNeedEscaping) { UCHAR chPercent = bTreatPercentAsSafe ? (UCHAR)'%' : (UCHAR)0;
if (bEscapeQuery) for (; *lpszObjectName; ++lpszObjectName) { UCHAR ch = (UCHAR)*lpszObjectName; if (!UnsafeInPathAndQueryFlags[ch] || (ch == chPercent)) *pStr++ = ch; else { ADD_HEX_TO_STRING (pStr, ch) } } else for (; *lpszObjectName && (*lpszObjectName != L'?'); ++lpszObjectName) { UCHAR ch = (UCHAR)*lpszObjectName; if (!UnsafeInPathAndQueryFlags[ch] || ch == chPercent) *pStr++ = ch; else { ADD_HEX_TO_STRING (pStr, ch) } } }
for (; *lpszObjectName; ++lpszObjectName) *pStr++ = (CHAR)*lpszObjectName; *pStr = '\0';
pmp->dwSize = (DWORD)(pStr - pmp->psStr); } else if (dwCodePage == CP_UTF8) { //converts to UTF8 and performs escaping at same time
pmp->dwAlloc = dwUnicodeUrlSize + (bNeedEscaping ? 8 : 2) * dwUnsafeChars; //yep, some extra allocation possible
pmp->psStr = (LPSTR)ALLOCATE_FIXED_MEMORY(pmp->dwAlloc);
if (!pmp->psStr) { pmp->dwAlloc = 0; dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; }
PSTR pStr = pmp->psStr;
if (bNeedEscaping) { WCHAR wcPercent = bTreatPercentAsSafe ? L'%' : (WCHAR)0; WCHAR wcQMark = bEscapeQuery ? (WCHAR)0 : L'?';
for (; *lpszObjectName && (*lpszObjectName != wcQMark); ++lpszObjectName) { UINT16 wc = *lpszObjectName; if (wc <= 0x007f) // encode to one byte
{ if (!UnsafeInPathAndQueryFlags[wc] || wc == wcPercent) *pStr++ = (CHAR)wc; else { ADD_HEX_TO_STRING (pStr, wc) } } else if (wc <= 0x07FF) //encode to two bytes
{ ADD_HEX_TO_STRING (pStr, 0xC0 | (wc >> 6)) ADD_HEX_TO_STRING (pStr, 0x80 | (wc & 0x3F)) } else //encode to three bytes
{ ADD_HEX_TO_STRING (pStr, 0xe0 | (wc >> 12)) ADD_HEX_TO_STRING (pStr, 0x80 | ((wc >> 6) & 0x3F)) ADD_HEX_TO_STRING (pStr, 0x80 | (wc & 0x3F)) } } }
for (; *lpszObjectName; ++lpszObjectName) { UINT16 wc = *lpszObjectName; if (wc <= 0x007f) // encode to one byte
{ *pStr++ = (CHAR)wc; } else if (wc <= 0x07FF) //encode to two bytes
{ *pStr++ = (CHAR)(0xC0 | (wc >> 6)); *pStr++ = (CHAR)(0x80 | (wc & 0x3F)); //*(WORD*)pStr = (WORD)0x80C0 | (wc >> 6) | ((wc & 0x3F) << 8);
//pStr += 2;
} else //encode to three bytes
{ *pStr++ = (CHAR)(0xe0 | (wc >> 12)); *pStr++ = (CHAR)(0x80 | ((wc >> 6) & 0x3F)); *pStr++ = (CHAR)(0x80 | (wc & 0x3F)); //DWORD tmp = 0x8080e0 | (wc >> 12) | ((wc << 2) & 0x3f00) | (((DWORD)wc << 16) & 0x3f0000);
//*(DWORD*)pStr = tmp;
//pStr += 3;
} }
*pStr = '\0';
pmp->dwSize = (DWORD)(pStr - pmp->psStr); } else { //last and final, so not to loose perf don't set dwCodePage to values other then CP_UTF8 :)
// convert with WideCharToMultiByte()
pmp->dwAlloc = WideCharToMultiByte(dwCodePage, 0, lpszObjectName, dwUnicodeUrlSize, NULL, 0, NULL, NULL); if (!pmp->dwAlloc) { dwError = GetLastError(); goto done; }
pmp->psStr = (LPSTR)ALLOCATE_FIXED_MEMORY(pmp->dwAlloc);
if (!pmp->psStr) { pmp->dwAlloc = 0; dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; }
//find out if query is present
PCHAR pchQMInConverted = NULL; DWORD dwQuerySize; if (bNeedEscaping) { WCHAR* pQM = wcschr(lpszObjectName, L'?'); if (pQM) { DWORD dwPathSize = 0; if (pQM != lpszObjectName) { dwPathSize = WideCharToMultiByte(dwCodePage, 0, lpszObjectName, (DWORD)(pQM - lpszObjectName), pmp->psStr, pmp->dwAlloc, NULL, NULL); if (!dwPathSize) { dwError = GetLastError(); goto done; } }
dwQuerySize = WideCharToMultiByte(dwCodePage, 0, pQM, dwUnicodeUrlSize - (DWORD)(pQM - lpszObjectName), pmp->psStr + dwPathSize, pmp->dwAlloc - dwPathSize, NULL, NULL);
if (!dwQuerySize) { dwError = GetLastError(); goto done; }
--dwQuerySize;
pmp->dwSize = dwPathSize + dwQuerySize; pchQMInConverted = pmp->psStr + dwPathSize; } }
if (!pchQMInConverted) { pmp->dwSize = WideCharToMultiByte(dwCodePage, 0, lpszObjectName, dwUnicodeUrlSize, pmp->psStr, pmp->dwAlloc, NULL, NULL);
if (!pmp->dwSize) { dwError = GetLastError(); goto done; } else --(pmp->dwSize); }
if (bNeedEscaping) { //collect information about code page
DWORD dwCharSize = 1;
if (dwCodePage != CP_UTF7) { CPINFO CPInfo; if (!GetCPInfo(dwCodePage, &CPInfo)) { dwError = GetLastError(); goto done; } dwCharSize = CPInfo.MaxCharSize; }
UCHAR chPercent = bTreatPercentAsSafe ? '%' : (UCHAR)0;
if (dwCharSize == 1) { dwUnsafeChars = 0;
//calculate number of unsafe chars
PSTR pStop = pchQMInConverted ? pchQMInConverted : (pmp->psStr + pmp->dwSize);
PSTR pStr = pmp->psStr; //this loop counts unsafe chars in path, count '?' as well
for(; pStr != pStop; ++pStr) { UCHAR ch = *pStr; if ((ch > 0x7F) || (UnsafeInPathAndQueryFlags[ch] && (ch != chPercent)) || (ch == '?')) ++dwUnsafeChars; } //this loop counts unsafe chars in query, do not count '?'
for(; *pStr; ++pStr) { UCHAR ch = *pStr; if ((ch > 0x7F) || (UnsafeInPathAndQueryFlags[ch] && (ch != chPercent))) ++dwUnsafeChars; }
if (dwUnsafeChars == 0) goto done;
//make new allocation
DWORD dwNewAlloc = pmp->dwAlloc + dwUnsafeChars*2; LPSTR pDest, pNewStr; pNewStr = pDest = (LPSTR)ALLOCATE_FIXED_MEMORY(dwNewAlloc); if (!pDest) { dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; }
//escaping
//escape path part
pStr = pmp->psStr; for(; pStr != pStop; ++pStr) { UCHAR ch = *pStr; if ((ch <= 0x7F) && ((!UnsafeInPathAndQueryFlags[ch] && (ch != '?')) || (ch == chPercent))) *pDest++ = ch; else { ADD_HEX_TO_STRING (pDest, ch) } } //escape query part
for(; *pStr; ++pStr) { UCHAR ch = *pStr; if ((ch <= 0x7F) && (!UnsafeInPathAndQueryFlags[ch] || (ch == chPercent))) *pDest++ = ch; else { ADD_HEX_TO_STRING (pDest, ch) } } *pDest = '\0';
FREE_FIXED_MEMORY(pmp->psStr); pmp->psStr = pNewStr; pmp->dwSize = (DWORD)(pDest-pNewStr); pmp->dwAlloc = dwNewAlloc; } else { //well, string is mbcs
dwUnsafeChars = 0;
//calculate number of unsafe chars
PSTR pStop = pchQMInConverted ? pchQMInConverted : (pmp->psStr + pmp->dwSize);
PSTR pStr = pmp->psStr;
//this loop counts unsafe chars in path, count '?' as well
while (pStr != pStop) { UCHAR ch = *pStr; if (IsDBCSLeadByteEx(dwCodePage, ch)) { //do not allow percent here
if ((ch > 0x7F) || UnsafeInPathAndQueryFlags[ch] || (ch == '?')) ++dwUnsafeChars; ++pStr; ch = *pStr; if ((ch > 0x7F) || UnsafeInPathAndQueryFlags[ch] || (ch == '?')) ++dwUnsafeChars; ++pStr; } else { if ((ch > 0x7F) || (UnsafeInPathAndQueryFlags[ch] && (ch != chPercent)) || (ch == '?')) ++dwUnsafeChars; ++pStr; } } //this loop counts unsafe chars in query, do not count '?'
while(*pStr) { UCHAR ch = *pStr; if (IsDBCSLeadByteEx(dwCodePage, ch)) { //do not allow percent here
if ((ch > 0x7F) || UnsafeInPathAndQueryFlags[ch]) ++dwUnsafeChars; ++pStr; ch = *pStr; if ((ch > 0x7F) || UnsafeInPathAndQueryFlags[ch]) ++dwUnsafeChars; ++pStr; } else { if ((ch > 0x7F) || (UnsafeInPathAndQueryFlags[ch] && (ch != chPercent))) ++dwUnsafeChars; ++pStr; } }
if (dwUnsafeChars == 0) goto done;
//make new allocation
DWORD dwNewAlloc = pmp->dwAlloc + dwUnsafeChars*2; LPSTR pDest, pNewStr; pNewStr = pDest = (LPSTR)ALLOCATE_FIXED_MEMORY(dwNewAlloc); if (!pDest) { dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; }
//escaping
//escape path part
pStr = pmp->psStr; while (pStr != pStop) { UCHAR ch = *pStr; if (IsDBCSLeadByteEx(dwCodePage, ch)) { //do not allow percent here
if ((ch <= 0x7F) && !UnsafeInPathAndQueryFlags[ch] && (ch != '?')) *pDest++ = ch; else { ADD_HEX_TO_STRING (pDest, ch) } ++pStr; ch = *pStr; if ((ch <= 0x7F) && !UnsafeInPathAndQueryFlags[ch] && (ch != '?')) *pDest++ = ch; else { ADD_HEX_TO_STRING (pDest, ch) } ++pStr; } else { if ((ch <= 0x7F) && ((!UnsafeInPathAndQueryFlags[ch] && (ch != '?')) || (ch == chPercent))) *pDest++ = ch; else { ADD_HEX_TO_STRING (pDest, ch) } ++pStr; } }
//escape query part
while (*pStr) { UCHAR ch = *pStr; if (IsDBCSLeadByteEx(dwCodePage, ch)) { //do not allow percent here
if ((ch <= 0x7F) && !UnsafeInPathAndQueryFlags[ch]) *pDest++ = ch; else { ADD_HEX_TO_STRING (pDest, ch) } ++pStr; ch = *pStr; if ((ch <= 0x7F) && !UnsafeInPathAndQueryFlags[ch]) *pDest++ = ch; else { ADD_HEX_TO_STRING (pDest, ch) } ++pStr; } else { if ((ch <= 0x7F) && (!UnsafeInPathAndQueryFlags[ch] || (ch == chPercent))) *pDest++ = ch; else { ADD_HEX_TO_STRING (pDest, ch) } ++pStr; } }
*pDest = '\0';
FREE_FIXED_MEMORY(pmp->psStr); pmp->psStr = pNewStr; pmp->dwSize = (DWORD)(pDest-pNewStr); pmp->dwAlloc = dwNewAlloc; } } } done: if (pmp->psStr) pmp->dwAlloc = (pmp->dwAlloc > MP_MAX_STACK_USE) ? pmp->dwAlloc : MP_MAX_STACK_USE+1;// to force FREE in ~MEMORYPACKET
return dwError; }
|