/*++ Copyright (c) 1995 Microsoft Corporation Module Name: parseurl.cxx Abstract: Contains functions to parse the basic URLs - FTP, Gopher, HTTP. An URL parser simply acts as a macro: it must break out the protocol-specific information from the URL and initiate opening the identified resource: all this can be accomplished by calling the relevant Internet protocol APIs. Code in this module is based on RFC1738 Contents: IsValidUrl DoesSchemeRequireSlashes ParseUrl CrackUrl EncodeUrlPath (HexCharToNumber) (NumberToHexChar) DecodeUrl DecodeUrlInSitu DecodeUrlStringInSitu GetUrlAddressInfo GetUrlAddress MapUrlSchemeName MapUrlScheme MapUrlSchemeToName Author: Richard L Firth (rfirth) 26-Apr-1995 Environment: Win32(s) user-mode DLL Revision History: 26-Apr-1995 Created --*/ #include // // private manifests // #define RESERVED SAFE // // private macros // //#define HEX_CHAR_TO_NUMBER(ch) \ // ((ch <= '9') \ // ? (ch - '0') \ // : ((ch >= 'a') \ // ? ((ch - 'a') + 10) \ // : ((ch - 'A') + 10))) #define NUMBER_TO_HEX_CHAR(n) \ (((n) <= 9) ? ((char)(n) + '0') : (((char)(n) - 10) + 'A')) #define IS_UNSAFE_URL_CHARACTER(Char, Scheme) \ (((UCHAR)(Char) <= 0x20) || ((UCHAR)(Char) >= 0x7f) \ || (SafetyList[(Char) - 0x21] & (UNSAFE | Scheme))) #define IS_UNSAFE_URL_WIDECHARACTER(wChar, Scheme) \ (((WCHAR)(wChar) <= 0x0020) || ((WCHAR)(wChar) >= 0x007f) \ || (SafetyList[(wChar) - 0x0021] & (UNSAFE | Scheme))) // // private types // // // private prototypes // PRIVATE char HexCharToNumber( IN char ch ); PRIVATE char NumberToHexChar( IN int Number ); // // private data // // // SafetyList - the list of characters above 0x20 and below 0x7f that are // classified as safe, unsafe or scheme-specific. Safe characters do not need // to be escaped for any URL scheme. Unsafe characters must be escaped for all // URL schemes. Scheme-specific characters need only be escaped for the relevant // scheme(s) // const PRIVATE UCHAR SafetyList[] = { // // UNSAFE: 0x00..0x20 // SAFE | HOSTNAME, // 0x21 (!) UNSAFE, // 0x22 (") UNSAFE, // 0x23 (#) SAFE | HOSTNAME, // 0x24 ($) UNSAFE, // 0x25 (%) RESERVED | HOSTNAME, // 0x26 (&) SAFE | HOSTNAME, // 0x27 (') SAFE | HOSTNAME, // 0x28 (() SAFE | HOSTNAME, // 0x29 ()) SAFE | HOSTNAME, // 0x2A (*) SCHEME_GOPHER | HOSTNAME, // 0x2B (+) SAFE | HOSTNAME, // 0x2C (,) SAFE, // 0x2D (-) SAFE, // 0x2E (.) RESERVED | HOSTNAME, // 0x2F (/) SAFE, // 0x30 (0) SAFE, // 0x31 (1) SAFE, // 0x32 (2) SAFE, // 0x33 (3) SAFE, // 0x34 (4) SAFE, // 0x35 (5) SAFE, // 0x36 (6) SAFE, // 0x37 (7) SAFE, // 0x38 (8) SAFE, // 0x39 (9) RESERVED | HOSTNAME, // 0x3A (:) RESERVED | HOSTNAME, // 0x3B (;) UNSAFE, // 0x3C (<) RESERVED | HOSTNAME, // 0x3D (=) UNSAFE, // 0x3E (>) RESERVED | SCHEME_GOPHER | HOSTNAME, // 0x3F (?) RESERVED | HOSTNAME, // 0x40 (@) SAFE, // 0x41 (A) SAFE, // 0x42 (B) SAFE, // 0x43 (C) SAFE, // 0x44 (D) SAFE, // 0x45 (E) SAFE, // 0x46 (F) SAFE, // 0x47 (G) SAFE, // 0x48 (H) SAFE, // 0x49 (I) SAFE, // 0x4A (J) SAFE, // 0x4B (K) SAFE, // 0x4C (L) SAFE, // 0x4D (M) SAFE, // 0x4E (N) SAFE, // 0x4F (O) SAFE, // 0x50 (P) SAFE, // 0x51 (Q) SAFE, // 0x42 (R) SAFE, // 0x43 (S) SAFE, // 0x44 (T) SAFE, // 0x45 (U) SAFE, // 0x46 (V) SAFE, // 0x47 (W) SAFE, // 0x48 (X) SAFE, // 0x49 (Y) SAFE, // 0x5A (Z) UNSAFE, // 0x5B ([) UNSAFE, // 0x5C (\) UNSAFE, // 0x5D (]) UNSAFE, // 0x5E (^) SAFE, // 0x5F (_) UNSAFE, // 0x60 (`) SAFE, // 0x61 (a) SAFE, // 0x62 (b) SAFE, // 0x63 (c) SAFE, // 0x64 (d) SAFE, // 0x65 (e) SAFE, // 0x66 (f) SAFE, // 0x67 (g) SAFE, // 0x68 (h) SAFE, // 0x69 (i) SAFE, // 0x6A (j) SAFE, // 0x6B (k) SAFE, // 0x6C (l) SAFE, // 0x6D (m) SAFE, // 0x6E (n) SAFE, // 0x6F (o) SAFE, // 0x70 (p) SAFE, // 0x71 (q) SAFE, // 0x72 (r) SAFE, // 0x73 (s) SAFE, // 0x74 (t) SAFE, // 0x75 (u) SAFE, // 0x76 (v) SAFE, // 0x77 (w) SAFE, // 0x78 (x) SAFE, // 0x79 (y) SAFE, // 0x7A (z) UNSAFE, // 0x7B ({) UNSAFE, // 0x7C (|) UNSAFE, // 0x7D (}) UNSAFE // 0x7E (~) // // UNSAFE: 0x7F..0xFF // }; // // UrlSchemeList - the list of schemes that we support // typedef struct { LPSTR SchemeName; DWORD SchemeLength; INTERNET_SCHEME SchemeType; DWORD SchemeFlags; BOOL NeedSlashes; DWORD OpenFlags; } URL_SCHEME_INFO; const PRIVATE URL_SCHEME_INFO UrlSchemeList[] = { NULL, 0, INTERNET_SCHEME_DEFAULT, 0, FALSE, 0, "http", 4, INTERNET_SCHEME_HTTP, SCHEME_HTTP, TRUE, 0, "https", 5, INTERNET_SCHEME_HTTPS, SCHEME_HTTP, TRUE, WINHTTP_FLAG_SECURE, }; #define NUMBER_OF_URL_SCHEMES ARRAY_ELEMENTS(UrlSchemeList) BOOL ScanSchemes(LPTSTR pszToCheck, DWORD ccStr, PDWORD pwResult) { for (DWORD i=0; i 3) && (memcmp(&lpszUrl[schemeLength], "://", 3) == 0)) { skip = 3; // skip "://" haveSlashes = TRUE; } // // If we don't have slashes, make sure we don't need them. // If we have slashes, make sure they are required. // if ((!haveSlashes && !needSlashes) || (haveSlashes && needSlashes)) { if (ARGUMENT_PRESENT(lpSchemeType)) { *lpSchemeType = schemeType; } if (ARGUMENT_PRESENT(lpszSchemeName)) { *lpszSchemeName = lpszUrl; *lpdwSchemeNameLength = schemeLength; } lpszUrl += schemeLength + skip; dwUrlLength -= skip; if (isGeneric) { if (ARGUMENT_PRESENT(lpszUserName)) { *lpszUserName = NULL; *lpdwUserNameLength = 0; } if (ARGUMENT_PRESENT(lpszPassword)) { *lpszPassword = NULL; *lpdwPasswordLength = 0; } if (ARGUMENT_PRESENT(lpszHostName)) { *lpszHostName = NULL; *lpdwHostNameLength = 0; } if (ARGUMENT_PRESENT(lpServerPort)) { *lpServerPort = 0; } error = ERROR_SUCCESS; } else { error = GetUrlAddress(&lpszUrl, &dwUrlLength, lpszUserName, lpdwUserNameLength, lpszPassword, lpdwPasswordLength, lpszHostName, lpdwHostNameLength, lpServerPort, pHavePort ); } if (bEscape && (error == ERROR_SUCCESS)) { error = DecodeUrlInSitu(lpszUrl, &dwUrlLength); } if ((error == ERROR_SUCCESS) && ARGUMENT_PRESENT(lpszExtraInfo)) { *lpdwExtraInfoLength = 0; for (i = 0; i < (int)dwUrlLength; i++) { if (lpszUrl[i] == '?' || lpszUrl[i] == '#') { *lpszExtraInfo = &lpszUrl[i]; *lpdwExtraInfoLength = dwUrlLength - i; dwUrlLength -= *lpdwExtraInfoLength; } } } if ((error == ERROR_SUCCESS) && ARGUMENT_PRESENT(lpszUrlPath)) { *lpszUrlPath = lpszUrl; *lpdwUrlPathLength = dwUrlLength; } } else { error = ERROR_WINHTTP_UNRECOGNIZED_SCHEME; } quit: return error; } #define DEFAULT_REALLOC_SIZE 1024 DWORD EncodeUrlPath( IN DWORD Flags, IN DWORD SchemeFlags, IN LPSTR UrlPath, IN DWORD UrlPathLength, OUT LPSTR* pEncodedUrlPath, IN OUT LPDWORD EncodedUrlPathLength ) /*++ Routine Description: Encodes an URL-path. That is, escapes the string. Creates a new URL-path in which all the 'unsafe' and reserved characters for this scheme have been converted to escape sequences Arguments: Flags - controlling expansion SchemeFlags - which scheme we are encoding for - SCHEME_HTTP, etc. UrlPath - pointer to the unescaped string UrlPathLength - length of Url EncodedUrlPath - pointer to buffer where encoded URL will be written EncodedUrlPathLength - IN: size of EncodedUrlPath OUT: number of bytes written to EncodedUrlPath Return Value: DWORD Success - ERROR_SUCCESS Failure - ERROR_INSUFFICIENT_BUFFER UrlPathLength not large enough to store encoded URL path --*/ { DWORD error; DWORD len; len = *EncodedUrlPathLength; LPSTR EncodedUrlPath = *pEncodedUrlPath; UCHAR ch; while(ch = (UCHAR)*UrlPath++) { // // check whether this character is safe. For now, we encode all unsafe // and scheme-specific characters the same way (i.e. irrespective of // scheme) // // We are allowing '/' to be copied unmodified // if (len < 3) { LPSTR pStr = (LPSTR)REALLOCATE_MEMORY(*pEncodedUrlPath, *EncodedUrlPathLength+DEFAULT_REALLOC_SIZE, LMEM_MOVEABLE); if (pStr) { EncodedUrlPath = pStr+*EncodedUrlPathLength-len; *pEncodedUrlPath = pStr; len += DEFAULT_REALLOC_SIZE; *EncodedUrlPathLength += DEFAULT_REALLOC_SIZE; } else { goto error; } } if (IS_UNSAFE_URL_CHARACTER(ch, SchemeFlags) && !((ch == '/') && (Flags & NO_ENCODE_PATH_SEP))) { *EncodedUrlPath++ = '%'; //*EncodedUrlPath++ = NumberToHexChar((int)ch / 16); *EncodedUrlPath++ = NUMBER_TO_HEX_CHAR((int)ch / 16); //*EncodedUrlPath++ = NumberToHexChar((int)ch % 16); *EncodedUrlPath++ = NUMBER_TO_HEX_CHAR((int)ch % 16); len -= 2; // extra --len below } else { *EncodedUrlPath++ = (signed char)ch; } --len; } *EncodedUrlPath = '\0'; *EncodedUrlPathLength -= len; error = ERROR_SUCCESS; quit: return error; error: error = ERROR_INSUFFICIENT_BUFFER; goto quit; } PRIVATE char HexCharToNumber( IN char ch ) /*++ Routine Description: Converts an ANSI character in the range '0'..'9' 'A'..'F' 'a'..'f' to its corresponding hexadecimal value (0..f) Arguments: ch - character to convert Return Value: char hexadecimal value of ch, as an 8-bit (signed) character value --*/ { return (ch <= '9') ? (ch - '0') : ((ch >= 'a') ? ((ch - 'a') + 10) : ((ch - 'A') + 10)); } PRIVATE char NumberToHexChar( IN int Number ) /*++ Routine Description: Converts a number in the range 0..15 to its ASCII character hex representation ('0'..'F') Arguments: Number - to convert Return Value: char character in above range --*/ { return (Number <= 9) ? (char)('0' + Number) : (char)('A' + (Number - 10)); } DWORD DecodeUrl( IN LPSTR Url, IN DWORD UrlLength, OUT LPSTR DecodedString, IN OUT LPDWORD DecodedLength ) /*++ Routine Description: Converts an URL string with embedded escape sequences (%xx) to a counted string It is safe to pass the same pointer for the string to convert, and the buffer for the converted results: if the current character is not escaped, it just gets overwritten, else the input pointer is moved ahead 2 characters further than the output pointer, which is benign Arguments: Url - pointer to URL string to convert UrlLength - number of characters in UrlString DecodedString - pointer to buffer that receives converted string DecodedLength - IN: number of characters in buffer OUT: number of characters converted Return Value: DWORD Success - ERROR_SUCCESS Failure - ERROR_WINHTTP_INVALID_URL UrlString couldn't be converted ERROR_INSUFFICIENT_BUFFER ConvertedString isn't large enough to hold all the converted UrlString --*/ { DWORD bufferRemaining; bufferRemaining = *DecodedLength; while (UrlLength && bufferRemaining) { char ch; if (*Url == '%') { // // BUGBUG - would %00 ever appear in an URL? // ++Url; if (isxdigit(*Url)) { ch = HexCharToNumber(*Url++) << 4; if (isxdigit(*Url)) { ch |= HexCharToNumber(*Url++); } else { return ERROR_WINHTTP_INVALID_URL; } } else { return ERROR_WINHTTP_INVALID_URL; } UrlLength -= 3; } else { ch = *Url++; --UrlLength; } *DecodedString++ = ch; --bufferRemaining; } if (UrlLength == 0) { *DecodedLength -= bufferRemaining; return ERROR_SUCCESS; } else { return ERROR_INSUFFICIENT_BUFFER; } } DWORD DecodeUrlInSitu( IN LPSTR BufferAddress, IN OUT LPDWORD BufferLength ) /*++ Routine Description: Decodes an URL string, if it contains escape sequences. The conversion is done in place, since we know that a string containing escapes is longer than the string with escape sequences (3 bytes) converted to characters (1 byte) Arguments: BufferAddress - pointer to the string to convert BufferLength - IN: number of characters to convert OUT: length of converted string Return Value: DWORD Success - ERROR_SUCCESS Failure - ERROR_WINHTTP_INVALID_URL ERROR_INSUFFICIENT_BUFFER --*/ { DWORD stringLength; stringLength = *BufferLength; if (memchr(BufferAddress, '%', stringLength)) { return DecodeUrl(BufferAddress, stringLength, BufferAddress, BufferLength ); } else { // // no escape character in the string, just return success // return ERROR_SUCCESS; } } DWORD DecodeUrlStringInSitu( IN LPSTR BufferAddress, IN OUT LPDWORD BufferLength ) /*++ Routine Description: Performs DecodeUrlInSitu() on a string and zero terminates it Assumes: 1. Even if no decoding is performed, *BufferLength is large enough to fit an extra '\0' character Arguments: BufferAddress - pointer to the string to convert BufferLength - IN: number of characters to convert OUT: length of converted string, excluding '\0' Return Value: DWORD Success - ERROR_SUCCESS Failure - ERROR_WINHTTP_INVALID_URL ERROR_INSUFFICIENT_BUFFER --*/ { DWORD error; error = DecodeUrlInSitu(BufferAddress, BufferLength); if (error == ERROR_SUCCESS) { BufferAddress[*BufferLength] = '\0'; } return error; } DWORD GetUrlAddressInfo( IN OUT LPSTR* Url, IN OUT LPDWORD UrlLength, OUT LPSTR* PartOne, OUT LPDWORD PartOneLength, OUT LPBOOL PartOneEscape, OUT LPSTR* PartTwo, OUT LPDWORD PartTwoLength, OUT LPBOOL PartTwoEscape ) /*++ Routine Description: Given a string of the form foo:bar, splits them into 2 counted strings about the ':' character. The address string may or may not contain a ':'. This function is intended to split into substrings the host:port and username:password strings commonly used in Internet address specifications and by association, in URLs Arguments: Url - pointer to pointer to string containing URL. On output this is advanced past the address parts UrlLength - pointer to length of URL in UrlString. On output this is reduced by the number of characters parsed PartOne - pointer which will receive first part of address string PartOneLength - pointer which will receive length of first part of address string PartOneEscape - TRUE on output if PartOne contains escape sequences PartTwo - pointer which will receive second part of address string PartTwoLength - pointer which will receive length of second part of address string PartOneEscape - TRUE on output if PartTwo contains escape sequences Return Value: DWORD Success - ERROR_SUCCESS Failure - ERROR_WINHTTP_INVALID_URL --*/ { LPSTR pString; LPSTR pColon; DWORD partLength; LPBOOL partEscape; DWORD length; // // parse out [:] or [:] (i.e. [:] // pString = *Url; pColon = NULL; partLength = 0; *PartOne = pString; *PartOneLength = 0; *PartOneEscape = FALSE; *PartTwoEscape = FALSE; partEscape = PartOneEscape; length = *UrlLength; while ((*pString != '/') && (*pString != '\0') && (length != 0)) { if (*pString == '%') { // // if there is a % in the string then it *must* (RFC 1738) be the // start of an escape sequence. This function just reports the // address of the substrings and their lengths; calling functions // must handle the escape sequences (i.e. it is their responsibility // to decide where to put the results) // *partEscape = TRUE; } if (*pString == ':') { if (pColon != NULL) { // // we don't expect more than 1 ':' // return ERROR_WINHTTP_INVALID_URL; } pColon = pString; *PartOneLength = partLength; if (partLength == 0) { *PartOne = NULL; } partLength = 0; partEscape = PartTwoEscape; } else { ++partLength; } ++pString; --length; } // // we either ended on the host (or user) name or the port number (or // password), one of which we don't know the length of // if (pColon == NULL) { *PartOneLength = partLength; *PartTwo = NULL; *PartTwoLength = 0; *PartTwoEscape = FALSE; } else { *PartTwoLength = partLength; *PartTwo = pColon + 1; // // in both the : and : cases, we cannot have // the second part without the first, although both parts being zero // length is OK (host name will be sorted out elsewhere, but (for now, // at least) I am allowing <>:<> for username:password, since I don't // see it expressly disallowed in the RFC. I may be revisiting this code // later...) // // N.B.: ftp://ftp.microsoft.com uses http://:0/-http-gw-internal-/menu.gif // if ((*PartOneLength == 0) && (partLength != 0)) { // return ERROR_WINHTTP_INVALID_URL; // } } // // update the URL pointer and length remaining // *Url = pString; *UrlLength = length; return ERROR_SUCCESS; } DWORD GetUrlAddress( IN OUT LPSTR* lpszUrl, OUT LPDWORD lpdwUrlLength, OUT LPSTR* lpszUserName OPTIONAL, OUT LPDWORD lpdwUserNameLength OPTIONAL, OUT LPSTR* lpszPassword OPTIONAL, OUT LPDWORD lpdwPasswordLength OPTIONAL, OUT LPSTR* lpszHostName OPTIONAL, OUT LPDWORD lpdwHostNameLength OPTIONAL, OUT LPINTERNET_PORT lpPort OPTIONAL, OUT LPBOOL pHavePort ) /*++ Routine Description: This function extracts any and all parts of the address information for a generic URL. If any of the address parts contain escaped characters (%nn) then they are converted in situ The generic addressing format (RFC 1738) is: :@: The addressing information cannot contain a password without a user name, or a port without a host name NB: ftp://ftp.microsoft.com uses URL's that have a port without a host name! (e.g. http://:0/-http-gw-internal-/menu.gif) Although only the lpszUrl and lpdwUrlLength fields are required, the address parts will be checked for presence and completeness Assumes: 1. If one of the optional lpsz fields is present (e.g. lpszUserName) then the accompanying lpdw field must also be supplied Arguments: lpszUrl - IN: pointer to the URL to parse OUT: URL remaining after address information N.B. The url-path is NOT canonicalized (unescaped) because it may contain protocol-specific information which must be parsed out by the protocol-specific parser lpdwUrlLength - returned length of the remainder of the URL after the address information lpszUserName - returned pointer to the user name This parameter can be omitted by those protocol parsers that do not require or expect user names in the URL lpdwUserNameLength - returned length of the user name part This parameter can be omitted by those protocol parsers that do not require or expect user names in the URL lpszPassword - returned pointer to the password This parameter can be omitted by those protocol parsers that do not require or expect user passwords in the URL lpdwPasswordLength - returned length of the password This parameter can be omitted by those protocol parsers that do not require or expect user passwords in the URL lpszHostName - returned pointer to the host name This parameter can be omitted by those protocol parsers that do not require the host name info lpdwHostNameLength - returned length of the host name This parameter can be omitted by those protocol parsers that do not require the host name info lpPort - returned value of the port field This parameter can be omitted by those protocol parsers that do not require or expect user port number pHavePort - returned boolean indicating whether a port was specified in the URL or not. This value is not returned if the lpPort parameter is omitted. Return Value: DWORD Success - ERROR_SUCCESS Failure - ERROR_WINHTTP_INVALID_URL We could not parse some part of the address info, or we found address info where the protocol parser didn't expect any ERROR_INSUFFICIENT_BUFFER We could not convert an escaped string --*/ { LPSTR pAt; DWORD urlLength; LPSTR pUrl; BOOL part1Escape; BOOL part2Escape; char portNumber[INTERNET_MAX_PORT_NUMBER_LENGTH + 1]; DWORD portNumberLength; LPSTR pPortNumber; DWORD error; LPSTR hostName; DWORD hostNameLength; pUrl = *lpszUrl; urlLength = strlen(pUrl); // // check to see if there is an '@' separating user name & password. If we // see a '/' or get to the end of the string before we see the '@' then // there is no username:password part // pAt = NULL; for (DWORD i = 0; i < urlLength; ++i) { if (pUrl[i] == '/') { break; } else if (pUrl[i] == '@') { pAt = &pUrl[i]; break; } } if (pAt != NULL) { DWORD addressPartLength; LPSTR userName; DWORD userNameLength; LPSTR password; DWORD passwordLength; addressPartLength = (DWORD) (pAt - pUrl); urlLength -= addressPartLength; error = GetUrlAddressInfo(&pUrl, &addressPartLength, &userName, &userNameLength, &part1Escape, &password, &passwordLength, &part2Escape ); if (error != ERROR_SUCCESS) { return error; } // // ensure there is no address information unparsed before the '@' // INET_ASSERT(addressPartLength == 0); INET_ASSERT(pUrl == pAt); if (ARGUMENT_PRESENT(lpszUserName)) { INET_ASSERT(ARGUMENT_PRESENT(lpdwUserNameLength)); // // convert the user name in situ // if (part1Escape) { INET_ASSERT(userName != NULL); INET_ASSERT(userNameLength != 0); error = DecodeUrlInSitu(userName, &userNameLength); if (error != ERROR_SUCCESS) { return error; } } *lpszUserName = userName; *lpdwUserNameLength = userNameLength; } if (ARGUMENT_PRESENT(lpszPassword)) { // // convert the password in situ // if (part2Escape) { INET_ASSERT(userName != NULL); INET_ASSERT(userNameLength != 0); INET_ASSERT(password != NULL); INET_ASSERT(passwordLength != 0); error = DecodeUrlInSitu(password, &passwordLength); if (error != ERROR_SUCCESS) { return error; } } *lpszPassword = password; *lpdwPasswordLength = passwordLength; } // // the URL pointer now points at the host:port fields (remember that // ExtractAddressParts() must have bumped pUrl up to the end of the // password field (if present) which ends at pAt) // ++pUrl; // // similarly, bump urlLength to account for the '@' // --urlLength; } else { // // no '@' therefore no username or password // if (ARGUMENT_PRESENT(lpszUserName)) { INET_ASSERT(ARGUMENT_PRESENT(lpdwUserNameLength)); *lpszUserName = NULL; *lpdwUserNameLength = 0; } if (ARGUMENT_PRESENT(lpszPassword)) { INET_ASSERT(ARGUMENT_PRESENT(lpdwPasswordLength)); *lpszPassword = NULL; *lpdwPasswordLength = 0; } } // // now get the host name and the optional port // pPortNumber = portNumber; portNumberLength = sizeof(portNumber); error = GetUrlAddressInfo(&pUrl, &urlLength, &hostName, &hostNameLength, &part1Escape, &pPortNumber, &portNumberLength, &part2Escape ); if (error != ERROR_SUCCESS) { return error; } // // the URL address information MUST contain the host name // // if ((hostName == NULL) || (hostNameLength == 0)) { // return ERROR_WINHTTP_INVALID_URL; // } if (ARGUMENT_PRESENT(lpszHostName)) { INET_ASSERT(ARGUMENT_PRESENT(lpdwHostNameLength)); // // if the host name contains escaped characters, convert them in situ // if (part1Escape) { error = DecodeUrlInSitu(hostName, &hostNameLength); if (error != ERROR_SUCCESS) { return error; } } *lpszHostName = hostName; *lpdwHostNameLength = hostNameLength; } // // if there is a port field, convert it if there are escaped characters, // check it for valid numeric characters, and convert it to a number // if (ARGUMENT_PRESENT(lpPort)) { if (portNumberLength != 0) { DWORD i; DWORD port; INET_ASSERT(pPortNumber != NULL); if (part2Escape) { error = DecodeUrlInSitu(pPortNumber, &portNumberLength); if (error != ERROR_SUCCESS) { return error; } } // // ensure all characters in the port number buffer are numeric, and // calculate the port number at the same time // for (i = 0, port = 0; i < portNumberLength; ++i) { if (!isdigit(*pPortNumber)) { return ERROR_WINHTTP_INVALID_URL; } port = port * 10 + (int)(*pPortNumber++ - '0'); // We won't allow ports larger than 65535 ((2^16)-1) // We have to check this every time to make sure that someone // doesn't try to overflow a DWORD. if (port > 65535) { return ERROR_WINHTTP_INVALID_URL; } } *lpPort = (INTERNET_PORT)port; if (ARGUMENT_PRESENT(pHavePort)) { *pHavePort = TRUE; } } else { *lpPort = INTERNET_INVALID_PORT_NUMBER; if (ARGUMENT_PRESENT(pHavePort)) { *pHavePort = FALSE; } } } // // update the URL pointer and the length of the url-path // *lpszUrl = pUrl; *lpdwUrlLength = urlLength; return ERROR_SUCCESS; } INTERNET_SCHEME MapUrlSchemeName( IN LPSTR lpszSchemeName, IN DWORD dwSchemeNameLength ) /*++ Routine Description: Maps a scheme name/length to a scheme name type Arguments: lpszSchemeName - pointer to name of scheme to map dwSchemeNameLength - length of scheme (if -1, lpszSchemeName is ASCIZ) Return Value: INTERNET_SCHEME --*/ { if (dwSchemeNameLength == (DWORD)-1) { dwSchemeNameLength = (DWORD)lstrlen(lpszSchemeName); } DWORD i; if (ScanSchemes(lpszSchemeName, dwSchemeNameLength, &i)) { return UrlSchemeList[i].SchemeType; } return INTERNET_SCHEME_UNKNOWN; } LPSTR MapUrlScheme( IN INTERNET_SCHEME Scheme, OUT LPDWORD lpdwSchemeNameLength ) /*++ Routine Description: Maps the enumerated scheme name type to the name Arguments: Scheme - enumerated scheme type to map lpdwSchemeNameLength - pointer to returned length of scheme name Return Value: LPSTR - pointer to scheme name or NULL --*/ { if ((Scheme >= INTERNET_SCHEME_FIRST) && (Scheme <= INTERNET_SCHEME_LAST)) { *lpdwSchemeNameLength = UrlSchemeList[Scheme].SchemeLength; return UrlSchemeList[Scheme].SchemeName; } *lpdwSchemeNameLength = 0; return NULL; } LPSTR MapUrlSchemeToName( IN INTERNET_SCHEME Scheme ) /*++ Routine Description: Maps the enumerated scheme name type to the name Arguments: Scheme - enumerated scheme type to map Return Value: LPSTR - pointer to scheme name or NULL --*/ { if ((Scheme >= INTERNET_SCHEME_FIRST) && (Scheme <= INTERNET_SCHEME_LAST)) { return UrlSchemeList[Scheme].SchemeName; } return NULL; } /* * ConvertUnicodeToMultiByte: * * dwFlags: WINHTTP_FLAG_NULL_CODEPAGE-> assumes correctly encoded string packaged into UTF8, no escaping done. WINHTTP_FLAG_VALID_HOSTNAME-> only for server name only the previous flag valid for server name passed in here. if both of these are not specified, then if dwCodePage is not INVALID, it'll be used to convert unicode string to ANSI. else UTF8 will be used. if ESCAPE && ESCAPE_PERCENT is specified, the ANSI url will be escaped (incl. %) else it will be escaped w/o escaping %s. */ DWORD ConvertUnicodeToMultiByte( LPCWSTR lpszObjectName, DWORD dwCodePage, MEMORYPACKET* pmp, DWORD dwFlags) { DWORD dwError = ERROR_SUCCESS; LPSTR pStr; WCHAR wc; LPCWSTR pwStr; BOOL bStrip0s = TRUE; DWORD dwUnicodeUrlSize; //determine size of string and/or safe characters if ((dwFlags & WINHTTP_FLAG_NULL_CODEPAGE) || (dwFlags & WINHTTP_FLAG_VALID_HOSTNAME)) { if (dwFlags & WINHTTP_FLAG_VALID_HOSTNAME) { for (pwStr = lpszObjectName; wc = *pwStr; ++pwStr) { if (IS_UNSAFE_URL_WIDECHARACTER(wc, HOSTNAME)) { dwError = ERROR_WINHTTP_INVALID_URL; goto done; } } pmp->dwAlloc = dwUnicodeUrlSize = (DWORD)(pwStr-lpszObjectName+1); } else { pmp->dwAlloc = dwUnicodeUrlSize = lstrlenW(lpszObjectName)+1; } } else { DWORD dwUnsafeChars = 0; // optimization to check for unsafe characters, and optimize the common case. // calculate the length, and while parsing the string, check if there are unsafeChars for(pwStr = lpszObjectName; wc = *pwStr; ++pwStr) { if (IS_UNSAFE_URL_WIDECHARACTER(wc, 0)) ++dwUnsafeChars; } dwUnicodeUrlSize = (DWORD)(pwStr-lpszObjectName+1); if (dwUnsafeChars == 0) { pmp->dwAlloc = dwUnicodeUrlSize; } else { bStrip0s = FALSE; } } //convert to MBCS if (bStrip0s) { INET_ASSERT(pmp->dwAlloc); pmp->psStr = (LPSTR)ALLOCATE_FIXED_MEMORY(pmp->dwAlloc); if (!pmp->psStr) { pmp->dwAlloc = 0; dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; } pmp->dwSize = pmp->dwAlloc-1; for (pStr = pmp->psStr; wc = *lpszObjectName; ++lpszObjectName) { *(pStr)++ = (CHAR)wc; } *pStr = '\0'; } else { // convert with WideCharToMultiByte() pmp->dwAlloc = WideCharToMultiByte(dwCodePage, 0, lpszObjectName, dwUnicodeUrlSize, NULL, 0, NULL, NULL); if (pmp->dwAlloc) { pmp->psStr = (LPSTR)ALLOCATE_FIXED_MEMORY(pmp->dwAlloc); if (!pmp->psStr) { pmp->dwAlloc = 0; dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; } pmp->dwSize = WideCharToMultiByte(dwCodePage, 0, lpszObjectName, dwUnicodeUrlSize, pmp->psStr, pmp->dwAlloc, NULL, NULL); if (!pmp->dwSize) { dwError = GetLastError(); goto done; } else pmp->dwSize -= 1; } else { dwError = GetLastError(); goto done; } } //escaping if (dwFlags & WINHTTP_FLAG_DEFAULT_ESCAPE) { INET_ASSERT (! (dwFlags & WINHTTP_FLAG_VALID_HOSTNAME)); static CHAR* hexArray = "0123456789ABCDEF"; UCHAR ch; DWORD dwUnsafeChars = 0; DWORD dwNewAlloc; LPSTR pDest, pNewStr; for(pStr = pmp->psStr; ch = *pStr; pStr = CharNextExA((WORD)dwCodePage, pStr, 0)) { if (IS_UNSAFE_URL_CHARACTER(ch, SCHEME_HTTP)) ++dwUnsafeChars; else if(ch == '?') break; } if (dwUnsafeChars == 0) goto done; dwNewAlloc = pmp->dwAlloc + dwUnsafeChars*2; pNewStr = pDest = (LPSTR)ALLOCATE_FIXED_MEMORY(dwNewAlloc); if (!pDest) { dwError = ERROR_NOT_ENOUGH_MEMORY; goto done; } BOOL bEscapePercent = (dwFlags & WINHTTP_FLAG_ESCAPE_PERCENT) ? TRUE : FALSE; BOOL bHitQuery = FALSE; LPSTR pNext; BOOL bLead; for (pStr = pmp->psStr; ch = *pStr;) { pNext = CharNextExA((WORD)dwCodePage, pStr, 0); bLead = TRUE; do { ch = *pStr; if (IS_UNSAFE_URL_CHARACTER(ch, SCHEME_HTTP) && (!bLead || (ch != '%') || bEscapePercent) ) { *pDest++ = '%'; *pDest++ = hexArray[ch>>4]; *pDest++ = hexArray[ch & 0x0f]; } else { *pDest++ = ch; if ((ch == '?') && bLead) { bHitQuery = TRUE; ++pStr; INET_ASSERT(pStr == pNext); break; } } bLead = FALSE; } while (++pStr != pNext); if (bHitQuery) break; } if (bHitQuery) { for ( ; ch = *pStr; pStr++) { *pDest++ = ch; } } *pDest = '\0'; FREE_FIXED_MEMORY(pmp->psStr); pmp->psStr = pNewStr; pmp->dwSize = (DWORD)(pDest-pNewStr); pmp->dwAlloc = dwNewAlloc; } done: if (pmp->psStr) pmp->dwAlloc = (pmp->dwAlloc > MP_MAX_STACK_USE) ? pmp->dwAlloc : MP_MAX_STACK_USE+1;// to force FREE in ~MEMORYPACKET return dwError; }