/*++ Copyright (c) 1998-2002 Microsoft Corporation Module Name: C14n.c Abstract: URL canonicalization (c14n) routines Author: George V. Reilly (GeorgeRe) 22-Mar-2002 Revision History: --*/ #include #include "c14np.h" #if defined(ALLOC_PRAGMA) && defined(KERNEL_PRIV) #pragma alloc_text( PAGE, HttpInitializeDefaultUrlC14nConfig) #pragma alloc_text( PAGE, HttpInitializeDefaultUrlC14nConfigEncoding) #pragma alloc_text( PAGE, HttpUnescapePercentHexEncoding) #pragma alloc_text( PAGE, HttppPopCharHostNameUtf8) #pragma alloc_text( PAGE, HttppPopCharHostNameDbcs) #pragma alloc_text( PAGE, HttppPopCharHostNameAnsi) #pragma alloc_text( PAGE, HttpCopyHost) #pragma alloc_text( PAGE, HttppCopyHostByType) #pragma alloc_text( PAGE, HttpValidateHostname) #pragma alloc_text( PAGE, HttppPopCharAbsPathUtf8) #pragma alloc_text( PAGE, HttppPopCharAbsPathDbcs) #pragma alloc_text( PAGE, HttppPopCharAbsPathAnsi) #pragma alloc_text( PAGE, HttppPopCharQueryString) #pragma alloc_text( PAGE, HttppCopyUrlByType) #pragma alloc_text( PAGE, HttpCopyUrl) #pragma alloc_text( PAGE, HttpCleanAndCopyUrl) #pragma alloc_text( PAGE, HttppCleanAndCopyUrlByType) #pragma alloc_text( PAGE, HttpFindUrlToken) #pragma alloc_text( PAGE, HttppParseIPv6Address) #pragma alloc_text( PAGE, HttppPrintIpAddressW) #pragma alloc_text( PAGE, HttpParseUrl) #pragma alloc_text( PAGE, HttpNormalizeParsedUrl) #endif // ALLOC_PRAGMA && KERNEL_PRIV #if 0 // Non-Pageable Functions NOT PAGEABLE -- #endif // Non-Pageable Functions VOID HttpInitializeDefaultUrlC14nConfig( PURL_C14N_CONFIG pCfg ) { PAGED_CODE(); pCfg->HostnameDecodeOrder = UrlDecode_Utf8_Else_Dbcs_Else_Ansi; pCfg->AbsPathDecodeOrder = UrlDecode_Utf8; pCfg->EnableNonUtf8 = FALSE; pCfg->FavorUtf8 = FALSE; pCfg->EnableDbcs = FALSE; pCfg->PercentUAllowed = DEFAULT_C14N_PERCENT_U_ALLOWED; pCfg->AllowRestrictedChars = DEFAULT_C14N_ALLOW_RESTRICTED_CHARS; pCfg->CodePage = 0; pCfg->UrlMaxLength = DEFAULT_C14N_URL_MAX_LENGTH; pCfg->UrlSegmentMaxLength = DEFAULT_C14N_URL_SEGMENT_MAX_LENGTH; pCfg->UrlSegmentMaxCount = DEFAULT_C14N_URL_SEGMENT_MAX_COUNT; pCfg->MaxLabelLength = DEFAULT_C14N_MAX_LABEL_LENGTH; pCfg->MaxHostnameLength = DEFAULT_C14N_MAX_HOSTNAME_LENGTH; } // HttpInitializeDefaultUrlC14nConfig VOID HttpInitializeDefaultUrlC14nConfigEncoding( PURL_C14N_CONFIG pCfg, BOOLEAN EnableNonUtf8, BOOLEAN FavorUtf8, BOOLEAN EnableDbcs ) { PAGED_CODE(); HttpInitializeDefaultUrlC14nConfig(pCfg); pCfg->EnableNonUtf8 = EnableNonUtf8; pCfg->FavorUtf8 = FavorUtf8; pCfg->EnableDbcs = EnableDbcs; if (EnableNonUtf8) { if (FavorUtf8) { pCfg->AbsPathDecodeOrder = (EnableDbcs ? UrlDecode_Utf8_Else_Dbcs : UrlDecode_Utf8_Else_Ansi); } else { pCfg->AbsPathDecodeOrder = (EnableDbcs ? UrlDecode_Dbcs_Else_Utf8 : UrlDecode_Ansi_Else_Utf8); } } else { pCfg->AbsPathDecodeOrder = UrlDecode_Utf8; } } // HttpInitializeDefaultUrlC14nConfigEncoding /***************************************************************************++ Routine Description: Convert '%NN' or '%uNNNN' to a ULONG. Arguments: pSourceChar - Input buffer SourceLength - Length of pSourceChar, in bytes PercentUAllowed - Accept '%uNNNN' notation? pOutChar - decoded character pBytesToSkip - number of bytes consumed from pSourceChar; will be 3 for %NN and 6 for %uNNNN. Return Value: STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD --***************************************************************************/ NTSTATUS HttpUnescapePercentHexEncoding( IN PCUCHAR pSourceChar, IN ULONG SourceLength, IN BOOLEAN PercentUAllowed, OUT PULONG pOutChar, OUT PULONG pBytesToSkip ) { ULONG Result, i, NumDigits; PCUCHAR pHexDigits; PAGED_CODE(); if (SourceLength < STRLEN_LIT("%NN")) { UlTraceError(PARSER, ( "http!HttpUnescapePercentHexEncoding(%p): " "Length too short, %lu.\n", pSourceChar, SourceLength )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } else if (pSourceChar[0] != PERCENT) { UlTraceError(PARSER, ( "http!HttpUnescapePercentHexEncoding(%p): " "Starts with 0x%02lX, not '%%'.\n", pSourceChar, (ULONG) pSourceChar[0] )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } if (pSourceChar[1] != 'u' && pSourceChar[1] != 'U') { // RFC 2396 says that an "escaped octet is encoded as a character // triplet, consisting of the percent character '%' followed by // the two hexadecimal digits representing the octet code." pHexDigits = pSourceChar + STRLEN_LIT("%"); NumDigits = 2; *pBytesToSkip = STRLEN_LIT("%NN"); } else { // This is the %uNNNN notation generated by JavaScript's escape() fn if (! PercentUAllowed) { UlTraceError(PARSER, ( "http!HttpUnescapePercentHexEncoding(%p): " "%%uNNNN forbidden.\n", pSourceChar, SourceLength )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } else if (SourceLength < STRLEN_LIT("%uNNNN")) { UlTraceError(PARSER, ( "http!HttpUnescapePercentHexEncoding(%p): " "Length %lu too short for %%uNNNN.\n", pSourceChar, SourceLength )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } pHexDigits = pSourceChar + STRLEN_LIT("%u"); NumDigits = 4; *pBytesToSkip = STRLEN_LIT("%uNNNN"); } ASSERT(*pBytesToSkip <= SourceLength); Result = 0; for (i = 0; i < NumDigits; ++i) { ULONG Char = pHexDigits[i]; ULONG Digit; // // HexToChar() inlined. Note: in ASCII, '0' < 'A' < 'a' and there are // no gaps in ranges '0'..'9', 'A'..'F', and 'a'..'f' (unlike EBCDIC, // which has gaps between 'I'/'J', 'R'/'S', 'i'/'j', and 'r'/'s'). // C_ASSERT('0' < 'A' && 'A' < 'a'); C_ASSERT('9' - '0' == 10 - 1); C_ASSERT('F' - 'A' == 6 - 1); C_ASSERT('f' - 'a' == 6 - 1); if (! IS_HTTP_HEX(Char)) { UlTraceError(PARSER, ( "http!HttpUnescapePercentHexEncoding(%p): " "Invalid hex character[%lu], 0x%02lX.\n", pSourceChar, i, Char )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } else if ('a' <= Char) { ASSERT('a' <= Char && Char <= 'f'); Digit = Char - 'a' + 0xA; } else if ('A' <= Char) { ASSERT('A' <= Char && Char <= 'F'); Digit = Char - 'A' + 0xA; } else { ASSERT('0' <= Char && Char <= '9'); Digit = Char - '0'; } ASSERT(Digit < 0x10); Result = (Result << 4) | Digit; } *pOutChar = Result; return STATUS_SUCCESS; } // HttpUnescapePercentHexEncoding /***************************************************************************++ Routine Description: Consume 1-4 bytes from pSourceChar, treating it as raw UTF-8. This routine is only suitable for the hostname part of an HTTP URL, Arguments: pSourceChar - Input buffer SourceLength - Length of pSourceChar, in bytes pUnicodeChar - decoded character pBytesToSkip - number of characters consumed from pSourceChar Return Value: STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD --***************************************************************************/ NTSTATUS HttppPopCharHostNameUtf8( IN PCUCHAR pSourceChar, IN ULONG SourceLength, OUT PULONG pUnicodeChar, OUT PULONG pBytesToSkip ) { NTSTATUS Status; PAGED_CODE(); ASSERT(SourceLength > 0); Status = HttpUtf8RawBytesToUnicode( pSourceChar, SourceLength, pUnicodeChar, pBytesToSkip ); return Status; } // HttppPopCharHostNameUtf8 /***************************************************************************++ Routine Description: Consume 1-2 bytes from pSourceChar and converts it from raw DBCS to Unicode. This routine is only suitable for the hostname part of an HTTP URL. Arguments: pSourceChar - Input buffer SourceLength - Length of pSourceChar, in bytes pUnicodeChar - decoded character pBytesToSkip - number of characters consumed from pSourceChar Return Value: STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD --***************************************************************************/ NTSTATUS HttppPopCharHostNameDbcs( IN PCUCHAR pSourceChar, IN ULONG SourceLength, OUT PULONG pUnicodeChar, OUT PULONG pBytesToSkip ) { NTSTATUS Status; ULONG AnsiCharSize; WCHAR WideChar; PAGED_CODE(); ASSERT(SourceLength > 0); if (! IS_DBCS_LEAD_BYTE(pSourceChar[0])) { AnsiCharSize = 1; } else { if (SourceLength < 2) { UlTraceError(PARSER, ( "http!HttppPopCharHostNameDbcs(%p): " "ERROR: DBCS lead byte, 0x%02lX, at end of string\n", pSourceChar, *pSourceChar )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } AnsiCharSize = 2; } Status = RtlMultiByteToUnicodeN( &WideChar, sizeof(WCHAR), NULL, (PCSTR) pSourceChar, AnsiCharSize ); if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ( "http!HttppPopCharHostNameDbcs(%p): " "MultiByteToUnicode(%lu) failed, %s.\n", pSourceChar, AnsiCharSize, HttpStatusToString(Status) )); return Status; } *pUnicodeChar = WideChar; *pBytesToSkip = AnsiCharSize; return STATUS_SUCCESS; } // HttppPopCharHostNameDbcs /***************************************************************************++ Routine Description: Consume 1 bytes from pSourceChar and converts it from raw ANSI to Unicode. This routine is only suitable for the hostname part of an HTTP URL. Arguments: pSourceChar - Input buffer SourceLength - Length of pSourceChar, in bytes pUnicodeChar - decoded character pBytesToSkip - number of characters consumed from pSourceChar Return Value: STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD --***************************************************************************/ NTSTATUS HttppPopCharHostNameAnsi( IN PCUCHAR pSourceChar, IN ULONG SourceLength, OUT PULONG pUnicodeChar, OUT PULONG pBytesToSkip ) { NTSTATUS Status; #if !DBG UNREFERENCED_PARAMETER(SourceLength); #endif // !DBG PAGED_CODE(); ASSERT(SourceLength > 0); *pUnicodeChar = AnsiToUnicodeMap[pSourceChar[0]]; *pBytesToSkip = 1; Status = (0 != *pUnicodeChar) ? STATUS_SUCCESS : STATUS_OBJECT_PATH_SYNTAX_BAD; if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ( "http!HttppPopCharHostNameAnsi(%p): " "No mapping for %lu.\n", pSourceChar, *pSourceChar )); } return Status; } // HttppPopCharHostNameAnsi /***************************************************************************++ Routine Description: Common tail function called at the end of the HttppPopCharAbsPath*() functions, to minimize code replication. Arguments: pSourceChar - Input buffer SourceLength - Length of pSourceChar, in bytes UnicodeChar - decoded character BytesToSkip - number of characters consumed from pSourceChar pUnicodeChar - where to put UnicodeChar result pBytesToSkip - where to put BytesToSkip result Return Value: STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD --***************************************************************************/ __inline NTSTATUS HttppPopCharAbsPathCommonTail( IN PCUCHAR pSourceChar, IN ULONG SourceLength, IN ULONG UnicodeChar, IN ULONG BytesToSkip, IN BOOLEAN AllowRestrictedChars, OUT PULONG pUnicodeChar, OUT PULONG pBytesToSkip ) { #if !DBG UNREFERENCED_PARAMETER(pSourceChar); UNREFERENCED_PARAMETER(SourceLength); #endif // !DBG // // Special handling for characters in the 8-bit range. // May want to look at BytesToSkip to distinguish between // raw and hex-escaped/UTF-8-encoded data. // // In particular, should we allow %2F or %u002F as alternate // represenations of '/' in a URL? Why would anyone have a legitimate // need to escape a slash character? // if (UnicodeChar < 0x100) { // Transform backslashes to forward slashes if (BACK_SLASH == UnicodeChar) { UnicodeChar = FORWARD_SLASH; } else if (!AllowRestrictedChars && IS_URL_INVALID(UnicodeChar)) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathCommonTail(%p): " "Invalid character, U+%04X.\n", pSourceChar, UnicodeChar )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } // CODEWORK: should we allow hex-escaped "restricted" or "unwise" // characters at all? } ASSERT(BytesToSkip <= SourceLength); *pBytesToSkip = BytesToSkip; *pUnicodeChar = UnicodeChar; return STATUS_SUCCESS; } // HttppPopCharAbsPathCommonTail /***************************************************************************++ Routine Description: Consume 1-12 bytes from pSourceChar. Handle hex-escaped UTF-8 encoding. This routine is only suitable for the /abspath part of an HTTP URL. Arguments: pSourceChar - Input buffer SourceLength - Length of pSourceChar, in bytes pUnicodeChar - decoded character pBytesToSkip - number of characters consumed from pSourceChar Return Value: STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD --***************************************************************************/ NTSTATUS HttppPopCharAbsPathUtf8( IN PCUCHAR pSourceChar, IN ULONG SourceLength, IN BOOLEAN PercentUAllowed, IN BOOLEAN AllowRestrictedChars, OUT PULONG pUnicodeChar, OUT PULONG pBytesToSkip ) { NTSTATUS Status; ULONG UnicodeChar; ULONG BytesToSkip; ULONG Temp; ULONG OctetCount; UCHAR Octets[4]; UCHAR LeadByte; // // Sanity check. // PAGED_CODE(); ASSERT(SourceLength > 0); // // validate it as a valid URL character // if (! IS_URL_TOKEN(pSourceChar[0])) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathUtf8(%p): " "first char, 0x%02lX, isn't URL token\n", pSourceChar, (ULONG) pSourceChar[0] )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } // // need to unescape hex encoding, '%NN' or '%uNNNN'? // if (PERCENT != pSourceChar[0]) { UnicodeChar = pSourceChar[0]; BytesToSkip = 1; // // All octets with bit7 set MUST be hex-escaped. // Do NOT accept literals with hi-bit set. // if (UnicodeChar > ASCII_MAX) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathUtf8(%p): " "Invalid hi-bit literal, 0x%02lX.\n", pSourceChar, UnicodeChar )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } Status = STATUS_SUCCESS; goto unslash; } Status = HttpUnescapePercentHexEncoding( pSourceChar, SourceLength, PercentUAllowed, &UnicodeChar, &BytesToSkip ); if (! NT_SUCCESS(Status)) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathUtf8(%p): " "Invalid hex encoding.\n", pSourceChar )); return Status; } // // If we consumed '%uNNNN', don't attempt any UTF-8 decoding // if (STRLEN_LIT("%uNNNN") == BytesToSkip) goto unslash; ASSERT(STRLEN_LIT("%NN") == BytesToSkip); ASSERT(UnicodeChar <= 0xFF); Octets[0] = LeadByte = (UCHAR) UnicodeChar; OctetCount = UTF8_OCTET_COUNT(LeadByte); if (0 == OctetCount) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathUtf8(%p): " "Invalid lead byte, 0x%02lX.\n", pSourceChar, UnicodeChar )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } ASSERT(OctetCount <= sizeof(Octets) / sizeof(Octets[0])); BytesToSkip = OctetCount * STRLEN_LIT("%NN"); if (BytesToSkip > SourceLength) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathUtf8(%p): " "%lu octets is not enough for %lu-byte UTF-8 encoding.\n", pSourceChar, OctetCount, SourceLength )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } if (OctetCount == 1) { #if DBG // Singleton: no trail bytes Status = HttpUtf8RawBytesToUnicode( Octets, OctetCount, &UnicodeChar, &Temp ); ASSERT(STATUS_SUCCESS == Status); ASSERT(UnicodeChar == LeadByte); ASSERT(1 == Temp); #endif // DBG } else { ULONG i; // // Decode the hex-escaped trail bytes // for (i = 1; i < OctetCount; ++i) { ULONG TrailChar; UCHAR TrailByte; Status = HttpUnescapePercentHexEncoding( pSourceChar + i * STRLEN_LIT("%NN"), STRLEN_LIT("%NN"), FALSE, // do not allow %uNNNN for trail bytes &TrailChar, &Temp ); if (! NT_SUCCESS(Status)) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathUtf8(%p): " "Invalid hex-encoded trail byte[%lu].\n", pSourceChar, i )); return Status; } ASSERT(STRLEN_LIT("%NN") == Temp); ASSERT(TrailChar <= 0xFF); Octets[i] = TrailByte = (UCHAR) TrailChar; if (! IS_UTF8_TRAILBYTE(TrailByte)) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathUtf8(%p): " "Invalid trail byte[%lu], 0x%02lX.\n", pSourceChar, i, TrailChar )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } } // // Decode the raw UTF-8 bytes // Status = HttpUtf8RawBytesToUnicode( Octets, OctetCount, &UnicodeChar, &Temp ); if (! NT_SUCCESS(Status)) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathUtf8(%p): " "Invalid UTF-8 sequence.\n", pSourceChar )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } } unslash: ASSERT(NT_SUCCESS(Status)); return HttppPopCharAbsPathCommonTail( pSourceChar, SourceLength, UnicodeChar, BytesToSkip, AllowRestrictedChars, pUnicodeChar, pBytesToSkip ); } // HttppPopCharAbsPathUtf8 /***************************************************************************++ Routine Description: Consume 1-6 bytes from pSourceChar. Handle hex-escaped DBCS encoding. This routine is only suitable for the /abspath part of an HTTP URL. Arguments: pSourceChar - Input buffer SourceLength - Length of pSourceChar, in bytes pUnicodeChar - decoded character pBytesToSkip - number of characters consumed from pSourceChar Return Value: STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD --***************************************************************************/ NTSTATUS HttppPopCharAbsPathDbcs( IN PCUCHAR pSourceChar, IN ULONG SourceLength, IN BOOLEAN PercentUAllowed, IN BOOLEAN AllowRestrictedChars, OUT PULONG pUnicodeChar, OUT PULONG pBytesToSkip ) { NTSTATUS Status; ULONG UnicodeChar; WCHAR WideChar; ULONG BytesToSkip; UCHAR AnsiChar[2]; ULONG AnsiCharSize; UCHAR LeadByte; UCHAR SecondByte = 0; // // Sanity check. // PAGED_CODE(); ASSERT(SourceLength > 0); if (! IS_URL_TOKEN(pSourceChar[0])) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathDbcs(%p): " "first char, 0x%02lX, isn't URL token\n", pSourceChar, (ULONG) pSourceChar[0] )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } if (PERCENT != pSourceChar[0]) { // Note: unlike UTF-8, we allow literal bytes whose top bit is set UnicodeChar = pSourceChar[0]; BytesToSkip = 1; } else { // need to unescape hex encoding, '%NN' or '%uNNNN' Status = HttpUnescapePercentHexEncoding( pSourceChar, SourceLength, PercentUAllowed, &UnicodeChar, &BytesToSkip ); if (! NT_SUCCESS(Status)) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathDbcs(%p): " "Invalid hex encoding.\n", pSourceChar )); return Status; } // // If we consumed '%uNNNN', don't attempt DBCS-to-Unicode conversion // if (STRLEN_LIT("%uNNNN") == BytesToSkip) goto unslash; ASSERT(STRLEN_LIT("%NN") == BytesToSkip); ASSERT(UnicodeChar <= 0xFF); } LeadByte = (UCHAR) UnicodeChar; AnsiChar[0] = LeadByte; if (! IS_DBCS_LEAD_BYTE(LeadByte)) { AnsiCharSize = 1; } else { // // This is a double-byte character. // ASSERT(BytesToSkip <= SourceLength); if (BytesToSkip == SourceLength) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathDbcs(%p): " "ERROR: DBCS lead byte, 0x%02lX, at end of string\n", pSourceChar, UnicodeChar )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } AnsiCharSize = 2; SecondByte = pSourceChar[BytesToSkip]; if (PERCENT != SecondByte) { BytesToSkip += 1; } else { ULONG TrailChar; ULONG Temp; if (BytesToSkip + STRLEN_LIT("%NN") > SourceLength) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathDbcs(%p): " "ERROR: no space for DBCS hex-encoded suffix\n", pSourceChar )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } Status = HttpUnescapePercentHexEncoding( pSourceChar + BytesToSkip, SourceLength - BytesToSkip, FALSE, // no %uNNNN allowed here &TrailChar, &Temp ); if (! NT_SUCCESS(Status)) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathDbcs(%p): " "Invalid hex encoding of trail byte.\n", pSourceChar )); return Status; } ASSERT(STRLEN_LIT("%NN") == Temp); ASSERT(TrailChar <= 0xFF); SecondByte = (UCHAR) TrailChar; BytesToSkip += STRLEN_LIT("%NN"); } AnsiChar[1] = SecondByte; } Status = RtlMultiByteToUnicodeN( &WideChar, sizeof(WCHAR), NULL, (PCHAR) &AnsiChar[0], AnsiCharSize ); if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathDbcs(%p): " "MultiByteToUnicode(%lu) failed, %s.\n", pSourceChar, AnsiCharSize, HttpStatusToString(Status) )); return Status; } UnicodeChar = WideChar; #if DBG // // Describe conversion in debug spew. // if (1 == AnsiCharSize) { UlTraceVerbose(PARSER, ( "http!HttppPopCharAbsPathDbcs(%p): " "converted %02X to U+%04lX '%C'\n", pSourceChar, LeadByte, UnicodeChar, UnicodeChar )); } else { ASSERT(2 == AnsiCharSize); UlTraceVerbose(PARSER, ( "http!HttppPopCharAbsPathDbcs(%p): " "converted %02X %02X to U+%04lX '%C'\n", pSourceChar, LeadByte, SecondByte, UnicodeChar, UnicodeChar )); } #endif // DBG unslash: ASSERT(NT_SUCCESS(Status)); return HttppPopCharAbsPathCommonTail( pSourceChar, SourceLength, UnicodeChar, BytesToSkip, AllowRestrictedChars, pUnicodeChar, pBytesToSkip ); } // HttppPopCharAbsPathDbcs /***************************************************************************++ Routine Description: Consume 1-6 bytes from pSourceChar. Handle hex-escaped ANSI encoding. This routine is only suitable for the /abspath part of an HTTP URL. Arguments: pSourceChar - Input buffer SourceLength - Length of pSourceChar, in bytes pUnicodeChar - decoded character pBytesToSkip - number of characters consumed from pSourceChar Return Value: STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD --***************************************************************************/ NTSTATUS HttppPopCharAbsPathAnsi( IN PCUCHAR pSourceChar, IN ULONG SourceLength, IN BOOLEAN PercentUAllowed, IN BOOLEAN AllowRestrictedChars, OUT PULONG pUnicodeChar, OUT PULONG pBytesToSkip ) { NTSTATUS Status = STATUS_SUCCESS; ULONG UnicodeChar; ULONG BytesToSkip; // // Sanity check. // PAGED_CODE(); ASSERT(SourceLength > 0); // // DBCS and ANSI decoders must allow any raw byte whose top bit // is set (0x80-0xFF) // if (! IS_URL_TOKEN(pSourceChar[0]) && !(0x80 & pSourceChar[0])) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathAnsi(%p): " "first char, 0x%02lX, isn't URL token\n", pSourceChar, (ULONG) pSourceChar[0] )); RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD); } if (PERCENT != pSourceChar[0]) { // Note: unlike UTF-8, we allow literal bytes whose top bit is set UnicodeChar = AnsiToUnicodeMap[ pSourceChar[0] ]; BytesToSkip = 1; } else { // need to unescape hex encoding, '%NN' or '%uNNNN' Status = HttpUnescapePercentHexEncoding( pSourceChar, SourceLength, PercentUAllowed, &UnicodeChar, &BytesToSkip ); if (! NT_SUCCESS(Status)) { UlTraceError(PARSER, ( "http!HttppPopCharAbsPathAnsi(%p): " "Invalid hex encoding.\n", pSourceChar )); return Status; } // // If we consumed '%uNNNN', don't attempt Ansi-to-Unicode conversion // if (STRLEN_LIT("%uNNNN") != BytesToSkip) { ASSERT(STRLEN_LIT("%NN") == BytesToSkip); ASSERT(UnicodeChar <= 0xFF); UnicodeChar = AnsiToUnicodeMap[(UCHAR) UnicodeChar]; } } ASSERT(NT_SUCCESS(Status)); return HttppPopCharAbsPathCommonTail( pSourceChar, SourceLength, UnicodeChar, BytesToSkip, AllowRestrictedChars, pUnicodeChar, pBytesToSkip ); } // HttppPopCharAbsPathAnsi /***************************************************************************++ Routine Description: Consume 1 bytes from pSourceChar and returns it unaltered. This routine is only suitable for the ?querystring part of an HTTP URL, which we do not interpret. CODEWORK: don't 'convert' querystring to Unicode. Send it up verbatim. Arguments: pSourceChar - Input buffer SourceLength - Length of pSourceChar, in bytes pUnicodeChar - decoded character pBytesToSkip - number of characters consumed from pSourceChar Return Value: STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD --***************************************************************************/ NTSTATUS HttppPopCharQueryString( IN PCUCHAR pSourceChar, IN ULONG SourceLength, IN BOOLEAN PercentUAllowed, IN BOOLEAN AllowRestrictedChars, OUT PULONG pUnicodeChar, OUT PULONG pBytesToSkip ) { PAGED_CODE(); UNREFERENCED_PARAMETER(SourceLength); UNREFERENCED_PARAMETER(PercentUAllowed); UNREFERENCED_PARAMETER(AllowRestrictedChars); *pUnicodeChar = *pSourceChar; *pBytesToSkip = 1; return STATUS_SUCCESS; } // HttppPopCharQueryString // // a cool local helper macro // #define EMIT_CHAR(ch, pDest, BytesCopied, Status, AllowRestrictedChars) \ do \ { \ WCHAR HighSurrogate, LowSurrogate; \ \ if ((ch) > LOW_NONCHAR_BITS) \ { \ Status = HttpUcs4toUtf16((ch), \ &HighSurrogate, &LowSurrogate); \ \ if (! NT_SUCCESS(Status)) \ goto end; \ \ *pDest++ = HighSurrogate; \ *pDest++ = LowSurrogate; \ BytesCopied += 2 * sizeof(WCHAR); \ } \ else \ { \ ASSERT(ch < HIGH_SURROGATE_START \ || LOW_SURROGATE_END < ch); \ \ if ( IS_UNICODE_NONCHAR((ch)) ) \ { \ UlTraceError(PARSER, ( \ "http!HttpUcs4toUtf16(): " \ "Non-character code point, U+%04lX.\n", \ (ch) )); \ \ Status = STATUS_INVALID_PARAMETER; \ goto end; \ } \ \ *pDest++ = (WCHAR) (ch); \ BytesCopied += sizeof(WCHAR); \ } \ \ /* Can probably omit this test */ \ if (BytesCopied > UNICODE_STRING_MAX_BYTE_LEN) \ { \ Status = STATUS_DATA_OVERRUN; \ goto end; \ } \ } while (0, 0) #define EMIT_LITERAL_CHAR(ch, pDest, BytesCopied) \ do \ { \ ASSERT(IS_ASCII(ch)); \ \ *pDest++ = (WCHAR) (ch); \ BytesCopied += sizeof(WCHAR); \ } while (0, 0) #define HttppUrlEncodingToString(UrlEncoding) \ ((UrlEncoding == UrlDecode_Ansi) \ ? "Ansi" \ : (UrlEncoding == UrlDecode_Dbcs) \ ? "Dbcs" \ : "Utf8") /***************************************************************************++ Routine Description: Copies a hostname, converting it to Unicode Arguments: Return Value: NTSTATUS - Completion status. --***************************************************************************/ NTSTATUS HttpCopyHost( IN PURL_C14N_CONFIG pCfg, OUT PWSTR pDestination, IN PCUCHAR pSource, IN ULONG SourceLength, OUT PULONG pBytesCopied, OUT PURL_ENCODING_TYPE pHostnameEncodingType ) { NTSTATUS Status = STATUS_UNSUCCESSFUL; ULONG DecodeOrder = pCfg->HostnameDecodeOrder; PAGED_CODE(); ASSERT(NULL != pCfg); ASSERT(NULL != pDestination); ASSERT(NULL != pSource); ASSERT(NULL != pBytesCopied); ASSERT(NULL != pHostnameEncodingType); if (0 == DecodeOrder || DecodeOrder != (DecodeOrder & UrlDecode_MaxMask)) { UlTraceError(PARSER, ("http!HttpCopyHost: invalid DecodeOrder, 0x%lX\n", DecodeOrder )); RETURN(STATUS_INVALID_PARAMETER); } for ( ; 0 != DecodeOrder && !NT_SUCCESS(Status); DecodeOrder >>= UrlDecode_Shift ) { ULONG UrlEncoding = (DecodeOrder & UrlDecode_Mask); switch (UrlEncoding) { default: ASSERT(! "Impossible UrlDecodeOrder"); case UrlDecode_None: break; case UrlDecode_Ansi: case UrlDecode_Dbcs: case UrlDecode_Utf8: UlTraceVerbose(PARSER, ("http!HttpCopyHost(%s, Src=%p, %lu)\n", HttppUrlEncodingToString(UrlEncoding), pSource, SourceLength )); Status = HttppCopyHostByType( (URL_ENCODING_TYPE) UrlEncoding, pDestination, pSource, SourceLength, pBytesCopied ); if (NT_SUCCESS(Status)) { *pHostnameEncodingType = (URL_ENCODING_TYPE) UrlEncoding; UlTraceVerbose(PARSER, ("http!HttpCopyHost(%s): " "(%lu) '%.*s' -> (%lu) '%ls'\n", HttppUrlEncodingToString(UrlEncoding), SourceLength, SourceLength, pSource, *pBytesCopied/sizeof(WCHAR), pDestination )); } break; }; } return Status; } // HttpCopyHost /***************************************************************************++ Routine Description: Copies a hostname, converting it to Unicode CODEWORK: Handle ACE-encoded hostnames Arguments: Return Value: NTSTATUS - Completion status. --***************************************************************************/ NTSTATUS HttppCopyHostByType( IN URL_ENCODING_TYPE UrlEncoding, OUT PWSTR pDestination, IN PCUCHAR pSource, IN ULONG SourceLength, OUT PULONG pBytesCopied ) { NTSTATUS Status; PWSTR pDest; PCUCHAR pChar; ULONG BytesCopied; ULONG UnicodeChar; ULONG CharToSkip; PFN_POPCHAR_HOSTNAME pfnPopChar; if (UrlEncoding_Ansi == UrlEncoding) pfnPopChar = &HttppPopCharHostNameAnsi; else if (UrlEncoding_Dbcs == UrlEncoding) pfnPopChar = &HttppPopCharHostNameDbcs; else if (UrlEncoding_Utf8 == UrlEncoding) pfnPopChar = &HttppPopCharHostNameUtf8; else { ASSERT(! "Invalid UrlEncoding"); RETURN(STATUS_INVALID_PARAMETER); } // // Sanity check. // PAGED_CODE(); pDest = pDestination; BytesCopied = 0; pChar = pSource; while ((int)SourceLength > 0) { UnicodeChar = *pChar; if (IS_ASCII(UnicodeChar)) { CharToSkip = 1; } else { Status = (*pfnPopChar)( pChar, SourceLength, &UnicodeChar, &CharToSkip ); if (NT_SUCCESS(Status) == FALSE) goto end; } ASSERT(CharToSkip <= SourceLength); EMIT_CHAR( UnicodeChar, pDest, BytesCopied, Status, FALSE ); pChar += CharToSkip; SourceLength -= CharToSkip; } // // terminate the string, it hasn't been done in the loop // ASSERT((pDest-1)[0] != UNICODE_NULL); pDest[0] = UNICODE_NULL; *pBytesCopied = BytesCopied; Status = STATUS_SUCCESS; end: return Status; } // HttppCopyHostByType /*++ Routine Description: Validates that a hostname is well-formed CODEWORK: For future IDN (International Domain Names) work, we may need to handle raw UTF-8 or ACE hostnames. Note: if the validation algorithm changes here, it may be necessary to update HttpParseUrl() too. Arguments: pHostname - the hostname HostnameLength - length of hostname, in bytes HostnameType - Source of the hostname: Host header, AbsUri, or synthesized from the transport's local IP address Return Value: STATUS_SUCCESS if valid --*/ NTSTATUS HttpValidateHostname( IN PURL_C14N_CONFIG pCfg, IN PCUCHAR pHostname, IN ULONG HostnameLength, IN HOSTNAME_TYPE HostnameType, OUT PSHORT pAddressType ) { PCUCHAR pChar; PCUCHAR pLabel; PCUCHAR pEnd = pHostname + HostnameLength; PCSTR pTerminator; NTSTATUS Status; USHORT Port; struct in_addr IPv4Address; struct in6_addr IPv6Address; BOOLEAN AlphaLabel; PAGED_CODE(); ASSERT(NULL != pCfg); ASSERT(NULL != pHostname); ASSERT(NULL != pAddressType); if (0 == HostnameLength) { // RFC 2616, 14.23 "Host" says that the Host header can be empty if (Hostname_HostHeader == HostnameType) goto end; // It is an error for empty hostnames to appear elsewhere UlTraceError(PARSER, ("http!HttpValidateHostname: empty hostname\n" )); RETURN(STATUS_INVALID_PARAMETER); } // Is this an IPv6 literal address, per RFC 2732? if ('[' == *pHostname) { // Empty brackets? if (HostnameLength < STRLEN_LIT("[0]") || ']' == pHostname[1]) { UlTraceError(PARSER, ("http!HttpValidateHostname: IPv6 address too short\n" )); RETURN(STATUS_INVALID_PARAMETER); } for (pChar = pHostname + STRLEN_LIT("["); pChar < pEnd; ++pChar) { if (']' == *pChar) break; // // Dots are allowed because the last 32 bits may be represented // in IPv4 dotted-octet notation. We do not accept Scope IDs // (indicated by '%') in hostnames. // if (IS_HTTP_HEX(*pChar) || ':' == *pChar || '.' == *pChar) continue; UlTraceError(PARSER, ("http!HttpValidateHostname: " "Invalid char in IPv6 address, 0x%02X '%c', " "after '%.*s'\n", *pChar, IS_HTTP_PRINT(*pChar) ? *pChar : '?', DIFF(pChar - pHostname), pHostname )); RETURN(STATUS_INVALID_PARAMETER); } if (pChar == pEnd) { UlTraceError(PARSER, ("http!HttpValidateHostname: No ']' for IPv6 address\n" )); RETURN(STATUS_INVALID_PARAMETER); } ASSERT(pChar < pEnd); ASSERT(']' == *pChar); // Let the RTL routine do the hard work of parsing IPv6 addrs Status = RtlIpv6StringToAddressA( (PCSTR) pHostname + STRLEN_LIT("["), &pTerminator, &IPv6Address ); if (! NT_SUCCESS(Status)) { UlTraceError(PARSER, ("http!HttpValidateHostname: " "Invalid IPv6 address, %s\n", HttpStatusToString(Status) )); RETURN(Status); } if (pTerminator != (PCSTR) pChar) { UlTraceError(PARSER, ("http!HttpValidateHostname: " "Invalid IPv6 terminator, 0x%02X '%c'\n", *pTerminator, IS_HTTP_PRINT(*pTerminator) ? *pTerminator : '?' )); RETURN(STATUS_INVALID_PARAMETER); } *pAddressType = TDI_ADDRESS_TYPE_IP6; // Skip the terminating ']' pChar += STRLEN_LIT("]"); // Any chars after the ']'? if (pChar == pEnd) { ASSERT(DIFF(pEnd - pHostname) <= pCfg->MaxHostnameLength); goto end; } ASSERT(pChar < pEnd); if (':' == *pChar) goto port; UlTraceError(PARSER, ("http!HttpValidateHostname: " "Invalid char after IPv6 ']', 0x%02X '%c'\n", *pChar, IS_HTTP_PRINT(*pChar) ? *pChar : '?' )); RETURN(STATUS_INVALID_PARAMETER); } // // It must be a domain name or an IPv4 literal. We'll try to treat // it as a domain name first. If it turns out to be all-numeric, // we'll try decoding it as an IPv4 literal. We'll see if the name // is well-formed, but we will not do a DNS lookup to see if it exists, // as that would be much too expensive. // AlphaLabel = FALSE; pLabel = pHostname; for (pChar = pHostname; pChar < pEnd; ++pChar) { if (':' == *pChar) { if (pChar == pHostname) { UlTraceError(PARSER, ("http!HttpValidateHostname: empty hostname\n" )); RETURN(STATUS_INVALID_PARAMETER); } // exit the loop break; } if ('.' == *pChar) { ULONG LabelLength = DIFF(pChar - pLabel); // There must be at least one char in the label if (0 == LabelLength) { UlTraceError(PARSER, ("http!HttpValidateHostname: empty label\n" )); RETURN(STATUS_INVALID_PARAMETER); } // Label can't have more than 63 chars if (LabelLength > pCfg->MaxLabelLength) { UlTraceError(PARSER, ("http!HttpValidateHostname: overlong label, %lu\n", LabelLength )); RETURN(STATUS_INVALID_PARAMETER); } // Reset for the next label pLabel = pChar + STRLEN_LIT("."); continue; } // CODEWORK: handle DBCS characters if (!IS_URL_ILLEGAL_COMPUTERNAME(*pChar)) { if (!IS_HTTP_DIGIT(*pChar)) AlphaLabel = TRUE; if (pChar > pLabel) continue; // The first char of a label cannot be a hyphen. (Underscore?) if ('-' == *pChar) { UlTraceError(PARSER, ("http!HttpValidateHostname: " "'-' at beginning of label\n" )); RETURN(STATUS_INVALID_PARAMETER); } continue; } UlTraceError(PARSER, ("http!HttpValidateHostname: " "Invalid char in hostname, 0x%02X '%c', " "after '%.*s'\n", *pChar, IS_HTTP_PRINT(*pChar) ? *pChar : '?', DIFF(pChar - pHostname), pHostname )); RETURN(STATUS_INVALID_PARAMETER); } // loop through hostname ASSERT(pChar == pEnd || ':' == *pChar); if (AlphaLabel) { *pAddressType = 0; } else { // Let's see if it's a valid IPv4 address Status = RtlIpv4StringToAddressA( (PCSTR) pHostname, TRUE, // strict => 4 dotted decimal octets &pTerminator, &IPv4Address ); if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ("http!HttpValidateHostname: " "Invalid IPv4 address, %s\n", HttpStatusToString(Status) )); RETURN(Status); } if (pTerminator != (PCSTR) pChar) { ASSERT(pTerminator < (PCSTR) pChar); UlTraceError(PARSER, ("http!HttpValidateHostname: " "Invalid IPv4 address after %lu chars, " "0x%02X, '%c'\n", DIFF(pTerminator - (PCSTR) pHostname), *pTerminator, IS_HTTP_PRINT(*pTerminator) ? *pTerminator : '?' )); RETURN(STATUS_INVALID_PARAMETER); } *pAddressType = TDI_ADDRESS_TYPE_IP; } port: // // Parse the port number // // Check for overlong hostnames if (DIFF(pChar - pHostname) > pCfg->MaxHostnameLength) { UlTraceError(PARSER, ("http!HttpValidateHostname: overlong hostname, %lu\n", DIFF(pChar - pHostname) )); RETURN(STATUS_INVALID_PARAMETER); } if (pChar == pEnd) goto end; ASSERT(pHostname < pChar && pChar < pEnd); ASSERT(':' == *pChar); pChar += STRLEN_LIT(":"); ASSERT(pChar <= pEnd); // RFC 2616, section 3.2.2 "http URL", says: // "If the port is empty or not given, port 80 is assumed". if (pChar == pEnd) { Port = 80; goto end; } Status = HttpAnsiStringToUShort( pChar, pEnd - pChar, // must occupy all remaining chars FALSE, // no leading zeros permitted 10, (PUCHAR*) &pTerminator, &Port ); if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ("http!HttpValidateHostname: " "Invalid port number, %s\n", HttpStatusToString(Status) )); RETURN(STATUS_INVALID_PARAMETER); } ASSERT(pTerminator == (PCSTR) pEnd); if (0 == Port) { UlTraceError(PARSER, ("http!HttpValidateHostname: Port must not be zero.\n" )); RETURN(STATUS_INVALID_PARAMETER); } end: RETURN(STATUS_SUCCESS); } // HttpValidateHostname /***************************************************************************++ Routine Description: Convert to unicode Arguments: Return Value: NTSTATUS - Completion status. --***************************************************************************/ NTSTATUS HttpCopyUrl( IN PURL_C14N_CONFIG pCfg, OUT PWSTR pDestination, IN PCUCHAR pSource, IN ULONG SourceLength, OUT PULONG pBytesCopied, OUT PURL_ENCODING_TYPE pUrlEncodingType ) { NTSTATUS Status = STATUS_UNSUCCESSFUL; ULONG DecodeOrder = pCfg->AbsPathDecodeOrder; PAGED_CODE(); ASSERT(NULL != pDestination); ASSERT(NULL != pSource); ASSERT(NULL != pBytesCopied); ASSERT(NULL != pUrlEncodingType); if (0 == DecodeOrder || DecodeOrder != (DecodeOrder & UrlDecode_MaxMask)) { UlTraceError(PARSER, ("http!HttpCopyUrl: invalid DecodeOrder, 0x%lX\n", DecodeOrder )); RETURN(STATUS_INVALID_PARAMETER); } for ( ; 0 != DecodeOrder && !NT_SUCCESS(Status); DecodeOrder >>= UrlDecode_Shift ) { ULONG UrlEncoding = (DecodeOrder & UrlDecode_Mask); switch (UrlEncoding) { default: ASSERT(! "Impossible UrlDecodeOrder"); case UrlDecode_None: break; case UrlDecode_Ansi: case UrlDecode_Dbcs: case UrlDecode_Utf8: UlTraceVerbose(PARSER, ("http!HttpCopyUrl(%s, Src=%p, %lu)\n", HttppUrlEncodingToString(UrlEncoding), pSource, SourceLength )); Status = HttppCopyUrlByType( pCfg, (URL_ENCODING_TYPE) UrlEncoding, pDestination, pSource, SourceLength, pBytesCopied ); if (NT_SUCCESS(Status)) { *pUrlEncodingType = (URL_ENCODING_TYPE) UrlEncoding; UlTraceVerbose(PARSER, ("http!HttpCopyUrl(%s): " "(%lu) '%.*s' -> (%lu) '%ls'\n", HttppUrlEncodingToString(UrlEncoding), SourceLength, SourceLength, pSource, *pBytesCopied/sizeof(WCHAR), pDestination )); } break; }; } return Status; } // HttpCopyUrl /***************************************************************************++ Routine Description: This function can be told to copy UTF-8, ANSI, or DBCS URLs. Convert to Unicode Arguments: Return Value: NTSTATUS - Completion status. --***************************************************************************/ NTSTATUS HttppCopyUrlByType( IN PURL_C14N_CONFIG pCfg, IN URL_ENCODING_TYPE UrlEncoding, OUT PWSTR pDestination, IN PCUCHAR pSource, IN ULONG SourceLength, OUT PULONG pBytesCopied ) { PWSTR pDest; PCUCHAR pChar; ULONG BytesCopied; ULONG UnicodeChar; ULONG CharToSkip; #if DBG NTSTATUS Status; PFN_POPCHAR_ABSPATH pfnPopChar; PWSTR pSegment = pDestination; ULONG SegmentCount = 0; #endif // DBG // // Sanity check. // PAGED_CODE(); #if DBG if (UrlEncoding_Ansi == UrlEncoding) pfnPopChar = &HttppPopCharAbsPathAnsi; else if (UrlEncoding_Dbcs == UrlEncoding) pfnPopChar = &HttppPopCharAbsPathDbcs; else if (UrlEncoding_Utf8 == UrlEncoding) pfnPopChar = &HttppPopCharAbsPathUtf8; else { ASSERT(! "Invalid UrlEncoding"); RETURN(STATUS_INVALID_PARAMETER); } #else // !DBG UNREFERENCED_PARAMETER(pCfg); UNREFERENCED_PARAMETER(UrlEncoding); #endif // DBG pDest = pDestination; BytesCopied = 0; pChar = pSource; CharToSkip = 1; while ((int)SourceLength > 0) { ULONG NextUnicodeChar = FastPopChars[*pChar]; // // Grab the next character. // // All clean chars have a non-zero entry in FastPopChars[]. // All clean chars are in the US-ASCII range, 0-127. // ASSERT(0 != NextUnicodeChar); ASSERT(IS_ASCII(NextUnicodeChar)); #if DBG Status = (*pfnPopChar)( pChar, SourceLength, pCfg->PercentUAllowed, pCfg->AllowRestrictedChars, &UnicodeChar, &CharToSkip ); ASSERT(NT_SUCCESS(Status)); ASSERT(UnicodeChar == NextUnicodeChar); ASSERT(CharToSkip == 1); #endif // !DBG UnicodeChar = (WCHAR) NextUnicodeChar; CharToSkip = 1; #if DBG // Because HttpFindUrlToken() marks as dirty any URLs that // (appear to) have too many segments or overlong segments, // we should never hit these assertions if (FORWARD_SLASH == UnicodeChar) { ULONG SegmentLength = DIFF(pDest - pSegment); // The segment length should be within bounds ASSERT(SegmentLength > 0 || pDestination == pSegment); ASSERT(SegmentLength <= pCfg->UrlSegmentMaxLength + WCSLEN_LIT(L"/")); pSegment = pDest; ++SegmentCount; // There should not be too many segments ASSERT(SegmentCount <= pCfg->UrlSegmentMaxCount); } #endif // DBG EMIT_LITERAL_CHAR(UnicodeChar, pDest, BytesCopied); pChar += CharToSkip; SourceLength -= CharToSkip; } // // terminate the string, it hasn't been done in the loop // ASSERT((pDest-1)[0] != UNICODE_NULL); pDest[0] = UNICODE_NULL; *pBytesCopied = BytesCopied; ASSERT(DIFF(pDest - pSegment) > 0); ASSERT(DIFF(pDest - pSegment) <= pCfg->UrlSegmentMaxLength + WCSLEN_LIT(L"/")); ASSERT(SegmentCount < pCfg->UrlSegmentMaxCount); return STATUS_SUCCESS; } // HttppCopyUrlByType /***************************************************************************++ Routine Description: Unescape Convert backslash to forward slash Remove double slashes (empty directiories names) - e.g. // or \\ Handle /./ Handle /../ Convert to unicode Arguments: Return Value: NTSTATUS - Completion status. Note: Any changes to this code may require changes for the fast path code too. The fast path is HttpCopyUrl. --***************************************************************************/ NTSTATUS HttpCleanAndCopyUrl( IN PURL_C14N_CONFIG pCfg, IN URL_PART UrlPart, OUT PWSTR pDestination, IN PCUCHAR pSource, IN ULONG SourceLength, OUT PULONG pBytesCopied, OUT PWSTR * ppQueryString OPTIONAL, OUT PURL_ENCODING_TYPE pUrlEncodingType ) { NTSTATUS Status = STATUS_UNSUCCESSFUL; ULONG DecodeOrder = pCfg->AbsPathDecodeOrder; PAGED_CODE(); ASSERT(NULL != pDestination); ASSERT(NULL != pSource); ASSERT(NULL != pBytesCopied); ASSERT(NULL != pUrlEncodingType); if (0 == DecodeOrder || DecodeOrder != (DecodeOrder & UrlDecode_MaxMask)) { UlTraceError(PARSER, ("http!HttpCleanAndCopyUrl: invalid DecodeOrder, 0x%lX\n", DecodeOrder )); RETURN(STATUS_INVALID_PARAMETER); } for ( ; 0 != DecodeOrder && !NT_SUCCESS(Status); DecodeOrder >>= UrlDecode_Shift ) { ULONG UrlEncoding = (DecodeOrder & UrlDecode_Mask); switch (UrlEncoding) { default: ASSERT(! "Impossible UrlDecodeOrder"); case UrlDecode_None: break; case UrlDecode_Ansi: case UrlDecode_Dbcs: case UrlDecode_Utf8: UlTraceVerbose(PARSER, ("http!HttpCleanAndCopyUrl(%s, Src=%p, %lu)\n", HttppUrlEncodingToString(UrlEncoding), pSource, SourceLength )); Status = HttppCleanAndCopyUrlByType( pCfg, (URL_ENCODING_TYPE) UrlEncoding, UrlPart, pDestination, pSource, SourceLength, pBytesCopied, ppQueryString ); if (NT_SUCCESS(Status)) { *pUrlEncodingType = (URL_ENCODING_TYPE) UrlEncoding; UlTraceVerbose(PARSER, ("http!HttpCleanAndCopyUrl(%s): " "(%lu) '%.*s' -> (%lu) '%ls'\n", HttppUrlEncodingToString(UrlEncoding), SourceLength, SourceLength, pSource, *pBytesCopied/sizeof(WCHAR), pDestination )); } break; }; } return Status; } // HttpCleanAndCopyUrl // // HttppCleanAndCopyUrlByType() uses StateFromStateAndToken[][] and // ActionFromStateAndToken[][] to handle "//", "/./", and "/../" productions. // #define TOK_STATE(state, other, dot, eos, slash) \ { \ URL_STATE_ ## other, \ URL_STATE_ ## dot, \ URL_STATE_ ## eos, \ URL_STATE_ ## slash \ } // // CanonStateFromStateAndToken[][] is used by HttpParseUrl() to reject // "//", "/./", and "/../" sequences, as these URLs are supposed to // be in canonical form already. // const URL_STATE CanonStateFromStateAndToken[URL_STATE_MAX][URL_TOKEN_MAX] = { // State \ Token: Other '.' EOS '/' TOK_STATE( START, START, START, END, SLASH), TOK_STATE( SLASH, START, SLASH_DOT, END, ERROR), TOK_STATE( SLASH_DOT, START, SLASH_DOT_DOT, END, ERROR), TOK_STATE( SLASH_DOT_DOT, START, START, ERROR, ERROR), TOK_STATE( END, END, END, END, END), TOK_STATE( ERROR, ERROR, ERROR, ERROR, ERROR) }; // // StateFromStateAndToken[][] says which new state to transition to given // the current state and the token we saw. Used by HttppCleanAndCopyUrlByType() // const URL_STATE StateFromStateAndToken[URL_STATE_MAX][URL_TOKEN_MAX] = { // State \ Token: Other '.' EOS '/' TOK_STATE( START, START, START, END, SLASH), TOK_STATE( SLASH, START, SLASH_DOT, END, SLASH), TOK_STATE( SLASH_DOT, START, SLASH_DOT_DOT, END, SLASH), TOK_STATE( SLASH_DOT_DOT, START, START, END, SLASH), TOK_STATE( END, END, END, END, END), TOK_STATE( ERROR, ERROR, ERROR, ERROR, ERROR) }; // // ActionFromStateAndToken[][] says what action to perform based on // the current state and the current token // #define NEW_ACTION(state, other, dot, eos, slash) \ { \ ACTION_ ## other, \ ACTION_ ## dot, \ ACTION_ ## eos, \ ACTION_ ## slash \ } const URL_ACTION ActionFromStateAndToken[URL_STATE_MAX][URL_TOKEN_MAX] = { // State \ Token: Other '.' EOS '/' NEW_ACTION(START, EMIT_CH, EMIT_CH, NOTHING, EMIT_CH), NEW_ACTION(SLASH, EMIT_CH, NOTHING, NOTHING, NOTHING), NEW_ACTION(SLASH_DOT, EMIT_DOT_CH, NOTHING, NOTHING, NOTHING), NEW_ACTION(SLASH_DOT_DOT, EMIT_DOT_DOT_CH, EMIT_DOT_DOT_CH, BACKUP, BACKUP), NEW_ACTION(END, NOTHING, NOTHING, NOTHING, NOTHING) }; #if DBG PCSTR HttppUrlActionToString( URL_ACTION Action) { switch (Action) { case ACTION_NOTHING: return "NOTHING"; case ACTION_EMIT_CH: return "EMIT_CH"; case ACTION_EMIT_DOT_CH: return "EMIT_DOT_CH"; case ACTION_EMIT_DOT_DOT_CH: return "EMIT_DOT_DOT_CH"; case ACTION_BACKUP: return "BACKUP"; case ACTION_MAX: return "MAX"; default: ASSERT(! "Invalid URL_ACTION"); return "ACTION_???"; } } // HttppUrlActionToString PCSTR HttppUrlStateToString( URL_STATE UrlState) { switch (UrlState) { case URL_STATE_START: return "START"; case URL_STATE_SLASH: return "SLASH"; case URL_STATE_SLASH_DOT: return "SLASH_DOT"; case URL_STATE_SLASH_DOT_DOT: return "SLASH_DOT_DOT"; case URL_STATE_END: return "END"; case URL_STATE_ERROR: return "ERROR"; case URL_STATE_MAX: return "MAX"; default: ASSERT(! "Invalid URL_STATE"); return "URL_STATE_???"; } } // HttppUrlStateToString PCSTR HttppUrlTokenToString( URL_STATE_TOKEN UrlToken) { switch (UrlToken) { case URL_TOKEN_OTHER: return "OTHER"; case URL_TOKEN_DOT: return "DOT"; case URL_TOKEN_EOS: return "EOS"; case URL_TOKEN_SLASH: return "SLASH"; case URL_TOKEN_MAX: return "MAX"; default: ASSERT(! "Invalid URL_STATE_TOKEN"); return "URL_TOKEN_???"; } } // HttppUrlTokenToString #endif // DBG PCSTR HttpSiteTypeToString( HTTP_URL_SITE_TYPE SiteType ) { switch (SiteType) { case HttpUrlSite_None: return "None"; case HttpUrlSite_Name: return "Name"; case HttpUrlSite_IP: return "IP"; case HttpUrlSite_NamePlusIP: return "Name+IP"; case HttpUrlSite_WeakWildcard: return "Weak"; case HttpUrlSite_StrongWildcard: return "Strong"; case HttpUrlSite_Max: return "Max"; default: ASSERT(! "Invalid HTTP_URL_SITE_TYPE"); return "????"; } } /***************************************************************************++ Routine Description: This function can be told to clean up UTF-8, ANSI, or DBCS URLs. Unescape Convert backslash to forward slash Remove double slashes (empty directiories names) - e.g. // or \\ Handle /./ Handle /../ Convert to unicode Arguments: Return Value: NTSTATUS - Completion status. Note: Any changes to this code may require changes for the fast path code too. The fast path is HttppCopyUrlByType. --***************************************************************************/ NTSTATUS HttppCleanAndCopyUrlByType( IN PURL_C14N_CONFIG pCfg, IN URL_ENCODING_TYPE UrlEncoding, IN URL_PART UrlPart, OUT PWSTR pDestination, IN PCUCHAR pSource, IN ULONG SourceLength, OUT PULONG pBytesCopied, OUT PWSTR * ppQueryString OPTIONAL ) { NTSTATUS Status; PWSTR pDest; PCUCHAR pChar; ULONG CharToSkip; ULONG BytesCopied; PWSTR pQueryString; URL_STATE UrlState = URL_STATE_START; URL_STATE_TOKEN UrlToken = URL_TOKEN_OTHER; URL_ACTION Action = ACTION_NOTHING; ULONG UnicodeChar; BOOLEAN MakeCanonical; PWCHAR pFastPopChar; PFN_POPCHAR_ABSPATH pfnPopChar; PWSTR pSegment = pDestination; ULONG SegmentCount = 0; BOOLEAN TestSegment = FALSE; #if DBG ULONG OriginalSourceLength = SourceLength; #endif // // Sanity check. // PAGED_CODE(); ASSERT(UrlPart_AbsPath == UrlPart); if (UrlEncoding_Ansi == UrlEncoding) pfnPopChar = &HttppPopCharAbsPathAnsi; else if (UrlEncoding_Dbcs == UrlEncoding) pfnPopChar = &HttppPopCharAbsPathDbcs; else if (UrlEncoding_Utf8 == UrlEncoding) pfnPopChar = &HttppPopCharAbsPathUtf8; else { ASSERT(! "Invalid UrlEncoding"); RETURN(STATUS_INVALID_PARAMETER); } ASSERT(FORWARD_SLASH == *pSource); pDest = pDestination; pQueryString = NULL; BytesCopied = 0; pChar = pSource; CharToSkip = 0; UrlState = 0; MakeCanonical = (BOOLEAN) (UrlPart == UrlPart_AbsPath); if (UrlEncoding == UrlEncoding_Utf8 && UrlPart != UrlPart_QueryString) { pFastPopChar = FastPopChars; } else { pFastPopChar = DummyPopChars; } while (SourceLength > 0) { // // advance ! it's at the top of the loop to enable ANSI_NULL to // come through ONCE // ASSERT(CharToSkip <= SourceLength); pChar += CharToSkip; SourceLength -= CharToSkip; // // well? have we hit the end? // if (SourceLength == 0) { UnicodeChar = UNICODE_NULL; CharToSkip = 1; } else { // // Nope. Peek briefly to see if we hit the query string // if (UrlPart == UrlPart_AbsPath && pChar[0] == QUESTION_MARK) { ASSERT(pQueryString == NULL); // // remember its location // pQueryString = pDest; // // let it fall through ONCE to the canonical // in order to handle a trailing "/.." like // "http://foobar:80/foo/bar/..?v=1&v2" // TestSegment = TRUE; UnicodeChar = QUESTION_MARK; CharToSkip = 1; // // now we are cleaning the query string // UrlPart = UrlPart_QueryString; UlTraceVerbose(PARSER, ("QueryString @ %p\n", pQueryString)); // // cannot use fast path for PopChar anymore // pFastPopChar = DummyPopChars; pfnPopChar = HttppPopCharQueryString; } else { ULONG NextUnicodeChar = pFastPopChar[*pChar]; // // Grab the next character. Try to be fast for the // normal character case. Otherwise call PopChar. // if (NextUnicodeChar == 0) { Status = (*pfnPopChar)( pChar, SourceLength, pCfg->PercentUAllowed, pCfg->AllowRestrictedChars, &UnicodeChar, &CharToSkip ); if (NT_SUCCESS(Status) == FALSE) goto end; } else { #if DBG Status = (*pfnPopChar)( pChar, SourceLength, pCfg->PercentUAllowed, pCfg->AllowRestrictedChars, &UnicodeChar, &CharToSkip ); ASSERT(NT_SUCCESS(Status)); ASSERT(UnicodeChar == NextUnicodeChar); ASSERT(CharToSkip == 1); #endif // DBG UnicodeChar = (WCHAR) NextUnicodeChar; CharToSkip = 1; } } } if (!MakeCanonical) { UrlToken = (UnicodeChar == UNICODE_NULL) ? URL_TOKEN_EOS : URL_TOKEN_OTHER; TestSegment = FALSE; } else { // // now use the state machine to make it canonical. // // // did we just hit the query string? this will only happen once // that we take this branch after hitting it, as we stop // processing after hitting it. // if (UrlPart == UrlPart_QueryString) { // // treat this just like we hit a NULL, EOS. // ASSERT(QUESTION_MARK == UnicodeChar); UrlToken = URL_TOKEN_EOS; TestSegment = TRUE; } else { // // otherwise based the new state off of the char we // just popped. // switch (UnicodeChar) { case UNICODE_NULL: UrlToken = URL_TOKEN_EOS; TestSegment = TRUE; break; case DOT: UrlToken = URL_TOKEN_DOT; TestSegment = FALSE; break; case FORWARD_SLASH: UrlToken = URL_TOKEN_SLASH; TestSegment = TRUE; break; default: UrlToken = URL_TOKEN_OTHER; TestSegment = FALSE; break; } } } Action = ActionFromStateAndToken[UrlState][UrlToken]; IF_DEBUG2BOTH(PARSER, VERBOSE) { ULONG i; UCHAR HexBuff[5*12 + 10]; PUCHAR p = HexBuff; UCHAR Byte; ASSERT(CharToSkip <= 4 * STRLEN_LIT("%NN")); // Generate something like // "[25 65 32 25 38 30 25 39 35] '%e2%80%95'" *p++ = '['; for (i = 0; i < CharToSkip; ++i) { const static char hexArray[] = "0123456789ABCDEF"; Byte = pChar[i]; *p++ = hexArray[Byte >> 4]; *p++ = hexArray[Byte & 0xf]; *p++ = ' '; } p[-1] = ']'; // overwrite last ' ' *p++ = ' '; *p++ = '\''; for (i = 0; i < CharToSkip; ++i) { Byte = pChar[i]; *p++ = (IS_HTTP_PRINT(Byte) ? Byte : '?'); } *p++ = '\''; *p++ = '\0'; ASSERT(DIFF(p - HexBuff) <= DIMENSION(HexBuff)); UlTrace(PARSER, ("http!HttppCleanAndCopyUrlByType(%s): " "(%lu) %s -> U+%04lX '%c': " "[%s][%s] -> %s, %s%s\n", HttppUrlEncodingToString(UrlEncoding), CharToSkip, HexBuff, UnicodeChar, IS_ANSI(UnicodeChar) && IS_HTTP_PRINT(UnicodeChar) ? (UCHAR) UnicodeChar : '?', HttppUrlStateToString(UrlState), HttppUrlTokenToString(UrlToken), HttppUrlStateToString( StateFromStateAndToken[UrlState][UrlToken]), HttppUrlActionToString(Action), TestSegment ? ", TestSegment" : "" )); } // IF_DEBUG2BOTH(PARSER, VERBOSE) // // Segment length and segment count checks // if (TestSegment) { ULONG SegmentLength = DIFF(pDest - pSegment); ASSERT(pSegment <= pDest); UlTraceVerbose(PARSER, ("http!HttppCleanAndCopyUrlByType: " "Segment[%lu] %p (%lu) = '%.*ls'\n", SegmentCount, pSegment, SegmentLength, SegmentLength, pSegment )); // Reject if segment too long if (SegmentLength > pCfg->UrlSegmentMaxLength + WCSLEN_LIT(L"/")) { UlTraceError(PARSER, ( "http!HttppCleanAndCopyUrlByType: " "Segment too long: %lu\n", SegmentLength )); RETURN(STATUS_INVALID_DEVICE_REQUEST); } pSegment = pDest; // Reject if too many path segments if (Action != ACTION_NOTHING) { if (pSegment == pDestination) { SegmentCount = 0; } else if (++SegmentCount > pCfg->UrlSegmentMaxCount) { UlTraceError(PARSER, ( "http!HttppCleanAndCopyUrlByType: " "Too many segments: %lu\n", SegmentCount )); RETURN(STATUS_INVALID_DEVICE_REQUEST); } } } // // Perform the action associated with the state. // switch (Action) { case ACTION_EMIT_DOT_DOT_CH: EMIT_LITERAL_CHAR(DOT, pDest, BytesCopied); // fall through case ACTION_EMIT_DOT_CH: EMIT_LITERAL_CHAR(DOT, pDest, BytesCopied); // fall through case ACTION_EMIT_CH: EMIT_CHAR( UnicodeChar, pDest, BytesCopied, Status, pCfg->AllowRestrictedChars ); // fall through case ACTION_NOTHING: break; case ACTION_BACKUP: // // pDest currently points 1 past the last '/'. backup over it and // find the preceding '/', set pDest to 1 past that one. // // // backup to the '/' // pDest -= 1; BytesCopied -= sizeof(WCHAR); ASSERT(pDest[0] == FORWARD_SLASH); // // are we at the start of the string? that's bad, can't go back! // if (pDest == pDestination) { ASSERT(BytesCopied == 0); UlTraceError(PARSER, ( "http!HttppCleanAndCopyUrl: " "Can't back up for \"/../\"\n" )); Status = STATUS_OBJECT_PATH_INVALID; goto end; } // // back up over the '/' // pDest -= 1; BytesCopied -= sizeof(WCHAR); ASSERT(pDest > pDestination); // // now find the previous slash // while (pDest > pDestination && pDest[0] != FORWARD_SLASH) { pDest -= 1; BytesCopied -= sizeof(WCHAR); } // // Adjust segment trackers downwards // pSegment = pDest; if (pSegment == pDestination) SegmentCount = 0; else --SegmentCount; // // we already have a slash, so don't have to store one. // ASSERT(pDest[0] == FORWARD_SLASH); // // simply skip it, as if we had emitted it just now // pDest += 1; BytesCopied += sizeof(WCHAR); break; default: ASSERT(!"http!HttppCleanAndCopyUrl: " "Invalid action code in state table!"); Status = STATUS_OBJECT_PATH_SYNTAX_BAD; goto end; } // // Just hit the query string ? // if (MakeCanonical && UrlPart == UrlPart_QueryString) { // // Stop canonical processing // MakeCanonical = FALSE; // // Need to emit the '?', it wasn't emitted above // ASSERT(ActionFromStateAndToken[UrlState][UrlToken] != ACTION_EMIT_CH); // // remember its location (in case we backed up) // pQueryString = pDest; EMIT_LITERAL_CHAR(QUESTION_MARK, pDest, BytesCopied); // reset UrlToken = URL_TOKEN_OTHER; UrlState = URL_STATE_START; } // update the URL state UrlState = StateFromStateAndToken[UrlState][UrlToken]; ASSERT(URL_STATE_ERROR != UrlState); } // // terminate the string, it hasn't been done in the loop // ASSERT((pDest-1)[0] != UNICODE_NULL); pDest[0] = UNICODE_NULL; *pBytesCopied = BytesCopied; if (BytesCopied > pCfg->UrlMaxLength * sizeof(WCHAR)) { UlTraceError(PARSER, ( "http!HttppCleanAndCopyUrlByType: " "URL too long: %lu\n", BytesCopied )); RETURN(STATUS_INVALID_DEVICE_REQUEST); } if (ppQueryString != NULL) { *ppQueryString = pQueryString; } UlTraceVerbose(PARSER, ("http!HttppCleanAndCopyUrlByType: " "(%lu) '%.*s' -> (%lu) '%.*ls', %squerystring\n", OriginalSourceLength, OriginalSourceLength, pSource, BytesCopied/sizeof(WCHAR), BytesCopied/sizeof(WCHAR), pDestination, pQueryString != NULL ? "" : "no " )); Status = STATUS_SUCCESS; end: return Status; } // HttppCleanAndCopyUrlByType /*++ Routine Description: A utility routine to find a Url token. We take an input pointer, skip any preceding LWS, then scan the token until we find either LWS or a CRLF pair. We also mark the request to have a "Clean" Url Arguments: pBuffer - Buffer to search for token. BufferLength - Length of data pointed to by pBuffer. ppTokenStart - Where to return the start of the token, if we locate its delimiter. pTokenLength - Where to return the length of the token. pRawUrlClean - where to return cleanliness of URL Return Value: STATUS_SUCCESS if no parsing errors in the URL. We also return, in *ppTokenStart, a pointer to the token we found, or NULL if we don't find a whitespace-delimited token. pRawUrlClean flag may be set. --*/ NTSTATUS HttpFindUrlToken( IN PURL_C14N_CONFIG pCfg, IN PCUCHAR pBuffer, IN ULONG BufferLength, OUT PUCHAR* ppTokenStart, OUT PULONG pTokenLength, OUT PBOOLEAN pRawUrlClean ) { PCUCHAR pTokenStart; PCUCHAR pSegment; UCHAR CurrentChar; UCHAR PreviousChar; ULONG SegmentCount = 0; ULONG TokenLength; // // Sanity check. // PAGED_CODE(); ASSERT(NULL != pBuffer); ASSERT(NULL != ppTokenStart); ASSERT(NULL != pTokenLength); ASSERT(NULL != pRawUrlClean); // // Assume Clean RawUrl // *pRawUrlClean = TRUE; *ppTokenStart = NULL; *pTokenLength = 0; // // First, skip any preceding LWS. // while (BufferLength > 0 && IS_HTTP_LWS(*pBuffer)) { pBuffer++; BufferLength--; } // If we stopped because we ran out of buffer, bail. if (BufferLength == 0) { return STATUS_SUCCESS; } pTokenStart = pBuffer; PreviousChar = ANSI_NULL; // This will usually point to a '/', but it won't if this is an AbsURI. // It doesn't really matter, since only a few borderline cases will // be marked as dirty that might not otherwise be. pSegment = pBuffer; // Now skip over the token, until we see either LWS or a CR or LF. while ( BufferLength != 0 ) { CurrentChar = *pBuffer; // must check for WS [ \t\r\n] first, since \t, \r, & \n are CTL chars! if ( IS_HTTP_WS_TOKEN(CurrentChar) ) { break; } if ( IS_HTTP_CTL(CurrentChar) ) { *pRawUrlClean = FALSE; *ppTokenStart = NULL; UlTraceError(PARSER, ( "http!HttpFindUrlToken: " "Found control char: %02X\n", CurrentChar )); RETURN(STATUS_INVALID_DEVICE_REQUEST); } // // URL is NOT clean if it contains any of the following patterns // // a. back slash "\" // b. dot, forward slash | forward slash, forward slash "./" | "//" // c. forward slash, dot | dot, dot "/." | ".." // d. question mark (querystring) "?" // e. percent (hex escape) "%" // f. raw bytes with high bit set, >= 0x80 // // These are conservative estimates of "Clean"; some clean URLs may not // be marked as clean. For such URLs, we'll skip the fast path but at // no loss of functionality. // if ( IS_URL_DIRTY(CurrentChar) ) { // Only do the checks if it's still clean if (*pRawUrlClean) { if (CurrentChar == FORWARD_SLASH || CurrentChar == DOT) { if (PreviousChar == FORWARD_SLASH || PreviousChar == DOT) { *pRawUrlClean = FALSE; } } else { *pRawUrlClean = FALSE; } } if (CurrentChar == FORWARD_SLASH) { ULONG SegmentLength = DIFF(pBuffer - pSegment); // If the segment contains %-hex-escaped chars, it may become // acceptably short after PopChar() processing. Let // HttppCleanAndCopyUrlByType() figure it out. if (SegmentLength > pCfg->UrlSegmentMaxLength) *pRawUrlClean = FALSE; pSegment = pBuffer; // If this is an AbsURI, instead of an AbsPath, the // segment count will be higher, because of the two slashes // before the hostname. Also, "/../", "/./", and "//" // minimization will reduce the final count of segments. // Again, let HttppCleanAndCopyUrlByType() figure it out. if (++SegmentCount > pCfg->UrlSegmentMaxCount) *pRawUrlClean = FALSE; } } PreviousChar = CurrentChar; pBuffer++; BufferLength--; } // See why we stopped. if (0 == BufferLength) { *pRawUrlClean = FALSE; // Ran out of buffer before end of token. return STATUS_SUCCESS; } ASSERT(IS_HTTP_WS_TOKEN(*pBuffer)); TokenLength = DIFF(pBuffer - pTokenStart); if (0 == TokenLength) { UlTraceError(PARSER, ("http!HttpFindUrlToken: Found empty token\n")); RETURN(STATUS_INVALID_DEVICE_REQUEST); } // Check the final segment if (DIFF(pBuffer - pSegment) > pCfg->UrlSegmentMaxLength) *pRawUrlClean = FALSE; if (++SegmentCount > pCfg->UrlSegmentMaxCount) *pRawUrlClean = FALSE; if (TokenLength > pCfg->UrlMaxLength) *pRawUrlClean = FALSE; // Success! Set the token length and return the start of the token. *pTokenLength = TokenLength; *ppTokenStart = (PUCHAR) pTokenStart; return STATUS_SUCCESS; } // HttpFindUrlToken /*++ Routine Description: Parse an IPv6 address from a Unicode buffer. Must be delimited by []. May contain a scope ID. Arguments: pBuffer - Buffer to parse. Must point to '['. BufferLength - Length of data pointed to by pBuffer. ScopeIdAllowed - if TRUE, an optional scope ID may be present pSockAddr6 - Where to return the parsed IPv6 address ppEnd - On success, points to character after ']' Return Value: STATUS_SUCCESS if no parsing errors in the IPv6 address. --*/ NTSTATUS HttppParseIPv6Address( IN PCWSTR pBuffer, IN ULONG BufferLength, IN BOOLEAN ScopeIdAllowed, OUT PSOCKADDR_IN6 pSockAddr6, OUT PCWSTR* ppEnd ) { NTSTATUS Status; PCWSTR pEnd = pBuffer + BufferLength; PCWSTR pChar; PWSTR pTerminator; ULONG ScopeTemp; ASSERT(NULL != pBuffer); ASSERT(0 < BufferLength); ASSERT(NULL != pSockAddr6); ASSERT(NULL != ppEnd); RtlZeroMemory(pSockAddr6, sizeof(*pSockAddr6)); *ppEnd = NULL; pSockAddr6->sin6_family = TDI_ADDRESS_TYPE_IP6; // Caller guarantees this ASSERT(L'[' == *pBuffer); // Empty brackets? if (BufferLength < WCSLEN_LIT(L"[0]") || L']' == pBuffer[1]) { UlTraceError(PARSER, ("http!HttppParseIPv6Address: IPv6 address too short\n" )); RETURN(STATUS_INVALID_PARAMETER); } for (pChar = pBuffer + WCSLEN_LIT(L"["); pChar < pEnd; ++pChar) { if (IS_ASCII(*pChar)) { if (L']' == *pChar || L'%' == *pChar) break; // Dots are allowed because the last 32 bits may be represented // in IPv4 dotted-octet notation if (IS_HTTP_HEX(*pChar) || L':' == *pChar || L'.' == *pChar) continue; } UlTraceError(PARSER, ("http!HttppParseIPv6Address: " "Invalid char in IPv6 address, U+%04X '%c', " "after %lu chars, '%.*ls'\n", *pChar, IS_ANSI(*pChar) && IS_HTTP_PRINT(*pChar) ? *pChar : '?', DIFF(pChar - pBuffer), DIFF(pChar - pBuffer), pBuffer )); RETURN(STATUS_INVALID_PARAMETER); } if (pChar == pEnd) { UlTraceError(PARSER, ("http!HttppParseIPv6Address: No ']' for IPv6 address\n" )); RETURN(STATUS_INVALID_PARAMETER); } ASSERT(pChar < pEnd); ASSERT(L']' == *pChar || L'%' == *pChar); // Let the RTL routine do the hard work of parsing IPv6 addrs Status = RtlIpv6StringToAddressW( pBuffer + WCSLEN_LIT(L"["), &pTerminator, &pSockAddr6->sin6_addr ); if (! NT_SUCCESS(Status)) { UlTraceError(PARSER, ("http!HttppParseIPv6Address: " "Invalid IPv6 address, %s\n", HttpStatusToString(Status) )); RETURN(Status); } if (pTerminator != pChar) { UlTraceError(PARSER, ("http!HttppParseIPv6Address: " "Invalid IPv6 terminator, U+%04X, '%c'\n", *pTerminator, IS_ANSI(*pTerminator) && IS_HTTP_PRINT(*pTerminator) ? *pTerminator : '?' )); RETURN(STATUS_INVALID_PARAMETER); } // Is a scopeid present? if (L'%' != *pChar) { ASSERT(L']' == *pChar); pSockAddr6->sin6_scope_id = 0; } else { PCWSTR pScopeEnd; // Skip the '%' denoting a scope ID pChar += WCSLEN_LIT(L"%"); if (!ScopeIdAllowed) { UlTraceError(PARSER, ("http!HttppParseIPv6Address: No scope ID allowed\n" )); RETURN(STATUS_INVALID_PARAMETER); } if (pChar == pEnd) { UlTraceError(PARSER, ("http!HttppParseIPv6Address: " "No IPv6 scope ID after '%%'\n" )); RETURN(STATUS_INVALID_PARAMETER); } pScopeEnd = pChar; do { if (*pScopeEnd < L'0' || *pScopeEnd > L'9') { UlTraceError(PARSER, ("http!HttppParseIPv6Address: " "Invalid digit in IPv6 scope ID, " "U+%04X, '%c'\n", *pScopeEnd, IS_ANSI(*pScopeEnd) && IS_HTTP_PRINT(*pScopeEnd) ? *pScopeEnd : '?' )); RETURN(STATUS_INVALID_PARAMETER); } } while (++pScopeEnd < pEnd && L']' != *pScopeEnd); ASSERT(pScopeEnd > pChar); if (pScopeEnd == pEnd) { UlTraceError(PARSER, ("http!HttppParseIPv6Address: " "No ']' after IPv6 scope ID\n" )); RETURN(STATUS_INVALID_PARAMETER); } ASSERT(L']' == *pScopeEnd); Status = HttpWideStringToULong( pChar, pScopeEnd - pChar, FALSE, // no leading zeros permitted 10, &pTerminator, &ScopeTemp ); if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ("http!HttppParseIPv6Address: " "Invalid scopeID, %s\n", HttpStatusToString(Status) )); RETURN(STATUS_INVALID_PARAMETER); } // Scope ID does not get swapped to Network Byte Order *(UNALIGNED64 ULONG *)&pSockAddr6->sin6_scope_id = ScopeTemp; ASSERT(pTerminator == pScopeEnd); pChar = pScopeEnd; } // '%' handling ASSERT(pChar < pEnd); ASSERT(L']' == *pChar); // Skip the terminating ']' pChar += WCSLEN_LIT(L"]"); *ppEnd = pChar; RETURN(STATUS_SUCCESS); } // HttppParseIPv6Address /*++ Routine Description: Print an IPv4 or IPv6 address as Unicode. Arguments: pSockAddr - The IP address to print pBuffer - Buffer to print to. Assumed to be large enough. Return Value: Number of wide chars printed (the length) --*/ ULONG HttppPrintIpAddressW( IN PSOCKADDR pSockAddr, OUT PWSTR pBuffer ) { PWSTR pResult = pBuffer; HTTP_FILL_BUFFER(pBuffer, MAX_IP_ADDR_PLUS_BRACKETS_STRING_LEN); if (TDI_ADDRESS_TYPE_IP == pSockAddr->sa_family) { PSOCKADDR_IN pAddr4 = (PSOCKADDR_IN) pSockAddr; pResult = RtlIpv4AddressToStringW(&pAddr4->sin_addr, pResult); } else if (TDI_ADDRESS_TYPE_IP6 == pSockAddr->sa_family) { PSOCKADDR_IN6 pAddr6 = (PSOCKADDR_IN6) pSockAddr; *pResult++ = L'['; pResult = RtlIpv6AddressToStringW(&pAddr6->sin6_addr, pResult); // CODEWORK: Handle scope ID *pResult++ = L']'; } else { UlTraceError(PARSER, ("http!HttppPrintIpAddressW(): invalid sa_family, %hd\n", pSockAddr->sa_family )); ASSERT(! "Invalid SockAddr Family"); } *pResult = UNICODE_NULL; return DIFF(pResult - pBuffer); } // HttppPrintIpAddressW /***************************************************************************++ Routine Description: This checks to see if the URL is well-formed. A well-formed URL has a scheme ("http" or "https"), a valid hostname (including + and * wildcards, IPv4, and IPv6 literals), a port, and a well-formed abspath. * Must check that the URL is well-formed and in canonical form; e.g., - Disallow /../ and /./ - Disallow invalid characters, including invalid Unicode surrogate pairs. The URL is already in Unicode, so it's not a question of using the IS_URL_TOKEN() macro. Arguments: pCfg - configuration parameters pUrl - Unicode string containing URL (not assumed to be zero-terminated) UrlLength - length of pUrl, in WCHARs TrailingSlashReqd - if TRUE, pUrl must end in '/' ForceRoutingIP - if TRUE and the hostname is an IPv4 or IPv6 literal, pParsedUrl->Normalized will be cleared, to force HttpNormalizeParsedUrl() to rewrite the URL as http://IP:port:IP/path pParsedUrl - on successful exit, the components of the URL Return Value: NTSTATUS --***************************************************************************/ NTSTATUS HttpParseUrl( IN PURL_C14N_CONFIG pCfg, IN PCWSTR pUrl, IN ULONG UrlLength, IN BOOLEAN TrailingSlashReqd, IN BOOLEAN ForceRoutingIP, OUT PHTTP_PARSED_URL pParsedUrl ) { NTSTATUS Status; ULONG PreviousChar; ULONG UnicodeChar; PCWSTR pEnd = pUrl + UrlLength; PCWSTR pHostname; PCWSTR pChar; PCWSTR pLabel; PCWSTR pSlash; PCWSTR pSegment; PWSTR pTerminator; BOOLEAN AlphaLabel; BOOLEAN TestSegment; BOOLEAN MoreChars; BOOLEAN LastCharHack; ULONG SegmentCount; URL_STATE UrlState; URL_STATE_TOKEN UrlToken; URL_ACTION Action; WCHAR IpAddr[MAX_IP_ADDR_PLUS_BRACKETS_STRING_LEN]; ULONG Length; // // Sanity check. // PAGED_CODE(); ASSERT(NULL != pCfg); ASSERT(NULL != pUrl); ASSERT(0 < UrlLength && UrlLength <= UNICODE_STRING_MAX_WCHAR_LEN); ASSERT(FALSE == TrailingSlashReqd || TRUE == TrailingSlashReqd); ASSERT(FALSE == ForceRoutingIP || TRUE == ForceRoutingIP); ASSERT(NULL != pParsedUrl); RtlZeroMemory(pParsedUrl, sizeof(*pParsedUrl)); pParsedUrl->Signature = HTTP_PARSED_URL_SIGNATURE; pParsedUrl->pFullUrl = (PWSTR) pUrl; pParsedUrl->UrlLength = (USHORT) UrlLength; pParsedUrl->Normalized = TRUE; pParsedUrl->TrailingSlashReqd = TrailingSlashReqd; // This is the shortest possible valid URL if (UrlLength < WCSLEN_LIT(L"http://*:1/")) { UlTraceError(PARSER, ("http!HttpParseUrl: Url too short, %lu, %.*ls\n", UrlLength, UrlLength, pUrl )); RETURN(STATUS_INVALID_PARAMETER); } // Check the scheme if (0 == wcsncmp(pUrl, L"http://", WCSLEN_LIT(L"http://"))) { pParsedUrl->Secure = FALSE; pHostname = pUrl + WCSLEN_LIT(L"http://"); } else if (0 == wcsncmp(pUrl, L"https://", WCSLEN_LIT(L"https://"))) { pParsedUrl->Secure = TRUE; pHostname = pUrl + WCSLEN_LIT(L"https://"); } else { UlTraceError(PARSER, ("http!HttpParseUrl: invalid scheme, %.*ls\n", UrlLength, pUrl )); RETURN(STATUS_INVALID_PARAMETER); } pParsedUrl->pHostname = (PWSTR) pHostname; // Is a trailing slash present, if required? if (TrailingSlashReqd && L'/' != pUrl[UrlLength - 1]) { // No, then the URL will have to be rewritten pParsedUrl->Normalized = FALSE; } // // The hostname validation code below looks a lot like that in // HttpValidateHostname(). However, it is sufficiently different // (WCHAR vs. UCHAR, Host+IP, Scope IDs, compulsory ports, etc) that // it is not easy to combine them into one routine. If the hostname // validation code is changed here, it may be necessary to change it // in HttpValidateHostname() too, or vice versa. // // Check for weak (http://*:port/) and strong (http://+:port/) wildcards if (L'*' == *pHostname || L'+' == *pHostname) { pParsedUrl->SiteType = (L'*' == *pHostname) ? HttpUrlSite_WeakWildcard : HttpUrlSite_StrongWildcard; pChar = pHostname + WCSLEN_LIT(L"*"); ASSERT(pChar < pEnd); // The wildcard must be followed by ":port" if (L':' == *pChar) goto port; UlTraceError(PARSER, ("http!HttpParseUrl: No port in '%c' wildcard address\n", *pHostname )); RETURN(STATUS_INVALID_PARAMETER); } // Is this an IPv6 literal address, per RFC 2732? if (L'[' == *pHostname) { pParsedUrl->SiteType = HttpUrlSite_IP; Status = HttppParseIPv6Address( pHostname, DIFF(pEnd - pHostname), TRUE, // scope ID allowed &pParsedUrl->SockAddr6, &pChar); if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ("http!HttpParseUrl: " "Invalid IPv6 address, %s\n", HttpStatusToString(Status) )); RETURN(Status); } ASSERT(TDI_ADDRESS_TYPE_IP6 == pParsedUrl->SockAddr.sa_family); ASSERT(pChar > pHostname); // There must be a port if (pChar == pEnd || L':' != *pChar) { UlTraceError(PARSER, ("http!HttpParseUrl: No port after IPv6 address\n" )); RETURN(STATUS_INVALID_PARAMETER); } // // There are so many legitimate ways to write an IPv6 literal // that we can't assume that a valid IPv6 literal is normalized. // Since we do string comparisons, we'll have to rewrite the URL // if the Normalized flag is not set. // Length = HttppPrintIpAddressW(&pParsedUrl->SockAddr, IpAddr); if (Length != DIFF_USHORT(pChar - pHostname) || 0 != _wcsnicmp(pHostname, IpAddr, Length)) { pParsedUrl->Normalized = FALSE; } goto port; } // IPv6 // // It must be a domain name or an IPv4 literal. We'll try to treat // it as a domain name first. If the labels turn out to be all-numeric, // we'll try decoding it as an IPv4 literal. // AlphaLabel = FALSE; pLabel = pHostname; for (pChar = pHostname; pChar < pEnd; ++pChar) { if (L':' == *pChar) { if (pChar == pHostname) { UlTraceError(PARSER, ("http!HttpParseUrl: empty hostname\n" )); RETURN(STATUS_INVALID_PARAMETER); } // Have we seen any non-digits? if (AlphaLabel) { ASSERT(0 == pParsedUrl->SockAddr.sa_family); pParsedUrl->SiteType = HttpUrlSite_Name; goto port; } pParsedUrl->SiteType = HttpUrlSite_IP; pParsedUrl->SockAddr4.sin_family = TDI_ADDRESS_TYPE_IP; ASSERT(TDI_ADDRESS_TYPE_IP == pParsedUrl->SockAddr.sa_family); // Let's see if it's a valid IPv4 address Status = RtlIpv4StringToAddressW( pHostname, TRUE, // strict => 4 dotted decimal octets &pTerminator, &pParsedUrl->SockAddr4.sin_addr ); if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ("http!HttpParseUrl: " "Invalid IPv4 address, %s\n", HttpStatusToString(Status) )); RETURN(Status); } if (pTerminator != pChar) { ASSERT(pTerminator < pChar); UlTraceError(PARSER, ("http!HttpParseUrl: " "Invalid IPv4 address after %lu chars, " "U+%04X, '%c'\n", DIFF(pTerminator - pHostname), *pTerminator, IS_ANSI(*pTerminator) && IS_HTTP_PRINT(*pTerminator) ? *pTerminator : '?' )); RETURN(STATUS_INVALID_PARAMETER); } Length = HttppPrintIpAddressW(&pParsedUrl->SockAddr, IpAddr); if (Length != DIFF_USHORT(pChar - pHostname) || 0 != _wcsnicmp(pHostname, IpAddr, Length)) { pParsedUrl->Normalized = FALSE; } goto port; } // ':' handling if (L'.' == *pChar) { ULONG LabelLength = DIFF(pChar - pLabel); // There must be at least one char in the label if (0 == LabelLength) { UlTraceError(PARSER, ("http!HttpParseUrl: empty label\n" )); RETURN(STATUS_INVALID_PARAMETER); } // Label can't have more than 63 chars if (LabelLength > pCfg->MaxLabelLength) { UlTraceError(PARSER, ("http!HttpParseUrl: overlong label, %lu\n", LabelLength )); RETURN(STATUS_INVALID_PARAMETER); } // Reset for the next label pLabel = pChar + WCSLEN_LIT(L"."); continue; } // // All chars above 0xFF are considered valid // if (!IS_ANSI(*pChar) || !IS_URL_ILLEGAL_COMPUTERNAME(*pChar)) { if (!IS_ANSI(*pChar) || !IS_HTTP_DIGIT(*pChar)) AlphaLabel = TRUE; if (pChar > pLabel) continue; // The first char of a label cannot be a hyphen. (Underscore?) if (L'-' == *pChar) { UlTraceError(PARSER, ("http!HttpParseUrl: '-' at beginning of label\n" )); RETURN(STATUS_INVALID_PARAMETER); } continue; } UlTraceError(PARSER, ("http!HttpParseUrl: " "Invalid char in hostname, U+%04X '%c'," " after %lu chars, '%.*s'\n", *pChar, IS_ANSI(*pChar) && IS_HTTP_PRINT(*pChar) ? *pChar : '?', DIFF(pChar - pHostname), DIFF(pChar - pHostname), pHostname )); RETURN(STATUS_INVALID_PARAMETER); } // hostname // // If we got here, we fell off the end of the buffer, // without finding a ':' for the port // ASSERT(pChar == pEnd); UlTraceError(PARSER, ("http!HttpParseUrl: No port\n")); RETURN(STATUS_INVALID_PARAMETER); port: // // Parse the port number // ASSERT(pHostname < pChar && pChar < pEnd); ASSERT(L':' == *pChar); pParsedUrl->HostnameLength = DIFF_USHORT(pChar - pHostname); // First, check for overlong hostnames if (pParsedUrl->HostnameLength > pCfg->MaxHostnameLength) { UlTraceError(PARSER, ("http!HttpParseUrl: overlong hostname, %hu\n", pParsedUrl->HostnameLength )); RETURN(STATUS_INVALID_PARAMETER); } // Skip the ':' denoting a port number pChar += WCSLEN_LIT(L":"); if (pChar == pEnd) { UlTraceError(PARSER, ("http!HttpParseUrl: No port after ':'\n" )); RETURN(STATUS_INVALID_PARAMETER); } // Search for the '/' or second ':' that terminates the port number pSlash = pChar; pParsedUrl->pPort = (PWSTR) pSlash; do { if (*pSlash < L'0' || *pSlash > L'9') { UlTraceError(PARSER, ("http!HttpParseUrl: " "Invalid digit in port, U+%04X, '%c'\n", *pSlash, IS_ANSI(*pSlash) && IS_HTTP_PRINT(*pSlash) ? *pSlash : '?' )); RETURN(STATUS_INVALID_PARAMETER); } } while (++pSlash < pEnd && L'/' != *pSlash && L':' != *pSlash); ASSERT(pSlash > pChar); pParsedUrl->PortLength = DIFF_USHORT(pSlash - pChar); if (pSlash == pEnd) { UlTraceError(PARSER, ("http!HttpParseUrl: No '/' (or second ':') after port\n" )); RETURN(STATUS_INVALID_PARAMETER); } ASSERT(L'/' == *pSlash || L':' == *pSlash); Status = HttpWideStringToUShort( pChar, pParsedUrl->PortLength, FALSE, // no leading zeros permitted 10, &pTerminator, &pParsedUrl->PortNumber ); if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ("http!HttpParseUrl: " "Invalid port number, %s\n", HttpStatusToString(Status) )); RETURN(STATUS_INVALID_PARAMETER); } if (0 == pParsedUrl->PortNumber) { UlTraceError(PARSER, ("http!HttpParseUrl: Port must not be zero.\n" )); RETURN(STATUS_INVALID_PARAMETER); } ASSERT(pTerminator == pSlash); pChar = pSlash; goto routing_IP; // so /W4 won't complain about an unreferenced label routing_IP: // // Is this a Host+IP site; i.e., is there a Routing IP address // after the port number? // if (L'/' == *pChar) { pParsedUrl->pRoutingIP = NULL; pParsedUrl->RoutingIPLength = 0; ASSERT(0 == pParsedUrl->RoutingAddr.sa_family); // // If the hostname is an IP literal, but there is no routing IP // (i.e., http://IP:port/path), we must rewrite the URL as // http://IP:port:IP/path; i.e., explicitly use the hostname IP // as the routing IP. // if (ForceRoutingIP && 0 != pParsedUrl->SockAddr.sa_family) { ASSERT(TDI_ADDRESS_TYPE_IP == pParsedUrl->SockAddr.sa_family || TDI_ADDRESS_TYPE_IP6 == pParsedUrl->SockAddr.sa_family); pParsedUrl->Normalized = FALSE; } goto parse_path; } ASSERT(L':' == *pChar); if (HttpUrlSite_WeakWildcard == pParsedUrl->SiteType || HttpUrlSite_StrongWildcard == pParsedUrl->SiteType) { UlTraceError(PARSER, ("http!HttpParseUrl: " "Can't have Routing IPs on Wildcard sites\n" )); RETURN(STATUS_INVALID_PARAMETER); } pChar += WCSLEN_LIT(L":"); if (pChar == pEnd) { UlTraceError(PARSER, ("http!HttpParseUrl: No IP address after second ':'\n" )); RETURN(STATUS_INVALID_PARAMETER); } pParsedUrl->pRoutingIP = (PWSTR) pChar; ASSERT(HttpUrlSite_NamePlusIP != pParsedUrl->SiteType); if (HttpUrlSite_Name == pParsedUrl->SiteType) { pParsedUrl->SiteType = HttpUrlSite_NamePlusIP; } // // Is the Routing IP an IPv6 literal? // if (L'[' == *pChar) { if (TDI_ADDRESS_TYPE_IP == pParsedUrl->SockAddr.sa_family) { UlTraceError(PARSER, ("http!HttpParseUrl: " "Can't have http://IPv4:port:[IPv6]\n" )); RETURN(STATUS_INVALID_PARAMETER); } ASSERT(TDI_ADDRESS_TYPE_IP6 == pParsedUrl->SockAddr.sa_family || 0 == pParsedUrl->SockAddr.sa_family); Status = HttppParseIPv6Address( pChar, DIFF(pEnd - pChar), TRUE, // scope ID allowed &pParsedUrl->RoutingAddr6, &pSlash); if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ("http!HttpParseUrl: " "Invalid Host+IPv6 address, %s\n", HttpStatusToString(Status) )); RETURN(Status); } ASSERT(TDI_ADDRESS_TYPE_IP6 == pParsedUrl->RoutingAddr.sa_family); ASSERT(pSlash > pChar); // There must be a slash if (pSlash == pEnd || L'/' != *pSlash) { UlTraceError(PARSER, ("http!HttpParseUrl: '/' expected after Host+IPv6.\n" )); RETURN(STATUS_INVALID_PARAMETER); } // CODEWORK: Should we care if RoutingAddr6 != SockAddr6? pParsedUrl->RoutingIPLength = DIFF_USHORT(pSlash - pChar); Length = HttppPrintIpAddressW(&pParsedUrl->RoutingAddr, IpAddr); if (Length != pParsedUrl->RoutingIPLength || 0 != _wcsnicmp(pChar, IpAddr, Length)) { pParsedUrl->Normalized = FALSE; } pChar = pSlash; goto parse_path; } // // No, then it must be an IPv4 literal // if (TDI_ADDRESS_TYPE_IP6 == pParsedUrl->SockAddr.sa_family) { UlTraceError(PARSER, ("http!HttpParseUrl: Can't have http://[IPv6]:port:IPv4\n" )); RETURN(STATUS_INVALID_PARAMETER); } ASSERT(TDI_ADDRESS_TYPE_IP == pParsedUrl->SockAddr.sa_family || 0 == pParsedUrl->SockAddr.sa_family); // Search for the terminating '/' pSlash = pChar; do { if ((L'0' <= *pSlash && *pSlash <= L'9') || L'.' == *pSlash) continue; UlTraceError(PARSER, ("http!HttpParseUrl: " "Invalid character in Host+IPv4, U+%04X, '%c'\n", *pSlash, IS_ANSI(*pSlash) && IS_HTTP_PRINT(*pSlash) ? *pSlash : '?' )); RETURN(STATUS_INVALID_PARAMETER); } while (++pSlash < pEnd && L'/' != *pSlash); ASSERT(pSlash > pChar); if (pSlash == pEnd) { UlTraceError(PARSER, ("http!HttpParseUrl: No '/' after Host+IPv4\n" )); RETURN(STATUS_INVALID_PARAMETER); } ASSERT(L'/' == *pSlash); Status = RtlIpv4StringToAddressW( pChar, TRUE, // strict => 4 dotted decimal octets &pTerminator, &pParsedUrl->RoutingAddr4.sin_addr ); if (!NT_SUCCESS(Status)) { UlTraceError(PARSER, ("http!HttpParseUrl: " "Invalid Host+IPv4 address, %s\n", HttpStatusToString(Status) )); RETURN(Status); } if (pTerminator != pSlash) { ASSERT(pTerminator < pSlash); UlTraceError(PARSER, ("http!HttpParseUrl: " "Invalid Host+IPv4 address after %lu chars, " "U+%04X, '%c'\n", DIFF(pTerminator - pChar), *pTerminator, IS_ANSI(*pTerminator) && IS_HTTP_PRINT(*pTerminator) ? *pTerminator : '?' )); RETURN(STATUS_INVALID_PARAMETER); } // CODEWORK: Should we care if RoutingAddr4 != SockAddr4 pParsedUrl->RoutingIPLength = DIFF_USHORT(pSlash - pChar); pParsedUrl->RoutingAddr4.sin_family = TDI_ADDRESS_TYPE_IP; Length = HttppPrintIpAddressW(&pParsedUrl->RoutingAddr, IpAddr); if (Length != pParsedUrl->RoutingIPLength || 0 != _wcsnicmp(pChar, IpAddr, Length)) { pParsedUrl->Normalized = FALSE; } pChar = pSlash; parse_path: // // Parse the abspath // ASSERT(pParsedUrl->pRoutingIP == NULL || pParsedUrl->RoutingIPLength > 0); ASSERT(pHostname < pChar && pChar < pEnd); ASSERT(L'/' == *pChar); pParsedUrl->pAbsPath = (PWSTR) pChar; pParsedUrl->AbsPathLength = DIFF_USHORT(pEnd - pChar); if (pParsedUrl->AbsPathLength > pCfg->UrlMaxLength) { UlTraceError(PARSER, ("http!HttpParseUrl: " "AbsPath is too long: %lu\n", pParsedUrl->AbsPathLength )); RETURN(STATUS_INVALID_PARAMETER); } UrlState = URL_STATE_START; UrlToken = URL_TOKEN_OTHER; Action = ACTION_NOTHING; pSegment = pChar; TestSegment = FALSE; LastCharHack = FALSE; MoreChars = TRUE; PreviousChar = UNICODE_NULL; UnicodeChar = *pChar; SegmentCount = 0; // // Loop through all the characters in pAbsPath, plus one or two // special ones at the end. // while (MoreChars) { switch (UnicodeChar) { case UNICODE_NULL: UrlToken = URL_TOKEN_EOS; TestSegment = TRUE; break; case DOT: UrlToken = URL_TOKEN_DOT; TestSegment = FALSE; break; case FORWARD_SLASH: UrlToken = URL_TOKEN_SLASH; TestSegment = TRUE; break; case PERCENT: // no hex escapes case STAR: // no wildcards case QUESTION_MARK: // no wildcards or querystrings case BACK_SLASH: // no C string escapes UlTraceError(PARSER, ("http!HttpParseUrl: invalid '%c' char in path\n", (UCHAR) UnicodeChar )); RETURN(STATUS_INVALID_PARAMETER); default: UrlToken = URL_TOKEN_OTHER; TestSegment = FALSE; break; } UlTraceVerbose(PARSER, ("http!HttpParseUrl: " "[%lu] U+%04lX '%c' %p: [%s][%s] -> %s, %s\n", DIFF(pChar - pParsedUrl->pAbsPath), UnicodeChar, IS_ANSI(UnicodeChar) && IS_HTTP_PRINT(UnicodeChar) ? (UCHAR) UnicodeChar : '?', pChar, HttppUrlStateToString(UrlState), HttppUrlTokenToString(UrlToken), HttppUrlStateToString( CanonStateFromStateAndToken[UrlState][UrlToken]), TestSegment ? ", TestSegment" : "" )); // // Reject control characters // if (!LastCharHack && !pCfg->AllowRestrictedChars && IS_ANSI(UnicodeChar) && IS_URL_INVALID(UnicodeChar)) { UlTraceError(PARSER, ( "http!HttpParseUrl: " "Invalid character, U+%04lX, in path.\n", UnicodeChar )); RETURN(STATUS_INVALID_PARAMETER); } // // Check that (high-surrogate, low-surrogate) come in pairs // if (HIGH_SURROGATE_START <= PreviousChar && PreviousChar <= HIGH_SURROGATE_END) { if (UnicodeChar < LOW_SURROGATE_START || UnicodeChar > LOW_SURROGATE_END) { UlTraceError(PARSER, ( "http!HttpParseUrl: " "Illegal surrogate pair, U+%04lX, U+%04lX.\n", PreviousChar, UnicodeChar )); RETURN(STATUS_INVALID_PARAMETER); } } else if (LOW_SURROGATE_START <= UnicodeChar && UnicodeChar <= LOW_SURROGATE_END) { UlTraceError(PARSER, ( "http!HttpParseUrl: " "Non-high surrogate, U+%04lX, " "before low surrogate, U+%04lX.\n", PreviousChar, UnicodeChar )); RETURN(STATUS_INVALID_PARAMETER); } if (URL_STATE_ERROR == CanonStateFromStateAndToken[UrlState][UrlToken]) { UlTraceError(PARSER, ( "http!HttpParseUrl: " "Error state from %s,%s in path, after U+%04lX.\n", HttppUrlStateToString(UrlState), HttppUrlTokenToString(UrlToken), UnicodeChar )); RETURN(STATUS_INVALID_PARAMETER); } UrlState = CanonStateFromStateAndToken[UrlState][UrlToken]; // // Check segment limits // if (TestSegment) { ULONG SegmentLength = DIFF(pChar - pSegment); // The CanonStateFromStateAndToken checks should prevent // empty segments, among other things ASSERT(SegmentLength > 0 || pChar == pSegment); // Reject if segment too long if (SegmentLength > pCfg->UrlSegmentMaxLength + WCSLEN_LIT(L"/")) { UlTraceError(PARSER, ( "http!HttpParseUrl(): " "Segment too long: %lu\n", SegmentLength )); RETURN(STATUS_INVALID_PARAMETER); } pSegment = pChar; // Reject if too many path segments if (++SegmentCount > pCfg->UrlSegmentMaxCount) { UlTraceError(PARSER, ( "http!HttpParseUrl(): " "Too many segments: %lu\n", SegmentCount )); RETURN(STATUS_INVALID_PARAMETER); } } // // Are there any more path characters? // PreviousChar = UnicodeChar; if (++pChar < pEnd) { UnicodeChar = *pChar; } else if (!LastCharHack) { // Want to make sure that the last segment is tested. // If there's no trailing slash, we'll enter here twice; // otherwise once if (TrailingSlashReqd && FORWARD_SLASH != PreviousChar) { // First, fake a trailing slash, if needed UnicodeChar = FORWARD_SLASH; } else { // Second, always finish up with UNICODE_NULL UnicodeChar = UNICODE_NULL; LastCharHack = TRUE; } } else { // Terminate the loop MoreChars = FALSE; } } // while (MoreChars) RETURN(STATUS_SUCCESS); } // HttpParseUrl /***************************************************************************++ Routine Description: Some URLs parsed by HttpParseUrl() will not be considered normalized if they have IP literals, Routing IPs, or no trailing slash. This routine will build a fully normalized URL and (possibly) free the old one Arguments: pParsedUrl - On entry, points to a URL parsed by HttpParseUrl(); On successful exit, points to a normalized URL. pCfg - configuration parameters ForceCopy - if TRUE, will always make a new, normalized URL FreeOriginalUrl - if FALSE, will never free the original URL. The caller must manage the memory. ForceRoutingIP - if TRUE and the hostname is an IPv4 or IPv6 literal, the URL will be rewritten in the form http://IP:port:IP/path PoolType - PagedPool or NonPagedPool PoolTag - Tag used to allocate pUrl Return Value: NTSTATUS - STATUS_SUCCESS or STATUS_NO_MEMORY --***************************************************************************/ NTSTATUS HttpNormalizeParsedUrl( IN OUT PHTTP_PARSED_URL pParsedUrl, IN PURL_C14N_CONFIG pCfg, IN BOOLEAN ForceCopy, IN BOOLEAN FreeOriginalUrl, IN BOOLEAN ForceRoutingIP, IN POOL_TYPE PoolType, IN ULONG PoolTag ) { HTTP_PARSED_URL ParsedUrl = *pParsedUrl; NTSTATUS Status = STATUS_SUCCESS; ASSERT(HTTP_PARSED_URL_SIGNATURE == ParsedUrl.Signature); if (ParsedUrl.Normalized && !ForceCopy) { // nothing to do } else { PWSTR pResult; WCHAR HostAddrString[MAX_IP_ADDR_PLUS_BRACKETS_STRING_LEN]; WCHAR RoutingAddrString[MAX_IP_ADDR_PLUS_BRACKETS_STRING_LEN]; ULONG SchemeLength; ULONG HostAddrLength; ULONG HostnameLength; ULONG RoutingAddrLength; ULONG AbsPathLength; ULONG Length; ULONG TrailingSlashLength; PCWSTR pUrl; pUrl = ParsedUrl.pFullUrl; SchemeLength = DIFF(ParsedUrl.pHostname - ParsedUrl.pFullUrl); // Calculate HostAddrLength and HostnameLength (mutually exclusive) if (0 != ParsedUrl.SockAddr.sa_family) { HostAddrLength = HttppPrintIpAddressW( &ParsedUrl.SockAddr, HostAddrString ); HostnameLength = 0; } else { HostAddrLength = 0; HostAddrString[0] = UNICODE_NULL; HostnameLength = ParsedUrl.HostnameLength; } // Calculate RoutingAddrLength if (0 != ParsedUrl.RoutingAddr.sa_family) { RoutingAddrLength = WCSLEN_LIT(L":") + HttppPrintIpAddressW( &ParsedUrl.RoutingAddr, RoutingAddrString ); } else if (ForceRoutingIP && 0 != ParsedUrl.SockAddr.sa_family) { // We must rewrite http://IP:port/path as http://IP:port:IP/path RoutingAddrLength = WCSLEN_LIT(L":") + HostAddrLength; wcscpy(RoutingAddrString, HostAddrString); } else { RoutingAddrLength = 0; RoutingAddrString[0] = UNICODE_NULL; } AbsPathLength = ParsedUrl.AbsPathLength; ASSERT(AbsPathLength > 0); if (ParsedUrl.TrailingSlashReqd && FORWARD_SLASH != ParsedUrl.pAbsPath[AbsPathLength-1]) { TrailingSlashLength = WCSLEN_LIT(L"/"); } else { TrailingSlashLength = 0; } Length = SchemeLength + HostAddrLength + HostnameLength + WCSLEN_LIT(L":") + ParsedUrl.PortLength + RoutingAddrLength + AbsPathLength + TrailingSlashLength; pResult = (PWSTR) HTTPP_ALLOC( PoolType, (Length + 1) * sizeof(WCHAR), PoolTag ); if (NULL == pResult) { Status = STATUS_NO_MEMORY; // Do not destroy the old URL. Let caller handle it. } else { PWSTR pDest = pResult; #define WCSNCPY(pSrc, Length) \ RtlCopyMemory(pDest, (pSrc), (Length) * sizeof(WCHAR)); \ pDest += (Length) #define WCSNCPY2(pField, Length) \ WCSNCPY(ParsedUrl.pField, Length) #define WCSNCPY_LIT(Lit) \ WCSNCPY(Lit, WCSLEN_LIT(Lit)) WCSNCPY2(pFullUrl, SchemeLength); if (0 != HostnameLength) { ASSERT(0 == HostAddrLength); WCSNCPY2(pHostname, HostnameLength); } else { ASSERT(0 != HostAddrLength); WCSNCPY(HostAddrString, HostAddrLength); } WCSNCPY_LIT(L":"); WCSNCPY2(pPort, ParsedUrl.PortLength); if (RoutingAddrLength > 0) { WCSNCPY_LIT(L":"); WCSNCPY( RoutingAddrString, RoutingAddrLength - WCSLEN_LIT(L":") ); } WCSNCPY2(pAbsPath, AbsPathLength); if (TrailingSlashLength > 0) { WCSNCPY_LIT(L"/"); } ASSERT(DIFF(pDest - pResult) == Length); *pDest = UNICODE_NULL; Status = HttpParseUrl( pCfg, pResult, Length, ParsedUrl.TrailingSlashReqd, ForceRoutingIP, &ParsedUrl ); ASSERT(STATUS_SUCCESS == Status); ASSERT(ParsedUrl.Normalized); if (FreeOriginalUrl) HTTPP_FREE((PVOID) pUrl, PoolTag); // Write the updated local copy back to the caller's HTTP_PARSED_URL *pParsedUrl = ParsedUrl; } } return Status; } // HttpNormalizeParsedUrl