Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

4688 lines
133 KiB

/*++
Copyright (c) 1998-2002 Microsoft Corporation
Module Name:
C14n.c
Abstract:
URL canonicalization (c14n) routines
Author:
George V. Reilly (GeorgeRe) 22-Mar-2002
Revision History:
--*/
#include <precomp.h>
#include "c14np.h"
#if defined(ALLOC_PRAGMA) && defined(KERNEL_PRIV)
#pragma alloc_text( PAGE, HttpInitializeDefaultUrlC14nConfig)
#pragma alloc_text( PAGE, HttpInitializeDefaultUrlC14nConfigEncoding)
#pragma alloc_text( PAGE, HttpUnescapePercentHexEncoding)
#pragma alloc_text( PAGE, HttppPopCharHostNameUtf8)
#pragma alloc_text( PAGE, HttppPopCharHostNameDbcs)
#pragma alloc_text( PAGE, HttppPopCharHostNameAnsi)
#pragma alloc_text( PAGE, HttpCopyHost)
#pragma alloc_text( PAGE, HttppCopyHostByType)
#pragma alloc_text( PAGE, HttpValidateHostname)
#pragma alloc_text( PAGE, HttppPopCharAbsPathUtf8)
#pragma alloc_text( PAGE, HttppPopCharAbsPathDbcs)
#pragma alloc_text( PAGE, HttppPopCharAbsPathAnsi)
#pragma alloc_text( PAGE, HttppPopCharQueryString)
#pragma alloc_text( PAGE, HttppCopyUrlByType)
#pragma alloc_text( PAGE, HttpCopyUrl)
#pragma alloc_text( PAGE, HttpCleanAndCopyUrl)
#pragma alloc_text( PAGE, HttppCleanAndCopyUrlByType)
#pragma alloc_text( PAGE, HttpFindUrlToken)
#pragma alloc_text( PAGE, HttppParseIPv6Address)
#pragma alloc_text( PAGE, HttppPrintIpAddressW)
#pragma alloc_text( PAGE, HttpParseUrl)
#pragma alloc_text( PAGE, HttpNormalizeParsedUrl)
#endif // ALLOC_PRAGMA && KERNEL_PRIV
#if 0 // Non-Pageable Functions
NOT PAGEABLE --
#endif // Non-Pageable Functions
VOID
HttpInitializeDefaultUrlC14nConfig(
PURL_C14N_CONFIG pCfg
)
{
PAGED_CODE();
pCfg->HostnameDecodeOrder = UrlDecode_Utf8_Else_Dbcs_Else_Ansi;
pCfg->AbsPathDecodeOrder = UrlDecode_Utf8;
pCfg->EnableNonUtf8 = FALSE;
pCfg->FavorUtf8 = FALSE;
pCfg->EnableDbcs = FALSE;
pCfg->PercentUAllowed = DEFAULT_C14N_PERCENT_U_ALLOWED;
pCfg->AllowRestrictedChars = DEFAULT_C14N_ALLOW_RESTRICTED_CHARS;
pCfg->CodePage = 0;
pCfg->UrlMaxLength = DEFAULT_C14N_URL_MAX_LENGTH;
pCfg->UrlSegmentMaxLength = DEFAULT_C14N_URL_SEGMENT_MAX_LENGTH;
pCfg->UrlSegmentMaxCount = DEFAULT_C14N_URL_SEGMENT_MAX_COUNT;
pCfg->MaxLabelLength = DEFAULT_C14N_MAX_LABEL_LENGTH;
pCfg->MaxHostnameLength = DEFAULT_C14N_MAX_HOSTNAME_LENGTH;
} // HttpInitializeDefaultUrlC14nConfig
VOID
HttpInitializeDefaultUrlC14nConfigEncoding(
PURL_C14N_CONFIG pCfg,
BOOLEAN EnableNonUtf8,
BOOLEAN FavorUtf8,
BOOLEAN EnableDbcs
)
{
PAGED_CODE();
HttpInitializeDefaultUrlC14nConfig(pCfg);
pCfg->EnableNonUtf8 = EnableNonUtf8;
pCfg->FavorUtf8 = FavorUtf8;
pCfg->EnableDbcs = EnableDbcs;
if (EnableNonUtf8)
{
if (FavorUtf8)
{
pCfg->AbsPathDecodeOrder = (EnableDbcs
? UrlDecode_Utf8_Else_Dbcs
: UrlDecode_Utf8_Else_Ansi);
}
else
{
pCfg->AbsPathDecodeOrder = (EnableDbcs
? UrlDecode_Dbcs_Else_Utf8
: UrlDecode_Ansi_Else_Utf8);
}
}
else
{
pCfg->AbsPathDecodeOrder = UrlDecode_Utf8;
}
} // HttpInitializeDefaultUrlC14nConfigEncoding
/***************************************************************************++
Routine Description:
Convert '%NN' or '%uNNNN' to a ULONG.
Arguments:
pSourceChar - Input buffer
SourceLength - Length of pSourceChar, in bytes
PercentUAllowed - Accept '%uNNNN' notation?
pOutChar - decoded character
pBytesToSkip - number of bytes consumed from pSourceChar;
will be 3 for %NN and 6 for %uNNNN.
Return Value:
STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
--***************************************************************************/
NTSTATUS
HttpUnescapePercentHexEncoding(
IN PCUCHAR pSourceChar,
IN ULONG SourceLength,
IN BOOLEAN PercentUAllowed,
OUT PULONG pOutChar,
OUT PULONG pBytesToSkip
)
{
ULONG Result, i, NumDigits;
PCUCHAR pHexDigits;
PAGED_CODE();
if (SourceLength < STRLEN_LIT("%NN"))
{
UlTraceError(PARSER, (
"http!HttpUnescapePercentHexEncoding(%p): "
"Length too short, %lu.\n",
pSourceChar, SourceLength
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
else if (pSourceChar[0] != PERCENT)
{
UlTraceError(PARSER, (
"http!HttpUnescapePercentHexEncoding(%p): "
"Starts with 0x%02lX, not '%%'.\n",
pSourceChar, (ULONG) pSourceChar[0]
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
if (pSourceChar[1] != 'u' && pSourceChar[1] != 'U')
{
// RFC 2396 says that an "escaped octet is encoded as a character
// triplet, consisting of the percent character '%' followed by
// the two hexadecimal digits representing the octet code."
pHexDigits = pSourceChar + STRLEN_LIT("%");
NumDigits = 2;
*pBytesToSkip = STRLEN_LIT("%NN");
}
else
{
// This is the %uNNNN notation generated by JavaScript's escape() fn
if (! PercentUAllowed)
{
UlTraceError(PARSER, (
"http!HttpUnescapePercentHexEncoding(%p): "
"%%uNNNN forbidden.\n",
pSourceChar, SourceLength
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
else if (SourceLength < STRLEN_LIT("%uNNNN"))
{
UlTraceError(PARSER, (
"http!HttpUnescapePercentHexEncoding(%p): "
"Length %lu too short for %%uNNNN.\n",
pSourceChar, SourceLength
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
pHexDigits = pSourceChar + STRLEN_LIT("%u");
NumDigits = 4;
*pBytesToSkip = STRLEN_LIT("%uNNNN");
}
ASSERT(*pBytesToSkip <= SourceLength);
Result = 0;
for (i = 0; i < NumDigits; ++i)
{
ULONG Char = pHexDigits[i];
ULONG Digit;
//
// HexToChar() inlined. Note: in ASCII, '0' < 'A' < 'a' and there are
// no gaps in ranges '0'..'9', 'A'..'F', and 'a'..'f' (unlike EBCDIC,
// which has gaps between 'I'/'J', 'R'/'S', 'i'/'j', and 'r'/'s').
//
C_ASSERT('0' < 'A' && 'A' < 'a');
C_ASSERT('9' - '0' == 10 - 1);
C_ASSERT('F' - 'A' == 6 - 1);
C_ASSERT('f' - 'a' == 6 - 1);
if (! IS_HTTP_HEX(Char))
{
UlTraceError(PARSER, (
"http!HttpUnescapePercentHexEncoding(%p): "
"Invalid hex character[%lu], 0x%02lX.\n",
pSourceChar, i, Char
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
else if ('a' <= Char)
{
ASSERT('a' <= Char && Char <= 'f');
Digit = Char - 'a' + 0xA;
}
else if ('A' <= Char)
{
ASSERT('A' <= Char && Char <= 'F');
Digit = Char - 'A' + 0xA;
}
else
{
ASSERT('0' <= Char && Char <= '9');
Digit = Char - '0';
}
ASSERT(Digit < 0x10);
Result = (Result << 4) | Digit;
}
*pOutChar = Result;
return STATUS_SUCCESS;
} // HttpUnescapePercentHexEncoding
/***************************************************************************++
Routine Description:
Consume 1-4 bytes from pSourceChar, treating it as raw UTF-8.
This routine is only suitable for the hostname part of an HTTP URL,
Arguments:
pSourceChar - Input buffer
SourceLength - Length of pSourceChar, in bytes
pUnicodeChar - decoded character
pBytesToSkip - number of characters consumed from pSourceChar
Return Value:
STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
--***************************************************************************/
NTSTATUS
HttppPopCharHostNameUtf8(
IN PCUCHAR pSourceChar,
IN ULONG SourceLength,
OUT PULONG pUnicodeChar,
OUT PULONG pBytesToSkip
)
{
NTSTATUS Status;
PAGED_CODE();
ASSERT(SourceLength > 0);
Status = HttpUtf8RawBytesToUnicode(
pSourceChar,
SourceLength,
pUnicodeChar,
pBytesToSkip
);
return Status;
} // HttppPopCharHostNameUtf8
/***************************************************************************++
Routine Description:
Consume 1-2 bytes from pSourceChar and converts it from raw DBCS to Unicode.
This routine is only suitable for the hostname part of an HTTP URL.
Arguments:
pSourceChar - Input buffer
SourceLength - Length of pSourceChar, in bytes
pUnicodeChar - decoded character
pBytesToSkip - number of characters consumed from pSourceChar
Return Value:
STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
--***************************************************************************/
NTSTATUS
HttppPopCharHostNameDbcs(
IN PCUCHAR pSourceChar,
IN ULONG SourceLength,
OUT PULONG pUnicodeChar,
OUT PULONG pBytesToSkip
)
{
NTSTATUS Status;
ULONG AnsiCharSize;
WCHAR WideChar;
PAGED_CODE();
ASSERT(SourceLength > 0);
if (! IS_DBCS_LEAD_BYTE(pSourceChar[0]))
{
AnsiCharSize = 1;
}
else
{
if (SourceLength < 2)
{
UlTraceError(PARSER, (
"http!HttppPopCharHostNameDbcs(%p): "
"ERROR: DBCS lead byte, 0x%02lX, at end of string\n",
pSourceChar, *pSourceChar
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
AnsiCharSize = 2;
}
Status = RtlMultiByteToUnicodeN(
&WideChar,
sizeof(WCHAR),
NULL,
(PCSTR) pSourceChar,
AnsiCharSize
);
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER, (
"http!HttppPopCharHostNameDbcs(%p): "
"MultiByteToUnicode(%lu) failed, %s.\n",
pSourceChar, AnsiCharSize, HttpStatusToString(Status)
));
return Status;
}
*pUnicodeChar = WideChar;
*pBytesToSkip = AnsiCharSize;
return STATUS_SUCCESS;
} // HttppPopCharHostNameDbcs
/***************************************************************************++
Routine Description:
Consume 1 bytes from pSourceChar and converts it from raw ANSI to Unicode.
This routine is only suitable for the hostname part of an HTTP URL.
Arguments:
pSourceChar - Input buffer
SourceLength - Length of pSourceChar, in bytes
pUnicodeChar - decoded character
pBytesToSkip - number of characters consumed from pSourceChar
Return Value:
STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
--***************************************************************************/
NTSTATUS
HttppPopCharHostNameAnsi(
IN PCUCHAR pSourceChar,
IN ULONG SourceLength,
OUT PULONG pUnicodeChar,
OUT PULONG pBytesToSkip
)
{
NTSTATUS Status;
#if !DBG
UNREFERENCED_PARAMETER(SourceLength);
#endif // !DBG
PAGED_CODE();
ASSERT(SourceLength > 0);
*pUnicodeChar = AnsiToUnicodeMap[pSourceChar[0]];
*pBytesToSkip = 1;
Status = (0 != *pUnicodeChar)
? STATUS_SUCCESS
: STATUS_OBJECT_PATH_SYNTAX_BAD;
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER, (
"http!HttppPopCharHostNameAnsi(%p): "
"No mapping for %lu.\n",
pSourceChar, *pSourceChar
));
}
return Status;
} // HttppPopCharHostNameAnsi
/***************************************************************************++
Routine Description:
Common tail function called at the end of the HttppPopCharAbsPath*()
functions, to minimize code replication.
Arguments:
pSourceChar - Input buffer
SourceLength - Length of pSourceChar, in bytes
UnicodeChar - decoded character
BytesToSkip - number of characters consumed from pSourceChar
pUnicodeChar - where to put UnicodeChar result
pBytesToSkip - where to put BytesToSkip result
Return Value:
STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
--***************************************************************************/
__inline
NTSTATUS
HttppPopCharAbsPathCommonTail(
IN PCUCHAR pSourceChar,
IN ULONG SourceLength,
IN ULONG UnicodeChar,
IN ULONG BytesToSkip,
IN BOOLEAN AllowRestrictedChars,
OUT PULONG pUnicodeChar,
OUT PULONG pBytesToSkip
)
{
#if !DBG
UNREFERENCED_PARAMETER(pSourceChar);
UNREFERENCED_PARAMETER(SourceLength);
#endif // !DBG
//
// Special handling for characters in the 8-bit range.
// May want to look at BytesToSkip to distinguish between
// raw and hex-escaped/UTF-8-encoded data.
//
// In particular, should we allow %2F or %u002F as alternate
// represenations of '/' in a URL? Why would anyone have a legitimate
// need to escape a slash character?
//
if (UnicodeChar < 0x100)
{
// Transform backslashes to forward slashes
if (BACK_SLASH == UnicodeChar)
{
UnicodeChar = FORWARD_SLASH;
}
else if (!AllowRestrictedChars && IS_URL_INVALID(UnicodeChar))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathCommonTail(%p): "
"Invalid character, U+%04X.\n",
pSourceChar, UnicodeChar
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
// CODEWORK: should we allow hex-escaped "restricted" or "unwise"
// characters at all?
}
ASSERT(BytesToSkip <= SourceLength);
*pBytesToSkip = BytesToSkip;
*pUnicodeChar = UnicodeChar;
return STATUS_SUCCESS;
} // HttppPopCharAbsPathCommonTail
/***************************************************************************++
Routine Description:
Consume 1-12 bytes from pSourceChar. Handle hex-escaped UTF-8 encoding.
This routine is only suitable for the /abspath part of an HTTP URL.
Arguments:
pSourceChar - Input buffer
SourceLength - Length of pSourceChar, in bytes
pUnicodeChar - decoded character
pBytesToSkip - number of characters consumed from pSourceChar
Return Value:
STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
--***************************************************************************/
NTSTATUS
HttppPopCharAbsPathUtf8(
IN PCUCHAR pSourceChar,
IN ULONG SourceLength,
IN BOOLEAN PercentUAllowed,
IN BOOLEAN AllowRestrictedChars,
OUT PULONG pUnicodeChar,
OUT PULONG pBytesToSkip
)
{
NTSTATUS Status;
ULONG UnicodeChar;
ULONG BytesToSkip;
ULONG Temp;
ULONG OctetCount;
UCHAR Octets[4];
UCHAR LeadByte;
//
// Sanity check.
//
PAGED_CODE();
ASSERT(SourceLength > 0);
//
// validate it as a valid URL character
//
if (! IS_URL_TOKEN(pSourceChar[0]))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathUtf8(%p): "
"first char, 0x%02lX, isn't URL token\n",
pSourceChar, (ULONG) pSourceChar[0]
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
//
// need to unescape hex encoding, '%NN' or '%uNNNN'?
//
if (PERCENT != pSourceChar[0])
{
UnicodeChar = pSourceChar[0];
BytesToSkip = 1;
//
// All octets with bit7 set MUST be hex-escaped.
// Do NOT accept literals with hi-bit set.
//
if (UnicodeChar > ASCII_MAX)
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathUtf8(%p): "
"Invalid hi-bit literal, 0x%02lX.\n",
pSourceChar, UnicodeChar
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
Status = STATUS_SUCCESS;
goto unslash;
}
Status = HttpUnescapePercentHexEncoding(
pSourceChar,
SourceLength,
PercentUAllowed,
&UnicodeChar,
&BytesToSkip
);
if (! NT_SUCCESS(Status))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathUtf8(%p): "
"Invalid hex encoding.\n",
pSourceChar
));
return Status;
}
//
// If we consumed '%uNNNN', don't attempt any UTF-8 decoding
//
if (STRLEN_LIT("%uNNNN") == BytesToSkip)
goto unslash;
ASSERT(STRLEN_LIT("%NN") == BytesToSkip);
ASSERT(UnicodeChar <= 0xFF);
Octets[0] = LeadByte = (UCHAR) UnicodeChar;
OctetCount = UTF8_OCTET_COUNT(LeadByte);
if (0 == OctetCount)
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathUtf8(%p): "
"Invalid lead byte, 0x%02lX.\n",
pSourceChar, UnicodeChar
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
ASSERT(OctetCount <= sizeof(Octets) / sizeof(Octets[0]));
BytesToSkip = OctetCount * STRLEN_LIT("%NN");
if (BytesToSkip > SourceLength)
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathUtf8(%p): "
"%lu octets is not enough for %lu-byte UTF-8 encoding.\n",
pSourceChar, OctetCount, SourceLength
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
if (OctetCount == 1)
{
#if DBG
// Singleton: no trail bytes
Status = HttpUtf8RawBytesToUnicode(
Octets,
OctetCount,
&UnicodeChar,
&Temp
);
ASSERT(STATUS_SUCCESS == Status);
ASSERT(UnicodeChar == LeadByte);
ASSERT(1 == Temp);
#endif // DBG
}
else
{
ULONG i;
//
// Decode the hex-escaped trail bytes
//
for (i = 1; i < OctetCount; ++i)
{
ULONG TrailChar;
UCHAR TrailByte;
Status = HttpUnescapePercentHexEncoding(
pSourceChar + i * STRLEN_LIT("%NN"),
STRLEN_LIT("%NN"),
FALSE, // do not allow %uNNNN for trail bytes
&TrailChar,
&Temp
);
if (! NT_SUCCESS(Status))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathUtf8(%p): "
"Invalid hex-encoded trail byte[%lu].\n",
pSourceChar, i
));
return Status;
}
ASSERT(STRLEN_LIT("%NN") == Temp);
ASSERT(TrailChar <= 0xFF);
Octets[i] = TrailByte = (UCHAR) TrailChar;
if (! IS_UTF8_TRAILBYTE(TrailByte))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathUtf8(%p): "
"Invalid trail byte[%lu], 0x%02lX.\n",
pSourceChar, i, TrailChar
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
}
//
// Decode the raw UTF-8 bytes
//
Status = HttpUtf8RawBytesToUnicode(
Octets,
OctetCount,
&UnicodeChar,
&Temp
);
if (! NT_SUCCESS(Status))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathUtf8(%p): "
"Invalid UTF-8 sequence.\n",
pSourceChar
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
}
unslash:
ASSERT(NT_SUCCESS(Status));
return HttppPopCharAbsPathCommonTail(
pSourceChar,
SourceLength,
UnicodeChar,
BytesToSkip,
AllowRestrictedChars,
pUnicodeChar,
pBytesToSkip
);
} // HttppPopCharAbsPathUtf8
/***************************************************************************++
Routine Description:
Consume 1-6 bytes from pSourceChar. Handle hex-escaped DBCS encoding.
This routine is only suitable for the /abspath part of an HTTP URL.
Arguments:
pSourceChar - Input buffer
SourceLength - Length of pSourceChar, in bytes
pUnicodeChar - decoded character
pBytesToSkip - number of characters consumed from pSourceChar
Return Value:
STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
--***************************************************************************/
NTSTATUS
HttppPopCharAbsPathDbcs(
IN PCUCHAR pSourceChar,
IN ULONG SourceLength,
IN BOOLEAN PercentUAllowed,
IN BOOLEAN AllowRestrictedChars,
OUT PULONG pUnicodeChar,
OUT PULONG pBytesToSkip
)
{
NTSTATUS Status;
ULONG UnicodeChar;
WCHAR WideChar;
ULONG BytesToSkip;
UCHAR AnsiChar[2];
ULONG AnsiCharSize;
UCHAR LeadByte;
UCHAR SecondByte = 0;
//
// Sanity check.
//
PAGED_CODE();
ASSERT(SourceLength > 0);
if (! IS_URL_TOKEN(pSourceChar[0]))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathDbcs(%p): "
"first char, 0x%02lX, isn't URL token\n",
pSourceChar, (ULONG) pSourceChar[0]
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
if (PERCENT != pSourceChar[0])
{
// Note: unlike UTF-8, we allow literal bytes whose top bit is set
UnicodeChar = pSourceChar[0];
BytesToSkip = 1;
}
else
{
// need to unescape hex encoding, '%NN' or '%uNNNN'
Status = HttpUnescapePercentHexEncoding(
pSourceChar,
SourceLength,
PercentUAllowed,
&UnicodeChar,
&BytesToSkip
);
if (! NT_SUCCESS(Status))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathDbcs(%p): "
"Invalid hex encoding.\n",
pSourceChar
));
return Status;
}
//
// If we consumed '%uNNNN', don't attempt DBCS-to-Unicode conversion
//
if (STRLEN_LIT("%uNNNN") == BytesToSkip)
goto unslash;
ASSERT(STRLEN_LIT("%NN") == BytesToSkip);
ASSERT(UnicodeChar <= 0xFF);
}
LeadByte = (UCHAR) UnicodeChar;
AnsiChar[0] = LeadByte;
if (! IS_DBCS_LEAD_BYTE(LeadByte))
{
AnsiCharSize = 1;
}
else
{
//
// This is a double-byte character.
//
ASSERT(BytesToSkip <= SourceLength);
if (BytesToSkip == SourceLength)
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathDbcs(%p): "
"ERROR: DBCS lead byte, 0x%02lX, at end of string\n",
pSourceChar, UnicodeChar
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
AnsiCharSize = 2;
SecondByte = pSourceChar[BytesToSkip];
if (PERCENT != SecondByte)
{
BytesToSkip += 1;
}
else
{
ULONG TrailChar;
ULONG Temp;
if (BytesToSkip + STRLEN_LIT("%NN") > SourceLength)
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathDbcs(%p): "
"ERROR: no space for DBCS hex-encoded suffix\n",
pSourceChar
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
Status = HttpUnescapePercentHexEncoding(
pSourceChar + BytesToSkip,
SourceLength - BytesToSkip,
FALSE, // no %uNNNN allowed here
&TrailChar,
&Temp
);
if (! NT_SUCCESS(Status))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathDbcs(%p): "
"Invalid hex encoding of trail byte.\n",
pSourceChar
));
return Status;
}
ASSERT(STRLEN_LIT("%NN") == Temp);
ASSERT(TrailChar <= 0xFF);
SecondByte = (UCHAR) TrailChar;
BytesToSkip += STRLEN_LIT("%NN");
}
AnsiChar[1] = SecondByte;
}
Status = RtlMultiByteToUnicodeN(
&WideChar,
sizeof(WCHAR),
NULL,
(PCHAR) &AnsiChar[0],
AnsiCharSize
);
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathDbcs(%p): "
"MultiByteToUnicode(%lu) failed, %s.\n",
pSourceChar, AnsiCharSize, HttpStatusToString(Status)
));
return Status;
}
UnicodeChar = WideChar;
#if DBG
//
// Describe conversion in debug spew.
//
if (1 == AnsiCharSize)
{
UlTraceVerbose(PARSER, (
"http!HttppPopCharAbsPathDbcs(%p): "
"converted %02X to U+%04lX '%C'\n",
pSourceChar,
LeadByte,
UnicodeChar,
UnicodeChar
));
}
else
{
ASSERT(2 == AnsiCharSize);
UlTraceVerbose(PARSER, (
"http!HttppPopCharAbsPathDbcs(%p): "
"converted %02X %02X to U+%04lX '%C'\n",
pSourceChar,
LeadByte,
SecondByte,
UnicodeChar,
UnicodeChar
));
}
#endif // DBG
unslash:
ASSERT(NT_SUCCESS(Status));
return HttppPopCharAbsPathCommonTail(
pSourceChar,
SourceLength,
UnicodeChar,
BytesToSkip,
AllowRestrictedChars,
pUnicodeChar,
pBytesToSkip
);
} // HttppPopCharAbsPathDbcs
/***************************************************************************++
Routine Description:
Consume 1-6 bytes from pSourceChar. Handle hex-escaped ANSI encoding.
This routine is only suitable for the /abspath part of an HTTP URL.
Arguments:
pSourceChar - Input buffer
SourceLength - Length of pSourceChar, in bytes
pUnicodeChar - decoded character
pBytesToSkip - number of characters consumed from pSourceChar
Return Value:
STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
--***************************************************************************/
NTSTATUS
HttppPopCharAbsPathAnsi(
IN PCUCHAR pSourceChar,
IN ULONG SourceLength,
IN BOOLEAN PercentUAllowed,
IN BOOLEAN AllowRestrictedChars,
OUT PULONG pUnicodeChar,
OUT PULONG pBytesToSkip
)
{
NTSTATUS Status = STATUS_SUCCESS;
ULONG UnicodeChar;
ULONG BytesToSkip;
//
// Sanity check.
//
PAGED_CODE();
ASSERT(SourceLength > 0);
//
// DBCS and ANSI decoders must allow any raw byte whose top bit
// is set (0x80-0xFF)
//
if (! IS_URL_TOKEN(pSourceChar[0]) &&
!(0x80 & pSourceChar[0]))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathAnsi(%p): "
"first char, 0x%02lX, isn't URL token\n",
pSourceChar, (ULONG) pSourceChar[0]
));
RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
}
if (PERCENT != pSourceChar[0])
{
// Note: unlike UTF-8, we allow literal bytes whose top bit is set
UnicodeChar = AnsiToUnicodeMap[ pSourceChar[0] ];
BytesToSkip = 1;
}
else
{
// need to unescape hex encoding, '%NN' or '%uNNNN'
Status = HttpUnescapePercentHexEncoding(
pSourceChar,
SourceLength,
PercentUAllowed,
&UnicodeChar,
&BytesToSkip
);
if (! NT_SUCCESS(Status))
{
UlTraceError(PARSER, (
"http!HttppPopCharAbsPathAnsi(%p): "
"Invalid hex encoding.\n",
pSourceChar
));
return Status;
}
//
// If we consumed '%uNNNN', don't attempt Ansi-to-Unicode conversion
//
if (STRLEN_LIT("%uNNNN") != BytesToSkip)
{
ASSERT(STRLEN_LIT("%NN") == BytesToSkip);
ASSERT(UnicodeChar <= 0xFF);
UnicodeChar = AnsiToUnicodeMap[(UCHAR) UnicodeChar];
}
}
ASSERT(NT_SUCCESS(Status));
return HttppPopCharAbsPathCommonTail(
pSourceChar,
SourceLength,
UnicodeChar,
BytesToSkip,
AllowRestrictedChars,
pUnicodeChar,
pBytesToSkip
);
} // HttppPopCharAbsPathAnsi
/***************************************************************************++
Routine Description:
Consume 1 bytes from pSourceChar and returns it unaltered.
This routine is only suitable for the ?querystring part of an HTTP URL,
which we do not interpret.
CODEWORK: don't 'convert' querystring to Unicode. Send it up verbatim.
Arguments:
pSourceChar - Input buffer
SourceLength - Length of pSourceChar, in bytes
pUnicodeChar - decoded character
pBytesToSkip - number of characters consumed from pSourceChar
Return Value:
STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
--***************************************************************************/
NTSTATUS
HttppPopCharQueryString(
IN PCUCHAR pSourceChar,
IN ULONG SourceLength,
IN BOOLEAN PercentUAllowed,
IN BOOLEAN AllowRestrictedChars,
OUT PULONG pUnicodeChar,
OUT PULONG pBytesToSkip
)
{
PAGED_CODE();
UNREFERENCED_PARAMETER(SourceLength);
UNREFERENCED_PARAMETER(PercentUAllowed);
UNREFERENCED_PARAMETER(AllowRestrictedChars);
*pUnicodeChar = *pSourceChar;
*pBytesToSkip = 1;
return STATUS_SUCCESS;
} // HttppPopCharQueryString
//
// a cool local helper macro
//
#define EMIT_CHAR(ch, pDest, BytesCopied, Status, AllowRestrictedChars) \
do \
{ \
WCHAR HighSurrogate, LowSurrogate; \
\
if ((ch) > LOW_NONCHAR_BITS) \
{ \
Status = HttpUcs4toUtf16((ch), \
&HighSurrogate, &LowSurrogate); \
\
if (! NT_SUCCESS(Status)) \
goto end; \
\
*pDest++ = HighSurrogate; \
*pDest++ = LowSurrogate; \
BytesCopied += 2 * sizeof(WCHAR); \
} \
else \
{ \
ASSERT(ch < HIGH_SURROGATE_START \
|| LOW_SURROGATE_END < ch); \
\
if ( IS_UNICODE_NONCHAR((ch)) ) \
{ \
UlTraceError(PARSER, ( \
"http!HttpUcs4toUtf16(): " \
"Non-character code point, U+%04lX.\n", \
(ch) )); \
\
Status = STATUS_INVALID_PARAMETER; \
goto end; \
} \
\
*pDest++ = (WCHAR) (ch); \
BytesCopied += sizeof(WCHAR); \
} \
\
/* Can probably omit this test */ \
if (BytesCopied > UNICODE_STRING_MAX_BYTE_LEN) \
{ \
Status = STATUS_DATA_OVERRUN; \
goto end; \
} \
} while (0, 0)
#define EMIT_LITERAL_CHAR(ch, pDest, BytesCopied) \
do \
{ \
ASSERT(IS_ASCII(ch)); \
\
*pDest++ = (WCHAR) (ch); \
BytesCopied += sizeof(WCHAR); \
} while (0, 0)
#define HttppUrlEncodingToString(UrlEncoding) \
((UrlEncoding == UrlDecode_Ansi) \
? "Ansi" \
: (UrlEncoding == UrlDecode_Dbcs) \
? "Dbcs" \
: "Utf8")
/***************************************************************************++
Routine Description:
Copies a hostname, converting it to Unicode
Arguments:
Return Value:
NTSTATUS - Completion status.
--***************************************************************************/
NTSTATUS
HttpCopyHost(
IN PURL_C14N_CONFIG pCfg,
OUT PWSTR pDestination,
IN PCUCHAR pSource,
IN ULONG SourceLength,
OUT PULONG pBytesCopied,
OUT PURL_ENCODING_TYPE pHostnameEncodingType
)
{
NTSTATUS Status = STATUS_UNSUCCESSFUL;
ULONG DecodeOrder = pCfg->HostnameDecodeOrder;
PAGED_CODE();
ASSERT(NULL != pCfg);
ASSERT(NULL != pDestination);
ASSERT(NULL != pSource);
ASSERT(NULL != pBytesCopied);
ASSERT(NULL != pHostnameEncodingType);
if (0 == DecodeOrder || DecodeOrder != (DecodeOrder & UrlDecode_MaxMask))
{
UlTraceError(PARSER,
("http!HttpCopyHost: invalid DecodeOrder, 0x%lX\n",
DecodeOrder
));
RETURN(STATUS_INVALID_PARAMETER);
}
for ( ;
0 != DecodeOrder && !NT_SUCCESS(Status);
DecodeOrder >>= UrlDecode_Shift
)
{
ULONG UrlEncoding = (DecodeOrder & UrlDecode_Mask);
switch (UrlEncoding)
{
default:
ASSERT(! "Impossible UrlDecodeOrder");
case UrlDecode_None:
break;
case UrlDecode_Ansi:
case UrlDecode_Dbcs:
case UrlDecode_Utf8:
UlTraceVerbose(PARSER,
("http!HttpCopyHost(%s, Src=%p, %lu)\n",
HttppUrlEncodingToString(UrlEncoding),
pSource, SourceLength
));
Status = HttppCopyHostByType(
(URL_ENCODING_TYPE) UrlEncoding,
pDestination,
pSource,
SourceLength,
pBytesCopied
);
if (NT_SUCCESS(Status))
{
*pHostnameEncodingType = (URL_ENCODING_TYPE) UrlEncoding;
UlTraceVerbose(PARSER,
("http!HttpCopyHost(%s): "
"(%lu) '%.*s' -> (%lu) '%ls'\n",
HttppUrlEncodingToString(UrlEncoding),
SourceLength, SourceLength, pSource,
*pBytesCopied/sizeof(WCHAR), pDestination
));
}
break;
};
}
return Status;
} // HttpCopyHost
/***************************************************************************++
Routine Description:
Copies a hostname, converting it to Unicode
CODEWORK: Handle ACE-encoded hostnames
Arguments:
Return Value:
NTSTATUS - Completion status.
--***************************************************************************/
NTSTATUS
HttppCopyHostByType(
IN URL_ENCODING_TYPE UrlEncoding,
OUT PWSTR pDestination,
IN PCUCHAR pSource,
IN ULONG SourceLength,
OUT PULONG pBytesCopied
)
{
NTSTATUS Status;
PWSTR pDest;
PCUCHAR pChar;
ULONG BytesCopied;
ULONG UnicodeChar;
ULONG CharToSkip;
PFN_POPCHAR_HOSTNAME pfnPopChar;
if (UrlEncoding_Ansi == UrlEncoding)
pfnPopChar = &HttppPopCharHostNameAnsi;
else if (UrlEncoding_Dbcs == UrlEncoding)
pfnPopChar = &HttppPopCharHostNameDbcs;
else if (UrlEncoding_Utf8 == UrlEncoding)
pfnPopChar = &HttppPopCharHostNameUtf8;
else
{
ASSERT(! "Invalid UrlEncoding");
RETURN(STATUS_INVALID_PARAMETER);
}
//
// Sanity check.
//
PAGED_CODE();
pDest = pDestination;
BytesCopied = 0;
pChar = pSource;
while ((int)SourceLength > 0)
{
UnicodeChar = *pChar;
if (IS_ASCII(UnicodeChar))
{
CharToSkip = 1;
}
else
{
Status = (*pfnPopChar)(
pChar,
SourceLength,
&UnicodeChar,
&CharToSkip
);
if (NT_SUCCESS(Status) == FALSE)
goto end;
}
ASSERT(CharToSkip <= SourceLength);
EMIT_CHAR(
UnicodeChar,
pDest,
BytesCopied,
Status,
FALSE
);
pChar += CharToSkip;
SourceLength -= CharToSkip;
}
//
// terminate the string, it hasn't been done in the loop
//
ASSERT((pDest-1)[0] != UNICODE_NULL);
pDest[0] = UNICODE_NULL;
*pBytesCopied = BytesCopied;
Status = STATUS_SUCCESS;
end:
return Status;
} // HttppCopyHostByType
/*++
Routine Description:
Validates that a hostname is well-formed
CODEWORK: For future IDN (International Domain Names) work,
we may need to handle raw UTF-8 or ACE hostnames.
Note: if the validation algorithm changes here, it may be necessary
to update HttpParseUrl() too.
Arguments:
pHostname - the hostname
HostnameLength - length of hostname, in bytes
HostnameType - Source of the hostname: Host header, AbsUri, or
synthesized from the transport's local IP address
Return Value:
STATUS_SUCCESS if valid
--*/
NTSTATUS
HttpValidateHostname(
IN PURL_C14N_CONFIG pCfg,
IN PCUCHAR pHostname,
IN ULONG HostnameLength,
IN HOSTNAME_TYPE HostnameType,
OUT PSHORT pAddressType
)
{
PCUCHAR pChar;
PCUCHAR pLabel;
PCUCHAR pEnd = pHostname + HostnameLength;
PCSTR pTerminator;
NTSTATUS Status;
USHORT Port;
struct in_addr IPv4Address;
struct in6_addr IPv6Address;
BOOLEAN AlphaLabel;
PAGED_CODE();
ASSERT(NULL != pCfg);
ASSERT(NULL != pHostname);
ASSERT(NULL != pAddressType);
if (0 == HostnameLength)
{
// RFC 2616, 14.23 "Host" says that the Host header can be empty
if (Hostname_HostHeader == HostnameType)
goto end;
// It is an error for empty hostnames to appear elsewhere
UlTraceError(PARSER,
("http!HttpValidateHostname: empty hostname\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Is this an IPv6 literal address, per RFC 2732?
if ('[' == *pHostname)
{
// Empty brackets?
if (HostnameLength < STRLEN_LIT("[0]") || ']' == pHostname[1])
{
UlTraceError(PARSER,
("http!HttpValidateHostname: IPv6 address too short\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
for (pChar = pHostname + STRLEN_LIT("["); pChar < pEnd; ++pChar)
{
if (']' == *pChar)
break;
//
// Dots are allowed because the last 32 bits may be represented
// in IPv4 dotted-octet notation. We do not accept Scope IDs
// (indicated by '%') in hostnames.
//
if (IS_HTTP_HEX(*pChar) || ':' == *pChar || '.' == *pChar)
continue;
UlTraceError(PARSER,
("http!HttpValidateHostname: "
"Invalid char in IPv6 address, 0x%02X '%c', "
"after '%.*s'\n",
*pChar,
IS_HTTP_PRINT(*pChar) ? *pChar : '?',
DIFF(pChar - pHostname),
pHostname
));
RETURN(STATUS_INVALID_PARAMETER);
}
if (pChar == pEnd)
{
UlTraceError(PARSER,
("http!HttpValidateHostname: No ']' for IPv6 address\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
ASSERT(pChar < pEnd);
ASSERT(']' == *pChar);
// Let the RTL routine do the hard work of parsing IPv6 addrs
Status = RtlIpv6StringToAddressA(
(PCSTR) pHostname + STRLEN_LIT("["),
&pTerminator,
&IPv6Address
);
if (! NT_SUCCESS(Status))
{
UlTraceError(PARSER,
("http!HttpValidateHostname: "
"Invalid IPv6 address, %s\n",
HttpStatusToString(Status)
));
RETURN(Status);
}
if (pTerminator != (PCSTR) pChar)
{
UlTraceError(PARSER,
("http!HttpValidateHostname: "
"Invalid IPv6 terminator, 0x%02X '%c'\n",
*pTerminator,
IS_HTTP_PRINT(*pTerminator) ? *pTerminator : '?'
));
RETURN(STATUS_INVALID_PARAMETER);
}
*pAddressType = TDI_ADDRESS_TYPE_IP6;
// Skip the terminating ']'
pChar += STRLEN_LIT("]");
// Any chars after the ']'?
if (pChar == pEnd)
{
ASSERT(DIFF(pEnd - pHostname) <= pCfg->MaxHostnameLength);
goto end;
}
ASSERT(pChar < pEnd);
if (':' == *pChar)
goto port;
UlTraceError(PARSER,
("http!HttpValidateHostname: "
"Invalid char after IPv6 ']', 0x%02X '%c'\n",
*pChar,
IS_HTTP_PRINT(*pChar) ? *pChar : '?'
));
RETURN(STATUS_INVALID_PARAMETER);
}
//
// It must be a domain name or an IPv4 literal. We'll try to treat
// it as a domain name first. If it turns out to be all-numeric,
// we'll try decoding it as an IPv4 literal. We'll see if the name
// is well-formed, but we will not do a DNS lookup to see if it exists,
// as that would be much too expensive.
//
AlphaLabel = FALSE;
pLabel = pHostname;
for (pChar = pHostname; pChar < pEnd; ++pChar)
{
if (':' == *pChar)
{
if (pChar == pHostname)
{
UlTraceError(PARSER,
("http!HttpValidateHostname: empty hostname\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
// exit the loop
break;
}
if ('.' == *pChar)
{
ULONG LabelLength = DIFF(pChar - pLabel);
// There must be at least one char in the label
if (0 == LabelLength)
{
UlTraceError(PARSER,
("http!HttpValidateHostname: empty label\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Label can't have more than 63 chars
if (LabelLength > pCfg->MaxLabelLength)
{
UlTraceError(PARSER,
("http!HttpValidateHostname: overlong label, %lu\n",
LabelLength
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Reset for the next label
pLabel = pChar + STRLEN_LIT(".");
continue;
}
// CODEWORK: handle DBCS characters
if (!IS_URL_ILLEGAL_COMPUTERNAME(*pChar))
{
if (!IS_HTTP_DIGIT(*pChar))
AlphaLabel = TRUE;
if (pChar > pLabel)
continue;
// The first char of a label cannot be a hyphen. (Underscore?)
if ('-' == *pChar)
{
UlTraceError(PARSER,
("http!HttpValidateHostname: "
"'-' at beginning of label\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
continue;
}
UlTraceError(PARSER,
("http!HttpValidateHostname: "
"Invalid char in hostname, 0x%02X '%c', "
"after '%.*s'\n",
*pChar,
IS_HTTP_PRINT(*pChar) ? *pChar : '?',
DIFF(pChar - pHostname),
pHostname
));
RETURN(STATUS_INVALID_PARAMETER);
} // loop through hostname
ASSERT(pChar == pEnd || ':' == *pChar);
if (AlphaLabel)
{
*pAddressType = 0;
}
else
{
// Let's see if it's a valid IPv4 address
Status = RtlIpv4StringToAddressA(
(PCSTR) pHostname,
TRUE, // strict => 4 dotted decimal octets
&pTerminator,
&IPv4Address
);
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER,
("http!HttpValidateHostname: "
"Invalid IPv4 address, %s\n",
HttpStatusToString(Status)
));
RETURN(Status);
}
if (pTerminator != (PCSTR) pChar)
{
ASSERT(pTerminator < (PCSTR) pChar);
UlTraceError(PARSER,
("http!HttpValidateHostname: "
"Invalid IPv4 address after %lu chars, "
"0x%02X, '%c'\n",
DIFF(pTerminator - (PCSTR) pHostname),
*pTerminator,
IS_HTTP_PRINT(*pTerminator) ? *pTerminator : '?'
));
RETURN(STATUS_INVALID_PARAMETER);
}
*pAddressType = TDI_ADDRESS_TYPE_IP;
}
port:
//
// Parse the port number
//
// Check for overlong hostnames
if (DIFF(pChar - pHostname) > pCfg->MaxHostnameLength)
{
UlTraceError(PARSER,
("http!HttpValidateHostname: overlong hostname, %lu\n",
DIFF(pChar - pHostname)
));
RETURN(STATUS_INVALID_PARAMETER);
}
if (pChar == pEnd)
goto end;
ASSERT(pHostname < pChar && pChar < pEnd);
ASSERT(':' == *pChar);
pChar += STRLEN_LIT(":");
ASSERT(pChar <= pEnd);
// RFC 2616, section 3.2.2 "http URL", says:
// "If the port is empty or not given, port 80 is assumed".
if (pChar == pEnd)
{
Port = 80;
goto end;
}
Status = HttpAnsiStringToUShort(
pChar,
pEnd - pChar, // <port> must occupy all remaining chars
FALSE, // no leading zeros permitted
10,
(PUCHAR*) &pTerminator,
&Port
);
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER,
("http!HttpValidateHostname: "
"Invalid port number, %s\n",
HttpStatusToString(Status)
));
RETURN(STATUS_INVALID_PARAMETER);
}
ASSERT(pTerminator == (PCSTR) pEnd);
if (0 == Port)
{
UlTraceError(PARSER,
("http!HttpValidateHostname: Port must not be zero.\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
end:
RETURN(STATUS_SUCCESS);
} // HttpValidateHostname
/***************************************************************************++
Routine Description:
Convert to unicode
Arguments:
Return Value:
NTSTATUS - Completion status.
--***************************************************************************/
NTSTATUS
HttpCopyUrl(
IN PURL_C14N_CONFIG pCfg,
OUT PWSTR pDestination,
IN PCUCHAR pSource,
IN ULONG SourceLength,
OUT PULONG pBytesCopied,
OUT PURL_ENCODING_TYPE pUrlEncodingType
)
{
NTSTATUS Status = STATUS_UNSUCCESSFUL;
ULONG DecodeOrder = pCfg->AbsPathDecodeOrder;
PAGED_CODE();
ASSERT(NULL != pDestination);
ASSERT(NULL != pSource);
ASSERT(NULL != pBytesCopied);
ASSERT(NULL != pUrlEncodingType);
if (0 == DecodeOrder || DecodeOrder != (DecodeOrder & UrlDecode_MaxMask))
{
UlTraceError(PARSER,
("http!HttpCopyUrl: invalid DecodeOrder, 0x%lX\n",
DecodeOrder
));
RETURN(STATUS_INVALID_PARAMETER);
}
for ( ;
0 != DecodeOrder && !NT_SUCCESS(Status);
DecodeOrder >>= UrlDecode_Shift
)
{
ULONG UrlEncoding = (DecodeOrder & UrlDecode_Mask);
switch (UrlEncoding)
{
default:
ASSERT(! "Impossible UrlDecodeOrder");
case UrlDecode_None:
break;
case UrlDecode_Ansi:
case UrlDecode_Dbcs:
case UrlDecode_Utf8:
UlTraceVerbose(PARSER,
("http!HttpCopyUrl(%s, Src=%p, %lu)\n",
HttppUrlEncodingToString(UrlEncoding),
pSource, SourceLength
));
Status = HttppCopyUrlByType(
pCfg,
(URL_ENCODING_TYPE) UrlEncoding,
pDestination,
pSource,
SourceLength,
pBytesCopied
);
if (NT_SUCCESS(Status))
{
*pUrlEncodingType = (URL_ENCODING_TYPE) UrlEncoding;
UlTraceVerbose(PARSER,
("http!HttpCopyUrl(%s): "
"(%lu) '%.*s' -> (%lu) '%ls'\n",
HttppUrlEncodingToString(UrlEncoding),
SourceLength, SourceLength, pSource,
*pBytesCopied/sizeof(WCHAR), pDestination
));
}
break;
};
}
return Status;
} // HttpCopyUrl
/***************************************************************************++
Routine Description:
This function can be told to copy UTF-8, ANSI, or DBCS URLs.
Convert to Unicode
Arguments:
Return Value:
NTSTATUS - Completion status.
--***************************************************************************/
NTSTATUS
HttppCopyUrlByType(
IN PURL_C14N_CONFIG pCfg,
IN URL_ENCODING_TYPE UrlEncoding,
OUT PWSTR pDestination,
IN PCUCHAR pSource,
IN ULONG SourceLength,
OUT PULONG pBytesCopied
)
{
PWSTR pDest;
PCUCHAR pChar;
ULONG BytesCopied;
ULONG UnicodeChar;
ULONG CharToSkip;
#if DBG
NTSTATUS Status;
PFN_POPCHAR_ABSPATH pfnPopChar;
PWSTR pSegment = pDestination;
ULONG SegmentCount = 0;
#endif // DBG
//
// Sanity check.
//
PAGED_CODE();
#if DBG
if (UrlEncoding_Ansi == UrlEncoding)
pfnPopChar = &HttppPopCharAbsPathAnsi;
else if (UrlEncoding_Dbcs == UrlEncoding)
pfnPopChar = &HttppPopCharAbsPathDbcs;
else if (UrlEncoding_Utf8 == UrlEncoding)
pfnPopChar = &HttppPopCharAbsPathUtf8;
else
{
ASSERT(! "Invalid UrlEncoding");
RETURN(STATUS_INVALID_PARAMETER);
}
#else // !DBG
UNREFERENCED_PARAMETER(pCfg);
UNREFERENCED_PARAMETER(UrlEncoding);
#endif // DBG
pDest = pDestination;
BytesCopied = 0;
pChar = pSource;
CharToSkip = 1;
while ((int)SourceLength > 0)
{
ULONG NextUnicodeChar = FastPopChars[*pChar];
//
// Grab the next character.
//
// All clean chars have a non-zero entry in FastPopChars[].
// All clean chars are in the US-ASCII range, 0-127.
//
ASSERT(0 != NextUnicodeChar);
ASSERT(IS_ASCII(NextUnicodeChar));
#if DBG
Status = (*pfnPopChar)(
pChar,
SourceLength,
pCfg->PercentUAllowed,
pCfg->AllowRestrictedChars,
&UnicodeChar,
&CharToSkip
);
ASSERT(NT_SUCCESS(Status));
ASSERT(UnicodeChar == NextUnicodeChar);
ASSERT(CharToSkip == 1);
#endif // !DBG
UnicodeChar = (WCHAR) NextUnicodeChar;
CharToSkip = 1;
#if DBG
// Because HttpFindUrlToken() marks as dirty any URLs that
// (appear to) have too many segments or overlong segments,
// we should never hit these assertions
if (FORWARD_SLASH == UnicodeChar)
{
ULONG SegmentLength = DIFF(pDest - pSegment);
// The segment length should be within bounds
ASSERT(SegmentLength > 0 || pDestination == pSegment);
ASSERT(SegmentLength
<= pCfg->UrlSegmentMaxLength + WCSLEN_LIT(L"/"));
pSegment = pDest;
++SegmentCount;
// There should not be too many segments
ASSERT(SegmentCount <= pCfg->UrlSegmentMaxCount);
}
#endif // DBG
EMIT_LITERAL_CHAR(UnicodeChar, pDest, BytesCopied);
pChar += CharToSkip;
SourceLength -= CharToSkip;
}
//
// terminate the string, it hasn't been done in the loop
//
ASSERT((pDest-1)[0] != UNICODE_NULL);
pDest[0] = UNICODE_NULL;
*pBytesCopied = BytesCopied;
ASSERT(DIFF(pDest - pSegment) > 0);
ASSERT(DIFF(pDest - pSegment)
<= pCfg->UrlSegmentMaxLength + WCSLEN_LIT(L"/"));
ASSERT(SegmentCount < pCfg->UrlSegmentMaxCount);
return STATUS_SUCCESS;
} // HttppCopyUrlByType
/***************************************************************************++
Routine Description:
Unescape
Convert backslash to forward slash
Remove double slashes (empty directiories names) - e.g. // or \\
Handle /./
Handle /../
Convert to unicode
Arguments:
Return Value:
NTSTATUS - Completion status.
Note: Any changes to this code may require changes for the fast path code too.
The fast path is HttpCopyUrl.
--***************************************************************************/
NTSTATUS
HttpCleanAndCopyUrl(
IN PURL_C14N_CONFIG pCfg,
IN URL_PART UrlPart,
OUT PWSTR pDestination,
IN PCUCHAR pSource,
IN ULONG SourceLength,
OUT PULONG pBytesCopied,
OUT PWSTR * ppQueryString OPTIONAL,
OUT PURL_ENCODING_TYPE pUrlEncodingType
)
{
NTSTATUS Status = STATUS_UNSUCCESSFUL;
ULONG DecodeOrder = pCfg->AbsPathDecodeOrder;
PAGED_CODE();
ASSERT(NULL != pDestination);
ASSERT(NULL != pSource);
ASSERT(NULL != pBytesCopied);
ASSERT(NULL != pUrlEncodingType);
if (0 == DecodeOrder || DecodeOrder != (DecodeOrder & UrlDecode_MaxMask))
{
UlTraceError(PARSER,
("http!HttpCleanAndCopyUrl: invalid DecodeOrder, 0x%lX\n",
DecodeOrder
));
RETURN(STATUS_INVALID_PARAMETER);
}
for ( ;
0 != DecodeOrder && !NT_SUCCESS(Status);
DecodeOrder >>= UrlDecode_Shift
)
{
ULONG UrlEncoding = (DecodeOrder & UrlDecode_Mask);
switch (UrlEncoding)
{
default:
ASSERT(! "Impossible UrlDecodeOrder");
case UrlDecode_None:
break;
case UrlDecode_Ansi:
case UrlDecode_Dbcs:
case UrlDecode_Utf8:
UlTraceVerbose(PARSER,
("http!HttpCleanAndCopyUrl(%s, Src=%p, %lu)\n",
HttppUrlEncodingToString(UrlEncoding),
pSource, SourceLength
));
Status = HttppCleanAndCopyUrlByType(
pCfg,
(URL_ENCODING_TYPE) UrlEncoding,
UrlPart,
pDestination,
pSource,
SourceLength,
pBytesCopied,
ppQueryString
);
if (NT_SUCCESS(Status))
{
*pUrlEncodingType = (URL_ENCODING_TYPE) UrlEncoding;
UlTraceVerbose(PARSER,
("http!HttpCleanAndCopyUrl(%s): "
"(%lu) '%.*s' -> (%lu) '%ls'\n",
HttppUrlEncodingToString(UrlEncoding),
SourceLength, SourceLength, pSource,
*pBytesCopied/sizeof(WCHAR), pDestination
));
}
break;
};
}
return Status;
} // HttpCleanAndCopyUrl
//
// HttppCleanAndCopyUrlByType() uses StateFromStateAndToken[][] and
// ActionFromStateAndToken[][] to handle "//", "/./", and "/../" productions.
//
#define TOK_STATE(state, other, dot, eos, slash) \
{ \
URL_STATE_ ## other, \
URL_STATE_ ## dot, \
URL_STATE_ ## eos, \
URL_STATE_ ## slash \
}
//
// CanonStateFromStateAndToken[][] is used by HttpParseUrl() to reject
// "//", "/./", and "/../" sequences, as these URLs are supposed to
// be in canonical form already.
//
const URL_STATE
CanonStateFromStateAndToken[URL_STATE_MAX][URL_TOKEN_MAX] =
{
// State \ Token: Other '.' EOS '/'
TOK_STATE( START, START, START, END, SLASH),
TOK_STATE( SLASH, START, SLASH_DOT, END, ERROR),
TOK_STATE( SLASH_DOT, START, SLASH_DOT_DOT, END, ERROR),
TOK_STATE( SLASH_DOT_DOT, START, START, ERROR, ERROR),
TOK_STATE( END, END, END, END, END),
TOK_STATE( ERROR, ERROR, ERROR, ERROR, ERROR)
};
//
// StateFromStateAndToken[][] says which new state to transition to given
// the current state and the token we saw. Used by HttppCleanAndCopyUrlByType()
//
const URL_STATE
StateFromStateAndToken[URL_STATE_MAX][URL_TOKEN_MAX] =
{
// State \ Token: Other '.' EOS '/'
TOK_STATE( START, START, START, END, SLASH),
TOK_STATE( SLASH, START, SLASH_DOT, END, SLASH),
TOK_STATE( SLASH_DOT, START, SLASH_DOT_DOT, END, SLASH),
TOK_STATE( SLASH_DOT_DOT, START, START, END, SLASH),
TOK_STATE( END, END, END, END, END),
TOK_STATE( ERROR, ERROR, ERROR, ERROR, ERROR)
};
//
// ActionFromStateAndToken[][] says what action to perform based on
// the current state and the current token
//
#define NEW_ACTION(state, other, dot, eos, slash) \
{ \
ACTION_ ## other, \
ACTION_ ## dot, \
ACTION_ ## eos, \
ACTION_ ## slash \
}
const URL_ACTION
ActionFromStateAndToken[URL_STATE_MAX][URL_TOKEN_MAX] =
{
// State \ Token: Other '.' EOS '/'
NEW_ACTION(START, EMIT_CH, EMIT_CH, NOTHING, EMIT_CH),
NEW_ACTION(SLASH, EMIT_CH, NOTHING, NOTHING, NOTHING),
NEW_ACTION(SLASH_DOT, EMIT_DOT_CH, NOTHING, NOTHING, NOTHING),
NEW_ACTION(SLASH_DOT_DOT, EMIT_DOT_DOT_CH,
EMIT_DOT_DOT_CH, BACKUP, BACKUP),
NEW_ACTION(END, NOTHING, NOTHING, NOTHING, NOTHING)
};
#if DBG
PCSTR
HttppUrlActionToString(
URL_ACTION Action)
{
switch (Action)
{
case ACTION_NOTHING: return "NOTHING";
case ACTION_EMIT_CH: return "EMIT_CH";
case ACTION_EMIT_DOT_CH: return "EMIT_DOT_CH";
case ACTION_EMIT_DOT_DOT_CH: return "EMIT_DOT_DOT_CH";
case ACTION_BACKUP: return "BACKUP";
case ACTION_MAX: return "MAX";
default:
ASSERT(! "Invalid URL_ACTION");
return "ACTION_???";
}
} // HttppUrlActionToString
PCSTR
HttppUrlStateToString(
URL_STATE UrlState)
{
switch (UrlState)
{
case URL_STATE_START: return "START";
case URL_STATE_SLASH: return "SLASH";
case URL_STATE_SLASH_DOT: return "SLASH_DOT";
case URL_STATE_SLASH_DOT_DOT: return "SLASH_DOT_DOT";
case URL_STATE_END: return "END";
case URL_STATE_ERROR: return "ERROR";
case URL_STATE_MAX: return "MAX";
default:
ASSERT(! "Invalid URL_STATE");
return "URL_STATE_???";
}
} // HttppUrlStateToString
PCSTR
HttppUrlTokenToString(
URL_STATE_TOKEN UrlToken)
{
switch (UrlToken)
{
case URL_TOKEN_OTHER: return "OTHER";
case URL_TOKEN_DOT: return "DOT";
case URL_TOKEN_EOS: return "EOS";
case URL_TOKEN_SLASH: return "SLASH";
case URL_TOKEN_MAX: return "MAX";
default:
ASSERT(! "Invalid URL_STATE_TOKEN");
return "URL_TOKEN_???";
}
} // HttppUrlTokenToString
#endif // DBG
PCSTR
HttpSiteTypeToString(
HTTP_URL_SITE_TYPE SiteType
)
{
switch (SiteType)
{
case HttpUrlSite_None: return "None";
case HttpUrlSite_Name: return "Name";
case HttpUrlSite_IP: return "IP";
case HttpUrlSite_NamePlusIP: return "Name+IP";
case HttpUrlSite_WeakWildcard: return "Weak";
case HttpUrlSite_StrongWildcard: return "Strong";
case HttpUrlSite_Max: return "Max";
default:
ASSERT(! "Invalid HTTP_URL_SITE_TYPE");
return "????";
}
}
/***************************************************************************++
Routine Description:
This function can be told to clean up UTF-8, ANSI, or DBCS URLs.
Unescape
Convert backslash to forward slash
Remove double slashes (empty directiories names) - e.g. // or \\
Handle /./
Handle /../
Convert to unicode
Arguments:
Return Value:
NTSTATUS - Completion status.
Note: Any changes to this code may require changes for the fast path code too.
The fast path is HttppCopyUrlByType.
--***************************************************************************/
NTSTATUS
HttppCleanAndCopyUrlByType(
IN PURL_C14N_CONFIG pCfg,
IN URL_ENCODING_TYPE UrlEncoding,
IN URL_PART UrlPart,
OUT PWSTR pDestination,
IN PCUCHAR pSource,
IN ULONG SourceLength,
OUT PULONG pBytesCopied,
OUT PWSTR * ppQueryString OPTIONAL
)
{
NTSTATUS Status;
PWSTR pDest;
PCUCHAR pChar;
ULONG CharToSkip;
ULONG BytesCopied;
PWSTR pQueryString;
URL_STATE UrlState = URL_STATE_START;
URL_STATE_TOKEN UrlToken = URL_TOKEN_OTHER;
URL_ACTION Action = ACTION_NOTHING;
ULONG UnicodeChar;
BOOLEAN MakeCanonical;
PWCHAR pFastPopChar;
PFN_POPCHAR_ABSPATH pfnPopChar;
PWSTR pSegment = pDestination;
ULONG SegmentCount = 0;
BOOLEAN TestSegment = FALSE;
#if DBG
ULONG OriginalSourceLength = SourceLength;
#endif
//
// Sanity check.
//
PAGED_CODE();
ASSERT(UrlPart_AbsPath == UrlPart);
if (UrlEncoding_Ansi == UrlEncoding)
pfnPopChar = &HttppPopCharAbsPathAnsi;
else if (UrlEncoding_Dbcs == UrlEncoding)
pfnPopChar = &HttppPopCharAbsPathDbcs;
else if (UrlEncoding_Utf8 == UrlEncoding)
pfnPopChar = &HttppPopCharAbsPathUtf8;
else
{
ASSERT(! "Invalid UrlEncoding");
RETURN(STATUS_INVALID_PARAMETER);
}
ASSERT(FORWARD_SLASH == *pSource);
pDest = pDestination;
pQueryString = NULL;
BytesCopied = 0;
pChar = pSource;
CharToSkip = 0;
UrlState = 0;
MakeCanonical = (BOOLEAN) (UrlPart == UrlPart_AbsPath);
if (UrlEncoding == UrlEncoding_Utf8 && UrlPart != UrlPart_QueryString)
{
pFastPopChar = FastPopChars;
}
else
{
pFastPopChar = DummyPopChars;
}
while (SourceLength > 0)
{
//
// advance ! it's at the top of the loop to enable ANSI_NULL to
// come through ONCE
//
ASSERT(CharToSkip <= SourceLength);
pChar += CharToSkip;
SourceLength -= CharToSkip;
//
// well? have we hit the end?
//
if (SourceLength == 0)
{
UnicodeChar = UNICODE_NULL;
CharToSkip = 1;
}
else
{
//
// Nope. Peek briefly to see if we hit the query string
//
if (UrlPart == UrlPart_AbsPath && pChar[0] == QUESTION_MARK)
{
ASSERT(pQueryString == NULL);
//
// remember its location
//
pQueryString = pDest;
//
// let it fall through ONCE to the canonical
// in order to handle a trailing "/.." like
// "http://foobar:80/foo/bar/..?v=1&v2"
//
TestSegment = TRUE;
UnicodeChar = QUESTION_MARK;
CharToSkip = 1;
//
// now we are cleaning the query string
//
UrlPart = UrlPart_QueryString;
UlTraceVerbose(PARSER, ("QueryString @ %p\n", pQueryString));
//
// cannot use fast path for PopChar anymore
//
pFastPopChar = DummyPopChars;
pfnPopChar = HttppPopCharQueryString;
}
else
{
ULONG NextUnicodeChar = pFastPopChar[*pChar];
//
// Grab the next character. Try to be fast for the
// normal character case. Otherwise call PopChar.
//
if (NextUnicodeChar == 0)
{
Status = (*pfnPopChar)(
pChar,
SourceLength,
pCfg->PercentUAllowed,
pCfg->AllowRestrictedChars,
&UnicodeChar,
&CharToSkip
);
if (NT_SUCCESS(Status) == FALSE)
goto end;
}
else
{
#if DBG
Status = (*pfnPopChar)(
pChar,
SourceLength,
pCfg->PercentUAllowed,
pCfg->AllowRestrictedChars,
&UnicodeChar,
&CharToSkip
);
ASSERT(NT_SUCCESS(Status));
ASSERT(UnicodeChar == NextUnicodeChar);
ASSERT(CharToSkip == 1);
#endif // DBG
UnicodeChar = (WCHAR) NextUnicodeChar;
CharToSkip = 1;
}
}
}
if (!MakeCanonical)
{
UrlToken = (UnicodeChar == UNICODE_NULL)
? URL_TOKEN_EOS
: URL_TOKEN_OTHER;
TestSegment = FALSE;
}
else
{
//
// now use the state machine to make it canonical.
//
//
// did we just hit the query string? this will only happen once
// that we take this branch after hitting it, as we stop
// processing after hitting it.
//
if (UrlPart == UrlPart_QueryString)
{
//
// treat this just like we hit a NULL, EOS.
//
ASSERT(QUESTION_MARK == UnicodeChar);
UrlToken = URL_TOKEN_EOS;
TestSegment = TRUE;
}
else
{
//
// otherwise based the new state off of the char we
// just popped.
//
switch (UnicodeChar)
{
case UNICODE_NULL:
UrlToken = URL_TOKEN_EOS;
TestSegment = TRUE;
break;
case DOT:
UrlToken = URL_TOKEN_DOT;
TestSegment = FALSE;
break;
case FORWARD_SLASH:
UrlToken = URL_TOKEN_SLASH;
TestSegment = TRUE;
break;
default:
UrlToken = URL_TOKEN_OTHER;
TestSegment = FALSE;
break;
}
}
}
Action = ActionFromStateAndToken[UrlState][UrlToken];
IF_DEBUG2BOTH(PARSER, VERBOSE)
{
ULONG i;
UCHAR HexBuff[5*12 + 10];
PUCHAR p = HexBuff;
UCHAR Byte;
ASSERT(CharToSkip <= 4 * STRLEN_LIT("%NN"));
// Generate something like
// "[25 65 32 25 38 30 25 39 35] '%e2%80%95'"
*p++ = '[';
for (i = 0; i < CharToSkip; ++i)
{
const static char hexArray[] = "0123456789ABCDEF";
Byte = pChar[i];
*p++ = hexArray[Byte >> 4];
*p++ = hexArray[Byte & 0xf];
*p++ = ' ';
}
p[-1] = ']'; // overwrite last ' '
*p++ = ' ';
*p++ = '\'';
for (i = 0; i < CharToSkip; ++i)
{
Byte = pChar[i];
*p++ = (IS_HTTP_PRINT(Byte) ? Byte : '?');
}
*p++ = '\'';
*p++ = '\0';
ASSERT(DIFF(p - HexBuff) <= DIMENSION(HexBuff));
UlTrace(PARSER,
("http!HttppCleanAndCopyUrlByType(%s): "
"(%lu) %s -> U+%04lX '%c': "
"[%s][%s] -> %s, %s%s\n",
HttppUrlEncodingToString(UrlEncoding),
CharToSkip, HexBuff,
UnicodeChar,
IS_ANSI(UnicodeChar) && IS_HTTP_PRINT(UnicodeChar)
? (UCHAR) UnicodeChar : '?',
HttppUrlStateToString(UrlState),
HttppUrlTokenToString(UrlToken),
HttppUrlStateToString(
StateFromStateAndToken[UrlState][UrlToken]),
HttppUrlActionToString(Action),
TestSegment ? ", TestSegment" : ""
));
} // IF_DEBUG2BOTH(PARSER, VERBOSE)
//
// Segment length and segment count checks
//
if (TestSegment)
{
ULONG SegmentLength = DIFF(pDest - pSegment);
ASSERT(pSegment <= pDest);
UlTraceVerbose(PARSER,
("http!HttppCleanAndCopyUrlByType: "
"Segment[%lu] %p (%lu) = '%.*ls'\n",
SegmentCount, pSegment, SegmentLength,
SegmentLength, pSegment
));
// Reject if segment too long
if (SegmentLength > pCfg->UrlSegmentMaxLength + WCSLEN_LIT(L"/"))
{
UlTraceError(PARSER, (
"http!HttppCleanAndCopyUrlByType: "
"Segment too long: %lu\n",
SegmentLength
));
RETURN(STATUS_INVALID_DEVICE_REQUEST);
}
pSegment = pDest;
// Reject if too many path segments
if (Action != ACTION_NOTHING)
{
if (pSegment == pDestination)
{
SegmentCount = 0;
}
else if (++SegmentCount > pCfg->UrlSegmentMaxCount)
{
UlTraceError(PARSER, (
"http!HttppCleanAndCopyUrlByType: "
"Too many segments: %lu\n",
SegmentCount
));
RETURN(STATUS_INVALID_DEVICE_REQUEST);
}
}
}
//
// Perform the action associated with the state.
//
switch (Action)
{
case ACTION_EMIT_DOT_DOT_CH:
EMIT_LITERAL_CHAR(DOT, pDest, BytesCopied);
// fall through
case ACTION_EMIT_DOT_CH:
EMIT_LITERAL_CHAR(DOT, pDest, BytesCopied);
// fall through
case ACTION_EMIT_CH:
EMIT_CHAR(
UnicodeChar,
pDest,
BytesCopied,
Status,
pCfg->AllowRestrictedChars
);
// fall through
case ACTION_NOTHING:
break;
case ACTION_BACKUP:
//
// pDest currently points 1 past the last '/'. backup over it and
// find the preceding '/', set pDest to 1 past that one.
//
//
// backup to the '/'
//
pDest -= 1;
BytesCopied -= sizeof(WCHAR);
ASSERT(pDest[0] == FORWARD_SLASH);
//
// are we at the start of the string? that's bad, can't go back!
//
if (pDest == pDestination)
{
ASSERT(BytesCopied == 0);
UlTraceError(PARSER, (
"http!HttppCleanAndCopyUrl: "
"Can't back up for \"/../\"\n"
));
Status = STATUS_OBJECT_PATH_INVALID;
goto end;
}
//
// back up over the '/'
//
pDest -= 1;
BytesCopied -= sizeof(WCHAR);
ASSERT(pDest > pDestination);
//
// now find the previous slash
//
while (pDest > pDestination && pDest[0] != FORWARD_SLASH)
{
pDest -= 1;
BytesCopied -= sizeof(WCHAR);
}
//
// Adjust segment trackers downwards
//
pSegment = pDest;
if (pSegment == pDestination)
SegmentCount = 0;
else
--SegmentCount;
//
// we already have a slash, so don't have to store one.
//
ASSERT(pDest[0] == FORWARD_SLASH);
//
// simply skip it, as if we had emitted it just now
//
pDest += 1;
BytesCopied += sizeof(WCHAR);
break;
default:
ASSERT(!"http!HttppCleanAndCopyUrl: "
"Invalid action code in state table!");
Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
goto end;
}
//
// Just hit the query string ?
//
if (MakeCanonical && UrlPart == UrlPart_QueryString)
{
//
// Stop canonical processing
//
MakeCanonical = FALSE;
//
// Need to emit the '?', it wasn't emitted above
//
ASSERT(ActionFromStateAndToken[UrlState][UrlToken]
!= ACTION_EMIT_CH);
//
// remember its location (in case we backed up)
//
pQueryString = pDest;
EMIT_LITERAL_CHAR(QUESTION_MARK, pDest, BytesCopied);
// reset
UrlToken = URL_TOKEN_OTHER;
UrlState = URL_STATE_START;
}
// update the URL state
UrlState = StateFromStateAndToken[UrlState][UrlToken];
ASSERT(URL_STATE_ERROR != UrlState);
}
//
// terminate the string, it hasn't been done in the loop
//
ASSERT((pDest-1)[0] != UNICODE_NULL);
pDest[0] = UNICODE_NULL;
*pBytesCopied = BytesCopied;
if (BytesCopied > pCfg->UrlMaxLength * sizeof(WCHAR))
{
UlTraceError(PARSER, (
"http!HttppCleanAndCopyUrlByType: "
"URL too long: %lu\n",
BytesCopied
));
RETURN(STATUS_INVALID_DEVICE_REQUEST);
}
if (ppQueryString != NULL)
{
*ppQueryString = pQueryString;
}
UlTraceVerbose(PARSER,
("http!HttppCleanAndCopyUrlByType: "
"(%lu) '%.*s' -> (%lu) '%.*ls', %squerystring\n",
OriginalSourceLength,
OriginalSourceLength, pSource,
BytesCopied/sizeof(WCHAR),
BytesCopied/sizeof(WCHAR), pDestination,
pQueryString != NULL ? "" : "no "
));
Status = STATUS_SUCCESS;
end:
return Status;
} // HttppCleanAndCopyUrlByType
/*++
Routine Description:
A utility routine to find a Url token. We take an input pointer, skip any
preceding LWS, then scan the token until we find either LWS or a CRLF
pair. We also mark the request to have a "Clean" Url
Arguments:
pBuffer - Buffer to search for token.
BufferLength - Length of data pointed to by pBuffer.
ppTokenStart - Where to return the start of the token, if we locate
its delimiter.
pTokenLength - Where to return the length of the token.
pRawUrlClean - where to return cleanliness of URL
Return Value:
STATUS_SUCCESS if no parsing errors in the URL.
We also return, in *ppTokenStart, a pointer to the token we found,
or NULL if we don't find a whitespace-delimited token.
pRawUrlClean flag may be set.
--*/
NTSTATUS
HttpFindUrlToken(
IN PURL_C14N_CONFIG pCfg,
IN PCUCHAR pBuffer,
IN ULONG BufferLength,
OUT PUCHAR* ppTokenStart,
OUT PULONG pTokenLength,
OUT PBOOLEAN pRawUrlClean
)
{
PCUCHAR pTokenStart;
PCUCHAR pSegment;
UCHAR CurrentChar;
UCHAR PreviousChar;
ULONG SegmentCount = 0;
ULONG TokenLength;
//
// Sanity check.
//
PAGED_CODE();
ASSERT(NULL != pBuffer);
ASSERT(NULL != ppTokenStart);
ASSERT(NULL != pTokenLength);
ASSERT(NULL != pRawUrlClean);
//
// Assume Clean RawUrl
//
*pRawUrlClean = TRUE;
*ppTokenStart = NULL;
*pTokenLength = 0;
//
// First, skip any preceding LWS.
//
while (BufferLength > 0 && IS_HTTP_LWS(*pBuffer))
{
pBuffer++;
BufferLength--;
}
// If we stopped because we ran out of buffer, bail.
if (BufferLength == 0)
{
return STATUS_SUCCESS;
}
pTokenStart = pBuffer;
PreviousChar = ANSI_NULL;
// This will usually point to a '/', but it won't if this is an AbsURI.
// It doesn't really matter, since only a few borderline cases will
// be marked as dirty that might not otherwise be.
pSegment = pBuffer;
// Now skip over the token, until we see either LWS or a CR or LF.
while ( BufferLength != 0 )
{
CurrentChar = *pBuffer;
// must check for WS [ \t\r\n] first, since \t, \r, & \n are CTL chars!
if ( IS_HTTP_WS_TOKEN(CurrentChar) )
{
break;
}
if ( IS_HTTP_CTL(CurrentChar) )
{
*pRawUrlClean = FALSE;
*ppTokenStart = NULL;
UlTraceError(PARSER, (
"http!HttpFindUrlToken: "
"Found control char: %02X\n",
CurrentChar
));
RETURN(STATUS_INVALID_DEVICE_REQUEST);
}
//
// URL is NOT clean if it contains any of the following patterns
//
// a. back slash "\"
// b. dot, forward slash | forward slash, forward slash "./" | "//"
// c. forward slash, dot | dot, dot "/." | ".."
// d. question mark (querystring) "?"
// e. percent (hex escape) "%"
// f. raw bytes with high bit set, >= 0x80
//
// These are conservative estimates of "Clean"; some clean URLs may not
// be marked as clean. For such URLs, we'll skip the fast path but at
// no loss of functionality.
//
if ( IS_URL_DIRTY(CurrentChar) )
{
// Only do the checks if it's still clean
if (*pRawUrlClean)
{
if (CurrentChar == FORWARD_SLASH || CurrentChar == DOT)
{
if (PreviousChar == FORWARD_SLASH || PreviousChar == DOT)
{
*pRawUrlClean = FALSE;
}
}
else
{
*pRawUrlClean = FALSE;
}
}
if (CurrentChar == FORWARD_SLASH)
{
ULONG SegmentLength = DIFF(pBuffer - pSegment);
// If the segment contains %-hex-escaped chars, it may become
// acceptably short after PopChar() processing. Let
// HttppCleanAndCopyUrlByType() figure it out.
if (SegmentLength > pCfg->UrlSegmentMaxLength)
*pRawUrlClean = FALSE;
pSegment = pBuffer;
// If this is an AbsURI, instead of an AbsPath, the
// segment count will be higher, because of the two slashes
// before the hostname. Also, "/../", "/./", and "//"
// minimization will reduce the final count of segments.
// Again, let HttppCleanAndCopyUrlByType() figure it out.
if (++SegmentCount > pCfg->UrlSegmentMaxCount)
*pRawUrlClean = FALSE;
}
}
PreviousChar = CurrentChar;
pBuffer++;
BufferLength--;
}
// See why we stopped.
if (0 == BufferLength)
{
*pRawUrlClean = FALSE;
// Ran out of buffer before end of token.
return STATUS_SUCCESS;
}
ASSERT(IS_HTTP_WS_TOKEN(*pBuffer));
TokenLength = DIFF(pBuffer - pTokenStart);
if (0 == TokenLength)
{
UlTraceError(PARSER, ("http!HttpFindUrlToken: Found empty token\n"));
RETURN(STATUS_INVALID_DEVICE_REQUEST);
}
// Check the final segment
if (DIFF(pBuffer - pSegment) > pCfg->UrlSegmentMaxLength)
*pRawUrlClean = FALSE;
if (++SegmentCount > pCfg->UrlSegmentMaxCount)
*pRawUrlClean = FALSE;
if (TokenLength > pCfg->UrlMaxLength)
*pRawUrlClean = FALSE;
// Success! Set the token length and return the start of the token.
*pTokenLength = TokenLength;
*ppTokenStart = (PUCHAR) pTokenStart;
return STATUS_SUCCESS;
} // HttpFindUrlToken
/*++
Routine Description:
Parse an IPv6 address from a Unicode buffer. Must be delimited by [].
May contain a scope ID.
Arguments:
pBuffer - Buffer to parse. Must point to '['.
BufferLength - Length of data pointed to by pBuffer.
ScopeIdAllowed - if TRUE, an optional scope ID may be present
pSockAddr6 - Where to return the parsed IPv6 address
ppEnd - On success, points to character after ']'
Return Value:
STATUS_SUCCESS if no parsing errors in the IPv6 address.
--*/
NTSTATUS
HttppParseIPv6Address(
IN PCWSTR pBuffer,
IN ULONG BufferLength,
IN BOOLEAN ScopeIdAllowed,
OUT PSOCKADDR_IN6 pSockAddr6,
OUT PCWSTR* ppEnd
)
{
NTSTATUS Status;
PCWSTR pEnd = pBuffer + BufferLength;
PCWSTR pChar;
PWSTR pTerminator;
ULONG ScopeTemp;
ASSERT(NULL != pBuffer);
ASSERT(0 < BufferLength);
ASSERT(NULL != pSockAddr6);
ASSERT(NULL != ppEnd);
RtlZeroMemory(pSockAddr6, sizeof(*pSockAddr6));
*ppEnd = NULL;
pSockAddr6->sin6_family = TDI_ADDRESS_TYPE_IP6;
// Caller guarantees this
ASSERT(L'[' == *pBuffer);
// Empty brackets?
if (BufferLength < WCSLEN_LIT(L"[0]") || L']' == pBuffer[1])
{
UlTraceError(PARSER,
("http!HttppParseIPv6Address: IPv6 address too short\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
for (pChar = pBuffer + WCSLEN_LIT(L"["); pChar < pEnd; ++pChar)
{
if (IS_ASCII(*pChar))
{
if (L']' == *pChar || L'%' == *pChar)
break;
// Dots are allowed because the last 32 bits may be represented
// in IPv4 dotted-octet notation
if (IS_HTTP_HEX(*pChar) || L':' == *pChar || L'.' == *pChar)
continue;
}
UlTraceError(PARSER,
("http!HttppParseIPv6Address: "
"Invalid char in IPv6 address, U+%04X '%c', "
"after %lu chars, '%.*ls'\n",
*pChar,
IS_ANSI(*pChar) && IS_HTTP_PRINT(*pChar) ? *pChar : '?',
DIFF(pChar - pBuffer),
DIFF(pChar - pBuffer),
pBuffer
));
RETURN(STATUS_INVALID_PARAMETER);
}
if (pChar == pEnd)
{
UlTraceError(PARSER,
("http!HttppParseIPv6Address: No ']' for IPv6 address\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
ASSERT(pChar < pEnd);
ASSERT(L']' == *pChar || L'%' == *pChar);
// Let the RTL routine do the hard work of parsing IPv6 addrs
Status = RtlIpv6StringToAddressW(
pBuffer + WCSLEN_LIT(L"["),
&pTerminator,
&pSockAddr6->sin6_addr
);
if (! NT_SUCCESS(Status))
{
UlTraceError(PARSER,
("http!HttppParseIPv6Address: "
"Invalid IPv6 address, %s\n",
HttpStatusToString(Status)
));
RETURN(Status);
}
if (pTerminator != pChar)
{
UlTraceError(PARSER,
("http!HttppParseIPv6Address: "
"Invalid IPv6 terminator, U+%04X, '%c'\n",
*pTerminator,
IS_ANSI(*pTerminator) && IS_HTTP_PRINT(*pTerminator)
? *pTerminator
: '?'
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Is a scopeid present?
if (L'%' != *pChar)
{
ASSERT(L']' == *pChar);
pSockAddr6->sin6_scope_id = 0;
}
else
{
PCWSTR pScopeEnd;
// Skip the '%' denoting a scope ID
pChar += WCSLEN_LIT(L"%");
if (!ScopeIdAllowed)
{
UlTraceError(PARSER,
("http!HttppParseIPv6Address: No scope ID allowed\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
if (pChar == pEnd)
{
UlTraceError(PARSER,
("http!HttppParseIPv6Address: "
"No IPv6 scope ID after '%%'\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
pScopeEnd = pChar;
do
{
if (*pScopeEnd < L'0' || *pScopeEnd > L'9')
{
UlTraceError(PARSER,
("http!HttppParseIPv6Address: "
"Invalid digit in IPv6 scope ID, "
"U+%04X, '%c'\n",
*pScopeEnd,
IS_ANSI(*pScopeEnd) && IS_HTTP_PRINT(*pScopeEnd)
? *pScopeEnd
: '?'
));
RETURN(STATUS_INVALID_PARAMETER);
}
} while (++pScopeEnd < pEnd && L']' != *pScopeEnd);
ASSERT(pScopeEnd > pChar);
if (pScopeEnd == pEnd)
{
UlTraceError(PARSER,
("http!HttppParseIPv6Address: "
"No ']' after IPv6 scope ID\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
ASSERT(L']' == *pScopeEnd);
Status = HttpWideStringToULong(
pChar,
pScopeEnd - pChar,
FALSE, // no leading zeros permitted
10,
&pTerminator,
&ScopeTemp
);
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER,
("http!HttppParseIPv6Address: "
"Invalid scopeID, %s\n",
HttpStatusToString(Status)
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Scope ID does not get swapped to Network Byte Order
*(UNALIGNED64 ULONG *)&pSockAddr6->sin6_scope_id =
ScopeTemp;
ASSERT(pTerminator == pScopeEnd);
pChar = pScopeEnd;
} // '%' handling
ASSERT(pChar < pEnd);
ASSERT(L']' == *pChar);
// Skip the terminating ']'
pChar += WCSLEN_LIT(L"]");
*ppEnd = pChar;
RETURN(STATUS_SUCCESS);
} // HttppParseIPv6Address
/*++
Routine Description:
Print an IPv4 or IPv6 address as Unicode.
Arguments:
pSockAddr - The IP address to print
pBuffer - Buffer to print to. Assumed to be large enough.
Return Value:
Number of wide chars printed (the length)
--*/
ULONG
HttppPrintIpAddressW(
IN PSOCKADDR pSockAddr,
OUT PWSTR pBuffer
)
{
PWSTR pResult = pBuffer;
HTTP_FILL_BUFFER(pBuffer, MAX_IP_ADDR_PLUS_BRACKETS_STRING_LEN);
if (TDI_ADDRESS_TYPE_IP == pSockAddr->sa_family)
{
PSOCKADDR_IN pAddr4 = (PSOCKADDR_IN) pSockAddr;
pResult = RtlIpv4AddressToStringW(&pAddr4->sin_addr, pResult);
}
else if (TDI_ADDRESS_TYPE_IP6 == pSockAddr->sa_family)
{
PSOCKADDR_IN6 pAddr6 = (PSOCKADDR_IN6) pSockAddr;
*pResult++ = L'[';
pResult = RtlIpv6AddressToStringW(&pAddr6->sin6_addr, pResult);
// CODEWORK: Handle scope ID
*pResult++ = L']';
}
else
{
UlTraceError(PARSER,
("http!HttppPrintIpAddressW(): invalid sa_family, %hd\n",
pSockAddr->sa_family
));
ASSERT(! "Invalid SockAddr Family");
}
*pResult = UNICODE_NULL;
return DIFF(pResult - pBuffer);
} // HttppPrintIpAddressW
/***************************************************************************++
Routine Description:
This checks to see if the URL is well-formed.
A well-formed URL has a scheme ("http" or "https"),
a valid hostname (including + and * wildcards, IPv4, and IPv6 literals),
a port, and a well-formed abspath.
* Must check that the URL is well-formed and in canonical form; e.g.,
- Disallow /../ and /./
- Disallow invalid characters, including invalid Unicode surrogate
pairs. The URL is already in Unicode, so it's not a question of
using the IS_URL_TOKEN() macro.
Arguments:
pCfg - configuration parameters
pUrl - Unicode string containing URL (not assumed to be
zero-terminated)
UrlLength - length of pUrl, in WCHARs
TrailingSlashReqd - if TRUE, pUrl must end in '/'
ForceRoutingIP - if TRUE and the hostname is an IPv4 or IPv6 literal,
pParsedUrl->Normalized will be cleared, to force
HttpNormalizeParsedUrl() to rewrite the URL as
http://IP:port:IP/path
pParsedUrl - on successful exit, the components of the URL
Return Value:
NTSTATUS
--***************************************************************************/
NTSTATUS
HttpParseUrl(
IN PURL_C14N_CONFIG pCfg,
IN PCWSTR pUrl,
IN ULONG UrlLength,
IN BOOLEAN TrailingSlashReqd,
IN BOOLEAN ForceRoutingIP,
OUT PHTTP_PARSED_URL pParsedUrl
)
{
NTSTATUS Status;
ULONG PreviousChar;
ULONG UnicodeChar;
PCWSTR pEnd = pUrl + UrlLength;
PCWSTR pHostname;
PCWSTR pChar;
PCWSTR pLabel;
PCWSTR pSlash;
PCWSTR pSegment;
PWSTR pTerminator;
BOOLEAN AlphaLabel;
BOOLEAN TestSegment;
BOOLEAN MoreChars;
BOOLEAN LastCharHack;
ULONG SegmentCount;
URL_STATE UrlState;
URL_STATE_TOKEN UrlToken;
URL_ACTION Action;
WCHAR IpAddr[MAX_IP_ADDR_PLUS_BRACKETS_STRING_LEN];
ULONG Length;
//
// Sanity check.
//
PAGED_CODE();
ASSERT(NULL != pCfg);
ASSERT(NULL != pUrl);
ASSERT(0 < UrlLength && UrlLength <= UNICODE_STRING_MAX_WCHAR_LEN);
ASSERT(FALSE == TrailingSlashReqd || TRUE == TrailingSlashReqd);
ASSERT(FALSE == ForceRoutingIP || TRUE == ForceRoutingIP);
ASSERT(NULL != pParsedUrl);
RtlZeroMemory(pParsedUrl, sizeof(*pParsedUrl));
pParsedUrl->Signature = HTTP_PARSED_URL_SIGNATURE;
pParsedUrl->pFullUrl = (PWSTR) pUrl;
pParsedUrl->UrlLength = (USHORT) UrlLength;
pParsedUrl->Normalized = TRUE;
pParsedUrl->TrailingSlashReqd = TrailingSlashReqd;
// This is the shortest possible valid URL
if (UrlLength < WCSLEN_LIT(L"http://*:1/"))
{
UlTraceError(PARSER,
("http!HttpParseUrl: Url too short, %lu, %.*ls\n",
UrlLength, UrlLength, pUrl
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Check the scheme
if (0 == wcsncmp(pUrl, L"http://", WCSLEN_LIT(L"http://")))
{
pParsedUrl->Secure = FALSE;
pHostname = pUrl + WCSLEN_LIT(L"http://");
}
else if (0 == wcsncmp(pUrl, L"https://", WCSLEN_LIT(L"https://")))
{
pParsedUrl->Secure = TRUE;
pHostname = pUrl + WCSLEN_LIT(L"https://");
}
else
{
UlTraceError(PARSER,
("http!HttpParseUrl: invalid scheme, %.*ls\n",
UrlLength, pUrl
));
RETURN(STATUS_INVALID_PARAMETER);
}
pParsedUrl->pHostname = (PWSTR) pHostname;
// Is a trailing slash present, if required?
if (TrailingSlashReqd && L'/' != pUrl[UrlLength - 1])
{
// No, then the URL will have to be rewritten
pParsedUrl->Normalized = FALSE;
}
//
// The hostname validation code below looks a lot like that in
// HttpValidateHostname(). However, it is sufficiently different
// (WCHAR vs. UCHAR, Host+IP, Scope IDs, compulsory ports, etc) that
// it is not easy to combine them into one routine. If the hostname
// validation code is changed here, it may be necessary to change it
// in HttpValidateHostname() too, or vice versa.
//
// Check for weak (http://*:port/) and strong (http://+:port/) wildcards
if (L'*' == *pHostname || L'+' == *pHostname)
{
pParsedUrl->SiteType = (L'*' == *pHostname)
? HttpUrlSite_WeakWildcard
: HttpUrlSite_StrongWildcard;
pChar = pHostname + WCSLEN_LIT(L"*");
ASSERT(pChar < pEnd);
// The wildcard must be followed by ":port"
if (L':' == *pChar)
goto port;
UlTraceError(PARSER,
("http!HttpParseUrl: No port in '%c' wildcard address\n",
*pHostname
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Is this an IPv6 literal address, per RFC 2732?
if (L'[' == *pHostname)
{
pParsedUrl->SiteType = HttpUrlSite_IP;
Status = HttppParseIPv6Address(
pHostname,
DIFF(pEnd - pHostname),
TRUE, // scope ID allowed
&pParsedUrl->SockAddr6,
&pChar);
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Invalid IPv6 address, %s\n",
HttpStatusToString(Status)
));
RETURN(Status);
}
ASSERT(TDI_ADDRESS_TYPE_IP6 == pParsedUrl->SockAddr.sa_family);
ASSERT(pChar > pHostname);
// There must be a port
if (pChar == pEnd || L':' != *pChar)
{
UlTraceError(PARSER,
("http!HttpParseUrl: No port after IPv6 address\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
//
// There are so many legitimate ways to write an IPv6 literal
// that we can't assume that a valid IPv6 literal is normalized.
// Since we do string comparisons, we'll have to rewrite the URL
// if the Normalized flag is not set.
//
Length = HttppPrintIpAddressW(&pParsedUrl->SockAddr, IpAddr);
if (Length != DIFF_USHORT(pChar - pHostname)
|| 0 != _wcsnicmp(pHostname, IpAddr, Length))
{
pParsedUrl->Normalized = FALSE;
}
goto port;
} // IPv6
//
// It must be a domain name or an IPv4 literal. We'll try to treat
// it as a domain name first. If the labels turn out to be all-numeric,
// we'll try decoding it as an IPv4 literal.
//
AlphaLabel = FALSE;
pLabel = pHostname;
for (pChar = pHostname; pChar < pEnd; ++pChar)
{
if (L':' == *pChar)
{
if (pChar == pHostname)
{
UlTraceError(PARSER,
("http!HttpParseUrl: empty hostname\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Have we seen any non-digits?
if (AlphaLabel)
{
ASSERT(0 == pParsedUrl->SockAddr.sa_family);
pParsedUrl->SiteType = HttpUrlSite_Name;
goto port;
}
pParsedUrl->SiteType = HttpUrlSite_IP;
pParsedUrl->SockAddr4.sin_family = TDI_ADDRESS_TYPE_IP;
ASSERT(TDI_ADDRESS_TYPE_IP == pParsedUrl->SockAddr.sa_family);
// Let's see if it's a valid IPv4 address
Status = RtlIpv4StringToAddressW(
pHostname,
TRUE, // strict => 4 dotted decimal octets
&pTerminator,
&pParsedUrl->SockAddr4.sin_addr
);
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Invalid IPv4 address, %s\n",
HttpStatusToString(Status)
));
RETURN(Status);
}
if (pTerminator != pChar)
{
ASSERT(pTerminator < pChar);
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Invalid IPv4 address after %lu chars, "
"U+%04X, '%c'\n",
DIFF(pTerminator - pHostname),
*pTerminator,
IS_ANSI(*pTerminator) && IS_HTTP_PRINT(*pTerminator)
? *pTerminator
: '?'
));
RETURN(STATUS_INVALID_PARAMETER);
}
Length = HttppPrintIpAddressW(&pParsedUrl->SockAddr, IpAddr);
if (Length != DIFF_USHORT(pChar - pHostname)
|| 0 != _wcsnicmp(pHostname, IpAddr, Length))
{
pParsedUrl->Normalized = FALSE;
}
goto port;
} // ':' handling
if (L'.' == *pChar)
{
ULONG LabelLength = DIFF(pChar - pLabel);
// There must be at least one char in the label
if (0 == LabelLength)
{
UlTraceError(PARSER,
("http!HttpParseUrl: empty label\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Label can't have more than 63 chars
if (LabelLength > pCfg->MaxLabelLength)
{
UlTraceError(PARSER,
("http!HttpParseUrl: overlong label, %lu\n",
LabelLength
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Reset for the next label
pLabel = pChar + WCSLEN_LIT(L".");
continue;
}
//
// All chars above 0xFF are considered valid
//
if (!IS_ANSI(*pChar) || !IS_URL_ILLEGAL_COMPUTERNAME(*pChar))
{
if (!IS_ANSI(*pChar) || !IS_HTTP_DIGIT(*pChar))
AlphaLabel = TRUE;
if (pChar > pLabel)
continue;
// The first char of a label cannot be a hyphen. (Underscore?)
if (L'-' == *pChar)
{
UlTraceError(PARSER,
("http!HttpParseUrl: '-' at beginning of label\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
continue;
}
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Invalid char in hostname, U+%04X '%c',"
" after %lu chars, '%.*s'\n",
*pChar,
IS_ANSI(*pChar) && IS_HTTP_PRINT(*pChar) ? *pChar : '?',
DIFF(pChar - pHostname),
DIFF(pChar - pHostname),
pHostname
));
RETURN(STATUS_INVALID_PARAMETER);
} // hostname
//
// If we got here, we fell off the end of the buffer,
// without finding a ':' for the port
//
ASSERT(pChar == pEnd);
UlTraceError(PARSER, ("http!HttpParseUrl: No port\n"));
RETURN(STATUS_INVALID_PARAMETER);
port:
//
// Parse the port number
//
ASSERT(pHostname < pChar && pChar < pEnd);
ASSERT(L':' == *pChar);
pParsedUrl->HostnameLength = DIFF_USHORT(pChar - pHostname);
// First, check for overlong hostnames
if (pParsedUrl->HostnameLength > pCfg->MaxHostnameLength)
{
UlTraceError(PARSER,
("http!HttpParseUrl: overlong hostname, %hu\n",
pParsedUrl->HostnameLength
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Skip the ':' denoting a port number
pChar += WCSLEN_LIT(L":");
if (pChar == pEnd)
{
UlTraceError(PARSER,
("http!HttpParseUrl: No port after ':'\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
// Search for the '/' or second ':' that terminates the port number
pSlash = pChar;
pParsedUrl->pPort = (PWSTR) pSlash;
do
{
if (*pSlash < L'0' || *pSlash > L'9')
{
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Invalid digit in port, U+%04X, '%c'\n",
*pSlash,
IS_ANSI(*pSlash) && IS_HTTP_PRINT(*pSlash)
? *pSlash
: '?'
));
RETURN(STATUS_INVALID_PARAMETER);
}
} while (++pSlash < pEnd && L'/' != *pSlash && L':' != *pSlash);
ASSERT(pSlash > pChar);
pParsedUrl->PortLength = DIFF_USHORT(pSlash - pChar);
if (pSlash == pEnd)
{
UlTraceError(PARSER,
("http!HttpParseUrl: No '/' (or second ':') after port\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
ASSERT(L'/' == *pSlash || L':' == *pSlash);
Status = HttpWideStringToUShort(
pChar,
pParsedUrl->PortLength,
FALSE, // no leading zeros permitted
10,
&pTerminator,
&pParsedUrl->PortNumber
);
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Invalid port number, %s\n",
HttpStatusToString(Status)
));
RETURN(STATUS_INVALID_PARAMETER);
}
if (0 == pParsedUrl->PortNumber)
{
UlTraceError(PARSER,
("http!HttpParseUrl: Port must not be zero.\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
ASSERT(pTerminator == pSlash);
pChar = pSlash;
goto routing_IP; // so /W4 won't complain about an unreferenced label
routing_IP:
//
// Is this a Host+IP site; i.e., is there a Routing IP address
// after the port number?
//
if (L'/' == *pChar)
{
pParsedUrl->pRoutingIP = NULL;
pParsedUrl->RoutingIPLength = 0;
ASSERT(0 == pParsedUrl->RoutingAddr.sa_family);
//
// If the hostname is an IP literal, but there is no routing IP
// (i.e., http://IP:port/path), we must rewrite the URL as
// http://IP:port:IP/path; i.e., explicitly use the hostname IP
// as the routing IP.
//
if (ForceRoutingIP && 0 != pParsedUrl->SockAddr.sa_family)
{
ASSERT(TDI_ADDRESS_TYPE_IP == pParsedUrl->SockAddr.sa_family
|| TDI_ADDRESS_TYPE_IP6 == pParsedUrl->SockAddr.sa_family);
pParsedUrl->Normalized = FALSE;
}
goto parse_path;
}
ASSERT(L':' == *pChar);
if (HttpUrlSite_WeakWildcard == pParsedUrl->SiteType
|| HttpUrlSite_StrongWildcard == pParsedUrl->SiteType)
{
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Can't have Routing IPs on Wildcard sites\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
pChar += WCSLEN_LIT(L":");
if (pChar == pEnd)
{
UlTraceError(PARSER,
("http!HttpParseUrl: No IP address after second ':'\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
pParsedUrl->pRoutingIP = (PWSTR) pChar;
ASSERT(HttpUrlSite_NamePlusIP != pParsedUrl->SiteType);
if (HttpUrlSite_Name == pParsedUrl->SiteType)
{
pParsedUrl->SiteType = HttpUrlSite_NamePlusIP;
}
//
// Is the Routing IP an IPv6 literal?
//
if (L'[' == *pChar)
{
if (TDI_ADDRESS_TYPE_IP == pParsedUrl->SockAddr.sa_family)
{
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Can't have http://IPv4:port:[IPv6]\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
ASSERT(TDI_ADDRESS_TYPE_IP6 == pParsedUrl->SockAddr.sa_family
|| 0 == pParsedUrl->SockAddr.sa_family);
Status = HttppParseIPv6Address(
pChar,
DIFF(pEnd - pChar),
TRUE, // scope ID allowed
&pParsedUrl->RoutingAddr6,
&pSlash);
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Invalid Host+IPv6 address, %s\n",
HttpStatusToString(Status)
));
RETURN(Status);
}
ASSERT(TDI_ADDRESS_TYPE_IP6 == pParsedUrl->RoutingAddr.sa_family);
ASSERT(pSlash > pChar);
// There must be a slash
if (pSlash == pEnd || L'/' != *pSlash)
{
UlTraceError(PARSER,
("http!HttpParseUrl: '/' expected after Host+IPv6.\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
// CODEWORK: Should we care if RoutingAddr6 != SockAddr6?
pParsedUrl->RoutingIPLength = DIFF_USHORT(pSlash - pChar);
Length = HttppPrintIpAddressW(&pParsedUrl->RoutingAddr, IpAddr);
if (Length != pParsedUrl->RoutingIPLength
|| 0 != _wcsnicmp(pChar, IpAddr, Length))
{
pParsedUrl->Normalized = FALSE;
}
pChar = pSlash;
goto parse_path;
}
//
// No, then it must be an IPv4 literal
//
if (TDI_ADDRESS_TYPE_IP6 == pParsedUrl->SockAddr.sa_family)
{
UlTraceError(PARSER,
("http!HttpParseUrl: Can't have http://[IPv6]:port:IPv4\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
ASSERT(TDI_ADDRESS_TYPE_IP == pParsedUrl->SockAddr.sa_family
|| 0 == pParsedUrl->SockAddr.sa_family);
// Search for the terminating '/'
pSlash = pChar;
do
{
if ((L'0' <= *pSlash && *pSlash <= L'9') || L'.' == *pSlash)
continue;
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Invalid character in Host+IPv4, U+%04X, '%c'\n",
*pSlash,
IS_ANSI(*pSlash) && IS_HTTP_PRINT(*pSlash)
? *pSlash
: '?'
));
RETURN(STATUS_INVALID_PARAMETER);
} while (++pSlash < pEnd && L'/' != *pSlash);
ASSERT(pSlash > pChar);
if (pSlash == pEnd)
{
UlTraceError(PARSER,
("http!HttpParseUrl: No '/' after Host+IPv4\n"
));
RETURN(STATUS_INVALID_PARAMETER);
}
ASSERT(L'/' == *pSlash);
Status = RtlIpv4StringToAddressW(
pChar,
TRUE, // strict => 4 dotted decimal octets
&pTerminator,
&pParsedUrl->RoutingAddr4.sin_addr
);
if (!NT_SUCCESS(Status))
{
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Invalid Host+IPv4 address, %s\n",
HttpStatusToString(Status)
));
RETURN(Status);
}
if (pTerminator != pSlash)
{
ASSERT(pTerminator < pSlash);
UlTraceError(PARSER,
("http!HttpParseUrl: "
"Invalid Host+IPv4 address after %lu chars, "
"U+%04X, '%c'\n",
DIFF(pTerminator - pChar),
*pTerminator,
IS_ANSI(*pTerminator) && IS_HTTP_PRINT(*pTerminator)
? *pTerminator
: '?'
));
RETURN(STATUS_INVALID_PARAMETER);
}
// CODEWORK: Should we care if RoutingAddr4 != SockAddr4
pParsedUrl->RoutingIPLength = DIFF_USHORT(pSlash - pChar);
pParsedUrl->RoutingAddr4.sin_family = TDI_ADDRESS_TYPE_IP;
Length = HttppPrintIpAddressW(&pParsedUrl->RoutingAddr, IpAddr);
if (Length != pParsedUrl->RoutingIPLength
|| 0 != _wcsnicmp(pChar, IpAddr, Length))
{
pParsedUrl->Normalized = FALSE;
}
pChar = pSlash;
parse_path:
//
// Parse the abspath
//
ASSERT(pParsedUrl->pRoutingIP == NULL || pParsedUrl->RoutingIPLength > 0);
ASSERT(pHostname < pChar && pChar < pEnd);
ASSERT(L'/' == *pChar);
pParsedUrl->pAbsPath = (PWSTR) pChar;
pParsedUrl->AbsPathLength = DIFF_USHORT(pEnd - pChar);
if (pParsedUrl->AbsPathLength > pCfg->UrlMaxLength)
{
UlTraceError(PARSER,
("http!HttpParseUrl: "
"AbsPath is too long: %lu\n",
pParsedUrl->AbsPathLength
));
RETURN(STATUS_INVALID_PARAMETER);
}
UrlState = URL_STATE_START;
UrlToken = URL_TOKEN_OTHER;
Action = ACTION_NOTHING;
pSegment = pChar;
TestSegment = FALSE;
LastCharHack = FALSE;
MoreChars = TRUE;
PreviousChar = UNICODE_NULL;
UnicodeChar = *pChar;
SegmentCount = 0;
//
// Loop through all the characters in pAbsPath, plus one or two
// special ones at the end.
//
while (MoreChars)
{
switch (UnicodeChar)
{
case UNICODE_NULL:
UrlToken = URL_TOKEN_EOS;
TestSegment = TRUE;
break;
case DOT:
UrlToken = URL_TOKEN_DOT;
TestSegment = FALSE;
break;
case FORWARD_SLASH:
UrlToken = URL_TOKEN_SLASH;
TestSegment = TRUE;
break;
case PERCENT: // no hex escapes
case STAR: // no wildcards
case QUESTION_MARK: // no wildcards or querystrings
case BACK_SLASH: // no C string escapes
UlTraceError(PARSER,
("http!HttpParseUrl: invalid '%c' char in path\n",
(UCHAR) UnicodeChar
));
RETURN(STATUS_INVALID_PARAMETER);
default:
UrlToken = URL_TOKEN_OTHER;
TestSegment = FALSE;
break;
}
UlTraceVerbose(PARSER,
("http!HttpParseUrl: "
"[%lu] U+%04lX '%c' %p: [%s][%s] -> %s, %s\n",
DIFF(pChar - pParsedUrl->pAbsPath),
UnicodeChar,
IS_ANSI(UnicodeChar) && IS_HTTP_PRINT(UnicodeChar)
? (UCHAR) UnicodeChar : '?',
pChar,
HttppUrlStateToString(UrlState),
HttppUrlTokenToString(UrlToken),
HttppUrlStateToString(
CanonStateFromStateAndToken[UrlState][UrlToken]),
TestSegment ? ", TestSegment" : ""
));
//
// Reject control characters
//
if (!LastCharHack
&& !pCfg->AllowRestrictedChars
&& IS_ANSI(UnicodeChar)
&& IS_URL_INVALID(UnicodeChar))
{
UlTraceError(PARSER, (
"http!HttpParseUrl: "
"Invalid character, U+%04lX, in path.\n",
UnicodeChar
));
RETURN(STATUS_INVALID_PARAMETER);
}
//
// Check that (high-surrogate, low-surrogate) come in pairs
//
if (HIGH_SURROGATE_START <= PreviousChar
&& PreviousChar <= HIGH_SURROGATE_END)
{
if (UnicodeChar < LOW_SURROGATE_START
|| UnicodeChar > LOW_SURROGATE_END)
{
UlTraceError(PARSER, (
"http!HttpParseUrl: "
"Illegal surrogate pair, U+%04lX, U+%04lX.\n",
PreviousChar, UnicodeChar
));
RETURN(STATUS_INVALID_PARAMETER);
}
}
else if (LOW_SURROGATE_START <= UnicodeChar
&& UnicodeChar <= LOW_SURROGATE_END)
{
UlTraceError(PARSER, (
"http!HttpParseUrl: "
"Non-high surrogate, U+%04lX, "
"before low surrogate, U+%04lX.\n",
PreviousChar, UnicodeChar
));
RETURN(STATUS_INVALID_PARAMETER);
}
if (URL_STATE_ERROR == CanonStateFromStateAndToken[UrlState][UrlToken])
{
UlTraceError(PARSER, (
"http!HttpParseUrl: "
"Error state from %s,%s in path, after U+%04lX.\n",
HttppUrlStateToString(UrlState),
HttppUrlTokenToString(UrlToken),
UnicodeChar
));
RETURN(STATUS_INVALID_PARAMETER);
}
UrlState = CanonStateFromStateAndToken[UrlState][UrlToken];
//
// Check segment limits
//
if (TestSegment)
{
ULONG SegmentLength = DIFF(pChar - pSegment);
// The CanonStateFromStateAndToken checks should prevent
// empty segments, among other things
ASSERT(SegmentLength > 0 || pChar == pSegment);
// Reject if segment too long
if (SegmentLength > pCfg->UrlSegmentMaxLength + WCSLEN_LIT(L"/"))
{
UlTraceError(PARSER, (
"http!HttpParseUrl(): "
"Segment too long: %lu\n",
SegmentLength
));
RETURN(STATUS_INVALID_PARAMETER);
}
pSegment = pChar;
// Reject if too many path segments
if (++SegmentCount > pCfg->UrlSegmentMaxCount)
{
UlTraceError(PARSER, (
"http!HttpParseUrl(): "
"Too many segments: %lu\n",
SegmentCount
));
RETURN(STATUS_INVALID_PARAMETER);
}
}
//
// Are there any more path characters?
//
PreviousChar = UnicodeChar;
if (++pChar < pEnd)
{
UnicodeChar = *pChar;
}
else if (!LastCharHack)
{
// Want to make sure that the last segment is tested.
// If there's no trailing slash, we'll enter here twice;
// otherwise once
if (TrailingSlashReqd && FORWARD_SLASH != PreviousChar)
{
// First, fake a trailing slash, if needed
UnicodeChar = FORWARD_SLASH;
}
else
{
// Second, always finish up with UNICODE_NULL
UnicodeChar = UNICODE_NULL;
LastCharHack = TRUE;
}
}
else
{
// Terminate the loop
MoreChars = FALSE;
}
} // while (MoreChars)
RETURN(STATUS_SUCCESS);
} // HttpParseUrl
/***************************************************************************++
Routine Description:
Some URLs parsed by HttpParseUrl() will not be considered normalized
if they have IP literals, Routing IPs, or no trailing slash.
This routine will build a fully normalized URL and (possibly) free the
old one
Arguments:
pParsedUrl - On entry, points to a URL parsed by HttpParseUrl();
On successful exit, points to a normalized URL.
pCfg - configuration parameters
ForceCopy - if TRUE, will always make a new, normalized URL
FreeOriginalUrl - if FALSE, will never free the original URL.
The caller must manage the memory.
ForceRoutingIP - if TRUE and the hostname is an IPv4 or IPv6 literal,
the URL will be rewritten in the form
http://IP:port:IP/path
PoolType - PagedPool or NonPagedPool
PoolTag - Tag used to allocate pUrl
Return Value:
NTSTATUS - STATUS_SUCCESS or STATUS_NO_MEMORY
--***************************************************************************/
NTSTATUS
HttpNormalizeParsedUrl(
IN OUT PHTTP_PARSED_URL pParsedUrl,
IN PURL_C14N_CONFIG pCfg,
IN BOOLEAN ForceCopy,
IN BOOLEAN FreeOriginalUrl,
IN BOOLEAN ForceRoutingIP,
IN POOL_TYPE PoolType,
IN ULONG PoolTag
)
{
HTTP_PARSED_URL ParsedUrl = *pParsedUrl;
NTSTATUS Status = STATUS_SUCCESS;
ASSERT(HTTP_PARSED_URL_SIGNATURE == ParsedUrl.Signature);
if (ParsedUrl.Normalized && !ForceCopy)
{
// nothing to do
}
else
{
PWSTR pResult;
WCHAR HostAddrString[MAX_IP_ADDR_PLUS_BRACKETS_STRING_LEN];
WCHAR RoutingAddrString[MAX_IP_ADDR_PLUS_BRACKETS_STRING_LEN];
ULONG SchemeLength;
ULONG HostAddrLength;
ULONG HostnameLength;
ULONG RoutingAddrLength;
ULONG AbsPathLength;
ULONG Length;
ULONG TrailingSlashLength;
PCWSTR pUrl;
pUrl = ParsedUrl.pFullUrl;
SchemeLength = DIFF(ParsedUrl.pHostname - ParsedUrl.pFullUrl);
// Calculate HostAddrLength and HostnameLength (mutually exclusive)
if (0 != ParsedUrl.SockAddr.sa_family)
{
HostAddrLength = HttppPrintIpAddressW(
&ParsedUrl.SockAddr,
HostAddrString
);
HostnameLength = 0;
}
else
{
HostAddrLength = 0;
HostAddrString[0] = UNICODE_NULL;
HostnameLength = ParsedUrl.HostnameLength;
}
// Calculate RoutingAddrLength
if (0 != ParsedUrl.RoutingAddr.sa_family)
{
RoutingAddrLength = WCSLEN_LIT(L":")
+ HttppPrintIpAddressW(
&ParsedUrl.RoutingAddr,
RoutingAddrString
);
}
else if (ForceRoutingIP && 0 != ParsedUrl.SockAddr.sa_family)
{
// We must rewrite http://IP:port/path as http://IP:port:IP/path
RoutingAddrLength = WCSLEN_LIT(L":") + HostAddrLength;
wcscpy(RoutingAddrString, HostAddrString);
}
else
{
RoutingAddrLength = 0;
RoutingAddrString[0] = UNICODE_NULL;
}
AbsPathLength = ParsedUrl.AbsPathLength;
ASSERT(AbsPathLength > 0);
if (ParsedUrl.TrailingSlashReqd
&& FORWARD_SLASH != ParsedUrl.pAbsPath[AbsPathLength-1])
{
TrailingSlashLength = WCSLEN_LIT(L"/");
}
else
{
TrailingSlashLength = 0;
}
Length = SchemeLength
+ HostAddrLength
+ HostnameLength
+ WCSLEN_LIT(L":") + ParsedUrl.PortLength
+ RoutingAddrLength
+ AbsPathLength
+ TrailingSlashLength;
pResult = (PWSTR) HTTPP_ALLOC(
PoolType,
(Length + 1) * sizeof(WCHAR),
PoolTag
);
if (NULL == pResult)
{
Status = STATUS_NO_MEMORY;
// Do not destroy the old URL. Let caller handle it.
}
else
{
PWSTR pDest = pResult;
#define WCSNCPY(pSrc, Length) \
RtlCopyMemory(pDest, (pSrc), (Length) * sizeof(WCHAR)); \
pDest += (Length)
#define WCSNCPY2(pField, Length) \
WCSNCPY(ParsedUrl.pField, Length)
#define WCSNCPY_LIT(Lit) \
WCSNCPY(Lit, WCSLEN_LIT(Lit))
WCSNCPY2(pFullUrl, SchemeLength);
if (0 != HostnameLength)
{
ASSERT(0 == HostAddrLength);
WCSNCPY2(pHostname, HostnameLength);
}
else
{
ASSERT(0 != HostAddrLength);
WCSNCPY(HostAddrString, HostAddrLength);
}
WCSNCPY_LIT(L":");
WCSNCPY2(pPort, ParsedUrl.PortLength);
if (RoutingAddrLength > 0)
{
WCSNCPY_LIT(L":");
WCSNCPY(
RoutingAddrString,
RoutingAddrLength - WCSLEN_LIT(L":")
);
}
WCSNCPY2(pAbsPath, AbsPathLength);
if (TrailingSlashLength > 0)
{
WCSNCPY_LIT(L"/");
}
ASSERT(DIFF(pDest - pResult) == Length);
*pDest = UNICODE_NULL;
Status = HttpParseUrl(
pCfg,
pResult,
Length,
ParsedUrl.TrailingSlashReqd,
ForceRoutingIP,
&ParsedUrl
);
ASSERT(STATUS_SUCCESS == Status);
ASSERT(ParsedUrl.Normalized);
if (FreeOriginalUrl)
HTTPP_FREE((PVOID) pUrl, PoolTag);
// Write the updated local copy back to the caller's HTTP_PARSED_URL
*pParsedUrl = ParsedUrl;
}
}
return Status;
} // HttpNormalizeParsedUrl