windows-server-2003/net/http/common/utf8.c

/*++

Copyright (c) 2002-2002 Microsoft Corporation

Module Name:

    Utf8.c

Abstract:

    UTF-8 manipulation routines

Author:

    George V. Reilly (GeorgeRe)     01-Apr-2002

Revision History:

--*/

#include "precomp.h"

#if defined(ALLOC_PRAGMA) && defined(KERNEL_PRIV)

#pragma alloc_text( INIT, HttpInitializeUtf8)
#pragma alloc_text( PAGE, HttpUnicodeToUTF8)
#pragma alloc_text( PAGE, HttpUTF8ToUnicode)
#pragma alloc_text( PAGE, HttpUcs4toUtf16)
#pragma alloc_text( PAGE, HttpUnicodeToUTF8Count)
#pragma alloc_text( PAGE, HttpUnicodeToUTF8Encode)
#pragma alloc_text( PAGE, HttpUtf8RawBytesToUnicode)

#endif // ALLOC_PRAGMA && KERNEL_PRIV

#if 0   // Non-Pageable Functions
NOT PAGEABLE -- 
#endif // Non-Pageable Functions


DECLSPEC_ALIGN(UL_CACHE_LINE)  
const UCHAR
Utf8OctetCount[256] =
{
    // singletons: 0x00 - 0x7F
    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   // 0x
    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   // 1x
    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   // 2x
    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   // 3x
    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   // 4x
    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   // 5x
    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   // 6x
    1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   // 7x

    // UTF-8 trail bytes are not valid lead byte prefixes: 0x80 - 0xBF
    0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   // 8x
    0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   // 9x
    0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   // Ax
    0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   // Bx

    // two-byte prefixes: 0xC0 - 0xDF
    2, 2, 2, 2, 2, 2, 2, 2,   2, 2, 2, 2, 2, 2, 2, 2,   // Cx
    2, 2, 2, 2, 2, 2, 2, 2,   2, 2, 2, 2, 2, 2, 2, 2,   // Dx

    // three-byte prefixes: 0xE0 - 0xEF
    3, 3, 3, 3, 3, 3, 3, 3,   3, 3, 3, 3, 3, 3, 3, 3,   // Ex

    // four-byte prefixes: 0xF0 - 0xF7
    4, 4, 4, 4, 4, 4, 4, 4,                             // Fx

    // invalid prefixes: 0xF8 - 0xFF
                              0, 0, 0, 0, 0, 0, 0, 0,   // Fx
};

const static char hexArray[] = "0123456789ABCDEF";


VOID
HttpInitializeUtf8(
    VOID
    )
{
#if DBG
    ULONG i;
    //
    // Validate Utf8OctetCount[]
    //

    for (i = 0;  i < 256;  ++i)
    {
        UCHAR OctetCount = UTF8_OCTET_COUNT(i);

        if (IS_UTF8_SINGLETON(i))
        {
            ASSERT(1 == OctetCount);
        }
        else if (IS_UTF8_1ST_BYTE_OF_2(i))
        {
            ASSERT(2 == OctetCount);
        }
        else if (IS_UTF8_1ST_BYTE_OF_3(i))
        {
            ASSERT(3 == OctetCount);
        }
        else if (IS_UTF8_1ST_BYTE_OF_4(i))
        {
            ASSERT(4 == OctetCount);
        }
        else
        {
            ASSERT(0 == OctetCount);
        }
    }
#endif // DBG
} // HttpInitializeUtf8


//
// Some Unicode to Utf8 conversion utilities taken and modified frm
// base\win32\winnls\utf.c. Use this until they expose the same functionality
// in kernel.
//

/***************************************************************************++

Routine Description:

    Maps a Unicode character string to its UTF-8 string counterpart

    Conversion continues until the source is finished or an error happens in
    either case it returns the number of UTF-8 characters written.

    If the supllied buffer is not big enough it returns 0.

--***************************************************************************/

ULONG
HttpUnicodeToUTF8(
    IN  PCWSTR  lpSrcStr,
    IN  LONG    cchSrc,
    OUT LPSTR   lpDestStr,
    IN  LONG    cchDest
    )
{
    LPCWSTR     lpWC  = lpSrcStr;
    LONG        cchU8 = 0;                // # of UTF8 chars generated
    ULONG       dwSurrogateChar;
    WCHAR       wchHighSurrogate = 0;
    BOOLEAN     bHandled;

    while ((cchSrc--) && ((cchDest == 0) || (cchU8 < cchDest)))
    {
        bHandled = FALSE;

        //
        // Check if high surrogate is available
        //
        if ((*lpWC >= HIGH_SURROGATE_START) && (*lpWC <= HIGH_SURROGATE_END))
        {
            if (cchDest)
            {
                // Another high surrogate, then treat the 1st as normal
                // Unicode character.
                if (wchHighSurrogate)
                {
                    if ((cchU8 + 2) < cchDest)
                    {
                        lpDestStr[cchU8++] = (UCHAR) (UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
                        lpDestStr[cchU8++] = (UCHAR) (UTF8_TRAIL     | MIDDLE_6_BIT(wchHighSurrogate));
                        lpDestStr[cchU8++] = (UCHAR) (UTF8_TRAIL     | LOWER_6_BIT(wchHighSurrogate));
                    }
                    else
                    {
                        // not enough buffer
                        cchSrc++;
                        break;
                    }
                }
            }
            else
            {
                cchU8 += 3;
            }
            wchHighSurrogate = *lpWC;
            bHandled = TRUE;
        }

        if (!bHandled && wchHighSurrogate)
        {
            if ((*lpWC >= LOW_SURROGATE_START) && (*lpWC <= LOW_SURROGATE_END))
            {
                 // wheee, valid surrogate pairs

                 if (cchDest)
                 {
                     if ((cchU8 + 3) < cchDest)
                     {
                         dwSurrogateChar = (((wchHighSurrogate-0xD800) << 10) + (*lpWC - 0xDC00) + 0x10000);

                         lpDestStr[cchU8++] = (UTF8_1ST_OF_4 | (UCHAR)(dwSurrogateChar >> 18));             // 3 bits from 1st byte
                         lpDestStr[cchU8++] = (UTF8_TRAIL    | (UCHAR)((dwSurrogateChar >> 12) & 0x3f));    // 6 bits from 2nd byte
                         lpDestStr[cchU8++] = (UTF8_TRAIL    | (UCHAR)((dwSurrogateChar >> 6) & 0x3f));     // 6 bits from 3rd byte
                         lpDestStr[cchU8++] = (UTF8_TRAIL    | (UCHAR)(0x3f &dwSurrogateChar));             // 6 bits from 4th byte
                     }
                     else
                     {
                        // not enough buffer
                        cchSrc++;
                        break;
                     }
                 }
                 else
                 {
                     // we already counted 3 previously (in high surrogate)
                     cchU8 += 1;
                 }

                 bHandled = TRUE;
            }
            else
            {
                 // Bad Surrogate pair : ERROR
                 // Just process wchHighSurrogate , and the code below will
                 // process the current code point
                 if (cchDest)
                 {
                     if ((cchU8 + 2) < cchDest)
                     {
                        lpDestStr[cchU8++] = (UCHAR) (UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
                        lpDestStr[cchU8++] = (UCHAR) (UTF8_TRAIL    | MIDDLE_6_BIT(wchHighSurrogate));
                        lpDestStr[cchU8++] = (UCHAR) (UTF8_TRAIL    | LOWER_6_BIT(wchHighSurrogate));
                     }
                     else
                     {
                        // not enough buffer
                        cchSrc++;
                        break;
                     }
                 }
            }

            wchHighSurrogate = 0;
        }

        if (!bHandled)
        {
            if (*lpWC <= UTF8_1_MAX)
            {
                //
                //  Found ASCII.
                //
                if (cchDest)
                {
                    lpDestStr[cchU8] = (char)*lpWC;
                }
                cchU8++;
            }
            else if (*lpWC <= UTF8_2_MAX)
            {
                //
                //  Found 2 byte sequence if < 0x07ff (11 bits).
                //
                if (cchDest)
                {
                    if ((cchU8 + 1) < cchDest)
                    {
                        //
                        //  Use upper 5 bits in first byte.
                        //  Use lower 6 bits in second byte.
                        //
                        lpDestStr[cchU8++] = (UCHAR) (UTF8_1ST_OF_2 | (*lpWC >> 6));
                        lpDestStr[cchU8++] = (UCHAR) (UTF8_TRAIL    | LOWER_6_BIT(*lpWC));
                    }
                    else
                    {
                        //
                        //  Error - buffer too small.
                        //
                        cchSrc++;
                        break;
                    }
                }
                else
                {
                    cchU8 += 2;
                }
            }
            else
            {
                //
                //  Found 3 byte sequence.
                //
                if (cchDest)
                {
                    if ((cchU8 + 2) < cchDest)
                    {
                        //
                        //  Use upper  4 bits in first byte.
                        //  Use middle 6 bits in second byte.
                        //  Use lower  6 bits in third byte.
                        //
                        lpDestStr[cchU8++] = (UCHAR)(UTF8_1ST_OF_3 | HIGHER_6_BIT(*lpWC));
                        lpDestStr[cchU8++] = (UCHAR)(UTF8_TRAIL    | MIDDLE_6_BIT(*lpWC));
                        lpDestStr[cchU8++] = (UCHAR)(UTF8_TRAIL    | LOWER_6_BIT(*lpWC));
                    }
                    else
                    {
                        //
                        //  Error - buffer too small.
                        //
                        cchSrc++;
                        break;
                    }
                }
                else
                {
                    cchU8 += 3;
                }
            }
        }

        lpWC++;
    }

    //
    // If the last character was a high surrogate, then handle it as a normal
    // unicode character.
    //
    if ((cchSrc < 0) && (wchHighSurrogate != 0))
    {
        if (cchDest)
        {
            if ((cchU8 + 2) < cchDest)
            {
                lpDestStr[cchU8++] = (UCHAR)(UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
                lpDestStr[cchU8++] = (UCHAR)(UTF8_TRAIL    | MIDDLE_6_BIT(wchHighSurrogate));
                lpDestStr[cchU8++] = (UCHAR)(UTF8_TRAIL    | LOWER_6_BIT(wchHighSurrogate));
            }
            else
            {
                cchSrc++;
            }
        }
    }

    //
    //  Make sure the destination buffer was large enough.
    //
    if (cchDest && (cchSrc >= 0))
    {
        return 0;
    }

    //
    //  Return the number of UTF-8 characters written.
    //
    return cchU8;

} // HttpUnicodeToUTF8


/***************************************************************************++

Routine Description:

    Maps a UTF-8 character string to its wide character string counterpart.

Return Value:

--***************************************************************************/
NTSTATUS
HttpUTF8ToUnicode(
    IN     LPCSTR lpSrcStr,
    IN     LONG   cchSrc,
       OUT LPWSTR lpDestStr,
    IN OUT PLONG  pcchDest,
    IN     ULONG  dwFlags
    )
{
    LONG        nTB = 0;              // # trail bytes to follow
    LONG        cchWC = 0;            // # of Unicode code points generated
    CONST BYTE* pUTF8 = (CONST BYTE*)lpSrcStr;
    LONG        dwSurrogateChar = 0;     // Full surrogate char
    BOOLEAN     bSurrogatePair = FALSE;  // Indicate we'r collecting a
                                         // surrogate pair
    BOOLEAN     bCheckInvalidBytes = (BOOLEAN)(dwFlags == 1);
    BYTE        UTF8;
    LONG        cchDest = *pcchDest;

    while ((cchSrc--) && ((cchDest == 0) || (cchWC < cchDest)))
    {
        //
        //  See if there are any trail bytes.
        //
        if (BIT7(*pUTF8) == 0)
        {
            //
            //  Found ASCII.
            //
            if (cchDest)
            {
                lpDestStr[cchWC] = (WCHAR)*pUTF8;
            }
            nTB = bSurrogatePair = 0;
            cchWC++;
        }
        else if (BIT6(*pUTF8) == 0)
        {
            //
            //  Found a trail byte.
            //  Note : Ignore the trail byte if there was no lead byte.
            //
            if (nTB != 0)
            {
                //
                //  Decrement the trail byte counter.
                //
                nTB--;

                if (bSurrogatePair)
                {
                    dwSurrogateChar <<= 6;
                    dwSurrogateChar |= LOWER_6_BIT(*pUTF8);

                    if (nTB == 0)
                    {
                        if (cchDest)
                        {
                            if ((cchWC + 1) < cchDest)
                            {
                                lpDestStr[cchWC]   = (WCHAR)
                                                     (((dwSurrogateChar - 0x10000) >> 10) + HIGH_SURROGATE_START);

                                lpDestStr[cchWC+1] = (WCHAR)
                                                     ((dwSurrogateChar - 0x10000) % 0x400 + LOW_SURROGATE_START);
                            }
                            else
                            {
                                // Error : Buffer too small
                                cchSrc++;
                                break;
                            }
                        }

                        cchWC += 2;
                        bSurrogatePair = FALSE;
                    }
                }
                else
                {
                    //
                    //  Make room for the trail byte and add the trail byte
                    //  value.
                    //
                    if (cchDest)
                    {
                        lpDestStr[cchWC] <<= 6;
                        lpDestStr[cchWC] |= LOWER_6_BIT(*pUTF8);
                        
                    }

                    if (nTB == 0)
                    {
                        //
                        //  End of sequence.  Advance the output counter.
                        //
                        cchWC++;
                    }
                }
            }
            else
            {
                if (bCheckInvalidBytes) 
                {
                    RETURN(STATUS_INVALID_PARAMETER);
                }
                // error - not expecting a trail byte. That is, there is a trailing byte without leading byte.
                bSurrogatePair = FALSE;
            }
        }
        else
        {
            //
            //  Found a lead byte.
            //
            if (nTB > 0)
            {
                // error - A leading byte before the previous sequence is completed.
                if (bCheckInvalidBytes) 
                {
                    RETURN(STATUS_INVALID_PARAMETER);
                }            
                //
                //  Error - previous sequence not finished.
                //
                nTB = 0;
                bSurrogatePair = FALSE;
                // Put this character back so that we can start over another sequence.
                cchSrc++;
                pUTF8--;
            }
            else
            {
                //
                //  Calculate the number of bytes to follow.
                //  Look for the first 0 from left to right.
                //
                UTF8 = *pUTF8;
                while (BIT7(UTF8) != 0)
                {
                    UTF8 <<= 1;
                    nTB++;
                }

                //
                // Check for non-shortest form.
                // 
                switch (nTB) {
                    case 1:
                        nTB = 0;
                        break;
                    case 2:
                        // Make sure that bit 8 ~ bit 11 is not all zero.
                        // 110XXXXx 10xxxxxx
                        if ((*pUTF8 & 0x1e) == 0)
                        {
                            nTB = 0;
                        }
                        break;
                    case 3:
                        // Look ahead to check for non-shortest form.
                        // 1110XXXX 10Xxxxxx 10xxxxxx
                        if (cchSrc >= 2)
                        {
                            if (((*pUTF8 & 0x0f) == 0) && (*(pUTF8 + 1) & 0x20) == 0)
                            {
                                nTB = 0;
                            }
                        }
                        break;
                    case 4:                    
                        //
                        // This is a surrogate unicode pair
                        //
                        if (cchSrc >= 3)
                        {
                            SHORT word = (((SHORT)*pUTF8) << 8) | *(pUTF8 + 1);
                            // Look ahead to check for non-shortest form.
                            // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx                        
                            // Check for the 5 bits are not all zero.
                            // 0x0730 == 00000111 11000000
                            if ((word & 0x0730) == 0) 
                            {
                                nTB = 0;
                            } else if ((word & 0x0400) == 0x0400)
                            {
                                // The 21st bit is 1.
                                // Make sure that the resulting Unicode is within the valid surrogate range.
                                // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point ragne
                                // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
                                // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
                                // bit are all zero.
                                // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
                                // XXXXX can only be 10000.

                                // 0x0330 = 0000 0011 0011 0000
                                if ((word & 0x0330) != 0) 
                                {
                                    nTB = 0;
                                }  
                            }

                            if (nTB != 0)
                            { 
                                dwSurrogateChar = UTF8 >> nTB;
                                bSurrogatePair = TRUE;
                            }
                        }                        
                        break;
                    default:                    
                        // 
                        // If the bits is greater than 4, this is an invalid
                        // UTF8 lead byte.
                        //
                        nTB = 0;
                        break;
                }

                if (nTB != 0) 
                {
                    //
                    //  Store the value from the first byte and decrement
                    //  the number of bytes to follow.
                    //
                    if (cchDest)
                    {
                        lpDestStr[cchWC] = (WCHAR)(UTF8 >> nTB);
                    }
                    nTB--;
                } else 
                {
                    if (bCheckInvalidBytes) 
                    {
                        RETURN(STATUS_INVALID_PARAMETER);
                    }                 
                }
            }
        }
        pUTF8++;
    }

    if ((bCheckInvalidBytes && nTB != 0) || (cchWC == 0)) 
    {
        // About (cchWC == 0):
        // Because we now throw away non-shortest form, it is possible that we generate 0 chars.
        // In this case, we have to set error to ERROR_NO_UNICODE_TRANSLATION so that we conform
        // to the spec of MultiByteToWideChar.
        RETURN(STATUS_INVALID_PARAMETER);
    }
    //
    //  Make sure the destination buffer was large enough.
    //
    if (cchDest && (cchSrc >= 0))
    {
        RETURN(STATUS_BUFFER_TOO_SMALL);
    }


    //
    //  Return the number of Unicode characters written.
    //
    *pcchDest = cchWC;

    return STATUS_SUCCESS;

} // HttpUTF8ToUnicode


/***************************************************************************++

Routine Description:

    Split a UCS-4 character (32 bits)
    into 1 or 2 UTF-16 characters (16 bits each)

Arguments:

    UnicodeChar     - UCS-4 character
    pHighSurrogate  - First output character
    pLowSurrogate   - Second output character. Zero unless UnicodeChar > 0xFFFF

Return Value:

    STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD

--***************************************************************************/

NTSTATUS
HttpUcs4toUtf16(
    IN  ULONG   UnicodeChar, 
    OUT PWCHAR  pHighSurrogate, 
    OUT PWCHAR  pLowSurrogate
    )
{
    NTSTATUS Status = STATUS_SUCCESS;

    ASSERT(NULL != pHighSurrogate);
    ASSERT(NULL != pLowSurrogate);

    if (UnicodeChar <= 0xFFFF)
    {
        *pHighSurrogate = (WCHAR) UnicodeChar;
        *pLowSurrogate  = 0;

        if (HIGH_SURROGATE_START <= UnicodeChar
                &&  UnicodeChar <= LOW_SURROGATE_END)
        {
            UlTraceError(PARSER, (
                        "http!HttpUcs4toUtf16(): "
                        "Illegal raw surrogate character, U+%04lX.\n",
                        UnicodeChar
                        ));

            Status = STATUS_INVALID_PARAMETER;
        }

        if ( IS_UNICODE_NONCHAR(UnicodeChar) )
        {
            UlTraceError(PARSER, (
                        "http!HttpUcs4toUtf16(): "
                        "Non-character code point, U+%04lX.\n",
                        UnicodeChar
                        ));

            Status = STATUS_INVALID_PARAMETER;
        }
    }
    else if (UnicodeChar <= UTF8_4_MAX)
    {
        if ( IS_UNICODE_NONCHAR(UnicodeChar) )
        {
            UlTraceError(PARSER, (
                        "http!HttpUcs4toUtf16(): "
                        "Non-character code point, U+%04lX.\n",
                        UnicodeChar
                        ));

            Status = STATUS_INVALID_PARAMETER;
        }
        else
        {
            *pHighSurrogate
                = (WCHAR) (((UnicodeChar - 0x10000) >> 10)
                           + HIGH_SURROGATE_START);

            ASSERT(HIGH_SURROGATE_START <= *pHighSurrogate
                    &&  *pHighSurrogate <= HIGH_SURROGATE_END);

            *pLowSurrogate
                = (WCHAR) (((UnicodeChar - 0x10000) & ((1 << 10) - 1))
                           + LOW_SURROGATE_START);

            ASSERT(LOW_SURROGATE_START <= *pLowSurrogate
                    &&  *pLowSurrogate <= LOW_SURROGATE_END);
        }
    }
    else
    {
        UlTraceError(PARSER, (
                    "http!HttpUcs4toUtf16(): "
                    "Illegal large character, 0x%08lX.\n",
                    UnicodeChar
                    ));

        Status = STATUS_INVALID_PARAMETER;
    }

    return Status;

} // HttpUcs4toUtf16


/***************************************************************************++

Routine Description:

    Count number of BYTEs required for UTF-8 conversion of UNICODE string.
    Count is terminated after dwInLen characters

Arguments:

    pwszIn  - pointer to input wide-character string

    dwInLen - number of characters in pwszIn

    bEncode - TRUE if we are to hex encode characters >= 0x80

Return Value:

    ULONG   - number of BYTEs required for conversion

--***************************************************************************/
ULONG
HttpUnicodeToUTF8Count(
    IN LPCWSTR pwszIn,
    IN ULONG dwInLen,
    IN BOOLEAN bEncode
    )
{
    ULONG dwCount = 0;
    ULONG oneCharLen = bEncode ? 3 : 1;
    ULONG twoCharLen = 2 * oneCharLen;

    ASSERT(pwszIn != NULL);
    ASSERT(dwInLen != 0);

    //
    // N.B. code arranged to reduce number of jumps in loop to 1 (while)
    //

    do {

        ULONG wchar = *pwszIn++;

        dwCount += (wchar & 0xF800) ? oneCharLen : 0;
        dwCount += ((wchar & 0xFF80) ? 0xFFFFFFFF : 0) & (twoCharLen - 1);
        ++dwCount;
    } while (--dwInLen != 0);

    return dwCount;

} // HttpUnicodeToUTF8Count


/***************************************************************************++

Routine Description:

    Maps a Unicode character string to its UTF-8 string counterpart. This
    also hex encodes the string.

    Conversion continues until the source is finished or an error happens in 
    either case it returns the number of UTF-8 characters written.
    
    If the supllied buffer is not big enough it returns 0.

    Convert a string of UNICODE characters to UTF-8:

        0000000000000000..0000000001111111: 0xxxxxxx
        0000000010000000..0000011111111111: 110xxxxx 10xxxxxx
        0000100000000000..1111111111111111: 1110xxxx 10xxxxxx 10xxxxxx

Arguments:

    pwszIn      - pointer to input wide-character string

    dwInLen     - number of CHARACTERS in pwszIn INCLUDING terminating NUL

    pszOut      - pointer to output narrow-character buffer

    dwOutLen    - number of BYTEs in pszOut

    pdwOutLen   - actual number of BYTES written to the output pszOut

    bEncode     - TRUE if we are to hex encode characters >= 0x80

Return Value:

    ULONG
        Success - STATUS_SUCCESS

        Failure - STATUS_INSUFFICIENT_RESOURCES
                    Not enough space in pszOut to store results
    
--***************************************************************************/
NTSTATUS
HttpUnicodeToUTF8Encode(
    IN  LPCWSTR pwszIn,
    IN  ULONG   dwInLen,
    OUT PUCHAR  pszOut,
    IN  ULONG   dwOutLen,
    OUT PULONG  pdwOutLen,
    IN  BOOLEAN bEncode
    )
{
    PUCHAR pOutput = pszOut;
    ULONG pOutputLen = dwOutLen;
    UCHAR lead;
    int shift;

    ULONG outputSize = bEncode ? 3 : 1;

    ASSERT(pwszIn != NULL);
    ASSERT((int)dwInLen > 0);
    ASSERT(pszOut != NULL);
    ASSERT((int)dwOutLen > 0);

    while (dwInLen-- && dwOutLen) {

        ULONG wchar = *pwszIn++;
        UCHAR bchar;

        if (wchar <= 0x007F) {
            *pszOut++ = (UCHAR)(wchar);
            --dwOutLen;
            continue;
        }

        lead = ((wchar >= 0x0800) ? 0xE0 : 0xC0);
        shift = ((wchar >= 0x0800) ? 12 : 6);

        if ((int)(dwOutLen -= outputSize) < 0)
        {
            RETURN(STATUS_INSUFFICIENT_RESOURCES);
        }
        bchar = lead | (UCHAR)(wchar >> shift);
        if (bEncode) {
            *pszOut++ = '%';
            *pszOut++ = hexArray[bchar >> 4];
            bchar = hexArray[bchar & 0x0F];
        }
        *pszOut++ = bchar;

        if (wchar >= 0x0800) {
            if ((int)(dwOutLen -= outputSize) < 0)
            {
                RETURN(STATUS_INSUFFICIENT_RESOURCES);
            }
            bchar = 0x80 | (UCHAR)((wchar >> 6) & 0x003F);
            if (bEncode) {
                *pszOut++ = '%';
                *pszOut++ = hexArray[bchar >> 4];
                bchar = hexArray[bchar & 0x0F];
            }
            *pszOut++ = bchar;
        }
        if ((int)(dwOutLen -= outputSize) < 0)
        {
            RETURN(STATUS_INSUFFICIENT_RESOURCES);
        }
        bchar = 0x80 | (UCHAR)(wchar & 0x003F);
        if (bEncode) {
            *pszOut++ = '%';
            *pszOut++ = hexArray[bchar >> 4];
            bchar = hexArray[bchar & 0x0F];
        }
        *pszOut++ = bchar;
    }

    ASSERT(pszOut >= pOutput && pszOut <= pOutput + pOutputLen);
    UNREFERENCED_PARAMETER(pOutputLen);

    if (pdwOutLen)
        *pdwOutLen = (ULONG)(pszOut - pOutput);

    return STATUS_SUCCESS;

} // HttpUnicodeToUTF8Encode


/***************************************************************************++

Routine Description:

    Splice together the bits from a UTF-8 lead byte and 0-3 trail bytes
    into a Unicode character.

Arguments:

    pOctetArray     - Input buffer: Raw lead byte + raw trail bytes
    SourceLength    - Length of pOctetArray, in bytes
    pUnicodeChar    - decoded character
    pOctetsToSkip   - number of bytes consumed from pOctetArray

Return Value:

    STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD

--***************************************************************************/

NTSTATUS
HttpUtf8RawBytesToUnicode(
    IN  PCUCHAR pOctetArray,
    IN  ULONG   SourceLength,
    OUT PULONG  pUnicodeChar,
    OUT PULONG  pOctetsToSkip
    )
{
    ULONG i;
    ULONG UnicodeChar;
    UCHAR LeadByte    = pOctetArray[0];
    ULONG OctetCount  = UTF8_OCTET_COUNT(LeadByte);

    ASSERT(SourceLength > 0);

    if (0 == OctetCount)
    {
        UlTraceError(PARSER, (
                    "http!HttpUtf8RawBytesToUnicode(): "
                    "Invalid UTF-8 lead byte, %%%02X.\n",
                    LeadByte
                    ));

        RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
    }
    else if (OctetCount > SourceLength)
    {
        UlTraceError(PARSER, (
                    "http!HttpUtf8RawBytesToUnicode(): "
                    "UTF-8 lead byte, %%%02X, requires %lu bytes in buffer, "
                    "but only have %lu.\n",
                    LeadByte, OctetCount, SourceLength
                    ));

        RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
    }

    // Check that the trail bytes are valid: 10xxxxxx.

    for (i = 1;  i < OctetCount;  ++i)
    {
        if (! IS_UTF8_TRAILBYTE(pOctetArray[i]))
        {
            UlTraceError(PARSER, (
                    "http!HttpUtf8RawBytesToUnicode(): "
                    "Invalid trail byte[%lu], %%%02X.\n",
                    i, pOctetArray[i]
                    ));

            RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
        }
    }

    //
    // Now splice together the bits from the lead byte and the trail byte(s)
    //

    switch (OctetCount)
    {

    case 1:
        // handle one-byte case:
        //      (0xxx xxxx)
        //          => 0xxx xxxx 

        ASSERT(IS_UTF8_SINGLETON(LeadByte));
        ASSERT(SourceLength >= 1);

        UnicodeChar = LeadByte;

        ASSERT(UnicodeChar <= UTF8_1_MAX);
        break;


    case 2:
        // handle two-byte case:
        //      (110y yyyy,  10xx xxxx)
        //          => 0000 0yyy yyxx xxxx 

        ASSERT(IS_UTF8_1ST_BYTE_OF_2(LeadByte));
        ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[1]));
        ASSERT(SourceLength >= 2);

        UnicodeChar = (
                        ((pOctetArray[0] & 0x1f) << 6) |
                         (pOctetArray[1] & 0x3f)
                      );

        if (UnicodeChar <= UTF8_1_MAX)
        {
            UlTraceError(PARSER, (
                        "http!HttpUtf8RawBytesToUnicode(): "
                        "Overlong 2-byte sequence, "
                        "%%%02X %%%02X = U+%04lX.\n",
                        pOctetArray[0],
                        pOctetArray[1],
                        UnicodeChar
                        ));

            RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
        }

        ASSERT(UTF8_1_MAX < UnicodeChar  &&  UnicodeChar <= UTF8_2_MAX);
        break;


    case 3:
        // handle three-byte case:
        //      (1110 zzzz,  10yy yyyy,  10xx xxxx)
        //          => zzzz yyyy yyxx xxxx 

        ASSERT(IS_UTF8_1ST_BYTE_OF_3(LeadByte));
        ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[1]));
        ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[2]));
        ASSERT(SourceLength >= 3);

        UnicodeChar = (
                        ((pOctetArray[0] & 0x0f) << 12) |
                        ((pOctetArray[1] & 0x3f) <<  6) |
                         (pOctetArray[2] & 0x3f)
                      );

        if (UnicodeChar <= UTF8_2_MAX)
        {
            UlTraceError(PARSER, (
                        "http!HttpUtf8RawBytesToUnicode(): "
                        "Overlong 3-byte sequence, "
                        "%%%02X %%%02X %%%02X = U+%04lX.\n",
                        pOctetArray[0],
                        pOctetArray[1],
                        pOctetArray[2],
                        UnicodeChar
                        ));

            RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
        }

        ASSERT(UTF8_2_MAX < UnicodeChar  &&  UnicodeChar <= UTF8_3_MAX);
        break;


    case 4:
        // handle four-byte case:
        //      (1111 0uuu,  10uu zzzz,  10yy yyyy,  10xx xxxx)
        //          => 000u uuuu zzzz yyyy yyxx xxxx

        ASSERT(IS_UTF8_1ST_BYTE_OF_4(LeadByte));
        ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[1]));
        ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[2]));
        ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[3]));
        ASSERT(SourceLength >= 4);

        UnicodeChar = (
                        ((pOctetArray[0] & 0x07) << 18) |
                        ((pOctetArray[1] & 0x3f) << 12) |
                        ((pOctetArray[2] & 0x3f) <<  6) |
                         (pOctetArray[3] & 0x3f)
                      );

        if (UnicodeChar <= UTF8_3_MAX)
        {
            UlTraceError(PARSER, (
                        "http!HttpUtf8RawBytesToUnicode(): "
                        "Overlong 4-byte sequence, "
                        "%%%02X %%%02X %%%02X %%%02X = U+%06lX.\n",
                        pOctetArray[0],
                        pOctetArray[1],
                        pOctetArray[2],
                        pOctetArray[3],
                        UnicodeChar
                        ));

            RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
        }

        // Not all values in the 21-bit range are valid
        if (UnicodeChar > UTF8_4_MAX)
        {
            UlTraceError(PARSER, (
                        "http!HttpUtf8RawBytesToUnicode(): "
                        "Overlarge 4-byte sequence, "
                        "%%%02X %%%02X %%%02X %%%02X = U+%06lX.\n",
                        pOctetArray[0],
                        pOctetArray[1],
                        pOctetArray[2],
                        pOctetArray[3],
                        UnicodeChar
                        ));

            RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
        }

        ASSERT(UTF8_3_MAX < UnicodeChar  &&  UnicodeChar <= UTF8_4_MAX);
        break;


    default:
        ASSERT(! "Impossible OctetCount");
        UnicodeChar = 0;
        break;
    }

    //
    // Do not allow characters in the high- or low-surrogate ranges
    // to be UTF-8-encoded directly.
    //

    if (HIGH_SURROGATE_START <= UnicodeChar && UnicodeChar <= LOW_SURROGATE_END)
    {
        UlTraceError(PARSER, (
                    "http!HttpUtf8RawBytesToUnicode(): "
                    "Illegal surrogate character, U+%04lX.\n",
                    UnicodeChar
                    ));

        RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
    }


    // For security reasons we will signal an error for all noncharacter code 
    // points encountered.

    if ( IS_UNICODE_NONCHAR(UnicodeChar) )
    {
        ASSERT( (((LOW_NONCHAR_BOM & UnicodeChar) == LOW_NONCHAR_BOM) && 
         ((UnicodeChar >> 16) <= HIGH_NONCHAR_END)) ||
         ((LOW_NONCHAR_START <= UnicodeChar) && 
         (UnicodeChar <= LOW_NONCHAR_END)) );
    
        UlTraceError(PARSER, (
                    "http!HttpUtf8RawBytesToUnicode(): "
                    "Non-character code point, U+%04lX.\n",
                    UnicodeChar
                    ));

        RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
    }

    *pUnicodeChar = UnicodeChar;
    *pOctetsToSkip = OctetCount;

    return STATUS_SUCCESS;

} // HttpUtf8RawBytesToUnicode