windows-server-2003/base/win32/fusion/xmlparser/encodingstream.cxx


								/*

								 * @(#)EncodingStream.cxx 1.0 6/10/97

								 *

								* Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. *

								 */

								#include "stdinc.h"

								#include "core.hxx"

								#include "xmlhelper.hxx"

								#include "encodingstream.hxx"

								#pragma hdrstop


								const int EncodingStream::BUFFERSIZE = 4096*sizeof(WCHAR);

								//////////////////////////////////////////////////////////////////////////////////

								EncodingStream::EncodingStream(IStream * pStream):

								    stream(pStream), encoding(NULL), buf(NULL), pfnWideCharFromMultiByte(NULL),

								    btotal(0), bnext(0), startAt(0), lastBuffer(false), bufsize(0), _fEOF(false),

								    _fReadStream(true), _fUTF8BOM(false), _dwMode(0), codepage(CP_UNDEFINED)

								{

								}


								//////////////////////////////////////////////////////////////////////////////////

								/**

								 * Builds the EncodingStream for input.

								 * Reads the first two bytes of the InputStream * in order to make a guess

								 * as to the character encoding of the file.

								 */

								IStream * EncodingStream::newEncodingStream(IStream * pStream)

								{

								    EncodingStream * es = NEW (EncodingStream(pStream));

								    if (es == NULL)

								        return NULL;


								    //

								    // REVIEW REVIEW:

								    // Shouldn't this rewind the stream cursor back to a known good point?  The

								    // comment above is useless, too - there's no data reading at all here.

								    //


								    es->AddRef(); // xwu@@ : check this addRef()!


								    es->isInput = true;

								    es->buf = NULL;


								    return es;

								}

								//////////////////////////////////////////////////////////////////////////////////

								EncodingStream::~EncodingStream()

								{

								    if (buf)

								    {

								        delete [] buf;

								        buf = NULL;

								    }


								    if (encoding != NULL)

								    {

								        delete encoding;

								        encoding = NULL;

								    }


								    stream = NULL; // smart pointer

								}

								//////////////////////////////////////////////////////////////////////////////////

								/**

								 * Reads characters from stream and encode it to Unicode

								 */

								HRESULT STDMETHODCALLTYPE EncodingStream::Read(void * pv, ULONG cb, ULONG * pcbRead)

								{

								    HRESULT hr;


								    ULONG num = 0;


								    if (pcbRead != NULL)

								        *pcbRead = 0;


								    if (btotal == 0 && _fEOF)          // we already hit EOF - so return right away.

								        return S_OK;


								    // Calculate how many UNICODE chars we are allowed to return,

								    // xiaoyu : which is the same as the number of BYTES read from the file

								    cb /= sizeof(WCHAR);

								    checkhr2(prepareForInput(cb));


								    if (stream && _fReadStream)

								    {

								        // btotal = number of bytes already in start of buffer.

								        if (cb > btotal)

								        {

								            hr = stream->Read(buf + btotal, cb - btotal, &num);


								            // Let's show what we've seen in the debugger so that we can diagnose bad manifests

								            // more easily.  mgrier 12/28/2000


								            if (::FusionpDbgWouldPrintAtFilterLevel(FUSION_DBG_LEVEL_XMLSTREAM))

								            {

								                ::FusionpDbgPrintEx(

								                    FUSION_DBG_LEVEL_XMLSTREAM,

								                    "SXS.DLL: Read %lu bytes from XML stream; HRESULT returned = 0x%08lx\n", num, hr);


								                if (num > 0)

								                {

								                    ::FusionpDbgPrintBlob(

								                        FUSION_DBG_LEVEL_XMLSTREAM,

								                        buf + btotal,

								                        num,

								                        L"   ");

								                }

								            }


								            if ((hr == E_PENDING) && (num > 0))

								            {

								                // in which case we ignore the error, and continue on !!.

								                // BUGBUG - this may be a problem.since we are changing the

								                // return code returned from the stream.  This may mean we

								                // should not ever hand out this stream outside of MSXML.

								                hr = 0;

								            }

								            if (FAILED(hr))

								            {

								                return hr;

								            }

								            if (btotal == 0 && num == 0)

								            {

								                _fEOF = true;

								                return hr;

								            }

								        }

								        else

								        {

								            hr = S_OK;

								        }

								    }

								    else if (btotal == 0)

								    {

								    	return (lastBuffer) ? S_FALSE : E_PENDING;

								    }


								    btotal += num;

								    UINT b = btotal, utotal = cb;


								    if (b > cb)

								    {

								        // If we have more bytes in our buffer than the caller has

								        // room for, then only return the number of bytes the caller

								        // asked for -- otherwise pfnWideCharFromMultiByte will write

								        // off the end of the caller's buffer.

								        b = cb;

								    }

								    if (pfnWideCharFromMultiByte == NULL) // first read() call

								    {

								        checkhr2(autoDetect());

								        if (pfnWideCharFromMultiByte == NULL) // failed to fully determine encoding

								            return (lastBuffer) ? S_FALSE : E_PENDING;

								        b -= bnext;

								        startAt -= bnext;

								    }

								    hr = (this->pfnWideCharFromMultiByte)(&_dwMode, codepage, buf + bnext, &b, (WCHAR *)pv, &utotal);

								    if (hr != S_OK)

								        return hr;

								    if (b == 0 && num == 0 && (stream || lastBuffer))

								    {

								        // stream says we're at the end, but pfnWideCharFromMultiByte

								        // disagrees !!

								        ::FusionpDbgPrintEx(

								            FUSION_DBG_LEVEL_ERROR,

								            "SXS.DLL: XML Parser found incomplete encoding\n");


								        return XML_E_INCOMPLETE_ENCODING;

								    }

								    bnext += b;

								    if (pcbRead != NULL)

								        *pcbRead = utotal*sizeof(WCHAR);

								    return (utotal == 0) ? E_PENDING : S_OK;

								}

								//////////////////////////////////////////////////////////////////////////////////

								/**

								 * Checks the first two/four bytes of the input Stream in order to

								 * detect UTF-16/UCS-4 or UTF-8 encoding;

								 * otherwise assume it is UTF-8


								 * xiaoyu : since only UCS-2 and UTF-8 are support, we do not deal with others...

								 */

								HRESULT EncodingStream::autoDetect()

								{

								    // wait until we have enough to be sure.

								    if (btotal < 2)

								        return S_OK;


								    unsigned int guess = (((unsigned char)buf[0]) << 8) + ((unsigned char)buf[1]);

								    HRESULT hr;


								    if (guess == 0xFEFF || guess == 0xFFFE) // BOM found

								    {

								        // wait until we have enough to be sure.

								        if (btotal < 4)

								            return S_OK;


								        unsigned int guess1 = (((unsigned char)buf[2]) << 8) + ((unsigned char)buf[3]);

								        if (guess == guess1)

								        {

								            /*

											if (!encoding)

								            {

								                static const WCHAR* wchUCS4 = TEXT("UCS-4");

								                encoding = Encoding::newEncoding(wchUCS4, 5, (0xFFFE == guess), true);

								            }

								            bnext = 4;

											*/

											// FUSION_XML_PARSER does not support UCS4

											return XML_E_INVALIDENCODING;

								        }

								        else

								        {

								            if (!encoding)

								            {

								                static const WCHAR wchUCS2[] = L"UCS-2";

								                encoding = Encoding::newEncoding(wchUCS2, LENGTH(wchUCS2), (0xFFFE == guess), true);

								            }

								            bnext = 2;

								        }


								        if (NULL == encoding)

								            return E_OUTOFMEMORY;

								        encoding->littleendian =  (0xFFFE == guess);

								    }

								    else

								    {

								        if (!encoding)

								        {

								            encoding = Encoding::newEncoding(L"UTF-8", 5, false, false);

								            if (NULL == encoding)

								                return E_OUTOFMEMORY;

								        }


								        // In some system, such as win2k, there is BOM 0xEF BB BF for UTF8

								        if (guess == 0xEFBB)

								        {

								            if (btotal < 3)

								                return S_OK;


								            if (buf[2] == 0xBF)

								                _fUTF8BOM = true;


								            bnext = 3;

								        }

								        else

								        {

								            encoding->byteOrderMark = false;

								        }

								    }


								    checkhr2(CharEncoder::getWideCharFromMultiByteInfo(encoding, &codepage, &pfnWideCharFromMultiByte, &maxCharSize));

								    return S_OK;

								}

								/////////////////////////////////////////////////////////////////////////////////////////

								/**

								 * Switchs the character encoding of the input stream

								 * Returns:

								 *         S_OK: succeeded, and do not need re-read

								 *         S_FALSE: succeeded, needs to re-read from <code> newPosition </code>

								 *         Otherwise: error code

								 * Notice:

								 *         This method only works for input stream, newPosition starts with 1

								 */

								HRESULT EncodingStream::switchEncodingAt(Encoding * newEncoding, int newPosition)

								{

								    // Ignore encoding information in the document when charset information is set from outside

									// xwu: fusion xml parsed does not use Charset

								    //if (_fSetCharset)

								    //    return S_OK;


								    int l = newPosition - startAt;

								    if (l < 0 || l > (int)bnext)

								    {

								        // out of range

								        delete newEncoding;

								        return E_INVALIDARG;

								    }


								    UINT newcodepage;

								    UINT newCharSize;

								    //

								    // get and check charset information

								    //

								    WideCharFromMultiByteFunc * pfn;

								    HRESULT hr = CharEncoder::getWideCharFromMultiByteInfo(newEncoding, &newcodepage, &pfn, &newCharSize);

								    if (hr != S_OK)

								    {

								        delete newEncoding;

								        return E_INVALIDARG;

								    }

								    if (codepage == newcodepage)

								    {

								        delete newEncoding;

								        return S_OK;

								    }


								    // Now if we are in UCS-2/UCS-4 we cannot switch out of UCS-2/UCS-4 and if we are

								    // not in UCS-2/UCS-4 we cannot switch into UCS-2/UCS-4.

								    // Also if UTF-8 BOM is presented, we cannot switch away

								    if ((codepage != CP_UCS_2 && newcodepage == CP_UCS_2) ||

								        (codepage == CP_UCS_2 && newcodepage != CP_UCS_2) ||

								        (codepage == CP_UTF_8 && newcodepage != CP_UTF_8 && _fUTF8BOM))

								    {

								        delete newEncoding;

								        return E_FAIL;

								    }


								    // Ok, then, let's make the switch.

								    if (encoding)

								    {

								        delete encoding;

								    }


								    encoding = newEncoding;

								    maxCharSize = newCharSize;

								    codepage = newcodepage;

								    pfnWideCharFromMultiByte = pfn;


								    // Because the XML declaration is encoded in UTF-8,

								    // Mapping input characters to wide characters is one-to-one mapping

								    if ((int)bnext != l)

								    {

								        bnext = l;

								        return S_FALSE;

								    }

								    return S_OK;

								}


								//////////////////////////////////////////////////////////////////////////////////

								// minlen is the number of UNICODE, which is the same number of byte we read from the file

								HRESULT EncodingStream::prepareForInput(ULONG minlen)

								{

								    Assert(btotal >= bnext);

								    btotal -= bnext;


								    if (bufsize < minlen)

								    {

								        BYTE* newbuf = NEW (BYTE[minlen]);

								        if (newbuf == NULL) {

								            return E_OUTOFMEMORY;

								        }


								        if (buf){

								            ::memcpy(newbuf, buf+bnext, btotal);

								            delete[] buf;

								        }


								        buf = newbuf;

								        bufsize = minlen;

								    }

								    else if (bnext > 0 && btotal > 0)

								    {

								        // Shift remaining bytes down to beginning of buffer.

								        ::memmove(buf, buf + bnext, btotal);

								    }


								    startAt += bnext;

								    bnext = 0;

								    return S_OK;

								}

								//////////////////////////////////////////////////////////////////////////////////

								// xiaoyu : here it assumes that it is a BYTE buffer, not a WCHAR byte, so it can be copied directly

								HRESULT EncodingStream::AppendData( const BYTE* buffer, ULONG length, BOOL fLastBuffer)

								{

								    Assert(btotal >= bnext);

								    lastBuffer = (fLastBuffer != FALSE);

								    HRESULT hr;

								    ULONG minlen = length + (btotal - bnext); // make sure we don't loose any data

								    if (minlen < BUFFERSIZE)

								        minlen = BUFFERSIZE;

								    checkhr2( prepareForInput(minlen)); // guarantee enough space in the array


								    if (length > 0 && buffer != NULL){

								        // Copy raw data into new buffer.

								        ::memcpy(buf + btotal, buffer, length);

								        btotal += length;

								    }

									if (pfnWideCharFromMultiByte == NULL) // first AppendData call

								    {

								        checkhr2(autoDetect());

								    }


								    return hr;

								}

								//////////////////////////////////////////////////////////////////////////////////

								HRESULT EncodingStream::BufferData()

								{

								    HRESULT hr = S_OK;

								    checkhr2(prepareForInput(0)); // 0 is used just for shift down (so bnext=0).


								    if (_fEOF)          // already hit the end of the stream.

								        return S_FALSE;


								    const DWORD BUFSIZE = 4096;


								    DWORD dwRead = 1;


								    while (S_OK == hr && dwRead > 0)

								    {

								        // if we cannot fit another buffer full, then re-allocate.

								        DWORD minsize = (btotal+BUFSIZE > bufsize) ? bufsize + BUFSIZE : bufsize;

								        checkhr2( prepareForInput(minsize)); // make space available.


								        dwRead = 0;

								        hr = stream->Read(buf + btotal, BUFSIZE, &dwRead);

								        btotal += dwRead;

								    }


								    if (SUCCEEDED(hr) && dwRead == 0)

								    {

								        _fEOF = true;

								        hr = S_FALSE; // return S_FALSE when at eof.

								    }

								    return hr;

								}