|
|
/*
* @(#)EncodingStream.cxx 1.0 6/10/97 * * Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. * */ #include "stdinc.h"
#include "core.hxx"
#include "xmlhelper.hxx"
#include "encodingstream.hxx"
#pragma hdrstop
const int EncodingStream::BUFFERSIZE = 4096*sizeof(WCHAR); //////////////////////////////////////////////////////////////////////////////////
EncodingStream::EncodingStream(IStream * pStream): stream(pStream), encoding(NULL), buf(NULL) { #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
// These objects are sometimes handed out to external clients.
::IncrementComponents(); #endif
pfnWideCharFromMultiByte = NULL; #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
pfnWideCharToMultiByte = NULL; #endif
btotal = bnext = startAt = 0; lastBuffer = false; bufsize = 0; _fEOF = false; _fReadStream = true; _fUTF8BOM = false; //_fTextXML = false;
//_fSetCharset = false;
_dwMode = 0; codepage = CP_UNDEFINED; } //////////////////////////////////////////////////////////////////////////////////
/**
* Builds the EncodingStream for input. * Reads the first two bytes of the InputStream * in order to make a guess * as to the character encoding of the file. */ IStream * EncodingStream::newEncodingStream(IStream * pStream) { EncodingStream * es = NEW (EncodingStream(pStream)); if (es == NULL) return NULL;
es->AddRef(); // xwu@@ : check this addRef()!
es->isInput = true; es->buf = NULL;
return es; } //////////////////////////////////////////////////////////////////////////////////
EncodingStream::~EncodingStream() { if (buf) delete [] buf; if (encoding != NULL) delete encoding;
stream = NULL; // smart pointer
} //////////////////////////////////////////////////////////////////////////////////
/**
* Reads characters from stream and encode it to Unicode */ HRESULT STDMETHODCALLTYPE EncodingStream::Read(void * pv, ULONG cb, ULONG * pcbRead) { HRESULT hr; ULONG num = 0;
if (pcbRead != NULL) *pcbRead = 0;
if (btotal == 0 && _fEOF) // we already hit EOF - so return right away.
return S_OK;
// Calculate how many UNICODE chars we are allowed to return,
// xiaoyu : which is the same as the number of BYTES read from the file
cb /= sizeof(WCHAR); checkhr2(prepareForInput(cb));
if (stream && _fReadStream) { // btotal = number of bytes already in start of buffer.
if (cb > btotal) { hr = stream->Read(buf + btotal, cb - btotal, &num);
// Let's show what we've seen in the debugger so that we can diagnose bad manifests
// more easily. mgrier 12/28/2000
if (::FusionpDbgWouldPrintAtFilterLevel(FUSION_DBG_LEVEL_XMLSTREAM)) { ::FusionpDbgPrintEx( FUSION_DBG_LEVEL_XMLSTREAM, "SXS.DLL: Read %lu bytes from XML stream; HRESULT returned = 0x%08lx\n", num, hr);
if (num > 0) { ::FusionpDbgPrintBlob( FUSION_DBG_LEVEL_XMLSTREAM, buf + btotal, num, L" "); } }
if (hr == E_PENDING && num > 0) { // in which case we ignore the error, and continue on !!.
// BUGBUG - this may be a problem.since we are changing the
// return code returned from the stream. This may mean we
// should not ever hand out this stream outside of MSXML.
hr = 0; } if (FAILED(hr)) { return hr; } if (btotal == 0 && num == 0) { _fEOF = true; return hr; } } else { hr = S_OK; } } else if (btotal == 0) { return (lastBuffer) ? S_FALSE : E_PENDING; }
btotal += num; UINT b = btotal, utotal = cb;
if (b > cb) { // If we have more bytes in our buffer than the caller has
// room for, then only return the number of bytes the caller
// asked for -- otherwise pfnWideCharFromMultiByte will write
// off the end of the caller's buffer.
b = cb; } if (pfnWideCharFromMultiByte == NULL) // first read() call
{ checkhr2(autoDetect()); if (pfnWideCharFromMultiByte == NULL) // failed to fully determine encoding
return (lastBuffer) ? S_FALSE : E_PENDING; b -= bnext; startAt -= bnext; } hr = (this->pfnWideCharFromMultiByte)(&_dwMode, codepage, buf + bnext, &b, (WCHAR *)pv, &utotal); if (hr != S_OK) return hr; if (b == 0 && num == 0 && (stream || lastBuffer)) { // stream says we're at the end, but pfnWideCharFromMultiByte
// disagrees !!
::FusionpDbgPrintEx( FUSION_DBG_LEVEL_ERROR, "SXS.DLL: XML Parser found incomplete encoding\n");
return XML_E_INCOMPLETE_ENCODING; } bnext += b; if (pcbRead != NULL) *pcbRead = utotal*sizeof(WCHAR); return (utotal == 0) ? E_PENDING : S_OK; } //////////////////////////////////////////////////////////////////////////////////
/**
* Checks the first two/four bytes of the input Stream in order to * detect UTF-16/UCS-4 or UTF-8 encoding; * otherwise assume it is UTF-8
* xiaoyu : since only UCS-2 and UTF-8 are support, we do not deal with others... */ HRESULT EncodingStream::autoDetect() { // wait until we have enough to be sure.
if (btotal < 2) return S_OK;
unsigned int guess = (((unsigned char)buf[0]) << 8) + ((unsigned char)buf[1]); HRESULT hr;
if (guess == 0xFEFF || guess == 0xFFFE) // BOM found
{ // wait until we have enough to be sure.
if (btotal < 4) return S_OK; unsigned int guess1 = (((unsigned char)buf[2]) << 8) + ((unsigned char)buf[3]); if (guess == guess1) { /*
if (!encoding) { static const WCHAR* wchUCS4 = TEXT("UCS-4"); encoding = Encoding::newEncoding(wchUCS4, 5, (0xFFFE == guess), true); } bnext = 4; */ // FUSION_XML_PARSER does not support UCS4
return XML_E_INVALIDENCODING; } else { if (!encoding) { static const WCHAR* wchUCS2 = L"UCS-2"; encoding = Encoding::newEncoding(wchUCS2, 5, (0xFFFE == guess), true); } bnext = 2; }
if (NULL == encoding) return E_OUTOFMEMORY; encoding->littleendian = (0xFFFE == guess); } else { if (!encoding) { encoding = Encoding::newEncoding(); // default encoding : UTF-8
if (NULL == encoding) return E_OUTOFMEMORY; }
// In some system, such as win2k, there is BOM 0xEF BB BF for UTF8
if (guess == 0xEFBB) { if (btotal < 3) return S_OK; if (buf[2] == 0xBF) _fUTF8BOM = true; bnext = 3; } else { encoding->byteOrderMark = false; } }
checkhr2(CharEncoder::getWideCharFromMultiByteInfo(encoding, &codepage, &pfnWideCharFromMultiByte, &maxCharSize)); return S_OK; } /////////////////////////////////////////////////////////////////////////////////////////
/**
* Switchs the character encoding of the input stream * Returns: * S_OK: succeeded, and do not need re-read * S_FALSE: succeeded, needs to re-read from <code> newPosition </code> * Otherwise: error code * Notice: * This method only works for input stream, newPosition starts with 1 */ HRESULT EncodingStream::switchEncodingAt(Encoding * newEncoding, int newPosition) { // Ignore encoding information in the document when charset information is set from outside
// xwu: fusion xml parsed does not use Charset
//if (_fSetCharset)
// return S_OK;
int l = newPosition - startAt; if (l < 0 || l > (int)bnext) { // out of range
delete newEncoding; return E_INVALIDARG; }
UINT newcodepage; UINT newCharSize; //
// get and check charset information
//
WideCharFromMultiByteFunc * pfn; HRESULT hr = CharEncoder::getWideCharFromMultiByteInfo(newEncoding, &newcodepage, &pfn, &newCharSize); if (hr != S_OK) { delete newEncoding; return E_INVALIDARG; } if (codepage == newcodepage) { delete newEncoding; return S_OK; }
// Now if we are in UCS-2/UCS-4 we cannot switch out of UCS-2/UCS-4 and if we are
// not in UCS-2/UCS-4 we cannot switch into UCS-2/UCS-4.
// Also if UTF-8 BOM is presented, we cannot switch away
if ((codepage != CP_UCS_2 && newcodepage == CP_UCS_2) || (codepage == CP_UCS_2 && newcodepage != CP_UCS_2) || /* xuw: fusion xml parser only support UTF-8 and UCS-2
(codepage != CP_UCS_4 && newcodepage == CP_UCS_4) || (codepage == CP_UCS_4 && newcodepage != CP_UCS_4) || */ (codepage == CP_UTF_8 && newcodepage != CP_UTF_8 && _fUTF8BOM)) { delete newEncoding; return E_FAIL; }
// Ok, then, let's make the switch.
delete encoding; encoding = newEncoding; maxCharSize = newCharSize; codepage = newcodepage; pfnWideCharFromMultiByte = pfn;
// Because the XML declaration is encoded in UTF-8,
// Mapping input characters to wide characters is one-to-one mapping
if ((int)bnext != l) { bnext = l; return S_FALSE; } return S_OK; }
//////////////////////////////////////////////////////////////////////////////////
// minlen is the number of UNICODE, which is the same number of byte we read from the file
HRESULT EncodingStream::prepareForInput(ULONG minlen) { Assert(btotal >= bnext); btotal -= bnext;
if (bufsize < minlen) { BYTE* newbuf = NEW (BYTE[minlen]); if (newbuf == NULL) { return E_OUTOFMEMORY; }
if (buf){ ::memcpy(newbuf, buf+bnext, btotal); delete[] buf; }
buf = newbuf; bufsize = minlen; } else if (bnext > 0 && btotal > 0) { // Shift remaining bytes down to beginning of buffer.
::memmove(buf, buf + bnext, btotal); }
startAt += bnext; bnext = 0; return S_OK; } //////////////////////////////////////////////////////////////////////////////////
// xiaoyu : here it assumes that it is a BYTE buffer, not a WCHAR byte, so it can be copied directly
HRESULT EncodingStream::AppendData( const BYTE* buffer, ULONG length, BOOL fLastBuffer) { Assert(btotal >= bnext); lastBuffer = (fLastBuffer != FALSE); HRESULT hr; ULONG minlen = length + (btotal - bnext); // make sure we don't loose any data
if (minlen < BUFFERSIZE) minlen = BUFFERSIZE; checkhr2( prepareForInput(minlen)); // guarantee enough space in the array
if (length > 0 && buffer != NULL){ // Copy raw data into new buffer.
::memcpy(buf + btotal, buffer, length); btotal += length; } if (pfnWideCharFromMultiByte == NULL) // first AppendData call
{ checkhr2(autoDetect()); }
return hr; } //////////////////////////////////////////////////////////////////////////////////
HRESULT EncodingStream::BufferData() { HRESULT hr = S_OK; checkhr2(prepareForInput(0)); // 0 is used just for shift down (so bnext=0).
if (_fEOF) // already hit the end of the stream.
return S_FALSE;
const DWORD BUFSIZE = 4096;
DWORD dwRead = 1;
while (S_OK == hr && dwRead > 0) { // if we cannot fit another buffer full, then re-allocate.
DWORD minsize = (btotal+BUFSIZE > bufsize) ? bufsize + BUFSIZE : bufsize; checkhr2( prepareForInput(minsize)); // make space available.
dwRead = 0; hr = stream->Read(buf + btotal, BUFSIZE, &dwRead); btotal += dwRead; }
if (SUCCEEDED(hr) && dwRead == 0) { _fEOF = true; hr = S_FALSE; // return S_FALSE when at eof.
} return hr; }
|