|
|
/*
* @(#)CharEncoder.cxx 1.0 6/10/97 * * Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. * */ #include "stdinc.h"
#include "core.hxx"
#pragma hdrstop
#include "charencoder.hxx"
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
#include <shlwapip.h> // IsCharSpace
#ifdef UNIX
#include <lendian.hpp>
#endif
#ifdef UNIX
// Not needed under UNIX
#else
#ifndef _WIN64
#include <w95wraps.h>
#endif // _WIN64
#endif /* UNIX */
#endif
//
// Delegate other charsets to mlang
//
const EncodingEntry CharEncoder::charsetInfo [] = { #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
{ CP_1250, _T("WINDOWS-1250"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_1251, _T("WINDOWS-1251"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_1252, _T("WINDOWS-1252"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_1253, _T("WINDOWS-1253"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_1254, _T("WINDOWS-1254"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_1257, _T("WINDOWS-1257"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_UCS_4, _T("UCS-4"), 4, wideCharFromUcs4Bigendian, wideCharToUcs4Bigendian }, { CP_UCS_2, _T("ISO-10646-UCS-2"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian }, { CP_UCS_2, _T("UNICODE-2-0-UTF-16"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian }, { CP_UCS_2, _T("UTF-16"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian }, { CP_UTF_8, _T("UNICODE-1-1-UTF-8"), 3, wideCharFromUtf8, wideCharToUtf8 }, { CP_UTF_8, _T("UNICODE-2-0-UTF-8"), 3, wideCharFromUtf8, wideCharToUtf8 }, #endif
{ CP_UCS_2, L"UTF-16", 2, wideCharFromUcs2Bigendian }, { CP_UCS_2, L"UCS-2", 2, wideCharFromUcs2Bigendian }, { CP_UTF_8, L"UTF-8", 3, wideCharFromUtf8 }, };
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
IMultiLanguage * CharEncoder::pMultiLanguage = NULL; #endif
Encoding * Encoding::newEncoding(const WCHAR * s, ULONG len, bool endian, bool mark) { //Encoding * e = new Encoding();
Encoding * e = NEW (Encoding()); if (e == NULL) return NULL; e->charset = NEW (WCHAR[len + 1]); if (e->charset == NULL) { delete e; return NULL; } ::memcpy(e->charset, s, sizeof(WCHAR) * len); e->charset[len] = 0; // guarentee NULL termination.
e->littleendian = endian; e->byteOrderMark = mark; return e; }
Encoding::~Encoding() { if (charset != NULL) { delete [] charset; } }
int CharEncoder::getCharsetInfo(const WCHAR * charset, CODEPAGE * pcodepage, UINT * mCharSize) { #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
CPINFO cpinfo;
#endif
for (int i = LENGTH(charsetInfo) - 1; i >= 0; i--) { //if (StrCmpI(charset, charsetInfo[i].charset) == 0)
if (::FusionpCompareStrings(charset, ::wcslen(charset), charsetInfo[i].charset, ::wcslen(charsetInfo[i].charset), true) == 0) { //
// test whether we can handle it locally or not
// BUGBUG(HACK) the index number may change if we change charsetInfo
//
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
if (i > 5 || GetCPInfo(charsetInfo[i].codepage, &cpinfo)) #endif
{ *pcodepage = charsetInfo[i].codepage; *mCharSize = charsetInfo[i].maxCharSize; return i; } #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
else { break; } #endif
} // end of if
}// end of for
// xiaoyu: It is assumed that an error would return if neither UTF-8 nor UCS-2
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
//
// delegate to MLANG then
//
MIMECSETINFO mimeCharsetInfo; HRESULT hr;
hr = _EnsureMultiLanguage(); if (hr == S_OK) { hr = pMultiLanguage->GetCharsetInfo((WCHAR*)charset, &mimeCharsetInfo); if (hr == S_OK) { *pcodepage = mimeCharsetInfo.uiInternetEncoding; if (GetCPInfo(*pcodepage, &cpinfo)) *mCharSize = cpinfo.MaxCharSize; else // if we don't know the max size, assume a large size
*mCharSize = 4; return -1; } } #endif
return -2; }
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
extern HRESULT CreateMultiLanguage(IMultiLanguage ** ppUnk);
HRESULT CharEncoder::_EnsureMultiLanguage() { return CreateMultiLanguage(&pMultiLanguage); } #endif
/**
* get information about a code page identified by <code> encoding </code> */ HRESULT CharEncoder::getWideCharFromMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharFromMultiByteFunc ** pfnWideCharFromMultiByte, UINT * mCharSize) { HRESULT hr = S_OK;
int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize); if (i >= 0) // in our short list
{ switch (*pcodepage) { case CP_UCS_2: if (encoding->littleendian) *pfnWideCharFromMultiByte = wideCharFromUcs2Littleendian; else *pfnWideCharFromMultiByte = wideCharFromUcs2Bigendian; break; #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
case CP_UCS_4: if (encoding->littleendian) *pfnWideCharFromMultiByte = wideCharFromUcs4Littleendian; else *pfnWideCharFromMultiByte = wideCharFromUcs4Bigendian; break; #endif
default: *pfnWideCharFromMultiByte = charsetInfo[i].pfnWideCharFromMultiByte; break; } } // xiaoyu : we do not deal this case
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
else if (i == -1) // delegate to MLANG
{ hr = pMultiLanguage->IsConvertible(*pcodepage, CP_UCS_2); if (S_OK == hr) *pfnWideCharFromMultiByte = wideCharFromMultiByteMlang; } #endif
else // invalid encoding
{ hr = E_FAIL; } return hr; }
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* get information about a code page identified by <code> encoding </code> */ HRESULT CharEncoder::getWideCharToMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharToMultiByteFunc ** pfnWideCharToMultiByte, UINT * mCharSize) { HRESULT hr = S_OK;
int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize); if (i >= 0) // in our short list
{ switch (*pcodepage) { case CP_UCS_2: if (encoding->littleendian) *pfnWideCharToMultiByte = wideCharToUcs2Littleendian; else *pfnWideCharToMultiByte = wideCharToUcs2Bigendian; break; case CP_UCS_4: if (encoding->littleendian) *pfnWideCharToMultiByte = wideCharToUcs4Littleendian; else *pfnWideCharToMultiByte = wideCharToUcs4Bigendian; break; default: *pfnWideCharToMultiByte = charsetInfo[i].pfnWideCharToMultiByte; break; } } else if (i == -1) // delegate to MLANG
{ hr = pMultiLanguage->IsConvertible(CP_UCS_2, *pcodepage); if (hr == S_OK) *pfnWideCharToMultiByte = wideCharToMultiByteMlang; else hr = E_FAIL; } else { hr = E_FAIL; }
return hr; } #endif
/**
* Scans rawbuffer and translates UTF8 characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUtf8(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) {
UNUSED(pdwMode); UNUSED(codepage); #if 0
// Just for the record - I tried this and measured it and it's twice as
// slow as our hand-crafted code.
// Back up if end of buffer is the second or third byte of a multi-byte
// encoding since MultiByteToWideChar cannot handle this case. These second
// and third bytes are easy to identify - they always start with the bit
// pattern 0x10xxxxxx.
UINT remaining = 0; UINT count; int endpos = (int)*cb;
while (endpos > 0 && (bytebuffer[endpos-1] & 0xc0) == 0x80) { endpos--; remaining++; } if (endpos > 0) { count = MultiByteToWideChar(CP_UTF8, 0, bytebuffer, endpos, buffer, *cch); if (count == 0) { return HRESULT_FROM_WIN32(GetLastError()); } } #else
UINT remaining = *cb; UINT count = 0; UINT max = *cch; ULONG ucs4;
// UTF-8 multi-byte encoding. See Appendix A.2 of the Unicode book for more info.
//
// Unicode value 1st byte 2nd byte 3rd byte 4th byte
// 000000000xxxxxxx 0xxxxxxx
// 00000yyyyyxxxxxx 110yyyyy 10xxxxxx
// zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
// 110110wwwwzzzzyy+ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
// 110111yyyyxxxxxx, where uuuuu = wwww + 1
WCHAR c; bool valid = true;
while (remaining > 0 && count < max) { // This is an optimization for straight runs of 7-bit ascii
// inside the UTF-8 data.
c = *bytebuffer; if (c & 0x80) // check 8th-bit and get out of here
break; // so we can do proper UTF-8 decoding.
*buffer++ = c; bytebuffer++; count++; remaining--; }
while (remaining > 0 && count < max) { UINT bytes = 0; for (c = *bytebuffer; c & 0x80; c <<= 1) bytes++;
if (bytes == 0) bytes = 1;
if (remaining < bytes) { break; } c = 0; switch ( bytes ) { case 6: bytebuffer++; // We do not handle ucs4 chars
case 5: bytebuffer++; // except those on plane 1
valid = false; // fall through
case 4: // Do we have enough buffer?
if (count >= max - 1) goto Cleanup;
// surrogate pairs
ucs4 = ULONG(*bytebuffer++ & 0x07) << 18; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 12; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 6; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f);
// For non-BMP code values of ISO/IEC 10646,
// only those in plane 1 are valid xml characters
if (ucs4 > 0x10ffff) valid = false;
if (valid) { // first ucs2 char
*buffer++ = static_cast<WCHAR>((ucs4 - 0x10000) / 0x400 + 0xd800); count++; // second ucs2 char
c = static_cast<WCHAR>((ucs4 - 0x10000) % 0x400 + 0xdc00); } break;
case 3: c = WCHAR(*bytebuffer++ & 0x0f) << 12; // 0x0800 - 0xffff
if ((*bytebuffer & 0xc0) != 0x80) valid = false; // fall through
case 2: c |= WCHAR(*bytebuffer++ & 0x3f) << 6; // 0x0080 - 0x07ff
if ((*bytebuffer & 0xc0) != 0x80) valid = false; c |= WCHAR(*bytebuffer++ & 0x3f); break; case 1: c = WCHAR(*bytebuffer++); // 0x0000 - 0x007f
break;
default: valid = false; // not a valid UTF-8 character.
break; }
// If the multibyte sequence was illegal, store a FFFF character code.
// The Unicode spec says this value may be used as a signal like this.
// This will be detected later by the parser and an error generated.
// We don't throw an exception here because the parser would not yet know
// the line and character where the error occurred and couldn't produce a
// detailed error message.
if (! valid) { c = 0xffff; valid = true; }
*buffer++ = c; count++; remaining -= bytes; } #endif
Cleanup: // tell caller that there are bytes remaining in the buffer to
// be processed next time around when we have more data.
*cb -= remaining; *cch = count; return S_OK; }
/**
* Scans bytebuffer and translates UCS2 big endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UNUSED(codepage); UNUSED(pdwMode);
UINT num = *cb >> 1; if (num > *cch) num = *cch; for (UINT i = num; i > 0; i--) { *buffer++ = ((*bytebuffer) << 8) | (*(bytebuffer + 1)); bytebuffer += 2; } *cch = num; *cb = num << 1; return S_OK; }
/**
* Scans bytebuffer and translates UCS2 little endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UNUSED(codepage); UNUSED(pdwMode);
UINT num = *cb / 2; // Ucs2 is two byte unicode.
if (num > *cch) num = *cch;
#ifndef UNIX
// Optimization for windows platform where little endian maps directly to WCHAR.
// (This increases overall parser performance by 5% for large unicode files !!)
::memcpy(buffer, bytebuffer, num * sizeof(WCHAR)); #else
for (UINT i = num; i > 0 ; i--) { // we want the letter 'a' to be 0x0000006a.
*buffer++ = (*(bytebuffer+1)<<8) | (*bytebuffer); bytebuffer += 2; } #endif
*cch = num; *cb = num * 2; return S_OK; }
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans bytebuffer and translates UCS4 big endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs4Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UINT num = *cb >> 2; if (num > *cch) num = *cch; for (UINT i = num; i > 0; i--) { #ifndef UNIX
if (*bytebuffer != 0 || *(bytebuffer + 1) != 0) { return XML_E_INVALID_UNICODE; } *buffer++ = (*(bytebuffer + 2) << 8) | (*(bytebuffer + 3)); #else
*buffer++ = ((*bytebuffer)<<24) | (*(bytebuffer+1)<<16) | (*(bytebuffer+2)<<8) | (*(bytebuffer+3)); #endif
bytebuffer += 4; } *cch = num; *cb = num << 2; return S_OK; } #endif
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans bytebuffer and translates UCS4 little endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs4Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UINT num = *cb >> 2; // Ucs4 is two byte unicode.
if (num > *cch) num = *cch; for (UINT i = num; i > 0 ; i--) { #ifndef UNIX
*buffer++ = (*(bytebuffer+1)<<8) | (*bytebuffer); if (*(bytebuffer + 2) != 0 || *(bytebuffer + 3) != 0) { return XML_E_INVALID_UNICODE; } #else
*buffer++ = (*(bytebuffer+3)<<24) | (*(bytebuffer+2)<<16) | (*(bytebuffer+1)<<8) | (*bytebuffer); #endif
bytebuffer += 4; } *cch = num; *cb = num << 2; return S_OK; } #endif
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans bytebuffer and translates characters of charSet identified by * <code> codepage </code> into UNICODE characters, * using Win32 function MultiByteToWideChar() for encoding */ HRESULT CharEncoder::wideCharFromMultiByteWin32(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { HRESULT hr = S_OK; *cch = ::MultiByteToWideChar(codepage, MB_PRECOMPOSED, (char*)bytebuffer, *cb, buffer, *cch); if (*cch == 0) hr = GetLastError(); return hr; } #endif
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans bytebuffer and translates multibyte characters into UNICODE characters, * using Mlang for encoding */ HRESULT CharEncoder::wideCharFromMultiByteMlang(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { HRESULT hr; checkhr2(_EnsureMultiLanguage()); checkhr2(pMultiLanguage->ConvertStringToUnicode(pdwMode, codepage, (char*)bytebuffer, cb, buffer, cch )); return S_OK; } #endif
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans buffer and translates Unicode characters into Ucs2 big endian characters */ HRESULT CharEncoder::wideCharToUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { UINT num = (*cb) >> 1; if (num > *cch) num = *cch; // BUGBUG - what do we do about Unix where WCHAR is 4 bytes ?
// Currently we just throw away the high WORD - but I don't know how else
// to do it, since UCS2 is 2-byte unicode by definition.
for (UINT i = num; i > 0; i--) { *bytebuffer++ = (*buffer) >> 8; *bytebuffer++ = (*buffer++) & 0xFF; } *cch = num; *cb = num << 1; return S_OK; } #endif
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans buffer and translates Unicode characters into Ucs2 little endian characters */ HRESULT CharEncoder::wideCharToUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { UINT num = (*cb) >> 1; if (num > *cch) num = *cch;
// BUGBUG - what do we do about Unix where WCHAR is 4 bytes ?
// Currently we just throw away the high WORD - but I don't know how else
// to do it, since UCS2 is 2-byte unicode by definition.
#ifndef UNIX
// Optimization for windows platform where little endian maps directly to WCHAR.
// (This increases overall parser performance by 5% for large unicode files !!)
::memcpy(bytebuffer, buffer, num * sizeof(WCHAR)); #else
for (UINT i = num; i > 0; i--) { *bytebuffer++ = (*buffer) & 0xFF; *bytebuffer++ = (*buffer++) >> 8; } #endif
*cch = num; *cb = num << 1; return S_OK; } #endif
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans buffer and translates Unicode characters into Ucs4 big endian characters */ HRESULT CharEncoder::wideCharToUcs4Bigendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { UINT num = (*cb) >> 2; if (num > *cch) num = *cch;
for (UINT i = num; i > 0; i--) { #ifndef UNIX
*bytebuffer++ = 0; *bytebuffer++ = 0; *bytebuffer++ = (*buffer) >> 8; *bytebuffer++ = (*buffer) & 0xFF; #else
*bytebuffer++ = (*buffer) >> 24; *bytebuffer++ = ((*buffer) >> 16) & 0xFF; *bytebuffer++ = ((*buffer) >> 8) & 0xFF; *bytebuffer++ = (*buffer) & 0xFF; #endif
buffer++; } *cch = num; *cb = num << 2; return S_OK; } #endif
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans buffer and translates Unicode characters into Ucs4 little endian characters */ HRESULT CharEncoder::wideCharToUcs4Littleendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { UINT num = (*cb) >> 2; if (num > *cch) num = *cch;
for (UINT i = num; i > 0; i--) { #ifndef UNIX
*bytebuffer++ = (*buffer) & 0xFF; *bytebuffer++ = (*buffer) >> 8; *bytebuffer++ = 0; *bytebuffer++ = 0; #else
*bytebuffer++ = (*buffer) & 0xFF; *bytebuffer++ = ((*buffer) >> 8) & 0xFF; *bytebuffer++ = ((*buffer) >> 16) & 0xFF; *bytebuffer++ = (*buffer) >> 24; #endif
buffer++; } *cch = num; *cb = num << 2; return S_OK; } #endif
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans buffer and translates Unicode characters into UTF8 characters */ HRESULT CharEncoder::wideCharToUtf8(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { UINT count = 0, num = *cch, m1 = *cb, m2 = m1 - 1, m3 = m2 - 1, m4 = m3 - 1; DWORD dw1; bool surrogate = false;
for (UINT i = num; i > 0; i--) { #ifdef UNIX
// Solaris a WCHAR is 4 bytes (DWORD)
DWORD dw = 0; DWORD dwTemp[4]; BYTE* pByte = (BYTE*)buffer; dwTemp[3] = (DWORD)pByte[0]; dwTemp[2] = (DWORD)pByte[1]; dwTemp[1] = (DWORD)pByte[2]; dwTemp[0] = (DWORD)pByte[3]; dw = dwTemp[0]+(dwTemp[1]<<8)+(dwTemp[2]<<16)+(dwTemp[3]<<24); #else
DWORD dw = *buffer; #endif
if (surrogate) // is it the second char of a surrogate pair?
{ if (dw >= 0xdc00 && dw <= 0xdfff) { // four bytes 0x11110xxx 0x10xxxxxx 0x10xxxxxx 0x10xxxxxx
if (count < m4) count += 4; else break; ULONG ucs4 = (dw1 - 0xd800) * 0x400 + (dw - 0xdc00) + 0x10000; *bytebuffer++ = (byte)(( ucs4 >> 18) | 0xF0); *bytebuffer++ = (byte)((( ucs4 >> 12) & 0x3F) | 0x80); *bytebuffer++ = (byte)((( ucs4 >> 6) & 0x3F) | 0x80); *bytebuffer++ = (byte)(( ucs4 & 0x3F) | 0x80); surrogate = false; buffer++; continue; } else // Then dw1 must be a three byte character
{ if (count < m3) count += 3; else break; *bytebuffer++ = (byte)(( dw1 >> 12) | 0xE0); *bytebuffer++ = (byte)((( dw1 >> 6) & 0x3F) | 0x80); *bytebuffer++ = (byte)(( dw1 & 0x3F) | 0x80); } surrogate = false; }
if (dw < 0x80) // one byte, 0xxxxxxx
{ if (count < m1) count++; else break; *bytebuffer++ = (byte)dw; } else if ( dw < 0x800) // two WORDS, 110xxxxx 10xxxxxx
{ if (count < m2) count += 2; else break; *bytebuffer++ = (byte)((dw >> 6) | 0xC0); *bytebuffer++ = (byte)((dw & 0x3F) | 0x80); } else if (dw >= 0xd800 && dw <= 0xdbff) // Assume that it is the first char of surrogate pair
{ if (i == 1) // last wchar in buffer
break; dw1 = dw; surrogate = true; } else // three bytes, 1110xxxx 10xxxxxx 10xxxxxx
{ if (count < m3) count += 3; else break; *bytebuffer++ = (byte)(( dw >> 12) | 0xE0); *bytebuffer++ = (byte)((( dw >> 6) & 0x3F) | 0x80); *bytebuffer++ = (byte)(( dw & 0x3F) | 0x80); } buffer++; }
*cch = surrogate ? num - i - 1 : num - i; *cb = count;
return S_OK; } #endif
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans buffer and translates Unicode characters into characters identified * by <code> codepage </>, using Win32 function WideCharToMultiByte for encoding */ HRESULT CharEncoder::wideCharToMultiByteWin32(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { HRESULT hr = S_OK; BOOL fBadChar = false; *cb = ::WideCharToMultiByte(codepage, NULL, buffer, *cch, (char*)bytebuffer, *cb, NULL, &fBadChar); if (*cb == 0) hr = ::GetLastError(); else if (fBadChar) // BUGBUG: how do we inform the caller which character failed?
hr = S_FALSE; return hr; } #endif
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
/**
* Scans buffer and translates Unicode characters into characters of charSet * identified by <code> codepage </code>, using Mlang for encoding */ HRESULT CharEncoder::wideCharToMultiByteMlang(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { HRESULT hr; checkhr2(_EnsureMultiLanguage()); checkhr2(pMultiLanguage->ConvertStringFromUnicode(pdwMode, codepage, buffer, cch, (char*)bytebuffer, cb )); return S_OK; } #endif
|