* @(#)CharEncoder.cxx 1.0 6/10/97 * * Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. * */ #include "stdinc.h"
#include "core.hxx"
#pragma hdrstop
#include "charencoder.hxx"
// Delegate other charsets to mlang
const EncodingEntry CharEncoder::charsetInfo [] = { { CP_UCS_2, L"UTF-16", 2, wideCharFromUcs2Bigendian }, { CP_UCS_2, L"UCS-2", 2, wideCharFromUcs2Bigendian }, { CP_UTF_8, L"UTF-8", 3, wideCharFromUtf8 }, };
Encoding * Encoding::newEncoding(const WCHAR * s, ULONG len, bool endian, bool mark) { //Encoding * e = new Encoding();
Encoding * e = NEW (Encoding()); if (e == NULL) return NULL; e->charset = NEW (WCHAR[len + 1]); if (e->charset == NULL) { delete e; return NULL; } ::memcpy(e->charset, s, sizeof(WCHAR) * len); e->charset[len] = 0; // guarentee NULL termination.
e->littleendian = endian; e->byteOrderMark = mark; return e; }
Encoding::~Encoding() { if (charset != NULL) { delete [] charset; } }
int CharEncoder::getCharsetInfo(const WCHAR * charset, CODEPAGE * pcodepage, UINT * mCharSize) { for (int i = LENGTH(charsetInfo) - 1; i >= 0; i--) { if (::FusionpCompareStrings(charset, ::wcslen(charset), charsetInfo[i].charset, ::wcslen(charsetInfo[i].charset), true) == 0) { *pcodepage = charsetInfo[i].codepage; *mCharSize = charsetInfo[i].maxCharSize; return i; } // end of if
}// end of for
return -2; }
* get information about a code page identified by <code> encoding </code> */ HRESULT CharEncoder::getWideCharFromMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharFromMultiByteFunc ** pfnWideCharFromMultiByte, UINT * mCharSize) { HRESULT hr = S_OK;
int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize); if (i >= 0) // in our short list
{ switch (*pcodepage) { case CP_UCS_2: if (encoding->littleendian) *pfnWideCharFromMultiByte = wideCharFromUcs2Littleendian; else *pfnWideCharFromMultiByte = wideCharFromUcs2Bigendian; break; default: *pfnWideCharFromMultiByte = charsetInfo[i].pfnWideCharFromMultiByte; break; } } else // invalid encoding
{ hr = E_FAIL; } return hr; }
* Scans rawbuffer and translates UTF8 characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUtf8(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) {
UNUSED(pdwMode); UNUSED(codepage);
UINT remaining = *cb; UINT count = 0; UINT max = *cch; ULONG ucs4;
// UTF-8 multi-byte encoding. See Appendix A.2 of the Unicode book for more info.
// Unicode value 1st byte 2nd byte 3rd byte 4th byte
// 000000000xxxxxxx 0xxxxxxx
// 00000yyyyyxxxxxx 110yyyyy 10xxxxxx
// zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
// 110110wwwwzzzzyy+ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
// 110111yyyyxxxxxx, where uuuuu = wwww + 1
WCHAR c; bool valid = true;
while (remaining > 0 && count < max) { // This is an optimization for straight runs of 7-bit ascii
// inside the UTF-8 data.
c = *bytebuffer; if (c & 0x80) // check 8th-bit and get out of here
break; // so we can do proper UTF-8 decoding.
*buffer++ = c; bytebuffer++; count++; remaining--; }
while (remaining > 0 && count < max) { UINT bytes = 0; for (c = *bytebuffer; c & 0x80; c <<= 1) bytes++;
if (bytes == 0) bytes = 1;
if (remaining < bytes) { break; } c = 0; switch ( bytes ) { case 6: bytebuffer++; // We do not handle ucs4 chars
case 5: bytebuffer++; // except those on plane 1
valid = false; // fall through
case 4: // Do we have enough buffer?
if (count >= max - 1) goto Cleanup;
// surrogate pairs
ucs4 = ULONG(*bytebuffer++ & 0x07) << 18; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 12; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 6; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f);
// For non-BMP code values of ISO/IEC 10646,
// only those in plane 1 are valid xml characters
if (ucs4 > 0x10ffff) valid = false;
if (valid) { // first ucs2 char
*buffer++ = static_cast<WCHAR>((ucs4 - 0x10000) / 0x400 + 0xd800); count++; // second ucs2 char
c = static_cast<WCHAR>((ucs4 - 0x10000) % 0x400 + 0xdc00); } break;
case 3: c = WCHAR(*bytebuffer++ & 0x0f) << 12; // 0x0800 - 0xffff
if ((*bytebuffer & 0xc0) != 0x80) valid = false; // fall through
case 2: c |= WCHAR(*bytebuffer++ & 0x3f) << 6; // 0x0080 - 0x07ff
if ((*bytebuffer & 0xc0) != 0x80) valid = false; c |= WCHAR(*bytebuffer++ & 0x3f); break; case 1: c = WCHAR(*bytebuffer++); // 0x0000 - 0x007f
default: valid = false; // not a valid UTF-8 character.
break; }
// If the multibyte sequence was illegal, store a FFFF character code.
// The Unicode spec says this value may be used as a signal like this.
// This will be detected later by the parser and an error generated.
// We don't throw an exception here because the parser would not yet know
// the line and character where the error occurred and couldn't produce a
// detailed error message.
if (! valid) { c = 0xffff; valid = true; }
*buffer++ = c; count++; remaining -= bytes; }
Cleanup: // tell caller that there are bytes remaining in the buffer to
// be processed next time around when we have more data.
*cb -= remaining; *cch = count; return S_OK; }
* Scans bytebuffer and translates UCS2 big endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UNUSED(codepage); UNUSED(pdwMode);
UINT num = *cb >> 1; if (num > *cch) num = *cch; for (UINT i = num; i > 0; i--) { *buffer++ = ((*bytebuffer) << 8) | (*(bytebuffer + 1)); bytebuffer += 2; } *cch = num; *cb = num << 1; return S_OK; }
* Scans bytebuffer and translates UCS2 little endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UNUSED(codepage); UNUSED(pdwMode);
UINT num = *cb / 2; // Ucs2 is two byte unicode.
if (num > *cch) num = *cch;
// Optimization for windows platform where little endian maps directly to WCHAR.
// (This increases overall parser performance by 5% for large unicode files !!)
::memcpy(buffer, bytebuffer, num * sizeof(WCHAR));
*cch = num; *cb = num * 2; return S_OK; }