/* * @(#)CharEncoder.cxx 1.0 6/10/97 * * Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. * */ #include "stdinc.h" #include "core.hxx" #pragma hdrstop #include "charencoder.hxx" // // Delegate other charsets to mlang // const EncodingEntry CharEncoder::charsetInfo [] = { { CP_UCS_2, L"UTF-16", 2, wideCharFromUcs2Bigendian }, { CP_UCS_2, L"UCS-2", 2, wideCharFromUcs2Bigendian }, { CP_UTF_8, L"UTF-8", 3, wideCharFromUtf8 }, }; Encoding * Encoding::newEncoding(const WCHAR * s, ULONG len, bool endian, bool mark) { //Encoding * e = new Encoding(); Encoding * e = NEW (Encoding()); if (e == NULL) return NULL; e->charset = NEW (WCHAR[len + 1]); if (e->charset == NULL) { delete e; return NULL; } ::memcpy(e->charset, s, sizeof(WCHAR) * len); e->charset[len] = 0; // guarentee NULL termination. e->littleendian = endian; e->byteOrderMark = mark; return e; } Encoding::~Encoding() { if (charset != NULL) { delete [] charset; } } int CharEncoder::getCharsetInfo(const WCHAR * charset, CODEPAGE * pcodepage, UINT * mCharSize) { for (int i = LENGTH(charsetInfo) - 1; i >= 0; i--) { if (::FusionpCompareStrings(charset, ::wcslen(charset), charsetInfo[i].charset, ::wcslen(charsetInfo[i].charset), true) == 0) { *pcodepage = charsetInfo[i].codepage; *mCharSize = charsetInfo[i].maxCharSize; return i; } // end of if }// end of for return -2; } /** * get information about a code page identified by encoding */ HRESULT CharEncoder::getWideCharFromMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharFromMultiByteFunc ** pfnWideCharFromMultiByte, UINT * mCharSize) { HRESULT hr = S_OK; int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize); if (i >= 0) // in our short list { switch (*pcodepage) { case CP_UCS_2: if (encoding->littleendian) *pfnWideCharFromMultiByte = wideCharFromUcs2Littleendian; else *pfnWideCharFromMultiByte = wideCharFromUcs2Bigendian; break; default: *pfnWideCharFromMultiByte = charsetInfo[i].pfnWideCharFromMultiByte; break; } } else // invalid encoding { hr = E_FAIL; } return hr; } /** * Scans rawbuffer and translates UTF8 characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUtf8(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UNUSED(pdwMode); UNUSED(codepage); UINT remaining = *cb; UINT count = 0; UINT max = *cch; ULONG ucs4; // UTF-8 multi-byte encoding. See Appendix A.2 of the Unicode book for more info. // // Unicode value 1st byte 2nd byte 3rd byte 4th byte // 000000000xxxxxxx 0xxxxxxx // 00000yyyyyxxxxxx 110yyyyy 10xxxxxx // zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx // 110110wwwwzzzzyy+ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx // 110111yyyyxxxxxx, where uuuuu = wwww + 1 WCHAR c; bool valid = true; while (remaining > 0 && count < max) { // This is an optimization for straight runs of 7-bit ascii // inside the UTF-8 data. c = *bytebuffer; if (c & 0x80) // check 8th-bit and get out of here break; // so we can do proper UTF-8 decoding. *buffer++ = c; bytebuffer++; count++; remaining--; } while (remaining > 0 && count < max) { UINT bytes = 0; for (c = *bytebuffer; c & 0x80; c <<= 1) bytes++; if (bytes == 0) bytes = 1; if (remaining < bytes) { break; } c = 0; switch ( bytes ) { case 6: bytebuffer++; // We do not handle ucs4 chars case 5: bytebuffer++; // except those on plane 1 valid = false; // fall through case 4: // Do we have enough buffer? if (count >= max - 1) goto Cleanup; // surrogate pairs ucs4 = ULONG(*bytebuffer++ & 0x07) << 18; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 12; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 6; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f); // For non-BMP code values of ISO/IEC 10646, // only those in plane 1 are valid xml characters if (ucs4 > 0x10ffff) valid = false; if (valid) { // first ucs2 char *buffer++ = static_cast((ucs4 - 0x10000) / 0x400 + 0xd800); count++; // second ucs2 char c = static_cast((ucs4 - 0x10000) % 0x400 + 0xdc00); } break; case 3: c = WCHAR(*bytebuffer++ & 0x0f) << 12; // 0x0800 - 0xffff if ((*bytebuffer & 0xc0) != 0x80) valid = false; // fall through case 2: c |= WCHAR(*bytebuffer++ & 0x3f) << 6; // 0x0080 - 0x07ff if ((*bytebuffer & 0xc0) != 0x80) valid = false; c |= WCHAR(*bytebuffer++ & 0x3f); break; case 1: c = WCHAR(*bytebuffer++); // 0x0000 - 0x007f break; default: valid = false; // not a valid UTF-8 character. break; } // If the multibyte sequence was illegal, store a FFFF character code. // The Unicode spec says this value may be used as a signal like this. // This will be detected later by the parser and an error generated. // We don't throw an exception here because the parser would not yet know // the line and character where the error occurred and couldn't produce a // detailed error message. if (! valid) { c = 0xffff; valid = true; } *buffer++ = c; count++; remaining -= bytes; } Cleanup: // tell caller that there are bytes remaining in the buffer to // be processed next time around when we have more data. *cb -= remaining; *cch = count; return S_OK; } /** * Scans bytebuffer and translates UCS2 big endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UNUSED(codepage); UNUSED(pdwMode); UINT num = *cb >> 1; if (num > *cch) num = *cch; for (UINT i = num; i > 0; i--) { *buffer++ = ((*bytebuffer) << 8) | (*(bytebuffer + 1)); bytebuffer += 2; } *cch = num; *cb = num << 1; return S_OK; } /** * Scans bytebuffer and translates UCS2 little endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UNUSED(codepage); UNUSED(pdwMode); UINT num = *cb / 2; // Ucs2 is two byte unicode. if (num > *cch) num = *cch; // Optimization for windows platform where little endian maps directly to WCHAR. // (This increases overall parser performance by 5% for large unicode files !!) ::memcpy(buffer, bytebuffer, num * sizeof(WCHAR)); *cch = num; *cb = num * 2; return S_OK; }