You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
278 lines
8.4 KiB
278 lines
8.4 KiB
/*
|
|
* @(#)CharEncoder.cxx 1.0 6/10/97
|
|
*
|
|
* Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. *
|
|
*/
|
|
#include "stdinc.h"
|
|
#include "core.hxx"
|
|
#pragma hdrstop
|
|
|
|
#include "charencoder.hxx"
|
|
|
|
//
|
|
// Delegate other charsets to mlang
|
|
//
|
|
const EncodingEntry CharEncoder::charsetInfo [] =
|
|
{
|
|
{ CP_UCS_2, L"UTF-16", 2, wideCharFromUcs2Bigendian },
|
|
{ CP_UCS_2, L"UCS-2", 2, wideCharFromUcs2Bigendian },
|
|
{ CP_UTF_8, L"UTF-8", 3, wideCharFromUtf8 },
|
|
};
|
|
|
|
Encoding * Encoding::newEncoding(const WCHAR * s, ULONG len, bool endian, bool mark)
|
|
{
|
|
//Encoding * e = new Encoding();
|
|
Encoding * e = NEW (Encoding());
|
|
if (e == NULL)
|
|
return NULL;
|
|
e->charset = NEW (WCHAR[len + 1]);
|
|
if (e->charset == NULL)
|
|
{
|
|
delete e;
|
|
return NULL;
|
|
}
|
|
::memcpy(e->charset, s, sizeof(WCHAR) * len);
|
|
e->charset[len] = 0; // guarentee NULL termination.
|
|
e->littleendian = endian;
|
|
e->byteOrderMark = mark;
|
|
return e;
|
|
}
|
|
|
|
Encoding::~Encoding()
|
|
{
|
|
if (charset != NULL)
|
|
{
|
|
delete [] charset;
|
|
}
|
|
}
|
|
|
|
int CharEncoder::getCharsetInfo(const WCHAR * charset, CODEPAGE * pcodepage, UINT * mCharSize)
|
|
{
|
|
for (int i = LENGTH(charsetInfo) - 1; i >= 0; i--)
|
|
{
|
|
if (::FusionpCompareStrings(charset, ::wcslen(charset), charsetInfo[i].charset, ::wcslen(charsetInfo[i].charset), true) == 0)
|
|
{
|
|
*pcodepage = charsetInfo[i].codepage;
|
|
*mCharSize = charsetInfo[i].maxCharSize;
|
|
return i;
|
|
} // end of if
|
|
}// end of for
|
|
|
|
return -2;
|
|
}
|
|
|
|
/**
|
|
* get information about a code page identified by <code> encoding </code>
|
|
*/
|
|
HRESULT CharEncoder::getWideCharFromMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharFromMultiByteFunc ** pfnWideCharFromMultiByte, UINT * mCharSize)
|
|
{
|
|
HRESULT hr = S_OK;
|
|
|
|
int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize);
|
|
if (i >= 0) // in our short list
|
|
{
|
|
switch (*pcodepage)
|
|
{
|
|
case CP_UCS_2:
|
|
if (encoding->littleendian)
|
|
*pfnWideCharFromMultiByte = wideCharFromUcs2Littleendian;
|
|
else
|
|
*pfnWideCharFromMultiByte = wideCharFromUcs2Bigendian;
|
|
break;
|
|
default:
|
|
*pfnWideCharFromMultiByte = charsetInfo[i].pfnWideCharFromMultiByte;
|
|
break;
|
|
}
|
|
}
|
|
else // invalid encoding
|
|
{
|
|
hr = E_FAIL;
|
|
}
|
|
return hr;
|
|
}
|
|
|
|
|
|
/**
|
|
* Scans rawbuffer and translates UTF8 characters into UNICODE characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharFromUtf8(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
|
|
UINT * cb, WCHAR * buffer, UINT * cch)
|
|
{
|
|
|
|
UNUSED(pdwMode);
|
|
UNUSED(codepage);
|
|
|
|
UINT remaining = *cb;
|
|
UINT count = 0;
|
|
UINT max = *cch;
|
|
ULONG ucs4;
|
|
|
|
// UTF-8 multi-byte encoding. See Appendix A.2 of the Unicode book for more info.
|
|
//
|
|
// Unicode value 1st byte 2nd byte 3rd byte 4th byte
|
|
// 000000000xxxxxxx 0xxxxxxx
|
|
// 00000yyyyyxxxxxx 110yyyyy 10xxxxxx
|
|
// zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
|
|
// 110110wwwwzzzzyy+ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
|
|
// 110111yyyyxxxxxx, where uuuuu = wwww + 1
|
|
WCHAR c;
|
|
bool valid = true;
|
|
|
|
while (remaining > 0 && count < max)
|
|
{
|
|
// This is an optimization for straight runs of 7-bit ascii
|
|
// inside the UTF-8 data.
|
|
c = *bytebuffer;
|
|
if (c & 0x80) // check 8th-bit and get out of here
|
|
break; // so we can do proper UTF-8 decoding.
|
|
*buffer++ = c;
|
|
bytebuffer++;
|
|
count++;
|
|
remaining--;
|
|
}
|
|
|
|
while (remaining > 0 && count < max)
|
|
{
|
|
UINT bytes = 0;
|
|
for (c = *bytebuffer; c & 0x80; c <<= 1)
|
|
bytes++;
|
|
|
|
if (bytes == 0)
|
|
bytes = 1;
|
|
|
|
if (remaining < bytes)
|
|
{
|
|
break;
|
|
}
|
|
|
|
c = 0;
|
|
switch ( bytes )
|
|
{
|
|
case 6: bytebuffer++; // We do not handle ucs4 chars
|
|
case 5: bytebuffer++; // except those on plane 1
|
|
valid = false;
|
|
// fall through
|
|
case 4:
|
|
// Do we have enough buffer?
|
|
if (count >= max - 1)
|
|
goto Cleanup;
|
|
|
|
// surrogate pairs
|
|
ucs4 = ULONG(*bytebuffer++ & 0x07) << 18;
|
|
if ((*bytebuffer & 0xc0) != 0x80)
|
|
valid = false;
|
|
ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 12;
|
|
if ((*bytebuffer & 0xc0) != 0x80)
|
|
valid = false;
|
|
ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 6;
|
|
if ((*bytebuffer & 0xc0) != 0x80)
|
|
valid = false;
|
|
ucs4 |= ULONG(*bytebuffer++ & 0x3f);
|
|
|
|
// For non-BMP code values of ISO/IEC 10646,
|
|
// only those in plane 1 are valid xml characters
|
|
if (ucs4 > 0x10ffff)
|
|
valid = false;
|
|
|
|
if (valid)
|
|
{
|
|
// first ucs2 char
|
|
*buffer++ = static_cast<WCHAR>((ucs4 - 0x10000) / 0x400 + 0xd800);
|
|
count++;
|
|
// second ucs2 char
|
|
c = static_cast<WCHAR>((ucs4 - 0x10000) % 0x400 + 0xdc00);
|
|
}
|
|
break;
|
|
|
|
case 3: c = WCHAR(*bytebuffer++ & 0x0f) << 12; // 0x0800 - 0xffff
|
|
if ((*bytebuffer & 0xc0) != 0x80)
|
|
valid = false;
|
|
// fall through
|
|
case 2: c |= WCHAR(*bytebuffer++ & 0x3f) << 6; // 0x0080 - 0x07ff
|
|
if ((*bytebuffer & 0xc0) != 0x80)
|
|
valid = false;
|
|
c |= WCHAR(*bytebuffer++ & 0x3f);
|
|
break;
|
|
|
|
case 1:
|
|
c = WCHAR(*bytebuffer++); // 0x0000 - 0x007f
|
|
break;
|
|
|
|
default:
|
|
valid = false; // not a valid UTF-8 character.
|
|
break;
|
|
}
|
|
|
|
// If the multibyte sequence was illegal, store a FFFF character code.
|
|
// The Unicode spec says this value may be used as a signal like this.
|
|
// This will be detected later by the parser and an error generated.
|
|
// We don't throw an exception here because the parser would not yet know
|
|
// the line and character where the error occurred and couldn't produce a
|
|
// detailed error message.
|
|
|
|
if (! valid)
|
|
{
|
|
c = 0xffff;
|
|
valid = true;
|
|
}
|
|
|
|
*buffer++ = c;
|
|
count++;
|
|
remaining -= bytes;
|
|
}
|
|
|
|
Cleanup:
|
|
// tell caller that there are bytes remaining in the buffer to
|
|
// be processed next time around when we have more data.
|
|
*cb -= remaining;
|
|
*cch = count;
|
|
return S_OK;
|
|
}
|
|
|
|
|
|
/**
|
|
* Scans bytebuffer and translates UCS2 big endian characters into UNICODE characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharFromUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
|
|
UINT * cb, WCHAR * buffer, UINT * cch)
|
|
{
|
|
UNUSED(codepage);
|
|
UNUSED(pdwMode);
|
|
|
|
UINT num = *cb >> 1;
|
|
if (num > *cch)
|
|
num = *cch;
|
|
for (UINT i = num; i > 0; i--)
|
|
{
|
|
*buffer++ = ((*bytebuffer) << 8) | (*(bytebuffer + 1));
|
|
bytebuffer += 2;
|
|
}
|
|
*cch = num;
|
|
*cb = num << 1;
|
|
return S_OK;
|
|
}
|
|
|
|
|
|
/**
|
|
* Scans bytebuffer and translates UCS2 little endian characters into UNICODE characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharFromUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
|
|
UINT * cb, WCHAR * buffer, UINT * cch)
|
|
{
|
|
UNUSED(codepage);
|
|
UNUSED(pdwMode);
|
|
|
|
UINT num = *cb / 2; // Ucs2 is two byte unicode.
|
|
if (num > *cch)
|
|
num = *cch;
|
|
|
|
|
|
// Optimization for windows platform where little endian maps directly to WCHAR.
|
|
// (This increases overall parser performance by 5% for large unicode files !!)
|
|
::memcpy(buffer, bytebuffer, num * sizeof(WCHAR));
|
|
|
|
*cch = num;
|
|
*cb = num * 2;
|
|
return S_OK;
|
|
}
|
|
|