Source code of Windows XP (NT5)
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
#include "precomp.h"
#include "utf8str.h"
CUTF8String::~CUTF8String() { if (ALLOC_UNICODE & m_eAlloc) { delete m_pwszUnicode; } else if (ALLOC_UTF8 & m_eAlloc) { delete m_pszUTF8; } }
CUTF8String::operator LPWSTR() { if ((NULL == m_pwszUnicode) && (NULL != m_pszUTF8)) { DecodeUTF8(); } return m_pwszUnicode; }
CUTF8String::operator LPSTR() { if ((NULL == m_pszUTF8) && (NULL != m_pwszUnicode)) { EncodeUTF8(); } return m_pszUTF8; }
VOID CUTF8String::EncodeUTF8() { DebugEntry(CUTF8String::EncodeUTF8); m_hr = S_OK; ASSERT(NULL != m_pwszUnicode); int cchUTF8 = 1; // always include a NULL terminator
// First make a pass to see how many characters we will be converting.
LPWSTR pwsz = m_pwszUnicode; while (L'\0' != *pwsz) { WCHAR wch = *pwsz++; if (wch < 0x80) { cchUTF8 += 1; } else if (wch < 0x800) { cchUTF8 += 2; } else { cchUTF8 += 3; } }
ASSERT(NULL == m_pszUTF8); m_pszUTF8 = new CHAR[cchUTF8]; if (NULL != m_pszUTF8) { ASSERT(ALLOC_NONE == m_eAlloc); m_eAlloc = ALLOC_UTF8; // Start encoding here:
const BYTE cUtf8FirstSignal[4] = {0x00, 0x00, 0xC0, 0xE0}; const BYTE cMask = 0xBF; const BYTE cSignal = 0x80; LPSTR pszStop = m_pszUTF8 + cchUTF8; LPSTR pszDst = m_pszUTF8; pwsz = m_pwszUnicode; while (pszDst < pszStop) { WCHAR wch = *pwsz++; #ifdef DEBUG
if (L'\0' == wch) { ASSERT(pszDst == pszStop - 1); } #endif // DEBUG
int cchTotal; if (wch < 0x80) { cchTotal = 1; } else if (wch < 0x800) { cchTotal = 2; } else { cchTotal = 3; }
pszDst += cchTotal; switch (cchTotal) { case 3: *--pszDst = (wch | cSignal) & cMask; wch >>= 6; // FALL THROUGH
case 2: *--pszDst = (wch | cSignal) & cMask; wch >>= 6; // FALL THROUGH
case 1: *--pszDst = (wch | cUtf8FirstSignal[cchTotal]); } pszDst += cchTotal; } m_hr = S_OK; } else { m_hr = E_OUTOFMEMORY; } DebugExitVOID(CUTF8String::EncodeUTF8); }
VOID CUTF8String::DecodeUTF8() { DebugEntry(CUTF8String::DecodeUTF8); m_hr = S_OK; ASSERT(NULL != m_pszUTF8); int cchUnicode = 1; // always include a NULL terminator
LPSTR psz = m_pszUTF8; // First determine the destination size (cchUnicode)
while ('\0' != *psz) { int cbChar = 0; BYTE bFirst = (BYTE) *psz; while (bFirst & 0x80) { cbChar++; ASSERT(cbChar < 8); bFirst <<= 1; }
cbChar = max(1, cbChar); psz += cbChar; cchUnicode++; }
ASSERT(NULL == m_pwszUnicode); m_pwszUnicode = new WCHAR[cchUnicode]; if (NULL != m_pwszUnicode) { ASSERT(ALLOC_NONE == m_eAlloc); m_eAlloc = ALLOC_UNICODE; // Start decoding here:
LPWSTR pwszStop = m_pwszUnicode + cchUnicode; LPWSTR pwszDst = m_pwszUnicode; psz = m_pszUTF8; while (pwszDst < pwszStop) { int cbChar = 0; BYTE bFirst = (BYTE) *psz; while (bFirst & 0x80) { cbChar++; ASSERT(cbChar < 8); bFirst <<= 1; }
BOOL fValid = TRUE; WCHAR wch = L'\0'; switch (cbChar) { case 6: psz++; // FALLTHROUGH // we don't handle
case 5: psz++; // FALLTHROUGH // UCS-4; skip first
case 4: psz++; // FALLTHROUGH // three bytes
case 3: wch = WCHAR(*psz++ & 0x0f) << 12; // 0x0800 - 0xffff
fValid = fValid && ((*psz & 0xc0) == 0x80); // FALLTHROUGH
case 2: wch |= WCHAR(*psz++ & 0x3f) << 6; // 0x0080 - 0x07ff
fValid = fValid && ((*psz & 0xc0) == 0x80); wch |= WCHAR(*psz++ & 0x3f); break;
case 0: wch = WCHAR(*psz++); // 0x0000 - 0x007f
break;
default: ERROR_OUT(("CUTF8String::DecodeUTF8 found invalid UTF-8 lead byte")); wch = L'?'; psz += cbChar; break; }
if (FALSE == fValid) { ERROR_OUT(("CUTF8String::DecodeUTF8 found bad UTF-8 sequence")); *pwszDst = L'\0'; m_hr = E_FAIL; break; } #ifdef DEBUG
cchUnicode--; #endif // DEBUG
*pwszDst++ = wch; } ASSERT(0 == cchUnicode); } else { m_hr = E_OUTOFMEMORY; } DebugExitVOID(CUTF8String::DecodeUTF8); }
|