|
|
#include "stdafx.h"
#include "conveng.h"
#include "convdata.tbl"
// These file contain 3 parts:
// First part, Some basic service functions for Ansi char format convert,
// Distance/Advance calculate and Binary search algorithm copied from STL
// Second part, Unicode to Ansi
// Third part, Ansi to Unicode
// ****************************************************************************
// Frist part, Ansi char convert functions
//
// This part not use any data base in .tbl file
// ****************************************************************************
// Binary search algorithm
// Copy from STL, only very little modify
template <class RandomAccessIterator, class T> RandomAccessIterator __lower_bound(RandomAccessIterator first, RandomAccessIterator last, const T& value) { INT_PTR len = last - first; INT_PTR half; RandomAccessIterator middle;
while (len > 0) { half = len / 2;
middle = first + half; if (*middle < value) { first = middle + 1; len = len - half - 1; } else { len = half; } } return first; }
template <class RandomAccessIterator, class T> RandomAccessIterator __upper_bound(RandomAccessIterator first, RandomAccessIterator last, const T& value) { DWORD len = last - first; DWORD half; RandomAccessIterator middle;
while (len > 0) { half = len / 2;
middle = first + half; if (!(value < *middle)) { first = middle + 1; len = len - half - 1; } else { len = half; } } return first; }
template<class T> inline ValueIn( T Value, T Low, T High) { return (Value >= Low && Value < High); }
inline BOOL IsValidSurrogateLeadWord( WCHAR wchUnicode) { return ValueIn(wchUnicode, cg_wchSurrogateLeadWordLow, cg_wchSurrogateLeadWordHigh); }
inline BOOL IsValidSurrogateTailWord( WCHAR wchUnicode) { return ValueIn(wchUnicode, cg_wchSurrogateTailWordLow, cg_wchSurrogateTailWordHigh); }
inline BOOL IsValidQByteAnsiLeadByte( BYTE byAnsi) { return ValueIn(byAnsi, cg_byQByteAnsiLeadByteLow, cg_byQByteAnsiLeadByteHigh); }
inline BOOL IsValidQByteAnsiTailByte( BYTE byAnsi) { return ValueIn(byAnsi, cg_byQByteAnsiTailByteLow, cg_byQByteAnsiTailByteHigh); }
// Generate QByte Ansi. The Ansi char is in DWORD format,
// in another word, it's in reverse order of GB18030 standard
DWORD QByteAnsiBaseAddOffset( DWORD dwBaseAnsi, // In reverse order
int nOffset) { DWORD dwAnsi = dwBaseAnsi; PBYTE pByte = (PBYTE)&dwAnsi; // dwOffset should less than 1M
ASSERT (nOffset < 0x100000);
nOffset += pByte[0] - 0x30; pByte[0] = 0x30 + nOffset % 10; nOffset /= 10;
nOffset += pByte[1] - 0x81; pByte[1] = 0x81 + nOffset % 126; nOffset /= 126;
nOffset += pByte[2] - 0x30; pByte[2] = 0x30 + nOffset % 10; nOffset /= 10;
nOffset += pByte[3] - 0x81; pByte[3] = 0x81 + nOffset % 126; nOffset /= 126; ASSERT(nOffset == 0);
return dwAnsi; }
// Get "distance" of 2 QByte Ansi
int CalcuDistanceOfQByteAnsi( DWORD dwAnsi1, // In reverse order
DWORD dwAnsi2) // In reverse order
{ signed char* pschAnsi1 = (signed char*)&dwAnsi1; signed char* pschAnsi2 = (signed char*)&dwAnsi2; int nDistance = 0;
nDistance += (pschAnsi1[0] - pschAnsi2[0]); nDistance += (pschAnsi1[1] - pschAnsi2[1])*10; nDistance += (pschAnsi1[2] - pschAnsi2[2])*1260; nDistance += (pschAnsi1[3] - pschAnsi2[3])*12600;
return nDistance; }
// Reverse 4 Bytes order, from DWORD format to GB format,
// or GB to DWORD
void ReverseQBytesOrder( PBYTE pByte) { BYTE by;
by = pByte[0]; pByte[0] = pByte[3]; pByte[3] = by;
by = pByte[1]; pByte[1] = pByte[2]; pByte[2] = by;
return; }
// ****************************************************************************
// Second part, Unicode to Ansi
// ****************************************************************************
// ------------------------------------------------
// Two helper function for UnicodeToAnsi
// return Ansi char code
// the Ansi is in GB standard order (not Word value order)
//
// Unicode to double bytes Ansi char
//
// Return Ansi char code, 0 means fail (internal error, etc.)
//
WORD UnicodeToDByteAnsi( WCHAR wchUnicode) { char achAnsiBuf[4]; WORD wAnsi = 0; int cLen = 0;
// Code changed from GBK to GB18030, or code not compatible
// from CP936 to CP54936
for (int i = 0; i < sizeof(asAnsiCodeChanged)/sizeof(SAnsiCodeChanged); i++) { if (wchUnicode == asAnsiCodeChanged[i].wchUnicode) { wAnsi = asAnsiCodeChanged[i].wchAnsiNew; goto Exit; } } // Not in Changed code list, that is same with GBK, or CP936
// (Most DByte Ansi char code should compatible from GBK to GB18030)
cLen = WideCharToMultiByte(936, WC_COMPOSITECHECK, &wchUnicode, 1, achAnsiBuf, sizeof(achAnsiBuf)-1, NULL, NULL); if (cLen != 2) { ASSERT(cLen == 2); wAnsi = 0; } else { wAnsi = *(PWORD)achAnsiBuf; }
Exit: return wAnsi; }
// Unicode to quad bytes Ansi char
//
// Return Ansi char code
// 0 means fail (interal error)
//
DWORD UnicodeToQByteAnsi( int nSection, int nOffset) { DWORD dwBaseAnsi; if (nSection < 0 || nSection >= sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) { ASSERT(FALSE); return 0; } dwBaseAnsi = adwAnsiQBytesAreaStartValue[nSection];
// Check adwAnsiQByteAreaStartValue array is correctly
#ifdef _DEBUG
int ncQByteAnsiNum = 0; for (int i = 0; i < nSection; i++) { // Calcu QByte Ansi char numbers
ncQByteAnsiNum += awchAnsiDQByteBound[2*i+1] - awchAnsiDQByteBound[2*i]; } ASSERT(dwBaseAnsi == QByteAnsiBaseAddOffset(cg_dwQByteAnsiStart, ncQByteAnsiNum)); #endif
DWORD dwAnsi = QByteAnsiBaseAddOffset(dwBaseAnsi, nOffset); // Value order to standard order
ReverseQBytesOrder((PBYTE)(&dwAnsi));
return dwAnsi; }
// ---------------------------------------------------------
// Two function support 2 bytes Unicode (BMP)
// and 4 bytes Unicode (Surrogate) translate to Ansi
// 2 bytes Unicode (BMP)
// Return Ansi str len, when success, should be 2 or 4;
// return 0 means fail (internal error, etc.)
int UnicodeToAnsi( WCHAR wchUnicode, char* pchAnsi, DWORD dwBufSize) { // Classic Unicode, not support surrogate in this function
ASSERT(!IsValidSurrogateLeadWord(wchUnicode) && !IsValidSurrogateTailWord(wchUnicode));
DWORD lAnsiLen = 0; const WORD* p; INT_PTR i;
// ASCII, 0 - 0x7f
if (wchUnicode <= 0x7f) { *pchAnsi = (char)wchUnicode; lAnsiLen = 1; goto Exit; }
// BMP, 4 byte or 2 byte
p = __lower_bound(awchAnsiDQByteBound, awchAnsiDQByteBound + sizeof(awchAnsiDQByteBound)/sizeof(WCHAR), wchUnicode); if (p == awchAnsiDQByteBound + sizeof(awchAnsiDQByteBound)/sizeof(WCHAR)) { p --; } else if (wchUnicode < *p) { p --; } else if (wchUnicode == *p) { } else { ASSERT(FALSE); }
i = p - awchAnsiDQByteBound; ASSERT(i >= 0); // Stop when >= *(((PWORD)asAnsi2ByteArea) + i);
if (i%2) { // Odd, in 2 bytes area
WORD wAnsi = UnicodeToDByteAnsi(wchUnicode); if (wAnsi && dwBufSize >= 2) { *(UNALIGNED WORD*)pchAnsi = wAnsi; lAnsiLen = 2; } else { lAnsiLen = 0; } } else { // Duel, in 4 bytes area
DWORD dwAnsi = UnicodeToQByteAnsi ((int)i/2, wchUnicode - awchAnsiDQByteBound[i]); if (dwAnsi && dwBufSize >= 4) { *(UNALIGNED DWORD*)pchAnsi = dwAnsi; lAnsiLen = 4; } else { lAnsiLen = 0; } }
Exit: return lAnsiLen;
}
// 4 bytes Unicode (Surrogate)
// Return Ansi str length, when success, should be 4
// return 0 means fail (Buffer overflow)
int SurrogateToAnsi( PCWCH pwchUnicode, PCHAR pchAnsi, DWORD dwBufSize) { ASSERT(IsValidSurrogateLeadWord(pwchUnicode[0])); ASSERT(IsValidSurrogateTailWord(pwchUnicode[1]));
// dwOffset is ISO char code - 0x10000
DWORD dwOffset = ((pwchUnicode[0] - cg_wchSurrogateLeadWordLow)<<10) + (pwchUnicode[1] - cg_wchSurrogateTailWordLow) + 0x10000 - 0x10000;
if (dwBufSize < 4) { return 0; }
*(UNALIGNED DWORD*)pchAnsi = QByteAnsiBaseAddOffset (cg_dwQByteAnsiToSurrogateStart, dwOffset); ReverseQBytesOrder((PBYTE)pchAnsi);
return 4; }
// API: high level service for Unicode to Ansi
// return result Ansi str length (in byte)
// return -1 means fail (Buffer overflow, internal error, etc.)
int UnicodeStrToAnsiStr( PCWCH pwchUnicodeStr, int ncUnicodeStr, // in WCHAR
PCHAR pchAnsiStrBuf, int ncAnsiStrBufSize) // in BYTE
{ int ncAnsiStr = 0; int ncAnsiCharSize;
for (int i = 0; i < ncUnicodeStr; i++, pwchUnicodeStr++) { if (ncAnsiStr > (ncAnsiStrBufSize-4)) { // Buffer overflow
break; }
if (IsValidSurrogateLeadWord(pwchUnicodeStr[0])) { if ((i+1 < ncUnicodeStr) && (IsValidSurrogateTailWord(pwchUnicodeStr[1]))) { ncAnsiCharSize = SurrogateToAnsi(pwchUnicodeStr, pchAnsiStrBuf, 4); ASSERT(ncAnsiCharSize == 4); if (ncAnsiCharSize == 0) { ASSERT(FALSE); break; }
ncAnsiStr += ncAnsiCharSize; pchAnsiStrBuf += ncAnsiCharSize; pwchUnicodeStr++; i++; } else { // Invalide Uncode char, skip
} } else if (*pwchUnicodeStr == 0) { *pchAnsiStrBuf = 0; pchAnsiStrBuf ++; ncAnsiStr ++; } else { ncAnsiCharSize = UnicodeToAnsi(*pwchUnicodeStr, pchAnsiStrBuf, 4);
if (ncAnsiCharSize == 0) { ASSERT(FALSE); break; }
pchAnsiStrBuf += ncAnsiCharSize; ncAnsiStr += ncAnsiCharSize; } }
if (i < ncUnicodeStr) { return -1; } return ncAnsiStr; }
// ****************************************************************************
// Third part, Ansi to Unicode
// ****************************************************************************
// Return Unicode number (number always equal to 1 when success)
// return 0 if can't find corresponding Unicode
// -1 means fail (internal error, etc.)
int QByteAnsiToSingleUnicode( DWORD dwAnsi, PWCH pwchUnicode) { const DWORD* p; INT_PTR i; // 0x8431a439(cg_dwQByteAnsiToBMPLast) to 0x85308130 haven't Unicode corresponding
// 0x85308130 to 0x90308130(cg_dwQByteAnsiToSurrogateStart) are reserved zone,
// haven't Unicode corresponding
if (dwAnsi > cg_dwQByteAnsiToBMPLast) { return 0; }
// Invalid input value
if (dwAnsi < adwAnsiQBytesAreaStartValue[0]) { return -1; }
p = __lower_bound(adwAnsiQBytesAreaStartValue, adwAnsiQBytesAreaStartValue + sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD), dwAnsi);
if (p == adwAnsiQBytesAreaStartValue + sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) { p --; } else if (dwAnsi < *p) { p --; } else if (dwAnsi == *p) { } else { ASSERT(FALSE); }
i = p - adwAnsiQBytesAreaStartValue; if (i < 0) { ASSERT(i >= 0); return -1; }
*pwchUnicode = awchAnsiDQByteBound[2*i] + CalcuDistanceOfQByteAnsi(dwAnsi, *p); #ifdef _DEBUG
{
int nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, *p); ASSERT(nAnsiCharDistance >= 0); WCHAR wchUnicodeDbg; if ((p+1) < adwAnsiQBytesAreaStartValue + sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) { nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, *(p+1)); wchUnicodeDbg = awchAnsiDQByteBound[2*i+1] + nAnsiCharDistance; } else if ((p+1) == adwAnsiQBytesAreaStartValue + sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) { nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, 0x8431A530); wchUnicodeDbg = 0x10000 + nAnsiCharDistance; } else { ASSERT(FALSE); } ASSERT(nAnsiCharDistance < 0); ASSERT(wchUnicodeDbg == *pwchUnicode);
} #endif
return 1; }
// Return Unicode number (number always 2 when success)
// return 0 if can't find corresponding Unicode
int QByteAnsiToDoubleUnicode( DWORD dwAnsi, PWCH pwchUnicode) { int nDistance = CalcuDistanceOfQByteAnsi(dwAnsi, cg_dwQByteAnsiToSurrogateStart); ASSERT (nDistance >= 0); if (nDistance >= 0x100000) { return 0; }
pwchUnicode[1] = nDistance % 0x400 + 0xDC00; pwchUnicode[0] = nDistance / 0x400 + 0xD800;
return 2; }
// Return Unicode number (1 or 2 when success)
// return 0 if can't find corresponding Unicode
// return -1 if fail (Buffer overflow, invalid GB char code input,
// internal error, etc.)
int QByteAnsiToUnicode( const BYTE* pbyAnsiChar, PWCH pwchUnicode, DWORD dwBufLen) // In WCHAR
{ DWORD dwAnsi; int nLen = -1; if ( IsValidQByteAnsiLeadByte(pbyAnsiChar[0]) && IsValidQByteAnsiTailByte(pbyAnsiChar[1]) && IsValidQByteAnsiLeadByte(pbyAnsiChar[2]) && IsValidQByteAnsiTailByte(pbyAnsiChar[3])) { } else { return -1; // Invalid char
}
dwAnsi = *(UNALIGNED DWORD*)pbyAnsiChar; ReverseQBytesOrder((PBYTE)(&dwAnsi)); if (dwAnsi >= cg_dwQByteAnsiToSurrogateStart) { if (dwBufLen >= 2) { nLen = QByteAnsiToDoubleUnicode(dwAnsi, pwchUnicode); } } else { if (dwBufLen >= 1) { nLen = QByteAnsiToSingleUnicode(dwAnsi, pwchUnicode); } }
return nLen; }
// Unicode to double bytes Ansi char
// Return: Unicode char code, 0 means fail (internal error, etc.)
WCHAR DByteAnsiToUnicode( const BYTE* pbyAnsi) { WORD wAnsi = *(UNALIGNED WORD*)pbyAnsi; int cLen = 1; WCHAR wchUnicode;
// Code changed from GBK to GB18030, or code not compatible
// from CP936 to CP54936
for (int i = 0; i < sizeof(asAnsiCodeChanged)/sizeof(SAnsiCodeChanged); i++) { if (wAnsi == asAnsiCodeChanged[i].wchAnsiNew) { wchUnicode = asAnsiCodeChanged[i].wchUnicode; goto Exit; } } // Not in Changed code list, that is same with GBK, or CP936
// (Most DByte Ansi char code should compatible from GBK to GB18030)
cLen = MultiByteToWideChar(936, MB_PRECOMPOSED, (PCCH)pbyAnsi, 2, &wchUnicode, 1);
if (cLen != 1) { wchUnicode = 0; }
Exit: return wchUnicode; }
// API: High level service for Ansi to Unicode
// return Unicode str length (in WCHAR)
// return -1 means fail (Buffer overflow, etc.)
int AnsiStrToUnicodeStr( const BYTE* pbyAnsiStr, int ncAnsiStrSize, // In char
PWCH pwchUnicodeBuf, int ncBufLen) // In WCHAR
{ int nCharLen; int ncUnicodeBuf = 0;
for (int i = 0; i < ncAnsiStrSize; ) { if (ncUnicodeBuf > (ncBufLen-4)) { // Buffer overflow
break; } // 1 byte Ansi char
if (*pbyAnsiStr < 0x80) { *pwchUnicodeBuf = (WCHAR)*pbyAnsiStr; pwchUnicodeBuf ++; ncUnicodeBuf ++; i++; pbyAnsiStr++; // 2 byte Ansi char
} else if ((i+1 < ncAnsiStrSize) && pbyAnsiStr[1] >= 0x40) { *pwchUnicodeBuf = DByteAnsiToUnicode(pbyAnsiStr); if (*pwchUnicodeBuf == 0) { *pwchUnicodeBuf = '?'; } pwchUnicodeBuf ++; ncUnicodeBuf ++; i += 2; pbyAnsiStr += 2; // 4 byte Ansi char
} else if ((i+3 < ncAnsiStrSize) && IsValidQByteAnsiLeadByte(pbyAnsiStr[0]) && IsValidQByteAnsiTailByte(pbyAnsiStr[1]) && IsValidQByteAnsiLeadByte(pbyAnsiStr[2]) && IsValidQByteAnsiTailByte(pbyAnsiStr[3])) { // QByte GB char
nCharLen = QByteAnsiToUnicode(pbyAnsiStr, pwchUnicodeBuf, 4); if (nCharLen < 0) { ASSERT(FALSE); // Invalid Ansi char input, or buffer overflow, etc.
// Should never happen but an internal error
break; } else if (nCharLen == 0) { // hasn't corresponding Unicode Char
*pwchUnicodeBuf = '?'; pwchUnicodeBuf ++; ncUnicodeBuf ++; } else if (nCharLen > 0) { ASSERT(nCharLen <= 2); pwchUnicodeBuf += nCharLen; ncUnicodeBuf += nCharLen; } else { ASSERT(FALSE); } i += 4; pbyAnsiStr += 4; // Invalid Ansi char
} else { // Invalid
i++; pbyAnsiStr++; } }
if (i < ncAnsiStrSize) { return -1; }
return ncUnicodeBuf; }
// ******************************************************
// Testing program
// ******************************************************
/*
"\u0080", <0x81;0x30;0x81;0x30> "\u00A3", <0x81;0x30;0x84;0x35> "\u00A4", <0xA1;0xE8> "\u00A5", <0x81;0x30;0x84;0x36> "\u00A6", <0x81;0x30;0x84;0x37> "\u00A7", <0xA1;0xEC> "\u00A8", <0xA1;0xA7> "\u00A9", <0x81;0x30;0x84;0x38> "\u00AF", <0x81;0x30;0x85;0x34> "\u00B0", <0xA1;0xE3> "\u00B1", <0xA1;0xC0> "\u00B2", <0x81;0x30;0x85;0x35>
{0x20AC, 0xe3a2}, {0x01f9, 0xbfa8}, {0x303e, 0x89a9}, {0x2ff0, 0x8aa9}, {0x2ff1, 0x8ba9},
50EF 836A 50F0 836B 50F1 836C 50F2 836D
*/ #if 0
int test (void) { const WCHAR awchUnicodeStr[] = {0x01, 0x7f, 0x80, 0x81, 0x82, 0xa2, 0xa3, // 0x81;0x30;0x84;0x35
0xa4, // 0xA1;0xE8
0xa5, // 0x81;0x30;0x84;0x36
0xa6, // 0x81;0x30;0x84;0x37
0xaf, // 0x81;0x30;0x85;0x34
0xb0, // 0xA1;0xE3
0xb1, // 0xA1;0xC0
0xb6, // 0x81;0x30;0x85;0x39
0xb7, // 0xA1;0xA4
// Some normal DByte Ansi char
0x50ef, // 0x83, 0x6A
0x50f2, // 0x83, 0x6D
// Some ansi char code changed in new standard
0x20ac, // 0xa2, 0xe3
0xE76C, // not (0xa2, 0xe3), should some QByte char
0x2ff0, // 0xa9, 0x8A
0x2ff1, // 0xa9, 0x8B
0x4723, // 0xFE, 0x80
// Ansi char arround DC00 to E000
0xd7ff, // 0x83, 0x36, 0xC7, 0x38
0xe76c, // 0x83, 0x36, 0xC7, 0x39
0xE76B, // 0xA2, 0xB0
0xffff, // 0x84, 0x31, 0xa4, 0x39,
0x00};
char* pchAnsiStr = new char[sizeof(awchUnicodeStr)*2+5]; UnicodeStrToAnsiStr(awchUnicodeStr, sizeof(awchUnicodeStr)/sizeof(WCHAR), pchAnsiStr, sizeof(awchUnicodeStr)*2+5);
delete[] pchAnsiStr;
BYTE abyAnsiStr2[] = { 0x81, 0x30, 0x81, 0x30, 0x81, 0x30, 0x84, 0x35, 0xA1, 0xE8, 0x81, 0x30, 0x84, 0x36, 0x81, 0x30, 0x84, 0x37, 0xA1, 0xEC, 0xA1, 0xA7, 0x81, 0x30, 0x84, 0x38, 0x81, 0x30, 0x85, 0x34, 0xA1, 0xE3, 0xA1, 0xC0, 0x81, 0x30, 0x85, 0x35, // Testing D800 to DE00
0x82, 0x35, 0x8f, 0x33, // 0x9FA6
0x83, 0x36, 0xC7, 0x38, // 0xD7FF
0xA2, 0xB0, // 0xE76B
0x83, 0x36, 0xC7, 0x39, // 0xE76C
// Testing last char in BMP
0x84, 0x31, 0xa4, 0x39, // 0xFFFF
// Some char code changed in new GB standard
0xa2, 0xe3, // 0x20AC,
0xa8, 0xbf, // 0x01f9,
0xa9, 0x89, // 0x303e,
0xa9, 0x8a, // 0x2ff0,
0xa9, 0x8b, // 0x2ff1,
0xFE, 0x9F, // 0x4dae
0x83, 0x6A, // 50EF
0x83, 0x6B, // 50F0
0x83, 0x6C, // 50F1
0x83, 0x6D // 50F2
};
WCHAR* pwchUnicodeStr2 = new WCHAR[sizeof(abyAnsiStr2)+3]; AnsiStrToUnicodeStr(abyAnsiStr2, sizeof(abyAnsiStr2), pwchUnicodeStr2, sizeof(abyAnsiStr2)+3);
delete[] pwchUnicodeStr2;
return 0; } #endif
|