#include "stdafx.h" #include "conveng.h" #include "convdata.tbl" // These file contain 3 parts: // First part, Some basic service functions for Ansi char format convert, // Distance/Advance calculate and Binary search algorithm copied from STL // Second part, Unicode to Ansi // Third part, Ansi to Unicode // **************************************************************************** // Frist part, Ansi char convert functions // // This part not use any data base in .tbl file // **************************************************************************** // Binary search algorithm // Copy from STL, only very little modify template RandomAccessIterator __lower_bound(RandomAccessIterator first, RandomAccessIterator last, const T& value) { INT_PTR len = last - first; INT_PTR half; RandomAccessIterator middle; while (len > 0) { half = len / 2; middle = first + half; if (*middle < value) { first = middle + 1; len = len - half - 1; } else { len = half; } } return first; } template RandomAccessIterator __upper_bound(RandomAccessIterator first, RandomAccessIterator last, const T& value) { DWORD len = last - first; DWORD half; RandomAccessIterator middle; while (len > 0) { half = len / 2; middle = first + half; if (!(value < *middle)) { first = middle + 1; len = len - half - 1; } else { len = half; } } return first; } template inline ValueIn( T Value, T Low, T High) { return (Value >= Low && Value < High); } inline BOOL IsValidSurrogateLeadWord( WCHAR wchUnicode) { return ValueIn(wchUnicode, cg_wchSurrogateLeadWordLow, cg_wchSurrogateLeadWordHigh); } inline BOOL IsValidSurrogateTailWord( WCHAR wchUnicode) { return ValueIn(wchUnicode, cg_wchSurrogateTailWordLow, cg_wchSurrogateTailWordHigh); } inline BOOL IsValidQByteAnsiLeadByte( BYTE byAnsi) { return ValueIn(byAnsi, cg_byQByteAnsiLeadByteLow, cg_byQByteAnsiLeadByteHigh); } inline BOOL IsValidQByteAnsiTailByte( BYTE byAnsi) { return ValueIn(byAnsi, cg_byQByteAnsiTailByteLow, cg_byQByteAnsiTailByteHigh); } // Generate QByte Ansi. The Ansi char is in DWORD format, // in another word, it's in reverse order of GB18030 standard DWORD QByteAnsiBaseAddOffset( DWORD dwBaseAnsi, // In reverse order int nOffset) { DWORD dwAnsi = dwBaseAnsi; PBYTE pByte = (PBYTE)&dwAnsi; // dwOffset should less than 1M ASSERT (nOffset < 0x100000); nOffset += pByte[0] - 0x30; pByte[0] = 0x30 + nOffset % 10; nOffset /= 10; nOffset += pByte[1] - 0x81; pByte[1] = 0x81 + nOffset % 126; nOffset /= 126; nOffset += pByte[2] - 0x30; pByte[2] = 0x30 + nOffset % 10; nOffset /= 10; nOffset += pByte[3] - 0x81; pByte[3] = 0x81 + nOffset % 126; nOffset /= 126; ASSERT(nOffset == 0); return dwAnsi; } // Get "distance" of 2 QByte Ansi int CalcuDistanceOfQByteAnsi( DWORD dwAnsi1, // In reverse order DWORD dwAnsi2) // In reverse order { signed char* pschAnsi1 = (signed char*)&dwAnsi1; signed char* pschAnsi2 = (signed char*)&dwAnsi2; int nDistance = 0; nDistance += (pschAnsi1[0] - pschAnsi2[0]); nDistance += (pschAnsi1[1] - pschAnsi2[1])*10; nDistance += (pschAnsi1[2] - pschAnsi2[2])*1260; nDistance += (pschAnsi1[3] - pschAnsi2[3])*12600; return nDistance; } // Reverse 4 Bytes order, from DWORD format to GB format, // or GB to DWORD void ReverseQBytesOrder( PBYTE pByte) { BYTE by; by = pByte[0]; pByte[0] = pByte[3]; pByte[3] = by; by = pByte[1]; pByte[1] = pByte[2]; pByte[2] = by; return; } // **************************************************************************** // Second part, Unicode to Ansi // **************************************************************************** // ------------------------------------------------ // Two helper function for UnicodeToAnsi // return Ansi char code // the Ansi is in GB standard order (not Word value order) // // Unicode to double bytes Ansi char // // Return Ansi char code, 0 means fail (internal error, etc.) // WORD UnicodeToDByteAnsi( WCHAR wchUnicode) { char achAnsiBuf[4]; WORD wAnsi = 0; int cLen = 0; // Code changed from GBK to GB18030, or code not compatible // from CP936 to CP54936 for (int i = 0; i < sizeof(asAnsiCodeChanged)/sizeof(SAnsiCodeChanged); i++) { if (wchUnicode == asAnsiCodeChanged[i].wchUnicode) { wAnsi = asAnsiCodeChanged[i].wchAnsiNew; goto Exit; } } // Not in Changed code list, that is same with GBK, or CP936 // (Most DByte Ansi char code should compatible from GBK to GB18030) cLen = WideCharToMultiByte(936, WC_COMPOSITECHECK, &wchUnicode, 1, achAnsiBuf, sizeof(achAnsiBuf)-1, NULL, NULL); if (cLen != 2) { ASSERT(cLen == 2); wAnsi = 0; } else { wAnsi = *(PWORD)achAnsiBuf; } Exit: return wAnsi; } // Unicode to quad bytes Ansi char // // Return Ansi char code // 0 means fail (interal error) // DWORD UnicodeToQByteAnsi( int nSection, int nOffset) { DWORD dwBaseAnsi; if (nSection < 0 || nSection >= sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) { ASSERT(FALSE); return 0; } dwBaseAnsi = adwAnsiQBytesAreaStartValue[nSection]; // Check adwAnsiQByteAreaStartValue array is correctly #ifdef _DEBUG int ncQByteAnsiNum = 0; for (int i = 0; i < nSection; i++) { // Calcu QByte Ansi char numbers ncQByteAnsiNum += awchAnsiDQByteBound[2*i+1] - awchAnsiDQByteBound[2*i]; } ASSERT(dwBaseAnsi == QByteAnsiBaseAddOffset(cg_dwQByteAnsiStart, ncQByteAnsiNum)); #endif DWORD dwAnsi = QByteAnsiBaseAddOffset(dwBaseAnsi, nOffset); // Value order to standard order ReverseQBytesOrder((PBYTE)(&dwAnsi)); return dwAnsi; } // --------------------------------------------------------- // Two function support 2 bytes Unicode (BMP) // and 4 bytes Unicode (Surrogate) translate to Ansi // 2 bytes Unicode (BMP) // Return Ansi str len, when success, should be 2 or 4; // return 0 means fail (internal error, etc.) int UnicodeToAnsi( WCHAR wchUnicode, char* pchAnsi, DWORD dwBufSize) { // Classic Unicode, not support surrogate in this function ASSERT(!IsValidSurrogateLeadWord(wchUnicode) && !IsValidSurrogateTailWord(wchUnicode)); DWORD lAnsiLen = 0; const WORD* p; INT_PTR i; // ASCII, 0 - 0x7f if (wchUnicode <= 0x7f) { *pchAnsi = (char)wchUnicode; lAnsiLen = 1; goto Exit; } // BMP, 4 byte or 2 byte p = __lower_bound(awchAnsiDQByteBound, awchAnsiDQByteBound + sizeof(awchAnsiDQByteBound)/sizeof(WCHAR), wchUnicode); if (p == awchAnsiDQByteBound + sizeof(awchAnsiDQByteBound)/sizeof(WCHAR)) { p --; } else if (wchUnicode < *p) { p --; } else if (wchUnicode == *p) { } else { ASSERT(FALSE); } i = p - awchAnsiDQByteBound; ASSERT(i >= 0); // Stop when >= *(((PWORD)asAnsi2ByteArea) + i); if (i%2) { // Odd, in 2 bytes area WORD wAnsi = UnicodeToDByteAnsi(wchUnicode); if (wAnsi && dwBufSize >= 2) { *(UNALIGNED WORD*)pchAnsi = wAnsi; lAnsiLen = 2; } else { lAnsiLen = 0; } } else { // Duel, in 4 bytes area DWORD dwAnsi = UnicodeToQByteAnsi ((int)i/2, wchUnicode - awchAnsiDQByteBound[i]); if (dwAnsi && dwBufSize >= 4) { *(UNALIGNED DWORD*)pchAnsi = dwAnsi; lAnsiLen = 4; } else { lAnsiLen = 0; } } Exit: return lAnsiLen; } // 4 bytes Unicode (Surrogate) // Return Ansi str length, when success, should be 4 // return 0 means fail (Buffer overflow) int SurrogateToAnsi( PCWCH pwchUnicode, PCHAR pchAnsi, DWORD dwBufSize) { ASSERT(IsValidSurrogateLeadWord(pwchUnicode[0])); ASSERT(IsValidSurrogateTailWord(pwchUnicode[1])); // dwOffset is ISO char code - 0x10000 DWORD dwOffset = ((pwchUnicode[0] - cg_wchSurrogateLeadWordLow)<<10) + (pwchUnicode[1] - cg_wchSurrogateTailWordLow) + 0x10000 - 0x10000; if (dwBufSize < 4) { return 0; } *(UNALIGNED DWORD*)pchAnsi = QByteAnsiBaseAddOffset (cg_dwQByteAnsiToSurrogateStart, dwOffset); ReverseQBytesOrder((PBYTE)pchAnsi); return 4; } // API: high level service for Unicode to Ansi // return result Ansi str length (in byte) // return -1 means fail (Buffer overflow, internal error, etc.) int UnicodeStrToAnsiStr( PCWCH pwchUnicodeStr, int ncUnicodeStr, // in WCHAR PCHAR pchAnsiStrBuf, int ncAnsiStrBufSize) // in BYTE { int ncAnsiStr = 0; int ncAnsiCharSize; for (int i = 0; i < ncUnicodeStr; i++, pwchUnicodeStr++) { if (ncAnsiStr > (ncAnsiStrBufSize-4)) { // Buffer overflow break; } if (IsValidSurrogateLeadWord(pwchUnicodeStr[0])) { if ((i+1 < ncUnicodeStr) && (IsValidSurrogateTailWord(pwchUnicodeStr[1]))) { ncAnsiCharSize = SurrogateToAnsi(pwchUnicodeStr, pchAnsiStrBuf, 4); ASSERT(ncAnsiCharSize == 4); if (ncAnsiCharSize == 0) { ASSERT(FALSE); break; } ncAnsiStr += ncAnsiCharSize; pchAnsiStrBuf += ncAnsiCharSize; pwchUnicodeStr++; i++; } else { // Invalide Uncode char, skip } } else if (*pwchUnicodeStr == 0) { *pchAnsiStrBuf = 0; pchAnsiStrBuf ++; ncAnsiStr ++; } else { ncAnsiCharSize = UnicodeToAnsi(*pwchUnicodeStr, pchAnsiStrBuf, 4); if (ncAnsiCharSize == 0) { ASSERT(FALSE); break; } pchAnsiStrBuf += ncAnsiCharSize; ncAnsiStr += ncAnsiCharSize; } } if (i < ncUnicodeStr) { return -1; } return ncAnsiStr; } // **************************************************************************** // Third part, Ansi to Unicode // **************************************************************************** // Return Unicode number (number always equal to 1 when success) // return 0 if can't find corresponding Unicode // -1 means fail (internal error, etc.) int QByteAnsiToSingleUnicode( DWORD dwAnsi, PWCH pwchUnicode) { const DWORD* p; INT_PTR i; // 0x8431a439(cg_dwQByteAnsiToBMPLast) to 0x85308130 haven't Unicode corresponding // 0x85308130 to 0x90308130(cg_dwQByteAnsiToSurrogateStart) are reserved zone, // haven't Unicode corresponding if (dwAnsi > cg_dwQByteAnsiToBMPLast) { return 0; } // Invalid input value if (dwAnsi < adwAnsiQBytesAreaStartValue[0]) { return -1; } p = __lower_bound(adwAnsiQBytesAreaStartValue, adwAnsiQBytesAreaStartValue + sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD), dwAnsi); if (p == adwAnsiQBytesAreaStartValue + sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) { p --; } else if (dwAnsi < *p) { p --; } else if (dwAnsi == *p) { } else { ASSERT(FALSE); } i = p - adwAnsiQBytesAreaStartValue; if (i < 0) { ASSERT(i >= 0); return -1; } *pwchUnicode = awchAnsiDQByteBound[2*i] + CalcuDistanceOfQByteAnsi(dwAnsi, *p); #ifdef _DEBUG { int nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, *p); ASSERT(nAnsiCharDistance >= 0); WCHAR wchUnicodeDbg; if ((p+1) < adwAnsiQBytesAreaStartValue + sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) { nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, *(p+1)); wchUnicodeDbg = awchAnsiDQByteBound[2*i+1] + nAnsiCharDistance; } else if ((p+1) == adwAnsiQBytesAreaStartValue + sizeof(adwAnsiQBytesAreaStartValue)/sizeof(DWORD)) { nAnsiCharDistance = CalcuDistanceOfQByteAnsi(dwAnsi, 0x8431A530); wchUnicodeDbg = 0x10000 + nAnsiCharDistance; } else { ASSERT(FALSE); } ASSERT(nAnsiCharDistance < 0); ASSERT(wchUnicodeDbg == *pwchUnicode); } #endif return 1; } // Return Unicode number (number always 2 when success) // return 0 if can't find corresponding Unicode int QByteAnsiToDoubleUnicode( DWORD dwAnsi, PWCH pwchUnicode) { int nDistance = CalcuDistanceOfQByteAnsi(dwAnsi, cg_dwQByteAnsiToSurrogateStart); ASSERT (nDistance >= 0); if (nDistance >= 0x100000) { return 0; } pwchUnicode[1] = nDistance % 0x400 + 0xDC00; pwchUnicode[0] = nDistance / 0x400 + 0xD800; return 2; } // Return Unicode number (1 or 2 when success) // return 0 if can't find corresponding Unicode // return -1 if fail (Buffer overflow, invalid GB char code input, // internal error, etc.) int QByteAnsiToUnicode( const BYTE* pbyAnsiChar, PWCH pwchUnicode, DWORD dwBufLen) // In WCHAR { DWORD dwAnsi; int nLen = -1; if ( IsValidQByteAnsiLeadByte(pbyAnsiChar[0]) && IsValidQByteAnsiTailByte(pbyAnsiChar[1]) && IsValidQByteAnsiLeadByte(pbyAnsiChar[2]) && IsValidQByteAnsiTailByte(pbyAnsiChar[3])) { } else { return -1; // Invalid char } dwAnsi = *(UNALIGNED DWORD*)pbyAnsiChar; ReverseQBytesOrder((PBYTE)(&dwAnsi)); if (dwAnsi >= cg_dwQByteAnsiToSurrogateStart) { if (dwBufLen >= 2) { nLen = QByteAnsiToDoubleUnicode(dwAnsi, pwchUnicode); } } else { if (dwBufLen >= 1) { nLen = QByteAnsiToSingleUnicode(dwAnsi, pwchUnicode); } } return nLen; } // Unicode to double bytes Ansi char // Return: Unicode char code, 0 means fail (internal error, etc.) WCHAR DByteAnsiToUnicode( const BYTE* pbyAnsi) { WORD wAnsi = *(UNALIGNED WORD*)pbyAnsi; int cLen = 1; WCHAR wchUnicode; // Code changed from GBK to GB18030, or code not compatible // from CP936 to CP54936 for (int i = 0; i < sizeof(asAnsiCodeChanged)/sizeof(SAnsiCodeChanged); i++) { if (wAnsi == asAnsiCodeChanged[i].wchAnsiNew) { wchUnicode = asAnsiCodeChanged[i].wchUnicode; goto Exit; } } // Not in Changed code list, that is same with GBK, or CP936 // (Most DByte Ansi char code should compatible from GBK to GB18030) cLen = MultiByteToWideChar(936, MB_PRECOMPOSED, (PCCH)pbyAnsi, 2, &wchUnicode, 1); if (cLen != 1) { wchUnicode = 0; } Exit: return wchUnicode; } // API: High level service for Ansi to Unicode // return Unicode str length (in WCHAR) // return -1 means fail (Buffer overflow, etc.) int AnsiStrToUnicodeStr( const BYTE* pbyAnsiStr, int ncAnsiStrSize, // In char PWCH pwchUnicodeBuf, int ncBufLen) // In WCHAR { int nCharLen; int ncUnicodeBuf = 0; for (int i = 0; i < ncAnsiStrSize; ) { if (ncUnicodeBuf > (ncBufLen-4)) { // Buffer overflow break; } // 1 byte Ansi char if (*pbyAnsiStr < 0x80) { *pwchUnicodeBuf = (WCHAR)*pbyAnsiStr; pwchUnicodeBuf ++; ncUnicodeBuf ++; i++; pbyAnsiStr++; // 2 byte Ansi char } else if ((i+1 < ncAnsiStrSize) && pbyAnsiStr[1] >= 0x40) { *pwchUnicodeBuf = DByteAnsiToUnicode(pbyAnsiStr); if (*pwchUnicodeBuf == 0) { *pwchUnicodeBuf = '?'; } pwchUnicodeBuf ++; ncUnicodeBuf ++; i += 2; pbyAnsiStr += 2; // 4 byte Ansi char } else if ((i+3 < ncAnsiStrSize) && IsValidQByteAnsiLeadByte(pbyAnsiStr[0]) && IsValidQByteAnsiTailByte(pbyAnsiStr[1]) && IsValidQByteAnsiLeadByte(pbyAnsiStr[2]) && IsValidQByteAnsiTailByte(pbyAnsiStr[3])) { // QByte GB char nCharLen = QByteAnsiToUnicode(pbyAnsiStr, pwchUnicodeBuf, 4); if (nCharLen < 0) { ASSERT(FALSE); // Invalid Ansi char input, or buffer overflow, etc. // Should never happen but an internal error break; } else if (nCharLen == 0) { // hasn't corresponding Unicode Char *pwchUnicodeBuf = '?'; pwchUnicodeBuf ++; ncUnicodeBuf ++; } else if (nCharLen > 0) { ASSERT(nCharLen <= 2); pwchUnicodeBuf += nCharLen; ncUnicodeBuf += nCharLen; } else { ASSERT(FALSE); } i += 4; pbyAnsiStr += 4; // Invalid Ansi char } else { // Invalid i++; pbyAnsiStr++; } } if (i < ncAnsiStrSize) { return -1; } return ncUnicodeBuf; } // ****************************************************** // Testing program // ****************************************************** /* "\u0080", <0x81;0x30;0x81;0x30> "\u00A3", <0x81;0x30;0x84;0x35> "\u00A4", <0xA1;0xE8> "\u00A5", <0x81;0x30;0x84;0x36> "\u00A6", <0x81;0x30;0x84;0x37> "\u00A7", <0xA1;0xEC> "\u00A8", <0xA1;0xA7> "\u00A9", <0x81;0x30;0x84;0x38> "\u00AF", <0x81;0x30;0x85;0x34> "\u00B0", <0xA1;0xE3> "\u00B1", <0xA1;0xC0> "\u00B2", <0x81;0x30;0x85;0x35> {0x20AC, 0xe3a2}, {0x01f9, 0xbfa8}, {0x303e, 0x89a9}, {0x2ff0, 0x8aa9}, {0x2ff1, 0x8ba9}, 50EF 836A 50F0 836B 50F1 836C 50F2 836D */ #if 0 int test (void) { const WCHAR awchUnicodeStr[] = {0x01, 0x7f, 0x80, 0x81, 0x82, 0xa2, 0xa3, // 0x81;0x30;0x84;0x35 0xa4, // 0xA1;0xE8 0xa5, // 0x81;0x30;0x84;0x36 0xa6, // 0x81;0x30;0x84;0x37 0xaf, // 0x81;0x30;0x85;0x34 0xb0, // 0xA1;0xE3 0xb1, // 0xA1;0xC0 0xb6, // 0x81;0x30;0x85;0x39 0xb7, // 0xA1;0xA4 // Some normal DByte Ansi char 0x50ef, // 0x83, 0x6A 0x50f2, // 0x83, 0x6D // Some ansi char code changed in new standard 0x20ac, // 0xa2, 0xe3 0xE76C, // not (0xa2, 0xe3), should some QByte char 0x2ff0, // 0xa9, 0x8A 0x2ff1, // 0xa9, 0x8B 0x4723, // 0xFE, 0x80 // Ansi char arround DC00 to E000 0xd7ff, // 0x83, 0x36, 0xC7, 0x38 0xe76c, // 0x83, 0x36, 0xC7, 0x39 0xE76B, // 0xA2, 0xB0 0xffff, // 0x84, 0x31, 0xa4, 0x39, 0x00}; char* pchAnsiStr = new char[sizeof(awchUnicodeStr)*2+5]; UnicodeStrToAnsiStr(awchUnicodeStr, sizeof(awchUnicodeStr)/sizeof(WCHAR), pchAnsiStr, sizeof(awchUnicodeStr)*2+5); delete[] pchAnsiStr; BYTE abyAnsiStr2[] = { 0x81, 0x30, 0x81, 0x30, 0x81, 0x30, 0x84, 0x35, 0xA1, 0xE8, 0x81, 0x30, 0x84, 0x36, 0x81, 0x30, 0x84, 0x37, 0xA1, 0xEC, 0xA1, 0xA7, 0x81, 0x30, 0x84, 0x38, 0x81, 0x30, 0x85, 0x34, 0xA1, 0xE3, 0xA1, 0xC0, 0x81, 0x30, 0x85, 0x35, // Testing D800 to DE00 0x82, 0x35, 0x8f, 0x33, // 0x9FA6 0x83, 0x36, 0xC7, 0x38, // 0xD7FF 0xA2, 0xB0, // 0xE76B 0x83, 0x36, 0xC7, 0x39, // 0xE76C // Testing last char in BMP 0x84, 0x31, 0xa4, 0x39, // 0xFFFF // Some char code changed in new GB standard 0xa2, 0xe3, // 0x20AC, 0xa8, 0xbf, // 0x01f9, 0xa9, 0x89, // 0x303e, 0xa9, 0x8a, // 0x2ff0, 0xa9, 0x8b, // 0x2ff1, 0xFE, 0x9F, // 0x4dae 0x83, 0x6A, // 50EF 0x83, 0x6B, // 50F0 0x83, 0x6C, // 50F1 0x83, 0x6D // 50F2 }; WCHAR* pwchUnicodeStr2 = new WCHAR[sizeof(abyAnsiStr2)+3]; AnsiStrToUnicodeStr(abyAnsiStr2, sizeof(abyAnsiStr2), pwchUnicodeStr2, sizeof(abyAnsiStr2)+3); delete[] pwchUnicodeStr2; return 0; } #endif