|
|
#include "private.h"
#include "detcbase.h"
#include "codepage.h"
#include "detcjpn.h"
#include "detckrn.h"
#include "fechrcnv.h"
#include "ichrcnv.h"
#include "cpdetect.h"
#include <tchar.h>
#define CONV_UU 12
#define CONV_UUW 10
#define CONV_UUWI 9
#define CONV_UW 6
#define CONV_UWI 5
#define CONV_WI 3
#define MAX_CHAR_SIZE 4
#define MAPUSERDEF(x) (((x) == 50000) ? 1252 : (x))
#define CONVERT_IS_VALIDCODEPAGE(x) (((x) == CP_USER_DEFINED) ? TRUE: IsValidCodePage(x))
#define CONV_CHK_NLS 0x00000001
struct ENCODINGINFO { DWORD dwEncoding; DWORD dwCodePage; BYTE bTypeUUIW; CP_STATE nCP_State ; // whether this is a valid windows codepage ?
DWORD dwFlags; // give us more flexibilities to handle different encodings differently
};
static WCHAR UniocdeSignature = { 0xFFFE } ;
/*
Bit 4 (16) - Unicode <-> Internet Encoding Bit 3 (8) - UTF8, UTF7 Bit 2 (4) - Unicode Bit 1 (2) - Windows CodePage Bit 0 (1) - Internet Encoding
P.S. if bit 4 is set, it means it should convert between Unicode and Internet Encoding directly, no intermediate step - Windows CodePage */
// these codepages including Unicode need special convertor
static struct ENCODINGINFO aEncodingInfo[] = {
{ CP_JPN_SJ, 932, 0x02, INVALID_CP, 0 }, // W-Japanese Shift JIS
{ CP_CHN_GB, 936, 0x02, INVALID_CP, 0 }, // W-Simplified Chinese
{ CP_KOR_5601, 949, 0x02, INVALID_CP, 0 }, // W-Krean Unified Hangul
{ CP_TWN, 950, 0x02, INVALID_CP, 0 }, // W-Traditional Chinese
{ CP_UCS_2, 0, 0x04, INVALID_CP, 0 }, // U-Unicode
{ CP_UCS_2_BE, 0, 0x04, INVALID_CP, 0 }, // U-Unicode Big Endian
{ CP_1252, 1252, 0x02, INVALID_CP, 0 }, // W-Latin 1
{ CP_20127, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // US ASCII
{ CP_ISO_8859_1, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // I-ISO 8859-1 Latin 1
{ CP_ISO_8859_15, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // I-ISO 8859-1 Latin 1
{ CP_AUTO, 1252, 0x01, INVALID_CP, 0 }, // General auto detect
{ CP_ISO_2022_JP, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP No Halfwidth Katakana
{ CP_ISO_2022_JP_ESC, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP w/esc Halfwidth Katakana
{ CP_ISO_2022_JP_SIO, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP w/sio Halfwidth Katakana
{ CP_ISO_2022_KR, 949, 0x01, INVALID_CP, 0 }, // I-ISO 2022-KR
{ CP_ISO_2022_TW, 950, 0x01, INVALID_CP, 0 }, // I-ISO 2022-TW
{ CP_ISO_2022_CH, 936, 0x01, INVALID_CP, 0 }, // I-ISO 2022-CH
{ CP_JP_AUTO, 932, 0x01, INVALID_CP, 0 }, // JP auto detect
{ CP_CHS_AUTO, 936, 0x01, INVALID_CP, 0 }, // Simplified Chinese auto detect
{ CP_KR_AUTO, 949, 0x01, INVALID_CP, 0 }, // KR auto detect
{ CP_CHT_AUTO, 950, 0x01, INVALID_CP, 0 }, // Traditional Chinese auto detect
{ CP_CYRILLIC_AUTO, 1251, 0x01, INVALID_CP, 0 }, // Cyrillic auto detect
{ CP_GREEK_AUTO, 1253, 0x01, INVALID_CP, 0 }, // Greek auto detect
{ CP_ARABIC_AUTO, 1256, 0x01, INVALID_CP, 0 }, // Arabic auto detect
{ CP_EUC_JP, 932, 0x01, INVALID_CP, 0 }, // EUC Japanese
{ CP_EUC_CH, 936, 0x01, INVALID_CP, 0 }, // EUC Chinese
{ CP_EUC_KR, 949, 0x01, INVALID_CP, 0 }, // EUC Korean
{ CP_EUC_TW, 950, 0x01, INVALID_CP, 0 }, // EUC Taiwanese
{ CP_CHN_HZ, 936, 0x01, INVALID_CP, 0 }, // Simplify Chinese HZ-GB
{ CP_UTF_7, 0, 0x08, INVALID_CP, 0 }, // U-UTF7
{ CP_UTF_8, 0, 0x08, INVALID_CP, 0 }, // U-UTF8
};
// HTML name entity table for Latin-1 Supplement - from 0x00A0-0x00FF
#define NAME_ENTITY_OFFSET 0x00A0
#define NAME_ENTITY_MAX 0x00FF
#define NAME_ENTITY_ENTRY 96
static CHAR *g_lpstrNameEntity[NAME_ENTITY_ENTRY] = { " ", // " " -- no-break space = non-breaking space,
"¡", // "¡" -- inverted exclamation mark, U+00A1 ISOnum -->
"¢", // "¢" -- cent sign, U+00A2 ISOnum -->
"£", // "£" -- pound sign, U+00A3 ISOnum -->
"¤", // "¤" -- currency sign, U+00A4 ISOnum -->
"¥", // "¥" -- yen sign = yuan sign, U+00A5 ISOnum -->
"¦", // "¦" -- broken bar = broken vertical bar,
"§", // "§" -- section sign, U+00A7 ISOnum -->
"¨", // "¨" -- diaeresis = spacing diaeresis,
"©", // "©" -- copyright sign, U+00A9 ISOnum -->
"ª", // "ª" -- feminine ordinal indicator, U+00AA ISOnum -->
"«", // "«" -- left-pointing double angle quotation mark
"¬", // "¬" -- not sign = discretionary hyphen,
"­", // "­" -- soft hyphen = discretionary hyphen,
"®", // "®" -- registered sign = registered trade mark sign,
"¯", // "¯" -- macron = spacing macron = overline
"°", // "°" -- degree sign, U+00B0 ISOnum -->
"±", // "±" -- plus-minus sign = plus-or-minus sign,
"²", // "²" -- superscript two = superscript digit two
"³", // "³" -- superscript three = superscript digit three
"´", // "´" -- acute accent = spacing acute,
"µ", // "µ" -- micro sign, U+00B5 ISOnum -->
"¶", // "¶" -- pilcrow sign = paragraph sign,
"·", // "·" -- middle dot = Georgian comma
"¸", // "¸" -- cedilla = spacing cedilla, U+00B8 ISOdia -->
"¹", // "¹" -- superscript one = superscript digit one,
"º", // "º" -- masculine ordinal indicator,
"»", // "»" -- right-pointing double angle quotation mark
"¼", // "¼" -- vulgar fraction one quarter
"½", // "½" -- vulgar fraction one half
"¾", // "¾" -- vulgar fraction three quarters
"¿", // "¿" -- inverted question mark
"À", // "À" -- latin capital letter A with grave
"Á", // "Á" -- latin capital letter A with acute,
"Â", // "Â" -- latin capital letter A with circumflex,
"Ã", // "Ã" -- latin capital letter A with tilde,
"Ä", // "Ä" -- latin capital letter A with diaeresis,
"Å", // "Å" -- latin capital letter A with ring above
"Æ", // "Æ" -- latin capital letter AE
"Ç", // "Ç" -- latin capital letter C with cedilla,
"È", // "È" -- latin capital letter E with grave,
"É", // "É" -- latin capital letter E with acute,
"Ê", // "Ê" -- latin capital letter E with circumflex,
"Ë", // "Ë" -- latin capital letter E with diaeresis,
"Ì", // "Ì" -- latin capital letter I with grave,
"Í", // "Í" -- latin capital letter I with acute,
"Î", // "Î" -- latin capital letter I with circumflex,
"Ï", // "Ï" -- latin capital letter I with diaeresis,
"Ð", // "Ð" -- latin capital letter ETH, U+00D0 ISOlat1 -->
"Ñ", // "Ñ" -- latin capital letter N with tilde,
"Ò", // "Ò" -- latin capital letter O with grave,
"Ó", // "Ó" -- latin capital letter O with acute,
"Ô", // "Ô" -- latin capital letter O with circumflex,
"Õ", // "Õ" -- latin capital letter O with tilde,
"Ö", // "Ö" -- latin capital letter O with diaeresis,
"×", // "×" -- multiplication sign, U+00D7 ISOnum -->
"Ø", // "Ø" -- latin capital letter O with stroke
"Ù", // "Ù" -- latin capital letter U with grave,
"Ú", // "Ú" -- latin capital letter U with acute,
"Û", // "Û" -- latin capital letter U with circumflex,
"Ü", // "Ü" -- latin capital letter U with diaeresis,
"Ý", // "Ý" -- latin capital letter Y with acute,
"Þ", // "Þ" -- latin capital letter THORN,
"ß", // "ß" -- latin small letter sharp s = ess-zed,
"à", // "à" -- latin small letter a with grave
"á", // "á" -- latin small letter a with acute,
"â", // "â" -- latin small letter a with circumflex,
"ã", // "ã" -- latin small letter a with tilde,
"ä", // "ä" -- latin small letter a with diaeresis,
"å", // "å" -- latin small letter a with ring above
"æ", // "æ" -- latin small letter ae
"ç", // "ç" -- latin small letter c with cedilla,
"è", // "è" -- latin small letter e with grave,
"é", // "é" -- latin small letter e with acute,
"ê", // "ê" -- latin small letter e with circumflex,
"ë", // "ë" -- latin small letter e with diaeresis,
"ì", // "ì" -- latin small letter i with grave,
"í", // "í" -- latin small letter i with acute,
"î", // "î" -- latin small letter i with circumflex,
"ï", // "ï" -- latin small letter i with diaeresis,
"ð", // "ð" -- latin small letter eth, U+00F0 ISOlat1 -->
"ñ", // "ñ" -- latin small letter n with tilde,
"ò", // "ò" -- latin small letter o with grave,
"ó", // "ó" -- latin small letter o with acute,
"ô", // "ô" -- latin small letter o with circumflex,
"õ", // "õ" -- latin small letter o with tilde,
"ö", // "ö" -- latin small letter o with diaeresis,
"÷", // "÷" -- division sign, U+00F7 ISOnum -->
"ø", // "ø" -- latin small letter o with stroke,
"ù", // "ù" -- latin small letter u with grave,
"ú", // "ú" -- latin small letter u with acute,
"û", // "û" -- latin small letter u with circumflex,
"ü", // "ü" -- latin small letter u with diaeresis,
"ý", // "ý" -- latin small letter y with acute,
"þ", // "þ" -- latin small letter thorn with,
"ÿ", // "ÿ" -- latin small letter y with diaeresis,
};
#ifdef MORE_NAME_ENTITY // in case we decide to do more name entity latter
// Additional HTML 4.0 name entity table for CP 1252 extension character set
#define CP1252EXT_BASE (UINT)0x0080
#define CP1252EXT_MAX (UINT)0x009F
#define NONUNI 0xFFFF
#define UNDEFCHAR "???????"
#define CP1252EXT_NCR_SIZE 7
struct NAME_ENTITY_EXT { UWORD uwUniCode; LPCTSTR lpszNameEntity; };
static struct NAME_ENTITY_EXT aNameEntityExt[] = { // UniCode NCR_Enty Name_Enty CP1252Ext Comment
{ 0x20AC, "€" }, // "€" }, // € #EURO SIGN
// { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED
{ 0x201A, "‚" }, // "‚" }, // ‚ #SINGLE LOW-9 QUOTATION MARK
{ 0x0192, "ƒ" }, // "ƒ" }, // ƒ #LATIN SMALL LETTER F WITH HOOK
{ 0x201E, "„" }, // "„" }, // „ #DOUBLE LOW-9 QUOTATION MARK
{ 0x2026, "…" }, // "…" }, // … #HORIZONTAL ELLIPSIS
{ 0x2020, "†" }, // "†" }, // † #DAGGER
{ 0x2021, "‡" }, // "‡" }, // ‡ #DOUBLE DAGGER
{ 0x02C6, "ˆ" }, // "ˆ" }, // ˆ #MODIFIER LETTER CIRCUMFLEX ACCENT
{ 0x2030, "‰" }, // "‰" }, // ‰ #PER MILLE SIGN
{ 0x0160, "Š" }, // "Š" }, // Š #LATIN CAPITAL LETTER S WITH CARON
{ 0x2039, "‹" }, // "‹" }, // ‹ #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
{ 0x0152, "Œ" }, // "Œ" }, // Œ #LATIN CAPITAL LIGATURE OE
// { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED
{ 0x017D, "Ž" }, // "&;" }, // Ž #LATIN CAPITAL LETTER Z WITH CARON, ***no name entity defined in HTML 4.0***
// { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED
// { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED
{ 0x2018, "‘" }, // "‘" }, // ‘ #LEFT SINGLE QUOTATION MARK
{ 0x2019, "’" }, // "’" }, // ’ #RIGHT SINGLE QUOTATION MARK
{ 0x201C, "“" }, // "“" }, // “ #LEFT DOUBLE QUOTATION MARK
{ 0x201D, "”" }, // "”" }, // ” #RIGHT DOUBLE QUOTATION MARK
{ 0x2022, "•" }, // "•" }, // • #BULLET
{ 0x2013, "–" }, // "–" }, // – #EN DASH
{ 0x2014, "—" }, // "—" }, // — #EM DASH
{ 0x20DC, "˜" }, // "˜" }, // ˜ #SMALL TILDE
{ 0x2122, "™" }, // "™" }, // ™ #TRADE MARK SIGN
{ 0x0161, "š" }, // "š" }, // š #LATIN SMALL LETTER S WITH CARON
{ 0x203A, "›" }, // "›" }, // › #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
{ 0x0153, "œ" }, // "œ" }, // œ #LATIN SMALL LIGATURE OE
// { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED
{ 0x017E, "ž" }, // "&;" }, // ž #LATIN SMALL LETTER Z WITH CARON, ***no name entity defined in HTML 4.0***
{ 0x0178, "Ÿ" }, // "Ÿ" }, // Ÿ #LATIN CAPITAL LETTER Y WITH DIAERESIS
}; #endif
/******************************************************************************
***************************** U T I L I T I E S *************************** ******************************************************************************/ void DataByteSwap(LPSTR DataBuf, int len ) { int i ; UCHAR tmpData ;
if ( len ) for ( i = 0 ; i < len-1 ; i+=2 ) { tmpData = DataBuf[i] ; DataBuf[i] = DataBuf[i+1] ; DataBuf[i+1] = tmpData ; }
return ; }
void CheckUnicodeDataType(DWORD dwDstEncoding, LPSTR DataBuf, int len ) { if ( DataBuf && len ) { if ( dwDstEncoding == CP_UCS_2_BE ) DataByteSwap(DataBuf,len); } return ; }
void CheckASCIIEncoding(DWORD dwSrcEncoding, LPSTR DataBuf, int len ) { if (DataBuf && len) { if (dwSrcEncoding == CP_20127) { for (int i = 0; i<len; i++) { if (*DataBuf & 0x80) { *DataBuf &= 0x7f; } DataBuf++; } } } }
/******************************************************************************
****************** C O N V E R T I N E T S T R I N G ****************** ******************************************************************************/ HRESULT CICharConverter::UnicodeToMultiByteEncoding(DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) {
int nBuffSize, i ; BOOL UseDefChar = FALSE ; LPSTR lpDefFallBack = NULL ; UCHAR DefaultCharBuff[3]; // possible DBCS + null
HRESULT hr = E_FAIL; int _nDstSize = *lpnDstSize;
if ( _dwUnicodeEncoding == CP_UCS_2_BE && _cvt_count == 0 ) { if ( _lpUnicodeStr = (LPSTR)LocalAlloc(LPTR, *lpnSrcSize ) ) { MoveMemory(_lpUnicodeStr, lpSrcStr, *lpnSrcSize ) ; lpSrcStr = _lpUnicodeStr ; } else { hr = E_OUTOFMEMORY; goto EXIT; } }
CheckUnicodeDataType(_dwUnicodeEncoding, (LPSTR) lpSrcStr, *lpnSrcSize); nBuffSize = *lpnSrcSize / sizeof(WCHAR);
// We force to use MLang NO_BEST_FIT_CHAR check on ISCII encoding since system don't accept default chars
if (IS_NLS_DLL_CP(dwDstEncoding) && (dwFlag & MLCONVCHARF_USEDEFCHAR)) dwFlag |= MLCONVCHARF_NOBESTFITCHARS;
if ( lpFallBack && ( dwFlag & MLCONVCHARF_USEDEFCHAR )) { // only take SBCS, no DBCS character
if ( 1 == WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0, (LPCWSTR)lpFallBack, 1, (LPSTR)DefaultCharBuff, sizeof(DefaultCharBuff), NULL, NULL )) lpDefFallBack = (LPSTR) DefaultCharBuff; }
if(!(*lpnDstSize = WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0, (LPCWSTR)lpSrcStr, nBuffSize, lpDstStr, *lpnDstSize, IS_NLS_DLL_CP(dwDstEncoding)? NULL:(LPCSTR)lpDefFallBack, IS_NLS_DLL_CP(dwDstEncoding)? NULL:&UseDefChar))) { hr = E_FAIL; goto EXIT; }
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
_nSrcSize = nBuffSize * sizeof(WCHAR);
if (*lpnDstSize) { if (dwFlag & ( MLCONVCHARF_NCR_ENTITIZE | MLCONVCHARF_NAME_ENTITIZE | MLCONVCHARF_NOBESTFITCHARS )) { char *lpDstStrTmp = lpDstStr; WCHAR *lpwStrTmp = NULL; WCHAR *lpwStrTmpSave = NULL; char *lpDstStrTmp2 = NULL; char *lpDstStrTmp2Save = NULL; int cCount, ConvCount = 0, nCount = 0; WCHAR *lpwSrcStrTmp = (WCHAR *)lpSrcStr; int *lpBCharOffset = NULL; int *lpBCharOffsetSave = NULL;
if (!(lpwStrTmpSave = lpwStrTmp = (WCHAR *)LocalAlloc(LPTR, *lpnSrcSize))) { hr = E_OUTOFMEMORY; goto ENTITIZE_DONE; }
// Make sure we have real converted buffer to check BEST_FIT_CHAR and DEFAULT_CHAR
if (!_nDstSize) { lpDstStrTmp2Save = lpDstStrTmp2 = (char *)LocalAlloc(LPTR, *lpnDstSize); if (lpDstStrTmp2) { WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0, (LPCWSTR)lpSrcStr, nBuffSize, lpDstStrTmp2, *lpnDstSize, NULL, NULL ); } else { hr = E_OUTOFMEMORY; goto ENTITIZE_DONE; } }
if (nBuffSize == MultiByteToWideChar(MAPUSERDEF(dwDstEncoding), 0, _nDstSize? lpDstStr : lpDstStrTmp2, *lpnDstSize, lpwStrTmp, _nSrcSize)) { // Pre scan to get number of best fit chars.
for (i=0; i<nBuffSize; i++) { // make special case for ?(yen sign) in Shift-JIS
if (*lpwStrTmp++ != *lpwSrcStrTmp++) { if ((dwDstEncoding == CP_JPN_SJ) && (*(lpwSrcStrTmp - 1) == 0x00A5)) *(lpwStrTmp - 1) = 0x00A5; else nCount ++; } }
lpwSrcStrTmp -= nBuffSize; lpwStrTmp -= nBuffSize;
if (nCount) { int j = 0;
if (!(dwFlag & ( MLCONVCHARF_NCR_ENTITIZE | MLCONVCHARF_NAME_ENTITIZE | MLCONVCHARF_USEDEFCHAR))) { hr = E_FAIL; goto ENTITIZE_DONE; }
if (!(lpBCharOffsetSave = lpBCharOffset = (int *) LocalAlloc(LPTR, nCount*sizeof(int)))) { hr = E_OUTOFMEMORY; goto ENTITIZE_DONE; }
// Record the offset position of each best fit char.
for (i=0; i<nBuffSize; i++) { if (*lpwStrTmp++ != *lpwSrcStrTmp++) { *lpBCharOffset = i-j; lpBCharOffset++; j = i+1; } }
lpBCharOffset -= nCount; lpwSrcStrTmp -= nBuffSize; lpwStrTmp -= nBuffSize;
for (i=0; i<nCount; i++) { BOOL bIsSurrogatePair = FALSE;
if (*lpBCharOffset) { cCount = WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0, (LPCWSTR)lpwSrcStrTmp, *lpBCharOffset, lpDstStrTmp, _nDstSize? _nDstSize-ConvCount : 0, NULL, NULL );
ConvCount += cCount; if (_nDstSize) { lpDstStrTmp += cCount; } lpwSrcStrTmp += *lpBCharOffset; }
BOOL fConverted = FALSE;
// check if unconvertable character falls in NAME ENTITY area
if (dwFlag & MLCONVCHARF_NAME_ENTITIZE) { // for beta2, make assmption that name entity implys NCR.
dwFlag |= MLCONVCHARF_NCR_ENTITIZE;
#ifdef MORE_NAME_ENTITY // in case we decide do more name entity latter
BOOL fDoNEnty = FALSE; LPCTSTR lpszNEnty = NULL;
// check if character is in the Latin-1 Supplement range
if ((*lpwSrcStrTmp >= NAME_ENTITY_OFFSET) && (*lpwSrcStrTmp <= NAME_ENTITY_MAX )) { fDoNEnty = TRUE; lpszNEnty = g_lpstrNameEntity[(*lpwSrcStrTmp) - NAME_ENTITY_OFFSET]; }
// check if character is in the additional name entity table for CP 1252 extension
if (!fDoNEnty) { for (int idx = 0; idx < ARRAYSIZE(aNameEntityExt); idx++) if (*lpwSrcStrTmp == aNameEntityExt[idx].uwUniCode) { fDoNEnty = TRUE; lpszNEnty = aNameEntityExt[idx].lpszNameEntity; break; } }
if (fDoNEnty) { cCount = lstrlenA(lpszNEnty); if (_nDstSize) { CopyMemory(lpDstStrTmp, lpszNEnty, cCount); lpDstStrTmp += cCount ; }
ConvCount += cCount; fConverted = TRUE; } #else
// check if character is in the Latin-1 Supplement range
if ((*lpwSrcStrTmp >= NAME_ENTITY_OFFSET) && (*lpwSrcStrTmp < ARRAYSIZE(g_lpstrNameEntity)+NAME_ENTITY_OFFSET)) { LPCTSTR lpszNEnty = NULL;
if (!(lpszNEnty = g_lpstrNameEntity[(*lpwSrcStrTmp) - NAME_ENTITY_OFFSET])) { #ifdef DEBUG
AssertMsg((BOOL)FALSE, "Name entity table broken"); #endif
hr = E_FAIL; goto ENTITIZE_DONE; }
cCount = lstrlenA(lpszNEnty); if (_nDstSize) { CopyMemory(lpDstStrTmp, lpszNEnty, cCount); lpDstStrTmp += cCount ; } ConvCount += cCount; fConverted = TRUE; } #endif
}
// check if NCR requested
if ((!fConverted) && (dwFlag & MLCONVCHARF_NCR_ENTITIZE)) { if ((nCount-i >= 2) && (*lpwSrcStrTmp >= 0xD800 && *lpwSrcStrTmp <= 0xDBFF) && (*(lpwSrcStrTmp+1) >= 0xDC00 && *(lpwSrcStrTmp+1) <= 0xDFFF)) bIsSurrogatePair = TRUE; else bIsSurrogatePair = FALSE; if (_nDstSize) { lpDstStrTmp[0] = '&' ; lpDstStrTmp[1] = '#' ; lpDstStrTmp += 2 ; // If it is a Unicode surrogates pair, we convert it to real Unicode value
if (bIsSurrogatePair) { DWORD dwUnicode = ((*lpwSrcStrTmp - 0xD800) << 10) + *(lpwSrcStrTmp+1) - 0xDC00 + 0x10000; _ultoa( dwUnicode, (char*)lpDstStrTmp, 10); } else _ultoa( *lpwSrcStrTmp, (char*)lpDstStrTmp, 10); cCount = lstrlenA(lpDstStrTmp); lpDstStrTmp += cCount; ConvCount += cCount; *(lpDstStrTmp++) = ';' ; } else { char szTmpString[10]; if (bIsSurrogatePair) { DWORD dwUnicode = ((*lpwSrcStrTmp - 0xD800) << 10) + *(lpwSrcStrTmp+1) - 0xDC00 + 0x10000; _ultoa( dwUnicode, szTmpString, 10); } else _ultoa( *lpwSrcStrTmp, szTmpString, 10); ConvCount += lstrlenA(szTmpString); } fConverted = TRUE; ConvCount += 3; }
// handle MLCONVCHARF_USEDEFCHAR here - less priority and default method
if (!fConverted) { if (_nDstSize) { *lpDstStrTmp = lpDefFallBack ? *lpDefFallBack : '?'; lpDstStrTmp++; }
ConvCount++; if (!UseDefChar) UseDefChar = TRUE; }
lpBCharOffset++; lpwSrcStrTmp++; // Skip next character if it is a Unicode surrogates pair
if (bIsSurrogatePair) { lpBCharOffset++; lpwSrcStrTmp++; i++; } } lpBCharOffset -= nCount ; }
int nRemain = (*lpnSrcSize - (int)((char*)lpwSrcStrTmp - (char *)lpSrcStr))/sizeof(WCHAR);
ConvCount += WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0, (LPCWSTR)lpwSrcStrTmp, nRemain, lpDstStrTmp, _nDstSize? _nDstSize-ConvCount : 0, NULL, NULL );
*lpnDstSize = ConvCount ;
hr = S_OK; } else { hr = E_FAIL; }
ENTITIZE_DONE: if (lpwStrTmpSave) LocalFree(lpwStrTmpSave); if (lpDstStrTmp2Save) LocalFree(lpDstStrTmp2Save); if (lpBCharOffsetSave) LocalFree(lpBCharOffsetSave); } else { hr = S_OK; }
if (S_OK == hr && UseDefChar) hr = S_FALSE; } else { hr = E_FAIL; }
EXIT: return hr; }
HRESULT CICharConverter::UTF78ToUnicode(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize) { HRESULT hr ;
hr = DoConvertINetString(lpdwMode, TRUE, CP_UCS_2, _dwUTFEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
_nSrcSize = *lpnSrcSize ;
CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
return hr ; }
HRESULT CICharConverter::UnicodeToUTF78(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize) { HRESULT hr ;
if ( _dwUnicodeEncoding == CP_UCS_2_BE && _cvt_count == 0 ) { if ( _lpUnicodeStr = (LPSTR)LocalAlloc(LPTR, *lpnSrcSize ) ) { MoveMemory(_lpUnicodeStr, lpSrcStr, *lpnSrcSize ) ; lpSrcStr = _lpUnicodeStr ; } else return E_OUTOFMEMORY ; }
CheckUnicodeDataType(_dwUnicodeEncoding, (LPSTR) lpSrcStr, *lpnSrcSize);
hr = DoConvertINetString(lpdwMode, FALSE, CP_UCS_2, _dwUTFEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize); if ( !_cvt_count ) // save SrcSize if it is the first time conversion
_nSrcSize = *lpnSrcSize ;
return hr ; }
HRESULT CICharConverter::UnicodeToWindowsCodePage(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ;
hr = UnicodeToMultiByteEncoding(_dwWinCodePage,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize,dwFlag,lpFallBack);
return hr ; }
HRESULT CICharConverter::UnicodeToInternetEncoding(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ;
hr = UnicodeToMultiByteEncoding(_dwInternetEncoding,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize,dwFlag,lpFallBack);
return hr ; }
HRESULT CICharConverter::InternetEncodingToUnicode(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize) { int cch; int cb = *lpnSrcSize;
if ( !_cvt_count) { // If we have a multibyte character encoding, we are at risk of splitting
// some characters at the read boundary. We must Make sure we have a
// discrete number of characters first.
UINT uMax = MAX_CHAR_SIZE ; cb++; // pre-increment
do { cch = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), MB_ERR_INVALID_CHARS, lpSrcStr, --cb, NULL, 0 ); --uMax; } while (!cch && uMax && cb); }
if ( !cb || cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
cb = *lpnSrcSize ; // restore orginal value
*lpnDstSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), 0, lpSrcStr, cb, (LPWSTR)lpDstStr, *lpnDstSize/sizeof(WCHAR) ); *lpnDstSize = *lpnDstSize * sizeof(WCHAR); if ( !_cvt_count ) // save SrcSize if it is the first time conversion
_nSrcSize = cb ;
CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
if (*lpnDstSize==0 && (cb || cb != *lpnSrcSize)) { // GetLastError() for MultiByteToWideChar()
// Skip invalid characters for UTF8 conversion
if (CP_UTF_8 == MAPUSERDEF(_dwInternetEncoding)&& ERROR_NO_UNICODE_TRANSLATION == GetLastError()) return S_OK; else return E_FAIL ; } else return S_OK ; }
HRESULT CICharConverter::WindowsCodePageToUnicode(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize) {
int cch1, cch2; int cb = *lpnSrcSize;
if ( !_cvt_count && cb > 1 ) { if (IS_DBCSCODEPAGE(MAPUSERDEF(_dwWinCodePage))) { // Detect DBCS dangling character
if (!MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), MB_ERR_INVALID_CHARS, lpSrcStr, cb, NULL, 0 )) { if (IsDBCSLeadByteEx(MAPUSERDEF(_dwWinCodePage), lpSrcStr[cb-1])) { cch1 = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), 0, lpSrcStr, cb, NULL, 0 );
cch2 = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), 0, lpSrcStr, --cb, NULL, 0 ); if (cch1 != cch2+1) { //Dangling DBCS character not found, restore cb.
cb++; } } } } else { // If we have a multibyte character encoding, we are at risk of splitting
// some characters at the read boundary. We must Make sure we have a
// discrete number of characters first.
UINT uMax = MAX_CHAR_SIZE ; cb++; // pre-increment
do { cch1 = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), MB_ERR_INVALID_CHARS, lpSrcStr, --cb, NULL, 0 ); --uMax; } while (!cch1 && uMax && cb); if ( !cb || cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
cb = *lpnSrcSize ; // restore orginal value
} }
*lpnDstSize = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), 0, lpSrcStr, cb, (LPWSTR)lpDstStr, *lpnDstSize/sizeof(WCHAR) ); *lpnDstSize = *lpnDstSize * sizeof(WCHAR); if ( !_cvt_count ) // save SrcSize if it is the first time conversion
_nSrcSize = cb ;
CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize); // Whistler Bug#360429,
// Web page could have a splitting DBCS character at the very end of the page,
// To work around it, we allow one byte of dangling DBCS character.
if (*lpnDstSize==0 && (cb || (cb != *lpnSrcSize && ++cb != *lpnSrcSize))) return E_FAIL ; else return S_OK ; }
HRESULT CICharConverter::WindowsCodePageToInternetEncoding(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ;
// check if the conversion should go through Unicode indirectly
if ( _dwConvertType & 0x10 ) hr = WindowsCodePageToInternetEncodingWrap(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); else {
hr = DoConvertINetString(lpdwMode, FALSE, _dwWinCodePage, _dwInternetEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
_nSrcSize = *lpnSrcSize ; } return hr ; }
HRESULT CICharConverter::InternetEncodingToWindowsCodePage(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ;
// check if the conversion should go through Unicode indirectly
if ( _dwConvertType & 0x10 ) hr = InternetEncodingToWindowsCodePageWrap(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); else { hr = DoConvertINetString(lpdwMode, TRUE, _dwWinCodePage, _dwInternetEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
_nSrcSize = *lpnSrcSize ; } return hr ; }
HRESULT CICharConverter::WindowsCodePageToInternetEncodingWrap(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { int nBuffSize = 0 ; int cb = *lpnSrcSize; UINT uMax = MAX_CHAR_SIZE ; BOOL UseDefChar = FALSE ; HRESULT hr = S_OK;
if ( !_cvt_count ) { cb++; // pre-increment
do { nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), MB_ERR_INVALID_CHARS, lpSrcStr, --cb, NULL, 0 ); --uMax; } while (!nBuffSize && uMax && cb); }
if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
cb = *lpnSrcSize ; // restore orginal value
if (!nBuffSize) // in case there are illeage characters
nBuffSize = cb ;
if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR, (nBuffSize * sizeof(WCHAR)))) { nBuffSize = MultiByteToWideChar(MAPUSERDEF(_dwWinCodePage), 0, lpSrcStr, cb, (LPWSTR)_lpInterm1Str, nBuffSize );
int iSrcSizeTmp = nBuffSize * sizeof(WCHAR); hr = UnicodeToMultiByteEncoding(MAPUSERDEF(_dwInternetEncoding), (LPCSTR)_lpInterm1Str, &iSrcSizeTmp, lpDstStr, lpnDstSize, dwFlag, lpFallBack); // *lpnDstSize = WideCharToMultiByte( MAPUSERDEF(_dwInternetEncoding), 0,
// (LPCWSTR)_lpInterm1Str, nBuffSize, lpDstStr, *lpnDstSize, NULL, &UseDefChar );
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
_nSrcSize = cb ; } else hr = E_FAIL;
if (hr == S_OK) { if (*lpnDstSize==0 && cb) hr = E_FAIL ; else { if ( UseDefChar ) return S_FALSE ; else return S_OK ; } }
return hr; }
HRESULT CICharConverter::InternetEncodingToWindowsCodePageWrap(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) {
int nBuffSize = 0 ; int cb = *lpnSrcSize; UINT uMax = MAX_CHAR_SIZE ; BOOL UseDefChar = FALSE ; HRESULT hr = S_OK;
if ( !_cvt_count ) { cb++; // pre-increment
do { nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), MB_ERR_INVALID_CHARS, lpSrcStr, --cb, NULL, 0 ); --uMax; } while (!nBuffSize && uMax && cb); }
if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
cb = *lpnSrcSize ; // restore orginal value
if (!nBuffSize) // in case there are illeage characters
nBuffSize = cb ;
if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR,nBuffSize * sizeof (WCHAR) )) { nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), 0, lpSrcStr, cb, (LPWSTR)_lpInterm1Str, nBuffSize );
int iSrcSizeTmp = nBuffSize * sizeof(WCHAR); hr = UnicodeToMultiByteEncoding(MAPUSERDEF(_dwWinCodePage), (LPCSTR)_lpInterm1Str, &iSrcSizeTmp, lpDstStr, lpnDstSize, dwFlag, lpFallBack); // *lpnDstSize = WideCharToMultiByte( MAPUSERDEF(_dwWinCodePage), 0,
// (LPCWSTR)_lpInterm1Str, nBuffSize, lpDstStr, *lpnDstSize, NULL, &UseDefChar );
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
_nSrcSize = cb ; } else hr = E_FAIL;
if (hr == S_OK) { if (*lpnDstSize==0 && cb) hr = E_FAIL ; else { if ( UseDefChar ) return S_FALSE ; else return S_OK ; } }
return hr; }
HRESULT CICharConverter::ConvertIWUU(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { int nBuffSize = 0 ; HRESULT hr = S_OK ; HRESULT hrWarnings = S_OK ;
// InternetEncodingToWindowsCodePage
if ( _dwConvertType % 2 && _dwConvertType < 21 ) /* start from Internet Encoding */ { if ( _dwConvertType == 5 || _dwConvertType == 9 ) /* use interm buffer */ { hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize, dwFlag, lpFallBack); if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR,nBuffSize) ) { hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize, dwFlag, lpFallBack); lpSrcStr = _lpInterm1Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } else hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); _cvt_count ++ ; }
if ( hr != S_OK ) hrWarnings = hr ; // WindowsCodePageToUnicode or InternetEncodingToUnicode
if ( _dwConvertType == 21 || _dwConvertType == 25 ) { if ( _dwConvertType == 21 ) hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize); else // _dwConvertType == 25
{ hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, NULL, &nBuffSize); if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize); lpSrcStr = _lpInterm1Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } _cvt_count ++ ; } else if ( _dwConvertType >= 4 && _dwConvertType <= 10 ) { if ( _dwConvertType > 8 ) { nBuffSize = 0 ; hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, NULL, &nBuffSize); if ( _cvt_count ) { if ( _lpInterm2Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, _lpInterm2Str, &nBuffSize); lpSrcStr = _lpInterm2Str ; *lpnSrcSize = nBuffSize ; } else goto fail ;
} else { if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize); lpSrcStr = _lpInterm1Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } } else hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize); _cvt_count ++ ; }
if ( hr != S_OK ) hrWarnings = hr ;
// UnicodeToUTF78
if ( _dwConvertType & 0x08 ) #ifndef UNIX
hr = UnicodeToUTF78(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize); #else
{ /* we now hack the lpSrcStr to be the same as 2 byte Unicode so mlang
* lowlevel code can work right. */ LPWSTR lpwSrcStr = (LPWSTR)lpSrcStr; INT tmpSize = *lpnSrcSize/sizeof(WCHAR); UCHAR *pTmp = new UCHAR[(tmpSize+1)*2]; if(pTmp) { for(int i = 0; i < tmpSize; i++) { pTmp[i*2] = *lpwSrcStr++; pTmp[i*2+1] = 0x00; } pTmp[i*2] = pTmp[i*2+1] = 0x00; tmpSize *= 2; hr = UnicodeToUTF78(lpdwMode, (LPCSTR)pTmp, &tmpSize, lpDstStr, lpnDstSize); } else hr = E_FAIL; delete [] pTmp; } #endif /* UNIX */
return ( hr == S_OK ? hrWarnings : hr ) ;
fail : return E_FAIL ; }
HRESULT CICharConverter::ConvertUUWI(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { int nBuffSize = 0 ; HRESULT hr = S_OK ; HRESULT hrWarnings = S_OK ;
// UTF78ToUnicode
if ( _dwConvertType & 0x08 ) { if ( _dwConvertType == 12 ) /* convert UTF78 -> Unicode only */ hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize); else /* use interm buffer, type = 10 or 9 */ { hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize); if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize); lpSrcStr = _lpInterm1Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } _cvt_count ++ ; }
if ( hr != S_OK ) hrWarnings = hr ;
// UnicodeToWindowsCodePage or UnicodeToInternetEncoding
if ( _dwConvertType == 21 || _dwConvertType == 25 ) { hr = UnicodeToInternetEncoding(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); _cvt_count ++ ; } else if ( _dwConvertType >= 4 && _dwConvertType <= 10 ) { if ( _dwConvertType % 2 ) /* use interm buffer */ { nBuffSize = 0 ; hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, NULL, &nBuffSize, dwFlag, lpFallBack); if ( _cvt_count ) { if ( _lpInterm2Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, _lpInterm2Str, &nBuffSize, dwFlag, lpFallBack); lpSrcStr = _lpInterm2Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } else { if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize, dwFlag, lpFallBack); lpSrcStr = _lpInterm1Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } } else hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); _cvt_count ++ ; }
if ( hr != S_OK ) hrWarnings = hr ;
// WindowsCodePageToInternetEncoding
if ( _dwConvertType % 2 && _dwConvertType < 21 ) hr = WindowsCodePageToInternetEncoding(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
return ( hr == S_OK ? hrWarnings : hr ) ;
fail : return E_FAIL ; }
#if 0
struct CODEPAGEINFO { UINT uCodePage ; CP_STATE nCP_State ; // whether this is a valid windows codepage ?
};
// ValidCodepageInfo is used to cache whether a codepage is a vaild code
// It uses circular-FIFO cache algorithm
#define MAX_CP_CACHE 32
static int cp_cache_count = 0 ; static int cp_cache_ptr = 0 ; static struct CODEPAGEINFO ValidCodepageInfo[MAX_CP_CACHE];
// ValidCodepageInfo is used to cache whether a codepage is a vaild codepage
// It uses circular-FIFO cache algorithm
BOOL CheckIsValidCodePage (UINT uCodePage) { if ( uCodePage == 50000 ) // User defined
return TRUE ;
int i ; BOOL bRet ;
for ( i = 0 ; i < cp_cache_count ; i++ ) { if ( uCodePage == ValidCodepageInfo[i].uCodePage ) { if ( ValidCodepageInfo[i].nCP_State == VALID_CP ) return TRUE ; else return FALSE ; } }
// not found, call IsValidCodePage and cache the return value
bRet = IsValidCodePage(uCodePage);
EnterCriticalSection(&g_cs); ValidCodepageInfo[cp_cache_ptr].uCodePage = uCodePage ; if (bRet) ValidCodepageInfo[cp_cache_ptr].nCP_State = VALID_CP ; else ValidCodepageInfo[cp_cache_ptr].nCP_State = INVALID_CP ; if ( cp_cache_count < MAX_CP_CACHE ) cp_cache_count++ ; cp_cache_ptr = ( ++cp_cache_ptr ) % MAX_CP_CACHE ; LeaveCriticalSection(&g_cs);
return bRet ; } #endif
/*
Conversion Flag:
Bit 7 - Convert Direction.
Bit 4 (16) - Unicode <-> Internet Encoding Bit 3 (8) - UTF8, UTF7 Bit 2 (4) - Unicode Bit 1 (2) - Windows CodePage Bit 0 (1) - Internet Encoding
12, 6, 3 (19) - one step convert 10, 5 (21) - two steps convert 9 (25) - three steps convert
*/
int GetWindowsEncodingIndex(DWORD dwEncoding) { int nr = sizeof (aEncodingInfo) / sizeof(ENCODINGINFO) ; int i, half = nr / 2, index = -1 ;
if (aEncodingInfo[half].dwEncoding > dwEncoding ) { for ( i = 0 ; i < half ; i++ ) if (aEncodingInfo[i].dwEncoding == dwEncoding ) index = i ;
} else if (aEncodingInfo[half].dwEncoding < dwEncoding ) { for ( i = half + 1 ; i < nr ; i++ ) if (aEncodingInfo[i].dwEncoding == dwEncoding ) index = i ; } else index = half ;
if (index>=0) // found
{ if ( aEncodingInfo[index].nCP_State != VALID_CP && aEncodingInfo[index].dwCodePage ) {
if ( aEncodingInfo[index].dwCodePage == 50000 || IsValidCodePage(aEncodingInfo[index].dwCodePage ) ) // 50000 means user defined
aEncodingInfo[index].nCP_State = VALID_CP ; else aEncodingInfo[index].nCP_State = INVALID_CP ;
if ((aEncodingInfo[index].nCP_State == VALID_CP) && (aEncodingInfo[index].dwFlags & CONV_CHK_NLS) && !IsValidCodePage(aEncodingInfo[index].dwEncoding)) aEncodingInfo[index].nCP_State = INVALID_CP ; } // Use system UTF8 conversion to work around security issues on Win2k and greater platforms.
if (g_bUseSysUTF8 && dwEncoding == CP_UTF_8) { aEncodingInfo[index].bTypeUUIW = 0x11; } }
return index ; }
HRESULT CICharConverter::ConvertSetup(DWORD * pdwSrcEncoding, DWORD dwDstEncoding) { DWORD SrcFlag = 0, DstFlag = 0 ; int index, unknown = 0 ;
// IE bug 109708 - WEIWU 5/11/00
// Always consider US-ASCII as a valid source encoding for conversion
/*
if (*pdwSrcEncoding == CP_20127 && !IsValidCodePage(CP_20127)) *pdwSrcEncoding = CP_1252; */ /* check source & destination encoding type */ index = GetWindowsEncodingIndex(*pdwSrcEncoding); if ( index >=0 ) { SrcFlag = (DWORD) aEncodingInfo[index].bTypeUUIW ; if ( aEncodingInfo[index].dwCodePage ) { _dwWinCodePage = (DWORD) aEncodingInfo[index].dwCodePage ; if (aEncodingInfo[index].nCP_State == INVALID_CP ) goto fail ; } if ( SrcFlag & 0x08 ) _dwUTFEncoding = *pdwSrcEncoding ; if ( SrcFlag & 0x01 ) _dwInternetEncoding = *pdwSrcEncoding ; if ( SrcFlag & 0x04 ) _dwUnicodeEncoding = *pdwSrcEncoding ; } // assume it is a unknown Window Codepage
else { if ( !CONVERT_IS_VALIDCODEPAGE(*pdwSrcEncoding)) goto fail ;
SrcFlag = 0x02 ; _dwWinCodePage = *pdwSrcEncoding ;
unknown ++ ; }
index = GetWindowsEncodingIndex(dwDstEncoding); if ( index >=0 ) { // check if two codepages are compatiable
if ( _dwWinCodePage && aEncodingInfo[index].dwCodePage ) { if (_dwWinCodePage != (DWORD) aEncodingInfo[index].dwCodePage ) goto fail ; }
DstFlag = (DWORD) aEncodingInfo[index].bTypeUUIW ; if ( aEncodingInfo[index].dwCodePage ) { _dwWinCodePage = (DWORD) aEncodingInfo[index].dwCodePage ; if (aEncodingInfo[index].nCP_State == INVALID_CP ) goto fail ; } if ( DstFlag & 0x08 ) { if (_dwUTFEncoding) _dwUTFEncoding2 = dwDstEncoding ; else _dwUTFEncoding = dwDstEncoding ; } if ( DstFlag & 0x01 ) _dwInternetEncoding = dwDstEncoding ; if ( DstFlag & 0x04 ) _dwUnicodeEncoding = dwDstEncoding ; } // 1) First time unknown, assume it is a unknown Window Codepage
// the conversion become UTF78 <-> Unicode <-> Window Codepage
// 2) Second time unknown, assume it is a unknown Internet Encoding
// the conversion become Windows Codepage <-> Unicode <-> Internet Encoding
else { if ( !CONVERT_IS_VALIDCODEPAGE(dwDstEncoding)) goto fail ;
if ( unknown == 0 ) { if ( _dwWinCodePage ) { if (_dwWinCodePage != dwDstEncoding ) goto fail ; }
DstFlag = 0x02 ; _dwWinCodePage = dwDstEncoding ; } else { DstFlag = 0x11 ; _dwInternetEncoding = dwDstEncoding ; } }
if ( !SrcFlag | !DstFlag ) goto fail ;
if ( SrcFlag == DstFlag && *pdwSrcEncoding != dwDstEncoding && ( 4 != SrcFlag ) && ( 8 != SrcFlag )) goto fail ;
_dwConvertType = SrcFlag | DstFlag ;
_bConvertDirt = ( SrcFlag & 0x0f ) > ( DstFlag & 0x0f ) ;
// if code convertor has been allocated, deallocate it
if (_hcins) { delete _hcins ; _hcins = NULL ; }
return S_OK ;
fail : return S_FALSE ; }
HRESULT CICharConverter::DoCodeConvert(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr = S_OK ;
if ( 4 == _dwConvertType ) // CP_UCS_2 <-> CP_UCS_2_BE
{ if (!lpDstStr) { _nSrcSize = *lpnDstSize = *lpnSrcSize ; } else { int nSize = min(*lpnDstSize,*lpnSrcSize);
_nSrcSize = *lpnSrcSize ; if ( lpDstStr && nSize > 0 ) { MoveMemory(lpDstStr, lpSrcStr, nSize ); DataByteSwap(lpDstStr, nSize ); _nSrcSize = nSize ; *lpnDstSize = nSize ; } } } else if ( 8 == _dwConvertType) // UTF7 <-> UTF8
{ if (_dwUTFEncoding == _dwUTFEncoding2) { _nSrcSize = *lpnDstSize = min(*lpnDstSize,*lpnSrcSize); if (*lpnDstSize > 0) MoveMemory(lpDstStr, lpSrcStr, *lpnDstSize); } else { int nBuffSize = 0; // Always succeeds
hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize); if (_lpInterm1Str) LocalFree(_lpInterm1Str); if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { DWORD dwTmpEncoding = _dwUTFEncoding; int nTmpSrcSize;
hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize); _dwUTFEncoding = _dwUTFEncoding2 ; nTmpSrcSize = _nSrcSize; // We don't need to create another dwMode since only UTF7 conversion needs it
hr = UnicodeToUTF78(lpdwMode, _lpInterm1Str, &nBuffSize, lpDstStr, lpnDstSize); _nSrcSize = nTmpSrcSize; _dwUTFEncoding = dwTmpEncoding ; } else hr = E_OUTOFMEMORY; } } else if ( _bConvertDirt ) hr = ConvertUUWI(lpdwMode, lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, dwFlag, lpFallBack); else hr = ConvertIWUU(lpdwMode, lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, dwFlag, lpFallBack);
return hr ; }
BOOL CICharConverter::ConvertCleanUp() { if (_lpInterm1Str) { LocalFree(_lpInterm1Str); _lpInterm1Str = NULL ; } if (_lpInterm2Str) { LocalFree(_lpInterm2Str); _lpInterm2Str = NULL ; } if (_lpUnicodeStr) { LocalFree(_lpUnicodeStr); _lpUnicodeStr = NULL ; } _cvt_count = 0 ; _nSrcSize = 0 ;
return TRUE ; }
CICharConverter::CICharConverter() { _lpInterm1Str = NULL ; _lpInterm2Str = NULL ; _lpUnicodeStr = NULL ; _hcins = NULL ; _cvt_count = 0 ; _dwWinCodePage = 0; _dwInternetEncoding = 0; _dwUTFEncoding = 0; _dwUTFEncoding2 = 0; _dwUnicodeEncoding = 0; _dwConvertType = 0; _nSrcSize = 0 ; _hcins_dst = 0 ;
return ; }
CICharConverter::CICharConverter(DWORD dwFlag, WCHAR *lpFallBack) { _lpInterm1Str = NULL ; _lpInterm2Str = NULL ; _lpUnicodeStr = NULL ; _hcins = NULL ; _cvt_count = 0 ; _dwWinCodePage = 0; _dwInternetEncoding = 0; _dwUTFEncoding = 0; _dwUTFEncoding2 = 0; _dwUnicodeEncoding = 0; _dwConvertType = 0; _nSrcSize = 0 ; _hcins_dst = 0 ; _dwFlag = dwFlag; _lpFallBack = lpFallBack;
return ; }
CICharConverter::~CICharConverter() { if (_lpInterm1Str) { LocalFree(_lpInterm1Str); _lpInterm1Str = NULL ; } if (_lpInterm2Str) { LocalFree(_lpInterm2Str); _lpInterm2Str = NULL ; } if (_lpUnicodeStr) { LocalFree(_lpUnicodeStr); _lpUnicodeStr = NULL ; } if (_hcins) { delete _hcins ; _hcins = NULL ; } }
CICharConverter::CICharConverter(DWORD dwSrcEncoding, DWORD dwDstEncoding) { _lpInterm1Str = NULL ; _lpInterm2Str = NULL ; _lpUnicodeStr = NULL ; _hcins = NULL ; _cvt_count = 0 ; _dwWinCodePage = 0; _dwInternetEncoding = 0; _dwUTFEncoding = 0; _dwUTFEncoding2 = 0; _dwUnicodeEncoding = 0; _dwConvertType = 0; _nSrcSize = 0 ; _hcins_dst = 0 ; ConvertSetup(&dwSrcEncoding,dwDstEncoding); return ; }
HRESULT WINAPI IsConvertINetStringAvailable(DWORD dwSrcEncoding, DWORD dwDstEncoding) { HRESULT hr; CICharConverter * INetConvert = new CICharConverter ;
if (!INetConvert) return E_OUTOFMEMORY;
hr = INetConvert->ConvertSetup(&dwSrcEncoding, dwDstEncoding); delete INetConvert;
return hr ; }
#define DETECTION_BUFFER_NUM 3
// In CP_AUTO and detection result is UTF7 case, private converter might use high word of *lpdwMode to store internal data, but we need
// to use it to notify Trident the detection result, currently, we bias to returning correct detection result.
// This is currently by design. If we get a change to re-prototype conversion object, we can resovle this issue
HRESULT WINAPI ConvertINetStringEx(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { CICharConverter * INetConvert; int nSrcSize; int nDstSize; DWORD dwMode = 0 ; // dwDetectResult
// CP_UNDEFINED :Fail to detect
// 0 :Not a auto-detect scenario
// Others :Detected encoding
DWORD dwDetectResult = CP_UNDEFINED; HRESULT hr ;
if(lpnSrcSize) { nSrcSize = *lpnSrcSize; } else nSrcSize = -1;
if ( lpSrcStr && nSrcSize == -1 ) // Get length of lpSrcStr if not given, assuming lpSrcStr is a zero terminate string.
{ if ( dwSrcEncoding == CP_UCS_2 ) nSrcSize = (lstrlenW((WCHAR*)lpSrcStr) << 1) ; else nSrcSize = lstrlenA(lpSrcStr) ; }
// If there is nothing need to be converted, we return S_OK;
if (!nSrcSize || !lpSrcStr) { if (lpnDstSize) *lpnDstSize = 0; return S_OK; }
INetConvert = new CICharConverter(dwFlag, lpFallBack) ;
if (!INetConvert) return E_OUTOFMEMORY;
// ASSERT(CP_AUTO != dwDstEncoding);
// if null specified at dst buffer we'll get the size of required buffer.
if(!lpDstStr) nDstSize = 0; else if (lpnDstSize) nDstSize = *lpnDstSize; else nDstSize = 0;
if (lpdwMode) dwMode = *lpdwMode ;
// In real world, clients uses 28591 as 1252, 28599 as 1254,
// To correctly convert those extended characters to Unicode,
// We internally replace it with 1252
if (dwDstEncoding == CP_UCS_2 || dwDstEncoding == CP_UCS_2_BE) { if ((dwSrcEncoding == CP_ISO_8859_1) && _IsValidCodePage(CP_1252)) dwSrcEncoding = CP_1252;
if ((dwSrcEncoding == CP_ISO_8859_9) && _IsValidCodePage(CP_1254)) dwSrcEncoding = CP_1254; }
if ((dwDstEncoding == CP_1252) && (dwSrcEncoding == CP_ISO_8859_1)) { dwSrcEncoding = CP_1252; }
if ((dwDstEncoding == CP_1254) && (dwSrcEncoding == CP_ISO_8859_9)) { dwSrcEncoding = CP_1254; }
//
// Auto Detection for Japan
// Japanese user often tag their data incorrectly, so, if MLCONVCHARF_DETECTJPN specified,
// we'll do extra detection for Shift-Jis and EUC
//
if ( dwSrcEncoding == CP_JP_AUTO || ((dwFlag & MLCONVCHARF_DETECTJPN) && (dwSrcEncoding == CP_JPN_SJ || dwSrcEncoding == CP_EUC_JP))) // Auto Detection for Japan
{ CIncdJapanese DetectJapan(dwSrcEncoding); UINT uiCodePage ;
uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) { dwSrcEncoding = uiCodePage ; dwDetectResult = 0; } else { dwSrcEncoding = DetectJapan.DetectStringA(lpSrcStr, nSrcSize); // if dwSrcEncoding is zero means there is an ambiguity, we don't return
// the detected codepage to caller, instead we defaut its codepage internally
// to SJIS
if (dwSrcEncoding) { dwDetectResult = dwSrcEncoding << 16 ; } else dwSrcEncoding = CP_JPN_SJ; } } // bug #43190, we auto-detect again for euc-kr page because IMN ver 1.0
// mislabel an ISO-KR page as a ks_c_5601-1987 page. This is the only way
// we can fix that mistake.
else if ( dwSrcEncoding == CP_KR_AUTO || dwSrcEncoding == CP_KOR_5601 || dwSrcEncoding == CP_EUC_KR ) { CIncdKorean DetectKorean; UINT uiCodePage ;
uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) { dwSrcEncoding = uiCodePage ; dwDetectResult = 0; } else { dwSrcEncoding = DetectKorean.DetectStringA(lpSrcStr, nSrcSize); if (dwSrcEncoding) { dwDetectResult = dwSrcEncoding << 16 ; } else dwSrcEncoding = CP_KOR_5601; }
} else if ( dwSrcEncoding == CP_AUTO ) // General Auto Detection for all code pages
{ int _nSrcSize = DETECTION_MAX_LEN < nSrcSize ? DETECTION_MAX_LEN : nSrcSize; int nScores = DETECTION_BUFFER_NUM; DetectEncodingInfo Encoding[DETECTION_BUFFER_NUM]; UINT uiCodePage ;
uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) { dwSrcEncoding = uiCodePage ; dwDetectResult = 0; } else { dwSrcEncoding = g_uACP; if ( S_OK == _DetectInputCodepage(MLDETECTCP_HTML, CP_AUTO, (char *)lpSrcStr, &_nSrcSize, &Encoding[0], &nScores)) { MIMECPINFO cpInfo;
if (Encoding[0].nCodePage == CP_20127) Encoding[0].nCodePage = dwSrcEncoding;
if (NULL != g_pMimeDatabase) { if (SUCCEEDED(g_pMimeDatabase->GetCodePageInfo(Encoding[0].nCodePage, 0x409, &cpInfo)) && (cpInfo.dwFlags & MIMECONTF_VALID)) { dwSrcEncoding = Encoding[0].nCodePage; dwDetectResult = dwSrcEncoding << 16 ; } } }
// If we failed in general detection and system locale is Jpn, we try harder
// with our Japanese detection engine
if (dwSrcEncoding == CP_JPN_SJ && dwDetectResult == CP_UNDEFINED) { CIncdJapanese DetectJapan; DWORD dwSrcEncodingJpn = DetectJapan.DetectStringA(lpSrcStr, nSrcSize); if (dwSrcEncodingJpn) { // We only change conversion encoding without returnning this result to browser
// if it is in the middle of detection, this is to prevent other encodings been mis-detected as Jpn encodings.
dwSrcEncoding = dwSrcEncodingJpn; // Set search range for end tag as 10 bytes
if (nSrcSize >= 10) { char szTmpStr[11] = {0}; char *lpTmpStr = szTmpStr; _tcsncpy(szTmpStr, (char *)&lpSrcStr[nSrcSize-10], 10);
//ToLower
while(*lpTmpStr) { if (*lpTmpStr >= 'A' && *lpTmpStr <= 'W') *lpTmpStr += 0x20; lpTmpStr++; }
// If end of page, return this result
if (MLStrStr(szTmpStr, "</html>")) dwDetectResult = dwSrcEncoding << 16 ; }
} } //aEncodingInfo[GetWindowsEncodingIndex(CP_AUTO)].dwCodePage = dwSrcEncoding;
} } else { // Not a auto-detect scenario
dwDetectResult = 0; }
if ( S_OK == ( hr = INetConvert->ConvertSetup(&dwSrcEncoding,dwDstEncoding ))) { if ( dwSrcEncoding != dwDstEncoding ) { // if high word of dwMode is CP_UTF_7, it must be detection result, don't pass it to UTF7 converter
if ( dwSrcEncoding == CP_UTF_7 && (dwMode >> 16) == CP_UTF_7) dwMode &= 0xFFFF; // ASSERT(!((IS_ENCODED_ENCODING(dwSrcEncoding) || IS_ENCODED_ENCODING(dwDstEncoding)) && (NULL == lpdwMode)));
hr = INetConvert->DoCodeConvert(&dwMode, lpSrcStr, &nSrcSize, lpDstStr, &nDstSize, dwFlag, lpFallBack);
// return the number of bytes processed for the source.
if (lpnSrcSize) *lpnSrcSize = INetConvert->_nSrcSize ; INetConvert->ConvertCleanUp(); } else { int nSize, i ; hr = S_OK ; BOOL bLeadByte = FALSE ;
// only check for windows codepage
if ( INetConvert->_dwConvertType == 02 && lpSrcStr ) { for ( i=0; i<nSrcSize; i++) { if (bLeadByte) bLeadByte = FALSE ; else if (IsDBCSLeadByteEx(dwSrcEncoding,lpSrcStr[i])) bLeadByte = TRUE ; } if (bLeadByte) nSrcSize-- ; } // set input size
if (lpnSrcSize) *lpnSrcSize = nSrcSize ; // set output size and copy if we need to
if (lpDstStr && *lpnDstSize) { nSize = min(*lpnDstSize,nSrcSize); MoveMemory(lpDstStr, lpSrcStr, nSize); nDstSize = nSize ; } else nDstSize = nSrcSize ; } } else nDstSize = 0 ;
delete INetConvert;
// return the number of bytes copied for the destination,
if (lpnDstSize) *lpnDstSize = nDstSize;
if (lpdwMode && lpDstStr) { if (dwDetectResult) // CP_AUTO conversion
{ dwMode &= 0xFFFF; // Clear HIGHWORD in case private converter set it
// If we have detection result, return it in HIGHWORD
// in the case of UTF7 conversion, private converter might use high word to store internal data,
// this will conflict with our logic of returning detection result in high word, it is a design flaw,
// currently, we ignore conversion setting and give detection result more priority
if (dwDetectResult != CP_UNDEFINED) dwMode |= dwDetectResult; } *lpdwMode = dwMode ; }
return hr ; }
// We already published this API, keep it for backward compatibility
HRESULT WINAPI ConvertINetReset(void) { // Always suceed
return S_OK ; }
HRESULT WINAPI ConvertINetMultiByteToUnicodeEx(LPDWORD lpdwMode, DWORD dwEncoding, LPCSTR lpSrcStr, LPINT lpnMultiCharCount, LPWSTR lpDstStr, LPINT lpnWideCharCount, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ; int nByteCountSize = 0;
if (lpnWideCharCount) { nByteCountSize = *lpnWideCharCount * sizeof(WCHAR); }
#ifdef UNIX
int saved_nByteCountSize = nByteCountSize; #endif /* UNIX */
hr = ConvertINetStringEx(lpdwMode,dwEncoding, CP_UCS_2, lpSrcStr, lpnMultiCharCount, (LPSTR)lpDstStr, &nByteCountSize, dwFlag, lpFallBack) ;
#ifdef UNIX
if(dwEncoding == 1200 || dwEncoding == 65000 || dwEncoding == 65001 || (dwEncoding == 50001 && !_IsValidCodePage(dwEncoding)) ) { /*
* On unix we need to convert the little endian mode 2 byte unicode * format to unix mode 4 byte wChars. */ if(lpDstStr && (saved_nByteCountSize < (nByteCountSize/2)*sizeof(WCHAR))) hr = E_FAIL; else { /*
* Use a temporary array to do the 2byte -> 4byte conversion */ LPSTR pTmp = (LPSTR) lpDstStr; LPWSTR pw4 = NULL;
if(pTmp) /* allocate only if we have a lpDstStr */ pw4 = new WCHAR[nByteCountSize/2]; if(pw4) { int i = 0; LPWSTR pw4Tmp = pw4; for(; i < nByteCountSize/2; i++) *pw4Tmp++ = (UCHAR)pTmp[i*2]; pw4Tmp = pw4; for(i = 0; i < nByteCountSize/2; i++) *lpDstStr++ = *pw4Tmp++; } if(!pw4 && pTmp) /* if lpDstStr and allocate fails bail out */ hr = E_FAIL; delete [] pw4; } nByteCountSize *= 2; // Expand twice as we have 4 byte wchars.
} #endif
*lpnWideCharCount = nByteCountSize / sizeof(WCHAR);
return hr ; }
HRESULT WINAPI ConvertINetUnicodeToMultiByteEx(LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ; int nByteCountSize=-1;
if(lpnWideCharCount && *lpnWideCharCount != -1) nByteCountSize = *lpnWideCharCount * sizeof(WCHAR);
hr = ConvertINetStringEx(lpdwMode,CP_UCS_2, dwEncoding, (LPCSTR) lpSrcStr, &nByteCountSize, lpDstStr, lpnMultiCharCount, dwFlag, lpFallBack);
#ifdef UNIX
if(dwEncoding == 1200 || dwEncoding == 65000 || dwEncoding == 65001) { nByteCountSize *= 2; // Expand twice as we have 4 byte wchars.
} #endif /* UNIX */
if (lpnWideCharCount) *lpnWideCharCount = nByteCountSize / sizeof(WCHAR);
return hr ; }
HRESULT WINAPI ConvertINetString(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize) { HRESULT hr ;
hr = ConvertINetStringEx(lpdwMode,dwSrcEncoding,dwDstEncoding,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, 0, NULL);
return hr ; }
HRESULT WINAPI ConvertINetUnicodeToMultiByte(LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount) { HRESULT hr ; DWORD dwFlag = 0 ;
if ( lpdwMode ) dwFlag |= ( *lpdwMode & 0x00008000 ) ? MLCONVCHARF_ENTITIZE : 0 ;
hr = ConvertINetUnicodeToMultiByteEx(lpdwMode,dwEncoding,lpSrcStr,lpnWideCharCount,lpDstStr,lpnMultiCharCount,dwFlag,NULL);
return hr ; }
HRESULT WINAPI ConvertINetMultiByteToUnicode(LPDWORD lpdwMode, DWORD dwEncoding, LPCSTR lpSrcStr, LPINT lpnMultiCharCount, LPWSTR lpDstStr, LPINT lpnWideCharCount) { HRESULT hr ;
hr = ConvertINetMultiByteToUnicodeEx(lpdwMode,dwEncoding,lpSrcStr,lpnMultiCharCount,lpDstStr,lpnWideCharCount, 0, NULL);
return hr ; }
#define STR_BUFFER_SIZE 2048
HRESULT _ConvertINetStringInIStream(CICharConverter * INetConvert, LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, IStream *pstmIn, IStream *pstmOut, DWORD dwFlag, WCHAR *lpFallBack) { DWORD dwMode, dwModeTemp ; HRESULT hr= S_OK, hrWarnings=S_OK; LPSTR lpstrIn = NULL, lpstrOut = NULL; ULONG nSrcSize, nSrcUsed, nSrcLeft, nDstSize, _nDstSize, nOutBuffSize ;
if (lpdwMode) dwMode = *lpdwMode ;
// allocate a temp input buffer - 2K in size
if ( (lpstrIn = (LPSTR) LocalAlloc(LPTR, STR_BUFFER_SIZE )) == NULL ) { hrWarnings = E_OUTOFMEMORY ; goto exit; }
if ( (lpstrOut = (LPSTR) LocalAlloc(LPTR, STR_BUFFER_SIZE * 2 )) == NULL ) { hrWarnings = E_OUTOFMEMORY ; goto exit; }
nOutBuffSize = STR_BUFFER_SIZE * 2 ; nSrcLeft = 0 ;
// In real world, clients uses 28591 as 1252, 28599 as 1254,
// To correctly convert those extended characters to Unicode,
// We internally replace it with 1252
if (dwDstEncoding == CP_UCS_2 || dwDstEncoding == CP_UCS_2_BE) { if ((dwSrcEncoding == CP_ISO_8859_1) && _IsValidCodePage(CP_1252)) dwSrcEncoding = CP_1252;
if ((dwSrcEncoding == CP_ISO_8859_9) && _IsValidCodePage(CP_1254)) dwSrcEncoding = CP_1254; }
if ((dwDstEncoding == CP_1252) && (dwSrcEncoding == CP_ISO_8859_1)) { dwSrcEncoding = CP_1252; }
if ((dwDstEncoding == CP_1254) && (dwSrcEncoding == CP_ISO_8859_9)) { dwSrcEncoding = CP_1254; }
if ( dwSrcEncoding == CP_JP_AUTO ) // Auto Detection for Japan
{ CIncdJapanese DetectJapan; UINT uiCodePage ; LARGE_INTEGER li;
uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) dwSrcEncoding = uiCodePage ; else { LISet32(li, 0);
hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE , &nSrcSize); if (S_OK != hr) hrWarnings = hr; hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL); if (S_OK != hr) hrWarnings = hr;
dwSrcEncoding = DetectJapan.DetectStringA(lpstrIn, nSrcSize); // if dwSrcEncoding is zero means there is an ambiguity, we don't return
// the detected codepage to caller, instead we defaut its codepage internally
// to SJIS
if (dwSrcEncoding) { dwMode &= 0x0000ffff ; dwMode |= dwSrcEncoding << 16 ; } else dwSrcEncoding = CP_JPN_SJ; } } // bug #43190, we auto-detect again for euc-kr page because IMN ver 1.0
// mislabel an ISO-KR page as a ks_c_5601-1987 page. This is the only way
// we can fix that mistake.
else if ( dwSrcEncoding == CP_KR_AUTO || dwSrcEncoding == CP_KOR_5601 || dwSrcEncoding == CP_EUC_KR ) { CIncdKorean DetectKorean; UINT uiCodePage ; LARGE_INTEGER li;
uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) dwSrcEncoding = uiCodePage ; else { LISet32(li, 0); hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE, &nSrcSize); if (S_OK != hr) hrWarnings = hr; hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL); if (S_OK != hr) hrWarnings = hr; dwSrcEncoding = DetectKorean.DetectStringA(lpstrIn, nSrcSize); if (dwSrcEncoding) { dwMode &= 0x0000ffff ; dwMode |= dwSrcEncoding << 16 ; } else dwSrcEncoding = CP_KOR_5601; } } else if ( dwSrcEncoding == CP_AUTO ) // General Auto Detection for all code pages
{ INT nScores = 1; DWORD dwSrcEncoding ; DetectEncodingInfo Encoding; UINT uiCodePage ; LARGE_INTEGER li;
uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) dwSrcEncoding = uiCodePage ; else { LISet32(li, 0);
hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE , &nSrcSize); if (S_OK != hr) hrWarnings = hr; hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL); if (S_OK != hr) hrWarnings = hr;
if (DETECTION_MAX_LEN < nSrcSize) nSrcSize = DETECTION_MAX_LEN;
if ( S_OK == _DetectInputCodepage(MLDETECTCP_HTML, 1252, lpstrIn, (int *)&nSrcSize, &Encoding, &nScores)) { dwSrcEncoding = Encoding.nCodePage; dwMode &= 0x0000ffff ; dwMode |= dwSrcEncoding << 16 ; } else { dwSrcEncoding = CP_ACP; } aEncodingInfo[GetWindowsEncodingIndex(CP_AUTO)].dwCodePage = dwSrcEncoding; } }
if ( S_OK == ( hr = INetConvert->ConvertSetup(&dwSrcEncoding,dwDstEncoding ))) { // Loop for ever
while(1) { // Read a buffer
hr = pstmIn->Read(&lpstrIn[nSrcLeft], STR_BUFFER_SIZE-nSrcLeft, &nSrcSize); if (S_OK != hr) hrWarnings = hr;
// Done
if (0 == nSrcSize) break;
nSrcSize += nSrcLeft ; nSrcUsed = nSrcSize ; dwModeTemp = dwMode ; nDstSize = 0 ;
// get the size of output buffer
hr = INetConvert->DoCodeConvert(&dwModeTemp, (LPCSTR)lpstrIn, (LPINT)&nSrcUsed, NULL, (LPINT)&nDstSize, dwFlag, lpFallBack); if (S_OK != hr) hrWarnings = hr;
// Reallocate output buffer if so
if ( nDstSize > nOutBuffSize ) { LPSTR psz = (LPSTR) LocalReAlloc(lpstrOut, nDstSize, LMEM_ZEROINIT|LMEM_MOVEABLE); if (psz == NULL) { hrWarnings = E_OUTOFMEMORY ; goto exit; } lpstrOut = psz; nOutBuffSize = nDstSize ; } _nDstSize = nDstSize;
// Due to multi_stage conversion, this is the actual size is used
nSrcUsed = INetConvert->_nSrcSize ; nSrcLeft = nSrcSize - nSrcUsed ;
#if 0
// restore Src size
nSrcUsed = nSrcSize ; #endif
// do conversion
hr = INetConvert->DoCodeConvert(&dwMode, (LPCSTR)lpstrIn, (LPINT)&nSrcUsed, lpstrOut, (LPINT)&_nDstSize, dwFlag, lpFallBack); if (S_OK != hr) hrWarnings = hr;
// Write It
hr = pstmOut->Write(lpstrOut, nDstSize, &nDstSize); if (S_OK != hr) hrWarnings = hr;
if (nSrcLeft ) MoveMemory(lpstrIn, &lpstrIn[nSrcSize-nSrcLeft],nSrcLeft);
INetConvert->ConvertCleanUp(); } }
if (nSrcLeft ) { LARGE_INTEGER li;
LISet32(li, -(LONG)nSrcLeft ); hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL); }
if (lpdwMode) *lpdwMode = dwMode ;
exit : if (lpstrIn) LocalFree(lpstrIn); if (lpstrOut) LocalFree(lpstrOut);
// Done
return (hr == S_OK) ? hrWarnings : hr; }
HRESULT WINAPI ConvertINetStringInIStream(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, IStream *pstmIn, IStream *pstmOut, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr; CICharConverter * INetConvert = new CICharConverter(dwFlag, lpFallBack) ;
if (!INetConvert) return E_OUTOFMEMORY;
hr = _ConvertINetStringInIStream(INetConvert,lpdwMode,dwSrcEncoding,dwDstEncoding,pstmIn,pstmOut,dwFlag,lpFallBack);
delete INetConvert;
return hr ; }
|