You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2285 lines
83 KiB
2285 lines
83 KiB
#include "private.h"
|
|
#include "detcbase.h"
|
|
#include "codepage.h"
|
|
#include "detcjpn.h"
|
|
#include "detckrn.h"
|
|
#include "fechrcnv.h"
|
|
#include "ichrcnv.h"
|
|
#include "cpdetect.h"
|
|
#include <tchar.h>
|
|
|
|
|
|
#define CONV_UU 12
|
|
#define CONV_UUW 10
|
|
#define CONV_UUWI 9
|
|
#define CONV_UW 6
|
|
#define CONV_UWI 5
|
|
#define CONV_WI 3
|
|
|
|
#define MAX_CHAR_SIZE 4
|
|
|
|
#define MAPUSERDEF(x) (((x) == 50000) ? 1252 : (x))
|
|
#define CONVERT_IS_VALIDCODEPAGE(x) (((x) == CP_USER_DEFINED) ? TRUE: IsValidCodePage(x))
|
|
#define CONV_CHK_NLS 0x00000001
|
|
|
|
struct ENCODINGINFO
|
|
{
|
|
DWORD dwEncoding;
|
|
DWORD dwCodePage;
|
|
BYTE bTypeUUIW;
|
|
CP_STATE nCP_State ; // whether this is a valid windows codepage ?
|
|
DWORD dwFlags; // give us more flexibilities to handle different encodings differently
|
|
};
|
|
|
|
static WCHAR UniocdeSignature = { 0xFFFE } ;
|
|
|
|
/*
|
|
Bit 4 (16) - Unicode <-> Internet Encoding
|
|
Bit 3 (8) - UTF8, UTF7
|
|
Bit 2 (4) - Unicode
|
|
Bit 1 (2) - Windows CodePage
|
|
Bit 0 (1) - Internet Encoding
|
|
|
|
P.S. if bit 4 is set, it means it should convert between Unicode and Internet
|
|
Encoding directly, no intermediate step - Windows CodePage
|
|
*/
|
|
|
|
// these codepages including Unicode need special convertor
|
|
static struct ENCODINGINFO aEncodingInfo[] =
|
|
{
|
|
|
|
{ CP_JPN_SJ, 932, 0x02, INVALID_CP, 0 }, // W-Japanese Shift JIS
|
|
{ CP_CHN_GB, 936, 0x02, INVALID_CP, 0 }, // W-Simplified Chinese
|
|
{ CP_KOR_5601, 949, 0x02, INVALID_CP, 0 }, // W-Krean Unified Hangul
|
|
{ CP_TWN, 950, 0x02, INVALID_CP, 0 }, // W-Traditional Chinese
|
|
{ CP_UCS_2, 0, 0x04, INVALID_CP, 0 }, // U-Unicode
|
|
{ CP_UCS_2_BE, 0, 0x04, INVALID_CP, 0 }, // U-Unicode Big Endian
|
|
{ CP_1252, 1252, 0x02, INVALID_CP, 0 }, // W-Latin 1
|
|
{ CP_20127, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // US ASCII
|
|
{ CP_ISO_8859_1, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // I-ISO 8859-1 Latin 1
|
|
{ CP_ISO_8859_15, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // I-ISO 8859-1 Latin 1
|
|
{ CP_AUTO, 1252, 0x01, INVALID_CP, 0 }, // General auto detect
|
|
{ CP_ISO_2022_JP, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP No Halfwidth Katakana
|
|
{ CP_ISO_2022_JP_ESC, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP w/esc Halfwidth Katakana
|
|
{ CP_ISO_2022_JP_SIO, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP w/sio Halfwidth Katakana
|
|
{ CP_ISO_2022_KR, 949, 0x01, INVALID_CP, 0 }, // I-ISO 2022-KR
|
|
{ CP_ISO_2022_TW, 950, 0x01, INVALID_CP, 0 }, // I-ISO 2022-TW
|
|
{ CP_ISO_2022_CH, 936, 0x01, INVALID_CP, 0 }, // I-ISO 2022-CH
|
|
{ CP_JP_AUTO, 932, 0x01, INVALID_CP, 0 }, // JP auto detect
|
|
{ CP_CHS_AUTO, 936, 0x01, INVALID_CP, 0 }, // Simplified Chinese auto detect
|
|
{ CP_KR_AUTO, 949, 0x01, INVALID_CP, 0 }, // KR auto detect
|
|
{ CP_CHT_AUTO, 950, 0x01, INVALID_CP, 0 }, // Traditional Chinese auto detect
|
|
{ CP_CYRILLIC_AUTO, 1251, 0x01, INVALID_CP, 0 }, // Cyrillic auto detect
|
|
{ CP_GREEK_AUTO, 1253, 0x01, INVALID_CP, 0 }, // Greek auto detect
|
|
{ CP_ARABIC_AUTO, 1256, 0x01, INVALID_CP, 0 }, // Arabic auto detect
|
|
{ CP_EUC_JP, 932, 0x01, INVALID_CP, 0 }, // EUC Japanese
|
|
{ CP_EUC_CH, 936, 0x01, INVALID_CP, 0 }, // EUC Chinese
|
|
{ CP_EUC_KR, 949, 0x01, INVALID_CP, 0 }, // EUC Korean
|
|
{ CP_EUC_TW, 950, 0x01, INVALID_CP, 0 }, // EUC Taiwanese
|
|
{ CP_CHN_HZ, 936, 0x01, INVALID_CP, 0 }, // Simplify Chinese HZ-GB
|
|
{ CP_UTF_7, 0, 0x08, INVALID_CP, 0 }, // U-UTF7
|
|
{ CP_UTF_8, 0, 0x08, INVALID_CP, 0 }, // U-UTF8
|
|
};
|
|
|
|
|
|
// HTML name entity table for Latin-1 Supplement - from 0x00A0-0x00FF
|
|
|
|
#define NAME_ENTITY_OFFSET 0x00A0
|
|
#define NAME_ENTITY_MAX 0x00FF
|
|
#define NAME_ENTITY_ENTRY 96
|
|
|
|
static CHAR *g_lpstrNameEntity[NAME_ENTITY_ENTRY] =
|
|
{
|
|
" ", // " " -- no-break space = non-breaking space,
|
|
"¡", // "¡" -- inverted exclamation mark, U+00A1 ISOnum -->
|
|
"¢", // "¢" -- cent sign, U+00A2 ISOnum -->
|
|
"£", // "£" -- pound sign, U+00A3 ISOnum -->
|
|
"¤", // "¤" -- currency sign, U+00A4 ISOnum -->
|
|
"¥", // "¥" -- yen sign = yuan sign, U+00A5 ISOnum -->
|
|
"¦", // "¦" -- broken bar = broken vertical bar,
|
|
"§", // "§" -- section sign, U+00A7 ISOnum -->
|
|
"¨", // "¨" -- diaeresis = spacing diaeresis,
|
|
"©", // "©" -- copyright sign, U+00A9 ISOnum -->
|
|
"ª", // "ª" -- feminine ordinal indicator, U+00AA ISOnum -->
|
|
"«", // "«" -- left-pointing double angle quotation mark
|
|
"¬", // "¬" -- not sign = discretionary hyphen,
|
|
"­", // "­" -- soft hyphen = discretionary hyphen,
|
|
"®", // "®" -- registered sign = registered trade mark sign,
|
|
"¯", // "¯" -- macron = spacing macron = overline
|
|
"°", // "°" -- degree sign, U+00B0 ISOnum -->
|
|
"±", // "±" -- plus-minus sign = plus-or-minus sign,
|
|
"²", // "²" -- superscript two = superscript digit two
|
|
"³", // "³" -- superscript three = superscript digit three
|
|
"´", // "´" -- acute accent = spacing acute,
|
|
"µ", // "µ" -- micro sign, U+00B5 ISOnum -->
|
|
"¶", // "¶" -- pilcrow sign = paragraph sign,
|
|
"·", // "·" -- middle dot = Georgian comma
|
|
"¸", // "¸" -- cedilla = spacing cedilla, U+00B8 ISOdia -->
|
|
"¹", // "¹" -- superscript one = superscript digit one,
|
|
"º", // "º" -- masculine ordinal indicator,
|
|
"»", // "»" -- right-pointing double angle quotation mark
|
|
"¼", // "¼" -- vulgar fraction one quarter
|
|
"½", // "½" -- vulgar fraction one half
|
|
"¾", // "¾" -- vulgar fraction three quarters
|
|
"¿", // "¿" -- inverted question mark
|
|
"À", // "À" -- latin capital letter A with grave
|
|
"Á", // "Á" -- latin capital letter A with acute,
|
|
"Â", // "Â" -- latin capital letter A with circumflex,
|
|
"Ã", // "Ã" -- latin capital letter A with tilde,
|
|
"Ä", // "Ä" -- latin capital letter A with diaeresis,
|
|
"Å", // "Å" -- latin capital letter A with ring above
|
|
"Æ", // "Æ" -- latin capital letter AE
|
|
"Ç", // "Ç" -- latin capital letter C with cedilla,
|
|
"È", // "È" -- latin capital letter E with grave,
|
|
"É", // "É" -- latin capital letter E with acute,
|
|
"Ê", // "Ê" -- latin capital letter E with circumflex,
|
|
"Ë", // "Ë" -- latin capital letter E with diaeresis,
|
|
"Ì", // "Ì" -- latin capital letter I with grave,
|
|
"Í", // "Í" -- latin capital letter I with acute,
|
|
"Î", // "Î" -- latin capital letter I with circumflex,
|
|
"Ï", // "Ï" -- latin capital letter I with diaeresis,
|
|
"Ð", // "Ð" -- latin capital letter ETH, U+00D0 ISOlat1 -->
|
|
"Ñ", // "Ñ" -- latin capital letter N with tilde,
|
|
"Ò", // "Ò" -- latin capital letter O with grave,
|
|
"Ó", // "Ó" -- latin capital letter O with acute,
|
|
"Ô", // "Ô" -- latin capital letter O with circumflex,
|
|
"Õ", // "Õ" -- latin capital letter O with tilde,
|
|
"Ö", // "Ö" -- latin capital letter O with diaeresis,
|
|
"×", // "×" -- multiplication sign, U+00D7 ISOnum -->
|
|
"Ø", // "Ø" -- latin capital letter O with stroke
|
|
"Ù", // "Ù" -- latin capital letter U with grave,
|
|
"Ú", // "Ú" -- latin capital letter U with acute,
|
|
"Û", // "Û" -- latin capital letter U with circumflex,
|
|
"Ü", // "Ü" -- latin capital letter U with diaeresis,
|
|
"Ý", // "Ý" -- latin capital letter Y with acute,
|
|
"Þ", // "Þ" -- latin capital letter THORN,
|
|
"ß", // "ß" -- latin small letter sharp s = ess-zed,
|
|
"à", // "à" -- latin small letter a with grave
|
|
"á", // "á" -- latin small letter a with acute,
|
|
"â", // "â" -- latin small letter a with circumflex,
|
|
"ã", // "ã" -- latin small letter a with tilde,
|
|
"ä", // "ä" -- latin small letter a with diaeresis,
|
|
"å", // "å" -- latin small letter a with ring above
|
|
"æ", // "æ" -- latin small letter ae
|
|
"ç", // "ç" -- latin small letter c with cedilla,
|
|
"è", // "è" -- latin small letter e with grave,
|
|
"é", // "é" -- latin small letter e with acute,
|
|
"ê", // "ê" -- latin small letter e with circumflex,
|
|
"ë", // "ë" -- latin small letter e with diaeresis,
|
|
"ì", // "ì" -- latin small letter i with grave,
|
|
"í", // "í" -- latin small letter i with acute,
|
|
"î", // "î" -- latin small letter i with circumflex,
|
|
"ï", // "ï" -- latin small letter i with diaeresis,
|
|
"ð", // "ð" -- latin small letter eth, U+00F0 ISOlat1 -->
|
|
"ñ", // "ñ" -- latin small letter n with tilde,
|
|
"ò", // "ò" -- latin small letter o with grave,
|
|
"ó", // "ó" -- latin small letter o with acute,
|
|
"ô", // "ô" -- latin small letter o with circumflex,
|
|
"õ", // "õ" -- latin small letter o with tilde,
|
|
"ö", // "ö" -- latin small letter o with diaeresis,
|
|
"÷", // "÷" -- division sign, U+00F7 ISOnum -->
|
|
"ø", // "ø" -- latin small letter o with stroke,
|
|
"ù", // "ù" -- latin small letter u with grave,
|
|
"ú", // "ú" -- latin small letter u with acute,
|
|
"û", // "û" -- latin small letter u with circumflex,
|
|
"ü", // "ü" -- latin small letter u with diaeresis,
|
|
"ý", // "ý" -- latin small letter y with acute,
|
|
"þ", // "þ" -- latin small letter thorn with,
|
|
"ÿ", // "ÿ" -- latin small letter y with diaeresis,
|
|
};
|
|
|
|
|
|
#ifdef MORE_NAME_ENTITY // in case we decide to do more name entity latter
|
|
// Additional HTML 4.0 name entity table for CP 1252 extension character set
|
|
#define CP1252EXT_BASE (UINT)0x0080
|
|
#define CP1252EXT_MAX (UINT)0x009F
|
|
#define NONUNI 0xFFFF
|
|
#define UNDEFCHAR "???????"
|
|
#define CP1252EXT_NCR_SIZE 7
|
|
|
|
struct NAME_ENTITY_EXT
|
|
{
|
|
UWORD uwUniCode;
|
|
LPCTSTR lpszNameEntity;
|
|
};
|
|
|
|
static struct NAME_ENTITY_EXT aNameEntityExt[] =
|
|
{
|
|
// UniCode NCR_Enty Name_Enty CP1252Ext Comment
|
|
{ 0x20AC, "€" }, // "€" }, // € #EURO SIGN
|
|
// { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED
|
|
{ 0x201A, "‚" }, // "‚" }, // ‚ #SINGLE LOW-9 QUOTATION MARK
|
|
{ 0x0192, "ƒ" }, // "ƒ" }, // ƒ #LATIN SMALL LETTER F WITH HOOK
|
|
{ 0x201E, "„" }, // "„" }, // „ #DOUBLE LOW-9 QUOTATION MARK
|
|
{ 0x2026, "…" }, // "…" }, // … #HORIZONTAL ELLIPSIS
|
|
{ 0x2020, "†" }, // "†" }, // † #DAGGER
|
|
{ 0x2021, "‡" }, // "‡" }, // ‡ #DOUBLE DAGGER
|
|
{ 0x02C6, "ˆ" }, // "ˆ" }, // ˆ #MODIFIER LETTER CIRCUMFLEX ACCENT
|
|
{ 0x2030, "‰" }, // "‰" }, // ‰ #PER MILLE SIGN
|
|
{ 0x0160, "Š" }, // "Š" }, // Š #LATIN CAPITAL LETTER S WITH CARON
|
|
{ 0x2039, "‹" }, // "‹" }, // ‹ #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
|
{ 0x0152, "Œ" }, // "Œ" }, // Œ #LATIN CAPITAL LIGATURE OE
|
|
// { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED
|
|
{ 0x017D, "Ž" }, // "&;" }, // Ž #LATIN CAPITAL LETTER Z WITH CARON, ***no name entity defined in HTML 4.0***
|
|
// { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED
|
|
// { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED
|
|
{ 0x2018, "‘" }, // "‘" }, // ‘ #LEFT SINGLE QUOTATION MARK
|
|
{ 0x2019, "’" }, // "’" }, // ’ #RIGHT SINGLE QUOTATION MARK
|
|
{ 0x201C, "“" }, // "“" }, // “ #LEFT DOUBLE QUOTATION MARK
|
|
{ 0x201D, "”" }, // "”" }, // ” #RIGHT DOUBLE QUOTATION MARK
|
|
{ 0x2022, "•" }, // "•" }, // • #BULLET
|
|
{ 0x2013, "–" }, // "–" }, // – #EN DASH
|
|
{ 0x2014, "—" }, // "—" }, // — #EM DASH
|
|
{ 0x20DC, "˜" }, // "˜" }, // ˜ #SMALL TILDE
|
|
{ 0x2122, "™" }, // "™" }, // ™ #TRADE MARK SIGN
|
|
{ 0x0161, "š" }, // "š" }, // š #LATIN SMALL LETTER S WITH CARON
|
|
{ 0x203A, "›" }, // "›" }, // › #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
|
{ 0x0153, "œ" }, // "œ" }, // œ #LATIN SMALL LIGATURE OE
|
|
// { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED
|
|
{ 0x017E, "ž" }, // "&;" }, // ž #LATIN SMALL LETTER Z WITH CARON, ***no name entity defined in HTML 4.0***
|
|
{ 0x0178, "Ÿ" }, // "Ÿ" }, // Ÿ #LATIN CAPITAL LETTER Y WITH DIAERESIS
|
|
};
|
|
#endif
|
|
|
|
|
|
/******************************************************************************
|
|
***************************** U T I L I T I E S ***************************
|
|
******************************************************************************/
|
|
void DataByteSwap(LPSTR DataBuf, int len )
|
|
{
|
|
int i ;
|
|
UCHAR tmpData ;
|
|
|
|
if ( len )
|
|
for ( i = 0 ; i < len-1 ; i+=2 )
|
|
{
|
|
tmpData = DataBuf[i] ;
|
|
DataBuf[i] = DataBuf[i+1] ;
|
|
DataBuf[i+1] = tmpData ;
|
|
}
|
|
|
|
return ;
|
|
}
|
|
|
|
void CheckUnicodeDataType(DWORD dwDstEncoding, LPSTR DataBuf, int len )
|
|
{
|
|
|
|
if ( DataBuf && len )
|
|
{
|
|
if ( dwDstEncoding == CP_UCS_2_BE )
|
|
DataByteSwap(DataBuf,len);
|
|
}
|
|
return ;
|
|
}
|
|
|
|
void CheckASCIIEncoding(DWORD dwSrcEncoding, LPSTR DataBuf, int len )
|
|
{
|
|
if (DataBuf && len)
|
|
{
|
|
if (dwSrcEncoding == CP_20127)
|
|
{
|
|
for (int i = 0; i<len; i++)
|
|
{
|
|
if (*DataBuf & 0x80)
|
|
{
|
|
*DataBuf &= 0x7f;
|
|
}
|
|
DataBuf++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/******************************************************************************
|
|
****************** C O N V E R T I N E T S T R I N G ******************
|
|
******************************************************************************/
|
|
HRESULT CICharConverter::UnicodeToMultiByteEncoding(DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
|
|
int nBuffSize, i ;
|
|
BOOL UseDefChar = FALSE ;
|
|
LPSTR lpDefFallBack = NULL ;
|
|
UCHAR DefaultCharBuff[3]; // possible DBCS + null
|
|
HRESULT hr = E_FAIL;
|
|
int _nDstSize = *lpnDstSize;
|
|
|
|
if ( _dwUnicodeEncoding == CP_UCS_2_BE && _cvt_count == 0 )
|
|
{
|
|
if ( _lpUnicodeStr = (LPSTR)LocalAlloc(LPTR, *lpnSrcSize ) )
|
|
{
|
|
MoveMemory(_lpUnicodeStr, lpSrcStr, *lpnSrcSize ) ;
|
|
lpSrcStr = _lpUnicodeStr ;
|
|
}
|
|
else
|
|
{
|
|
hr = E_OUTOFMEMORY;
|
|
goto EXIT;
|
|
}
|
|
}
|
|
|
|
CheckUnicodeDataType(_dwUnicodeEncoding, (LPSTR) lpSrcStr, *lpnSrcSize);
|
|
|
|
nBuffSize = *lpnSrcSize / sizeof(WCHAR);
|
|
|
|
// We force to use MLang NO_BEST_FIT_CHAR check on ISCII encoding since system don't accept default chars
|
|
if (IS_NLS_DLL_CP(dwDstEncoding) && (dwFlag & MLCONVCHARF_USEDEFCHAR))
|
|
dwFlag |= MLCONVCHARF_NOBESTFITCHARS;
|
|
|
|
if ( lpFallBack && ( dwFlag & MLCONVCHARF_USEDEFCHAR ))
|
|
{
|
|
// only take SBCS, no DBCS character
|
|
if ( 1 == WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
|
|
(LPCWSTR)lpFallBack, 1,
|
|
(LPSTR)DefaultCharBuff, sizeof(DefaultCharBuff), NULL, NULL ))
|
|
lpDefFallBack = (LPSTR) DefaultCharBuff;
|
|
}
|
|
|
|
if(!(*lpnDstSize = WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
|
|
(LPCWSTR)lpSrcStr, nBuffSize,
|
|
lpDstStr, *lpnDstSize, IS_NLS_DLL_CP(dwDstEncoding)? NULL:(LPCSTR)lpDefFallBack, IS_NLS_DLL_CP(dwDstEncoding)? NULL:&UseDefChar)))
|
|
{
|
|
hr = E_FAIL;
|
|
goto EXIT;
|
|
}
|
|
|
|
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
|
|
_nSrcSize = nBuffSize * sizeof(WCHAR);
|
|
|
|
if (*lpnDstSize)
|
|
{
|
|
if (dwFlag & ( MLCONVCHARF_NCR_ENTITIZE | MLCONVCHARF_NAME_ENTITIZE | MLCONVCHARF_NOBESTFITCHARS ))
|
|
{
|
|
char *lpDstStrTmp = lpDstStr;
|
|
WCHAR *lpwStrTmp = NULL;
|
|
WCHAR *lpwStrTmpSave = NULL;
|
|
char *lpDstStrTmp2 = NULL;
|
|
char *lpDstStrTmp2Save = NULL;
|
|
int cCount, ConvCount = 0, nCount = 0;
|
|
WCHAR *lpwSrcStrTmp = (WCHAR *)lpSrcStr;
|
|
int *lpBCharOffset = NULL;
|
|
int *lpBCharOffsetSave = NULL;
|
|
|
|
if (!(lpwStrTmpSave = lpwStrTmp = (WCHAR *)LocalAlloc(LPTR, *lpnSrcSize)))
|
|
{
|
|
hr = E_OUTOFMEMORY;
|
|
goto ENTITIZE_DONE;
|
|
}
|
|
|
|
// Make sure we have real converted buffer to check BEST_FIT_CHAR and DEFAULT_CHAR
|
|
if (!_nDstSize)
|
|
{
|
|
lpDstStrTmp2Save = lpDstStrTmp2 = (char *)LocalAlloc(LPTR, *lpnDstSize);
|
|
if (lpDstStrTmp2)
|
|
{
|
|
WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
|
|
(LPCWSTR)lpSrcStr, nBuffSize,
|
|
lpDstStrTmp2, *lpnDstSize, NULL, NULL );
|
|
}
|
|
else
|
|
{
|
|
hr = E_OUTOFMEMORY;
|
|
goto ENTITIZE_DONE;
|
|
}
|
|
}
|
|
|
|
if (nBuffSize ==
|
|
MultiByteToWideChar(MAPUSERDEF(dwDstEncoding), 0, _nDstSize? lpDstStr : lpDstStrTmp2, *lpnDstSize, lpwStrTmp, _nSrcSize))
|
|
{
|
|
// Pre scan to get number of best fit chars.
|
|
for (i=0; i<nBuffSize; i++)
|
|
{
|
|
// make special case for ?(yen sign) in Shift-JIS
|
|
if (*lpwStrTmp++ != *lpwSrcStrTmp++)
|
|
{
|
|
if ((dwDstEncoding == CP_JPN_SJ) && (*(lpwSrcStrTmp - 1) == 0x00A5))
|
|
*(lpwStrTmp - 1) = 0x00A5;
|
|
else
|
|
nCount ++;
|
|
}
|
|
}
|
|
|
|
lpwSrcStrTmp -= nBuffSize;
|
|
lpwStrTmp -= nBuffSize;
|
|
|
|
if (nCount)
|
|
{
|
|
int j = 0;
|
|
|
|
if (!(dwFlag & ( MLCONVCHARF_NCR_ENTITIZE | MLCONVCHARF_NAME_ENTITIZE | MLCONVCHARF_USEDEFCHAR)))
|
|
{
|
|
hr = E_FAIL;
|
|
goto ENTITIZE_DONE;
|
|
}
|
|
|
|
if (!(lpBCharOffsetSave = lpBCharOffset = (int *) LocalAlloc(LPTR, nCount*sizeof(int))))
|
|
{
|
|
hr = E_OUTOFMEMORY;
|
|
goto ENTITIZE_DONE;
|
|
}
|
|
|
|
// Record the offset position of each best fit char.
|
|
for (i=0; i<nBuffSize; i++)
|
|
{
|
|
if (*lpwStrTmp++ != *lpwSrcStrTmp++)
|
|
{
|
|
*lpBCharOffset = i-j;
|
|
lpBCharOffset++;
|
|
j = i+1;
|
|
}
|
|
}
|
|
|
|
lpBCharOffset -= nCount;
|
|
lpwSrcStrTmp -= nBuffSize;
|
|
lpwStrTmp -= nBuffSize;
|
|
|
|
for (i=0; i<nCount; i++)
|
|
{
|
|
BOOL bIsSurrogatePair = FALSE;
|
|
|
|
if (*lpBCharOffset)
|
|
{
|
|
cCount = WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
|
|
(LPCWSTR)lpwSrcStrTmp, *lpBCharOffset,
|
|
lpDstStrTmp, _nDstSize? _nDstSize-ConvCount : 0, NULL, NULL );
|
|
|
|
ConvCount += cCount;
|
|
if (_nDstSize)
|
|
{
|
|
lpDstStrTmp += cCount;
|
|
}
|
|
lpwSrcStrTmp += *lpBCharOffset;
|
|
}
|
|
|
|
BOOL fConverted = FALSE;
|
|
|
|
// check if unconvertable character falls in NAME ENTITY area
|
|
if (dwFlag & MLCONVCHARF_NAME_ENTITIZE)
|
|
{
|
|
// for beta2, make assmption that name entity implys NCR.
|
|
dwFlag |= MLCONVCHARF_NCR_ENTITIZE;
|
|
|
|
#ifdef MORE_NAME_ENTITY // in case we decide do more name entity latter
|
|
BOOL fDoNEnty = FALSE;
|
|
LPCTSTR lpszNEnty = NULL;
|
|
|
|
// check if character is in the Latin-1 Supplement range
|
|
if ((*lpwSrcStrTmp >= NAME_ENTITY_OFFSET) && (*lpwSrcStrTmp <= NAME_ENTITY_MAX ))
|
|
{
|
|
fDoNEnty = TRUE;
|
|
lpszNEnty = g_lpstrNameEntity[(*lpwSrcStrTmp) - NAME_ENTITY_OFFSET];
|
|
}
|
|
|
|
// check if character is in the additional name entity table for CP 1252 extension
|
|
if (!fDoNEnty)
|
|
{
|
|
for (int idx = 0; idx < ARRAYSIZE(aNameEntityExt); idx++)
|
|
if (*lpwSrcStrTmp == aNameEntityExt[idx].uwUniCode)
|
|
{
|
|
fDoNEnty = TRUE;
|
|
lpszNEnty = aNameEntityExt[idx].lpszNameEntity;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (fDoNEnty)
|
|
{
|
|
cCount = lstrlenA(lpszNEnty);
|
|
if (_nDstSize)
|
|
{
|
|
CopyMemory(lpDstStrTmp, lpszNEnty, cCount);
|
|
lpDstStrTmp += cCount ;
|
|
}
|
|
|
|
ConvCount += cCount;
|
|
fConverted = TRUE;
|
|
}
|
|
#else
|
|
// check if character is in the Latin-1 Supplement range
|
|
if ((*lpwSrcStrTmp >= NAME_ENTITY_OFFSET)
|
|
&& (*lpwSrcStrTmp < ARRAYSIZE(g_lpstrNameEntity)+NAME_ENTITY_OFFSET))
|
|
|
|
{
|
|
LPCTSTR lpszNEnty = NULL;
|
|
|
|
if (!(lpszNEnty = g_lpstrNameEntity[(*lpwSrcStrTmp) - NAME_ENTITY_OFFSET]))
|
|
{
|
|
#ifdef DEBUG
|
|
AssertMsg((BOOL)FALSE, "Name entity table broken");
|
|
#endif
|
|
hr = E_FAIL;
|
|
goto ENTITIZE_DONE;
|
|
}
|
|
|
|
cCount = lstrlenA(lpszNEnty);
|
|
if (_nDstSize)
|
|
{
|
|
CopyMemory(lpDstStrTmp, lpszNEnty, cCount);
|
|
lpDstStrTmp += cCount ;
|
|
}
|
|
|
|
ConvCount += cCount;
|
|
fConverted = TRUE;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// check if NCR requested
|
|
if ((!fConverted) && (dwFlag & MLCONVCHARF_NCR_ENTITIZE))
|
|
{
|
|
if ((nCount-i >= 2) &&
|
|
(*lpwSrcStrTmp >= 0xD800 && *lpwSrcStrTmp <= 0xDBFF) &&
|
|
(*(lpwSrcStrTmp+1) >= 0xDC00 && *(lpwSrcStrTmp+1) <= 0xDFFF))
|
|
bIsSurrogatePair = TRUE;
|
|
else
|
|
bIsSurrogatePair = FALSE;
|
|
|
|
if (_nDstSize)
|
|
{
|
|
lpDstStrTmp[0] = '&' ;
|
|
lpDstStrTmp[1] = '#' ;
|
|
lpDstStrTmp += 2 ;
|
|
// If it is a Unicode surrogates pair, we convert it to real Unicode value
|
|
if (bIsSurrogatePair)
|
|
{
|
|
DWORD dwUnicode = ((*lpwSrcStrTmp - 0xD800) << 10) + *(lpwSrcStrTmp+1) - 0xDC00 + 0x10000;
|
|
_ultoa( dwUnicode, (char*)lpDstStrTmp, 10);
|
|
}
|
|
else
|
|
_ultoa( *lpwSrcStrTmp, (char*)lpDstStrTmp, 10);
|
|
cCount = lstrlenA(lpDstStrTmp);
|
|
lpDstStrTmp += cCount;
|
|
ConvCount += cCount;
|
|
*(lpDstStrTmp++) = ';' ;
|
|
}
|
|
else
|
|
{
|
|
char szTmpString[10];
|
|
if (bIsSurrogatePair)
|
|
{
|
|
DWORD dwUnicode = ((*lpwSrcStrTmp - 0xD800) << 10) + *(lpwSrcStrTmp+1) - 0xDC00 + 0x10000;
|
|
_ultoa( dwUnicode, szTmpString, 10);
|
|
}
|
|
else
|
|
_ultoa( *lpwSrcStrTmp, szTmpString, 10);
|
|
ConvCount += lstrlenA(szTmpString);
|
|
}
|
|
|
|
fConverted = TRUE;
|
|
ConvCount += 3;
|
|
}
|
|
|
|
// handle MLCONVCHARF_USEDEFCHAR here - less priority and default method
|
|
if (!fConverted)
|
|
{
|
|
if (_nDstSize)
|
|
{
|
|
*lpDstStrTmp = lpDefFallBack ? *lpDefFallBack : '?';
|
|
lpDstStrTmp++;
|
|
}
|
|
|
|
ConvCount++;
|
|
if (!UseDefChar)
|
|
UseDefChar = TRUE;
|
|
}
|
|
|
|
lpBCharOffset++;
|
|
lpwSrcStrTmp++;
|
|
// Skip next character if it is a Unicode surrogates pair
|
|
if (bIsSurrogatePair)
|
|
{
|
|
lpBCharOffset++;
|
|
lpwSrcStrTmp++;
|
|
i++;
|
|
}
|
|
}
|
|
lpBCharOffset -= nCount ;
|
|
}
|
|
|
|
int nRemain = (*lpnSrcSize - (int)((char*)lpwSrcStrTmp - (char *)lpSrcStr))/sizeof(WCHAR);
|
|
|
|
ConvCount += WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
|
|
(LPCWSTR)lpwSrcStrTmp, nRemain,
|
|
lpDstStrTmp, _nDstSize? _nDstSize-ConvCount : 0, NULL, NULL );
|
|
|
|
*lpnDstSize = ConvCount ;
|
|
|
|
hr = S_OK;
|
|
}
|
|
else
|
|
{
|
|
hr = E_FAIL;
|
|
}
|
|
|
|
ENTITIZE_DONE:
|
|
if (lpwStrTmpSave)
|
|
LocalFree(lpwStrTmpSave);
|
|
if (lpDstStrTmp2Save)
|
|
LocalFree(lpDstStrTmp2Save);
|
|
if (lpBCharOffsetSave)
|
|
LocalFree(lpBCharOffsetSave);
|
|
}
|
|
else
|
|
{
|
|
hr = S_OK;
|
|
}
|
|
|
|
if (S_OK == hr && UseDefChar)
|
|
hr = S_FALSE;
|
|
}
|
|
else
|
|
{
|
|
hr = E_FAIL;
|
|
}
|
|
|
|
EXIT:
|
|
return hr;
|
|
}
|
|
|
|
HRESULT CICharConverter::UTF78ToUnicode(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize)
|
|
{
|
|
HRESULT hr ;
|
|
|
|
hr = DoConvertINetString(lpdwMode, TRUE, CP_UCS_2, _dwUTFEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
|
|
|
|
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
|
|
_nSrcSize = *lpnSrcSize ;
|
|
|
|
CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
|
|
|
|
return hr ;
|
|
}
|
|
|
|
HRESULT CICharConverter::UnicodeToUTF78(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize)
|
|
{
|
|
HRESULT hr ;
|
|
|
|
if ( _dwUnicodeEncoding == CP_UCS_2_BE && _cvt_count == 0 )
|
|
{
|
|
if ( _lpUnicodeStr = (LPSTR)LocalAlloc(LPTR, *lpnSrcSize ) )
|
|
{
|
|
MoveMemory(_lpUnicodeStr, lpSrcStr, *lpnSrcSize ) ;
|
|
lpSrcStr = _lpUnicodeStr ;
|
|
}
|
|
else
|
|
return E_OUTOFMEMORY ;
|
|
}
|
|
|
|
CheckUnicodeDataType(_dwUnicodeEncoding, (LPSTR) lpSrcStr, *lpnSrcSize);
|
|
|
|
hr = DoConvertINetString(lpdwMode, FALSE, CP_UCS_2, _dwUTFEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
|
|
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
|
|
_nSrcSize = *lpnSrcSize ;
|
|
|
|
|
|
return hr ;
|
|
}
|
|
|
|
HRESULT CICharConverter::UnicodeToWindowsCodePage(LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
HRESULT hr ;
|
|
|
|
hr = UnicodeToMultiByteEncoding(_dwWinCodePage,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize,dwFlag,lpFallBack);
|
|
|
|
return hr ;
|
|
}
|
|
|
|
HRESULT CICharConverter::UnicodeToInternetEncoding(LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
HRESULT hr ;
|
|
|
|
hr = UnicodeToMultiByteEncoding(_dwInternetEncoding,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize,dwFlag,lpFallBack);
|
|
|
|
return hr ;
|
|
}
|
|
|
|
HRESULT CICharConverter::InternetEncodingToUnicode(LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize)
|
|
{
|
|
int cch;
|
|
int cb = *lpnSrcSize;
|
|
|
|
if ( !_cvt_count)
|
|
{
|
|
// If we have a multibyte character encoding, we are at risk of splitting
|
|
// some characters at the read boundary. We must Make sure we have a
|
|
// discrete number of characters first.
|
|
|
|
UINT uMax = MAX_CHAR_SIZE ;
|
|
cb++; // pre-increment
|
|
do
|
|
{
|
|
cch = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding),
|
|
MB_ERR_INVALID_CHARS,
|
|
lpSrcStr, --cb,
|
|
NULL, 0 );
|
|
--uMax;
|
|
} while (!cch && uMax && cb);
|
|
}
|
|
|
|
if ( !cb || cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
|
|
cb = *lpnSrcSize ; // restore orginal value
|
|
|
|
|
|
*lpnDstSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), 0,
|
|
lpSrcStr, cb,
|
|
(LPWSTR)lpDstStr, *lpnDstSize/sizeof(WCHAR) );
|
|
*lpnDstSize = *lpnDstSize * sizeof(WCHAR);
|
|
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
|
|
_nSrcSize = cb ;
|
|
|
|
CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
|
|
|
|
if (*lpnDstSize==0 && (cb || cb != *lpnSrcSize))
|
|
{
|
|
// GetLastError() for MultiByteToWideChar()
|
|
// Skip invalid characters for UTF8 conversion
|
|
if (CP_UTF_8 == MAPUSERDEF(_dwInternetEncoding)&&
|
|
ERROR_NO_UNICODE_TRANSLATION == GetLastError())
|
|
return S_OK;
|
|
else
|
|
return E_FAIL ;
|
|
}
|
|
else
|
|
return S_OK ;
|
|
}
|
|
|
|
HRESULT CICharConverter::WindowsCodePageToUnicode(LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize)
|
|
{
|
|
|
|
int cch1, cch2;
|
|
int cb = *lpnSrcSize;
|
|
|
|
if ( !_cvt_count && cb > 1 )
|
|
{
|
|
if (IS_DBCSCODEPAGE(MAPUSERDEF(_dwWinCodePage)))
|
|
{
|
|
// Detect DBCS dangling character
|
|
if (!MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
|
|
MB_ERR_INVALID_CHARS,
|
|
lpSrcStr, cb,
|
|
NULL, 0 ))
|
|
{
|
|
if (IsDBCSLeadByteEx(MAPUSERDEF(_dwWinCodePage), lpSrcStr[cb-1]))
|
|
{
|
|
cch1 = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
|
|
0,
|
|
lpSrcStr, cb,
|
|
NULL, 0 );
|
|
|
|
cch2 = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
|
|
0,
|
|
lpSrcStr, --cb,
|
|
NULL, 0 );
|
|
|
|
if (cch1 != cch2+1)
|
|
{
|
|
//Dangling DBCS character not found, restore cb.
|
|
cb++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// If we have a multibyte character encoding, we are at risk of splitting
|
|
// some characters at the read boundary. We must Make sure we have a
|
|
// discrete number of characters first.
|
|
|
|
UINT uMax = MAX_CHAR_SIZE ;
|
|
cb++; // pre-increment
|
|
do
|
|
{
|
|
cch1 = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
|
|
MB_ERR_INVALID_CHARS,
|
|
lpSrcStr, --cb,
|
|
NULL, 0 );
|
|
--uMax;
|
|
} while (!cch1 && uMax && cb);
|
|
|
|
if ( !cb || cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
|
|
cb = *lpnSrcSize ; // restore orginal value
|
|
}
|
|
}
|
|
|
|
*lpnDstSize = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), 0,
|
|
lpSrcStr, cb,
|
|
(LPWSTR)lpDstStr, *lpnDstSize/sizeof(WCHAR) );
|
|
*lpnDstSize = *lpnDstSize * sizeof(WCHAR);
|
|
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
|
|
_nSrcSize = cb ;
|
|
|
|
CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
|
|
|
|
// Whistler Bug#360429,
|
|
// Web page could have a splitting DBCS character at the very end of the page,
|
|
// To work around it, we allow one byte of dangling DBCS character.
|
|
if (*lpnDstSize==0 && (cb || (cb != *lpnSrcSize && ++cb != *lpnSrcSize)))
|
|
return E_FAIL ;
|
|
else
|
|
return S_OK ;
|
|
}
|
|
|
|
HRESULT CICharConverter::WindowsCodePageToInternetEncoding(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
HRESULT hr ;
|
|
|
|
// check if the conversion should go through Unicode indirectly
|
|
if ( _dwConvertType & 0x10 )
|
|
hr = WindowsCodePageToInternetEncodingWrap(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
|
|
else
|
|
{
|
|
|
|
hr = DoConvertINetString(lpdwMode, FALSE, _dwWinCodePage, _dwInternetEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
|
|
|
|
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
|
|
_nSrcSize = *lpnSrcSize ;
|
|
}
|
|
return hr ;
|
|
}
|
|
|
|
HRESULT CICharConverter::InternetEncodingToWindowsCodePage(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
HRESULT hr ;
|
|
|
|
// check if the conversion should go through Unicode indirectly
|
|
if ( _dwConvertType & 0x10 )
|
|
hr = InternetEncodingToWindowsCodePageWrap(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
|
|
else
|
|
{
|
|
hr = DoConvertINetString(lpdwMode, TRUE, _dwWinCodePage, _dwInternetEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
|
|
|
|
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
|
|
_nSrcSize = *lpnSrcSize ;
|
|
}
|
|
return hr ;
|
|
}
|
|
|
|
HRESULT CICharConverter::WindowsCodePageToInternetEncodingWrap(LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
int nBuffSize = 0 ;
|
|
int cb = *lpnSrcSize;
|
|
UINT uMax = MAX_CHAR_SIZE ;
|
|
BOOL UseDefChar = FALSE ;
|
|
HRESULT hr = S_OK;
|
|
|
|
if ( !_cvt_count )
|
|
{
|
|
cb++; // pre-increment
|
|
do
|
|
{
|
|
nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
|
|
MB_ERR_INVALID_CHARS,
|
|
lpSrcStr, --cb,
|
|
NULL, 0 );
|
|
--uMax;
|
|
} while (!nBuffSize && uMax && cb);
|
|
}
|
|
|
|
if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
|
|
cb = *lpnSrcSize ; // restore orginal value
|
|
|
|
if (!nBuffSize) // in case there are illeage characters
|
|
nBuffSize = cb ;
|
|
|
|
if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR, (nBuffSize * sizeof(WCHAR))))
|
|
{
|
|
nBuffSize = MultiByteToWideChar(MAPUSERDEF(_dwWinCodePage), 0,
|
|
lpSrcStr, cb, (LPWSTR)_lpInterm1Str, nBuffSize );
|
|
|
|
int iSrcSizeTmp = nBuffSize * sizeof(WCHAR);
|
|
hr = UnicodeToMultiByteEncoding(MAPUSERDEF(_dwInternetEncoding), (LPCSTR)_lpInterm1Str, &iSrcSizeTmp,
|
|
lpDstStr, lpnDstSize, dwFlag, lpFallBack);
|
|
// *lpnDstSize = WideCharToMultiByte( MAPUSERDEF(_dwInternetEncoding), 0,
|
|
// (LPCWSTR)_lpInterm1Str, nBuffSize, lpDstStr, *lpnDstSize, NULL, &UseDefChar );
|
|
|
|
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
|
|
_nSrcSize = cb ;
|
|
}
|
|
else
|
|
hr = E_FAIL;
|
|
|
|
if (hr == S_OK)
|
|
{
|
|
if (*lpnDstSize==0 && cb)
|
|
hr = E_FAIL ;
|
|
else
|
|
{
|
|
if ( UseDefChar )
|
|
return S_FALSE ;
|
|
else
|
|
return S_OK ;
|
|
}
|
|
}
|
|
|
|
return hr;
|
|
}
|
|
|
|
HRESULT CICharConverter::InternetEncodingToWindowsCodePageWrap(LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
|
|
int nBuffSize = 0 ;
|
|
int cb = *lpnSrcSize;
|
|
UINT uMax = MAX_CHAR_SIZE ;
|
|
BOOL UseDefChar = FALSE ;
|
|
HRESULT hr = S_OK;
|
|
|
|
if ( !_cvt_count )
|
|
{
|
|
cb++; // pre-increment
|
|
do
|
|
{
|
|
nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding),
|
|
MB_ERR_INVALID_CHARS,
|
|
lpSrcStr, --cb,
|
|
NULL, 0 );
|
|
--uMax;
|
|
} while (!nBuffSize && uMax && cb);
|
|
}
|
|
|
|
if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
|
|
cb = *lpnSrcSize ; // restore orginal value
|
|
|
|
if (!nBuffSize) // in case there are illeage characters
|
|
nBuffSize = cb ;
|
|
|
|
if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR,nBuffSize * sizeof (WCHAR) ))
|
|
{
|
|
nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), 0,
|
|
lpSrcStr, cb, (LPWSTR)_lpInterm1Str, nBuffSize );
|
|
|
|
int iSrcSizeTmp = nBuffSize * sizeof(WCHAR);
|
|
hr = UnicodeToMultiByteEncoding(MAPUSERDEF(_dwWinCodePage), (LPCSTR)_lpInterm1Str, &iSrcSizeTmp,
|
|
lpDstStr, lpnDstSize, dwFlag, lpFallBack);
|
|
// *lpnDstSize = WideCharToMultiByte( MAPUSERDEF(_dwWinCodePage), 0,
|
|
// (LPCWSTR)_lpInterm1Str, nBuffSize, lpDstStr, *lpnDstSize, NULL, &UseDefChar );
|
|
|
|
if ( !_cvt_count ) // save SrcSize if it is the first time conversion
|
|
_nSrcSize = cb ;
|
|
}
|
|
else
|
|
hr = E_FAIL;
|
|
|
|
if (hr == S_OK)
|
|
{
|
|
if (*lpnDstSize==0 && cb)
|
|
hr = E_FAIL ;
|
|
else
|
|
{
|
|
if ( UseDefChar )
|
|
return S_FALSE ;
|
|
else
|
|
return S_OK ;
|
|
}
|
|
}
|
|
|
|
return hr;
|
|
}
|
|
|
|
HRESULT CICharConverter::ConvertIWUU(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
int nBuffSize = 0 ;
|
|
HRESULT hr = S_OK ;
|
|
HRESULT hrWarnings = S_OK ;
|
|
|
|
// InternetEncodingToWindowsCodePage
|
|
if ( _dwConvertType % 2 && _dwConvertType < 21 ) /* start from Internet Encoding */
|
|
{
|
|
if ( _dwConvertType == 5 || _dwConvertType == 9 ) /* use interm buffer */
|
|
{
|
|
hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize, dwFlag, lpFallBack);
|
|
if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR,nBuffSize) )
|
|
{
|
|
hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize, dwFlag, lpFallBack);
|
|
lpSrcStr = _lpInterm1Str ;
|
|
*lpnSrcSize = nBuffSize ;
|
|
}
|
|
else
|
|
goto fail ;
|
|
}
|
|
else
|
|
hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
|
|
_cvt_count ++ ;
|
|
}
|
|
|
|
if ( hr != S_OK )
|
|
hrWarnings = hr ;
|
|
|
|
// WindowsCodePageToUnicode or InternetEncodingToUnicode
|
|
if ( _dwConvertType == 21 || _dwConvertType == 25 )
|
|
{
|
|
if ( _dwConvertType == 21 )
|
|
hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
|
|
else // _dwConvertType == 25
|
|
{
|
|
hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
|
|
if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
|
|
{
|
|
hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
|
|
lpSrcStr = _lpInterm1Str ;
|
|
*lpnSrcSize = nBuffSize ;
|
|
}
|
|
else
|
|
goto fail ;
|
|
}
|
|
_cvt_count ++ ;
|
|
}
|
|
else if ( _dwConvertType >= 4 && _dwConvertType <= 10 )
|
|
{
|
|
if ( _dwConvertType > 8 )
|
|
{
|
|
nBuffSize = 0 ;
|
|
hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
|
|
if ( _cvt_count )
|
|
{
|
|
if ( _lpInterm2Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
|
|
{
|
|
hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, _lpInterm2Str, &nBuffSize);
|
|
lpSrcStr = _lpInterm2Str ;
|
|
*lpnSrcSize = nBuffSize ;
|
|
}
|
|
else
|
|
goto fail ;
|
|
|
|
}
|
|
else
|
|
{
|
|
if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
|
|
{
|
|
hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
|
|
lpSrcStr = _lpInterm1Str ;
|
|
*lpnSrcSize = nBuffSize ;
|
|
}
|
|
else
|
|
goto fail ;
|
|
}
|
|
}
|
|
else
|
|
hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
|
|
_cvt_count ++ ;
|
|
}
|
|
|
|
if ( hr != S_OK )
|
|
hrWarnings = hr ;
|
|
|
|
// UnicodeToUTF78
|
|
if ( _dwConvertType & 0x08 )
|
|
#ifndef UNIX
|
|
hr = UnicodeToUTF78(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
|
|
#else
|
|
{
|
|
/* we now hack the lpSrcStr to be the same as 2 byte Unicode so mlang
|
|
* lowlevel code can work right.
|
|
*/
|
|
LPWSTR lpwSrcStr = (LPWSTR)lpSrcStr;
|
|
INT tmpSize = *lpnSrcSize/sizeof(WCHAR);
|
|
UCHAR *pTmp = new UCHAR[(tmpSize+1)*2];
|
|
if(pTmp) {
|
|
for(int i = 0; i < tmpSize; i++) {
|
|
pTmp[i*2] = *lpwSrcStr++;
|
|
pTmp[i*2+1] = 0x00;
|
|
}
|
|
pTmp[i*2] = pTmp[i*2+1] = 0x00;
|
|
tmpSize *= 2;
|
|
hr = UnicodeToUTF78(lpdwMode, (LPCSTR)pTmp, &tmpSize, lpDstStr, lpnDstSize);
|
|
}
|
|
else
|
|
hr = E_FAIL;
|
|
delete [] pTmp;
|
|
}
|
|
#endif /* UNIX */
|
|
|
|
return ( hr == S_OK ? hrWarnings : hr ) ;
|
|
|
|
fail :
|
|
return E_FAIL ;
|
|
}
|
|
|
|
HRESULT CICharConverter::ConvertUUWI(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
int nBuffSize = 0 ;
|
|
HRESULT hr = S_OK ;
|
|
HRESULT hrWarnings = S_OK ;
|
|
|
|
// UTF78ToUnicode
|
|
if ( _dwConvertType & 0x08 )
|
|
{
|
|
if ( _dwConvertType == 12 ) /* convert UTF78 -> Unicode only */
|
|
hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
|
|
else /* use interm buffer, type = 10 or 9 */
|
|
{
|
|
hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
|
|
if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
|
|
{
|
|
hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
|
|
lpSrcStr = _lpInterm1Str ;
|
|
*lpnSrcSize = nBuffSize ;
|
|
}
|
|
else
|
|
goto fail ;
|
|
}
|
|
_cvt_count ++ ;
|
|
}
|
|
|
|
if ( hr != S_OK )
|
|
hrWarnings = hr ;
|
|
|
|
// UnicodeToWindowsCodePage or UnicodeToInternetEncoding
|
|
if ( _dwConvertType == 21 || _dwConvertType == 25 )
|
|
{
|
|
hr = UnicodeToInternetEncoding(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
|
|
_cvt_count ++ ;
|
|
}
|
|
else if ( _dwConvertType >= 4 && _dwConvertType <= 10 )
|
|
{
|
|
if ( _dwConvertType % 2 ) /* use interm buffer */
|
|
{
|
|
nBuffSize = 0 ;
|
|
hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, NULL, &nBuffSize, dwFlag, lpFallBack);
|
|
if ( _cvt_count )
|
|
{
|
|
if ( _lpInterm2Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
|
|
{
|
|
hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, _lpInterm2Str, &nBuffSize, dwFlag, lpFallBack);
|
|
lpSrcStr = _lpInterm2Str ;
|
|
*lpnSrcSize = nBuffSize ;
|
|
}
|
|
else
|
|
goto fail ;
|
|
}
|
|
else
|
|
{
|
|
if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
|
|
{
|
|
hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize, dwFlag, lpFallBack);
|
|
lpSrcStr = _lpInterm1Str ;
|
|
*lpnSrcSize = nBuffSize ;
|
|
}
|
|
else
|
|
goto fail ;
|
|
}
|
|
}
|
|
else
|
|
hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
|
|
_cvt_count ++ ;
|
|
}
|
|
|
|
if ( hr != S_OK )
|
|
hrWarnings = hr ;
|
|
|
|
// WindowsCodePageToInternetEncoding
|
|
if ( _dwConvertType % 2 && _dwConvertType < 21 )
|
|
hr = WindowsCodePageToInternetEncoding(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
|
|
|
|
return ( hr == S_OK ? hrWarnings : hr ) ;
|
|
|
|
fail :
|
|
return E_FAIL ;
|
|
}
|
|
|
|
#if 0
|
|
struct CODEPAGEINFO
|
|
{
|
|
UINT uCodePage ;
|
|
CP_STATE nCP_State ; // whether this is a valid windows codepage ?
|
|
};
|
|
|
|
// ValidCodepageInfo is used to cache whether a codepage is a vaild code
|
|
// It uses circular-FIFO cache algorithm
|
|
#define MAX_CP_CACHE 32
|
|
static int cp_cache_count = 0 ;
|
|
static int cp_cache_ptr = 0 ;
|
|
static struct CODEPAGEINFO ValidCodepageInfo[MAX_CP_CACHE];
|
|
|
|
// ValidCodepageInfo is used to cache whether a codepage is a vaild codepage
|
|
// It uses circular-FIFO cache algorithm
|
|
|
|
BOOL CheckIsValidCodePage (UINT uCodePage)
|
|
{
|
|
if ( uCodePage == 50000 ) // User defined
|
|
return TRUE ;
|
|
|
|
int i ;
|
|
BOOL bRet ;
|
|
|
|
for ( i = 0 ; i < cp_cache_count ; i++ )
|
|
{
|
|
if ( uCodePage == ValidCodepageInfo[i].uCodePage )
|
|
{
|
|
if ( ValidCodepageInfo[i].nCP_State == VALID_CP )
|
|
return TRUE ;
|
|
else
|
|
return FALSE ;
|
|
}
|
|
}
|
|
|
|
// not found, call IsValidCodePage and cache the return value
|
|
bRet = IsValidCodePage(uCodePage);
|
|
|
|
EnterCriticalSection(&g_cs);
|
|
ValidCodepageInfo[cp_cache_ptr].uCodePage = uCodePage ;
|
|
if (bRet)
|
|
ValidCodepageInfo[cp_cache_ptr].nCP_State = VALID_CP ;
|
|
else
|
|
ValidCodepageInfo[cp_cache_ptr].nCP_State = INVALID_CP ;
|
|
if ( cp_cache_count < MAX_CP_CACHE )
|
|
cp_cache_count++ ;
|
|
cp_cache_ptr = ( ++cp_cache_ptr ) % MAX_CP_CACHE ;
|
|
LeaveCriticalSection(&g_cs);
|
|
|
|
return bRet ;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
Conversion Flag:
|
|
|
|
Bit 7 - Convert Direction.
|
|
|
|
Bit 4 (16) - Unicode <-> Internet Encoding
|
|
Bit 3 (8) - UTF8, UTF7
|
|
Bit 2 (4) - Unicode
|
|
Bit 1 (2) - Windows CodePage
|
|
Bit 0 (1) - Internet Encoding
|
|
|
|
12, 6, 3 (19) - one step convert
|
|
10, 5 (21) - two steps convert
|
|
9 (25) - three steps convert
|
|
|
|
*/
|
|
|
|
int GetWindowsEncodingIndex(DWORD dwEncoding)
|
|
{
|
|
int nr = sizeof (aEncodingInfo) / sizeof(ENCODINGINFO) ;
|
|
int i, half = nr / 2, index = -1 ;
|
|
|
|
if (aEncodingInfo[half].dwEncoding > dwEncoding )
|
|
{
|
|
for ( i = 0 ; i < half ; i++ )
|
|
if (aEncodingInfo[i].dwEncoding == dwEncoding )
|
|
index = i ;
|
|
|
|
}
|
|
else if (aEncodingInfo[half].dwEncoding < dwEncoding )
|
|
{
|
|
for ( i = half + 1 ; i < nr ; i++ )
|
|
if (aEncodingInfo[i].dwEncoding == dwEncoding )
|
|
index = i ;
|
|
}
|
|
else
|
|
index = half ;
|
|
|
|
if (index>=0) // found
|
|
{
|
|
if ( aEncodingInfo[index].nCP_State != VALID_CP &&
|
|
aEncodingInfo[index].dwCodePage )
|
|
{
|
|
|
|
if ( aEncodingInfo[index].dwCodePage == 50000 || IsValidCodePage(aEncodingInfo[index].dwCodePage ) ) // 50000 means user defined
|
|
aEncodingInfo[index].nCP_State = VALID_CP ;
|
|
else
|
|
aEncodingInfo[index].nCP_State = INVALID_CP ;
|
|
|
|
if ((aEncodingInfo[index].nCP_State == VALID_CP) &&
|
|
(aEncodingInfo[index].dwFlags & CONV_CHK_NLS) &&
|
|
!IsValidCodePage(aEncodingInfo[index].dwEncoding))
|
|
aEncodingInfo[index].nCP_State = INVALID_CP ;
|
|
}
|
|
// Use system UTF8 conversion to work around security issues on Win2k and greater platforms.
|
|
if (g_bUseSysUTF8 && dwEncoding == CP_UTF_8)
|
|
{
|
|
aEncodingInfo[index].bTypeUUIW = 0x11;
|
|
}
|
|
}
|
|
|
|
return index ;
|
|
}
|
|
|
|
HRESULT CICharConverter::ConvertSetup(DWORD * pdwSrcEncoding, DWORD dwDstEncoding)
|
|
{
|
|
DWORD SrcFlag = 0, DstFlag = 0 ;
|
|
int index, unknown = 0 ;
|
|
|
|
// IE bug 109708 - WEIWU 5/11/00
|
|
// Always consider US-ASCII as a valid source encoding for conversion
|
|
/*
|
|
if (*pdwSrcEncoding == CP_20127 && !IsValidCodePage(CP_20127))
|
|
*pdwSrcEncoding = CP_1252;
|
|
*/
|
|
/* check source & destination encoding type */
|
|
index = GetWindowsEncodingIndex(*pdwSrcEncoding);
|
|
if ( index >=0 )
|
|
{
|
|
SrcFlag = (DWORD) aEncodingInfo[index].bTypeUUIW ;
|
|
if ( aEncodingInfo[index].dwCodePage )
|
|
{
|
|
_dwWinCodePage = (DWORD) aEncodingInfo[index].dwCodePage ;
|
|
if (aEncodingInfo[index].nCP_State == INVALID_CP )
|
|
goto fail ;
|
|
}
|
|
if ( SrcFlag & 0x08 )
|
|
_dwUTFEncoding = *pdwSrcEncoding ;
|
|
if ( SrcFlag & 0x01 )
|
|
_dwInternetEncoding = *pdwSrcEncoding ;
|
|
if ( SrcFlag & 0x04 )
|
|
_dwUnicodeEncoding = *pdwSrcEncoding ;
|
|
}
|
|
// assume it is a unknown Window Codepage
|
|
else
|
|
{
|
|
if ( !CONVERT_IS_VALIDCODEPAGE(*pdwSrcEncoding))
|
|
goto fail ;
|
|
|
|
SrcFlag = 0x02 ;
|
|
_dwWinCodePage = *pdwSrcEncoding ;
|
|
|
|
unknown ++ ;
|
|
}
|
|
|
|
index = GetWindowsEncodingIndex(dwDstEncoding);
|
|
if ( index >=0 )
|
|
{
|
|
// check if two codepages are compatiable
|
|
if ( _dwWinCodePage && aEncodingInfo[index].dwCodePage )
|
|
{
|
|
if (_dwWinCodePage != (DWORD) aEncodingInfo[index].dwCodePage )
|
|
goto fail ;
|
|
}
|
|
|
|
DstFlag = (DWORD) aEncodingInfo[index].bTypeUUIW ;
|
|
if ( aEncodingInfo[index].dwCodePage )
|
|
{
|
|
_dwWinCodePage = (DWORD) aEncodingInfo[index].dwCodePage ;
|
|
if (aEncodingInfo[index].nCP_State == INVALID_CP )
|
|
goto fail ;
|
|
}
|
|
if ( DstFlag & 0x08 )
|
|
{
|
|
if (_dwUTFEncoding)
|
|
_dwUTFEncoding2 = dwDstEncoding ;
|
|
else
|
|
_dwUTFEncoding = dwDstEncoding ;
|
|
}
|
|
if ( DstFlag & 0x01 )
|
|
_dwInternetEncoding = dwDstEncoding ;
|
|
if ( DstFlag & 0x04 )
|
|
_dwUnicodeEncoding = dwDstEncoding ;
|
|
}
|
|
// 1) First time unknown, assume it is a unknown Window Codepage
|
|
// the conversion become UTF78 <-> Unicode <-> Window Codepage
|
|
// 2) Second time unknown, assume it is a unknown Internet Encoding
|
|
// the conversion become Windows Codepage <-> Unicode <-> Internet Encoding
|
|
else
|
|
{
|
|
if ( !CONVERT_IS_VALIDCODEPAGE(dwDstEncoding))
|
|
goto fail ;
|
|
|
|
if ( unknown == 0 )
|
|
{
|
|
if ( _dwWinCodePage )
|
|
{
|
|
if (_dwWinCodePage != dwDstEncoding )
|
|
goto fail ;
|
|
}
|
|
|
|
DstFlag = 0x02 ;
|
|
_dwWinCodePage = dwDstEncoding ;
|
|
}
|
|
else
|
|
{
|
|
DstFlag = 0x11 ;
|
|
_dwInternetEncoding = dwDstEncoding ;
|
|
}
|
|
}
|
|
|
|
if ( !SrcFlag | !DstFlag )
|
|
goto fail ;
|
|
|
|
if ( SrcFlag == DstFlag && *pdwSrcEncoding != dwDstEncoding && ( 4 != SrcFlag ) && ( 8 != SrcFlag ))
|
|
goto fail ;
|
|
|
|
_dwConvertType = SrcFlag | DstFlag ;
|
|
|
|
_bConvertDirt = ( SrcFlag & 0x0f ) > ( DstFlag & 0x0f ) ;
|
|
|
|
// if code convertor has been allocated, deallocate it
|
|
if (_hcins)
|
|
{
|
|
delete _hcins ;
|
|
_hcins = NULL ;
|
|
}
|
|
|
|
return S_OK ;
|
|
|
|
fail :
|
|
return S_FALSE ;
|
|
}
|
|
|
|
|
|
HRESULT CICharConverter::DoCodeConvert(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
|
|
LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
HRESULT hr = S_OK ;
|
|
|
|
if ( 4 == _dwConvertType ) // CP_UCS_2 <-> CP_UCS_2_BE
|
|
{
|
|
if (!lpDstStr)
|
|
{
|
|
_nSrcSize = *lpnDstSize = *lpnSrcSize ;
|
|
}
|
|
else
|
|
{
|
|
int nSize = min(*lpnDstSize,*lpnSrcSize);
|
|
|
|
_nSrcSize = *lpnSrcSize ;
|
|
if ( lpDstStr && nSize > 0 )
|
|
{
|
|
MoveMemory(lpDstStr, lpSrcStr, nSize );
|
|
DataByteSwap(lpDstStr, nSize );
|
|
_nSrcSize = nSize ;
|
|
*lpnDstSize = nSize ;
|
|
}
|
|
}
|
|
}
|
|
else if ( 8 == _dwConvertType) // UTF7 <-> UTF8
|
|
{
|
|
if (_dwUTFEncoding == _dwUTFEncoding2)
|
|
{
|
|
_nSrcSize = *lpnDstSize = min(*lpnDstSize,*lpnSrcSize);
|
|
if (*lpnDstSize > 0)
|
|
MoveMemory(lpDstStr, lpSrcStr, *lpnDstSize);
|
|
}
|
|
else
|
|
{
|
|
int nBuffSize = 0;
|
|
// Always succeeds
|
|
hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
|
|
if (_lpInterm1Str)
|
|
LocalFree(_lpInterm1Str);
|
|
if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
|
|
{
|
|
DWORD dwTmpEncoding = _dwUTFEncoding;
|
|
int nTmpSrcSize;
|
|
|
|
hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
|
|
_dwUTFEncoding = _dwUTFEncoding2 ;
|
|
nTmpSrcSize = _nSrcSize;
|
|
// We don't need to create another dwMode since only UTF7 conversion needs it
|
|
hr = UnicodeToUTF78(lpdwMode, _lpInterm1Str, &nBuffSize, lpDstStr, lpnDstSize);
|
|
_nSrcSize = nTmpSrcSize;
|
|
_dwUTFEncoding = dwTmpEncoding ;
|
|
}
|
|
else
|
|
hr = E_OUTOFMEMORY;
|
|
}
|
|
}
|
|
else if ( _bConvertDirt )
|
|
hr = ConvertUUWI(lpdwMode, lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, dwFlag, lpFallBack);
|
|
else
|
|
hr = ConvertIWUU(lpdwMode, lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, dwFlag, lpFallBack);
|
|
|
|
return hr ;
|
|
}
|
|
|
|
BOOL CICharConverter::ConvertCleanUp()
|
|
{
|
|
if (_lpInterm1Str)
|
|
{
|
|
LocalFree(_lpInterm1Str);
|
|
_lpInterm1Str = NULL ;
|
|
}
|
|
if (_lpInterm2Str)
|
|
{
|
|
LocalFree(_lpInterm2Str);
|
|
_lpInterm2Str = NULL ;
|
|
}
|
|
if (_lpUnicodeStr)
|
|
{
|
|
LocalFree(_lpUnicodeStr);
|
|
_lpUnicodeStr = NULL ;
|
|
}
|
|
_cvt_count = 0 ;
|
|
_nSrcSize = 0 ;
|
|
|
|
return TRUE ;
|
|
}
|
|
|
|
CICharConverter::CICharConverter()
|
|
{
|
|
_lpInterm1Str = NULL ;
|
|
_lpInterm2Str = NULL ;
|
|
_lpUnicodeStr = NULL ;
|
|
_hcins = NULL ;
|
|
_cvt_count = 0 ;
|
|
_dwWinCodePage = 0;
|
|
_dwInternetEncoding = 0;
|
|
_dwUTFEncoding = 0;
|
|
_dwUTFEncoding2 = 0;
|
|
_dwUnicodeEncoding = 0;
|
|
_dwConvertType = 0;
|
|
_nSrcSize = 0 ;
|
|
_hcins_dst = 0 ;
|
|
|
|
return ;
|
|
}
|
|
|
|
CICharConverter::CICharConverter(DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
_lpInterm1Str = NULL ;
|
|
_lpInterm2Str = NULL ;
|
|
_lpUnicodeStr = NULL ;
|
|
_hcins = NULL ;
|
|
_cvt_count = 0 ;
|
|
_dwWinCodePage = 0;
|
|
_dwInternetEncoding = 0;
|
|
_dwUTFEncoding = 0;
|
|
_dwUTFEncoding2 = 0;
|
|
_dwUnicodeEncoding = 0;
|
|
_dwConvertType = 0;
|
|
_nSrcSize = 0 ;
|
|
_hcins_dst = 0 ;
|
|
_dwFlag = dwFlag;
|
|
_lpFallBack = lpFallBack;
|
|
|
|
return ;
|
|
}
|
|
|
|
|
|
CICharConverter::~CICharConverter()
|
|
{
|
|
if (_lpInterm1Str)
|
|
{
|
|
LocalFree(_lpInterm1Str);
|
|
_lpInterm1Str = NULL ;
|
|
}
|
|
if (_lpInterm2Str)
|
|
{
|
|
LocalFree(_lpInterm2Str);
|
|
_lpInterm2Str = NULL ;
|
|
}
|
|
if (_lpUnicodeStr)
|
|
{
|
|
LocalFree(_lpUnicodeStr);
|
|
_lpUnicodeStr = NULL ;
|
|
}
|
|
if (_hcins)
|
|
{
|
|
delete _hcins ;
|
|
_hcins = NULL ;
|
|
}
|
|
}
|
|
|
|
CICharConverter::CICharConverter(DWORD dwSrcEncoding, DWORD dwDstEncoding)
|
|
{
|
|
_lpInterm1Str = NULL ;
|
|
_lpInterm2Str = NULL ;
|
|
_lpUnicodeStr = NULL ;
|
|
_hcins = NULL ;
|
|
_cvt_count = 0 ;
|
|
_dwWinCodePage = 0;
|
|
_dwInternetEncoding = 0;
|
|
_dwUTFEncoding = 0;
|
|
_dwUTFEncoding2 = 0;
|
|
_dwUnicodeEncoding = 0;
|
|
_dwConvertType = 0;
|
|
_nSrcSize = 0 ;
|
|
_hcins_dst = 0 ;
|
|
|
|
ConvertSetup(&dwSrcEncoding,dwDstEncoding);
|
|
return ;
|
|
}
|
|
|
|
HRESULT WINAPI IsConvertINetStringAvailable(DWORD dwSrcEncoding, DWORD dwDstEncoding)
|
|
{
|
|
HRESULT hr;
|
|
CICharConverter * INetConvert = new CICharConverter ;
|
|
|
|
if (!INetConvert)
|
|
return E_OUTOFMEMORY;
|
|
|
|
hr = INetConvert->ConvertSetup(&dwSrcEncoding, dwDstEncoding);
|
|
delete INetConvert;
|
|
|
|
return hr ;
|
|
}
|
|
|
|
#define DETECTION_BUFFER_NUM 3
|
|
|
|
|
|
// In CP_AUTO and detection result is UTF7 case, private converter might use high word of *lpdwMode to store internal data, but we need
|
|
// to use it to notify Trident the detection result, currently, we bias to returning correct detection result.
|
|
// This is currently by design. If we get a change to re-prototype conversion object, we can resovle this issue
|
|
HRESULT WINAPI ConvertINetStringEx(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
CICharConverter * INetConvert;
|
|
int nSrcSize;
|
|
int nDstSize;
|
|
DWORD dwMode = 0 ;
|
|
// dwDetectResult
|
|
// CP_UNDEFINED :Fail to detect
|
|
// 0 :Not a auto-detect scenario
|
|
// Others :Detected encoding
|
|
DWORD dwDetectResult = CP_UNDEFINED;
|
|
HRESULT hr ;
|
|
|
|
if(lpnSrcSize)
|
|
{
|
|
nSrcSize = *lpnSrcSize;
|
|
}
|
|
else
|
|
nSrcSize = -1;
|
|
|
|
if ( lpSrcStr && nSrcSize == -1 ) // Get length of lpSrcStr if not given, assuming lpSrcStr is a zero terminate string.
|
|
{
|
|
if ( dwSrcEncoding == CP_UCS_2 )
|
|
nSrcSize = (lstrlenW((WCHAR*)lpSrcStr) << 1) ;
|
|
else
|
|
nSrcSize = lstrlenA(lpSrcStr) ;
|
|
}
|
|
|
|
// If there is nothing need to be converted, we return S_OK;
|
|
if (!nSrcSize || !lpSrcStr)
|
|
{
|
|
if (lpnDstSize)
|
|
*lpnDstSize = 0;
|
|
return S_OK;
|
|
}
|
|
|
|
INetConvert = new CICharConverter(dwFlag, lpFallBack) ;
|
|
|
|
if (!INetConvert)
|
|
return E_OUTOFMEMORY;
|
|
|
|
// ASSERT(CP_AUTO != dwDstEncoding);
|
|
|
|
// if null specified at dst buffer we'll get the size of required buffer.
|
|
if(!lpDstStr)
|
|
nDstSize = 0;
|
|
else if (lpnDstSize)
|
|
nDstSize = *lpnDstSize;
|
|
else
|
|
nDstSize = 0;
|
|
|
|
if (lpdwMode)
|
|
dwMode = *lpdwMode ;
|
|
|
|
// In real world, clients uses 28591 as 1252, 28599 as 1254,
|
|
// To correctly convert those extended characters to Unicode,
|
|
// We internally replace it with 1252
|
|
if (dwDstEncoding == CP_UCS_2 || dwDstEncoding == CP_UCS_2_BE)
|
|
{
|
|
if ((dwSrcEncoding == CP_ISO_8859_1) && _IsValidCodePage(CP_1252))
|
|
dwSrcEncoding = CP_1252;
|
|
|
|
if ((dwSrcEncoding == CP_ISO_8859_9) && _IsValidCodePage(CP_1254))
|
|
dwSrcEncoding = CP_1254;
|
|
}
|
|
|
|
if ((dwDstEncoding == CP_1252) && (dwSrcEncoding == CP_ISO_8859_1))
|
|
{
|
|
dwSrcEncoding = CP_1252;
|
|
}
|
|
|
|
if ((dwDstEncoding == CP_1254) && (dwSrcEncoding == CP_ISO_8859_9))
|
|
{
|
|
dwSrcEncoding = CP_1254;
|
|
}
|
|
|
|
//
|
|
// Auto Detection for Japan
|
|
// Japanese user often tag their data incorrectly, so, if MLCONVCHARF_DETECTJPN specified,
|
|
// we'll do extra detection for Shift-Jis and EUC
|
|
//
|
|
if ( dwSrcEncoding == CP_JP_AUTO ||
|
|
((dwFlag & MLCONVCHARF_DETECTJPN) &&
|
|
(dwSrcEncoding == CP_JPN_SJ || dwSrcEncoding == CP_EUC_JP))) // Auto Detection for Japan
|
|
{
|
|
CIncdJapanese DetectJapan(dwSrcEncoding);
|
|
UINT uiCodePage ;
|
|
|
|
uiCodePage = ( dwMode >> 16 ) & 0xffff ;
|
|
if ( uiCodePage )
|
|
{
|
|
dwSrcEncoding = uiCodePage ;
|
|
dwDetectResult = 0;
|
|
}
|
|
else
|
|
{
|
|
dwSrcEncoding = DetectJapan.DetectStringA(lpSrcStr, nSrcSize);
|
|
// if dwSrcEncoding is zero means there is an ambiguity, we don't return
|
|
// the detected codepage to caller, instead we defaut its codepage internally
|
|
// to SJIS
|
|
if (dwSrcEncoding)
|
|
{
|
|
dwDetectResult = dwSrcEncoding << 16 ;
|
|
}
|
|
else
|
|
dwSrcEncoding = CP_JPN_SJ;
|
|
}
|
|
}
|
|
// bug #43190, we auto-detect again for euc-kr page because IMN ver 1.0
|
|
// mislabel an ISO-KR page as a ks_c_5601-1987 page. This is the only way
|
|
// we can fix that mistake.
|
|
else if ( dwSrcEncoding == CP_KR_AUTO || dwSrcEncoding == CP_KOR_5601 ||
|
|
dwSrcEncoding == CP_EUC_KR )
|
|
{
|
|
CIncdKorean DetectKorean;
|
|
UINT uiCodePage ;
|
|
|
|
uiCodePage = ( dwMode >> 16 ) & 0xffff ;
|
|
if ( uiCodePage )
|
|
{
|
|
dwSrcEncoding = uiCodePage ;
|
|
dwDetectResult = 0;
|
|
}
|
|
else
|
|
{
|
|
dwSrcEncoding = DetectKorean.DetectStringA(lpSrcStr, nSrcSize);
|
|
if (dwSrcEncoding)
|
|
{
|
|
dwDetectResult = dwSrcEncoding << 16 ;
|
|
}
|
|
else
|
|
dwSrcEncoding = CP_KOR_5601;
|
|
}
|
|
|
|
}
|
|
else if ( dwSrcEncoding == CP_AUTO ) // General Auto Detection for all code pages
|
|
{
|
|
int _nSrcSize = DETECTION_MAX_LEN < nSrcSize ? DETECTION_MAX_LEN : nSrcSize;
|
|
int nScores = DETECTION_BUFFER_NUM;
|
|
DetectEncodingInfo Encoding[DETECTION_BUFFER_NUM];
|
|
UINT uiCodePage ;
|
|
|
|
|
|
uiCodePage = ( dwMode >> 16 ) & 0xffff ;
|
|
if ( uiCodePage )
|
|
{
|
|
dwSrcEncoding = uiCodePage ;
|
|
dwDetectResult = 0;
|
|
}
|
|
else
|
|
{
|
|
dwSrcEncoding = g_uACP;
|
|
if ( S_OK == _DetectInputCodepage(MLDETECTCP_HTML, CP_AUTO, (char *)lpSrcStr, &_nSrcSize, &Encoding[0], &nScores))
|
|
{
|
|
MIMECPINFO cpInfo;
|
|
|
|
if (Encoding[0].nCodePage == CP_20127)
|
|
Encoding[0].nCodePage = dwSrcEncoding;
|
|
|
|
if (NULL != g_pMimeDatabase)
|
|
{
|
|
if (SUCCEEDED(g_pMimeDatabase->GetCodePageInfo(Encoding[0].nCodePage, 0x409, &cpInfo)) &&
|
|
(cpInfo.dwFlags & MIMECONTF_VALID))
|
|
{
|
|
dwSrcEncoding = Encoding[0].nCodePage;
|
|
dwDetectResult = dwSrcEncoding << 16 ;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we failed in general detection and system locale is Jpn, we try harder
|
|
// with our Japanese detection engine
|
|
if (dwSrcEncoding == CP_JPN_SJ && dwDetectResult == CP_UNDEFINED)
|
|
{
|
|
CIncdJapanese DetectJapan;
|
|
DWORD dwSrcEncodingJpn = DetectJapan.DetectStringA(lpSrcStr, nSrcSize);
|
|
if (dwSrcEncodingJpn)
|
|
{
|
|
// We only change conversion encoding without returnning this result to browser
|
|
// if it is in the middle of detection, this is to prevent other encodings been mis-detected as Jpn encodings.
|
|
dwSrcEncoding = dwSrcEncodingJpn;
|
|
|
|
// Set search range for end tag as 10 bytes
|
|
if (nSrcSize >= 10)
|
|
{
|
|
char szTmpStr[11] = {0};
|
|
char *lpTmpStr = szTmpStr;
|
|
_tcsncpy(szTmpStr, (char *)&lpSrcStr[nSrcSize-10], 10);
|
|
|
|
//ToLower
|
|
while(*lpTmpStr)
|
|
{
|
|
if (*lpTmpStr >= 'A' && *lpTmpStr <= 'W')
|
|
*lpTmpStr += 0x20;
|
|
lpTmpStr++;
|
|
}
|
|
|
|
// If end of page, return this result
|
|
if (MLStrStr(szTmpStr, "</html>"))
|
|
dwDetectResult = dwSrcEncoding << 16 ;
|
|
}
|
|
|
|
}
|
|
}
|
|
//aEncodingInfo[GetWindowsEncodingIndex(CP_AUTO)].dwCodePage = dwSrcEncoding;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Not a auto-detect scenario
|
|
dwDetectResult = 0;
|
|
}
|
|
|
|
if ( S_OK == ( hr = INetConvert->ConvertSetup(&dwSrcEncoding,dwDstEncoding )))
|
|
{
|
|
if ( dwSrcEncoding != dwDstEncoding )
|
|
{
|
|
// if high word of dwMode is CP_UTF_7, it must be detection result, don't pass it to UTF7 converter
|
|
if ( dwSrcEncoding == CP_UTF_7 && (dwMode >> 16) == CP_UTF_7)
|
|
dwMode &= 0xFFFF;
|
|
// ASSERT(!((IS_ENCODED_ENCODING(dwSrcEncoding) || IS_ENCODED_ENCODING(dwDstEncoding)) && (NULL == lpdwMode)));
|
|
hr = INetConvert->DoCodeConvert(&dwMode, lpSrcStr, &nSrcSize, lpDstStr, &nDstSize, dwFlag, lpFallBack);
|
|
|
|
// return the number of bytes processed for the source.
|
|
if (lpnSrcSize)
|
|
*lpnSrcSize = INetConvert->_nSrcSize ;
|
|
INetConvert->ConvertCleanUp();
|
|
}
|
|
else
|
|
{
|
|
int nSize, i ;
|
|
hr = S_OK ;
|
|
BOOL bLeadByte = FALSE ;
|
|
|
|
// only check for windows codepage
|
|
if ( INetConvert->_dwConvertType == 02 && lpSrcStr )
|
|
{
|
|
for ( i=0; i<nSrcSize; i++)
|
|
{
|
|
if (bLeadByte)
|
|
bLeadByte = FALSE ;
|
|
else if (IsDBCSLeadByteEx(dwSrcEncoding,lpSrcStr[i]))
|
|
bLeadByte = TRUE ;
|
|
}
|
|
if (bLeadByte)
|
|
nSrcSize-- ;
|
|
}
|
|
// set input size
|
|
if (lpnSrcSize)
|
|
*lpnSrcSize = nSrcSize ;
|
|
// set output size and copy if we need to
|
|
if (lpDstStr && *lpnDstSize)
|
|
{
|
|
nSize = min(*lpnDstSize,nSrcSize);
|
|
MoveMemory(lpDstStr, lpSrcStr, nSize);
|
|
nDstSize = nSize ;
|
|
}
|
|
else
|
|
nDstSize = nSrcSize ;
|
|
}
|
|
}
|
|
else
|
|
nDstSize = 0 ;
|
|
|
|
delete INetConvert;
|
|
|
|
// return the number of bytes copied for the destination,
|
|
if (lpnDstSize)
|
|
*lpnDstSize = nDstSize;
|
|
|
|
if (lpdwMode && lpDstStr)
|
|
{
|
|
if (dwDetectResult) // CP_AUTO conversion
|
|
{
|
|
dwMode &= 0xFFFF; // Clear HIGHWORD in case private converter set it
|
|
// If we have detection result, return it in HIGHWORD
|
|
// in the case of UTF7 conversion, private converter might use high word to store internal data,
|
|
// this will conflict with our logic of returning detection result in high word, it is a design flaw,
|
|
// currently, we ignore conversion setting and give detection result more priority
|
|
if (dwDetectResult != CP_UNDEFINED)
|
|
dwMode |= dwDetectResult;
|
|
}
|
|
*lpdwMode = dwMode ;
|
|
}
|
|
|
|
return hr ;
|
|
}
|
|
|
|
// We already published this API, keep it for backward compatibility
|
|
HRESULT WINAPI ConvertINetReset(void)
|
|
{
|
|
// Always suceed
|
|
return S_OK ;
|
|
}
|
|
|
|
HRESULT WINAPI ConvertINetMultiByteToUnicodeEx(LPDWORD lpdwMode, DWORD dwEncoding, LPCSTR lpSrcStr, LPINT lpnMultiCharCount, LPWSTR lpDstStr, LPINT lpnWideCharCount, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
HRESULT hr ;
|
|
int nByteCountSize = 0;
|
|
|
|
if (lpnWideCharCount)
|
|
{
|
|
nByteCountSize = *lpnWideCharCount * sizeof(WCHAR);
|
|
}
|
|
|
|
#ifdef UNIX
|
|
int saved_nByteCountSize = nByteCountSize;
|
|
#endif /* UNIX */
|
|
|
|
hr = ConvertINetStringEx(lpdwMode,dwEncoding, CP_UCS_2, lpSrcStr, lpnMultiCharCount, (LPSTR)lpDstStr, &nByteCountSize, dwFlag, lpFallBack) ;
|
|
|
|
#ifdef UNIX
|
|
if(dwEncoding == 1200 || dwEncoding == 65000 || dwEncoding == 65001 ||
|
|
(dwEncoding == 50001 && !_IsValidCodePage(dwEncoding)) )
|
|
{
|
|
/*
|
|
* On unix we need to convert the little endian mode 2 byte unicode
|
|
* format to unix mode 4 byte wChars.
|
|
*/
|
|
if(lpDstStr && (saved_nByteCountSize < (nByteCountSize/2)*sizeof(WCHAR)))
|
|
hr = E_FAIL;
|
|
else
|
|
{
|
|
/*
|
|
* Use a temporary array to do the 2byte -> 4byte conversion
|
|
*/
|
|
LPSTR pTmp = (LPSTR) lpDstStr;
|
|
LPWSTR pw4 = NULL;
|
|
|
|
if(pTmp) /* allocate only if we have a lpDstStr */
|
|
pw4 = new WCHAR[nByteCountSize/2];
|
|
if(pw4)
|
|
{
|
|
int i = 0;
|
|
LPWSTR pw4Tmp = pw4;
|
|
for(; i < nByteCountSize/2; i++)
|
|
*pw4Tmp++ = (UCHAR)pTmp[i*2];
|
|
pw4Tmp = pw4;
|
|
for(i = 0; i < nByteCountSize/2; i++)
|
|
*lpDstStr++ = *pw4Tmp++;
|
|
}
|
|
if(!pw4 && pTmp) /* if lpDstStr and allocate fails bail out */
|
|
hr = E_FAIL;
|
|
delete [] pw4;
|
|
}
|
|
nByteCountSize *= 2; // Expand twice as we have 4 byte wchars.
|
|
}
|
|
#endif
|
|
*lpnWideCharCount = nByteCountSize / sizeof(WCHAR);
|
|
|
|
return hr ;
|
|
}
|
|
|
|
|
|
HRESULT WINAPI ConvertINetUnicodeToMultiByteEx(LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
HRESULT hr ;
|
|
int nByteCountSize=-1;
|
|
|
|
if(lpnWideCharCount && *lpnWideCharCount != -1)
|
|
nByteCountSize = *lpnWideCharCount * sizeof(WCHAR);
|
|
|
|
hr = ConvertINetStringEx(lpdwMode,CP_UCS_2, dwEncoding, (LPCSTR) lpSrcStr, &nByteCountSize, lpDstStr, lpnMultiCharCount, dwFlag, lpFallBack);
|
|
|
|
#ifdef UNIX
|
|
if(dwEncoding == 1200 || dwEncoding == 65000 || dwEncoding == 65001) {
|
|
nByteCountSize *= 2; // Expand twice as we have 4 byte wchars.
|
|
}
|
|
#endif /* UNIX */
|
|
|
|
if (lpnWideCharCount)
|
|
*lpnWideCharCount = nByteCountSize / sizeof(WCHAR);
|
|
|
|
return hr ;
|
|
}
|
|
|
|
HRESULT WINAPI ConvertINetString(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize)
|
|
{
|
|
HRESULT hr ;
|
|
|
|
hr = ConvertINetStringEx(lpdwMode,dwSrcEncoding,dwDstEncoding,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, 0, NULL);
|
|
|
|
return hr ;
|
|
}
|
|
|
|
HRESULT WINAPI ConvertINetUnicodeToMultiByte(LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount)
|
|
{
|
|
HRESULT hr ;
|
|
DWORD dwFlag = 0 ;
|
|
|
|
if ( lpdwMode )
|
|
dwFlag |= ( *lpdwMode & 0x00008000 ) ? MLCONVCHARF_ENTITIZE : 0 ;
|
|
|
|
hr = ConvertINetUnicodeToMultiByteEx(lpdwMode,dwEncoding,lpSrcStr,lpnWideCharCount,lpDstStr,lpnMultiCharCount,dwFlag,NULL);
|
|
|
|
return hr ;
|
|
}
|
|
|
|
HRESULT WINAPI ConvertINetMultiByteToUnicode(LPDWORD lpdwMode, DWORD dwEncoding, LPCSTR lpSrcStr, LPINT lpnMultiCharCount, LPWSTR lpDstStr, LPINT lpnWideCharCount)
|
|
{
|
|
HRESULT hr ;
|
|
|
|
hr = ConvertINetMultiByteToUnicodeEx(lpdwMode,dwEncoding,lpSrcStr,lpnMultiCharCount,lpDstStr,lpnWideCharCount, 0, NULL);
|
|
|
|
return hr ;
|
|
}
|
|
|
|
#define STR_BUFFER_SIZE 2048
|
|
|
|
HRESULT _ConvertINetStringInIStream(CICharConverter * INetConvert, LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, IStream *pstmIn, IStream *pstmOut, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
DWORD dwMode, dwModeTemp ;
|
|
HRESULT hr= S_OK, hrWarnings=S_OK;
|
|
LPSTR lpstrIn = NULL, lpstrOut = NULL;
|
|
ULONG nSrcSize, nSrcUsed, nSrcLeft, nDstSize, _nDstSize, nOutBuffSize ;
|
|
|
|
if (lpdwMode)
|
|
dwMode = *lpdwMode ;
|
|
|
|
// allocate a temp input buffer - 2K in size
|
|
if ( (lpstrIn = (LPSTR) LocalAlloc(LPTR, STR_BUFFER_SIZE )) == NULL )
|
|
{
|
|
hrWarnings = E_OUTOFMEMORY ;
|
|
goto exit;
|
|
}
|
|
|
|
if ( (lpstrOut = (LPSTR) LocalAlloc(LPTR, STR_BUFFER_SIZE * 2 )) == NULL )
|
|
{
|
|
hrWarnings = E_OUTOFMEMORY ;
|
|
goto exit;
|
|
}
|
|
|
|
nOutBuffSize = STR_BUFFER_SIZE * 2 ;
|
|
nSrcLeft = 0 ;
|
|
|
|
// In real world, clients uses 28591 as 1252, 28599 as 1254,
|
|
// To correctly convert those extended characters to Unicode,
|
|
// We internally replace it with 1252
|
|
if (dwDstEncoding == CP_UCS_2 || dwDstEncoding == CP_UCS_2_BE)
|
|
{
|
|
if ((dwSrcEncoding == CP_ISO_8859_1) && _IsValidCodePage(CP_1252))
|
|
dwSrcEncoding = CP_1252;
|
|
|
|
if ((dwSrcEncoding == CP_ISO_8859_9) && _IsValidCodePage(CP_1254))
|
|
dwSrcEncoding = CP_1254;
|
|
}
|
|
|
|
if ((dwDstEncoding == CP_1252) && (dwSrcEncoding == CP_ISO_8859_1))
|
|
{
|
|
dwSrcEncoding = CP_1252;
|
|
}
|
|
|
|
if ((dwDstEncoding == CP_1254) && (dwSrcEncoding == CP_ISO_8859_9))
|
|
{
|
|
dwSrcEncoding = CP_1254;
|
|
}
|
|
|
|
|
|
if ( dwSrcEncoding == CP_JP_AUTO ) // Auto Detection for Japan
|
|
{
|
|
CIncdJapanese DetectJapan;
|
|
UINT uiCodePage ;
|
|
LARGE_INTEGER li;
|
|
|
|
uiCodePage = ( dwMode >> 16 ) & 0xffff ;
|
|
if ( uiCodePage )
|
|
dwSrcEncoding = uiCodePage ;
|
|
else
|
|
{
|
|
LISet32(li, 0);
|
|
|
|
hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE , &nSrcSize);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
|
|
dwSrcEncoding = DetectJapan.DetectStringA(lpstrIn, nSrcSize);
|
|
// if dwSrcEncoding is zero means there is an ambiguity, we don't return
|
|
// the detected codepage to caller, instead we defaut its codepage internally
|
|
// to SJIS
|
|
if (dwSrcEncoding)
|
|
{
|
|
dwMode &= 0x0000ffff ;
|
|
dwMode |= dwSrcEncoding << 16 ;
|
|
}
|
|
else
|
|
dwSrcEncoding = CP_JPN_SJ;
|
|
}
|
|
}
|
|
// bug #43190, we auto-detect again for euc-kr page because IMN ver 1.0
|
|
// mislabel an ISO-KR page as a ks_c_5601-1987 page. This is the only way
|
|
// we can fix that mistake.
|
|
else if ( dwSrcEncoding == CP_KR_AUTO || dwSrcEncoding == CP_KOR_5601 ||
|
|
dwSrcEncoding == CP_EUC_KR )
|
|
{
|
|
CIncdKorean DetectKorean;
|
|
UINT uiCodePage ;
|
|
LARGE_INTEGER li;
|
|
|
|
uiCodePage = ( dwMode >> 16 ) & 0xffff ;
|
|
if ( uiCodePage )
|
|
dwSrcEncoding = uiCodePage ;
|
|
else
|
|
{
|
|
LISet32(li, 0);
|
|
|
|
hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE, &nSrcSize);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
dwSrcEncoding = DetectKorean.DetectStringA(lpstrIn, nSrcSize);
|
|
if (dwSrcEncoding)
|
|
{
|
|
dwMode &= 0x0000ffff ;
|
|
dwMode |= dwSrcEncoding << 16 ;
|
|
}
|
|
else
|
|
dwSrcEncoding = CP_KOR_5601;
|
|
}
|
|
}
|
|
else if ( dwSrcEncoding == CP_AUTO ) // General Auto Detection for all code pages
|
|
{
|
|
INT nScores = 1;
|
|
DWORD dwSrcEncoding ;
|
|
DetectEncodingInfo Encoding;
|
|
UINT uiCodePage ;
|
|
LARGE_INTEGER li;
|
|
|
|
uiCodePage = ( dwMode >> 16 ) & 0xffff ;
|
|
if ( uiCodePage )
|
|
dwSrcEncoding = uiCodePage ;
|
|
else
|
|
{
|
|
LISet32(li, 0);
|
|
|
|
hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE , &nSrcSize);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
|
|
if (DETECTION_MAX_LEN < nSrcSize)
|
|
nSrcSize = DETECTION_MAX_LEN;
|
|
|
|
if ( S_OK == _DetectInputCodepage(MLDETECTCP_HTML, 1252, lpstrIn, (int *)&nSrcSize, &Encoding, &nScores))
|
|
{
|
|
dwSrcEncoding = Encoding.nCodePage;
|
|
dwMode &= 0x0000ffff ;
|
|
dwMode |= dwSrcEncoding << 16 ;
|
|
}
|
|
else
|
|
{
|
|
dwSrcEncoding = CP_ACP;
|
|
}
|
|
aEncodingInfo[GetWindowsEncodingIndex(CP_AUTO)].dwCodePage = dwSrcEncoding;
|
|
}
|
|
}
|
|
|
|
if ( S_OK == ( hr = INetConvert->ConvertSetup(&dwSrcEncoding,dwDstEncoding )))
|
|
{
|
|
// Loop for ever
|
|
while(1)
|
|
{
|
|
// Read a buffer
|
|
hr = pstmIn->Read(&lpstrIn[nSrcLeft], STR_BUFFER_SIZE-nSrcLeft, &nSrcSize);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
|
|
// Done
|
|
if (0 == nSrcSize)
|
|
break;
|
|
|
|
nSrcSize += nSrcLeft ;
|
|
nSrcUsed = nSrcSize ;
|
|
dwModeTemp = dwMode ;
|
|
nDstSize = 0 ;
|
|
|
|
// get the size of output buffer
|
|
hr = INetConvert->DoCodeConvert(&dwModeTemp, (LPCSTR)lpstrIn, (LPINT)&nSrcUsed, NULL, (LPINT)&nDstSize, dwFlag, lpFallBack);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
|
|
// Reallocate output buffer if so
|
|
if ( nDstSize > nOutBuffSize )
|
|
{
|
|
LPSTR psz = (LPSTR) LocalReAlloc(lpstrOut, nDstSize, LMEM_ZEROINIT|LMEM_MOVEABLE);
|
|
if (psz == NULL)
|
|
{
|
|
hrWarnings = E_OUTOFMEMORY ;
|
|
goto exit;
|
|
}
|
|
lpstrOut = psz;
|
|
nOutBuffSize = nDstSize ;
|
|
}
|
|
_nDstSize = nDstSize;
|
|
|
|
// Due to multi_stage conversion, this is the actual size is used
|
|
nSrcUsed = INetConvert->_nSrcSize ;
|
|
nSrcLeft = nSrcSize - nSrcUsed ;
|
|
|
|
#if 0
|
|
// restore Src size
|
|
nSrcUsed = nSrcSize ;
|
|
#endif
|
|
// do conversion
|
|
hr = INetConvert->DoCodeConvert(&dwMode, (LPCSTR)lpstrIn, (LPINT)&nSrcUsed, lpstrOut, (LPINT)&_nDstSize, dwFlag, lpFallBack);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
|
|
// Write It
|
|
hr = pstmOut->Write(lpstrOut, nDstSize, &nDstSize);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
|
|
if (nSrcLeft )
|
|
MoveMemory(lpstrIn, &lpstrIn[nSrcSize-nSrcLeft],nSrcLeft);
|
|
|
|
INetConvert->ConvertCleanUp();
|
|
}
|
|
}
|
|
|
|
if (nSrcLeft )
|
|
{
|
|
LARGE_INTEGER li;
|
|
|
|
LISet32(li, -(LONG)nSrcLeft );
|
|
hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL);
|
|
}
|
|
|
|
if (lpdwMode)
|
|
*lpdwMode = dwMode ;
|
|
|
|
exit :
|
|
if (lpstrIn)
|
|
LocalFree(lpstrIn);
|
|
if (lpstrOut)
|
|
LocalFree(lpstrOut);
|
|
|
|
// Done
|
|
return (hr == S_OK) ? hrWarnings : hr;
|
|
}
|
|
|
|
|
|
HRESULT WINAPI ConvertINetStringInIStream(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, IStream *pstmIn, IStream *pstmOut, DWORD dwFlag, WCHAR *lpFallBack)
|
|
{
|
|
HRESULT hr;
|
|
CICharConverter * INetConvert = new CICharConverter(dwFlag, lpFallBack) ;
|
|
|
|
if (!INetConvert)
|
|
return E_OUTOFMEMORY;
|
|
|
|
hr = _ConvertINetStringInIStream(INetConvert,lpdwMode,dwSrcEncoding,dwDstEncoding,pstmIn,pstmOut,dwFlag,lpFallBack);
|
|
|
|
delete INetConvert;
|
|
|
|
return hr ;
|
|
}
|
|
|