Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2218 lines
78 KiB

  1. #include "private.h"
  2. #include "detcbase.h"
  3. #include "codepage.h"
  4. #include "detcjpn.h"
  5. #include "detckrn.h"
  6. #include "fechrcnv.h"
  7. #include "ichrcnv.h"
  8. #include "cpdetect.h"
  9. #define CONV_UU 12
  10. #define CONV_UUW 10
  11. #define CONV_UUWI 9
  12. #define CONV_UW 6
  13. #define CONV_UWI 5
  14. #define CONV_WI 3
  15. #define MAX_CHAR_SIZE 4
  16. #define MAPUSERDEF(x) (((x) == 50000) ? 1252 : (x))
  17. #define CONVERT_IS_VALIDCODEPAGE(x) (((x) == CP_USER_DEFINED) ? TRUE: IsValidCodePage(x))
  18. #define CONV_CHK_NLS 0x00000001
  19. struct ENCODINGINFO
  20. {
  21. DWORD dwEncoding;
  22. DWORD dwCodePage;
  23. BYTE bTypeUUIW;
  24. CP_STATE nCP_State ; // whether this is a valid windows codepage ?
  25. DWORD dwFlags; // give us more flexibilities to handle different encodings differently
  26. };
  27. static WCHAR UniocdeSignature = { 0xFFFE } ;
  28. /*
  29. Bit 4 (16) - Unicode <-> Internet Encoding
  30. Bit 3 (8) - UTF8, UTF7
  31. Bit 2 (4) - Unicode
  32. Bit 1 (2) - Windows CodePage
  33. Bit 0 (1) - Internet Encoding
  34. P.S. if bit 4 is set, it means it should convert between Unicode and Internet
  35. Encoding directly, no intermediate step - Windows CodePage
  36. */
  37. // these codepages including Unicode need special convertor
  38. static struct ENCODINGINFO aEncodingInfo[] =
  39. {
  40. { CP_JPN_SJ, 932, 0x02, INVALID_CP, 0 }, // W-Japanese Shift JIS
  41. { CP_CHN_GB, 936, 0x02, INVALID_CP, 0 }, // W-Simplified Chinese
  42. { CP_KOR_5601, 949, 0x02, INVALID_CP, 0 }, // W-Krean Unified Hangul
  43. { CP_TWN, 950, 0x02, INVALID_CP, 0 }, // W-Traditional Chinese
  44. { CP_UCS_2, 0, 0x04, INVALID_CP, 0 }, // U-Unicode
  45. { CP_UCS_2_BE, 0, 0x04, INVALID_CP, 0 }, // U-Unicode Big Endian
  46. { CP_1252, 1252, 0x02, INVALID_CP, 0 }, // W-Latin 1
  47. { CP_20127, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // US ASCII
  48. { CP_ISO_8859_1, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // I-ISO 8859-1 Latin 1
  49. { CP_ISO_8859_15, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // I-ISO 8859-1 Latin 1
  50. { CP_AUTO, 1252, 0x01, INVALID_CP, 0 }, // General auto detect
  51. { CP_ISO_2022_JP, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP No Halfwidth Katakana
  52. { CP_ISO_2022_JP_ESC, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP w/esc Halfwidth Katakana
  53. { CP_ISO_2022_JP_SIO, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP w/sio Halfwidth Katakana
  54. { CP_ISO_2022_KR, 949, 0x01, INVALID_CP, 0 }, // I-ISO 2022-KR
  55. { CP_ISO_2022_TW, 950, 0x01, INVALID_CP, 0 }, // I-ISO 2022-TW
  56. { CP_ISO_2022_CH, 936, 0x01, INVALID_CP, 0 }, // I-ISO 2022-CH
  57. { CP_JP_AUTO, 932, 0x01, INVALID_CP, 0 }, // JP auto detect
  58. { CP_CHS_AUTO, 936, 0x01, INVALID_CP, 0 }, // Simplified Chinese auto detect
  59. { CP_KR_AUTO, 949, 0x01, INVALID_CP, 0 }, // KR auto detect
  60. { CP_CHT_AUTO, 950, 0x01, INVALID_CP, 0 }, // Traditional Chinese auto detect
  61. { CP_CYRILLIC_AUTO, 1251, 0x01, INVALID_CP, 0 }, // Cyrillic auto detect
  62. { CP_GREEK_AUTO, 1253, 0x01, INVALID_CP, 0 }, // Greek auto detect
  63. { CP_ARABIC_AUTO, 1256, 0x01, INVALID_CP, 0 }, // Arabic auto detect
  64. { CP_EUC_JP, 932, 0x01, INVALID_CP, 0 }, // EUC Japanese
  65. { CP_EUC_CH, 936, 0x01, INVALID_CP, 0 }, // EUC Chinese
  66. { CP_EUC_KR, 949, 0x01, INVALID_CP, 0 }, // EUC Korean
  67. { CP_EUC_TW, 950, 0x01, INVALID_CP, 0 }, // EUC Taiwanese
  68. { CP_CHN_HZ, 936, 0x01, INVALID_CP, 0 }, // Simplify Chinese HZ-GB
  69. { CP_UTF_7, 0, 0x08, INVALID_CP, 0 }, // U-UTF7
  70. { CP_UTF_8, 0, 0x08, INVALID_CP, 0 }, // U-UTF8
  71. };
  72. // HTML name entity table for Latin-1 Supplement - from 0x00A0-0x00FF
  73. #define NAME_ENTITY_OFFSET 0x00A0
  74. #define NAME_ENTITY_MAX 0x00FF
  75. #define NAME_ENTITY_ENTRY 96
  76. static CHAR *g_lpstrNameEntity[NAME_ENTITY_ENTRY] =
  77. {
  78. "&nbsp;", // "&#160;" -- no-break space = non-breaking space,
  79. "&iexcl;", // "&#161;" -- inverted exclamation mark, U+00A1 ISOnum -->
  80. "&cent;", // "&#162;" -- cent sign, U+00A2 ISOnum -->
  81. "&pound;", // "&#163;" -- pound sign, U+00A3 ISOnum -->
  82. "&curren;", // "&#164;" -- currency sign, U+00A4 ISOnum -->
  83. "&yen;", // "&#165;" -- yen sign = yuan sign, U+00A5 ISOnum -->
  84. "&brvbar;", // "&#166;" -- broken bar = broken vertical bar,
  85. "&sect;", // "&#167;" -- section sign, U+00A7 ISOnum -->
  86. "&uml;", // "&#168;" -- diaeresis = spacing diaeresis,
  87. "&copy;", // "&#169;" -- copyright sign, U+00A9 ISOnum -->
  88. "&ordf;", // "&#170;" -- feminine ordinal indicator, U+00AA ISOnum -->
  89. "&laquo;", // "&#171;" -- left-pointing double angle quotation mark
  90. "&not;", // "&#172;" -- not sign = discretionary hyphen,
  91. "&shy;", // "&#173;" -- soft hyphen = discretionary hyphen,
  92. "&reg;", // "&#174;" -- registered sign = registered trade mark sign,
  93. "&macr;", // "&#175;" -- macron = spacing macron = overline
  94. "&deg;", // "&#176;" -- degree sign, U+00B0 ISOnum -->
  95. "&plusmn;", // "&#177;" -- plus-minus sign = plus-or-minus sign,
  96. "&sup2;", // "&#178;" -- superscript two = superscript digit two
  97. "&sup3;", // "&#179;" -- superscript three = superscript digit three
  98. "&acute;", // "&#180;" -- acute accent = spacing acute,
  99. "&micro;", // "&#181;" -- micro sign, U+00B5 ISOnum -->
  100. "&para;", // "&#182;" -- pilcrow sign = paragraph sign,
  101. "&middot;", // "&#183;" -- middle dot = Georgian comma
  102. "&cedil;", // "&#184;" -- cedilla = spacing cedilla, U+00B8 ISOdia -->
  103. "&sup1;", // "&#185;" -- superscript one = superscript digit one,
  104. "&ordm;", // "&#186;" -- masculine ordinal indicator,
  105. "&raquo;", // "&#187;" -- right-pointing double angle quotation mark
  106. "&frac14;", // "&#188;" -- vulgar fraction one quarter
  107. "&frac12;", // "&#189;" -- vulgar fraction one half
  108. "&frac34;", // "&#190;" -- vulgar fraction three quarters
  109. "&iquest;", // "&#191;" -- inverted question mark
  110. "&Agrave;", // "&#192;" -- latin capital letter A with grave
  111. "&Aacute;", // "&#193;" -- latin capital letter A with acute,
  112. "&Acirc;", // "&#194;" -- latin capital letter A with circumflex,
  113. "&Atilde;", // "&#195;" -- latin capital letter A with tilde,
  114. "&Auml;", // "&#196;" -- latin capital letter A with diaeresis,
  115. "&Aring;", // "&#197;" -- latin capital letter A with ring above
  116. "&AElig;", // "&#198;" -- latin capital letter AE
  117. "&Ccedil;", // "&#199;" -- latin capital letter C with cedilla,
  118. "&Egrave;", // "&#200;" -- latin capital letter E with grave,
  119. "&Eacute;", // "&#201;" -- latin capital letter E with acute,
  120. "&Ecirc;", // "&#202;" -- latin capital letter E with circumflex,
  121. "&Euml;", // "&#203;" -- latin capital letter E with diaeresis,
  122. "&Igrave;", // "&#204;" -- latin capital letter I with grave,
  123. "&Iacute;", // "&#205;" -- latin capital letter I with acute,
  124. "&Icirc;", // "&#206;" -- latin capital letter I with circumflex,
  125. "&Iuml;", // "&#207;" -- latin capital letter I with diaeresis,
  126. "&ETH;", // "&#208;" -- latin capital letter ETH, U+00D0 ISOlat1 -->
  127. "&Ntilde;", // "&#209;" -- latin capital letter N with tilde,
  128. "&Ograve;", // "&#210;" -- latin capital letter O with grave,
  129. "&Oacute;", // "&#211;" -- latin capital letter O with acute,
  130. "&Ocirc;", // "&#212;" -- latin capital letter O with circumflex,
  131. "&Otilde;", // "&#213;" -- latin capital letter O with tilde,
  132. "&Ouml;", // "&#214;" -- latin capital letter O with diaeresis,
  133. "&times;", // "&#215;" -- multiplication sign, U+00D7 ISOnum -->
  134. "&Oslash;", // "&#216;" -- latin capital letter O with stroke
  135. "&Ugrave;", // "&#217;" -- latin capital letter U with grave,
  136. "&Uacute;", // "&#218;" -- latin capital letter U with acute,
  137. "&Ucirc;", // "&#219;" -- latin capital letter U with circumflex,
  138. "&Uuml;", // "&#220;" -- latin capital letter U with diaeresis,
  139. "&Yacute;", // "&#221;" -- latin capital letter Y with acute,
  140. "&THORN;", // "&#222;" -- latin capital letter THORN,
  141. "&szlig;", // "&#223;" -- latin small letter sharp s = ess-zed,
  142. "&agrave;", // "&#224;" -- latin small letter a with grave
  143. "&aacute;", // "&#225;" -- latin small letter a with acute,
  144. "&acirc;", // "&#226;" -- latin small letter a with circumflex,
  145. "&atilde;", // "&#227;" -- latin small letter a with tilde,
  146. "&auml;", // "&#228;" -- latin small letter a with diaeresis,
  147. "&aring;", // "&#229;" -- latin small letter a with ring above
  148. "&aelig;", // "&#230;" -- latin small letter ae
  149. "&ccedil;", // "&#231;" -- latin small letter c with cedilla,
  150. "&egrave;", // "&#232;" -- latin small letter e with grave,
  151. "&eacute;", // "&#233;" -- latin small letter e with acute,
  152. "&ecirc;", // "&#234;" -- latin small letter e with circumflex,
  153. "&euml;", // "&#235;" -- latin small letter e with diaeresis,
  154. "&igrave;", // "&#236;" -- latin small letter i with grave,
  155. "&iacute;", // "&#237;" -- latin small letter i with acute,
  156. "&icirc;", // "&#238;" -- latin small letter i with circumflex,
  157. "&iuml;", // "&#239;" -- latin small letter i with diaeresis,
  158. "&eth;", // "&#240;" -- latin small letter eth, U+00F0 ISOlat1 -->
  159. "&ntilde;", // "&#241;" -- latin small letter n with tilde,
  160. "&ograve;", // "&#242;" -- latin small letter o with grave,
  161. "&oacute;", // "&#243;" -- latin small letter o with acute,
  162. "&ocirc;", // "&#244;" -- latin small letter o with circumflex,
  163. "&otilde;", // "&#245;" -- latin small letter o with tilde,
  164. "&ouml;", // "&#246;" -- latin small letter o with diaeresis,
  165. "&divide;", // "&#247;" -- division sign, U+00F7 ISOnum -->
  166. "&oslash;", // "&#248;" -- latin small letter o with stroke,
  167. "&ugrave;", // "&#249;" -- latin small letter u with grave,
  168. "&uacute;", // "&#250;" -- latin small letter u with acute,
  169. "&ucirc;", // "&#251;" -- latin small letter u with circumflex,
  170. "&uuml;", // "&#252;" -- latin small letter u with diaeresis,
  171. "&yacute;", // "&#253;" -- latin small letter y with acute,
  172. "&thorn;", // "&#254;" -- latin small letter thorn with,
  173. "&yuml;", // "&#255;" -- latin small letter y with diaeresis,
  174. };
  175. #ifdef MORE_NAME_ENTITY // in case we decide to do more name entity latter
  176. // Additional HTML 4.0 name entity table for CP 1252 extension character set
  177. #define CP1252EXT_BASE (UINT)0x0080
  178. #define CP1252EXT_MAX (UINT)0x009F
  179. #define NONUNI 0xFFFF
  180. #define UNDEFCHAR "???????"
  181. #define CP1252EXT_NCR_SIZE 7
  182. struct NAME_ENTITY_EXT
  183. {
  184. UWORD uwUniCode;
  185. LPCTSTR lpszNameEntity;
  186. };
  187. static struct NAME_ENTITY_EXT aNameEntityExt[] =
  188. {
  189. // UniCode NCR_Enty Name_Enty CP1252Ext Comment
  190. { 0x20AC, "&#8364;" }, // "&euro;" }, // &#128; #EURO SIGN
  191. // { NONUNI, UNDEFCHAR }, // "&;" }, // &#129; #UNDEFINED
  192. { 0x201A, "&#8218;" }, // "&sbquo;" }, // &#130; #SINGLE LOW-9 QUOTATION MARK
  193. { 0x0192, "&#0402;" }, // "&fnof;" }, // &#131; #LATIN SMALL LETTER F WITH HOOK
  194. { 0x201E, "&#8222;" }, // "&bdquo;" }, // &#132; #DOUBLE LOW-9 QUOTATION MARK
  195. { 0x2026, "&#8230;" }, // "&hellip;" }, // &#133; #HORIZONTAL ELLIPSIS
  196. { 0x2020, "&#8224;" }, // "&dagger;" }, // &#134; #DAGGER
  197. { 0x2021, "&#8225;" }, // "&Dagger;" }, // &#135; #DOUBLE DAGGER
  198. { 0x02C6, "&#0710;" }, // "&circ;" }, // &#136; #MODIFIER LETTER CIRCUMFLEX ACCENT
  199. { 0x2030, "&#8240;" }, // "&permil;" }, // &#137; #PER MILLE SIGN
  200. { 0x0160, "&#0352;" }, // "&Scaron;" }, // &#138; #LATIN CAPITAL LETTER S WITH CARON
  201. { 0x2039, "&#8249;" }, // "&lsaquo;" }, // &#139; #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  202. { 0x0152, "&#0338;" }, // "&OElig;" }, // &#140; #LATIN CAPITAL LIGATURE OE
  203. // { NONUNI, UNDEFCHAR }, // "&;" }, // &#141; #UNDEFINED
  204. { 0x017D, "&#0381;" }, // "&;" }, // &#142; #LATIN CAPITAL LETTER Z WITH CARON, ***no name entity defined in HTML 4.0***
  205. // { NONUNI, UNDEFCHAR }, // "&;" }, // &#143; #UNDEFINED
  206. // { NONUNI, UNDEFCHAR }, // "&;" }, // &#144; #UNDEFINED
  207. { 0x2018, "&#8216;" }, // "&lsquo;" }, // &#145; #LEFT SINGLE QUOTATION MARK
  208. { 0x2019, "&#8217;" }, // "&rsquo;" }, // &#146; #RIGHT SINGLE QUOTATION MARK
  209. { 0x201C, "&#8220;" }, // "&ldquo;" }, // &#147; #LEFT DOUBLE QUOTATION MARK
  210. { 0x201D, "&#8221;" }, // "&rdquo;" }, // &#148; #RIGHT DOUBLE QUOTATION MARK
  211. { 0x2022, "&#8226;" }, // "&bull;" }, // &#149; #BULLET
  212. { 0x2013, "&#8211;" }, // "&ndash;" }, // &#150; #EN DASH
  213. { 0x2014, "&#8212;" }, // "&mdash;" }, // &#151; #EM DASH
  214. { 0x20DC, "&#0732;" }, // "&tilde;" }, // &#152; #SMALL TILDE
  215. { 0x2122, "&#8482;" }, // "&trade;" }, // &#153; #TRADE MARK SIGN
  216. { 0x0161, "&#0353;" }, // "&scaron;" }, // &#154; #LATIN SMALL LETTER S WITH CARON
  217. { 0x203A, "&#8250;" }, // "&rsaquo;" }, // &#155; #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  218. { 0x0153, "&#0339;" }, // "&oelig;" }, // &#156; #LATIN SMALL LIGATURE OE
  219. // { NONUNI, UNDEFCHAR }, // "&;" }, // &#157; #UNDEFINED
  220. { 0x017E, "&#0382;" }, // "&;" }, // &#158; #LATIN SMALL LETTER Z WITH CARON, ***no name entity defined in HTML 4.0***
  221. { 0x0178, "&#0376;" }, // "&Yuml;" }, // &#159; #LATIN CAPITAL LETTER Y WITH DIAERESIS
  222. };
  223. #endif
  224. HRESULT WINAPI DoConvertINetString(LPDWORD lpdwMode, BOOL fInbound, UINT uCodePage, int nCodeSet, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDestStr, int cchDest, LPINT lpnSize);
  225. /******************************************************************************
  226. ***************************** U T I L I T I E S ***************************
  227. ******************************************************************************/
  228. void DataByteSwap(LPSTR DataBuf, int len )
  229. {
  230. int i ;
  231. UCHAR tmpData ;
  232. if ( len )
  233. for ( i = 0 ; i < len-1 ; i+=2 )
  234. {
  235. tmpData = DataBuf[i] ;
  236. DataBuf[i] = DataBuf[i+1] ;
  237. DataBuf[i+1] = tmpData ;
  238. }
  239. return ;
  240. }
  241. void CheckUnicodeDataType(DWORD dwDstEncoding, LPSTR DataBuf, int len )
  242. {
  243. if ( DataBuf && len )
  244. {
  245. if ( dwDstEncoding == CP_UCS_2_BE )
  246. DataByteSwap(DataBuf,len);
  247. }
  248. return ;
  249. }
  250. /******************************************************************************
  251. ****************** C O N V E R T I N E T S T R I N G ******************
  252. ******************************************************************************/
  253. HRESULT CICharConverter::UnicodeToMultiByteEncoding(DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  254. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  255. {
  256. int nBuffSize, i ;
  257. BOOL UseDefChar = FALSE ;
  258. LPSTR lpDefFallBack = NULL ;
  259. UCHAR DefaultCharBuff[3]; // possible DBCS + null
  260. HRESULT hr = E_FAIL;
  261. int _nDstSize = *lpnDstSize;
  262. if ( _dwUnicodeEncoding == CP_UCS_2_BE && _cvt_count == 0 )
  263. {
  264. if ( _lpUnicodeStr = (LPSTR)LocalAlloc(LPTR, *lpnSrcSize ) )
  265. {
  266. MoveMemory(_lpUnicodeStr, lpSrcStr, *lpnSrcSize ) ;
  267. lpSrcStr = _lpUnicodeStr ;
  268. }
  269. else
  270. {
  271. hr = E_OUTOFMEMORY;
  272. goto EXIT;
  273. }
  274. }
  275. CheckUnicodeDataType(_dwUnicodeEncoding, (LPSTR) lpSrcStr, *lpnSrcSize);
  276. nBuffSize = *lpnSrcSize / sizeof(WCHAR);
  277. // We force to use MLang NO_BEST_FIT_CHAR check on ISCII encoding since system don't accept default chars
  278. if (IS_NLS_DLL_CP(dwDstEncoding) && (dwFlag & MLCONVCHARF_USEDEFCHAR))
  279. dwFlag |= MLCONVCHARF_NOBESTFITCHARS;
  280. if ( lpFallBack && ( dwFlag & MLCONVCHARF_USEDEFCHAR ))
  281. {
  282. // only take SBCS, no DBCS character
  283. if ( 1 == WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
  284. (LPCWSTR)lpFallBack, 1,
  285. (LPSTR)DefaultCharBuff, sizeof(DefaultCharBuff), NULL, NULL ))
  286. lpDefFallBack = (LPSTR) DefaultCharBuff;
  287. }
  288. if(!(*lpnDstSize = WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
  289. (LPCWSTR)lpSrcStr, nBuffSize,
  290. lpDstStr, *lpnDstSize, IS_NLS_DLL_CP(dwDstEncoding)? NULL:(LPCSTR)lpDefFallBack, IS_NLS_DLL_CP(dwDstEncoding)? NULL:&UseDefChar)))
  291. {
  292. hr = E_FAIL;
  293. goto EXIT;
  294. }
  295. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  296. _nSrcSize = nBuffSize * sizeof(WCHAR);
  297. if (*lpnDstSize)
  298. {
  299. if (dwFlag & ( MLCONVCHARF_NCR_ENTITIZE | MLCONVCHARF_NAME_ENTITIZE | MLCONVCHARF_NOBESTFITCHARS ))
  300. {
  301. char *lpDstStrTmp = lpDstStr;
  302. WCHAR *lpwStrTmp = NULL;
  303. WCHAR *lpwStrTmpSave = NULL;
  304. char *lpDstStrTmp2 = NULL;
  305. char *lpDstStrTmp2Save = NULL;
  306. int cCount, ConvCount = 0, nCount = 0;
  307. WCHAR *lpwSrcStrTmp = (WCHAR *)lpSrcStr;
  308. int *lpBCharOffset = NULL;
  309. int *lpBCharOffsetSave = NULL;
  310. if (!(lpwStrTmpSave = lpwStrTmp = (WCHAR *)LocalAlloc(LPTR, *lpnSrcSize)))
  311. {
  312. hr = E_OUTOFMEMORY;
  313. goto ENTITIZE_DONE;
  314. }
  315. // Make sure we have real converted buffer to check BEST_FIT_CHAR and DEFAULT_CHAR
  316. if (!_nDstSize)
  317. {
  318. lpDstStrTmp2Save = lpDstStrTmp2 = (char *)LocalAlloc(LPTR, *lpnDstSize);
  319. if (lpDstStrTmp2)
  320. {
  321. WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
  322. (LPCWSTR)lpSrcStr, nBuffSize,
  323. lpDstStrTmp2, *lpnDstSize, NULL, NULL );
  324. }
  325. else
  326. {
  327. hr = E_OUTOFMEMORY;
  328. goto ENTITIZE_DONE;
  329. }
  330. }
  331. if (nBuffSize ==
  332. MultiByteToWideChar(MAPUSERDEF(dwDstEncoding), 0, _nDstSize? lpDstStr : lpDstStrTmp2, *lpnDstSize, lpwStrTmp, _nSrcSize))
  333. {
  334. // Pre scan to get number of best fit chars.
  335. for (i=0; i<nBuffSize; i++)
  336. {
  337. // make special case for ?(yen sign) in Shift-JIS
  338. if (*lpwStrTmp++ != *lpwSrcStrTmp++)
  339. {
  340. if ((dwDstEncoding == CP_JPN_SJ) && (*(lpwSrcStrTmp - 1) == 0x00A5))
  341. *(lpwStrTmp - 1) = 0x00A5;
  342. else
  343. nCount ++;
  344. }
  345. }
  346. lpwSrcStrTmp -= nBuffSize;
  347. lpwStrTmp -= nBuffSize;
  348. if (nCount)
  349. {
  350. int j = 0;
  351. if (!(dwFlag & ( MLCONVCHARF_NCR_ENTITIZE | MLCONVCHARF_NAME_ENTITIZE | MLCONVCHARF_USEDEFCHAR)))
  352. {
  353. hr = E_FAIL;
  354. goto ENTITIZE_DONE;
  355. }
  356. if (!(lpBCharOffsetSave = lpBCharOffset = (int *) LocalAlloc(LPTR, nCount*sizeof(int))))
  357. {
  358. hr = E_OUTOFMEMORY;
  359. goto ENTITIZE_DONE;
  360. }
  361. // Record the offset position of each best fit char.
  362. for (i=0; i<nBuffSize; i++)
  363. {
  364. if (*lpwStrTmp++ != *lpwSrcStrTmp++)
  365. {
  366. *lpBCharOffset = i-j;
  367. lpBCharOffset++;
  368. j = i+1;
  369. }
  370. }
  371. lpBCharOffset -= nCount;
  372. lpwSrcStrTmp -= nBuffSize;
  373. lpwStrTmp -= nBuffSize;
  374. for (i=0; i<nCount; i++)
  375. {
  376. BOOL bIsSurrogatePair = FALSE;
  377. if (*lpBCharOffset)
  378. {
  379. cCount = WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
  380. (LPCWSTR)lpwSrcStrTmp, *lpBCharOffset,
  381. lpDstStrTmp, _nDstSize? _nDstSize-ConvCount : 0, NULL, NULL );
  382. ConvCount += cCount;
  383. if (_nDstSize)
  384. {
  385. lpDstStrTmp += cCount;
  386. }
  387. lpwSrcStrTmp += *lpBCharOffset;
  388. }
  389. BOOL fConverted = FALSE;
  390. // check if unconvertable character falls in NAME ENTITY area
  391. if (dwFlag & MLCONVCHARF_NAME_ENTITIZE)
  392. {
  393. // for beta2, make assmption that name entity implys NCR.
  394. dwFlag |= MLCONVCHARF_NCR_ENTITIZE;
  395. #ifdef MORE_NAME_ENTITY // in case we decide do more name entity latter
  396. BOOL fDoNEnty = FALSE;
  397. LPCTSTR lpszNEnty = NULL;
  398. // check if character is in the Latin-1 Supplement range
  399. if ((*lpwSrcStrTmp >= NAME_ENTITY_OFFSET) && (*lpwSrcStrTmp <= NAME_ENTITY_MAX ))
  400. {
  401. fDoNEnty = TRUE;
  402. lpszNEnty = g_lpstrNameEntity[(*lpwSrcStrTmp) - NAME_ENTITY_OFFSET];
  403. }
  404. // check if character is in the additional name entity table for CP 1252 extension
  405. if (!fDoNEnty)
  406. {
  407. for (int idx = 0; idx < ARRAYSIZE(aNameEntityExt); idx++)
  408. if (*lpwSrcStrTmp == aNameEntityExt[idx].uwUniCode)
  409. {
  410. fDoNEnty = TRUE;
  411. lpszNEnty = aNameEntityExt[idx].lpszNameEntity;
  412. break;
  413. }
  414. }
  415. if (fDoNEnty)
  416. {
  417. cCount = lstrlenA(lpszNEnty);
  418. if (_nDstSize)
  419. {
  420. CopyMemory(lpDstStrTmp, lpszNEnty, cCount);
  421. lpDstStrTmp += cCount ;
  422. }
  423. ConvCount += cCount;
  424. fConverted = TRUE;
  425. }
  426. #else
  427. // check if character is in the Latin-1 Supplement range
  428. if ((*lpwSrcStrTmp >= NAME_ENTITY_OFFSET)
  429. && (*lpwSrcStrTmp < ARRAYSIZE(g_lpstrNameEntity)+NAME_ENTITY_OFFSET))
  430. {
  431. LPCTSTR lpszNEnty = NULL;
  432. if (!(lpszNEnty = g_lpstrNameEntity[(*lpwSrcStrTmp) - NAME_ENTITY_OFFSET]))
  433. {
  434. #ifdef DEBUG
  435. AssertMsg((BOOL)FALSE, "Name entity table broken");
  436. #endif
  437. hr = E_FAIL;
  438. goto ENTITIZE_DONE;
  439. }
  440. cCount = lstrlenA(lpszNEnty);
  441. if (_nDstSize)
  442. {
  443. CopyMemory(lpDstStrTmp, lpszNEnty, cCount);
  444. lpDstStrTmp += cCount ;
  445. }
  446. ConvCount += cCount;
  447. fConverted = TRUE;
  448. }
  449. #endif
  450. }
  451. // check if NCR requested
  452. if ((!fConverted) && (dwFlag & MLCONVCHARF_NCR_ENTITIZE))
  453. {
  454. if ((nCount-i >= 2) &&
  455. (*lpwSrcStrTmp >= 0xD800 && *lpwSrcStrTmp <= 0xDBFF) &&
  456. (*(lpwSrcStrTmp+1) >= 0xDC00 && *(lpwSrcStrTmp+1) <= 0xDFFF))
  457. bIsSurrogatePair = TRUE;
  458. else
  459. bIsSurrogatePair = FALSE;
  460. if (_nDstSize)
  461. {
  462. lpDstStrTmp[0] = '&' ;
  463. lpDstStrTmp[1] = '#' ;
  464. lpDstStrTmp += 2 ;
  465. // If it is a Unicode surrogates pair, we convert it to real Unicode value
  466. if (bIsSurrogatePair)
  467. {
  468. DWORD dwUnicode = ((*lpwSrcStrTmp - 0xD800) << 10) + *(lpwSrcStrTmp+1) - 0xDC00 + 0x10000;
  469. _ultoa( dwUnicode, (char*)lpDstStrTmp, 10);
  470. }
  471. else
  472. _ultoa( *lpwSrcStrTmp, (char*)lpDstStrTmp, 10);
  473. cCount = lstrlenA(lpDstStrTmp);
  474. lpDstStrTmp += cCount;
  475. ConvCount += cCount;
  476. *(lpDstStrTmp++) = ';' ;
  477. }
  478. else
  479. {
  480. char szTmpString[10];
  481. if (bIsSurrogatePair)
  482. {
  483. DWORD dwUnicode = ((*lpwSrcStrTmp - 0xD800) << 10) + *(lpwSrcStrTmp+1) - 0xDC00 + 0x10000;
  484. _ultoa( dwUnicode, szTmpString, 10);
  485. }
  486. else
  487. _ultoa( *lpwSrcStrTmp, szTmpString, 10);
  488. ConvCount += lstrlenA(szTmpString);
  489. }
  490. fConverted = TRUE;
  491. ConvCount += 3;
  492. }
  493. // handle MLCONVCHARF_USEDEFCHAR here - less priority and default method
  494. if (!fConverted)
  495. {
  496. if (_nDstSize)
  497. {
  498. *lpDstStrTmp = lpDefFallBack ? *lpDefFallBack : '?';
  499. lpDstStrTmp++;
  500. }
  501. ConvCount++;
  502. if (!UseDefChar)
  503. UseDefChar = TRUE;
  504. }
  505. lpBCharOffset++;
  506. lpwSrcStrTmp++;
  507. // Skip next character if it is a Unicode surrogates pair
  508. if (bIsSurrogatePair)
  509. {
  510. lpBCharOffset++;
  511. lpwSrcStrTmp++;
  512. i++;
  513. }
  514. }
  515. lpBCharOffset -= nCount ;
  516. }
  517. int nRemain = (*lpnSrcSize - (int)((char*)lpwSrcStrTmp - (char *)lpSrcStr))/sizeof(WCHAR);
  518. ConvCount += WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
  519. (LPCWSTR)lpwSrcStrTmp, nRemain,
  520. lpDstStrTmp, _nDstSize? _nDstSize-ConvCount : 0, NULL, NULL );
  521. *lpnDstSize = ConvCount ;
  522. hr = S_OK;
  523. }
  524. else
  525. {
  526. hr = E_FAIL;
  527. }
  528. ENTITIZE_DONE:
  529. if (lpwStrTmpSave)
  530. LocalFree(lpwStrTmpSave);
  531. if (lpDstStrTmp2Save)
  532. LocalFree(lpDstStrTmp2Save);
  533. if (lpBCharOffsetSave)
  534. LocalFree(lpBCharOffsetSave);
  535. }
  536. else
  537. {
  538. hr = S_OK;
  539. }
  540. if (S_OK == hr && UseDefChar)
  541. hr = S_FALSE;
  542. }
  543. else
  544. {
  545. hr = E_FAIL;
  546. }
  547. EXIT:
  548. return hr;
  549. }
  550. HRESULT CICharConverter::UTF78ToUnicode(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  551. LPSTR lpDstStr, LPINT lpnDstSize)
  552. {
  553. HRESULT hr ;
  554. hr = DoConvertINetString(lpdwMode, TRUE, CP_UCS_2, _dwUTFEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
  555. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  556. _nSrcSize = *lpnSrcSize ;
  557. CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
  558. return hr ;
  559. }
  560. HRESULT CICharConverter::UnicodeToUTF78(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  561. LPSTR lpDstStr, LPINT lpnDstSize)
  562. {
  563. HRESULT hr ;
  564. if ( _dwUnicodeEncoding == CP_UCS_2_BE && _cvt_count == 0 )
  565. {
  566. if ( _lpUnicodeStr = (LPSTR)LocalAlloc(LPTR, *lpnSrcSize ) )
  567. {
  568. MoveMemory(_lpUnicodeStr, lpSrcStr, *lpnSrcSize ) ;
  569. lpSrcStr = _lpUnicodeStr ;
  570. }
  571. else
  572. return E_OUTOFMEMORY ;
  573. }
  574. CheckUnicodeDataType(_dwUnicodeEncoding, (LPSTR) lpSrcStr, *lpnSrcSize);
  575. hr = DoConvertINetString(lpdwMode, FALSE, CP_UCS_2, _dwUTFEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
  576. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  577. _nSrcSize = *lpnSrcSize ;
  578. return hr ;
  579. }
  580. HRESULT CICharConverter::UnicodeToWindowsCodePage(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  581. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  582. {
  583. HRESULT hr ;
  584. hr = UnicodeToMultiByteEncoding(_dwWinCodePage,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize,dwFlag,lpFallBack);
  585. return hr ;
  586. }
  587. HRESULT CICharConverter::UnicodeToInternetEncoding(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  588. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  589. {
  590. HRESULT hr ;
  591. hr = UnicodeToMultiByteEncoding(_dwInternetEncoding,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize,dwFlag,lpFallBack);
  592. return hr ;
  593. }
  594. HRESULT CICharConverter::InternetEncodingToUnicode(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  595. LPSTR lpDstStr, LPINT lpnDstSize)
  596. {
  597. int cch = 0 ;
  598. int cb = *lpnSrcSize;
  599. if ( !_cvt_count )
  600. {
  601. // If we have a multibyte character encoding, we are at risk of splitting
  602. // some characters at the read boundary. We must Make sure we have a
  603. // discrete number of characters first.
  604. UINT uMax = MAX_CHAR_SIZE ;
  605. cb++; // pre-increment
  606. do
  607. {
  608. cch = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding),
  609. MB_ERR_INVALID_CHARS | MB_PRECOMPOSED,
  610. lpSrcStr, --cb,
  611. NULL, 0 );
  612. --uMax;
  613. } while (!cch && uMax && cb);
  614. }
  615. if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
  616. cb = *lpnSrcSize ; // restore orginal value
  617. *lpnDstSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), 0,
  618. lpSrcStr, cb,
  619. (LPWSTR)lpDstStr, *lpnDstSize/sizeof(WCHAR) );
  620. *lpnDstSize = *lpnDstSize * sizeof(WCHAR);
  621. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  622. _nSrcSize = cb ;
  623. CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
  624. if (*lpnDstSize==0 && (cb || cb != *lpnSrcSize))
  625. return E_FAIL ;
  626. else
  627. return S_OK ;
  628. }
  629. HRESULT CICharConverter::WindowsCodePageToUnicode(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  630. LPSTR lpDstStr, LPINT lpnDstSize)
  631. {
  632. int cch = 0 ;
  633. int cb = *lpnSrcSize;
  634. if ( !_cvt_count )
  635. {
  636. UINT uMax = MAX_CHAR_SIZE ;
  637. cb++; // pre-increment
  638. do
  639. {
  640. cch = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
  641. MB_ERR_INVALID_CHARS | MB_PRECOMPOSED,
  642. lpSrcStr, --cb,
  643. NULL, 0 );
  644. --uMax;
  645. } while (!cch && uMax && cb);
  646. }
  647. if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
  648. cb = *lpnSrcSize ; // restore orginal value
  649. *lpnDstSize = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), 0,
  650. lpSrcStr, cb,
  651. (LPWSTR)lpDstStr, *lpnDstSize/sizeof(WCHAR) );
  652. *lpnDstSize = *lpnDstSize * sizeof(WCHAR);
  653. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  654. _nSrcSize = cb ;
  655. CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
  656. // Whistler Bug#360429,
  657. // Web page could have a splitting DBCS character at the very end of the page,
  658. // To work around it, we allow one byte of dangling DBCS character.
  659. if (*lpnDstSize==0 && (cb || (cb != *lpnSrcSize && ++cb != *lpnSrcSize)))
  660. return E_FAIL ;
  661. else
  662. return S_OK ;
  663. }
  664. HRESULT CICharConverter::WindowsCodePageToInternetEncoding(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  665. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  666. {
  667. HRESULT hr ;
  668. // check if the conversion should go through Unicode indirectly
  669. if ( _dwConvertType & 0x10 )
  670. hr = WindowsCodePageToInternetEncodingWrap(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  671. else
  672. {
  673. hr = DoConvertINetString(lpdwMode, FALSE, _dwWinCodePage, _dwInternetEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
  674. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  675. _nSrcSize = *lpnSrcSize ;
  676. }
  677. return hr ;
  678. }
  679. HRESULT CICharConverter::InternetEncodingToWindowsCodePage(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  680. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  681. {
  682. HRESULT hr ;
  683. // check if the conversion should go through Unicode indirectly
  684. if ( _dwConvertType & 0x10 )
  685. hr = InternetEncodingToWindowsCodePageWrap(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  686. else
  687. {
  688. hr = DoConvertINetString(lpdwMode, TRUE, _dwWinCodePage, _dwInternetEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
  689. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  690. _nSrcSize = *lpnSrcSize ;
  691. }
  692. return hr ;
  693. }
  694. HRESULT CICharConverter::WindowsCodePageToInternetEncodingWrap(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  695. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  696. {
  697. int nBuffSize = 0 ;
  698. int cb = *lpnSrcSize;
  699. UINT uMax = MAX_CHAR_SIZE ;
  700. BOOL UseDefChar = FALSE ;
  701. HRESULT hr = S_OK;
  702. if ( !_cvt_count )
  703. {
  704. cb++; // pre-increment
  705. do
  706. {
  707. nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
  708. MB_ERR_INVALID_CHARS | MB_PRECOMPOSED,
  709. lpSrcStr, --cb,
  710. NULL, 0 );
  711. --uMax;
  712. } while (!nBuffSize && uMax && cb);
  713. }
  714. if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
  715. cb = *lpnSrcSize ; // restore orginal value
  716. if (!nBuffSize) // in case there are illeage characters
  717. nBuffSize = cb ;
  718. if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR, (nBuffSize * sizeof(WCHAR))))
  719. {
  720. nBuffSize = MultiByteToWideChar(MAPUSERDEF(_dwWinCodePage), 0,
  721. lpSrcStr, cb, (LPWSTR)_lpInterm1Str, nBuffSize );
  722. int iSrcSizeTmp = nBuffSize * sizeof(WCHAR);
  723. hr = UnicodeToMultiByteEncoding(MAPUSERDEF(_dwInternetEncoding), (LPCSTR)_lpInterm1Str, &iSrcSizeTmp,
  724. lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  725. // *lpnDstSize = WideCharToMultiByte( MAPUSERDEF(_dwInternetEncoding), 0,
  726. // (LPCWSTR)_lpInterm1Str, nBuffSize, lpDstStr, *lpnDstSize, NULL, &UseDefChar );
  727. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  728. _nSrcSize = cb ;
  729. }
  730. else
  731. hr = E_FAIL;
  732. if (hr == S_OK)
  733. {
  734. if (*lpnDstSize==0 && cb)
  735. hr = E_FAIL ;
  736. else
  737. {
  738. if ( UseDefChar )
  739. return S_FALSE ;
  740. else
  741. return S_OK ;
  742. }
  743. }
  744. return hr;
  745. }
  746. HRESULT CICharConverter::InternetEncodingToWindowsCodePageWrap(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  747. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  748. {
  749. int nBuffSize = 0 ;
  750. int cb = *lpnSrcSize;
  751. UINT uMax = MAX_CHAR_SIZE ;
  752. BOOL UseDefChar = FALSE ;
  753. HRESULT hr = S_OK;
  754. if ( !_cvt_count )
  755. {
  756. cb++; // pre-increment
  757. do
  758. {
  759. nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding),
  760. MB_ERR_INVALID_CHARS | MB_PRECOMPOSED,
  761. lpSrcStr, --cb,
  762. NULL, 0 );
  763. --uMax;
  764. } while (!nBuffSize && uMax && cb);
  765. }
  766. if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
  767. cb = *lpnSrcSize ; // restore orginal value
  768. if (!nBuffSize) // in case there are illeage characters
  769. nBuffSize = cb ;
  770. if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR,nBuffSize * sizeof (WCHAR) ))
  771. {
  772. nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), 0,
  773. lpSrcStr, cb, (LPWSTR)_lpInterm1Str, nBuffSize );
  774. int iSrcSizeTmp = nBuffSize * sizeof(WCHAR);
  775. hr = UnicodeToMultiByteEncoding(MAPUSERDEF(_dwWinCodePage), (LPCSTR)_lpInterm1Str, &iSrcSizeTmp,
  776. lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  777. // *lpnDstSize = WideCharToMultiByte( MAPUSERDEF(_dwWinCodePage), 0,
  778. // (LPCWSTR)_lpInterm1Str, nBuffSize, lpDstStr, *lpnDstSize, NULL, &UseDefChar );
  779. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  780. _nSrcSize = cb ;
  781. }
  782. else
  783. hr = E_FAIL;
  784. if (hr == S_OK)
  785. {
  786. if (*lpnDstSize==0 && cb)
  787. hr = E_FAIL ;
  788. else
  789. {
  790. if ( UseDefChar )
  791. return S_FALSE ;
  792. else
  793. return S_OK ;
  794. }
  795. }
  796. return hr;
  797. }
  798. HRESULT CICharConverter::ConvertIWUU(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  799. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  800. {
  801. int nBuffSize = 0 ;
  802. HRESULT hr = S_OK ;
  803. HRESULT hrWarnings = S_OK ;
  804. // InternetEncodingToWindowsCodePage
  805. if ( _dwConvertType % 2 && _dwConvertType < 21 ) /* start from Internet Encoding */
  806. {
  807. if ( _dwConvertType == 5 || _dwConvertType == 9 ) /* use interm buffer */
  808. {
  809. hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize, dwFlag, lpFallBack);
  810. if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR,nBuffSize) )
  811. {
  812. hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize, dwFlag, lpFallBack);
  813. lpSrcStr = _lpInterm1Str ;
  814. *lpnSrcSize = nBuffSize ;
  815. }
  816. else
  817. goto fail ;
  818. }
  819. else
  820. hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  821. _cvt_count ++ ;
  822. }
  823. if ( hr != S_OK )
  824. hrWarnings = hr ;
  825. // WindowsCodePageToUnicode or InternetEncodingToUnicode
  826. if ( _dwConvertType == 21 || _dwConvertType == 25 )
  827. {
  828. if ( _dwConvertType == 21 )
  829. hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
  830. else // _dwConvertType == 25
  831. {
  832. hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
  833. if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  834. {
  835. hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
  836. lpSrcStr = _lpInterm1Str ;
  837. *lpnSrcSize = nBuffSize ;
  838. }
  839. else
  840. goto fail ;
  841. }
  842. _cvt_count ++ ;
  843. }
  844. else if ( _dwConvertType >= 4 && _dwConvertType <= 10 )
  845. {
  846. if ( _dwConvertType > 8 )
  847. {
  848. nBuffSize = 0 ;
  849. hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
  850. if ( _cvt_count )
  851. {
  852. if ( _lpInterm2Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  853. {
  854. hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, _lpInterm2Str, &nBuffSize);
  855. lpSrcStr = _lpInterm2Str ;
  856. *lpnSrcSize = nBuffSize ;
  857. }
  858. else
  859. goto fail ;
  860. }
  861. else
  862. {
  863. if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  864. {
  865. hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
  866. lpSrcStr = _lpInterm1Str ;
  867. *lpnSrcSize = nBuffSize ;
  868. }
  869. else
  870. goto fail ;
  871. }
  872. }
  873. else
  874. hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
  875. _cvt_count ++ ;
  876. }
  877. if ( hr != S_OK )
  878. hrWarnings = hr ;
  879. // UnicodeToUTF78
  880. if ( _dwConvertType & 0x08 )
  881. #ifndef UNIX
  882. hr = UnicodeToUTF78(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
  883. #else
  884. {
  885. /* we now hack the lpSrcStr to be the same as 2 byte Unicode so mlang
  886. * lowlevel code can work right.
  887. */
  888. LPWSTR lpwSrcStr = (LPWSTR)lpSrcStr;
  889. INT tmpSize = *lpnSrcSize/sizeof(WCHAR);
  890. UCHAR *pTmp = new UCHAR[(tmpSize+1)*2];
  891. if(pTmp) {
  892. for(int i = 0; i < tmpSize; i++) {
  893. pTmp[i*2] = *lpwSrcStr++;
  894. pTmp[i*2+1] = 0x00;
  895. }
  896. pTmp[i*2] = pTmp[i*2+1] = 0x00;
  897. tmpSize *= 2;
  898. hr = UnicodeToUTF78(lpdwMode, (LPCSTR)pTmp, &tmpSize, lpDstStr, lpnDstSize);
  899. }
  900. else
  901. hr = E_FAIL;
  902. delete [] pTmp;
  903. }
  904. #endif /* UNIX */
  905. return ( hr == S_OK ? hrWarnings : hr ) ;
  906. fail :
  907. return E_FAIL ;
  908. }
  909. HRESULT CICharConverter::ConvertUUWI(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  910. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  911. {
  912. int nBuffSize = 0 ;
  913. HRESULT hr = S_OK ;
  914. HRESULT hrWarnings = S_OK ;
  915. // UTF78ToUnicode
  916. if ( _dwConvertType & 0x08 )
  917. {
  918. if ( _dwConvertType == 12 ) /* convert UTF78 -> Unicode only */
  919. hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
  920. else /* use interm buffer, type = 10 or 9 */
  921. {
  922. hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
  923. if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  924. {
  925. hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
  926. lpSrcStr = _lpInterm1Str ;
  927. *lpnSrcSize = nBuffSize ;
  928. }
  929. else
  930. goto fail ;
  931. }
  932. _cvt_count ++ ;
  933. }
  934. if ( hr != S_OK )
  935. hrWarnings = hr ;
  936. // UnicodeToWindowsCodePage or UnicodeToInternetEncoding
  937. if ( _dwConvertType == 21 || _dwConvertType == 25 )
  938. {
  939. hr = UnicodeToInternetEncoding(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  940. _cvt_count ++ ;
  941. }
  942. else if ( _dwConvertType >= 4 && _dwConvertType <= 10 )
  943. {
  944. if ( _dwConvertType % 2 ) /* use interm buffer */
  945. {
  946. nBuffSize = 0 ;
  947. hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, NULL, &nBuffSize, dwFlag, lpFallBack);
  948. if ( _cvt_count )
  949. {
  950. if ( _lpInterm2Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  951. {
  952. hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, _lpInterm2Str, &nBuffSize, dwFlag, lpFallBack);
  953. lpSrcStr = _lpInterm2Str ;
  954. *lpnSrcSize = nBuffSize ;
  955. }
  956. else
  957. goto fail ;
  958. }
  959. else
  960. {
  961. if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  962. {
  963. hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize, dwFlag, lpFallBack);
  964. lpSrcStr = _lpInterm1Str ;
  965. *lpnSrcSize = nBuffSize ;
  966. }
  967. else
  968. goto fail ;
  969. }
  970. }
  971. else
  972. hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  973. _cvt_count ++ ;
  974. }
  975. if ( hr != S_OK )
  976. hrWarnings = hr ;
  977. // WindowsCodePageToInternetEncoding
  978. if ( _dwConvertType % 2 && _dwConvertType < 21 )
  979. hr = WindowsCodePageToInternetEncoding(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  980. return ( hr == S_OK ? hrWarnings : hr ) ;
  981. fail :
  982. return E_FAIL ;
  983. }
  984. #if 0
  985. struct CODEPAGEINFO
  986. {
  987. UINT uCodePage ;
  988. CP_STATE nCP_State ; // whether this is a valid windows codepage ?
  989. };
  990. // ValidCodepageInfo is used to cache whether a codepage is a vaild code
  991. // It uses circular-FIFO cache algorithm
  992. #define MAX_CP_CACHE 32
  993. static int cp_cache_count = 0 ;
  994. static int cp_cache_ptr = 0 ;
  995. static struct CODEPAGEINFO ValidCodepageInfo[MAX_CP_CACHE];
  996. // ValidCodepageInfo is used to cache whether a codepage is a vaild codepage
  997. // It uses circular-FIFO cache algorithm
  998. BOOL CheckIsValidCodePage (UINT uCodePage)
  999. {
  1000. if ( uCodePage == 50000 ) // User defined
  1001. return TRUE ;
  1002. int i ;
  1003. BOOL bRet ;
  1004. for ( i = 0 ; i < cp_cache_count ; i++ )
  1005. {
  1006. if ( uCodePage == ValidCodepageInfo[i].uCodePage )
  1007. {
  1008. if ( ValidCodepageInfo[i].nCP_State == VALID_CP )
  1009. return TRUE ;
  1010. else
  1011. return FALSE ;
  1012. }
  1013. }
  1014. // not found, call IsValidCodePage and cache the return value
  1015. bRet = IsValidCodePage(uCodePage);
  1016. EnterCriticalSection(&g_cs);
  1017. ValidCodepageInfo[cp_cache_ptr].uCodePage = uCodePage ;
  1018. if (bRet)
  1019. ValidCodepageInfo[cp_cache_ptr].nCP_State = VALID_CP ;
  1020. else
  1021. ValidCodepageInfo[cp_cache_ptr].nCP_State = INVALID_CP ;
  1022. if ( cp_cache_count < MAX_CP_CACHE )
  1023. cp_cache_count++ ;
  1024. cp_cache_ptr = ( ++cp_cache_ptr ) % MAX_CP_CACHE ;
  1025. LeaveCriticalSection(&g_cs);
  1026. return bRet ;
  1027. }
  1028. #endif
  1029. /*
  1030. Conversion Flag:
  1031. Bit 7 - Convert Direction.
  1032. Bit 4 (16) - Unicode <-> Internet Encoding
  1033. Bit 3 (8) - UTF8, UTF7
  1034. Bit 2 (4) - Unicode
  1035. Bit 1 (2) - Windows CodePage
  1036. Bit 0 (1) - Internet Encoding
  1037. 12, 6, 3 (19) - one step convert
  1038. 10, 5 (21) - two steps convert
  1039. 9 (25) - three steps convert
  1040. */
  1041. int GetWindowsEncodingIndex(DWORD dwEncoding)
  1042. {
  1043. int nr = sizeof (aEncodingInfo) / sizeof(ENCODINGINFO) ;
  1044. int i, half = nr / 2, index = -1 ;
  1045. if (aEncodingInfo[half].dwEncoding > dwEncoding )
  1046. {
  1047. for ( i = 0 ; i < half ; i++ )
  1048. if (aEncodingInfo[i].dwEncoding == dwEncoding )
  1049. index = i ;
  1050. }
  1051. else if (aEncodingInfo[half].dwEncoding < dwEncoding )
  1052. {
  1053. for ( i = half + 1 ; i < nr ; i++ )
  1054. if (aEncodingInfo[i].dwEncoding == dwEncoding )
  1055. index = i ;
  1056. }
  1057. else
  1058. index = half ;
  1059. if (index>=0) // found
  1060. {
  1061. if ( aEncodingInfo[index].nCP_State != VALID_CP &&
  1062. aEncodingInfo[index].dwCodePage )
  1063. {
  1064. if ( aEncodingInfo[index].dwCodePage == 50000 || IsValidCodePage(aEncodingInfo[index].dwCodePage ) ) // 50000 means user defined
  1065. aEncodingInfo[index].nCP_State = VALID_CP ;
  1066. else
  1067. aEncodingInfo[index].nCP_State = INVALID_CP ;
  1068. if ((aEncodingInfo[index].nCP_State == VALID_CP) &&
  1069. (aEncodingInfo[index].dwFlags & CONV_CHK_NLS) &&
  1070. !IsValidCodePage(aEncodingInfo[index].dwEncoding))
  1071. aEncodingInfo[index].nCP_State = INVALID_CP ;
  1072. }
  1073. }
  1074. return index ;
  1075. }
  1076. HRESULT CICharConverter::ConvertSetup(DWORD * pdwSrcEncoding, DWORD dwDstEncoding)
  1077. {
  1078. DWORD SrcFlag = 0, DstFlag = 0 ;
  1079. int index, unknown = 0 ;
  1080. // IE bug 109708 - WEIWU 5/11/00
  1081. // Always consider US-ASCII as a valid source encoding for conversion
  1082. /*
  1083. if (*pdwSrcEncoding == CP_20127 && !IsValidCodePage(CP_20127))
  1084. *pdwSrcEncoding = CP_1252;
  1085. */
  1086. /* check source & destination encoding type */
  1087. index = GetWindowsEncodingIndex(*pdwSrcEncoding);
  1088. if ( index >=0 )
  1089. {
  1090. SrcFlag = (DWORD) aEncodingInfo[index].bTypeUUIW ;
  1091. if ( aEncodingInfo[index].dwCodePage )
  1092. {
  1093. _dwWinCodePage = (DWORD) aEncodingInfo[index].dwCodePage ;
  1094. if (aEncodingInfo[index].nCP_State == INVALID_CP )
  1095. goto fail ;
  1096. }
  1097. if ( SrcFlag & 0x08 )
  1098. _dwUTFEncoding = *pdwSrcEncoding ;
  1099. if ( SrcFlag & 0x01 )
  1100. _dwInternetEncoding = *pdwSrcEncoding ;
  1101. if ( SrcFlag & 0x04 )
  1102. _dwUnicodeEncoding = *pdwSrcEncoding ;
  1103. }
  1104. // assume it is a unknown Window Codepage
  1105. else
  1106. {
  1107. if ( !CONVERT_IS_VALIDCODEPAGE(*pdwSrcEncoding))
  1108. goto fail ;
  1109. SrcFlag = 0x02 ;
  1110. _dwWinCodePage = *pdwSrcEncoding ;
  1111. unknown ++ ;
  1112. }
  1113. index = GetWindowsEncodingIndex(dwDstEncoding);
  1114. if ( index >=0 )
  1115. {
  1116. // check if two codepages are compatiable
  1117. if ( _dwWinCodePage && aEncodingInfo[index].dwCodePage )
  1118. {
  1119. if (_dwWinCodePage != (DWORD) aEncodingInfo[index].dwCodePage )
  1120. goto fail ;
  1121. }
  1122. DstFlag = (DWORD) aEncodingInfo[index].bTypeUUIW ;
  1123. if ( aEncodingInfo[index].dwCodePage )
  1124. {
  1125. _dwWinCodePage = (DWORD) aEncodingInfo[index].dwCodePage ;
  1126. if (aEncodingInfo[index].nCP_State == INVALID_CP )
  1127. goto fail ;
  1128. }
  1129. if ( DstFlag & 0x08 )
  1130. {
  1131. if (_dwUTFEncoding)
  1132. _dwUTFEncoding2 = dwDstEncoding ;
  1133. else
  1134. _dwUTFEncoding = dwDstEncoding ;
  1135. }
  1136. if ( DstFlag & 0x01 )
  1137. _dwInternetEncoding = dwDstEncoding ;
  1138. if ( DstFlag & 0x04 )
  1139. _dwUnicodeEncoding = dwDstEncoding ;
  1140. }
  1141. // 1) First time unknown, assume it is a unknown Window Codepage
  1142. // the conversion become UTF78 <-> Unicode <-> Window Codepage
  1143. // 2) Second time unknown, assume it is a unknown Internet Encoding
  1144. // the conversion become Windows Codepage <-> Unicode <-> Internet Encoding
  1145. else
  1146. {
  1147. if ( !CONVERT_IS_VALIDCODEPAGE(dwDstEncoding))
  1148. goto fail ;
  1149. if ( unknown == 0 )
  1150. {
  1151. if ( _dwWinCodePage )
  1152. {
  1153. if (_dwWinCodePage != dwDstEncoding )
  1154. goto fail ;
  1155. }
  1156. DstFlag = 0x02 ;
  1157. _dwWinCodePage = dwDstEncoding ;
  1158. }
  1159. else
  1160. {
  1161. DstFlag = 0x11 ;
  1162. _dwInternetEncoding = dwDstEncoding ;
  1163. }
  1164. }
  1165. if ( !SrcFlag | !DstFlag )
  1166. goto fail ;
  1167. if ( SrcFlag == DstFlag && *pdwSrcEncoding != dwDstEncoding && ( 4 != SrcFlag ) && ( 8 != SrcFlag ))
  1168. goto fail ;
  1169. _dwConvertType = SrcFlag | DstFlag ;
  1170. _bConvertDirt = ( SrcFlag & 0x0f ) > ( DstFlag & 0x0f ) ;
  1171. // if code convertor has been allocated, deallocate it
  1172. if (_hcins)
  1173. {
  1174. delete _hcins ;
  1175. _hcins = NULL ;
  1176. }
  1177. return S_OK ;
  1178. fail :
  1179. return S_FALSE ;
  1180. }
  1181. HRESULT CICharConverter::DoCodeConvert(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  1182. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  1183. {
  1184. HRESULT hr = S_OK ;
  1185. if ( 4 == _dwConvertType ) // CP_UCS_2 <-> CP_UCS_2_BE
  1186. {
  1187. if (!lpDstStr)
  1188. {
  1189. _nSrcSize = *lpnDstSize = *lpnSrcSize ;
  1190. }
  1191. else
  1192. {
  1193. int nSize = min(*lpnDstSize,*lpnSrcSize);
  1194. _nSrcSize = *lpnSrcSize ;
  1195. if ( lpDstStr && nSize > 0 )
  1196. {
  1197. MoveMemory(lpDstStr, lpSrcStr, nSize );
  1198. DataByteSwap(lpDstStr, nSize );
  1199. _nSrcSize = nSize ;
  1200. *lpnDstSize = nSize ;
  1201. }
  1202. }
  1203. }
  1204. else if ( 8 == _dwConvertType) // UTF7 <-> UTF8
  1205. {
  1206. if (_dwUTFEncoding == _dwUTFEncoding2)
  1207. {
  1208. _nSrcSize = *lpnDstSize = min(*lpnDstSize,*lpnSrcSize);
  1209. if (*lpnDstSize > 0)
  1210. MoveMemory(lpDstStr, lpSrcStr, *lpnDstSize);
  1211. }
  1212. else
  1213. {
  1214. int nBuffSize = 0;
  1215. // Always succeeds
  1216. hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
  1217. if (_lpInterm1Str)
  1218. LocalFree(_lpInterm1Str);
  1219. if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  1220. {
  1221. DWORD dwTmpEncoding = _dwUTFEncoding;
  1222. int nTmpSrcSize;
  1223. hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
  1224. _dwUTFEncoding = _dwUTFEncoding2 ;
  1225. nTmpSrcSize = _nSrcSize;
  1226. // We don't need to create another dwMode since only UTF7 conversion needs it
  1227. hr = UnicodeToUTF78(lpdwMode, _lpInterm1Str, &nBuffSize, lpDstStr, lpnDstSize);
  1228. _nSrcSize = nTmpSrcSize;
  1229. _dwUTFEncoding = dwTmpEncoding ;
  1230. }
  1231. else
  1232. hr = E_OUTOFMEMORY;
  1233. }
  1234. }
  1235. else if ( _bConvertDirt )
  1236. hr = ConvertUUWI(lpdwMode, lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, dwFlag, lpFallBack);
  1237. else
  1238. hr = ConvertIWUU(lpdwMode, lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, dwFlag, lpFallBack);
  1239. return hr ;
  1240. }
  1241. BOOL CICharConverter::ConvertCleanUp()
  1242. {
  1243. if (_lpInterm1Str)
  1244. {
  1245. LocalFree(_lpInterm1Str);
  1246. _lpInterm1Str = NULL ;
  1247. }
  1248. if (_lpInterm2Str)
  1249. {
  1250. LocalFree(_lpInterm2Str);
  1251. _lpInterm2Str = NULL ;
  1252. }
  1253. if (_lpUnicodeStr)
  1254. {
  1255. LocalFree(_lpUnicodeStr);
  1256. _lpUnicodeStr = NULL ;
  1257. }
  1258. _cvt_count = 0 ;
  1259. _nSrcSize = 0 ;
  1260. return TRUE ;
  1261. }
  1262. CICharConverter::CICharConverter()
  1263. {
  1264. _lpInterm1Str = NULL ;
  1265. _lpInterm2Str = NULL ;
  1266. _lpUnicodeStr = NULL ;
  1267. _hcins = NULL ;
  1268. _cvt_count = 0 ;
  1269. _dwWinCodePage = 0;
  1270. _dwInternetEncoding = 0;
  1271. _dwUTFEncoding = 0;
  1272. _dwUTFEncoding2 = 0;
  1273. _dwUnicodeEncoding = 0;
  1274. _dwConvertType = 0;
  1275. _nSrcSize = 0 ;
  1276. _hcins_dst = 0 ;
  1277. return ;
  1278. }
  1279. CICharConverter::CICharConverter(DWORD dwFlag, WCHAR *lpFallBack)
  1280. {
  1281. _lpInterm1Str = NULL ;
  1282. _lpInterm2Str = NULL ;
  1283. _lpUnicodeStr = NULL ;
  1284. _hcins = NULL ;
  1285. _cvt_count = 0 ;
  1286. _dwWinCodePage = 0;
  1287. _dwInternetEncoding = 0;
  1288. _dwUTFEncoding = 0;
  1289. _dwUTFEncoding2 = 0;
  1290. _dwUnicodeEncoding = 0;
  1291. _dwConvertType = 0;
  1292. _nSrcSize = 0 ;
  1293. _hcins_dst = 0 ;
  1294. _dwFlag = dwFlag;
  1295. _lpFallBack = lpFallBack;
  1296. return ;
  1297. }
  1298. CICharConverter::~CICharConverter()
  1299. {
  1300. if (_lpInterm1Str)
  1301. {
  1302. LocalFree(_lpInterm1Str);
  1303. _lpInterm1Str = NULL ;
  1304. }
  1305. if (_lpInterm2Str)
  1306. {
  1307. LocalFree(_lpInterm2Str);
  1308. _lpInterm2Str = NULL ;
  1309. }
  1310. if (_lpUnicodeStr)
  1311. {
  1312. LocalFree(_lpUnicodeStr);
  1313. _lpUnicodeStr = NULL ;
  1314. }
  1315. if (_hcins)
  1316. {
  1317. delete _hcins ;
  1318. _hcins = NULL ;
  1319. }
  1320. }
  1321. CICharConverter::CICharConverter(DWORD dwSrcEncoding, DWORD dwDstEncoding)
  1322. {
  1323. _lpInterm1Str = NULL ;
  1324. _lpInterm2Str = NULL ;
  1325. _lpUnicodeStr = NULL ;
  1326. _hcins = NULL ;
  1327. _cvt_count = 0 ;
  1328. _dwWinCodePage = 0;
  1329. _dwInternetEncoding = 0;
  1330. _dwUTFEncoding = 0;
  1331. _dwUTFEncoding2 = 0;
  1332. _dwUnicodeEncoding = 0;
  1333. _dwConvertType = 0;
  1334. _nSrcSize = 0 ;
  1335. _hcins_dst = 0 ;
  1336. ConvertSetup(&dwSrcEncoding,dwDstEncoding);
  1337. return ;
  1338. }
  1339. HRESULT WINAPI IsConvertINetStringAvailable(DWORD dwSrcEncoding, DWORD dwDstEncoding)
  1340. {
  1341. HRESULT hr;
  1342. CICharConverter * INetConvert = new CICharConverter ;
  1343. if (!INetConvert)
  1344. return E_OUTOFMEMORY;
  1345. hr = INetConvert->ConvertSetup(&dwSrcEncoding, dwDstEncoding);
  1346. delete INetConvert;
  1347. return hr ;
  1348. }
  1349. #define DETECTION_BUFFER_NUM 3
  1350. // In CP_AUTO and detection result is UTF7 case, private converter might use high word of *lpdwMode to store internal data, but we need
  1351. // to use it to notify Trident the detection result, currently, we bias to returning correct detection result.
  1352. // This is currently by design. If we get a change to re-prototype conversion object, we can resovle this issue
  1353. HRESULT WINAPI ConvertINetStringEx(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  1354. {
  1355. CICharConverter * INetConvert;
  1356. int nSrcSize;
  1357. int nDstSize;
  1358. DWORD dwMode = 0 ;
  1359. // dwDetectResult
  1360. // CP_UNDEFINED :Fail to detect
  1361. // 0 :Not a auto-detect scenario
  1362. // Others :Detected encoding
  1363. DWORD dwDetectResult = CP_UNDEFINED;
  1364. HRESULT hr ;
  1365. if(lpnSrcSize)
  1366. {
  1367. nSrcSize = *lpnSrcSize;
  1368. }
  1369. else
  1370. nSrcSize = -1;
  1371. if ( lpSrcStr && nSrcSize == -1 ) // Get length of lpSrcStr if not given, assuming lpSrcStr is a zero terminate string.
  1372. {
  1373. if ( dwSrcEncoding == CP_UCS_2 )
  1374. nSrcSize = (lstrlenW((WCHAR*)lpSrcStr) << 1) ;
  1375. else
  1376. nSrcSize = lstrlenA(lpSrcStr) ;
  1377. }
  1378. // If there is nothing need to be converted, we return S_OK;
  1379. if (!nSrcSize || !lpSrcStr)
  1380. {
  1381. if (lpnDstSize)
  1382. *lpnDstSize = 0;
  1383. return S_OK;
  1384. }
  1385. INetConvert = new CICharConverter(dwFlag, lpFallBack) ;
  1386. if (!INetConvert)
  1387. return E_OUTOFMEMORY;
  1388. // ASSERT(CP_AUTO != dwDstEncoding);
  1389. // if null specified at dst buffer we'll get the size of required buffer.
  1390. if(!lpDstStr)
  1391. nDstSize = 0;
  1392. else if (lpnDstSize)
  1393. nDstSize = *lpnDstSize;
  1394. else
  1395. nDstSize = 0;
  1396. if (lpdwMode)
  1397. dwMode = *lpdwMode ;
  1398. // In real world, clients uses 28591 as 1252, 28599 as 1254,
  1399. // To correctly convert those extended characters to Unicode,
  1400. // We internally replace it with 1252
  1401. if (dwDstEncoding == CP_UCS_2 || dwDstEncoding == CP_UCS_2_BE)
  1402. {
  1403. if ((dwSrcEncoding == CP_ISO_8859_1) && _IsValidCodePage(CP_1252))
  1404. dwSrcEncoding = CP_1252;
  1405. if ((dwSrcEncoding == CP_ISO_8859_9) && _IsValidCodePage(CP_1254))
  1406. dwSrcEncoding = CP_1254;
  1407. }
  1408. if ((dwDstEncoding == CP_1252) && (dwSrcEncoding == CP_ISO_8859_1))
  1409. {
  1410. dwSrcEncoding = CP_1252;
  1411. }
  1412. if ((dwDstEncoding == CP_1254) && (dwSrcEncoding == CP_ISO_8859_9))
  1413. {
  1414. dwSrcEncoding = CP_1254;
  1415. }
  1416. //
  1417. // Auto Detection for Japan
  1418. // Japanese user often tag their data incorrectly, so, if MLCONVCHARF_DETECTJPN specified,
  1419. // we'll do extra detection for Shift-Jis and EUC
  1420. //
  1421. if ( dwSrcEncoding == CP_JP_AUTO ||
  1422. ((dwFlag & MLCONVCHARF_DETECTJPN) &&
  1423. (dwSrcEncoding == CP_JPN_SJ || dwSrcEncoding == CP_EUC_JP))) // Auto Detection for Japan
  1424. {
  1425. CIncdJapanese DetectJapan(dwSrcEncoding);
  1426. UINT uiCodePage ;
  1427. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1428. if ( uiCodePage )
  1429. {
  1430. dwSrcEncoding = uiCodePage ;
  1431. dwDetectResult = 0;
  1432. }
  1433. else
  1434. {
  1435. dwSrcEncoding = DetectJapan.DetectStringA(lpSrcStr, nSrcSize);
  1436. // if dwSrcEncoding is zero means there is an ambiguity, we don't return
  1437. // the detected codepage to caller, instead we defaut its codepage internally
  1438. // to SJIS
  1439. if (dwSrcEncoding)
  1440. {
  1441. dwDetectResult = dwSrcEncoding << 16 ;
  1442. }
  1443. else
  1444. dwSrcEncoding = CP_JPN_SJ;
  1445. }
  1446. }
  1447. // bug #43190, we auto-detect again for euc-kr page because IMN ver 1.0
  1448. // mislabel an ISO-KR page as a ks_c_5601-1987 page. This is the only way
  1449. // we can fix that mistake.
  1450. else if ( dwSrcEncoding == CP_KR_AUTO || dwSrcEncoding == CP_KOR_5601 ||
  1451. dwSrcEncoding == CP_EUC_KR )
  1452. {
  1453. CIncdKorean DetectKorean;
  1454. UINT uiCodePage ;
  1455. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1456. if ( uiCodePage )
  1457. {
  1458. dwSrcEncoding = uiCodePage ;
  1459. dwDetectResult = 0;
  1460. }
  1461. else
  1462. {
  1463. dwSrcEncoding = DetectKorean.DetectStringA(lpSrcStr, nSrcSize);
  1464. if (dwSrcEncoding)
  1465. {
  1466. dwDetectResult = dwSrcEncoding << 16 ;
  1467. }
  1468. else
  1469. dwSrcEncoding = CP_KOR_5601;
  1470. }
  1471. }
  1472. else if ( dwSrcEncoding == CP_AUTO ) // General Auto Detection for all code pages
  1473. {
  1474. int _nSrcSize = DETECTION_MAX_LEN < nSrcSize ? DETECTION_MAX_LEN : nSrcSize;
  1475. int nScores = DETECTION_BUFFER_NUM;
  1476. DetectEncodingInfo Encoding[DETECTION_BUFFER_NUM];
  1477. UINT uiCodePage ;
  1478. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1479. if ( uiCodePage )
  1480. {
  1481. dwSrcEncoding = uiCodePage ;
  1482. dwDetectResult = 0;
  1483. }
  1484. else
  1485. {
  1486. dwSrcEncoding = g_uACP;
  1487. if ( S_OK == _DetectInputCodepage(MLDETECTCP_HTML, CP_AUTO, (char *)lpSrcStr, &_nSrcSize, &Encoding[0], &nScores))
  1488. {
  1489. MIMECPINFO cpInfo;
  1490. if (Encoding[0].nCodePage == CP_20127)
  1491. Encoding[0].nCodePage = dwSrcEncoding;
  1492. if (NULL != g_pMimeDatabase)
  1493. {
  1494. if (SUCCEEDED(g_pMimeDatabase->GetCodePageInfo(Encoding[0].nCodePage, 0x409, &cpInfo)) &&
  1495. (cpInfo.dwFlags & MIMECONTF_VALID))
  1496. {
  1497. dwSrcEncoding = Encoding[0].nCodePage;
  1498. dwDetectResult = dwSrcEncoding << 16 ;
  1499. }
  1500. }
  1501. }
  1502. // If we failed in general detection and system locale is Jpn, we try harder
  1503. // with our Japanese detection engine
  1504. if (dwSrcEncoding == CP_JPN_SJ && dwDetectResult == CP_UNDEFINED)
  1505. {
  1506. CIncdJapanese DetectJapan;
  1507. DWORD dwSrcEncodingJpn = DetectJapan.DetectStringA(lpSrcStr, nSrcSize);
  1508. if (dwSrcEncodingJpn)
  1509. {
  1510. // We only change conversion encoding without returnning this result to browser
  1511. // if it is in the middle of detection, this is to prevent other encodings been mis-detected as Jpn encodings.
  1512. dwSrcEncoding = dwSrcEncodingJpn;
  1513. // Set search range for end tag as 10 bytes
  1514. if (nSrcSize >= 10)
  1515. {
  1516. char szTmpStr[11] = {0};
  1517. char *lpTmpStr = szTmpStr;
  1518. MLStrCpyN(szTmpStr, (char *)&lpSrcStr[nSrcSize-10], 10);
  1519. //ToLower
  1520. while(*lpTmpStr)
  1521. {
  1522. if (*lpTmpStr >= 'A' && *lpTmpStr <= 'W')
  1523. *lpTmpStr += 0x20;
  1524. lpTmpStr++;
  1525. }
  1526. // If end of page, return this result
  1527. if (MLStrStr(szTmpStr, "</html>"))
  1528. dwDetectResult = dwSrcEncoding << 16 ;
  1529. }
  1530. }
  1531. }
  1532. //aEncodingInfo[GetWindowsEncodingIndex(CP_AUTO)].dwCodePage = dwSrcEncoding;
  1533. }
  1534. }
  1535. else
  1536. {
  1537. // Not a auto-detect scenario
  1538. dwDetectResult = 0;
  1539. }
  1540. if ( S_OK == ( hr = INetConvert->ConvertSetup(&dwSrcEncoding,dwDstEncoding )))
  1541. {
  1542. if ( dwSrcEncoding != dwDstEncoding )
  1543. {
  1544. // if high word of dwMode is CP_UTF_7, it must be detection result, don't pass it to UTF7 converter
  1545. if ( dwSrcEncoding == CP_UTF_7 && (dwMode >> 16) == CP_UTF_7)
  1546. dwMode &= 0xFFFF;
  1547. // ASSERT(!((IS_ENCODED_ENCODING(dwSrcEncoding) || IS_ENCODED_ENCODING(dwDstEncoding)) && (NULL == lpdwMode)));
  1548. hr = INetConvert->DoCodeConvert(&dwMode, lpSrcStr, &nSrcSize, lpDstStr, &nDstSize, dwFlag, lpFallBack);
  1549. // return the number of bytes processed for the source.
  1550. if (lpnSrcSize)
  1551. *lpnSrcSize = INetConvert->_nSrcSize ;
  1552. INetConvert->ConvertCleanUp();
  1553. }
  1554. else
  1555. {
  1556. int nSize, i ;
  1557. hr = S_OK ;
  1558. BOOL bLeadByte = FALSE ;
  1559. // only check for windows codepage
  1560. if ( INetConvert->_dwConvertType == 02 && lpSrcStr )
  1561. {
  1562. for ( i=0; i<nSrcSize; i++)
  1563. {
  1564. if (bLeadByte)
  1565. bLeadByte = FALSE ;
  1566. else if (IsDBCSLeadByteEx(dwSrcEncoding,lpSrcStr[i]))
  1567. bLeadByte = TRUE ;
  1568. }
  1569. if (bLeadByte)
  1570. nSrcSize-- ;
  1571. }
  1572. // set input size
  1573. if (lpnSrcSize)
  1574. *lpnSrcSize = nSrcSize ;
  1575. // set output size and copy if we need to
  1576. if (lpDstStr && *lpnDstSize)
  1577. {
  1578. nSize = min(*lpnDstSize,nSrcSize);
  1579. MoveMemory(lpDstStr, lpSrcStr, nSize);
  1580. nDstSize = nSize ;
  1581. }
  1582. else
  1583. nDstSize = nSrcSize ;
  1584. }
  1585. }
  1586. else
  1587. nDstSize = 0 ;
  1588. delete INetConvert;
  1589. // return the number of bytes copied for the destination,
  1590. if (lpnDstSize)
  1591. *lpnDstSize = nDstSize;
  1592. if (lpdwMode && lpDstStr)
  1593. {
  1594. if (dwDetectResult) // CP_AUTO conversion
  1595. {
  1596. dwMode &= 0xFFFF; // Clear HIGHWORD in case private converter set it
  1597. // If we have detection result, return it in HIGHWORD
  1598. // in the case of UTF7 conversion, private converter might use high word to store internal data,
  1599. // this will conflict with our logic of returning detection result in high word, it is a design flaw,
  1600. // currently, we ignore conversion setting and give detection result more priority
  1601. if (dwDetectResult != CP_UNDEFINED)
  1602. dwMode |= dwDetectResult;
  1603. }
  1604. *lpdwMode = dwMode ;
  1605. }
  1606. return hr ;
  1607. }
  1608. // We already published this API, keep it for backward compatibility
  1609. HRESULT WINAPI ConvertINetReset(void)
  1610. {
  1611. // Always suceed
  1612. return S_OK ;
  1613. }
  1614. HRESULT WINAPI ConvertINetMultiByteToUnicodeEx(LPDWORD lpdwMode, DWORD dwEncoding, LPCSTR lpSrcStr, LPINT lpnMultiCharCount, LPWSTR lpDstStr, LPINT lpnWideCharCount, DWORD dwFlag, WCHAR *lpFallBack)
  1615. {
  1616. HRESULT hr ;
  1617. int nByteCountSize = 0;
  1618. if (lpnWideCharCount)
  1619. {
  1620. nByteCountSize = *lpnWideCharCount * sizeof(WCHAR);
  1621. }
  1622. #ifdef UNIX
  1623. int saved_nByteCountSize = nByteCountSize;
  1624. #endif /* UNIX */
  1625. hr = ConvertINetStringEx(lpdwMode,dwEncoding, CP_UCS_2, lpSrcStr, lpnMultiCharCount, (LPSTR)lpDstStr, &nByteCountSize, dwFlag, lpFallBack) ;
  1626. #ifdef UNIX
  1627. if(dwEncoding == 1200 || dwEncoding == 65000 || dwEncoding == 65001 ||
  1628. (dwEncoding == 50001 && !_IsValidCodePage(dwEncoding)) )
  1629. {
  1630. /*
  1631. * On unix we need to convert the little endian mode 2 byte unicode
  1632. * format to unix mode 4 byte wChars.
  1633. */
  1634. if(lpDstStr && (saved_nByteCountSize < (nByteCountSize/2)*sizeof(WCHAR)))
  1635. hr = E_FAIL;
  1636. else
  1637. {
  1638. /*
  1639. * Use a temporary array to do the 2byte -> 4byte conversion
  1640. */
  1641. LPSTR pTmp = (LPSTR) lpDstStr;
  1642. LPWSTR pw4 = NULL;
  1643. if(pTmp) /* allocate only if we have a lpDstStr */
  1644. pw4 = new WCHAR[nByteCountSize/2];
  1645. if(pw4)
  1646. {
  1647. int i = 0;
  1648. LPWSTR pw4Tmp = pw4;
  1649. for(; i < nByteCountSize/2; i++)
  1650. *pw4Tmp++ = (UCHAR)pTmp[i*2];
  1651. pw4Tmp = pw4;
  1652. for(i = 0; i < nByteCountSize/2; i++)
  1653. *lpDstStr++ = *pw4Tmp++;
  1654. }
  1655. if(!pw4 && pTmp) /* if lpDstStr and allocate fails bail out */
  1656. hr = E_FAIL;
  1657. delete [] pw4;
  1658. }
  1659. nByteCountSize *= 2; // Expand twice as we have 4 byte wchars.
  1660. }
  1661. #endif
  1662. *lpnWideCharCount = nByteCountSize / sizeof(WCHAR);
  1663. return hr ;
  1664. }
  1665. HRESULT WINAPI ConvertINetUnicodeToMultiByteEx(LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount, DWORD dwFlag, WCHAR *lpFallBack)
  1666. {
  1667. HRESULT hr ;
  1668. int nByteCountSize=-1;
  1669. if(lpnWideCharCount && *lpnWideCharCount != -1)
  1670. nByteCountSize = *lpnWideCharCount * sizeof(WCHAR);
  1671. hr = ConvertINetStringEx(lpdwMode,CP_UCS_2, dwEncoding, (LPCSTR) lpSrcStr, &nByteCountSize, lpDstStr, lpnMultiCharCount, dwFlag, lpFallBack);
  1672. #ifdef UNIX
  1673. if(dwEncoding == 1200 || dwEncoding == 65000 || dwEncoding == 65001) {
  1674. nByteCountSize *= 2; // Expand twice as we have 4 byte wchars.
  1675. }
  1676. #endif /* UNIX */
  1677. if (lpnWideCharCount)
  1678. *lpnWideCharCount = nByteCountSize / sizeof(WCHAR);
  1679. return hr ;
  1680. }
  1681. HRESULT WINAPI ConvertINetString(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize)
  1682. {
  1683. HRESULT hr ;
  1684. hr = ConvertINetStringEx(lpdwMode,dwSrcEncoding,dwDstEncoding,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, 0, NULL);
  1685. return hr ;
  1686. }
  1687. HRESULT WINAPI ConvertINetUnicodeToMultiByte(LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount)
  1688. {
  1689. HRESULT hr ;
  1690. DWORD dwFlag = 0 ;
  1691. if ( lpdwMode )
  1692. dwFlag |= ( *lpdwMode & 0x00008000 ) ? MLCONVCHARF_ENTITIZE : 0 ;
  1693. hr = ConvertINetUnicodeToMultiByteEx(lpdwMode,dwEncoding,lpSrcStr,lpnWideCharCount,lpDstStr,lpnMultiCharCount,dwFlag,NULL);
  1694. return hr ;
  1695. }
  1696. HRESULT WINAPI ConvertINetMultiByteToUnicode(LPDWORD lpdwMode, DWORD dwEncoding, LPCSTR lpSrcStr, LPINT lpnMultiCharCount, LPWSTR lpDstStr, LPINT lpnWideCharCount)
  1697. {
  1698. HRESULT hr ;
  1699. hr = ConvertINetMultiByteToUnicodeEx(lpdwMode,dwEncoding,lpSrcStr,lpnMultiCharCount,lpDstStr,lpnWideCharCount, 0, NULL);
  1700. return hr ;
  1701. }
  1702. #define STR_BUFFER_SIZE 2048
  1703. HRESULT _ConvertINetStringInIStream(CICharConverter * INetConvert, LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, IStream *pstmIn, IStream *pstmOut, DWORD dwFlag, WCHAR *lpFallBack)
  1704. {
  1705. DWORD dwMode, dwModeTemp ;
  1706. HRESULT hr= S_OK, hrWarnings=S_OK;
  1707. LPSTR lpstrIn = NULL, lpstrOut = NULL;
  1708. ULONG nSrcSize, nSrcUsed, nSrcLeft, nDstSize, _nDstSize, nOutBuffSize ;
  1709. if (lpdwMode)
  1710. dwMode = *lpdwMode ;
  1711. // allocate a temp input buffer - 2K in size
  1712. if ( (lpstrIn = (LPSTR) LocalAlloc(LPTR, STR_BUFFER_SIZE )) == NULL )
  1713. {
  1714. hrWarnings = E_OUTOFMEMORY ;
  1715. goto exit;
  1716. }
  1717. if ( (lpstrOut = (LPSTR) LocalAlloc(LPTR, STR_BUFFER_SIZE * 2 )) == NULL )
  1718. {
  1719. hrWarnings = E_OUTOFMEMORY ;
  1720. goto exit;
  1721. }
  1722. nOutBuffSize = STR_BUFFER_SIZE * 2 ;
  1723. nSrcLeft = 0 ;
  1724. // In real world, clients uses 28591 as 1252, 28599 as 1254,
  1725. // To correctly convert those extended characters to Unicode,
  1726. // We internally replace it with 1252
  1727. if (dwDstEncoding == CP_UCS_2 || dwDstEncoding == CP_UCS_2_BE)
  1728. {
  1729. if ((dwSrcEncoding == CP_ISO_8859_1) && _IsValidCodePage(CP_1252))
  1730. dwSrcEncoding = CP_1252;
  1731. if ((dwSrcEncoding == CP_ISO_8859_9) && _IsValidCodePage(CP_1254))
  1732. dwSrcEncoding = CP_1254;
  1733. }
  1734. if ((dwDstEncoding == CP_1252) && (dwSrcEncoding == CP_ISO_8859_1))
  1735. {
  1736. dwSrcEncoding = CP_1252;
  1737. }
  1738. if ((dwDstEncoding == CP_1254) && (dwSrcEncoding == CP_ISO_8859_9))
  1739. {
  1740. dwSrcEncoding = CP_1254;
  1741. }
  1742. if ( dwSrcEncoding == CP_JP_AUTO ) // Auto Detection for Japan
  1743. {
  1744. CIncdJapanese DetectJapan;
  1745. UINT uiCodePage ;
  1746. LARGE_INTEGER li;
  1747. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1748. if ( uiCodePage )
  1749. dwSrcEncoding = uiCodePage ;
  1750. else
  1751. {
  1752. LISet32(li, 0);
  1753. hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE , &nSrcSize);
  1754. if (S_OK != hr)
  1755. hrWarnings = hr;
  1756. hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL);
  1757. if (S_OK != hr)
  1758. hrWarnings = hr;
  1759. dwSrcEncoding = DetectJapan.DetectStringA(lpstrIn, nSrcSize);
  1760. // if dwSrcEncoding is zero means there is an ambiguity, we don't return
  1761. // the detected codepage to caller, instead we defaut its codepage internally
  1762. // to SJIS
  1763. if (dwSrcEncoding)
  1764. {
  1765. dwMode &= 0x0000ffff ;
  1766. dwMode |= dwSrcEncoding << 16 ;
  1767. }
  1768. else
  1769. dwSrcEncoding = CP_JPN_SJ;
  1770. }
  1771. }
  1772. // bug #43190, we auto-detect again for euc-kr page because IMN ver 1.0
  1773. // mislabel an ISO-KR page as a ks_c_5601-1987 page. This is the only way
  1774. // we can fix that mistake.
  1775. else if ( dwSrcEncoding == CP_KR_AUTO || dwSrcEncoding == CP_KOR_5601 ||
  1776. dwSrcEncoding == CP_EUC_KR )
  1777. {
  1778. CIncdKorean DetectKorean;
  1779. UINT uiCodePage ;
  1780. LARGE_INTEGER li;
  1781. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1782. if ( uiCodePage )
  1783. dwSrcEncoding = uiCodePage ;
  1784. else
  1785. {
  1786. LISet32(li, 0);
  1787. hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE, &nSrcSize);
  1788. if (S_OK != hr)
  1789. hrWarnings = hr;
  1790. hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL);
  1791. if (S_OK != hr)
  1792. hrWarnings = hr;
  1793. dwSrcEncoding = DetectKorean.DetectStringA(lpstrIn, nSrcSize);
  1794. if (dwSrcEncoding)
  1795. {
  1796. dwMode &= 0x0000ffff ;
  1797. dwMode |= dwSrcEncoding << 16 ;
  1798. }
  1799. else
  1800. dwSrcEncoding = CP_KOR_5601;
  1801. }
  1802. }
  1803. else if ( dwSrcEncoding == CP_AUTO ) // General Auto Detection for all code pages
  1804. {
  1805. INT nScores = 1;
  1806. DWORD dwSrcEncoding ;
  1807. DetectEncodingInfo Encoding;
  1808. UINT uiCodePage ;
  1809. LARGE_INTEGER li;
  1810. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1811. if ( uiCodePage )
  1812. dwSrcEncoding = uiCodePage ;
  1813. else
  1814. {
  1815. LISet32(li, 0);
  1816. hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE , &nSrcSize);
  1817. if (S_OK != hr)
  1818. hrWarnings = hr;
  1819. hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL);
  1820. if (S_OK != hr)
  1821. hrWarnings = hr;
  1822. if (DETECTION_MAX_LEN < nSrcSize)
  1823. nSrcSize = DETECTION_MAX_LEN;
  1824. if ( S_OK == _DetectInputCodepage(MLDETECTCP_HTML, 1252, lpstrIn, (int *)&nSrcSize, &Encoding, &nScores))
  1825. {
  1826. dwSrcEncoding = Encoding.nCodePage;
  1827. dwMode &= 0x0000ffff ;
  1828. dwMode |= dwSrcEncoding << 16 ;
  1829. }
  1830. else
  1831. {
  1832. dwSrcEncoding = CP_ACP;
  1833. }
  1834. aEncodingInfo[GetWindowsEncodingIndex(CP_AUTO)].dwCodePage = dwSrcEncoding;
  1835. }
  1836. }
  1837. if ( S_OK == ( hr = INetConvert->ConvertSetup(&dwSrcEncoding,dwDstEncoding )))
  1838. {
  1839. // Loop for ever
  1840. while(1)
  1841. {
  1842. // Read a buffer
  1843. hr = pstmIn->Read(&lpstrIn[nSrcLeft], STR_BUFFER_SIZE-nSrcLeft, &nSrcSize);
  1844. if (S_OK != hr)
  1845. hrWarnings = hr;
  1846. // Done
  1847. if (0 == nSrcSize)
  1848. break;
  1849. nSrcSize += nSrcLeft ;
  1850. nSrcUsed = nSrcSize ;
  1851. dwModeTemp = dwMode ;
  1852. nDstSize = 0 ;
  1853. // get the size of output buffer
  1854. hr = INetConvert->DoCodeConvert(&dwModeTemp, (LPCSTR)lpstrIn, (LPINT)&nSrcUsed, NULL, (LPINT)&nDstSize, dwFlag, lpFallBack);
  1855. if (S_OK != hr)
  1856. hrWarnings = hr;
  1857. // Reallocate output buffer if so
  1858. if ( nDstSize > nOutBuffSize )
  1859. {
  1860. LPSTR psz = (LPSTR) LocalReAlloc(lpstrOut, nDstSize, LMEM_ZEROINIT|LMEM_MOVEABLE);
  1861. if (psz == NULL)
  1862. {
  1863. hrWarnings = E_OUTOFMEMORY ;
  1864. goto exit;
  1865. }
  1866. lpstrOut = psz;
  1867. nOutBuffSize = nDstSize ;
  1868. }
  1869. _nDstSize = nDstSize;
  1870. // Due to multi_stage conversion, this is the actual size is used
  1871. nSrcUsed = INetConvert->_nSrcSize ;
  1872. nSrcLeft = nSrcSize - nSrcUsed ;
  1873. #if 0
  1874. // restore Src size
  1875. nSrcUsed = nSrcSize ;
  1876. #endif
  1877. // do conversion
  1878. hr = INetConvert->DoCodeConvert(&dwMode, (LPCSTR)lpstrIn, (LPINT)&nSrcUsed, lpstrOut, (LPINT)&_nDstSize, dwFlag, lpFallBack);
  1879. if (S_OK != hr)
  1880. hrWarnings = hr;
  1881. // Write It
  1882. hr = pstmOut->Write(lpstrOut, nDstSize, &nDstSize);
  1883. if (S_OK != hr)
  1884. hrWarnings = hr;
  1885. if (nSrcLeft )
  1886. MoveMemory(lpstrIn, &lpstrIn[nSrcSize-nSrcLeft],nSrcLeft);
  1887. INetConvert->ConvertCleanUp();
  1888. }
  1889. }
  1890. if (nSrcLeft )
  1891. {
  1892. LARGE_INTEGER li;
  1893. LISet32(li, -(LONG)nSrcLeft );
  1894. hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL);
  1895. }
  1896. if (lpdwMode)
  1897. *lpdwMode = dwMode ;
  1898. exit :
  1899. if (lpstrIn)
  1900. LocalFree(lpstrIn);
  1901. if (lpstrOut)
  1902. LocalFree(lpstrOut);
  1903. // Done
  1904. return (hr == S_OK) ? hrWarnings : hr;
  1905. }
  1906. HRESULT WINAPI ConvertINetStringInIStream(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, IStream *pstmIn, IStream *pstmOut, DWORD dwFlag, WCHAR *lpFallBack)
  1907. {
  1908. HRESULT hr;
  1909. CICharConverter * INetConvert = new CICharConverter(dwFlag, lpFallBack) ;
  1910. if (!INetConvert)
  1911. return E_OUTOFMEMORY;
  1912. hr = _ConvertINetStringInIStream(INetConvert,lpdwMode,dwSrcEncoding,dwDstEncoding,pstmIn,pstmOut,dwFlag,lpFallBack);
  1913. delete INetConvert;
  1914. return hr ;
  1915. }