Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2285 lines
83 KiB

  1. #include "private.h"
  2. #include "detcbase.h"
  3. #include "codepage.h"
  4. #include "detcjpn.h"
  5. #include "detckrn.h"
  6. #include "fechrcnv.h"
  7. #include "ichrcnv.h"
  8. #include "cpdetect.h"
  9. #include <tchar.h>
  10. #define CONV_UU 12
  11. #define CONV_UUW 10
  12. #define CONV_UUWI 9
  13. #define CONV_UW 6
  14. #define CONV_UWI 5
  15. #define CONV_WI 3
  16. #define MAX_CHAR_SIZE 4
  17. #define MAPUSERDEF(x) (((x) == 50000) ? 1252 : (x))
  18. #define CONVERT_IS_VALIDCODEPAGE(x) (((x) == CP_USER_DEFINED) ? TRUE: IsValidCodePage(x))
  19. #define CONV_CHK_NLS 0x00000001
  20. struct ENCODINGINFO
  21. {
  22. DWORD dwEncoding;
  23. DWORD dwCodePage;
  24. BYTE bTypeUUIW;
  25. CP_STATE nCP_State ; // whether this is a valid windows codepage ?
  26. DWORD dwFlags; // give us more flexibilities to handle different encodings differently
  27. };
  28. static WCHAR UniocdeSignature = { 0xFFFE } ;
  29. /*
  30. Bit 4 (16) - Unicode <-> Internet Encoding
  31. Bit 3 (8) - UTF8, UTF7
  32. Bit 2 (4) - Unicode
  33. Bit 1 (2) - Windows CodePage
  34. Bit 0 (1) - Internet Encoding
  35. P.S. if bit 4 is set, it means it should convert between Unicode and Internet
  36. Encoding directly, no intermediate step - Windows CodePage
  37. */
  38. // these codepages including Unicode need special convertor
  39. static struct ENCODINGINFO aEncodingInfo[] =
  40. {
  41. { CP_JPN_SJ, 932, 0x02, INVALID_CP, 0 }, // W-Japanese Shift JIS
  42. { CP_CHN_GB, 936, 0x02, INVALID_CP, 0 }, // W-Simplified Chinese
  43. { CP_KOR_5601, 949, 0x02, INVALID_CP, 0 }, // W-Krean Unified Hangul
  44. { CP_TWN, 950, 0x02, INVALID_CP, 0 }, // W-Traditional Chinese
  45. { CP_UCS_2, 0, 0x04, INVALID_CP, 0 }, // U-Unicode
  46. { CP_UCS_2_BE, 0, 0x04, INVALID_CP, 0 }, // U-Unicode Big Endian
  47. { CP_1252, 1252, 0x02, INVALID_CP, 0 }, // W-Latin 1
  48. { CP_20127, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // US ASCII
  49. { CP_ISO_8859_1, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // I-ISO 8859-1 Latin 1
  50. { CP_ISO_8859_15, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // I-ISO 8859-1 Latin 1
  51. { CP_AUTO, 1252, 0x01, INVALID_CP, 0 }, // General auto detect
  52. { CP_ISO_2022_JP, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP No Halfwidth Katakana
  53. { CP_ISO_2022_JP_ESC, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP w/esc Halfwidth Katakana
  54. { CP_ISO_2022_JP_SIO, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP w/sio Halfwidth Katakana
  55. { CP_ISO_2022_KR, 949, 0x01, INVALID_CP, 0 }, // I-ISO 2022-KR
  56. { CP_ISO_2022_TW, 950, 0x01, INVALID_CP, 0 }, // I-ISO 2022-TW
  57. { CP_ISO_2022_CH, 936, 0x01, INVALID_CP, 0 }, // I-ISO 2022-CH
  58. { CP_JP_AUTO, 932, 0x01, INVALID_CP, 0 }, // JP auto detect
  59. { CP_CHS_AUTO, 936, 0x01, INVALID_CP, 0 }, // Simplified Chinese auto detect
  60. { CP_KR_AUTO, 949, 0x01, INVALID_CP, 0 }, // KR auto detect
  61. { CP_CHT_AUTO, 950, 0x01, INVALID_CP, 0 }, // Traditional Chinese auto detect
  62. { CP_CYRILLIC_AUTO, 1251, 0x01, INVALID_CP, 0 }, // Cyrillic auto detect
  63. { CP_GREEK_AUTO, 1253, 0x01, INVALID_CP, 0 }, // Greek auto detect
  64. { CP_ARABIC_AUTO, 1256, 0x01, INVALID_CP, 0 }, // Arabic auto detect
  65. { CP_EUC_JP, 932, 0x01, INVALID_CP, 0 }, // EUC Japanese
  66. { CP_EUC_CH, 936, 0x01, INVALID_CP, 0 }, // EUC Chinese
  67. { CP_EUC_KR, 949, 0x01, INVALID_CP, 0 }, // EUC Korean
  68. { CP_EUC_TW, 950, 0x01, INVALID_CP, 0 }, // EUC Taiwanese
  69. { CP_CHN_HZ, 936, 0x01, INVALID_CP, 0 }, // Simplify Chinese HZ-GB
  70. { CP_UTF_7, 0, 0x08, INVALID_CP, 0 }, // U-UTF7
  71. { CP_UTF_8, 0, 0x08, INVALID_CP, 0 }, // U-UTF8
  72. };
  73. // HTML name entity table for Latin-1 Supplement - from 0x00A0-0x00FF
  74. #define NAME_ENTITY_OFFSET 0x00A0
  75. #define NAME_ENTITY_MAX 0x00FF
  76. #define NAME_ENTITY_ENTRY 96
  77. static CHAR *g_lpstrNameEntity[NAME_ENTITY_ENTRY] =
  78. {
  79. "&nbsp;", // "&#160;" -- no-break space = non-breaking space,
  80. "&iexcl;", // "&#161;" -- inverted exclamation mark, U+00A1 ISOnum -->
  81. "&cent;", // "&#162;" -- cent sign, U+00A2 ISOnum -->
  82. "&pound;", // "&#163;" -- pound sign, U+00A3 ISOnum -->
  83. "&curren;", // "&#164;" -- currency sign, U+00A4 ISOnum -->
  84. "&yen;", // "&#165;" -- yen sign = yuan sign, U+00A5 ISOnum -->
  85. "&brvbar;", // "&#166;" -- broken bar = broken vertical bar,
  86. "&sect;", // "&#167;" -- section sign, U+00A7 ISOnum -->
  87. "&uml;", // "&#168;" -- diaeresis = spacing diaeresis,
  88. "&copy;", // "&#169;" -- copyright sign, U+00A9 ISOnum -->
  89. "&ordf;", // "&#170;" -- feminine ordinal indicator, U+00AA ISOnum -->
  90. "&laquo;", // "&#171;" -- left-pointing double angle quotation mark
  91. "&not;", // "&#172;" -- not sign = discretionary hyphen,
  92. "&shy;", // "&#173;" -- soft hyphen = discretionary hyphen,
  93. "&reg;", // "&#174;" -- registered sign = registered trade mark sign,
  94. "&macr;", // "&#175;" -- macron = spacing macron = overline
  95. "&deg;", // "&#176;" -- degree sign, U+00B0 ISOnum -->
  96. "&plusmn;", // "&#177;" -- plus-minus sign = plus-or-minus sign,
  97. "&sup2;", // "&#178;" -- superscript two = superscript digit two
  98. "&sup3;", // "&#179;" -- superscript three = superscript digit three
  99. "&acute;", // "&#180;" -- acute accent = spacing acute,
  100. "&micro;", // "&#181;" -- micro sign, U+00B5 ISOnum -->
  101. "&para;", // "&#182;" -- pilcrow sign = paragraph sign,
  102. "&middot;", // "&#183;" -- middle dot = Georgian comma
  103. "&cedil;", // "&#184;" -- cedilla = spacing cedilla, U+00B8 ISOdia -->
  104. "&sup1;", // "&#185;" -- superscript one = superscript digit one,
  105. "&ordm;", // "&#186;" -- masculine ordinal indicator,
  106. "&raquo;", // "&#187;" -- right-pointing double angle quotation mark
  107. "&frac14;", // "&#188;" -- vulgar fraction one quarter
  108. "&frac12;", // "&#189;" -- vulgar fraction one half
  109. "&frac34;", // "&#190;" -- vulgar fraction three quarters
  110. "&iquest;", // "&#191;" -- inverted question mark
  111. "&Agrave;", // "&#192;" -- latin capital letter A with grave
  112. "&Aacute;", // "&#193;" -- latin capital letter A with acute,
  113. "&Acirc;", // "&#194;" -- latin capital letter A with circumflex,
  114. "&Atilde;", // "&#195;" -- latin capital letter A with tilde,
  115. "&Auml;", // "&#196;" -- latin capital letter A with diaeresis,
  116. "&Aring;", // "&#197;" -- latin capital letter A with ring above
  117. "&AElig;", // "&#198;" -- latin capital letter AE
  118. "&Ccedil;", // "&#199;" -- latin capital letter C with cedilla,
  119. "&Egrave;", // "&#200;" -- latin capital letter E with grave,
  120. "&Eacute;", // "&#201;" -- latin capital letter E with acute,
  121. "&Ecirc;", // "&#202;" -- latin capital letter E with circumflex,
  122. "&Euml;", // "&#203;" -- latin capital letter E with diaeresis,
  123. "&Igrave;", // "&#204;" -- latin capital letter I with grave,
  124. "&Iacute;", // "&#205;" -- latin capital letter I with acute,
  125. "&Icirc;", // "&#206;" -- latin capital letter I with circumflex,
  126. "&Iuml;", // "&#207;" -- latin capital letter I with diaeresis,
  127. "&ETH;", // "&#208;" -- latin capital letter ETH, U+00D0 ISOlat1 -->
  128. "&Ntilde;", // "&#209;" -- latin capital letter N with tilde,
  129. "&Ograve;", // "&#210;" -- latin capital letter O with grave,
  130. "&Oacute;", // "&#211;" -- latin capital letter O with acute,
  131. "&Ocirc;", // "&#212;" -- latin capital letter O with circumflex,
  132. "&Otilde;", // "&#213;" -- latin capital letter O with tilde,
  133. "&Ouml;", // "&#214;" -- latin capital letter O with diaeresis,
  134. "&times;", // "&#215;" -- multiplication sign, U+00D7 ISOnum -->
  135. "&Oslash;", // "&#216;" -- latin capital letter O with stroke
  136. "&Ugrave;", // "&#217;" -- latin capital letter U with grave,
  137. "&Uacute;", // "&#218;" -- latin capital letter U with acute,
  138. "&Ucirc;", // "&#219;" -- latin capital letter U with circumflex,
  139. "&Uuml;", // "&#220;" -- latin capital letter U with diaeresis,
  140. "&Yacute;", // "&#221;" -- latin capital letter Y with acute,
  141. "&THORN;", // "&#222;" -- latin capital letter THORN,
  142. "&szlig;", // "&#223;" -- latin small letter sharp s = ess-zed,
  143. "&agrave;", // "&#224;" -- latin small letter a with grave
  144. "&aacute;", // "&#225;" -- latin small letter a with acute,
  145. "&acirc;", // "&#226;" -- latin small letter a with circumflex,
  146. "&atilde;", // "&#227;" -- latin small letter a with tilde,
  147. "&auml;", // "&#228;" -- latin small letter a with diaeresis,
  148. "&aring;", // "&#229;" -- latin small letter a with ring above
  149. "&aelig;", // "&#230;" -- latin small letter ae
  150. "&ccedil;", // "&#231;" -- latin small letter c with cedilla,
  151. "&egrave;", // "&#232;" -- latin small letter e with grave,
  152. "&eacute;", // "&#233;" -- latin small letter e with acute,
  153. "&ecirc;", // "&#234;" -- latin small letter e with circumflex,
  154. "&euml;", // "&#235;" -- latin small letter e with diaeresis,
  155. "&igrave;", // "&#236;" -- latin small letter i with grave,
  156. "&iacute;", // "&#237;" -- latin small letter i with acute,
  157. "&icirc;", // "&#238;" -- latin small letter i with circumflex,
  158. "&iuml;", // "&#239;" -- latin small letter i with diaeresis,
  159. "&eth;", // "&#240;" -- latin small letter eth, U+00F0 ISOlat1 -->
  160. "&ntilde;", // "&#241;" -- latin small letter n with tilde,
  161. "&ograve;", // "&#242;" -- latin small letter o with grave,
  162. "&oacute;", // "&#243;" -- latin small letter o with acute,
  163. "&ocirc;", // "&#244;" -- latin small letter o with circumflex,
  164. "&otilde;", // "&#245;" -- latin small letter o with tilde,
  165. "&ouml;", // "&#246;" -- latin small letter o with diaeresis,
  166. "&divide;", // "&#247;" -- division sign, U+00F7 ISOnum -->
  167. "&oslash;", // "&#248;" -- latin small letter o with stroke,
  168. "&ugrave;", // "&#249;" -- latin small letter u with grave,
  169. "&uacute;", // "&#250;" -- latin small letter u with acute,
  170. "&ucirc;", // "&#251;" -- latin small letter u with circumflex,
  171. "&uuml;", // "&#252;" -- latin small letter u with diaeresis,
  172. "&yacute;", // "&#253;" -- latin small letter y with acute,
  173. "&thorn;", // "&#254;" -- latin small letter thorn with,
  174. "&yuml;", // "&#255;" -- latin small letter y with diaeresis,
  175. };
  176. #ifdef MORE_NAME_ENTITY // in case we decide to do more name entity latter
  177. // Additional HTML 4.0 name entity table for CP 1252 extension character set
  178. #define CP1252EXT_BASE (UINT)0x0080
  179. #define CP1252EXT_MAX (UINT)0x009F
  180. #define NONUNI 0xFFFF
  181. #define UNDEFCHAR "???????"
  182. #define CP1252EXT_NCR_SIZE 7
  183. struct NAME_ENTITY_EXT
  184. {
  185. UWORD uwUniCode;
  186. LPCTSTR lpszNameEntity;
  187. };
  188. static struct NAME_ENTITY_EXT aNameEntityExt[] =
  189. {
  190. // UniCode NCR_Enty Name_Enty CP1252Ext Comment
  191. { 0x20AC, "&#8364;" }, // "&euro;" }, // &#128; #EURO SIGN
  192. // { NONUNI, UNDEFCHAR }, // "&;" }, // &#129; #UNDEFINED
  193. { 0x201A, "&#8218;" }, // "&sbquo;" }, // &#130; #SINGLE LOW-9 QUOTATION MARK
  194. { 0x0192, "&#0402;" }, // "&fnof;" }, // &#131; #LATIN SMALL LETTER F WITH HOOK
  195. { 0x201E, "&#8222;" }, // "&bdquo;" }, // &#132; #DOUBLE LOW-9 QUOTATION MARK
  196. { 0x2026, "&#8230;" }, // "&hellip;" }, // &#133; #HORIZONTAL ELLIPSIS
  197. { 0x2020, "&#8224;" }, // "&dagger;" }, // &#134; #DAGGER
  198. { 0x2021, "&#8225;" }, // "&Dagger;" }, // &#135; #DOUBLE DAGGER
  199. { 0x02C6, "&#0710;" }, // "&circ;" }, // &#136; #MODIFIER LETTER CIRCUMFLEX ACCENT
  200. { 0x2030, "&#8240;" }, // "&permil;" }, // &#137; #PER MILLE SIGN
  201. { 0x0160, "&#0352;" }, // "&Scaron;" }, // &#138; #LATIN CAPITAL LETTER S WITH CARON
  202. { 0x2039, "&#8249;" }, // "&lsaquo;" }, // &#139; #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  203. { 0x0152, "&#0338;" }, // "&OElig;" }, // &#140; #LATIN CAPITAL LIGATURE OE
  204. // { NONUNI, UNDEFCHAR }, // "&;" }, // &#141; #UNDEFINED
  205. { 0x017D, "&#0381;" }, // "&;" }, // &#142; #LATIN CAPITAL LETTER Z WITH CARON, ***no name entity defined in HTML 4.0***
  206. // { NONUNI, UNDEFCHAR }, // "&;" }, // &#143; #UNDEFINED
  207. // { NONUNI, UNDEFCHAR }, // "&;" }, // &#144; #UNDEFINED
  208. { 0x2018, "&#8216;" }, // "&lsquo;" }, // &#145; #LEFT SINGLE QUOTATION MARK
  209. { 0x2019, "&#8217;" }, // "&rsquo;" }, // &#146; #RIGHT SINGLE QUOTATION MARK
  210. { 0x201C, "&#8220;" }, // "&ldquo;" }, // &#147; #LEFT DOUBLE QUOTATION MARK
  211. { 0x201D, "&#8221;" }, // "&rdquo;" }, // &#148; #RIGHT DOUBLE QUOTATION MARK
  212. { 0x2022, "&#8226;" }, // "&bull;" }, // &#149; #BULLET
  213. { 0x2013, "&#8211;" }, // "&ndash;" }, // &#150; #EN DASH
  214. { 0x2014, "&#8212;" }, // "&mdash;" }, // &#151; #EM DASH
  215. { 0x20DC, "&#0732;" }, // "&tilde;" }, // &#152; #SMALL TILDE
  216. { 0x2122, "&#8482;" }, // "&trade;" }, // &#153; #TRADE MARK SIGN
  217. { 0x0161, "&#0353;" }, // "&scaron;" }, // &#154; #LATIN SMALL LETTER S WITH CARON
  218. { 0x203A, "&#8250;" }, // "&rsaquo;" }, // &#155; #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  219. { 0x0153, "&#0339;" }, // "&oelig;" }, // &#156; #LATIN SMALL LIGATURE OE
  220. // { NONUNI, UNDEFCHAR }, // "&;" }, // &#157; #UNDEFINED
  221. { 0x017E, "&#0382;" }, // "&;" }, // &#158; #LATIN SMALL LETTER Z WITH CARON, ***no name entity defined in HTML 4.0***
  222. { 0x0178, "&#0376;" }, // "&Yuml;" }, // &#159; #LATIN CAPITAL LETTER Y WITH DIAERESIS
  223. };
  224. #endif
  225. /******************************************************************************
  226. ***************************** U T I L I T I E S ***************************
  227. ******************************************************************************/
  228. void DataByteSwap(LPSTR DataBuf, int len )
  229. {
  230. int i ;
  231. UCHAR tmpData ;
  232. if ( len )
  233. for ( i = 0 ; i < len-1 ; i+=2 )
  234. {
  235. tmpData = DataBuf[i] ;
  236. DataBuf[i] = DataBuf[i+1] ;
  237. DataBuf[i+1] = tmpData ;
  238. }
  239. return ;
  240. }
  241. void CheckUnicodeDataType(DWORD dwDstEncoding, LPSTR DataBuf, int len )
  242. {
  243. if ( DataBuf && len )
  244. {
  245. if ( dwDstEncoding == CP_UCS_2_BE )
  246. DataByteSwap(DataBuf,len);
  247. }
  248. return ;
  249. }
  250. void CheckASCIIEncoding(DWORD dwSrcEncoding, LPSTR DataBuf, int len )
  251. {
  252. if (DataBuf && len)
  253. {
  254. if (dwSrcEncoding == CP_20127)
  255. {
  256. for (int i = 0; i<len; i++)
  257. {
  258. if (*DataBuf & 0x80)
  259. {
  260. *DataBuf &= 0x7f;
  261. }
  262. DataBuf++;
  263. }
  264. }
  265. }
  266. }
  267. /******************************************************************************
  268. ****************** C O N V E R T I N E T S T R I N G ******************
  269. ******************************************************************************/
  270. HRESULT CICharConverter::UnicodeToMultiByteEncoding(DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  271. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  272. {
  273. int nBuffSize, i ;
  274. BOOL UseDefChar = FALSE ;
  275. LPSTR lpDefFallBack = NULL ;
  276. UCHAR DefaultCharBuff[3]; // possible DBCS + null
  277. HRESULT hr = E_FAIL;
  278. int _nDstSize = *lpnDstSize;
  279. if ( _dwUnicodeEncoding == CP_UCS_2_BE && _cvt_count == 0 )
  280. {
  281. if ( _lpUnicodeStr = (LPSTR)LocalAlloc(LPTR, *lpnSrcSize ) )
  282. {
  283. MoveMemory(_lpUnicodeStr, lpSrcStr, *lpnSrcSize ) ;
  284. lpSrcStr = _lpUnicodeStr ;
  285. }
  286. else
  287. {
  288. hr = E_OUTOFMEMORY;
  289. goto EXIT;
  290. }
  291. }
  292. CheckUnicodeDataType(_dwUnicodeEncoding, (LPSTR) lpSrcStr, *lpnSrcSize);
  293. nBuffSize = *lpnSrcSize / sizeof(WCHAR);
  294. // We force to use MLang NO_BEST_FIT_CHAR check on ISCII encoding since system don't accept default chars
  295. if (IS_NLS_DLL_CP(dwDstEncoding) && (dwFlag & MLCONVCHARF_USEDEFCHAR))
  296. dwFlag |= MLCONVCHARF_NOBESTFITCHARS;
  297. if ( lpFallBack && ( dwFlag & MLCONVCHARF_USEDEFCHAR ))
  298. {
  299. // only take SBCS, no DBCS character
  300. if ( 1 == WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
  301. (LPCWSTR)lpFallBack, 1,
  302. (LPSTR)DefaultCharBuff, sizeof(DefaultCharBuff), NULL, NULL ))
  303. lpDefFallBack = (LPSTR) DefaultCharBuff;
  304. }
  305. if(!(*lpnDstSize = WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
  306. (LPCWSTR)lpSrcStr, nBuffSize,
  307. lpDstStr, *lpnDstSize, IS_NLS_DLL_CP(dwDstEncoding)? NULL:(LPCSTR)lpDefFallBack, IS_NLS_DLL_CP(dwDstEncoding)? NULL:&UseDefChar)))
  308. {
  309. hr = E_FAIL;
  310. goto EXIT;
  311. }
  312. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  313. _nSrcSize = nBuffSize * sizeof(WCHAR);
  314. if (*lpnDstSize)
  315. {
  316. if (dwFlag & ( MLCONVCHARF_NCR_ENTITIZE | MLCONVCHARF_NAME_ENTITIZE | MLCONVCHARF_NOBESTFITCHARS ))
  317. {
  318. char *lpDstStrTmp = lpDstStr;
  319. WCHAR *lpwStrTmp = NULL;
  320. WCHAR *lpwStrTmpSave = NULL;
  321. char *lpDstStrTmp2 = NULL;
  322. char *lpDstStrTmp2Save = NULL;
  323. int cCount, ConvCount = 0, nCount = 0;
  324. WCHAR *lpwSrcStrTmp = (WCHAR *)lpSrcStr;
  325. int *lpBCharOffset = NULL;
  326. int *lpBCharOffsetSave = NULL;
  327. if (!(lpwStrTmpSave = lpwStrTmp = (WCHAR *)LocalAlloc(LPTR, *lpnSrcSize)))
  328. {
  329. hr = E_OUTOFMEMORY;
  330. goto ENTITIZE_DONE;
  331. }
  332. // Make sure we have real converted buffer to check BEST_FIT_CHAR and DEFAULT_CHAR
  333. if (!_nDstSize)
  334. {
  335. lpDstStrTmp2Save = lpDstStrTmp2 = (char *)LocalAlloc(LPTR, *lpnDstSize);
  336. if (lpDstStrTmp2)
  337. {
  338. WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
  339. (LPCWSTR)lpSrcStr, nBuffSize,
  340. lpDstStrTmp2, *lpnDstSize, NULL, NULL );
  341. }
  342. else
  343. {
  344. hr = E_OUTOFMEMORY;
  345. goto ENTITIZE_DONE;
  346. }
  347. }
  348. if (nBuffSize ==
  349. MultiByteToWideChar(MAPUSERDEF(dwDstEncoding), 0, _nDstSize? lpDstStr : lpDstStrTmp2, *lpnDstSize, lpwStrTmp, _nSrcSize))
  350. {
  351. // Pre scan to get number of best fit chars.
  352. for (i=0; i<nBuffSize; i++)
  353. {
  354. // make special case for ?(yen sign) in Shift-JIS
  355. if (*lpwStrTmp++ != *lpwSrcStrTmp++)
  356. {
  357. if ((dwDstEncoding == CP_JPN_SJ) && (*(lpwSrcStrTmp - 1) == 0x00A5))
  358. *(lpwStrTmp - 1) = 0x00A5;
  359. else
  360. nCount ++;
  361. }
  362. }
  363. lpwSrcStrTmp -= nBuffSize;
  364. lpwStrTmp -= nBuffSize;
  365. if (nCount)
  366. {
  367. int j = 0;
  368. if (!(dwFlag & ( MLCONVCHARF_NCR_ENTITIZE | MLCONVCHARF_NAME_ENTITIZE | MLCONVCHARF_USEDEFCHAR)))
  369. {
  370. hr = E_FAIL;
  371. goto ENTITIZE_DONE;
  372. }
  373. if (!(lpBCharOffsetSave = lpBCharOffset = (int *) LocalAlloc(LPTR, nCount*sizeof(int))))
  374. {
  375. hr = E_OUTOFMEMORY;
  376. goto ENTITIZE_DONE;
  377. }
  378. // Record the offset position of each best fit char.
  379. for (i=0; i<nBuffSize; i++)
  380. {
  381. if (*lpwStrTmp++ != *lpwSrcStrTmp++)
  382. {
  383. *lpBCharOffset = i-j;
  384. lpBCharOffset++;
  385. j = i+1;
  386. }
  387. }
  388. lpBCharOffset -= nCount;
  389. lpwSrcStrTmp -= nBuffSize;
  390. lpwStrTmp -= nBuffSize;
  391. for (i=0; i<nCount; i++)
  392. {
  393. BOOL bIsSurrogatePair = FALSE;
  394. if (*lpBCharOffset)
  395. {
  396. cCount = WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
  397. (LPCWSTR)lpwSrcStrTmp, *lpBCharOffset,
  398. lpDstStrTmp, _nDstSize? _nDstSize-ConvCount : 0, NULL, NULL );
  399. ConvCount += cCount;
  400. if (_nDstSize)
  401. {
  402. lpDstStrTmp += cCount;
  403. }
  404. lpwSrcStrTmp += *lpBCharOffset;
  405. }
  406. BOOL fConverted = FALSE;
  407. // check if unconvertable character falls in NAME ENTITY area
  408. if (dwFlag & MLCONVCHARF_NAME_ENTITIZE)
  409. {
  410. // for beta2, make assmption that name entity implys NCR.
  411. dwFlag |= MLCONVCHARF_NCR_ENTITIZE;
  412. #ifdef MORE_NAME_ENTITY // in case we decide do more name entity latter
  413. BOOL fDoNEnty = FALSE;
  414. LPCTSTR lpszNEnty = NULL;
  415. // check if character is in the Latin-1 Supplement range
  416. if ((*lpwSrcStrTmp >= NAME_ENTITY_OFFSET) && (*lpwSrcStrTmp <= NAME_ENTITY_MAX ))
  417. {
  418. fDoNEnty = TRUE;
  419. lpszNEnty = g_lpstrNameEntity[(*lpwSrcStrTmp) - NAME_ENTITY_OFFSET];
  420. }
  421. // check if character is in the additional name entity table for CP 1252 extension
  422. if (!fDoNEnty)
  423. {
  424. for (int idx = 0; idx < ARRAYSIZE(aNameEntityExt); idx++)
  425. if (*lpwSrcStrTmp == aNameEntityExt[idx].uwUniCode)
  426. {
  427. fDoNEnty = TRUE;
  428. lpszNEnty = aNameEntityExt[idx].lpszNameEntity;
  429. break;
  430. }
  431. }
  432. if (fDoNEnty)
  433. {
  434. cCount = lstrlenA(lpszNEnty);
  435. if (_nDstSize)
  436. {
  437. CopyMemory(lpDstStrTmp, lpszNEnty, cCount);
  438. lpDstStrTmp += cCount ;
  439. }
  440. ConvCount += cCount;
  441. fConverted = TRUE;
  442. }
  443. #else
  444. // check if character is in the Latin-1 Supplement range
  445. if ((*lpwSrcStrTmp >= NAME_ENTITY_OFFSET)
  446. && (*lpwSrcStrTmp < ARRAYSIZE(g_lpstrNameEntity)+NAME_ENTITY_OFFSET))
  447. {
  448. LPCTSTR lpszNEnty = NULL;
  449. if (!(lpszNEnty = g_lpstrNameEntity[(*lpwSrcStrTmp) - NAME_ENTITY_OFFSET]))
  450. {
  451. #ifdef DEBUG
  452. AssertMsg((BOOL)FALSE, "Name entity table broken");
  453. #endif
  454. hr = E_FAIL;
  455. goto ENTITIZE_DONE;
  456. }
  457. cCount = lstrlenA(lpszNEnty);
  458. if (_nDstSize)
  459. {
  460. CopyMemory(lpDstStrTmp, lpszNEnty, cCount);
  461. lpDstStrTmp += cCount ;
  462. }
  463. ConvCount += cCount;
  464. fConverted = TRUE;
  465. }
  466. #endif
  467. }
  468. // check if NCR requested
  469. if ((!fConverted) && (dwFlag & MLCONVCHARF_NCR_ENTITIZE))
  470. {
  471. if ((nCount-i >= 2) &&
  472. (*lpwSrcStrTmp >= 0xD800 && *lpwSrcStrTmp <= 0xDBFF) &&
  473. (*(lpwSrcStrTmp+1) >= 0xDC00 && *(lpwSrcStrTmp+1) <= 0xDFFF))
  474. bIsSurrogatePair = TRUE;
  475. else
  476. bIsSurrogatePair = FALSE;
  477. if (_nDstSize)
  478. {
  479. lpDstStrTmp[0] = '&' ;
  480. lpDstStrTmp[1] = '#' ;
  481. lpDstStrTmp += 2 ;
  482. // If it is a Unicode surrogates pair, we convert it to real Unicode value
  483. if (bIsSurrogatePair)
  484. {
  485. DWORD dwUnicode = ((*lpwSrcStrTmp - 0xD800) << 10) + *(lpwSrcStrTmp+1) - 0xDC00 + 0x10000;
  486. _ultoa( dwUnicode, (char*)lpDstStrTmp, 10);
  487. }
  488. else
  489. _ultoa( *lpwSrcStrTmp, (char*)lpDstStrTmp, 10);
  490. cCount = lstrlenA(lpDstStrTmp);
  491. lpDstStrTmp += cCount;
  492. ConvCount += cCount;
  493. *(lpDstStrTmp++) = ';' ;
  494. }
  495. else
  496. {
  497. char szTmpString[10];
  498. if (bIsSurrogatePair)
  499. {
  500. DWORD dwUnicode = ((*lpwSrcStrTmp - 0xD800) << 10) + *(lpwSrcStrTmp+1) - 0xDC00 + 0x10000;
  501. _ultoa( dwUnicode, szTmpString, 10);
  502. }
  503. else
  504. _ultoa( *lpwSrcStrTmp, szTmpString, 10);
  505. ConvCount += lstrlenA(szTmpString);
  506. }
  507. fConverted = TRUE;
  508. ConvCount += 3;
  509. }
  510. // handle MLCONVCHARF_USEDEFCHAR here - less priority and default method
  511. if (!fConverted)
  512. {
  513. if (_nDstSize)
  514. {
  515. *lpDstStrTmp = lpDefFallBack ? *lpDefFallBack : '?';
  516. lpDstStrTmp++;
  517. }
  518. ConvCount++;
  519. if (!UseDefChar)
  520. UseDefChar = TRUE;
  521. }
  522. lpBCharOffset++;
  523. lpwSrcStrTmp++;
  524. // Skip next character if it is a Unicode surrogates pair
  525. if (bIsSurrogatePair)
  526. {
  527. lpBCharOffset++;
  528. lpwSrcStrTmp++;
  529. i++;
  530. }
  531. }
  532. lpBCharOffset -= nCount ;
  533. }
  534. int nRemain = (*lpnSrcSize - (int)((char*)lpwSrcStrTmp - (char *)lpSrcStr))/sizeof(WCHAR);
  535. ConvCount += WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0,
  536. (LPCWSTR)lpwSrcStrTmp, nRemain,
  537. lpDstStrTmp, _nDstSize? _nDstSize-ConvCount : 0, NULL, NULL );
  538. *lpnDstSize = ConvCount ;
  539. hr = S_OK;
  540. }
  541. else
  542. {
  543. hr = E_FAIL;
  544. }
  545. ENTITIZE_DONE:
  546. if (lpwStrTmpSave)
  547. LocalFree(lpwStrTmpSave);
  548. if (lpDstStrTmp2Save)
  549. LocalFree(lpDstStrTmp2Save);
  550. if (lpBCharOffsetSave)
  551. LocalFree(lpBCharOffsetSave);
  552. }
  553. else
  554. {
  555. hr = S_OK;
  556. }
  557. if (S_OK == hr && UseDefChar)
  558. hr = S_FALSE;
  559. }
  560. else
  561. {
  562. hr = E_FAIL;
  563. }
  564. EXIT:
  565. return hr;
  566. }
  567. HRESULT CICharConverter::UTF78ToUnicode(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  568. LPSTR lpDstStr, LPINT lpnDstSize)
  569. {
  570. HRESULT hr ;
  571. hr = DoConvertINetString(lpdwMode, TRUE, CP_UCS_2, _dwUTFEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
  572. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  573. _nSrcSize = *lpnSrcSize ;
  574. CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
  575. return hr ;
  576. }
  577. HRESULT CICharConverter::UnicodeToUTF78(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  578. LPSTR lpDstStr, LPINT lpnDstSize)
  579. {
  580. HRESULT hr ;
  581. if ( _dwUnicodeEncoding == CP_UCS_2_BE && _cvt_count == 0 )
  582. {
  583. if ( _lpUnicodeStr = (LPSTR)LocalAlloc(LPTR, *lpnSrcSize ) )
  584. {
  585. MoveMemory(_lpUnicodeStr, lpSrcStr, *lpnSrcSize ) ;
  586. lpSrcStr = _lpUnicodeStr ;
  587. }
  588. else
  589. return E_OUTOFMEMORY ;
  590. }
  591. CheckUnicodeDataType(_dwUnicodeEncoding, (LPSTR) lpSrcStr, *lpnSrcSize);
  592. hr = DoConvertINetString(lpdwMode, FALSE, CP_UCS_2, _dwUTFEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
  593. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  594. _nSrcSize = *lpnSrcSize ;
  595. return hr ;
  596. }
  597. HRESULT CICharConverter::UnicodeToWindowsCodePage(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  598. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  599. {
  600. HRESULT hr ;
  601. hr = UnicodeToMultiByteEncoding(_dwWinCodePage,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize,dwFlag,lpFallBack);
  602. return hr ;
  603. }
  604. HRESULT CICharConverter::UnicodeToInternetEncoding(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  605. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  606. {
  607. HRESULT hr ;
  608. hr = UnicodeToMultiByteEncoding(_dwInternetEncoding,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize,dwFlag,lpFallBack);
  609. return hr ;
  610. }
  611. HRESULT CICharConverter::InternetEncodingToUnicode(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  612. LPSTR lpDstStr, LPINT lpnDstSize)
  613. {
  614. int cch;
  615. int cb = *lpnSrcSize;
  616. if ( !_cvt_count)
  617. {
  618. // If we have a multibyte character encoding, we are at risk of splitting
  619. // some characters at the read boundary. We must Make sure we have a
  620. // discrete number of characters first.
  621. UINT uMax = MAX_CHAR_SIZE ;
  622. cb++; // pre-increment
  623. do
  624. {
  625. cch = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding),
  626. MB_ERR_INVALID_CHARS,
  627. lpSrcStr, --cb,
  628. NULL, 0 );
  629. --uMax;
  630. } while (!cch && uMax && cb);
  631. }
  632. if ( !cb || cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
  633. cb = *lpnSrcSize ; // restore orginal value
  634. *lpnDstSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), 0,
  635. lpSrcStr, cb,
  636. (LPWSTR)lpDstStr, *lpnDstSize/sizeof(WCHAR) );
  637. *lpnDstSize = *lpnDstSize * sizeof(WCHAR);
  638. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  639. _nSrcSize = cb ;
  640. CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
  641. if (*lpnDstSize==0 && (cb || cb != *lpnSrcSize))
  642. {
  643. // GetLastError() for MultiByteToWideChar()
  644. // Skip invalid characters for UTF8 conversion
  645. if (CP_UTF_8 == MAPUSERDEF(_dwInternetEncoding)&&
  646. ERROR_NO_UNICODE_TRANSLATION == GetLastError())
  647. return S_OK;
  648. else
  649. return E_FAIL ;
  650. }
  651. else
  652. return S_OK ;
  653. }
  654. HRESULT CICharConverter::WindowsCodePageToUnicode(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  655. LPSTR lpDstStr, LPINT lpnDstSize)
  656. {
  657. int cch1, cch2;
  658. int cb = *lpnSrcSize;
  659. if ( !_cvt_count && cb > 1 )
  660. {
  661. if (IS_DBCSCODEPAGE(MAPUSERDEF(_dwWinCodePage)))
  662. {
  663. // Detect DBCS dangling character
  664. if (!MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
  665. MB_ERR_INVALID_CHARS,
  666. lpSrcStr, cb,
  667. NULL, 0 ))
  668. {
  669. if (IsDBCSLeadByteEx(MAPUSERDEF(_dwWinCodePage), lpSrcStr[cb-1]))
  670. {
  671. cch1 = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
  672. 0,
  673. lpSrcStr, cb,
  674. NULL, 0 );
  675. cch2 = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
  676. 0,
  677. lpSrcStr, --cb,
  678. NULL, 0 );
  679. if (cch1 != cch2+1)
  680. {
  681. //Dangling DBCS character not found, restore cb.
  682. cb++;
  683. }
  684. }
  685. }
  686. }
  687. else
  688. {
  689. // If we have a multibyte character encoding, we are at risk of splitting
  690. // some characters at the read boundary. We must Make sure we have a
  691. // discrete number of characters first.
  692. UINT uMax = MAX_CHAR_SIZE ;
  693. cb++; // pre-increment
  694. do
  695. {
  696. cch1 = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
  697. MB_ERR_INVALID_CHARS,
  698. lpSrcStr, --cb,
  699. NULL, 0 );
  700. --uMax;
  701. } while (!cch1 && uMax && cb);
  702. if ( !cb || cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
  703. cb = *lpnSrcSize ; // restore orginal value
  704. }
  705. }
  706. *lpnDstSize = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), 0,
  707. lpSrcStr, cb,
  708. (LPWSTR)lpDstStr, *lpnDstSize/sizeof(WCHAR) );
  709. *lpnDstSize = *lpnDstSize * sizeof(WCHAR);
  710. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  711. _nSrcSize = cb ;
  712. CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize);
  713. // Whistler Bug#360429,
  714. // Web page could have a splitting DBCS character at the very end of the page,
  715. // To work around it, we allow one byte of dangling DBCS character.
  716. if (*lpnDstSize==0 && (cb || (cb != *lpnSrcSize && ++cb != *lpnSrcSize)))
  717. return E_FAIL ;
  718. else
  719. return S_OK ;
  720. }
  721. HRESULT CICharConverter::WindowsCodePageToInternetEncoding(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  722. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  723. {
  724. HRESULT hr ;
  725. // check if the conversion should go through Unicode indirectly
  726. if ( _dwConvertType & 0x10 )
  727. hr = WindowsCodePageToInternetEncodingWrap(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  728. else
  729. {
  730. hr = DoConvertINetString(lpdwMode, FALSE, _dwWinCodePage, _dwInternetEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
  731. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  732. _nSrcSize = *lpnSrcSize ;
  733. }
  734. return hr ;
  735. }
  736. HRESULT CICharConverter::InternetEncodingToWindowsCodePage(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  737. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  738. {
  739. HRESULT hr ;
  740. // check if the conversion should go through Unicode indirectly
  741. if ( _dwConvertType & 0x10 )
  742. hr = InternetEncodingToWindowsCodePageWrap(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  743. else
  744. {
  745. hr = DoConvertINetString(lpdwMode, TRUE, _dwWinCodePage, _dwInternetEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize);
  746. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  747. _nSrcSize = *lpnSrcSize ;
  748. }
  749. return hr ;
  750. }
  751. HRESULT CICharConverter::WindowsCodePageToInternetEncodingWrap(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  752. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  753. {
  754. int nBuffSize = 0 ;
  755. int cb = *lpnSrcSize;
  756. UINT uMax = MAX_CHAR_SIZE ;
  757. BOOL UseDefChar = FALSE ;
  758. HRESULT hr = S_OK;
  759. if ( !_cvt_count )
  760. {
  761. cb++; // pre-increment
  762. do
  763. {
  764. nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage),
  765. MB_ERR_INVALID_CHARS,
  766. lpSrcStr, --cb,
  767. NULL, 0 );
  768. --uMax;
  769. } while (!nBuffSize && uMax && cb);
  770. }
  771. if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
  772. cb = *lpnSrcSize ; // restore orginal value
  773. if (!nBuffSize) // in case there are illeage characters
  774. nBuffSize = cb ;
  775. if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR, (nBuffSize * sizeof(WCHAR))))
  776. {
  777. nBuffSize = MultiByteToWideChar(MAPUSERDEF(_dwWinCodePage), 0,
  778. lpSrcStr, cb, (LPWSTR)_lpInterm1Str, nBuffSize );
  779. int iSrcSizeTmp = nBuffSize * sizeof(WCHAR);
  780. hr = UnicodeToMultiByteEncoding(MAPUSERDEF(_dwInternetEncoding), (LPCSTR)_lpInterm1Str, &iSrcSizeTmp,
  781. lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  782. // *lpnDstSize = WideCharToMultiByte( MAPUSERDEF(_dwInternetEncoding), 0,
  783. // (LPCWSTR)_lpInterm1Str, nBuffSize, lpDstStr, *lpnDstSize, NULL, &UseDefChar );
  784. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  785. _nSrcSize = cb ;
  786. }
  787. else
  788. hr = E_FAIL;
  789. if (hr == S_OK)
  790. {
  791. if (*lpnDstSize==0 && cb)
  792. hr = E_FAIL ;
  793. else
  794. {
  795. if ( UseDefChar )
  796. return S_FALSE ;
  797. else
  798. return S_OK ;
  799. }
  800. }
  801. return hr;
  802. }
  803. HRESULT CICharConverter::InternetEncodingToWindowsCodePageWrap(LPCSTR lpSrcStr, LPINT lpnSrcSize,
  804. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  805. {
  806. int nBuffSize = 0 ;
  807. int cb = *lpnSrcSize;
  808. UINT uMax = MAX_CHAR_SIZE ;
  809. BOOL UseDefChar = FALSE ;
  810. HRESULT hr = S_OK;
  811. if ( !_cvt_count )
  812. {
  813. cb++; // pre-increment
  814. do
  815. {
  816. nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding),
  817. MB_ERR_INVALID_CHARS,
  818. lpSrcStr, --cb,
  819. NULL, 0 );
  820. --uMax;
  821. } while (!nBuffSize && uMax && cb);
  822. }
  823. if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string
  824. cb = *lpnSrcSize ; // restore orginal value
  825. if (!nBuffSize) // in case there are illeage characters
  826. nBuffSize = cb ;
  827. if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR,nBuffSize * sizeof (WCHAR) ))
  828. {
  829. nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), 0,
  830. lpSrcStr, cb, (LPWSTR)_lpInterm1Str, nBuffSize );
  831. int iSrcSizeTmp = nBuffSize * sizeof(WCHAR);
  832. hr = UnicodeToMultiByteEncoding(MAPUSERDEF(_dwWinCodePage), (LPCSTR)_lpInterm1Str, &iSrcSizeTmp,
  833. lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  834. // *lpnDstSize = WideCharToMultiByte( MAPUSERDEF(_dwWinCodePage), 0,
  835. // (LPCWSTR)_lpInterm1Str, nBuffSize, lpDstStr, *lpnDstSize, NULL, &UseDefChar );
  836. if ( !_cvt_count ) // save SrcSize if it is the first time conversion
  837. _nSrcSize = cb ;
  838. }
  839. else
  840. hr = E_FAIL;
  841. if (hr == S_OK)
  842. {
  843. if (*lpnDstSize==0 && cb)
  844. hr = E_FAIL ;
  845. else
  846. {
  847. if ( UseDefChar )
  848. return S_FALSE ;
  849. else
  850. return S_OK ;
  851. }
  852. }
  853. return hr;
  854. }
  855. HRESULT CICharConverter::ConvertIWUU(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  856. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  857. {
  858. int nBuffSize = 0 ;
  859. HRESULT hr = S_OK ;
  860. HRESULT hrWarnings = S_OK ;
  861. // InternetEncodingToWindowsCodePage
  862. if ( _dwConvertType % 2 && _dwConvertType < 21 ) /* start from Internet Encoding */
  863. {
  864. if ( _dwConvertType == 5 || _dwConvertType == 9 ) /* use interm buffer */
  865. {
  866. hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize, dwFlag, lpFallBack);
  867. if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR,nBuffSize) )
  868. {
  869. hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize, dwFlag, lpFallBack);
  870. lpSrcStr = _lpInterm1Str ;
  871. *lpnSrcSize = nBuffSize ;
  872. }
  873. else
  874. goto fail ;
  875. }
  876. else
  877. hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  878. _cvt_count ++ ;
  879. }
  880. if ( hr != S_OK )
  881. hrWarnings = hr ;
  882. // WindowsCodePageToUnicode or InternetEncodingToUnicode
  883. if ( _dwConvertType == 21 || _dwConvertType == 25 )
  884. {
  885. if ( _dwConvertType == 21 )
  886. hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
  887. else // _dwConvertType == 25
  888. {
  889. hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
  890. if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  891. {
  892. hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
  893. lpSrcStr = _lpInterm1Str ;
  894. *lpnSrcSize = nBuffSize ;
  895. }
  896. else
  897. goto fail ;
  898. }
  899. _cvt_count ++ ;
  900. }
  901. else if ( _dwConvertType >= 4 && _dwConvertType <= 10 )
  902. {
  903. if ( _dwConvertType > 8 )
  904. {
  905. nBuffSize = 0 ;
  906. hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
  907. if ( _cvt_count )
  908. {
  909. if ( _lpInterm2Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  910. {
  911. hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, _lpInterm2Str, &nBuffSize);
  912. lpSrcStr = _lpInterm2Str ;
  913. *lpnSrcSize = nBuffSize ;
  914. }
  915. else
  916. goto fail ;
  917. }
  918. else
  919. {
  920. if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  921. {
  922. hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
  923. lpSrcStr = _lpInterm1Str ;
  924. *lpnSrcSize = nBuffSize ;
  925. }
  926. else
  927. goto fail ;
  928. }
  929. }
  930. else
  931. hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
  932. _cvt_count ++ ;
  933. }
  934. if ( hr != S_OK )
  935. hrWarnings = hr ;
  936. // UnicodeToUTF78
  937. if ( _dwConvertType & 0x08 )
  938. #ifndef UNIX
  939. hr = UnicodeToUTF78(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
  940. #else
  941. {
  942. /* we now hack the lpSrcStr to be the same as 2 byte Unicode so mlang
  943. * lowlevel code can work right.
  944. */
  945. LPWSTR lpwSrcStr = (LPWSTR)lpSrcStr;
  946. INT tmpSize = *lpnSrcSize/sizeof(WCHAR);
  947. UCHAR *pTmp = new UCHAR[(tmpSize+1)*2];
  948. if(pTmp) {
  949. for(int i = 0; i < tmpSize; i++) {
  950. pTmp[i*2] = *lpwSrcStr++;
  951. pTmp[i*2+1] = 0x00;
  952. }
  953. pTmp[i*2] = pTmp[i*2+1] = 0x00;
  954. tmpSize *= 2;
  955. hr = UnicodeToUTF78(lpdwMode, (LPCSTR)pTmp, &tmpSize, lpDstStr, lpnDstSize);
  956. }
  957. else
  958. hr = E_FAIL;
  959. delete [] pTmp;
  960. }
  961. #endif /* UNIX */
  962. return ( hr == S_OK ? hrWarnings : hr ) ;
  963. fail :
  964. return E_FAIL ;
  965. }
  966. HRESULT CICharConverter::ConvertUUWI(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  967. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  968. {
  969. int nBuffSize = 0 ;
  970. HRESULT hr = S_OK ;
  971. HRESULT hrWarnings = S_OK ;
  972. // UTF78ToUnicode
  973. if ( _dwConvertType & 0x08 )
  974. {
  975. if ( _dwConvertType == 12 ) /* convert UTF78 -> Unicode only */
  976. hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize);
  977. else /* use interm buffer, type = 10 or 9 */
  978. {
  979. hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
  980. if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  981. {
  982. hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
  983. lpSrcStr = _lpInterm1Str ;
  984. *lpnSrcSize = nBuffSize ;
  985. }
  986. else
  987. goto fail ;
  988. }
  989. _cvt_count ++ ;
  990. }
  991. if ( hr != S_OK )
  992. hrWarnings = hr ;
  993. // UnicodeToWindowsCodePage or UnicodeToInternetEncoding
  994. if ( _dwConvertType == 21 || _dwConvertType == 25 )
  995. {
  996. hr = UnicodeToInternetEncoding(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  997. _cvt_count ++ ;
  998. }
  999. else if ( _dwConvertType >= 4 && _dwConvertType <= 10 )
  1000. {
  1001. if ( _dwConvertType % 2 ) /* use interm buffer */
  1002. {
  1003. nBuffSize = 0 ;
  1004. hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, NULL, &nBuffSize, dwFlag, lpFallBack);
  1005. if ( _cvt_count )
  1006. {
  1007. if ( _lpInterm2Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  1008. {
  1009. hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, _lpInterm2Str, &nBuffSize, dwFlag, lpFallBack);
  1010. lpSrcStr = _lpInterm2Str ;
  1011. *lpnSrcSize = nBuffSize ;
  1012. }
  1013. else
  1014. goto fail ;
  1015. }
  1016. else
  1017. {
  1018. if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  1019. {
  1020. hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize, dwFlag, lpFallBack);
  1021. lpSrcStr = _lpInterm1Str ;
  1022. *lpnSrcSize = nBuffSize ;
  1023. }
  1024. else
  1025. goto fail ;
  1026. }
  1027. }
  1028. else
  1029. hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  1030. _cvt_count ++ ;
  1031. }
  1032. if ( hr != S_OK )
  1033. hrWarnings = hr ;
  1034. // WindowsCodePageToInternetEncoding
  1035. if ( _dwConvertType % 2 && _dwConvertType < 21 )
  1036. hr = WindowsCodePageToInternetEncoding(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack);
  1037. return ( hr == S_OK ? hrWarnings : hr ) ;
  1038. fail :
  1039. return E_FAIL ;
  1040. }
  1041. #if 0
  1042. struct CODEPAGEINFO
  1043. {
  1044. UINT uCodePage ;
  1045. CP_STATE nCP_State ; // whether this is a valid windows codepage ?
  1046. };
  1047. // ValidCodepageInfo is used to cache whether a codepage is a vaild code
  1048. // It uses circular-FIFO cache algorithm
  1049. #define MAX_CP_CACHE 32
  1050. static int cp_cache_count = 0 ;
  1051. static int cp_cache_ptr = 0 ;
  1052. static struct CODEPAGEINFO ValidCodepageInfo[MAX_CP_CACHE];
  1053. // ValidCodepageInfo is used to cache whether a codepage is a vaild codepage
  1054. // It uses circular-FIFO cache algorithm
  1055. BOOL CheckIsValidCodePage (UINT uCodePage)
  1056. {
  1057. if ( uCodePage == 50000 ) // User defined
  1058. return TRUE ;
  1059. int i ;
  1060. BOOL bRet ;
  1061. for ( i = 0 ; i < cp_cache_count ; i++ )
  1062. {
  1063. if ( uCodePage == ValidCodepageInfo[i].uCodePage )
  1064. {
  1065. if ( ValidCodepageInfo[i].nCP_State == VALID_CP )
  1066. return TRUE ;
  1067. else
  1068. return FALSE ;
  1069. }
  1070. }
  1071. // not found, call IsValidCodePage and cache the return value
  1072. bRet = IsValidCodePage(uCodePage);
  1073. EnterCriticalSection(&g_cs);
  1074. ValidCodepageInfo[cp_cache_ptr].uCodePage = uCodePage ;
  1075. if (bRet)
  1076. ValidCodepageInfo[cp_cache_ptr].nCP_State = VALID_CP ;
  1077. else
  1078. ValidCodepageInfo[cp_cache_ptr].nCP_State = INVALID_CP ;
  1079. if ( cp_cache_count < MAX_CP_CACHE )
  1080. cp_cache_count++ ;
  1081. cp_cache_ptr = ( ++cp_cache_ptr ) % MAX_CP_CACHE ;
  1082. LeaveCriticalSection(&g_cs);
  1083. return bRet ;
  1084. }
  1085. #endif
  1086. /*
  1087. Conversion Flag:
  1088. Bit 7 - Convert Direction.
  1089. Bit 4 (16) - Unicode <-> Internet Encoding
  1090. Bit 3 (8) - UTF8, UTF7
  1091. Bit 2 (4) - Unicode
  1092. Bit 1 (2) - Windows CodePage
  1093. Bit 0 (1) - Internet Encoding
  1094. 12, 6, 3 (19) - one step convert
  1095. 10, 5 (21) - two steps convert
  1096. 9 (25) - three steps convert
  1097. */
  1098. int GetWindowsEncodingIndex(DWORD dwEncoding)
  1099. {
  1100. int nr = sizeof (aEncodingInfo) / sizeof(ENCODINGINFO) ;
  1101. int i, half = nr / 2, index = -1 ;
  1102. if (aEncodingInfo[half].dwEncoding > dwEncoding )
  1103. {
  1104. for ( i = 0 ; i < half ; i++ )
  1105. if (aEncodingInfo[i].dwEncoding == dwEncoding )
  1106. index = i ;
  1107. }
  1108. else if (aEncodingInfo[half].dwEncoding < dwEncoding )
  1109. {
  1110. for ( i = half + 1 ; i < nr ; i++ )
  1111. if (aEncodingInfo[i].dwEncoding == dwEncoding )
  1112. index = i ;
  1113. }
  1114. else
  1115. index = half ;
  1116. if (index>=0) // found
  1117. {
  1118. if ( aEncodingInfo[index].nCP_State != VALID_CP &&
  1119. aEncodingInfo[index].dwCodePage )
  1120. {
  1121. if ( aEncodingInfo[index].dwCodePage == 50000 || IsValidCodePage(aEncodingInfo[index].dwCodePage ) ) // 50000 means user defined
  1122. aEncodingInfo[index].nCP_State = VALID_CP ;
  1123. else
  1124. aEncodingInfo[index].nCP_State = INVALID_CP ;
  1125. if ((aEncodingInfo[index].nCP_State == VALID_CP) &&
  1126. (aEncodingInfo[index].dwFlags & CONV_CHK_NLS) &&
  1127. !IsValidCodePage(aEncodingInfo[index].dwEncoding))
  1128. aEncodingInfo[index].nCP_State = INVALID_CP ;
  1129. }
  1130. // Use system UTF8 conversion to work around security issues on Win2k and greater platforms.
  1131. if (g_bUseSysUTF8 && dwEncoding == CP_UTF_8)
  1132. {
  1133. aEncodingInfo[index].bTypeUUIW = 0x11;
  1134. }
  1135. }
  1136. return index ;
  1137. }
  1138. HRESULT CICharConverter::ConvertSetup(DWORD * pdwSrcEncoding, DWORD dwDstEncoding)
  1139. {
  1140. DWORD SrcFlag = 0, DstFlag = 0 ;
  1141. int index, unknown = 0 ;
  1142. // IE bug 109708 - WEIWU 5/11/00
  1143. // Always consider US-ASCII as a valid source encoding for conversion
  1144. /*
  1145. if (*pdwSrcEncoding == CP_20127 && !IsValidCodePage(CP_20127))
  1146. *pdwSrcEncoding = CP_1252;
  1147. */
  1148. /* check source & destination encoding type */
  1149. index = GetWindowsEncodingIndex(*pdwSrcEncoding);
  1150. if ( index >=0 )
  1151. {
  1152. SrcFlag = (DWORD) aEncodingInfo[index].bTypeUUIW ;
  1153. if ( aEncodingInfo[index].dwCodePage )
  1154. {
  1155. _dwWinCodePage = (DWORD) aEncodingInfo[index].dwCodePage ;
  1156. if (aEncodingInfo[index].nCP_State == INVALID_CP )
  1157. goto fail ;
  1158. }
  1159. if ( SrcFlag & 0x08 )
  1160. _dwUTFEncoding = *pdwSrcEncoding ;
  1161. if ( SrcFlag & 0x01 )
  1162. _dwInternetEncoding = *pdwSrcEncoding ;
  1163. if ( SrcFlag & 0x04 )
  1164. _dwUnicodeEncoding = *pdwSrcEncoding ;
  1165. }
  1166. // assume it is a unknown Window Codepage
  1167. else
  1168. {
  1169. if ( !CONVERT_IS_VALIDCODEPAGE(*pdwSrcEncoding))
  1170. goto fail ;
  1171. SrcFlag = 0x02 ;
  1172. _dwWinCodePage = *pdwSrcEncoding ;
  1173. unknown ++ ;
  1174. }
  1175. index = GetWindowsEncodingIndex(dwDstEncoding);
  1176. if ( index >=0 )
  1177. {
  1178. // check if two codepages are compatiable
  1179. if ( _dwWinCodePage && aEncodingInfo[index].dwCodePage )
  1180. {
  1181. if (_dwWinCodePage != (DWORD) aEncodingInfo[index].dwCodePage )
  1182. goto fail ;
  1183. }
  1184. DstFlag = (DWORD) aEncodingInfo[index].bTypeUUIW ;
  1185. if ( aEncodingInfo[index].dwCodePage )
  1186. {
  1187. _dwWinCodePage = (DWORD) aEncodingInfo[index].dwCodePage ;
  1188. if (aEncodingInfo[index].nCP_State == INVALID_CP )
  1189. goto fail ;
  1190. }
  1191. if ( DstFlag & 0x08 )
  1192. {
  1193. if (_dwUTFEncoding)
  1194. _dwUTFEncoding2 = dwDstEncoding ;
  1195. else
  1196. _dwUTFEncoding = dwDstEncoding ;
  1197. }
  1198. if ( DstFlag & 0x01 )
  1199. _dwInternetEncoding = dwDstEncoding ;
  1200. if ( DstFlag & 0x04 )
  1201. _dwUnicodeEncoding = dwDstEncoding ;
  1202. }
  1203. // 1) First time unknown, assume it is a unknown Window Codepage
  1204. // the conversion become UTF78 <-> Unicode <-> Window Codepage
  1205. // 2) Second time unknown, assume it is a unknown Internet Encoding
  1206. // the conversion become Windows Codepage <-> Unicode <-> Internet Encoding
  1207. else
  1208. {
  1209. if ( !CONVERT_IS_VALIDCODEPAGE(dwDstEncoding))
  1210. goto fail ;
  1211. if ( unknown == 0 )
  1212. {
  1213. if ( _dwWinCodePage )
  1214. {
  1215. if (_dwWinCodePage != dwDstEncoding )
  1216. goto fail ;
  1217. }
  1218. DstFlag = 0x02 ;
  1219. _dwWinCodePage = dwDstEncoding ;
  1220. }
  1221. else
  1222. {
  1223. DstFlag = 0x11 ;
  1224. _dwInternetEncoding = dwDstEncoding ;
  1225. }
  1226. }
  1227. if ( !SrcFlag | !DstFlag )
  1228. goto fail ;
  1229. if ( SrcFlag == DstFlag && *pdwSrcEncoding != dwDstEncoding && ( 4 != SrcFlag ) && ( 8 != SrcFlag ))
  1230. goto fail ;
  1231. _dwConvertType = SrcFlag | DstFlag ;
  1232. _bConvertDirt = ( SrcFlag & 0x0f ) > ( DstFlag & 0x0f ) ;
  1233. // if code convertor has been allocated, deallocate it
  1234. if (_hcins)
  1235. {
  1236. delete _hcins ;
  1237. _hcins = NULL ;
  1238. }
  1239. return S_OK ;
  1240. fail :
  1241. return S_FALSE ;
  1242. }
  1243. HRESULT CICharConverter::DoCodeConvert(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize,
  1244. LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  1245. {
  1246. HRESULT hr = S_OK ;
  1247. if ( 4 == _dwConvertType ) // CP_UCS_2 <-> CP_UCS_2_BE
  1248. {
  1249. if (!lpDstStr)
  1250. {
  1251. _nSrcSize = *lpnDstSize = *lpnSrcSize ;
  1252. }
  1253. else
  1254. {
  1255. int nSize = min(*lpnDstSize,*lpnSrcSize);
  1256. _nSrcSize = *lpnSrcSize ;
  1257. if ( lpDstStr && nSize > 0 )
  1258. {
  1259. MoveMemory(lpDstStr, lpSrcStr, nSize );
  1260. DataByteSwap(lpDstStr, nSize );
  1261. _nSrcSize = nSize ;
  1262. *lpnDstSize = nSize ;
  1263. }
  1264. }
  1265. }
  1266. else if ( 8 == _dwConvertType) // UTF7 <-> UTF8
  1267. {
  1268. if (_dwUTFEncoding == _dwUTFEncoding2)
  1269. {
  1270. _nSrcSize = *lpnDstSize = min(*lpnDstSize,*lpnSrcSize);
  1271. if (*lpnDstSize > 0)
  1272. MoveMemory(lpDstStr, lpSrcStr, *lpnDstSize);
  1273. }
  1274. else
  1275. {
  1276. int nBuffSize = 0;
  1277. // Always succeeds
  1278. hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize);
  1279. if (_lpInterm1Str)
  1280. LocalFree(_lpInterm1Str);
  1281. if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) )
  1282. {
  1283. DWORD dwTmpEncoding = _dwUTFEncoding;
  1284. int nTmpSrcSize;
  1285. hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize);
  1286. _dwUTFEncoding = _dwUTFEncoding2 ;
  1287. nTmpSrcSize = _nSrcSize;
  1288. // We don't need to create another dwMode since only UTF7 conversion needs it
  1289. hr = UnicodeToUTF78(lpdwMode, _lpInterm1Str, &nBuffSize, lpDstStr, lpnDstSize);
  1290. _nSrcSize = nTmpSrcSize;
  1291. _dwUTFEncoding = dwTmpEncoding ;
  1292. }
  1293. else
  1294. hr = E_OUTOFMEMORY;
  1295. }
  1296. }
  1297. else if ( _bConvertDirt )
  1298. hr = ConvertUUWI(lpdwMode, lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, dwFlag, lpFallBack);
  1299. else
  1300. hr = ConvertIWUU(lpdwMode, lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, dwFlag, lpFallBack);
  1301. return hr ;
  1302. }
  1303. BOOL CICharConverter::ConvertCleanUp()
  1304. {
  1305. if (_lpInterm1Str)
  1306. {
  1307. LocalFree(_lpInterm1Str);
  1308. _lpInterm1Str = NULL ;
  1309. }
  1310. if (_lpInterm2Str)
  1311. {
  1312. LocalFree(_lpInterm2Str);
  1313. _lpInterm2Str = NULL ;
  1314. }
  1315. if (_lpUnicodeStr)
  1316. {
  1317. LocalFree(_lpUnicodeStr);
  1318. _lpUnicodeStr = NULL ;
  1319. }
  1320. _cvt_count = 0 ;
  1321. _nSrcSize = 0 ;
  1322. return TRUE ;
  1323. }
  1324. CICharConverter::CICharConverter()
  1325. {
  1326. _lpInterm1Str = NULL ;
  1327. _lpInterm2Str = NULL ;
  1328. _lpUnicodeStr = NULL ;
  1329. _hcins = NULL ;
  1330. _cvt_count = 0 ;
  1331. _dwWinCodePage = 0;
  1332. _dwInternetEncoding = 0;
  1333. _dwUTFEncoding = 0;
  1334. _dwUTFEncoding2 = 0;
  1335. _dwUnicodeEncoding = 0;
  1336. _dwConvertType = 0;
  1337. _nSrcSize = 0 ;
  1338. _hcins_dst = 0 ;
  1339. return ;
  1340. }
  1341. CICharConverter::CICharConverter(DWORD dwFlag, WCHAR *lpFallBack)
  1342. {
  1343. _lpInterm1Str = NULL ;
  1344. _lpInterm2Str = NULL ;
  1345. _lpUnicodeStr = NULL ;
  1346. _hcins = NULL ;
  1347. _cvt_count = 0 ;
  1348. _dwWinCodePage = 0;
  1349. _dwInternetEncoding = 0;
  1350. _dwUTFEncoding = 0;
  1351. _dwUTFEncoding2 = 0;
  1352. _dwUnicodeEncoding = 0;
  1353. _dwConvertType = 0;
  1354. _nSrcSize = 0 ;
  1355. _hcins_dst = 0 ;
  1356. _dwFlag = dwFlag;
  1357. _lpFallBack = lpFallBack;
  1358. return ;
  1359. }
  1360. CICharConverter::~CICharConverter()
  1361. {
  1362. if (_lpInterm1Str)
  1363. {
  1364. LocalFree(_lpInterm1Str);
  1365. _lpInterm1Str = NULL ;
  1366. }
  1367. if (_lpInterm2Str)
  1368. {
  1369. LocalFree(_lpInterm2Str);
  1370. _lpInterm2Str = NULL ;
  1371. }
  1372. if (_lpUnicodeStr)
  1373. {
  1374. LocalFree(_lpUnicodeStr);
  1375. _lpUnicodeStr = NULL ;
  1376. }
  1377. if (_hcins)
  1378. {
  1379. delete _hcins ;
  1380. _hcins = NULL ;
  1381. }
  1382. }
  1383. CICharConverter::CICharConverter(DWORD dwSrcEncoding, DWORD dwDstEncoding)
  1384. {
  1385. _lpInterm1Str = NULL ;
  1386. _lpInterm2Str = NULL ;
  1387. _lpUnicodeStr = NULL ;
  1388. _hcins = NULL ;
  1389. _cvt_count = 0 ;
  1390. _dwWinCodePage = 0;
  1391. _dwInternetEncoding = 0;
  1392. _dwUTFEncoding = 0;
  1393. _dwUTFEncoding2 = 0;
  1394. _dwUnicodeEncoding = 0;
  1395. _dwConvertType = 0;
  1396. _nSrcSize = 0 ;
  1397. _hcins_dst = 0 ;
  1398. ConvertSetup(&dwSrcEncoding,dwDstEncoding);
  1399. return ;
  1400. }
  1401. HRESULT WINAPI IsConvertINetStringAvailable(DWORD dwSrcEncoding, DWORD dwDstEncoding)
  1402. {
  1403. HRESULT hr;
  1404. CICharConverter * INetConvert = new CICharConverter ;
  1405. if (!INetConvert)
  1406. return E_OUTOFMEMORY;
  1407. hr = INetConvert->ConvertSetup(&dwSrcEncoding, dwDstEncoding);
  1408. delete INetConvert;
  1409. return hr ;
  1410. }
  1411. #define DETECTION_BUFFER_NUM 3
  1412. // In CP_AUTO and detection result is UTF7 case, private converter might use high word of *lpdwMode to store internal data, but we need
  1413. // to use it to notify Trident the detection result, currently, we bias to returning correct detection result.
  1414. // This is currently by design. If we get a change to re-prototype conversion object, we can resovle this issue
  1415. HRESULT WINAPI ConvertINetStringEx(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack)
  1416. {
  1417. CICharConverter * INetConvert;
  1418. int nSrcSize;
  1419. int nDstSize;
  1420. DWORD dwMode = 0 ;
  1421. // dwDetectResult
  1422. // CP_UNDEFINED :Fail to detect
  1423. // 0 :Not a auto-detect scenario
  1424. // Others :Detected encoding
  1425. DWORD dwDetectResult = CP_UNDEFINED;
  1426. HRESULT hr ;
  1427. if(lpnSrcSize)
  1428. {
  1429. nSrcSize = *lpnSrcSize;
  1430. }
  1431. else
  1432. nSrcSize = -1;
  1433. if ( lpSrcStr && nSrcSize == -1 ) // Get length of lpSrcStr if not given, assuming lpSrcStr is a zero terminate string.
  1434. {
  1435. if ( dwSrcEncoding == CP_UCS_2 )
  1436. nSrcSize = (lstrlenW((WCHAR*)lpSrcStr) << 1) ;
  1437. else
  1438. nSrcSize = lstrlenA(lpSrcStr) ;
  1439. }
  1440. // If there is nothing need to be converted, we return S_OK;
  1441. if (!nSrcSize || !lpSrcStr)
  1442. {
  1443. if (lpnDstSize)
  1444. *lpnDstSize = 0;
  1445. return S_OK;
  1446. }
  1447. INetConvert = new CICharConverter(dwFlag, lpFallBack) ;
  1448. if (!INetConvert)
  1449. return E_OUTOFMEMORY;
  1450. // ASSERT(CP_AUTO != dwDstEncoding);
  1451. // if null specified at dst buffer we'll get the size of required buffer.
  1452. if(!lpDstStr)
  1453. nDstSize = 0;
  1454. else if (lpnDstSize)
  1455. nDstSize = *lpnDstSize;
  1456. else
  1457. nDstSize = 0;
  1458. if (lpdwMode)
  1459. dwMode = *lpdwMode ;
  1460. // In real world, clients uses 28591 as 1252, 28599 as 1254,
  1461. // To correctly convert those extended characters to Unicode,
  1462. // We internally replace it with 1252
  1463. if (dwDstEncoding == CP_UCS_2 || dwDstEncoding == CP_UCS_2_BE)
  1464. {
  1465. if ((dwSrcEncoding == CP_ISO_8859_1) && _IsValidCodePage(CP_1252))
  1466. dwSrcEncoding = CP_1252;
  1467. if ((dwSrcEncoding == CP_ISO_8859_9) && _IsValidCodePage(CP_1254))
  1468. dwSrcEncoding = CP_1254;
  1469. }
  1470. if ((dwDstEncoding == CP_1252) && (dwSrcEncoding == CP_ISO_8859_1))
  1471. {
  1472. dwSrcEncoding = CP_1252;
  1473. }
  1474. if ((dwDstEncoding == CP_1254) && (dwSrcEncoding == CP_ISO_8859_9))
  1475. {
  1476. dwSrcEncoding = CP_1254;
  1477. }
  1478. //
  1479. // Auto Detection for Japan
  1480. // Japanese user often tag their data incorrectly, so, if MLCONVCHARF_DETECTJPN specified,
  1481. // we'll do extra detection for Shift-Jis and EUC
  1482. //
  1483. if ( dwSrcEncoding == CP_JP_AUTO ||
  1484. ((dwFlag & MLCONVCHARF_DETECTJPN) &&
  1485. (dwSrcEncoding == CP_JPN_SJ || dwSrcEncoding == CP_EUC_JP))) // Auto Detection for Japan
  1486. {
  1487. CIncdJapanese DetectJapan(dwSrcEncoding);
  1488. UINT uiCodePage ;
  1489. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1490. if ( uiCodePage )
  1491. {
  1492. dwSrcEncoding = uiCodePage ;
  1493. dwDetectResult = 0;
  1494. }
  1495. else
  1496. {
  1497. dwSrcEncoding = DetectJapan.DetectStringA(lpSrcStr, nSrcSize);
  1498. // if dwSrcEncoding is zero means there is an ambiguity, we don't return
  1499. // the detected codepage to caller, instead we defaut its codepage internally
  1500. // to SJIS
  1501. if (dwSrcEncoding)
  1502. {
  1503. dwDetectResult = dwSrcEncoding << 16 ;
  1504. }
  1505. else
  1506. dwSrcEncoding = CP_JPN_SJ;
  1507. }
  1508. }
  1509. // bug #43190, we auto-detect again for euc-kr page because IMN ver 1.0
  1510. // mislabel an ISO-KR page as a ks_c_5601-1987 page. This is the only way
  1511. // we can fix that mistake.
  1512. else if ( dwSrcEncoding == CP_KR_AUTO || dwSrcEncoding == CP_KOR_5601 ||
  1513. dwSrcEncoding == CP_EUC_KR )
  1514. {
  1515. CIncdKorean DetectKorean;
  1516. UINT uiCodePage ;
  1517. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1518. if ( uiCodePage )
  1519. {
  1520. dwSrcEncoding = uiCodePage ;
  1521. dwDetectResult = 0;
  1522. }
  1523. else
  1524. {
  1525. dwSrcEncoding = DetectKorean.DetectStringA(lpSrcStr, nSrcSize);
  1526. if (dwSrcEncoding)
  1527. {
  1528. dwDetectResult = dwSrcEncoding << 16 ;
  1529. }
  1530. else
  1531. dwSrcEncoding = CP_KOR_5601;
  1532. }
  1533. }
  1534. else if ( dwSrcEncoding == CP_AUTO ) // General Auto Detection for all code pages
  1535. {
  1536. int _nSrcSize = DETECTION_MAX_LEN < nSrcSize ? DETECTION_MAX_LEN : nSrcSize;
  1537. int nScores = DETECTION_BUFFER_NUM;
  1538. DetectEncodingInfo Encoding[DETECTION_BUFFER_NUM];
  1539. UINT uiCodePage ;
  1540. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1541. if ( uiCodePage )
  1542. {
  1543. dwSrcEncoding = uiCodePage ;
  1544. dwDetectResult = 0;
  1545. }
  1546. else
  1547. {
  1548. dwSrcEncoding = g_uACP;
  1549. if ( S_OK == _DetectInputCodepage(MLDETECTCP_HTML, CP_AUTO, (char *)lpSrcStr, &_nSrcSize, &Encoding[0], &nScores))
  1550. {
  1551. MIMECPINFO cpInfo;
  1552. if (Encoding[0].nCodePage == CP_20127)
  1553. Encoding[0].nCodePage = dwSrcEncoding;
  1554. if (NULL != g_pMimeDatabase)
  1555. {
  1556. if (SUCCEEDED(g_pMimeDatabase->GetCodePageInfo(Encoding[0].nCodePage, 0x409, &cpInfo)) &&
  1557. (cpInfo.dwFlags & MIMECONTF_VALID))
  1558. {
  1559. dwSrcEncoding = Encoding[0].nCodePage;
  1560. dwDetectResult = dwSrcEncoding << 16 ;
  1561. }
  1562. }
  1563. }
  1564. // If we failed in general detection and system locale is Jpn, we try harder
  1565. // with our Japanese detection engine
  1566. if (dwSrcEncoding == CP_JPN_SJ && dwDetectResult == CP_UNDEFINED)
  1567. {
  1568. CIncdJapanese DetectJapan;
  1569. DWORD dwSrcEncodingJpn = DetectJapan.DetectStringA(lpSrcStr, nSrcSize);
  1570. if (dwSrcEncodingJpn)
  1571. {
  1572. // We only change conversion encoding without returnning this result to browser
  1573. // if it is in the middle of detection, this is to prevent other encodings been mis-detected as Jpn encodings.
  1574. dwSrcEncoding = dwSrcEncodingJpn;
  1575. // Set search range for end tag as 10 bytes
  1576. if (nSrcSize >= 10)
  1577. {
  1578. char szTmpStr[11] = {0};
  1579. char *lpTmpStr = szTmpStr;
  1580. _tcsncpy(szTmpStr, (char *)&lpSrcStr[nSrcSize-10], 10);
  1581. //ToLower
  1582. while(*lpTmpStr)
  1583. {
  1584. if (*lpTmpStr >= 'A' && *lpTmpStr <= 'W')
  1585. *lpTmpStr += 0x20;
  1586. lpTmpStr++;
  1587. }
  1588. // If end of page, return this result
  1589. if (MLStrStr(szTmpStr, "</html>"))
  1590. dwDetectResult = dwSrcEncoding << 16 ;
  1591. }
  1592. }
  1593. }
  1594. //aEncodingInfo[GetWindowsEncodingIndex(CP_AUTO)].dwCodePage = dwSrcEncoding;
  1595. }
  1596. }
  1597. else
  1598. {
  1599. // Not a auto-detect scenario
  1600. dwDetectResult = 0;
  1601. }
  1602. if ( S_OK == ( hr = INetConvert->ConvertSetup(&dwSrcEncoding,dwDstEncoding )))
  1603. {
  1604. if ( dwSrcEncoding != dwDstEncoding )
  1605. {
  1606. // if high word of dwMode is CP_UTF_7, it must be detection result, don't pass it to UTF7 converter
  1607. if ( dwSrcEncoding == CP_UTF_7 && (dwMode >> 16) == CP_UTF_7)
  1608. dwMode &= 0xFFFF;
  1609. // ASSERT(!((IS_ENCODED_ENCODING(dwSrcEncoding) || IS_ENCODED_ENCODING(dwDstEncoding)) && (NULL == lpdwMode)));
  1610. hr = INetConvert->DoCodeConvert(&dwMode, lpSrcStr, &nSrcSize, lpDstStr, &nDstSize, dwFlag, lpFallBack);
  1611. // return the number of bytes processed for the source.
  1612. if (lpnSrcSize)
  1613. *lpnSrcSize = INetConvert->_nSrcSize ;
  1614. INetConvert->ConvertCleanUp();
  1615. }
  1616. else
  1617. {
  1618. int nSize, i ;
  1619. hr = S_OK ;
  1620. BOOL bLeadByte = FALSE ;
  1621. // only check for windows codepage
  1622. if ( INetConvert->_dwConvertType == 02 && lpSrcStr )
  1623. {
  1624. for ( i=0; i<nSrcSize; i++)
  1625. {
  1626. if (bLeadByte)
  1627. bLeadByte = FALSE ;
  1628. else if (IsDBCSLeadByteEx(dwSrcEncoding,lpSrcStr[i]))
  1629. bLeadByte = TRUE ;
  1630. }
  1631. if (bLeadByte)
  1632. nSrcSize-- ;
  1633. }
  1634. // set input size
  1635. if (lpnSrcSize)
  1636. *lpnSrcSize = nSrcSize ;
  1637. // set output size and copy if we need to
  1638. if (lpDstStr && *lpnDstSize)
  1639. {
  1640. nSize = min(*lpnDstSize,nSrcSize);
  1641. MoveMemory(lpDstStr, lpSrcStr, nSize);
  1642. nDstSize = nSize ;
  1643. }
  1644. else
  1645. nDstSize = nSrcSize ;
  1646. }
  1647. }
  1648. else
  1649. nDstSize = 0 ;
  1650. delete INetConvert;
  1651. // return the number of bytes copied for the destination,
  1652. if (lpnDstSize)
  1653. *lpnDstSize = nDstSize;
  1654. if (lpdwMode && lpDstStr)
  1655. {
  1656. if (dwDetectResult) // CP_AUTO conversion
  1657. {
  1658. dwMode &= 0xFFFF; // Clear HIGHWORD in case private converter set it
  1659. // If we have detection result, return it in HIGHWORD
  1660. // in the case of UTF7 conversion, private converter might use high word to store internal data,
  1661. // this will conflict with our logic of returning detection result in high word, it is a design flaw,
  1662. // currently, we ignore conversion setting and give detection result more priority
  1663. if (dwDetectResult != CP_UNDEFINED)
  1664. dwMode |= dwDetectResult;
  1665. }
  1666. *lpdwMode = dwMode ;
  1667. }
  1668. return hr ;
  1669. }
  1670. // We already published this API, keep it for backward compatibility
  1671. HRESULT WINAPI ConvertINetReset(void)
  1672. {
  1673. // Always suceed
  1674. return S_OK ;
  1675. }
  1676. HRESULT WINAPI ConvertINetMultiByteToUnicodeEx(LPDWORD lpdwMode, DWORD dwEncoding, LPCSTR lpSrcStr, LPINT lpnMultiCharCount, LPWSTR lpDstStr, LPINT lpnWideCharCount, DWORD dwFlag, WCHAR *lpFallBack)
  1677. {
  1678. HRESULT hr ;
  1679. int nByteCountSize = 0;
  1680. if (lpnWideCharCount)
  1681. {
  1682. nByteCountSize = *lpnWideCharCount * sizeof(WCHAR);
  1683. }
  1684. #ifdef UNIX
  1685. int saved_nByteCountSize = nByteCountSize;
  1686. #endif /* UNIX */
  1687. hr = ConvertINetStringEx(lpdwMode,dwEncoding, CP_UCS_2, lpSrcStr, lpnMultiCharCount, (LPSTR)lpDstStr, &nByteCountSize, dwFlag, lpFallBack) ;
  1688. #ifdef UNIX
  1689. if(dwEncoding == 1200 || dwEncoding == 65000 || dwEncoding == 65001 ||
  1690. (dwEncoding == 50001 && !_IsValidCodePage(dwEncoding)) )
  1691. {
  1692. /*
  1693. * On unix we need to convert the little endian mode 2 byte unicode
  1694. * format to unix mode 4 byte wChars.
  1695. */
  1696. if(lpDstStr && (saved_nByteCountSize < (nByteCountSize/2)*sizeof(WCHAR)))
  1697. hr = E_FAIL;
  1698. else
  1699. {
  1700. /*
  1701. * Use a temporary array to do the 2byte -> 4byte conversion
  1702. */
  1703. LPSTR pTmp = (LPSTR) lpDstStr;
  1704. LPWSTR pw4 = NULL;
  1705. if(pTmp) /* allocate only if we have a lpDstStr */
  1706. pw4 = new WCHAR[nByteCountSize/2];
  1707. if(pw4)
  1708. {
  1709. int i = 0;
  1710. LPWSTR pw4Tmp = pw4;
  1711. for(; i < nByteCountSize/2; i++)
  1712. *pw4Tmp++ = (UCHAR)pTmp[i*2];
  1713. pw4Tmp = pw4;
  1714. for(i = 0; i < nByteCountSize/2; i++)
  1715. *lpDstStr++ = *pw4Tmp++;
  1716. }
  1717. if(!pw4 && pTmp) /* if lpDstStr and allocate fails bail out */
  1718. hr = E_FAIL;
  1719. delete [] pw4;
  1720. }
  1721. nByteCountSize *= 2; // Expand twice as we have 4 byte wchars.
  1722. }
  1723. #endif
  1724. *lpnWideCharCount = nByteCountSize / sizeof(WCHAR);
  1725. return hr ;
  1726. }
  1727. HRESULT WINAPI ConvertINetUnicodeToMultiByteEx(LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount, DWORD dwFlag, WCHAR *lpFallBack)
  1728. {
  1729. HRESULT hr ;
  1730. int nByteCountSize=-1;
  1731. if(lpnWideCharCount && *lpnWideCharCount != -1)
  1732. nByteCountSize = *lpnWideCharCount * sizeof(WCHAR);
  1733. hr = ConvertINetStringEx(lpdwMode,CP_UCS_2, dwEncoding, (LPCSTR) lpSrcStr, &nByteCountSize, lpDstStr, lpnMultiCharCount, dwFlag, lpFallBack);
  1734. #ifdef UNIX
  1735. if(dwEncoding == 1200 || dwEncoding == 65000 || dwEncoding == 65001) {
  1736. nByteCountSize *= 2; // Expand twice as we have 4 byte wchars.
  1737. }
  1738. #endif /* UNIX */
  1739. if (lpnWideCharCount)
  1740. *lpnWideCharCount = nByteCountSize / sizeof(WCHAR);
  1741. return hr ;
  1742. }
  1743. HRESULT WINAPI ConvertINetString(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize)
  1744. {
  1745. HRESULT hr ;
  1746. hr = ConvertINetStringEx(lpdwMode,dwSrcEncoding,dwDstEncoding,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, 0, NULL);
  1747. return hr ;
  1748. }
  1749. HRESULT WINAPI ConvertINetUnicodeToMultiByte(LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount)
  1750. {
  1751. HRESULT hr ;
  1752. DWORD dwFlag = 0 ;
  1753. if ( lpdwMode )
  1754. dwFlag |= ( *lpdwMode & 0x00008000 ) ? MLCONVCHARF_ENTITIZE : 0 ;
  1755. hr = ConvertINetUnicodeToMultiByteEx(lpdwMode,dwEncoding,lpSrcStr,lpnWideCharCount,lpDstStr,lpnMultiCharCount,dwFlag,NULL);
  1756. return hr ;
  1757. }
  1758. HRESULT WINAPI ConvertINetMultiByteToUnicode(LPDWORD lpdwMode, DWORD dwEncoding, LPCSTR lpSrcStr, LPINT lpnMultiCharCount, LPWSTR lpDstStr, LPINT lpnWideCharCount)
  1759. {
  1760. HRESULT hr ;
  1761. hr = ConvertINetMultiByteToUnicodeEx(lpdwMode,dwEncoding,lpSrcStr,lpnMultiCharCount,lpDstStr,lpnWideCharCount, 0, NULL);
  1762. return hr ;
  1763. }
  1764. #define STR_BUFFER_SIZE 2048
  1765. HRESULT _ConvertINetStringInIStream(CICharConverter * INetConvert, LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, IStream *pstmIn, IStream *pstmOut, DWORD dwFlag, WCHAR *lpFallBack)
  1766. {
  1767. DWORD dwMode, dwModeTemp ;
  1768. HRESULT hr= S_OK, hrWarnings=S_OK;
  1769. LPSTR lpstrIn = NULL, lpstrOut = NULL;
  1770. ULONG nSrcSize, nSrcUsed, nSrcLeft, nDstSize, _nDstSize, nOutBuffSize ;
  1771. if (lpdwMode)
  1772. dwMode = *lpdwMode ;
  1773. // allocate a temp input buffer - 2K in size
  1774. if ( (lpstrIn = (LPSTR) LocalAlloc(LPTR, STR_BUFFER_SIZE )) == NULL )
  1775. {
  1776. hrWarnings = E_OUTOFMEMORY ;
  1777. goto exit;
  1778. }
  1779. if ( (lpstrOut = (LPSTR) LocalAlloc(LPTR, STR_BUFFER_SIZE * 2 )) == NULL )
  1780. {
  1781. hrWarnings = E_OUTOFMEMORY ;
  1782. goto exit;
  1783. }
  1784. nOutBuffSize = STR_BUFFER_SIZE * 2 ;
  1785. nSrcLeft = 0 ;
  1786. // In real world, clients uses 28591 as 1252, 28599 as 1254,
  1787. // To correctly convert those extended characters to Unicode,
  1788. // We internally replace it with 1252
  1789. if (dwDstEncoding == CP_UCS_2 || dwDstEncoding == CP_UCS_2_BE)
  1790. {
  1791. if ((dwSrcEncoding == CP_ISO_8859_1) && _IsValidCodePage(CP_1252))
  1792. dwSrcEncoding = CP_1252;
  1793. if ((dwSrcEncoding == CP_ISO_8859_9) && _IsValidCodePage(CP_1254))
  1794. dwSrcEncoding = CP_1254;
  1795. }
  1796. if ((dwDstEncoding == CP_1252) && (dwSrcEncoding == CP_ISO_8859_1))
  1797. {
  1798. dwSrcEncoding = CP_1252;
  1799. }
  1800. if ((dwDstEncoding == CP_1254) && (dwSrcEncoding == CP_ISO_8859_9))
  1801. {
  1802. dwSrcEncoding = CP_1254;
  1803. }
  1804. if ( dwSrcEncoding == CP_JP_AUTO ) // Auto Detection for Japan
  1805. {
  1806. CIncdJapanese DetectJapan;
  1807. UINT uiCodePage ;
  1808. LARGE_INTEGER li;
  1809. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1810. if ( uiCodePage )
  1811. dwSrcEncoding = uiCodePage ;
  1812. else
  1813. {
  1814. LISet32(li, 0);
  1815. hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE , &nSrcSize);
  1816. if (S_OK != hr)
  1817. hrWarnings = hr;
  1818. hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL);
  1819. if (S_OK != hr)
  1820. hrWarnings = hr;
  1821. dwSrcEncoding = DetectJapan.DetectStringA(lpstrIn, nSrcSize);
  1822. // if dwSrcEncoding is zero means there is an ambiguity, we don't return
  1823. // the detected codepage to caller, instead we defaut its codepage internally
  1824. // to SJIS
  1825. if (dwSrcEncoding)
  1826. {
  1827. dwMode &= 0x0000ffff ;
  1828. dwMode |= dwSrcEncoding << 16 ;
  1829. }
  1830. else
  1831. dwSrcEncoding = CP_JPN_SJ;
  1832. }
  1833. }
  1834. // bug #43190, we auto-detect again for euc-kr page because IMN ver 1.0
  1835. // mislabel an ISO-KR page as a ks_c_5601-1987 page. This is the only way
  1836. // we can fix that mistake.
  1837. else if ( dwSrcEncoding == CP_KR_AUTO || dwSrcEncoding == CP_KOR_5601 ||
  1838. dwSrcEncoding == CP_EUC_KR )
  1839. {
  1840. CIncdKorean DetectKorean;
  1841. UINT uiCodePage ;
  1842. LARGE_INTEGER li;
  1843. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1844. if ( uiCodePage )
  1845. dwSrcEncoding = uiCodePage ;
  1846. else
  1847. {
  1848. LISet32(li, 0);
  1849. hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE, &nSrcSize);
  1850. if (S_OK != hr)
  1851. hrWarnings = hr;
  1852. hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL);
  1853. if (S_OK != hr)
  1854. hrWarnings = hr;
  1855. dwSrcEncoding = DetectKorean.DetectStringA(lpstrIn, nSrcSize);
  1856. if (dwSrcEncoding)
  1857. {
  1858. dwMode &= 0x0000ffff ;
  1859. dwMode |= dwSrcEncoding << 16 ;
  1860. }
  1861. else
  1862. dwSrcEncoding = CP_KOR_5601;
  1863. }
  1864. }
  1865. else if ( dwSrcEncoding == CP_AUTO ) // General Auto Detection for all code pages
  1866. {
  1867. INT nScores = 1;
  1868. DWORD dwSrcEncoding ;
  1869. DetectEncodingInfo Encoding;
  1870. UINT uiCodePage ;
  1871. LARGE_INTEGER li;
  1872. uiCodePage = ( dwMode >> 16 ) & 0xffff ;
  1873. if ( uiCodePage )
  1874. dwSrcEncoding = uiCodePage ;
  1875. else
  1876. {
  1877. LISet32(li, 0);
  1878. hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE , &nSrcSize);
  1879. if (S_OK != hr)
  1880. hrWarnings = hr;
  1881. hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL);
  1882. if (S_OK != hr)
  1883. hrWarnings = hr;
  1884. if (DETECTION_MAX_LEN < nSrcSize)
  1885. nSrcSize = DETECTION_MAX_LEN;
  1886. if ( S_OK == _DetectInputCodepage(MLDETECTCP_HTML, 1252, lpstrIn, (int *)&nSrcSize, &Encoding, &nScores))
  1887. {
  1888. dwSrcEncoding = Encoding.nCodePage;
  1889. dwMode &= 0x0000ffff ;
  1890. dwMode |= dwSrcEncoding << 16 ;
  1891. }
  1892. else
  1893. {
  1894. dwSrcEncoding = CP_ACP;
  1895. }
  1896. aEncodingInfo[GetWindowsEncodingIndex(CP_AUTO)].dwCodePage = dwSrcEncoding;
  1897. }
  1898. }
  1899. if ( S_OK == ( hr = INetConvert->ConvertSetup(&dwSrcEncoding,dwDstEncoding )))
  1900. {
  1901. // Loop for ever
  1902. while(1)
  1903. {
  1904. // Read a buffer
  1905. hr = pstmIn->Read(&lpstrIn[nSrcLeft], STR_BUFFER_SIZE-nSrcLeft, &nSrcSize);
  1906. if (S_OK != hr)
  1907. hrWarnings = hr;
  1908. // Done
  1909. if (0 == nSrcSize)
  1910. break;
  1911. nSrcSize += nSrcLeft ;
  1912. nSrcUsed = nSrcSize ;
  1913. dwModeTemp = dwMode ;
  1914. nDstSize = 0 ;
  1915. // get the size of output buffer
  1916. hr = INetConvert->DoCodeConvert(&dwModeTemp, (LPCSTR)lpstrIn, (LPINT)&nSrcUsed, NULL, (LPINT)&nDstSize, dwFlag, lpFallBack);
  1917. if (S_OK != hr)
  1918. hrWarnings = hr;
  1919. // Reallocate output buffer if so
  1920. if ( nDstSize > nOutBuffSize )
  1921. {
  1922. LPSTR psz = (LPSTR) LocalReAlloc(lpstrOut, nDstSize, LMEM_ZEROINIT|LMEM_MOVEABLE);
  1923. if (psz == NULL)
  1924. {
  1925. hrWarnings = E_OUTOFMEMORY ;
  1926. goto exit;
  1927. }
  1928. lpstrOut = psz;
  1929. nOutBuffSize = nDstSize ;
  1930. }
  1931. _nDstSize = nDstSize;
  1932. // Due to multi_stage conversion, this is the actual size is used
  1933. nSrcUsed = INetConvert->_nSrcSize ;
  1934. nSrcLeft = nSrcSize - nSrcUsed ;
  1935. #if 0
  1936. // restore Src size
  1937. nSrcUsed = nSrcSize ;
  1938. #endif
  1939. // do conversion
  1940. hr = INetConvert->DoCodeConvert(&dwMode, (LPCSTR)lpstrIn, (LPINT)&nSrcUsed, lpstrOut, (LPINT)&_nDstSize, dwFlag, lpFallBack);
  1941. if (S_OK != hr)
  1942. hrWarnings = hr;
  1943. // Write It
  1944. hr = pstmOut->Write(lpstrOut, nDstSize, &nDstSize);
  1945. if (S_OK != hr)
  1946. hrWarnings = hr;
  1947. if (nSrcLeft )
  1948. MoveMemory(lpstrIn, &lpstrIn[nSrcSize-nSrcLeft],nSrcLeft);
  1949. INetConvert->ConvertCleanUp();
  1950. }
  1951. }
  1952. if (nSrcLeft )
  1953. {
  1954. LARGE_INTEGER li;
  1955. LISet32(li, -(LONG)nSrcLeft );
  1956. hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL);
  1957. }
  1958. if (lpdwMode)
  1959. *lpdwMode = dwMode ;
  1960. exit :
  1961. if (lpstrIn)
  1962. LocalFree(lpstrIn);
  1963. if (lpstrOut)
  1964. LocalFree(lpstrOut);
  1965. // Done
  1966. return (hr == S_OK) ? hrWarnings : hr;
  1967. }
  1968. HRESULT WINAPI ConvertINetStringInIStream(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, IStream *pstmIn, IStream *pstmOut, DWORD dwFlag, WCHAR *lpFallBack)
  1969. {
  1970. HRESULT hr;
  1971. CICharConverter * INetConvert = new CICharConverter(dwFlag, lpFallBack) ;
  1972. if (!INetConvert)
  1973. return E_OUTOFMEMORY;
  1974. hr = _ConvertINetStringInIStream(INetConvert,lpdwMode,dwSrcEncoding,dwDstEncoding,pstmIn,pstmOut,dwFlag,lpFallBack);
  1975. delete INetConvert;
  1976. return hr ;
  1977. }