Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

316 lines
9.8 KiB

  1. /*****************************************************************************\
  2. FILE: encoding.cpp
  3. DESCRIPTION:
  4. Handle taking internet strings by detecting if they are UTF-8 encoded
  5. or DBCS and finding out what code page was used.
  6. \*****************************************************************************/
  7. #include "priv.h"
  8. #include "util.h"
  9. #include "ftpurl.h"
  10. #include "statusbr.h"
  11. #include <commctrl.h>
  12. #include <shdocvw.h>
  13. /*****************************************************************************\
  14. CLASS: CMultiLanguageCache
  15. \*****************************************************************************/
  16. HRESULT CMultiLanguageCache::_Init(void)
  17. {
  18. if (m_pml2)
  19. return S_OK;
  20. return CoCreateInstance(CLSID_CMultiLanguage, NULL, CLSCTX_INPROC_SERVER, IID_IMultiLanguage2, (void **) &m_pml2);
  21. }
  22. /*****************************************************************************\
  23. CLASS: CWireEncoding
  24. \*****************************************************************************/
  25. CWireEncoding::CWireEncoding(void)
  26. {
  27. // We can go on the stack, so we may not be zero inited.
  28. m_nConfidence = 0;
  29. m_uiCodePage = CP_ACP; //
  30. m_dwMode = 0;
  31. m_fUseUTF8 = FALSE;
  32. }
  33. CWireEncoding::~CWireEncoding(void)
  34. {
  35. }
  36. void CWireEncoding::_ImproveAccuracy(CMultiLanguageCache * pmlc, LPCWIRESTR pwStr, BOOL fUpdateCP, UINT * puiCodePath)
  37. {
  38. DetectEncodingInfo dei = {0};
  39. INT nStructs = 1;
  40. INT cchSize = lstrlenA(pwStr);
  41. IMultiLanguage2 * pml2 = pmlc->GetIMultiLanguage2();
  42. // Assume we will use the normal code page.
  43. *puiCodePath = m_uiCodePage;
  44. if (S_OK == pml2->DetectInputCodepage(MLDETECTCP_8BIT, CP_AUTO, (LPWIRESTR)pwStr, &cchSize, &dei, (INT *)&nStructs))
  45. {
  46. // Is it UTF8 or just plain ansi(CP_20127)?
  47. if (((CP_UTF_8 == dei.nCodePage) || (CP_20127 == dei.nCodePage)) &&
  48. (dei.nConfidence > 70))
  49. {
  50. // Yes, so make sure the caller uses UTF8 to decode but don't update
  51. // the codepage.
  52. *puiCodePath = CP_UTF_8;
  53. }
  54. else
  55. {
  56. if (fUpdateCP && (dei.nConfidence > m_nConfidence))
  57. {
  58. m_uiCodePage = dei.nCodePage;
  59. m_nConfidence = dei.nConfidence;
  60. }
  61. }
  62. }
  63. }
  64. HRESULT CWireEncoding::WireBytesToUnicode(CMultiLanguageCache * pmlc, LPCWIRESTR pwStr, DWORD dwFlags, LPWSTR pwzDest, DWORD cchSize)
  65. {
  66. HRESULT hr = S_OK;
  67. // Optimize for the fast common case.
  68. if (Is7BitAnsi(pwStr))
  69. {
  70. pwzDest[0] = 0;
  71. SHAnsiToUnicodeCP(CP_UTF_8, pwStr, pwzDest, cchSize);
  72. hr = S_OK;
  73. }
  74. else
  75. {
  76. #ifdef FEATURE_CP_AUTODETECT
  77. if (this)
  78. {
  79. CMultiLanguageCache mlcTemp;
  80. UINT cchSizeTemp = cchSize;
  81. UINT uiCodePageToUse;
  82. if (!pmlc)
  83. pmlc = &mlcTemp;
  84. if (!pmlc || !pmlc->GetIMultiLanguage2())
  85. return E_FAIL;
  86. IMultiLanguage2 * pml2 = pmlc->GetIMultiLanguage2();
  87. _ImproveAccuracy(pmlc, pwStr, (WIREENC_IMPROVE_ACCURACY & dwFlags), &uiCodePageToUse);
  88. if (CP_ACP == uiCodePageToUse)
  89. uiCodePageToUse = GetACP();
  90. UINT cchSrcSize = lstrlenA(pwStr) + 1; // The need to do the terminator also.
  91. hr = pml2->ConvertStringToUnicode(&m_dwMode, uiCodePageToUse, (LPWIRESTR)pwStr, &cchSrcSize, pwzDest, &cchSizeTemp);
  92. if (!(EVAL(S_OK == hr)))
  93. SHAnsiToUnicode(pwStr, pwzDest, cchSize);
  94. }
  95. else
  96. #endif // FEATURE_CP_AUTODETECT
  97. {
  98. UINT uiCodePage = ((WIREENC_USE_UTF8 & dwFlags) ? CP_UTF_8 : CP_ACP);
  99. SHAnsiToUnicodeCP(uiCodePage, pwStr, pwzDest, cchSize);
  100. }
  101. }
  102. return hr;
  103. }
  104. HRESULT CWireEncoding::UnicodeToWireBytes(CMultiLanguageCache * pmlc, LPCWSTR pwzStr, DWORD dwFlags, LPWIRESTR pwDest, DWORD cchSize)
  105. {
  106. HRESULT hr = S_OK;
  107. #ifdef FEATURE_CP_AUTODETECT
  108. CMultiLanguageCache mlcTemp;
  109. DWORD dwCodePage = CP_UTF_8;
  110. DWORD dwModeTemp = 0;
  111. DWORD * pdwMode = &dwModeTemp;
  112. UINT cchSizeTemp = cchSize;
  113. // In some cases, we don't know the site, so we use this.
  114. // Come back and force this to be set if we want to support
  115. // the code page detection.
  116. if (this)
  117. {
  118. dwCodePage = m_uiCodePage;
  119. pdwMode = &m_dwMode;
  120. }
  121. if (!pmlc)
  122. pmlc = &mlcTemp;
  123. if (!pmlc)
  124. return E_FAIL;
  125. IMultiLanguage2 * pml2 = pmlc->GetIMultiLanguage2();
  126. // if (WIREENC_USE_UTF8 & dwFlags)
  127. // dwCodePage = CP_UTF_8;
  128. UINT cchSrcSize = lstrlenW(pwzStr) + 1; // The need to do the terminator also.
  129. if (CP_ACP == dwCodePage)
  130. dwCodePage = GetACP();
  131. hr = pml2->ConvertStringFromUnicode(pdwMode, dwCodePage, (LPWSTR) pwzStr, &cchSrcSize, pwDest, &cchSizeTemp);
  132. if (!(EVAL(S_OK == hr)))
  133. SHUnicodeToAnsi(pwzStr, pwDest, cchSize);
  134. #else // FEATURE_CP_AUTODETECT
  135. UINT nCodePage = ((WIREENC_USE_UTF8 & dwFlags) ? CP_UTF_8 : CP_ACP);
  136. SHUnicodeToAnsiCP(nCodePage, pwzStr, pwDest, cchSize);
  137. #endif // FEATURE_CP_AUTODETECT
  138. return hr;
  139. }
  140. HRESULT CWireEncoding::ReSetCodePages(CMultiLanguageCache * pmlc, CFtpPidlList * pFtpPidlList)
  141. {
  142. CMultiLanguageCache mlcTemp;
  143. if (!pmlc)
  144. pmlc = &mlcTemp;
  145. if (!pmlc)
  146. return E_FAIL;
  147. // Implement if we decide we need this feature. We don't after Win2k and
  148. // we don't see the need being large enought to do the work.
  149. return S_OK;
  150. }
  151. HRESULT CWireEncoding::CreateFtpItemID(CMultiLanguageCache * pmlc, LPFTP_FIND_DATA pwfd, LPITEMIDLIST * ppidl)
  152. {
  153. CMultiLanguageCache mlcTemp;
  154. WCHAR wzDisplayName[MAX_PATH];
  155. if (!pmlc)
  156. pmlc = &mlcTemp;
  157. WireBytesToUnicode(pmlc, pwfd->cFileName, (m_fUseUTF8 ? WIREENC_USE_UTF8 : WIREENC_NONE), wzDisplayName, ARRAYSIZE(wzDisplayName));
  158. return FtpItemID_CreateReal(pwfd, wzDisplayName, ppidl);
  159. }
  160. HRESULT CWireEncoding::ChangeFtpItemIDName(CMultiLanguageCache * pmlc, LPCITEMIDLIST pidlBefore, LPCWSTR pwzNewName, BOOL fUTF8, LPITEMIDLIST * ppidlAfter)
  161. {
  162. CMultiLanguageCache mlcTemp;
  163. WIRECHAR wWireName[MAX_PATH];
  164. HRESULT hr;
  165. if (!pmlc)
  166. pmlc = &mlcTemp;
  167. hr = UnicodeToWireBytes(pmlc, pwzNewName, (fUTF8 ? WIREENC_USE_UTF8 : WIREENC_NONE), wWireName, ARRAYSIZE(wWireName));
  168. if (EVAL(SUCCEEDED(hr)))
  169. hr = FtpItemID_CreateWithNewName(pidlBefore, pwzNewName, wWireName, ppidlAfter);
  170. return hr;
  171. }
  172. BOOL SHIsUTF8Encoded(LPCWIRESTR pszIsUTF8)
  173. {
  174. unsigned int len = lstrlenA(pszIsUTF8);
  175. LPCWIRESTR endbuf = pszIsUTF8 + len;
  176. unsigned char byte2mask = 0x00;
  177. unsigned char c;
  178. int trailing = 0; // trailing (continuation) bytes to follow
  179. while (pszIsUTF8 != endbuf)
  180. {
  181. c = *pszIsUTF8++;
  182. if (trailing)
  183. {
  184. if ((c & 0xC0) == 0x80) // Does trailing byte follow UTF-8 format?
  185. {
  186. if (byte2mask) // Need to check 2nd byte for proper range?
  187. {
  188. if (c & byte2mask) // Are appropriate bits set?
  189. byte2mask=0x00;
  190. else
  191. return 0;
  192. trailing--;
  193. }
  194. }
  195. else
  196. return FALSE;
  197. }
  198. else
  199. {
  200. if ((c & 0x80) == 0x00)
  201. continue; // valid 1 byte UTF-8
  202. else
  203. {
  204. if ((c & 0xE0) == 0xC0) // valid 2 byte UTF-8
  205. {
  206. if (c & 0x1E) // Is UTF-8 byte in proper range?
  207. {
  208. trailing =1;
  209. }
  210. else
  211. return FALSE;
  212. }
  213. else
  214. {
  215. if ((c & 0xF0) == 0xE0) // valid 3 byte UTF-8
  216. {
  217. if (!(c & 0x0F)) // Is UTF-8 byte in proper range?
  218. byte2mask=0x20; // If not set mask to check next byte
  219. trailing = 2;
  220. }
  221. else
  222. {
  223. if ((c & 0xF8) == 0xF0) // valid 4 byte UTF-8
  224. {
  225. if (!(c & 0x07)) // Is UTF-8 byte in proper range?
  226. byte2mask=0x30; // If not set mask to check next byte
  227. trailing = 3;
  228. }
  229. else
  230. {
  231. if ((c & 0xFC) == 0xF8) // valid 5 byte UTF-8
  232. {
  233. if (!(c & 0x03)) // Is UTF-8 byte in proper range?
  234. byte2mask=0x38; // If not set mask to check next byte
  235. trailing = 4;
  236. }
  237. else
  238. {
  239. if ((c & 0xFE) == 0xFC) // valid 6 byte UTF-8
  240. {
  241. if (!(c & 0x01)) // Is UTF-8 byte in proper range?
  242. byte2mask=0x3C; // If not set mask to check next byte
  243. trailing = 5;
  244. }
  245. else
  246. return FALSE;
  247. }
  248. }
  249. }
  250. }
  251. }
  252. }
  253. }
  254. return (trailing == 0);
  255. }