Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

216 lines
4.0 KiB

  1. #include "precomp.h"
  2. #include "utf8str.h"
  3. CUTF8String::~CUTF8String()
  4. {
  5. if (ALLOC_UNICODE & m_eAlloc)
  6. {
  7. delete m_pwszUnicode;
  8. }
  9. else if (ALLOC_UTF8 & m_eAlloc)
  10. {
  11. delete m_pszUTF8;
  12. }
  13. }
  14. CUTF8String::operator LPWSTR()
  15. {
  16. if ((NULL == m_pwszUnicode) && (NULL != m_pszUTF8))
  17. {
  18. DecodeUTF8();
  19. }
  20. return m_pwszUnicode;
  21. }
  22. CUTF8String::operator LPSTR()
  23. {
  24. if ((NULL == m_pszUTF8) && (NULL != m_pwszUnicode))
  25. {
  26. EncodeUTF8();
  27. }
  28. return m_pszUTF8;
  29. }
  30. VOID CUTF8String::EncodeUTF8()
  31. {
  32. DebugEntry(CUTF8String::EncodeUTF8);
  33. m_hr = S_OK;
  34. ASSERT(NULL != m_pwszUnicode);
  35. int cchUTF8 = 1; // always include a NULL terminator
  36. // First make a pass to see how many characters we will be converting.
  37. LPWSTR pwsz = m_pwszUnicode;
  38. while (L'\0' != *pwsz)
  39. {
  40. WCHAR wch = *pwsz++;
  41. if (wch < 0x80)
  42. {
  43. cchUTF8 += 1;
  44. }
  45. else if (wch < 0x800)
  46. {
  47. cchUTF8 += 2;
  48. }
  49. else
  50. {
  51. cchUTF8 += 3;
  52. }
  53. }
  54. ASSERT(NULL == m_pszUTF8);
  55. m_pszUTF8 = new CHAR[cchUTF8];
  56. if (NULL != m_pszUTF8)
  57. {
  58. ASSERT(ALLOC_NONE == m_eAlloc);
  59. m_eAlloc = ALLOC_UTF8;
  60. // Start encoding here:
  61. const BYTE cUtf8FirstSignal[4] = {0x00, 0x00, 0xC0, 0xE0};
  62. const BYTE cMask = 0xBF;
  63. const BYTE cSignal = 0x80;
  64. LPSTR pszStop = m_pszUTF8 + cchUTF8;
  65. LPSTR pszDst = m_pszUTF8;
  66. pwsz = m_pwszUnicode;
  67. while (pszDst < pszStop)
  68. {
  69. WCHAR wch = *pwsz++;
  70. #ifdef DEBUG
  71. if (L'\0' == wch)
  72. {
  73. ASSERT(pszDst == pszStop - 1);
  74. }
  75. #endif // DEBUG
  76. int cchTotal;
  77. if (wch < 0x80)
  78. {
  79. cchTotal = 1;
  80. }
  81. else if (wch < 0x800)
  82. {
  83. cchTotal = 2;
  84. }
  85. else
  86. {
  87. cchTotal = 3;
  88. }
  89. pszDst += cchTotal;
  90. switch (cchTotal)
  91. {
  92. case 3:
  93. *--pszDst = (wch | cSignal) & cMask;
  94. wch >>= 6;
  95. // FALL THROUGH
  96. case 2:
  97. *--pszDst = (wch | cSignal) & cMask;
  98. wch >>= 6;
  99. // FALL THROUGH
  100. case 1:
  101. *--pszDst = (wch | cUtf8FirstSignal[cchTotal]);
  102. }
  103. pszDst += cchTotal;
  104. }
  105. m_hr = S_OK;
  106. }
  107. else
  108. {
  109. m_hr = E_OUTOFMEMORY;
  110. }
  111. DebugExitVOID(CUTF8String::EncodeUTF8);
  112. }
  113. VOID CUTF8String::DecodeUTF8()
  114. {
  115. DebugEntry(CUTF8String::DecodeUTF8);
  116. m_hr = S_OK;
  117. ASSERT(NULL != m_pszUTF8);
  118. int cchUnicode = 1; // always include a NULL terminator
  119. LPSTR psz = m_pszUTF8;
  120. // First determine the destination size (cchUnicode)
  121. while ('\0' != *psz)
  122. {
  123. int cbChar = 0;
  124. BYTE bFirst = (BYTE) *psz;
  125. while (bFirst & 0x80)
  126. {
  127. cbChar++;
  128. ASSERT(cbChar < 8);
  129. bFirst <<= 1;
  130. }
  131. cbChar = max(1, cbChar);
  132. psz += cbChar;
  133. cchUnicode++;
  134. }
  135. ASSERT(NULL == m_pwszUnicode);
  136. m_pwszUnicode = new WCHAR[cchUnicode];
  137. if (NULL != m_pwszUnicode)
  138. {
  139. ASSERT(ALLOC_NONE == m_eAlloc);
  140. m_eAlloc = ALLOC_UNICODE;
  141. // Start decoding here:
  142. LPWSTR pwszStop = m_pwszUnicode + cchUnicode;
  143. LPWSTR pwszDst = m_pwszUnicode;
  144. psz = m_pszUTF8;
  145. while (pwszDst < pwszStop)
  146. {
  147. int cbChar = 0;
  148. BYTE bFirst = (BYTE) *psz;
  149. while (bFirst & 0x80)
  150. {
  151. cbChar++;
  152. ASSERT(cbChar < 8);
  153. bFirst <<= 1;
  154. }
  155. BOOL fValid = TRUE;
  156. WCHAR wch = L'\0';
  157. switch (cbChar)
  158. {
  159. case 6: psz++; // FALLTHROUGH // we don't handle
  160. case 5: psz++; // FALLTHROUGH // UCS-4; skip first
  161. case 4: psz++; // FALLTHROUGH // three bytes
  162. case 3:
  163. wch = WCHAR(*psz++ & 0x0f) << 12; // 0x0800 - 0xffff
  164. fValid = fValid && ((*psz & 0xc0) == 0x80);
  165. // FALLTHROUGH
  166. case 2:
  167. wch |= WCHAR(*psz++ & 0x3f) << 6; // 0x0080 - 0x07ff
  168. fValid = fValid && ((*psz & 0xc0) == 0x80);
  169. wch |= WCHAR(*psz++ & 0x3f);
  170. break;
  171. case 0:
  172. wch = WCHAR(*psz++); // 0x0000 - 0x007f
  173. break;
  174. default:
  175. ERROR_OUT(("CUTF8String::DecodeUTF8 found invalid UTF-8 lead byte"));
  176. wch = L'?';
  177. psz += cbChar;
  178. break;
  179. }
  180. if (FALSE == fValid)
  181. {
  182. ERROR_OUT(("CUTF8String::DecodeUTF8 found bad UTF-8 sequence"));
  183. *pwszDst = L'\0';
  184. m_hr = E_FAIL;
  185. break;
  186. }
  187. #ifdef DEBUG
  188. cchUnicode--;
  189. #endif // DEBUG
  190. *pwszDst++ = wch;
  191. }
  192. ASSERT(0 == cchUnicode);
  193. }
  194. else
  195. {
  196. m_hr = E_OUTOFMEMORY;
  197. }
  198. DebugExitVOID(CUTF8String::DecodeUTF8);
  199. }