Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

337 lines
11 KiB

  1. // ============================================================================
  2. // Internet Character Set Conversion: Input from UTF-8
  3. // ============================================================================
  4. #include "private.h"
  5. #include "fechrcnv.h"
  6. #include "utf8obj.h"
  7. /******************************************************************************
  8. ************************** C O N S T R U C T O R **************************
  9. ******************************************************************************/
  10. CInccUTF8In::CInccUTF8In(UINT uCodePage, int nCodeSet) : CINetCodeConverter(uCodePage, nCodeSet)
  11. {
  12. Reset(); // initialization
  13. return ;
  14. }
  15. /******************************************************************************
  16. ******************************* R E S E T *********************************
  17. ******************************************************************************/
  18. void CInccUTF8In::Reset()
  19. {
  20. m_pfnConv = ConvMain;
  21. m_pfnCleanUp = CleanUpMain;
  22. m_nByteFollow = 0 ;
  23. m_tcUnicode = 0 ;
  24. m_tcSurrogateUnicode = 0 ;
  25. m_nBytesUsed = 0 ;
  26. m_fSurrogatesPairs = FALSE;
  27. return ;
  28. }
  29. /******************************************************************************
  30. ************************* C O N V E R T C H A R *************************
  31. ******************************************************************************/
  32. HRESULT CInccUTF8In::ConvertChar(UCHAR tc, int cchSrc)
  33. {
  34. BOOL fDone = (this->*m_pfnConv)(tc);
  35. if (fDone)
  36. return S_OK;
  37. else
  38. return E_FAIL;
  39. }
  40. /******************************************************************************
  41. ***************************** C L E A N U P *****************************
  42. ******************************************************************************/
  43. BOOL CInccUTF8In::CleanUp()
  44. {
  45. return (this->*m_pfnCleanUp)();
  46. }
  47. /******************************************************************************
  48. **************************** C O N V M A I N ****************************
  49. ******************************************************************************/
  50. BOOL CInccUTF8In::ConvMain(UCHAR tc)
  51. {
  52. BOOL fDone = TRUE;
  53. if( ( 0x80 & tc ) == 0 ) // BIT7 == 0 ASCII
  54. {
  55. Output(tc);
  56. fDone = Output(0);
  57. m_nBytesUsed = 0 ;
  58. }
  59. else if( (0x40 & tc) == 0 ) // BIT6 == 0 a trail byte
  60. {
  61. if( m_nByteFollow )
  62. {
  63. if (m_fSurrogatesPairs)
  64. {
  65. m_nByteFollow--;
  66. m_tcSurrogateUnicode <<= 6; // Make room for trail byte
  67. m_tcSurrogateUnicode |= ( 0x3F & tc ); // LOWER_6BIT add trail byte value
  68. if( m_nByteFollow == 0) // End of sequence, advance output ptr
  69. {
  70. m_tcUnicode = (WCHAR)(((m_tcSurrogateUnicode - 0x10000) >> 10) + HIGHT_SURROGATE_START);
  71. tc = (UCHAR)m_tcUnicode ;
  72. if ( fDone = Output(tc) )
  73. {
  74. tc = (UCHAR) ( m_tcUnicode >> 8 ) ;
  75. fDone = Output(tc);
  76. }
  77. m_tcUnicode = (WCHAR)((m_tcSurrogateUnicode - 0x10000)%0x400 + LOW_SURROGATE_START);
  78. tc = (UCHAR)m_tcUnicode ;
  79. if ( fDone = Output(tc) )
  80. {
  81. tc = (UCHAR) ( m_tcUnicode >> 8 ) ;
  82. fDone = Output(tc);
  83. }
  84. m_fSurrogatesPairs = 0;
  85. m_nBytesUsed = 0 ;
  86. }
  87. else
  88. m_nBytesUsed++ ;
  89. }
  90. else
  91. {
  92. m_nByteFollow--;
  93. m_tcUnicode <<= 6; // make room for trail byte
  94. m_tcUnicode |= ( 0x3F & tc ); // LOWER_6BIT add trail byte value
  95. if( m_nByteFollow == 0) // end of sequence, advance output ptr
  96. {
  97. tc = (UCHAR)m_tcUnicode ;
  98. if ( fDone = Output(tc) )
  99. {
  100. tc = (UCHAR) ( m_tcUnicode >> 8 ) ;
  101. fDone = Output(tc);
  102. }
  103. m_nBytesUsed = 0 ;
  104. }
  105. else
  106. m_nBytesUsed++ ;
  107. }
  108. }
  109. else // error - ignor and rest
  110. {
  111. m_nBytesUsed = 0 ;
  112. m_nByteFollow = 0 ;
  113. }
  114. }
  115. else // a lead byte
  116. {
  117. if( m_nByteFollow > 0 ) // error, previous sequence not finished
  118. {
  119. m_nByteFollow = 0;
  120. Output(' ');
  121. fDone = Output(0);
  122. m_nBytesUsed = 0 ;
  123. }
  124. else // calculate # bytes to follow
  125. {
  126. while( (0x80 & tc) != 0) // BIT7 until first 0 encountered from left to right
  127. {
  128. tc <<= 1;
  129. m_nByteFollow++;
  130. }
  131. if (m_nByteFollow == 4)
  132. {
  133. m_fSurrogatesPairs = TRUE;
  134. m_tcSurrogateUnicode = tc >> m_nByteFollow;
  135. }
  136. else
  137. {
  138. m_tcUnicode = ( tc >> m_nByteFollow ) ;
  139. m_nBytesUsed = 1 ; // # bytes used
  140. }
  141. m_nByteFollow--; // # bytes to follow
  142. }
  143. }
  144. return fDone;
  145. }
  146. /******************************************************************************
  147. ************************ C L E A N U P M A I N ************************
  148. ******************************************************************************/
  149. BOOL CInccUTF8In::CleanUpMain()
  150. {
  151. return TRUE;
  152. }
  153. int CInccUTF8In::GetUnconvertBytes()
  154. {
  155. return m_nBytesUsed < 4 ? m_nBytesUsed : 3 ;
  156. }
  157. DWORD CInccUTF8In::GetConvertMode()
  158. {
  159. // UTF8 does not use mode esc sequence
  160. return 0 ;
  161. }
  162. void CInccUTF8In::SetConvertMode(DWORD mode)
  163. {
  164. Reset(); // initialization
  165. // UTF8 does not use mode esc sequence
  166. return ;
  167. }
  168. // ============================================================================
  169. // Internet Character Set Conversion: Output to UTF-8
  170. // ============================================================================
  171. /******************************************************************************
  172. ************************** C O N S T R U C T O R **************************
  173. ******************************************************************************/
  174. CInccUTF8Out::CInccUTF8Out(UINT uCodePage, int nCodeSet) : CINetCodeConverter(uCodePage, nCodeSet)
  175. {
  176. Reset(); // initialization
  177. return ;
  178. }
  179. /******************************************************************************
  180. ******************************* R E S E T *********************************
  181. ******************************************************************************/
  182. void CInccUTF8Out::Reset()
  183. {
  184. m_fDoubleByte = FALSE;
  185. m_wchSurrogateHigh = 0;
  186. return ;
  187. }
  188. HRESULT CInccUTF8Out::ConvertChar(UCHAR tc, int cchSrc)
  189. {
  190. BOOL fDone = TRUE;
  191. WORD uc ;
  192. UCHAR UTF8[4] ;
  193. if (m_fDoubleByte )
  194. {
  195. uc = ( (WORD) tc << 8 | m_tcLeadByte ) ;
  196. if (uc >= HIGHT_SURROGATE_START && uc <= HIGHT_SURROGATE_END && cchSrc >= sizeof(WCHAR))
  197. {
  198. if (m_wchSurrogateHigh)
  199. {
  200. UTF8[0] = 0xe0 | ( m_wchSurrogateHigh >> 12 ); // 4 bits in first byte
  201. UTF8[1] = 0x80 | ( ( m_wchSurrogateHigh >> 6 ) & 0x3f ); // 6 bits in second
  202. UTF8[2] = 0x80 | ( 0x3f & m_wchSurrogateHigh); // 6 bits in third
  203. Output(UTF8[0]);
  204. Output(UTF8[1]);
  205. fDone = Output(UTF8[2]);
  206. }
  207. m_wchSurrogateHigh = uc;
  208. m_fDoubleByte = FALSE ;
  209. goto CONVERT_DONE;
  210. }
  211. if (m_wchSurrogateHigh)
  212. {
  213. if (uc >= LOW_SURROGATE_START && uc <= LOW_SURROGATE_END) // We find a surrogate pairs
  214. {
  215. DWORD dwSurrogateChar = ((m_wchSurrogateHigh-0xD800) << 10) + uc - 0xDC00 + 0x10000;
  216. UTF8[0] = 0xF0 | (unsigned char)( dwSurrogateChar >> 18 ); // 3 bits in first byte
  217. UTF8[1] = 0x80 | (unsigned char)( ( dwSurrogateChar >> 12 ) & 0x3f ); // 6 bits in second
  218. UTF8[2] = 0x80 | (unsigned char)( ( dwSurrogateChar >> 6 ) & 0x3f ); // 6 bits in third
  219. UTF8[3] = 0x80 | (unsigned char)( 0x3f & dwSurrogateChar); // 6 bits in forth
  220. Output(UTF8[0]);
  221. Output(UTF8[1]);
  222. Output(UTF8[2]);
  223. fDone = Output(UTF8[3]);
  224. m_fDoubleByte = FALSE ;
  225. m_wchSurrogateHigh = 0;
  226. goto CONVERT_DONE;
  227. }
  228. else // Not a surrogate pairs, error
  229. {
  230. UTF8[0] = 0xe0 | ( m_wchSurrogateHigh >> 12 ); // 4 bits in first byte
  231. UTF8[1] = 0x80 | ( ( m_wchSurrogateHigh >> 6 ) & 0x3f ); // 6 bits in second
  232. UTF8[2] = 0x80 | ( 0x3f & m_wchSurrogateHigh); // 6 bits in third
  233. Output(UTF8[0]);
  234. Output(UTF8[1]);
  235. fDone = Output(UTF8[2]);
  236. m_wchSurrogateHigh = 0;
  237. }
  238. }
  239. if( ( uc & 0xff80 ) == 0 ) // ASCII
  240. {
  241. UTF8[0] = (UCHAR) uc;
  242. fDone = Output(UTF8[0]);
  243. }
  244. else if( ( uc & 0xf800 ) == 0 ) // UTF8_2_MAX 2-byte sequence if < 07ff (11 bits)
  245. {
  246. UTF8[0] = 0xC0 | (uc >> 6); // 5 bits in first byte
  247. UTF8[1] = 0x80 | ( 0x3f & uc); // 6 bits in second
  248. Output(UTF8[0]);
  249. fDone = Output(UTF8[1]);
  250. }
  251. else // 3-byte sequence
  252. {
  253. UTF8[0] = 0xe0 | ( uc >> 12 ); // 4 bits in first byte
  254. UTF8[1] = 0x80 | ( ( uc >> 6 ) & 0x3f ); // 6 bits in second
  255. UTF8[2] = 0x80 | ( 0x3f & uc); // 6 bits in third
  256. Output(UTF8[0]);
  257. Output(UTF8[1]);
  258. fDone = Output(UTF8[2]);
  259. }
  260. m_fDoubleByte = FALSE ;
  261. }
  262. else
  263. {
  264. m_tcLeadByte = tc ;
  265. m_fDoubleByte = TRUE ;
  266. }
  267. CONVERT_DONE:
  268. if (fDone)
  269. return S_OK;
  270. else
  271. return E_FAIL;
  272. }
  273. /******************************************************************************
  274. ***************************** C L E A N U P *****************************
  275. ******************************************************************************/
  276. BOOL CInccUTF8Out::CleanUp()
  277. {
  278. BOOL fDone = TRUE;
  279. return fDone;
  280. }
  281. int CInccUTF8Out::GetUnconvertBytes()
  282. {
  283. return m_fDoubleByte ? 1 : 0 ;
  284. }
  285. DWORD CInccUTF8Out::GetConvertMode()
  286. {
  287. // UTF8 does not use mode esc sequence
  288. return 0 ;
  289. }
  290. void CInccUTF8Out::SetConvertMode(DWORD mode)
  291. {
  292. Reset(); // initialization
  293. // UTF8 does not use mode esc sequence
  294. return ;
  295. }