Source code of Windows XP (NT5)
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
// ============================================================================
// Internet Character Set Conversion: Input from UTF-8
// ============================================================================
#include "private.h"
#include "fechrcnv.h"
#include "utf8obj.h"
/******************************************************************************
************************** C O N S T R U C T O R ************************** ******************************************************************************/
CInccUTF8In::CInccUTF8In(UINT uCodePage, int nCodeSet) : CINetCodeConverter(uCodePage, nCodeSet) { Reset(); // initialization
return ; }
/******************************************************************************
******************************* R E S E T ********************************* ******************************************************************************/
void CInccUTF8In::Reset() { m_pfnConv = ConvMain; m_pfnCleanUp = CleanUpMain; m_nByteFollow = 0 ; m_tcUnicode = 0 ; m_tcSurrogateUnicode = 0 ; m_nBytesUsed = 0 ; m_fSurrogatesPairs = FALSE; return ; }
/******************************************************************************
************************* C O N V E R T C H A R ************************* ******************************************************************************/
HRESULT CInccUTF8In::ConvertChar(UCHAR tc, int cchSrc) { BOOL fDone = (this->*m_pfnConv)(tc); if (fDone) return S_OK; else return E_FAIL; }
/******************************************************************************
***************************** C L E A N U P ***************************** ******************************************************************************/
BOOL CInccUTF8In::CleanUp() { return (this->*m_pfnCleanUp)(); }
/******************************************************************************
**************************** C O N V M A I N **************************** ******************************************************************************/
BOOL CInccUTF8In::ConvMain(UCHAR tc) { BOOL fDone = TRUE;
if( ( 0x80 & tc ) == 0 ) // BIT7 == 0 ASCII
{ Output(tc); fDone = Output(0); m_nBytesUsed = 0 ; } else if( (0x40 & tc) == 0 ) // BIT6 == 0 a trail byte
{ if( m_nByteFollow ) { if (m_fSurrogatesPairs) { m_nByteFollow--; m_tcSurrogateUnicode <<= 6; // Make room for trail byte
m_tcSurrogateUnicode |= ( 0x3F & tc ); // LOWER_6BIT add trail byte value
if( m_nByteFollow == 0) // End of sequence, advance output ptr
{ m_tcUnicode = (WCHAR)(((m_tcSurrogateUnicode - 0x10000) >> 10) + HIGHT_SURROGATE_START); tc = (UCHAR)m_tcUnicode ; if ( fDone = Output(tc) ) { tc = (UCHAR) ( m_tcUnicode >> 8 ) ; fDone = Output(tc); } m_tcUnicode = (WCHAR)((m_tcSurrogateUnicode - 0x10000)%0x400 + LOW_SURROGATE_START); tc = (UCHAR)m_tcUnicode ; if ( fDone = Output(tc) ) { tc = (UCHAR) ( m_tcUnicode >> 8 ) ; fDone = Output(tc); } m_fSurrogatesPairs = 0; m_nBytesUsed = 0 ; } else m_nBytesUsed++ ; } else { m_nByteFollow--; m_tcUnicode <<= 6; // make room for trail byte
m_tcUnicode |= ( 0x3F & tc ); // LOWER_6BIT add trail byte value
if( m_nByteFollow == 0) // end of sequence, advance output ptr
{ tc = (UCHAR)m_tcUnicode ; if ( fDone = Output(tc) ) { tc = (UCHAR) ( m_tcUnicode >> 8 ) ; fDone = Output(tc); } m_nBytesUsed = 0 ; } else m_nBytesUsed++ ; } } else // error - ignor and rest
{ m_nBytesUsed = 0 ; m_nByteFollow = 0 ; } } else // a lead byte
{ if( m_nByteFollow > 0 ) // error, previous sequence not finished
{ m_nByteFollow = 0; Output(' '); fDone = Output(0); m_nBytesUsed = 0 ; } else // calculate # bytes to follow
{ while( (0x80 & tc) != 0) // BIT7 until first 0 encountered from left to right
{ tc <<= 1; m_nByteFollow++; }
if (m_nByteFollow == 4) { m_fSurrogatesPairs = TRUE; m_tcSurrogateUnicode = tc >> m_nByteFollow;
} else { m_tcUnicode = ( tc >> m_nByteFollow ) ; m_nBytesUsed = 1 ; // # bytes used
} m_nByteFollow--; // # bytes to follow
} }
return fDone; }
/******************************************************************************
************************ C L E A N U P M A I N ************************ ******************************************************************************/
BOOL CInccUTF8In::CleanUpMain() { return TRUE; }
int CInccUTF8In::GetUnconvertBytes() { return m_nBytesUsed < 4 ? m_nBytesUsed : 3 ; }
DWORD CInccUTF8In::GetConvertMode() { // UTF8 does not use mode esc sequence
return 0 ; }
void CInccUTF8In::SetConvertMode(DWORD mode) { Reset(); // initialization
// UTF8 does not use mode esc sequence
return ; }
// ============================================================================
// Internet Character Set Conversion: Output to UTF-8
// ============================================================================
/******************************************************************************
************************** C O N S T R U C T O R ************************** ******************************************************************************/
CInccUTF8Out::CInccUTF8Out(UINT uCodePage, int nCodeSet) : CINetCodeConverter(uCodePage, nCodeSet) { Reset(); // initialization
return ; }
/******************************************************************************
******************************* R E S E T ********************************* ******************************************************************************/
void CInccUTF8Out::Reset() { m_fDoubleByte = FALSE; m_wchSurrogateHigh = 0; return ; }
HRESULT CInccUTF8Out::ConvertChar(UCHAR tc, int cchSrc) { BOOL fDone = TRUE; WORD uc ; UCHAR UTF8[4] ;
if (m_fDoubleByte ) { uc = ( (WORD) tc << 8 | m_tcLeadByte ) ;
if (uc >= HIGHT_SURROGATE_START && uc <= HIGHT_SURROGATE_END && cchSrc >= sizeof(WCHAR)) { if (m_wchSurrogateHigh) { UTF8[0] = 0xe0 | ( m_wchSurrogateHigh >> 12 ); // 4 bits in first byte
UTF8[1] = 0x80 | ( ( m_wchSurrogateHigh >> 6 ) & 0x3f ); // 6 bits in second
UTF8[2] = 0x80 | ( 0x3f & m_wchSurrogateHigh); // 6 bits in third
Output(UTF8[0]); Output(UTF8[1]); fDone = Output(UTF8[2]); } m_wchSurrogateHigh = uc; m_fDoubleByte = FALSE ; goto CONVERT_DONE; }
if (m_wchSurrogateHigh) { if (uc >= LOW_SURROGATE_START && uc <= LOW_SURROGATE_END) // We find a surrogate pairs
{
DWORD dwSurrogateChar = ((m_wchSurrogateHigh-0xD800) << 10) + uc - 0xDC00 + 0x10000; UTF8[0] = 0xF0 | (unsigned char)( dwSurrogateChar >> 18 ); // 3 bits in first byte
UTF8[1] = 0x80 | (unsigned char)( ( dwSurrogateChar >> 12 ) & 0x3f ); // 6 bits in second
UTF8[2] = 0x80 | (unsigned char)( ( dwSurrogateChar >> 6 ) & 0x3f ); // 6 bits in third
UTF8[3] = 0x80 | (unsigned char)( 0x3f & dwSurrogateChar); // 6 bits in forth
Output(UTF8[0]); Output(UTF8[1]); Output(UTF8[2]); fDone = Output(UTF8[3]); m_fDoubleByte = FALSE ; m_wchSurrogateHigh = 0; goto CONVERT_DONE; } else // Not a surrogate pairs, error
{ UTF8[0] = 0xe0 | ( m_wchSurrogateHigh >> 12 ); // 4 bits in first byte
UTF8[1] = 0x80 | ( ( m_wchSurrogateHigh >> 6 ) & 0x3f ); // 6 bits in second
UTF8[2] = 0x80 | ( 0x3f & m_wchSurrogateHigh); // 6 bits in third
Output(UTF8[0]); Output(UTF8[1]); fDone = Output(UTF8[2]); m_wchSurrogateHigh = 0; } }
if( ( uc & 0xff80 ) == 0 ) // ASCII
{ UTF8[0] = (UCHAR) uc; fDone = Output(UTF8[0]); } else if( ( uc & 0xf800 ) == 0 ) // UTF8_2_MAX 2-byte sequence if < 07ff (11 bits)
{ UTF8[0] = 0xC0 | (uc >> 6); // 5 bits in first byte
UTF8[1] = 0x80 | ( 0x3f & uc); // 6 bits in second
Output(UTF8[0]); fDone = Output(UTF8[1]); } else // 3-byte sequence
{ UTF8[0] = 0xe0 | ( uc >> 12 ); // 4 bits in first byte
UTF8[1] = 0x80 | ( ( uc >> 6 ) & 0x3f ); // 6 bits in second
UTF8[2] = 0x80 | ( 0x3f & uc); // 6 bits in third
Output(UTF8[0]); Output(UTF8[1]); fDone = Output(UTF8[2]); } m_fDoubleByte = FALSE ; } else { m_tcLeadByte = tc ; m_fDoubleByte = TRUE ; }
CONVERT_DONE: if (fDone) return S_OK; else return E_FAIL; }
/******************************************************************************
***************************** C L E A N U P ***************************** ******************************************************************************/
BOOL CInccUTF8Out::CleanUp() { BOOL fDone = TRUE;
return fDone; }
int CInccUTF8Out::GetUnconvertBytes() { return m_fDoubleByte ? 1 : 0 ; }
DWORD CInccUTF8Out::GetConvertMode() { // UTF8 does not use mode esc sequence
return 0 ; }
void CInccUTF8Out::SetConvertMode(DWORD mode) { Reset(); // initialization
// UTF8 does not use mode esc sequence
return ; }
|