/////////////////////////////////////////////////////////////////////////////// // // Copyright (c) 1998, Microsoft Corp. All rights reserved. // // FILE // // iasutf8.c // // SYNOPSIS // // Defines functions for converting between UTF-8 and Unicode. // // MODIFICATION HISTORY // // 01/22/1999 Original version. // /////////////////////////////////////////////////////////////////////////////// #include #include ///////// // Tests the validity of a UTF-8 trail byte. Must be of the form 10vvvvvv. ///////// #define NOT_TRAIL_BYTE(b) (((BYTE)(b) & 0xC0) != 0x80) ////////// // Returns the number of characters required to hold the converted string. The // source string may not contain nulls. Returns -1 if 'src' is not a valid // UTF-8 string. ////////// LONG WINAPI IASUtf8ToUnicodeLength( PCSTR src, DWORD srclen ) { LONG nchar; PCSTR end; if (src == NULL) { return 0; } // Number of characters needed. nchar = 0; // End of string to be converted. end = src + srclen; // Loop through the UTF-8 string. while (src < end) { if (*src == 0) { // Do not allow embedded nulls. return -1; } else if ((BYTE)*src < 0x80) { // 0vvvvvvv = 1 byte character. } else if ((BYTE)*src < 0xC0) { // 10vvvvvv = Invalid lead byte. return -1; } else if ((BYTE)*src < 0xE0) { // 110vvvvv = 2 byte character. if (NOT_TRAIL_BYTE(*++src)) { return -1; } } else if ((BYTE)*src < 0xF0) { // 1110vvvv = 3 byte character. if (NOT_TRAIL_BYTE(*++src)) { return -1; } if (NOT_TRAIL_BYTE(*++src)) { return -1; } } else { // In theory, UTF-8 supports 4-6 byte characters, but Windows uses // 16-bit integers for Unicode, so we can't handle them. return -1; } // We successfully parsed a UTF-8 character. ++src; ++nchar; } // Return the number of characters needed. return nchar; } ////////// // Returns the number of characters required to hold the converted string. ////////// LONG WINAPI IASUnicodeToUtf8Length( PCWSTR src, DWORD srclen ) { LONG nchar; PCWSTR end; if (src == NULL) { return 0; } // Number of characters needed. nchar = 0; // End of string to be converted. end = src + srclen; // Loop through the Unicode string. while (src < end) { if (*src < 0x80) { // 1 byte character. nchar += 1; } else if (*src < 0x800) { // 2 byte character. nchar += 2; } else { // 3 byte character. nchar += 3; } // Advance to the next character in the string. ++src; } // Return the number of characters needed. return nchar; } ///////// // Converts a UTF-8 string to Unicode. Returns the number of characters in the // converted string. The source string may not contain nulls. Returns -1 if // 'src' is not a valid UTF-8 string. ///////// LONG IASUtf8ToUnicode( PCSTR src, DWORD srclen, PWSTR dst ) { PCWSTR start; PCSTR end; if (!src || !dst) { return 0; } // Remember where we started. start = dst; // End of the string to be converted. end = src + srclen; // Loop through the source UTF-8 string. while (src < end) { if (*src == 0) { // Do not allow embedded nulls. return -1; } else if ((BYTE)*src < 0x80) { // 1 byte character: 0vvvvvvv *dst = *src; } else if ((BYTE)*src < 0xC0) { // Invalid lead byte: 10vvvvvv return -1; } else if ((BYTE)*src < 0xE0) { // 2 byte character: 110vvvvv 10vvvvvv *dst = (*src & 0x1F) << 6; if (NOT_TRAIL_BYTE(*++src)) { return -1; } *dst |= (*src & 0x3F); } else if ((BYTE)*src < 0xF0) { // 3 byte character: 1110vvvv 10vvvvvv 10vvvvvv *dst = (*src & 0x0F) << 12; if (NOT_TRAIL_BYTE(*++src)) { return -1; } *dst |= (*src & 0x3f) << 6; if (NOT_TRAIL_BYTE(*++src)) { return -1; } *dst |= (*src & 0x3f); } else { // In theory, UTF-8 supports 4-6 byte characters, but Windows uses // 16-bit integers for Unicode, so we can't handle them. return -1; } // Advance to the next character. ++src; ++dst; } // Return the number of characters in the converted string. return (LONG)(dst - start); } ///////// // Converts a Unicode string to UTF-8. Returns the number of characters in the // converted string. ///////// LONG IASUnicodeToUtf8( PCWSTR src, DWORD srclen, PSTR dst ) { PCSTR start; PCWSTR end; if (!src || !dst) { return 0; } // Remember where we started. start = dst; // End of the string to be converted. end = src + srclen; // Loop through the source Unicode string. while (src < end) { if (*src < 0x80) { // Pack as 0vvvvvvv *dst++ = (CHAR)*src; } else if (*src < 0x800) { // Pack as 110vvvvv 10vvvvvv 10vvvvvv *dst++ = (CHAR)(0xC0 | ((*src >> 6) & 0x3F)); *dst++ = (CHAR)(0x80 | ((*src ) & 0x3F)); } else { // Pack as 1110vvvv 10vvvvvv 10vvvvvv *dst++ = (CHAR)(0xE0 | ((*src >> 12) )); *dst++ = (CHAR)(0x80 | ((*src >> 6) & 0x3F)); *dst++ = (CHAR)(0x80 | ((*src ) & 0x3F)); } // Advance to the next character. ++src; } // Return the number of characters in the converted string. return (LONG)(dst - start); }