Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

138 lines
3.7 KiB

  1. /*
  2. * nputf.c - Routines for utf text processing for notepad
  3. *
  4. * Copyright (C) 1998-2000 Microsoft Corporation
  5. */
  6. #include "precomp.h"
  7. /* IsTextUTF8
  8. *
  9. * UTF-8 is the encoding of Unicode based on Internet Society RFC2279
  10. * ( See http://www.cis.ohio-state.edu/htbin/rfc/rfc2279.html )
  11. *
  12. * Basicly:
  13. * 0000 0000-0000 007F - 0xxxxxxx (ascii converts to 1 octet!)
  14. * 0000 0080-0000 07FF - 110xxxxx 10xxxxxx ( 2 octet format)
  15. * 0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)
  16. * (this keeps going for 32 bit unicode)
  17. *
  18. *
  19. * Return value: TRUE, if the text is in UTF-8 format.
  20. * FALSE, if the text is not in UTF-8 format.
  21. * We will also return FALSE is it is only 7-bit ascii, so the right code page
  22. * will be used.
  23. *
  24. * Actually for 7 bit ascii, it doesn't matter which code page we use, but
  25. * notepad will remember that it is utf-8 and "save" or "save as" will store
  26. * the file with a UTF-8 BOM. Not cool.
  27. */
  28. INT IsTextUTF8( LPSTR lpstrInputStream, INT iLen )
  29. {
  30. INT i;
  31. DWORD cOctets; // octets to go in this UTF-8 encoded character
  32. UCHAR chr;
  33. BOOL bAllAscii= TRUE;
  34. cOctets= 0;
  35. for( i=0; i < iLen; i++ ) {
  36. chr= *(lpstrInputStream+i);
  37. if( (chr&0x80) != 0 ) bAllAscii= FALSE;
  38. if( cOctets == 0 ) {
  39. //
  40. // 7 bit ascii after 7 bit ascii is just fine. Handle start of encoding case.
  41. //
  42. if( chr >= 0x80 ) {
  43. //
  44. // count of the leading 1 bits is the number of characters encoded
  45. //
  46. do {
  47. chr <<= 1;
  48. cOctets++;
  49. }
  50. while( (chr&0x80) != 0 );
  51. cOctets--; // count includes this character
  52. if( cOctets == 0 ) return FALSE; // must start with 11xxxxxx
  53. }
  54. }
  55. else {
  56. // non-leading bytes must start as 10xxxxxx
  57. if( (chr&0xC0) != 0x80 ) {
  58. return FALSE;
  59. }
  60. cOctets--; // processed another octet in encoding
  61. }
  62. }
  63. //
  64. // End of text. Check for consistency.
  65. //
  66. if( cOctets > 0 ) { // anything left over at the end is an error
  67. return FALSE;
  68. }
  69. if( bAllAscii ) { // Not utf-8 if all ascii. Forces caller to use code pages for conversion
  70. return FALSE;
  71. }
  72. return TRUE;
  73. }
  74. /* IsInputTextUnicode
  75. * Verify if the input stream is in Unicode format.
  76. *
  77. * Return value: TRUE, if the text is in Unicode format.
  78. *
  79. * 29 June 1998
  80. */
  81. INT IsInputTextUnicode (LPSTR lpstrInputStream, INT iLen)
  82. {
  83. INT iResult= ~0; // turn on IS_TEXT_UNICODE_DBCS_LEADBYTE
  84. BOOL bUnicode;
  85. // We would like to check the possibility
  86. // of IS_TEXT_UNICODE_DBCS_LEADBYTE.
  87. //
  88. bUnicode= IsTextUnicode( lpstrInputStream, iLen, &iResult);
  89. if (bUnicode &&
  90. ((iResult & IS_TEXT_UNICODE_STATISTICS) != 0 ) &&
  91. ((iResult & (~IS_TEXT_UNICODE_STATISTICS)) == 0 ) )
  92. {
  93. CPINFO cpiInfo;
  94. CHAR* pch= (CHAR*)lpstrInputStream;
  95. INT cb;
  96. //
  97. // If the result depends only upon statistics, check
  98. // to see if there is a possibility of DBCS.
  99. // Only do this check if the ansi code page is DBCS
  100. //
  101. GetCPInfo( CP_ACP, &cpiInfo);
  102. if( cpiInfo.MaxCharSize > 1 )
  103. {
  104. for( cb=0; cb<iLen; cb++ )
  105. {
  106. if( IsDBCSLeadByte(*pch++) )
  107. {
  108. return FALSE;
  109. }
  110. }
  111. }
  112. }
  113. return bUnicode;
  114. }