Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

530 lines
12 KiB

/***
*unicode.c - create Unicode version of necessary CRT functions
*
* Copyright (c) 1989-1993, Microsoft Corporation. All rights reserved.
*
*Revision History:
*
*******************************************************************************/
#include <stdlib.h>
#include <assert.h>
#include <malloc.h>
#include "unicode.h"
/*
* Returns a line from file (no CRLFs);
* returns NULL if EOF
*/
WCHAR *
fgetsW (
WCHAR *string,
int count,
FILE *fp,
BOOL bUnicode)
{
int ch;
WCHAR *pch = string;
assert (string != NULL);
assert (fp != 0);
if (count <= 0)
return (NULL);
while (--count) {
ch = 0;
if (bUnicode) {
int low, high;
low = getc(fp);
high = getc(fp);
ch = MAKEWORD(low, high);
}
else {
ch = getc(fp);
}
//
// if there are no more characters, end the line
//
if (ch == EOF) {
if (pch == string)
return (NULL);
break;
}
*pch++ = ch;
if (ch == L'\n')
break;
}
*pch = L'\0';
return (string);
}
// Read the specified stream into the user's buffer
long
freadW (
WCHAR *string,
long count,
FILE *fp,
BOOL bUnicode)
{
size_t n;
long cbRead = 0;
WCHAR *pch = string;
WCHAR ch;
assert (string != NULL);
assert (fp != 0);
if (count <= 0)
return ((long) NULL);
while (cbRead < count && !feof(fp)) {
ch = L'\0';
n = fread(&ch, 1, bUnicode ? sizeof(WCHAR) : sizeof(CHAR), fp);
if (n) {
*pch = ch;
pch++;
cbRead += n;
}
}
return (cbRead);
}
#define cchReadMax 250
#ifndef _WIN32
#define BYTE_ORDER_MARK 0xFEFF
BOOL
IsFileUnicode (
char * fName)
{
long chRead;
long val = 0xFFFF;
FILE *fp;
BYTE *buf;
BOOL ret = FALSE;
if ((fp = fopen (fName, "rb")) == 0)
return (0);
if ((buf = (BYTE *) malloc(cchReadMax + 1)) == NULL) {
fclose (fp);
return (0);
}
chRead = fread (buf, 1, cchReadMax, fp);
if (*((WCHAR *)buf) == BYTE_ORDER_MARK)
return = TRUE;
fclose (fp);
free (buf);
return (ret);
}
#else // _WIN32
BOOL (WINAPI * TestForUnicode)(PVOID, ULONG, PULONG) = NULL;
BOOL WINAPI SlmIsTextUnicode( PVOID Buffer, ULONG Size, PULONG Result );
static void InitIsText(void)
{
// Since this it the first NT specific function to be called, initialize
// the Unicode test ptr.
if ((GetVersion() >> 16 & 0x00007fff) < 546)
TestForUnicode = SlmIsTextUnicode;
else {
TestForUnicode = (BOOL (WINAPI *)(PVOID, ULONG, PULONG))
GetProcAddress(LoadLibrary("ADVAPI32"), "IsTextUnicode");
// Make sure we always have something.
if (TestForUnicode == NULL)
TestForUnicode = SlmIsTextUnicode;
}
}
#pragma data_seg(".CRT$XIU")
static void (*pInitIsText)(void) = InitIsText;
#pragma data_seg()
BOOL
IsFileUnicode (
char * fName)
{
long chRead;
long val = 0xFFFF;
FILE *fp;
BYTE *buf;
BOOL ret = FALSE;
if ((fp = fopen (fName, "rb")) == 0)
return (0);
if ((buf = (BYTE *) malloc(cchReadMax + 1)) == NULL) {
fclose (fp);
return (0);
}
chRead = fread (buf, 1, cchReadMax, fp);
ret = (*TestForUnicode) (buf, chRead, &val);
fclose (fp);
free (buf);
return (ret);
}
/**
Stolen from \nt\private\ntos\rtl\nls.c. For NT versions > 546, use the
version in advapi32.
**/
#define UNICODE_FFFF 0xFFFF
#define REVERSE_BYTE_ORDER_MARK 0xFFFE
#define BYTE_ORDER_MARK 0xFEFF
#define PARAGRAPH_SEPARATOR 0x2029
#define LINE_SEPARATOR 0x2028
#define UNICODE_TAB 0x0009
#define UNICODE_LF 0x000A
#define UNICODE_CR 0x000D
#define UNICODE_SPACE 0x0020
#define UNICODE_CJK_SPACE 0x3000
#define UNICODE_R_TAB 0x0900
#define UNICODE_R_LF 0x0A00
#define UNICODE_R_CR 0x0D00
#define UNICODE_R_SPACE 0x2000
#define UNICODE_R_CJK_SPACE 0x0030 /* Ambiguous - same as ASCII '0' */
#define ASCII_CRLF 0x0A0D
#define __max(a,b) (((a) > (b)) ? (a) : (b))
#define __min(a,b) (((a) < (b)) ? (a) : (b))
BOOL WINAPI SlmIsTextUnicode( PVOID Buffer, ULONG Size, PULONG Result )
/*++
Routine Description:
IsTextUnicode performs a series of inexpensive heuristic checks
on a buffer in order to verify that it contains Unicode data.
[[ need to fix this section, see at the end ]]
Found Return Result
BOM TRUE BOM
RBOM FALSE RBOM
FFFF FALSE Binary
NULL FALSE Binary
null TRUE null bytes
ASCII_CRLF FALSE CRLF
UNICODE_TAB etc. TRUE Zero Ext Controls
UNICODE_TAB_R FALSE Reversed Controls
UNICODE_ZW etc. TRUE Unicode specials
1/3 as little variation in hi-byte as in lo byte: TRUE Correl
3/1 or worse " FALSE AntiCorrel
Arguments:
Buffer - pointer to buffer containing text to examine.
Size - size of buffer in bytes. At most 256 characters in this will
be examined. If the size is less than the size of a unicode
character, then this function returns FALSE.
Result - optional pointer to a flag word that contains additional information
about the reason for the return value. If specified, this value on
input is a mask that is used to limit the factors this routine uses
to make it decision. On output, this flag word is set to contain
those flags that were used to make its decision.
Return Value:
Boolean value that is TRUE if Buffer contains unicode characters.
--*/
{
WCHAR UNALIGNED *lpBuff = Buffer;
ULONG iBOM = 0;
ULONG iCR = 0;
ULONG iLF = 0;
ULONG iTAB = 0;
ULONG iSPACE = 0;
ULONG iCJK_SPACE = 0;
ULONG iFFFF = 0;
ULONG iPS = 0;
ULONG iLS = 0;
ULONG iRBOM = 0;
ULONG iR_CR = 0;
ULONG iR_LF = 0;
ULONG iR_TAB = 0;
ULONG iR_SPACE = 0;
ULONG iNull = 0;
ULONG iUNULL = 0;
ULONG iCRLF = 0;
ULONG iTmp;
ULONG LastLo = 0;
ULONG LastHi = 0;
ULONG iHi, iLo;
ULONG HiDiff = 0;
ULONG LoDiff = 0;
ULONG iResult = 0;
if (Size < 2 ) {
if (Result != NULL)
*Result = IS_TEXT_UNICODE_ASCII16 | IS_TEXT_UNICODE_CONTROLS;
return FALSE;
}
// Check at most 256 wide character, collect various statistics
for (iTmp = 0; iTmp < __min( 256, Size / sizeof( WCHAR ) ); iTmp++) {
switch (lpBuff[iTmp]) {
case BYTE_ORDER_MARK:
iBOM++;
break;
case PARAGRAPH_SEPARATOR:
iPS++;
break;
case LINE_SEPARATOR:
iLS++;
break;
case UNICODE_LF:
iLF++;
break;
case UNICODE_TAB:
iTAB++;
break;
case UNICODE_SPACE:
iSPACE++;
break;
case UNICODE_CJK_SPACE:
iCJK_SPACE++;
break;
case UNICODE_CR:
iCR++;
break;
// The following codes are expected to show up in
// byte reversed files
case REVERSE_BYTE_ORDER_MARK:
iRBOM++;
break;
case UNICODE_R_LF:
iR_LF++;
break;
case UNICODE_R_TAB:
iR_TAB++;
break;
case UNICODE_R_CR:
iR_CR++;
break;
case UNICODE_R_SPACE:
iR_SPACE++;
break;
// The following codes are illegal and should never occur
case UNICODE_FFFF:
iFFFF++;
break;
case UNICODE_NULL:
iUNULL++;
break;
// The following is not currently a Unicode character
// but is expected to show up accidentally when reading
// in ASCII files which use CRLF on a little endian machine
case ASCII_CRLF:
iCRLF++;
break; /* little endian */
}
// Collect statistics on the fluctuations of high bytes
// versus low bytes
iHi = HIBYTE (lpBuff[iTmp]);
iLo = LOBYTE (lpBuff[iTmp]);
iNull += (iHi ? 0 : 1) + (iLo ? 0 : 1); /* count Null bytes */
HiDiff += __max( iHi, LastHi ) - __min( LastHi, iHi );
LoDiff += __max( iLo, LastLo ) - __min( LastLo, iLo );
LastLo = iLo;
LastHi = iHi;
}
// sift the statistical evidence
if (LoDiff < 127 && HiDiff == 0) {
iResult |= IS_TEXT_UNICODE_ASCII16; /* likely 16-bit ASCII */
}
if (HiDiff && LoDiff == 0) {
iResult |= IS_TEXT_UNICODE_REVERSE_ASCII16; /* reverse order 16-bit ASCII */
}
if (3 * HiDiff < LoDiff) {
iResult |= IS_TEXT_UNICODE_STATISTICS;
}
if (3 * LoDiff < HiDiff) {
iResult |= IS_TEXT_UNICODE_REVERSE_STATISTICS;
}
//
// Any control codes widened to 16 bits? Any Unicode character
// which contain one byte in the control code range?
//
if (iCR + iLF + iTAB + iSPACE + iCJK_SPACE /*+iPS+iLS*/) {
iResult |= IS_TEXT_UNICODE_CONTROLS;
}
if (iR_LF + iR_CR + iR_TAB + iR_SPACE) {
iResult |= IS_TEXT_UNICODE_REVERSE_CONTROLS;
}
//
// Any characters that are illegal for Unicode?
//
if (iRBOM+iFFFF + iUNULL + iCRLF) {
iResult |= IS_TEXT_UNICODE_ILLEGAL_CHARS;
}
//
// Odd buffer length cannot be Unicode
//
if (Size & 1) {
iResult |= IS_TEXT_UNICODE_ODD_LENGTH;
}
//
// Any NULL bytes? (Illegal in ANSI)
//
if (iNull) {
iResult |= IS_TEXT_UNICODE_NULL_BYTES;
}
//
// POSITIVE evidence, BOM or RBOM used as signature
//
if (*lpBuff == BYTE_ORDER_MARK) {
iResult |= IS_TEXT_UNICODE_SIGNATURE;
}
else
if (*lpBuff == REVERSE_BYTE_ORDER_MARK) {
iResult |= IS_TEXT_UNICODE_REVERSE_SIGNATURE;
}
//
// limit to desired categories if requested.
//
if (Result != NULL) {
iResult &= *Result;
*Result = iResult;
}
//
// There are four separate conclusions:
//
// 1: The file APPEARS to be Unicode AU
// 2: The file CANNOT be Unicode CU
// 3: The file CANNOT be ANSI CA
//
//
// This gives the following possible results
//
// CU
// + -
//
// AU AU
// + - + -
// -------- --------
// CA +| 0 0 2 3
// |
// -| 1 1 4 5
//
//
// Note that there are only 6 really different cases, not 8.
//
// 0 - This must be a binary file
// 1 - ANSI file
// 2 - Unicode file (High probability)
// 3 - Unicode file (more than 50% chance)
// 5 - No evidence for Unicode (ANSI is default)
//
// The whole thing is more complicated if we allow the assumption
// of reverse polarity input. At this point we have a simplistic
// model: some of the reverse Unicode evidence is very strong,
// we ignore most weak evidence except statistics. If this kind of
// strong evidence is found together with Unicode evidence, it means
// its likely NOT Text at all. Furthermore if a REVERSE_BYTE_ORDER_MARK
// is found, it precludes normal Unicode. If both byte order marks are
// found it's not Unicode.
//
//
// Unicode signature : uncontested signature outweighs reverse evidence
//
if ((iResult & IS_TEXT_UNICODE_SIGNATURE) &&
!(iResult & IS_TEXT_UNICODE_NOT_UNICODE_MASK)
) {
return TRUE;
}
//
// If we have conflicting evidence, its not Unicode
//
if (iResult & IS_TEXT_UNICODE_REVERSE_MASK) {
return FALSE;
}
//
// Statistical and other results (cases 2 and 3)
//
if (!(iResult & IS_TEXT_UNICODE_NOT_UNICODE_MASK) &&
((iResult & IS_TEXT_UNICODE_NOT_ASCII_MASK) ||
(iResult & IS_TEXT_UNICODE_UNICODE_MASK)
)
) {
return TRUE;
}
return FALSE;
}
#endif // _WIN32