Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

531 lines
15 KiB

  1. /***********************************************************************
  2. * Microsoft (R) Windows (R) Resource Compiler
  3. *
  4. * Copyright (c) Microsoft Corporation. All rights reserved.
  5. *
  6. * File Comments:
  7. *
  8. *
  9. ***********************************************************************/
  10. #include "rc.h"
  11. /* IsTextUnicode has to be here so this will run on Chicago and NT 1.0. */
  12. #define UNICODE_FFFF 0xFFFF
  13. #define REVERSE_BYTE_ORDER_MARK 0xFFFE
  14. #define BYTE_ORDER_MARK 0xFEFF
  15. #define PARAGRAPH_SEPARATOR 0x2029
  16. #define LINE_SEPARATOR 0x2028
  17. #define UNICODE_TAB 0x0009
  18. #define UNICODE_LF 0x000A
  19. #define UNICODE_CR 0x000D
  20. #define UNICODE_SPACE 0x0020
  21. #define UNICODE_CJK_SPACE 0x3000
  22. #define UNICODE_R_TAB 0x0900
  23. #define UNICODE_R_LF 0x0A00
  24. #define UNICODE_R_CR 0x0D00
  25. #define UNICODE_R_SPACE 0x2000
  26. #define UNICODE_R_CJK_SPACE 0x0030 /* Ambiguous - same as ASCII '0' */
  27. #define ASCII_CRLF 0x0A0D
  28. #define __max(a,b) (((a) > (b)) ? (a) : (b))
  29. #define __min(a,b) (((a) < (b)) ? (a) : (b))
  30. #define ARGUMENT_PRESENT(a) (a != NULL)
  31. BOOL
  32. WINAPI
  33. LocalIsTextUnicode(
  34. CONST LPVOID Buffer,
  35. int Size,
  36. LPINT Result
  37. )
  38. /*++
  39. Routine Description:
  40. IsTextUnicode performs a series of inexpensive heuristic checks
  41. on a buffer in order to verify that it contains Unicode data.
  42. [[ need to fix this section, see at the end ]]
  43. Found Return Result
  44. BOM TRUE BOM
  45. RBOM FALSE RBOM
  46. FFFF FALSE Binary
  47. NULL FALSE Binary
  48. null TRUE null bytes
  49. ASCII_CRLF FALSE CRLF
  50. UNICODE_TAB etc. TRUE Zero Ext Controls
  51. UNICODE_TAB_R FALSE Reversed Controls
  52. UNICODE_ZW etc. TRUE Unicode specials
  53. 1/3 as little variation in hi-byte as in lo byte: TRUE Correl
  54. 3/1 or worse " FALSE AntiCorrel
  55. Arguments:
  56. Buffer - pointer to buffer containing text to examine.
  57. Size - size of buffer in bytes. At most 256 characters in this will
  58. be examined. If the size is less than the size of a unicode
  59. character, then this function returns FALSE.
  60. Result - optional pointer to a flag word that contains additional information
  61. about the reason for the return value. If specified, this value on
  62. input is a mask that is used to limit the factors this routine uses
  63. to make it decision. On output, this flag word is set to contain
  64. those flags that were used to make its decision.
  65. Return Value:
  66. Boolean value that is TRUE if Buffer contains unicode characters.
  67. --*/
  68. {
  69. CPINFO cpinfo;
  70. UNALIGNED WCHAR *lpBuff = (UNALIGNED WCHAR *) Buffer;
  71. PCHAR lpb = (PCHAR) Buffer;
  72. ULONG iBOM = 0;
  73. ULONG iCR = 0;
  74. ULONG iLF = 0;
  75. ULONG iTAB = 0;
  76. ULONG iSPACE = 0;
  77. ULONG iCJK_SPACE = 0;
  78. ULONG iFFFF = 0;
  79. ULONG iPS = 0;
  80. ULONG iLS = 0;
  81. ULONG iRBOM = 0;
  82. ULONG iR_CR = 0;
  83. ULONG iR_LF = 0;
  84. ULONG iR_TAB = 0;
  85. ULONG iR_SPACE = 0;
  86. ULONG iNull = 0;
  87. ULONG iUNULL = 0;
  88. ULONG iCRLF = 0;
  89. ULONG iTmp;
  90. ULONG LastLo = 0;
  91. ULONG LastHi = 0;
  92. ULONG iHi, iLo;
  93. ULONG HiDiff = 0;
  94. ULONG LoDiff = 0;
  95. ULONG cLeadByte = 0;
  96. ULONG cWeird = 0;
  97. ULONG iResult = 0;
  98. ULONG iMaxTmp = __min(256, Size / sizeof(WCHAR));
  99. if (Size < 2 ) {
  100. if (ARGUMENT_PRESENT( Result )) {
  101. *Result = IS_TEXT_UNICODE_ASCII16 | IS_TEXT_UNICODE_CONTROLS;
  102. }
  103. return FALSE;
  104. }
  105. // Check at most 256 wide character, collect various statistics
  106. for (iTmp = 0; iTmp < iMaxTmp; iTmp++) {
  107. switch (lpBuff[iTmp]) {
  108. case BYTE_ORDER_MARK:
  109. iBOM++;
  110. break;
  111. case PARAGRAPH_SEPARATOR:
  112. iPS++;
  113. break;
  114. case LINE_SEPARATOR:
  115. iLS++;
  116. break;
  117. case UNICODE_LF:
  118. iLF++;
  119. break;
  120. case UNICODE_TAB:
  121. iTAB++;
  122. break;
  123. case UNICODE_SPACE:
  124. iSPACE++;
  125. break;
  126. case UNICODE_CJK_SPACE:
  127. iCJK_SPACE++;
  128. break;
  129. case UNICODE_CR:
  130. iCR++;
  131. break;
  132. // The following codes are expected to show up in
  133. // byte reversed files
  134. case REVERSE_BYTE_ORDER_MARK:
  135. iRBOM++;
  136. break;
  137. case UNICODE_R_LF:
  138. iR_LF++;
  139. break;
  140. case UNICODE_R_TAB:
  141. iR_TAB++;
  142. break;
  143. case UNICODE_R_CR:
  144. iR_CR++;
  145. break;
  146. case UNICODE_R_SPACE:
  147. iR_SPACE++;
  148. break;
  149. // The following codes are illegal and should never occur
  150. case UNICODE_FFFF:
  151. iFFFF++;
  152. break;
  153. case UNICODE_NULL:
  154. iUNULL++;
  155. break;
  156. // The following is not currently a Unicode character
  157. // but is expected to show up accidentally when reading
  158. // in ASCII files which use CRLF on a little endian machine
  159. case ASCII_CRLF:
  160. iCRLF++;
  161. break; /* little endian */
  162. }
  163. // Collect statistics on the fluctuations of high bytes
  164. // versus low bytes
  165. iHi = HIBYTE (lpBuff[iTmp]);
  166. iLo = LOBYTE (lpBuff[iTmp]);
  167. // Count cr/lf and lf/cr that cross two words
  168. if ((iLo == '\r' && LastHi == '\n') ||
  169. (iLo == '\n' && LastHi == '\r')) {
  170. cWeird++;
  171. }
  172. iNull += (iHi ? 0 : 1) + (iLo ? 0 : 1); /* count Null bytes */
  173. HiDiff += __max( iHi, LastHi ) - __min( LastHi, iHi );
  174. LoDiff += __max( iLo, LastLo ) - __min( LastLo, iLo );
  175. LastLo = iLo;
  176. LastHi = iHi;
  177. }
  178. // Count cr/lf and lf/cr that cross two words
  179. if ((iLo == '\r' && LastHi == '\n') ||
  180. (iLo == '\n' && LastHi == '\r')) {
  181. cWeird++;
  182. }
  183. if (iHi == '\0') /* don't count the last null */
  184. iNull--;
  185. if (iHi == 26) /* count ^Z at end as weird */
  186. cWeird++;
  187. iMaxTmp = (ULONG)__min(256 * sizeof(WCHAR), Size);
  188. GetCPInfo(CP_ACP, &cpinfo);
  189. if (cpinfo.MaxCharSize != 1) {
  190. for (iTmp = 0; iTmp < iMaxTmp; iTmp++) {
  191. if (IsDBCSLeadByteEx(uiCodePage, lpb[iTmp])) {
  192. cLeadByte++;
  193. iTmp++; /* should check for trailing-byte range */
  194. }
  195. }
  196. }
  197. // sift the statistical evidence
  198. if (LoDiff < 127 && HiDiff == 0) {
  199. iResult |= IS_TEXT_UNICODE_ASCII16; /* likely 16-bit ASCII */
  200. }
  201. if (HiDiff && LoDiff == 0) {
  202. iResult |= IS_TEXT_UNICODE_REVERSE_ASCII16; /* reverse 16-bit ASCII */
  203. }
  204. // Use leadbyte info to weight statistics.
  205. if (!cpinfo.MaxCharSize != 1 || cLeadByte == 0 ||
  206. !ARGUMENT_PRESENT(Result) || !(*Result & IS_TEXT_UNICODE_DBCS_LEADBYTE)) {
  207. iHi = 3;
  208. } else {
  209. // A ratio of cLeadByte:cb of 1:2 ==> dbcs
  210. // Very crude - should have a nice eq.
  211. iHi = __min(256, Size/sizeof(WCHAR)) / 2;
  212. if (cLeadByte < (iHi-1) / 3) {
  213. iHi = 3;
  214. } else if (cLeadByte < (2 * (iHi-1)) / 3) {
  215. iHi = 2;
  216. } else {
  217. iHi = 1;
  218. }
  219. iResult |= IS_TEXT_UNICODE_DBCS_LEADBYTE;
  220. }
  221. if (iHi * HiDiff < LoDiff) {
  222. iResult |= IS_TEXT_UNICODE_STATISTICS;
  223. }
  224. if (iHi * LoDiff < HiDiff) {
  225. iResult |= IS_TEXT_UNICODE_REVERSE_STATISTICS;
  226. }
  227. //
  228. // Any control codes widened to 16 bits? Any Unicode character
  229. // which contain one byte in the control code range?
  230. //
  231. if (iCR + iLF + iTAB + iSPACE + iCJK_SPACE /*+iPS+iLS*/) {
  232. iResult |= IS_TEXT_UNICODE_CONTROLS;
  233. }
  234. if (iR_LF + iR_CR + iR_TAB + iR_SPACE) {
  235. iResult |= IS_TEXT_UNICODE_REVERSE_CONTROLS;
  236. }
  237. //
  238. // Any characters that are illegal for Unicode?
  239. //
  240. if (((iRBOM + iFFFF + iUNULL + iCRLF) != 0) || ((cWeird != 0) && (cWeird >= iMaxTmp/40))) {
  241. iResult |= IS_TEXT_UNICODE_ILLEGAL_CHARS;
  242. }
  243. //
  244. // Odd buffer length cannot be Unicode
  245. //
  246. if (Size & 1) {
  247. iResult |= IS_TEXT_UNICODE_ODD_LENGTH;
  248. }
  249. //
  250. // Any NULL bytes? (Illegal in ANSI)
  251. //
  252. if (iNull) {
  253. iResult |= IS_TEXT_UNICODE_NULL_BYTES;
  254. }
  255. //
  256. // POSITIVE evidence, BOM or RBOM used as signature
  257. //
  258. if (*lpBuff == BYTE_ORDER_MARK) {
  259. iResult |= IS_TEXT_UNICODE_SIGNATURE;
  260. } else if (*lpBuff == REVERSE_BYTE_ORDER_MARK) {
  261. iResult |= IS_TEXT_UNICODE_REVERSE_SIGNATURE;
  262. }
  263. //
  264. // limit to desired categories if requested.
  265. //
  266. if (ARGUMENT_PRESENT( Result )) {
  267. iResult &= *Result;
  268. *Result = iResult;
  269. }
  270. //
  271. // There are four separate conclusions:
  272. //
  273. // 1: The file APPEARS to be Unicode AU
  274. // 2: The file CANNOT be Unicode CU
  275. // 3: The file CANNOT be ANSI CA
  276. //
  277. //
  278. // This gives the following possible results
  279. //
  280. // CU
  281. // + -
  282. //
  283. // AU AU
  284. // + - + -
  285. // -------- --------
  286. // CA +| 0 0 2 3
  287. // |
  288. // -| 1 1 4 5
  289. //
  290. //
  291. // Note that there are only 6 really different cases, not 8.
  292. //
  293. // 0 - This must be a binary file
  294. // 1 - ANSI file
  295. // 2 - Unicode file (High probability)
  296. // 3 - Unicode file (more than 50% chance)
  297. // 5 - No evidence for Unicode (ANSI is default)
  298. //
  299. // The whole thing is more complicated if we allow the assumption
  300. // of reverse polarity input. At this point we have a simplistic
  301. // model: some of the reverse Unicode evidence is very strong,
  302. // we ignore most weak evidence except statistics. If this kind of
  303. // strong evidence is found together with Unicode evidence, it means
  304. // its likely NOT Text at all. Furthermore if a REVERSE_BYTE_ORDER_MARK
  305. // is found, it precludes normal Unicode. If both byte order marks are
  306. // found it's not Unicode.
  307. //
  308. //
  309. // Unicode signature : uncontested signature outweighs reverse evidence
  310. //
  311. if ((iResult & IS_TEXT_UNICODE_SIGNATURE) &&
  312. !(iResult & (IS_TEXT_UNICODE_NOT_UNICODE_MASK&(~IS_TEXT_UNICODE_DBCS_LEADBYTE)))
  313. ) {
  314. return TRUE;
  315. }
  316. //
  317. // If we have conflicting evidence, it's not Unicode
  318. //
  319. if (iResult & IS_TEXT_UNICODE_REVERSE_MASK) {
  320. return FALSE;
  321. }
  322. //
  323. // Statistical and other results (cases 2 and 3)
  324. //
  325. if (!(iResult & IS_TEXT_UNICODE_NOT_UNICODE_MASK) &&
  326. ((iResult & IS_TEXT_UNICODE_NOT_ASCII_MASK) ||
  327. (iResult & IS_TEXT_UNICODE_UNICODE_MASK)
  328. )
  329. ) {
  330. return TRUE;
  331. }
  332. return FALSE;
  333. }
  334. /*------------------------------------------------------------------*/
  335. /* */
  336. /* fgetl() - */
  337. /* */
  338. /*------------------------------------------------------------------*/
  339. /* fgetl expands tabs and return lines w/o separators */
  340. /* returns line from file (no CRLFs); returns NULL if EOF */
  341. int
  342. fgetl (
  343. PWCHAR wbuf,
  344. int len,
  345. BOOL bUnicode,
  346. PFILE fh
  347. )
  348. {
  349. int c = 0;
  350. int second;
  351. *wbuf = 0;
  352. if (bUnicode) {
  353. PWCHAR p;
  354. /* remember NUL at end */
  355. len--;
  356. p = wbuf;
  357. /* fill buffer from the file until EOF or EOLN or no space in buffer */
  358. while (len) {
  359. c = fgetc (fh);
  360. if (c == EOF)
  361. break;
  362. second = fgetc (fh);
  363. c = MAKEWORD (c, second);
  364. if (c == L'\n')
  365. break;
  366. if (c != L'\r') {
  367. if (c != L'\t') {
  368. *p++ = (WCHAR)c;
  369. len--;
  370. } else {
  371. /* tabs: expand to spaces */
  372. c = (int)(min (8 - ((p - wbuf) & 0x0007), len));
  373. len -= c;
  374. while (c) {
  375. *p++ = L' ';
  376. c--;
  377. }
  378. }
  379. }
  380. }
  381. /* null terminate string */
  382. *p = 0;
  383. } else {
  384. PCHAR p;
  385. PCHAR lpbuf;
  386. p = lpbuf = (PCHAR) LocalAlloc (LPTR, len);
  387. if (p) {
  388. /* remember NUL at end */
  389. len--;
  390. /* fill buffer from the file until EOF or EOLN or no space in buffer */
  391. while (len) {
  392. c = fgetc (fh);
  393. if (c == EOF || c == '\n')
  394. break;
  395. if (c != '\r') {
  396. if (c != '\t') {
  397. *p++ = (CHAR)c;
  398. len--;
  399. } else {
  400. /* tabs: expand to spaces */
  401. c = (int)(min (8 - ((p - lpbuf) & 0x0007), len));
  402. len -= c;
  403. while (c) {
  404. *p++ = ' ';
  405. c--;
  406. }
  407. }
  408. }
  409. }
  410. /* null terminate string and translate to Unicode */
  411. *p = 0;
  412. MultiByteToWideChar (uiCodePage, MB_PRECOMPOSED, lpbuf, -1, wbuf, (int)(p - lpbuf + 1));
  413. LocalFree (lpbuf);
  414. }
  415. }
  416. /* return false if EOF with no chars read */
  417. return !(c == EOF && !*wbuf);
  418. }
  419. /*----------------------------------------------------------*/
  420. /* */
  421. /* myfwrite() - */
  422. /* */
  423. /* Wrapper for fwrite to ensure data gets to the disk. */
  424. /* returns if ok, calls quit if write fails */
  425. /*----------------------------------------------------------*/
  426. void
  427. myfwrite(
  428. const void *pv,
  429. size_t s,
  430. size_t n,
  431. PFILE fp
  432. )
  433. {
  434. if (fwrite(pv, s, n, fp) == n)
  435. return;
  436. fatal(1122);
  437. }