Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

530 lines
15 KiB

  1. /***********************************************************************/
  2. /* */
  3. /* RCFUTIL.C - */
  4. /* */
  5. /* Windows 3.0 Resource compiler - File utility functions */
  6. /* */
  7. /* */
  8. /***********************************************************************/
  9. #include "rc.h"
  10. /* IsTextUnicode has to be here so this will run on Chicago and NT 1.0. */
  11. #define UNICODE_FFFF 0xFFFF
  12. #define REVERSE_BYTE_ORDER_MARK 0xFFFE
  13. #define BYTE_ORDER_MARK 0xFEFF
  14. #define PARAGRAPH_SEPARATOR 0x2029
  15. #define LINE_SEPARATOR 0x2028
  16. #define UNICODE_TAB 0x0009
  17. #define UNICODE_LF 0x000A
  18. #define UNICODE_CR 0x000D
  19. #define UNICODE_SPACE 0x0020
  20. #define UNICODE_CJK_SPACE 0x3000
  21. #define UNICODE_R_TAB 0x0900
  22. #define UNICODE_R_LF 0x0A00
  23. #define UNICODE_R_CR 0x0D00
  24. #define UNICODE_R_SPACE 0x2000
  25. #define UNICODE_R_CJK_SPACE 0x0030 /* Ambiguous - same as ASCII '0' */
  26. #define ASCII_CRLF 0x0A0D
  27. #define __max(a,b) (((a) > (b)) ? (a) : (b))
  28. #define __min(a,b) (((a) < (b)) ? (a) : (b))
  29. #define ARGUMENT_PRESENT(a) (a != NULL)
  30. BOOL
  31. WINAPI
  32. LocalIsTextUnicode(
  33. CONST LPVOID Buffer,
  34. int Size,
  35. LPINT Result
  36. )
  37. /*++
  38. Routine Description:
  39. IsTextUnicode performs a series of inexpensive heuristic checks
  40. on a buffer in order to verify that it contains Unicode data.
  41. [[ need to fix this section, see at the end ]]
  42. Found Return Result
  43. BOM TRUE BOM
  44. RBOM FALSE RBOM
  45. FFFF FALSE Binary
  46. NULL FALSE Binary
  47. null TRUE null bytes
  48. ASCII_CRLF FALSE CRLF
  49. UNICODE_TAB etc. TRUE Zero Ext Controls
  50. UNICODE_TAB_R FALSE Reversed Controls
  51. UNICODE_ZW etc. TRUE Unicode specials
  52. 1/3 as little variation in hi-byte as in lo byte: TRUE Correl
  53. 3/1 or worse " FALSE AntiCorrel
  54. Arguments:
  55. Buffer - pointer to buffer containing text to examine.
  56. Size - size of buffer in bytes. At most 256 characters in this will
  57. be examined. If the size is less than the size of a unicode
  58. character, then this function returns FALSE.
  59. Result - optional pointer to a flag word that contains additional information
  60. about the reason for the return value. If specified, this value on
  61. input is a mask that is used to limit the factors this routine uses
  62. to make it decision. On output, this flag word is set to contain
  63. those flags that were used to make its decision.
  64. Return Value:
  65. Boolean value that is TRUE if Buffer contains unicode characters.
  66. --*/
  67. {
  68. CPINFO cpinfo;
  69. UNALIGNED WCHAR *lpBuff = (UNALIGNED WCHAR *) Buffer;
  70. PCHAR lpb = (PCHAR) Buffer;
  71. ULONG iBOM = 0;
  72. ULONG iCR = 0;
  73. ULONG iLF = 0;
  74. ULONG iTAB = 0;
  75. ULONG iSPACE = 0;
  76. ULONG iCJK_SPACE = 0;
  77. ULONG iFFFF = 0;
  78. ULONG iPS = 0;
  79. ULONG iLS = 0;
  80. ULONG iRBOM = 0;
  81. ULONG iR_CR = 0;
  82. ULONG iR_LF = 0;
  83. ULONG iR_TAB = 0;
  84. ULONG iR_SPACE = 0;
  85. ULONG iNull = 0;
  86. ULONG iUNULL = 0;
  87. ULONG iCRLF = 0;
  88. ULONG iTmp;
  89. ULONG LastLo = 0;
  90. ULONG LastHi = 0;
  91. ULONG iHi, iLo;
  92. ULONG HiDiff = 0;
  93. ULONG LoDiff = 0;
  94. ULONG cLeadByte = 0;
  95. ULONG cWeird = 0;
  96. ULONG iResult = 0;
  97. ULONG iMaxTmp = __min(256, Size / sizeof(WCHAR));
  98. if (Size < 2 ) {
  99. if (ARGUMENT_PRESENT( Result )) {
  100. *Result = IS_TEXT_UNICODE_ASCII16 | IS_TEXT_UNICODE_CONTROLS;
  101. }
  102. return FALSE;
  103. }
  104. // Check at most 256 wide character, collect various statistics
  105. for (iTmp = 0; iTmp < iMaxTmp; iTmp++) {
  106. switch (lpBuff[iTmp]) {
  107. case BYTE_ORDER_MARK:
  108. iBOM++;
  109. break;
  110. case PARAGRAPH_SEPARATOR:
  111. iPS++;
  112. break;
  113. case LINE_SEPARATOR:
  114. iLS++;
  115. break;
  116. case UNICODE_LF:
  117. iLF++;
  118. break;
  119. case UNICODE_TAB:
  120. iTAB++;
  121. break;
  122. case UNICODE_SPACE:
  123. iSPACE++;
  124. break;
  125. case UNICODE_CJK_SPACE:
  126. iCJK_SPACE++;
  127. break;
  128. case UNICODE_CR:
  129. iCR++;
  130. break;
  131. // The following codes are expected to show up in
  132. // byte reversed files
  133. case REVERSE_BYTE_ORDER_MARK:
  134. iRBOM++;
  135. break;
  136. case UNICODE_R_LF:
  137. iR_LF++;
  138. break;
  139. case UNICODE_R_TAB:
  140. iR_TAB++;
  141. break;
  142. case UNICODE_R_CR:
  143. iR_CR++;
  144. break;
  145. case UNICODE_R_SPACE:
  146. iR_SPACE++;
  147. break;
  148. // The following codes are illegal and should never occur
  149. case UNICODE_FFFF:
  150. iFFFF++;
  151. break;
  152. case UNICODE_NULL:
  153. iUNULL++;
  154. break;
  155. // The following is not currently a Unicode character
  156. // but is expected to show up accidentally when reading
  157. // in ASCII files which use CRLF on a little endian machine
  158. case ASCII_CRLF:
  159. iCRLF++;
  160. break; /* little endian */
  161. }
  162. // Collect statistics on the fluctuations of high bytes
  163. // versus low bytes
  164. iHi = HIBYTE (lpBuff[iTmp]);
  165. iLo = LOBYTE (lpBuff[iTmp]);
  166. // Count cr/lf and lf/cr that cross two words
  167. if ((iLo == '\r' && LastHi == '\n') ||
  168. (iLo == '\n' && LastHi == '\r')) {
  169. cWeird++;
  170. }
  171. iNull += (iHi ? 0 : 1) + (iLo ? 0 : 1); /* count Null bytes */
  172. HiDiff += __max( iHi, LastHi ) - __min( LastHi, iHi );
  173. LoDiff += __max( iLo, LastLo ) - __min( LastLo, iLo );
  174. LastLo = iLo;
  175. LastHi = iHi;
  176. }
  177. // Count cr/lf and lf/cr that cross two words
  178. if ((iLo == '\r' && LastHi == '\n') ||
  179. (iLo == '\n' && LastHi == '\r')) {
  180. cWeird++;
  181. }
  182. if (iHi == '\0') /* don't count the last null */
  183. iNull--;
  184. if (iHi == 26) /* count ^Z at end as weird */
  185. cWeird++;
  186. iMaxTmp = (ULONG)__min(256 * sizeof(WCHAR), Size);
  187. GetCPInfo(CP_ACP, &cpinfo);
  188. if (cpinfo.MaxCharSize != 1) {
  189. for (iTmp = 0; iTmp < iMaxTmp; iTmp++) {
  190. if (IsDBCSLeadByteEx(uiCodePage, lpb[iTmp])) {
  191. cLeadByte++;
  192. iTmp++; /* should check for trailing-byte range */
  193. }
  194. }
  195. }
  196. // sift the statistical evidence
  197. if (LoDiff < 127 && HiDiff == 0) {
  198. iResult |= IS_TEXT_UNICODE_ASCII16; /* likely 16-bit ASCII */
  199. }
  200. if (HiDiff && LoDiff == 0) {
  201. iResult |= IS_TEXT_UNICODE_REVERSE_ASCII16; /* reverse 16-bit ASCII */
  202. }
  203. // Use leadbyte info to weight statistics.
  204. if (!cpinfo.MaxCharSize != 1 || cLeadByte == 0 ||
  205. !ARGUMENT_PRESENT(Result) || !(*Result & IS_TEXT_UNICODE_DBCS_LEADBYTE)) {
  206. iHi = 3;
  207. } else {
  208. // A ratio of cLeadByte:cb of 1:2 ==> dbcs
  209. // Very crude - should have a nice eq.
  210. iHi = __min(256, Size/sizeof(WCHAR)) / 2;
  211. if (cLeadByte < (iHi-1) / 3) {
  212. iHi = 3;
  213. } else if (cLeadByte < (2 * (iHi-1)) / 3) {
  214. iHi = 2;
  215. } else {
  216. iHi = 1;
  217. }
  218. iResult |= IS_TEXT_UNICODE_DBCS_LEADBYTE;
  219. }
  220. if (iHi * HiDiff < LoDiff) {
  221. iResult |= IS_TEXT_UNICODE_STATISTICS;
  222. }
  223. if (iHi * LoDiff < HiDiff) {
  224. iResult |= IS_TEXT_UNICODE_REVERSE_STATISTICS;
  225. }
  226. //
  227. // Any control codes widened to 16 bits? Any Unicode character
  228. // which contain one byte in the control code range?
  229. //
  230. if (iCR + iLF + iTAB + iSPACE + iCJK_SPACE /*+iPS+iLS*/) {
  231. iResult |= IS_TEXT_UNICODE_CONTROLS;
  232. }
  233. if (iR_LF + iR_CR + iR_TAB + iR_SPACE) {
  234. iResult |= IS_TEXT_UNICODE_REVERSE_CONTROLS;
  235. }
  236. //
  237. // Any characters that are illegal for Unicode?
  238. //
  239. if (((iRBOM + iFFFF + iUNULL + iCRLF) != 0) || ((cWeird != 0) && (cWeird >= iMaxTmp/40))) {
  240. iResult |= IS_TEXT_UNICODE_ILLEGAL_CHARS;
  241. }
  242. //
  243. // Odd buffer length cannot be Unicode
  244. //
  245. if (Size & 1) {
  246. iResult |= IS_TEXT_UNICODE_ODD_LENGTH;
  247. }
  248. //
  249. // Any NULL bytes? (Illegal in ANSI)
  250. //
  251. if (iNull) {
  252. iResult |= IS_TEXT_UNICODE_NULL_BYTES;
  253. }
  254. //
  255. // POSITIVE evidence, BOM or RBOM used as signature
  256. //
  257. if (*lpBuff == BYTE_ORDER_MARK) {
  258. iResult |= IS_TEXT_UNICODE_SIGNATURE;
  259. } else if (*lpBuff == REVERSE_BYTE_ORDER_MARK) {
  260. iResult |= IS_TEXT_UNICODE_REVERSE_SIGNATURE;
  261. }
  262. //
  263. // limit to desired categories if requested.
  264. //
  265. if (ARGUMENT_PRESENT( Result )) {
  266. iResult &= *Result;
  267. *Result = iResult;
  268. }
  269. //
  270. // There are four separate conclusions:
  271. //
  272. // 1: The file APPEARS to be Unicode AU
  273. // 2: The file CANNOT be Unicode CU
  274. // 3: The file CANNOT be ANSI CA
  275. //
  276. //
  277. // This gives the following possible results
  278. //
  279. // CU
  280. // + -
  281. //
  282. // AU AU
  283. // + - + -
  284. // -------- --------
  285. // CA +| 0 0 2 3
  286. // |
  287. // -| 1 1 4 5
  288. //
  289. //
  290. // Note that there are only 6 really different cases, not 8.
  291. //
  292. // 0 - This must be a binary file
  293. // 1 - ANSI file
  294. // 2 - Unicode file (High probability)
  295. // 3 - Unicode file (more than 50% chance)
  296. // 5 - No evidence for Unicode (ANSI is default)
  297. //
  298. // The whole thing is more complicated if we allow the assumption
  299. // of reverse polarity input. At this point we have a simplistic
  300. // model: some of the reverse Unicode evidence is very strong,
  301. // we ignore most weak evidence except statistics. If this kind of
  302. // strong evidence is found together with Unicode evidence, it means
  303. // its likely NOT Text at all. Furthermore if a REVERSE_BYTE_ORDER_MARK
  304. // is found, it precludes normal Unicode. If both byte order marks are
  305. // found it's not Unicode.
  306. //
  307. //
  308. // Unicode signature : uncontested signature outweighs reverse evidence
  309. //
  310. if ((iResult & IS_TEXT_UNICODE_SIGNATURE) &&
  311. !(iResult & (IS_TEXT_UNICODE_NOT_UNICODE_MASK&(~IS_TEXT_UNICODE_DBCS_LEADBYTE)))
  312. ) {
  313. return TRUE;
  314. }
  315. //
  316. // If we have conflicting evidence, it's not Unicode
  317. //
  318. if (iResult & IS_TEXT_UNICODE_REVERSE_MASK) {
  319. return FALSE;
  320. }
  321. //
  322. // Statistical and other results (cases 2 and 3)
  323. //
  324. if (!(iResult & IS_TEXT_UNICODE_NOT_UNICODE_MASK) &&
  325. ((iResult & IS_TEXT_UNICODE_NOT_ASCII_MASK) ||
  326. (iResult & IS_TEXT_UNICODE_UNICODE_MASK)
  327. )
  328. ) {
  329. return TRUE;
  330. }
  331. return FALSE;
  332. }
  333. /*------------------------------------------------------------------*/
  334. /* */
  335. /* fgetl() - */
  336. /* */
  337. /*------------------------------------------------------------------*/
  338. /* fgetl expands tabs and return lines w/o separators */
  339. /* returns line from file (no CRLFs); returns NULL if EOF */
  340. int
  341. fgetl (
  342. PWCHAR wbuf,
  343. int len,
  344. BOOL bUnicode,
  345. PFILE fh
  346. )
  347. {
  348. int c = 0;
  349. int second;
  350. *wbuf = 0;
  351. if (bUnicode) {
  352. PWCHAR p;
  353. /* remember NUL at end */
  354. len--;
  355. p = wbuf;
  356. /* fill buffer from the file until EOF or EOLN or no space in buffer */
  357. while (len) {
  358. c = fgetc (fh);
  359. if (c == EOF)
  360. break;
  361. second = fgetc (fh);
  362. c = MAKEWORD (c, second);
  363. if (c == L'\n')
  364. break;
  365. if (c != L'\r') {
  366. if (c != L'\t') {
  367. *p++ = (WCHAR)c;
  368. len--;
  369. } else {
  370. /* tabs: expand to spaces */
  371. c = (int)(min (8 - ((p - wbuf) & 0x0007), len));
  372. len -= c;
  373. while (c) {
  374. *p++ = L' ';
  375. c--;
  376. }
  377. }
  378. }
  379. }
  380. /* null terminate string */
  381. *p = 0;
  382. } else {
  383. PCHAR p;
  384. PCHAR lpbuf;
  385. p = lpbuf = (PCHAR) LocalAlloc (LPTR, len);
  386. if (p) {
  387. /* remember NUL at end */
  388. len--;
  389. /* fill buffer from the file until EOF or EOLN or no space in buffer */
  390. while (len) {
  391. c = fgetc (fh);
  392. if (c == EOF || c == '\n')
  393. break;
  394. if (c != '\r') {
  395. if (c != '\t') {
  396. *p++ = (CHAR)c;
  397. len--;
  398. } else {
  399. /* tabs: expand to spaces */
  400. c = (int)(min (8 - ((p - lpbuf) & 0x0007), len));
  401. len -= c;
  402. while (c) {
  403. *p++ = ' ';
  404. c--;
  405. }
  406. }
  407. }
  408. }
  409. /* null terminate string and translate to Unicode */
  410. *p = 0;
  411. MultiByteToWideChar (uiCodePage, MB_PRECOMPOSED, lpbuf, -1, wbuf, (int)(p - lpbuf + 1));
  412. LocalFree (lpbuf);
  413. }
  414. }
  415. /* return false if EOF with no chars read */
  416. return !(c == EOF && !*wbuf);
  417. }
  418. /*----------------------------------------------------------*/
  419. /* */
  420. /* myfwrite() - */
  421. /* */
  422. /* Wrapper for fwrite to ensure data gets to the disk. */
  423. /* returns if ok, calls quit if write fails */
  424. /*----------------------------------------------------------*/
  425. void
  426. myfwrite(
  427. const void *pv,
  428. size_t s,
  429. size_t n,
  430. PFILE fp
  431. )
  432. {
  433. if (fwrite(pv, s, n, fp) == n)
  434. return;
  435. else
  436. quit(GET_MSG(1122));
  437. }