Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

838 lines
20 KiB

  1. /*----------------------------------------------------------------------------
  2. %%File: fechauto.c
  3. %%Unit: fechmap
  4. %%Contact: jpick
  5. Module that attempts to auto-detect encoding for a given stream.
  6. ----------------------------------------------------------------------------*/
  7. #include <stdio.h>
  8. #include <stddef.h>
  9. #include "private.h"
  10. #include "fechmap_.h"
  11. #include "lexint_.h"
  12. // Code marked by these #defines will be deleted eventually ...
  13. // (It prints out useful information and statistics about how
  14. // auto-detect is doing and what it's finding in the input).
  15. //
  16. #define JPDEBUG 0
  17. #define JPDEBUG2 0
  18. #define JPDEBUG3 0
  19. #define NEED_NAMES 0
  20. #if JPDEBUG || JPDEBUG2 || JPDEBUG3
  21. #undef NEED_NAMES
  22. #define NEED_NAMES 1
  23. #endif
  24. #if NEED_NAMES
  25. static char *rgszIcetNames[icetCount] =
  26. {
  27. "icetEucCn",
  28. "icetEucJp",
  29. "icetEucKr",
  30. "icetEucTw",
  31. "icetIso2022Cn",
  32. "icetIso2022Jp",
  33. "icetIso2022Kr",
  34. "icetIso2022Tw",
  35. "icetBig5",
  36. "icetGbk",
  37. "icetHz",
  38. "icetShiftJis",
  39. "icetWansung",
  40. "icetUtf7",
  41. "icetUtf8",
  42. };
  43. #endif
  44. // Characters we care about
  45. //
  46. #define chSo (UCHAR) 0x0e
  47. #define chSi (UCHAR) 0x0f
  48. #define chEsc (UCHAR) 0x1b
  49. // Minimum Sample Size
  50. //
  51. #define cchMinSample 64
  52. // High-ASCII character threshold. If this routine is unable
  53. // to absolutely determine the encoding of this file, it will
  54. // need to guess. Files that are ASCII, but contain high-ASCII
  55. // characters (e.g., a file with some Cyrillic characters) may
  56. // confuse us. If the number of high-ASCII characters falls
  57. // below this threshold, return the encoding we guessed but
  58. // also return a special rc that says the file "might be ASCII."
  59. //
  60. // 5%, for now.
  61. //
  62. // 40%, for now, of the high-ascii characters must be in high-
  63. // ascii pairs. (Pulled down because of Big5 and the other
  64. // DBCS encodings that can have trail bytes in the low range).
  65. //
  66. #define nHighCharThreshold 5 // %
  67. #define nHighPairThreshold 40 // %
  68. // Used by CceDetermineInputTypeReturnAll() to determine whether any icet has
  69. // high enough count to rule out all other icets.
  70. //
  71. #define CchCountThreshold(icet) (((icet) == icetHz || (icet) == icetUtf7) ? 5 : 10)
  72. // Tokens
  73. //
  74. // Stop tokens (negative) imply special handling and will cause
  75. // the processing loop to stop (eof, err, si, so and esc are
  76. // stop tokens).
  77. //
  78. #define xmn 0
  79. #define esc (-1)
  80. #define so (-2)
  81. #define si (-3)
  82. #define eof (-4)
  83. #define err (-5)
  84. #define _FStopToken(tk) ((tk) < 0)
  85. // Masks used in _CBitsOnFromUlong()
  86. //
  87. #define lMaskBitCount1 (LONG) 0x55555555
  88. #define lMaskBitCount2 (LONG) 0x33333333
  89. #define lMaskBitCount3 (LONG) 0x0F0F0F0F
  90. #define lMaskBitCount4 (LONG) 0x00FF00FF
  91. #define lMaskBitCount5 (LONG) 0x0000FFFF
  92. /* _ C B I T S O N F R O M U L O N G */
  93. /*----------------------------------------------------------------------------
  94. %%Function: _CBitsOnFromUlong
  95. %%Contact: jpick
  96. (adapted from code in convio.c)
  97. ----------------------------------------------------------------------------*/
  98. int __inline _CBitsOnFromUlong(ULONG ulBits)
  99. {
  100. ulBits = (ulBits & lMaskBitCount1) + ((ulBits & ~lMaskBitCount1) >> 1);
  101. ulBits = (ulBits & lMaskBitCount2) + ((ulBits & ~lMaskBitCount2) >> 2);
  102. ulBits = (ulBits & lMaskBitCount3) + ((ulBits & ~lMaskBitCount3) >> 4);
  103. ulBits = (ulBits & lMaskBitCount4) + ((ulBits & ~lMaskBitCount4) >> 8);
  104. ulBits = (ulBits & lMaskBitCount5) + ((ulBits & ~lMaskBitCount5) >> 16);
  105. return (int)ulBits;
  106. }
  107. // Masks for the encodings
  108. //
  109. #define grfEucCn (ULONG) 0x0001
  110. #define grfEucJp (ULONG) 0x0002
  111. #define grfEucKr (ULONG) 0x0004
  112. #define grfEucTw (ULONG) 0x0008
  113. #define grfIso2022Cn (ULONG) 0x0010
  114. #define grfIso2022Jp (ULONG) 0x0020
  115. #define grfIso2022Kr (ULONG) 0x0040
  116. #define grfIso2022Tw (ULONG) 0x0080
  117. #define grfBig5 (ULONG) 0x0100
  118. #define grfGbk (ULONG) 0x0200
  119. #define grfHz (ULONG) 0x0400
  120. #define grfShiftJis (ULONG) 0x0800
  121. #define grfWansung (ULONG) 0x1000
  122. #define grfUtf7 (ULONG) 0x2000
  123. #define grfUtf8 (ULONG) 0x4000
  124. // grfAll assumes that the tests for Euc-Kr fall within those
  125. // for Wansung (as far as I can tell from reading, Euc-Kr is a
  126. // strict subset of Wansung). The same for Euc-Cn and Gbk. No
  127. // need to test for both the subset and the whole.
  128. //
  129. #define grfAll (ULONG) 0x7FFA
  130. #define grfAllButIso2022 (ULONG) 0x7F0A
  131. #define cAll 13 // == number bits set in grfAll
  132. #define cAllButIso2022 9 // == number bits set in grfAllButIso2022
  133. // Array that maps an encoding to its mask
  134. //
  135. static ULONG _mpicetgrf[icetCount] =
  136. {
  137. grfEucCn,
  138. grfEucJp,
  139. grfEucKr,
  140. grfEucTw,
  141. grfIso2022Cn,
  142. grfIso2022Jp,
  143. grfIso2022Kr,
  144. grfIso2022Tw,
  145. grfBig5,
  146. grfGbk,
  147. grfHz,
  148. grfShiftJis,
  149. grfWansung,
  150. grfUtf7,
  151. grfUtf8,
  152. };
  153. // Prototypes
  154. //
  155. static int _NGetNextUch(IStream *pstmIn, unsigned char *c, BOOL *lpfIsHigh);
  156. static ICET _IcetFromIcetMask(ULONG ulMask);
  157. static ICET _IcetDefaultFromIcetMask(ULONG ulMask);
  158. static CCE _CceResolveAmbiguity(ULONG grfIcet, ICET *lpicet, int nPrefCp, EFam efPref);
  159. static CCE _CceReadEscSeq(IStream *pstmIn, int nPrefCp, ICET *lpicet, BOOL *lpfGuess);
  160. /* C C E D E T E R M I N E I N P U T T Y P E */
  161. /*----------------------------------------------------------------------------
  162. %%Function: CceDetermineInputType
  163. %%Contact: jpick
  164. Attempt to determine the appropriate ICET type for the given
  165. stream. Caller-supplied get/unget routines used for data access.
  166. ----------------------------------------------------------------------------*/
  167. CCE CceDetermineInputType(
  168. IStream *pstmIn, // input stream
  169. DWORD dwFlags, // configuration flags
  170. EFam efPref, // optional: preferred encoding family
  171. int nPrefCp, // optional: preferred code page
  172. ICET *lpicet, // set to detected encoding
  173. BOOL *lpfGuess // set to fTrue if function "guessed"
  174. )
  175. {
  176. unsigned char uch;
  177. int nToken;
  178. CCE cceRet;
  179. BOOL fGuess;
  180. ICET icet;
  181. int cIcetActive;
  182. ULONG grfIcetActive; // Bitarray tracks which encodings are still active candidates.
  183. ICET icetSeq;
  184. int i, nCount, nCountCurr;
  185. DWORD dwValFlags;
  186. BOOL fIsHigh;
  187. int cchHigh = 0;
  188. int cchHighPairs = 0;
  189. int cchTotal = 0;
  190. BOOL fLastHigh = fFalse;
  191. #if JPDEBUG3
  192. ULONG grfIcetNoCommonChars;
  193. #endif
  194. #if JPDEBUG
  195. printf("flags: %d\n", dwFlags);
  196. #endif
  197. // Initialize parsers
  198. //
  199. dwValFlags = grfCountCommonChars;
  200. if (dwFlags & grfDetectUseCharMapping)
  201. dwValFlags |= grfValidateCharMapping;
  202. ValidateInitAll(dwValFlags);
  203. // Initialize locals -- be optimistic
  204. //
  205. cceRet = cceSuccess;
  206. fGuess = fFalse;
  207. grfIcetActive = grfAllButIso2022;
  208. cIcetActive = cAllButIso2022;
  209. #if JPDEBUG3
  210. grfIcetNoCommonChars = grfAllButIso2022;
  211. #endif
  212. while (fTrue)
  213. {
  214. nToken = _NGetNextUch(pstmIn, &uch, &fIsHigh);
  215. if (_FStopToken(nToken))
  216. break;
  217. // Update (admittedly dumb) statistics -- really counts high
  218. // ascii characters in runs (not really pairs). But threshold
  219. // constants (defined, above) were determined by calculating
  220. // exactly these numbers for ~25 files, so it should be ok (?).
  221. //
  222. ++cchTotal;
  223. if (fIsHigh)
  224. {
  225. ++cchHigh;
  226. if (fLastHigh)
  227. ++cchHighPairs;
  228. }
  229. fLastHigh = fIsHigh;
  230. for (i = 0; i < icetCount; i++)
  231. {
  232. if (!(grfIcetActive & _mpicetgrf[i]) || (NValidateUch((ICET)i, uch, fFalse) != 0))
  233. continue;
  234. grfIcetActive &= ~_mpicetgrf[i];
  235. --cIcetActive;
  236. #if JPDEBUG
  237. printf("Log: Lost %s at offset 0x%.4x (%d), char 0x%.2x\n", rgszIcetNames[i], (cchTotal-1), (cchTotal-1), uch);
  238. #endif
  239. }
  240. #if JPDEBUG3
  241. for (i = 0; i < icetCount; i++)
  242. {
  243. if (!(grfIcetActive & _mpicetgrf[i]) || !(grfIcetNoCommonChars & _mpicetgrf[i]))
  244. continue;
  245. if (!FValidateCharCount(i, &nCount) || (nCount == 0))
  246. continue;
  247. grfIcetNoCommonChars &= ~_mpicetgrf[i];
  248. printf("Log: Found first common seq for %s at offset 0x%.4x (%d)\n", rgszIcetNames[i], (cchTotal-1), (cchTotal-1));
  249. }
  250. #endif
  251. if ((cIcetActive == 0) || ((cIcetActive == 1) && (cchTotal > cchMinSample)))
  252. break;
  253. }
  254. // Figure out why we exited the loop.
  255. //
  256. if (nToken == err)
  257. {
  258. cceRet = cceRead;
  259. goto _LRet;
  260. }
  261. // Process escapes separately. Interpret the escape sequence
  262. // to determine for real which ISO7 flavor we have found.
  263. //
  264. if ((nToken == esc) || (nToken == so) || (nToken == si))
  265. {
  266. LARGE_INTEGER li;
  267. HRESULT hr;
  268. LISet32(li, -1 );
  269. hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL);
  270. // if (!pfnUnget(uch, lpvPrivate))
  271. // {
  272. // cceRet = cceUnget;
  273. // goto _LRet;
  274. // }
  275. cceRet = _CceReadEscSeq(pstmIn, nPrefCp, &icet, &fGuess);
  276. #if JPDEBUG
  277. if (cceRet == cceSuccess)
  278. printf("Log: Found encoding %s at offset 0x%.4x (%d)\n", rgszIcetNames[icet], cchTotal, cchTotal);
  279. #endif
  280. // ISO is a special case -- no need to check statistics.
  281. //
  282. goto _LRet;
  283. }
  284. #if JPDEBUG2
  285. printf("Counts: %d total chars, %d high chars, %d high pairs\n", cchTotal, cchHigh, cchHighPairs);
  286. #endif
  287. // If the token was eof, and we're not ignoring eof, transition
  288. // the remaining active sets on eof.
  289. //
  290. if ((nToken == eof) && !(dwFlags & grfDetectIgnoreEof))
  291. {
  292. for (i = 0; i < icetCount; i++)
  293. {
  294. if (!(grfIcetActive & _mpicetgrf[i]) || (NValidateUch((ICET)i, 0, fTrue) != 0))
  295. continue;
  296. #if JPDEBUG
  297. printf("Log: Lost %s at EOF\n", rgszIcetNames[i]);
  298. #endif
  299. grfIcetActive &= ~_mpicetgrf[i];
  300. --cIcetActive;
  301. }
  302. }
  303. Assert(cIcetActive >= 0); // better *not* be less than 0
  304. // See how we've narrowed our field of choices and set the
  305. // return status accordingly.
  306. //
  307. if (cIcetActive <= 0)
  308. {
  309. #if JPDEBUG
  310. printf("Log: Bailed out entirely at offset 0x%.4x (%d)\n", cchTotal, cchTotal);
  311. #endif
  312. cceRet = cceUnknownInput;
  313. goto _LRet;
  314. }
  315. else if (cIcetActive == 1)
  316. {
  317. icet = _IcetFromIcetMask(grfIcetActive);
  318. #if JPDEBUG
  319. printf("Log: Found encoding %s at offset 0x%.4x (%d)\n", rgszIcetNames[icet], cchTotal, cchTotal);
  320. #endif
  321. // If we matched an encoding type and also found matching
  322. // common character runs, skip statistics (see comment,
  323. // below).
  324. //
  325. if (FValidateCharCount(icet, &nCount) && (nCount > 0))
  326. {
  327. #if JPDEBUG3
  328. printf("Log: %d common sequences for %s\n", nCount, rgszIcetNames[icet]);
  329. #endif
  330. goto _LRet;
  331. }
  332. else
  333. {
  334. goto _LStats;
  335. }
  336. }
  337. // Did we learn anything from counting characters?
  338. //
  339. icetSeq = (ICET)-1;
  340. nCountCurr = 0;
  341. for (i = 0; i < icetCount; i++)
  342. {
  343. if (!(grfIcetActive & _mpicetgrf[i]) || !FValidateCharCount((ICET)i, &nCount))
  344. continue;
  345. if (nCount > nCountCurr)
  346. {
  347. icetSeq = (ICET)i;
  348. nCountCurr = nCount;
  349. }
  350. #if JPDEBUG3
  351. printf("Log: %d common sequences for %s\n", nCount, rgszIcetNames[i]);
  352. #endif
  353. }
  354. // Any luck? If so, return. Don't bother checking statistics.
  355. // We just proved that we found at least one common run of
  356. // characters in this input. The odds against this for just a
  357. // plain ASCII file with some high characters seem pretty high.
  358. // Ignore the statistics and just return the encoding type we
  359. // found.
  360. //
  361. if (icetSeq != -1)
  362. {
  363. icet = icetSeq;
  364. goto _LRet;
  365. }
  366. #if JPDEBUG
  367. printf("Log: Active Icet Mask 0x%.8x, %d left\n", grfIcetActive, cIcetActive);
  368. printf("Log: Icet's left -- ");
  369. for (i = 0; i < icetCount; i++)
  370. {
  371. if (grfIcetActive & _mpicetgrf[i])
  372. printf("%s, ", rgszIcetNames[i]);
  373. }
  374. printf("\n");
  375. #endif
  376. // If caller did not want us to try to guess at the encoding
  377. // in the absence of definitive data, bail out.
  378. //
  379. if (!(dwFlags & grfDetectResolveAmbiguity))
  380. {
  381. cceRet = cceAmbiguousInput;
  382. goto _LRet;
  383. }
  384. // We're guessing -- note it.
  385. //
  386. fGuess = fTrue;
  387. // More than one active encoding. Attempt to resolve ambiguity.
  388. //
  389. cceRet = _CceResolveAmbiguity(grfIcetActive, &icet, nPrefCp, efPref);
  390. if (cceRet != cceSuccess)
  391. return cceRet;
  392. _LStats:
  393. // Adjust the return code based on the "statistics" we gathered,
  394. // above.
  395. //
  396. if (cchHigh > 0)
  397. {
  398. if ((cchTotal < cchMinSample) ||
  399. (((cchHigh * 100) / cchTotal) < nHighCharThreshold) ||
  400. (((cchHighPairs * 100) / cchHigh) < nHighPairThreshold))
  401. {
  402. cceRet = cceMayBeAscii;
  403. }
  404. }
  405. else
  406. {
  407. cceRet = cceMayBeAscii; // no high-ascii characters? definitely maybe!
  408. }
  409. #if JPDEBUG2
  410. if (cchHigh > 0)
  411. {
  412. int nPercent1 = ((cchHigh * 100) / cchTotal);
  413. int nPercent2 = ((cchHighPairs * 100) / cchHigh);
  414. printf("Ratios -- high/total: %d%%, runs/high: %d%%\n", nPercent1, nPercent2);
  415. }
  416. #endif
  417. _LRet:
  418. // Set the return variables, if successful.
  419. //
  420. if ((cceRet == cceSuccess) || (cceRet == cceMayBeAscii))
  421. {
  422. *lpicet = icet;
  423. *lpfGuess = fGuess;
  424. }
  425. #if JPDEBUG
  426. if (cceRet == cceSuccess)
  427. {
  428. printf("Log: Returning %s, fGuess = %s\n", rgszIcetNames[icet], (fGuess ? "fTrue" : "fFalse"));
  429. }
  430. else if (cceRet == cceMayBeAscii)
  431. {
  432. printf("Log: Returning %s, fGuess = %s, may-be-ASCII\n", rgszIcetNames[icet], (fGuess ? "fTrue" : "fFalse"));
  433. }
  434. #endif
  435. return cceRet;
  436. }
  437. /* _ N G E T N E X T U C H */
  438. /*----------------------------------------------------------------------------
  439. %%Function: _NGetNextUch
  440. %%Contact: jpick
  441. Get the next character from the input stream. Classify the character.
  442. ----------------------------------------------------------------------------*/
  443. static int _NGetNextUch(IStream *pstmIn, unsigned char *c, BOOL *lpfIsHigh)
  444. {
  445. ULONG rc;
  446. unsigned char uch;
  447. HRESULT hr;
  448. hr = pstmIn->Read(&uch, 1, &rc);
  449. if (rc == 0)
  450. return eof;
  451. else if (hr != S_OK )
  452. return err;
  453. *lpfIsHigh = (uch >= 0x80);
  454. *c = uch;
  455. switch (uch)
  456. {
  457. case chEsc:
  458. return esc;
  459. case chSo:
  460. return so;
  461. case chSi:
  462. return si;
  463. default:
  464. return xmn;
  465. }
  466. }
  467. // Masks for _CceResolveAmbiguity() -- only externally supported character
  468. // sets are used in ambiguity resolution. Don't include Euc-Tw here.
  469. //
  470. #define grfJapan (ULONG) (grfShiftJis | grfEucJp)
  471. #define grfChina (ULONG) (grfEucCn | grfGbk)
  472. #define grfKorea (ULONG) (grfEucKr | grfWansung)
  473. #define grfTaiwan (ULONG) (grfBig5)
  474. #define grfDbcs (ULONG) (grfShiftJis | grfGbk | grfWansung | grfBig5)
  475. #define grfEuc (ULONG) (grfEucJp | grfEucKr | grfEucCn)
  476. /* _ C E F R O M C E M A S K */
  477. /*----------------------------------------------------------------------------
  478. %%Function: _IcetFromIcetMask
  479. %%Contact: jpick
  480. ----------------------------------------------------------------------------*/
  481. static ICET _IcetFromIcetMask(ULONG ulMask)
  482. {
  483. switch (ulMask)
  484. {
  485. case grfEucCn:
  486. return icetEucCn;
  487. case grfEucJp:
  488. return icetEucJp;
  489. case grfEucKr:
  490. return icetEucKr;
  491. case grfEucTw:
  492. return icetEucTw;
  493. case grfIso2022Cn:
  494. return icetIso2022Cn;
  495. case grfIso2022Jp:
  496. return icetIso2022Jp;
  497. case grfIso2022Kr:
  498. return icetIso2022Kr;
  499. case grfIso2022Tw:
  500. return icetIso2022Tw;
  501. case grfBig5:
  502. return icetBig5;
  503. case grfGbk:
  504. return icetGbk;
  505. case grfHz:
  506. return icetHz;
  507. case grfShiftJis:
  508. return icetShiftJis;
  509. case grfWansung:
  510. return icetWansung;
  511. case grfUtf7:
  512. return icetUtf7;
  513. case grfUtf8:
  514. return icetUtf8;
  515. default:
  516. break;
  517. }
  518. // Should never get here ...
  519. //
  520. // NotReached();
  521. // Can't return a bogus value, here.
  522. //
  523. return icetShiftJis;
  524. }
  525. /* _ C E D E F A U L T F R O M C E M A S K */
  526. /*----------------------------------------------------------------------------
  527. %%Function: _IcetDefaultFromIcetMask
  528. %%Contact: jpick
  529. ----------------------------------------------------------------------------*/
  530. static ICET _IcetDefaultFromIcetMask(ULONG ulMask)
  531. {
  532. // Priorities -- DBCS, EUC, Japan, Taiwan, China and Korea (???).
  533. //
  534. if (ulMask & grfDbcs)
  535. {
  536. if (ulMask & grfJapan)
  537. return icetShiftJis;
  538. if (ulMask & grfChina)
  539. return icetGbk;
  540. if (ulMask & grfTaiwan)
  541. return icetBig5;
  542. if (ulMask & grfKorea)
  543. return icetWansung;
  544. }
  545. else // EUC
  546. {
  547. if (ulMask & grfJapan)
  548. return icetEucJp;
  549. if (ulMask & grfChina)
  550. return icetEucCn;
  551. if (ulMask & grfKorea)
  552. return icetEucKr; // may be able to return icetWansung, here
  553. }
  554. // (Assert);
  555. return icetShiftJis; // ???
  556. }
  557. /* _ U L C E M A S K F R O M C P E T P */
  558. /*----------------------------------------------------------------------------
  559. %%Function: _UlIcetMaskFromCpEf
  560. %%Contact: jpick
  561. ----------------------------------------------------------------------------*/
  562. static ULONG _UlIcetMaskFromCpEf(int nCp, EFam ef)
  563. {
  564. ULONG grf = grfAll;
  565. switch (nCp)
  566. {
  567. case nCpJapan:
  568. grf &= grfJapan;
  569. break;
  570. case nCpChina:
  571. grf &= grfChina;
  572. break;
  573. case nCpKorea:
  574. grf &= grfKorea;
  575. break;
  576. case nCpTaiwan:
  577. grf &= grfTaiwan;
  578. break;
  579. default:
  580. break;
  581. }
  582. switch (ef)
  583. {
  584. case efDbcs:
  585. grf &= grfDbcs;
  586. break;
  587. case efEuc:
  588. grf &= grfEuc;
  589. break;
  590. default:
  591. break;
  592. }
  593. return grf;
  594. }
  595. /* _ C C E R E S O L V E A M B I G U I T Y */
  596. /*----------------------------------------------------------------------------
  597. %%Function: _CceResolveAmbiguity
  598. %%Contact: jpick
  599. Attempt to resolve ambiguous input encoding based on user
  600. preferences, if set, and system code page. grfIcet contains a
  601. bitmask representing the encodings that are still possible after
  602. examining the input sample.
  603. ----------------------------------------------------------------------------*/
  604. static CCE _CceResolveAmbiguity(ULONG grfIcet, ICET *lpicet, int nPrefCp, EFam efPref)
  605. {
  606. ULONG grfIcetOrig = grfIcet;
  607. ULONG grfPref;
  608. ULONG grfSys;
  609. ULONG grfResult;
  610. int cIcet;
  611. // Build "list" of encodings based on user-prefs.
  612. //
  613. grfPref = _UlIcetMaskFromCpEf(nPrefCp, efPref);
  614. // See if the user's preferences make any difference.
  615. //
  616. grfResult = grfIcet & grfPref;
  617. if (grfResult)
  618. {
  619. cIcet = _CBitsOnFromUlong(grfResult);
  620. if (cIcet == 1)
  621. {
  622. *lpicet = _IcetFromIcetMask(grfResult);
  623. return cceSuccess;
  624. }
  625. else
  626. grfIcet = grfResult; // see comment, below
  627. }
  628. // Now look to the system code page for help. Look at
  629. // the set of encodings as modified by the user
  630. // preferences (??? do we want to do this ???).
  631. //
  632. if (!FIsFeCp(g_uACP) || (grfIcetOrig & grfUtf8))
  633. goto _LDefault;
  634. // Build "list" of encodings based on system cp.
  635. //
  636. grfSys = _UlIcetMaskFromCpEf(g_uACP, (EFam) 0);
  637. // See if the system cp makes any difference.
  638. //
  639. grfResult = grfIcet & grfSys;
  640. if (grfResult)
  641. {
  642. cIcet = _CBitsOnFromUlong(grfResult);
  643. if (cIcet == 1)
  644. {
  645. *lpicet = _IcetFromIcetMask(grfResult);
  646. return cceSuccess;
  647. }
  648. }
  649. _LDefault:
  650. // Special case -- pick UTF-8 if it's legal and the prefs
  651. // don't help us.
  652. //
  653. *lpicet =
  654. (grfIcetOrig & grfUtf8) ? icetUtf8 : _IcetDefaultFromIcetMask(grfIcet);
  655. return cceSuccess;
  656. }
  657. /* _ C C E R E A D E S C S E Q */
  658. /*----------------------------------------------------------------------------
  659. %%Function: _CceReadEscSeq
  660. %%Contact: jpick
  661. We've read (and put back) an escape character. Call the ISO-2022
  662. escape sequence converter to have it map the escape sequence to the
  663. appropriate character set. We may be looking at the escape sequence
  664. for ASCII, so be prepared to read ahead to the next one.
  665. ----------------------------------------------------------------------------*/
  666. static CCE _CceReadEscSeq(
  667. IStream *pstmIn, // input stream
  668. int nPrefCp,
  669. ICET *lpicet,
  670. BOOL *lpfGuess
  671. )
  672. {
  673. unsigned char uch;
  674. CCE cceRet;
  675. int nToken;
  676. BOOL fDummy;
  677. do
  678. {
  679. cceRet = CceReadEscSeq(pstmIn, lpicet);
  680. if ((cceRet == cceSuccess) || (cceRet != cceMayBeAscii))
  681. break;
  682. while (fTrue)
  683. {
  684. nToken = _NGetNextUch(pstmIn, &uch, &fDummy);
  685. if (_FStopToken(nToken))
  686. break;
  687. }
  688. // Why did we stop?
  689. //
  690. if (nToken == err)
  691. {
  692. cceRet = cceRead;
  693. break;
  694. }
  695. else if (nToken == eof)
  696. {
  697. // Means this is legal ISO-2022 input, but we've seen nothing
  698. // but non-flavor-specific escape sequences (e.g., only ASCII
  699. // or shift sequences). Choose the encoding type based on
  700. // preferences (only pick from those currently supported
  701. // externally).
  702. //
  703. switch (nPrefCp)
  704. {
  705. case nCpKorea:
  706. *lpicet = icetIso2022Kr;
  707. break;
  708. case nCpJapan:
  709. default: // Right ??? (gotta pick something ...)
  710. *lpicet = icetIso2022Jp;
  711. break;
  712. }
  713. *lpfGuess = fTrue; // not *really* guessing, but ... (???)
  714. cceRet = cceSuccess;
  715. break;
  716. }
  717. Assert((nToken == esc) || (nToken == so) || (nToken == si));
  718. {
  719. LARGE_INTEGER li;
  720. HRESULT hr;
  721. LISet32(li, -1 );
  722. hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL);
  723. }
  724. // Put it back for CceReadEscSeq() to process.
  725. //
  726. // if (!pfnUnget(uch, lpvPrivate))
  727. // {
  728. // cceRet = cceUnget;
  729. // break;
  730. // }
  731. } while (fTrue);
  732. return cceRet;
  733. }