Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1019 lines
30 KiB

  1. #include "private.h"
  2. #include "detcbase.h"
  3. #include "codepage.h"
  4. #include "detcjpn.h"
  5. #include "detckrn.h"
  6. #include "fechrcnv.h"
  7. #include "msencode.h"
  8. #include "lcdetect.h"
  9. #include "cpdetect.h"
  10. CCpMRU *g_pCpMRU = NULL;
  11. // Get data from registry and construct cache
  12. HRESULT CCpMRU::Init(void)
  13. {
  14. BOOL bRegKeyReady = TRUE;
  15. HRESULT hr = S_OK;
  16. HKEY hkey;
  17. _pCpMRU = NULL;
  18. // HKCR\\Software\\Microsoft\internet explorer\\international\\CpMRU
  19. if (ERROR_SUCCESS != RegOpenKeyEx(HKEY_CURRENT_USER,
  20. REGSTR_PATH_CPMRU,
  21. 0, KEY_READ|KEY_SET_VALUE, &hkey))
  22. {
  23. DWORD dwAction = 0;
  24. if (ERROR_SUCCESS != RegCreateKeyEx(HKEY_CURRENT_USER,
  25. REGSTR_PATH_CPMRU,
  26. 0, NULL, REG_OPTION_NON_VOLATILE, KEY_ALL_ACCESS, NULL, &hkey, &dwAction))
  27. {
  28. bRegKeyReady = FALSE;
  29. dwCpMRUEnable = 0;
  30. hr = E_FAIL;
  31. }
  32. }
  33. if (bRegKeyReady)
  34. {
  35. DWORD dwType = REG_DWORD;
  36. DWORD dwSize = sizeof(DWORD);
  37. BOOL bUseDefault = FALSE;
  38. if (ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_ENABLE, 0, &dwType, (LPBYTE)&dwCpMRUEnable, &dwSize))
  39. {
  40. dwCpMRUEnable = 1;
  41. RegSetValueEx(hkey, REG_KEY_CPMRU_ENABLE, 0, REG_DWORD, (LPBYTE)&dwCpMRUEnable, sizeof(dwCpMRUEnable));
  42. }
  43. // If fail to open registry data or find unreasonable cache parameters, use default settings
  44. if ((ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_NUM, 0, &dwType, (LPBYTE)&dwCpMRUNum, &dwSize)) ||
  45. (ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_INIT_HITS, 0, &dwType, (LPBYTE)&dwCpMRUInitHits, &dwSize)) ||
  46. (ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_PERCENTAGE_FACTOR, 0, &dwType, (LPBYTE)&dwCpMRUFactor, &dwSize)) ||
  47. (dwCpMRUNum > MAX_CPMRU_NUM) || !dwCpMRUFactor || !dwCpMRUInitHits)
  48. {
  49. dwCpMRUNum = DEFAULT_CPMRU_NUM;
  50. dwCpMRUInitHits = DEFAULT_CPMRU_INIT_HITS;
  51. dwCpMRUFactor = DEFAULT_CPMRU_FACTOR;
  52. bUseDefault = TRUE;
  53. // Store default value in registry
  54. RegSetValueEx(hkey, REG_KEY_CPMRU_NUM, 0, REG_DWORD, (LPBYTE)&dwCpMRUNum, sizeof(dwCpMRUNum));
  55. RegSetValueEx(hkey, REG_KEY_CPMRU_INIT_HITS, 0, REG_DWORD, (LPBYTE)&dwCpMRUInitHits, sizeof(dwCpMRUInitHits));
  56. RegSetValueEx(hkey, REG_KEY_CPMRU_PERCENTAGE_FACTOR, 0, REG_DWORD, (LPBYTE)&dwCpMRUFactor, sizeof(dwCpMRUFactor));
  57. }
  58. dwSize = sizeof(CODEPAGE_MRU)*dwCpMRUNum;
  59. if (!dwSize || NULL == (_pCpMRU = (PCODEPAGE_MRU)LocalAlloc(LPTR, dwSize)))
  60. {
  61. hr = E_FAIL;
  62. dwCpMRUEnable = 0;
  63. }
  64. if (_pCpMRU && !bUseDefault)
  65. {
  66. dwType = REG_BINARY;
  67. if (ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU, 0, &dwType, (LPBYTE)_pCpMRU, &dwSize))
  68. {
  69. ZeroMemory(_pCpMRU,sizeof(CODEPAGE_MRU)*dwCpMRUNum);
  70. }
  71. }
  72. RegCloseKey(hkey);
  73. }
  74. return hr;
  75. }
  76. // Update registry's cache value
  77. CCpMRU::~CCpMRU(void)
  78. {
  79. HKEY hkey;
  80. if (bCpUpdated)
  81. {
  82. if (RegOpenKeyEx(HKEY_CURRENT_USER,
  83. REGSTR_PATH_CPMRU,
  84. 0, KEY_READ|KEY_SET_VALUE, &hkey) == ERROR_SUCCESS)
  85. {
  86. DWORD dwType = REG_BINARY;
  87. DWORD dwSize = sizeof(CODEPAGE_MRU)*dwCpMRUNum;
  88. if (_pCpMRU)
  89. {
  90. RegSetValueEx(hkey, REG_KEY_CPMRU, 0, dwType, (LPBYTE)_pCpMRU, dwSize);
  91. LocalFree(_pCpMRU);
  92. _pCpMRU = NULL;
  93. }
  94. RegCloseKey(hkey);
  95. }
  96. bCpUpdated = FALSE;
  97. }
  98. }
  99. HRESULT CCpMRU::GetCpMRU(PCODEPAGE_MRU pCpMRU, UINT *puiCpNum)
  100. {
  101. DWORD dwTotalHits = 0;
  102. UINT i;
  103. HRESULT hr = E_FAIL;
  104. if (!(*puiCpNum))
  105. return E_INVALIDARG;
  106. if (!_pCpMRU)
  107. return hr;
  108. if (!dwCpMRUEnable || !dwCpMRUInitHits)
  109. {
  110. *puiCpNum = 0;
  111. return S_FALSE;
  112. }
  113. ZeroMemory(pCpMRU, sizeof(CODEPAGE_MRU)*(*puiCpNum));
  114. // Get total hits acount
  115. for (i=0; i<dwCpMRUNum; i++)
  116. {
  117. if (_pCpMRU[i].dwHistoryHits)
  118. dwTotalHits += _pCpMRU[i].dwHistoryHits;
  119. else
  120. break;
  121. }
  122. // Not enough hits count to determin the result, keep collecting
  123. if (dwTotalHits < dwCpMRUInitHits)
  124. {
  125. *puiCpNum = 0;
  126. return S_FALSE;
  127. }
  128. for (i=0; i<dwCpMRUNum && i<*puiCpNum; i++)
  129. {
  130. // Percentage is 1/MIN_CPMRU_FACTOR
  131. if (_pCpMRU[i].dwHistoryHits*dwCpMRUFactor/dwTotalHits < 1)
  132. break;
  133. }
  134. if (i != 0)
  135. {
  136. CopyMemory(pCpMRU, _pCpMRU, sizeof(CODEPAGE_MRU)*(i));
  137. *puiCpNum = i;
  138. hr = S_OK;
  139. }
  140. return hr;
  141. }
  142. // Update code page MRU
  143. void CCpMRU::UpdateCPMRU(DWORD dwEncoding)
  144. {
  145. UINT i,j;
  146. if (!_pCpMRU)
  147. return;
  148. if ((dwEncoding == CP_AUTO) ||
  149. (dwEncoding == CP_JP_AUTO) ||
  150. (dwEncoding == CP_KR_AUTO))
  151. return;
  152. if (!bCpUpdated)
  153. bCpUpdated = TRUE;
  154. // Sorted
  155. for (i=0; i< dwCpMRUNum; i++)
  156. {
  157. if (!_pCpMRU[i].dwEncoding || (_pCpMRU[i].dwEncoding == dwEncoding))
  158. break;
  159. }
  160. // If not found, replace the last encoding
  161. if (i == dwCpMRUNum)
  162. {
  163. _pCpMRU[dwCpMRUNum-1].dwEncoding = dwEncoding;
  164. _pCpMRU[dwCpMRUNum-1].dwHistoryHits = 1;
  165. }
  166. else
  167. {
  168. _pCpMRU[i].dwHistoryHits ++;
  169. // If it is an already exist encoding, change order as needed
  170. if (_pCpMRU[i].dwEncoding)
  171. {
  172. for (j=i; j>0; j--)
  173. {
  174. if (_pCpMRU[j-1].dwHistoryHits >= _pCpMRU[i].dwHistoryHits)
  175. {
  176. break;
  177. }
  178. }
  179. if (j < i)
  180. {
  181. // Simple sorting
  182. CODEPAGE_MRU tmpCPMRU = _pCpMRU[i];
  183. MoveMemory(&_pCpMRU[j+1], &_pCpMRU[j], (i-j)*sizeof(CODEPAGE_MRU));
  184. _pCpMRU[j].dwEncoding = tmpCPMRU.dwEncoding;
  185. _pCpMRU[j].dwHistoryHits = tmpCPMRU.dwHistoryHits;
  186. }
  187. }
  188. else
  189. {
  190. _pCpMRU[i].dwEncoding = dwEncoding;
  191. }
  192. }
  193. // Cached too many hits?
  194. if (_pCpMRU[0].dwHistoryHits > 0xFFFFFFF0)
  195. {
  196. // Find the smallest one
  197. // This loop will always terminate
  198. // because at worst, it will stop at i=0 (which we know
  199. // is a huge number from the "if" above).
  200. for (i=dwCpMRUNum-1; ; i--)
  201. {
  202. if (_pCpMRU[i].dwHistoryHits > 1)
  203. break;
  204. }
  205. // Decrease Cache value
  206. for (j=0; j<dwCpMRUNum && _pCpMRU[j].dwHistoryHits; j++)
  207. {
  208. // We still keep those one hit encodings if any
  209. _pCpMRU[j].dwHistoryHits /= _pCpMRU[i].dwHistoryHits;
  210. }
  211. }
  212. }
  213. UINT CheckEntity(LPSTR pIn, UINT nIn)
  214. {
  215. UINT uiRet = 0;
  216. UINT uiSearchRange;
  217. UINT i;
  218. uiSearchRange = (nIn > MAX_ENTITY_LENTH)? MAX_ENTITY_LENTH:nIn;
  219. if (*pIn == '&')
  220. {
  221. for(i=0; i<uiSearchRange; i++)
  222. {
  223. if (pIn[i] == ';')
  224. break;
  225. }
  226. if (i < uiSearchRange)
  227. {
  228. uiSearchRange = i+1;
  229. // NCR Entity
  230. if (pIn[1] == '#')
  231. {
  232. for (i=2; i<uiSearchRange-1; i++)
  233. if (!IS_DIGITA(pIn[i]))
  234. {
  235. uiSearchRange = 0;
  236. break;
  237. }
  238. }
  239. // Name Entity
  240. else
  241. {
  242. for (i=1; i<uiSearchRange-1; i++)
  243. if (!IS_CHARA(pIn[i]))
  244. {
  245. uiSearchRange = 0;
  246. break;
  247. }
  248. }
  249. }
  250. else
  251. {
  252. uiSearchRange = 0;
  253. }
  254. }
  255. else
  256. {
  257. uiSearchRange = 0;
  258. }
  259. return uiSearchRange;
  260. }
  261. void RemoveHtmlTags (LPSTR pIn, UINT *pnBytes)
  262. //
  263. // Remove HTML tags from pIn and compress whitespace, in-place.
  264. // On input *pnBytes is the input length; on return *pnBytes is
  265. // set to the resulting length.
  266. //
  267. // Name Entity and NCR Entity strings also removed
  268. {
  269. UINT nIn = *pnBytes;
  270. UINT nOut = 0;
  271. UINT nEntity = 0;
  272. LPSTR pOut = pIn;
  273. BOOL fSkippedSpace = FALSE;
  274. while ( nIn > 0 /*&& nOut + 2 < *pnBytes */) {
  275. if (*pIn == '<' && nIn > 1/* && !IsNoise (pIn[1])*/) {
  276. // Discard text until the end of this tag. The handling here
  277. // is pragmatic and imprecise; what matters is detecting mostly
  278. // contents text, not tags or comments.
  279. pIn++;
  280. nIn--;
  281. LPCSTR pSkip;
  282. DWORD nLenSkip;
  283. if ( nIn > 1 && *pIn == '%' )
  284. {
  285. pSkip = "%>"; // Skip <% to %>
  286. nLenSkip = 2;
  287. }
  288. else if ( nIn > 3 && *pIn == '!' && !LowAsciiStrCmpNIA(pIn, "!--", 3) )
  289. {
  290. pSkip = "-->"; // Skip <!-- to -->
  291. nLenSkip = 3;
  292. }
  293. else if ( nIn > 5 && !LowAsciiStrCmpNIA(pIn, "style", 5) )
  294. {
  295. pSkip = "</style>"; // Skip <style ...> to </style>
  296. nLenSkip = 8;
  297. }
  298. else if ( nIn > 6 && !LowAsciiStrCmpNIA(pIn, "script", 6) )
  299. {
  300. pSkip = "</script>"; // Skip <script ...> to </script>
  301. nLenSkip = 9;
  302. }
  303. else if ( nIn > 3 && !LowAsciiStrCmpNIA(pIn, "xml", 3) )
  304. {
  305. pSkip = "</xml>";
  306. nLenSkip = 6;
  307. }
  308. else
  309. {
  310. pSkip = ">"; // match any end tag
  311. nLenSkip = 1;
  312. }
  313. // Skip up to a case-insensitive match of pSkip / nLenSkip
  314. while ( nIn > 0 )
  315. {
  316. // Spin fast up to a match of the first char.
  317. // NOTE: the first-char compare is NOT case insensitive
  318. // because this char is known to never be alphabetic.
  319. while ( nIn > 0 && *pIn != *pSkip )
  320. {
  321. pIn++;
  322. nIn--;
  323. }
  324. if ( nIn > nLenSkip && !LowAsciiStrCmpNIA(pIn, pSkip, nLenSkip) )
  325. {
  326. pIn += nLenSkip;
  327. nIn -= nLenSkip;
  328. fSkippedSpace = TRUE;
  329. break;
  330. }
  331. if ( nIn > 0)
  332. {
  333. pIn++;
  334. nIn--;
  335. }
  336. }
  337. // *pIn is either one past '>' or at end of input
  338. }
  339. else
  340. if (IsNoise (*pIn) || (nEntity = CheckEntity(pIn, nIn)))
  341. {
  342. // Collapse whitespace -- remember it but don't copy it now
  343. fSkippedSpace = TRUE;
  344. if (nEntity)
  345. {
  346. pIn+=nEntity;
  347. nIn-=nEntity;
  348. nEntity = 0;
  349. }
  350. else
  351. {
  352. while (nIn > 0 && IsNoise (*pIn))
  353. pIn++, nIn--;
  354. }
  355. }
  356. // *pIn is non-ws char
  357. else
  358. {
  359. // Pass through all other characters
  360. // Compress all previous noise characters to a white space
  361. if (fSkippedSpace)
  362. {
  363. *pOut++ = ' ';
  364. nOut++;
  365. fSkippedSpace = FALSE;
  366. }
  367. *pOut++ = *pIn++;
  368. nIn--;
  369. nOut++;
  370. }
  371. }
  372. *pnBytes = nOut;
  373. }
  374. static unsigned char szKoi8ru[] = {0xA4, 0xA6, 0xA7, 0xB4, 0xB6, 0xB7, 0xAD, 0xAE, 0xBD, 0xBE};
  375. static unsigned char sz28592[] = {0xA1, 0xA6, /*0xAB,*/ 0xAC, 0xB1, 0xB5, 0xB6, 0xB9, /*0xBB, 0xE1*/}; // Need to fine tune this data
  376. const CPPATCH CpData[] =
  377. {
  378. {CP_KOI8R, CP_KOI8RU, ARRAYSIZE(szKoi8ru), szKoi8ru},
  379. {CP_1250, CP_ISO_8859_2, ARRAYSIZE(sz28592), sz28592},
  380. };
  381. // Distinguish similar western encodings
  382. UINT PatchCodePage(UINT uiEncoding, unsigned char *pStr, int nSize)
  383. {
  384. int i, l,m, n, iPatch=0;
  385. while (iPatch < ARRAYSIZE(CpData))
  386. {
  387. if (uiEncoding == CpData[iPatch].srcEncoding)
  388. {
  389. for (i=0; i<nSize; i++)
  390. {
  391. if (*pStr > HIGHEST_ASCII)
  392. {
  393. l = 0;
  394. m = CpData[iPatch].nSize-1;
  395. n = m / 2;
  396. while (l <= m)
  397. {
  398. if (*pStr == CpData[iPatch].pszUch[n])
  399. return CpData[iPatch].destEncoding;
  400. else
  401. {
  402. if (*pStr < CpData[iPatch].pszUch[n])
  403. {
  404. m = n-1;
  405. }
  406. else
  407. {
  408. l = n+1;
  409. }
  410. n = (l+m)/2;
  411. }
  412. }
  413. }
  414. pStr++;
  415. }
  416. }
  417. iPatch++;
  418. }
  419. return uiEncoding;
  420. }
  421. #if 0
  422. const unsigned char szKOIRU[] = {0xA4, 0xA6, 0xA7, 0xB4, 0xB6, 0xB7, 0xAD, 0xAE, 0xBD, 0xBE};
  423. BOOL _IsKOI8RU(unsigned char *pStr, int nSize)
  424. {
  425. int i,j;
  426. BOOL bRet = FALSE;
  427. // Skip parameter check since this is internal
  428. for (i=0; i<nSize; i++)
  429. {
  430. if (*pStr >= szKOIRU[0] && *pStr <= szKOIRU[ARRAYSIZE(szKOIRU)-1])
  431. {
  432. for (j=0; j<ARRAYSIZE(szKOIRU); j++)
  433. {
  434. if (*pStr == szKOIRU[j])
  435. {
  436. bRet = TRUE;
  437. break;
  438. }
  439. }
  440. }
  441. if (bRet)
  442. break;
  443. pStr++;
  444. }
  445. return bRet;
  446. }
  447. #endif
  448. HRESULT WINAPI _DetectInputCodepage(DWORD dwFlag, DWORD uiPrefWinCodepage, CHAR *pSrcStr, INT *pcSrcSize, DetectEncodingInfo *lpEncoding, INT *pnScores)
  449. {
  450. HRESULT hr = S_OK;
  451. IStream *pstmTmp = NULL;
  452. BOOL bGuess = FALSE;
  453. BOOL bLCDetectSucceed = FALSE;
  454. int nBufSize = *pnScores;
  455. CHAR *_pSrcStr = pSrcStr;
  456. UINT nSrcSize;
  457. int i;
  458. BOOL bMayBeAscii = FALSE;
  459. // Check parameters
  460. if (!pSrcStr || !(*pcSrcSize) || !lpEncoding || *pnScores == 0)
  461. return E_INVALIDARG;
  462. nSrcSize = *pcSrcSize;
  463. // Zero out return buffer
  464. ZeroMemory(lpEncoding, sizeof(DetectEncodingInfo)*(*pnScores));
  465. // Simple Unicode detection
  466. if (nSrcSize >= sizeof(WCHAR))
  467. {
  468. UINT uiCp = 0;
  469. if (*((WCHAR *)pSrcStr) == 0xFEFF) // Unicode
  470. uiCp = CP_UCS_2;
  471. else if (*((WCHAR *)pSrcStr) == 0xFFFE) // Uncode Big Endian
  472. uiCp = CP_UCS_2_BE;
  473. if (uiCp)
  474. {
  475. *pnScores = 1;
  476. lpEncoding[0].nCodePage = uiCp;
  477. lpEncoding[0].nConfidence = 100;
  478. lpEncoding[0].nDocPercent = 100;
  479. lpEncoding[0].nLangID = -1;
  480. return S_OK;
  481. }
  482. }
  483. // HTML: take off HTML 'decoration'
  484. if (dwFlag & MLDETECTCP_HTML)
  485. {
  486. // Dup buffer for HTML parser
  487. if (NULL == (_pSrcStr = (char *)LocalAlloc(LPTR, nSrcSize)))
  488. return E_OUTOFMEMORY;
  489. CopyMemory(_pSrcStr, pSrcStr, nSrcSize);
  490. RemoveHtmlTags (_pSrcStr, &nSrcSize);
  491. }
  492. // if blank page/file...
  493. if (!nSrcSize)
  494. return E_FAIL;
  495. if (nSrcSize >= MIN_TEXT_SIZE)
  496. {
  497. // Initialize LCDetect
  498. if (NULL == g_pLCDetect)
  499. {
  500. EnterCriticalSection(&g_cs);
  501. if (NULL == g_pLCDetect)
  502. {
  503. LCDetect *pLC = new LCDetect ((HMODULE)g_hInst);
  504. if (pLC)
  505. {
  506. if (pLC->LoadState() == NO_ERROR)
  507. g_pLCDetect = pLC;
  508. else
  509. {
  510. delete pLC;
  511. }
  512. }
  513. }
  514. LeaveCriticalSection(&g_cs);
  515. }
  516. if (g_pLCDetect)
  517. {
  518. LCD_Detect(_pSrcStr, nSrcSize, (PLCDScore)lpEncoding, pnScores, NULL);
  519. if (*pnScores)
  520. {
  521. hr = S_OK;
  522. bLCDetectSucceed = TRUE;
  523. }
  524. }
  525. }
  526. if (!bLCDetectSucceed)
  527. {
  528. *pnScores = 0;
  529. hr = E_FAIL;
  530. }
  531. unsigned int uiCodepage = 0;
  532. LARGE_INTEGER li = {0,0};
  533. ULARGE_INTEGER uli = {0,0};
  534. if (S_OK == CreateStreamOnHGlobal(NULL, TRUE, &pstmTmp))
  535. {
  536. ULONG cb = (ULONG) nSrcSize ;
  537. if (S_OK == pstmTmp->Write(_pSrcStr,cb,&cb))
  538. {
  539. uli.LowPart = cb ;
  540. if (S_OK != pstmTmp->SetSize(uli))
  541. {
  542. hr = E_OUTOFMEMORY;
  543. goto DETECT_DONE;
  544. }
  545. }
  546. else
  547. {
  548. goto DETECT_DONE;
  549. }
  550. }
  551. else
  552. {
  553. hr = E_OUTOFMEMORY;
  554. goto DETECT_DONE;
  555. }
  556. pstmTmp->Seek(li,STREAM_SEEK_SET, NULL);
  557. switch (CceDetectInputCode(pstmTmp, grfDetectResolveAmbiguity|grfDetectUseCharMapping|grfDetectIgnoreEof, (EFam) 0, 0, &uiCodepage, &bGuess))
  558. {
  559. case cceSuccess:
  560. if (*pnScores)
  561. {
  562. // LCDETECT never detects wrong on Arabic and Russian, don't consider it as DBCS in this case
  563. // because MSEncode might misdetect Arabic and Russian as Japanese
  564. // Same goes for Korean JOHAB, MSENCODE doesn't support it at all
  565. if (((lpEncoding[0].nLangID == LANG_ARABIC )|| (lpEncoding[0].nLangID == LANG_RUSSIAN) || (lpEncoding[0].nCodePage == CP_KOR_JOHAB)) &&
  566. (lpEncoding[0].nConfidence >= MIN_ACCEPTABLE_CONFIDENCE)
  567. && (lpEncoding[0].nDocPercent >= MIN_DOCPERCENT) && !bGuess)
  568. bGuess = TRUE;
  569. for (i=0;i<*pnScores;i++)
  570. {
  571. if (lpEncoding[i].nCodePage == uiCodepage)
  572. {
  573. if ((i != 0) && !bGuess)
  574. {
  575. DetectEncodingInfo TmpEncoding;
  576. // Re-arrange lanugage list for MSEncode result
  577. MoveMemory(&TmpEncoding, &lpEncoding[0], sizeof(DetectEncodingInfo));
  578. MoveMemory(&lpEncoding[0], &lpEncoding[i], sizeof(DetectEncodingInfo));
  579. MoveMemory(&lpEncoding[i], &TmpEncoding, sizeof(DetectEncodingInfo));
  580. }
  581. // Boost confidence for double hits
  582. lpEncoding[0].nDocPercent = 100;
  583. if (lpEncoding[0].nConfidence < 100)
  584. lpEncoding[0].nConfidence = 100;
  585. break;
  586. }
  587. }
  588. if (i == *pnScores)
  589. {
  590. if (bGuess)
  591. {
  592. if (nBufSize > *pnScores)
  593. {
  594. lpEncoding[*pnScores].nCodePage = uiCodepage;
  595. lpEncoding[*pnScores].nConfidence = MIN_CONFIDENCE;
  596. lpEncoding[*pnScores].nDocPercent = MIN_DOCPERCENT;
  597. lpEncoding[*pnScores].nLangID = -1;
  598. (*pnScores)++;
  599. }
  600. }
  601. else
  602. {
  603. if (nBufSize > *pnScores)
  604. {
  605. MoveMemory(lpEncoding+1, lpEncoding, sizeof(DetectEncodingInfo) * (*pnScores));
  606. (*pnScores)++;
  607. }
  608. else
  609. {
  610. MoveMemory(lpEncoding+1, lpEncoding, sizeof(DetectEncodingInfo) * (*pnScores-1));
  611. }
  612. lpEncoding[0].nCodePage = uiCodepage;
  613. lpEncoding[0].nConfidence = 100;
  614. lpEncoding[0].nDocPercent = MIN_DOCPERCENT;
  615. lpEncoding[0].nLangID = -1;
  616. }
  617. }
  618. }
  619. else
  620. {
  621. lpEncoding[0].nCodePage = uiCodepage;
  622. if (bGuess)
  623. lpEncoding[0].nConfidence = MIN_CONFIDENCE;
  624. else
  625. lpEncoding[0].nConfidence = 100;
  626. lpEncoding[0].nDocPercent = MIN_DOCPERCENT;
  627. lpEncoding[0].nLangID = -1;
  628. (*pnScores)++;
  629. }
  630. //hr = (g_pLCDetect || (nSrcSize < MIN_TEXT_SIZE)) ? S_OK : S_FALSE;
  631. hr = (!g_pLCDetect || (bGuess && !bLCDetectSucceed )) ? S_FALSE : S_OK;
  632. break;
  633. // Currently MSEncode doesn't provide any useful information in 'cceAmbiguousInput' case.
  634. // We may update our code here if Office team enhance MSEncode for ambiguous input later.
  635. case cceAmbiguousInput:
  636. break;
  637. case cceMayBeAscii:
  638. bMayBeAscii = TRUE;
  639. if (!(*pnScores))
  640. {
  641. lpEncoding[0].nCodePage = uiCodepage;
  642. lpEncoding[0].nConfidence = MIN_CONFIDENCE;
  643. lpEncoding[0].nDocPercent = -1;
  644. lpEncoding[0].nLangID = -1;
  645. (*pnScores)++;
  646. }
  647. else
  648. {
  649. for (i=0;i<*pnScores;i++)
  650. {
  651. if (lpEncoding[i].nCodePage == uiCodepage)
  652. {
  653. break;
  654. }
  655. }
  656. if (i == *pnScores)
  657. {
  658. if(nBufSize > *pnScores) // Append MSEncode result to the language list
  659. {
  660. lpEncoding[i].nCodePage = uiCodepage;
  661. lpEncoding[i].nConfidence = -1;
  662. lpEncoding[i].nDocPercent = -1;
  663. lpEncoding[i].nLangID = -1;
  664. (*pnScores)++;
  665. }
  666. }
  667. }
  668. hr = bLCDetectSucceed ? S_OK : S_FALSE;
  669. break;
  670. // MSEncode failed
  671. default:
  672. break;
  673. }
  674. for (i=0; i<*pnScores; i++)
  675. {
  676. switch (lpEncoding[i].nCodePage) {
  677. case 850:
  678. if ((*pnScores>1) && (lpEncoding[1].nConfidence >= MIN_CONFIDENCE))
  679. {
  680. // Remove 850 from detection result if there is other detection results
  681. (*pnScores)--;
  682. if (i < *pnScores)
  683. MoveMemory(&lpEncoding[i], &lpEncoding[i+1], (*pnScores-i)*sizeof(DetectEncodingInfo));
  684. ZeroMemory(&lpEncoding[*pnScores], sizeof(DetectEncodingInfo));
  685. }
  686. else
  687. {
  688. // Replace it with 1252 if it is the only result we get
  689. lpEncoding[0].nCodePage = CP_1252;
  690. lpEncoding[0].nConfidence =
  691. lpEncoding[0].nDocPercent = 100;
  692. lpEncoding[0].nLangID = LANG_ENGLISH;
  693. }
  694. break;
  695. case CP_1250:
  696. case CP_KOI8R:
  697. lpEncoding[i].nCodePage = PatchCodePage(lpEncoding[i].nCodePage, (unsigned char *)_pSrcStr, nSrcSize);
  698. break;
  699. default:
  700. break;
  701. }
  702. }
  703. // If not a high confidence CP_1254 (Windows Turkish),
  704. // we'll check if there're better detection results, and swap results if needed
  705. if ((lpEncoding[0].nCodePage == CP_1254) &&
  706. (*pnScores>1) &&
  707. ((lpEncoding[0].nDocPercent < 90) || (lpEncoding[1].nCodePage == CP_CHN_GB) ||
  708. (lpEncoding[1].nCodePage == CP_TWN) || (lpEncoding[1].nCodePage == CP_JPN_SJ) || (lpEncoding[1].nCodePage == CP_KOR_5601)))
  709. {
  710. MoveMemory(&lpEncoding[0], &lpEncoding[1], sizeof(DetectEncodingInfo)*(*pnScores-1));
  711. lpEncoding[*pnScores-1].nCodePage = CP_1254;
  712. lpEncoding[*pnScores-1].nLangID = LANG_TURKISH;
  713. }
  714. // 852 and 1258 text only have one sure detection result
  715. if (((lpEncoding[0].nCodePage == CP_852) || (lpEncoding[0].nCodePage == CP_1258)) &&
  716. (*pnScores>1) &&
  717. (lpEncoding[1].nConfidence >= MIN_CONFIDENCE))
  718. {
  719. DetectEncodingInfo tmpDetect = {0};
  720. MoveMemory(&tmpDetect, &lpEncoding[0], sizeof(DetectEncodingInfo));
  721. MoveMemory(&lpEncoding[0], &lpEncoding[1], sizeof(DetectEncodingInfo));
  722. MoveMemory(&lpEncoding[1], &tmpDetect, sizeof(DetectEncodingInfo));
  723. }
  724. // Considering guessed value from MSENCODE is pretty accurate, we don't change S_OK to S_FALSE
  725. #if 0
  726. if ((S_OK == hr) && !bLCDetectSucceed && bGuess)
  727. {
  728. hr = S_FALSE;
  729. }
  730. #endif
  731. if (uiPrefWinCodepage && *pnScores)
  732. {
  733. if (uiPrefWinCodepage == CP_AUTO && g_pCpMRU && !IS_ENCODED_ENCODING(lpEncoding[0].nCodePage))
  734. {
  735. UINT uiCpNum = CP_AUTO_MRU_NUM;
  736. CODEPAGE_MRU CpMRU[CP_AUTO_MRU_NUM];
  737. if (S_OK == g_pCpMRU->GetCpMRU(CpMRU, &uiCpNum))
  738. {
  739. for (i = 0; i<*pnScores; i++)
  740. {
  741. for (UINT j = 0; j < uiCpNum; j++)
  742. {
  743. if (lpEncoding[i].nCodePage == CpMRU[j].dwEncoding)
  744. {
  745. uiPrefWinCodepage = CpMRU[j].dwEncoding;
  746. break;
  747. }
  748. }
  749. if (uiPrefWinCodepage != CP_AUTO)
  750. break;
  751. }
  752. // If detection result is not in MRU
  753. if (uiPrefWinCodepage == CP_AUTO)
  754. {
  755. // Don't take Unicode as perferred encoding if it is not in detection results for following reasons
  756. // 1. Unicode is usually tagged with charset or Unicode BOM
  757. // 2. Currently, we don't support Unicode detection in all detection engines
  758. if (CpMRU[0].dwEncoding != CP_UCS_2 && CpMRU[0].dwEncoding != CP_UCS_2_BE)
  759. uiPrefWinCodepage = CpMRU[0].dwEncoding;
  760. }
  761. }
  762. }
  763. // End preferred CP check if we can't get a valid one
  764. if (uiPrefWinCodepage == CP_AUTO)
  765. goto PREFERCPCHECK_DONE;
  766. for (i = 1; i<*pnScores; i++)
  767. {
  768. if (uiPrefWinCodepage == lpEncoding[i].nCodePage)
  769. {
  770. DetectEncodingInfo TmpEncoding;
  771. // Re-arrange lanugage list for prefered codepage
  772. TmpEncoding = lpEncoding[i];
  773. MoveMemory(&lpEncoding[1], &lpEncoding[0], sizeof(DetectEncodingInfo)*i);
  774. lpEncoding[0] = TmpEncoding;
  775. break;
  776. }
  777. }
  778. if ((uiPrefWinCodepage != lpEncoding[0].nCodePage) &&
  779. ((bMayBeAscii && (lpEncoding[0].nConfidence <= MIN_CONFIDENCE)) ||
  780. (hr != S_OK && nSrcSize >= MIN_TEXT_SIZE) ||
  781. (nSrcSize < MIN_TEXT_SIZE && !IS_ENCODED_ENCODING(lpEncoding[0].nCodePage))))
  782. {
  783. lpEncoding[0].nCodePage = uiPrefWinCodepage;
  784. lpEncoding[0].nConfidence = -1;
  785. lpEncoding[0].nDocPercent = -1;
  786. lpEncoding[0].nLangID = -1;
  787. *pnScores = 1;
  788. }
  789. }
  790. PREFERCPCHECK_DONE:
  791. // Assume LCDETECT won't misdetect 1252 for files over MIN_TEXT_SIZE
  792. // and MSENCODE can handle encoded text even they're below MIN_TEXT_SIZE
  793. if (((nSrcSize < MIN_TEXT_SIZE) && (bMayBeAscii || E_FAIL == hr)) ||
  794. (lpEncoding[0].nCodePage == CP_1252) ||
  795. (lpEncoding[0].nCodePage == CP_UTF_8))
  796. {
  797. UINT j;
  798. for (j=0; j < nSrcSize; j++)
  799. if (*((LPBYTE)(_pSrcStr+j)) > HIGHEST_ASCII)
  800. break;
  801. if (j == nSrcSize)
  802. {
  803. if (lpEncoding[0].nCodePage == CP_1252)
  804. {
  805. lpEncoding[0].nCodePage = CP_20127;
  806. }
  807. else
  808. {
  809. *pnScores = 1;
  810. lpEncoding[0].nCodePage = CP_20127;
  811. lpEncoding[0].nConfidence =
  812. lpEncoding[0].nDocPercent = 100;
  813. lpEncoding[0].nLangID = LANG_ENGLISH;
  814. hr = S_OK;
  815. }
  816. }
  817. }
  818. // UTF-8 doesn't really have distinctive signatures,
  819. // if text amout is small, we won't return low confidence UTF-8 detection result.
  820. if (hr == S_FALSE && IS_ENCODED_ENCODING(lpEncoding[0].nCodePage) &&
  821. !((nSrcSize < MIN_TEXT_SIZE) && (lpEncoding[0].nCodePage == CP_UTF_8)))
  822. hr = S_OK;
  823. DETECT_DONE:
  824. if ((dwFlag & MLDETECTCP_HTML) && _pSrcStr)
  825. LocalFree(_pSrcStr);
  826. if (pstmTmp)
  827. {
  828. pstmTmp->Release();
  829. }
  830. return hr ;
  831. }
  832. HRESULT WINAPI _DetectCodepageInIStream(DWORD dwFlag, DWORD uiPrefWinCodepage, IStream *pstmIn, DetectEncodingInfo *lpEncoding, INT *pnScores)
  833. {
  834. HRESULT hr= S_OK, hrWarnings=S_OK;
  835. LARGE_INTEGER libOrigin = { 0, 0 };
  836. ULARGE_INTEGER ulPos = {0, 0};
  837. LPSTR lpstrIn = NULL ;
  838. ULONG nlSrcSize ;
  839. INT nSrcUsed ;
  840. if (!pstmIn)
  841. return E_INVALIDARG ;
  842. // get size
  843. hr = pstmIn->Seek(libOrigin, STREAM_SEEK_END,&ulPos);
  844. if (S_OK != hr)
  845. hrWarnings = hr;
  846. if ( ulPos.LowPart == 0 && ulPos.HighPart == 0 )
  847. return E_INVALIDARG ;
  848. nlSrcSize = ulPos.LowPart ;
  849. // allocate a temp input buffer
  850. if ( (lpstrIn = (LPSTR) LocalAlloc(LPTR, nlSrcSize )) == NULL )
  851. {
  852. hrWarnings = E_OUTOFMEMORY ;
  853. goto exit;
  854. }
  855. // reset the pointer
  856. hr = pstmIn->Seek(libOrigin, STREAM_SEEK_SET, NULL);
  857. if (S_OK != hr)
  858. hrWarnings = hr;
  859. hr = pstmIn->Read(lpstrIn, nlSrcSize, &nlSrcSize);
  860. if (S_OK != hr)
  861. hrWarnings = hr;
  862. nSrcUsed = (INT) nlSrcSize ;
  863. hr = _DetectInputCodepage(dwFlag, uiPrefWinCodepage, lpstrIn, &nSrcUsed, lpEncoding, pnScores);
  864. exit :
  865. if (lpstrIn)
  866. {
  867. LocalFree(lpstrIn);
  868. }
  869. return (hr == S_OK) ? hrWarnings : hr;
  870. }