You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1019 lines
32 KiB
1019 lines
32 KiB
#include "private.h"
|
|
#include "detcbase.h"
|
|
#include "codepage.h"
|
|
#include "detcjpn.h"
|
|
#include "detckrn.h"
|
|
|
|
#include "fechrcnv.h"
|
|
|
|
#include "msencode.h"
|
|
#include "lcdetect.h"
|
|
#include "cpdetect.h"
|
|
|
|
CCpMRU *g_pCpMRU = NULL;
|
|
|
|
|
|
|
|
// Get data from registry and construct cache
|
|
HRESULT CCpMRU::Init(void)
|
|
{
|
|
BOOL bRegKeyReady = TRUE;
|
|
HRESULT hr = S_OK;
|
|
HKEY hkey;
|
|
|
|
_pCpMRU = NULL;
|
|
|
|
// HKCR\\Software\\Microsoft\internet explorer\\international\\CpMRU
|
|
if (ERROR_SUCCESS != RegOpenKeyEx(HKEY_CURRENT_USER,
|
|
REGSTR_PATH_CPMRU,
|
|
0, KEY_READ|KEY_SET_VALUE, &hkey))
|
|
{
|
|
DWORD dwAction = 0;
|
|
if (ERROR_SUCCESS != RegCreateKeyEx(HKEY_CURRENT_USER,
|
|
REGSTR_PATH_CPMRU,
|
|
0, NULL, REG_OPTION_NON_VOLATILE, KEY_ALL_ACCESS, NULL, &hkey, &dwAction))
|
|
{
|
|
bRegKeyReady = FALSE;
|
|
dwCpMRUEnable = 0;
|
|
hr = E_FAIL;
|
|
}
|
|
}
|
|
|
|
if (bRegKeyReady)
|
|
{
|
|
DWORD dwType = REG_DWORD;
|
|
DWORD dwSize = sizeof(DWORD);
|
|
BOOL bUseDefault = FALSE;
|
|
|
|
if (ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_ENABLE, 0, &dwType, (LPBYTE)&dwCpMRUEnable, &dwSize))
|
|
{
|
|
dwCpMRUEnable = 1;
|
|
RegSetValueEx(hkey, REG_KEY_CPMRU_ENABLE, 0, REG_DWORD, (LPBYTE)&dwCpMRUEnable, sizeof(dwCpMRUEnable));
|
|
}
|
|
|
|
// If fail to open registry data or find unreasonable cache parameters, use default settings
|
|
if ((ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_NUM, 0, &dwType, (LPBYTE)&dwCpMRUNum, &dwSize)) ||
|
|
(ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_INIT_HITS, 0, &dwType, (LPBYTE)&dwCpMRUInitHits, &dwSize)) ||
|
|
(ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_PERCENTAGE_FACTOR, 0, &dwType, (LPBYTE)&dwCpMRUFactor, &dwSize)) ||
|
|
(dwCpMRUNum > MAX_CPMRU_NUM) || !dwCpMRUFactor || !dwCpMRUInitHits)
|
|
{
|
|
dwCpMRUNum = DEFAULT_CPMRU_NUM;
|
|
dwCpMRUInitHits = DEFAULT_CPMRU_INIT_HITS;
|
|
dwCpMRUFactor = DEFAULT_CPMRU_FACTOR;
|
|
bUseDefault = TRUE;
|
|
|
|
// Store default value in registry
|
|
RegSetValueEx(hkey, REG_KEY_CPMRU_NUM, 0, REG_DWORD, (LPBYTE)&dwCpMRUNum, sizeof(dwCpMRUNum));
|
|
RegSetValueEx(hkey, REG_KEY_CPMRU_INIT_HITS, 0, REG_DWORD, (LPBYTE)&dwCpMRUInitHits, sizeof(dwCpMRUInitHits));
|
|
RegSetValueEx(hkey, REG_KEY_CPMRU_PERCENTAGE_FACTOR, 0, REG_DWORD, (LPBYTE)&dwCpMRUFactor, sizeof(dwCpMRUFactor));
|
|
}
|
|
|
|
dwSize = sizeof(CODEPAGE_MRU)*dwCpMRUNum;
|
|
|
|
if (!dwSize || NULL == (_pCpMRU = (PCODEPAGE_MRU)LocalAlloc(LPTR, dwSize)))
|
|
{
|
|
hr = E_FAIL;
|
|
dwCpMRUEnable = 0;
|
|
}
|
|
|
|
if (_pCpMRU && !bUseDefault)
|
|
{
|
|
dwType = REG_BINARY;
|
|
|
|
if (ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU, 0, &dwType, (LPBYTE)_pCpMRU, &dwSize))
|
|
{
|
|
ZeroMemory(_pCpMRU,sizeof(CODEPAGE_MRU)*dwCpMRUNum);
|
|
}
|
|
}
|
|
RegCloseKey(hkey);
|
|
}
|
|
|
|
return hr;
|
|
}
|
|
|
|
// Update registry's cache value
|
|
CCpMRU::~CCpMRU(void)
|
|
{
|
|
HKEY hkey;
|
|
|
|
if (bCpUpdated)
|
|
{
|
|
|
|
if (RegOpenKeyEx(HKEY_CURRENT_USER,
|
|
REGSTR_PATH_CPMRU,
|
|
0, KEY_READ|KEY_SET_VALUE, &hkey) == ERROR_SUCCESS)
|
|
{
|
|
DWORD dwType = REG_BINARY;
|
|
DWORD dwSize = sizeof(CODEPAGE_MRU)*dwCpMRUNum;
|
|
if (_pCpMRU)
|
|
{
|
|
RegSetValueEx(hkey, REG_KEY_CPMRU, 0, dwType, (LPBYTE)_pCpMRU, dwSize);
|
|
LocalFree(_pCpMRU);
|
|
_pCpMRU = NULL;
|
|
}
|
|
|
|
RegCloseKey(hkey);
|
|
}
|
|
bCpUpdated = FALSE;
|
|
|
|
}
|
|
}
|
|
|
|
HRESULT CCpMRU::GetCpMRU(PCODEPAGE_MRU pCpMRU, UINT *puiCpNum)
|
|
{
|
|
DWORD dwTotalHits = 0;
|
|
UINT i;
|
|
HRESULT hr = E_FAIL;
|
|
|
|
if (!(*puiCpNum))
|
|
return E_INVALIDARG;
|
|
|
|
if (!_pCpMRU)
|
|
return hr;
|
|
|
|
if (!dwCpMRUEnable || !dwCpMRUInitHits)
|
|
{
|
|
*puiCpNum = 0;
|
|
return S_FALSE;
|
|
}
|
|
|
|
ZeroMemory(pCpMRU, sizeof(CODEPAGE_MRU)*(*puiCpNum));
|
|
|
|
// Get total hits acount
|
|
for (i=0; i<dwCpMRUNum; i++)
|
|
{
|
|
if (_pCpMRU[i].dwHistoryHits)
|
|
dwTotalHits += _pCpMRU[i].dwHistoryHits;
|
|
else
|
|
break;
|
|
}
|
|
|
|
// Not enough hits count to determin the result, keep collecting
|
|
if (dwTotalHits < dwCpMRUInitHits)
|
|
{
|
|
*puiCpNum = 0;
|
|
return S_FALSE;
|
|
}
|
|
|
|
for (i=0; i<dwCpMRUNum && i<*puiCpNum; i++)
|
|
{
|
|
// Percentage is 1/MIN_CPMRU_FACTOR
|
|
if (_pCpMRU[i].dwHistoryHits*dwCpMRUFactor/dwTotalHits < 1)
|
|
break;
|
|
}
|
|
|
|
if (i != 0)
|
|
{
|
|
CopyMemory(pCpMRU, _pCpMRU, sizeof(CODEPAGE_MRU)*(i));
|
|
*puiCpNum = i;
|
|
hr = S_OK;
|
|
}
|
|
|
|
return hr;
|
|
|
|
}
|
|
|
|
// Update code page MRU
|
|
void CCpMRU::UpdateCPMRU(DWORD dwEncoding)
|
|
{
|
|
UINT i,j;
|
|
|
|
if (!_pCpMRU)
|
|
return;
|
|
|
|
if ((dwEncoding == CP_AUTO) ||
|
|
(dwEncoding == CP_JP_AUTO) ||
|
|
(dwEncoding == CP_KR_AUTO))
|
|
return;
|
|
|
|
if (!bCpUpdated)
|
|
bCpUpdated = TRUE;
|
|
|
|
|
|
// Sorted
|
|
for (i=0; i< dwCpMRUNum; i++)
|
|
{
|
|
if (!_pCpMRU[i].dwEncoding || (_pCpMRU[i].dwEncoding == dwEncoding))
|
|
break;
|
|
}
|
|
|
|
// If not found, replace the last encoding
|
|
if (i == dwCpMRUNum)
|
|
{
|
|
_pCpMRU[dwCpMRUNum-1].dwEncoding = dwEncoding;
|
|
_pCpMRU[dwCpMRUNum-1].dwHistoryHits = 1;
|
|
}
|
|
else
|
|
{
|
|
_pCpMRU[i].dwHistoryHits ++;
|
|
|
|
// If it is an already exist encoding, change order as needed
|
|
if (_pCpMRU[i].dwEncoding)
|
|
{
|
|
for (j=i; j>0; j--)
|
|
{
|
|
if (_pCpMRU[j-1].dwHistoryHits >= _pCpMRU[i].dwHistoryHits)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
if (j < i)
|
|
{
|
|
// Simple sorting
|
|
CODEPAGE_MRU tmpCPMRU = _pCpMRU[i];
|
|
|
|
MoveMemory(&_pCpMRU[j+1], &_pCpMRU[j], (i-j)*sizeof(CODEPAGE_MRU));
|
|
_pCpMRU[j].dwEncoding = tmpCPMRU.dwEncoding;
|
|
_pCpMRU[j].dwHistoryHits = tmpCPMRU.dwHistoryHits;
|
|
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
_pCpMRU[i].dwEncoding = dwEncoding;
|
|
}
|
|
|
|
}
|
|
|
|
// Cached too many hits?
|
|
if (_pCpMRU[0].dwHistoryHits > 0xFFFFFFF0)
|
|
{
|
|
// Find the smallest one
|
|
// This loop will always terminate
|
|
// because at worst, it will stop at i=0 (which we know
|
|
// is a huge number from the "if" above).
|
|
for (i=dwCpMRUNum-1; ; i--)
|
|
{
|
|
if (_pCpMRU[i].dwHistoryHits > 1)
|
|
break;
|
|
}
|
|
|
|
// Decrease Cache value
|
|
for (j=0; j<dwCpMRUNum && _pCpMRU[j].dwHistoryHits; j++)
|
|
{
|
|
// We still keep those one hit encodings if any
|
|
_pCpMRU[j].dwHistoryHits /= _pCpMRU[i].dwHistoryHits;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
UINT CheckEntity(LPSTR pIn, UINT nIn)
|
|
{
|
|
UINT uiRet = 0;
|
|
UINT uiSearchRange;
|
|
UINT i;
|
|
|
|
uiSearchRange = (nIn > MAX_ENTITY_LENTH)? MAX_ENTITY_LENTH:nIn;
|
|
|
|
if (*pIn == '&')
|
|
{
|
|
for(i=0; i<uiSearchRange; i++)
|
|
{
|
|
if (pIn[i] == ';')
|
|
break;
|
|
}
|
|
if (i < uiSearchRange)
|
|
{
|
|
uiSearchRange = i+1;
|
|
// NCR Entity
|
|
if (pIn[1] == '#')
|
|
{
|
|
for (i=2; i<uiSearchRange-1; i++)
|
|
if (!IS_DIGITA(pIn[i]))
|
|
{
|
|
uiSearchRange = 0;
|
|
break;
|
|
}
|
|
}
|
|
// Name Entity
|
|
else
|
|
{
|
|
for (i=1; i<uiSearchRange-1; i++)
|
|
if (!IS_CHARA(pIn[i]))
|
|
{
|
|
uiSearchRange = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uiSearchRange = 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uiSearchRange = 0;
|
|
}
|
|
|
|
return uiSearchRange;
|
|
}
|
|
|
|
void RemoveHtmlTags (LPSTR pIn, UINT *pnBytes)
|
|
//
|
|
// Remove HTML tags from pIn and compress whitespace, in-place.
|
|
// On input *pnBytes is the input length; on return *pnBytes is
|
|
// set to the resulting length.
|
|
//
|
|
// Name Entity and NCR Entity strings also removed
|
|
{
|
|
UINT nIn = *pnBytes;
|
|
UINT nOut = 0;
|
|
UINT nEntity = 0;
|
|
LPSTR pOut = pIn;
|
|
BOOL fSkippedSpace = FALSE;
|
|
|
|
|
|
while ( nIn > 0 /*&& nOut + 2 < *pnBytes */) {
|
|
|
|
if (*pIn == '<' && nIn > 1/* && !IsNoise (pIn[1])*/) {
|
|
|
|
// Discard text until the end of this tag. The handling here
|
|
// is pragmatic and imprecise; what matters is detecting mostly
|
|
// contents text, not tags or comments.
|
|
pIn++;
|
|
nIn--;
|
|
|
|
LPCSTR pSkip;
|
|
DWORD nLenSkip;
|
|
|
|
if ( nIn > 1 && *pIn == '%' )
|
|
{
|
|
pSkip = "%>"; // Skip <% to %>
|
|
nLenSkip = 2;
|
|
}
|
|
else if ( nIn > 3 && *pIn == '!' && !LowAsciiStrCmpNIA(pIn, "!--", 3) )
|
|
{
|
|
pSkip = "-->"; // Skip <!-- to -->
|
|
nLenSkip = 3;
|
|
}
|
|
else if ( nIn > 5 && !LowAsciiStrCmpNIA(pIn, "style", 5) )
|
|
{
|
|
pSkip = "</style>"; // Skip <style ...> to </style>
|
|
nLenSkip = 8;
|
|
}
|
|
else if ( nIn > 6 && !LowAsciiStrCmpNIA(pIn, "script", 6) )
|
|
{
|
|
pSkip = "</script>"; // Skip <script ...> to </script>
|
|
nLenSkip = 9;
|
|
}
|
|
else if ( nIn > 3 && !LowAsciiStrCmpNIA(pIn, "xml", 3) )
|
|
{
|
|
pSkip = "</xml>";
|
|
nLenSkip = 6;
|
|
}
|
|
else
|
|
{
|
|
pSkip = ">"; // match any end tag
|
|
nLenSkip = 1;
|
|
}
|
|
|
|
// Skip up to a case-insensitive match of pSkip / nLenSkip
|
|
|
|
while ( nIn > 0 )
|
|
{
|
|
// Spin fast up to a match of the first char.
|
|
// NOTE: the first-char compare is NOT case insensitive
|
|
// because this char is known to never be alphabetic.
|
|
|
|
while ( nIn > 0 && *pIn != *pSkip )
|
|
{
|
|
pIn++;
|
|
nIn--;
|
|
}
|
|
|
|
if ( nIn > nLenSkip && !LowAsciiStrCmpNIA(pIn, pSkip, nLenSkip) )
|
|
{
|
|
pIn += nLenSkip;
|
|
nIn -= nLenSkip;
|
|
fSkippedSpace = TRUE;
|
|
|
|
break;
|
|
}
|
|
|
|
if ( nIn > 0)
|
|
{
|
|
pIn++;
|
|
nIn--;
|
|
}
|
|
}
|
|
|
|
// *pIn is either one past '>' or at end of input
|
|
|
|
}
|
|
else
|
|
if (IsNoise (*pIn) || (nEntity = CheckEntity(pIn, nIn)))
|
|
{
|
|
|
|
// Collapse whitespace -- remember it but don't copy it now
|
|
fSkippedSpace = TRUE;
|
|
if (nEntity)
|
|
{
|
|
pIn+=nEntity;
|
|
nIn-=nEntity;
|
|
nEntity = 0;
|
|
}
|
|
else
|
|
{
|
|
while (nIn > 0 && IsNoise (*pIn))
|
|
pIn++, nIn--;
|
|
}
|
|
}
|
|
// *pIn is non-ws char
|
|
else
|
|
{
|
|
// Pass through all other characters
|
|
// Compress all previous noise characters to a white space
|
|
if (fSkippedSpace)
|
|
{
|
|
*pOut++ = ' ';
|
|
nOut++;
|
|
fSkippedSpace = FALSE;
|
|
}
|
|
|
|
*pOut++ = *pIn++;
|
|
nIn--;
|
|
nOut++;
|
|
}
|
|
}
|
|
|
|
*pnBytes = nOut;
|
|
}
|
|
|
|
static unsigned char szKoi8ru[] = {0xA4, 0xA6, 0xA7, 0xB4, 0xB6, 0xB7, 0xAD, 0xAE, 0xBD, 0xBE};
|
|
static unsigned char sz28592[] = {0xA1, 0xA6, /*0xAB,*/ 0xAC, 0xB1, 0xB5, 0xB6, 0xB9, /*0xBB, 0xE1*/}; // Need to fine tune this data
|
|
|
|
const CPPATCH CpData[] =
|
|
{
|
|
{CP_KOI8R, CP_KOI8RU, ARRAYSIZE(szKoi8ru), szKoi8ru},
|
|
{CP_1250, CP_ISO_8859_2, ARRAYSIZE(sz28592), sz28592},
|
|
};
|
|
|
|
|
|
// Distinguish similar western encodings
|
|
UINT PatchCodePage(UINT uiEncoding, unsigned char *pStr, int nSize)
|
|
{
|
|
int i, l,m, n, iPatch=0;
|
|
|
|
while (iPatch < ARRAYSIZE(CpData))
|
|
{
|
|
if (uiEncoding == CpData[iPatch].srcEncoding)
|
|
{
|
|
for (i=0; i<nSize; i++)
|
|
{
|
|
if (*pStr > HIGHEST_ASCII)
|
|
{
|
|
l = 0;
|
|
m = CpData[iPatch].nSize-1;
|
|
n = m / 2;
|
|
while (l <= m)
|
|
{
|
|
if (*pStr == CpData[iPatch].pszUch[n])
|
|
return CpData[iPatch].destEncoding;
|
|
else
|
|
{
|
|
if (*pStr < CpData[iPatch].pszUch[n])
|
|
{
|
|
m = n-1;
|
|
}
|
|
else
|
|
{
|
|
l = n+1;
|
|
}
|
|
n = (l+m)/2;
|
|
}
|
|
}
|
|
}
|
|
pStr++;
|
|
}
|
|
}
|
|
iPatch++;
|
|
}
|
|
|
|
return uiEncoding;
|
|
}
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
const unsigned char szKOIRU[] = {0xA4, 0xA6, 0xA7, 0xB4, 0xB6, 0xB7, 0xAD, 0xAE, 0xBD, 0xBE};
|
|
|
|
BOOL _IsKOI8RU(unsigned char *pStr, int nSize)
|
|
{
|
|
int i,j;
|
|
BOOL bRet = FALSE;
|
|
|
|
// Skip parameter check since this is internal
|
|
for (i=0; i<nSize; i++)
|
|
{
|
|
if (*pStr >= szKOIRU[0] && *pStr <= szKOIRU[ARRAYSIZE(szKOIRU)-1])
|
|
{
|
|
for (j=0; j<ARRAYSIZE(szKOIRU); j++)
|
|
{
|
|
if (*pStr == szKOIRU[j])
|
|
{
|
|
bRet = TRUE;
|
|
break;
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
if (bRet)
|
|
break;
|
|
|
|
pStr++;
|
|
}
|
|
|
|
return bRet;
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
HRESULT WINAPI _DetectInputCodepage(DWORD dwFlag, DWORD uiPrefWinCodepage, CHAR *pSrcStr, INT *pcSrcSize, DetectEncodingInfo *lpEncoding, INT *pnScores)
|
|
{
|
|
|
|
HRESULT hr = S_OK;
|
|
IStream *pstmTmp = NULL;
|
|
BOOL bGuess = FALSE;
|
|
BOOL bLCDetectSucceed = FALSE;
|
|
int nBufSize = *pnScores;
|
|
CHAR *_pSrcStr = pSrcStr;
|
|
UINT nSrcSize;
|
|
int i;
|
|
BOOL bMayBeAscii = FALSE;
|
|
|
|
// Check parameters
|
|
if (!pSrcStr || !(*pcSrcSize) || !lpEncoding || *pnScores == 0)
|
|
return E_INVALIDARG;
|
|
|
|
nSrcSize = *pcSrcSize;
|
|
|
|
// Zero out return buffer
|
|
ZeroMemory(lpEncoding, sizeof(DetectEncodingInfo)*(*pnScores));
|
|
|
|
// Simple Unicode detection
|
|
if (nSrcSize >= sizeof(WCHAR))
|
|
{
|
|
UINT uiCp = 0;
|
|
|
|
if (*((WCHAR *)pSrcStr) == 0xFEFF) // Unicode
|
|
uiCp = CP_UCS_2;
|
|
else if (*((WCHAR *)pSrcStr) == 0xFFFE) // Uncode Big Endian
|
|
uiCp = CP_UCS_2_BE;
|
|
|
|
if (uiCp)
|
|
{
|
|
*pnScores = 1;
|
|
lpEncoding[0].nCodePage = uiCp;
|
|
lpEncoding[0].nConfidence = 100;
|
|
lpEncoding[0].nDocPercent = 100;
|
|
lpEncoding[0].nLangID = -1;
|
|
return S_OK;
|
|
}
|
|
}
|
|
|
|
// HTML: take off HTML 'decoration'
|
|
if (dwFlag & MLDETECTCP_HTML)
|
|
{
|
|
// Dup buffer for HTML parser
|
|
if (NULL == (_pSrcStr = (char *)LocalAlloc(LPTR, nSrcSize)))
|
|
return E_OUTOFMEMORY;
|
|
CopyMemory(_pSrcStr, pSrcStr, nSrcSize);
|
|
RemoveHtmlTags (_pSrcStr, &nSrcSize);
|
|
}
|
|
|
|
// if blank page/file...
|
|
if (!nSrcSize)
|
|
return E_FAIL;
|
|
|
|
if (nSrcSize >= MIN_TEXT_SIZE)
|
|
{
|
|
// Initialize LCDetect
|
|
if (NULL == g_pLCDetect)
|
|
{
|
|
EnterCriticalSection(&g_cs);
|
|
if (NULL == g_pLCDetect)
|
|
{
|
|
LCDetect *pLC = new LCDetect ((HMODULE)g_hInst);
|
|
if (pLC)
|
|
{
|
|
if (pLC->LoadState() == NO_ERROR)
|
|
g_pLCDetect = pLC;
|
|
else
|
|
{
|
|
delete pLC;
|
|
}
|
|
}
|
|
}
|
|
LeaveCriticalSection(&g_cs);
|
|
}
|
|
|
|
if (g_pLCDetect)
|
|
{
|
|
LCD_Detect(_pSrcStr, nSrcSize, (PLCDScore)lpEncoding, pnScores, NULL);
|
|
if (*pnScores)
|
|
{
|
|
hr = S_OK;
|
|
bLCDetectSucceed = TRUE;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!bLCDetectSucceed)
|
|
{
|
|
*pnScores = 0;
|
|
hr = E_FAIL;
|
|
}
|
|
|
|
unsigned int uiCodepage = 0;
|
|
LARGE_INTEGER li = {0,0};
|
|
ULARGE_INTEGER uli = {0,0};
|
|
|
|
|
|
if (S_OK == CreateStreamOnHGlobal(NULL, TRUE, &pstmTmp))
|
|
{
|
|
ULONG cb = (ULONG) nSrcSize ;
|
|
if (S_OK == pstmTmp->Write(_pSrcStr,cb,&cb))
|
|
{
|
|
uli.LowPart = cb ;
|
|
if (S_OK != pstmTmp->SetSize(uli))
|
|
{
|
|
hr = E_OUTOFMEMORY;
|
|
goto DETECT_DONE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
goto DETECT_DONE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
hr = E_OUTOFMEMORY;
|
|
goto DETECT_DONE;
|
|
}
|
|
|
|
pstmTmp->Seek(li,STREAM_SEEK_SET, NULL);
|
|
|
|
switch (CceDetectInputCode(pstmTmp, grfDetectResolveAmbiguity|grfDetectUseCharMapping|grfDetectIgnoreEof, (EFam) 0, 0, &uiCodepage, &bGuess))
|
|
{
|
|
case cceSuccess:
|
|
if (*pnScores)
|
|
{
|
|
// LCDETECT never detects wrong on Arabic and Russian, don't consider it as DBCS in this case
|
|
// because MSEncode might misdetect Arabic and Russian as Japanese
|
|
// Same goes for Korean JOHAB, MSENCODE doesn't support it at all
|
|
if (((lpEncoding[0].nLangID == LANG_ARABIC )|| (lpEncoding[0].nLangID == LANG_RUSSIAN) || (lpEncoding[0].nCodePage == CP_KOR_JOHAB)) &&
|
|
(lpEncoding[0].nConfidence >= MIN_ACCEPTABLE_CONFIDENCE)
|
|
&& (lpEncoding[0].nDocPercent >= MIN_DOCPERCENT) && !bGuess)
|
|
bGuess = TRUE;
|
|
|
|
for (i=0;i<*pnScores;i++)
|
|
{
|
|
if (lpEncoding[i].nCodePage == uiCodepage)
|
|
{
|
|
if ((i != 0) && !bGuess)
|
|
{
|
|
DetectEncodingInfo TmpEncoding;
|
|
// Re-arrange lanugage list for MSEncode result
|
|
MoveMemory(&TmpEncoding, &lpEncoding[0], sizeof(DetectEncodingInfo));
|
|
MoveMemory(&lpEncoding[0], &lpEncoding[i], sizeof(DetectEncodingInfo));
|
|
MoveMemory(&lpEncoding[i], &TmpEncoding, sizeof(DetectEncodingInfo));
|
|
}
|
|
// Boost confidence for double hits
|
|
lpEncoding[0].nDocPercent = 100;
|
|
if (lpEncoding[0].nConfidence < 100)
|
|
lpEncoding[0].nConfidence = 100;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (i == *pnScores)
|
|
{
|
|
if (bGuess)
|
|
{
|
|
if (nBufSize > *pnScores)
|
|
{
|
|
lpEncoding[*pnScores].nCodePage = uiCodepage;
|
|
lpEncoding[*pnScores].nConfidence = MIN_CONFIDENCE;
|
|
lpEncoding[*pnScores].nDocPercent = MIN_DOCPERCENT;
|
|
lpEncoding[*pnScores].nLangID = -1;
|
|
(*pnScores)++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (nBufSize > *pnScores)
|
|
{
|
|
MoveMemory(lpEncoding+1, lpEncoding, sizeof(DetectEncodingInfo) * (*pnScores));
|
|
(*pnScores)++;
|
|
}
|
|
else
|
|
{
|
|
MoveMemory(lpEncoding+1, lpEncoding, sizeof(DetectEncodingInfo) * (*pnScores-1));
|
|
}
|
|
lpEncoding[0].nCodePage = uiCodepage;
|
|
lpEncoding[0].nConfidence = 100;
|
|
lpEncoding[0].nDocPercent = MIN_DOCPERCENT;
|
|
lpEncoding[0].nLangID = -1;
|
|
}
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
lpEncoding[0].nCodePage = uiCodepage;
|
|
if (bGuess)
|
|
lpEncoding[0].nConfidence = MIN_CONFIDENCE;
|
|
else
|
|
lpEncoding[0].nConfidence = 100;
|
|
lpEncoding[0].nDocPercent = MIN_DOCPERCENT;
|
|
lpEncoding[0].nLangID = -1;
|
|
(*pnScores)++;
|
|
}
|
|
|
|
//hr = (g_pLCDetect || (nSrcSize < MIN_TEXT_SIZE)) ? S_OK : S_FALSE;
|
|
hr = (!g_pLCDetect || (bGuess && !bLCDetectSucceed )) ? S_FALSE : S_OK;
|
|
break;
|
|
|
|
// Currently MSEncode doesn't provide any useful information in 'cceAmbiguousInput' case.
|
|
// We may update our code here if Office team enhance MSEncode for ambiguous input later.
|
|
case cceAmbiguousInput:
|
|
break;
|
|
|
|
case cceMayBeAscii:
|
|
bMayBeAscii = TRUE;
|
|
if (!(*pnScores))
|
|
{
|
|
lpEncoding[0].nCodePage = uiCodepage;
|
|
lpEncoding[0].nConfidence = MIN_CONFIDENCE;
|
|
lpEncoding[0].nDocPercent = -1;
|
|
lpEncoding[0].nLangID = -1;
|
|
(*pnScores)++;
|
|
}
|
|
else
|
|
{
|
|
for (i=0;i<*pnScores;i++)
|
|
{
|
|
if (lpEncoding[i].nCodePage == uiCodepage)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (i == *pnScores)
|
|
{
|
|
if(nBufSize > *pnScores) // Append MSEncode result to the language list
|
|
{
|
|
lpEncoding[i].nCodePage = uiCodepage;
|
|
lpEncoding[i].nConfidence = -1;
|
|
lpEncoding[i].nDocPercent = -1;
|
|
lpEncoding[i].nLangID = -1;
|
|
(*pnScores)++;
|
|
}
|
|
}
|
|
}
|
|
hr = bLCDetectSucceed ? S_OK : S_FALSE;
|
|
break;
|
|
|
|
// MSEncode failed
|
|
default:
|
|
break;
|
|
}
|
|
|
|
|
|
|
|
|
|
for (i=0; i<*pnScores; i++)
|
|
{
|
|
switch (lpEncoding[i].nCodePage) {
|
|
|
|
case 850:
|
|
if ((*pnScores>1) && (lpEncoding[1].nConfidence >= MIN_CONFIDENCE))
|
|
{
|
|
// Remove 850 from detection result if there is other detection results
|
|
(*pnScores)--;
|
|
if (i < *pnScores)
|
|
MoveMemory(&lpEncoding[i], &lpEncoding[i+1], (*pnScores-i)*sizeof(DetectEncodingInfo));
|
|
ZeroMemory(&lpEncoding[*pnScores], sizeof(DetectEncodingInfo));
|
|
}
|
|
else
|
|
{
|
|
// Replace it with 1252 if it is the only result we get
|
|
lpEncoding[0].nCodePage = CP_1252;
|
|
lpEncoding[0].nConfidence =
|
|
lpEncoding[0].nDocPercent = 100;
|
|
lpEncoding[0].nLangID = LANG_ENGLISH;
|
|
}
|
|
break;
|
|
|
|
case CP_1250:
|
|
case CP_KOI8R:
|
|
lpEncoding[i].nCodePage = PatchCodePage(lpEncoding[i].nCodePage, (unsigned char *)_pSrcStr, nSrcSize);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If not a high confidence CP_1254 (Windows Turkish),
|
|
// we'll check if there're better detection results, and swap results if needed
|
|
if ((lpEncoding[0].nCodePage == CP_1254) &&
|
|
(*pnScores>1) &&
|
|
((lpEncoding[0].nDocPercent < 90) || (lpEncoding[1].nCodePage == CP_CHN_GB) ||
|
|
(lpEncoding[1].nCodePage == CP_TWN) || (lpEncoding[1].nCodePage == CP_JPN_SJ) || (lpEncoding[1].nCodePage == CP_KOR_5601)))
|
|
{
|
|
MoveMemory(&lpEncoding[0], &lpEncoding[1], sizeof(DetectEncodingInfo)*(*pnScores-1));
|
|
lpEncoding[*pnScores-1].nCodePage = CP_1254;
|
|
lpEncoding[*pnScores-1].nLangID = LANG_TURKISH;
|
|
}
|
|
|
|
// 852 and 1258 text only have one sure detection result
|
|
if (((lpEncoding[0].nCodePage == CP_852) || (lpEncoding[0].nCodePage == CP_1258)) &&
|
|
(*pnScores>1) &&
|
|
(lpEncoding[1].nConfidence >= MIN_CONFIDENCE))
|
|
{
|
|
DetectEncodingInfo tmpDetect = {0};
|
|
MoveMemory(&tmpDetect, &lpEncoding[0], sizeof(DetectEncodingInfo));
|
|
MoveMemory(&lpEncoding[0], &lpEncoding[1], sizeof(DetectEncodingInfo));
|
|
MoveMemory(&lpEncoding[1], &tmpDetect, sizeof(DetectEncodingInfo));
|
|
}
|
|
|
|
// Considering guessed value from MSENCODE is pretty accurate, we don't change S_OK to S_FALSE
|
|
#if 0
|
|
if ((S_OK == hr) && !bLCDetectSucceed && bGuess)
|
|
{
|
|
hr = S_FALSE;
|
|
}
|
|
#endif
|
|
|
|
if (uiPrefWinCodepage && *pnScores)
|
|
{
|
|
if (uiPrefWinCodepage == CP_AUTO && g_pCpMRU && !IS_ENCODED_ENCODING(lpEncoding[0].nCodePage))
|
|
{
|
|
UINT uiCpNum = CP_AUTO_MRU_NUM;
|
|
CODEPAGE_MRU CpMRU[CP_AUTO_MRU_NUM];
|
|
|
|
if (S_OK == g_pCpMRU->GetCpMRU(CpMRU, &uiCpNum))
|
|
{
|
|
for (i = 0; i<*pnScores; i++)
|
|
{
|
|
for (UINT j = 0; j < uiCpNum; j++)
|
|
{
|
|
if (lpEncoding[i].nCodePage == CpMRU[j].dwEncoding)
|
|
{
|
|
uiPrefWinCodepage = CpMRU[j].dwEncoding;
|
|
break;
|
|
}
|
|
}
|
|
if (uiPrefWinCodepage != CP_AUTO)
|
|
break;
|
|
}
|
|
|
|
// If detection result is not in MRU
|
|
if (uiPrefWinCodepage == CP_AUTO)
|
|
{
|
|
// Don't take Unicode as perferred encoding if it is not in detection results for following reasons
|
|
// 1. Unicode is usually tagged with charset or Unicode BOM
|
|
// 2. Currently, we don't support Unicode detection in all detection engines
|
|
if (CpMRU[0].dwEncoding != CP_UCS_2 && CpMRU[0].dwEncoding != CP_UCS_2_BE)
|
|
uiPrefWinCodepage = CpMRU[0].dwEncoding;
|
|
}
|
|
}
|
|
}
|
|
|
|
// End preferred CP check if we can't get a valid one
|
|
if (uiPrefWinCodepage == CP_AUTO)
|
|
goto PREFERCPCHECK_DONE;
|
|
|
|
for (i = 1; i<*pnScores; i++)
|
|
{
|
|
if (uiPrefWinCodepage == lpEncoding[i].nCodePage)
|
|
{
|
|
DetectEncodingInfo TmpEncoding;
|
|
// Re-arrange lanugage list for prefered codepage
|
|
TmpEncoding = lpEncoding[i];
|
|
MoveMemory(&lpEncoding[1], &lpEncoding[0], sizeof(DetectEncodingInfo)*i);
|
|
lpEncoding[0] = TmpEncoding;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ((uiPrefWinCodepage != lpEncoding[0].nCodePage) &&
|
|
((bMayBeAscii && (lpEncoding[0].nConfidence <= MIN_CONFIDENCE)) ||
|
|
(hr != S_OK && nSrcSize >= MIN_TEXT_SIZE) ||
|
|
(nSrcSize < MIN_TEXT_SIZE && !IS_ENCODED_ENCODING(lpEncoding[0].nCodePage))))
|
|
{
|
|
lpEncoding[0].nCodePage = uiPrefWinCodepage;
|
|
lpEncoding[0].nConfidence = -1;
|
|
lpEncoding[0].nDocPercent = -1;
|
|
lpEncoding[0].nLangID = -1;
|
|
*pnScores = 1;
|
|
}
|
|
}
|
|
|
|
PREFERCPCHECK_DONE:
|
|
|
|
// Assume LCDETECT won't misdetect 1252 for files over MIN_TEXT_SIZE
|
|
// and MSENCODE can handle encoded text even they're below MIN_TEXT_SIZE
|
|
if (((nSrcSize < MIN_TEXT_SIZE) && (bMayBeAscii || E_FAIL == hr)) ||
|
|
(lpEncoding[0].nCodePage == CP_1252) ||
|
|
(lpEncoding[0].nCodePage == CP_UTF_8))
|
|
{
|
|
UINT j;
|
|
for (j=0; j < nSrcSize; j++)
|
|
if (*((LPBYTE)(_pSrcStr+j)) > HIGHEST_ASCII)
|
|
break;
|
|
if (j == nSrcSize)
|
|
{
|
|
if (lpEncoding[0].nCodePage == CP_1252)
|
|
{
|
|
lpEncoding[0].nCodePage = CP_20127;
|
|
}
|
|
else
|
|
{
|
|
*pnScores = 1;
|
|
lpEncoding[0].nCodePage = CP_20127;
|
|
lpEncoding[0].nConfidence =
|
|
lpEncoding[0].nDocPercent = 100;
|
|
lpEncoding[0].nLangID = LANG_ENGLISH;
|
|
hr = S_OK;
|
|
}
|
|
}
|
|
}
|
|
|
|
// UTF-8 doesn't really have distinctive signatures,
|
|
// if text amout is small, we won't return low confidence UTF-8 detection result.
|
|
if (hr == S_FALSE && IS_ENCODED_ENCODING(lpEncoding[0].nCodePage) &&
|
|
!((nSrcSize < MIN_TEXT_SIZE) && (lpEncoding[0].nCodePage == CP_UTF_8)))
|
|
hr = S_OK;
|
|
|
|
DETECT_DONE:
|
|
|
|
if ((dwFlag & MLDETECTCP_HTML) && _pSrcStr)
|
|
LocalFree(_pSrcStr);
|
|
|
|
if (pstmTmp)
|
|
{
|
|
pstmTmp->Release();
|
|
}
|
|
|
|
return hr ;
|
|
}
|
|
|
|
HRESULT WINAPI _DetectCodepageInIStream(DWORD dwFlag, DWORD uiPrefWinCodepage, IStream *pstmIn, DetectEncodingInfo *lpEncoding, INT *pnScores)
|
|
{
|
|
|
|
HRESULT hr= S_OK, hrWarnings=S_OK;
|
|
LARGE_INTEGER libOrigin = { 0, 0 };
|
|
ULARGE_INTEGER ulPos = {0, 0};
|
|
LPSTR lpstrIn = NULL ;
|
|
ULONG nlSrcSize ;
|
|
INT nSrcUsed ;
|
|
|
|
if (!pstmIn)
|
|
return E_INVALIDARG ;
|
|
|
|
// get size
|
|
hr = pstmIn->Seek(libOrigin, STREAM_SEEK_END,&ulPos);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
|
|
if ( ulPos.LowPart == 0 && ulPos.HighPart == 0 )
|
|
return E_INVALIDARG ;
|
|
|
|
nlSrcSize = ulPos.LowPart ;
|
|
|
|
// allocate a temp input buffer
|
|
if ( (lpstrIn = (LPSTR) LocalAlloc(LPTR, nlSrcSize )) == NULL )
|
|
{
|
|
hrWarnings = E_OUTOFMEMORY ;
|
|
goto exit;
|
|
}
|
|
|
|
// reset the pointer
|
|
hr = pstmIn->Seek(libOrigin, STREAM_SEEK_SET, NULL);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
|
|
hr = pstmIn->Read(lpstrIn, nlSrcSize, &nlSrcSize);
|
|
if (S_OK != hr)
|
|
hrWarnings = hr;
|
|
|
|
nSrcUsed = (INT) nlSrcSize ;
|
|
hr = _DetectInputCodepage(dwFlag, uiPrefWinCodepage, lpstrIn, &nSrcUsed, lpEncoding, pnScores);
|
|
|
|
exit :
|
|
if (lpstrIn)
|
|
{
|
|
LocalFree(lpstrIn);
|
|
}
|
|
|
|
return (hr == S_OK) ? hrWarnings : hr;
|
|
}
|