* Automatic language and codepage detector * * Bob Powell, 2/97 * Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved. * * History: 1-Feb-97 BobP Created * 5-Aug-97 BobP Unicode support; Charmaps in data file. */ #include "private.h"
Histogram::Histogram (const PFileHistogramSection pHS, const PHIdx pMap) : m_nDimensionality((UCHAR)pHS->m_dwDimensionality), m_nEdgeSize((UCHAR)pHS->m_dwEdgeSize), m_nCodePage((USHORT)pHS->m_dwCodePage), m_pMap(pMap), m_panElts((HElt *)&pHS[1]) // table follows header struct in the file
{ // #elements = #unique character values ^ #dimensions
m_nElts = 1; for (UCHAR i = 0; i < m_nDimensionality; i++) m_nElts *= m_nEdgeSize; }
DWORD Histogram::Validate (DWORD nBytes) const { if ( nBytes < m_nElts * sizeof(HElt) || m_nDimensionality > 4 ) { return ERROR_INTERNAL_DB_CORRUPTION; }
return NO_ERROR; }
Histogram::Histogram (const Histogram &H, const PHIdx pMap) : m_nDimensionality(H.m_nDimensionality), m_nEdgeSize(H.m_nEdgeSize), m_nCodePage(H.m_nCodePage), m_nElts(H.m_nElts), m_pMap(pMap), m_panElts(H.m_panElts) //
// Clone a histogram but use a different Charmap.
{ }
Histogram::~Histogram (void) //
// The pointer members point to the mapped file and do not need to be freed.
{ }
Language::Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID) : m_pLC(pL), m_nLangID(nLangID), m_nCodePages(nCodePages), m_nRangeID(nRangeID) { }
Language7Bit::Language7Bit (PLCDetect pL, int nLangID, int nCodePages) : Language(pL, nLangID, nCodePages), m_pLangHistogram(NULL) { memset ((void *)m_ppCodePageHistogram, 0, sizeof(m_ppCodePageHistogram)); }
Language7Bit::~Language7Bit (void) { if (m_pLangHistogram) delete m_pLangHistogram;
for (int i = 0; i < MAXSUBLANG; i++) if (m_ppCodePageHistogram[i]) delete m_ppCodePageHistogram[i]; }
DWORD Language7Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx) //
// Add the raw histogram at *pHS in the mapped file to this language object.
// The histograms must be for 7-bit detection.
PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );
if (nIdx == 0) { // The first histogram for a language is its language-detection table.
if ( (m_pLangHistogram = new Histogram (pHS, pMap)) == NULL) return ERROR_OUTOFMEMORY;
if ((hr = m_pLangHistogram->Validate (nBytes)) != NO_ERROR) return hr; } else { // Each subsequent histogram is a code page detection table.
if (nIdx - 1 >= m_nCodePages) return ERROR_INTERNAL_DB_CORRUPTION;
Histogram *pH;
if ((pH = new Histogram (pHS, pMap)) == NULL) return ERROR_OUTOFMEMORY;
if ((hr = pH->Validate (nBytes)) != NO_ERROR) return hr;
m_ppCodePageHistogram[nIdx - 1] = pH;
// Cache for the scoring vector math
m_paHElt[nIdx - 1] = pH->Array(); }
return hr; }
Language8Bit::Language8Bit (PLCDetect pL, int nLangID, int nCodePages) : Language(pL, nLangID, nCodePages) { memset ((void *)m_ppHistogram, 0, sizeof(m_ppHistogram)); }
Language8Bit::~Language8Bit (void) { for (int i = 0; i < MAXSUBLANG; i++) if (m_ppHistogram[i]) delete m_ppHistogram[i]; }
DWORD Language8Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx) //
// Add the raw histogram at *pHS to this language object.
// This language is known to use 8-bit detection.
PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );
// The histograms are the direct language-code page tables
if (nIdx >= m_nCodePages) return ERROR_INTERNAL_DB_CORRUPTION;
Histogram *pH;
if ((pH = new Histogram (pHS, pMap)) == NULL) return ERROR_OUTOFMEMORY;
if ((hr = pH->Validate (nBytes)) != NO_ERROR) return hr;
m_ppHistogram[nIdx] = pH;
return hr; }
LanguageUnicode::LanguageUnicode (PLCDetect pL, int nLangID, int nSubLangs, int nRangeID) : Language(pL, nLangID, nSubLangs, nRangeID) { memset ((void *)m_ppSubLangHistogram, 0, sizeof(m_ppSubLangHistogram)); }
LanguageUnicode::~LanguageUnicode (void) { for (int i = 0; i < MAXSUBLANG; i++) if (m_ppSubLangHistogram[i]) delete m_ppSubLangHistogram[i]; }
DWORD LanguageUnicode::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx) { DWORD hr = NO_ERROR;
// All histograms for are sublanguage detection
if (nIdx >= m_nSubLangs) return ERROR_INTERNAL_DB_CORRUPTION;
// Get the custom charmap used for scoring this sublanguage group
PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );
Histogram *pH;
if ((pH = new Histogram (pHS, pMap)) == NULL) return ERROR_OUTOFMEMORY;
if ((hr = pH->Validate (nBytes)) != NO_ERROR) return hr;
m_ppSubLangHistogram[nIdx] = pH;
m_paHElt[nIdx] = pH->Array();
return hr; }
LCDetect::LCDetect (HMODULE hM) : m_hModule(hM), m_nCharmaps(0), m_n7BitLanguages(0), m_n8BitLanguages(0), m_nUnicodeLanguages(0), m_n7BitLangsRead(0), m_n8BitLangsRead(0), m_nUnicodeLangsRead(0), m_nMapsRead(0), m_nHistogramsRead(0), m_nScoreIdx(0), m_pp7BitLanguages(NULL), m_pp8BitLanguages(NULL), m_ppUnicodeLanguages(NULL), m_ppCharmaps(NULL), m_pv(NULL), m_hmap(0), m_hf(0), m_pHU27Bit(0) { }
LCDetect::~LCDetect () { delete m_pHU27Bit;
for (unsigned int i = 0; i < m_n7BitLanguages; i++) delete m_pp7BitLanguages[i]; delete m_pp7BitLanguages;
for (i = 0; i < m_n8BitLanguages; i++) delete m_pp8BitLanguages[i]; delete m_pp8BitLanguages;
for (i = 0; i < m_nUnicodeLanguages; i++) delete m_ppUnicodeLanguages[i]; delete m_ppUnicodeLanguages;
for (i = 0; i < m_nCharmaps; i++) delete m_ppCharmaps[i]; delete m_ppCharmaps;
if (m_pv) UnmapViewOfFile (m_pv);
CloseHandle (m_hmap); CloseHandle (m_hf); }
DWORD LCDetect::Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL) //
// Set *ppL to the Language object created from this section.
{ // nRecordCount is lang histogram (1) + # of code page histograms
if ( m_n7BitLangsRead >= m_n7BitLanguages || pLS->m_dwRecordCount < 1) return ERROR_INTERNAL_DB_CORRUPTION;
PLanguage7Bit pL = new Language7Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount - 1);
// Each 7-bit lang uses one score index slot per code page.
// The range starts with the 7-bit langs, since both the 8-bit
// and Unicode langs follow it.
if (m_n7BitLangsRead == 0 && m_nScoreIdx != 0) return ERROR_INTERNAL_DB_CORRUPTION;;
m_nScoreIdx += pLS->m_dwRecordCount - 1; // skip 1st record (Language)
m_pp7BitLanguages[ m_n7BitLangsRead++ ] = pL;
*ppL = pL;
return NO_ERROR; }
DWORD LCDetect::Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL) //
// Set *ppL to the Language object created from this section.
{ // nRecordCount is # of combined language / code page histograms
if ( m_n8BitLangsRead >= m_n8BitLanguages || pLS->m_dwRecordCount < 1) return ERROR_INTERNAL_DB_CORRUPTION;
PLanguage8Bit pL = new Language8Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount);
// The 8-bit score indices follow the 7-bit languages
// Each 8-bit lang uses a score index slot for each of its code pages,
// since all the code pages are scored in the initial scoring pass.
// The number of slots is the number of code page histograms, which is
// one less than the number of records following this language.
pL->SetScoreIdx(m_nScoreIdx); m_nScoreIdx += pLS->m_dwRecordCount;
m_pp8BitLanguages[ m_n8BitLangsRead++ ] = pL;
*ppL = pL;
return NO_ERROR; }
DWORD LCDetect::InitializeUnicodeLanguage (PFileLanguageSection pLS, Language **ppL) //
// Set *ppL to the Language object created from this section.
{ // nRecordCount is # of sublanguage histograms
if ( m_nUnicodeLangsRead >= m_nUnicodeLanguages || pLS->m_dwUnicodeRangeID >= m_nUnicodeLanguages ) { return ERROR_INTERNAL_DB_CORRUPTION; }
PLanguageUnicode pL = new LanguageUnicode (this, pLS->m_dwLangID, pLS->m_dwRecordCount, pLS->m_dwUnicodeRangeID);
// The Unicode score indices follow the 7-bit languages, and overlay the
// 8-bit slots since they aren't used at the same time.
if (m_nUnicodeLangsRead == 0 && GetN8BitLanguages() > 0) m_nScoreIdx = Get8BitLanguage(0)->GetScoreIdx();
// Each Unicode entry uses exactly one score index. SBCS subdetection
// (Latin group) uses the slots for the corresponding 7-bit languages,
// and Unicode subdetection (CJK) uses the slots already defined for the
// Unicode sub-languages.
// For Unicode, the range ID is used as the Language array index.
m_ppUnicodeLanguages[ pLS->m_dwUnicodeRangeID ] = pL; m_nUnicodeLangsRead++;
*ppL = pL;
return NO_ERROR; }
DWORD LCDetect::LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL) //
// A language section begins the definition of data for a language.
// Each language has exactly one of these records. One or more
// histogram sections follow each language, and are always associated
// with the language of the preceding language section.
// Set *ppL to the Language object created from this section.
PFileLanguageSection pLS;
pLS = (PFileLanguageSection)&((char *)pv)[sizeof(FileSection)];
switch ( pLS->m_dwDetectionType ) {
case DETECT_7BIT: hr = Initialize7BitLanguage (pLS, ppL); break;
case DETECT_8BIT: hr = Initialize8BitLanguage (pLS, ppL); break;
case DETECT_UNICODE: hr = InitializeUnicodeLanguage (pLS, ppL); break; }
return hr; }
DWORD LCDetect::LoadHistogramSection (void *pv, int nSectionSize, Language *pL) { PFileHistogramSection pHS;
pHS = (PFileHistogramSection)&((char *)pv)[sizeof(FileSection)];
int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pHS);
return pL->AddHistogram ( pHS, nBytes, m_nHistogramsRead++); }
DWORD LCDetect::LoadMapSection (void *pv, int nSectionSize) { PFileMapSection pMS;
pMS = (PFileMapSection)&((char *)pv)[sizeof(FileSection)];
int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pMS);
if (m_nMapsRead >= m_nCharmaps) return ERROR_INTERNAL_DB_CORRUPTION;
PCharmap pM = new Charmap (pMS);
m_ppCharmaps[ m_nMapsRead++ ] = pM;
return NO_ERROR; }
DWORD LCDetect::BuildState (DWORD nFileSize) //
// Build the detection structures from the mapped training file image at *m_pv
{ PLanguage pL; PFileHeader pFH; PFileSection pFS;
// Validate header
pFH = (PFileHeader) m_pv;
if ( nFileSize < sizeof(*pFH) || pFH->m_dwAppSig != APP_SIGNATURE || pFH->m_dwVersion != APP_VERSION || pFH->m_dwHdrSizeBytes >= nFileSize || pFH->m_dwN7BitLanguages == 0 || pFH->m_dwN8BitLanguages == 0 || pFH->m_dwNUnicodeLanguages == 0 || pFH->m_dwNCharmaps == 0 ) { return ERROR_INTERNAL_DB_CORRUPTION; }
// Allocate language pointer table per header
m_n7BitLanguages = pFH->m_dwN7BitLanguages; m_pp7BitLanguages = new PLanguage7Bit [m_n7BitLanguages];
m_n8BitLanguages = pFH->m_dwN8BitLanguages; m_pp8BitLanguages = new PLanguage8Bit [m_n8BitLanguages];
m_nUnicodeLanguages = pFH->m_dwNUnicodeLanguages; m_ppUnicodeLanguages = new PLanguageUnicode [m_nUnicodeLanguages];
m_nCharmaps = pFH->m_dwNCharmaps; m_ppCharmaps = new PCharmap [m_nCharmaps];
if ( m_pp7BitLanguages == NULL || m_pp8BitLanguages == NULL || m_ppUnicodeLanguages == NULL || m_ppCharmaps == NULL ) { return ERROR_OUTOFMEMORY; }
// Clear, because not all slots may be assigned
memset (m_ppUnicodeLanguages, 0, sizeof(PLanguageUnicode) * m_nUnicodeLanguages);
// Remember other header info
m_LCDConfigureDefault.nMin7BitScore = pFH->m_dwMin7BitScore; m_LCDConfigureDefault.nMin8BitScore = pFH->m_dwMin8BitScore; m_LCDConfigureDefault.nMinUnicodeScore = pFH->m_dwMinUnicodeScore; m_LCDConfigureDefault.nRelativeThreshhold = pFH->m_dwRelativeThreshhold; m_LCDConfigureDefault.nDocPctThreshhold = pFH->m_dwDocPctThreshhold; m_LCDConfigureDefault.nChunkSize = pFH->m_dwChunkSize;
// Position to first section
pFS = (PFileSection) &((char *)m_pv)[pFH->m_dwHdrSizeBytes];
// Read and process each file section
while ( hr == NO_ERROR ) {
// check alignment
if (((DWORD_PTR)pFS & 3) != 0) { hr = ERROR_INTERNAL_DB_CORRUPTION; break; }
// zero-length section marks end of data
if (pFS->m_dwSizeBytes == 0) break;
if ( &((char *)pFS)[pFS->m_dwSizeBytes] >= &((char *)m_pv)[nFileSize]) { hr = ERROR_INTERNAL_DB_CORRUPTION; break; }
switch ( pFS->m_dwType ) {
hr = LoadLanguageSection ((void*)pFS, pFS->m_dwSizeBytes, &pL); m_nHistogramsRead = 0; break;
hr = LoadHistogramSection ((void*)pFS, pFS->m_dwSizeBytes, pL); break;
case SECTION_TYPE_MAP: hr = LoadMapSection ((void*)pFS, pFS->m_dwSizeBytes); break;
default: // ignore unrecognized sections
break; }
pFS = (PFileSection) &((char *)pFS)[pFS->m_dwSizeBytes]; }
if (hr != NO_ERROR) return hr;
if ( m_nMapsRead != m_nCharmaps ) return ERROR_INTERNAL_DB_CORRUPTION;
// Set up quick-reference arrays used by the scoring inner loops
for (unsigned int i = 0; i < GetN7BitLanguages(); i++) m_paHElt7Bit[i] = Get7BitLanguage(i)->GetLangHistogram()->Array();
m_nHElt8Bit = 0; for (i = 0; i < GetN8BitLanguages(); i++) { PLanguage8Bit pL = Get8BitLanguage(i);
for (int j = 0; j < pL->NCodePages(); j++) m_paHElt8Bit[m_nHElt8Bit++] = pL->GetHistogram(j)->Array(); }
// Set up the Histogram used for ScoreVectorW() for scoring Unicode
// text for 7-bit language detection. Clone the first 7-bit language
// histogram and replace its map with CHARMAP_U27BIT.
m_pHU27Bit = new Histogram ( *Get7BitLanguage(0)->GetLangHistogram(), GetMap(CHARMAP_U27BIT));
return hr; }
DWORD LCDetect::LoadState (void) //
// Overall initialization and state loading. Open the compiled training
// file from its fixed location in the System32 directory, and assemble
// in-memory detection tables from its contents.
{ DWORD hr = NO_ERROR; DWORD nFileSize; #define MODULENAMELEN 100
char szFilename[MODULENAMELEN+50], *p;
// Find out if NT or Windows
OSVERSIONINFOA OSVersionInfo; int nOSWinNT = 0; OSVersionInfo.dwOSVersionInfoSize = sizeof( OSVERSIONINFOA ); if ( GetVersionExA( &OSVersionInfo ) ) nOSWinNT = OSVersionInfo.dwPlatformId;
// Open the training data file,
// look in the directory that contains the DLL.
if (GetModuleFileNameA (m_hModule, szFilename, MODULENAMELEN) == 0) return GetLastError();
if ( (p = strrchr (szFilename, '\\')) != NULL || (p = strrchr (szFilename, ':')) != NULL ) { *++p = 0; } else *szFilename = 0; strcat (szFilename, DETECTION_DATA_FILENAME);
if ((nFileSize = GetFileSize (m_hf, NULL)) == 0xffffffff) { hr = GetLastError(); CloseHandle (m_hf); return hr; }
// Virtual-map the file
if ( nOSWinNT == VER_PLATFORM_WIN32_NT ) m_hmap = CreateFileMapping (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL); else m_hmap = CreateFileMappingA (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);
if (m_hmap == NULL) { hr = GetLastError(); CloseHandle (m_hf); return hr; }
if ((m_pv = MapViewOfFile (m_hmap, FILE_MAP_READ, 0, 0, 0 )) == NULL) { hr = GetLastError(); CloseHandle (m_hmap); CloseHandle (m_hf); return hr; } // Build the in-memory structures from the file
hr = BuildState (nFileSize);
return hr; }