windows-server-2003/shell/ext/mlang/lcinit.cpp


								/*

								 * Automatic language and codepage detector

								 *

								 * Bob Powell, 2/97

								 * Copyright (C) 1996, 1997, Microsoft Corp.  All rights reserved.

								 *

								 *  History:    1-Feb-97    BobP      Created

								 *              5-Aug-97    BobP      Unicode support; Charmaps in data file.

								 */

								#include "private.h"

								#include <strsafe.h>

								/****************************************************************/


								Histogram::Histogram (const PFileHistogramSection pHS, const PHIdx pMap)

								: m_nDimensionality((UCHAR)pHS->m_dwDimensionality),

								  m_nEdgeSize((UCHAR)pHS->m_dwEdgeSize),

								  m_nCodePage((USHORT)pHS->m_dwCodePage),

								  m_pMap(pMap),

								  m_panElts((HElt *)&pHS[1])    // table follows header struct  in the file

								{

								    // #elements = #unique character values ^ #dimensions


								    m_nElts = 1;

								    for (UCHAR i = 0; i < m_nDimensionality; i++)

								        m_nElts *= m_nEdgeSize;

								}


								DWORD

								Histogram::Validate (DWORD nBytes) const

								{

								    if ( nBytes < m_nElts * sizeof(HElt) ||

								         m_nDimensionality > 4 )

								    {

								        return ERROR_INTERNAL_DB_CORRUPTION;

								    }


								    return NO_ERROR;

								}


								Histogram::Histogram (const Histogram &H, const PHIdx pMap)

								: m_nDimensionality(H.m_nDimensionality),

								  m_nEdgeSize(H.m_nEdgeSize),

								  m_nCodePage(H.m_nCodePage),

								  m_nElts(H.m_nElts),

								  m_pMap(pMap),

								  m_panElts(H.m_panElts)

								//

								// Clone a histogram but use a different Charmap.

								{

								}


								Histogram::~Histogram (void)

								//

								// The pointer members point to the mapped file and do not need to be freed.

								{

								}


								/****************************************************************/


								Language::Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID)

								: m_pLC(pL),

								  m_nLangID(nLangID),

								  m_nCodePages(nCodePages),

								  m_nRangeID(nRangeID)

								{

								}


								Language7Bit::Language7Bit (PLCDetect pL, int nLangID, int nCodePages)

								: Language(pL, nLangID, nCodePages),

								  m_pLangHistogram(NULL)

								{

								    memset ((void *)m_ppCodePageHistogram, 0, sizeof(m_ppCodePageHistogram));

								}


								Language7Bit::~Language7Bit (void)

								{

								    if (m_pLangHistogram)

								        delete m_pLangHistogram;


								    for (int i = 0; i < MAXSUBLANG; i++)

								        if (m_ppCodePageHistogram[i])

								            delete m_ppCodePageHistogram[i];

								}


								DWORD

								Language7Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)

								//

								// Add the raw histogram at *pHS in the mapped file to this language object.

								// The histograms must be for 7-bit detection.

								{

								    DWORD hr = NO_ERROR;


								    PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );


								    if (nIdx == 0)

								    {

								        // The first histogram for a language is its language-detection table.


								        if ( (m_pLangHistogram = new Histogram (pHS, pMap)) == NULL)

								            return ERROR_OUTOFMEMORY;


								        if ((hr = m_pLangHistogram->Validate (nBytes)) != NO_ERROR)

								            return hr;

								    }

								    else

								    {

								        // Each subsequent histogram is a code page detection table.


								        if (nIdx - 1 >= m_nCodePages)

								            return ERROR_INTERNAL_DB_CORRUPTION;


								        Histogram *pH;


								        if ((pH = new Histogram (pHS, pMap)) == NULL)

								            return ERROR_OUTOFMEMORY;


								        if ((hr = pH->Validate (nBytes)) != NO_ERROR)

								            return hr;


								        m_ppCodePageHistogram[nIdx - 1] = pH;


								        // Cache for the scoring vector math


								        m_paHElt[nIdx - 1] = pH->Array();

								    }


								    return hr;

								}


								/****************************************************************/


								Language8Bit::Language8Bit (PLCDetect pL, int nLangID, int nCodePages)

								: Language(pL, nLangID, nCodePages)

								{

								    memset ((void *)m_ppHistogram, 0, sizeof(m_ppHistogram));

								}


								Language8Bit::~Language8Bit (void)

								{

								    for (int i = 0; i < MAXSUBLANG; i++)

								        if (m_ppHistogram[i])

								            delete m_ppHistogram[i];

								}


								DWORD

								Language8Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)

								//

								// Add the raw histogram at *pHS to this language object.

								// This language is known to use 8-bit detection.

								{

								    DWORD hr = NO_ERROR;


								    PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );


								    // The histograms are the direct language-code page tables


								    if (nIdx >= m_nCodePages)

								        return ERROR_INTERNAL_DB_CORRUPTION;


								    Histogram *pH;


								    if ((pH = new Histogram (pHS, pMap)) == NULL)

								        return ERROR_OUTOFMEMORY;


								    if ((hr = pH->Validate (nBytes)) != NO_ERROR)

								        return hr;


								    m_ppHistogram[nIdx] = pH;


								    return hr;

								}


								/****************************************************************/


								LanguageUnicode::LanguageUnicode (PLCDetect pL, int nLangID,

								    int nSubLangs, int nRangeID)

								: Language(pL, nLangID, nSubLangs, nRangeID)

								{

								    memset ((void *)m_ppSubLangHistogram, 0, sizeof(m_ppSubLangHistogram));

								}


								LanguageUnicode::~LanguageUnicode (void)

								{

								    for (int i = 0; i < MAXSUBLANG; i++)

								        if (m_ppSubLangHistogram[i])

								            delete m_ppSubLangHistogram[i];

								}


								DWORD

								LanguageUnicode::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)

								{

								    DWORD hr = NO_ERROR;


								    // All histograms for are sublanguage detection


								    if (nIdx >= m_nSubLangs)

								        return ERROR_INTERNAL_DB_CORRUPTION;


								    // Get the custom charmap used for scoring this sublanguage group


								    PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );


								    Histogram *pH;


								    if ((pH = new Histogram (pHS, pMap)) == NULL)

								        return ERROR_OUTOFMEMORY;


								    if ((hr = pH->Validate (nBytes)) != NO_ERROR)

								        return hr;


								    m_ppSubLangHistogram[nIdx] = pH;


								    m_paHElt[nIdx] = pH->Array();


								    return hr;

								}


								/****************************************************************/


								LCDetect::LCDetect (HMODULE hM)

								: m_hModule(hM),

								  m_nCharmaps(0),

								  m_n7BitLanguages(0),

								  m_n8BitLanguages(0),

								  m_nUnicodeLanguages(0),

								  m_n7BitLangsRead(0),

								  m_n8BitLangsRead(0),

								  m_nUnicodeLangsRead(0),

								  m_nMapsRead(0),

								  m_nHistogramsRead(0),

								  m_nScoreIdx(0),

								  m_pp7BitLanguages(NULL),

								  m_pp8BitLanguages(NULL),

								  m_ppUnicodeLanguages(NULL),

								  m_ppCharmaps(NULL),

								  m_pv(NULL),

								  m_hmap(0),

								  m_hf(0),

								  m_pHU27Bit(0)

								{

								}


								LCDetect::~LCDetect ()

								{

								    delete m_pHU27Bit;


								    for (unsigned int i = 0; i < m_n7BitLanguages; i++)

								        delete m_pp7BitLanguages[i];

								    delete m_pp7BitLanguages;


								    for (i = 0; i < m_n8BitLanguages; i++)

								        delete m_pp8BitLanguages[i];

								    delete m_pp8BitLanguages;


								    for (i = 0; i < m_nUnicodeLanguages; i++)

								        delete m_ppUnicodeLanguages[i];

								    delete m_ppUnicodeLanguages;


								    for (i = 0; i < m_nCharmaps; i++)

								        delete m_ppCharmaps[i];

								    delete m_ppCharmaps;


								    if (m_pv)

								        UnmapViewOfFile (m_pv);


								    CloseHandle (m_hmap);

								    CloseHandle (m_hf);

								}


								DWORD

								LCDetect::Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL)

								//

								// Set *ppL to the Language object created from this section.

								{

								    // nRecordCount is lang histogram (1) + # of code page histograms


								    if ( m_n7BitLangsRead >= m_n7BitLanguages || pLS->m_dwRecordCount < 1)

								        return ERROR_INTERNAL_DB_CORRUPTION;


								    PLanguage7Bit pL = new Language7Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount - 1);


								    if (pL == NULL)

								        return ERROR_OUTOFMEMORY;


								    // Each 7-bit lang uses one score index slot per code page.

								    // The range starts with the 7-bit langs, since both the 8-bit

								    // and Unicode langs follow it.


								    if (m_n7BitLangsRead == 0 && m_nScoreIdx != 0)

								        return ERROR_INTERNAL_DB_CORRUPTION;;


								    pL->SetScoreIdx(m_nScoreIdx);


								    m_nScoreIdx += pLS->m_dwRecordCount - 1;    // skip 1st record (Language)


								    m_pp7BitLanguages[ m_n7BitLangsRead++ ] = pL;


								    *ppL = pL;


								    return NO_ERROR;

								}


								DWORD

								LCDetect::Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL)

								//

								// Set *ppL to the Language object created from this section.

								{

								    // nRecordCount is # of combined language / code page histograms


								    if ( m_n8BitLangsRead >= m_n8BitLanguages || pLS->m_dwRecordCount < 1)

								        return ERROR_INTERNAL_DB_CORRUPTION;


								    PLanguage8Bit pL = new Language8Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount);


								    if (pL == NULL)

								        return ERROR_OUTOFMEMORY;


								    // The 8-bit score indices follow the 7-bit languages


								    // Each 8-bit lang uses a score index slot for each of its code pages,

								    // since all the code pages are scored in the initial scoring pass.

								    // The number of slots is the number of code page histograms, which is

								    // one less than the number of records following this language.


								    pL->SetScoreIdx(m_nScoreIdx);

								    m_nScoreIdx += pLS->m_dwRecordCount;


								    m_pp8BitLanguages[ m_n8BitLangsRead++ ] = pL;


								    *ppL = pL;


								    return NO_ERROR;

								}


								DWORD

								LCDetect::InitializeUnicodeLanguage (PFileLanguageSection pLS, Language **ppL)

								//

								// Set *ppL to the Language object created from this section.

								{

								    // nRecordCount is # of sublanguage histograms


								    if ( m_nUnicodeLangsRead >= m_nUnicodeLanguages ||

								         pLS->m_dwUnicodeRangeID >= m_nUnicodeLanguages )

								    {

								        return ERROR_INTERNAL_DB_CORRUPTION;

								    }


								    PLanguageUnicode pL = new LanguageUnicode (this, pLS->m_dwLangID,

								                        pLS->m_dwRecordCount, pLS->m_dwUnicodeRangeID);


								    if (pL == NULL)

								        return ERROR_OUTOFMEMORY;


								    // The Unicode score indices follow the 7-bit languages, and overlay the

								    // 8-bit slots since they aren't used at the same time.


								    if (m_nUnicodeLangsRead == 0 && GetN8BitLanguages() > 0)

								        m_nScoreIdx = Get8BitLanguage(0)->GetScoreIdx();


								    // Each Unicode entry uses exactly one score index.  SBCS subdetection

								    // (Latin group) uses the slots for the corresponding 7-bit languages,

								    // and Unicode subdetection (CJK) uses the slots already defined for the

								    // Unicode sub-languages.


								    pL->SetScoreIdx(m_nScoreIdx);


								    m_nScoreIdx++;


								    // For Unicode, the range ID is used as the Language array index.


								    m_ppUnicodeLanguages[ pLS->m_dwUnicodeRangeID ] = pL;

								    m_nUnicodeLangsRead++;


								    *ppL = pL;


								    return NO_ERROR;

								}


								DWORD

								LCDetect::LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL)

								//

								// A language section begins the definition of data for a language.

								// Each language has exactly one of these records.  One or more

								// histogram sections follow each language, and are always associated

								// with the language of the preceding language section.

								//

								// Set *ppL to the Language object created from this section.

								{

								    DWORD hr = NO_ERROR;


								    PFileLanguageSection pLS;


								    pLS = (PFileLanguageSection)&((char *)pv)[sizeof(FileSection)];


								    switch ( pLS->m_dwDetectionType ) {


								    case DETECT_7BIT:

								        hr = Initialize7BitLanguage (pLS, ppL);

								        break;


								    case DETECT_8BIT:

								        hr = Initialize8BitLanguage (pLS, ppL);

								        break;


								    case DETECT_UNICODE:

								        hr = InitializeUnicodeLanguage (pLS, ppL);

								        break;

								    }


								    return hr;

								}


								DWORD

								LCDetect::LoadHistogramSection (void *pv, int nSectionSize, Language *pL)

								{

								    PFileHistogramSection pHS;


								    pHS = (PFileHistogramSection)&((char *)pv)[sizeof(FileSection)];


								    int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pHS);


								    return pL->AddHistogram ( pHS, nBytes, m_nHistogramsRead++);

								}


								DWORD

								LCDetect::LoadMapSection (void *pv, int nSectionSize)

								{

								    PFileMapSection pMS;


								    pMS = (PFileMapSection)&((char *)pv)[sizeof(FileSection)];


								    int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pMS);


								    if (m_nMapsRead >= m_nCharmaps)

								        return ERROR_INTERNAL_DB_CORRUPTION;


								    PCharmap pM = new Charmap (pMS);


								    if (pM == NULL)

								        return ERROR_OUTOFMEMORY;


								    m_ppCharmaps[ m_nMapsRead++ ]  = pM;


								    return NO_ERROR;

								}


								DWORD

								LCDetect::BuildState (DWORD nFileSize)

								//

								// Build the detection structures from the mapped training file image at *m_pv

								{

								    PLanguage pL;

								    PFileHeader pFH;

								    PFileSection pFS;


								    DWORD hr = NO_ERROR;


								    // Validate header


								    pFH = (PFileHeader) m_pv;


								    if ( nFileSize < sizeof(*pFH) ||

								         pFH->m_dwAppSig != APP_SIGNATURE ||

								         pFH->m_dwVersion != APP_VERSION ||

								         pFH->m_dwHdrSizeBytes >= nFileSize ||

								         pFH->m_dwN7BitLanguages == 0 ||

								         pFH->m_dwN8BitLanguages == 0 ||

								         pFH->m_dwNUnicodeLanguages == 0 ||

								         pFH->m_dwNCharmaps == 0 )

								    {

								        return ERROR_INTERNAL_DB_CORRUPTION;

								    }


								    // Allocate language pointer table per header


								    m_n7BitLanguages = pFH->m_dwN7BitLanguages;

								    m_pp7BitLanguages = new PLanguage7Bit [m_n7BitLanguages];


								    m_n8BitLanguages = pFH->m_dwN8BitLanguages;

								    m_pp8BitLanguages = new PLanguage8Bit [m_n8BitLanguages];


								    m_nUnicodeLanguages = pFH->m_dwNUnicodeLanguages;

								    m_ppUnicodeLanguages = new PLanguageUnicode [m_nUnicodeLanguages];


								    m_nCharmaps = pFH->m_dwNCharmaps;

								    m_ppCharmaps = new PCharmap [m_nCharmaps];


								    if ( m_pp7BitLanguages == NULL ||

								         m_pp8BitLanguages == NULL ||

								         m_ppUnicodeLanguages == NULL ||

								         m_ppCharmaps == NULL )

								    {

								        return ERROR_OUTOFMEMORY;

								    }


								    // Clear, because not all slots may be assigned

								    memset (m_ppUnicodeLanguages, 0, sizeof(PLanguageUnicode) * m_nUnicodeLanguages);


								    // Remember other header info


								    m_LCDConfigureDefault.nMin7BitScore = pFH->m_dwMin7BitScore;

								    m_LCDConfigureDefault.nMin8BitScore = pFH->m_dwMin8BitScore;

								    m_LCDConfigureDefault.nMinUnicodeScore = pFH->m_dwMinUnicodeScore;

								    m_LCDConfigureDefault.nRelativeThreshhold = pFH->m_dwRelativeThreshhold;

								    m_LCDConfigureDefault.nDocPctThreshhold = pFH->m_dwDocPctThreshhold;

								    m_LCDConfigureDefault.nChunkSize = pFH->m_dwChunkSize;


								    // Position to first section


								    pFS = (PFileSection) &((char *)m_pv)[pFH->m_dwHdrSizeBytes];


								    // Read and process each file section


								    while ( hr == NO_ERROR ) {


								        // check alignment


								        if (((DWORD_PTR)pFS & 3) != 0) {

								            hr = ERROR_INTERNAL_DB_CORRUPTION;

								            break;

								        }


								        // zero-length section marks end of data


								        if (pFS->m_dwSizeBytes == 0)

								            break;


								        if ( &((char *)pFS)[pFS->m_dwSizeBytes] >= &((char *)m_pv)[nFileSize]) {

								            hr = ERROR_INTERNAL_DB_CORRUPTION;

								            break;

								        }


								        switch ( pFS->m_dwType ) {


								        case SECTION_TYPE_LANGUAGE:                             // sets pL

								            hr = LoadLanguageSection ((void*)pFS, pFS->m_dwSizeBytes, &pL);

								            m_nHistogramsRead = 0;

								            break;


								        case SECTION_TYPE_HISTOGRAM:                            // uses pL

								            hr = LoadHistogramSection ((void*)pFS, pFS->m_dwSizeBytes, pL);

								            break;


								        case SECTION_TYPE_MAP:

								            hr = LoadMapSection ((void*)pFS, pFS->m_dwSizeBytes);

								            break;


								        default:                    // ignore unrecognized sections

								            break;

								        }


								        pFS = (PFileSection) &((char *)pFS)[pFS->m_dwSizeBytes];

								    }


								    if (hr != NO_ERROR)

								        return hr;


								    if ( m_nMapsRead != m_nCharmaps )

								        return ERROR_INTERNAL_DB_CORRUPTION;


								    // Set up quick-reference arrays used by the scoring inner loops


								    for (unsigned int i = 0; i < GetN7BitLanguages(); i++)

								        m_paHElt7Bit[i] = Get7BitLanguage(i)->GetLangHistogram()->Array();


								    m_nHElt8Bit = 0;

								    for (i = 0; i < GetN8BitLanguages(); i++)

								    {

								        PLanguage8Bit pL = Get8BitLanguage(i);


								        for (int j = 0; j < pL->NCodePages(); j++)

								            m_paHElt8Bit[m_nHElt8Bit++] = pL->GetHistogram(j)->Array();

								    }


								    // Set up the Histogram used for ScoreVectorW() for scoring Unicode

								    // text for 7-bit language detection.  Clone the first 7-bit language

								    // histogram and replace its map with CHARMAP_U27BIT.


								    m_pHU27Bit = new Histogram ( *Get7BitLanguage(0)->GetLangHistogram(),

								                                 GetMap(CHARMAP_U27BIT));


								    return hr;

								}


								DWORD

								LCDetect::LoadState (void)

								//

								// Overall initialization and state loading.  Open the compiled training

								// file from its fixed location in the System32 directory, and assemble

								// in-memory detection tables from its contents.

								{

								    DWORD hr = NO_ERROR;

								    DWORD nFileSize;

								#define MODULENAMELEN 100

								    char szFilename[MODULENAMELEN+50], *p;


								    // Find out if NT or Windows


								    OSVERSIONINFOA OSVersionInfo;

								    int nOSWinNT = 0;

								    OSVersionInfo.dwOSVersionInfoSize = sizeof( OSVERSIONINFOA );

								    if ( GetVersionExA( &OSVersionInfo ) )

								        nOSWinNT = OSVersionInfo.dwPlatformId;


								    // Open the training data file,

								    // look in the directory that contains the DLL.


								    if (GetModuleFileNameA (m_hModule, szFilename, MODULENAMELEN) == 0)

								        return GetLastError();


								    if ( (p = strrchr (szFilename, '\\')) != NULL ||

								         (p = strrchr (szFilename, ':')) != NULL )

								    {

								        *++p = 0;

								    }

								    else

								        *szFilename = 0;

								    //*STRSAFE*     strcat (szFilename, DETECTION_DATA_FILENAME);

								    hr = StringCchCatA(szFilename , ARRAYSIZE(szFilename),  DETECTION_DATA_FILENAME);

								    if (!SUCCEEDED(hr))

								    {

								       return E_FAIL;

								    }


								    if ((m_hf = CreateFileA (szFilename, GENERIC_READ, FILE_SHARE_READ,

								                    NULL, OPEN_EXISTING,

								                    FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE)

								    {

								        return E_FAIL;

								    }


								    if ((nFileSize = GetFileSize (m_hf, NULL)) == 0xffffffff) {

								        hr = GetLastError();

								        CloseHandle (m_hf);

								        return hr;

								    }


								    // Virtual-map the file


								    if ( nOSWinNT == VER_PLATFORM_WIN32_NT )

								        m_hmap = CreateFileMapping (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);

								    else

								        m_hmap = CreateFileMappingA (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);


								    if (m_hmap == NULL) {

								        hr = GetLastError();

								        CloseHandle (m_hf);

								        return hr;

								    }


								    if ((m_pv = MapViewOfFile (m_hmap, FILE_MAP_READ, 0, 0, 0 )) == NULL) {

								        hr = GetLastError();

								        CloseHandle (m_hmap);

								        CloseHandle (m_hf);

								        return hr;

								    }


								    // Build the in-memory structures from the file


								    hr = BuildState (nFileSize);


								    return hr;

								}


								/****************************************************************/