windows-server-2003/shell/ext/mlang/lcinit.cpp

/*
 * Automatic language and codepage detector
 * 
 * Bob Powell, 2/97
 * Copyright (C) 1996, 1997, Microsoft Corp.  All rights reserved.
 * 
 *  History:    1-Feb-97    BobP      Created
 *              5-Aug-97    BobP      Unicode support; Charmaps in data file.
 */
#include "private.h"
#include <strsafe.h>
/****************************************************************/


Histogram::Histogram (const PFileHistogramSection pHS, const PHIdx pMap)
: m_nDimensionality((UCHAR)pHS->m_dwDimensionality),
  m_nEdgeSize((UCHAR)pHS->m_dwEdgeSize),
  m_nCodePage((USHORT)pHS->m_dwCodePage),
  m_pMap(pMap),
  m_panElts((HElt *)&pHS[1])    // table follows header struct  in the file
{
    // #elements = #unique character values ^ #dimensions

    m_nElts = 1;
    for (UCHAR i = 0; i < m_nDimensionality; i++)
        m_nElts *= m_nEdgeSize;
}

DWORD
Histogram::Validate (DWORD nBytes) const
{
    if ( nBytes < m_nElts * sizeof(HElt) ||
         m_nDimensionality > 4 )
    {
        return ERROR_INTERNAL_DB_CORRUPTION;
    }

    return NO_ERROR;
}

Histogram::Histogram (const Histogram &H, const PHIdx pMap)
: m_nDimensionality(H.m_nDimensionality),
  m_nEdgeSize(H.m_nEdgeSize),
  m_nCodePage(H.m_nCodePage),
  m_nElts(H.m_nElts),
  m_pMap(pMap),
  m_panElts(H.m_panElts)
//
// Clone a histogram but use a different Charmap.
{
}

Histogram::~Histogram (void)
//
// The pointer members point to the mapped file and do not need to be freed.
{
}

/****************************************************************/

Language::Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID)
: m_pLC(pL),
  m_nLangID(nLangID),
  m_nCodePages(nCodePages),
  m_nRangeID(nRangeID)
{
}

Language7Bit::Language7Bit (PLCDetect pL, int nLangID, int nCodePages)
: Language(pL, nLangID, nCodePages),
  m_pLangHistogram(NULL)
{
    memset ((void *)m_ppCodePageHistogram, 0, sizeof(m_ppCodePageHistogram));
}

Language7Bit::~Language7Bit (void)
{
    if (m_pLangHistogram)
        delete m_pLangHistogram;

    for (int i = 0; i < MAXSUBLANG; i++)
        if (m_ppCodePageHistogram[i])
            delete m_ppCodePageHistogram[i];
}

DWORD
Language7Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
//
// Add the raw histogram at *pHS in the mapped file to this language object.  
// The histograms must be for 7-bit detection.
{
    DWORD hr = NO_ERROR;

    PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );

    if (nIdx == 0)
    {
        // The first histogram for a language is its language-detection table.

        if ( (m_pLangHistogram = new Histogram (pHS, pMap)) == NULL)
            return ERROR_OUTOFMEMORY;

        if ((hr = m_pLangHistogram->Validate (nBytes)) != NO_ERROR)
            return hr;
    }
    else
    {
        // Each subsequent histogram is a code page detection table.

        if (nIdx - 1 >= m_nCodePages)
            return ERROR_INTERNAL_DB_CORRUPTION;

        Histogram *pH;

        if ((pH = new Histogram (pHS, pMap)) == NULL)
            return ERROR_OUTOFMEMORY;

        if ((hr = pH->Validate (nBytes)) != NO_ERROR)
            return hr;

        m_ppCodePageHistogram[nIdx - 1] = pH;

        // Cache for the scoring vector math

        m_paHElt[nIdx - 1] = pH->Array();
    }

    return hr;
}

/****************************************************************/

Language8Bit::Language8Bit (PLCDetect pL, int nLangID, int nCodePages)
: Language(pL, nLangID, nCodePages)
{
    memset ((void *)m_ppHistogram, 0, sizeof(m_ppHistogram));
}

Language8Bit::~Language8Bit (void)
{
    for (int i = 0; i < MAXSUBLANG; i++)
        if (m_ppHistogram[i])
            delete m_ppHistogram[i];
}

DWORD
Language8Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
//
// Add the raw histogram at *pHS to this language object.  
// This language is known to use 8-bit detection.
{
    DWORD hr = NO_ERROR;

    PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );

    // The histograms are the direct language-code page tables

    if (nIdx >= m_nCodePages)
        return ERROR_INTERNAL_DB_CORRUPTION;

    Histogram *pH;

    if ((pH = new Histogram (pHS, pMap)) == NULL)
        return ERROR_OUTOFMEMORY;

    if ((hr = pH->Validate (nBytes)) != NO_ERROR)
        return hr;

    m_ppHistogram[nIdx] = pH;

    return hr;
}

/****************************************************************/

LanguageUnicode::LanguageUnicode (PLCDetect pL, int nLangID, 
    int nSubLangs, int nRangeID)
: Language(pL, nLangID, nSubLangs, nRangeID)
{
    memset ((void *)m_ppSubLangHistogram, 0, sizeof(m_ppSubLangHistogram));
}

LanguageUnicode::~LanguageUnicode (void)
{
    for (int i = 0; i < MAXSUBLANG; i++)
        if (m_ppSubLangHistogram[i])
            delete m_ppSubLangHistogram[i];
}

DWORD
LanguageUnicode::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
{
    DWORD hr = NO_ERROR;

    // All histograms for are sublanguage detection

    if (nIdx >= m_nSubLangs)
        return ERROR_INTERNAL_DB_CORRUPTION;

    // Get the custom charmap used for scoring this sublanguage group

    PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );

    Histogram *pH;

    if ((pH = new Histogram (pHS, pMap)) == NULL)
        return ERROR_OUTOFMEMORY;

    if ((hr = pH->Validate (nBytes)) != NO_ERROR)
        return hr;

    m_ppSubLangHistogram[nIdx] = pH;

    m_paHElt[nIdx] = pH->Array();

    return hr;
}

/****************************************************************/

LCDetect::LCDetect (HMODULE hM)
: m_hModule(hM),
  m_nCharmaps(0),
  m_n7BitLanguages(0),
  m_n8BitLanguages(0),
  m_nUnicodeLanguages(0),
  m_n7BitLangsRead(0),
  m_n8BitLangsRead(0),
  m_nUnicodeLangsRead(0),
  m_nMapsRead(0),
  m_nHistogramsRead(0),
  m_nScoreIdx(0),
  m_pp7BitLanguages(NULL),
  m_pp8BitLanguages(NULL),
  m_ppUnicodeLanguages(NULL),
  m_ppCharmaps(NULL),
  m_pv(NULL),
  m_hmap(0),
  m_hf(0),
  m_pHU27Bit(0)
{
}

LCDetect::~LCDetect ()
{
    delete m_pHU27Bit;

    for (unsigned int i = 0; i < m_n7BitLanguages; i++)
        delete m_pp7BitLanguages[i];
    delete m_pp7BitLanguages;

    for (i = 0; i < m_n8BitLanguages; i++)
        delete m_pp8BitLanguages[i];
    delete m_pp8BitLanguages;

    for (i = 0; i < m_nUnicodeLanguages; i++)
        delete m_ppUnicodeLanguages[i];
    delete m_ppUnicodeLanguages;

    for (i = 0; i < m_nCharmaps; i++)
        delete m_ppCharmaps[i];
    delete m_ppCharmaps;

    if (m_pv)
        UnmapViewOfFile (m_pv);

    CloseHandle (m_hmap);
    CloseHandle (m_hf);
}

DWORD
LCDetect::Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL)
//
// Set *ppL to the Language object created from this section.
{
    // nRecordCount is lang histogram (1) + # of code page histograms

    if ( m_n7BitLangsRead >= m_n7BitLanguages || pLS->m_dwRecordCount < 1)
        return ERROR_INTERNAL_DB_CORRUPTION;

    PLanguage7Bit pL = new Language7Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount - 1);

    if (pL == NULL)
        return ERROR_OUTOFMEMORY;


    // Each 7-bit lang uses one score index slot per code page.
    // The range starts with the 7-bit langs, since both the 8-bit
    // and Unicode langs follow it.

    if (m_n7BitLangsRead == 0 && m_nScoreIdx != 0)
        return ERROR_INTERNAL_DB_CORRUPTION;;

    pL->SetScoreIdx(m_nScoreIdx);

    m_nScoreIdx += pLS->m_dwRecordCount - 1;    // skip 1st record (Language)

    m_pp7BitLanguages[ m_n7BitLangsRead++ ] = pL;

    *ppL = pL;

    return NO_ERROR;
}

DWORD
LCDetect::Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL)
//
// Set *ppL to the Language object created from this section.
{
    // nRecordCount is # of combined language / code page histograms

    if ( m_n8BitLangsRead >= m_n8BitLanguages || pLS->m_dwRecordCount < 1)
        return ERROR_INTERNAL_DB_CORRUPTION;

    PLanguage8Bit pL = new Language8Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount);

    if (pL == NULL)
        return ERROR_OUTOFMEMORY;


    // The 8-bit score indices follow the 7-bit languages

    // Each 8-bit lang uses a score index slot for each of its code pages,
    // since all the code pages are scored in the initial scoring pass.
    // The number of slots is the number of code page histograms, which is
    // one less than the number of records following this language.

    pL->SetScoreIdx(m_nScoreIdx);
    m_nScoreIdx += pLS->m_dwRecordCount;


    m_pp8BitLanguages[ m_n8BitLangsRead++ ] = pL;

    *ppL = pL;

    return NO_ERROR;
}

DWORD
LCDetect::InitializeUnicodeLanguage (PFileLanguageSection pLS, Language **ppL)
//
// Set *ppL to the Language object created from this section.
{
    // nRecordCount is # of sublanguage histograms

    if ( m_nUnicodeLangsRead >= m_nUnicodeLanguages ||
         pLS->m_dwUnicodeRangeID >= m_nUnicodeLanguages )
    {
        return ERROR_INTERNAL_DB_CORRUPTION;
    }

    PLanguageUnicode pL = new LanguageUnicode (this, pLS->m_dwLangID, 
                        pLS->m_dwRecordCount, pLS->m_dwUnicodeRangeID);

    if (pL == NULL)
        return ERROR_OUTOFMEMORY;


    // The Unicode score indices follow the 7-bit languages, and overlay the
    // 8-bit slots since they aren't used at the same time.

    if (m_nUnicodeLangsRead == 0 && GetN8BitLanguages() > 0)
        m_nScoreIdx = Get8BitLanguage(0)->GetScoreIdx();

    // Each Unicode entry uses exactly one score index.  SBCS subdetection
    // (Latin group) uses the slots for the corresponding 7-bit languages,
    // and Unicode subdetection (CJK) uses the slots already defined for the
    // Unicode sub-languages.

    pL->SetScoreIdx(m_nScoreIdx);

    m_nScoreIdx++;

    // For Unicode, the range ID is used as the Language array index.

    m_ppUnicodeLanguages[ pLS->m_dwUnicodeRangeID ] = pL;
    m_nUnicodeLangsRead++;

    *ppL = pL;

    return NO_ERROR;
}

DWORD
LCDetect::LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL)
//
// A language section begins the definition of data for a language.
// Each language has exactly one of these records.  One or more
// histogram sections follow each language, and are always associated
// with the language of the preceding language section.
//
// Set *ppL to the Language object created from this section.
{
    DWORD hr = NO_ERROR;

    PFileLanguageSection pLS;

    pLS = (PFileLanguageSection)&((char *)pv)[sizeof(FileSection)];

    switch ( pLS->m_dwDetectionType ) {

    case DETECT_7BIT:
        hr = Initialize7BitLanguage (pLS, ppL);
        break;

    case DETECT_8BIT:
        hr = Initialize8BitLanguage (pLS, ppL);
        break;

    case DETECT_UNICODE:
        hr = InitializeUnicodeLanguage (pLS, ppL);
        break;
    }

    return hr;
}

DWORD
LCDetect::LoadHistogramSection (void *pv, int nSectionSize, Language *pL)
{
    PFileHistogramSection pHS;

    pHS = (PFileHistogramSection)&((char *)pv)[sizeof(FileSection)];

    int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pHS);

    return pL->AddHistogram ( pHS, nBytes, m_nHistogramsRead++);
}

DWORD
LCDetect::LoadMapSection (void *pv, int nSectionSize)
{
    PFileMapSection pMS;

    pMS = (PFileMapSection)&((char *)pv)[sizeof(FileSection)];

    int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pMS);

    if (m_nMapsRead >= m_nCharmaps)
        return ERROR_INTERNAL_DB_CORRUPTION;

    PCharmap pM = new Charmap (pMS);

    if (pM == NULL)
        return ERROR_OUTOFMEMORY;

    m_ppCharmaps[ m_nMapsRead++ ]  = pM;

    return NO_ERROR;
}

DWORD
LCDetect::BuildState (DWORD nFileSize)
//
// Build the detection structures from the mapped training file image at *m_pv
{
    PLanguage pL;
    PFileHeader pFH;
    PFileSection pFS;

    DWORD hr = NO_ERROR;

    // Validate header

    pFH = (PFileHeader) m_pv;

    if ( nFileSize < sizeof(*pFH) || 
         pFH->m_dwAppSig != APP_SIGNATURE ||
         pFH->m_dwVersion != APP_VERSION ||
         pFH->m_dwHdrSizeBytes >= nFileSize ||
         pFH->m_dwN7BitLanguages == 0 ||
         pFH->m_dwN8BitLanguages == 0 ||
         pFH->m_dwNUnicodeLanguages == 0 ||
         pFH->m_dwNCharmaps == 0 )
    {
        return ERROR_INTERNAL_DB_CORRUPTION;
    }

    // Allocate language pointer table per header

    m_n7BitLanguages = pFH->m_dwN7BitLanguages;
    m_pp7BitLanguages = new PLanguage7Bit [m_n7BitLanguages];

    m_n8BitLanguages = pFH->m_dwN8BitLanguages;
    m_pp8BitLanguages = new PLanguage8Bit [m_n8BitLanguages];

    m_nUnicodeLanguages = pFH->m_dwNUnicodeLanguages;
    m_ppUnicodeLanguages = new PLanguageUnicode [m_nUnicodeLanguages];

    m_nCharmaps = pFH->m_dwNCharmaps;
    m_ppCharmaps = new PCharmap [m_nCharmaps];

    if ( m_pp7BitLanguages == NULL || 
         m_pp8BitLanguages == NULL || 
         m_ppUnicodeLanguages == NULL ||
         m_ppCharmaps == NULL )
    {
        return ERROR_OUTOFMEMORY;
    }

    // Clear, because not all slots may be assigned
    memset (m_ppUnicodeLanguages, 0, sizeof(PLanguageUnicode) * m_nUnicodeLanguages);

    // Remember other header info

    m_LCDConfigureDefault.nMin7BitScore = pFH->m_dwMin7BitScore;
    m_LCDConfigureDefault.nMin8BitScore = pFH->m_dwMin8BitScore;
    m_LCDConfigureDefault.nMinUnicodeScore = pFH->m_dwMinUnicodeScore;
    m_LCDConfigureDefault.nRelativeThreshhold = pFH->m_dwRelativeThreshhold;
    m_LCDConfigureDefault.nDocPctThreshhold = pFH->m_dwDocPctThreshhold;
    m_LCDConfigureDefault.nChunkSize = pFH->m_dwChunkSize;

    // Position to first section

    pFS = (PFileSection) &((char *)m_pv)[pFH->m_dwHdrSizeBytes];

    // Read and process each file section

    while ( hr == NO_ERROR ) {

        // check alignment

        if (((DWORD_PTR)pFS & 3) != 0) {
            hr = ERROR_INTERNAL_DB_CORRUPTION;
            break;
        }

        // zero-length section marks end of data

        if (pFS->m_dwSizeBytes == 0)
            break;

        if ( &((char *)pFS)[pFS->m_dwSizeBytes] >= &((char *)m_pv)[nFileSize]) {
            hr = ERROR_INTERNAL_DB_CORRUPTION;
            break;
        }

        switch ( pFS->m_dwType ) {

        case SECTION_TYPE_LANGUAGE:                             // sets pL
            hr = LoadLanguageSection ((void*)pFS, pFS->m_dwSizeBytes, &pL);
            m_nHistogramsRead = 0;
            break;

        case SECTION_TYPE_HISTOGRAM:                            // uses pL
            hr = LoadHistogramSection ((void*)pFS, pFS->m_dwSizeBytes, pL);
            break;

        case SECTION_TYPE_MAP:
            hr = LoadMapSection ((void*)pFS, pFS->m_dwSizeBytes);
            break;

        default:                    // ignore unrecognized sections
            break;
        }

        pFS = (PFileSection) &((char *)pFS)[pFS->m_dwSizeBytes];
    }

    if (hr != NO_ERROR)
        return hr;

    if ( m_nMapsRead != m_nCharmaps )
        return ERROR_INTERNAL_DB_CORRUPTION;


    // Set up quick-reference arrays used by the scoring inner loops

    for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
        m_paHElt7Bit[i] = Get7BitLanguage(i)->GetLangHistogram()->Array();

    m_nHElt8Bit = 0;
    for (i = 0; i < GetN8BitLanguages(); i++) 
    {
        PLanguage8Bit pL = Get8BitLanguage(i);

        for (int j = 0; j < pL->NCodePages(); j++)
            m_paHElt8Bit[m_nHElt8Bit++] = pL->GetHistogram(j)->Array();
    }

    // Set up the Histogram used for ScoreVectorW() for scoring Unicode
    // text for 7-bit language detection.  Clone the first 7-bit language
    // histogram and replace its map with CHARMAP_U27BIT.

    m_pHU27Bit = new Histogram ( *Get7BitLanguage(0)->GetLangHistogram(),
                                 GetMap(CHARMAP_U27BIT));

    return hr;
}


DWORD
LCDetect::LoadState (void)
//
// Overall initialization and state loading.  Open the compiled training
// file from its fixed location in the System32 directory, and assemble
// in-memory detection tables from its contents.
{
    DWORD hr = NO_ERROR;
    DWORD nFileSize;
#define MODULENAMELEN 100
    char szFilename[MODULENAMELEN+50], *p;

    // Find out if NT or Windows

    OSVERSIONINFOA OSVersionInfo;
    int nOSWinNT = 0;
    OSVersionInfo.dwOSVersionInfoSize = sizeof( OSVERSIONINFOA );
    if ( GetVersionExA( &OSVersionInfo ) )
        nOSWinNT = OSVersionInfo.dwPlatformId;

    // Open the training data file,
    // look in the directory that contains the DLL.

    if (GetModuleFileNameA (m_hModule, szFilename, MODULENAMELEN) == 0)
        return GetLastError();

    if ( (p = strrchr (szFilename, '\\')) != NULL ||
         (p = strrchr (szFilename, ':')) != NULL )
    {
        *++p = 0;
    }
    else
        *szFilename = 0;
    //*STRSAFE*     strcat (szFilename, DETECTION_DATA_FILENAME);
    hr = StringCchCatA(szFilename , ARRAYSIZE(szFilename),  DETECTION_DATA_FILENAME);
    if (!SUCCEEDED(hr))
    {
       return E_FAIL;
    }

    if ((m_hf = CreateFileA (szFilename, GENERIC_READ, FILE_SHARE_READ, 
                    NULL, OPEN_EXISTING, 
                    FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) 
    {
        return E_FAIL;
    }

    if ((nFileSize = GetFileSize (m_hf, NULL)) == 0xffffffff) {
        hr = GetLastError();
        CloseHandle (m_hf);
        return hr;
    }

    // Virtual-map the file

    if ( nOSWinNT == VER_PLATFORM_WIN32_NT )
        m_hmap = CreateFileMapping (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);
    else
        m_hmap = CreateFileMappingA (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);

    if (m_hmap == NULL) {
        hr = GetLastError();
        CloseHandle (m_hf);
        return hr;
    }

    if ((m_pv = MapViewOfFile (m_hmap, FILE_MAP_READ, 0, 0, 0 )) == NULL) {
        hr = GetLastError();
        CloseHandle (m_hmap);
        CloseHandle (m_hf);
        return hr;
    }
        
    // Build the in-memory structures from the file

    hr = BuildState (nFileSize);

    return hr;
}

/****************************************************************/