mirror of https://github.com/tongzx/nt5src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1278 lines
36 KiB
1278 lines
36 KiB
/*
|
|
* Automatic language and codepage detector
|
|
*
|
|
* Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved.
|
|
*
|
|
* History: 1-Feb-97 BobP Created
|
|
* 5-Aug-97 BobP Added Unicode support and rewrote
|
|
* scoring to use vector math.
|
|
*
|
|
* This is the runtime detector.
|
|
*
|
|
* See the comments in lcdcomp.cpp for a description of the compilation
|
|
* process and training data format.
|
|
*
|
|
* See design.txt for a description of the detection and scoring algorithm.
|
|
*
|
|
* Performance note: 60-80% of execution time in this code is AddVector(),
|
|
* which is probably memory-cycle bound by its random data access, but is
|
|
* still a candidate for further optimizing with an intrinsic vector operator,
|
|
* should one become available.
|
|
*
|
|
* to-do (as needed):
|
|
* - Adjust 7-bit and 8-bit scores to make them more comparable
|
|
* - detect UTF-8 in the SBCS entry point, via heuristic and via
|
|
* subdetection as 7-bit lang and as Unicode.
|
|
*/
|
|
|
|
#include "private.h"
|
|
|
|
// This is all the global (per-process) state
|
|
//
|
|
// It is set at DLL process init and its contents are const after that.
|
|
|
|
LCDetect * g_pLCDetect;
|
|
|
|
#ifdef DEBUG_LCDETECT
|
|
int g_fDebug;
|
|
#endif
|
|
|
|
/****************************************************************/
|
|
|
|
static inline unsigned int
|
|
FindHighIdx (const int *pn, unsigned int n)
|
|
//
|
|
// Return the INDEX of the highest-valued integer in the given array.
|
|
{
|
|
int nMax = 0;
|
|
unsigned int nIdx = 0;
|
|
|
|
for (unsigned int i = 0; i < n; i++)
|
|
{
|
|
if (pn[i] > nMax)
|
|
{
|
|
nMax = pn[i];
|
|
nIdx = i;
|
|
}
|
|
}
|
|
|
|
return nIdx;
|
|
}
|
|
|
|
/****************************************************************/
|
|
|
|
void
|
|
CScores::SelectCodePages (void)
|
|
//
|
|
// Find the highest scoring code page for each language, and remove
|
|
// all the other scores from the array such that the array contains
|
|
// exactly one score per detected language instead of one score per
|
|
// code page per language.
|
|
//
|
|
// When multiple scores are present for different code pages of the same
|
|
// language, this function combines the scores into a single score.
|
|
// The resulting entry will have the code page of the top-scoring code page
|
|
// for the various entries for that language, and the score and char count
|
|
// will be the SUM of the scores and char counts for ALL the entries for
|
|
// that language.
|
|
//
|
|
// For example, if the input contains:
|
|
// Lang Codepage Score Char count
|
|
// Russian 1251 42 200
|
|
// Russian 20866 69 300
|
|
//
|
|
// Then on output, the array will contain only one score for Russian:
|
|
// Russian 20866 111 500
|
|
//
|
|
// This overwrites the entries in place, and sets m_nUsed to the resulting
|
|
// number of active slots.
|
|
//
|
|
// The scores are already grouped by language, no need to sort by language.
|
|
//
|
|
// After return, the score array must NOT be referenced via ScoreIdx()
|
|
// because the index of the entries has changed.
|
|
{
|
|
// The score indices no longer matter, remove slots that scored zero.
|
|
|
|
RemoveZeroScores ();
|
|
|
|
if (m_nUsed == 0)
|
|
return;
|
|
|
|
// Select top score per language. This is fundamentally dependent
|
|
// on the score array already being ordered by language. This won't
|
|
// combine scores for the same language as both a 7-bit and 8-bit lang,
|
|
// but that's not worth fixing.
|
|
|
|
int maxscore = 0; // highest score for a given language
|
|
int totalscore = m_p[0].GetScore(); // sum of scores " "
|
|
int totalchars = m_p[0].GetCharCount();// sum of character counts " "
|
|
|
|
int nReturned = 0; // index and ultimate count of elts returned
|
|
unsigned int maxscoreidx = 0; // array index of the top-scoring code page,
|
|
// *** for the current language ***
|
|
|
|
for (unsigned int i = 1; i < m_nUsed; i++) {
|
|
if (m_p[i-1].GetLang() != m_p[i].GetLang())
|
|
{
|
|
// [i] indicates a different language from the previous entry
|
|
|
|
// Add the entry for the previous language to the result
|
|
// by copying the slot for its highest-scoring code page,
|
|
// and overwriting its score and char count with the sum counts.
|
|
|
|
m_p[maxscoreidx].SetScore(totalscore);
|
|
m_p[maxscoreidx].SetCharCount(totalchars);
|
|
m_p[nReturned++] = m_p[maxscoreidx];
|
|
|
|
// Start remembering the top and total scores for the new lang.
|
|
|
|
maxscore = m_p[i].GetScore();
|
|
totalscore = m_p[i].GetScore();
|
|
totalchars = m_p[i].GetCharCount();
|
|
maxscoreidx = i; // remember which [] had the top score
|
|
}
|
|
else
|
|
{
|
|
// Accumulate more scores for the same language
|
|
|
|
if (m_p[i].GetScore() > maxscore) {
|
|
maxscore = m_p[i].GetScore();
|
|
maxscoreidx = i;
|
|
}
|
|
totalscore += m_p[i].GetScore();
|
|
totalchars += m_p[i].GetCharCount();
|
|
}
|
|
}
|
|
|
|
// Process the the last language. Return the slot from its
|
|
// highest-scoring code page.
|
|
|
|
if (m_nUsed > 0)
|
|
{
|
|
m_p[maxscoreidx].SetScore(totalscore);
|
|
m_p[maxscoreidx].SetCharCount(totalchars);
|
|
m_p[nReturned++] = m_p[maxscoreidx];
|
|
}
|
|
|
|
m_nUsed = nReturned;
|
|
}
|
|
|
|
/****************************************************************/
|
|
|
|
static void __fastcall
|
|
AddVector (int *pS, const PHElt *pH, int idx, unsigned int nScores)
|
|
//
|
|
// Add the score vector for a single n-gram to the running sum score
|
|
// vector at pS.
|
|
//
|
|
// On return, paS[0..nScores-1] is filled with the sum scores for each
|
|
// language.
|
|
//
|
|
// **** PERFORMANCE NOTE ****
|
|
//
|
|
// This is the critical inner-loop of the entire subsystem.
|
|
//
|
|
// Code generation and performance have been checked for various code
|
|
// organization. Ironically, making AddVector() a true function is
|
|
// FASTER than inlining it because when inlined, the registers are used
|
|
// for the OUTER loop variables and the inner loop here does approximately
|
|
// twice as many memory references per pass.
|
|
//
|
|
// On x86, all four loop variables are registered, and each pass makes only
|
|
// three memory references, which is optimal for the given representation.
|
|
//
|
|
// Future note: the histogram tables could be pivoted to collect all the
|
|
// scores for each n-gram in a block; that would eliminate the double
|
|
// indirection through ph and reduce the memory refs to two per pass.
|
|
{
|
|
nScores++; // makes faster end-test
|
|
|
|
while (--nScores != 0)
|
|
*pS++ += (*pH++)[idx];
|
|
}
|
|
|
|
static inline void
|
|
ScoreUnigramVector (LPCSTR pcsz, int nCh, PHistogram pH,
|
|
int *paS, const PHElt *paH, unsigned int nScores)
|
|
//
|
|
// Score this text for a unigram histogram. Each individual character is
|
|
// mapped to a histogram slot to yield a score for that character in each
|
|
// language.
|
|
{
|
|
if (nCh < 1)
|
|
return;
|
|
|
|
const PHIdx pMap = pH->GetMap();
|
|
|
|
unsigned char *p = (unsigned char *)pcsz;
|
|
|
|
while (nCh-- > 0)
|
|
AddVector (paS, paH, pMap[*p++], nScores);
|
|
}
|
|
|
|
static inline void
|
|
ScoreUnigramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
|
|
int *paS, const PHElt *paH, unsigned int nScores)
|
|
//
|
|
// WCHAR version. Only difference is the use of a map that maps the
|
|
// full 64K WCHAR space into the histogram index range.
|
|
{
|
|
if (nCh < 1)
|
|
return;
|
|
|
|
const PHIdx pMap = pH->GetMap();
|
|
|
|
while (nCh-- > 0)
|
|
AddVector (paS, paH, pMap[*pcwsz++], nScores);
|
|
}
|
|
|
|
static inline void
|
|
ScoreDigramVector (LPCSTR pcsz, int nCh, PHistogram pH,
|
|
int *paS, const PHElt *paH, unsigned int nScores)
|
|
//
|
|
// Score this text for a digram histogram. Each adjacent pair of characters
|
|
// are mapped to the index range and the mapped values combined to form an
|
|
// array index unique to that digram. The scores for that array slot are
|
|
// summed for each language.
|
|
{
|
|
if (nCh < 2)
|
|
return;
|
|
|
|
unsigned char *p = (unsigned char *)pcsz;
|
|
|
|
const PHIdx pMap = pH->GetMap();
|
|
|
|
unsigned char ch1 = pMap[*p++];
|
|
|
|
while (nCh-- > 1)
|
|
{
|
|
unsigned char ch2 = pMap[*p++];
|
|
|
|
AddVector (paS, paH, ch1 * pH->EdgeSize() + ch2, nScores);
|
|
|
|
ch1 = ch2;
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
ScoreTrigramVector (LPCSTR pcsz, int nCh, PHistogram pH,
|
|
int *paS, const PHElt *paH, unsigned int nScores)
|
|
//
|
|
// Score this text for a trigram histogram. Each adjacent three-letter set
|
|
// of characters are mapped to the index range and the mapped values combined
|
|
// to form an array index unique to that trgram.
|
|
{
|
|
if (nCh < 3)
|
|
return;
|
|
|
|
unsigned char *p = (unsigned char *)pcsz;
|
|
|
|
const PHIdx pMap = pH->GetMap();
|
|
|
|
unsigned char ch1 = pMap[*p++];
|
|
unsigned char ch2 = pMap[*p++];
|
|
|
|
while (nCh-- > 2)
|
|
{
|
|
unsigned char ch3 = pMap[*p++];
|
|
debug(printf(" '%c%c%c':",unmapch(ch1),unmapch(ch2),unmapch(ch3)));
|
|
|
|
int idx = ((ch1 * pH->EdgeSize()) + ch2) * pH->EdgeSize() + ch3;
|
|
ch1 = ch2;
|
|
ch2 = ch3;
|
|
|
|
AddVector (paS, paH, idx, nScores);
|
|
|
|
debug(for (UINT i = 0; i < nScores; i++) printf(" %3d", paH[i][idx]));
|
|
debug(printf("\n"));
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
ScoreTrigramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
|
|
int *paS, const PHElt *paH, unsigned int nScores)
|
|
//
|
|
// WCHAR version.
|
|
{
|
|
if (nCh < 3)
|
|
return;
|
|
|
|
const PHIdx pMap = pH->GetMap();
|
|
|
|
unsigned char ch1 = pMap[*pcwsz++];
|
|
unsigned char ch2 = pMap[*pcwsz++];
|
|
|
|
while (nCh-- > 2)
|
|
{
|
|
unsigned char ch3 = pMap[*pcwsz++];
|
|
|
|
int idx = ((ch1 * pH->EdgeSize()) + ch2) * pH->EdgeSize() + ch3;
|
|
ch1 = ch2;
|
|
ch2 = ch3;
|
|
|
|
AddVector (paS, paH, idx, nScores);
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
ScoreNgramVector (LPCSTR pcsz, int nCh, PHistogram pH,
|
|
int *paS, const PHElt *paH, unsigned int nScores)
|
|
//
|
|
// Score this text for any dimension of n-gram. Get "N" from the
|
|
// dimensionality of the histogram.
|
|
//
|
|
// Each adjacent n-letter set of characters are mapped to the index range
|
|
// and the scores the reference summed for each language. This code is
|
|
// never used for the current data file, instead an optimized scoring
|
|
// loop exists for each existing case. This exists to enable trying
|
|
// different dimension scoring without requiring a new DLL.
|
|
{
|
|
if (nCh < pH->Dimensionality())
|
|
return;
|
|
|
|
unsigned char *p = (unsigned char *)pcsz;
|
|
|
|
const PHIdx pMap = pH->GetMap();
|
|
|
|
// Fill the pipeline
|
|
|
|
int idx = 0;
|
|
if (pH->Dimensionality() >= 2)
|
|
idx = idx * pH->EdgeSize() + pMap[*p++];
|
|
if (pH->Dimensionality() >= 3)
|
|
idx = idx * pH->EdgeSize() + pMap[*p++];
|
|
if (pH->Dimensionality() >= 4)
|
|
idx = idx * pH->EdgeSize() + pMap[*p++];
|
|
|
|
unsigned int nLoopCount = nCh - (pH->Dimensionality() - 1);
|
|
|
|
while (nLoopCount-- > 0)
|
|
{
|
|
idx = (idx * pH->EdgeSize() + pMap[*p++]) % pH->NElts();
|
|
|
|
AddVector (paS, paH, idx, nScores);
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
ScoreNgramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
|
|
int *paS, const PHElt *paH, unsigned int nScores)
|
|
//
|
|
// WCHAR version.
|
|
{
|
|
if (nCh < pH->Dimensionality())
|
|
return;
|
|
|
|
const PHIdx pMap = pH->GetMap();
|
|
|
|
// Fill the pipeline
|
|
|
|
int idx = 0;
|
|
if (pH->Dimensionality() >= 2)
|
|
idx = idx * pH->EdgeSize() + pMap[*pcwsz++];
|
|
if (pH->Dimensionality() >= 3)
|
|
idx = idx * pH->EdgeSize() + pMap[*pcwsz++];
|
|
if (pH->Dimensionality() >= 4)
|
|
idx = idx * pH->EdgeSize() + pMap[*pcwsz++];
|
|
|
|
unsigned int nLoopCount = nCh - (pH->Dimensionality() - 1);
|
|
|
|
while (nLoopCount-- > 0)
|
|
{
|
|
idx = (idx * pH->EdgeSize() + pMap[*pcwsz++]) % pH->NElts();
|
|
|
|
AddVector (paS, paH, idx, nScores);
|
|
}
|
|
}
|
|
|
|
void
|
|
ScoreVector (LPCSTR pcsz, int nCh, PHistogram pH,
|
|
int *paS, const PHElt *paH, unsigned int nScores)
|
|
//
|
|
// Score a string into an array of scores using an array of histograms
|
|
//
|
|
// Each character n-gram is mapped to a histogram slot to yield a score
|
|
// for that character in each array at paH.
|
|
//
|
|
// On return, paS[0..nScores-1] is filled with the sum scores.
|
|
{
|
|
memset (paS, 0, sizeof(int) * nScores);
|
|
|
|
switch (pH->Dimensionality())
|
|
{
|
|
case 1:
|
|
ScoreUnigramVector (pcsz, nCh, pH, paS, paH, nScores);
|
|
break;
|
|
|
|
case 2:
|
|
ScoreDigramVector (pcsz, nCh, pH, paS, paH, nScores);
|
|
break;
|
|
|
|
case 3:
|
|
ScoreTrigramVector (pcsz, nCh, pH, paS, paH, nScores);
|
|
break;
|
|
|
|
default:
|
|
ScoreNgramVector (pcsz, nCh, pH, paS, paH, nScores);
|
|
break;
|
|
}
|
|
}
|
|
|
|
void
|
|
ScoreVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
|
|
int *paS, const PHElt *paH, unsigned int nScores)
|
|
//
|
|
// Score a string into an array of scores using an array of histograms.
|
|
{
|
|
memset (paS, 0, sizeof(int) * nScores);
|
|
|
|
switch (pH->Dimensionality())
|
|
{
|
|
case 1:
|
|
ScoreUnigramVectorW (pcwsz, nCh, pH, paS, paH, nScores);
|
|
break;
|
|
|
|
case 3:
|
|
ScoreTrigramVectorW (pcwsz, nCh, pH, paS, paH, nScores);
|
|
break;
|
|
|
|
default:
|
|
ScoreNgramVectorW (pcwsz, nCh, pH, paS, paH, nScores);
|
|
break;
|
|
}
|
|
}
|
|
|
|
void
|
|
LCDetect::Score7Bit (LPCSTR pcszText, int nChars, CScores &S) const
|
|
//
|
|
// Do 7-bit language detection. Compute scores for all 7-bit languages
|
|
// and store the raw language score in S at the language's base score-idx.
|
|
//
|
|
// Fill in only the first score slot per language. Uses ScoreIdx() for
|
|
// the first code page, but does not detect or set the code page.
|
|
{
|
|
const PHistogram pH = Get7BitLanguage(0)->GetLangHistogram();
|
|
|
|
debug(printf(" "));
|
|
debug(for(unsigned int x=0;x<GetN7BitLanguages();x++)printf(" %3d", Get7BitLanguage(x)->LanguageID()));
|
|
debug(printf("\n"));
|
|
|
|
int sc[MAXSCORES];
|
|
|
|
// Compute the raw score vector
|
|
|
|
ScoreVector (pcszText, nChars, pH, sc, m_paHElt7Bit, GetN7BitLanguages());
|
|
|
|
|
|
// Fill in the CScores array from it
|
|
|
|
for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
|
|
{
|
|
PLanguage7Bit pL = Get7BitLanguage(i);
|
|
|
|
CScore &s = S.Ref(pL->GetScoreIdx());
|
|
|
|
s.SetLang(pL);
|
|
s.SetCodePage(0);
|
|
s.SetScore(sc[i]);
|
|
s.SetCharCount(nChars);
|
|
}
|
|
}
|
|
|
|
void
|
|
LCDetect::Score8Bit (LPCSTR pcszText, int nChars, CScores &S) const
|
|
//
|
|
// Do 8-bit detection. Compute a combined language / code page score
|
|
// for each trained language / code page combination for the 8-bit languages.
|
|
// Store all the raw scores in S at the language+each codepage score-idx.
|
|
//
|
|
// May store multiple entries in S for each language, one per code page.
|
|
{
|
|
const PHistogram pH = Get8BitLanguage(0)->GetHistogram(0);
|
|
|
|
int sc[MAXSCORES];
|
|
|
|
// Compute the raw score vector
|
|
|
|
ScoreVector (pcszText, nChars, pH, sc, m_paHElt8Bit, m_nHElt8Bit);
|
|
|
|
// Fill in the CScores array from it
|
|
|
|
int nSc = 0;
|
|
for (unsigned int i = 0; i < GetN8BitLanguages(); i++)
|
|
{
|
|
PLanguage8Bit pL = Get8BitLanguage(i);
|
|
|
|
for (int j = 0; j < pL->NCodePages(); j++)
|
|
{
|
|
CScore &s = S.Ref(pL->GetScoreIdx() + j);
|
|
|
|
s.SetLang(pL);
|
|
s.SetCodePage(pL->GetCodePage(j));
|
|
s.SetScore( sc[ nSc++ ] );
|
|
s.SetCharCount(nChars);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
LCDetect::ScoreLanguageAsSBCS (LPCWSTR wcs, int nch, CScores &S) const
|
|
//
|
|
// This scores Unicode text known to contain mostly characters in the
|
|
// script ranges used for 7-bit languages. This uses a special mapping,
|
|
// m_pH727Bit, that converts n-grams in the WCHAR text directly to the same
|
|
// mapping output space used for 7-bit language detection. It is then scored
|
|
// using the same language-only histograms used for 7-bit SBCS detection.
|
|
//
|
|
// The output is the same as if Score7Bit() had been called on the SBCS
|
|
// equivalent to this text. The same slots in S are filled in, using the
|
|
// 7-bit score indices, NOT the Unicode language score indices.
|
|
{
|
|
debug(printf(" scoring as SBCS\n"));
|
|
|
|
debug(printf(" "));
|
|
debug(for(unsigned int x=0;x<GetN7BitLanguages();x++)printf(" %3d", Get7BitLanguage(x)->LanguageID()));
|
|
debug(printf("\n"));
|
|
|
|
// Call ScoreVectorW(), passing the histogram set up or the WCHAR map.
|
|
|
|
int sc[MAXSCORES];
|
|
|
|
// Compute the raw score vector
|
|
|
|
ScoreVectorW (wcs, nch, m_pHU27Bit, sc, m_paHElt7Bit,GetN7BitLanguages());
|
|
|
|
|
|
// Fill in the CScores array from it
|
|
|
|
for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
|
|
{
|
|
PLanguage7Bit pL = Get7BitLanguage(i);
|
|
|
|
CScore &s = S.Ref(pL->GetScoreIdx());
|
|
|
|
s.SetLang(pL);
|
|
s.SetCodePage(0);
|
|
s.SetScore(sc[i]);
|
|
s.SetCharCount(nch);
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
|
|
void
|
|
Language::ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const
|
|
//
|
|
// The default handler for scoring the code page for text for which the
|
|
// language is already known. Initially used only for Unicode.
|
|
{
|
|
idx = 0;
|
|
S.SetCodePage(0);
|
|
}
|
|
|
|
void
|
|
Language7Bit::ScoreCodePage (LPCSTR pStr, int nCh, CScore &S, int &idx) const
|
|
//
|
|
// Detect the code page for text whose language has already been detected
|
|
// and is indicated in S. Set S.CodePage(), do not change other
|
|
// fields of S.
|
|
//
|
|
// Set idx to the index of the high-scoring code page. The caller uses this
|
|
// to place the score in the correct ScoreIdx slot.
|
|
//
|
|
// Note that the arg is a single CScore, not an array. The CScore S is
|
|
// filled in with the score of the high-scoring code page, and no information
|
|
// about the other code pages is returned.
|
|
{
|
|
if (NCodePages() == 1)
|
|
{
|
|
// If lang is trained with only one codepage, just return it.
|
|
|
|
idx = 0;
|
|
S.SetCodePage(GetCodePage(0));
|
|
|
|
debug(printf(" score code page: only one; cp=%d\n",GetCodePage(0)));
|
|
}
|
|
|
|
debug(printf("scoring 7-bit code pages: "));
|
|
|
|
int sc[MAXSUBLANG];
|
|
|
|
// Compute the raw score vector
|
|
|
|
ScoreVector (pStr, nCh, GetCodePageHistogram(0),
|
|
sc, GetPHEltArray(), NCodePages());
|
|
|
|
// Find the high-scoring code page and fill in S with its values
|
|
|
|
idx = FindHighIdx (sc, NCodePages());
|
|
|
|
debug(printf("selecting cp=%d idx=%d\n", GetCodePage(idx), idx));
|
|
|
|
S.SetCodePage (GetCodePage(idx));
|
|
}
|
|
|
|
void
|
|
LanguageUnicode::ScoreSublanguages (LPCWSTR wcs, int nch, CScores &S) const
|
|
//
|
|
// Score wcs for each sub-language and add the raw scores to S.
|
|
// The scores are not qualified at this time.
|
|
//
|
|
// Relevant only for Unicode language groups that require subdetection,
|
|
// initially CJK.
|
|
{
|
|
if (m_nSubLangs == 0)
|
|
return;
|
|
|
|
debug(printf(" scoring Unicode sublanguages:\n"));
|
|
|
|
int sc[MAXSUBLANG];
|
|
|
|
// Compute the raw score vector
|
|
|
|
ScoreVectorW (wcs, nch, GetHistogram(0), sc, m_paHElt, m_nSubLangs);
|
|
|
|
// Fill in the CScores array from it
|
|
|
|
for (int i = 0; i < NSubLangs(); i++)
|
|
{
|
|
PLanguageUnicode pSL = GetSublanguage(i);
|
|
|
|
CScore &s = S.Ref(pSL->GetScoreIdx());
|
|
s.SetLang (pSL);
|
|
s.SetScore (sc[i]);
|
|
s.SetCharCount (nch);
|
|
s.SetCodePage (0);
|
|
|
|
debug(printf(" lang=%d score=%d\n", pSL->LanguageID(), sc[i]));
|
|
}
|
|
}
|
|
|
|
int
|
|
LCDetect::ChooseDetectionType (LPCSTR pcszText, int nChars) const
|
|
//
|
|
// Histogram the raw char values to determine whether to use 7-bit or
|
|
// 8-bit detection for this block.
|
|
{
|
|
// Count the proportion of chars < vs. >= 0x80
|
|
|
|
int nHi = 0;
|
|
|
|
for (int i = nChars; i-- > 0; )
|
|
nHi += ((unsigned char)*pcszText++) & 0x80;
|
|
|
|
nHi /= 0x80;
|
|
int nLo = nChars - nHi;
|
|
|
|
// Make sure there is sufficient data to make a good choice
|
|
|
|
// work here -- try if abs(nHi - nLo) < 10
|
|
|
|
if (nHi + nLo < 10)
|
|
return DETECT_NOTDEFINED;
|
|
|
|
if (nHi * 2 > nLo)
|
|
return DETECT_8BIT;
|
|
else
|
|
return DETECT_7BIT;
|
|
}
|
|
|
|
void
|
|
LCDetect::ScoreLanguageA (LPCSTR pStr, int nChars, CScores &S) const
|
|
//
|
|
//
|
|
// Score the text at pStr for each language that it potentially contains.
|
|
//
|
|
// Add the scores to S at the ScoreIdx() for each language and codepage
|
|
// combination.
|
|
//
|
|
// This adds all the raw scores for either all the 7-bit or all the
|
|
// 8-bit entries, depending on which category the rough initial analysis
|
|
// indicates. At this time, there are no entries for which both methods
|
|
// are required.
|
|
//
|
|
// For 7-bit detection, code page is always set to 0 and the language's score
|
|
// is placed in the 0'th slot for each language. The caller later scores
|
|
// code pages if needed, and fills the remaining slots.
|
|
//
|
|
// For 8-bit detection, scores are generated for each code page and all
|
|
// ScoreIdx() slots are used.
|
|
{
|
|
switch (ChooseDetectionType (pStr, nChars)) {
|
|
|
|
case DETECT_7BIT:
|
|
Score7Bit (pStr, nChars, S);
|
|
break;
|
|
|
|
case DETECT_8BIT:
|
|
Score8Bit (pStr, nChars, S);
|
|
break;
|
|
}
|
|
}
|
|
|
|
void
|
|
LCDetect::ScoreLanguageW (LPCWSTR wcs, int nch, CScores &S, PCLCDConfigure pC) const
|
|
//
|
|
// Score the text at wcs for each language that it potentially contains.
|
|
//
|
|
// Add the scores to S at the ScoreIdx() for each language.
|
|
//
|
|
// This first determines the Unicode script groups represented in wcs.
|
|
// Each WCHAR is mapped through CHARMAP_UNICODE to yield its "language group
|
|
// ID". The IDs for each char are counted and the top scoring IDs indicate
|
|
// the probable languages or language groups. Note that unlike all other
|
|
// use of n-gram scoring, NO WEIGHTS are associated with the IDs -- whichever
|
|
// group contains the most raw chars, wins.
|
|
//
|
|
// Some languages are indicated by presence of characters in a particular
|
|
// script group; these scores are immediately added to S.
|
|
//
|
|
// For script groups that indicate multiple languages, subdetection within
|
|
// the group is done only when the score for the group exceeds a threshhold
|
|
// that indicates the sub-detected languages are likely to be included in
|
|
// the final result. This is purely a performance optimization, not to
|
|
// be confused with the uniform score threshhold applied by the caller.
|
|
//
|
|
// The "Group" entries themselves are never included in the result; they
|
|
// exist only to invoke subdetection.
|
|
//
|
|
// In many cases even a single Unicode character provides sufficient
|
|
// identification of script and language, so there is no minimum
|
|
// qualification for scores in the script ranges that indicate a
|
|
// specific language by range alone.
|
|
{
|
|
// Score the chars according to the Unicode script group they belong to.
|
|
// The array indices are the raw outputs of the primary Unicode Charmap
|
|
// NOT to be confused with the ScoreIdx() of each language. Further,
|
|
// the scores are the simple count of the characters in each script
|
|
// range, and are NOT weighted by any histogram.
|
|
|
|
// In this initial step, the simple majority of characters per range
|
|
// determines which further detection steps to take.
|
|
|
|
const PHIdx map = GetMap (CHARMAP_UNICODE);
|
|
|
|
int anScore[MAXSCORES];
|
|
memset (anScore, 0, sizeof(int) * GetNUnicodeLanguages());
|
|
|
|
for (int x = 0; x < nch; x++)
|
|
anScore[map[wcs[x]]]++;
|
|
|
|
debug(printf(" char_ignore score=%d\n",anScore[HIDX_IGNORE]));
|
|
|
|
// Ignore scores for chars that correlate with no language
|
|
|
|
anScore[HIDX_IGNORE] = 0;
|
|
|
|
|
|
// Identify the scores that qualify a language for immediate inclusion
|
|
// in the result, or that qualify a language group for further detection.
|
|
|
|
|
|
// Find the high score to use as a relative threshhold for inclusion.
|
|
|
|
int nMaxScore = 0;
|
|
|
|
for (unsigned int i = 0; i < GetNUnicodeLanguages(); i++)
|
|
{
|
|
if (anScore[i] > nMaxScore)
|
|
nMaxScore = anScore[i];
|
|
}
|
|
|
|
debug(printf(" unicode range max score=%d\n",nMaxScore));
|
|
|
|
// Process all individual and group scores above a threshhold.
|
|
|
|
// The threshhold logic is different from the logic for SBCS/DBCS
|
|
// detection, because presence of even a single character in certain
|
|
// Unicode script ranges can be a strong correct indicator for a
|
|
// specific language. The threshhold for subdetected scores is
|
|
// higher, since that is a statistical result; single characters
|
|
// are not as strong an indicator.
|
|
|
|
// Set the threshhold for subdetecting.
|
|
|
|
int nRelThresh = 1 + (nMaxScore * pC->nRelativeThreshhold) / 100;
|
|
|
|
|
|
for (i = 0; i < GetNUnicodeLanguages(); i++)
|
|
{
|
|
// Threshhold for any range is at least this many raw chars in range.
|
|
|
|
if (anScore[i] >= 2)
|
|
{
|
|
PLanguageUnicode pL = GetUnicodeLanguage(i);
|
|
|
|
debug(printf(" using lang=%d score=%d:\n", pL->LanguageID(), anScore[i]));
|
|
|
|
if (pL->LanguageID() == LANGID_UNKNOWN)
|
|
{
|
|
// DO NOTHING -- text is an unknown language
|
|
|
|
debug(printf(" lang=unknown\n"));
|
|
|
|
}
|
|
else if (pL->NSubLangs() > 0)
|
|
{
|
|
// Subdetect language within a Unicode group, and add all the
|
|
// unqualified raw scores directly to S.
|
|
|
|
pL->ScoreSublanguages (wcs, nch, S);
|
|
}
|
|
else if ( pL->LanguageID() == LANGID_LATIN_GROUP &&
|
|
anScore[i] >= nRelThresh )
|
|
{
|
|
// Subdetect Latin/Western languages, and add all the
|
|
// unqualified raw scores to S.
|
|
|
|
ScoreLanguageAsSBCS (wcs, nch, S);
|
|
}
|
|
else
|
|
{
|
|
debug(printf(" range identifies language\n"));
|
|
|
|
// This range identifies a specific language; add it.
|
|
|
|
CScore &s = S.Ref(pL->GetScoreIdx());
|
|
s.SetLang (pL);
|
|
s.SetScore (anScore[i] * UNICODE_DEFAULT_CHAR_SCORE);
|
|
s.SetCharCount (nch);
|
|
s.SetCodePage (0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/****************************************************************/
|
|
|
|
DWORD
|
|
LCDetect::DetectA (LPCSTR pStr, int nInputChars,
|
|
PLCDScore paScores, int *pnScores,
|
|
PCLCDConfigure pLCDC) const
|
|
//
|
|
// Do SBCS / DBCS detection. Detect language and code page of pStr,
|
|
// fill paScores[] with the result and set *pnScores to the result count.
|
|
// On input, *pnScores is the available capacity of paScores.
|
|
//
|
|
// The text at pStr is broken into chunks, typically several hundred
|
|
// bytes.
|
|
//
|
|
// In the first phase, each chunk is scored by language. The scores for
|
|
// a single chunk are qualified by both an absolute threshhold and by a
|
|
// threshhold based on the high score of just that chunk. Scores exceeding
|
|
// the threshhold are remembered towards the second phase; other scores
|
|
// are discarded.
|
|
//
|
|
// For each score that will be remembered, if a code page is not already
|
|
// known for it then the code page for the chunk is determined and included
|
|
// with the score. Note that the score refers only to the language, NOT
|
|
// to the confidence of the code page.
|
|
//
|
|
// In the second phase, the combined scores for all chunks are examined.
|
|
// The scores are further qualified by a relative threshhold. Only
|
|
// languages with scores exceeding the threshhold are included in the
|
|
// final result; the remainder are discarded.
|
|
//
|
|
// The two-step process is designed to yield good results for input containing
|
|
// text in multiple languages, or containing a high portion of whitespace or
|
|
// symbol characters that correlate with no language. It also is designed
|
|
// to optimally handle tie-cases whether due to similar languages or to
|
|
// mixed-language input, and to avoid applying threshholds based on
|
|
// absolute scores.
|
|
//
|
|
// The presumption is that each chunk, generally, represents text in a single
|
|
// language, and no matter what the absolute high score is, its high score
|
|
// most likely is for that language. The point of the first phase is to
|
|
// identify all the languages that are known with some confidence to be
|
|
// represented in the text. For a given chunk, multiple languages scores may
|
|
// meet this criteria and be remembered towards the result. Specifically,
|
|
// when a tie occurs, BOTH scores are always included. (Choosing just one
|
|
// would be wrong too often to be worthwhile.)
|
|
//
|
|
// The point of the second phase is to filter out the noise allowed by the
|
|
// first phase.
|
|
{
|
|
TScores<MAXSCORES> SChunk; // Scores for one chunk at a time
|
|
TScores<MAXSCORES> SAll; // Qualified scores for ultimate result
|
|
|
|
if (pLCDC == NULL) // Use the default config if not specified
|
|
pLCDC = &m_LCDConfigureDefault;
|
|
|
|
if (*pnScores == 0)
|
|
return NO_ERROR;
|
|
|
|
#define MAX_INPUT (USHRT_MAX-1)
|
|
// CScore.NChars() is a USHORT to save space+time, so only this # of chars
|
|
// can be accepted per call or the scoring would overflow.
|
|
|
|
nInputChars = min (nInputChars, MAX_INPUT);
|
|
debug(printf("LCD_Detect: detecting %d chars\n", nInputChars));
|
|
|
|
// The first loop processed fixed-size chunks and accumulates all the
|
|
// credibly-detected languages in SAll. This is the "coarse" accuracy
|
|
// qualification: detect the language of text blocks small enough to
|
|
// typically be in *one* language, and remember only the highest scoring
|
|
// language for that chunk. Then generate a multivalued result that
|
|
// shows the distribution of language in the doc, instead of simply
|
|
// returning the dominant language. This is necessary because it is
|
|
// much harder to determine the sole language than to determine the
|
|
// multivalued result.
|
|
|
|
int nProcessed = 0;
|
|
|
|
while (nProcessed < nInputChars)
|
|
{
|
|
SChunk.Reset(); // reset is cheaper than constructing
|
|
|
|
// Process nChunkSize worth of text if that will leave at least
|
|
// another nChunkSize piece for the final pass. If that would
|
|
// leave a smaller final chunk, go ahead and process the entire
|
|
// remaining input.
|
|
|
|
int nch = nInputChars - nProcessed;
|
|
|
|
if (nch >= pLCDC->nChunkSize * 2)
|
|
nch = pLCDC->nChunkSize;
|
|
|
|
|
|
debug(printf("\nStarting chunk: %d ch\n\"%.*s\"\n", nch, nch, &pStr[nProcessed]));
|
|
|
|
ScoreLanguageA (&pStr[nProcessed], nch, SChunk);
|
|
|
|
// Compute the threshhold for inclusion of each score in the
|
|
// overall result.
|
|
|
|
int nRelThresh = 1 + (SChunk.FindHighScore().GetScore() * pLCDC->nRelativeThreshhold) / 100;
|
|
int nThresh7 = max (pLCDC->nMin7BitScore * nch, nRelThresh);
|
|
int nThresh8 = max (pLCDC->nMin8BitScore * nch, nRelThresh);
|
|
|
|
debug(printf("high score=%d min7=%d thresh7=%d thresh8=%d\n", SChunk.FindHighScore().GetScore(),pLCDC->nMin7BitScore*nch,nThresh7,nThresh8));
|
|
|
|
// Qualify each score, remember only scores well-above the noise.
|
|
|
|
for (unsigned int i = 0; i < SChunk.NElts(); i++)
|
|
{
|
|
CScore &s = SChunk.Ref(i);
|
|
PLanguage pL = s.GetLang();
|
|
|
|
// debug(if (s.GetScore()) printf(" raw: lang=%d score=%d cp=%d\n",pL->LanguageID(),s.GetScore(),s.GetCodePage()));
|
|
|
|
if ( (s.GetScore() >= nThresh7 && pL->Type() == DETECT_7BIT) ||
|
|
(s.GetScore() >= nThresh8 && pL->Type() == DETECT_8BIT) )
|
|
{
|
|
debug(printf(" qual: lang=%d score=%d cp=%d\n",pL->LanguageID(),s.GetScore(),s.GetCodePage()));
|
|
|
|
// If code page is not already set, detect it, and store
|
|
// the score for this language using the scoreidx slot
|
|
// for that code page. Store no score in the slots for
|
|
// other code pages for the same language.
|
|
|
|
int idx = 0;
|
|
|
|
if (s.GetCodePage() == 0)
|
|
pL->ScoreCodePage (&pStr[nProcessed], nch, s, idx);
|
|
|
|
// Remember this score for the overall results
|
|
|
|
SAll.Ref(i + idx) += s;
|
|
}
|
|
}
|
|
|
|
nProcessed += nch;
|
|
}
|
|
|
|
// SAll has entries for each unique { lang ID, code page }
|
|
// with the char count and total raw score (not normalized per char)
|
|
// for those chunks whose score qualifies as a confident result and
|
|
// that contributed to the entry.
|
|
|
|
// Select the top-scoring code page for each language
|
|
// and remove all other code page scores.
|
|
|
|
debug(printf("Selecting top-scoring code pages\n"));
|
|
|
|
SAll.SelectCodePages ();
|
|
|
|
// Sort by decreasing score
|
|
|
|
SAll.SortByScore ();
|
|
|
|
// Build the client return structure
|
|
// Language ID
|
|
// Code page
|
|
// Doc percent 0-100
|
|
// Confidence 0-100
|
|
|
|
int nScoresReturned = 0;
|
|
|
|
for (unsigned i = 0; i < SAll.NElts() && nScoresReturned < *pnScores; i++)
|
|
{
|
|
CScore &s = SAll.Ref(i);
|
|
|
|
LCDScore R;
|
|
|
|
R.nLangID = s.GetLang()->LanguageID();
|
|
R.nCodePage = s.GetCodePage();
|
|
|
|
// Percent of doc for which this language scored above the
|
|
// confidence threshhold, even if not 1st place for that chunk.
|
|
|
|
R.nDocPercent = (s.GetCharCount() * 100) / nProcessed;
|
|
|
|
debug(printf("s.CharCount=%d nProcessed=%d\n", s.GetCharCount(), nProcessed));
|
|
|
|
// Confidence is the raw score for all the chunks for which this
|
|
// language was detected above the confidence threshhold, divided
|
|
// by the number of characters in those chunks.
|
|
|
|
R.nConfidence = s.GetScore() / s.GetCharCount();
|
|
|
|
debug(printf("Examining: lang=%d cp=%d docpct=%d\n", R.nLangID, R.nCodePage, R.nDocPercent));
|
|
|
|
// Return only scores for languages detected in over a
|
|
// minimum % of the doc.
|
|
|
|
if (R.nDocPercent > pLCDC->nDocPctThreshhold)
|
|
{
|
|
debug(printf(" returning score\n"));
|
|
paScores[nScoresReturned++] = R;
|
|
}
|
|
}
|
|
|
|
debug(printf("Returning %d scores\n", nScoresReturned));
|
|
|
|
*pnScores = nScoresReturned;
|
|
|
|
return NO_ERROR;
|
|
}
|
|
|
|
DWORD
|
|
LCDetect::DetectW (LPCWSTR pwStr, int nInputChars,
|
|
PLCDScore paScores, int *pnScores, PCLCDConfigure pLCDC) const
|
|
//
|
|
// WCHAR (Unicode) version of LCD_Detect. Score into paScores, one score
|
|
// per language.
|
|
{
|
|
if (pLCDC == NULL) // Use the default config if not specified
|
|
pLCDC = &m_LCDConfigureDefault;
|
|
|
|
if (*pnScores == 0)
|
|
return NO_ERROR;
|
|
|
|
// CScore.NChars() is a USHORT to save space+time, so only this # of chars
|
|
// can be accepted per call or the scoring would overflow.
|
|
|
|
nInputChars = min (nInputChars, MAX_INPUT);
|
|
debug(printf("LCD_DetectW: detecting %d chars\n", nInputChars));
|
|
|
|
TScores<MAXSCORES> SChunk; // Raw score for one chunk at a time
|
|
TScores<MAXSCORES> SAll; // Qualifying scores for final result
|
|
|
|
// SChunk is defined outside the loop since it's cheaper to Reset() it
|
|
// than to reconstruct it each time.
|
|
|
|
int nProcessed = 0;
|
|
|
|
// Process one chunk of the input per loop
|
|
|
|
while (nProcessed < nInputChars)
|
|
{
|
|
SChunk.Reset();
|
|
|
|
|
|
// Process nChunkSize worth of text if that will leave at least
|
|
// another nChunkSize piece for the final pass. If that would
|
|
// leave a smaller final chunk, go ahead and process the entire
|
|
// remaining input.
|
|
|
|
int nch = nInputChars - nProcessed;
|
|
|
|
if (nch >= pLCDC->nChunkSize * 2)
|
|
nch = pLCDC->nChunkSize;
|
|
|
|
|
|
debug(printf("\nStarting chunk: %d ch\n", nch));
|
|
|
|
// Compute the raw scores for the chunk.
|
|
// This automatically includes the sub-detected language scores
|
|
// for the Latin/Western group and Unicode groups, <<< when the
|
|
// group itself >>> scores above the inclusion threshhold.
|
|
// But, the sub-detected scores themselves still need to be
|
|
// qualified.
|
|
|
|
ScoreLanguageW (&pwStr[nProcessed], nch, SChunk, pLCDC);
|
|
|
|
// Compute the threshhold for inclusion of each score in the
|
|
// overall result.
|
|
|
|
int nRelThresh = 1 + (SChunk.FindHighScore().GetScore() * pLCDC->nRelativeThreshhold) / 100;
|
|
int nThresh7 = max (pLCDC->nMin7BitScore * nch, nRelThresh);
|
|
int nThreshU = max (pLCDC->nMinUnicodeScore * nch, nRelThresh);
|
|
|
|
debug(printf("scores: nElts=%d rel=%d%% high=%d min=%d min7=%d minU=%d\n", SChunk.NElts(), pLCDC->nRelativeThreshhold, SChunk.FindHighScore().GetScore(), nRelThresh,nThresh7,nThreshU));
|
|
|
|
// Qualify each score, remember only scores well-above the noise.
|
|
|
|
for (unsigned int i = 0; i < SChunk.NElts(); i++)
|
|
{
|
|
CScore &s = SChunk.Ref(i);
|
|
PLanguage pL = s.GetLang();
|
|
|
|
if ( (s.GetScore() >= nThresh7 && pL->Type() == DETECT_7BIT) ||
|
|
(s.GetScore() >= nThreshU && pL->Type() == DETECT_UNICODE) )
|
|
{
|
|
debug(printf(" using lang=%d score=%d nch=%d\n",pL->LanguageID(),s.GetScore(),s.GetCharCount()));
|
|
|
|
// Remember this score for the overall results
|
|
|
|
SAll.Ref(i) += s;
|
|
}
|
|
}
|
|
|
|
nProcessed += nch;
|
|
}
|
|
|
|
// SAll has entries for each unique language with char count and total
|
|
// raw score (not normalized per char) for those chunks whose score
|
|
// qualifies as a confident result.
|
|
|
|
// SAll may contain entries only for 7-bit and Unicode languages,
|
|
// at most one entry per unique Win32 language ID
|
|
|
|
debug(printf("Selecting scores for result:\n"));
|
|
|
|
// Sort by decreasing score
|
|
|
|
SAll.SortByScore ();
|
|
|
|
// Build the client return structure
|
|
// Language ID
|
|
// Code page
|
|
// Doc percent 0-100
|
|
// Confidence 0-100
|
|
|
|
int nScoresReturned = 0;
|
|
|
|
for (unsigned i = 0; i < SAll.NElts() && nScoresReturned < *pnScores; i++)
|
|
{
|
|
CScore &s = SAll.Ref(i);
|
|
|
|
LCDScore R;
|
|
|
|
R.nLangID = s.GetLang()->LanguageID();
|
|
R.nCodePage = s.GetCodePage();
|
|
|
|
// Percent of doc for which this language scored above the
|
|
// confidence threshhold, even if not 1st place for that chunk.
|
|
|
|
R.nDocPercent = (s.GetCharCount() * 100) / nProcessed;
|
|
|
|
// Confidence is the raw score for all the chunks for which this
|
|
// language was detected above the confidence threshhold, divided
|
|
// by the number of characters in those chunks.
|
|
|
|
R.nConfidence = s.GetScore() / s.GetCharCount();
|
|
|
|
debug(printf(" testing: lang=%d nch=%d docpct=%d\n", R.nLangID,s.GetCharCount(),R.nDocPercent));
|
|
|
|
// Return only scores for languages detected in over a
|
|
// minimum % of the doc.
|
|
|
|
if (R.nDocPercent > pLCDC->nDocPctThreshhold)
|
|
{
|
|
debug(printf(" returning score\n"));
|
|
paScores[nScoresReturned++] = R;
|
|
}
|
|
}
|
|
|
|
debug(printf("Returning %d scores\n", nScoresReturned));
|
|
|
|
*pnScores = nScoresReturned;
|
|
|
|
return NO_ERROR;
|
|
}
|
|
|
|
/****************************************************************/
|
|
/****************************************************************/
|
|
|
|
#if 0
|
|
// Export functions
|
|
|
|
BOOL APIENTRY
|
|
DllMain (HANDLE hM, DWORD ul_reason, LPVOID lpReserved)
|
|
{
|
|
switch (ul_reason) {
|
|
|
|
case DLL_PROCESS_ATTACH:
|
|
{
|
|
DisableThreadLibraryCalls( (HINSTANCE)hM );
|
|
|
|
LCDetect *pLC = new LCDetect ( (HMODULE)hM );
|
|
if (pLC == NULL)
|
|
return FALSE;
|
|
|
|
if (pLC->LoadState() != NO_ERROR)
|
|
{
|
|
delete pLC;
|
|
return FALSE;
|
|
}
|
|
|
|
g_pLCDetect = pLC;
|
|
}
|
|
return TRUE;
|
|
|
|
case DLL_PROCESS_DETACH:
|
|
if (g_pLCDetect != NULL)
|
|
delete (LCDetect *)g_pLCDetect;
|
|
g_pLCDetect = NULL;
|
|
return TRUE;
|
|
|
|
case DLL_THREAD_ATTACH:
|
|
case DLL_THREAD_DETACH:
|
|
break;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
#endif
|
|
|
|
extern "C" void WINAPI
|
|
LCD_GetConfig (PLCDConfigure pLCDC)
|
|
{
|
|
if (g_pLCDetect)
|
|
*pLCDC = g_pLCDetect->GetConfig();
|
|
}
|
|
|
|
extern "C" DWORD WINAPI
|
|
LCD_Detect (LPCSTR pStr, int nInputChars,
|
|
PLCDScore paScores, int *pnScores,
|
|
PCLCDConfigure pLCDC)
|
|
//
|
|
// Score into paScores, one score per language, "qualifying" scores only.
|
|
// Return ranked by decreasing score.
|
|
{
|
|
if (g_pLCDetect == NULL)
|
|
return ERROR_INVALID_FUNCTION;
|
|
|
|
return g_pLCDetect->DetectA(pStr, nInputChars, paScores, pnScores, pLCDC);
|
|
}
|
|
|
|
extern "C" DWORD WINAPI
|
|
LCD_DetectW (LPCWSTR wcs, int nInputChars,
|
|
PLCDScore paScores, int *pnScores,
|
|
PCLCDConfigure pLCDC)
|
|
{
|
|
if (g_pLCDetect == NULL)
|
|
return ERROR_INVALID_FUNCTION;
|
|
|
|
return g_pLCDetect->DetectW(wcs, nInputChars, paScores, pnScores, pLCDC);
|
|
}
|
|
|
|
extern "C" void WINAPI
|
|
LCD_SetDebug (int f)
|
|
{
|
|
#ifdef DEBUG_LCDETECT
|
|
g_fDebug = f;
|
|
#endif
|
|
}
|