windows-xp/Source/XPSP1/NT/shell/ext/mlang/lcdetect.cpp


								/*

								 * Automatic language and codepage detector

								 *

								 * Copyright (C) 1996, 1997, Microsoft Corp.  All rights reserved.

								 *

								 *  History:    1-Feb-97    BobP      Created

								 *              5-Aug-97    BobP      Added Unicode support and rewrote

								 *                                    scoring to use vector math.

								 *

								 * This is the runtime detector.

								 *

								 * See the comments in lcdcomp.cpp for a description of the compilation

								 * process and training data format.

								 *

								 * See design.txt for a description of the detection and scoring algorithm.

								 *

								 * Performance note:  60-80% of execution time in this code is AddVector(),

								 * which is probably memory-cycle bound by its random data access, but is

								 * still a candidate for further optimizing with an intrinsic vector operator,

								 * should one become available.

								 *

								 * to-do (as needed):

								 * - Adjust 7-bit and 8-bit scores to make them more comparable

								 * - detect UTF-8 in the SBCS entry point, via heuristic and via

								 *   subdetection as 7-bit lang and as Unicode.

								 */


								#include "private.h"


								// This is all the global (per-process) state

								//

								// It is set at DLL process init and its contents are const after that.


								LCDetect * g_pLCDetect;


								#ifdef DEBUG_LCDETECT

								int g_fDebug;

								#endif


								/****************************************************************/


								static inline unsigned int

								FindHighIdx (const int *pn, unsigned int n)

								//

								// Return the INDEX of the highest-valued integer in the given array.

								{

									int nMax = 0;

									unsigned int nIdx = 0;


									for (unsigned int i = 0; i < n; i++)

									{

										if (pn[i] > nMax)

										{

											nMax = pn[i];

											nIdx = i;

										}

									}


									return nIdx;

								}


								/****************************************************************/


								void

								CScores::SelectCodePages (void)

								//

								// Find the highest scoring code page for each language, and remove

								// all the other scores from the array such that the array contains

								// exactly one score per detected language instead of one score per

								// code page per language.

								//

								// When multiple scores are present for different code pages of the same

								// language, this function combines the scores into a single score.

								// The resulting entry will have the code page of the top-scoring code page

								// for the various entries for that language, and the score and char count

								// will be the SUM of the scores and char counts for ALL the entries for

								// that language.

								//

								// For example, if the input contains:

								//		Lang		Codepage	Score	Char count

								//		Russian		1251		42		200

								//		Russian		20866		69		300

								//

								// Then on output, the array will contain only one score for Russian:

								//		Russian		20866		111		500

								//

								// This overwrites the entries in place, and sets m_nUsed to the resulting

								// number of active slots.

								//

								// The scores are already grouped by language, no need to sort by language.

								//

								// After return, the score array must NOT be referenced via ScoreIdx()

								// because the index of the entries has changed.

								{

									// The score indices no longer matter, remove slots that scored zero.


									RemoveZeroScores ();


									if (m_nUsed == 0)

										return;


									// Select top score per language.  This is fundamentally dependent

									// on the score array already being ordered by language.  This won't

									// combine scores for the same language as both a 7-bit and 8-bit lang,

									// but that's not worth fixing.


									int maxscore = 0;					// highest score for a given language

									int totalscore = m_p[0].GetScore();	// sum of scores  " "

									int totalchars = m_p[0].GetCharCount();// sum of character counts  " "


									int nReturned = 0;			// index and ultimate count of elts returned

									unsigned int maxscoreidx = 0; // array index of the top-scoring code page,

																  // *** for the current language ***


									for (unsigned int i = 1; i < m_nUsed; i++) {

										if (m_p[i-1].GetLang() != m_p[i].GetLang())

										{

											// [i] indicates a different language from the previous entry


											// Add the entry for the previous language to the result

											// by copying the slot for its highest-scoring code page,

											// and overwriting its score and char count with the sum counts.


											m_p[maxscoreidx].SetScore(totalscore);

											m_p[maxscoreidx].SetCharCount(totalchars);

											m_p[nReturned++] = m_p[maxscoreidx];


											// Start remembering the top and total scores for the new lang.


											maxscore = m_p[i].GetScore();

											totalscore = m_p[i].GetScore();

											totalchars = m_p[i].GetCharCount();

											maxscoreidx = i;		// remember which [] had the top score

										}

										else

										{

											// Accumulate more scores for the same language


											if (m_p[i].GetScore() > maxscore) {

												maxscore = m_p[i].GetScore();

												maxscoreidx = i;

											}

											totalscore += m_p[i].GetScore();

											totalchars += m_p[i].GetCharCount();

										}

									}


									// Process the the last language.  Return the slot from its

									// highest-scoring code page.


									if (m_nUsed > 0)

									{

										m_p[maxscoreidx].SetScore(totalscore);

										m_p[maxscoreidx].SetCharCount(totalchars);

										m_p[nReturned++] = m_p[maxscoreidx];

									}


									m_nUsed = nReturned;

								}


								/****************************************************************/


								static void __fastcall

								AddVector (int *pS, const PHElt *pH, int idx, unsigned int nScores)

								//

								// Add the score vector for a single n-gram to the running sum score

								// vector at pS.

								//

								// On return, paS[0..nScores-1] is filled with the sum scores for each

								// language.

								//

								// **** PERFORMANCE NOTE ****

								//

								// This is the critical inner-loop of the entire subsystem.

								//

								// Code generation and performance have been checked for various code

								// organization.  Ironically, making AddVector() a true function is

								// FASTER than inlining it because when inlined, the registers are used

								// for the OUTER loop variables and the inner loop here does approximately

								// twice as many memory references per pass.

								//

								// On x86, all four loop variables are registered, and each pass makes only

								// three memory references, which is optimal for the given representation.

								//

								// Future note: the histogram tables could be pivoted to collect all the

								// scores for each n-gram in a block; that would eliminate the double

								// indirection through ph and reduce the memory refs to two per pass.

								{

									nScores++;		// makes faster end-test


									while (--nScores != 0)

										*pS++ += (*pH++)[idx];

								}


								static inline void

								ScoreUnigramVector (LPCSTR pcsz, int nCh, PHistogram pH,

									int *paS, const PHElt *paH, unsigned int nScores)

								//

								// Score this text for a unigram histogram.  Each individual character is

								// mapped to a histogram slot to yield a score for that character in each

								// language.

								{

									if (nCh < 1)

										return;


									const PHIdx pMap = pH->GetMap();


									unsigned char *p = (unsigned char *)pcsz;


									while (nCh-- > 0)

										AddVector (paS, paH, pMap[*p++], nScores);

								}


								static inline void

								ScoreUnigramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,

									int *paS, const PHElt *paH, unsigned int nScores)

								//

								// WCHAR version.  Only difference is the use of a map that maps the

								// full 64K WCHAR space into the histogram index range.

								{

									if (nCh < 1)

										return;


									const PHIdx pMap = pH->GetMap();


									while (nCh-- > 0)

										AddVector (paS, paH, pMap[*pcwsz++], nScores);

								}


								static inline void

								ScoreDigramVector (LPCSTR pcsz, int nCh, PHistogram pH,

									int *paS, const PHElt *paH, unsigned int nScores)

								//

								// Score this text for a digram histogram.  Each adjacent pair of characters

								// are mapped to the index range and the mapped values combined to form an

								// array index unique to that digram.  The scores for that array slot are

								// summed for each language.

								{

									if (nCh < 2)

										return;


									unsigned char *p = (unsigned char *)pcsz;


									const PHIdx pMap = pH->GetMap();


									unsigned char ch1 = pMap[*p++];


									while (nCh-- > 1)

									{

										unsigned char ch2 = pMap[*p++];


										AddVector (paS, paH, ch1 * pH->EdgeSize() + ch2, nScores);


										ch1 = ch2;

									}

								}


								static inline void

								ScoreTrigramVector (LPCSTR pcsz, int nCh, PHistogram pH,

									int *paS, const PHElt *paH, unsigned int nScores)

								//

								// Score this text for a trigram histogram.  Each adjacent three-letter set

								// of characters are mapped to the index range and the mapped values combined

								// to form an array index unique to that trgram.

								{

									if (nCh < 3)

										return;


									unsigned char *p = (unsigned char *)pcsz;


									const PHIdx pMap = pH->GetMap();


									unsigned char ch1 = pMap[*p++];

									unsigned char ch2 = pMap[*p++];


									while (nCh-- > 2)

									{

										unsigned char ch3 = pMap[*p++];

										debug(printf("  '%c%c%c':",unmapch(ch1),unmapch(ch2),unmapch(ch3)));


										int idx = ((ch1 * pH->EdgeSize()) + ch2) * pH->EdgeSize() + ch3;

										ch1 = ch2;

										ch2 = ch3;


										AddVector (paS, paH, idx, nScores);


										debug(for (UINT i = 0; i < nScores; i++) printf(" %3d", paH[i][idx]));

										debug(printf("\n"));

									}

								}


								static inline void

								ScoreTrigramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,

									int *paS, const PHElt *paH, unsigned int nScores)

								//

								// WCHAR version.

								{

									if (nCh < 3)

										return;


									const PHIdx pMap = pH->GetMap();


									unsigned char ch1 = pMap[*pcwsz++];

									unsigned char ch2 = pMap[*pcwsz++];


									while (nCh-- > 2)

									{

										unsigned char ch3 = pMap[*pcwsz++];


										int idx = ((ch1 * pH->EdgeSize()) + ch2) * pH->EdgeSize() + ch3;

										ch1 = ch2;

										ch2 = ch3;


										AddVector (paS, paH, idx, nScores);

									}

								}


								static inline void

								ScoreNgramVector (LPCSTR pcsz, int nCh, PHistogram pH,

									int *paS, const PHElt *paH, unsigned int nScores)

								//

								// Score this text for any dimension of n-gram.  Get "N" from the

								// dimensionality of the histogram.

								//

								//  Each adjacent n-letter set of characters are mapped to the index range

								// and the scores the reference summed for each language.  This code is

								// never used for the current data file, instead an optimized scoring

								// loop exists for each existing case.  This exists to enable trying

								// different dimension scoring without requiring a new DLL.

								{

									if (nCh < pH->Dimensionality())

										return;


									unsigned char *p = (unsigned char *)pcsz;


									const PHIdx pMap = pH->GetMap();


									// Fill the pipeline


									int idx = 0;

									if (pH->Dimensionality() >= 2)

										idx = idx * pH->EdgeSize() + pMap[*p++];

									if (pH->Dimensionality() >= 3)

										idx = idx * pH->EdgeSize() + pMap[*p++];

									if (pH->Dimensionality() >= 4)

										idx = idx * pH->EdgeSize() + pMap[*p++];


									unsigned int nLoopCount = nCh - (pH->Dimensionality() - 1);


									while (nLoopCount-- > 0)

									{

										idx = (idx * pH->EdgeSize() + pMap[*p++]) % pH->NElts();


										AddVector (paS, paH, idx, nScores);

									}

								}


								static inline void

								ScoreNgramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,

									int *paS, const PHElt *paH, unsigned int nScores)

								//

								// WCHAR version.

								{

									if (nCh < pH->Dimensionality())

										return;


									const PHIdx pMap = pH->GetMap();


									// Fill the pipeline


									int idx = 0;

									if (pH->Dimensionality() >= 2)

										idx = idx * pH->EdgeSize() + pMap[*pcwsz++];

									if (pH->Dimensionality() >= 3)

										idx = idx * pH->EdgeSize() + pMap[*pcwsz++];

									if (pH->Dimensionality() >= 4)

										idx = idx * pH->EdgeSize() + pMap[*pcwsz++];


									unsigned int nLoopCount = nCh - (pH->Dimensionality() - 1);


									while (nLoopCount-- > 0)

									{

										idx = (idx * pH->EdgeSize() + pMap[*pcwsz++]) % pH->NElts();


										AddVector (paS, paH, idx, nScores);

									}

								}


								void

								ScoreVector (LPCSTR pcsz, int nCh, PHistogram pH,

									int *paS, const PHElt *paH, unsigned int nScores)

								//

								// Score a string into an array of scores using an array of histograms

								//

								// Each character n-gram is mapped to a histogram slot to yield a score

								// for that character in each array at paH.

								//

								// On return, paS[0..nScores-1] is filled with the sum scores.

								{

									memset (paS, 0, sizeof(int) * nScores);


									switch (pH->Dimensionality())

									{

									case 1:

										ScoreUnigramVector (pcsz, nCh, pH, paS, paH, nScores);

										break;


									case 2:

										ScoreDigramVector (pcsz, nCh, pH, paS, paH, nScores);

										break;


									case 3:

										ScoreTrigramVector (pcsz, nCh, pH, paS, paH, nScores);

										break;


									default:

										ScoreNgramVector (pcsz, nCh, pH, paS, paH, nScores);

										break;

									}

								}


								void

								ScoreVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,

									int *paS, const PHElt *paH, unsigned int nScores)

								//

								// Score a string into an array of scores using an array of histograms.

								{

									memset (paS, 0, sizeof(int) * nScores);


									switch (pH->Dimensionality())

									{

									case 1:

										ScoreUnigramVectorW (pcwsz, nCh, pH, paS, paH, nScores);

										break;


									case 3:

										ScoreTrigramVectorW (pcwsz, nCh, pH, paS, paH, nScores);

										break;


									default:

										ScoreNgramVectorW (pcwsz, nCh, pH, paS, paH, nScores);

										break;

									}

								}


								void

								LCDetect::Score7Bit (LPCSTR pcszText, int nChars, CScores &S) const

								//

								// Do 7-bit language detection.  Compute scores for all 7-bit languages

								// and store the raw language score in S at the language's base score-idx.

								//

								// Fill in only the first score slot per language.  Uses ScoreIdx() for

								// the first code page, but does not detect or set the code page.

								{

									const PHistogram pH = Get7BitLanguage(0)->GetLangHistogram();


									debug(printf("       "));

									debug(for(unsigned int x=0;x<GetN7BitLanguages();x++)printf(" %3d", Get7BitLanguage(x)->LanguageID()));

									debug(printf("\n"));


									int sc[MAXSCORES];


									// Compute the raw score vector


									ScoreVector (pcszText, nChars, pH, sc, m_paHElt7Bit, GetN7BitLanguages());


									// Fill in the CScores array from it


									for (unsigned int i = 0; i < GetN7BitLanguages(); i++)

									{

										PLanguage7Bit pL = Get7BitLanguage(i);


										CScore &s = S.Ref(pL->GetScoreIdx());


										s.SetLang(pL);

										s.SetCodePage(0);

										s.SetScore(sc[i]);

										s.SetCharCount(nChars);

									}

								}


								void

								LCDetect::Score8Bit (LPCSTR pcszText, int nChars, CScores &S) const

								//

								// Do 8-bit detection.  Compute a combined language / code page score

								// for each trained language / code page combination for the 8-bit languages.

								// Store all the raw scores in S at the language+each codepage score-idx.

								//

								// May store multiple entries in S for each language, one per code page.

								{

									const PHistogram pH = Get8BitLanguage(0)->GetHistogram(0);


									int sc[MAXSCORES];


									// Compute the raw score vector


									ScoreVector (pcszText, nChars, pH, sc, m_paHElt8Bit, m_nHElt8Bit);


									// Fill in the CScores array from it


									int nSc = 0;

									for (unsigned int i = 0; i < GetN8BitLanguages(); i++)

									{

										PLanguage8Bit pL = Get8BitLanguage(i);


										for (int j = 0; j < pL->NCodePages(); j++)

										{

											CScore &s = S.Ref(pL->GetScoreIdx() + j);


											s.SetLang(pL);

											s.SetCodePage(pL->GetCodePage(j));

											s.SetScore( sc[ nSc++ ] );

											s.SetCharCount(nChars);

										}

									}

								}


								void

								LCDetect::ScoreLanguageAsSBCS (LPCWSTR wcs, int nch, CScores &S) const

								//

								// This scores Unicode text known to contain mostly characters in the

								// script ranges used for 7-bit languages.  This uses a special mapping,

								// m_pH727Bit, that converts n-grams in the WCHAR text directly to the same

								// mapping output space used for 7-bit language detection.  It is then scored

								// using the same language-only histograms used for 7-bit SBCS detection.

								//

								// The output is the same as if Score7Bit() had been called on the SBCS

								// equivalent to this text.  The same slots in S are filled in, using the

								// 7-bit score indices, NOT the Unicode language score indices.

								{

									debug(printf("    scoring as SBCS\n"));


									debug(printf("       "));

									debug(for(unsigned int x=0;x<GetN7BitLanguages();x++)printf(" %3d", Get7BitLanguage(x)->LanguageID()));

									debug(printf("\n"));


									// Call ScoreVectorW(), passing the histogram set up or the WCHAR map.


									int sc[MAXSCORES];


									// Compute the raw score vector


									ScoreVectorW (wcs, nch, m_pHU27Bit, sc, m_paHElt7Bit,GetN7BitLanguages());


									// Fill in the CScores array from it


									for (unsigned int i = 0; i < GetN7BitLanguages(); i++)

									{

										PLanguage7Bit pL = Get7BitLanguage(i);


										CScore &s = S.Ref(pL->GetScoreIdx());


										s.SetLang(pL);

										s.SetCodePage(0);

										s.SetScore(sc[i]);

										s.SetCharCount(nch);

									}

								}


								////////////////////////////////////////////////////////////////


								void

								Language::ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const

								//

								// The default handler for scoring the code page for text for which the

								// language is already known.  Initially used only for Unicode.

								{

									idx = 0;

									S.SetCodePage(0);

								}


								void

								Language7Bit::ScoreCodePage (LPCSTR pStr, int nCh, CScore &S, int &idx) const

								//

								// Detect the code page for text whose language has already been detected

								// and is indicated in S.  Set S.CodePage(), do not change other

								// fields of S.

								//

								// Set idx to the index of the high-scoring code page.  The caller uses this

								// to place the score in the correct ScoreIdx slot.

								//

								// Note that the arg is a single CScore, not an array.  The CScore S is

								// filled in with the score of the high-scoring code page, and no information

								// about the other code pages is returned.

								{

									if (NCodePages() == 1)

									{

										// If lang is trained with only one codepage, just return it.


										idx = 0;

										S.SetCodePage(GetCodePage(0));


										debug(printf("  score code page: only one; cp=%d\n",GetCodePage(0)));

									}


									debug(printf("scoring 7-bit code pages: "));


									int sc[MAXSUBLANG];


									// Compute the raw score vector


									ScoreVector (pStr, nCh, GetCodePageHistogram(0),

											sc, GetPHEltArray(), NCodePages());


									// Find the high-scoring code page and fill in S with its values


									idx = FindHighIdx (sc, NCodePages());


									debug(printf("selecting cp=%d idx=%d\n", GetCodePage(idx), idx));


									S.SetCodePage (GetCodePage(idx));

								}


								void

								LanguageUnicode::ScoreSublanguages (LPCWSTR wcs, int nch, CScores &S) const

								//

								// Score wcs for each sub-language and add the raw scores to S.

								// The scores are not qualified at this time.

								//

								// Relevant only for Unicode language groups that require subdetection,

								// initially CJK.

								{

									if (m_nSubLangs == 0)

										return;


									debug(printf("    scoring Unicode sublanguages:\n"));


									int sc[MAXSUBLANG];


									// Compute the raw score vector


									ScoreVectorW (wcs, nch, GetHistogram(0), sc, m_paHElt, m_nSubLangs);


									// Fill in the CScores array from it


									for (int i = 0; i < NSubLangs(); i++)

									{

										PLanguageUnicode pSL = GetSublanguage(i);


										CScore &s = S.Ref(pSL->GetScoreIdx());

										s.SetLang (pSL);

										s.SetScore (sc[i]);

										s.SetCharCount (nch);

										s.SetCodePage (0);


										debug(printf("      lang=%d score=%d\n", pSL->LanguageID(), sc[i]));

									}

								}


								int

								LCDetect::ChooseDetectionType (LPCSTR pcszText, int nChars) const

								//

								// Histogram the raw char values to determine whether to use 7-bit or

								// 8-bit detection for this block.

								{

									// Count the proportion of chars < vs. >= 0x80


									int nHi = 0;


									for (int i = nChars; i-- > 0; )

										nHi += ((unsigned char)*pcszText++) & 0x80;


									nHi /= 0x80;

									int nLo = nChars - nHi;


									// Make sure there is sufficient data to make a good choice


									// work here -- try  if abs(nHi - nLo) < 10


									if (nHi + nLo < 10)

										return DETECT_NOTDEFINED;


									if (nHi * 2 > nLo)

										return DETECT_8BIT;

									else

										return DETECT_7BIT;

								}


								void

								LCDetect::ScoreLanguageA (LPCSTR pStr, int nChars, CScores &S) const

								//

								//

								// Score the text at pStr for each language that it potentially contains.

								//

								// Add the scores to S at the ScoreIdx() for each language and codepage

								// combination.

								//

								// This adds all the raw scores for either all the 7-bit or all the

								// 8-bit entries, depending on which category the rough initial analysis

								// indicates.  At this time, there are no entries for which both methods

								// are required.

								//

								// For 7-bit detection, code page is always set to 0 and the language's score

								// is placed in the 0'th slot for each language.  The caller later scores

								// code pages if needed, and fills the remaining slots.

								//

								// For 8-bit detection, scores are generated for each code page and all

								// ScoreIdx() slots are used.

								{

									switch (ChooseDetectionType (pStr, nChars)) {


									case DETECT_7BIT:

										Score7Bit (pStr, nChars, S);

										break;


									case DETECT_8BIT:

										Score8Bit (pStr, nChars, S);

										break;

									}

								}


								void

								LCDetect::ScoreLanguageW (LPCWSTR wcs, int nch, CScores &S, PCLCDConfigure pC) const

								//

								// Score the text at wcs for each language that it potentially contains.

								//

								// Add the scores to S at the ScoreIdx() for each language.

								//

								// This first determines the Unicode script groups represented in wcs.

								// Each WCHAR is mapped through CHARMAP_UNICODE to yield its "language group

								// ID".  The IDs for each char are counted and the top scoring IDs indicate

								// the probable languages or language groups.  Note that unlike all other

								// use of n-gram scoring, NO WEIGHTS are associated with the IDs -- whichever

								// group contains the most raw chars, wins.

								//

								// Some languages are indicated by presence of characters in a particular

								// script group; these scores are immediately added to S.

								//

								// For script groups that indicate multiple languages, subdetection within

								// the group is done only when the score for the group exceeds a threshhold

								// that indicates the sub-detected languages are likely to be included in

								// the final result.  This is purely a performance optimization, not to

								// be confused with the uniform score threshhold applied by the caller.

								//

								// The "Group" entries themselves are never included in the result; they

								// exist only to invoke subdetection.

								//

								// In many cases even a single Unicode character provides sufficient

								// identification of script and language, so there is no minimum

								// qualification for scores in the script ranges that indicate a

								// specific language by range alone.

								{

									// Score the chars according to the Unicode script group they belong to.

									// The array indices are the raw outputs of the primary Unicode Charmap

									// NOT to be confused with the ScoreIdx() of each language.  Further,

									// the scores are the simple count of the characters in each script

									// range, and are NOT weighted by any histogram.


									// In this initial step, the simple majority of characters per range

									// determines which further detection steps to take.


									const PHIdx map = GetMap (CHARMAP_UNICODE);


									int anScore[MAXSCORES];

									memset (anScore, 0, sizeof(int) * GetNUnicodeLanguages());


									for (int x = 0; x < nch; x++)

										anScore[map[wcs[x]]]++;


									debug(printf("    char_ignore score=%d\n",anScore[HIDX_IGNORE]));


									// Ignore scores for chars that correlate with no language


									anScore[HIDX_IGNORE] = 0;


									// Identify the scores that qualify a language for immediate inclusion

									// in the result, or that qualify a language group for further detection.


									// Find the high score to use as a relative threshhold for inclusion.


									int nMaxScore = 0;


									for (unsigned int i = 0; i < GetNUnicodeLanguages(); i++)

									{

										if (anScore[i] > nMaxScore)

											nMaxScore = anScore[i];

									}


									debug(printf("  unicode range max score=%d\n",nMaxScore));


									// Process all individual and group scores above a threshhold.


									// The threshhold logic is different from the logic for SBCS/DBCS

									// detection, because presence of even a single character in certain

									// Unicode script ranges can be a strong correct indicator for a

									// specific language.  The threshhold for subdetected scores is

									// higher, since that is a statistical result; single characters

									// are not as strong an indicator.


									// Set the threshhold for subdetecting.


									int nRelThresh = 1 + (nMaxScore * pC->nRelativeThreshhold) / 100;


									for (i = 0; i < GetNUnicodeLanguages(); i++)

									{

										// Threshhold for any range is at least this many raw chars in range.


										if (anScore[i] >= 2)

										{

											PLanguageUnicode pL = GetUnicodeLanguage(i);


											debug(printf("  using lang=%d score=%d:\n", pL->LanguageID(), anScore[i]));


											if (pL->LanguageID() == LANGID_UNKNOWN)

											{

												// DO NOTHING -- text is an unknown language


												debug(printf("    lang=unknown\n"));


											}

											else if (pL->NSubLangs() > 0)

											{

												// Subdetect language within a Unicode group, and add all the

												// unqualified raw scores directly to S.


												pL->ScoreSublanguages (wcs, nch, S);

											}

											else if ( pL->LanguageID() == LANGID_LATIN_GROUP &&

												      anScore[i] >= nRelThresh )

											{

												// Subdetect Latin/Western languages, and add all the

												// unqualified raw scores to S.


												ScoreLanguageAsSBCS (wcs, nch, S);

											}

											else

											{

												debug(printf("    range identifies language\n"));


												// This range identifies a specific language; add it.


												CScore &s = S.Ref(pL->GetScoreIdx());

												s.SetLang (pL);

												s.SetScore (anScore[i] * UNICODE_DEFAULT_CHAR_SCORE);

												s.SetCharCount (nch);

												s.SetCodePage (0);

											}

										}

									}

								}


								/****************************************************************/


								DWORD

								LCDetect::DetectA (LPCSTR pStr, int nInputChars,

									PLCDScore paScores, int *pnScores,

									PCLCDConfigure pLCDC) const

								//

								// Do SBCS / DBCS detection.  Detect language and code page of pStr,

								// fill paScores[] with the result and set *pnScores to the result count.

								// On input, *pnScores is the available capacity of paScores.

								//

								// The text at pStr is broken into chunks, typically several hundred

								// bytes.

								//

								// In the first phase, each chunk is scored by language.  The scores for

								// a single chunk are qualified by both an absolute threshhold and by a

								// threshhold based on the high score of just that chunk.  Scores exceeding

								// the threshhold are remembered towards the second phase; other scores

								// are discarded.

								//

								// For each score that will be remembered, if a code page is not already

								// known for it then the code page for the chunk is determined and included

								// with the score.  Note that the score refers only to the language, NOT

								// to the confidence of the code page.

								//

								// In the second phase, the combined scores for all chunks are examined.

								// The scores are further qualified by a relative threshhold.  Only

								// languages with scores exceeding the threshhold are included in the

								// final result; the remainder are discarded.

								//

								// The two-step process is designed to yield good results for input containing

								// text in multiple languages, or containing a high portion of whitespace or

								// symbol characters that correlate with no language.  It also is designed

								// to optimally handle tie-cases whether due to similar languages or to

								// mixed-language input, and to avoid applying threshholds based on

								// absolute scores.

								//

								// The presumption is that each chunk, generally, represents text in a single

								// language, and no matter what the absolute high score is, its high score

								// most likely is for that language.  The point of the first phase is to

								// identify all the languages that are known with some confidence to be

								// represented in the text.  For a given chunk, multiple languages scores may

								// meet this criteria and be remembered towards the result.  Specifically,

								// when a tie occurs, BOTH scores are always included.  (Choosing just one

								// would be wrong too often to be worthwhile.)

								//

								// The point of the second phase is to filter out the noise allowed by the

								// first phase.

								{

									TScores<MAXSCORES> SChunk;		// Scores for one chunk at a time

									TScores<MAXSCORES> SAll;		// Qualified scores for ultimate result


									if (pLCDC == NULL)				// Use the default config if not specified

										pLCDC = &m_LCDConfigureDefault;


									if (*pnScores == 0)

										return NO_ERROR;


								#define MAX_INPUT (USHRT_MAX-1)

									// CScore.NChars() is a USHORT to save space+time, so only this # of chars

									// can be accepted per call or the scoring would overflow.


									nInputChars = min (nInputChars, MAX_INPUT);

									debug(printf("LCD_Detect: detecting %d chars\n", nInputChars));


									// The first loop processed fixed-size chunks and accumulates all the

									// credibly-detected languages in SAll.  This is the "coarse" accuracy

									// qualification:  detect the language of text blocks small enough to

									// typically be in *one* language, and remember only the highest scoring

									// language for that chunk.  Then generate a multivalued result that

									// shows the distribution of language in the doc, instead of simply

									// returning the dominant language.  This is necessary because it is

									// much harder to determine the sole language than to determine the

									// multivalued result.


									int nProcessed = 0;


									while (nProcessed < nInputChars)

									{

										SChunk.Reset();				// reset is cheaper than constructing


										// Process nChunkSize worth of text if that will leave at least

										// another nChunkSize piece for the final pass.  If that would

										// leave a smaller final chunk, go ahead and process the entire

										// remaining input.


										int nch = nInputChars - nProcessed;


										if (nch >= pLCDC->nChunkSize * 2)

											nch = pLCDC->nChunkSize;


										debug(printf("\nStarting chunk: %d ch\n\"%.*s\"\n", nch, nch, &pStr[nProcessed]));


										ScoreLanguageA (&pStr[nProcessed], nch, SChunk);


										// Compute the threshhold for inclusion of each score in the

										// overall result.


										int nRelThresh = 1 + (SChunk.FindHighScore().GetScore() * pLCDC->nRelativeThreshhold) / 100;

										int nThresh7 = max (pLCDC->nMin7BitScore * nch, nRelThresh);

										int nThresh8 = max (pLCDC->nMin8BitScore * nch, nRelThresh);


										debug(printf("high score=%d min7=%d thresh7=%d thresh8=%d\n", SChunk.FindHighScore().GetScore(),pLCDC->nMin7BitScore*nch,nThresh7,nThresh8));


										// Qualify each score, remember only scores well-above the noise.


										for (unsigned int i = 0; i < SChunk.NElts(); i++)

										{

											CScore &s = SChunk.Ref(i);

											PLanguage pL = s.GetLang();


								//			debug(if (s.GetScore()) printf("  raw: lang=%d score=%d cp=%d\n",pL->LanguageID(),s.GetScore(),s.GetCodePage()));


											if ( (s.GetScore() >= nThresh7 && pL->Type() == DETECT_7BIT) ||

												 (s.GetScore() >= nThresh8 && pL->Type() == DETECT_8BIT) )

											{

												debug(printf("    qual: lang=%d score=%d cp=%d\n",pL->LanguageID(),s.GetScore(),s.GetCodePage()));


												// If code page is not already set, detect it, and store

												// the score for this language using the scoreidx slot

												// for that code page.  Store no score in the slots for

												// other code pages for the same language.


												int idx = 0;


												if (s.GetCodePage() == 0)

													pL->ScoreCodePage (&pStr[nProcessed], nch, s, idx);


												// Remember this score for the overall results


												SAll.Ref(i + idx) += s;

											}

										}


										nProcessed += nch;

									}


									// SAll has entries for each unique { lang ID, code page }

									// with the char count and total raw score (not normalized per char)

									// for those chunks whose score qualifies as a confident result and

									// that contributed to the entry.


									// Select the top-scoring code page for each language

									// and remove all other code page scores.


									debug(printf("Selecting top-scoring code pages\n"));


									SAll.SelectCodePages ();


									// Sort by decreasing score


									SAll.SortByScore ();


									// Build the client return structure

									//		Language ID

									//		Code page

									//		Doc percent 0-100

									//		Confidence 0-100


									int nScoresReturned = 0;


									for (unsigned i = 0; i < SAll.NElts() && nScoresReturned < *pnScores; i++)

									{

										CScore &s = SAll.Ref(i);


										LCDScore R;


										R.nLangID = s.GetLang()->LanguageID();

										R.nCodePage = s.GetCodePage();


										// Percent of doc for which this language scored above the

										// confidence threshhold, even if not 1st place for that chunk.


										R.nDocPercent = (s.GetCharCount() * 100) / nProcessed;


										debug(printf("s.CharCount=%d nProcessed=%d\n", s.GetCharCount(), nProcessed));


										// Confidence is the raw score for all the chunks for which this

										// language was detected above the confidence threshhold, divided

										// by the number of characters in those chunks.


										R.nConfidence = s.GetScore() / s.GetCharCount();


										debug(printf("Examining: lang=%d cp=%d docpct=%d\n", R.nLangID, R.nCodePage, R.nDocPercent));


										// Return only scores for languages detected in over a

										// minimum % of the doc.


										if (R.nDocPercent > pLCDC->nDocPctThreshhold)

										{

											debug(printf("  returning score\n"));

											paScores[nScoresReturned++] = R;

										}

									}


									debug(printf("Returning %d scores\n", nScoresReturned));


									*pnScores = nScoresReturned;


									return NO_ERROR;

								}


								DWORD

								LCDetect::DetectW (LPCWSTR pwStr, int nInputChars,

									PLCDScore paScores, int *pnScores, PCLCDConfigure pLCDC) const

								//

								// WCHAR (Unicode) version of LCD_Detect.  Score into paScores, one score

								// per language.

								{

									if (pLCDC == NULL)				// Use the default config if not specified

										pLCDC = &m_LCDConfigureDefault;


									if (*pnScores == 0)

										return NO_ERROR;


									// CScore.NChars() is a USHORT to save space+time, so only this # of chars

									// can be accepted per call or the scoring would overflow.


									nInputChars = min (nInputChars, MAX_INPUT);

									debug(printf("LCD_DetectW: detecting %d chars\n", nInputChars));


									TScores<MAXSCORES> SChunk;		// Raw score for one chunk at a time

									TScores<MAXSCORES> SAll;		// Qualifying scores for final result


									// SChunk is defined outside the loop since it's cheaper to Reset() it

									// than to reconstruct it each time.


									int nProcessed = 0;


									// Process one chunk of the input per loop


									while (nProcessed < nInputChars)

									{

										SChunk.Reset();


										// Process nChunkSize worth of text if that will leave at least

										// another nChunkSize piece for the final pass.  If that would

										// leave a smaller final chunk, go ahead and process the entire

										// remaining input.


										int nch = nInputChars - nProcessed;


										if (nch >= pLCDC->nChunkSize * 2)

											nch = pLCDC->nChunkSize;


										debug(printf("\nStarting chunk: %d ch\n", nch));


										// Compute the raw scores for the chunk.

										// This automatically includes the sub-detected language scores

										// for the Latin/Western group and Unicode groups, <<< when the

										// group itself >>> scores above the inclusion threshhold.

										// But, the sub-detected scores themselves still need to be

										// qualified.


										ScoreLanguageW (&pwStr[nProcessed], nch, SChunk, pLCDC);


										// Compute the threshhold for inclusion of each score in the

										// overall result.


										int nRelThresh = 1 + (SChunk.FindHighScore().GetScore() * pLCDC->nRelativeThreshhold) / 100;

										int nThresh7 = max (pLCDC->nMin7BitScore * nch, nRelThresh);

										int nThreshU = max (pLCDC->nMinUnicodeScore * nch, nRelThresh);


										debug(printf("scores: nElts=%d rel=%d%% high=%d min=%d min7=%d minU=%d\n", SChunk.NElts(), pLCDC->nRelativeThreshhold, SChunk.FindHighScore().GetScore(), nRelThresh,nThresh7,nThreshU));


										// Qualify each score, remember only scores well-above the noise.


										for (unsigned int i = 0; i < SChunk.NElts(); i++)

										{

											CScore &s = SChunk.Ref(i);

											PLanguage pL = s.GetLang();


											if ( (s.GetScore() >= nThresh7 && pL->Type() == DETECT_7BIT) ||

												 (s.GetScore() >= nThreshU && pL->Type() == DETECT_UNICODE) )

											{

												debug(printf("    using lang=%d score=%d nch=%d\n",pL->LanguageID(),s.GetScore(),s.GetCharCount()));


												// Remember this score for the overall results


												SAll.Ref(i) += s;

											}

										}


										nProcessed += nch;

									}


									// SAll has entries for each unique language with char count and total

									// raw score (not normalized per char) for those chunks whose score

									// qualifies as a confident result.


									// SAll may contain entries only for 7-bit and Unicode languages,

									// at most one entry per unique Win32 language ID


									debug(printf("Selecting scores for result:\n"));


									// Sort by decreasing score


									SAll.SortByScore ();


									// Build the client return structure

									//		Language ID

									//		Code page

									//		Doc percent 0-100

									//		Confidence 0-100


									int nScoresReturned = 0;


									for (unsigned i = 0; i < SAll.NElts() && nScoresReturned < *pnScores; i++)

									{

										CScore &s = SAll.Ref(i);


										LCDScore R;


										R.nLangID = s.GetLang()->LanguageID();

										R.nCodePage = s.GetCodePage();


										// Percent of doc for which this language scored above the

										// confidence threshhold, even if not 1st place for that chunk.


										R.nDocPercent = (s.GetCharCount() * 100) / nProcessed;


										// Confidence is the raw score for all the chunks for which this

										// language was detected above the confidence threshhold, divided

										// by the number of characters in those chunks.


										R.nConfidence = s.GetScore() / s.GetCharCount();


										debug(printf("  testing: lang=%d nch=%d docpct=%d\n", R.nLangID,s.GetCharCount(),R.nDocPercent));


										// Return only scores for languages detected in over a

										// minimum % of the doc.


										if (R.nDocPercent > pLCDC->nDocPctThreshhold)

										{

											debug(printf("  returning score\n"));

											paScores[nScoresReturned++] = R;

										}

									}


									debug(printf("Returning %d scores\n", nScoresReturned));


									*pnScores = nScoresReturned;


									return NO_ERROR;

								}


								/****************************************************************/

								/****************************************************************/


								#if 0

								// Export functions


								BOOL APIENTRY

								DllMain (HANDLE hM, DWORD ul_reason, LPVOID lpReserved)

								{

									switch (ul_reason) {


									case DLL_PROCESS_ATTACH:

										{

											DisableThreadLibraryCalls( (HINSTANCE)hM );


											LCDetect *pLC = new LCDetect ( (HMODULE)hM );

											if (pLC == NULL)

												return FALSE;


											if (pLC->LoadState() != NO_ERROR)

											{

												delete pLC;

												return FALSE;

											}


											g_pLCDetect = pLC;

										}

										return TRUE;


									case DLL_PROCESS_DETACH:

										if (g_pLCDetect != NULL)

											delete (LCDetect *)g_pLCDetect;

										g_pLCDetect = NULL;

										return TRUE;


									case DLL_THREAD_ATTACH:

									case DLL_THREAD_DETACH:

										break;

									}


									return TRUE;

								}

								#endif


								extern "C" void WINAPI

								LCD_GetConfig (PLCDConfigure pLCDC)

								{

									if (g_pLCDetect)

										*pLCDC = g_pLCDetect->GetConfig();

								}


								extern "C" DWORD WINAPI

								LCD_Detect (LPCSTR pStr, int nInputChars,

									PLCDScore paScores, int *pnScores,

									PCLCDConfigure pLCDC)

								//

								// Score into paScores, one score per language, "qualifying" scores only.

								// Return ranked by decreasing score.

								{

									if (g_pLCDetect == NULL)

										return ERROR_INVALID_FUNCTION;


									return g_pLCDetect->DetectA(pStr, nInputChars, paScores, pnScores, pLCDC);

								}


								extern "C" DWORD WINAPI

								LCD_DetectW (LPCWSTR wcs, int nInputChars,

									PLCDScore paScores, int *pnScores,

									PCLCDConfigure pLCDC)

								{

									if (g_pLCDetect == NULL)

										return ERROR_INVALID_FUNCTION;


									return g_pLCDetect->DetectW(wcs, nInputChars, paScores, pnScores, pLCDC);

								}


								extern "C" void WINAPI

								LCD_SetDebug (int f)

								{

								#ifdef DEBUG_LCDETECT

									g_fDebug = f;

								#endif

								}