Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1278 lines
36 KiB

  1. /*
  2. * Automatic language and codepage detector
  3. *
  4. * Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved.
  5. *
  6. * History: 1-Feb-97 BobP Created
  7. * 5-Aug-97 BobP Added Unicode support and rewrote
  8. * scoring to use vector math.
  9. *
  10. * This is the runtime detector.
  11. *
  12. * See the comments in lcdcomp.cpp for a description of the compilation
  13. * process and training data format.
  14. *
  15. * See design.txt for a description of the detection and scoring algorithm.
  16. *
  17. * Performance note: 60-80% of execution time in this code is AddVector(),
  18. * which is probably memory-cycle bound by its random data access, but is
  19. * still a candidate for further optimizing with an intrinsic vector operator,
  20. * should one become available.
  21. *
  22. * to-do (as needed):
  23. * - Adjust 7-bit and 8-bit scores to make them more comparable
  24. * - detect UTF-8 in the SBCS entry point, via heuristic and via
  25. * subdetection as 7-bit lang and as Unicode.
  26. */
  27. #include "private.h"
  28. // This is all the global (per-process) state
  29. //
  30. // It is set at DLL process init and its contents are const after that.
  31. LCDetect * g_pLCDetect;
  32. #ifdef DEBUG_LCDETECT
  33. int g_fDebug;
  34. #endif
  35. /****************************************************************/
  36. static inline unsigned int
  37. FindHighIdx (const int *pn, unsigned int n)
  38. //
  39. // Return the INDEX of the highest-valued integer in the given array.
  40. {
  41. int nMax = 0;
  42. unsigned int nIdx = 0;
  43. for (unsigned int i = 0; i < n; i++)
  44. {
  45. if (pn[i] > nMax)
  46. {
  47. nMax = pn[i];
  48. nIdx = i;
  49. }
  50. }
  51. return nIdx;
  52. }
  53. /****************************************************************/
  54. void
  55. CScores::SelectCodePages (void)
  56. //
  57. // Find the highest scoring code page for each language, and remove
  58. // all the other scores from the array such that the array contains
  59. // exactly one score per detected language instead of one score per
  60. // code page per language.
  61. //
  62. // When multiple scores are present for different code pages of the same
  63. // language, this function combines the scores into a single score.
  64. // The resulting entry will have the code page of the top-scoring code page
  65. // for the various entries for that language, and the score and char count
  66. // will be the SUM of the scores and char counts for ALL the entries for
  67. // that language.
  68. //
  69. // For example, if the input contains:
  70. // Lang Codepage Score Char count
  71. // Russian 1251 42 200
  72. // Russian 20866 69 300
  73. //
  74. // Then on output, the array will contain only one score for Russian:
  75. // Russian 20866 111 500
  76. //
  77. // This overwrites the entries in place, and sets m_nUsed to the resulting
  78. // number of active slots.
  79. //
  80. // The scores are already grouped by language, no need to sort by language.
  81. //
  82. // After return, the score array must NOT be referenced via ScoreIdx()
  83. // because the index of the entries has changed.
  84. {
  85. // The score indices no longer matter, remove slots that scored zero.
  86. RemoveZeroScores ();
  87. if (m_nUsed == 0)
  88. return;
  89. // Select top score per language. This is fundamentally dependent
  90. // on the score array already being ordered by language. This won't
  91. // combine scores for the same language as both a 7-bit and 8-bit lang,
  92. // but that's not worth fixing.
  93. int maxscore = 0; // highest score for a given language
  94. int totalscore = m_p[0].GetScore(); // sum of scores " "
  95. int totalchars = m_p[0].GetCharCount();// sum of character counts " "
  96. int nReturned = 0; // index and ultimate count of elts returned
  97. unsigned int maxscoreidx = 0; // array index of the top-scoring code page,
  98. // *** for the current language ***
  99. for (unsigned int i = 1; i < m_nUsed; i++) {
  100. if (m_p[i-1].GetLang() != m_p[i].GetLang())
  101. {
  102. // [i] indicates a different language from the previous entry
  103. // Add the entry for the previous language to the result
  104. // by copying the slot for its highest-scoring code page,
  105. // and overwriting its score and char count with the sum counts.
  106. m_p[maxscoreidx].SetScore(totalscore);
  107. m_p[maxscoreidx].SetCharCount(totalchars);
  108. m_p[nReturned++] = m_p[maxscoreidx];
  109. // Start remembering the top and total scores for the new lang.
  110. maxscore = m_p[i].GetScore();
  111. totalscore = m_p[i].GetScore();
  112. totalchars = m_p[i].GetCharCount();
  113. maxscoreidx = i; // remember which [] had the top score
  114. }
  115. else
  116. {
  117. // Accumulate more scores for the same language
  118. if (m_p[i].GetScore() > maxscore) {
  119. maxscore = m_p[i].GetScore();
  120. maxscoreidx = i;
  121. }
  122. totalscore += m_p[i].GetScore();
  123. totalchars += m_p[i].GetCharCount();
  124. }
  125. }
  126. // Process the the last language. Return the slot from its
  127. // highest-scoring code page.
  128. if (m_nUsed > 0)
  129. {
  130. m_p[maxscoreidx].SetScore(totalscore);
  131. m_p[maxscoreidx].SetCharCount(totalchars);
  132. m_p[nReturned++] = m_p[maxscoreidx];
  133. }
  134. m_nUsed = nReturned;
  135. }
  136. /****************************************************************/
  137. static void __fastcall
  138. AddVector (int *pS, const PHElt *pH, int idx, unsigned int nScores)
  139. //
  140. // Add the score vector for a single n-gram to the running sum score
  141. // vector at pS.
  142. //
  143. // On return, paS[0..nScores-1] is filled with the sum scores for each
  144. // language.
  145. //
  146. // **** PERFORMANCE NOTE ****
  147. //
  148. // This is the critical inner-loop of the entire subsystem.
  149. //
  150. // Code generation and performance have been checked for various code
  151. // organization. Ironically, making AddVector() a true function is
  152. // FASTER than inlining it because when inlined, the registers are used
  153. // for the OUTER loop variables and the inner loop here does approximately
  154. // twice as many memory references per pass.
  155. //
  156. // On x86, all four loop variables are registered, and each pass makes only
  157. // three memory references, which is optimal for the given representation.
  158. //
  159. // Future note: the histogram tables could be pivoted to collect all the
  160. // scores for each n-gram in a block; that would eliminate the double
  161. // indirection through ph and reduce the memory refs to two per pass.
  162. {
  163. nScores++; // makes faster end-test
  164. while (--nScores != 0)
  165. *pS++ += (*pH++)[idx];
  166. }
  167. static inline void
  168. ScoreUnigramVector (LPCSTR pcsz, int nCh, PHistogram pH,
  169. int *paS, const PHElt *paH, unsigned int nScores)
  170. //
  171. // Score this text for a unigram histogram. Each individual character is
  172. // mapped to a histogram slot to yield a score for that character in each
  173. // language.
  174. {
  175. if (nCh < 1)
  176. return;
  177. const PHIdx pMap = pH->GetMap();
  178. unsigned char *p = (unsigned char *)pcsz;
  179. while (nCh-- > 0)
  180. AddVector (paS, paH, pMap[*p++], nScores);
  181. }
  182. static inline void
  183. ScoreUnigramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
  184. int *paS, const PHElt *paH, unsigned int nScores)
  185. //
  186. // WCHAR version. Only difference is the use of a map that maps the
  187. // full 64K WCHAR space into the histogram index range.
  188. {
  189. if (nCh < 1)
  190. return;
  191. const PHIdx pMap = pH->GetMap();
  192. while (nCh-- > 0)
  193. AddVector (paS, paH, pMap[*pcwsz++], nScores);
  194. }
  195. static inline void
  196. ScoreDigramVector (LPCSTR pcsz, int nCh, PHistogram pH,
  197. int *paS, const PHElt *paH, unsigned int nScores)
  198. //
  199. // Score this text for a digram histogram. Each adjacent pair of characters
  200. // are mapped to the index range and the mapped values combined to form an
  201. // array index unique to that digram. The scores for that array slot are
  202. // summed for each language.
  203. {
  204. if (nCh < 2)
  205. return;
  206. unsigned char *p = (unsigned char *)pcsz;
  207. const PHIdx pMap = pH->GetMap();
  208. unsigned char ch1 = pMap[*p++];
  209. while (nCh-- > 1)
  210. {
  211. unsigned char ch2 = pMap[*p++];
  212. AddVector (paS, paH, ch1 * pH->EdgeSize() + ch2, nScores);
  213. ch1 = ch2;
  214. }
  215. }
  216. static inline void
  217. ScoreTrigramVector (LPCSTR pcsz, int nCh, PHistogram pH,
  218. int *paS, const PHElt *paH, unsigned int nScores)
  219. //
  220. // Score this text for a trigram histogram. Each adjacent three-letter set
  221. // of characters are mapped to the index range and the mapped values combined
  222. // to form an array index unique to that trgram.
  223. {
  224. if (nCh < 3)
  225. return;
  226. unsigned char *p = (unsigned char *)pcsz;
  227. const PHIdx pMap = pH->GetMap();
  228. unsigned char ch1 = pMap[*p++];
  229. unsigned char ch2 = pMap[*p++];
  230. while (nCh-- > 2)
  231. {
  232. unsigned char ch3 = pMap[*p++];
  233. debug(printf(" '%c%c%c':",unmapch(ch1),unmapch(ch2),unmapch(ch3)));
  234. int idx = ((ch1 * pH->EdgeSize()) + ch2) * pH->EdgeSize() + ch3;
  235. ch1 = ch2;
  236. ch2 = ch3;
  237. AddVector (paS, paH, idx, nScores);
  238. debug(for (UINT i = 0; i < nScores; i++) printf(" %3d", paH[i][idx]));
  239. debug(printf("\n"));
  240. }
  241. }
  242. static inline void
  243. ScoreTrigramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
  244. int *paS, const PHElt *paH, unsigned int nScores)
  245. //
  246. // WCHAR version.
  247. {
  248. if (nCh < 3)
  249. return;
  250. const PHIdx pMap = pH->GetMap();
  251. unsigned char ch1 = pMap[*pcwsz++];
  252. unsigned char ch2 = pMap[*pcwsz++];
  253. while (nCh-- > 2)
  254. {
  255. unsigned char ch3 = pMap[*pcwsz++];
  256. int idx = ((ch1 * pH->EdgeSize()) + ch2) * pH->EdgeSize() + ch3;
  257. ch1 = ch2;
  258. ch2 = ch3;
  259. AddVector (paS, paH, idx, nScores);
  260. }
  261. }
  262. static inline void
  263. ScoreNgramVector (LPCSTR pcsz, int nCh, PHistogram pH,
  264. int *paS, const PHElt *paH, unsigned int nScores)
  265. //
  266. // Score this text for any dimension of n-gram. Get "N" from the
  267. // dimensionality of the histogram.
  268. //
  269. // Each adjacent n-letter set of characters are mapped to the index range
  270. // and the scores the reference summed for each language. This code is
  271. // never used for the current data file, instead an optimized scoring
  272. // loop exists for each existing case. This exists to enable trying
  273. // different dimension scoring without requiring a new DLL.
  274. {
  275. if (nCh < pH->Dimensionality())
  276. return;
  277. unsigned char *p = (unsigned char *)pcsz;
  278. const PHIdx pMap = pH->GetMap();
  279. // Fill the pipeline
  280. int idx = 0;
  281. if (pH->Dimensionality() >= 2)
  282. idx = idx * pH->EdgeSize() + pMap[*p++];
  283. if (pH->Dimensionality() >= 3)
  284. idx = idx * pH->EdgeSize() + pMap[*p++];
  285. if (pH->Dimensionality() >= 4)
  286. idx = idx * pH->EdgeSize() + pMap[*p++];
  287. unsigned int nLoopCount = nCh - (pH->Dimensionality() - 1);
  288. while (nLoopCount-- > 0)
  289. {
  290. idx = (idx * pH->EdgeSize() + pMap[*p++]) % pH->NElts();
  291. AddVector (paS, paH, idx, nScores);
  292. }
  293. }
  294. static inline void
  295. ScoreNgramVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
  296. int *paS, const PHElt *paH, unsigned int nScores)
  297. //
  298. // WCHAR version.
  299. {
  300. if (nCh < pH->Dimensionality())
  301. return;
  302. const PHIdx pMap = pH->GetMap();
  303. // Fill the pipeline
  304. int idx = 0;
  305. if (pH->Dimensionality() >= 2)
  306. idx = idx * pH->EdgeSize() + pMap[*pcwsz++];
  307. if (pH->Dimensionality() >= 3)
  308. idx = idx * pH->EdgeSize() + pMap[*pcwsz++];
  309. if (pH->Dimensionality() >= 4)
  310. idx = idx * pH->EdgeSize() + pMap[*pcwsz++];
  311. unsigned int nLoopCount = nCh - (pH->Dimensionality() - 1);
  312. while (nLoopCount-- > 0)
  313. {
  314. idx = (idx * pH->EdgeSize() + pMap[*pcwsz++]) % pH->NElts();
  315. AddVector (paS, paH, idx, nScores);
  316. }
  317. }
  318. void
  319. ScoreVector (LPCSTR pcsz, int nCh, PHistogram pH,
  320. int *paS, const PHElt *paH, unsigned int nScores)
  321. //
  322. // Score a string into an array of scores using an array of histograms
  323. //
  324. // Each character n-gram is mapped to a histogram slot to yield a score
  325. // for that character in each array at paH.
  326. //
  327. // On return, paS[0..nScores-1] is filled with the sum scores.
  328. {
  329. memset (paS, 0, sizeof(int) * nScores);
  330. switch (pH->Dimensionality())
  331. {
  332. case 1:
  333. ScoreUnigramVector (pcsz, nCh, pH, paS, paH, nScores);
  334. break;
  335. case 2:
  336. ScoreDigramVector (pcsz, nCh, pH, paS, paH, nScores);
  337. break;
  338. case 3:
  339. ScoreTrigramVector (pcsz, nCh, pH, paS, paH, nScores);
  340. break;
  341. default:
  342. ScoreNgramVector (pcsz, nCh, pH, paS, paH, nScores);
  343. break;
  344. }
  345. }
  346. void
  347. ScoreVectorW (LPCWSTR pcwsz, int nCh, PHistogram pH,
  348. int *paS, const PHElt *paH, unsigned int nScores)
  349. //
  350. // Score a string into an array of scores using an array of histograms.
  351. {
  352. memset (paS, 0, sizeof(int) * nScores);
  353. switch (pH->Dimensionality())
  354. {
  355. case 1:
  356. ScoreUnigramVectorW (pcwsz, nCh, pH, paS, paH, nScores);
  357. break;
  358. case 3:
  359. ScoreTrigramVectorW (pcwsz, nCh, pH, paS, paH, nScores);
  360. break;
  361. default:
  362. ScoreNgramVectorW (pcwsz, nCh, pH, paS, paH, nScores);
  363. break;
  364. }
  365. }
  366. void
  367. LCDetect::Score7Bit (LPCSTR pcszText, int nChars, CScores &S) const
  368. //
  369. // Do 7-bit language detection. Compute scores for all 7-bit languages
  370. // and store the raw language score in S at the language's base score-idx.
  371. //
  372. // Fill in only the first score slot per language. Uses ScoreIdx() for
  373. // the first code page, but does not detect or set the code page.
  374. {
  375. const PHistogram pH = Get7BitLanguage(0)->GetLangHistogram();
  376. debug(printf(" "));
  377. debug(for(unsigned int x=0;x<GetN7BitLanguages();x++)printf(" %3d", Get7BitLanguage(x)->LanguageID()));
  378. debug(printf("\n"));
  379. int sc[MAXSCORES];
  380. // Compute the raw score vector
  381. ScoreVector (pcszText, nChars, pH, sc, m_paHElt7Bit, GetN7BitLanguages());
  382. // Fill in the CScores array from it
  383. for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
  384. {
  385. PLanguage7Bit pL = Get7BitLanguage(i);
  386. CScore &s = S.Ref(pL->GetScoreIdx());
  387. s.SetLang(pL);
  388. s.SetCodePage(0);
  389. s.SetScore(sc[i]);
  390. s.SetCharCount(nChars);
  391. }
  392. }
  393. void
  394. LCDetect::Score8Bit (LPCSTR pcszText, int nChars, CScores &S) const
  395. //
  396. // Do 8-bit detection. Compute a combined language / code page score
  397. // for each trained language / code page combination for the 8-bit languages.
  398. // Store all the raw scores in S at the language+each codepage score-idx.
  399. //
  400. // May store multiple entries in S for each language, one per code page.
  401. {
  402. const PHistogram pH = Get8BitLanguage(0)->GetHistogram(0);
  403. int sc[MAXSCORES];
  404. // Compute the raw score vector
  405. ScoreVector (pcszText, nChars, pH, sc, m_paHElt8Bit, m_nHElt8Bit);
  406. // Fill in the CScores array from it
  407. int nSc = 0;
  408. for (unsigned int i = 0; i < GetN8BitLanguages(); i++)
  409. {
  410. PLanguage8Bit pL = Get8BitLanguage(i);
  411. for (int j = 0; j < pL->NCodePages(); j++)
  412. {
  413. CScore &s = S.Ref(pL->GetScoreIdx() + j);
  414. s.SetLang(pL);
  415. s.SetCodePage(pL->GetCodePage(j));
  416. s.SetScore( sc[ nSc++ ] );
  417. s.SetCharCount(nChars);
  418. }
  419. }
  420. }
  421. void
  422. LCDetect::ScoreLanguageAsSBCS (LPCWSTR wcs, int nch, CScores &S) const
  423. //
  424. // This scores Unicode text known to contain mostly characters in the
  425. // script ranges used for 7-bit languages. This uses a special mapping,
  426. // m_pH727Bit, that converts n-grams in the WCHAR text directly to the same
  427. // mapping output space used for 7-bit language detection. It is then scored
  428. // using the same language-only histograms used for 7-bit SBCS detection.
  429. //
  430. // The output is the same as if Score7Bit() had been called on the SBCS
  431. // equivalent to this text. The same slots in S are filled in, using the
  432. // 7-bit score indices, NOT the Unicode language score indices.
  433. {
  434. debug(printf(" scoring as SBCS\n"));
  435. debug(printf(" "));
  436. debug(for(unsigned int x=0;x<GetN7BitLanguages();x++)printf(" %3d", Get7BitLanguage(x)->LanguageID()));
  437. debug(printf("\n"));
  438. // Call ScoreVectorW(), passing the histogram set up or the WCHAR map.
  439. int sc[MAXSCORES];
  440. // Compute the raw score vector
  441. ScoreVectorW (wcs, nch, m_pHU27Bit, sc, m_paHElt7Bit,GetN7BitLanguages());
  442. // Fill in the CScores array from it
  443. for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
  444. {
  445. PLanguage7Bit pL = Get7BitLanguage(i);
  446. CScore &s = S.Ref(pL->GetScoreIdx());
  447. s.SetLang(pL);
  448. s.SetCodePage(0);
  449. s.SetScore(sc[i]);
  450. s.SetCharCount(nch);
  451. }
  452. }
  453. ////////////////////////////////////////////////////////////////
  454. void
  455. Language::ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const
  456. //
  457. // The default handler for scoring the code page for text for which the
  458. // language is already known. Initially used only for Unicode.
  459. {
  460. idx = 0;
  461. S.SetCodePage(0);
  462. }
  463. void
  464. Language7Bit::ScoreCodePage (LPCSTR pStr, int nCh, CScore &S, int &idx) const
  465. //
  466. // Detect the code page for text whose language has already been detected
  467. // and is indicated in S. Set S.CodePage(), do not change other
  468. // fields of S.
  469. //
  470. // Set idx to the index of the high-scoring code page. The caller uses this
  471. // to place the score in the correct ScoreIdx slot.
  472. //
  473. // Note that the arg is a single CScore, not an array. The CScore S is
  474. // filled in with the score of the high-scoring code page, and no information
  475. // about the other code pages is returned.
  476. {
  477. if (NCodePages() == 1)
  478. {
  479. // If lang is trained with only one codepage, just return it.
  480. idx = 0;
  481. S.SetCodePage(GetCodePage(0));
  482. debug(printf(" score code page: only one; cp=%d\n",GetCodePage(0)));
  483. }
  484. debug(printf("scoring 7-bit code pages: "));
  485. int sc[MAXSUBLANG];
  486. // Compute the raw score vector
  487. ScoreVector (pStr, nCh, GetCodePageHistogram(0),
  488. sc, GetPHEltArray(), NCodePages());
  489. // Find the high-scoring code page and fill in S with its values
  490. idx = FindHighIdx (sc, NCodePages());
  491. debug(printf("selecting cp=%d idx=%d\n", GetCodePage(idx), idx));
  492. S.SetCodePage (GetCodePage(idx));
  493. }
  494. void
  495. LanguageUnicode::ScoreSublanguages (LPCWSTR wcs, int nch, CScores &S) const
  496. //
  497. // Score wcs for each sub-language and add the raw scores to S.
  498. // The scores are not qualified at this time.
  499. //
  500. // Relevant only for Unicode language groups that require subdetection,
  501. // initially CJK.
  502. {
  503. if (m_nSubLangs == 0)
  504. return;
  505. debug(printf(" scoring Unicode sublanguages:\n"));
  506. int sc[MAXSUBLANG];
  507. // Compute the raw score vector
  508. ScoreVectorW (wcs, nch, GetHistogram(0), sc, m_paHElt, m_nSubLangs);
  509. // Fill in the CScores array from it
  510. for (int i = 0; i < NSubLangs(); i++)
  511. {
  512. PLanguageUnicode pSL = GetSublanguage(i);
  513. CScore &s = S.Ref(pSL->GetScoreIdx());
  514. s.SetLang (pSL);
  515. s.SetScore (sc[i]);
  516. s.SetCharCount (nch);
  517. s.SetCodePage (0);
  518. debug(printf(" lang=%d score=%d\n", pSL->LanguageID(), sc[i]));
  519. }
  520. }
  521. int
  522. LCDetect::ChooseDetectionType (LPCSTR pcszText, int nChars) const
  523. //
  524. // Histogram the raw char values to determine whether to use 7-bit or
  525. // 8-bit detection for this block.
  526. {
  527. // Count the proportion of chars < vs. >= 0x80
  528. int nHi = 0;
  529. for (int i = nChars; i-- > 0; )
  530. nHi += ((unsigned char)*pcszText++) & 0x80;
  531. nHi /= 0x80;
  532. int nLo = nChars - nHi;
  533. // Make sure there is sufficient data to make a good choice
  534. // work here -- try if abs(nHi - nLo) < 10
  535. if (nHi + nLo < 10)
  536. return DETECT_NOTDEFINED;
  537. if (nHi * 2 > nLo)
  538. return DETECT_8BIT;
  539. else
  540. return DETECT_7BIT;
  541. }
  542. void
  543. LCDetect::ScoreLanguageA (LPCSTR pStr, int nChars, CScores &S) const
  544. //
  545. //
  546. // Score the text at pStr for each language that it potentially contains.
  547. //
  548. // Add the scores to S at the ScoreIdx() for each language and codepage
  549. // combination.
  550. //
  551. // This adds all the raw scores for either all the 7-bit or all the
  552. // 8-bit entries, depending on which category the rough initial analysis
  553. // indicates. At this time, there are no entries for which both methods
  554. // are required.
  555. //
  556. // For 7-bit detection, code page is always set to 0 and the language's score
  557. // is placed in the 0'th slot for each language. The caller later scores
  558. // code pages if needed, and fills the remaining slots.
  559. //
  560. // For 8-bit detection, scores are generated for each code page and all
  561. // ScoreIdx() slots are used.
  562. {
  563. switch (ChooseDetectionType (pStr, nChars)) {
  564. case DETECT_7BIT:
  565. Score7Bit (pStr, nChars, S);
  566. break;
  567. case DETECT_8BIT:
  568. Score8Bit (pStr, nChars, S);
  569. break;
  570. }
  571. }
  572. void
  573. LCDetect::ScoreLanguageW (LPCWSTR wcs, int nch, CScores &S, PCLCDConfigure pC) const
  574. //
  575. // Score the text at wcs for each language that it potentially contains.
  576. //
  577. // Add the scores to S at the ScoreIdx() for each language.
  578. //
  579. // This first determines the Unicode script groups represented in wcs.
  580. // Each WCHAR is mapped through CHARMAP_UNICODE to yield its "language group
  581. // ID". The IDs for each char are counted and the top scoring IDs indicate
  582. // the probable languages or language groups. Note that unlike all other
  583. // use of n-gram scoring, NO WEIGHTS are associated with the IDs -- whichever
  584. // group contains the most raw chars, wins.
  585. //
  586. // Some languages are indicated by presence of characters in a particular
  587. // script group; these scores are immediately added to S.
  588. //
  589. // For script groups that indicate multiple languages, subdetection within
  590. // the group is done only when the score for the group exceeds a threshhold
  591. // that indicates the sub-detected languages are likely to be included in
  592. // the final result. This is purely a performance optimization, not to
  593. // be confused with the uniform score threshhold applied by the caller.
  594. //
  595. // The "Group" entries themselves are never included in the result; they
  596. // exist only to invoke subdetection.
  597. //
  598. // In many cases even a single Unicode character provides sufficient
  599. // identification of script and language, so there is no minimum
  600. // qualification for scores in the script ranges that indicate a
  601. // specific language by range alone.
  602. {
  603. // Score the chars according to the Unicode script group they belong to.
  604. // The array indices are the raw outputs of the primary Unicode Charmap
  605. // NOT to be confused with the ScoreIdx() of each language. Further,
  606. // the scores are the simple count of the characters in each script
  607. // range, and are NOT weighted by any histogram.
  608. // In this initial step, the simple majority of characters per range
  609. // determines which further detection steps to take.
  610. const PHIdx map = GetMap (CHARMAP_UNICODE);
  611. int anScore[MAXSCORES];
  612. memset (anScore, 0, sizeof(int) * GetNUnicodeLanguages());
  613. for (int x = 0; x < nch; x++)
  614. anScore[map[wcs[x]]]++;
  615. debug(printf(" char_ignore score=%d\n",anScore[HIDX_IGNORE]));
  616. // Ignore scores for chars that correlate with no language
  617. anScore[HIDX_IGNORE] = 0;
  618. // Identify the scores that qualify a language for immediate inclusion
  619. // in the result, or that qualify a language group for further detection.
  620. // Find the high score to use as a relative threshhold for inclusion.
  621. int nMaxScore = 0;
  622. for (unsigned int i = 0; i < GetNUnicodeLanguages(); i++)
  623. {
  624. if (anScore[i] > nMaxScore)
  625. nMaxScore = anScore[i];
  626. }
  627. debug(printf(" unicode range max score=%d\n",nMaxScore));
  628. // Process all individual and group scores above a threshhold.
  629. // The threshhold logic is different from the logic for SBCS/DBCS
  630. // detection, because presence of even a single character in certain
  631. // Unicode script ranges can be a strong correct indicator for a
  632. // specific language. The threshhold for subdetected scores is
  633. // higher, since that is a statistical result; single characters
  634. // are not as strong an indicator.
  635. // Set the threshhold for subdetecting.
  636. int nRelThresh = 1 + (nMaxScore * pC->nRelativeThreshhold) / 100;
  637. for (i = 0; i < GetNUnicodeLanguages(); i++)
  638. {
  639. // Threshhold for any range is at least this many raw chars in range.
  640. if (anScore[i] >= 2)
  641. {
  642. PLanguageUnicode pL = GetUnicodeLanguage(i);
  643. debug(printf(" using lang=%d score=%d:\n", pL->LanguageID(), anScore[i]));
  644. if (pL->LanguageID() == LANGID_UNKNOWN)
  645. {
  646. // DO NOTHING -- text is an unknown language
  647. debug(printf(" lang=unknown\n"));
  648. }
  649. else if (pL->NSubLangs() > 0)
  650. {
  651. // Subdetect language within a Unicode group, and add all the
  652. // unqualified raw scores directly to S.
  653. pL->ScoreSublanguages (wcs, nch, S);
  654. }
  655. else if ( pL->LanguageID() == LANGID_LATIN_GROUP &&
  656. anScore[i] >= nRelThresh )
  657. {
  658. // Subdetect Latin/Western languages, and add all the
  659. // unqualified raw scores to S.
  660. ScoreLanguageAsSBCS (wcs, nch, S);
  661. }
  662. else
  663. {
  664. debug(printf(" range identifies language\n"));
  665. // This range identifies a specific language; add it.
  666. CScore &s = S.Ref(pL->GetScoreIdx());
  667. s.SetLang (pL);
  668. s.SetScore (anScore[i] * UNICODE_DEFAULT_CHAR_SCORE);
  669. s.SetCharCount (nch);
  670. s.SetCodePage (0);
  671. }
  672. }
  673. }
  674. }
  675. /****************************************************************/
  676. DWORD
  677. LCDetect::DetectA (LPCSTR pStr, int nInputChars,
  678. PLCDScore paScores, int *pnScores,
  679. PCLCDConfigure pLCDC) const
  680. //
  681. // Do SBCS / DBCS detection. Detect language and code page of pStr,
  682. // fill paScores[] with the result and set *pnScores to the result count.
  683. // On input, *pnScores is the available capacity of paScores.
  684. //
  685. // The text at pStr is broken into chunks, typically several hundred
  686. // bytes.
  687. //
  688. // In the first phase, each chunk is scored by language. The scores for
  689. // a single chunk are qualified by both an absolute threshhold and by a
  690. // threshhold based on the high score of just that chunk. Scores exceeding
  691. // the threshhold are remembered towards the second phase; other scores
  692. // are discarded.
  693. //
  694. // For each score that will be remembered, if a code page is not already
  695. // known for it then the code page for the chunk is determined and included
  696. // with the score. Note that the score refers only to the language, NOT
  697. // to the confidence of the code page.
  698. //
  699. // In the second phase, the combined scores for all chunks are examined.
  700. // The scores are further qualified by a relative threshhold. Only
  701. // languages with scores exceeding the threshhold are included in the
  702. // final result; the remainder are discarded.
  703. //
  704. // The two-step process is designed to yield good results for input containing
  705. // text in multiple languages, or containing a high portion of whitespace or
  706. // symbol characters that correlate with no language. It also is designed
  707. // to optimally handle tie-cases whether due to similar languages or to
  708. // mixed-language input, and to avoid applying threshholds based on
  709. // absolute scores.
  710. //
  711. // The presumption is that each chunk, generally, represents text in a single
  712. // language, and no matter what the absolute high score is, its high score
  713. // most likely is for that language. The point of the first phase is to
  714. // identify all the languages that are known with some confidence to be
  715. // represented in the text. For a given chunk, multiple languages scores may
  716. // meet this criteria and be remembered towards the result. Specifically,
  717. // when a tie occurs, BOTH scores are always included. (Choosing just one
  718. // would be wrong too often to be worthwhile.)
  719. //
  720. // The point of the second phase is to filter out the noise allowed by the
  721. // first phase.
  722. {
  723. TScores<MAXSCORES> SChunk; // Scores for one chunk at a time
  724. TScores<MAXSCORES> SAll; // Qualified scores for ultimate result
  725. if (pLCDC == NULL) // Use the default config if not specified
  726. pLCDC = &m_LCDConfigureDefault;
  727. if (*pnScores == 0)
  728. return NO_ERROR;
  729. #define MAX_INPUT (USHRT_MAX-1)
  730. // CScore.NChars() is a USHORT to save space+time, so only this # of chars
  731. // can be accepted per call or the scoring would overflow.
  732. nInputChars = min (nInputChars, MAX_INPUT);
  733. debug(printf("LCD_Detect: detecting %d chars\n", nInputChars));
  734. // The first loop processed fixed-size chunks and accumulates all the
  735. // credibly-detected languages in SAll. This is the "coarse" accuracy
  736. // qualification: detect the language of text blocks small enough to
  737. // typically be in *one* language, and remember only the highest scoring
  738. // language for that chunk. Then generate a multivalued result that
  739. // shows the distribution of language in the doc, instead of simply
  740. // returning the dominant language. This is necessary because it is
  741. // much harder to determine the sole language than to determine the
  742. // multivalued result.
  743. int nProcessed = 0;
  744. while (nProcessed < nInputChars)
  745. {
  746. SChunk.Reset(); // reset is cheaper than constructing
  747. // Process nChunkSize worth of text if that will leave at least
  748. // another nChunkSize piece for the final pass. If that would
  749. // leave a smaller final chunk, go ahead and process the entire
  750. // remaining input.
  751. int nch = nInputChars - nProcessed;
  752. if (nch >= pLCDC->nChunkSize * 2)
  753. nch = pLCDC->nChunkSize;
  754. debug(printf("\nStarting chunk: %d ch\n\"%.*s\"\n", nch, nch, &pStr[nProcessed]));
  755. ScoreLanguageA (&pStr[nProcessed], nch, SChunk);
  756. // Compute the threshhold for inclusion of each score in the
  757. // overall result.
  758. int nRelThresh = 1 + (SChunk.FindHighScore().GetScore() * pLCDC->nRelativeThreshhold) / 100;
  759. int nThresh7 = max (pLCDC->nMin7BitScore * nch, nRelThresh);
  760. int nThresh8 = max (pLCDC->nMin8BitScore * nch, nRelThresh);
  761. debug(printf("high score=%d min7=%d thresh7=%d thresh8=%d\n", SChunk.FindHighScore().GetScore(),pLCDC->nMin7BitScore*nch,nThresh7,nThresh8));
  762. // Qualify each score, remember only scores well-above the noise.
  763. for (unsigned int i = 0; i < SChunk.NElts(); i++)
  764. {
  765. CScore &s = SChunk.Ref(i);
  766. PLanguage pL = s.GetLang();
  767. // debug(if (s.GetScore()) printf(" raw: lang=%d score=%d cp=%d\n",pL->LanguageID(),s.GetScore(),s.GetCodePage()));
  768. if ( (s.GetScore() >= nThresh7 && pL->Type() == DETECT_7BIT) ||
  769. (s.GetScore() >= nThresh8 && pL->Type() == DETECT_8BIT) )
  770. {
  771. debug(printf(" qual: lang=%d score=%d cp=%d\n",pL->LanguageID(),s.GetScore(),s.GetCodePage()));
  772. // If code page is not already set, detect it, and store
  773. // the score for this language using the scoreidx slot
  774. // for that code page. Store no score in the slots for
  775. // other code pages for the same language.
  776. int idx = 0;
  777. if (s.GetCodePage() == 0)
  778. pL->ScoreCodePage (&pStr[nProcessed], nch, s, idx);
  779. // Remember this score for the overall results
  780. SAll.Ref(i + idx) += s;
  781. }
  782. }
  783. nProcessed += nch;
  784. }
  785. // SAll has entries for each unique { lang ID, code page }
  786. // with the char count and total raw score (not normalized per char)
  787. // for those chunks whose score qualifies as a confident result and
  788. // that contributed to the entry.
  789. // Select the top-scoring code page for each language
  790. // and remove all other code page scores.
  791. debug(printf("Selecting top-scoring code pages\n"));
  792. SAll.SelectCodePages ();
  793. // Sort by decreasing score
  794. SAll.SortByScore ();
  795. // Build the client return structure
  796. // Language ID
  797. // Code page
  798. // Doc percent 0-100
  799. // Confidence 0-100
  800. int nScoresReturned = 0;
  801. for (unsigned i = 0; i < SAll.NElts() && nScoresReturned < *pnScores; i++)
  802. {
  803. CScore &s = SAll.Ref(i);
  804. LCDScore R;
  805. R.nLangID = s.GetLang()->LanguageID();
  806. R.nCodePage = s.GetCodePage();
  807. // Percent of doc for which this language scored above the
  808. // confidence threshhold, even if not 1st place for that chunk.
  809. R.nDocPercent = (s.GetCharCount() * 100) / nProcessed;
  810. debug(printf("s.CharCount=%d nProcessed=%d\n", s.GetCharCount(), nProcessed));
  811. // Confidence is the raw score for all the chunks for which this
  812. // language was detected above the confidence threshhold, divided
  813. // by the number of characters in those chunks.
  814. R.nConfidence = s.GetScore() / s.GetCharCount();
  815. debug(printf("Examining: lang=%d cp=%d docpct=%d\n", R.nLangID, R.nCodePage, R.nDocPercent));
  816. // Return only scores for languages detected in over a
  817. // minimum % of the doc.
  818. if (R.nDocPercent > pLCDC->nDocPctThreshhold)
  819. {
  820. debug(printf(" returning score\n"));
  821. paScores[nScoresReturned++] = R;
  822. }
  823. }
  824. debug(printf("Returning %d scores\n", nScoresReturned));
  825. *pnScores = nScoresReturned;
  826. return NO_ERROR;
  827. }
  828. DWORD
  829. LCDetect::DetectW (LPCWSTR pwStr, int nInputChars,
  830. PLCDScore paScores, int *pnScores, PCLCDConfigure pLCDC) const
  831. //
  832. // WCHAR (Unicode) version of LCD_Detect. Score into paScores, one score
  833. // per language.
  834. {
  835. if (pLCDC == NULL) // Use the default config if not specified
  836. pLCDC = &m_LCDConfigureDefault;
  837. if (*pnScores == 0)
  838. return NO_ERROR;
  839. // CScore.NChars() is a USHORT to save space+time, so only this # of chars
  840. // can be accepted per call or the scoring would overflow.
  841. nInputChars = min (nInputChars, MAX_INPUT);
  842. debug(printf("LCD_DetectW: detecting %d chars\n", nInputChars));
  843. TScores<MAXSCORES> SChunk; // Raw score for one chunk at a time
  844. TScores<MAXSCORES> SAll; // Qualifying scores for final result
  845. // SChunk is defined outside the loop since it's cheaper to Reset() it
  846. // than to reconstruct it each time.
  847. int nProcessed = 0;
  848. // Process one chunk of the input per loop
  849. while (nProcessed < nInputChars)
  850. {
  851. SChunk.Reset();
  852. // Process nChunkSize worth of text if that will leave at least
  853. // another nChunkSize piece for the final pass. If that would
  854. // leave a smaller final chunk, go ahead and process the entire
  855. // remaining input.
  856. int nch = nInputChars - nProcessed;
  857. if (nch >= pLCDC->nChunkSize * 2)
  858. nch = pLCDC->nChunkSize;
  859. debug(printf("\nStarting chunk: %d ch\n", nch));
  860. // Compute the raw scores for the chunk.
  861. // This automatically includes the sub-detected language scores
  862. // for the Latin/Western group and Unicode groups, <<< when the
  863. // group itself >>> scores above the inclusion threshhold.
  864. // But, the sub-detected scores themselves still need to be
  865. // qualified.
  866. ScoreLanguageW (&pwStr[nProcessed], nch, SChunk, pLCDC);
  867. // Compute the threshhold for inclusion of each score in the
  868. // overall result.
  869. int nRelThresh = 1 + (SChunk.FindHighScore().GetScore() * pLCDC->nRelativeThreshhold) / 100;
  870. int nThresh7 = max (pLCDC->nMin7BitScore * nch, nRelThresh);
  871. int nThreshU = max (pLCDC->nMinUnicodeScore * nch, nRelThresh);
  872. debug(printf("scores: nElts=%d rel=%d%% high=%d min=%d min7=%d minU=%d\n", SChunk.NElts(), pLCDC->nRelativeThreshhold, SChunk.FindHighScore().GetScore(), nRelThresh,nThresh7,nThreshU));
  873. // Qualify each score, remember only scores well-above the noise.
  874. for (unsigned int i = 0; i < SChunk.NElts(); i++)
  875. {
  876. CScore &s = SChunk.Ref(i);
  877. PLanguage pL = s.GetLang();
  878. if ( (s.GetScore() >= nThresh7 && pL->Type() == DETECT_7BIT) ||
  879. (s.GetScore() >= nThreshU && pL->Type() == DETECT_UNICODE) )
  880. {
  881. debug(printf(" using lang=%d score=%d nch=%d\n",pL->LanguageID(),s.GetScore(),s.GetCharCount()));
  882. // Remember this score for the overall results
  883. SAll.Ref(i) += s;
  884. }
  885. }
  886. nProcessed += nch;
  887. }
  888. // SAll has entries for each unique language with char count and total
  889. // raw score (not normalized per char) for those chunks whose score
  890. // qualifies as a confident result.
  891. // SAll may contain entries only for 7-bit and Unicode languages,
  892. // at most one entry per unique Win32 language ID
  893. debug(printf("Selecting scores for result:\n"));
  894. // Sort by decreasing score
  895. SAll.SortByScore ();
  896. // Build the client return structure
  897. // Language ID
  898. // Code page
  899. // Doc percent 0-100
  900. // Confidence 0-100
  901. int nScoresReturned = 0;
  902. for (unsigned i = 0; i < SAll.NElts() && nScoresReturned < *pnScores; i++)
  903. {
  904. CScore &s = SAll.Ref(i);
  905. LCDScore R;
  906. R.nLangID = s.GetLang()->LanguageID();
  907. R.nCodePage = s.GetCodePage();
  908. // Percent of doc for which this language scored above the
  909. // confidence threshhold, even if not 1st place for that chunk.
  910. R.nDocPercent = (s.GetCharCount() * 100) / nProcessed;
  911. // Confidence is the raw score for all the chunks for which this
  912. // language was detected above the confidence threshhold, divided
  913. // by the number of characters in those chunks.
  914. R.nConfidence = s.GetScore() / s.GetCharCount();
  915. debug(printf(" testing: lang=%d nch=%d docpct=%d\n", R.nLangID,s.GetCharCount(),R.nDocPercent));
  916. // Return only scores for languages detected in over a
  917. // minimum % of the doc.
  918. if (R.nDocPercent > pLCDC->nDocPctThreshhold)
  919. {
  920. debug(printf(" returning score\n"));
  921. paScores[nScoresReturned++] = R;
  922. }
  923. }
  924. debug(printf("Returning %d scores\n", nScoresReturned));
  925. *pnScores = nScoresReturned;
  926. return NO_ERROR;
  927. }
  928. /****************************************************************/
  929. /****************************************************************/
  930. #if 0
  931. // Export functions
  932. BOOL APIENTRY
  933. DllMain (HANDLE hM, DWORD ul_reason, LPVOID lpReserved)
  934. {
  935. switch (ul_reason) {
  936. case DLL_PROCESS_ATTACH:
  937. {
  938. DisableThreadLibraryCalls( (HINSTANCE)hM );
  939. LCDetect *pLC = new LCDetect ( (HMODULE)hM );
  940. if (pLC == NULL)
  941. return FALSE;
  942. if (pLC->LoadState() != NO_ERROR)
  943. {
  944. delete pLC;
  945. return FALSE;
  946. }
  947. g_pLCDetect = pLC;
  948. }
  949. return TRUE;
  950. case DLL_PROCESS_DETACH:
  951. if (g_pLCDetect != NULL)
  952. delete (LCDetect *)g_pLCDetect;
  953. g_pLCDetect = NULL;
  954. return TRUE;
  955. case DLL_THREAD_ATTACH:
  956. case DLL_THREAD_DETACH:
  957. break;
  958. }
  959. return TRUE;
  960. }
  961. #endif
  962. extern "C" void WINAPI
  963. LCD_GetConfig (PLCDConfigure pLCDC)
  964. {
  965. if (g_pLCDetect)
  966. *pLCDC = g_pLCDetect->GetConfig();
  967. }
  968. extern "C" DWORD WINAPI
  969. LCD_Detect (LPCSTR pStr, int nInputChars,
  970. PLCDScore paScores, int *pnScores,
  971. PCLCDConfigure pLCDC)
  972. //
  973. // Score into paScores, one score per language, "qualifying" scores only.
  974. // Return ranked by decreasing score.
  975. {
  976. if (g_pLCDetect == NULL)
  977. return ERROR_INVALID_FUNCTION;
  978. return g_pLCDetect->DetectA(pStr, nInputChars, paScores, pnScores, pLCDC);
  979. }
  980. extern "C" DWORD WINAPI
  981. LCD_DetectW (LPCWSTR wcs, int nInputChars,
  982. PLCDScore paScores, int *pnScores,
  983. PCLCDConfigure pLCDC)
  984. {
  985. if (g_pLCDetect == NULL)
  986. return ERROR_INVALID_FUNCTION;
  987. return g_pLCDetect->DetectW(wcs, nInputChars, paScores, pnScores, pLCDC);
  988. }
  989. extern "C" void WINAPI
  990. LCD_SetDebug (int f)
  991. {
  992. #ifdef DEBUG_LCDETECT
  993. g_fDebug = f;
  994. #endif
  995. }