Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

478 lines
14 KiB

  1. /*
  2. * Automatic language and codepage detector
  3. *
  4. * Bob Powell, 2/97
  5. * Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved.
  6. */
  7. #ifdef __cplusplus
  8. #include <wtypes.h>
  9. #include <limits.h>
  10. #include "lcdetect.h"
  11. #include "lccommon.h"
  12. #include <qsort.h>
  13. // Turn this on in SOURCES to enable debug output
  14. #ifdef DEBUG_LCDETECT
  15. #include <stdio.h>
  16. extern int g_fDebug;
  17. #define debug(x) { if (g_fDebug) { x; }}
  18. #define unmapch(x) ((x) >= 2 ? (x)+'a'-2 : ' ')
  19. #else
  20. #define debug(x)
  21. #endif
  22. class LCDetect;
  23. typedef LCDetect *PLCDetect;
  24. class Language;
  25. class Language7Bit;
  26. class Language8Bit;
  27. class LanguageUnicode;
  28. typedef Language *PLanguage;
  29. typedef Language7Bit *PLanguage7Bit;
  30. typedef Language8Bit *PLanguage8Bit;
  31. typedef LanguageUnicode *PLanguageUnicode;
  32. class CScore;
  33. class CScores;
  34. /****************************************************************/
  35. #define MAXSCORES 50 // Max possible simultaneous # of scores
  36. #define MINRAWSCORE 100 // Score threshhold (weight * char count)
  37. // for further processing
  38. /****************************************************************/
  39. // Histograms
  40. // A histogram stores an array of n-gram occurrence counts.
  41. // HElt stores the count, at present this is an unsigned char.
  42. // The in-memory structure is similar to the file.
  43. // The histogram array pointers m_panElts point into the mapped file image.
  44. class Histogram {
  45. public:
  46. Histogram (const PFileHistogramSection pHS, const PHIdx pMap);
  47. Histogram (const Histogram &H, const PHIdx pMap);
  48. virtual ~Histogram (void);
  49. DWORD Validate (DWORD nBytes) const;
  50. UCHAR Dimensionality (void) { return m_nDimensionality; }
  51. UCHAR EdgeSize (void) { return m_nEdgeSize; }
  52. USHORT CodePage (void) { return m_nCodePage; }
  53. USHORT GetRangeID (void) { return m_nRangeID; }
  54. USHORT NElts (void) { return m_nElts; }
  55. PHIdx GetMap (void) { return m_pMap; }
  56. HElt Ref (USHORT i1) const { return m_panElts[i1]; }
  57. HElt Ref (UCHAR i1, UCHAR i2) const {
  58. return m_panElts[(i1 * m_nEdgeSize) + i2]; }
  59. HElt Ref (UCHAR i1, UCHAR i2, UCHAR i3) const {
  60. return m_panElts[((i1 * m_nEdgeSize) + i2) * m_nEdgeSize + i3]; }
  61. HElt *Array (void) { return m_panElts; }
  62. protected:
  63. UCHAR m_nDimensionality; // 1=unigram, 2=digram etc.
  64. UCHAR m_nEdgeSize; // edge size (is a function of char map)
  65. union {
  66. USHORT m_nCodePage; // For 7 and 8-bit, is code page
  67. USHORT m_nRangeID; // For Unicode, is sub-language range ID
  68. };
  69. USHORT m_nElts; // (edge size ^ dimensionality)
  70. PHIdx m_pMap; // char/WCHAR to histogram idx mapping
  71. HElt *m_panElts; // array of elements / counts
  72. };
  73. typedef Histogram *PHistogram;
  74. /****************************************************************/
  75. // A Language object stores all the detection state for a given language,
  76. // i.e. primary language ID.
  77. class Language {
  78. public:
  79. // nCodePages is same as nSubLangs
  80. Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID = 0);
  81. virtual ~Language (void) { }
  82. virtual DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx) = 0;
  83. // Score the code pages for this language
  84. virtual void ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const;
  85. int LanguageID (void) const { return m_nLangID; }
  86. int NCodePages (void) const { return m_nCodePages; }
  87. int NSubLangs (void) const { return m_nSubLangs; }
  88. int RangeID (void) const { return m_nRangeID; }
  89. int GetScoreIdx (void) const { return m_nScoreIdx; }
  90. void SetScoreIdx (int nScoreIdx) { m_nScoreIdx = nScoreIdx; }
  91. virtual int GetCodePage (int n) const { return 0; }
  92. virtual int GetSublangRangeID (int n) const { return 0; }
  93. virtual int GetSublangID (int n) const { return 0; }
  94. virtual DetectionType Type (void) = 0;
  95. virtual Language7Bit const * GetLanguage7Bit (void) const { return NULL; }
  96. virtual Language8Bit const * GetLanguage8Bit (void) const { return NULL; }
  97. virtual LanguageUnicode const * GetLanguageUnicode (void) const { return NULL; }
  98. protected:
  99. PLCDetect m_pLC;
  100. int m_nLangID; // Win32 primary language ID
  101. int m_nRangeID; // Unicode range ID, for Unicode langs
  102. union {
  103. int m_nCodePages; // # of code pages trained for this language
  104. int m_nSubLangs;
  105. };
  106. int m_nScoreIdx; // Used to create a unique index into the score arrays
  107. // for each lang + cp combination, to eliminate the
  108. // need to search the arrays to merge scores. Add
  109. // the code page index to this to get the array index.
  110. };
  111. ////////////////////////////////////////////////////////////////
  112. class Language7Bit : public Language {
  113. public:
  114. Language7Bit (PLCDetect pL, int nLangID, int nCodePages);
  115. ~Language7Bit (void);
  116. DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);
  117. void ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const;
  118. int GetCodePage (int n) const { return m_ppCodePageHistogram[n]->CodePage();}
  119. virtual DetectionType Type (void) { return DETECT_7BIT; }
  120. PHistogram GetLangHistogram (void) const { return m_pLangHistogram; }
  121. PHistogram GetCodePageHistogram (int i) const {
  122. return m_ppCodePageHistogram[i]; }
  123. virtual Language7Bit const * GetLanguage7Bit (void) const { return this; }
  124. const PHElt * GetPHEltArray (void) const { return m_paHElt; }
  125. private:
  126. PHistogram m_pLangHistogram;
  127. PHistogram m_ppCodePageHistogram[MAXSUBLANG];
  128. PHElt m_paHElt[MAXSUBLANG];
  129. };
  130. ////////////////////////////////////////////////////////////////
  131. class Language8Bit : public Language {
  132. public:
  133. Language8Bit (PLCDetect pL, int nLangID, int nCodePages);
  134. ~Language8Bit (void);
  135. DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);
  136. int GetCodePage (int n) const { return m_ppHistogram[n]->CodePage(); }
  137. virtual DetectionType Type (void) { return DETECT_8BIT; }
  138. PHistogram GetHistogram (int i) const { return m_ppHistogram[i]; }
  139. virtual Language8Bit const * GetLanguage8Bit (void) const { return this; }
  140. private:
  141. PHistogram m_ppHistogram[MAXSUBLANG];
  142. };
  143. ////////////////////////////////////////////////////////////////
  144. class LanguageUnicode : public Language {
  145. public:
  146. LanguageUnicode (PLCDetect pL, int nLangID, int nRecordCount, int nRangeID);
  147. ~LanguageUnicode (void);
  148. DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);
  149. void ScoreSublanguages (LPCWSTR wcs, int nch, CScores &S) const;
  150. int GetSublangRangeID (int i) const{return GetHistogram(i)->GetRangeID();}
  151. PLanguageUnicode GetSublanguage (int n) const;
  152. virtual DetectionType Type (void) { return DETECT_UNICODE; }
  153. PHistogram GetHistogram (int i) const { return m_ppSubLangHistogram[i]; }
  154. virtual LanguageUnicode const * GetLanguageUnicode (void) const {
  155. return this;
  156. }
  157. const PHElt * GetPHEltArray (void) const { return m_paHElt; }
  158. private:
  159. PHistogram m_ppSubLangHistogram[MAXSUBLANG];
  160. PHElt m_paHElt[MAXSUBLANG];
  161. };
  162. /****************************************************************/
  163. class Charmap {
  164. public:
  165. Charmap (PFileMapSection pMS) : m_nID(pMS->m_dwID), m_nSize(pMS->m_dwSize),
  166. m_nUnique(pMS->m_dwNUnique), m_pElts( (PHIdx) (&pMS[1]) ) { }
  167. // int ID (void) const { return m_nID; }
  168. int Size (void) const { return m_nSize; }
  169. int NUnique (void) const { return m_nUnique; }
  170. PHIdx Map (void) const { return m_pElts; }
  171. HIdx Map (WCHAR x) const { return m_pElts[x]; }
  172. private:
  173. int m_nID; // ID by which hardwired code finds the table
  174. int m_nSize; // size of table (256 or 65536)
  175. int m_nUnique; // # of unique output values
  176. PHIdx m_pElts;
  177. };
  178. typedef Charmap *PCharmap;
  179. /****************************************************************/
  180. // class CScore -- score for one lang and/or code page, variously used for
  181. // individual chunks and also for an entire document.
  182. class CScore {
  183. public:
  184. // Only these two slots need to be initialized
  185. CScore (void) : m_nScore(0), m_nChars(0) {}
  186. ~CScore (void) { }
  187. const PLanguage GetLang (void) const { return m_pLang; }
  188. int GetScore (void) const { return m_nScore; }
  189. unsigned short GetCodePage (void) const { return m_nCodePage; }
  190. unsigned short GetCharCount (void) const { return m_nChars; }
  191. void SetLang (PLanguage p) { m_pLang = p; }
  192. void SetScore (int x) { m_nScore = x; }
  193. void SetCharCount (unsigned x) { m_nChars = (unsigned short)x; }
  194. void SetCodePage (unsigned x) { m_nCodePage = (unsigned short)x; }
  195. void Add (CScore &S) {
  196. SetLang(S.GetLang());
  197. SetCodePage(S.GetCodePage());
  198. SetScore(GetScore() + S.GetScore());
  199. SetCharCount(GetCharCount() + S.GetCharCount());
  200. }
  201. CScore & operator += (CScore &S) { Add (S); return *this; }
  202. int operator <= (CScore &S) {
  203. // Special: always put 8-bit langs first since the code page
  204. // matters more for them.
  205. if (GetLang()->Type() != S.GetLang()->Type())
  206. return GetLang()->Type() == DETECT_8BIT ? -1 : 1;
  207. return GetScore() <= S.GetScore();
  208. }
  209. #ifdef DEBUG_LCDETECT
  210. void Print(void) {
  211. printf("Lang=%d CodePage=%d Score=%d NChars=%d\n",
  212. GetLang() ? GetLang()->LanguageID() : -1,
  213. GetCodePage(), GetScore(), GetCharCount());
  214. }
  215. #endif
  216. private:
  217. PLanguage m_pLang;
  218. int m_nScore;
  219. unsigned short m_nCodePage;
  220. unsigned short m_nChars;
  221. };
  222. typedef CScore *PScore;
  223. ////////////////////////////////////////////////////////////////
  224. // class CScores
  225. //
  226. // For SBCS detection, the index e.g. Ref(i) is the language+codepage index,
  227. // one of a contiguous set of values which identifies each unique supported
  228. // language and codepage combination.
  229. //
  230. // For DBCS detection, the index is just the Unicode language group.
  231. class CScores {
  232. public:
  233. CScores (int nAlloc, PScore p) : m_nAlloc(nAlloc), m_nUsed(0), m_p(p) { }
  234. virtual ~CScores (void) { }
  235. void Reset (void) {
  236. memset ((void *)m_p, 0, sizeof(CScore) * m_nUsed);
  237. m_nUsed = 0;
  238. }
  239. unsigned int &NElts (void) { return m_nUsed; }
  240. CScore &Ref (unsigned int n) {
  241. if (m_nUsed <= n)
  242. m_nUsed = n + 1;
  243. return m_p[n];
  244. }
  245. void SelectCodePages (void);
  246. void RemoveZeroScores (void) {
  247. for (unsigned int i = 0, j = 0; i < m_nUsed; i++)
  248. {
  249. if (m_p[i].GetScore() > MINRAWSCORE)
  250. m_p[j++] = m_p[i];
  251. }
  252. m_nUsed = j;
  253. }
  254. // Sort by decreasing score.
  255. // Instantiates template qsort using CScore::operator <=
  256. void SortByScore (void) {
  257. RemoveZeroScores ();
  258. if (m_nUsed)
  259. QSort (m_p, m_nUsed, FALSE);
  260. }
  261. CScore & FindHighScore (void) {
  262. int highscore = 0;
  263. for (unsigned int i = 0, highidx = 0; i < m_nUsed; i++) {
  264. if (m_p[i].GetScore() > highscore)
  265. {
  266. highscore = m_p[i].GetScore();
  267. highidx = i;
  268. }
  269. }
  270. return m_p[highidx];
  271. }
  272. protected:
  273. unsigned int m_nAlloc;
  274. unsigned int m_nUsed; // high water mark to optimize NElts(), Reset()
  275. PScore m_p; // score array, typically per TScores<NNN>
  276. };
  277. template<ULONG Size>class TScores : public CScores {
  278. public:
  279. TScores (void) : CScores (Size, m_S) { }
  280. virtual ~TScores (void) { }
  281. private:
  282. CScore m_S[Size];
  283. };
  284. ////////////////////////////////////////////////////////////////
  285. class LCDetect {
  286. public:
  287. LCDetect (HMODULE hM);
  288. ~LCDetect (void);
  289. unsigned int GetNCharmaps() const { return m_nCharmaps; }
  290. unsigned int GetN7BitLanguages() const { return m_n7BitLanguages; }
  291. unsigned int GetN8BitLanguages() const { return m_n8BitLanguages; }
  292. unsigned int GetNUnicodeLanguages() const { return m_nUnicodeLanguages; }
  293. PLanguage7Bit Get7BitLanguage (int i) const { return m_pp7BitLanguages[i]; }
  294. PLanguage8Bit Get8BitLanguage (int i) const { return m_pp8BitLanguages[i]; }
  295. PLanguageUnicode GetUnicodeLanguage (int i) const { return m_ppUnicodeLanguages[i]; }
  296. PHIdx GetMap (int i) const { return m_ppCharmaps[i]->Map(); }
  297. const LCDConfigure &GetConfig () const { return m_LCDConfigureDefault; }
  298. DWORD LoadState (void);
  299. DWORD DetectA (LPCSTR pStr, int nChars, PLCDScore paScores,
  300. int *pnScores, PCLCDConfigure pLCDC) const;
  301. DWORD DetectW (LPCWSTR wcs, int nInputChars, PLCDScore paScores,
  302. int *pnScores, PCLCDConfigure pLCDC) const;
  303. private:
  304. DWORD Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL);
  305. DWORD Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL);
  306. DWORD InitializeUnicodeLanguage (PFileLanguageSection pLS,Language **ppL);
  307. DWORD LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL);
  308. DWORD LoadHistogramSection (void *pv, int nSectionSize, Language *pL);
  309. DWORD LoadMapSection (void *pv, int nSectionSize);
  310. DWORD BuildState (DWORD nFileSize);
  311. void Score7Bit (LPCSTR pcszText, int nChars, CScores &S) const;
  312. void Score8Bit (LPCSTR pcszText, int nChars, CScores &S) const;
  313. int ScoreCodePage (LPCSTR pStr, int nChars, CScore &S) const;
  314. int ChooseDetectionType (LPCSTR pcszText, int nChars) const;
  315. void ScoreLanguageA (LPCSTR pStr, int nChars, CScores &S) const;
  316. void ScoreLanguageW (LPCWSTR wcs, int nChars, CScores &S, PCLCDConfigure) const;
  317. void ScoreLanguageAsSBCS (LPCWSTR wcs, int nch, CScores &S) const;
  318. void ScoreUnicodeSublanguages (PLanguageUnicode pL, LPCWSTR wcs,
  319. int nch, CScores &S) const;
  320. private:
  321. // Language training info virtual-mapped in training file
  322. unsigned int m_nCharmaps;
  323. unsigned int m_n7BitLanguages;
  324. unsigned int m_n8BitLanguages;
  325. unsigned int m_nUnicodeLanguages;
  326. PCharmap *m_ppCharmaps;
  327. PLanguage7Bit *m_pp7BitLanguages;
  328. PLanguage8Bit *m_pp8BitLanguages;
  329. PLanguageUnicode *m_ppUnicodeLanguages;
  330. // Cached information for the optimized scoring inner-loops.
  331. PHElt m_paHElt7Bit[MAX7BITLANG];
  332. PHElt m_paHElt8Bit[MAXSCORES];
  333. int m_nHElt8Bit;
  334. // Special 7-bit lang histogram for ScoreLanguageAsSBCS()
  335. PHistogram m_pHU27Bit;
  336. // Initialization state variables
  337. unsigned int m_n7BitLangsRead;
  338. unsigned int m_n8BitLangsRead;
  339. unsigned int m_nUnicodeLangsRead;
  340. unsigned int m_nMapsRead;
  341. int m_nHistogramsRead;
  342. int m_nScoreIdx;
  343. // Default configuration to use when NULL parameter passed to detect
  344. LCDConfigure m_LCDConfigureDefault;
  345. // File mapping information for the training data file
  346. HANDLE m_hf;
  347. HANDLE m_hmap;
  348. void *m_pv;
  349. HMODULE m_hModule;
  350. };
  351. ////////////////////////////////////////////////////////////////
  352. inline PLanguageUnicode
  353. LanguageUnicode::GetSublanguage (int n) const
  354. {
  355. return m_pLC->GetUnicodeLanguage(GetSublangRangeID(n));
  356. }
  357. #endif // __cplusplus