Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

667 lines
16 KiB

  1. /*
  2. * Automatic language and codepage detector
  3. *
  4. * Bob Powell, 2/97
  5. * Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved.
  6. *
  7. * History: 1-Feb-97 BobP Created
  8. * 5-Aug-97 BobP Unicode support; Charmaps in data file.
  9. */
  10. #include "private.h"
  11. /****************************************************************/
  12. Histogram::Histogram (const PFileHistogramSection pHS, const PHIdx pMap)
  13. : m_nDimensionality((UCHAR)pHS->m_dwDimensionality),
  14. m_nEdgeSize((UCHAR)pHS->m_dwEdgeSize),
  15. m_nCodePage((USHORT)pHS->m_dwCodePage),
  16. m_pMap(pMap),
  17. m_panElts((HElt *)&pHS[1]) // table follows header struct in the file
  18. {
  19. // #elements = #unique character values ^ #dimensions
  20. m_nElts = 1;
  21. for (UCHAR i = 0; i < m_nDimensionality; i++)
  22. m_nElts *= m_nEdgeSize;
  23. }
  24. DWORD
  25. Histogram::Validate (DWORD nBytes) const
  26. {
  27. if ( nBytes < m_nElts * sizeof(HElt) ||
  28. m_nDimensionality > 4 )
  29. {
  30. return ERROR_INTERNAL_DB_CORRUPTION;
  31. }
  32. return NO_ERROR;
  33. }
  34. Histogram::Histogram (const Histogram &H, const PHIdx pMap)
  35. : m_nDimensionality(H.m_nDimensionality),
  36. m_nEdgeSize(H.m_nEdgeSize),
  37. m_nCodePage(H.m_nCodePage),
  38. m_nElts(H.m_nElts),
  39. m_pMap(pMap),
  40. m_panElts(H.m_panElts)
  41. //
  42. // Clone a histogram but use a different Charmap.
  43. {
  44. }
  45. Histogram::~Histogram (void)
  46. //
  47. // The pointer members point to the mapped file and do not need to be freed.
  48. {
  49. }
  50. /****************************************************************/
  51. Language::Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID)
  52. : m_pLC(pL),
  53. m_nLangID(nLangID),
  54. m_nCodePages(nCodePages),
  55. m_nRangeID(nRangeID)
  56. {
  57. }
  58. Language7Bit::Language7Bit (PLCDetect pL, int nLangID, int nCodePages)
  59. : Language(pL, nLangID, nCodePages),
  60. m_pLangHistogram(NULL)
  61. {
  62. memset ((void *)m_ppCodePageHistogram, 0, sizeof(m_ppCodePageHistogram));
  63. }
  64. Language7Bit::~Language7Bit (void)
  65. {
  66. if (m_pLangHistogram)
  67. delete m_pLangHistogram;
  68. for (int i = 0; i < MAXSUBLANG; i++)
  69. if (m_ppCodePageHistogram[i])
  70. delete m_ppCodePageHistogram[i];
  71. }
  72. DWORD
  73. Language7Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
  74. //
  75. // Add the raw histogram at *pHS in the mapped file to this language object.
  76. // The histograms must be for 7-bit detection.
  77. {
  78. DWORD hr = NO_ERROR;
  79. PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );
  80. if (nIdx == 0)
  81. {
  82. // The first histogram for a language is its language-detection table.
  83. if ( (m_pLangHistogram = new Histogram (pHS, pMap)) == NULL)
  84. return ERROR_OUTOFMEMORY;
  85. if ((hr = m_pLangHistogram->Validate (nBytes)) != NO_ERROR)
  86. return hr;
  87. }
  88. else
  89. {
  90. // Each subsequent histogram is a code page detection table.
  91. if (nIdx - 1 >= m_nCodePages)
  92. return ERROR_INTERNAL_DB_CORRUPTION;
  93. Histogram *pH;
  94. if ((pH = new Histogram (pHS, pMap)) == NULL)
  95. return ERROR_OUTOFMEMORY;
  96. if ((hr = pH->Validate (nBytes)) != NO_ERROR)
  97. return hr;
  98. m_ppCodePageHistogram[nIdx - 1] = pH;
  99. // Cache for the scoring vector math
  100. m_paHElt[nIdx - 1] = pH->Array();
  101. }
  102. return hr;
  103. }
  104. /****************************************************************/
  105. Language8Bit::Language8Bit (PLCDetect pL, int nLangID, int nCodePages)
  106. : Language(pL, nLangID, nCodePages)
  107. {
  108. memset ((void *)m_ppHistogram, 0, sizeof(m_ppHistogram));
  109. }
  110. Language8Bit::~Language8Bit (void)
  111. {
  112. for (int i = 0; i < MAXSUBLANG; i++)
  113. if (m_ppHistogram[i])
  114. delete m_ppHistogram[i];
  115. }
  116. DWORD
  117. Language8Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
  118. //
  119. // Add the raw histogram at *pHS to this language object.
  120. // This language is known to use 8-bit detection.
  121. {
  122. DWORD hr = NO_ERROR;
  123. PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );
  124. // The histograms are the direct language-code page tables
  125. if (nIdx >= m_nCodePages)
  126. return ERROR_INTERNAL_DB_CORRUPTION;
  127. Histogram *pH;
  128. if ((pH = new Histogram (pHS, pMap)) == NULL)
  129. return ERROR_OUTOFMEMORY;
  130. if ((hr = pH->Validate (nBytes)) != NO_ERROR)
  131. return hr;
  132. m_ppHistogram[nIdx] = pH;
  133. return hr;
  134. }
  135. /****************************************************************/
  136. LanguageUnicode::LanguageUnicode (PLCDetect pL, int nLangID,
  137. int nSubLangs, int nRangeID)
  138. : Language(pL, nLangID, nSubLangs, nRangeID)
  139. {
  140. memset ((void *)m_ppSubLangHistogram, 0, sizeof(m_ppSubLangHistogram));
  141. }
  142. LanguageUnicode::~LanguageUnicode (void)
  143. {
  144. for (int i = 0; i < MAXSUBLANG; i++)
  145. if (m_ppSubLangHistogram[i])
  146. delete m_ppSubLangHistogram[i];
  147. }
  148. DWORD
  149. LanguageUnicode::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
  150. {
  151. DWORD hr = NO_ERROR;
  152. // All histograms for are sublanguage detection
  153. if (nIdx >= m_nSubLangs)
  154. return ERROR_INTERNAL_DB_CORRUPTION;
  155. // Get the custom charmap used for scoring this sublanguage group
  156. PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );
  157. Histogram *pH;
  158. if ((pH = new Histogram (pHS, pMap)) == NULL)
  159. return ERROR_OUTOFMEMORY;
  160. if ((hr = pH->Validate (nBytes)) != NO_ERROR)
  161. return hr;
  162. m_ppSubLangHistogram[nIdx] = pH;
  163. m_paHElt[nIdx] = pH->Array();
  164. return hr;
  165. }
  166. /****************************************************************/
  167. LCDetect::LCDetect (HMODULE hM)
  168. : m_hModule(hM),
  169. m_nCharmaps(0),
  170. m_n7BitLanguages(0),
  171. m_n8BitLanguages(0),
  172. m_nUnicodeLanguages(0),
  173. m_n7BitLangsRead(0),
  174. m_n8BitLangsRead(0),
  175. m_nUnicodeLangsRead(0),
  176. m_nMapsRead(0),
  177. m_nHistogramsRead(0),
  178. m_nScoreIdx(0),
  179. m_pp7BitLanguages(NULL),
  180. m_pp8BitLanguages(NULL),
  181. m_ppUnicodeLanguages(NULL),
  182. m_ppCharmaps(NULL),
  183. m_pv(NULL),
  184. m_hmap(0),
  185. m_hf(0),
  186. m_pHU27Bit(0)
  187. {
  188. }
  189. LCDetect::~LCDetect ()
  190. {
  191. delete m_pHU27Bit;
  192. for (unsigned int i = 0; i < m_n7BitLanguages; i++)
  193. delete m_pp7BitLanguages[i];
  194. delete m_pp7BitLanguages;
  195. for (i = 0; i < m_n8BitLanguages; i++)
  196. delete m_pp8BitLanguages[i];
  197. delete m_pp8BitLanguages;
  198. for (i = 0; i < m_nUnicodeLanguages; i++)
  199. delete m_ppUnicodeLanguages[i];
  200. delete m_ppUnicodeLanguages;
  201. for (i = 0; i < m_nCharmaps; i++)
  202. delete m_ppCharmaps[i];
  203. delete m_ppCharmaps;
  204. if (m_pv)
  205. UnmapViewOfFile (m_pv);
  206. CloseHandle (m_hmap);
  207. CloseHandle (m_hf);
  208. }
  209. DWORD
  210. LCDetect::Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL)
  211. //
  212. // Set *ppL to the Language object created from this section.
  213. {
  214. // nRecordCount is lang histogram (1) + # of code page histograms
  215. if ( m_n7BitLangsRead >= m_n7BitLanguages || pLS->m_dwRecordCount < 1)
  216. return ERROR_INTERNAL_DB_CORRUPTION;
  217. PLanguage7Bit pL = new Language7Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount - 1);
  218. if (pL == NULL)
  219. return ERROR_OUTOFMEMORY;
  220. // Each 7-bit lang uses one score index slot per code page.
  221. // The range starts with the 7-bit langs, since both the 8-bit
  222. // and Unicode langs follow it.
  223. if (m_n7BitLangsRead == 0 && m_nScoreIdx != 0)
  224. return ERROR_INTERNAL_DB_CORRUPTION;;
  225. pL->SetScoreIdx(m_nScoreIdx);
  226. m_nScoreIdx += pLS->m_dwRecordCount - 1; // skip 1st record (Language)
  227. m_pp7BitLanguages[ m_n7BitLangsRead++ ] = pL;
  228. *ppL = pL;
  229. return NO_ERROR;
  230. }
  231. DWORD
  232. LCDetect::Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL)
  233. //
  234. // Set *ppL to the Language object created from this section.
  235. {
  236. // nRecordCount is # of combined language / code page histograms
  237. if ( m_n8BitLangsRead >= m_n8BitLanguages || pLS->m_dwRecordCount < 1)
  238. return ERROR_INTERNAL_DB_CORRUPTION;
  239. PLanguage8Bit pL = new Language8Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount);
  240. if (pL == NULL)
  241. return ERROR_OUTOFMEMORY;
  242. // The 8-bit score indices follow the 7-bit languages
  243. // Each 8-bit lang uses a score index slot for each of its code pages,
  244. // since all the code pages are scored in the initial scoring pass.
  245. // The number of slots is the number of code page histograms, which is
  246. // one less than the number of records following this language.
  247. pL->SetScoreIdx(m_nScoreIdx);
  248. m_nScoreIdx += pLS->m_dwRecordCount;
  249. m_pp8BitLanguages[ m_n8BitLangsRead++ ] = pL;
  250. *ppL = pL;
  251. return NO_ERROR;
  252. }
  253. DWORD
  254. LCDetect::InitializeUnicodeLanguage (PFileLanguageSection pLS, Language **ppL)
  255. //
  256. // Set *ppL to the Language object created from this section.
  257. {
  258. // nRecordCount is # of sublanguage histograms
  259. if ( m_nUnicodeLangsRead >= m_nUnicodeLanguages ||
  260. pLS->m_dwUnicodeRangeID >= m_nUnicodeLanguages )
  261. {
  262. return ERROR_INTERNAL_DB_CORRUPTION;
  263. }
  264. PLanguageUnicode pL = new LanguageUnicode (this, pLS->m_dwLangID,
  265. pLS->m_dwRecordCount, pLS->m_dwUnicodeRangeID);
  266. if (pL == NULL)
  267. return ERROR_OUTOFMEMORY;
  268. // The Unicode score indices follow the 7-bit languages, and overlay the
  269. // 8-bit slots since they aren't used at the same time.
  270. if (m_nUnicodeLangsRead == 0 && GetN8BitLanguages() > 0)
  271. m_nScoreIdx = Get8BitLanguage(0)->GetScoreIdx();
  272. // Each Unicode entry uses exactly one score index. SBCS subdetection
  273. // (Latin group) uses the slots for the corresponding 7-bit languages,
  274. // and Unicode subdetection (CJK) uses the slots already defined for the
  275. // Unicode sub-languages.
  276. pL->SetScoreIdx(m_nScoreIdx);
  277. m_nScoreIdx++;
  278. // For Unicode, the range ID is used as the Language array index.
  279. m_ppUnicodeLanguages[ pLS->m_dwUnicodeRangeID ] = pL;
  280. m_nUnicodeLangsRead++;
  281. *ppL = pL;
  282. return NO_ERROR;
  283. }
  284. DWORD
  285. LCDetect::LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL)
  286. //
  287. // A language section begins the definition of data for a language.
  288. // Each language has exactly one of these records. One or more
  289. // histogram sections follow each language, and are always associated
  290. // with the language of the preceding language section.
  291. //
  292. // Set *ppL to the Language object created from this section.
  293. {
  294. DWORD hr = NO_ERROR;
  295. PFileLanguageSection pLS;
  296. pLS = (PFileLanguageSection)&((char *)pv)[sizeof(FileSection)];
  297. switch ( pLS->m_dwDetectionType ) {
  298. case DETECT_7BIT:
  299. hr = Initialize7BitLanguage (pLS, ppL);
  300. break;
  301. case DETECT_8BIT:
  302. hr = Initialize8BitLanguage (pLS, ppL);
  303. break;
  304. case DETECT_UNICODE:
  305. hr = InitializeUnicodeLanguage (pLS, ppL);
  306. break;
  307. }
  308. return hr;
  309. }
  310. DWORD
  311. LCDetect::LoadHistogramSection (void *pv, int nSectionSize, Language *pL)
  312. {
  313. PFileHistogramSection pHS;
  314. pHS = (PFileHistogramSection)&((char *)pv)[sizeof(FileSection)];
  315. int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pHS);
  316. return pL->AddHistogram ( pHS, nBytes, m_nHistogramsRead++);
  317. }
  318. DWORD
  319. LCDetect::LoadMapSection (void *pv, int nSectionSize)
  320. {
  321. PFileMapSection pMS;
  322. pMS = (PFileMapSection)&((char *)pv)[sizeof(FileSection)];
  323. int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pMS);
  324. if (m_nMapsRead >= m_nCharmaps)
  325. return ERROR_INTERNAL_DB_CORRUPTION;
  326. PCharmap pM = new Charmap (pMS);
  327. if (pM == NULL)
  328. return ERROR_OUTOFMEMORY;
  329. m_ppCharmaps[ m_nMapsRead++ ] = pM;
  330. return NO_ERROR;
  331. }
  332. DWORD
  333. LCDetect::BuildState (DWORD nFileSize)
  334. //
  335. // Build the detection structures from the mapped training file image at *m_pv
  336. {
  337. PLanguage pL;
  338. PFileHeader pFH;
  339. PFileSection pFS;
  340. DWORD hr = NO_ERROR;
  341. // Validate header
  342. pFH = (PFileHeader) m_pv;
  343. if ( nFileSize < sizeof(*pFH) ||
  344. pFH->m_dwAppSig != APP_SIGNATURE ||
  345. pFH->m_dwVersion != APP_VERSION ||
  346. pFH->m_dwHdrSizeBytes >= nFileSize ||
  347. pFH->m_dwN7BitLanguages == 0 ||
  348. pFH->m_dwN8BitLanguages == 0 ||
  349. pFH->m_dwNUnicodeLanguages == 0 ||
  350. pFH->m_dwNCharmaps == 0 )
  351. {
  352. return ERROR_INTERNAL_DB_CORRUPTION;
  353. }
  354. // Allocate language pointer table per header
  355. m_n7BitLanguages = pFH->m_dwN7BitLanguages;
  356. m_pp7BitLanguages = new PLanguage7Bit [m_n7BitLanguages];
  357. m_n8BitLanguages = pFH->m_dwN8BitLanguages;
  358. m_pp8BitLanguages = new PLanguage8Bit [m_n8BitLanguages];
  359. m_nUnicodeLanguages = pFH->m_dwNUnicodeLanguages;
  360. m_ppUnicodeLanguages = new PLanguageUnicode [m_nUnicodeLanguages];
  361. m_nCharmaps = pFH->m_dwNCharmaps;
  362. m_ppCharmaps = new PCharmap [m_nCharmaps];
  363. if ( m_pp7BitLanguages == NULL ||
  364. m_pp8BitLanguages == NULL ||
  365. m_ppUnicodeLanguages == NULL ||
  366. m_ppCharmaps == NULL )
  367. {
  368. return ERROR_OUTOFMEMORY;
  369. }
  370. // Clear, because not all slots may be assigned
  371. memset (m_ppUnicodeLanguages, 0, sizeof(PLanguageUnicode) * m_nUnicodeLanguages);
  372. // Remember other header info
  373. m_LCDConfigureDefault.nMin7BitScore = pFH->m_dwMin7BitScore;
  374. m_LCDConfigureDefault.nMin8BitScore = pFH->m_dwMin8BitScore;
  375. m_LCDConfigureDefault.nMinUnicodeScore = pFH->m_dwMinUnicodeScore;
  376. m_LCDConfigureDefault.nRelativeThreshhold = pFH->m_dwRelativeThreshhold;
  377. m_LCDConfigureDefault.nDocPctThreshhold = pFH->m_dwDocPctThreshhold;
  378. m_LCDConfigureDefault.nChunkSize = pFH->m_dwChunkSize;
  379. // Position to first section
  380. pFS = (PFileSection) &((char *)m_pv)[pFH->m_dwHdrSizeBytes];
  381. // Read and process each file section
  382. while ( hr == NO_ERROR ) {
  383. // check alignment
  384. if (((DWORD_PTR)pFS & 3) != 0) {
  385. hr = ERROR_INTERNAL_DB_CORRUPTION;
  386. break;
  387. }
  388. // zero-length section marks end of data
  389. if (pFS->m_dwSizeBytes == 0)
  390. break;
  391. if ( &((char *)pFS)[pFS->m_dwSizeBytes] >= &((char *)m_pv)[nFileSize]) {
  392. hr = ERROR_INTERNAL_DB_CORRUPTION;
  393. break;
  394. }
  395. switch ( pFS->m_dwType ) {
  396. case SECTION_TYPE_LANGUAGE: // sets pL
  397. hr = LoadLanguageSection ((void*)pFS, pFS->m_dwSizeBytes, &pL);
  398. m_nHistogramsRead = 0;
  399. break;
  400. case SECTION_TYPE_HISTOGRAM: // uses pL
  401. hr = LoadHistogramSection ((void*)pFS, pFS->m_dwSizeBytes, pL);
  402. break;
  403. case SECTION_TYPE_MAP:
  404. hr = LoadMapSection ((void*)pFS, pFS->m_dwSizeBytes);
  405. break;
  406. default: // ignore unrecognized sections
  407. break;
  408. }
  409. pFS = (PFileSection) &((char *)pFS)[pFS->m_dwSizeBytes];
  410. }
  411. if (hr != NO_ERROR)
  412. return hr;
  413. if ( m_nMapsRead != m_nCharmaps )
  414. return ERROR_INTERNAL_DB_CORRUPTION;
  415. // Set up quick-reference arrays used by the scoring inner loops
  416. for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
  417. m_paHElt7Bit[i] = Get7BitLanguage(i)->GetLangHistogram()->Array();
  418. m_nHElt8Bit = 0;
  419. for (i = 0; i < GetN8BitLanguages(); i++)
  420. {
  421. PLanguage8Bit pL = Get8BitLanguage(i);
  422. for (int j = 0; j < pL->NCodePages(); j++)
  423. m_paHElt8Bit[m_nHElt8Bit++] = pL->GetHistogram(j)->Array();
  424. }
  425. // Set up the Histogram used for ScoreVectorW() for scoring Unicode
  426. // text for 7-bit language detection. Clone the first 7-bit language
  427. // histogram and replace its map with CHARMAP_U27BIT.
  428. m_pHU27Bit = new Histogram ( *Get7BitLanguage(0)->GetLangHistogram(),
  429. GetMap(CHARMAP_U27BIT));
  430. return hr;
  431. }
  432. DWORD
  433. LCDetect::LoadState (void)
  434. //
  435. // Overall initialization and state loading. Open the compiled training
  436. // file from its fixed location in the System32 directory, and assemble
  437. // in-memory detection tables from its contents.
  438. {
  439. DWORD hr = NO_ERROR;
  440. DWORD nFileSize;
  441. #define MODULENAMELEN 100
  442. char szFilename[MODULENAMELEN+50], *p;
  443. // Find out if NT or Windows
  444. OSVERSIONINFOA OSVersionInfo;
  445. int nOSWinNT = 0;
  446. OSVersionInfo.dwOSVersionInfoSize = sizeof( OSVERSIONINFOA );
  447. if ( GetVersionExA( &OSVersionInfo ) )
  448. nOSWinNT = OSVersionInfo.dwPlatformId;
  449. // Open the training data file,
  450. // look in the directory that contains the DLL.
  451. if (GetModuleFileNameA (m_hModule, szFilename, MODULENAMELEN) == 0)
  452. return GetLastError();
  453. if ( (p = strrchr (szFilename, '\\')) != NULL ||
  454. (p = strrchr (szFilename, ':')) != NULL )
  455. {
  456. *++p = 0;
  457. }
  458. else
  459. *szFilename = 0;
  460. strcat (szFilename, DETECTION_DATA_FILENAME);
  461. if ((m_hf = CreateFileA (szFilename, GENERIC_READ, FILE_SHARE_READ,
  462. NULL, OPEN_EXISTING,
  463. FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE)
  464. {
  465. return E_FAIL;
  466. }
  467. if ((nFileSize = GetFileSize (m_hf, NULL)) == 0xffffffff) {
  468. hr = GetLastError();
  469. CloseHandle (m_hf);
  470. return hr;
  471. }
  472. // Virtual-map the file
  473. if ( nOSWinNT == VER_PLATFORM_WIN32_NT )
  474. m_hmap = CreateFileMapping (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);
  475. else
  476. m_hmap = CreateFileMappingA (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);
  477. if (m_hmap == NULL) {
  478. hr = GetLastError();
  479. CloseHandle (m_hf);
  480. return hr;
  481. }
  482. if ((m_pv = MapViewOfFile (m_hmap, FILE_MAP_READ, 0, 0, 0 )) == NULL) {
  483. hr = GetLastError();
  484. CloseHandle (m_hmap);
  485. CloseHandle (m_hf);
  486. return hr;
  487. }
  488. // Build the in-memory structures from the file
  489. hr = BuildState (nFileSize);
  490. return hr;
  491. }
  492. /****************************************************************/