Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

673 lines
18 KiB

  1. /*
  2. * Automatic language and codepage detector
  3. *
  4. * Bob Powell, 2/97
  5. * Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved.
  6. *
  7. * History: 1-Feb-97 BobP Created
  8. * 5-Aug-97 BobP Unicode support; Charmaps in data file.
  9. */
  10. #include "private.h"
  11. #include <strsafe.h>
  12. /****************************************************************/
  13. Histogram::Histogram (const PFileHistogramSection pHS, const PHIdx pMap)
  14. : m_nDimensionality((UCHAR)pHS->m_dwDimensionality),
  15. m_nEdgeSize((UCHAR)pHS->m_dwEdgeSize),
  16. m_nCodePage((USHORT)pHS->m_dwCodePage),
  17. m_pMap(pMap),
  18. m_panElts((HElt *)&pHS[1]) // table follows header struct in the file
  19. {
  20. // #elements = #unique character values ^ #dimensions
  21. m_nElts = 1;
  22. for (UCHAR i = 0; i < m_nDimensionality; i++)
  23. m_nElts *= m_nEdgeSize;
  24. }
  25. DWORD
  26. Histogram::Validate (DWORD nBytes) const
  27. {
  28. if ( nBytes < m_nElts * sizeof(HElt) ||
  29. m_nDimensionality > 4 )
  30. {
  31. return ERROR_INTERNAL_DB_CORRUPTION;
  32. }
  33. return NO_ERROR;
  34. }
  35. Histogram::Histogram (const Histogram &H, const PHIdx pMap)
  36. : m_nDimensionality(H.m_nDimensionality),
  37. m_nEdgeSize(H.m_nEdgeSize),
  38. m_nCodePage(H.m_nCodePage),
  39. m_nElts(H.m_nElts),
  40. m_pMap(pMap),
  41. m_panElts(H.m_panElts)
  42. //
  43. // Clone a histogram but use a different Charmap.
  44. {
  45. }
  46. Histogram::~Histogram (void)
  47. //
  48. // The pointer members point to the mapped file and do not need to be freed.
  49. {
  50. }
  51. /****************************************************************/
  52. Language::Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID)
  53. : m_pLC(pL),
  54. m_nLangID(nLangID),
  55. m_nCodePages(nCodePages),
  56. m_nRangeID(nRangeID)
  57. {
  58. }
  59. Language7Bit::Language7Bit (PLCDetect pL, int nLangID, int nCodePages)
  60. : Language(pL, nLangID, nCodePages),
  61. m_pLangHistogram(NULL)
  62. {
  63. memset ((void *)m_ppCodePageHistogram, 0, sizeof(m_ppCodePageHistogram));
  64. }
  65. Language7Bit::~Language7Bit (void)
  66. {
  67. if (m_pLangHistogram)
  68. delete m_pLangHistogram;
  69. for (int i = 0; i < MAXSUBLANG; i++)
  70. if (m_ppCodePageHistogram[i])
  71. delete m_ppCodePageHistogram[i];
  72. }
  73. DWORD
  74. Language7Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
  75. //
  76. // Add the raw histogram at *pHS in the mapped file to this language object.
  77. // The histograms must be for 7-bit detection.
  78. {
  79. DWORD hr = NO_ERROR;
  80. PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );
  81. if (nIdx == 0)
  82. {
  83. // The first histogram for a language is its language-detection table.
  84. if ( (m_pLangHistogram = new Histogram (pHS, pMap)) == NULL)
  85. return ERROR_OUTOFMEMORY;
  86. if ((hr = m_pLangHistogram->Validate (nBytes)) != NO_ERROR)
  87. return hr;
  88. }
  89. else
  90. {
  91. // Each subsequent histogram is a code page detection table.
  92. if (nIdx - 1 >= m_nCodePages)
  93. return ERROR_INTERNAL_DB_CORRUPTION;
  94. Histogram *pH;
  95. if ((pH = new Histogram (pHS, pMap)) == NULL)
  96. return ERROR_OUTOFMEMORY;
  97. if ((hr = pH->Validate (nBytes)) != NO_ERROR)
  98. return hr;
  99. m_ppCodePageHistogram[nIdx - 1] = pH;
  100. // Cache for the scoring vector math
  101. m_paHElt[nIdx - 1] = pH->Array();
  102. }
  103. return hr;
  104. }
  105. /****************************************************************/
  106. Language8Bit::Language8Bit (PLCDetect pL, int nLangID, int nCodePages)
  107. : Language(pL, nLangID, nCodePages)
  108. {
  109. memset ((void *)m_ppHistogram, 0, sizeof(m_ppHistogram));
  110. }
  111. Language8Bit::~Language8Bit (void)
  112. {
  113. for (int i = 0; i < MAXSUBLANG; i++)
  114. if (m_ppHistogram[i])
  115. delete m_ppHistogram[i];
  116. }
  117. DWORD
  118. Language8Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
  119. //
  120. // Add the raw histogram at *pHS to this language object.
  121. // This language is known to use 8-bit detection.
  122. {
  123. DWORD hr = NO_ERROR;
  124. PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );
  125. // The histograms are the direct language-code page tables
  126. if (nIdx >= m_nCodePages)
  127. return ERROR_INTERNAL_DB_CORRUPTION;
  128. Histogram *pH;
  129. if ((pH = new Histogram (pHS, pMap)) == NULL)
  130. return ERROR_OUTOFMEMORY;
  131. if ((hr = pH->Validate (nBytes)) != NO_ERROR)
  132. return hr;
  133. m_ppHistogram[nIdx] = pH;
  134. return hr;
  135. }
  136. /****************************************************************/
  137. LanguageUnicode::LanguageUnicode (PLCDetect pL, int nLangID,
  138. int nSubLangs, int nRangeID)
  139. : Language(pL, nLangID, nSubLangs, nRangeID)
  140. {
  141. memset ((void *)m_ppSubLangHistogram, 0, sizeof(m_ppSubLangHistogram));
  142. }
  143. LanguageUnicode::~LanguageUnicode (void)
  144. {
  145. for (int i = 0; i < MAXSUBLANG; i++)
  146. if (m_ppSubLangHistogram[i])
  147. delete m_ppSubLangHistogram[i];
  148. }
  149. DWORD
  150. LanguageUnicode::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
  151. {
  152. DWORD hr = NO_ERROR;
  153. // All histograms for are sublanguage detection
  154. if (nIdx >= m_nSubLangs)
  155. return ERROR_INTERNAL_DB_CORRUPTION;
  156. // Get the custom charmap used for scoring this sublanguage group
  157. PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );
  158. Histogram *pH;
  159. if ((pH = new Histogram (pHS, pMap)) == NULL)
  160. return ERROR_OUTOFMEMORY;
  161. if ((hr = pH->Validate (nBytes)) != NO_ERROR)
  162. return hr;
  163. m_ppSubLangHistogram[nIdx] = pH;
  164. m_paHElt[nIdx] = pH->Array();
  165. return hr;
  166. }
  167. /****************************************************************/
  168. LCDetect::LCDetect (HMODULE hM)
  169. : m_hModule(hM),
  170. m_nCharmaps(0),
  171. m_n7BitLanguages(0),
  172. m_n8BitLanguages(0),
  173. m_nUnicodeLanguages(0),
  174. m_n7BitLangsRead(0),
  175. m_n8BitLangsRead(0),
  176. m_nUnicodeLangsRead(0),
  177. m_nMapsRead(0),
  178. m_nHistogramsRead(0),
  179. m_nScoreIdx(0),
  180. m_pp7BitLanguages(NULL),
  181. m_pp8BitLanguages(NULL),
  182. m_ppUnicodeLanguages(NULL),
  183. m_ppCharmaps(NULL),
  184. m_pv(NULL),
  185. m_hmap(0),
  186. m_hf(0),
  187. m_pHU27Bit(0)
  188. {
  189. }
  190. LCDetect::~LCDetect ()
  191. {
  192. delete m_pHU27Bit;
  193. for (unsigned int i = 0; i < m_n7BitLanguages; i++)
  194. delete m_pp7BitLanguages[i];
  195. delete m_pp7BitLanguages;
  196. for (i = 0; i < m_n8BitLanguages; i++)
  197. delete m_pp8BitLanguages[i];
  198. delete m_pp8BitLanguages;
  199. for (i = 0; i < m_nUnicodeLanguages; i++)
  200. delete m_ppUnicodeLanguages[i];
  201. delete m_ppUnicodeLanguages;
  202. for (i = 0; i < m_nCharmaps; i++)
  203. delete m_ppCharmaps[i];
  204. delete m_ppCharmaps;
  205. if (m_pv)
  206. UnmapViewOfFile (m_pv);
  207. CloseHandle (m_hmap);
  208. CloseHandle (m_hf);
  209. }
  210. DWORD
  211. LCDetect::Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL)
  212. //
  213. // Set *ppL to the Language object created from this section.
  214. {
  215. // nRecordCount is lang histogram (1) + # of code page histograms
  216. if ( m_n7BitLangsRead >= m_n7BitLanguages || pLS->m_dwRecordCount < 1)
  217. return ERROR_INTERNAL_DB_CORRUPTION;
  218. PLanguage7Bit pL = new Language7Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount - 1);
  219. if (pL == NULL)
  220. return ERROR_OUTOFMEMORY;
  221. // Each 7-bit lang uses one score index slot per code page.
  222. // The range starts with the 7-bit langs, since both the 8-bit
  223. // and Unicode langs follow it.
  224. if (m_n7BitLangsRead == 0 && m_nScoreIdx != 0)
  225. return ERROR_INTERNAL_DB_CORRUPTION;;
  226. pL->SetScoreIdx(m_nScoreIdx);
  227. m_nScoreIdx += pLS->m_dwRecordCount - 1; // skip 1st record (Language)
  228. m_pp7BitLanguages[ m_n7BitLangsRead++ ] = pL;
  229. *ppL = pL;
  230. return NO_ERROR;
  231. }
  232. DWORD
  233. LCDetect::Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL)
  234. //
  235. // Set *ppL to the Language object created from this section.
  236. {
  237. // nRecordCount is # of combined language / code page histograms
  238. if ( m_n8BitLangsRead >= m_n8BitLanguages || pLS->m_dwRecordCount < 1)
  239. return ERROR_INTERNAL_DB_CORRUPTION;
  240. PLanguage8Bit pL = new Language8Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount);
  241. if (pL == NULL)
  242. return ERROR_OUTOFMEMORY;
  243. // The 8-bit score indices follow the 7-bit languages
  244. // Each 8-bit lang uses a score index slot for each of its code pages,
  245. // since all the code pages are scored in the initial scoring pass.
  246. // The number of slots is the number of code page histograms, which is
  247. // one less than the number of records following this language.
  248. pL->SetScoreIdx(m_nScoreIdx);
  249. m_nScoreIdx += pLS->m_dwRecordCount;
  250. m_pp8BitLanguages[ m_n8BitLangsRead++ ] = pL;
  251. *ppL = pL;
  252. return NO_ERROR;
  253. }
  254. DWORD
  255. LCDetect::InitializeUnicodeLanguage (PFileLanguageSection pLS, Language **ppL)
  256. //
  257. // Set *ppL to the Language object created from this section.
  258. {
  259. // nRecordCount is # of sublanguage histograms
  260. if ( m_nUnicodeLangsRead >= m_nUnicodeLanguages ||
  261. pLS->m_dwUnicodeRangeID >= m_nUnicodeLanguages )
  262. {
  263. return ERROR_INTERNAL_DB_CORRUPTION;
  264. }
  265. PLanguageUnicode pL = new LanguageUnicode (this, pLS->m_dwLangID,
  266. pLS->m_dwRecordCount, pLS->m_dwUnicodeRangeID);
  267. if (pL == NULL)
  268. return ERROR_OUTOFMEMORY;
  269. // The Unicode score indices follow the 7-bit languages, and overlay the
  270. // 8-bit slots since they aren't used at the same time.
  271. if (m_nUnicodeLangsRead == 0 && GetN8BitLanguages() > 0)
  272. m_nScoreIdx = Get8BitLanguage(0)->GetScoreIdx();
  273. // Each Unicode entry uses exactly one score index. SBCS subdetection
  274. // (Latin group) uses the slots for the corresponding 7-bit languages,
  275. // and Unicode subdetection (CJK) uses the slots already defined for the
  276. // Unicode sub-languages.
  277. pL->SetScoreIdx(m_nScoreIdx);
  278. m_nScoreIdx++;
  279. // For Unicode, the range ID is used as the Language array index.
  280. m_ppUnicodeLanguages[ pLS->m_dwUnicodeRangeID ] = pL;
  281. m_nUnicodeLangsRead++;
  282. *ppL = pL;
  283. return NO_ERROR;
  284. }
  285. DWORD
  286. LCDetect::LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL)
  287. //
  288. // A language section begins the definition of data for a language.
  289. // Each language has exactly one of these records. One or more
  290. // histogram sections follow each language, and are always associated
  291. // with the language of the preceding language section.
  292. //
  293. // Set *ppL to the Language object created from this section.
  294. {
  295. DWORD hr = NO_ERROR;
  296. PFileLanguageSection pLS;
  297. pLS = (PFileLanguageSection)&((char *)pv)[sizeof(FileSection)];
  298. switch ( pLS->m_dwDetectionType ) {
  299. case DETECT_7BIT:
  300. hr = Initialize7BitLanguage (pLS, ppL);
  301. break;
  302. case DETECT_8BIT:
  303. hr = Initialize8BitLanguage (pLS, ppL);
  304. break;
  305. case DETECT_UNICODE:
  306. hr = InitializeUnicodeLanguage (pLS, ppL);
  307. break;
  308. }
  309. return hr;
  310. }
  311. DWORD
  312. LCDetect::LoadHistogramSection (void *pv, int nSectionSize, Language *pL)
  313. {
  314. PFileHistogramSection pHS;
  315. pHS = (PFileHistogramSection)&((char *)pv)[sizeof(FileSection)];
  316. int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pHS);
  317. return pL->AddHistogram ( pHS, nBytes, m_nHistogramsRead++);
  318. }
  319. DWORD
  320. LCDetect::LoadMapSection (void *pv, int nSectionSize)
  321. {
  322. PFileMapSection pMS;
  323. pMS = (PFileMapSection)&((char *)pv)[sizeof(FileSection)];
  324. int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pMS);
  325. if (m_nMapsRead >= m_nCharmaps)
  326. return ERROR_INTERNAL_DB_CORRUPTION;
  327. PCharmap pM = new Charmap (pMS);
  328. if (pM == NULL)
  329. return ERROR_OUTOFMEMORY;
  330. m_ppCharmaps[ m_nMapsRead++ ] = pM;
  331. return NO_ERROR;
  332. }
  333. DWORD
  334. LCDetect::BuildState (DWORD nFileSize)
  335. //
  336. // Build the detection structures from the mapped training file image at *m_pv
  337. {
  338. PLanguage pL;
  339. PFileHeader pFH;
  340. PFileSection pFS;
  341. DWORD hr = NO_ERROR;
  342. // Validate header
  343. pFH = (PFileHeader) m_pv;
  344. if ( nFileSize < sizeof(*pFH) ||
  345. pFH->m_dwAppSig != APP_SIGNATURE ||
  346. pFH->m_dwVersion != APP_VERSION ||
  347. pFH->m_dwHdrSizeBytes >= nFileSize ||
  348. pFH->m_dwN7BitLanguages == 0 ||
  349. pFH->m_dwN8BitLanguages == 0 ||
  350. pFH->m_dwNUnicodeLanguages == 0 ||
  351. pFH->m_dwNCharmaps == 0 )
  352. {
  353. return ERROR_INTERNAL_DB_CORRUPTION;
  354. }
  355. // Allocate language pointer table per header
  356. m_n7BitLanguages = pFH->m_dwN7BitLanguages;
  357. m_pp7BitLanguages = new PLanguage7Bit [m_n7BitLanguages];
  358. m_n8BitLanguages = pFH->m_dwN8BitLanguages;
  359. m_pp8BitLanguages = new PLanguage8Bit [m_n8BitLanguages];
  360. m_nUnicodeLanguages = pFH->m_dwNUnicodeLanguages;
  361. m_ppUnicodeLanguages = new PLanguageUnicode [m_nUnicodeLanguages];
  362. m_nCharmaps = pFH->m_dwNCharmaps;
  363. m_ppCharmaps = new PCharmap [m_nCharmaps];
  364. if ( m_pp7BitLanguages == NULL ||
  365. m_pp8BitLanguages == NULL ||
  366. m_ppUnicodeLanguages == NULL ||
  367. m_ppCharmaps == NULL )
  368. {
  369. return ERROR_OUTOFMEMORY;
  370. }
  371. // Clear, because not all slots may be assigned
  372. memset (m_ppUnicodeLanguages, 0, sizeof(PLanguageUnicode) * m_nUnicodeLanguages);
  373. // Remember other header info
  374. m_LCDConfigureDefault.nMin7BitScore = pFH->m_dwMin7BitScore;
  375. m_LCDConfigureDefault.nMin8BitScore = pFH->m_dwMin8BitScore;
  376. m_LCDConfigureDefault.nMinUnicodeScore = pFH->m_dwMinUnicodeScore;
  377. m_LCDConfigureDefault.nRelativeThreshhold = pFH->m_dwRelativeThreshhold;
  378. m_LCDConfigureDefault.nDocPctThreshhold = pFH->m_dwDocPctThreshhold;
  379. m_LCDConfigureDefault.nChunkSize = pFH->m_dwChunkSize;
  380. // Position to first section
  381. pFS = (PFileSection) &((char *)m_pv)[pFH->m_dwHdrSizeBytes];
  382. // Read and process each file section
  383. while ( hr == NO_ERROR ) {
  384. // check alignment
  385. if (((DWORD_PTR)pFS & 3) != 0) {
  386. hr = ERROR_INTERNAL_DB_CORRUPTION;
  387. break;
  388. }
  389. // zero-length section marks end of data
  390. if (pFS->m_dwSizeBytes == 0)
  391. break;
  392. if ( &((char *)pFS)[pFS->m_dwSizeBytes] >= &((char *)m_pv)[nFileSize]) {
  393. hr = ERROR_INTERNAL_DB_CORRUPTION;
  394. break;
  395. }
  396. switch ( pFS->m_dwType ) {
  397. case SECTION_TYPE_LANGUAGE: // sets pL
  398. hr = LoadLanguageSection ((void*)pFS, pFS->m_dwSizeBytes, &pL);
  399. m_nHistogramsRead = 0;
  400. break;
  401. case SECTION_TYPE_HISTOGRAM: // uses pL
  402. hr = LoadHistogramSection ((void*)pFS, pFS->m_dwSizeBytes, pL);
  403. break;
  404. case SECTION_TYPE_MAP:
  405. hr = LoadMapSection ((void*)pFS, pFS->m_dwSizeBytes);
  406. break;
  407. default: // ignore unrecognized sections
  408. break;
  409. }
  410. pFS = (PFileSection) &((char *)pFS)[pFS->m_dwSizeBytes];
  411. }
  412. if (hr != NO_ERROR)
  413. return hr;
  414. if ( m_nMapsRead != m_nCharmaps )
  415. return ERROR_INTERNAL_DB_CORRUPTION;
  416. // Set up quick-reference arrays used by the scoring inner loops
  417. for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
  418. m_paHElt7Bit[i] = Get7BitLanguage(i)->GetLangHistogram()->Array();
  419. m_nHElt8Bit = 0;
  420. for (i = 0; i < GetN8BitLanguages(); i++)
  421. {
  422. PLanguage8Bit pL = Get8BitLanguage(i);
  423. for (int j = 0; j < pL->NCodePages(); j++)
  424. m_paHElt8Bit[m_nHElt8Bit++] = pL->GetHistogram(j)->Array();
  425. }
  426. // Set up the Histogram used for ScoreVectorW() for scoring Unicode
  427. // text for 7-bit language detection. Clone the first 7-bit language
  428. // histogram and replace its map with CHARMAP_U27BIT.
  429. m_pHU27Bit = new Histogram ( *Get7BitLanguage(0)->GetLangHistogram(),
  430. GetMap(CHARMAP_U27BIT));
  431. return hr;
  432. }
  433. DWORD
  434. LCDetect::LoadState (void)
  435. //
  436. // Overall initialization and state loading. Open the compiled training
  437. // file from its fixed location in the System32 directory, and assemble
  438. // in-memory detection tables from its contents.
  439. {
  440. DWORD hr = NO_ERROR;
  441. DWORD nFileSize;
  442. #define MODULENAMELEN 100
  443. char szFilename[MODULENAMELEN+50], *p;
  444. // Find out if NT or Windows
  445. OSVERSIONINFOA OSVersionInfo;
  446. int nOSWinNT = 0;
  447. OSVersionInfo.dwOSVersionInfoSize = sizeof( OSVERSIONINFOA );
  448. if ( GetVersionExA( &OSVersionInfo ) )
  449. nOSWinNT = OSVersionInfo.dwPlatformId;
  450. // Open the training data file,
  451. // look in the directory that contains the DLL.
  452. if (GetModuleFileNameA (m_hModule, szFilename, MODULENAMELEN) == 0)
  453. return GetLastError();
  454. if ( (p = strrchr (szFilename, '\\')) != NULL ||
  455. (p = strrchr (szFilename, ':')) != NULL )
  456. {
  457. *++p = 0;
  458. }
  459. else
  460. *szFilename = 0;
  461. //*STRSAFE* strcat (szFilename, DETECTION_DATA_FILENAME);
  462. hr = StringCchCatA(szFilename , ARRAYSIZE(szFilename), DETECTION_DATA_FILENAME);
  463. if (!SUCCEEDED(hr))
  464. {
  465. return E_FAIL;
  466. }
  467. if ((m_hf = CreateFileA (szFilename, GENERIC_READ, FILE_SHARE_READ,
  468. NULL, OPEN_EXISTING,
  469. FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE)
  470. {
  471. return E_FAIL;
  472. }
  473. if ((nFileSize = GetFileSize (m_hf, NULL)) == 0xffffffff) {
  474. hr = GetLastError();
  475. CloseHandle (m_hf);
  476. return hr;
  477. }
  478. // Virtual-map the file
  479. if ( nOSWinNT == VER_PLATFORM_WIN32_NT )
  480. m_hmap = CreateFileMapping (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);
  481. else
  482. m_hmap = CreateFileMappingA (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);
  483. if (m_hmap == NULL) {
  484. hr = GetLastError();
  485. CloseHandle (m_hf);
  486. return hr;
  487. }
  488. if ((m_pv = MapViewOfFile (m_hmap, FILE_MAP_READ, 0, 0, 0 )) == NULL) {
  489. hr = GetLastError();
  490. CloseHandle (m_hmap);
  491. CloseHandle (m_hf);
  492. return hr;
  493. }
  494. // Build the in-memory structures from the file
  495. hr = BuildState (nFileSize);
  496. return hr;
  497. }
  498. /****************************************************************/