Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

180 lines
5.2 KiB

  1. /*
  2. * Declarations common to compiler and detector.
  3. *
  4. * Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved.
  5. *
  6. * History: 1-Feb-97 BobP Created
  7. * 5-Aug-97 BobP Added Unicode support, and persisting
  8. * Charmaps in the data file.
  9. */
  10. #ifndef __INC_LCDCOMP_COMMON_H
  11. #define __INC_LCDCOMP_COMMON_H
  12. /****************************************************************/
  13. // Compiled detection data file, in lcdetect.dll module directory
  14. #define DETECTION_DATA_FILENAME "mlang.dat"
  15. // Limits
  16. #define MAX7BITLANG 30
  17. #define MAX8BITLANG 30
  18. #define MAXUNICODELANG 30
  19. #define MAXSUBLANG 5 // max # of sublanguages or codepages per lang
  20. #define MAXCHARMAPS 10 // max # of Charmaps, overall
  21. // Special case entries for the training script and detector.
  22. // These language IDs are never returned by the detector.
  23. #define LANGID_UNKNOWN 0x400
  24. #define LANGID_LATIN_GROUP 0x401
  25. #define LANGID_CJK_GROUP 0x402
  26. // Value type of a histogram array index.
  27. // This is the output value of the SBCS/DBCS or WCHAR reduction mapping,
  28. // and is used as the index into the n-gram arrays and for the Unicode
  29. // language group IDs.
  30. //
  31. typedef unsigned char HIdx;
  32. typedef HIdx *PHIdx;
  33. #define HIDX_MAX UCHAR_MAX // keep consistent w/ HIdx
  34. // Fixed index values for mapped characters
  35. #define HIDX_IGNORE 0
  36. #define HIDX_EXTD 1
  37. #define HIDX_LETTER_A 2
  38. #define HIDX_LETTER_Z (HIDX_LETTER_A + 25)
  39. // Value type of a histogram element
  40. typedef unsigned char HElt;
  41. typedef HElt *PHElt;
  42. #define HELT_MAX UCHAR_MAX // keep consistent w/ HElt
  43. #define LANG7_DIM 3 // 7-bit language uses trigrams
  44. // Fixed IDs of the Charmaps
  45. #define CHARMAP_UNICODE 0 // Built from RANGE directives
  46. #define CHARMAP_7BITLANG 1 // Built from CHARMAP 1
  47. #define CHARMAP_8BITLANG 2 // From CHARMAP 2
  48. #define CHARMAP_CODEPAGE 3 // From CHARMAP 3
  49. #define CHARMAP_U27BIT 4 // Built internally for Unicode to 7-bit lang
  50. #define CHARMAP_NINTERNAL 5 // First ID for dynamic subdetection maps
  51. #define DEFAULT_7BIT_EDGESIZE 28
  52. #define DEFAULT_8BIT_EDGESIZE 155
  53. #define UNICODE_DEFAULT_CHAR_SCORE 50
  54. /****************************************************************/
  55. // Compiled file format.
  56. // These declarations directly define the raw file format.
  57. // Be careful making changes here, and be sure to change the
  58. // header version number when appropriate.
  59. #define APP_SIGNATURE 0x5444434C // "LCDT"
  60. #define APP_VERSION 2
  61. enum SectionTypes { // for m_dwType below
  62. SECTION_TYPE_LANGUAGE = 1, // any language definition
  63. SECTION_TYPE_HISTOGRAM = 2, // any histogram
  64. SECTION_TYPE_MAP = 3 // any character mapping table
  65. };
  66. enum DetectionType { // SBCS/DBCS detection types
  67. DETECT_NOTDEFINED = 0,
  68. DETECT_7BIT,
  69. DETECT_8BIT,
  70. DETECT_UNICODE,
  71. DETECT_NTYPES
  72. };
  73. // FileHeader -- one-time header at start of file
  74. typedef struct FileHeader {
  75. DWORD m_dwAppSig; // 'DTCT'
  76. DWORD m_dwVersion;
  77. DWORD m_dwHdrSizeBytes; // byte offset of 1st real section
  78. DWORD m_dwN7BitLanguages;
  79. DWORD m_dwN8BitLanguages;
  80. DWORD m_dwNUnicodeLanguages;
  81. DWORD m_dwNCharmaps;
  82. DWORD m_dwMin7BitScore;
  83. DWORD m_dwMin8BitScore;
  84. DWORD m_dwMinUnicodeScore;
  85. DWORD m_dwRelativeThreshhold;
  86. DWORD m_dwDocPctThreshhold;
  87. DWORD m_dwChunkSize;
  88. } FileHeader;
  89. typedef FileHeader *PFileHeader;
  90. // FileSection -- common header that begins each file section
  91. typedef struct FileSection {
  92. DWORD m_dwSizeBytes; // section size incl. header (offset to next)
  93. DWORD m_dwType; // type of entry this section
  94. } FileSection;
  95. typedef FileSection *PFileSection;
  96. // FileLanguageSection -- 1st entry of sequence for an SBCS/DBCS language
  97. //
  98. // Followed by 1 or more histogram sections
  99. typedef struct FileLanguageSection {
  100. // preceded by struct FileSection
  101. DWORD m_dwDetectionType;
  102. DWORD m_dwLangID;
  103. DWORD m_dwUnicodeRangeID; // Unicode range mapping value for this lang
  104. DWORD m_dwRecordCount; // # of histograms following this record
  105. } FileLanguageSection;
  106. typedef FileLanguageSection *PFileLanguageSection;
  107. // FileHistogramSection -- entry for one histogram (SBCS/DBCS or WCHAR)
  108. typedef struct FileHistogramSection {
  109. // preceded by struct FileSection
  110. union {
  111. DWORD m_dwCodePage; // for 7 or 8-bit, Codepage this indicates
  112. DWORD m_dwRangeID; // for Unicode, the sublanguage group ID
  113. };
  114. DWORD m_dwDimensionality;
  115. DWORD m_dwEdgeSize;
  116. DWORD m_dwMappingID; // ID of Charmap to use
  117. // HElt m_Elts[]
  118. } FileHistogramSection;
  119. typedef struct FileHistogramSection *PFileHistogramSection;
  120. // FileMapSection -- entry for one character map (SBCS/DBCS or WCHAR)
  121. typedef struct FileMapSection {
  122. // preceded by struct FileSection
  123. DWORD m_dwID; // ID by which hardwired code finds the table
  124. DWORD m_dwSize; // size of table (256 or 65536)
  125. DWORD m_dwNUnique; // # of unique output values
  126. // HIdx m_map[]
  127. } FileMapSection;
  128. typedef struct FileMapSection *PFileMapSection;
  129. ////////////////////////////////////////////////////////////////
  130. // LangNames - lookup table to get from English-localized names to a Win32
  131. // primary language ID.
  132. struct LangNames {
  133. LPCSTR pcszName;
  134. unsigned short nLangID;
  135. };
  136. LPCSTR GetLangName (int id);
  137. int GetLangID (LPCSTR pcszName);
  138. extern const struct LangNames LangNames[];
  139. ////////////////////////////////////////////////////////////////
  140. #endif