Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

275 lines
12 KiB

  1. #ifndef TRIE_H
  2. #define TRIE_H
  3. #include <windows.h>
  4. #ifdef __cplusplus
  5. extern "C" {
  6. #endif
  7. /* Abstract trie node structure. wch is a character to transition on; flags describe various things
  8. about the compressed trie; lpbNode points to the first byte of the next node in this state, and
  9. lpbDown points to the first byte referenced by the down pointer, if any */
  10. typedef struct tagTAGDATA
  11. {
  12. DWORD cTag; // Count of tagged nodes below this node in the subtree
  13. DWORD dwData; // Stored tagged data for this node
  14. } TAGDATA;
  15. #define MAXTAGS 1
  16. #if MAXTAGS > 8
  17. #error No more than 8 tags are allowed
  18. #endif
  19. typedef struct tagTRIESCAN
  20. {
  21. WCHAR wch; // Unicode character
  22. WORD wFlags; // see below
  23. WORD wMask; // which tags are valid
  24. WORD __pad0; //
  25. DWORD cWords; // Words in subtree (only valid if TRIE_NODE_COUNT is set)
  26. DWORD cSkipWords; // Words in subtrees ignored when following a skip pointer
  27. LPBYTE lpbNode; // Address of next byte within the compressed trie
  28. LPBYTE lpbDown; // Address referenced by down pointer, if any
  29. LPBYTE lpbRight; // Address referenced by right pointer, if any
  30. LPBYTE lpbSRDown; // Last single-ref address referenced
  31. TAGDATA aTags[MAXTAGS]; // The list of tag counts/data
  32. } TRIESCAN, *PTRIESCAN, *LPTRIESCAN;
  33. // Trie node flags, only the lower 16 bits of the flags are saved in the trie
  34. #define TRIE_NODE_VALID 0x00000001 // wch is the last letter of a valid word
  35. #define TRIE_NODE_END 0x00000002 // Last node in the state (no more alternatives to wch)
  36. #define TRIE_NODE_COUNT 0x00000004 // The count of words in the subtree is stored in the node
  37. #define TRIE_NODE_TAGGED 0x00000008 // The node has tagged data
  38. #define TRIE_NODE_DOWN 0x00000010 // iDown is valid (word so far is a valid prefix)
  39. #define TRIE_NODE_RIGHT 0x00000020 // iRight is valid (word connects to a substate)
  40. #define TRIE_DOWN_INLINE 0x00000040 // iDown omitted, since it points to next node in memory
  41. #define TRIE_DOWN_MULTI 0x00000080 // iDown is a second reference or worse
  42. #define TRIE_DOWN_ABS 0x00000100 // iDown is an absolute immediate offset into the trie
  43. #define TRIE_NODE_SKIP 0x00000200 // Either iRight is a skip pointer or EOS is a 'soft' EOS
  44. #define TRIE_NODE_SKIP_COUNT 0x00000400 // cSkipWords is valid
  45. /* Macro to access the data in the node, works for dawgs and tries */
  46. #define DAWGDATA(pdawg) ((pdawg)->wch)
  47. #define DAWGDOWNFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_DOWN)
  48. #define DAWGENDFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_END)
  49. #define DAWGWORDFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_VALID)
  50. /* Fixed-length part of the compressed trie header */
  51. typedef struct tagTRIESTATS
  52. {
  53. WORD version; // Version of this particular compressed trie
  54. WORD __pad0; //
  55. BYTE wTagsMask; // Which tags are in use
  56. BYTE wEnumMask; // Which tags have enumeration
  57. BYTE wDataMask; // Which tags have stored data
  58. BYTE cTagFields; // Total tags in use
  59. WORD cMaxWord; // Number of characters in longest word
  60. WORD cMaxState; // Number of nodes in longest state (max alternatives)
  61. WORD cCharFlagsCodesMax; // Bytes in longest char/flags code
  62. WORD cTagsCodesMax; // Bytes in longest tagged data code
  63. WORD cMRPointersCodesMax; // Bytes in longest MR pointer code
  64. WORD cSROffsetsCodesMax; // Bytes in longest Single-ref code
  65. DWORD cWords; // Number of words in dictionary
  66. DWORD cUniqueSROffsets; // Unique offsets in Single-ref segment
  67. DWORD cUniqueCharFlags; // Unique char/flags pairs
  68. DWORD cUniqueTags; // Unique tagged data values
  69. DWORD cUniqueMRPointers; // Unique multi-ref pointers
  70. DWORD cbHeader; // Bytes in header & tables
  71. DWORD cbTrie; // Bytes in trie
  72. } TRIESTATS, *PTRIESTATS, *LPTRIESTATS;
  73. /* Primary unit of a node. Nodes usually contain a pointer too */
  74. typedef struct tagCHARFLAGS {
  75. wchar_t wch;
  76. short wFlags;
  77. } CHARFLAGS, *PCHARFLAGS, *LPCHARFLAGS;
  78. /* Control structure used to decompress the trie */
  79. typedef struct tagTRIECTRL
  80. {
  81. TRIESTATS *lpTrieStats; // Pointer to base of header segment
  82. WORD *lpwCharFlagsCodes; // decoding table for Char/flags
  83. WORD *lpwTagsCodes; // decoding table for tagged data
  84. WORD *lpwMRPointersCodes; // decoding table for multiref pointers
  85. WORD *lpwSROffsetsCodes; // decoding table for singleref offsets
  86. CHARFLAGS *lpCharFlags; // table to convert codes to char/flags
  87. DWORD *lpwTags; // table to convert codes to tagged data
  88. DWORD *lpwMRPointers; // table to convert codes to multiref pointers
  89. DWORD *lpwSROffsets; // table to convert codes to Singleref offsets
  90. BYTE *lpbTrie; // Pointer to the trie.
  91. } TRIECTRL, *PTRIECTRL, *LPTRIECTRL;
  92. /* Useful Constants */
  93. #define TRIE_MAX_DEPTH 128 // We'll fail on any words longer than this
  94. // The prototypes below are plain C (this is required for use with C++)
  95. /* Given a pointer to a mapped file or resource containing a compressed trie,
  96. read the trie into memory, making all the allocations required */
  97. TRIECTRL * WINAPI TrieInit(LPBYTE lpByte);
  98. /* Free all the allocations associated with a trie */
  99. void WINAPI TrieFree(LPTRIECTRL lpTrieCtrl);
  100. void WINAPI TrieDecompressNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan);
  101. /* Given a compressed trie and a pointer to a decompresed node from it, find and decompress
  102. the next node in the same state. lpTrieScan is a user-allocated structure that holds the
  103. decompressed node and into which the new node is copied.
  104. This is equivalent to traversing a right pointer or finding the next alternative
  105. letter at the same position. If there is no next node (i.e.this is the end of the state)
  106. then TrieGetNextNode returns FALSE. To scan from the beginning of the trie, set the lpTrieScan
  107. structure to zero */
  108. BOOL WINAPI
  109. TrieGetNextNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan);
  110. BOOL WINAPI
  111. TrieSkipNextNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan, WCHAR wch);
  112. /* Follow the down pointer to the next state. This is equivalent to accepting the character
  113. in this node and advancing to the next character position. Returns FALSE if there is no
  114. down pointer. This also decompresses the first node in the state, so all the values in
  115. lpTrieScan will be good. */
  116. BOOL WINAPI
  117. TrieGetNextState(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan);
  118. /* Check the validity of a word or prefix. Starts from the root of pTrie looking for
  119. pwszWord. If it finds it, it returns TRUE and the user-provided lpTrieScan structure
  120. contains the final node in the word. If there is no path, TrieCheckWord returns FALSE
  121. To distinguisha valid word from a valid prefix, caller must test
  122. wFlags for fTrieNodeValid. */
  123. BOOL WINAPI
  124. TrieCheckWord(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan, wchar_t * lpwszWord);
  125. /* Walk the trie from pTrieNode, calling pfnTrieWord on every valid word. pvParam is passed through
  126. to pfnTrieWord. If pfnTrieWord returns non-zero, the enumeration stops. lpwszWord must point to a
  127. space of cwchTrieWordMax+1 wchar_t's. To walk the entire trie, set *pTrieScan to all zeros. Returns
  128. the number of words traversed. pfnTrieWord may be null if all you want is the count of words. */
  129. int WINAPI
  130. TrieEnumerate(
  131. LPTRIECTRL lpTrieCtrl, // Trie to enumerate
  132. LPTRIESCAN lpTrieScan, // structure holding starting point, all-zero for whole trie
  133. wchar_t *pwszWord, // buffer to hold words being enumerated
  134. void *pvParam, // parameter to pass to pfnTrieWord
  135. int (*pfnTrieWord)(wchar_t *pwszWord, void *pvParam)
  136. );
  137. int WINAPI
  138. TrieWordToIndex(
  139. TRIECTRL *ptc, // Trie in which to find word index
  140. wchar_t *pwszWord // Word for which we're looking
  141. );
  142. BOOL WINAPI
  143. TrieIndexToWord(
  144. TRIECTRL *ptc, // Trie in which to find indexed word
  145. DWORD nIndex, // Index for which we're looking
  146. wchar_t *pwszWord, // Returned word
  147. int cwc // Max characters in buffer (including NULL)
  148. );
  149. int WINAPI
  150. TrieWordToTagIndex(
  151. TRIECTRL *ptc, // Trie in which to find word index
  152. wchar_t *pwszWord, // Word for which we're looking
  153. int tag // Which tag to enumerate
  154. );
  155. BOOL WINAPI
  156. TrieTagIndexToWord(
  157. TRIECTRL *ptc, // Trie in which to find indexed word
  158. DWORD nIndex, // Index for which we're looking
  159. wchar_t *pwszWord, // Returned word
  160. int cwc, // Max characters in buffer (including NULL)
  161. int tag // Which tag to enumerate
  162. );
  163. BOOL WINAPI
  164. TrieGetTagsFromWord(
  165. TRIECTRL *ptc, // Trie in which to find word
  166. wchar_t *pwszWord, // Word for which we're looking
  167. DWORD *pdw, // Returned values
  168. BYTE *pbValid // Mask for valid return values
  169. );
  170. int WINAPI
  171. TriePrefixToRange(
  172. TRIECTRL *ptc, // Trie in which to find prefix range
  173. wchar_t *pwszWord, // Prefix for which we're looking
  174. int *piStart // Start of range with this prefix
  175. );
  176. /**** Subroutines for traversing Directed Acyclic Word Graphs ****/
  177. /* Abstract trie node structure. wch is a character to transition on; flags describe various things
  178. about the compressed trie; iDown indexes the first node in the state wch transitions to. DAWG is a special
  179. kind of trie: a "Directed Acyclic Word Graph," essentially an ending-compressed trie. */
  180. typedef struct tagDAWGNODE
  181. {
  182. DWORD wch; // Unicode character
  183. DWORD wFlags; // see below
  184. DWORD cWords; // Words below this node in the subtree
  185. DWORD cSkipWords; // Words below skipped nodes
  186. DWORD iDown; // Offset of first node in next state
  187. DWORD iRight; // Offset to first node in next substate
  188. DWORD cTags[8]; // Count of tagged nodes below this node in the subtree
  189. DWORD dwData[8]; // Stored tagged data for this node
  190. } DAWGNODE, *PDAWGNODE, *LPDAWGNODE;
  191. /* Given a trie and a pointer to a node in it, find the next node in that state.
  192. This is equivalent to traversing a right pointer or finding the next alternative
  193. letter at the same position. Returns a pointer to the new node, NULL if there is
  194. no next node (i.e. if this is the end of a state).*/
  195. DAWGNODE * WINAPI DawgGetNextNode(void *pTrie, DAWGNODE *pTrieNode);
  196. /* From this node, find the first node in the state it points to. This is equivalent
  197. to traversing a down pointer or extending the word one letter and finding the first
  198. alternative. Returns a pointer to the first node in the new state, NULL if there is
  199. no down pointer. To find the first state in the trie, use pTrieNode == NULL */
  200. DAWGNODE * WINAPI DawgGetNextState(void *pTrie, DAWGNODE *pTrieNode);
  201. /* Check the validity of a word or prefix. Starts from the root of pTrie looking for
  202. pwszWord. If it finds it, it returns a pointer to the terminal node in pTrie Returns
  203. NULL if there is no path through the trie that corresponds to pwszWord. To distinguish
  204. a valid word from a valid prefix, caller must test wFlags for fTrieNodeValid. */
  205. DAWGNODE * WINAPI DawgCheckWord(void *pTrie, wchar_t *pwszWord);
  206. /* Walk the trie from pTrieNode, calling pfnTrieWord on every valid word. pvParam is passed through
  207. to pfnTrieWord. If pfnTrieWord returns non-zero, the enumeration stops. pwszWord must point to a
  208. space of cwchTrieWordMax+1 wchar_t's. To walk the entire trie, pass NULL for pTrieNode. Returns
  209. the number of words traversed. pfnTrieWord may be null if all you want is the count of words. */
  210. int WINAPI
  211. DawgEnumerate(
  212. void *pTrie, // Trie to enumerate
  213. DAWGNODE *pTrieNodeStart, // point to enumerate from, NULL if all
  214. wchar_t *pwszWord, // buffer to hold words being enumerated
  215. void *pvParam, // parameter to pass to pfnTrieWord
  216. int (*pfnTrieWord)(wchar_t *pwszWord, void *pvParam)
  217. );
  218. // end plain C Prototypes
  219. #ifdef __cplusplus
  220. }
  221. #endif
  222. #endif // TRIE_H