#ifndef TRIE_H #define TRIE_H #include #ifdef __cplusplus extern "C" { #endif /* Abstract trie node structure. wch is a character to transition on; flags describe various things about the compressed trie; lpbNode points to the first byte of the next node in this state, and lpbDown points to the first byte referenced by the down pointer, if any */ typedef struct tagTAGDATA { DWORD cTag; // Count of tagged nodes below this node in the subtree DWORD dwData; // Stored tagged data for this node } TAGDATA; #define MAXTAGS 1 #if MAXTAGS > 8 #error No more than 8 tags are allowed #endif typedef struct tagTRIESCAN { WCHAR wch; // Unicode character WORD wFlags; // see below WORD wMask; // which tags are valid WORD __pad0; // DWORD cWords; // Words in subtree (only valid if TRIE_NODE_COUNT is set) DWORD cSkipWords; // Words in subtrees ignored when following a skip pointer LPBYTE lpbNode; // Address of next byte within the compressed trie LPBYTE lpbDown; // Address referenced by down pointer, if any LPBYTE lpbRight; // Address referenced by right pointer, if any LPBYTE lpbSRDown; // Last single-ref address referenced TAGDATA aTags[MAXTAGS]; // The list of tag counts/data } TRIESCAN, *PTRIESCAN, *LPTRIESCAN; // Trie node flags, only the lower 16 bits of the flags are saved in the trie #define TRIE_NODE_VALID 0x00000001 // wch is the last letter of a valid word #define TRIE_NODE_END 0x00000002 // Last node in the state (no more alternatives to wch) #define TRIE_NODE_COUNT 0x00000004 // The count of words in the subtree is stored in the node #define TRIE_NODE_TAGGED 0x00000008 // The node has tagged data #define TRIE_NODE_DOWN 0x00000010 // iDown is valid (word so far is a valid prefix) #define TRIE_NODE_RIGHT 0x00000020 // iRight is valid (word connects to a substate) #define TRIE_DOWN_INLINE 0x00000040 // iDown omitted, since it points to next node in memory #define TRIE_DOWN_MULTI 0x00000080 // iDown is a second reference or worse #define TRIE_DOWN_ABS 0x00000100 // iDown is an absolute immediate offset into the trie #define TRIE_NODE_SKIP 0x00000200 // Either iRight is a skip pointer or EOS is a 'soft' EOS #define TRIE_NODE_SKIP_COUNT 0x00000400 // cSkipWords is valid /* Macro to access the data in the node, works for dawgs and tries */ #define DAWGDATA(pdawg) ((pdawg)->wch) #define DAWGDOWNFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_DOWN) #define DAWGENDFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_END) #define DAWGWORDFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_VALID) /* Fixed-length part of the compressed trie header */ typedef struct tagTRIESTATS { WORD version; // Version of this particular compressed trie WORD __pad0; // BYTE wTagsMask; // Which tags are in use BYTE wEnumMask; // Which tags have enumeration BYTE wDataMask; // Which tags have stored data BYTE cTagFields; // Total tags in use WORD cMaxWord; // Number of characters in longest word WORD cMaxState; // Number of nodes in longest state (max alternatives) WORD cCharFlagsCodesMax; // Bytes in longest char/flags code WORD cTagsCodesMax; // Bytes in longest tagged data code WORD cMRPointersCodesMax; // Bytes in longest MR pointer code WORD cSROffsetsCodesMax; // Bytes in longest Single-ref code DWORD cWords; // Number of words in dictionary DWORD cUniqueSROffsets; // Unique offsets in Single-ref segment DWORD cUniqueCharFlags; // Unique char/flags pairs DWORD cUniqueTags; // Unique tagged data values DWORD cUniqueMRPointers; // Unique multi-ref pointers DWORD cbHeader; // Bytes in header & tables DWORD cbTrie; // Bytes in trie } TRIESTATS, *PTRIESTATS, *LPTRIESTATS; /* Primary unit of a node. Nodes usually contain a pointer too */ typedef struct tagCHARFLAGS { wchar_t wch; short wFlags; } CHARFLAGS, *PCHARFLAGS, *LPCHARFLAGS; /* Control structure used to decompress the trie */ typedef struct tagTRIECTRL { TRIESTATS *lpTrieStats; // Pointer to base of header segment WORD *lpwCharFlagsCodes; // decoding table for Char/flags WORD *lpwTagsCodes; // decoding table for tagged data WORD *lpwMRPointersCodes; // decoding table for multiref pointers WORD *lpwSROffsetsCodes; // decoding table for singleref offsets CHARFLAGS *lpCharFlags; // table to convert codes to char/flags DWORD *lpwTags; // table to convert codes to tagged data DWORD *lpwMRPointers; // table to convert codes to multiref pointers DWORD *lpwSROffsets; // table to convert codes to Singleref offsets BYTE *lpbTrie; // Pointer to the trie. } TRIECTRL, *PTRIECTRL, *LPTRIECTRL; /* Useful Constants */ #define TRIE_MAX_DEPTH 128 // We'll fail on any words longer than this // The prototypes below are plain C (this is required for use with C++) /* Given a pointer to a mapped file or resource containing a compressed trie, read the trie into memory, making all the allocations required */ TRIECTRL * WINAPI TrieInit(LPBYTE lpByte); /* Free all the allocations associated with a trie */ void WINAPI TrieFree(LPTRIECTRL lpTrieCtrl); void WINAPI TrieDecompressNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan); /* Given a compressed trie and a pointer to a decompresed node from it, find and decompress the next node in the same state. lpTrieScan is a user-allocated structure that holds the decompressed node and into which the new node is copied. This is equivalent to traversing a right pointer or finding the next alternative letter at the same position. If there is no next node (i.e.this is the end of the state) then TrieGetNextNode returns FALSE. To scan from the beginning of the trie, set the lpTrieScan structure to zero */ BOOL WINAPI TrieGetNextNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan); BOOL WINAPI TrieSkipNextNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan, WCHAR wch); /* Follow the down pointer to the next state. This is equivalent to accepting the character in this node and advancing to the next character position. Returns FALSE if there is no down pointer. This also decompresses the first node in the state, so all the values in lpTrieScan will be good. */ BOOL WINAPI TrieGetNextState(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan); /* Check the validity of a word or prefix. Starts from the root of pTrie looking for pwszWord. If it finds it, it returns TRUE and the user-provided lpTrieScan structure contains the final node in the word. If there is no path, TrieCheckWord returns FALSE To distinguisha valid word from a valid prefix, caller must test wFlags for fTrieNodeValid. */ BOOL WINAPI TrieCheckWord(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan, wchar_t * lpwszWord); /* Walk the trie from pTrieNode, calling pfnTrieWord on every valid word. pvParam is passed through to pfnTrieWord. If pfnTrieWord returns non-zero, the enumeration stops. lpwszWord must point to a space of cwchTrieWordMax+1 wchar_t's. To walk the entire trie, set *pTrieScan to all zeros. Returns the number of words traversed. pfnTrieWord may be null if all you want is the count of words. */ int WINAPI TrieEnumerate( LPTRIECTRL lpTrieCtrl, // Trie to enumerate LPTRIESCAN lpTrieScan, // structure holding starting point, all-zero for whole trie wchar_t *pwszWord, // buffer to hold words being enumerated void *pvParam, // parameter to pass to pfnTrieWord int (*pfnTrieWord)(wchar_t *pwszWord, void *pvParam) ); int WINAPI TrieWordToIndex( TRIECTRL *ptc, // Trie in which to find word index wchar_t *pwszWord // Word for which we're looking ); BOOL WINAPI TrieIndexToWord( TRIECTRL *ptc, // Trie in which to find indexed word DWORD nIndex, // Index for which we're looking wchar_t *pwszWord, // Returned word int cwc // Max characters in buffer (including NULL) ); int WINAPI TrieWordToTagIndex( TRIECTRL *ptc, // Trie in which to find word index const wchar_t *pwszWord, // Word for which we're looking int tag // Which tag to enumerate ); BOOL WINAPI TrieTagIndexToWord( TRIECTRL *ptc, // Trie in which to find indexed word DWORD nIndex, // Index for which we're looking wchar_t *pwszWord, // Returned word int cwc, // Max characters in buffer (including NULL) int tag // Which tag to enumerate ); BOOL WINAPI TrieGetTagsFromWord( TRIECTRL *ptc, // Trie in which to find word wchar_t *pwszWord, // Word for which we're looking DWORD *pdw, // Returned values BYTE *pbValid // Mask for valid return values ); int WINAPI TriePrefixToRange( TRIECTRL *ptc, // Trie in which to find prefix range const wchar_t *pwszWord, // Prefix for which we're looking int *piStart // Start of range with this prefix ); /**** Subroutines for traversing Directed Acyclic Word Graphs ****/ /* Abstract trie node structure. wch is a character to transition on; flags describe various things about the compressed trie; iDown indexes the first node in the state wch transitions to. DAWG is a special kind of trie: a "Directed Acyclic Word Graph," essentially an ending-compressed trie. */ typedef struct tagDAWGNODE { DWORD wch; // Unicode character DWORD wFlags; // see below DWORD cWords; // Words below this node in the subtree DWORD cSkipWords; // Words below skipped nodes DWORD iDown; // Offset of first node in next state DWORD iRight; // Offset to first node in next substate DWORD cTags[8]; // Count of tagged nodes below this node in the subtree DWORD dwData[8]; // Stored tagged data for this node } DAWGNODE, *PDAWGNODE, *LPDAWGNODE; /* Given a trie and a pointer to a node in it, find the next node in that state. This is equivalent to traversing a right pointer or finding the next alternative letter at the same position. Returns a pointer to the new node, NULL if there is no next node (i.e. if this is the end of a state).*/ DAWGNODE * WINAPI DawgGetNextNode(void *pTrie, DAWGNODE *pTrieNode); /* From this node, find the first node in the state it points to. This is equivalent to traversing a down pointer or extending the word one letter and finding the first alternative. Returns a pointer to the first node in the new state, NULL if there is no down pointer. To find the first state in the trie, use pTrieNode == NULL */ DAWGNODE * WINAPI DawgGetNextState(void *pTrie, DAWGNODE *pTrieNode); /* Check the validity of a word or prefix. Starts from the root of pTrie looking for pwszWord. If it finds it, it returns a pointer to the terminal node in pTrie Returns NULL if there is no path through the trie that corresponds to pwszWord. To distinguish a valid word from a valid prefix, caller must test wFlags for fTrieNodeValid. */ DAWGNODE * WINAPI DawgCheckWord(void *pTrie, wchar_t *pwszWord); /* Walk the trie from pTrieNode, calling pfnTrieWord on every valid word. pvParam is passed through to pfnTrieWord. If pfnTrieWord returns non-zero, the enumeration stops. pwszWord must point to a space of cwchTrieWordMax+1 wchar_t's. To walk the entire trie, pass NULL for pTrieNode. Returns the number of words traversed. pfnTrieWord may be null if all you want is the count of words. */ int WINAPI DawgEnumerate( void *pTrie, // Trie to enumerate DAWGNODE *pTrieNodeStart, // point to enumerate from, NULL if all wchar_t *pwszWord, // buffer to hold words being enumerated void *pvParam, // parameter to pass to pfnTrieWord int (*pfnTrieWord)(wchar_t *pwszWord, void *pvParam) ); // end plain C Prototypes #ifdef __cplusplus } #endif #endif // TRIE_H