|
|
#ifndef TRIE_H
#define TRIE_H
#include <windows.h>
#ifdef __cplusplus
extern "C" { #endif
/* Abstract trie node structure. wch is a character to transition on; flags describe various things
about the compressed trie; lpbNode points to the first byte of the next node in this state, and lpbDown points to the first byte referenced by the down pointer, if any */
typedef struct tagTAGDATA { DWORD cTag; // Count of tagged nodes below this node in the subtree
DWORD dwData; // Stored tagged data for this node
} TAGDATA;
#define MAXTAGS 8
#if MAXTAGS > 8
#error No more than 8 tags are allowed
#endif
typedef struct tagTRIESCAN { WCHAR wch; // Unicode character
WORD wFlags; // see below
WORD wMask; // which tags are valid
WORD __pad0; //
DWORD cWords; // Words in subtree (only valid if TRIE_NODE_COUNT is set)
DWORD cSkipWords; // Words in subtrees ignored when following a skip pointer
LPBYTE lpbNode; // Address of next byte within the compressed trie
LPBYTE lpbDown; // Address referenced by down pointer, if any
LPBYTE lpbRight; // Address referenced by right pointer, if any
LPBYTE lpbSRDown; // Last single-ref address referenced
TAGDATA aTags[MAXTAGS]; // The list of tag counts/data
} TRIESCAN, *PTRIESCAN, *LPTRIESCAN;
// Trie node flags, only the lower 16 bits of the flags are saved in the trie
#define TRIE_NODE_VALID 0x00000001 // wch is the last letter of a valid word
#define TRIE_NODE_END 0x00000002 // Last node in the state (no more alternatives to wch)
#define TRIE_NODE_COUNT 0x00000004 // The count of words in the subtree is stored in the node
#define TRIE_NODE_TAGGED 0x00000008 // The node has tagged data
#define TRIE_NODE_DOWN 0x00000010 // iDown is valid (word so far is a valid prefix)
#define TRIE_NODE_RIGHT 0x00000020 // iRight is valid (word connects to a substate)
#define TRIE_DOWN_INLINE 0x00000040 // iDown omitted, since it points to next node in memory
#define TRIE_DOWN_MULTI 0x00000080 // iDown is a second reference or worse
#define TRIE_DOWN_ABS 0x00000100 // iDown is an absolute immediate offset into the trie
#define TRIE_NODE_SKIP 0x00000200 // Either iRight is a skip pointer or EOS is a 'soft' EOS
#define TRIE_NODE_SKIP_COUNT 0x00000400 // cSkipWords is valid
/* Macro to access the data in the node, works for dawgs and tries */
#define DAWGDATA(pdawg) ((pdawg)->wch)
#define DAWGDOWNFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_DOWN)
#define DAWGENDFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_END)
#define DAWGWORDFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_VALID)
/* Fixed-length part of the compressed trie header */
typedef struct tagTRIESTATS { WORD version; // Version of this particular compressed trie
WORD __pad0; //
BYTE wTagsMask; // Which tags are in use
BYTE wEnumMask; // Which tags have enumeration
BYTE wDataMask; // Which tags have stored data
BYTE cTagFields; // Total tags in use
WORD cMaxWord; // Number of characters in longest word
WORD cMaxState; // Number of nodes in longest state (max alternatives)
WORD cCharFlagsCodesMax; // Bytes in longest char/flags code
WORD cTagsCodesMax; // Bytes in longest tagged data code
WORD cMRPointersCodesMax; // Bytes in longest MR pointer code
WORD cSROffsetsCodesMax; // Bytes in longest Single-ref code
DWORD cWords; // Number of words in dictionary
DWORD cUniqueSROffsets; // Unique offsets in Single-ref segment
DWORD cUniqueCharFlags; // Unique char/flags pairs
DWORD cUniqueTags; // Unique tagged data values
DWORD cUniqueMRPointers; // Unique multi-ref pointers
DWORD cbHeader; // Bytes in header & tables
DWORD cbTrie; // Bytes in trie
} TRIESTATS, *PTRIESTATS, *LPTRIESTATS;
/* Primary unit of a node. Nodes usually contain a pointer too */
typedef struct tagCHARFLAGS { wchar_t wch; short wFlags; } CHARFLAGS, *PCHARFLAGS, *LPCHARFLAGS;
/* Control structure used to decompress the trie */
typedef struct tagTRIECTRL { TRIESTATS *lpTrieStats; // Pointer to base of header segment
WORD *lpwCharFlagsCodes; // decoding table for Char/flags
WORD *lpwTagsCodes; // decoding table for tagged data
WORD *lpwMRPointersCodes; // decoding table for multiref pointers
WORD *lpwSROffsetsCodes; // decoding table for singleref offsets
CHARFLAGS *lpCharFlags; // table to convert codes to char/flags
DWORD *lpwTags; // table to convert codes to tagged data
DWORD *lpwMRPointers; // table to convert codes to multiref pointers
DWORD *lpwSROffsets; // table to convert codes to Singleref offsets
BYTE *lpbTrie; // Pointer to the trie.
} TRIECTRL, *PTRIECTRL, *LPTRIECTRL;
/* Useful Constants */
#define TRIE_MAX_DEPTH 128 // We'll fail on any words longer than this
// The prototypes below are plain C (this is required for use with C++)
/* Given a pointer to a mapped file or resource containing a compressed trie,
read the trie into memory, making all the allocations required */
TRIECTRL * WINAPI TrieInit(LPBYTE lpByte);
/* Free all the allocations associated with a trie */
void WINAPI TrieFree(LPTRIECTRL lpTrieCtrl);
void WINAPI TrieDecompressNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan);
/* Given a compressed trie and a pointer to a decompresed node from it, find and decompress
the next node in the same state. lpTrieScan is a user-allocated structure that holds the decompressed node and into which the new node is copied. This is equivalent to traversing a right pointer or finding the next alternative letter at the same position. If there is no next node (i.e.this is the end of the state) then TrieGetNextNode returns FALSE. To scan from the beginning of the trie, set the lpTrieScan structure to zero */
BOOL WINAPI TrieGetNextNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan);
BOOL WINAPI TrieSkipNextNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan, WCHAR wch);
/* Follow the down pointer to the next state. This is equivalent to accepting the character
in this node and advancing to the next character position. Returns FALSE if there is no down pointer. This also decompresses the first node in the state, so all the values in lpTrieScan will be good. */
BOOL WINAPI TrieGetNextState(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan);
/* Check the validity of a word or prefix. Starts from the root of pTrie looking for
pwszWord. If it finds it, it returns TRUE and the user-provided lpTrieScan structure contains the final node in the word. If there is no path, TrieCheckWord returns FALSE To distinguisha valid word from a valid prefix, caller must test wFlags for fTrieNodeValid. */
BOOL WINAPI TrieCheckWord(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan, wchar_t * lpwszWord);
int WINAPI TrieWordToIndex( TRIECTRL *ptc, // Trie in which to find word index
wchar_t *pwszWord // Word for which we're looking
);
BOOL WINAPI TrieIndexToWord( TRIECTRL *ptc, // Trie in which to find indexed word
DWORD nIndex, // Index for which we're looking
wchar_t *pwszWord, // Returned word
int cwc // Max characters in buffer (including NULL)
);
int WINAPI TrieWordToTagIndex( TRIECTRL *ptc, // Trie in which to find word index
wchar_t *pwszWord, // Word for which we're looking
int tag // Which tag to enumerate
);
BOOL WINAPI TrieTagIndexToWord( TRIECTRL *ptc, // Trie in which to find indexed word
DWORD nIndex, // Index for which we're looking
wchar_t *pwszWord, // Returned word
int cwc, // Max characters in buffer (including NULL)
int tag // Which tag to enumerate
);
BOOL WINAPI TrieGetTagsFromWord( TRIECTRL *ptc, // Trie in which to find word
wchar_t *pwszWord, // Word for which we're looking
DWORD *pdw, // Returned values
BYTE *pbValid // Mask for valid return values
);
int WINAPI TriePrefixToRange( TRIECTRL *ptc, // Trie in which to find prefix range
wchar_t *pwszWord, // Prefix for which we're looking
int *piStart // Start of range with this prefix
);
/**** Subroutines for traversing Directed Acyclic Word Graphs ****/
/* Abstract trie node structure. wch is a character to transition on; flags describe various things
about the compressed trie; iDown indexes the first node in the state wch transitions to. DAWG is a special kind of trie: a "Directed Acyclic Word Graph," essentially an ending-compressed trie. */
typedef struct tagDAWGNODE { DWORD wch; // Unicode character
DWORD wFlags; // see below
DWORD cWords; // Words below this node in the subtree
DWORD cSkipWords; // Words below skipped nodes
DWORD iDown; // Offset of first node in next state
DWORD iRight; // Offset to first node in next substate
DWORD cTags[8]; // Count of tagged nodes below this node in the subtree
DWORD dwData[8]; // Stored tagged data for this node
} DAWGNODE, *PDAWGNODE, *LPDAWGNODE;
/* Given a trie and a pointer to a node in it, find the next node in that state.
This is equivalent to traversing a right pointer or finding the next alternative letter at the same position. Returns a pointer to the new node, NULL if there is no next node (i.e. if this is the end of a state).*/
DAWGNODE * WINAPI DawgGetNextNode(void *pTrie, DAWGNODE *pTrieNode);
/* From this node, find the first node in the state it points to. This is equivalent
to traversing a down pointer or extending the word one letter and finding the first alternative. Returns a pointer to the first node in the new state, NULL if there is no down pointer. To find the first state in the trie, use pTrieNode == NULL */
DAWGNODE * WINAPI DawgGetNextState(void *pTrie, DAWGNODE *pTrieNode);
/* Check the validity of a word or prefix. Starts from the root of pTrie looking for
pwszWord. If it finds it, it returns a pointer to the terminal node in pTrie Returns NULL if there is no path through the trie that corresponds to pwszWord. To distinguish a valid word from a valid prefix, caller must test wFlags for fTrieNodeValid. */
DAWGNODE * WINAPI DawgCheckWord(void *pTrie, wchar_t *pwszWord);
/* Walk the trie from pTrieNode, calling pfnTrieWord on every valid word. pvParam is passed through
to pfnTrieWord. If pfnTrieWord returns non-zero, the enumeration stops. pwszWord must point to a space of cwchTrieWordMax+1 wchar_t's. To walk the entire trie, pass NULL for pTrieNode. Returns the number of words traversed. pfnTrieWord may be null if all you want is the count of words. */
int WINAPI DawgEnumerate( void *pTrie, // Trie to enumerate
DAWGNODE *pTrieNodeStart, // point to enumerate from, NULL if all
wchar_t *pwszWord, // buffer to hold words being enumerated
void *pvParam, // parameter to pass to pfnTrieWord
int (*pfnTrieWord)(wchar_t *pwszWord, void *pvParam) );
// end plain C Prototypes
#ifdef __cplusplus
} #endif
#endif // TRIE_H
|