|
|
/*
* pathash.c * * author: John R. Douceur * date: 5 May 1997 * * This source file provides functions that implement insertion, removal, * search, scan, and flush operations on the pat-hash table database. The * code is object-oriented C, transliterated from a C++ implementation. * * The pat-hash database is a combination of a dynamically sized, separately * chained hash table and a Patricia tree. The hash table dynamically grows * and shrinks as needed, and the workload of modifying the table size is * distributed evenly among the insertion or removal operations that cause * the growth or shrinkage. * * The insertion and removal operations manage both a hash table and a Patricia * tree, but the search routine uses only the hash table for performing the * search. The Patrica tree is present to support a scan operation, which * searches the database for all entries that match a given pattern, where the * pattern that is scanned may contain wildcards. * * None of the code or comments in this file needs to be understood by writers * of client code; all explanatory information for clients is found in the * associated header file, rhizome.h. * */
#include "gpcpre.h"
#define MAGIC_NUMBER 0x9e4155b9 // Fibonacci hash multiplier (see Knuth 6.4)
// This macro allocates a new pat-hash table entry structure. The size of
// the structure is a function of the value of keybytes, since the entry stores
// a copy of the pattern. The value array, which is the last field in the
// structure, is declared as having a single element, but this array will
// actually extend beyond the defined end of the structure into additional
// space that is allocated for it by the following macro.
//
//#define NEW_PHTableEntry \ // ((PHTableEntry *)malloc(sizeof(PHTableEntry) + phtable->keybytes - 1))
#define NEW_PHTableEntry(_pe) \
GpcAllocMem(&_pe,\ sizeof(PHTableEntry) + phtable->keybytes - 1,\ PathHashTag)
// This macro allocates a new pat-hash table group structure. The size of
// the structure is a function of the size of the group. The entry_list array,
// which is the last field in the structure, is declared as having a single
// element, but this array will actually extend beyond the defined end of the
// structure into additional space that is allocated for it by the following
// macro.
//
//#define NEW_PHTableGroup(group_size) \ // ((PHTableGroup *)malloc(sizeof(PHTableGroup) + \ // ((group_size) - 1) * sizeof(PHTableEntry *)))
#define NEW_PHTableGroup(group_size, _pg) \
GpcAllocMem(&_pg,\ sizeof(PHTableGroup) + \ ((group_size) - 1) * sizeof(PHTableEntry *),\ PathHashTag)
// This macro gets the indexed bit of the value, where the most-significant bit
// is defined as bit 0.
//
#define BIT_OF(value, index) \
(((value)[(index) >> 3] >> (7 - ((index) & 0x7))) & 0x1)
// Following is a prototype for a static function that is used internally by
// the implementation of the pat-hash routines.
void node_scan( PatHashTable *phtable, PHTableEntry *node, int prev_bit, char *value, char *mask, void *context, ScanCallback func);
// Since this is not C++, the PatHashTable structure is not self-constructing;
// therefore, the following constructor code must be called on the PatHashTable
// structure after it is allocated. The argument keybits specifies the size
// (in bits) of each pattern that will be stored in the database. The usage
// ratio is the target ratio of database entries to discrete hash chains, which
// is also the mean length of a hash chain. The usage histeresis is the
// histeresis between resizing operations due to insertions and removals.
// Allocation histeresis is the histeresis between allocation and deallocation
// of groups, specified as a binary exponent. The maximum free list size
// determines the maximum number of elements that will be placed on a free
// list, rather than deallocated, when they are removed.
//
int constructPatHashTable( PatHashTable *phtable, int keybits, int usage_ratio, int usage_histeresis, int allocation_histeresis, int max_free_list_size) { PHTableGroup *group; phtable->keybits = keybits; phtable->keybytes = (keybits - 1) / 8 + 1; phtable->usage_ratio = usage_ratio; phtable->usage_histeresis = usage_histeresis; phtable->allocation_histeresis = allocation_histeresis; phtable->max_free_list_size = max_free_list_size; NEW_PHTableGroup(1, phtable->initial_group); phtable->top_group = phtable->initial_group; phtable->allocation_exponent = 0; phtable->size_exponent = 0; phtable->extension_size = 0; phtable->population = 0; phtable->root = 0; phtable->free_list = 0; phtable->free_list_size = 0; NEW_PHTableGroup(1, group); if (phtable->initial_group == 0 || group == 0) { // Memory could not be allocated for one of the two groups created by
// the constructor. Therefore, we return an indication of failure to
// the client.
// 286334 : Not so fast! Please free memory before leaving...
if (phtable->initial_group != 0) { GpcFreeMem(phtable->initial_group, PatHashTag); }
if (group != 0) { GpcFreeMem(group, PatHashTag); }
return 1; } group->previous = 0; group->entry_list[0] = 0; phtable->initial_group->previous = group; return 0; }
// Since this is not C++, the PatHashTable structure is not self-destructing;
// therefore, the following destructor code must be called on the PatHashTable
// structure before it is deallocated.
//
void destructPatHashTable( PatHashTable *phtable) { PHTableGroup *group, *previous; PHTableEntry *entry, *next; int index, size; // First, free all groups that are allocated but not currently used.
group = phtable->top_group; while (group != phtable->initial_group) { previous = group->previous; GpcFreeMem(group, PatHashTag); group = previous; } // Then, free the entries in the initial group. Since not all fields
// in the initial group's table may be valid, only check those whose
// indices are less than the extension size.
for (index = phtable->extension_size - 1; index >= 0; index--) { entry = group->entry_list[index]; while (entry != 0) { next = entry->next; GpcFreeMem(entry, PatHashTag); entry = next; } } // Then free the initial group.
previous = group->previous; GpcFreeMem(group, PatHashTag); group = previous; // Scan through all remaining groups except the last one, freeing all
// entries in each group, and thereafter freeing the group.
size = 1 << (phtable->size_exponent - 1); while (group->previous != 0) { for (index = size - 1; index >= 0; index--) { entry = group->entry_list[index]; while (entry != 0) { next = entry->next; GpcFreeMem(entry, PatHashTag); entry = next; } } previous = group->previous; GpcFreeMem(group, PatHashTag); group = previous; size >>= 1; } // The last group is special, since it has a size of one, but the logic
// used in the preceding loop would have calculated its size as zero.
// Rather than complicating the previous loop with a check for a single
// special case, we simply free the last group and its entries in the
// following code.
entry = group->entry_list[0]; while (entry != 0) { next = entry->next; GpcFreeMem(entry, PatHashTag); entry = next; } GpcFreeMem(group, PatHashTag); // Finally, free all of the entries in the free list.
while (phtable->free_list != 0) { next = phtable->free_list->next; GpcFreeMem(phtable->free_list, PatHashTag); phtable->free_list = next; } }
// This function inserts a new specific pattern into the database, passed as
// an array of bytes. The client supplies a digested form of the pattern as
// the chyme argument.
//
// The client specifies a void pointer reference value to associate with the
// specific pattern. When the specific pattern is installed, the insert
// routine returns a pointer to a SpecificPatternHandle.
//
// If the submitted pattern has already been installed in the database, then
// the insertion does not occur, and the SpecificPatternHandle of the
// previously installed pattern is returned.
//
// The insertion routine inserts the new pattern into both the hash table and
// the Patricia tree, and the two insertions are almost completely independent
// except for the shared entry structure.
//
SpecificPatternHandle insertPatHashTable( PatHashTable *phtable, char *pattern, unsigned int chyme, void *reference) { unsigned int hash, address, small_address, split_point; PHTableGroup *group; PHTableEntry **entry, *new_entry; char *value; int index, group_size, pivot_bit, bit_value; // The first portion of this routine inserts the new pattern into the hash
// table. To begin, we determine whether the number of hash chains needs
// to be increased in order to maintain the desired usage ratio.
group_size = 1 << phtable->size_exponent; if (phtable->population >= (group_size + phtable->extension_size) * phtable->usage_ratio) { // The number of hash chains needs to be increased. So, determine
// whether the initial group is completely full.
if (phtable->extension_size == group_size) { // The initial group is completely full. So, determine whether
// all allocated groups are currently in use.
if (phtable->allocation_exponent == phtable->size_exponent) { // All allocated groups are currently in use. So, allocate
// a new group and set its previous pointer to point to the
// initial group. Update the allocation values of the structure
// to reflect the new allocation.
NEW_PHTableGroup(group_size << 1, group); if (group == 0) { // Memory could not be allocated for the new group.
// Therefore, we return an indication of falure to the
// client.
return 0; } group->previous = phtable->initial_group; phtable->top_group = group; phtable->allocation_exponent++; } else { // Not all allocated groups are in use. So, scanning backward
// from the top group, find the group that immediately follows
// the initial group.
group = phtable->top_group; while (group->previous != phtable->initial_group) { group = group->previous; } } // We now have either a newly allocated group or a previously
// allocated group that immediately follows the initial group.
// Set this group to be the new initial group, and set the extension
// size to zero.
phtable->initial_group = group; phtable->size_exponent++; phtable->extension_size = 0; } else { // The initial group is not completely full. So, select the initial
// group.
group = phtable->initial_group; } // We now have a group that is not completely full, either because it
// wasn't completely full when the insert routine was entered, or
// because it has just been allocated. In either case, we now split
// a hash chain from a smaller group into two hash chains, one of which
// will be placed into an unused entry in the new group. The address
// of the hash chain to be split is determined by the extension size.
// First we find the group that contains this address.
group = group->previous; address = phtable->extension_size; while ((address & 0x1) == 0 && group->previous != 0) { address >>= 1; group = group->previous; } // Then, we scan through the entry list at the given address for the
// appropriate split point. The entries are stored in sorted order,
// and we are essentially shifting one more bit into the address for
// this value, so the split point can be found by searching for the
// first entry with the bit set.
address >>= 1; entry = &group->entry_list[address]; split_point = ((phtable->extension_size << 1) | 0x1) << (31 - phtable->size_exponent); while (*entry != 0 && (*entry)->hash < split_point) { entry = &(*entry)->next; } // Now that we have found the split point, we move the split-off
// piece of the list to the new address, and increment the extension
// size.
phtable->initial_group->entry_list[phtable->extension_size] = *entry; *entry = 0; phtable->extension_size++; } // Now that the memory management aspects of the hash table insertion have
// been taken care of, we can perform the actual insertion. First, we find
// the address by hashing the chyme value.
group = phtable->initial_group; hash = MAGIC_NUMBER * chyme; address = hash >> (31 - phtable->size_exponent); // There are two possible values for the address depending upon whether
// the hash chain pointer is below the extension size. If it is, then the
// larger (by one bit) address is used; otherwise, the smaller address is
// used.
small_address = address >> 1; if ((int)small_address >= phtable->extension_size) { address = small_address; group = group->previous; } // Next we find the group that contains this address.
while ((address & 0x1) == 0 && group->previous != 0) { address >>= 1; group = group->previous; } // Then, we scan through the entry list at the given address for the first
// entry whose hash value is equal to or greater than the hash of the search
// key. The entries are stored in sorted order to improve the search speed.
address >>= 1; entry = &group->entry_list[address]; while (*entry != 0 && (*entry)->hash < hash) { entry = &(*entry)->next; } // Now, we check all entries whose hash value matches that of the search
// key.
while (*entry != 0 && (*entry)->hash == hash) { // For each value whose hash matches, check the actual value to see
// if it matches the search key.
value = (*entry)->value; for (index = phtable->keybytes-1; index >= 0; index--) { if (value[index] != pattern[index]) { break; } } if (index < 0) { // A match is found, so we return the SpecificPatternHandle of the
// matching entry to the client.
return *entry; } entry = &(*entry)->next; } // A match was not found, so we insert the new entry into the hash chain.
// First we check to see if there is an entry avalable on the free list.
if (phtable->free_list != 0) { // There is an entry available on the free list, so grab it and
// decrement the size of the free list.
new_entry = phtable->free_list; phtable->free_list = phtable->free_list->next; phtable->free_list_size--; } else { // There is no entry available on the free list, so allocate a new one.
NEW_PHTableEntry(new_entry); if (new_entry == 0) { // Memory could not be allocated for the new entry. Therefore,
// we return an indication of falure to the client.
return 0; } } // Set the fields of the new entry to the appropriate information and add
// the entry to the hash chain.
new_entry->hash = hash; new_entry->reference = reference; new_entry->next = *entry; for (index = phtable->keybytes - 1; index >= 0; index--) { new_entry->value[index] = pattern[index]; } *entry = new_entry; // The hash table insertion is now complete. Here we begin the insertion
// of the new entry into the Patricia tree. We have to treat an empty
// tree as a special case.
if (phtable->root == 0) { // The Patricia tree is empty, so we set the root to point to the new
// entry. This entry is special, since it serves only as a leaf of
// the Patricia search and not also as a branch node. A Patricia tree
// always contains one fewer branch node than the number of leaves.
// Since a leaf is determined by a pivot bit that is less than or equal
// to the pivot bit of the parent branch node, a pivot bit of -1 flags
// this node as always a leaf.
new_entry->pivot_bit = -1; new_entry->children[0] = 0; new_entry->children[1] = 0; phtable->root = new_entry; } else { // The Patricia tree is not empty, so we proceed with the normal
// insertion process. Beginning at the root, scan through the tree
// according to the bits of the new pattern, until we reach a leaf.
entry = &phtable->root; index = -1; while ((*entry)->pivot_bit > index) { index = (*entry)->pivot_bit; entry = &(*entry)->children[BIT_OF(pattern, index)]; } // Now, compare the new pattern, bit by bit, to the pattern stored at
// the leaf, until a non-matching bit is found. There is no need to
// check for an exact match, since the hash insert above would have
// aborted if an exact match had been found.
value = (*entry)->value; pivot_bit = 0; while (BIT_OF(value, pivot_bit) == BIT_OF(pattern, pivot_bit)) { pivot_bit++; } // Now, scan a second time through the tree, until finding either a leaf
// or a branch with a pivot bit greater than the bit of the non-match.
entry = &phtable->root; index = -1; while ((*entry)->pivot_bit > index && (*entry)->pivot_bit < pivot_bit) { index = (*entry)->pivot_bit; entry = &(*entry)->children[BIT_OF(pattern, index)]; } // This is the point at which the new branch must be inserted. Since
// each node is both a branch and a leaf, the new entry serves as the
// new branch, and one of its children points to itself as a leaf. The
// other child points to the remaining subtree below the insertion
// point.
bit_value = BIT_OF(value, pivot_bit); new_entry->pivot_bit = pivot_bit; new_entry->children[1 - bit_value] = new_entry; new_entry->children[bit_value] = *entry; *entry = new_entry; } // Having inserted the new entry in both the hash table and the Patricia
// tree, we increment the population and return the SpecificPatternHandle
// of the new entry.
phtable->population++; return new_entry; }
// This function removes a pattern from the pat-hash table. The pattern is
// specified by the SpecificPatternHandle that was returned by the insert
// routine. No checks are performed to insure that this is a valid handle.
//
// The removal routine removes the pattern from both the hash table and the
// Patricia tree, and the two removals are almost completely independent
// except for the shared entry structure.
//
void removePatHashTable( PatHashTable *phtable, SpecificPatternHandle sphandle) { unsigned int hash, address, small_address; PHTableGroup *group; PHTableEntry **entry, **branch, **parent, *epoint, *bpoint; char *value; int index, group_size; // The first portion of this routine removess the new pattern from the hash
// table. First, we find the address by hashing the chyme value.
group = phtable->initial_group; hash = sphandle->hash; address = hash >> (31 - phtable->size_exponent); // There are two possible values for the address depending upon whether
// the hash chain pointer is below the extension size. If it is, then the
// larger (by one bit) address is used; otherwise, the smaller address is
// used.
small_address = address >> 1; if ((int)small_address >= phtable->extension_size) { address = small_address; group = group->previous; } // Next we find the group that contains this address.
while ((address & 0x1) == 0 && group->previous != 0) { address >>= 1; group = group->previous; } // Then, we scan through the entry list at the given address for the entry
// that matches the given SpecificPatternHandle.
address >>= 1; entry = &group->entry_list[address]; while (*entry != sphandle) { entry = &(*entry)->next; } // We then remove the entry from the hash chain and decrement the
// population.
*entry = sphandle->next; phtable->population--; // This completes the actual removal of the entry from the hash table, but
// we now have to determine whether to reduce the number of hash chains in
// order to maintain the desired usage ratio. Note that the usage
// histeresis is factored into the calculation.
group_size = 1 << phtable->size_exponent; if (phtable->population + phtable->usage_histeresis < (group_size + phtable->extension_size - 1) * phtable->usage_ratio) { // The number of hash chains needs to be reduced. So, we coalesce two
// hash chains into a single hash chain. The address of the hash chains
// is determined by the extension size. First we decrement the
// extension size and find the group that contains the address of the
// hash chain that is being retained.
phtable->extension_size--; group = phtable->initial_group->previous; address = phtable->extension_size; while ((address & 0x1) == 0 && group->previous != 0) { address >>= 1; group = group->previous; } // Then, we find the end of the entry list at the given address.
address >>= 1; entry = &group->entry_list[address]; while (*entry != 0) { entry = &(*entry)->next; } // We then make the last entry in the hash chain point to the first
// entry in the other hash chain that is being coalesced. We do not
// need to update the group's pointer to the other hash chain, since
// it is now beyond the extension size, and it will thus never be seen.
*entry = phtable->initial_group->entry_list[phtable->extension_size]; // Now, we check to see whether a group has been completely emptied.
// We also check the size exponent, since even if we have just emptied
// the first non-special group, we do not remove it.
if (phtable->extension_size == 0 && phtable->size_exponent > 0) { // The initial group has just been completely emptied, so we set
// the previous group as the new initial group. Update all
// housekeeping information accordingly.
phtable->size_exponent--; phtable->extension_size = group_size >> 1; phtable->initial_group = phtable->initial_group->previous; // We now determine whether we should deallocate a group. Note
// that the allocation histeresis is factored into the calculation.
if (phtable->size_exponent + phtable->allocation_histeresis < phtable->allocation_exponent) { // We should deallocate a group, so we deallocate the top group.
phtable->allocation_exponent--; group = phtable->top_group->previous; GpcFreeMem(phtable->top_group, PatHashTag); phtable->top_group = group; } } } // Now, the hash table removal operation is complete, including the memory
// management functions. Here we begin the removal of the entry from the
// Patricia tree. First, we scan through the tree according to the bits of
// the pattern being removed, until we reach a leaf. We keep track of the
// branch that immediately precedes the leaf, and we also note the parent
// of the pattern, in the latter's capacity as a branch node.
value = sphandle->value; entry = &phtable->root; branch = entry; parent = 0; index = -1; while ((*entry)->pivot_bit > index) { if ((*entry) == sphandle) { parent = entry; } branch = entry; index = (*entry)->pivot_bit; entry = &(*entry)->children[BIT_OF(value, index)]; } // We set the branch that points to the leaf to instead point to the child
// of the leaf that is not selected by the bit of the removed pattern, thus
// removing the branch from the tree.
epoint = *entry; bpoint = *branch; *branch = bpoint->children[1 - BIT_OF(value, index)]; // If the branch that was removed is also the leaf that contains the
// pattern, then the removal from the Patricia tree is complete. Otherwise,
// we replace the leaf that is being removed with the branch that is not
// being removed.
if (epoint != bpoint) { bpoint->pivot_bit = epoint->pivot_bit; bpoint->children[0] = epoint->children[0]; bpoint->children[1] = epoint->children[1]; // In the case of the special node that is not a branch node, we do
// not update its parent to point to the replacing branch, since this
// node has no parent.
if (parent != 0) { *parent = bpoint; } } // The removal from the Patricia tree is now complete. If appropriate, we
// place the removed entry onto the free list. If not, we simply free it.
if (phtable->free_list_size < phtable->max_free_list_size) { sphandle->next = phtable->free_list; phtable->free_list = sphandle; phtable->free_list_size++; } else { GpcFreeMem(sphandle, PatHashTag); } }
// This function searches the database for the specific pattern that matches
// the given key, which is passed as an array of bytes. The client supplies
// a digested form of the pattern as the chyme argument. If a match is found,
// the SpecificPatternHandle of that matching specific pattern is returned.
// If no match is found, then a value of 0 is returned.
//
// This search uses only the hash table; the Patricia tree is not used at all.
//
SpecificPatternHandle searchPatHashTable( PatHashTable *phtable, char *key, unsigned int chyme) { unsigned int hash, address, small_address; PHTableGroup *group; PHTableEntry *entry; char *value; int index; // First, we find the address by hashing the chyme value.
group = phtable->initial_group; hash = MAGIC_NUMBER * chyme; address = hash >> (31 - phtable->size_exponent); // There are two possible values for the address depending upon whether
// the hash chain pointer is below the extension size. If it is, then the
// larger (by one bit) address is used; otherwise, the smaller address is
// used.
small_address = address >> 1; if ((int)small_address >= phtable->extension_size) { address = small_address; group = group->previous; } // Next we find the group that contains this address.
while ((address & 0x1) == 0 && group->previous != 0) { address >>= 1; group = group->previous; } // Then, we scan through the entry list at the given address for the first
// entry whose hash value is equal to or greater than the hash of the search
// key. The entries are stored in sorted order to improve the search speed.
address >>= 1; entry = group->entry_list[address]; while (entry != 0 && entry->hash < hash) { entry = entry->next; } // Now, we check all entries whose hash value matches that of the search
// key.
while (entry != 0 && entry->hash == hash) { // For each value whose hash matches, check the actual value to see
// if it matches the search key.
value = entry->value; for (index = phtable->keybytes-1; index >= 0; index--) { if (value[index] != key[index]) { break; } } if (index < 0) { // A match is found, so we return the SpecificPatternHandle of the
// matching entry to the client.
return entry; } entry = entry->next; } // A match was not found, so we return a null pointer to the client.
return 0; }
// This function searches the database for all specific patterns that match a
// given general pattern. The general pattern is specified by a value and a
// mask. For each specific pattern in the database that matches the supplied
// general pattern, a client-supplied callback function is called with the
// SpecificPatternHandle of the matching specific pattern. This callback
// function is also passed a context (as a void pointer) that is supplied by
// the client in the call to the scan routine.
//
// This scan uses only the Patricia tree; the hash table is not used at all.
//
void scanPatHashTable( PatHashTable *phtable, char *value, char *mask, void *context, ScanCallback func) { // Call the recursive node_scan routine, starting at the root of the
// Patricia tree.
if (phtable->root != 0) { node_scan(phtable, phtable->root, -1, value, mask, context, func); } }
// This function recursively scans the Patricia tree for all specific patterns
// that match a given general pattern.
void node_scan( PatHashTable *phtable, PHTableEntry *node, int prev_bit, char *value, char *mask, void *context, ScanCallback func) { int mask_bit, index; // Partial recursion removal. The while loop takes the place of one of the
// recursive calls to node_scan(). We remain in the while loop while we
// are still examining branch nodes.
while (node->pivot_bit > prev_bit) { // For each branch node, determine which way(s) to branch based upon
// the bit of the general pattern. If the mask bit is a zero, then
// branch both ways, requiring a recursive call. If the mask bit is
// a one, then branch in the direction indicated by the value bit.
mask_bit = BIT_OF(mask, node->pivot_bit); if (mask_bit == 0) { // The general pattern has a wildcard for this node's pivot bit,
// so we must branch both ways. We branch on child one through
// an actual recursive call.
node_scan(phtable, node->children[1], node->pivot_bit, value, mask, context, func); } // We then branch either to the child selected by the value bit (if
// the mask bit is one) or to child zero (if the mask bit is zero).
prev_bit = node->pivot_bit; node = node->children[BIT_OF(value, node->pivot_bit) & mask_bit]; } // We have reached a leaf node. Examine its specific pattern to see if
// it matches the given general pattern. If it doesn't match, then just
// return; otherwise, call the client's callback function.
for (index = phtable->keybytes-1; index >= 0; index--) { if ((mask[index] & value[index]) != (mask[index] & node->value[index])) { return; } } func(context, node); }
// This function forces the pat-hash table to release all of the memory that
// it currently can, by deallocating all unneeded groups and entries.
//
void flushPatHashTable( PatHashTable *phtable) { PHTableGroup *group, *previous; PHTableEntry *entry, *next; // First, free all groups that are allocated but not currently used.
group = phtable->top_group; while (group != phtable->initial_group) { previous = group->previous; GpcFreeMem(group, PatHashTag); group = previous; } phtable->top_group = phtable->initial_group; phtable->allocation_exponent = phtable->size_exponent; // Then, free all of the entries in the free list.
entry = phtable->free_list; while (entry != 0) { next = entry->next; GpcFreeMem(entry, PatHashTag); entry = next; } phtable->free_list = 0; phtable->free_list_size = 0; }
|