#ifndef _DHT_HXX_INCLUDED #define _DHT_HXX_INCLUDED #pragma warning ( disable : 4200 ) // we allow zero sized arrays // asserts // // #define DHTAssert to point to your favorite assert function per #include #ifdef DHTAssert #else // !DHTAssert #define DHTAssert Assert #endif // DHTAssert #include #ifdef DEBUG // turns on unique names for bucket reader/writer locks (adds 60 bytes per BUCKET) #define UNIQUE_BUCKET_NAMES #ifdef UNIQUE_BUCKET_NAMES #include #endif // UNIQUE_BUCKET_NAMES #endif #ifdef DEBUGGER_EXTENSION class CPRINTF; #endif namespace DHT { ///////////////////////////////////////////////////////////////////////////////////////// // CDynamicHashTable // // Implements a dynamically resizable hash table of entries stored using a unique key // // CKey = class representing keys used to identify entries in the hash table // CEntry = class representing entries stored in the hash table // (required copy-constructor) template< class CKey, class CEntry > class CDynamicHashTable { public: // counter type (uses native word size of machine) typedef ULONG_PTR NativeCounter; // class controlling the Key and Entry for each entry in the hash table // // NOTE: All member functions must be defined by the user per instance // of this template. These functions must be defined after the // template definition. Declaring these functions to be inline // will allow full optimization by the compiler! class CKeyEntry { public: // produces the hash value for the specified key. this hash // function should produce numbers as uniformly as possible over // as large a range as possible for good performance static NativeCounter Hash( const CKey& key ); // produces the hash value for this entry's key. this hash // function should produce the same number as the above function // for the same key NativeCounter Hash() const; // returns fTrue if this entry matches the given key. this way, // the key doesn't necessarily have to be stored in the hash table // entry // // e.g.: CEntry can be PBF and key can be IFMP/PGNO where the // actual IFMP/PGNO is stored in the BF structure. this would // ruin cache locality, of course, but it would use less memory // // note that the entry could also contain some kind of hash value // for the key allowing some weeding out of entries before jumping // off to the full structure for a full comparison. an example // of this would be the SPAIRs from SORT BOOL FEntryMatchesKey( const CKey& key ) const; // sets the contained entry to the given entry void SetEntry( const CEntry& entry ); // gets the contained entry void GetEntry( CEntry* const pentry ) const; public: CEntry m_entry; ~CKeyEntry(); // not allowed private: CKeyEntry(); // not allowed CKeyEntry *operator =( const CKeyEntry & ); // not allowed }; // API Error Codes enum ERR { errSuccess, // success errOutOfMemory, // not enough memory errInvalidParameter, // bad argument to function errEntryNotFound, // entry was not found errNoCurrentEntry, // currency is invalid errKeyDuplicate, // cannot insert because key already exists }; // API Lock Context class CLock; public: CDynamicHashTable( const NativeCounter rankDHTrwlBucket ); ~CDynamicHashTable(); ERR ErrInit( const double dblLoadFactor, const double dblUniformity, const NativeCounter cBucketMinimum = 0 ); void Term(); void ReadLockKey( const CKey& key, CLock* const plock ); void ReadUnlockKey( CLock* const plock ); void WriteLockKey( const CKey& key, CLock* const plock ); void WriteUnlockKey( CLock* const plock ); ERR ErrRetrieveEntry( CLock* const plock, CEntry* const pentry ); ERR ErrReplaceEntry( CLock* const plock, const CEntry& entry ); ERR ErrInsertEntry( CLock* const plock, const CEntry& entry ); ERR ErrDeleteEntry( CLock* const plock ); void BeginHashScan( CLock* const plock ); void BeginHashScanFromKey( const CKey& key, CLock* const plock ); ERR ErrMoveNext( CLock* const plock, BOOL* const pfNewBucket = NULL ); void EndHashScan( CLock* const plock ); #ifdef DEBUGGER_EXTENSION VOID Dump( CPRINTF * pcprintf, const DWORD_PTR dwOffset = 0 ) const; VOID Scan( CPRINTF * pcprintf, VOID * pv ) const; #endif #ifdef DHT_STATS long CBucketOverflow() const { return m_cBucketOverflowInsert; } long CBucketSplit() const { return m_cBucketSplit; } long CBucketMerge() const { return m_cBucketMerge; } long CDirectorySplit() const { return m_cDirSplit; } long CDirectoryMerge() const { return m_cDirMerge; } long CStateTransition() const { return m_cTransition; } long CPolicySelection() const { return m_cSelection; } long CSplitContend() const { return m_cSplitContend; } long CMergeContend() const { return m_cMergeContend; } #else // !DHT_STATS long CBucketOverflow() const { return 0; } long CBucketSplit() const { return 0; } long CBucketMerge() const { return 0; } long CDirectorySplit() const { return 0; } long CDirectoryMerge() const { return 0; } long CStateTransition() const { return 0; } long CPolicySelection() const { return 0; } long CSplitContend() const { return 0; } long CMergeContend() const { return 0; } #endif // DHT_STATS private: // possible states for the hash-table // // DANGER! DANGER! DANGER WILL ROBINSON! // // DO NOT CHANGE THE ENUMATION VALUES! CODE IS DEPENDANT ON THEM BEING AS THEY ARE! // (specifically, I do "stateCur >> 4" to test for 0x10000 so I can see if we are splitting) // // DANGER! DANGER! DANGER WILL ROBINSON! enum ENUMSTATE { stateNil = 0, stateShrinkFromGrow = 1, stateShrinkFromGrow2 = 2, stateGrowFromShrink = 3, stateGrowFromShrink2 = 4, stateSplitFromGrow = 5, stateSplitFromGrow2 = 6, stateGrowFromSplit = 7, stateGrowFromSplit2 = 8, stateMergeFromShrink = 9, stateMergeFromShrink2 = 10, stateShrinkFromMerge = 11, stateShrinkFromMerge2 = 12, stateUnused = 13, stateGrow = 14, stateShrink = 15, stateSplit = 16, stateMerge = 17, }; // Constants enum { cbitByte = 8 }; // bits per byte enum { cbitNativeCounter = sizeof( NativeCounter ) * cbitByte }; // bits per NativeCounter // BUCKET // // - this is the individual unit of allocation for each logical bucket // - each BUCKET contains several CKeyEntry objects packed together // - BUCKETs are chained together to make up the entire logical bucket struct BUCKET { public: // read-write-lock/prev-ptr // in the primary BUCKET (allocated as a part of an array), this is the read-write-lock // in secondary BUCKETs, this is the prev-ptr for reverse traversal union { BYTE m_rgbRWL[ sizeof( OSSYNC::CReaderWriterLock ) ]; BUCKET *m_pBucketPrev; }; // next/end pointer // when this points outside of the array of buckets, it points to the next BUCKET // when this points inside of the array of buckets, it points to the first free entry union { BYTE *m_pb; BUCKET *m_pBucketNext; CKeyEntry *m_pEntryLast; }; // array of entries (it will contain 'load-factor' entries) CKeyEntry m_rgEntry[]; public: // return the properly typed CReaderWriterLock OSSYNC::CReaderWriterLock& CRWL() const { return (OSSYNC::CReaderWriterLock &)m_rgbRWL; } }; typedef BUCKET* PBUCKET; // BUCKETPool // // pool of BUCKET structures (reservation system for bucket split/merge) class BUCKETPool { public: PBUCKET m_pReserve; // list of BUCKET structures available for reservation long m_cReserve; // number of BUCKET structures available to be reserved OSSYNC::CSemaphore m_semReserve; // protection for reservation ptrs #ifdef _WIN64 BYTE m_rgbRsvd[ 40 ]; #else // !_WIN64 BYTE m_rgbRsvd[ 20 ]; #endif // _WIN64 public: BUCKETPool() : m_semReserve( CSyncBasicInfo( "CDynamicHashTable::BUCKETPool::m_semReserve" ) ) { // initialize vars m_pReserve = NULL; m_cReserve = 0; // prepare the semaphore to have 1 owner m_semReserve.Release(); #ifdef DEBUG memset( m_rgbRsvd, 0, sizeof( m_rgbRsvd ) ); #endif // DEBUG } // terminate ~BUCKETPool() { while ( m_pReserve ) { PBUCKET pBucket; pBucket = m_pReserve; m_pReserve = m_pReserve->m_pBucketNext; MEMFree( pBucket ); } m_cReserve = 0; } // reserve a BUCKET structure // "allocate" a bucket from the list by decrementing the counter of available buckets // if the counter went below zero, we need add a bucket to the list now (or fail) // to make sure we can honor the request later BOOL FPOOLReserve( const NativeCounter cbBucket ) { // reserve a bucket using the counter if ( AtomicDecrement( (long*)&m_cReserve ) >= 0 ) { return fTrue; } // reserve a bucket from the heap else { return FPOOLReserve_( cbBucket ); } } BOOL FPOOLReserve_( const NativeCounter cbBucket ) { // at this point, we need to increment m_cReserve for 1 of 2 reasons: // the allocation will succeed and we will add the new bucket to the list // the allocation will fail and we can't leave without "deallocating" the bucket AtomicIncrement( (long*)&m_cReserve ); // we need to allocate a bucket and add it to the list (to back the reservation we want) const PBUCKET pBucket = PBUCKET( PvMEMAlloc( cbBucket ) ); if ( pBucket ) { // add the bucket to the list m_semReserve.Acquire(); pBucket->m_pBucketNext = m_pReserve; m_pReserve = pBucket; m_semReserve.Release(); // reservation succeeded return fTrue; } // the allocation failed so the reservation cannot succeed return fFalse; } // commit a reservation BUCKET *PbucketPOOLCommit() { PBUCKET pBucketReserve; // assign a bucket to the reservation m_semReserve.Acquire(); pBucketReserve = m_pReserve; DHTAssert( pBucketReserve ); m_pReserve = m_pReserve->m_pBucketNext; m_semReserve.Release(); // return the bucket return pBucketReserve; } // release the reservation void POOLUnreserve() { // "deallocate" the bucket that was previously reserved AtomicIncrement( (long*)&m_cReserve ); } }; // HOTSTUFF // // "hot" elements of the hash-table (hashed to array of size 2*cProcessor elems) // // 32 bytes on WIN32 // 64 bytes on WIN64 // struct HOTSTUFF { public: NativeCounter m_cEntry; // counter for entries NativeCounter m_cOp; // counter for inserts/deletes OSSYNC::CMeteredSection m_cms; // metered section for changing states #ifdef _WIN64 BYTE m_rgbRsvd[ 24 ]; // alignment padding #else // !_WIN64 BYTE m_rgbRsvd[ 12 ]; // alignment padding #endif // _WIN64 BUCKETPool m_bucketpool; // pool of BUCKET blobs HOTSTUFF() : m_cms() { m_cEntry = 0; m_cOp = 0; #ifdef DEBUG memset( m_rgbRsvd, 0, sizeof( m_rgbRsvd ) ); #endif // DEBUG } }; // DIRPTRS // // containment for the directory pointers // these pointers control the use of the directory itself (m_rgrgBucket) // // the hash table will always have a minimum of 2 buckets (0 and 1) in the directory // // buckets are stored in dynamically allocated arrays which are pointed to by the directory // each array is 2 times larger than the previous array (exponential growth) // e.g. the Nth array (m_rgrgBucket[N]) contains 2^N contiguous buckets // NOTE: the 0th array is special in that it contains an extra element making its total 2 elements // (normally, 2^0 == 1 element; this is done for magical reasons to be explained later) // thus, the total number of entries for a given N is: // N // 1 + SUM 2^i --> 1 + [ 2^(N+1) - 1 ] --> 2^(N+1) // i=0 // // we know the total number of distinct hash values is a power of 2 (it must fit into a NativeCounter) // we can represent this with 2^M where M is the number of bits in a NativeCounter // therefore, assuming the above system of exponential growth, // we know that we can store the total number of hash buckets required at any given time so long as N = M // in other words, N = # of bits in NativeCounter --> sizeof( NativeCounter ) * 8 // // therefore, we can statically allocate the array of bucket arrays // and, we can use LOG2 to compute the bucket address of any given hash value // (exceptions: DIRILog2( 0 ) => 0, 0 and DIRILog2( 1 ) => 0, 1) // // for an explaination of m_cBucketMax and m_cBucket you should read the paper on // Dynamic Hashing by Per Ake Larson // // 160 bytes on WIN32 (5 cache lines) // 320 bytes on WIN64 (10 cache lines) struct DIRPTRS { NativeCounter m_cBucketMax; // half-way to last bucket in split iteration (2^(n-1)) NativeCounter m_cBucket; // destination of next split (0 to 2^(n-1)), must add to m_cBucketMax #ifdef _WIN64 BYTE m_rgbRsvd[ 16 ]; // alignment padding #else // !_WIN64 BYTE m_rgbRsvd[ 8 ]; // alignment padding #endif // _WIN64 }; // CLock // // - lock context for read/write/scan operations on the hash-table // - tracks currency within a bucket // - access is restricted to the dynamic-hash-table public: class CLock { friend class CDynamicHashTable< CKey, CEntry >; public: // possible states for a lock context (class CLock) enum ENUMLOCKSTATE { lsNil = 0, // lock is not used lsRead = 1, // lock is being used to read a particular CKeyEntry object lsWrite = 2, // lock is being used to write a particular CKeyEntry object lsScan = 3, // lock is being used to scan the hash-table }; public: CLock() { m_ls = lsNil; m_pBucketHead = NULL; } ~CLock() { DHTAssert( m_pBucketHead == NULL ); } private: // lock state ENUMLOCKSTATE m_ls; // current state of this lock context BOOL m_fInsertOrDelete; // HOTSTUFF pointer HOTSTUFF *m_phs; #ifdef DEBUG // debug-only parameters CKey m_key; // track the key that should be locked #endif // ptr to the first BUCKET BUCKET *m_pBucketHead; // ptr to the current BUCKET BUCKET *m_pBucket; // current BUCKET // ISAM-style cursor on current BUCKET (m_pBucket) CKeyEntry *m_pEntryPrev; // previous entry CKeyEntry *m_pEntry; // current entry CKeyEntry *m_pEntryNext; // next entry // current bucket (used in scan-mode only) NativeCounter m_iBucket; // current bucket }; ///////////////////////////////////////////////////////////////////////////////////////// // // state machine // const int UiSTEnter( HOTSTUFF **pphs ) { // hash to the HOTSTUFF structure *pphs = HOTSTUFFHash(); // enter the metered section return ( *pphs )->m_cms.Enter(); } void STLeave( const int group, HOTSTUFF *phs ) { phs->m_cms.Leave( group ); } const ENUMSTATE EsSTGetState() const { return m_stateCur; } void STTransition( const ENUMSTATE esNew ) { // initiate a transition to the desired state m_stateCur = esNew; m_cCompletions = 0; for ( NativeCounter ihs = 0; ihs < m_chs; ihs++ ) { m_rghs[ ihs ].m_cms.Partition( OSSYNC::CMeteredSection::PFNPARTITIONCOMPLETE( STCompletion_ ), DWORD_PTR( this ) ); } } static void STCompletion_( CDynamicHashTable< CKey, CEntry >* pdht ) { pdht->STCompletion(); } void STCompletion() { // state transition table typedef void (CDynamicHashTable< CKey, CEntry >::*PfnCompletion)(); struct StateTransitionTable { PfnCompletion m_pfnCompletion; ENUMSTATE m_stNext; }; static const StateTransitionTable rgstt[] = { /* stateNil */ { NULL, stateNil, }, /* stateShrinkFromGrow */ { NULL, stateShrinkFromGrow2, }, /* stateShrinkFromGrow2 */ { NULL, stateShrink, }, /* stateGrowFromShrink */ { NULL, stateGrowFromShrink2, }, /* stateGrowFromShrink2 */ { NULL, stateGrow, }, /* stateSplitFromGrow */ { NULL, stateSplitFromGrow2, }, /* stateSplitFromGrow2 */ { STCompletionCopyDir, stateSplit, }, /* stateGrowFromSplit */ { NULL, stateGrowFromSplit2, }, /* stateGrowFromSplit2 */ { NULL, stateGrow, }, /* stateMergeFromShrink */ { NULL, stateMergeFromShrink2, }, /* stateMergeFromShrink2 */ { STCompletionCopyDir, stateMerge, }, /* stateShrinkFromMerge */ { NULL, stateShrinkFromMerge2, }, /* stateShrinkFromMerge2 */ { NULL, stateShrink, }, /* stateUnused */ { NULL, stateNil, }, /* stateGrow */ { STCompletionGrowShrink, stateNil, }, /* stateShrink */ { STCompletionGrowShrink, stateNil, }, /* stateSplit */ { STCompletionSplit, stateGrowFromSplit, }, /* stateMerge */ { STCompletionMerge, stateShrinkFromMerge, }, }; // all metered sections have transitioned to the new state if ( NativeCounter( AtomicIncrement( &m_cCompletions ) ) >= m_chs ) { STATStateTransition(); // save the current state as it may change as a side-effect of // calling the completion function const ENUMSTATE esCurrent = EsSTGetState(); // if there is a completion function for this state then call it if ( rgstt[ esCurrent ].m_pfnCompletion ) { (this->*rgstt[ esCurrent ].m_pfnCompletion)(); } // if there is a next state then immediately begin the transition to that state if ( rgstt[ esCurrent ].m_stNext ) { STTransition( rgstt[ esCurrent ].m_stNext ); } } } void STCompletionCopyDir() { // backup the bucket ptrs for use during the split/merge process memcpy( &m_dirptrs[ 1 ], &m_dirptrs[ 0 ], sizeof( DIRPTRS ) ); } void STCompletionGrowShrink() { // enable the selection of a new maintenance policy m_semPolicy.Release(); } void STCompletionSplit() { // split the directory DIRISplit(); } void STCompletionMerge() { // merge the directory DIRIMerge(); } ///////////////////////////////////////////////////////////////////////////////////////// // // directory // // initialize the directory, possible allocating some buckets ERR ErrDIRInit( const NativeCounter cLoadFactor, const NativeCounter cbucketMin ) { ERR err; NativeCounter iExponent; NativeCounter iRemainder; // check params if ( cLoadFactor < 1 ) { return errInvalidParameter; } // setup the main paramters m_cLoadFactor = cLoadFactor; // calculate the bucket size, accounting for: // // - bucket header // - enough room for twice the load factor to eliminate overflow // buckets with uniform hashing // - room for an additional entry to give us some flexibility in // our actual load factor to reduce maintenance overhead // - cache line alignment of the bucket m_cbBucket = sizeof( BUCKET ) + ( cLoadFactor * 2 + 1 ) * sizeof( CKeyEntry ); m_cbBucket = ( ( m_cbBucket + cbCacheLine - 1 ) / cbCacheLine ) * cbCacheLine; // calculate the number of entries we can fit into a single bucket // NOTE: this may be larger than intended because we rounded the bucket size up the nearest cache-line m_centryBucket = ( m_cbBucket - sizeof( BUCKET ) ) / sizeof( CKeyEntry ); // calculate the minimum number of buckets using the following lower-bounds: // cbucketMin (user parameter) // # of processors (make sure we have atleast 1 bucket/proc as an attempt to minimize contention) // 2 (hash table assumes atleast 2 buckets) m_cbucketMin = max( cbucketMin, NativeCounter( OSSYNC::OSSyncGetProcessorCountMax() ) ); m_cbucketMin = max( m_cbucketMin, 2 ); // align the minimum number of buckets to the next highest power of 2 (unless it's already a power of 2) DIRILog2( m_cbucketMin, &iExponent, &iRemainder ); if ( iRemainder ) { if ( ++iExponent >= cbitNativeCounter ) { return errInvalidParameter; // could not round up without overflowing } } m_cbucketMin = 1 << iExponent; // setup the directory pointers m_dirptrs[ 0 ].m_cBucketMax = m_cbucketMin / 2; m_dirptrs[ 0 ].m_cBucket = m_cbucketMin / 2; // SPECIAL CASE: allocate 2 entries for the first bucket array // (we always do this because we always have atleast 2 buckets) err = ErrDIRInitBucketArray( 2, 0, &m_rgrgBucket[ 0 ] ); if ( errSuccess != err ) { return err; } // allocate memory for all other initial bucket arrays for ( iExponent = 1; ( NativeCounter( 1 ) << iExponent ) < m_cbucketMin; iExponent++ ) { err = ErrDIRInitBucketArray( 1 << iExponent, 1 << iExponent, &m_rgrgBucket[ iExponent ] ); if ( errSuccess != err ) { return err; } } // clear the second set of directory ptrs memset( &m_dirptrs[ 1 ], 0, sizeof( DIRPTRS ) ); return errSuccess; } // cleanup all memory by destructing it then freeing it void DIRTerm() { NativeCounter iExponent; // SPECIAL CASE: term the first bucket array (contains 2 entries) // (we will always do this because the hash-table will always contain atleast 2 entries) if ( m_rgrgBucket[ 0 ] ) { DIRTermBucketArray( m_rgrgBucket[ 0 ], 2 ); m_rgrgBucket[ 0 ] = NULL; } // term all other bucket arrays for ( iExponent = 1; iExponent < cbitNativeCounter; iExponent++ ) { if ( m_rgrgBucket[ iExponent ] ) { DIRTermBucketArray( m_rgrgBucket[ iExponent ], 1 << iExponent ); m_rgrgBucket[ iExponent ] = NULL; } } // reset both copies of the directory pointers memset( m_dirptrs, 0, sizeof( m_dirptrs ) ); } // lock a key for read operations void DIRReadLockKey( const ENUMSTATE esCurrent, const CKey &key, CLock * const plock ) const { NativeCounter iHash; NativeCounter iBucket; NativeCounter cBucketBefore; NativeCounter cBucketAfter; NativeCounter cBucketMax; // verify the lock DHTAssert( FBKTRead( plock ) ); DHTAssert( plock->m_pBucketHead == NULL ); #ifdef DEBUG // remember the key we are locking plock->m_key = key; #endif // hash to the bucket we want (this may require a retry in grow/shrink mode) iHash = CKeyEntry::Hash( key ); plock->m_pBucketHead = PbucketDIRIHash( esCurrent, iHash, &iBucket, &cBucketBefore ); // acquire the lock as a reader plock->m_pBucketHead->CRWL().EnterAsReader(); // the entry may have moved as the result of a bucket split/merge cBucketAfter = NcDIRIGetBucket( esCurrent ); cBucketMax = NcDIRIGetBucketMax( esCurrent ); if ( cBucketBefore != cBucketAfter && ( cBucketBefore <= iBucket && iBucket < cBucketAfter || cBucketMax + cBucketAfter <= iBucket && iBucket < cBucketMax + cBucketBefore ) ) { // unlock the old bucket plock->m_pBucketHead->CRWL().LeaveAsReader(); // hash to the bucket we want (this cannot fail more than once) plock->m_pBucketHead = PbucketDIRIHash( esCurrent, iHash ); // lock the new bucket plock->m_pBucketHead->CRWL().EnterAsReader(); } // we should now have the correct bucket locked DHTAssert( plock->m_pBucketHead == PbucketDIRIHash( esCurrent, iHash ) ); } // unlock the current read-locked key void DIRReadUnlockKey( CLock * const plock ) const { // verify the lock DHTAssert( FBKTRead( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); // release the lock plock->m_pBucketHead->CRWL().LeaveAsReader(); plock->m_pBucketHead = NULL; } // lock a key for read/write operations void DIRWriteLockKey( const ENUMSTATE esCurrent, const CKey &key, CLock * const plock ) const { NativeCounter iHash; NativeCounter iBucket; NativeCounter cBucketBefore; NativeCounter cBucketAfter; NativeCounter cBucketMax; // verify the lock DHTAssert( FBKTWrite( plock ) || FBKTScan( plock ) ); DHTAssert( plock->m_pBucketHead == NULL ); #ifdef DEBUG // remember the key we are locking plock->m_key = key; #endif // hash to the bucket we want (this may require a retry in grow/shrink mode) iHash = CKeyEntry::Hash( key ); plock->m_pBucketHead = PbucketDIRIHash( esCurrent, iHash, &iBucket, &cBucketBefore ); // acquire the lock as a writer plock->m_pBucketHead->CRWL().EnterAsWriter(); // the entry may have moved as the result of a bucket split/merge cBucketAfter = NcDIRIGetBucket( esCurrent ); cBucketMax = NcDIRIGetBucketMax( esCurrent ); if ( cBucketBefore != cBucketAfter && ( cBucketBefore <= iBucket && iBucket < cBucketAfter || cBucketMax + cBucketAfter <= iBucket && iBucket < cBucketMax + cBucketBefore ) ) { // unlock the old bucket plock->m_pBucketHead->CRWL().LeaveAsWriter(); // hash to the bucket we want (this cannot fail more than once) plock->m_pBucketHead = PbucketDIRIHash( esCurrent, iHash ); // lock the new bucket plock->m_pBucketHead->CRWL().EnterAsWriter(); } // we should now have the correct bucket locked DHTAssert( plock->m_pBucketHead == PbucketDIRIHash( esCurrent, iHash ) ); } // unlock the current write-locked key void DIRWriteUnlockKey( CLock * const plock ) const { // verify the lock DHTAssert( FBKTWrite( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); // release the lock plock->m_pBucketHead->CRWL().LeaveAsWriter(); plock->m_pBucketHead = NULL; } // initalize an array of buckets ERR ErrDIRInitBucketArray( const NativeCounter cbucketAlloc, const NativeCounter ibucketFirst, BYTE** const prgbBucket ) { #ifdef UNIQUE_BUCKET_NAMES char *psz; #endif // UNIQUE_BUCKET_NAMES NativeCounter cb; BYTE *rgb; NativeCounter ibucket; DHTAssert( cbucketAlloc > 0 ); DHTAssert( prgbBucket ); // calculate the size (in bytes) of the new bucket array #ifdef UNIQUE_BUCKET_NAMES cb = cbucketAlloc * ( m_cbBucket + 60 ); // add 60 extra bytes per bucket for a unique name (for the bucket's r/w-lock) #else cb = cbucketAlloc * m_cbBucket; #endif // allocate the new bucket array rgb = (BYTE*)PvMEMAlloc( cb ); if ( !rgb ) { *prgbBucket = NULL; return errOutOfMemory; } // initialize each bucket within the new array for ( ibucket = 0; ibucket < cbucketAlloc; ibucket++ ) { // efficiency variables PBUCKET const pbucket = PBUCKET( rgb + ( ibucket * m_cbBucket ) ); // construct the r/w-lock #ifdef UNIQUE_BUCKET_NAMES psz = (char*)( rgb + ( cbucketAlloc * m_cbBucket ) + ( ibucket * 60 ) ); sprintf( psz, "CDynamicHashTable::BUCKET[0x%016I64X]::m_rwlBucket", QWORD( ibucketFirst + ibucket ) ); DHTAssert( strlen( psz ) < 60 ); new( &pbucket->CRWL() ) OSSYNC::CReaderWriterLock( CLockBasicInfo( CSyncBasicInfo( psz ), int( m_rankDHTrwlBucket ), 0 ) ); #else // !UNIQUE_BUCKET_NAMES new( &pbucket->CRWL() ) OSSYNC::CReaderWriterLock( CLockBasicInfo( CSyncBasicInfo( "CDynamicHashTable::BUCKET::m_rwlBucket" ), int( m_rankDHTrwlBucket ), 0 ) ); #endif // UNIQUE_BUCKET_NAMES // make the bucket empty pbucket->m_pb = NULL; } *prgbBucket = rgb; return errSuccess; } // uninitialize an array of buckets void DIRTermBucketArray( BYTE* const rgbBucket, const NativeCounter cbucketTerm ) { NativeCounter ibucket; PBUCKET pbucketNext; // destroy each bucket in the array DHTAssert( rgbBucket ); for ( ibucket = 0; ibucket < cbucketTerm; ibucket++ ) { // efficiency variables PBUCKET pbucket = PBUCKET( rgbBucket + ( ibucket * m_cbBucket ) ); // destruct the r/w-lock in place without freeing memory pbucket->CRWL().CReaderWriterLock::~CReaderWriterLock(); // free all chained buckets (don't touch the first one because its part of rgbucket[]) pbucket = PbucketBKTNext( pbucket ); while ( pbucket ) { pbucketNext = PbucketBKTNext( pbucket ); MEMFree( pbucket ); pbucket = pbucketNext; } } MEMFree( rgbBucket ); } // split the directory void DIRISplit() { // we are executing the current policy (which is to split) and should be in this known state DHTAssert( m_dirptrs[ 0 ].m_cBucketMax > 0 ); DHTAssert( m_dirptrs[ 0 ].m_cBucket == m_dirptrs[ 0 ].m_cBucketMax ); // update the directory // NOTE: we do NOT allocate space here; this is deferred until BKTISplit() when we're sure we need it m_dirptrs[ 0 ].m_cBucketMax = m_dirptrs[ 0 ].m_cBucketMax * 2; m_dirptrs[ 0 ].m_cBucket = 0; STATSplitDirectory(); } // merge the directory void DIRIMerge() { // we are executing the current policy (which is to split) and should be in this known state DHTAssert( m_dirptrs[ 0 ].m_cBucketMax > 1 ); // we should not be at the last split-level ( == 1 ) DHTAssert( m_dirptrs[ 0 ].m_cBucket == 0 ); // free the bucket array that is no longer being used (the last one in the directory) // NOTE: we can guarantee that it isn't in use because m_cBucket == 0 AND we can't grow (we're in stateMerge) // that means that everyone trying to hash to this bucket will be re-routed to the low-order bucket instead NativeCounter iExponent; NativeCounter iRemainder; DIRILog2( m_dirptrs[ 0 ].m_cBucketMax, &iExponent, &iRemainder ); DHTAssert( NativeCounter( 1 ) << iExponent == m_dirptrs[ 0 ].m_cBucketMax ); DHTAssert( 0 == iRemainder ); // NOTE: the bucket array may not have been allocated because we defer its allocation until BKTISplit if ( m_rgrgBucket[ iExponent ] ) { DIRTermBucketArray( m_rgrgBucket[ iExponent ], m_dirptrs[ 0 ].m_cBucketMax ); m_rgrgBucket[ iExponent ] = NULL; } #ifdef DEBUG // verify that no higher-order bucket arrays exist while ( ++iExponent < cbitNativeCounter ) { DHTAssert( !m_rgrgBucket[ iExponent ] ); } #endif // DEBUG // update the directory m_dirptrs[ 0 ].m_cBucketMax = m_dirptrs[ 0 ].m_cBucketMax / 2; m_dirptrs[ 0 ].m_cBucket = m_dirptrs[ 0 ].m_cBucketMax; STATMergeDirectory(); } // computer the log2 of the given value in terms of an exponent and an integer remainder void DIRILog2( const NativeCounter iValue, NativeCounter* const piExponent, NativeCounter* const piRemainder ) const { NativeCounter iExponent; NativeCounter iMask; NativeCounter iMaskLast; iExponent = 0; iMaskLast = 1; iMask = 1; while ( iMask < iValue ) { iExponent++; iMaskLast = iMask; iMask = ( iMask << 1 ) + 1; } DHTAssert( iExponent < cbitNativeCounter ); *piExponent = iExponent; *piRemainder = iMaskLast & iValue; } // get the correct copy of cBucketMax const NativeCounter NcDIRIGetBucketMax( const ENUMSTATE esCurrent ) const { return m_dirptrs[ esCurrent >> 4 ].m_cBucketMax; } // get the correct copy of cBucket const NativeCounter NcDIRIGetBucket( const ENUMSTATE esCurrent ) const { return m_dirptrs[ esCurrent >> 4 ].m_cBucket; } // resolve a bucket address to a bucket pointer PBUCKET const PbucketDIRIResolve( const NativeCounter ibucketIndex, const NativeCounter ibucketOffset ) const { BYTE* const pb = m_rgrgBucket[ ibucketIndex ]; // get ptr to one of the bucket arrays const NativeCounter ibOffset = ibucketOffset * m_cbBucket; // get byte offset within bucket array DHTAssert( NULL != pb ); return PBUCKET( pb + ibOffset ); // return a typed ptr to the individual bucket within array } // hash to a bucket const PBUCKET PbucketDIRIHash( const ENUMSTATE esCurrent, const NativeCounter iHash, NativeCounter* const piBucket, NativeCounter* const pcBucket ) const { NativeCounter& iBucket = *piBucket; NativeCounter& cBucket = *pcBucket; NativeCounter cBucketMax; NativeCounter iExponent; NativeCounter iRemainder; // load some of the directory pointers cBucket = NcDIRIGetBucket( esCurrent ); cBucketMax = NcDIRIGetBucketMax( esCurrent ); // normalize the given hash value to the range of active buckets iBucket = iHash & ( ( cBucketMax - 1 ) + cBucketMax ); if ( iBucket >= cBucketMax + cBucket ) { iBucket -= cBucketMax; } // convert the normalized hash value to a bucket address DIRILog2( iBucket, &iExponent, &iRemainder ); // return the bucket return PbucketDIRIResolve( iExponent, iRemainder ); } const PBUCKET PbucketDIRIHash( const ENUMSTATE esCurrent, const NativeCounter iHash ) const { NativeCounter iBucket; NativeCounter cBucket; return PbucketDIRIHash( esCurrent, iHash, &iBucket, &cBucket ); } ///////////////////////////////////////////////////////////////////////////////////////// // // scan operations // // move from the current hash-bucket to the next hash-bucket that contains // atleast 1 entry; position currency on that entry ERR ErrSCANMoveNext( CLock *const plock ) { DHTAssert( plock->m_pEntryPrev == NULL ); DHTAssert( plock->m_pEntry == NULL ); DHTAssert( plock->m_pEntryNext == NULL ); // unlock the current bucket if ( plock->m_pBucketHead ) { plock->m_pBucketHead->CRWL().LeaveAsWriter(); plock->m_pBucketHead = NULL; // we performed an insert or delete while holding the write lock if ( plock->m_fInsertOrDelete ) { // perform amortized maintenance on the table MaintainTable( plock->m_phs ); } } // enter the state machine const int iGroup = UiSTEnter( &plock->m_phs ); const ENUMSTATE esCurrent = EsSTGetState(); while ( plock->m_iBucket + 1 < NcDIRIGetBucketMax( esCurrent ) + NcDIRIGetBucket( esCurrent ) ) { // we have not scanned the last bucket yet // advance the bucket index plock->m_iBucket++; // hash to the bucket and lock it plock->m_pBucketHead = PbucketDIRIHash( esCurrent, plock->m_iBucket ); plock->m_pBucketHead->CRWL().EnterAsWriter(); if ( plock->m_iBucket < NcDIRIGetBucketMax( esCurrent ) + NcDIRIGetBucket( esCurrent ) ) { // bucket address is OK (did not move) if ( plock->m_pBucketHead->m_pb != NULL ) { // current bucket contains atleast 1 entry // setup the currency on the first entry plock->m_pBucket = plock->m_pBucketHead; plock->m_pEntry = &plock->m_pBucketHead->m_rgEntry[0]; // stop the loop break; } // current bucket is empty } else { DHTAssert( stateShrink == esCurrent ); // the current bucket disappeared because it was merged into a lower bucket DHTAssert( plock->m_iBucket >= NcDIRIGetBucketMax( esCurrent ) ); DHTAssert( PbucketDIRIHash( esCurrent, plock->m_iBucket ) == PbucketDIRIHash( esCurrent, plock->m_iBucket - NcDIRIGetBucketMax( esCurrent ) ) ); // make sure the current entry ptr is reset DHTAssert( !plock->m_pEntry ); } // release the bucket lock (bucket should be empty since it was merged) DHTAssert( !plock->m_pBucketHead->m_pb ); plock->m_pBucketHead->CRWL().LeaveAsWriter(); plock->m_pBucketHead = NULL; } // leave the state machine STLeave( iGroup, plock->m_phs ); // return the result DHTAssert( !plock->m_pEntry || plock->m_pBucketHead ); return plock->m_pEntry ? errSuccess : errNoCurrentEntry; } ///////////////////////////////////////////////////////////////////////////////////////// // // bucket operations // // returns fTrue if the lock context is in read mode const BOOL FBKTRead( CLock *const plock ) const { return plock->m_ls == CLock::lsRead; } // returns fTrue if the lock context is in write mode const BOOL FBKTWrite( CLock *const plock ) const { return plock->m_ls == CLock::lsWrite; } // returns fTrue if the lock context is in scan-forward mode const BOOL FBKTScan( CLock *const plock ) const { return plock->m_ls == CLock::lsScan; } // returns the entry after last entry in the BUCKET or entry 0 if no entries exist CKeyEntry *PentryBKTNextMost( const PBUCKET pBucket ) const { const BYTE *pb = pBucket->m_pb; if ( BOOL( ( pb >= (BYTE*)&pBucket->m_rgEntry[ 0 ] ) & ( pb < (BYTE*)&pBucket->m_rgEntry[ m_centryBucket ] ) ) ) { // we are in the last bucket return (CKeyEntry*)pb + 1; } else if ( NULL == pb ) { // the bucket is empty return &pBucket->m_rgEntry[ 0 ]; } // the bucket is full return &pBucket->m_rgEntry[ m_centryBucket ]; } // returns the next BUCKET or NULL if no other BUCKETs exist PBUCKET PbucketBKTNext( const PBUCKET pBucket ) const { const BYTE *pb = pBucket->m_pb; if ( BOOL( ( pb <= (BYTE*)pBucket - m_cbBucket ) | ( pb >= (BYTE*)pBucket + m_cbBucket ) ) ) { // m_pBucketNext is either the next BUCKET or NULL DHTAssert( !pb || PBUCKET( pb )->m_pBucketPrev == pBucket ); return PBUCKET( pb ); } // m_pBucketNext is invalid (m_pEntryLast is valid instead) return NULL; } // try to seek to the entry corresponding to the given key // if found, the currency will be set to the entry and errSuccess will be returned // if not, currency will be set to before-first or after-last, and errEntryNotFound will be returned void BKTSeek( CLock *const plock, const CKey &key ) const { // pre-init our currency assuming we will hit a hot path plock->m_pBucket = plock->m_pBucketHead; plock->m_pEntryPrev = NULL; plock->m_pEntryNext = NULL; // HOT PATH: // // if the next/end pointer is within the head bucket then we know // that all entries are in the head bucket. if we find the entry // for this key then set our currency to point to it otherwise set // our currency to no current entry CKeyEntry* const pEntryLast = plock->m_pBucketHead->m_pEntryLast; if ( DWORD_PTR( pEntryLast ) - DWORD_PTR( plock->m_pBucketHead ) < m_cbBucket ) { CKeyEntry* pEntry = plock->m_pBucketHead->m_rgEntry; do { if ( pEntry->FEntryMatchesKey( key ) ) { plock->m_pEntry = pEntry; return; } } while ( ++pEntry <= pEntryLast ); plock->m_pEntry = NULL; } // HOT PATH: // // if the next/end pointer is NULL then we know that we will not // find the key. set our currency to no current entry else if ( !pEntryLast ) { plock->m_pEntry = NULL; } // if the next/end pointer points outside of the head bucket then // perform a full chain search else { BKTISeek( plock, key ); } } void BKTISeek( CLock *const plock, const CKey &key ) const { PBUCKET pBucket; PBUCKET pBucketPrev; CKeyEntry *pEntryThis; CKeyEntry *pEntryMost; DHTAssert( FBKTRead( plock ) || FBKTWrite( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); // start the scan on the first bucket pBucket = plock->m_pBucketHead; do { // scan the current BUCKET pEntryThis = &pBucket->m_rgEntry[ 0 ]; pEntryMost = PentryBKTNextMost( pBucket ); while ( pEntryThis < pEntryMost ) { // query the entry against the given key for a match // (assume we will be more likely to not find it) if ( !pEntryThis->FEntryMatchesKey( key ) ) { // nop } else { // the key exists; setup our currency around it goto SetupCurrency; } // move to the next entry pEntryThis++; } // move to the next BUCKET pBucketPrev = pBucket; pBucket = PbucketBKTNext( pBucket ); } while ( pBucket ); // move back to the last BUCKET and reset the entry ptr pBucket = pBucketPrev; pEntryThis = NULL; SetupCurrency: // setup the currency in the lock context // we will not allow moving next/prev, so we setup the next/prev ptrs accordingly plock->m_pBucket = pBucket; plock->m_pEntryPrev = NULL; plock->m_pEntry = pEntryThis; plock->m_pEntryNext = NULL; } #ifdef DEBUG // get a pointer to the current entry // if currency is before-first or after-last, then NULL is returned void BKTGetEntry( CLock *const plock, CKeyEntry **ppKeyEntry ) const { DHTAssert( FBKTRead( plock ) || FBKTWrite( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); DHTAssert( plock->m_pBucket != NULL ); *ppKeyEntry = plock->m_pEntry; return; } #endif // get the current entry // if currency is before-first or after-last, errEntryNotFound is returned const ERR ErrBKTGetEntry( CLock *const plock, CEntry *pentry ) const { DHTAssert( FBKTRead( plock ) || FBKTWrite( plock ) || FBKTScan( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); DHTAssert( plock->m_pBucket != NULL ); if ( plock->m_pEntry ) { // we are on an entry plock->m_pEntry->GetEntry( pentry ); return errSuccess; } // we are not on an entry return errEntryNotFound; } // replace the current entry (destruct old entry, contruct new entry) // if currency is before-first or after-last, then errNoCurrentEntry is returned const ERR ErrBKTReplaceEntry( CLock *const plock, const CEntry &entry ) const { DHTAssert( FBKTWrite( plock ) || FBKTScan( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); DHTAssert( plock->m_pBucket != NULL ); if ( plock->m_pEntry ) { // we are on an entry // copy the new entry over it plock->m_pEntry->SetEntry( entry ); return errSuccess; } // we are not on an entry return errNoCurrentEntry; } // insert an entry at the end of the logical bucket // if memory is short, errOutOfMemory is returned // otherwise, errSuccess is returned const ERR ErrBKTInsertEntry( CLock *const plock, const CEntry &entry ) { DHTAssert( FBKTWrite( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); DHTAssert( plock->m_pBucket != NULL ); if ( plock->m_pEntry ) { // we are pointing to the key we locked, so it must already exist return errKeyDuplicate; } #ifdef DEBUG PBUCKET *rgBucketCheck = NULL, pbucketTX; size_t cBucketCheck = 0, iT; pbucketTX = plock->m_pBucketHead; while ( pbucketTX ) { cBucketCheck++; pbucketTX = PbucketBKTNext( pbucketTX ); } cBucketCheck++; // account for newly allocated bucket rgBucketCheck = (PBUCKET *)PvMEMAlloc( cBucketCheck * sizeof( PBUCKET ) ); if ( NULL != rgBucketCheck ) { iT = 0; pbucketTX = plock->m_pBucketHead; while ( pbucketTX ) { rgBucketCheck[ iT++ ] = pbucketTX; pbucketTX = PbucketBKTNext( pbucketTX ); } rgBucketCheck[ iT++ ] = NULL; // new bucket } // count the number of entries we will be handling size_t cEntriesTotal = 0; PBUCKET pbktT, pbktNextT; pbktT = plock->m_pBucketHead; if ( pbktT->m_pb != NULL ) { while ( pbktT ) { pbktNextT = PbucketBKTNext( pbktT ); if ( pbktNextT ) { // full bucket cEntriesTotal += size_t( m_centryBucket ); } else { // partial bucket (not empty) cEntriesTotal += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] ); } pbktT = pbktNextT; } } #endif // cursor for insert PBUCKET pBucketThis = plock->m_pBucket; CKeyEntry *pEntryThis; // efficiency variable PBUCKET pBucketT; // move to the last entry in the last bucket pBucketT = PbucketBKTNext( pBucketThis ); while ( pBucketT ) { pBucketThis = pBucketT; pBucketT = PbucketBKTNext( pBucketT ); } pEntryThis = PentryBKTNextMost( pBucketThis ); if ( pEntryThis != &pBucketThis->m_rgEntry[ m_centryBucket ] ) { // there are available entries left in the last bucket // nop } else { // there are no entries left in the last bucket // allocate a new bucket pBucketT = (BUCKET *)PvMEMAlloc( m_cbBucket ); if ( !pBucketT ) { // we ran out of memory when allocating the new BUCKET #ifdef DEBUG // free memory from the start of this functions if ( NULL != rgBucketCheck ) { MEMFree( rgBucketCheck ); } #endif return errOutOfMemory; } STATInsertOverflowBucket(); #ifdef DEBUG // put the new bucket in our list if ( NULL != rgBucketCheck ) { DHTAssert( rgBucketCheck[cBucketCheck-1] == NULL ); rgBucketCheck[cBucketCheck-1] = pBucketT; } #endif // chain the new BUCKET pBucketThis->m_pBucketNext = pBucketT; pBucketT->m_pBucketPrev = pBucketThis; // use the first entry of the new BUCKET pBucketThis = pBucketT; pEntryThis = &pBucketT->m_rgEntry[0]; } // copy the entry pEntryThis->SetEntry( entry ); // update the last entry pointer pBucketThis->m_pEntryLast = pEntryThis; // move the currency to the new entry plock->m_pBucket = pBucketThis; plock->m_pEntry = pEntryThis; #ifdef DEBUG if ( NULL != rgBucketCheck ) { // check each catalogued bucket to see if it is still there pbucketTX = plock->m_pBucketHead; DHTAssert( pbucketTX ); // find an remove all buckets found in the destiantion bucket from our list while ( pbucketTX ) { for ( iT = 0; iT < cBucketCheck; iT++ ) { if ( rgBucketCheck[iT] == pbucketTX ) { rgBucketCheck[iT] = NULL; break; } } DHTAssert( iT < cBucketCheck ); // if this goes off, we somehow got a bucket // into the chain that shouldn't be there // (it is a bucket we never catalogued!) pbucketTX = PbucketBKTNext( pbucketTX ); } // the list should now be empty -- verify this for ( iT = 0; iT < cBucketCheck; iT++ ) { // if this goes off, rgBucketCheck[iT] contains a bucket that was abandoned without // being freed! DHTAssert( rgBucketCheck[iT] == NULL ); } // free the list MEMFree( rgBucketCheck ); } // make sure the number of entries has not changed since we started size_t cEntriesAfterwards = 0; pbktT = plock->m_pBucketHead; if ( pbktT->m_pb != NULL ) { while ( pbktT ) { pbktNextT = PbucketBKTNext( pbktT ); if ( pbktNextT ) { // full bucket cEntriesAfterwards += size_t( m_centryBucket ); } else { // partial bucket (not empty) cEntriesAfterwards += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] ); } pbktT = pbktNextT; } } // entry counters should match ( +1 is for the inserted entry ) DHTAssert( cEntriesAfterwards == cEntriesTotal + 1 ); #endif return errSuccess; } // delete the current entry // if currency is before-first or after-last, then errNoCurrentEntry is returned // if the entry is not the last in the logical bucket, the last entry is promoted // to fill in the hole // should a BUCKET become empty, it will be released immediately const ERR ErrBKTDeleteEntry( CLock *const plock ) { DHTAssert( FBKTWrite( plock ) || FBKTScan( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); DHTAssert( plock->m_pBucket != NULL ); if ( !plock->m_pEntry ) { // we do not have a current entry return errNoCurrentEntry; } #ifdef DEBUG PBUCKET *rgBucketCheck = NULL; PBUCKET pbucketT; size_t cBucketCheck = 0, iT; pbucketT = plock->m_pBucketHead; while ( pbucketT ) { cBucketCheck++; pbucketT = PbucketBKTNext( pbucketT ); } rgBucketCheck = (PBUCKET *)PvMEMAlloc( cBucketCheck * sizeof( PBUCKET ) ); if ( NULL != rgBucketCheck ) { iT = 0; pbucketT = plock->m_pBucketHead; while ( pbucketT ) { rgBucketCheck[ iT++ ] = pbucketT; pbucketT = PbucketBKTNext( pbucketT ); } } // count the number of entries we will be handling size_t cEntriesTotal = 0; PBUCKET pbktT, pbktNextT; pbktT = plock->m_pBucketHead; if ( pbktT->m_pb != NULL ) { while ( pbktT ) { pbktNextT = PbucketBKTNext( pbktT ); if ( pbktNextT ) { // full bucket cEntriesTotal += size_t( m_centryBucket ); } else { // partial bucket (not empty) cEntriesTotal += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] ); } pbktT = pbktNextT; } } #endif // we have a valid entry PBUCKET pBucketThis = plock->m_pBucket; CKeyEntry *pEntryThis = plock->m_pEntry; PBUCKET pBucketFree = NULL; // used later if we free a BUCKET strucutre if ( pEntryThis != pBucketThis->m_pEntryLast ) { // we are not deleting the last entry in the bucket // promote the last entry to fill in this spot left by the entry we are deleting // move to the last bucket PBUCKET pBucketT = PbucketBKTNext( pBucketThis ); while ( pBucketT ) { pBucketThis = pBucketT; pBucketT = PbucketBKTNext( pBucketT ); } // move to the last entry in the last BUCKET pEntryThis = pBucketThis->m_pEntryLast; // copy the entry plock->m_pEntry->SetEntry( pEntryThis->m_entry ); } // update the currency to show that we are no longer on an entry plock->m_pEntry = NULL; // we are now pointing to the last entry in the last bucket // (via pBucketThis/pEntryThis), and that entry needs to be // "deleted" from the bucket // update the next/end ptr to reflect this deletion if ( pEntryThis != &pBucketThis->m_rgEntry[0] ) { // entries still remain in the last bucket DHTAssert( pBucketThis->m_pEntryLast == pEntryThis ); pBucketThis->m_pEntryLast--; // pEntryThis - 1; #ifdef DEBUG // jump to the validation code goto DoValidation; #endif return errSuccess; } // no entries remain in the last bucket if ( pBucketThis == plock->m_pBucketHead ) { // this bucket is empty, but we cannot release it because it is part of the bucket array // instead, we mark it as being empty pBucketThis->m_pb = NULL; #ifdef DEBUG // jump to the validation code goto DoValidation; #endif return errSuccess; } // we can free the last bucket pBucketFree = pBucketThis; // unchain it DHTAssert( pBucketThis->m_pBucketPrev->m_pBucketNext == pBucketThis ); pBucketThis = pBucketThis->m_pBucketPrev; pBucketThis->m_pEntryLast = &pBucketThis->m_rgEntry[ m_centryBucket - 1 ]; // free it MEMFree( pBucketFree ); if ( plock->m_pBucket == pBucketFree ) { // our currency was on the last bucket which is now invalid // move to the previous bucket (which is now the NEW last BUCKET) plock->m_pBucket = pBucketThis; } STATDeleteOverflowBucket(); #ifdef DEBUG // check each catalogued bucket to see if it is still there DoValidation: if ( NULL != rgBucketCheck ) { pbucketT = plock->m_pBucketHead; DHTAssert( pbucketT ); // find an remove all buckets found in the destiantion bucket from our list while ( pbucketT ) { for ( iT = 0; iT < cBucketCheck; iT++ ) { if ( rgBucketCheck[iT] == pbucketT ) { rgBucketCheck[iT] = NULL; break; } } DHTAssert( iT < cBucketCheck ); // if this goes off, we somehow got a bucket // into the chain that shouldn't be there // (it is a bucket we never catalogued!) pbucketT = PbucketBKTNext( pbucketT ); } // remove pBucketFree from rgBucketCheck if ( pBucketFree ) { for ( iT = 0; iT < cBucketCheck; iT++ ) { if ( rgBucketCheck[iT] == pBucketFree ) { rgBucketCheck[iT] = NULL; break; } } DHTAssert( iT < cBucketCheck ); // if this goes off, we freed a bucket that // was never catalogued! we should only be freeing // buckets that were in the original catalogue! } // the list should now be empty -- verify this for ( iT = 0; iT < cBucketCheck; iT++ ) { // if this goes off, rgBucketCheck[iT] contains a bucket that was abandoned without // being freed! DHTAssert( rgBucketCheck[iT] == NULL ); } // free the list MEMFree( rgBucketCheck ); } // make sure the number of entries has not changed since we started size_t cEntriesAfterwards = 0; pbktT = plock->m_pBucketHead; if ( pbktT->m_pb != NULL ) { while ( pbktT ) { pbktNextT = PbucketBKTNext( pbktT ); if ( pbktNextT ) { // full bucket cEntriesAfterwards += size_t( m_centryBucket ); } else { // partial bucket (not empty) cEntriesAfterwards += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] ); } pbktT = pbktNextT; } } // entry counters should match ( -1 is for the deleted entry ) DHTAssert( cEntriesAfterwards == cEntriesTotal - 1 ); #endif return errSuccess; } // split to a new bucket void BKTISplit( HOTSTUFF* const phs ) { // NOTE: from our perspective, we are in the grow state // however, the current state may be set to something else due to a pending transition // read the directory pointers const NativeCounter cBucketMax = NcDIRIGetBucketMax( stateGrow ); const NativeCounter cBucket = NcDIRIGetBucket( stateGrow ); if ( cBucketMax + cBucket >= m_cBucketPreferred || cBucket == cBucketMax ) { return; // the requested growth is complete } // we need to reserve memory now to ensure that the growth will succeed // (BKTIDoSplit will commit or unreserve this reservation later) if ( !phs->m_bucketpool.FPOOLReserve( m_cbBucket ) ) { return; } // get the source bucket const PBUCKET pbucketGrowSrc = PbucketDIRIHash( stateGrow, cBucket ); // try to get the lock if ( pbucketGrowSrc->CRWL().FWritersQuiesced() || !pbucketGrowSrc->CRWL().FTryEnterAsWriter() ) { STATSplitContention(); phs->m_bucketpool.POOLUnreserve(); return; } // having a write-lock on the source bucket means no one else attempting to split can // be farther along than us at this moment unless they completed the growth already // see whether or not m_cBucket changed while were trying to get here // if it stayed the same, we were the first ones to split this bucket // it if changed, we were not first; instead, someone else managed to split AFTER // we read m_cBucket but BEFORE we could do the split ourselves if ( cBucket != NcDIRIGetBucket( stateGrow ) ) { DHTAssert( cBucket < NcDIRIGetBucket( stateGrow ) ); pbucketGrowSrc->CRWL().LeaveAsWriter(); phs->m_bucketpool.POOLUnreserve(); return; } // get the destination bucket (may not be allocated yet so we cannot use PbucketDIRIHash) NativeCounter iExponent; NativeCounter iRemainder; DIRILog2( cBucketMax + cBucket, &iExponent, &iRemainder ); // extract the address of the bucket if ( !m_rgrgBucket[ iExponent ] ) { // allocate a new bucket array to hold 2^iExponent buckets for this entry if ( ErrDIRInitBucketArray( cBucketMax, cBucketMax, &m_rgrgBucket[ iExponent ] ) != errSuccess ) { pbucketGrowSrc->CRWL().LeaveAsWriter(); phs->m_bucketpool.POOLUnreserve(); return; } } DHTAssert( m_rgrgBucket[ iExponent ] ); // get the destination bucket const PBUCKET pbucketGrowDst = PbucketDIRIResolve( iExponent, iRemainder ); // lock the destination bucket (no possibility of contention here) pbucketGrowDst->CRWL().FTryEnterAsWriter(); // increase m_cBucket (we cannot turn back after this point) // anyone who hashes to the new bucket will be queued up until the growth is complete DHTAssert( cBucket == NcDIRIGetBucket( stateGrow ) ); m_dirptrs[ 0 ].m_cBucket++; // do the growth work BKTIDoSplit( phs, pbucketGrowSrc, pbucketGrowDst, cBucket ); // release the write-locks pbucketGrowSrc->CRWL().LeaveAsWriter(); pbucketGrowDst->CRWL().LeaveAsWriter(); } // merge two existing buckets into one void BKTIMerge( HOTSTUFF* const phs ) { // NOTE: from our perspective, we are in the shrink state // however, the current state may be set to something else due to a pending transition // read the directory pointers const NativeCounter cBucketMax = NcDIRIGetBucketMax( stateShrink ); NativeCounter cBucket = NcDIRIGetBucket( stateShrink ); if ( cBucketMax + cBucket <= m_cBucketPreferred || cBucket == 0 ) { return; // the requested shrinkage is complete } cBucket--; // the bucket we are merging is really 1 below cBucket // we need to reserve memory now to ensure that the shrinkage will succeed // (BKTIDoMerge will commit or unreserve this reservation later) if ( !phs->m_bucketpool.FPOOLReserve( m_cbBucket ) ) { return; } // get the destination bucket const PBUCKET pbucketShrinkDst = PbucketDIRIHash( stateShrink, cBucket ); // try to get the lock if ( pbucketShrinkDst->CRWL().FWritersQuiesced() || !pbucketShrinkDst->CRWL().FTryEnterAsWriter() ) { STATMergeContention(); phs->m_bucketpool.POOLUnreserve(); return; } // having a write-lock on the destination bucket means no one else attempting to merge can // be farther along than us at this moment unless they completed the shrinkage already // see whether or not m_cSplit changed while were trying to get here // if it stayed the same, we were the first ones to merge this bucket // it if changed, we were not first; instead, someone else managed to merge AFTER // we read m_cBucket but BEFORE we could do the merge ourselves if ( cBucket + 1 != NcDIRIGetBucket( stateShrink ) ) { DHTAssert( cBucket + 1 > NcDIRIGetBucket( stateShrink ) ); pbucketShrinkDst->CRWL().LeaveAsWriter(); phs->m_bucketpool.POOLUnreserve(); return; } // convert cBucket to a bucket address NativeCounter iExponent; NativeCounter iRemainder; DIRILog2( cBucket + NcDIRIGetBucketMax( stateShrink ), &iExponent, &iRemainder ); // extract the address of the bucket const PBUCKET pbucketShrinkSrc = PbucketDIRIResolve( iExponent, iRemainder ); // try to get the lock if ( pbucketShrinkSrc->CRWL().FWritersQuiesced() || !pbucketShrinkSrc->CRWL().FTryEnterAsWriter() ) { STATMergeContention(); pbucketShrinkDst->CRWL().LeaveAsWriter(); phs->m_bucketpool.POOLUnreserve(); return; } // decrease m_cBucket (we cannot turn back after this point) // anyone who hashes to the destination bucket will be queued up until // the merge is complete // no one will be able to hash to the source bucket DHTAssert( cBucket + 1 == NcDIRIGetBucket( stateShrink ) ); m_dirptrs[ 0 ].m_cBucket--; // do the shrinkage work BKTIDoMerge( phs, pbucketShrinkSrc, pbucketShrinkDst ); // release the write-locks pbucketShrinkDst->CRWL().LeaveAsWriter(); pbucketShrinkSrc->CRWL().LeaveAsWriter(); } // work-horse for spliting a bucket void BKTIDoSplit( HOTSTUFF* const phs, PBUCKET pBucketSrcSrc, PBUCKET pBucketDst, const NativeCounter iHashSrc ) { #ifdef DEBUG PBUCKET pBucketSrcSrcOriginal = pBucketSrcSrc; PBUCKET pBucketDstOriginal = pBucketDst; size_t cEntriesTotal = 0, cEntriesTotalRunning = 0; PBUCKET pbktT, pbktNextT; // catalog each BUCKET structure and make sure they end up in the destination bucket PBUCKET *rgBucketCheck = NULL, pbucketTX; size_t cBucketCheck = 0, iT; pbucketTX = pBucketSrcSrc; while ( pbucketTX ) { cBucketCheck++; pbucketTX = PbucketBKTNext( pbucketTX ); } pbucketTX = pBucketDst; DHTAssert( PbucketBKTNext( pbucketTX ) == NULL ); while ( pbucketTX ) { cBucketCheck++; pbucketTX = PbucketBKTNext( pbucketTX ); } cBucketCheck++; // account for bucket from heap rgBucketCheck = (PBUCKET *)PvMEMAlloc( cBucketCheck * sizeof( PBUCKET ) ); if ( NULL != rgBucketCheck ) { iT = 0; pbucketTX = pBucketSrcSrc; while ( pbucketTX ) { rgBucketCheck[ iT++ ] = pbucketTX; pbucketTX = PbucketBKTNext( pbucketTX ); } pbucketTX = pBucketDst; while ( pbucketTX ) { rgBucketCheck[ iT++ ] = pbucketTX; pbucketTX = PbucketBKTNext( pbucketTX ); } rgBucketCheck[ iT++ ] = NULL; // heap bucket DHTAssert( iT == cBucketCheck ); } // count the number of entries that are in the source bucket pbktT = pBucketSrcSrc; if ( pbktT->m_pb != NULL ) { while ( pbktT ) { pbktNextT = PbucketBKTNext( pbktT ); if ( pbktNextT ) { // full bucket cEntriesTotal += size_t( m_centryBucket ); } else { // partial bucket (not empty) cEntriesTotal += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] ); } pbktT = pbktNextT; } } #endif // cursor for reading entries PBUCKET pBucketNextSrc; CKeyEntry *pEntryThisSrc; CKeyEntry *pEntryMostSrc; // cursors for writing entries // index 0 is for the SrcDst cursor (entries whose src and dst is the source bucket) // index 1 is for the Dst cursor (entries whose dst is the destination bucket) PBUCKET pBucketThis[2]; CKeyEntry *pEntryThis[2]; CKeyEntry *pEntryMost[2]; CKeyEntry *pEntryLast[2]; size_t iIndex; // extra buckets PBUCKET pBucketAvail = NULL; // remember if we used the bucket from the heap BOOL fBucketFromHeap = fFalse; // used for hashing NativeCounter iHashMask; DHTAssert( pBucketSrcSrc ); DHTAssert( pBucketDst ); DHTAssert( pBucketDst->m_pb == NULL ); // calculate the hash-mask (prevent wraparound) DHTAssert( NcDIRIGetBucketMax( stateGrow ) > 0 ); iHashMask = ( NcDIRIGetBucketMax( stateGrow ) - 1 ) + NcDIRIGetBucketMax( stateGrow ); // prepare the read cursor pBucketNextSrc = PbucketBKTNext( pBucketSrcSrc ); pEntryThisSrc = &pBucketSrcSrc->m_rgEntry[ 0 ]; pEntryMostSrc = PentryBKTNextMost( pBucketSrcSrc ); // prepare the src-dst write cursor pBucketThis[ 0 ] = pBucketSrcSrc; pEntryThis[ 0 ] = &pBucketSrcSrc->m_rgEntry[ 0 ]; pEntryMost[ 0 ] = &pBucketSrcSrc->m_rgEntry[ m_centryBucket ]; pEntryLast[ 0 ] = NULL; // prepare the dst write cursor pBucketThis[ 1 ] = pBucketDst; pEntryThis[ 1 ] = &pBucketDst->m_rgEntry[ 0 ]; pEntryMost[ 1 ] = &pBucketDst->m_rgEntry[ m_centryBucket ]; pEntryLast[ 1 ] = NULL; // iterate over all entries in the source bucket while ( fTrue ) { // check the read (src) cursor if ( pEntryThisSrc < pEntryMostSrc ) { // nop } else if ( NULL == pBucketNextSrc ) { // all entries have been exhausted break; } else { // all entries in the current bucket have been exhausted if ( pBucketSrcSrc != pBucketThis[ 0 ] ) { // the bucket we are leaving is completely empty and the // SrcDst pointer is not using it // we need to put it into the available bucket list // the bucket ordering should be like this: // pBucketThis[0] (src/dst bucket) // pBucketSrcSrc (src bucket) // pBucketNextSrc (next src bucket) DHTAssert( pBucketThis[ 0 ]->m_pBucketNext == pBucketSrcSrc ); DHTAssert( pBucketSrcSrc->m_pBucketNext == pBucketNextSrc ); DHTAssert( pBucketNextSrc->m_pBucketPrev == pBucketSrcSrc ); DHTAssert( pBucketSrcSrc->m_pBucketPrev == pBucketThis[ 0 ] ); // update the bucket links to "remove" the free bucket pBucketThis[ 0 ]->m_pBucketNext = pBucketNextSrc; pBucketNextSrc->m_pBucketPrev = pBucketThis[ 0 ]; // add the bucket to the avail list pBucketSrcSrc->m_pBucketNext = pBucketAvail; pBucketAvail = pBucketSrcSrc; } // move to the next bucket pEntryThisSrc = &pBucketNextSrc->m_rgEntry[ 0 ]; pEntryMostSrc = PentryBKTNextMost( pBucketNextSrc ); pBucketSrcSrc = pBucketNextSrc; pBucketNextSrc = PbucketBKTNext( pBucketNextSrc ); } // calculate the hash value iIndex = BOOL( ( pEntryThisSrc->Hash() & iHashMask ) != iHashSrc ); DHTAssert( iIndex == 0 || iIndex == 1 ); #ifdef DEBUG cEntriesTotalRunning++; #endif // DEBUG // check the write (src/dst or dst) cursor if ( pEntryThis[ iIndex ] < pEntryMost[ iIndex ] ) { // nop } else { // all entries in the current cursor's bucket are exhausted if ( 0 == iIndex ) { // the src/dst cursor will always have a next bucket DHTAssert( pBucketThis[ 0 ]->m_pBucketNext->m_pBucketPrev == pBucketThis[ 0 ] ); pBucketThis[ 0 ] = pBucketThis[ 0 ]->m_pBucketNext; // setup the entry ptrs pEntryThis[ 0 ] = &pBucketThis[ 0 ]->m_rgEntry[ 0 ]; pEntryMost[ 0 ] = &pBucketThis[ 0 ]->m_rgEntry[ m_centryBucket ]; } else { // the dst cursor must allocate a new bucket if ( pBucketAvail ) { // get a bucket from the avail list const PBUCKET pBucketNew = pBucketAvail; pBucketAvail = pBucketAvail->m_pBucketNext; // chain it pBucketThis[ 1 ]->m_pBucketNext = pBucketNew; pBucketNew->m_pBucketPrev = pBucketThis[ 1 ]; // move to it pBucketThis[ 1 ] = pBucketNew; } else { // get a bucket from the reservation pool DHTAssert( !fBucketFromHeap ); fBucketFromHeap = fTrue; // allocate it const PBUCKET pBucketReserve = phs->m_bucketpool.PbucketPOOLCommit(); DHTAssert( pBucketReserve ); STATInsertOverflowBucket(); #ifdef DEBUG // add the heap bucket to our catalog of buckets if ( NULL != rgBucketCheck ) { DHTAssert( NULL == rgBucketCheck[ cBucketCheck - 1 ] ); rgBucketCheck[ cBucketCheck - 1 ] = pBucketReserve; } #endif // DEBUG // chain it pBucketThis[ 1 ]->m_pBucketNext = pBucketReserve; pBucketReserve->m_pBucketPrev = pBucketThis[ 1 ]; // move to it pBucketThis[ 1 ] = pBucketReserve; } // setup the entry ptrs pEntryThis[ 1 ] = &pBucketThis[ 1 ]->m_rgEntry[ 0 ]; pEntryMost[ 1 ] = &pBucketThis[ 1 ]->m_rgEntry[ m_centryBucket ]; } } // copy the entry pEntryThis[ iIndex ]->SetEntry( pEntryThisSrc->m_entry ); // advance the write (src/dst or dst) cursor pEntryLast[ iIndex ] = pEntryThis[ iIndex ]; pEntryThis[ iIndex ]++; // advance the read (src) cursor pEntryThisSrc++; } if ( pBucketSrcSrc == pBucketThis[ 0 ] ) { // nop } else { // the last bucket of the src bucket is no longer needed // the bucket ordering should be like this: // pBucketThis[0] (src/dst bucket) // pBucketSrcSrc (src bucket) // << NOTHING >> DHTAssert( pBucketThis[ 0 ]->m_pBucketNext == pBucketSrcSrc ); DHTAssert( pBucketSrcSrc->m_pBucketPrev == pBucketThis[ 0 ] ); // free the bucket MEMFree( pBucketSrcSrc ); STATDeleteOverflowBucket(); #ifdef DEBUG // remove the bucket from the bucket-catalog if ( NULL != rgBucketCheck ) { for ( iT = 0; iT < cBucketCheck; iT++ ) { if ( rgBucketCheck[iT] == pBucketSrcSrc ) { rgBucketCheck[iT] = NULL; break; } } DHTAssert( iT < cBucketCheck ); // the bucket better be in the bucket-catalog! } #endif // DEBUG } // update the next/end ptrs for the src/dst cursor and the dst cursor pBucketThis[ 0 ]->m_pEntryLast = pEntryLast[ 0 ]; pBucketThis[ 1 ]->m_pEntryLast = pEntryLast[ 1 ]; #ifdef DEBUG if ( NULL != rgBucketCheck ) { // check each catalogued bucket to see if it is in the pBucketSrcSrc, pBucketDst, or pBucketAvail // find and remove all buckets in pBucketSrcSrc pbucketTX = pBucketSrcSrcOriginal; DHTAssert( pbucketTX ); while ( pbucketTX ) { for ( iT = 0; iT < cBucketCheck; iT++ ) { if ( rgBucketCheck[iT] == pbucketTX ) { rgBucketCheck[iT] = NULL; break; } } DHTAssert( iT < cBucketCheck ); // if this goes off, we somehow added a bucket to the // SOURCE CHAIN -- THIS SHOULD NEVER HAPPEN! also, we // never catalogued the bucket! pbucketTX = PbucketBKTNext( pbucketTX ); } // find and remove all buckets in pBucketDst pbucketTX = pBucketDstOriginal; DHTAssert( pbucketTX ); while ( pbucketTX ) { for ( iT = 0; iT < cBucketCheck; iT++ ) { if ( rgBucketCheck[iT] == pbucketTX ) { rgBucketCheck[iT] = NULL; break; } } DHTAssert( iT < cBucketCheck ); // if this goes off, we added a bucket to the destination // chain, but it was never catalogued! first question: where // did the bucket come from if didn't catalogue it??? pbucketTX = PbucketBKTNext( pbucketTX ); } // find and remove all buckets in pBucketAvail pbucketTX = pBucketAvail; while ( pbucketTX ) { for ( iT = 0; iT < cBucketCheck; iT++ ) { if ( rgBucketCheck[iT] == pbucketTX ) { rgBucketCheck[iT] = NULL; break; } } DHTAssert( iT < cBucketCheck ); // if this goes off, we have a free bucket that was never // catalogued! where did it come from? // NOTE: this is not a memleak, it is a "we-never-catalogued-it" // problem; the memory will be freed later in this function pbucketTX = pbucketTX->m_pBucketNext; } // the list should now be empty -- verify this for ( iT = 0; iT < cBucketCheck; iT++ ) { // if this goes off, rgBucketCheck[iT] contains a bucket that was abandoned without // being freed! DHTAssert( rgBucketCheck[iT] == NULL ); } // free the list MEMFree( rgBucketCheck ); } size_t cEntriesAfterwards = 0; // make sure the number of entries we processed matches the number of entries we started with DHTAssert( cEntriesTotal == cEntriesTotalRunning ); // make sure we have all the entries we started with pbktT = pBucketSrcSrcOriginal; if ( pbktT->m_pb != NULL ) { while ( pbktT ) { pbktNextT = PbucketBKTNext( pbktT ); if ( pbktNextT ) { // full bucket cEntriesAfterwards += size_t( m_centryBucket ); } else { // partial bucket (not empty) cEntriesAfterwards += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] ); } pbktT = pbktNextT; } } pbktT = pBucketDstOriginal; if ( pbktT->m_pb != NULL ) { while ( pbktT ) { pbktNextT = PbucketBKTNext( pbktT ); if ( pbktNextT ) { // full bucket cEntriesAfterwards += size_t( m_centryBucket ); } else { // partial bucket (not empty) cEntriesAfterwards += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] ); } pbktT = pbktNextT; } } DHTAssert( cEntriesAfterwards == cEntriesTotal ); #endif // free the avail list while ( pBucketAvail ) { PBUCKET pBucketT; pBucketT = pBucketAvail; pBucketAvail = pBucketAvail->m_pBucketNext; MEMFree( pBucketT ); STATDeleteOverflowBucket(); } if ( !fBucketFromHeap ) { phs->m_bucketpool.POOLUnreserve(); // cancel the heap reservation (we never used it) } STATSplitBucket(); } // work-horse for shrinking a bucket void BKTIDoMerge( HOTSTUFF* const phs, PBUCKET pBucketSrc, PBUCKET pBucketDst ) { #ifdef DEBUG // catalog each BUCKET structure and make sure they end up in the destination bucket PBUCKET pBucketDstOriginal = pBucketDst; PBUCKET *rgBucketCheck = NULL, pbucketT; size_t cBucketCheck = 0, iT; pbucketT = pBucketSrc; while ( pbucketT ) { cBucketCheck++; pbucketT = PbucketBKTNext( pbucketT ); } pbucketT = pBucketDst; while ( pbucketT ) { cBucketCheck++; pbucketT = PbucketBKTNext( pbucketT ); } cBucketCheck++; // account for bucket from heap rgBucketCheck = (PBUCKET *)PvMEMAlloc( cBucketCheck * sizeof( PBUCKET ) ); if ( NULL != rgBucketCheck ) { iT = 0; pbucketT = pBucketSrc; while ( pbucketT ) { rgBucketCheck[ iT++ ] = pbucketT; pbucketT = PbucketBKTNext( pbucketT ); } pbucketT = pBucketDst; while ( pbucketT ) { rgBucketCheck[ iT++ ] = pbucketT; pbucketT = PbucketBKTNext( pbucketT ); } rgBucketCheck[ iT++ ] = NULL; // heap bucket DHTAssert( iT == cBucketCheck ); } // count the number of entries we will be handling size_t cEntriesTotal = 0; PBUCKET pbktT, pbktNextT; pbktT = pBucketSrc; if ( pbktT->m_pb != NULL ) { while ( pbktT ) { pbktNextT = PbucketBKTNext( pbktT ); if ( pbktNextT ) { // full bucket cEntriesTotal += size_t( m_centryBucket ); } else { // partial bucket (not empty) cEntriesTotal += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] ); } pbktT = pbktNextT; } } pbktT = pBucketDst; if ( pbktT->m_pb != NULL ) { while ( pbktT ) { pbktNextT = PbucketBKTNext( pbktT ); if ( pbktNextT ) { // full bucket cEntriesTotal += size_t( m_centryBucket ); } else { // partial bucket (not empty) cEntriesTotal += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] ); } pbktT = pbktNextT; } } #endif // read (src) cursor CKeyEntry *pEntryThisSrc; CKeyEntry *pEntryMostSrc; // write (dst) cursor CKeyEntry *pEntryThisDst; CKeyEntry *pEntryMostDst; // remember if we have moved to the last bucket or not BOOL fSetEndPtr; // remember if we allocated a bucket from the heap BOOL fBucketFromHeap = fFalse; // efficiency variables PBUCKET pBucketT; // move to the end of the dst bucket pBucketT = PbucketBKTNext( pBucketDst ); while ( pBucketT ) { pBucketDst = pBucketT; pBucketT = PbucketBKTNext( pBucketT ); } pEntryThisDst = PentryBKTNextMost( pBucketDst ); pEntryMostDst = &pBucketDst->m_rgEntry[ m_centryBucket ]; if ( !PbucketBKTNext( pBucketSrc ) ) { // the src bucket does not have extra bucket structures // setup the src cursor for a partial pass pEntryThisSrc = &pBucketSrc->m_rgEntry[ 0 ]; pEntryMostSrc = PentryBKTNextMost( pBucketSrc ); // we are not appending buckets from the src bucket, so we will be setting the // end ptr of the dst bucket iff we add entries from the src bucket fSetEndPtr = BOOL( pEntryThisSrc < pEntryMostSrc ); } else { // the src bucket has extra bucket structures // attach the extra bucket structures to the dst bucket pBucketDst->m_pBucketNext = pBucketSrc->m_pBucketNext; pBucketDst->m_pBucketNext->m_pBucketPrev = pBucketDst; // setup the src cursor for a full pass over the first src bucket pEntryThisSrc = &pBucketSrc->m_rgEntry[ 0 ]; pEntryMostSrc = &pBucketSrc->m_rgEntry[ m_centryBucket ]; // we are appending buckets from the src bucket, so we will not be setting the // end ptr of the dst bucket because we are no longer in the last bucket // of the dst bucket chain fSetEndPtr = fFalse; } // copy the entries in the src bucket while ( pEntryThisSrc < pEntryMostSrc ) { // check the dst cursor if ( pEntryThisDst < pEntryMostDst ) { // nop } else { // all entries in the dst bucket are exhausted if ( !fSetEndPtr ) { // we are not in the last bucket of the dst bucket because there is no end ptr pBucketT = PbucketBKTNext( pBucketDst ); DHTAssert( pBucketT ); do { pBucketDst = pBucketT; pBucketT = PbucketBKTNext( pBucketT ); } while ( pBucketT ); // setup the dst cursor pEntryThisDst = pBucketDst->m_pEntryLast + 1; pEntryMostDst = &pBucketDst->m_rgEntry[ m_centryBucket ]; // we are now able to set the end ptr because we are in the last bucket // of the dst bucket fSetEndPtr = fTrue; // restart the loop continue; } // we were at the last bucket in the dst bucket // get a bucket from the heap reservation pool DHTAssert( !fBucketFromHeap ); fBucketFromHeap = fTrue; // commit the reservation now pBucketT = phs->m_bucketpool.PbucketPOOLCommit(); DHTAssert( pBucketT ); STATInsertOverflowBucket(); // chain the heap bucket pBucketDst->m_pBucketNext = pBucketT; pBucketT->m_pBucketPrev = pBucketDst; // setup the dst cursor pBucketDst = pBucketT; pEntryThisDst = &pBucketDst->m_rgEntry[ 0 ]; pEntryMostDst = &pBucketDst->m_rgEntry[ m_centryBucket ]; #ifdef DEBUG // add the heap bucket to our catalog of buckets if ( NULL != rgBucketCheck ) { DHTAssert( rgBucketCheck[cBucketCheck - 1] == NULL ); rgBucketCheck[cBucketCheck - 1] = pBucketT; } #endif // DEBUG } // copy the entry pEntryThisDst->SetEntry( pEntryThisSrc->m_entry ); // advance the cursors pEntryThisSrc++; pEntryThisDst++; } // mark the src bucket as empty pBucketSrc->m_pb = NULL; if ( fSetEndPtr ) { // set the end of the destination bucket DHTAssert( pEntryThisDst != &pBucketDst->m_rgEntry[ 0 ] ); pBucketDst->m_pEntryLast = pEntryThisDst - 1; } else { // we do not need to set the end ptr of the dst bucket // nop } if ( !fBucketFromHeap ) { // cancel the unused heap reservation phs->m_bucketpool.POOLUnreserve(); } #ifdef DEBUG if ( NULL != rgBucketCheck ) { // check each catalogued bucket to see if it is in the pBucketDst bucket pbucketT = pBucketDstOriginal; DHTAssert( pbucketT ); // find an remove all buckets found in the destiantion bucket from our list while ( pbucketT ) { for ( iT = 0; iT < cBucketCheck; iT++ ) { if ( rgBucketCheck[iT] == pbucketT ) { rgBucketCheck[iT] = NULL; break; } } DHTAssert( iT < cBucketCheck ); // if this goes off, we somehow got a bucket // into the chain that shouldn't be there // (it is a bucket we never catalogued!) pbucketT = PbucketBKTNext( pbucketT ); } // find an remove pBucketSrc from our list for ( iT = 0; iT < cBucketCheck; iT++ ) { if ( rgBucketCheck[iT] == pBucketSrc ) { rgBucketCheck[iT] = NULL; break; } } DHTAssert( iT < cBucketCheck ); // if this goes off, somehow the FIXED source bucket // got removed from our catalogue OR pBucketSrc was // changed (which should never happen) // the list should now be empty -- verify this for ( iT = 0; iT < cBucketCheck; iT++ ) { // if this goes off, rgBucketCheck[iT] contains a bucket that was abandoned without // being freed! DHTAssert( rgBucketCheck[iT] == NULL ); } // free the list MEMFree( rgBucketCheck ); } // make sure the number of entries has not changed since we started size_t cEntriesAfterwards = 0; pbktT = pBucketDstOriginal; if ( pbktT->m_pb != NULL ) { while ( pbktT ) { pbktNextT = PbucketBKTNext( pbktT ); if ( pbktNextT ) { // full bucket cEntriesAfterwards += size_t( m_centryBucket ); } else { // partial bucket (not empty) cEntriesAfterwards += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] ); } pbktT = pbktNextT; } } DHTAssert( cEntriesAfterwards == cEntriesTotal ); #endif STATMergeBucket(); } ///////////////////////////////////////////////////////////////////////////////////////// // // mechanisms for implementing the dynamic-hash-table policies // // hash to the correct HOTSTUFF element HOTSTUFF *HOTSTUFFHash() const { return m_rghs + OSSYNC::OSSyncGetCurrentProcessor(); } // statistics void STATInsertEntry( HOTSTUFF* const phs ) { AtomicExchangeAddPointer( (void**)&phs->m_cEntry, (void*)1 ); phs->m_cOp++; } void STATDeleteEntry( HOTSTUFF* const phs ) { AtomicExchangeAddPointer( (void**)&phs->m_cEntry, (void*)-1 ); phs->m_cOp++; } void STATInsertOverflowBucket() { #ifdef DHT_STATS m_cBucketOverflowInsert++; #endif // DHT_STATS } void STATDeleteOverflowBucket() { #ifdef DHT_STATS m_cBucketOverflowDelete++; #endif // DHT_STATS } void STATSplitBucket() { #ifdef DHT_STATS m_cBucketSplit++; #endif // DHT_STATS } void STATMergeBucket() { #ifdef DHT_STATS m_cBucketMerge++; #endif // DHT_STATS } void STATSplitDirectory() { #ifdef DHT_STATS m_cDirSplit++; #endif // DHT_STATS } void STATMergeDirectory() { #ifdef DHT_STATS m_cDirMerge++; #endif // DHT_STATS } void STATStateTransition() { #ifdef DHT_STATS m_cTransition++; #endif // DHT_STATS } void STATPolicySelection() { #ifdef DHT_STATS m_cSelection++; #endif // DHT_STATS } void STATSplitContention() { #ifdef DHT_STATS m_cSplitContend++; #endif // DHT_STATS } void STATMergeContention() { #ifdef DHT_STATS m_cMergeContend++; #endif // DHT_STATS } // amortized table maintenance void PerformMaintenance() { // enter the state machine HOTSTUFF* phs; const int iGroup = UiSTEnter( &phs ); const ENUMSTATE esCurrent = EsSTGetState(); // carry out the current policy if ( esCurrent == stateGrow ) { BKTISplit( phs ); } else if ( esCurrent == stateShrink ) { BKTIMerge( phs ); } // leave the state machine STLeave( iGroup, phs ); } void SelectMaintenancePolicy( HOTSTUFF* const phs ) { // collect information on the current state of the hash table const ENUMSTATE esCurrent = EsSTGetState(); const NativeCounter cBucketMax = NcDIRIGetBucketMax( esCurrent ); const NativeCounter cBucket = NcDIRIGetBucket( esCurrent ); const NativeCounter cBucketActive = cBucketMax + cBucket; const NativeCounter cOpLocal = phs->m_cOp; // compute the current entry count and op count and reset the op count NativeCounter cEntry = 0; NativeCounter cOp = 0; for ( NativeCounter ihs = 0; ihs < m_chs; ihs++ ) { cEntry += m_rghs[ ihs ].m_cEntry; cOp += m_rghs[ ihs ].m_cOp; m_rghs[ ihs ].m_cOp = 0; } // compute the ideal entry count const NativeCounter cEntryIdeal = m_cLoadFactor * cBucketActive; // compute the max entry count const NativeCounter cEntryMax = m_centryBucket * cBucketActive; // determine our current flexibility in the entry count const NativeCounter cEntryFlexibility = max( m_centryBucket - m_cLoadFactor, cEntryMax / 2 - cEntryIdeal ); // determine our current threshold sensitivity const NativeCounter cOpSensitivity = max( 1, cEntryFlexibility / 2 ); // approximate the local (per-HOTSTUFF) threshold sensitivity const NativeCounter ratio = ( cOp + cOpLocal - 1 ) / cOpLocal; const NativeCounter cOpSensitivityLocal = max( 1, cOpSensitivity / ratio ); // compute the preferred entry count NativeCounter cEntryPreferred = cEntry; if ( cEntryIdeal + ( cEntryFlexibility - cOpSensitivity ) < cEntry ) { cEntryPreferred = cEntry - ( cEntryFlexibility - cOpSensitivity ); } else if ( cEntryIdeal > cEntry + ( cEntryFlexibility - cOpSensitivity ) ) { cEntryPreferred = cEntry + ( cEntryFlexibility - cOpSensitivity ); } // compute the preferred bucket count const NativeCounter cBucketPreferred = max( m_cbucketMin, ( cEntryPreferred + m_cLoadFactor - 1 ) / m_cLoadFactor ); // determine the new policy ENUMSTATE esNew = stateNil; if ( esCurrent == stateGrow ) { if ( cBucketPreferred < cBucketActive ) { esNew = stateShrinkFromGrow; } else if ( cBucketPreferred > cBucketActive ) { if ( cBucket == cBucketMax ) { esNew = stateSplitFromGrow; } } } else { DHTAssert( esCurrent == stateShrink ); if ( cBucketPreferred < cBucketActive ) { if ( cBucket == 0 ) { esNew = stateMergeFromShrink; } } else if ( cBucketPreferred > cBucketActive ) { esNew = stateGrowFromShrink; } } // enact the new policy if ( m_cOpSensitivity != cOpSensitivityLocal ) { m_cOpSensitivity = cOpSensitivityLocal; } if ( m_cBucketPreferred != cBucketPreferred ) { m_cBucketPreferred = cBucketPreferred; } if ( esNew ) { STTransition( esNew ); } else { m_semPolicy.Release(); } STATPolicySelection(); } void MaintainTable( HOTSTUFF* const phs ) { // decide on a new policy if we may have breached one of our // thresholds if ( phs->m_cOp > m_cOpSensitivity && m_semPolicy.CAvail() && m_semPolicy.FTryAcquire() ) { if ( phs->m_cOp > m_cOpSensitivity ) { SelectMaintenancePolicy( phs ); } else { m_semPolicy.Release(); } } // perform amortized work on the table as necessary if ( NcDIRIGetBucketMax( stateGrow ) + NcDIRIGetBucket( stateGrow ) < m_cBucketPreferred || m_cBucketPreferred < NcDIRIGetBucketMax( stateShrink ) + NcDIRIGetBucket( stateShrink ) ) { PerformMaintenance(); } } public: // calculate the address of the aligned block and store its offset (for free) static void* PvMEMIAlign( void* const pv, const size_t cbAlign ) { // round up to the nearest cache line // NOTE: this formula always forces an offset of at least 1 byte const ULONG_PTR ulp = ULONG_PTR( pv ); const ULONG_PTR ulpAligned = ( ( ulp + cbAlign ) / cbAlign ) * cbAlign; const ULONG_PTR ulpOffset = ulpAligned - ulp; DHTAssert( ulpOffset > 0 ); DHTAssert( ulpOffset <= cbAlign ); DHTAssert( ulpOffset == BYTE( ulpOffset ) ); // must fit into a single BYTE // store the offset BYTE *const pbAligned = (BYTE*)ulpAligned; pbAligned[ -1 ] = BYTE( ulpOffset ); // return the aligned block return (void*)pbAligned; } // retrieve the original unaligned block of memory from the aligned block static void* PvMEMIUnalign( void* const pv ) { // read the offset of the real block BYTE *const pbAligned = (BYTE*)pv; const BYTE bOffset = pbAligned[ -1 ]; DHTAssert( bOffset > 0 ); // return the real unaligned block return (void*)( pbAligned - bOffset ); } // allocate memory static void* PvMEMAlloc( const size_t cbSize, const size_t cbAlign = cbCacheLine ) { void* const pv = new BYTE[ cbSize + cbAlign ]; if ( pv ) { return PvMEMIAlign( pv, cbAlign ); } return NULL; } // free memory static void MEMFree( void* const pv ) { if ( pv ) { delete [] ((BYTE*)PvMEMIUnalign( pv )); } } private: // never written NativeCounter m_cLoadFactor; // preferred number of entries in a bucket at any given time NativeCounter m_centryBucket; // maximum number of entries per bucket NativeCounter m_cbBucket; // size in bytes of a bucket (rounded up to the nearest full cache-line) NativeCounter m_rankDHTrwlBucket; // rank of the reader/writer lock on each bucket HOTSTUFF *m_rghs; // array of HOTSTUFF objects (hashed per processor) NativeCounter m_chs; // size of HOTSTUFF array NativeCounter m_cbucketMin; // minimum number of buckets in the hash-table #ifdef _WIN64 BYTE m_rgbRsvdNever[ 8 ]; #else // !_WIN64 BYTE m_rgbRsvdNever[ 4 ]; #endif // _WIN64 // rarely written DIRPTRS m_dirptrs[ 2 ]; // directory pointers (2 copies) BYTE *m_rgrgBucket[ cbitNativeCounter ]; // directory (array of arrays of buckets) // no padding necessary // often written NativeCounter m_cOpSensitivity; // used to regulate policy changes NativeCounter m_cBucketPreferred; // preferred table size ENUMSTATE m_stateCur; // current state #ifdef _WIN64 BYTE m_rgbRsvdOften[ 44 ]; #else // !_WIN64 BYTE m_rgbRsvdOften[ 20 ]; #endif // _WIN64 // always written (second only to HOTSTUFF members) OSSYNC::CSemaphore m_semPolicy; // used to serialize policy changes long m_cCompletions; // counts the number of metered-section completions #ifdef _WIN64 BYTE m_rgbRsvdAlways[ 52 ]; #else // !_WIN64 BYTE m_rgbRsvdAlways[ 24 ]; #endif // _WIN64 #ifdef DHT_STATS // performance statistics long m_cBucketOverflowInsert; // count of overflow bucket allocations long m_cBucketOverflowDelete; // count of overflow bucket deletions long m_cBucketSplit; // count of bucket split operations long m_cBucketMerge; // count of bucket merge operations long m_cDirSplit; // count of directory split operations long m_cDirMerge; // count of directory merge operations long m_cTransition; // count of state transitions long m_cSelection; // count of policy selections long m_cSplitContend; // count of split contentions long m_cMergeContend; // count of merge contentions #ifdef _WIN64 BYTE m_rgbRsvdPerf[ 24 ]; #else // !_WIN64 BYTE m_rgbRsvdPerf[ 24 ]; #endif // _WIN64 #endif // DHT_STATS #ifdef DEBUG BOOL m_fInit; // initialization flag #endif // DEBUG }; ///////////////////////////////////////////////////////////////////////////////////// // // CDynamicHashTable< CKey, CEntry > // ///////////////////////////////////////////////////////////////////////////////////// // ctor template< class CKey, class CEntry > inline CDynamicHashTable< CKey, CEntry >:: CDynamicHashTable( const NativeCounter rankDHTrwlBucket ) : m_semPolicy( CSyncBasicInfo( "CDynamicHashTable::m_semPolicy" ) ) { #ifdef DEBUG m_fInit = fFalse; // zero-out this memory so the debugger won't print garbage memset( m_rgbRsvdNever, 0, sizeof( m_rgbRsvdNever ) ); memset( m_rgbRsvdOften, 0, sizeof( m_rgbRsvdOften ) ); memset( m_rgbRsvdAlways, 0, sizeof( m_rgbRsvdAlways ) ); #ifdef DHT_STATS memset( m_rgbRsvdPerf, 0, sizeof( m_rgbRsvdPerf ) ); #endif // DHT_STATS #endif // we should be on a 32-bit or 64-bit system #ifdef _WIN64 DHTAssert( 8 == sizeof( NativeCounter ) ); #else // _!WIN64 DHTAssert( 4 == sizeof( NativeCounter ) ); #endif // _WIN64 // capture the rank for each bucket m_rankDHTrwlBucket = rankDHTrwlBucket; // prepare each semaphore so it can have 1 owner m_semPolicy.Release(); } // dtor template< class CKey, class CEntry > inline CDynamicHashTable< CKey, CEntry >:: ~CDynamicHashTable() { } // initializes the dynamic hash table. if the table cannot be initialized, // errOutOfMemory will be returned template< class CKey, class CEntry > inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >:: ErrInit( const double dblLoadFactor, const double dblUniformity, const NativeCounter cBucketMinimum ) { ERR err; NativeCounter ihs; DHTAssert( !m_fInit ); // initialize all data by its cache-line grouping // never written m_cLoadFactor = 0; m_centryBucket = 0; m_cbBucket = 0; m_rghs = NULL; m_chs = OSSYNC::OSSyncGetProcessorCountMax(); m_cbucketMin = 0; // rarely written memset( m_dirptrs, 0, sizeof( m_dirptrs ) ); memset( m_rgrgBucket, 0, sizeof( m_rgrgBucket ) ); // often written m_cOpSensitivity = 0; m_cBucketPreferred = cBucketMinimum; // NOTE: we cannot start in stateFreeze because we must go through the "halfway" completion // function so that we copy the directory ptrs safely m_stateCur = stateGrow; // always written m_cCompletions = 0; #ifdef DHT_STATS // performance statistics m_cBucketOverflowInsert = 0; m_cBucketOverflowDelete = 0; m_cBucketSplit = 0; m_cBucketMerge = 0; m_cDirSplit = 0; m_cDirMerge = 0; m_cTransition = 0; m_cSelection = 0; m_cSplitContend = 0; m_cMergeContend = 0; #endif // DHT_STATS // allocate the HOTSTUFF array m_rghs = (HOTSTUFF*)PvMEMAlloc( m_chs * sizeof( HOTSTUFF ), cbCacheLine ); if ( !m_rghs ) { err = errOutOfMemory; goto HandleError; } // construct the HOTSTUFF objects for ( ihs = 0; ihs < m_chs; ihs++ ) { new( m_rghs + ihs ) HOTSTUFF(); } // initialize the directory err = ErrDIRInit( NativeCounter( dblLoadFactor * dblUniformity ), cBucketMinimum ); if ( err != errSuccess ) { goto HandleError; } #ifdef DEBUG m_fInit = fTrue; #endif // DEBUG return errSuccess; HandleError: DHTAssert( err != errSuccess ); Term(); return err; } // terminates the dynamic hash table. this function can be called even if the // hash table has never been initialized or is only partially initialized // // NOTE: any data stored in the table at this time will be lost! template< class CKey, class CEntry > inline void CDynamicHashTable< CKey, CEntry >:: Term() { #ifdef DEBUG m_fInit = fFalse; #endif // DEBUG // term the directory DIRTerm(); if ( NULL != m_rghs ) { // delete the HOTSTUFF aray while ( m_chs ) { // destruct the object m_chs--; m_rghs[ m_chs ].HOTSTUFF::~HOTSTUFF(); } MEMFree( m_rghs ); m_rghs = NULL; } } // acquires a read lock on the specified key and returns the lock in the // provided lock context template< class CKey, class CEntry > inline void CDynamicHashTable< CKey, CEntry >:: ReadLockKey( const CKey& key, CLock* const plock ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( plock->m_ls == CLock::lsNil ); // initialize the lock plock->m_ls = CLock::lsRead; // enter the state machine const int iGroup = UiSTEnter( &plock->m_phs ); const ENUMSTATE esCurrent = EsSTGetState(); // read-lock the key through the directory DIRReadLockKey( esCurrent, key, plock ); // try to seek to the key (sets up currency) BKTSeek( plock, key ); // leave the state machine STLeave( iGroup, plock->m_phs ); } // releases the read lock in the specified lock context template< class CKey, class CEntry > inline void CDynamicHashTable< CKey, CEntry >:: ReadUnlockKey( CLock* const plock ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( FBKTRead( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); DHTAssert( plock->m_pBucketHead->CRWL().FReader() ); // unlock the key through the directory DIRReadUnlockKey( plock ); // reset the lock plock->m_ls = CLock::lsNil; } // acquires a write lock on the specified key and returns the lock in the // provided lock context template< class CKey, class CEntry > inline void CDynamicHashTable< CKey, CEntry >:: WriteLockKey( const CKey& key, CLock* const plock ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( plock->m_ls == CLock::lsNil ); // initialize the lock plock->m_ls = CLock::lsWrite; plock->m_fInsertOrDelete = fFalse; // enter the state machine const int iGroup = UiSTEnter( &plock->m_phs ); const ENUMSTATE esCurrent = EsSTGetState(); // write-lock the key through the directory DIRWriteLockKey( esCurrent, key, plock ); // try to seek to the key (sets up currency) BKTSeek( plock, key ); // leave the state machine STLeave( iGroup, plock->m_phs ); } // releases the write lock in the specified lock context template< class CKey, class CEntry > inline void CDynamicHashTable< CKey, CEntry >:: WriteUnlockKey( CLock* const plock ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( FBKTWrite( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); DHTAssert( plock->m_pBucketHead->CRWL().FWriter() ); // unlock the key through the directory DIRWriteUnlockKey( plock ); // we performed an insert or delete while holding the write lock if ( plock->m_fInsertOrDelete ) { // perform amortized maintenance on the table MaintainTable( plock->m_phs ); } // reset the lock plock->m_ls = CLock::lsNil; plock->m_fInsertOrDelete = fFalse; } // retrieves the entry corresponding to the key locked by the specified lock // context. if there is no entry for this key, errEntryNotFound will be // returned template< class CKey, class CEntry > inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >:: ErrRetrieveEntry( CLock* const plock, CEntry* const pentry ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( FBKTRead( plock ) || FBKTWrite( plock ) || FBKTScan( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); #ifdef DEBUG if ( FBKTRead( plock ) ) { DHTAssert( plock->m_pBucketHead->CRWL().FReader() ); } else { DHTAssert( plock->m_pBucketHead->CRWL().FWriter() ); } if ( FBKTRead( plock ) || FBKTWrite( plock ) ) { CKeyEntry *pKeyEntry; BKTGetEntry( plock, &pKeyEntry ); DHTAssert( pKeyEntry ? pKeyEntry->FEntryMatchesKey( plock->m_key ) : fTrue ); } #endif // get the entry return ErrBKTGetEntry( plock, pentry ); } // replaces the entry corresponding to the key locked by the specified lock // context. the key for the new entry must match the key for the old entry. // if there is no entry for this key, errNoCurrentEntry will be returned template< class CKey, class CEntry > inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >:: ErrReplaceEntry( CLock* const plock, const CEntry& entry ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( FBKTWrite( plock ) || FBKTScan( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); DHTAssert( plock->m_pBucketHead->CRWL().FWriter() ); #ifdef DEBUG if ( FBKTWrite( plock ) ) { CKeyEntry *pKeyEntry; BKTGetEntry( plock, &pKeyEntry ); DHTAssert( pKeyEntry ? pKeyEntry->FEntryMatchesKey( plock->m_key ) : fTrue ); DHTAssert( ((CKeyEntry &)entry).FEntryMatchesKey( plock->m_key ) ); } #endif // replace the entry return ErrBKTReplaceEntry( plock, entry ); } // inserts a new entry corresponding to the key locked by the specified lock // context. if there is already an entry with this key in the table, // errKeyDuplicate will be returned. if the new entry cannot be inserted, // errOutOfMemory will be returned template< class CKey, class CEntry > inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >:: ErrInsertEntry( CLock* const plock, const CEntry& entry ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( FBKTWrite( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); DHTAssert( plock->m_pBucketHead->CRWL().FWriter() ); /// DHTAssert( ((CKeyEntry &)entry).FEntryMatchesKey( plock->m_key ) ); // insert the entry const ERR err = ErrBKTInsertEntry( plock, entry ); if ( errSuccess == err ) { // maintain our stats STATInsertEntry( plock->m_phs ); // we have performed an insert plock->m_fInsertOrDelete = fTrue; } return err; } // deletes the entry corresponding to the key locked by the specified lock // context. if there is no entry for this key, errNoCurrentEntry will be // returned template< class CKey, class CEntry > inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >:: ErrDeleteEntry( CLock* const plock ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( FBKTWrite( plock ) || FBKTScan( plock ) ); DHTAssert( plock->m_pBucketHead != NULL ); DHTAssert( plock->m_pBucketHead->CRWL().FWriter() ); #ifdef DEBUG if ( FBKTWrite( plock ) ) { CKeyEntry *pKeyEntry; BKTGetEntry( plock, &pKeyEntry ); DHTAssert( pKeyEntry ? pKeyEntry->FEntryMatchesKey( plock->m_key ) : fTrue ); } #endif if ( FBKTScan( plock ) ) { // prepare the next-entry ptr so we can move-next after the delete // if we are deleting the last entry in the bucket, make this NULL // to force the cursor to move into the next hash bucket DHTAssert( plock->m_pBucket != NULL ); DHTAssert( plock->m_pEntryNext == NULL ); plock->m_pEntryNext = ( plock->m_pEntry != plock->m_pBucket->m_pEntryLast ) ? plock->m_pEntry : NULL; } // delete the entry const ERR err = ErrBKTDeleteEntry( plock ); if ( errSuccess == err ) { // maintain our stats STATDeleteEntry( plock->m_phs ); // we have performed a delete plock->m_fInsertOrDelete = fTrue; } return err; } // sets up the specified lock context in preparation for scanning all entries // in the hash table by physical storage order (i.e. not by key value order) // // NOTE: caller MUST terminate scan with EndHashScan to release any outstanding locks template< class CKey, class CEntry > inline void CDynamicHashTable< CKey, CEntry >:: BeginHashScan( CLock* const plock ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( plock->m_ls == CLock::lsNil ); // initialize the lock to start scanning at the first bucket (it may be empty!) plock->m_ls = CLock::lsScan; plock->m_fInsertOrDelete = fFalse; plock->m_iBucket = 0; // enter the state machine const int iGroup = UiSTEnter( &plock->m_phs ); const ENUMSTATE esCurrent = EsSTGetState(); // hash to the bucket we want (this may require a retry in grow/shrink mode) DHTAssert( plock->m_pBucketHead == NULL ); plock->m_pBucketHead = PbucketDIRIHash( esCurrent, plock->m_iBucket ); // acquire the lock as a writer plock->m_pBucketHead->CRWL().EnterAsWriter(); // NOTE: do not retry the hash function here because bucket 0 will never disappear // leave the state machine STLeave( iGroup, plock->m_phs ); // set up the currency as before-first plock->m_pBucket = plock->m_pBucketHead; plock->m_pEntryPrev = NULL; plock->m_pEntry = NULL; plock->m_pEntryNext = plock->m_pBucketHead->m_pb != NULL ? &plock->m_pBucketHead->m_rgEntry[0] : NULL; } // sets up the specified lock context in preparation for scanning all entries // in the hash table by physical storage order (i.e. not by key value order) // // NOTE: caller MUST terminate scan with EndHashScan to release any outstanding locks template< class CKey, class CEntry > inline void CDynamicHashTable< CKey, CEntry >:: BeginHashScanFromKey( const CKey& key, CLock* const plock ) { NativeCounter cBucket; NativeCounter cBucketMax; NativeCounter iHash; DHTAssert( m_fInit ); // verify the lock DHTAssert( plock->m_ls == CLock::lsNil ); // initialize the lock plock->m_ls = CLock::lsScan; plock->m_fInsertOrDelete = fFalse; // enter the state machine const int iGroup = UiSTEnter( &plock->m_phs ); const ENUMSTATE esCurrent = EsSTGetState(); // write-lock the key through the directory DIRWriteLockKey( esCurrent, key, plock ); // calculate the current bucket configuration // // NOTES ON WHY THIS WILL WORK: // // cBucket may increase/decrease if we are in grow/shrink mode, but this won't effect the // calculation below unless it grows ahead of OR shrinks behind the bucket at iHash; // since we have the bucket at iHash locked, it cannot grow/shrink // cBucketMax cannot change unless we are in split mode, and even then we will be reading from the // COPY of the cBucketMax -- not the real cBucketMax which is changing cBucket = NcDIRIGetBucket( esCurrent ); cBucketMax = NcDIRIGetBucketMax( esCurrent ); DHTAssert( cBucketMax != 0 ); // calculate the hash value and normalize it within the limits of the current bucket configuration iHash = CKeyEntry::Hash( key ); iHash = iHash & ( ( cBucketMax - 1 ) + cBucketMax ); if ( iHash >= cBucketMax + cBucket ) iHash -= cBucketMax; // remember which bucket we locked plock->m_iBucket = iHash; #ifdef DEBUG { // verify that we have the correct bucket locked using only iHash NativeCounter iExponent; NativeCounter iRemainder; DIRILog2( iHash, &iExponent, &iRemainder ); const PBUCKET pbucketT = PbucketDIRIResolve( iExponent, iRemainder ); DHTAssert( pbucketT == plock->m_pBucketHead ); DHTAssert( pbucketT->CRWL().FWriter() ); } #endif // DEBUG // leave the state machine STLeave( iGroup, plock->m_phs ); // set up the currency as before-first plock->m_pBucket = plock->m_pBucketHead; plock->m_pEntryPrev = NULL; plock->m_pEntry = NULL; plock->m_pEntryNext = plock->m_pBucketHead->m_pb != NULL ? &plock->m_pBucketHead->m_rgEntry[0] : NULL; } // moves the specified lock context to the next entry in the hash table by // physical storage order. if the end of the index is reached, // errNoCurrentEntry is returned. template< class CKey, class CEntry > inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >:: ErrMoveNext( CLock* const plock, BOOL* const pfNewBucket ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( FBKTScan( plock ) ); DHTAssert( plock->m_pEntryPrev == NULL ); // move to the next entry in this bucket if ( plock->m_pEntry ) { // we are already on an existing entry if ( plock->m_pEntry + 1 < PentryBKTNextMost( plock->m_pBucket ) ) { // we have not reached the end of the current BUCKET plock->m_pEntry++; } else { // we are at the end of the current BUCKET plock->m_pBucket = PbucketBKTNext( plock->m_pBucket ); if ( plock->m_pBucket ) { // we moved to the next BUCKET plock->m_pEntry = &plock->m_pBucket->m_rgEntry[0]; } else { // there are no more BUCKET structures in this chain plock->m_pEntry = NULL; } } } else { // we are not on an entry (before-first or after-last) plock->m_pEntry = plock->m_pEntryNext; } plock->m_pEntryNext = NULL; if ( plock->m_pEntry != NULL ) { // we moved to an entry successfully DHTAssert( plock->m_pBucket ); if ( pfNewBucket ) { *pfNewBucket = fFalse; } return errSuccess; } // try to move to the next hash-bucket if ( pfNewBucket ) { *pfNewBucket = fTrue; } return ErrSCANMoveNext( plock ); } // terminates a scan by releasing all outstanding locks and reset the lock context template< class CKey, class CEntry > inline void CDynamicHashTable< CKey, CEntry >:: EndHashScan( CLock* const plock ) { DHTAssert( m_fInit ); // verify the lock DHTAssert( FBKTScan( plock ) ); DHTAssert( plock->m_pEntryPrev == NULL ); if ( plock->m_pBucketHead != NULL ) { // unlock the current bucket plock->m_pBucketHead->CRWL().LeaveAsWriter(); plock->m_pBucketHead = NULL; // we performed an insert or delete while holding the write lock if ( plock->m_fInsertOrDelete ) { // perform amortized maintenance on the table MaintainTable( plock->m_phs ); } } // reset the lock plock->m_ls = CLock::lsNil; plock->m_fInsertOrDelete = fFalse; } }; // namespace DHT using namespace DHT; #endif // __DHT_HXX_INCLUDED