Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

4391 lines
110 KiB

  1. #ifndef _DHT_HXX_INCLUDED
  2. #define _DHT_HXX_INCLUDED
  3. #pragma warning ( disable : 4200 ) // we allow zero sized arrays
  4. // asserts
  5. //
  6. // #define DHTAssert to point to your favorite assert function per #include
  7. #ifdef DHTAssert
  8. #else // !DHTAssert
  9. #define DHTAssert Assert
  10. #endif // DHTAssert
  11. #include <sync.hxx>
  12. #ifdef DEBUG
  13. // turns on unique names for bucket reader/writer locks (adds 60 bytes per BUCKET)
  14. #define UNIQUE_BUCKET_NAMES
  15. #ifdef UNIQUE_BUCKET_NAMES
  16. #include <stdio.h>
  17. #endif // UNIQUE_BUCKET_NAMES
  18. #endif
  19. #ifdef DEBUGGER_EXTENSION
  20. class CPRINTF;
  21. #endif
  22. namespace DHT {
  23. /////////////////////////////////////////////////////////////////////////////////////////
  24. // CDynamicHashTable
  25. //
  26. // Implements a dynamically resizable hash table of entries stored using a unique key
  27. //
  28. // CKey = class representing keys used to identify entries in the hash table
  29. // CEntry = class representing entries stored in the hash table
  30. // (required copy-constructor)
  31. template< class CKey, class CEntry >
  32. class CDynamicHashTable
  33. {
  34. public:
  35. // counter type (uses native word size of machine)
  36. typedef ULONG_PTR NativeCounter;
  37. // class controlling the Key and Entry for each entry in the hash table
  38. //
  39. // NOTE: All member functions must be defined by the user per instance
  40. // of this template. These functions must be defined after the
  41. // template definition. Declaring these functions to be inline
  42. // will allow full optimization by the compiler!
  43. class CKeyEntry
  44. {
  45. public:
  46. // produces the hash value for the specified key. this hash
  47. // function should produce numbers as uniformly as possible over
  48. // as large a range as possible for good performance
  49. static NativeCounter Hash( const CKey& key );
  50. // produces the hash value for this entry's key. this hash
  51. // function should produce the same number as the above function
  52. // for the same key
  53. NativeCounter Hash() const;
  54. // returns fTrue if this entry matches the given key. this way,
  55. // the key doesn't necessarily have to be stored in the hash table
  56. // entry
  57. //
  58. // e.g.: CEntry can be PBF and key can be IFMP/PGNO where the
  59. // actual IFMP/PGNO is stored in the BF structure. this would
  60. // ruin cache locality, of course, but it would use less memory
  61. //
  62. // note that the entry could also contain some kind of hash value
  63. // for the key allowing some weeding out of entries before jumping
  64. // off to the full structure for a full comparison. an example
  65. // of this would be the SPAIRs from SORT
  66. BOOL FEntryMatchesKey( const CKey& key ) const;
  67. // sets the contained entry to the given entry
  68. void SetEntry( const CEntry& entry );
  69. // gets the contained entry
  70. void GetEntry( CEntry* const pentry ) const;
  71. public:
  72. CEntry m_entry;
  73. ~CKeyEntry(); // not allowed
  74. private:
  75. CKeyEntry(); // not allowed
  76. CKeyEntry *operator =( const CKeyEntry & ); // not allowed
  77. };
  78. // API Error Codes
  79. enum ERR
  80. {
  81. errSuccess, // success
  82. errOutOfMemory, // not enough memory
  83. errInvalidParameter, // bad argument to function
  84. errEntryNotFound, // entry was not found
  85. errNoCurrentEntry, // currency is invalid
  86. errKeyDuplicate, // cannot insert because key already exists
  87. };
  88. // API Lock Context
  89. class CLock;
  90. public:
  91. CDynamicHashTable( const NativeCounter rankDHTrwlBucket );
  92. ~CDynamicHashTable();
  93. ERR ErrInit( const double dblLoadFactor,
  94. const double dblUniformity,
  95. const NativeCounter cBucketMinimum = 0 );
  96. void Term();
  97. void ReadLockKey( const CKey& key, CLock* const plock );
  98. void ReadUnlockKey( CLock* const plock );
  99. void WriteLockKey( const CKey& key, CLock* const plock );
  100. void WriteUnlockKey( CLock* const plock );
  101. ERR ErrRetrieveEntry( CLock* const plock, CEntry* const pentry );
  102. ERR ErrReplaceEntry( CLock* const plock, const CEntry& entry );
  103. ERR ErrInsertEntry( CLock* const plock, const CEntry& entry );
  104. ERR ErrDeleteEntry( CLock* const plock );
  105. void BeginHashScan( CLock* const plock );
  106. void BeginHashScanFromKey( const CKey& key, CLock* const plock );
  107. ERR ErrMoveNext( CLock* const plock, BOOL* const pfNewBucket = NULL );
  108. void EndHashScan( CLock* const plock );
  109. #ifdef DEBUGGER_EXTENSION
  110. VOID Dump( CPRINTF * pcprintf, const DWORD_PTR dwOffset = 0 ) const;
  111. VOID Scan( CPRINTF * pcprintf, VOID * pv ) const;
  112. #endif
  113. #ifdef DHT_STATS
  114. long CBucketOverflow() const { return m_cBucketOverflowInsert; }
  115. long CBucketSplit() const { return m_cBucketSplit; }
  116. long CBucketMerge() const { return m_cBucketMerge; }
  117. long CDirectorySplit() const { return m_cDirSplit; }
  118. long CDirectoryMerge() const { return m_cDirMerge; }
  119. long CStateTransition() const { return m_cTransition; }
  120. long CPolicySelection() const { return m_cSelection; }
  121. long CSplitContend() const { return m_cSplitContend; }
  122. long CMergeContend() const { return m_cMergeContend; }
  123. #else // !DHT_STATS
  124. long CBucketOverflow() const { return 0; }
  125. long CBucketSplit() const { return 0; }
  126. long CBucketMerge() const { return 0; }
  127. long CDirectorySplit() const { return 0; }
  128. long CDirectoryMerge() const { return 0; }
  129. long CStateTransition() const { return 0; }
  130. long CPolicySelection() const { return 0; }
  131. long CSplitContend() const { return 0; }
  132. long CMergeContend() const { return 0; }
  133. #endif // DHT_STATS
  134. private:
  135. // possible states for the hash-table
  136. //
  137. // DANGER! DANGER! DANGER WILL ROBINSON!
  138. //
  139. // DO NOT CHANGE THE ENUMATION VALUES! CODE IS DEPENDANT ON THEM BEING AS THEY ARE!
  140. // (specifically, I do "stateCur >> 4" to test for 0x10000 so I can see if we are splitting)
  141. //
  142. // DANGER! DANGER! DANGER WILL ROBINSON!
  143. enum ENUMSTATE
  144. {
  145. stateNil = 0,
  146. stateShrinkFromGrow = 1,
  147. stateShrinkFromGrow2 = 2,
  148. stateGrowFromShrink = 3,
  149. stateGrowFromShrink2 = 4,
  150. stateSplitFromGrow = 5,
  151. stateSplitFromGrow2 = 6,
  152. stateGrowFromSplit = 7,
  153. stateGrowFromSplit2 = 8,
  154. stateMergeFromShrink = 9,
  155. stateMergeFromShrink2 = 10,
  156. stateShrinkFromMerge = 11,
  157. stateShrinkFromMerge2 = 12,
  158. stateUnused = 13,
  159. stateGrow = 14,
  160. stateShrink = 15,
  161. stateSplit = 16,
  162. stateMerge = 17,
  163. };
  164. // Constants
  165. enum { cbitByte = 8 }; // bits per byte
  166. enum { cbitNativeCounter = sizeof( NativeCounter ) * cbitByte }; // bits per NativeCounter
  167. // BUCKET
  168. //
  169. // - this is the individual unit of allocation for each logical bucket
  170. // - each BUCKET contains several CKeyEntry objects packed together
  171. // - BUCKETs are chained together to make up the entire logical bucket
  172. struct BUCKET
  173. {
  174. public:
  175. // read-write-lock/prev-ptr
  176. // in the primary BUCKET (allocated as a part of an array), this is the read-write-lock
  177. // in secondary BUCKETs, this is the prev-ptr for reverse traversal
  178. union
  179. {
  180. BYTE m_rgbRWL[ sizeof( OSSYNC::CReaderWriterLock ) ];
  181. BUCKET *m_pBucketPrev;
  182. };
  183. // next/end pointer
  184. // when this points outside of the array of buckets, it points to the next BUCKET
  185. // when this points inside of the array of buckets, it points to the first free entry
  186. union
  187. {
  188. BYTE *m_pb;
  189. BUCKET *m_pBucketNext;
  190. CKeyEntry *m_pEntryLast;
  191. };
  192. // array of entries (it will contain 'load-factor' entries)
  193. CKeyEntry m_rgEntry[];
  194. public:
  195. // return the properly typed CReaderWriterLock
  196. OSSYNC::CReaderWriterLock& CRWL() const
  197. {
  198. return (OSSYNC::CReaderWriterLock &)m_rgbRWL;
  199. }
  200. };
  201. typedef BUCKET* PBUCKET;
  202. // BUCKETPool
  203. //
  204. // pool of BUCKET structures (reservation system for bucket split/merge)
  205. class BUCKETPool
  206. {
  207. public:
  208. PBUCKET m_pReserve; // list of BUCKET structures available for reservation
  209. long m_cReserve; // number of BUCKET structures available to be reserved
  210. OSSYNC::CSemaphore m_semReserve; // protection for reservation ptrs
  211. #ifdef _WIN64
  212. BYTE m_rgbRsvd[ 40 ];
  213. #else // !_WIN64
  214. BYTE m_rgbRsvd[ 20 ];
  215. #endif // _WIN64
  216. public:
  217. BUCKETPool()
  218. : m_semReserve( CSyncBasicInfo( "CDynamicHashTable::BUCKETPool::m_semReserve" ) )
  219. {
  220. // initialize vars
  221. m_pReserve = NULL;
  222. m_cReserve = 0;
  223. // prepare the semaphore to have 1 owner
  224. m_semReserve.Release();
  225. #ifdef DEBUG
  226. memset( m_rgbRsvd, 0, sizeof( m_rgbRsvd ) );
  227. #endif // DEBUG
  228. }
  229. // terminate
  230. ~BUCKETPool()
  231. {
  232. while ( m_pReserve )
  233. {
  234. PBUCKET pBucket;
  235. pBucket = m_pReserve;
  236. m_pReserve = m_pReserve->m_pBucketNext;
  237. MEMFree( pBucket );
  238. }
  239. m_cReserve = 0;
  240. }
  241. // reserve a BUCKET structure
  242. // "allocate" a bucket from the list by decrementing the counter of available buckets
  243. // if the counter went below zero, we need add a bucket to the list now (or fail)
  244. // to make sure we can honor the request later
  245. BOOL FPOOLReserve( const NativeCounter cbBucket )
  246. {
  247. // reserve a bucket using the counter
  248. if ( AtomicDecrement( (long*)&m_cReserve ) >= 0 )
  249. {
  250. return fTrue;
  251. }
  252. // reserve a bucket from the heap
  253. else
  254. {
  255. return FPOOLReserve_( cbBucket );
  256. }
  257. }
  258. BOOL FPOOLReserve_( const NativeCounter cbBucket )
  259. {
  260. // at this point, we need to increment m_cReserve for 1 of 2 reasons:
  261. // the allocation will succeed and we will add the new bucket to the list
  262. // the allocation will fail and we can't leave without "deallocating" the bucket
  263. AtomicIncrement( (long*)&m_cReserve );
  264. // we need to allocate a bucket and add it to the list (to back the reservation we want)
  265. const PBUCKET pBucket = PBUCKET( PvMEMAlloc( cbBucket ) );
  266. if ( pBucket )
  267. {
  268. // add the bucket to the list
  269. m_semReserve.Acquire();
  270. pBucket->m_pBucketNext = m_pReserve;
  271. m_pReserve = pBucket;
  272. m_semReserve.Release();
  273. // reservation succeeded
  274. return fTrue;
  275. }
  276. // the allocation failed so the reservation cannot succeed
  277. return fFalse;
  278. }
  279. // commit a reservation
  280. BUCKET *PbucketPOOLCommit()
  281. {
  282. PBUCKET pBucketReserve;
  283. // assign a bucket to the reservation
  284. m_semReserve.Acquire();
  285. pBucketReserve = m_pReserve;
  286. DHTAssert( pBucketReserve );
  287. m_pReserve = m_pReserve->m_pBucketNext;
  288. m_semReserve.Release();
  289. // return the bucket
  290. return pBucketReserve;
  291. }
  292. // release the reservation
  293. void POOLUnreserve()
  294. {
  295. // "deallocate" the bucket that was previously reserved
  296. AtomicIncrement( (long*)&m_cReserve );
  297. }
  298. };
  299. // HOTSTUFF
  300. //
  301. // "hot" elements of the hash-table (hashed to array of size 2*cProcessor elems)
  302. //
  303. // 32 bytes on WIN32
  304. // 64 bytes on WIN64
  305. //
  306. struct HOTSTUFF
  307. {
  308. public:
  309. NativeCounter m_cEntry; // counter for entries
  310. NativeCounter m_cOp; // counter for inserts/deletes
  311. OSSYNC::CMeteredSection m_cms; // metered section for changing states
  312. #ifdef _WIN64
  313. BYTE m_rgbRsvd[ 24 ]; // alignment padding
  314. #else // !_WIN64
  315. BYTE m_rgbRsvd[ 12 ]; // alignment padding
  316. #endif // _WIN64
  317. BUCKETPool m_bucketpool; // pool of BUCKET blobs
  318. HOTSTUFF()
  319. : m_cms()
  320. {
  321. m_cEntry = 0;
  322. m_cOp = 0;
  323. #ifdef DEBUG
  324. memset( m_rgbRsvd, 0, sizeof( m_rgbRsvd ) );
  325. #endif // DEBUG
  326. }
  327. };
  328. // DIRPTRS
  329. //
  330. // containment for the directory pointers
  331. // these pointers control the use of the directory itself (m_rgrgBucket)
  332. //
  333. // the hash table will always have a minimum of 2 buckets (0 and 1) in the directory
  334. //
  335. // buckets are stored in dynamically allocated arrays which are pointed to by the directory
  336. // each array is 2 times larger than the previous array (exponential growth)
  337. // e.g. the Nth array (m_rgrgBucket[N]) contains 2^N contiguous buckets
  338. // NOTE: the 0th array is special in that it contains an extra element making its total 2 elements
  339. // (normally, 2^0 == 1 element; this is done for magical reasons to be explained later)
  340. // thus, the total number of entries for a given N is:
  341. // N
  342. // 1 + SUM 2^i --> 1 + [ 2^(N+1) - 1 ] --> 2^(N+1)
  343. // i=0
  344. //
  345. // we know the total number of distinct hash values is a power of 2 (it must fit into a NativeCounter)
  346. // we can represent this with 2^M where M is the number of bits in a NativeCounter
  347. // therefore, assuming the above system of exponential growth,
  348. // we know that we can store the total number of hash buckets required at any given time so long as N = M
  349. // in other words, N = # of bits in NativeCounter --> sizeof( NativeCounter ) * 8
  350. //
  351. // therefore, we can statically allocate the array of bucket arrays
  352. // and, we can use LOG2 to compute the bucket address of any given hash value
  353. // (exceptions: DIRILog2( 0 ) => 0, 0 and DIRILog2( 1 ) => 0, 1)
  354. //
  355. // for an explaination of m_cBucketMax and m_cBucket you should read the paper on
  356. // Dynamic Hashing by Per Ake Larson
  357. //
  358. // 160 bytes on WIN32 (5 cache lines)
  359. // 320 bytes on WIN64 (10 cache lines)
  360. struct DIRPTRS
  361. {
  362. NativeCounter m_cBucketMax; // half-way to last bucket in split iteration (2^(n-1))
  363. NativeCounter m_cBucket; // destination of next split (0 to 2^(n-1)), must add to m_cBucketMax
  364. #ifdef _WIN64
  365. BYTE m_rgbRsvd[ 16 ]; // alignment padding
  366. #else // !_WIN64
  367. BYTE m_rgbRsvd[ 8 ]; // alignment padding
  368. #endif // _WIN64
  369. };
  370. // CLock
  371. //
  372. // - lock context for read/write/scan operations on the hash-table
  373. // - tracks currency within a bucket
  374. // - access is restricted to the dynamic-hash-table
  375. public:
  376. class CLock
  377. {
  378. friend class CDynamicHashTable< CKey, CEntry >;
  379. public:
  380. // possible states for a lock context (class CLock)
  381. enum ENUMLOCKSTATE
  382. {
  383. lsNil = 0, // lock is not used
  384. lsRead = 1, // lock is being used to read a particular CKeyEntry object
  385. lsWrite = 2, // lock is being used to write a particular CKeyEntry object
  386. lsScan = 3, // lock is being used to scan the hash-table
  387. };
  388. public:
  389. CLock()
  390. {
  391. m_ls = lsNil;
  392. m_pBucketHead = NULL;
  393. }
  394. ~CLock()
  395. {
  396. DHTAssert( m_pBucketHead == NULL );
  397. }
  398. private:
  399. // lock state
  400. ENUMLOCKSTATE m_ls; // current state of this lock context
  401. BOOL m_fInsertOrDelete;
  402. // HOTSTUFF pointer
  403. HOTSTUFF *m_phs;
  404. #ifdef DEBUG
  405. // debug-only parameters
  406. CKey m_key; // track the key that should be locked
  407. #endif
  408. // ptr to the first BUCKET
  409. BUCKET *m_pBucketHead;
  410. // ptr to the current BUCKET
  411. BUCKET *m_pBucket; // current BUCKET
  412. // ISAM-style cursor on current BUCKET (m_pBucket)
  413. CKeyEntry *m_pEntryPrev; // previous entry
  414. CKeyEntry *m_pEntry; // current entry
  415. CKeyEntry *m_pEntryNext; // next entry
  416. // current bucket (used in scan-mode only)
  417. NativeCounter m_iBucket; // current bucket
  418. };
  419. /////////////////////////////////////////////////////////////////////////////////////////
  420. //
  421. // state machine
  422. //
  423. const int UiSTEnter( HOTSTUFF **pphs )
  424. {
  425. // hash to the HOTSTUFF structure
  426. *pphs = HOTSTUFFHash();
  427. // enter the metered section
  428. return ( *pphs )->m_cms.Enter();
  429. }
  430. void STLeave( const int group, HOTSTUFF *phs )
  431. {
  432. phs->m_cms.Leave( group );
  433. }
  434. const ENUMSTATE EsSTGetState() const
  435. {
  436. return m_stateCur;
  437. }
  438. void STTransition( const ENUMSTATE esNew )
  439. {
  440. // initiate a transition to the desired state
  441. m_stateCur = esNew;
  442. m_cCompletions = 0;
  443. for ( NativeCounter ihs = 0; ihs < m_chs; ihs++ )
  444. {
  445. m_rghs[ ihs ].m_cms.Partition( OSSYNC::CMeteredSection::PFNPARTITIONCOMPLETE( STCompletion_ ), DWORD_PTR( this ) );
  446. }
  447. }
  448. static void STCompletion_( CDynamicHashTable< CKey, CEntry >* pdht )
  449. {
  450. pdht->STCompletion();
  451. }
  452. void STCompletion()
  453. {
  454. // state transition table
  455. typedef void (CDynamicHashTable< CKey, CEntry >::*PfnCompletion)();
  456. struct StateTransitionTable
  457. {
  458. PfnCompletion m_pfnCompletion;
  459. ENUMSTATE m_stNext;
  460. };
  461. static const StateTransitionTable rgstt[] =
  462. {
  463. /* stateNil */ { NULL, stateNil, },
  464. /* stateShrinkFromGrow */ { NULL, stateShrinkFromGrow2, },
  465. /* stateShrinkFromGrow2 */ { NULL, stateShrink, },
  466. /* stateGrowFromShrink */ { NULL, stateGrowFromShrink2, },
  467. /* stateGrowFromShrink2 */ { NULL, stateGrow, },
  468. /* stateSplitFromGrow */ { NULL, stateSplitFromGrow2, },
  469. /* stateSplitFromGrow2 */ { STCompletionCopyDir, stateSplit, },
  470. /* stateGrowFromSplit */ { NULL, stateGrowFromSplit2, },
  471. /* stateGrowFromSplit2 */ { NULL, stateGrow, },
  472. /* stateMergeFromShrink */ { NULL, stateMergeFromShrink2, },
  473. /* stateMergeFromShrink2 */ { STCompletionCopyDir, stateMerge, },
  474. /* stateShrinkFromMerge */ { NULL, stateShrinkFromMerge2, },
  475. /* stateShrinkFromMerge2 */ { NULL, stateShrink, },
  476. /* stateUnused */ { NULL, stateNil, },
  477. /* stateGrow */ { STCompletionGrowShrink, stateNil, },
  478. /* stateShrink */ { STCompletionGrowShrink, stateNil, },
  479. /* stateSplit */ { STCompletionSplit, stateGrowFromSplit, },
  480. /* stateMerge */ { STCompletionMerge, stateShrinkFromMerge, },
  481. };
  482. // all metered sections have transitioned to the new state
  483. if ( NativeCounter( AtomicIncrement( &m_cCompletions ) ) >= m_chs )
  484. {
  485. STATStateTransition();
  486. // save the current state as it may change as a side-effect of
  487. // calling the completion function
  488. const ENUMSTATE esCurrent = EsSTGetState();
  489. // if there is a completion function for this state then call it
  490. if ( rgstt[ esCurrent ].m_pfnCompletion )
  491. {
  492. (this->*rgstt[ esCurrent ].m_pfnCompletion)();
  493. }
  494. // if there is a next state then immediately begin the transition to that state
  495. if ( rgstt[ esCurrent ].m_stNext )
  496. {
  497. STTransition( rgstt[ esCurrent ].m_stNext );
  498. }
  499. }
  500. }
  501. void STCompletionCopyDir()
  502. {
  503. // backup the bucket ptrs for use during the split/merge process
  504. memcpy( &m_dirptrs[ 1 ], &m_dirptrs[ 0 ], sizeof( DIRPTRS ) );
  505. }
  506. void STCompletionGrowShrink()
  507. {
  508. // enable the selection of a new maintenance policy
  509. m_semPolicy.Release();
  510. }
  511. void STCompletionSplit()
  512. {
  513. // split the directory
  514. DIRISplit();
  515. }
  516. void STCompletionMerge()
  517. {
  518. // merge the directory
  519. DIRIMerge();
  520. }
  521. /////////////////////////////////////////////////////////////////////////////////////////
  522. //
  523. // directory
  524. //
  525. // initialize the directory, possible allocating some buckets
  526. ERR ErrDIRInit( const NativeCounter cLoadFactor, const NativeCounter cbucketMin )
  527. {
  528. ERR err;
  529. NativeCounter iExponent;
  530. NativeCounter iRemainder;
  531. // check params
  532. if ( cLoadFactor < 1 )
  533. {
  534. return errInvalidParameter;
  535. }
  536. // setup the main paramters
  537. m_cLoadFactor = cLoadFactor;
  538. // calculate the bucket size, accounting for:
  539. //
  540. // - bucket header
  541. // - enough room for twice the load factor to eliminate overflow
  542. // buckets with uniform hashing
  543. // - room for an additional entry to give us some flexibility in
  544. // our actual load factor to reduce maintenance overhead
  545. // - cache line alignment of the bucket
  546. m_cbBucket = sizeof( BUCKET ) + ( cLoadFactor * 2 + 1 ) * sizeof( CKeyEntry );
  547. m_cbBucket = ( ( m_cbBucket + cbCacheLine - 1 ) / cbCacheLine ) * cbCacheLine;
  548. // calculate the number of entries we can fit into a single bucket
  549. // NOTE: this may be larger than intended because we rounded the bucket size up the nearest cache-line
  550. m_centryBucket = ( m_cbBucket - sizeof( BUCKET ) ) / sizeof( CKeyEntry );
  551. // calculate the minimum number of buckets using the following lower-bounds:
  552. // cbucketMin (user parameter)
  553. // # of processors (make sure we have atleast 1 bucket/proc as an attempt to minimize contention)
  554. // 2 (hash table assumes atleast 2 buckets)
  555. m_cbucketMin = max( cbucketMin, NativeCounter( OSSYNC::OSSyncGetProcessorCountMax() ) );
  556. m_cbucketMin = max( m_cbucketMin, 2 );
  557. // align the minimum number of buckets to the next highest power of 2 (unless it's already a power of 2)
  558. DIRILog2( m_cbucketMin, &iExponent, &iRemainder );
  559. if ( iRemainder )
  560. {
  561. if ( ++iExponent >= cbitNativeCounter )
  562. {
  563. return errInvalidParameter; // could not round up without overflowing
  564. }
  565. }
  566. m_cbucketMin = 1 << iExponent;
  567. // setup the directory pointers
  568. m_dirptrs[ 0 ].m_cBucketMax = m_cbucketMin / 2;
  569. m_dirptrs[ 0 ].m_cBucket = m_cbucketMin / 2;
  570. // SPECIAL CASE: allocate 2 entries for the first bucket array
  571. // (we always do this because we always have atleast 2 buckets)
  572. err = ErrDIRInitBucketArray( 2, 0, &m_rgrgBucket[ 0 ] );
  573. if ( errSuccess != err )
  574. {
  575. return err;
  576. }
  577. // allocate memory for all other initial bucket arrays
  578. for ( iExponent = 1; ( NativeCounter( 1 ) << iExponent ) < m_cbucketMin; iExponent++ )
  579. {
  580. err = ErrDIRInitBucketArray( 1 << iExponent, 1 << iExponent, &m_rgrgBucket[ iExponent ] );
  581. if ( errSuccess != err )
  582. {
  583. return err;
  584. }
  585. }
  586. // clear the second set of directory ptrs
  587. memset( &m_dirptrs[ 1 ], 0, sizeof( DIRPTRS ) );
  588. return errSuccess;
  589. }
  590. // cleanup all memory by destructing it then freeing it
  591. void DIRTerm()
  592. {
  593. NativeCounter iExponent;
  594. // SPECIAL CASE: term the first bucket array (contains 2 entries)
  595. // (we will always do this because the hash-table will always contain atleast 2 entries)
  596. if ( m_rgrgBucket[ 0 ] )
  597. {
  598. DIRTermBucketArray( m_rgrgBucket[ 0 ], 2 );
  599. m_rgrgBucket[ 0 ] = NULL;
  600. }
  601. // term all other bucket arrays
  602. for ( iExponent = 1; iExponent < cbitNativeCounter; iExponent++ )
  603. {
  604. if ( m_rgrgBucket[ iExponent ] )
  605. {
  606. DIRTermBucketArray( m_rgrgBucket[ iExponent ], 1 << iExponent );
  607. m_rgrgBucket[ iExponent ] = NULL;
  608. }
  609. }
  610. // reset both copies of the directory pointers
  611. memset( m_dirptrs, 0, sizeof( m_dirptrs ) );
  612. }
  613. // lock a key for read operations
  614. void DIRReadLockKey( const ENUMSTATE esCurrent, const CKey &key, CLock * const plock ) const
  615. {
  616. NativeCounter iHash;
  617. NativeCounter iBucket;
  618. NativeCounter cBucketBefore;
  619. NativeCounter cBucketAfter;
  620. NativeCounter cBucketMax;
  621. // verify the lock
  622. DHTAssert( FBKTRead( plock ) );
  623. DHTAssert( plock->m_pBucketHead == NULL );
  624. #ifdef DEBUG
  625. // remember the key we are locking
  626. plock->m_key = key;
  627. #endif
  628. // hash to the bucket we want (this may require a retry in grow/shrink mode)
  629. iHash = CKeyEntry::Hash( key );
  630. plock->m_pBucketHead = PbucketDIRIHash( esCurrent, iHash, &iBucket, &cBucketBefore );
  631. // acquire the lock as a reader
  632. plock->m_pBucketHead->CRWL().EnterAsReader();
  633. // the entry may have moved as the result of a bucket split/merge
  634. cBucketAfter = NcDIRIGetBucket( esCurrent );
  635. cBucketMax = NcDIRIGetBucketMax( esCurrent );
  636. if ( cBucketBefore != cBucketAfter &&
  637. ( cBucketBefore <= iBucket && iBucket < cBucketAfter ||
  638. cBucketMax + cBucketAfter <= iBucket && iBucket < cBucketMax + cBucketBefore ) )
  639. {
  640. // unlock the old bucket
  641. plock->m_pBucketHead->CRWL().LeaveAsReader();
  642. // hash to the bucket we want (this cannot fail more than once)
  643. plock->m_pBucketHead = PbucketDIRIHash( esCurrent, iHash );
  644. // lock the new bucket
  645. plock->m_pBucketHead->CRWL().EnterAsReader();
  646. }
  647. // we should now have the correct bucket locked
  648. DHTAssert( plock->m_pBucketHead == PbucketDIRIHash( esCurrent, iHash ) );
  649. }
  650. // unlock the current read-locked key
  651. void DIRReadUnlockKey( CLock * const plock ) const
  652. {
  653. // verify the lock
  654. DHTAssert( FBKTRead( plock ) );
  655. DHTAssert( plock->m_pBucketHead != NULL );
  656. // release the lock
  657. plock->m_pBucketHead->CRWL().LeaveAsReader();
  658. plock->m_pBucketHead = NULL;
  659. }
  660. // lock a key for read/write operations
  661. void DIRWriteLockKey( const ENUMSTATE esCurrent, const CKey &key, CLock * const plock ) const
  662. {
  663. NativeCounter iHash;
  664. NativeCounter iBucket;
  665. NativeCounter cBucketBefore;
  666. NativeCounter cBucketAfter;
  667. NativeCounter cBucketMax;
  668. // verify the lock
  669. DHTAssert( FBKTWrite( plock ) || FBKTScan( plock ) );
  670. DHTAssert( plock->m_pBucketHead == NULL );
  671. #ifdef DEBUG
  672. // remember the key we are locking
  673. plock->m_key = key;
  674. #endif
  675. // hash to the bucket we want (this may require a retry in grow/shrink mode)
  676. iHash = CKeyEntry::Hash( key );
  677. plock->m_pBucketHead = PbucketDIRIHash( esCurrent, iHash, &iBucket, &cBucketBefore );
  678. // acquire the lock as a writer
  679. plock->m_pBucketHead->CRWL().EnterAsWriter();
  680. // the entry may have moved as the result of a bucket split/merge
  681. cBucketAfter = NcDIRIGetBucket( esCurrent );
  682. cBucketMax = NcDIRIGetBucketMax( esCurrent );
  683. if ( cBucketBefore != cBucketAfter &&
  684. ( cBucketBefore <= iBucket && iBucket < cBucketAfter ||
  685. cBucketMax + cBucketAfter <= iBucket && iBucket < cBucketMax + cBucketBefore ) )
  686. {
  687. // unlock the old bucket
  688. plock->m_pBucketHead->CRWL().LeaveAsWriter();
  689. // hash to the bucket we want (this cannot fail more than once)
  690. plock->m_pBucketHead = PbucketDIRIHash( esCurrent, iHash );
  691. // lock the new bucket
  692. plock->m_pBucketHead->CRWL().EnterAsWriter();
  693. }
  694. // we should now have the correct bucket locked
  695. DHTAssert( plock->m_pBucketHead == PbucketDIRIHash( esCurrent, iHash ) );
  696. }
  697. // unlock the current write-locked key
  698. void DIRWriteUnlockKey( CLock * const plock ) const
  699. {
  700. // verify the lock
  701. DHTAssert( FBKTWrite( plock ) );
  702. DHTAssert( plock->m_pBucketHead != NULL );
  703. // release the lock
  704. plock->m_pBucketHead->CRWL().LeaveAsWriter();
  705. plock->m_pBucketHead = NULL;
  706. }
  707. // initalize an array of buckets
  708. ERR ErrDIRInitBucketArray( const NativeCounter cbucketAlloc,
  709. const NativeCounter ibucketFirst,
  710. BYTE** const prgbBucket )
  711. {
  712. #ifdef UNIQUE_BUCKET_NAMES
  713. char *psz;
  714. #endif // UNIQUE_BUCKET_NAMES
  715. NativeCounter cb;
  716. BYTE *rgb;
  717. NativeCounter ibucket;
  718. DHTAssert( cbucketAlloc > 0 );
  719. DHTAssert( prgbBucket );
  720. // calculate the size (in bytes) of the new bucket array
  721. #ifdef UNIQUE_BUCKET_NAMES
  722. cb = cbucketAlloc * ( m_cbBucket + 60 ); // add 60 extra bytes per bucket for a unique name (for the bucket's r/w-lock)
  723. #else
  724. cb = cbucketAlloc * m_cbBucket;
  725. #endif
  726. // allocate the new bucket array
  727. rgb = (BYTE*)PvMEMAlloc( cb );
  728. if ( !rgb )
  729. {
  730. *prgbBucket = NULL;
  731. return errOutOfMemory;
  732. }
  733. // initialize each bucket within the new array
  734. for ( ibucket = 0; ibucket < cbucketAlloc; ibucket++ )
  735. {
  736. // efficiency variables
  737. PBUCKET const pbucket = PBUCKET( rgb + ( ibucket * m_cbBucket ) );
  738. // construct the r/w-lock
  739. #ifdef UNIQUE_BUCKET_NAMES
  740. psz = (char*)( rgb + ( cbucketAlloc * m_cbBucket ) + ( ibucket * 60 ) );
  741. sprintf( psz, "CDynamicHashTable::BUCKET[0x%016I64X]::m_rwlBucket", QWORD( ibucketFirst + ibucket ) );
  742. DHTAssert( strlen( psz ) < 60 );
  743. new( &pbucket->CRWL() ) OSSYNC::CReaderWriterLock( CLockBasicInfo( CSyncBasicInfo( psz ), int( m_rankDHTrwlBucket ), 0 ) );
  744. #else // !UNIQUE_BUCKET_NAMES
  745. new( &pbucket->CRWL() ) OSSYNC::CReaderWriterLock( CLockBasicInfo( CSyncBasicInfo( "CDynamicHashTable::BUCKET::m_rwlBucket" ), int( m_rankDHTrwlBucket ), 0 ) );
  746. #endif // UNIQUE_BUCKET_NAMES
  747. // make the bucket empty
  748. pbucket->m_pb = NULL;
  749. }
  750. *prgbBucket = rgb;
  751. return errSuccess;
  752. }
  753. // uninitialize an array of buckets
  754. void DIRTermBucketArray( BYTE* const rgbBucket,
  755. const NativeCounter cbucketTerm )
  756. {
  757. NativeCounter ibucket;
  758. PBUCKET pbucketNext;
  759. // destroy each bucket in the array
  760. DHTAssert( rgbBucket );
  761. for ( ibucket = 0; ibucket < cbucketTerm; ibucket++ )
  762. {
  763. // efficiency variables
  764. PBUCKET pbucket = PBUCKET( rgbBucket + ( ibucket * m_cbBucket ) );
  765. // destruct the r/w-lock in place without freeing memory
  766. pbucket->CRWL().CReaderWriterLock::~CReaderWriterLock();
  767. // free all chained buckets (don't touch the first one because its part of rgbucket[])
  768. pbucket = PbucketBKTNext( pbucket );
  769. while ( pbucket )
  770. {
  771. pbucketNext = PbucketBKTNext( pbucket );
  772. MEMFree( pbucket );
  773. pbucket = pbucketNext;
  774. }
  775. }
  776. MEMFree( rgbBucket );
  777. }
  778. // split the directory
  779. void DIRISplit()
  780. {
  781. // we are executing the current policy (which is to split) and should be in this known state
  782. DHTAssert( m_dirptrs[ 0 ].m_cBucketMax > 0 );
  783. DHTAssert( m_dirptrs[ 0 ].m_cBucket == m_dirptrs[ 0 ].m_cBucketMax );
  784. // update the directory
  785. // NOTE: we do NOT allocate space here; this is deferred until BKTISplit() when we're sure we need it
  786. m_dirptrs[ 0 ].m_cBucketMax = m_dirptrs[ 0 ].m_cBucketMax * 2;
  787. m_dirptrs[ 0 ].m_cBucket = 0;
  788. STATSplitDirectory();
  789. }
  790. // merge the directory
  791. void DIRIMerge()
  792. {
  793. // we are executing the current policy (which is to split) and should be in this known state
  794. DHTAssert( m_dirptrs[ 0 ].m_cBucketMax > 1 ); // we should not be at the last split-level ( == 1 )
  795. DHTAssert( m_dirptrs[ 0 ].m_cBucket == 0 );
  796. // free the bucket array that is no longer being used (the last one in the directory)
  797. // NOTE: we can guarantee that it isn't in use because m_cBucket == 0 AND we can't grow (we're in stateMerge)
  798. // that means that everyone trying to hash to this bucket will be re-routed to the low-order bucket instead
  799. NativeCounter iExponent;
  800. NativeCounter iRemainder;
  801. DIRILog2( m_dirptrs[ 0 ].m_cBucketMax, &iExponent, &iRemainder );
  802. DHTAssert( NativeCounter( 1 ) << iExponent == m_dirptrs[ 0 ].m_cBucketMax );
  803. DHTAssert( 0 == iRemainder );
  804. // NOTE: the bucket array may not have been allocated because we defer its allocation until BKTISplit
  805. if ( m_rgrgBucket[ iExponent ] )
  806. {
  807. DIRTermBucketArray( m_rgrgBucket[ iExponent ], m_dirptrs[ 0 ].m_cBucketMax );
  808. m_rgrgBucket[ iExponent ] = NULL;
  809. }
  810. #ifdef DEBUG
  811. // verify that no higher-order bucket arrays exist
  812. while ( ++iExponent < cbitNativeCounter )
  813. {
  814. DHTAssert( !m_rgrgBucket[ iExponent ] );
  815. }
  816. #endif // DEBUG
  817. // update the directory
  818. m_dirptrs[ 0 ].m_cBucketMax = m_dirptrs[ 0 ].m_cBucketMax / 2;
  819. m_dirptrs[ 0 ].m_cBucket = m_dirptrs[ 0 ].m_cBucketMax;
  820. STATMergeDirectory();
  821. }
  822. // computer the log2 of the given value in terms of an exponent and an integer remainder
  823. void DIRILog2( const NativeCounter iValue,
  824. NativeCounter* const piExponent,
  825. NativeCounter* const piRemainder ) const
  826. {
  827. NativeCounter iExponent;
  828. NativeCounter iMask;
  829. NativeCounter iMaskLast;
  830. iExponent = 0;
  831. iMaskLast = 1;
  832. iMask = 1;
  833. while ( iMask < iValue )
  834. {
  835. iExponent++;
  836. iMaskLast = iMask;
  837. iMask = ( iMask << 1 ) + 1;
  838. }
  839. DHTAssert( iExponent < cbitNativeCounter );
  840. *piExponent = iExponent;
  841. *piRemainder = iMaskLast & iValue;
  842. }
  843. // get the correct copy of cBucketMax
  844. const NativeCounter NcDIRIGetBucketMax( const ENUMSTATE esCurrent ) const
  845. {
  846. return m_dirptrs[ esCurrent >> 4 ].m_cBucketMax;
  847. }
  848. // get the correct copy of cBucket
  849. const NativeCounter NcDIRIGetBucket( const ENUMSTATE esCurrent ) const
  850. {
  851. return m_dirptrs[ esCurrent >> 4 ].m_cBucket;
  852. }
  853. // resolve a bucket address to a bucket pointer
  854. PBUCKET const PbucketDIRIResolve( const NativeCounter ibucketIndex,
  855. const NativeCounter ibucketOffset ) const
  856. {
  857. BYTE* const pb = m_rgrgBucket[ ibucketIndex ]; // get ptr to one of the bucket arrays
  858. const NativeCounter ibOffset = ibucketOffset * m_cbBucket; // get byte offset within bucket array
  859. DHTAssert( NULL != pb );
  860. return PBUCKET( pb + ibOffset ); // return a typed ptr to the individual bucket within array
  861. }
  862. // hash to a bucket
  863. const PBUCKET PbucketDIRIHash( const ENUMSTATE esCurrent,
  864. const NativeCounter iHash,
  865. NativeCounter* const piBucket,
  866. NativeCounter* const pcBucket ) const
  867. {
  868. NativeCounter& iBucket = *piBucket;
  869. NativeCounter& cBucket = *pcBucket;
  870. NativeCounter cBucketMax;
  871. NativeCounter iExponent;
  872. NativeCounter iRemainder;
  873. // load some of the directory pointers
  874. cBucket = NcDIRIGetBucket( esCurrent );
  875. cBucketMax = NcDIRIGetBucketMax( esCurrent );
  876. // normalize the given hash value to the range of active buckets
  877. iBucket = iHash & ( ( cBucketMax - 1 ) + cBucketMax );
  878. if ( iBucket >= cBucketMax + cBucket )
  879. {
  880. iBucket -= cBucketMax;
  881. }
  882. // convert the normalized hash value to a bucket address
  883. DIRILog2( iBucket, &iExponent, &iRemainder );
  884. // return the bucket
  885. return PbucketDIRIResolve( iExponent, iRemainder );
  886. }
  887. const PBUCKET PbucketDIRIHash( const ENUMSTATE esCurrent,
  888. const NativeCounter iHash ) const
  889. {
  890. NativeCounter iBucket;
  891. NativeCounter cBucket;
  892. return PbucketDIRIHash( esCurrent, iHash, &iBucket, &cBucket );
  893. }
  894. /////////////////////////////////////////////////////////////////////////////////////////
  895. //
  896. // scan operations
  897. //
  898. // move from the current hash-bucket to the next hash-bucket that contains
  899. // atleast 1 entry; position currency on that entry
  900. ERR ErrSCANMoveNext( CLock *const plock )
  901. {
  902. DHTAssert( plock->m_pEntryPrev == NULL );
  903. DHTAssert( plock->m_pEntry == NULL );
  904. DHTAssert( plock->m_pEntryNext == NULL );
  905. // unlock the current bucket
  906. if ( plock->m_pBucketHead )
  907. {
  908. plock->m_pBucketHead->CRWL().LeaveAsWriter();
  909. plock->m_pBucketHead = NULL;
  910. // we performed an insert or delete while holding the write lock
  911. if ( plock->m_fInsertOrDelete )
  912. {
  913. // perform amortized maintenance on the table
  914. MaintainTable( plock->m_phs );
  915. }
  916. }
  917. // enter the state machine
  918. const int iGroup = UiSTEnter( &plock->m_phs );
  919. const ENUMSTATE esCurrent = EsSTGetState();
  920. while ( plock->m_iBucket + 1 < NcDIRIGetBucketMax( esCurrent ) + NcDIRIGetBucket( esCurrent ) )
  921. {
  922. // we have not scanned the last bucket yet
  923. // advance the bucket index
  924. plock->m_iBucket++;
  925. // hash to the bucket and lock it
  926. plock->m_pBucketHead = PbucketDIRIHash( esCurrent, plock->m_iBucket );
  927. plock->m_pBucketHead->CRWL().EnterAsWriter();
  928. if ( plock->m_iBucket < NcDIRIGetBucketMax( esCurrent ) + NcDIRIGetBucket( esCurrent ) )
  929. {
  930. // bucket address is OK (did not move)
  931. if ( plock->m_pBucketHead->m_pb != NULL )
  932. {
  933. // current bucket contains atleast 1 entry
  934. // setup the currency on the first entry
  935. plock->m_pBucket = plock->m_pBucketHead;
  936. plock->m_pEntry = &plock->m_pBucketHead->m_rgEntry[0];
  937. // stop the loop
  938. break;
  939. }
  940. // current bucket is empty
  941. }
  942. else
  943. {
  944. DHTAssert( stateShrink == esCurrent );
  945. // the current bucket disappeared because it was merged into a lower bucket
  946. DHTAssert( plock->m_iBucket >= NcDIRIGetBucketMax( esCurrent ) );
  947. DHTAssert( PbucketDIRIHash( esCurrent, plock->m_iBucket ) ==
  948. PbucketDIRIHash( esCurrent, plock->m_iBucket - NcDIRIGetBucketMax( esCurrent ) ) );
  949. // make sure the current entry ptr is reset
  950. DHTAssert( !plock->m_pEntry );
  951. }
  952. // release the bucket lock (bucket should be empty since it was merged)
  953. DHTAssert( !plock->m_pBucketHead->m_pb );
  954. plock->m_pBucketHead->CRWL().LeaveAsWriter();
  955. plock->m_pBucketHead = NULL;
  956. }
  957. // leave the state machine
  958. STLeave( iGroup, plock->m_phs );
  959. // return the result
  960. DHTAssert( !plock->m_pEntry || plock->m_pBucketHead );
  961. return plock->m_pEntry ? errSuccess : errNoCurrentEntry;
  962. }
  963. /////////////////////////////////////////////////////////////////////////////////////////
  964. //
  965. // bucket operations
  966. //
  967. // returns fTrue if the lock context is in read mode
  968. const BOOL FBKTRead( CLock *const plock ) const
  969. {
  970. return plock->m_ls == CLock::lsRead;
  971. }
  972. // returns fTrue if the lock context is in write mode
  973. const BOOL FBKTWrite( CLock *const plock ) const
  974. {
  975. return plock->m_ls == CLock::lsWrite;
  976. }
  977. // returns fTrue if the lock context is in scan-forward mode
  978. const BOOL FBKTScan( CLock *const plock ) const
  979. {
  980. return plock->m_ls == CLock::lsScan;
  981. }
  982. // returns the entry after last entry in the BUCKET or entry 0 if no entries exist
  983. CKeyEntry *PentryBKTNextMost( const PBUCKET pBucket ) const
  984. {
  985. const BYTE *pb = pBucket->m_pb;
  986. if ( BOOL( ( pb >= (BYTE*)&pBucket->m_rgEntry[ 0 ] ) &
  987. ( pb < (BYTE*)&pBucket->m_rgEntry[ m_centryBucket ] ) ) )
  988. {
  989. // we are in the last bucket
  990. return (CKeyEntry*)pb + 1;
  991. }
  992. else if ( NULL == pb )
  993. {
  994. // the bucket is empty
  995. return &pBucket->m_rgEntry[ 0 ];
  996. }
  997. // the bucket is full
  998. return &pBucket->m_rgEntry[ m_centryBucket ];
  999. }
  1000. // returns the next BUCKET or NULL if no other BUCKETs exist
  1001. PBUCKET PbucketBKTNext( const PBUCKET pBucket ) const
  1002. {
  1003. const BYTE *pb = pBucket->m_pb;
  1004. if ( BOOL( ( pb <= (BYTE*)pBucket - m_cbBucket ) |
  1005. ( pb >= (BYTE*)pBucket + m_cbBucket ) ) )
  1006. {
  1007. // m_pBucketNext is either the next BUCKET or NULL
  1008. DHTAssert( !pb || PBUCKET( pb )->m_pBucketPrev == pBucket );
  1009. return PBUCKET( pb );
  1010. }
  1011. // m_pBucketNext is invalid (m_pEntryLast is valid instead)
  1012. return NULL;
  1013. }
  1014. // try to seek to the entry corresponding to the given key
  1015. // if found, the currency will be set to the entry and errSuccess will be returned
  1016. // if not, currency will be set to before-first or after-last, and errEntryNotFound will be returned
  1017. void BKTSeek( CLock *const plock, const CKey &key ) const
  1018. {
  1019. // pre-init our currency assuming we will hit a hot path
  1020. plock->m_pBucket = plock->m_pBucketHead;
  1021. plock->m_pEntryPrev = NULL;
  1022. plock->m_pEntryNext = NULL;
  1023. // HOT PATH:
  1024. //
  1025. // if the next/end pointer is within the head bucket then we know
  1026. // that all entries are in the head bucket. if we find the entry
  1027. // for this key then set our currency to point to it otherwise set
  1028. // our currency to no current entry
  1029. CKeyEntry* const pEntryLast = plock->m_pBucketHead->m_pEntryLast;
  1030. if ( DWORD_PTR( pEntryLast ) - DWORD_PTR( plock->m_pBucketHead ) < m_cbBucket )
  1031. {
  1032. CKeyEntry* pEntry = plock->m_pBucketHead->m_rgEntry;
  1033. do
  1034. {
  1035. if ( pEntry->FEntryMatchesKey( key ) )
  1036. {
  1037. plock->m_pEntry = pEntry;
  1038. return;
  1039. }
  1040. }
  1041. while ( ++pEntry <= pEntryLast );
  1042. plock->m_pEntry = NULL;
  1043. }
  1044. // HOT PATH:
  1045. //
  1046. // if the next/end pointer is NULL then we know that we will not
  1047. // find the key. set our currency to no current entry
  1048. else if ( !pEntryLast )
  1049. {
  1050. plock->m_pEntry = NULL;
  1051. }
  1052. // if the next/end pointer points outside of the head bucket then
  1053. // perform a full chain search
  1054. else
  1055. {
  1056. BKTISeek( plock, key );
  1057. }
  1058. }
  1059. void BKTISeek( CLock *const plock, const CKey &key ) const
  1060. {
  1061. PBUCKET pBucket;
  1062. PBUCKET pBucketPrev;
  1063. CKeyEntry *pEntryThis;
  1064. CKeyEntry *pEntryMost;
  1065. DHTAssert( FBKTRead( plock ) || FBKTWrite( plock ) );
  1066. DHTAssert( plock->m_pBucketHead != NULL );
  1067. // start the scan on the first bucket
  1068. pBucket = plock->m_pBucketHead;
  1069. do
  1070. {
  1071. // scan the current BUCKET
  1072. pEntryThis = &pBucket->m_rgEntry[ 0 ];
  1073. pEntryMost = PentryBKTNextMost( pBucket );
  1074. while ( pEntryThis < pEntryMost )
  1075. {
  1076. // query the entry against the given key for a match
  1077. // (assume we will be more likely to not find it)
  1078. if ( !pEntryThis->FEntryMatchesKey( key ) )
  1079. {
  1080. // nop
  1081. }
  1082. else
  1083. {
  1084. // the key exists; setup our currency around it
  1085. goto SetupCurrency;
  1086. }
  1087. // move to the next entry
  1088. pEntryThis++;
  1089. }
  1090. // move to the next BUCKET
  1091. pBucketPrev = pBucket;
  1092. pBucket = PbucketBKTNext( pBucket );
  1093. }
  1094. while ( pBucket );
  1095. // move back to the last BUCKET and reset the entry ptr
  1096. pBucket = pBucketPrev;
  1097. pEntryThis = NULL;
  1098. SetupCurrency:
  1099. // setup the currency in the lock context
  1100. // we will not allow moving next/prev, so we setup the next/prev ptrs accordingly
  1101. plock->m_pBucket = pBucket;
  1102. plock->m_pEntryPrev = NULL;
  1103. plock->m_pEntry = pEntryThis;
  1104. plock->m_pEntryNext = NULL;
  1105. }
  1106. #ifdef DEBUG
  1107. // get a pointer to the current entry
  1108. // if currency is before-first or after-last, then NULL is returned
  1109. void BKTGetEntry( CLock *const plock, CKeyEntry **ppKeyEntry ) const
  1110. {
  1111. DHTAssert( FBKTRead( plock ) || FBKTWrite( plock ) );
  1112. DHTAssert( plock->m_pBucketHead != NULL );
  1113. DHTAssert( plock->m_pBucket != NULL );
  1114. *ppKeyEntry = plock->m_pEntry;
  1115. return;
  1116. }
  1117. #endif
  1118. // get the current entry
  1119. // if currency is before-first or after-last, errEntryNotFound is returned
  1120. const ERR ErrBKTGetEntry( CLock *const plock, CEntry *pentry ) const
  1121. {
  1122. DHTAssert( FBKTRead( plock ) || FBKTWrite( plock ) || FBKTScan( plock ) );
  1123. DHTAssert( plock->m_pBucketHead != NULL );
  1124. DHTAssert( plock->m_pBucket != NULL );
  1125. if ( plock->m_pEntry )
  1126. {
  1127. // we are on an entry
  1128. plock->m_pEntry->GetEntry( pentry );
  1129. return errSuccess;
  1130. }
  1131. // we are not on an entry
  1132. return errEntryNotFound;
  1133. }
  1134. // replace the current entry (destruct old entry, contruct new entry)
  1135. // if currency is before-first or after-last, then errNoCurrentEntry is returned
  1136. const ERR ErrBKTReplaceEntry( CLock *const plock, const CEntry &entry ) const
  1137. {
  1138. DHTAssert( FBKTWrite( plock ) || FBKTScan( plock ) );
  1139. DHTAssert( plock->m_pBucketHead != NULL );
  1140. DHTAssert( plock->m_pBucket != NULL );
  1141. if ( plock->m_pEntry )
  1142. {
  1143. // we are on an entry
  1144. // copy the new entry over it
  1145. plock->m_pEntry->SetEntry( entry );
  1146. return errSuccess;
  1147. }
  1148. // we are not on an entry
  1149. return errNoCurrentEntry;
  1150. }
  1151. // insert an entry at the end of the logical bucket
  1152. // if memory is short, errOutOfMemory is returned
  1153. // otherwise, errSuccess is returned
  1154. const ERR ErrBKTInsertEntry( CLock *const plock, const CEntry &entry )
  1155. {
  1156. DHTAssert( FBKTWrite( plock ) );
  1157. DHTAssert( plock->m_pBucketHead != NULL );
  1158. DHTAssert( plock->m_pBucket != NULL );
  1159. if ( plock->m_pEntry )
  1160. {
  1161. // we are pointing to the key we locked, so it must already exist
  1162. return errKeyDuplicate;
  1163. }
  1164. #ifdef DEBUG
  1165. PBUCKET *rgBucketCheck = NULL, pbucketTX;
  1166. size_t cBucketCheck = 0, iT;
  1167. pbucketTX = plock->m_pBucketHead;
  1168. while ( pbucketTX )
  1169. {
  1170. cBucketCheck++;
  1171. pbucketTX = PbucketBKTNext( pbucketTX );
  1172. }
  1173. cBucketCheck++; // account for newly allocated bucket
  1174. rgBucketCheck = (PBUCKET *)PvMEMAlloc( cBucketCheck * sizeof( PBUCKET ) );
  1175. if ( NULL != rgBucketCheck )
  1176. {
  1177. iT = 0;
  1178. pbucketTX = plock->m_pBucketHead;
  1179. while ( pbucketTX )
  1180. {
  1181. rgBucketCheck[ iT++ ] = pbucketTX;
  1182. pbucketTX = PbucketBKTNext( pbucketTX );
  1183. }
  1184. rgBucketCheck[ iT++ ] = NULL; // new bucket
  1185. }
  1186. // count the number of entries we will be handling
  1187. size_t cEntriesTotal = 0;
  1188. PBUCKET pbktT, pbktNextT;
  1189. pbktT = plock->m_pBucketHead;
  1190. if ( pbktT->m_pb != NULL )
  1191. {
  1192. while ( pbktT )
  1193. {
  1194. pbktNextT = PbucketBKTNext( pbktT );
  1195. if ( pbktNextT )
  1196. {
  1197. // full bucket
  1198. cEntriesTotal += size_t( m_centryBucket );
  1199. }
  1200. else
  1201. {
  1202. // partial bucket (not empty)
  1203. cEntriesTotal += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] );
  1204. }
  1205. pbktT = pbktNextT;
  1206. }
  1207. }
  1208. #endif
  1209. // cursor for insert
  1210. PBUCKET pBucketThis = plock->m_pBucket;
  1211. CKeyEntry *pEntryThis;
  1212. // efficiency variable
  1213. PBUCKET pBucketT;
  1214. // move to the last entry in the last bucket
  1215. pBucketT = PbucketBKTNext( pBucketThis );
  1216. while ( pBucketT )
  1217. {
  1218. pBucketThis = pBucketT;
  1219. pBucketT = PbucketBKTNext( pBucketT );
  1220. }
  1221. pEntryThis = PentryBKTNextMost( pBucketThis );
  1222. if ( pEntryThis != &pBucketThis->m_rgEntry[ m_centryBucket ] )
  1223. {
  1224. // there are available entries left in the last bucket
  1225. // nop
  1226. }
  1227. else
  1228. {
  1229. // there are no entries left in the last bucket
  1230. // allocate a new bucket
  1231. pBucketT = (BUCKET *)PvMEMAlloc( m_cbBucket );
  1232. if ( !pBucketT )
  1233. {
  1234. // we ran out of memory when allocating the new BUCKET
  1235. #ifdef DEBUG
  1236. // free memory from the start of this functions
  1237. if ( NULL != rgBucketCheck )
  1238. {
  1239. MEMFree( rgBucketCheck );
  1240. }
  1241. #endif
  1242. return errOutOfMemory;
  1243. }
  1244. STATInsertOverflowBucket();
  1245. #ifdef DEBUG
  1246. // put the new bucket in our list
  1247. if ( NULL != rgBucketCheck )
  1248. {
  1249. DHTAssert( rgBucketCheck[cBucketCheck-1] == NULL );
  1250. rgBucketCheck[cBucketCheck-1] = pBucketT;
  1251. }
  1252. #endif
  1253. // chain the new BUCKET
  1254. pBucketThis->m_pBucketNext = pBucketT;
  1255. pBucketT->m_pBucketPrev = pBucketThis;
  1256. // use the first entry of the new BUCKET
  1257. pBucketThis = pBucketT;
  1258. pEntryThis = &pBucketT->m_rgEntry[0];
  1259. }
  1260. // copy the entry
  1261. pEntryThis->SetEntry( entry );
  1262. // update the last entry pointer
  1263. pBucketThis->m_pEntryLast = pEntryThis;
  1264. // move the currency to the new entry
  1265. plock->m_pBucket = pBucketThis;
  1266. plock->m_pEntry = pEntryThis;
  1267. #ifdef DEBUG
  1268. if ( NULL != rgBucketCheck )
  1269. {
  1270. // check each catalogued bucket to see if it is still there
  1271. pbucketTX = plock->m_pBucketHead;
  1272. DHTAssert( pbucketTX );
  1273. // find an remove all buckets found in the destiantion bucket from our list
  1274. while ( pbucketTX )
  1275. {
  1276. for ( iT = 0; iT < cBucketCheck; iT++ )
  1277. {
  1278. if ( rgBucketCheck[iT] == pbucketTX )
  1279. {
  1280. rgBucketCheck[iT] = NULL;
  1281. break;
  1282. }
  1283. }
  1284. DHTAssert( iT < cBucketCheck ); // if this goes off, we somehow got a bucket
  1285. // into the chain that shouldn't be there
  1286. // (it is a bucket we never catalogued!)
  1287. pbucketTX = PbucketBKTNext( pbucketTX );
  1288. }
  1289. // the list should now be empty -- verify this
  1290. for ( iT = 0; iT < cBucketCheck; iT++ )
  1291. {
  1292. // if this goes off, rgBucketCheck[iT] contains a bucket that was abandoned without
  1293. // being freed!
  1294. DHTAssert( rgBucketCheck[iT] == NULL );
  1295. }
  1296. // free the list
  1297. MEMFree( rgBucketCheck );
  1298. }
  1299. // make sure the number of entries has not changed since we started
  1300. size_t cEntriesAfterwards = 0;
  1301. pbktT = plock->m_pBucketHead;
  1302. if ( pbktT->m_pb != NULL )
  1303. {
  1304. while ( pbktT )
  1305. {
  1306. pbktNextT = PbucketBKTNext( pbktT );
  1307. if ( pbktNextT )
  1308. {
  1309. // full bucket
  1310. cEntriesAfterwards += size_t( m_centryBucket );
  1311. }
  1312. else
  1313. {
  1314. // partial bucket (not empty)
  1315. cEntriesAfterwards += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] );
  1316. }
  1317. pbktT = pbktNextT;
  1318. }
  1319. }
  1320. // entry counters should match ( +1 is for the inserted entry )
  1321. DHTAssert( cEntriesAfterwards == cEntriesTotal + 1 );
  1322. #endif
  1323. return errSuccess;
  1324. }
  1325. // delete the current entry
  1326. // if currency is before-first or after-last, then errNoCurrentEntry is returned
  1327. // if the entry is not the last in the logical bucket, the last entry is promoted
  1328. // to fill in the hole
  1329. // should a BUCKET become empty, it will be released immediately
  1330. const ERR ErrBKTDeleteEntry( CLock *const plock )
  1331. {
  1332. DHTAssert( FBKTWrite( plock ) || FBKTScan( plock ) );
  1333. DHTAssert( plock->m_pBucketHead != NULL );
  1334. DHTAssert( plock->m_pBucket != NULL );
  1335. if ( !plock->m_pEntry )
  1336. {
  1337. // we do not have a current entry
  1338. return errNoCurrentEntry;
  1339. }
  1340. #ifdef DEBUG
  1341. PBUCKET *rgBucketCheck = NULL;
  1342. PBUCKET pbucketT;
  1343. size_t cBucketCheck = 0, iT;
  1344. pbucketT = plock->m_pBucketHead;
  1345. while ( pbucketT )
  1346. {
  1347. cBucketCheck++;
  1348. pbucketT = PbucketBKTNext( pbucketT );
  1349. }
  1350. rgBucketCheck = (PBUCKET *)PvMEMAlloc( cBucketCheck * sizeof( PBUCKET ) );
  1351. if ( NULL != rgBucketCheck )
  1352. {
  1353. iT = 0;
  1354. pbucketT = plock->m_pBucketHead;
  1355. while ( pbucketT )
  1356. {
  1357. rgBucketCheck[ iT++ ] = pbucketT;
  1358. pbucketT = PbucketBKTNext( pbucketT );
  1359. }
  1360. }
  1361. // count the number of entries we will be handling
  1362. size_t cEntriesTotal = 0;
  1363. PBUCKET pbktT, pbktNextT;
  1364. pbktT = plock->m_pBucketHead;
  1365. if ( pbktT->m_pb != NULL )
  1366. {
  1367. while ( pbktT )
  1368. {
  1369. pbktNextT = PbucketBKTNext( pbktT );
  1370. if ( pbktNextT )
  1371. {
  1372. // full bucket
  1373. cEntriesTotal += size_t( m_centryBucket );
  1374. }
  1375. else
  1376. {
  1377. // partial bucket (not empty)
  1378. cEntriesTotal += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] );
  1379. }
  1380. pbktT = pbktNextT;
  1381. }
  1382. }
  1383. #endif
  1384. // we have a valid entry
  1385. PBUCKET pBucketThis = plock->m_pBucket;
  1386. CKeyEntry *pEntryThis = plock->m_pEntry;
  1387. PBUCKET pBucketFree = NULL; // used later if we free a BUCKET strucutre
  1388. if ( pEntryThis != pBucketThis->m_pEntryLast )
  1389. {
  1390. // we are not deleting the last entry in the bucket
  1391. // promote the last entry to fill in this spot left by the entry we are deleting
  1392. // move to the last bucket
  1393. PBUCKET pBucketT = PbucketBKTNext( pBucketThis );
  1394. while ( pBucketT )
  1395. {
  1396. pBucketThis = pBucketT;
  1397. pBucketT = PbucketBKTNext( pBucketT );
  1398. }
  1399. // move to the last entry in the last BUCKET
  1400. pEntryThis = pBucketThis->m_pEntryLast;
  1401. // copy the entry
  1402. plock->m_pEntry->SetEntry( pEntryThis->m_entry );
  1403. }
  1404. // update the currency to show that we are no longer on an entry
  1405. plock->m_pEntry = NULL;
  1406. // we are now pointing to the last entry in the last bucket
  1407. // (via pBucketThis/pEntryThis), and that entry needs to be
  1408. // "deleted" from the bucket
  1409. // update the next/end ptr to reflect this deletion
  1410. if ( pEntryThis != &pBucketThis->m_rgEntry[0] )
  1411. {
  1412. // entries still remain in the last bucket
  1413. DHTAssert( pBucketThis->m_pEntryLast == pEntryThis );
  1414. pBucketThis->m_pEntryLast--; // pEntryThis - 1;
  1415. #ifdef DEBUG
  1416. // jump to the validation code
  1417. goto DoValidation;
  1418. #endif
  1419. return errSuccess;
  1420. }
  1421. // no entries remain in the last bucket
  1422. if ( pBucketThis == plock->m_pBucketHead )
  1423. {
  1424. // this bucket is empty, but we cannot release it because it is part of the bucket array
  1425. // instead, we mark it as being empty
  1426. pBucketThis->m_pb = NULL;
  1427. #ifdef DEBUG
  1428. // jump to the validation code
  1429. goto DoValidation;
  1430. #endif
  1431. return errSuccess;
  1432. }
  1433. // we can free the last bucket
  1434. pBucketFree = pBucketThis;
  1435. // unchain it
  1436. DHTAssert( pBucketThis->m_pBucketPrev->m_pBucketNext == pBucketThis );
  1437. pBucketThis = pBucketThis->m_pBucketPrev;
  1438. pBucketThis->m_pEntryLast = &pBucketThis->m_rgEntry[ m_centryBucket - 1 ];
  1439. // free it
  1440. MEMFree( pBucketFree );
  1441. if ( plock->m_pBucket == pBucketFree )
  1442. {
  1443. // our currency was on the last bucket which is now invalid
  1444. // move to the previous bucket (which is now the NEW last BUCKET)
  1445. plock->m_pBucket = pBucketThis;
  1446. }
  1447. STATDeleteOverflowBucket();
  1448. #ifdef DEBUG
  1449. // check each catalogued bucket to see if it is still there
  1450. DoValidation:
  1451. if ( NULL != rgBucketCheck )
  1452. {
  1453. pbucketT = plock->m_pBucketHead;
  1454. DHTAssert( pbucketT );
  1455. // find an remove all buckets found in the destiantion bucket from our list
  1456. while ( pbucketT )
  1457. {
  1458. for ( iT = 0; iT < cBucketCheck; iT++ )
  1459. {
  1460. if ( rgBucketCheck[iT] == pbucketT )
  1461. {
  1462. rgBucketCheck[iT] = NULL;
  1463. break;
  1464. }
  1465. }
  1466. DHTAssert( iT < cBucketCheck ); // if this goes off, we somehow got a bucket
  1467. // into the chain that shouldn't be there
  1468. // (it is a bucket we never catalogued!)
  1469. pbucketT = PbucketBKTNext( pbucketT );
  1470. }
  1471. // remove pBucketFree from rgBucketCheck
  1472. if ( pBucketFree )
  1473. {
  1474. for ( iT = 0; iT < cBucketCheck; iT++ )
  1475. {
  1476. if ( rgBucketCheck[iT] == pBucketFree )
  1477. {
  1478. rgBucketCheck[iT] = NULL;
  1479. break;
  1480. }
  1481. }
  1482. DHTAssert( iT < cBucketCheck ); // if this goes off, we freed a bucket that
  1483. // was never catalogued! we should only be freeing
  1484. // buckets that were in the original catalogue!
  1485. }
  1486. // the list should now be empty -- verify this
  1487. for ( iT = 0; iT < cBucketCheck; iT++ )
  1488. {
  1489. // if this goes off, rgBucketCheck[iT] contains a bucket that was abandoned without
  1490. // being freed!
  1491. DHTAssert( rgBucketCheck[iT] == NULL );
  1492. }
  1493. // free the list
  1494. MEMFree( rgBucketCheck );
  1495. }
  1496. // make sure the number of entries has not changed since we started
  1497. size_t cEntriesAfterwards = 0;
  1498. pbktT = plock->m_pBucketHead;
  1499. if ( pbktT->m_pb != NULL )
  1500. {
  1501. while ( pbktT )
  1502. {
  1503. pbktNextT = PbucketBKTNext( pbktT );
  1504. if ( pbktNextT )
  1505. {
  1506. // full bucket
  1507. cEntriesAfterwards += size_t( m_centryBucket );
  1508. }
  1509. else
  1510. {
  1511. // partial bucket (not empty)
  1512. cEntriesAfterwards += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] );
  1513. }
  1514. pbktT = pbktNextT;
  1515. }
  1516. }
  1517. // entry counters should match ( -1 is for the deleted entry )
  1518. DHTAssert( cEntriesAfterwards == cEntriesTotal - 1 );
  1519. #endif
  1520. return errSuccess;
  1521. }
  1522. // split to a new bucket
  1523. void BKTISplit( HOTSTUFF* const phs )
  1524. {
  1525. // NOTE: from our perspective, we are in the grow state
  1526. // however, the current state may be set to something else due to a pending transition
  1527. // read the directory pointers
  1528. const NativeCounter cBucketMax = NcDIRIGetBucketMax( stateGrow );
  1529. const NativeCounter cBucket = NcDIRIGetBucket( stateGrow );
  1530. if ( cBucketMax + cBucket >= m_cBucketPreferred || cBucket == cBucketMax )
  1531. {
  1532. return; // the requested growth is complete
  1533. }
  1534. // we need to reserve memory now to ensure that the growth will succeed
  1535. // (BKTIDoSplit will commit or unreserve this reservation later)
  1536. if ( !phs->m_bucketpool.FPOOLReserve( m_cbBucket ) )
  1537. {
  1538. return;
  1539. }
  1540. // get the source bucket
  1541. const PBUCKET pbucketGrowSrc = PbucketDIRIHash( stateGrow, cBucket );
  1542. // try to get the lock
  1543. if ( pbucketGrowSrc->CRWL().FWritersQuiesced() ||
  1544. !pbucketGrowSrc->CRWL().FTryEnterAsWriter() )
  1545. {
  1546. STATSplitContention();
  1547. phs->m_bucketpool.POOLUnreserve();
  1548. return;
  1549. }
  1550. // having a write-lock on the source bucket means no one else attempting to split can
  1551. // be farther along than us at this moment unless they completed the growth already
  1552. // see whether or not m_cBucket changed while were trying to get here
  1553. // if it stayed the same, we were the first ones to split this bucket
  1554. // it if changed, we were not first; instead, someone else managed to split AFTER
  1555. // we read m_cBucket but BEFORE we could do the split ourselves
  1556. if ( cBucket != NcDIRIGetBucket( stateGrow ) )
  1557. {
  1558. DHTAssert( cBucket < NcDIRIGetBucket( stateGrow ) );
  1559. pbucketGrowSrc->CRWL().LeaveAsWriter();
  1560. phs->m_bucketpool.POOLUnreserve();
  1561. return;
  1562. }
  1563. // get the destination bucket (may not be allocated yet so we cannot use PbucketDIRIHash)
  1564. NativeCounter iExponent;
  1565. NativeCounter iRemainder;
  1566. DIRILog2( cBucketMax + cBucket, &iExponent, &iRemainder );
  1567. // extract the address of the bucket
  1568. if ( !m_rgrgBucket[ iExponent ] )
  1569. {
  1570. // allocate a new bucket array to hold 2^iExponent buckets for this entry
  1571. if ( ErrDIRInitBucketArray( cBucketMax, cBucketMax, &m_rgrgBucket[ iExponent ] ) != errSuccess )
  1572. {
  1573. pbucketGrowSrc->CRWL().LeaveAsWriter();
  1574. phs->m_bucketpool.POOLUnreserve();
  1575. return;
  1576. }
  1577. }
  1578. DHTAssert( m_rgrgBucket[ iExponent ] );
  1579. // get the destination bucket
  1580. const PBUCKET pbucketGrowDst = PbucketDIRIResolve( iExponent, iRemainder );
  1581. // lock the destination bucket (no possibility of contention here)
  1582. pbucketGrowDst->CRWL().FTryEnterAsWriter();
  1583. // increase m_cBucket (we cannot turn back after this point)
  1584. // anyone who hashes to the new bucket will be queued up until the growth is complete
  1585. DHTAssert( cBucket == NcDIRIGetBucket( stateGrow ) );
  1586. m_dirptrs[ 0 ].m_cBucket++;
  1587. // do the growth work
  1588. BKTIDoSplit( phs, pbucketGrowSrc, pbucketGrowDst, cBucket );
  1589. // release the write-locks
  1590. pbucketGrowSrc->CRWL().LeaveAsWriter();
  1591. pbucketGrowDst->CRWL().LeaveAsWriter();
  1592. }
  1593. // merge two existing buckets into one
  1594. void BKTIMerge( HOTSTUFF* const phs )
  1595. {
  1596. // NOTE: from our perspective, we are in the shrink state
  1597. // however, the current state may be set to something else due to a pending transition
  1598. // read the directory pointers
  1599. const NativeCounter cBucketMax = NcDIRIGetBucketMax( stateShrink );
  1600. NativeCounter cBucket = NcDIRIGetBucket( stateShrink );
  1601. if ( cBucketMax + cBucket <= m_cBucketPreferred || cBucket == 0 )
  1602. {
  1603. return; // the requested shrinkage is complete
  1604. }
  1605. cBucket--; // the bucket we are merging is really 1 below cBucket
  1606. // we need to reserve memory now to ensure that the shrinkage will succeed
  1607. // (BKTIDoMerge will commit or unreserve this reservation later)
  1608. if ( !phs->m_bucketpool.FPOOLReserve( m_cbBucket ) )
  1609. {
  1610. return;
  1611. }
  1612. // get the destination bucket
  1613. const PBUCKET pbucketShrinkDst = PbucketDIRIHash( stateShrink, cBucket );
  1614. // try to get the lock
  1615. if ( pbucketShrinkDst->CRWL().FWritersQuiesced() ||
  1616. !pbucketShrinkDst->CRWL().FTryEnterAsWriter() )
  1617. {
  1618. STATMergeContention();
  1619. phs->m_bucketpool.POOLUnreserve();
  1620. return;
  1621. }
  1622. // having a write-lock on the destination bucket means no one else attempting to merge can
  1623. // be farther along than us at this moment unless they completed the shrinkage already
  1624. // see whether or not m_cSplit changed while were trying to get here
  1625. // if it stayed the same, we were the first ones to merge this bucket
  1626. // it if changed, we were not first; instead, someone else managed to merge AFTER
  1627. // we read m_cBucket but BEFORE we could do the merge ourselves
  1628. if ( cBucket + 1 != NcDIRIGetBucket( stateShrink ) )
  1629. {
  1630. DHTAssert( cBucket + 1 > NcDIRIGetBucket( stateShrink ) );
  1631. pbucketShrinkDst->CRWL().LeaveAsWriter();
  1632. phs->m_bucketpool.POOLUnreserve();
  1633. return;
  1634. }
  1635. // convert cBucket to a bucket address
  1636. NativeCounter iExponent;
  1637. NativeCounter iRemainder;
  1638. DIRILog2( cBucket + NcDIRIGetBucketMax( stateShrink ), &iExponent, &iRemainder );
  1639. // extract the address of the bucket
  1640. const PBUCKET pbucketShrinkSrc = PbucketDIRIResolve( iExponent, iRemainder );
  1641. // try to get the lock
  1642. if ( pbucketShrinkSrc->CRWL().FWritersQuiesced() ||
  1643. !pbucketShrinkSrc->CRWL().FTryEnterAsWriter() )
  1644. {
  1645. STATMergeContention();
  1646. pbucketShrinkDst->CRWL().LeaveAsWriter();
  1647. phs->m_bucketpool.POOLUnreserve();
  1648. return;
  1649. }
  1650. // decrease m_cBucket (we cannot turn back after this point)
  1651. // anyone who hashes to the destination bucket will be queued up until
  1652. // the merge is complete
  1653. // no one will be able to hash to the source bucket
  1654. DHTAssert( cBucket + 1 == NcDIRIGetBucket( stateShrink ) );
  1655. m_dirptrs[ 0 ].m_cBucket--;
  1656. // do the shrinkage work
  1657. BKTIDoMerge( phs, pbucketShrinkSrc, pbucketShrinkDst );
  1658. // release the write-locks
  1659. pbucketShrinkDst->CRWL().LeaveAsWriter();
  1660. pbucketShrinkSrc->CRWL().LeaveAsWriter();
  1661. }
  1662. // work-horse for spliting a bucket
  1663. void BKTIDoSplit( HOTSTUFF* const phs,
  1664. PBUCKET pBucketSrcSrc,
  1665. PBUCKET pBucketDst,
  1666. const NativeCounter iHashSrc )
  1667. {
  1668. #ifdef DEBUG
  1669. PBUCKET pBucketSrcSrcOriginal = pBucketSrcSrc;
  1670. PBUCKET pBucketDstOriginal = pBucketDst;
  1671. size_t cEntriesTotal = 0, cEntriesTotalRunning = 0;
  1672. PBUCKET pbktT, pbktNextT;
  1673. // catalog each BUCKET structure and make sure they end up in the destination bucket
  1674. PBUCKET *rgBucketCheck = NULL, pbucketTX;
  1675. size_t cBucketCheck = 0, iT;
  1676. pbucketTX = pBucketSrcSrc;
  1677. while ( pbucketTX )
  1678. {
  1679. cBucketCheck++;
  1680. pbucketTX = PbucketBKTNext( pbucketTX );
  1681. }
  1682. pbucketTX = pBucketDst;
  1683. DHTAssert( PbucketBKTNext( pbucketTX ) == NULL );
  1684. while ( pbucketTX )
  1685. {
  1686. cBucketCheck++;
  1687. pbucketTX = PbucketBKTNext( pbucketTX );
  1688. }
  1689. cBucketCheck++; // account for bucket from heap
  1690. rgBucketCheck = (PBUCKET *)PvMEMAlloc( cBucketCheck * sizeof( PBUCKET ) );
  1691. if ( NULL != rgBucketCheck )
  1692. {
  1693. iT = 0;
  1694. pbucketTX = pBucketSrcSrc;
  1695. while ( pbucketTX )
  1696. {
  1697. rgBucketCheck[ iT++ ] = pbucketTX;
  1698. pbucketTX = PbucketBKTNext( pbucketTX );
  1699. }
  1700. pbucketTX = pBucketDst;
  1701. while ( pbucketTX )
  1702. {
  1703. rgBucketCheck[ iT++ ] = pbucketTX;
  1704. pbucketTX = PbucketBKTNext( pbucketTX );
  1705. }
  1706. rgBucketCheck[ iT++ ] = NULL; // heap bucket
  1707. DHTAssert( iT == cBucketCheck );
  1708. }
  1709. // count the number of entries that are in the source bucket
  1710. pbktT = pBucketSrcSrc;
  1711. if ( pbktT->m_pb != NULL )
  1712. {
  1713. while ( pbktT )
  1714. {
  1715. pbktNextT = PbucketBKTNext( pbktT );
  1716. if ( pbktNextT )
  1717. {
  1718. // full bucket
  1719. cEntriesTotal += size_t( m_centryBucket );
  1720. }
  1721. else
  1722. {
  1723. // partial bucket (not empty)
  1724. cEntriesTotal += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] );
  1725. }
  1726. pbktT = pbktNextT;
  1727. }
  1728. }
  1729. #endif
  1730. // cursor for reading entries
  1731. PBUCKET pBucketNextSrc;
  1732. CKeyEntry *pEntryThisSrc;
  1733. CKeyEntry *pEntryMostSrc;
  1734. // cursors for writing entries
  1735. // index 0 is for the SrcDst cursor (entries whose src and dst is the source bucket)
  1736. // index 1 is for the Dst cursor (entries whose dst is the destination bucket)
  1737. PBUCKET pBucketThis[2];
  1738. CKeyEntry *pEntryThis[2];
  1739. CKeyEntry *pEntryMost[2];
  1740. CKeyEntry *pEntryLast[2];
  1741. size_t iIndex;
  1742. // extra buckets
  1743. PBUCKET pBucketAvail = NULL;
  1744. // remember if we used the bucket from the heap
  1745. BOOL fBucketFromHeap = fFalse;
  1746. // used for hashing
  1747. NativeCounter iHashMask;
  1748. DHTAssert( pBucketSrcSrc );
  1749. DHTAssert( pBucketDst );
  1750. DHTAssert( pBucketDst->m_pb == NULL );
  1751. // calculate the hash-mask (prevent wraparound)
  1752. DHTAssert( NcDIRIGetBucketMax( stateGrow ) > 0 );
  1753. iHashMask = ( NcDIRIGetBucketMax( stateGrow ) - 1 ) + NcDIRIGetBucketMax( stateGrow );
  1754. // prepare the read cursor
  1755. pBucketNextSrc = PbucketBKTNext( pBucketSrcSrc );
  1756. pEntryThisSrc = &pBucketSrcSrc->m_rgEntry[ 0 ];
  1757. pEntryMostSrc = PentryBKTNextMost( pBucketSrcSrc );
  1758. // prepare the src-dst write cursor
  1759. pBucketThis[ 0 ] = pBucketSrcSrc;
  1760. pEntryThis[ 0 ] = &pBucketSrcSrc->m_rgEntry[ 0 ];
  1761. pEntryMost[ 0 ] = &pBucketSrcSrc->m_rgEntry[ m_centryBucket ];
  1762. pEntryLast[ 0 ] = NULL;
  1763. // prepare the dst write cursor
  1764. pBucketThis[ 1 ] = pBucketDst;
  1765. pEntryThis[ 1 ] = &pBucketDst->m_rgEntry[ 0 ];
  1766. pEntryMost[ 1 ] = &pBucketDst->m_rgEntry[ m_centryBucket ];
  1767. pEntryLast[ 1 ] = NULL;
  1768. // iterate over all entries in the source bucket
  1769. while ( fTrue )
  1770. {
  1771. // check the read (src) cursor
  1772. if ( pEntryThisSrc < pEntryMostSrc )
  1773. {
  1774. // nop
  1775. }
  1776. else if ( NULL == pBucketNextSrc )
  1777. {
  1778. // all entries have been exhausted
  1779. break;
  1780. }
  1781. else
  1782. {
  1783. // all entries in the current bucket have been exhausted
  1784. if ( pBucketSrcSrc != pBucketThis[ 0 ] )
  1785. {
  1786. // the bucket we are leaving is completely empty and the
  1787. // SrcDst pointer is not using it
  1788. // we need to put it into the available bucket list
  1789. // the bucket ordering should be like this:
  1790. // pBucketThis[0] (src/dst bucket)
  1791. // pBucketSrcSrc (src bucket)
  1792. // pBucketNextSrc (next src bucket)
  1793. DHTAssert( pBucketThis[ 0 ]->m_pBucketNext == pBucketSrcSrc );
  1794. DHTAssert( pBucketSrcSrc->m_pBucketNext == pBucketNextSrc );
  1795. DHTAssert( pBucketNextSrc->m_pBucketPrev == pBucketSrcSrc );
  1796. DHTAssert( pBucketSrcSrc->m_pBucketPrev == pBucketThis[ 0 ] );
  1797. // update the bucket links to "remove" the free bucket
  1798. pBucketThis[ 0 ]->m_pBucketNext = pBucketNextSrc;
  1799. pBucketNextSrc->m_pBucketPrev = pBucketThis[ 0 ];
  1800. // add the bucket to the avail list
  1801. pBucketSrcSrc->m_pBucketNext = pBucketAvail;
  1802. pBucketAvail = pBucketSrcSrc;
  1803. }
  1804. // move to the next bucket
  1805. pEntryThisSrc = &pBucketNextSrc->m_rgEntry[ 0 ];
  1806. pEntryMostSrc = PentryBKTNextMost( pBucketNextSrc );
  1807. pBucketSrcSrc = pBucketNextSrc;
  1808. pBucketNextSrc = PbucketBKTNext( pBucketNextSrc );
  1809. }
  1810. // calculate the hash value
  1811. iIndex = BOOL( ( pEntryThisSrc->Hash() & iHashMask ) != iHashSrc );
  1812. DHTAssert( iIndex == 0 || iIndex == 1 );
  1813. #ifdef DEBUG
  1814. cEntriesTotalRunning++;
  1815. #endif // DEBUG
  1816. // check the write (src/dst or dst) cursor
  1817. if ( pEntryThis[ iIndex ] < pEntryMost[ iIndex ] )
  1818. {
  1819. // nop
  1820. }
  1821. else
  1822. {
  1823. // all entries in the current cursor's bucket are exhausted
  1824. if ( 0 == iIndex )
  1825. {
  1826. // the src/dst cursor will always have a next bucket
  1827. DHTAssert( pBucketThis[ 0 ]->m_pBucketNext->m_pBucketPrev == pBucketThis[ 0 ] );
  1828. pBucketThis[ 0 ] = pBucketThis[ 0 ]->m_pBucketNext;
  1829. // setup the entry ptrs
  1830. pEntryThis[ 0 ] = &pBucketThis[ 0 ]->m_rgEntry[ 0 ];
  1831. pEntryMost[ 0 ] = &pBucketThis[ 0 ]->m_rgEntry[ m_centryBucket ];
  1832. }
  1833. else
  1834. {
  1835. // the dst cursor must allocate a new bucket
  1836. if ( pBucketAvail )
  1837. {
  1838. // get a bucket from the avail list
  1839. const PBUCKET pBucketNew = pBucketAvail;
  1840. pBucketAvail = pBucketAvail->m_pBucketNext;
  1841. // chain it
  1842. pBucketThis[ 1 ]->m_pBucketNext = pBucketNew;
  1843. pBucketNew->m_pBucketPrev = pBucketThis[ 1 ];
  1844. // move to it
  1845. pBucketThis[ 1 ] = pBucketNew;
  1846. }
  1847. else
  1848. {
  1849. // get a bucket from the reservation pool
  1850. DHTAssert( !fBucketFromHeap );
  1851. fBucketFromHeap = fTrue;
  1852. // allocate it
  1853. const PBUCKET pBucketReserve = phs->m_bucketpool.PbucketPOOLCommit();
  1854. DHTAssert( pBucketReserve );
  1855. STATInsertOverflowBucket();
  1856. #ifdef DEBUG
  1857. // add the heap bucket to our catalog of buckets
  1858. if ( NULL != rgBucketCheck )
  1859. {
  1860. DHTAssert( NULL == rgBucketCheck[ cBucketCheck - 1 ] );
  1861. rgBucketCheck[ cBucketCheck - 1 ] = pBucketReserve;
  1862. }
  1863. #endif // DEBUG
  1864. // chain it
  1865. pBucketThis[ 1 ]->m_pBucketNext = pBucketReserve;
  1866. pBucketReserve->m_pBucketPrev = pBucketThis[ 1 ];
  1867. // move to it
  1868. pBucketThis[ 1 ] = pBucketReserve;
  1869. }
  1870. // setup the entry ptrs
  1871. pEntryThis[ 1 ] = &pBucketThis[ 1 ]->m_rgEntry[ 0 ];
  1872. pEntryMost[ 1 ] = &pBucketThis[ 1 ]->m_rgEntry[ m_centryBucket ];
  1873. }
  1874. }
  1875. // copy the entry
  1876. pEntryThis[ iIndex ]->SetEntry( pEntryThisSrc->m_entry );
  1877. // advance the write (src/dst or dst) cursor
  1878. pEntryLast[ iIndex ] = pEntryThis[ iIndex ];
  1879. pEntryThis[ iIndex ]++;
  1880. // advance the read (src) cursor
  1881. pEntryThisSrc++;
  1882. }
  1883. if ( pBucketSrcSrc == pBucketThis[ 0 ] )
  1884. {
  1885. // nop
  1886. }
  1887. else
  1888. {
  1889. // the last bucket of the src bucket is no longer needed
  1890. // the bucket ordering should be like this:
  1891. // pBucketThis[0] (src/dst bucket)
  1892. // pBucketSrcSrc (src bucket)
  1893. // << NOTHING >>
  1894. DHTAssert( pBucketThis[ 0 ]->m_pBucketNext == pBucketSrcSrc );
  1895. DHTAssert( pBucketSrcSrc->m_pBucketPrev == pBucketThis[ 0 ] );
  1896. // free the bucket
  1897. MEMFree( pBucketSrcSrc );
  1898. STATDeleteOverflowBucket();
  1899. #ifdef DEBUG
  1900. // remove the bucket from the bucket-catalog
  1901. if ( NULL != rgBucketCheck )
  1902. {
  1903. for ( iT = 0; iT < cBucketCheck; iT++ )
  1904. {
  1905. if ( rgBucketCheck[iT] == pBucketSrcSrc )
  1906. {
  1907. rgBucketCheck[iT] = NULL;
  1908. break;
  1909. }
  1910. }
  1911. DHTAssert( iT < cBucketCheck ); // the bucket better be in the bucket-catalog!
  1912. }
  1913. #endif // DEBUG
  1914. }
  1915. // update the next/end ptrs for the src/dst cursor and the dst cursor
  1916. pBucketThis[ 0 ]->m_pEntryLast = pEntryLast[ 0 ];
  1917. pBucketThis[ 1 ]->m_pEntryLast = pEntryLast[ 1 ];
  1918. #ifdef DEBUG
  1919. if ( NULL != rgBucketCheck )
  1920. {
  1921. // check each catalogued bucket to see if it is in the pBucketSrcSrc, pBucketDst, or pBucketAvail
  1922. // find and remove all buckets in pBucketSrcSrc
  1923. pbucketTX = pBucketSrcSrcOriginal;
  1924. DHTAssert( pbucketTX );
  1925. while ( pbucketTX )
  1926. {
  1927. for ( iT = 0; iT < cBucketCheck; iT++ )
  1928. {
  1929. if ( rgBucketCheck[iT] == pbucketTX )
  1930. {
  1931. rgBucketCheck[iT] = NULL;
  1932. break;
  1933. }
  1934. }
  1935. DHTAssert( iT < cBucketCheck ); // if this goes off, we somehow added a bucket to the
  1936. // SOURCE CHAIN -- THIS SHOULD NEVER HAPPEN! also, we
  1937. // never catalogued the bucket!
  1938. pbucketTX = PbucketBKTNext( pbucketTX );
  1939. }
  1940. // find and remove all buckets in pBucketDst
  1941. pbucketTX = pBucketDstOriginal;
  1942. DHTAssert( pbucketTX );
  1943. while ( pbucketTX )
  1944. {
  1945. for ( iT = 0; iT < cBucketCheck; iT++ )
  1946. {
  1947. if ( rgBucketCheck[iT] == pbucketTX )
  1948. {
  1949. rgBucketCheck[iT] = NULL;
  1950. break;
  1951. }
  1952. }
  1953. DHTAssert( iT < cBucketCheck ); // if this goes off, we added a bucket to the destination
  1954. // chain, but it was never catalogued! first question: where
  1955. // did the bucket come from if didn't catalogue it???
  1956. pbucketTX = PbucketBKTNext( pbucketTX );
  1957. }
  1958. // find and remove all buckets in pBucketAvail
  1959. pbucketTX = pBucketAvail;
  1960. while ( pbucketTX )
  1961. {
  1962. for ( iT = 0; iT < cBucketCheck; iT++ )
  1963. {
  1964. if ( rgBucketCheck[iT] == pbucketTX )
  1965. {
  1966. rgBucketCheck[iT] = NULL;
  1967. break;
  1968. }
  1969. }
  1970. DHTAssert( iT < cBucketCheck ); // if this goes off, we have a free bucket that was never
  1971. // catalogued! where did it come from?
  1972. // NOTE: this is not a memleak, it is a "we-never-catalogued-it"
  1973. // problem; the memory will be freed later in this function
  1974. pbucketTX = pbucketTX->m_pBucketNext;
  1975. }
  1976. // the list should now be empty -- verify this
  1977. for ( iT = 0; iT < cBucketCheck; iT++ )
  1978. {
  1979. // if this goes off, rgBucketCheck[iT] contains a bucket that was abandoned without
  1980. // being freed!
  1981. DHTAssert( rgBucketCheck[iT] == NULL );
  1982. }
  1983. // free the list
  1984. MEMFree( rgBucketCheck );
  1985. }
  1986. size_t cEntriesAfterwards = 0;
  1987. // make sure the number of entries we processed matches the number of entries we started with
  1988. DHTAssert( cEntriesTotal == cEntriesTotalRunning );
  1989. // make sure we have all the entries we started with
  1990. pbktT = pBucketSrcSrcOriginal;
  1991. if ( pbktT->m_pb != NULL )
  1992. {
  1993. while ( pbktT )
  1994. {
  1995. pbktNextT = PbucketBKTNext( pbktT );
  1996. if ( pbktNextT )
  1997. {
  1998. // full bucket
  1999. cEntriesAfterwards += size_t( m_centryBucket );
  2000. }
  2001. else
  2002. {
  2003. // partial bucket (not empty)
  2004. cEntriesAfterwards += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] );
  2005. }
  2006. pbktT = pbktNextT;
  2007. }
  2008. }
  2009. pbktT = pBucketDstOriginal;
  2010. if ( pbktT->m_pb != NULL )
  2011. {
  2012. while ( pbktT )
  2013. {
  2014. pbktNextT = PbucketBKTNext( pbktT );
  2015. if ( pbktNextT )
  2016. {
  2017. // full bucket
  2018. cEntriesAfterwards += size_t( m_centryBucket );
  2019. }
  2020. else
  2021. {
  2022. // partial bucket (not empty)
  2023. cEntriesAfterwards += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] );
  2024. }
  2025. pbktT = pbktNextT;
  2026. }
  2027. }
  2028. DHTAssert( cEntriesAfterwards == cEntriesTotal );
  2029. #endif
  2030. // free the avail list
  2031. while ( pBucketAvail )
  2032. {
  2033. PBUCKET pBucketT;
  2034. pBucketT = pBucketAvail;
  2035. pBucketAvail = pBucketAvail->m_pBucketNext;
  2036. MEMFree( pBucketT );
  2037. STATDeleteOverflowBucket();
  2038. }
  2039. if ( !fBucketFromHeap )
  2040. {
  2041. phs->m_bucketpool.POOLUnreserve(); // cancel the heap reservation (we never used it)
  2042. }
  2043. STATSplitBucket();
  2044. }
  2045. // work-horse for shrinking a bucket
  2046. void BKTIDoMerge( HOTSTUFF* const phs,
  2047. PBUCKET pBucketSrc,
  2048. PBUCKET pBucketDst )
  2049. {
  2050. #ifdef DEBUG
  2051. // catalog each BUCKET structure and make sure they end up in the destination bucket
  2052. PBUCKET pBucketDstOriginal = pBucketDst;
  2053. PBUCKET *rgBucketCheck = NULL, pbucketT;
  2054. size_t cBucketCheck = 0, iT;
  2055. pbucketT = pBucketSrc;
  2056. while ( pbucketT )
  2057. {
  2058. cBucketCheck++;
  2059. pbucketT = PbucketBKTNext( pbucketT );
  2060. }
  2061. pbucketT = pBucketDst;
  2062. while ( pbucketT )
  2063. {
  2064. cBucketCheck++;
  2065. pbucketT = PbucketBKTNext( pbucketT );
  2066. }
  2067. cBucketCheck++; // account for bucket from heap
  2068. rgBucketCheck = (PBUCKET *)PvMEMAlloc( cBucketCheck * sizeof( PBUCKET ) );
  2069. if ( NULL != rgBucketCheck )
  2070. {
  2071. iT = 0;
  2072. pbucketT = pBucketSrc;
  2073. while ( pbucketT )
  2074. {
  2075. rgBucketCheck[ iT++ ] = pbucketT;
  2076. pbucketT = PbucketBKTNext( pbucketT );
  2077. }
  2078. pbucketT = pBucketDst;
  2079. while ( pbucketT )
  2080. {
  2081. rgBucketCheck[ iT++ ] = pbucketT;
  2082. pbucketT = PbucketBKTNext( pbucketT );
  2083. }
  2084. rgBucketCheck[ iT++ ] = NULL; // heap bucket
  2085. DHTAssert( iT == cBucketCheck );
  2086. }
  2087. // count the number of entries we will be handling
  2088. size_t cEntriesTotal = 0;
  2089. PBUCKET pbktT, pbktNextT;
  2090. pbktT = pBucketSrc;
  2091. if ( pbktT->m_pb != NULL )
  2092. {
  2093. while ( pbktT )
  2094. {
  2095. pbktNextT = PbucketBKTNext( pbktT );
  2096. if ( pbktNextT )
  2097. {
  2098. // full bucket
  2099. cEntriesTotal += size_t( m_centryBucket );
  2100. }
  2101. else
  2102. {
  2103. // partial bucket (not empty)
  2104. cEntriesTotal += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] );
  2105. }
  2106. pbktT = pbktNextT;
  2107. }
  2108. }
  2109. pbktT = pBucketDst;
  2110. if ( pbktT->m_pb != NULL )
  2111. {
  2112. while ( pbktT )
  2113. {
  2114. pbktNextT = PbucketBKTNext( pbktT );
  2115. if ( pbktNextT )
  2116. {
  2117. // full bucket
  2118. cEntriesTotal += size_t( m_centryBucket );
  2119. }
  2120. else
  2121. {
  2122. // partial bucket (not empty)
  2123. cEntriesTotal += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] );
  2124. }
  2125. pbktT = pbktNextT;
  2126. }
  2127. }
  2128. #endif
  2129. // read (src) cursor
  2130. CKeyEntry *pEntryThisSrc;
  2131. CKeyEntry *pEntryMostSrc;
  2132. // write (dst) cursor
  2133. CKeyEntry *pEntryThisDst;
  2134. CKeyEntry *pEntryMostDst;
  2135. // remember if we have moved to the last bucket or not
  2136. BOOL fSetEndPtr;
  2137. // remember if we allocated a bucket from the heap
  2138. BOOL fBucketFromHeap = fFalse;
  2139. // efficiency variables
  2140. PBUCKET pBucketT;
  2141. // move to the end of the dst bucket
  2142. pBucketT = PbucketBKTNext( pBucketDst );
  2143. while ( pBucketT )
  2144. {
  2145. pBucketDst = pBucketT;
  2146. pBucketT = PbucketBKTNext( pBucketT );
  2147. }
  2148. pEntryThisDst = PentryBKTNextMost( pBucketDst );
  2149. pEntryMostDst = &pBucketDst->m_rgEntry[ m_centryBucket ];
  2150. if ( !PbucketBKTNext( pBucketSrc ) )
  2151. {
  2152. // the src bucket does not have extra bucket structures
  2153. // setup the src cursor for a partial pass
  2154. pEntryThisSrc = &pBucketSrc->m_rgEntry[ 0 ];
  2155. pEntryMostSrc = PentryBKTNextMost( pBucketSrc );
  2156. // we are not appending buckets from the src bucket, so we will be setting the
  2157. // end ptr of the dst bucket iff we add entries from the src bucket
  2158. fSetEndPtr = BOOL( pEntryThisSrc < pEntryMostSrc );
  2159. }
  2160. else
  2161. {
  2162. // the src bucket has extra bucket structures
  2163. // attach the extra bucket structures to the dst bucket
  2164. pBucketDst->m_pBucketNext = pBucketSrc->m_pBucketNext;
  2165. pBucketDst->m_pBucketNext->m_pBucketPrev = pBucketDst;
  2166. // setup the src cursor for a full pass over the first src bucket
  2167. pEntryThisSrc = &pBucketSrc->m_rgEntry[ 0 ];
  2168. pEntryMostSrc = &pBucketSrc->m_rgEntry[ m_centryBucket ];
  2169. // we are appending buckets from the src bucket, so we will not be setting the
  2170. // end ptr of the dst bucket because we are no longer in the last bucket
  2171. // of the dst bucket chain
  2172. fSetEndPtr = fFalse;
  2173. }
  2174. // copy the entries in the src bucket
  2175. while ( pEntryThisSrc < pEntryMostSrc )
  2176. {
  2177. // check the dst cursor
  2178. if ( pEntryThisDst < pEntryMostDst )
  2179. {
  2180. // nop
  2181. }
  2182. else
  2183. {
  2184. // all entries in the dst bucket are exhausted
  2185. if ( !fSetEndPtr )
  2186. {
  2187. // we are not in the last bucket of the dst bucket because there is no end ptr
  2188. pBucketT = PbucketBKTNext( pBucketDst );
  2189. DHTAssert( pBucketT );
  2190. do
  2191. {
  2192. pBucketDst = pBucketT;
  2193. pBucketT = PbucketBKTNext( pBucketT );
  2194. }
  2195. while ( pBucketT );
  2196. // setup the dst cursor
  2197. pEntryThisDst = pBucketDst->m_pEntryLast + 1;
  2198. pEntryMostDst = &pBucketDst->m_rgEntry[ m_centryBucket ];
  2199. // we are now able to set the end ptr because we are in the last bucket
  2200. // of the dst bucket
  2201. fSetEndPtr = fTrue;
  2202. // restart the loop
  2203. continue;
  2204. }
  2205. // we were at the last bucket in the dst bucket
  2206. // get a bucket from the heap reservation pool
  2207. DHTAssert( !fBucketFromHeap );
  2208. fBucketFromHeap = fTrue;
  2209. // commit the reservation now
  2210. pBucketT = phs->m_bucketpool.PbucketPOOLCommit();
  2211. DHTAssert( pBucketT );
  2212. STATInsertOverflowBucket();
  2213. // chain the heap bucket
  2214. pBucketDst->m_pBucketNext = pBucketT;
  2215. pBucketT->m_pBucketPrev = pBucketDst;
  2216. // setup the dst cursor
  2217. pBucketDst = pBucketT;
  2218. pEntryThisDst = &pBucketDst->m_rgEntry[ 0 ];
  2219. pEntryMostDst = &pBucketDst->m_rgEntry[ m_centryBucket ];
  2220. #ifdef DEBUG
  2221. // add the heap bucket to our catalog of buckets
  2222. if ( NULL != rgBucketCheck )
  2223. {
  2224. DHTAssert( rgBucketCheck[cBucketCheck - 1] == NULL );
  2225. rgBucketCheck[cBucketCheck - 1] = pBucketT;
  2226. }
  2227. #endif // DEBUG
  2228. }
  2229. // copy the entry
  2230. pEntryThisDst->SetEntry( pEntryThisSrc->m_entry );
  2231. // advance the cursors
  2232. pEntryThisSrc++;
  2233. pEntryThisDst++;
  2234. }
  2235. // mark the src bucket as empty
  2236. pBucketSrc->m_pb = NULL;
  2237. if ( fSetEndPtr )
  2238. {
  2239. // set the end of the destination bucket
  2240. DHTAssert( pEntryThisDst != &pBucketDst->m_rgEntry[ 0 ] );
  2241. pBucketDst->m_pEntryLast = pEntryThisDst - 1;
  2242. }
  2243. else
  2244. {
  2245. // we do not need to set the end ptr of the dst bucket
  2246. // nop
  2247. }
  2248. if ( !fBucketFromHeap )
  2249. {
  2250. // cancel the unused heap reservation
  2251. phs->m_bucketpool.POOLUnreserve();
  2252. }
  2253. #ifdef DEBUG
  2254. if ( NULL != rgBucketCheck )
  2255. {
  2256. // check each catalogued bucket to see if it is in the pBucketDst bucket
  2257. pbucketT = pBucketDstOriginal;
  2258. DHTAssert( pbucketT );
  2259. // find an remove all buckets found in the destiantion bucket from our list
  2260. while ( pbucketT )
  2261. {
  2262. for ( iT = 0; iT < cBucketCheck; iT++ )
  2263. {
  2264. if ( rgBucketCheck[iT] == pbucketT )
  2265. {
  2266. rgBucketCheck[iT] = NULL;
  2267. break;
  2268. }
  2269. }
  2270. DHTAssert( iT < cBucketCheck ); // if this goes off, we somehow got a bucket
  2271. // into the chain that shouldn't be there
  2272. // (it is a bucket we never catalogued!)
  2273. pbucketT = PbucketBKTNext( pbucketT );
  2274. }
  2275. // find an remove pBucketSrc from our list
  2276. for ( iT = 0; iT < cBucketCheck; iT++ )
  2277. {
  2278. if ( rgBucketCheck[iT] == pBucketSrc )
  2279. {
  2280. rgBucketCheck[iT] = NULL;
  2281. break;
  2282. }
  2283. }
  2284. DHTAssert( iT < cBucketCheck ); // if this goes off, somehow the FIXED source bucket
  2285. // got removed from our catalogue OR pBucketSrc was
  2286. // changed (which should never happen)
  2287. // the list should now be empty -- verify this
  2288. for ( iT = 0; iT < cBucketCheck; iT++ )
  2289. {
  2290. // if this goes off, rgBucketCheck[iT] contains a bucket that was abandoned without
  2291. // being freed!
  2292. DHTAssert( rgBucketCheck[iT] == NULL );
  2293. }
  2294. // free the list
  2295. MEMFree( rgBucketCheck );
  2296. }
  2297. // make sure the number of entries has not changed since we started
  2298. size_t cEntriesAfterwards = 0;
  2299. pbktT = pBucketDstOriginal;
  2300. if ( pbktT->m_pb != NULL )
  2301. {
  2302. while ( pbktT )
  2303. {
  2304. pbktNextT = PbucketBKTNext( pbktT );
  2305. if ( pbktNextT )
  2306. {
  2307. // full bucket
  2308. cEntriesAfterwards += size_t( m_centryBucket );
  2309. }
  2310. else
  2311. {
  2312. // partial bucket (not empty)
  2313. cEntriesAfterwards += 1 + ( pbktT->m_pEntryLast - &pbktT->m_rgEntry[0] );
  2314. }
  2315. pbktT = pbktNextT;
  2316. }
  2317. }
  2318. DHTAssert( cEntriesAfterwards == cEntriesTotal );
  2319. #endif
  2320. STATMergeBucket();
  2321. }
  2322. /////////////////////////////////////////////////////////////////////////////////////////
  2323. //
  2324. // mechanisms for implementing the dynamic-hash-table policies
  2325. //
  2326. // hash to the correct HOTSTUFF element
  2327. HOTSTUFF *HOTSTUFFHash() const
  2328. {
  2329. return m_rghs + OSSYNC::OSSyncGetCurrentProcessor();
  2330. }
  2331. // statistics
  2332. void STATInsertEntry( HOTSTUFF* const phs )
  2333. {
  2334. AtomicExchangeAddPointer( (void**)&phs->m_cEntry, (void*)1 );
  2335. phs->m_cOp++;
  2336. }
  2337. void STATDeleteEntry( HOTSTUFF* const phs )
  2338. {
  2339. AtomicExchangeAddPointer( (void**)&phs->m_cEntry, (void*)-1 );
  2340. phs->m_cOp++;
  2341. }
  2342. void STATInsertOverflowBucket()
  2343. {
  2344. #ifdef DHT_STATS
  2345. m_cBucketOverflowInsert++;
  2346. #endif // DHT_STATS
  2347. }
  2348. void STATDeleteOverflowBucket()
  2349. {
  2350. #ifdef DHT_STATS
  2351. m_cBucketOverflowDelete++;
  2352. #endif // DHT_STATS
  2353. }
  2354. void STATSplitBucket()
  2355. {
  2356. #ifdef DHT_STATS
  2357. m_cBucketSplit++;
  2358. #endif // DHT_STATS
  2359. }
  2360. void STATMergeBucket()
  2361. {
  2362. #ifdef DHT_STATS
  2363. m_cBucketMerge++;
  2364. #endif // DHT_STATS
  2365. }
  2366. void STATSplitDirectory()
  2367. {
  2368. #ifdef DHT_STATS
  2369. m_cDirSplit++;
  2370. #endif // DHT_STATS
  2371. }
  2372. void STATMergeDirectory()
  2373. {
  2374. #ifdef DHT_STATS
  2375. m_cDirMerge++;
  2376. #endif // DHT_STATS
  2377. }
  2378. void STATStateTransition()
  2379. {
  2380. #ifdef DHT_STATS
  2381. m_cTransition++;
  2382. #endif // DHT_STATS
  2383. }
  2384. void STATPolicySelection()
  2385. {
  2386. #ifdef DHT_STATS
  2387. m_cSelection++;
  2388. #endif // DHT_STATS
  2389. }
  2390. void STATSplitContention()
  2391. {
  2392. #ifdef DHT_STATS
  2393. m_cSplitContend++;
  2394. #endif // DHT_STATS
  2395. }
  2396. void STATMergeContention()
  2397. {
  2398. #ifdef DHT_STATS
  2399. m_cMergeContend++;
  2400. #endif // DHT_STATS
  2401. }
  2402. // amortized table maintenance
  2403. void PerformMaintenance()
  2404. {
  2405. // enter the state machine
  2406. HOTSTUFF* phs;
  2407. const int iGroup = UiSTEnter( &phs );
  2408. const ENUMSTATE esCurrent = EsSTGetState();
  2409. // carry out the current policy
  2410. if ( esCurrent == stateGrow )
  2411. {
  2412. BKTISplit( phs );
  2413. }
  2414. else if ( esCurrent == stateShrink )
  2415. {
  2416. BKTIMerge( phs );
  2417. }
  2418. // leave the state machine
  2419. STLeave( iGroup, phs );
  2420. }
  2421. void SelectMaintenancePolicy( HOTSTUFF* const phs )
  2422. {
  2423. // collect information on the current state of the hash table
  2424. const ENUMSTATE esCurrent = EsSTGetState();
  2425. const NativeCounter cBucketMax = NcDIRIGetBucketMax( esCurrent );
  2426. const NativeCounter cBucket = NcDIRIGetBucket( esCurrent );
  2427. const NativeCounter cBucketActive = cBucketMax + cBucket;
  2428. const NativeCounter cOpLocal = phs->m_cOp;
  2429. // compute the current entry count and op count and reset the op count
  2430. NativeCounter cEntry = 0;
  2431. NativeCounter cOp = 0;
  2432. for ( NativeCounter ihs = 0; ihs < m_chs; ihs++ )
  2433. {
  2434. cEntry += m_rghs[ ihs ].m_cEntry;
  2435. cOp += m_rghs[ ihs ].m_cOp;
  2436. m_rghs[ ihs ].m_cOp = 0;
  2437. }
  2438. // compute the ideal entry count
  2439. const NativeCounter cEntryIdeal = m_cLoadFactor * cBucketActive;
  2440. // compute the max entry count
  2441. const NativeCounter cEntryMax = m_centryBucket * cBucketActive;
  2442. // determine our current flexibility in the entry count
  2443. const NativeCounter cEntryFlexibility = max( m_centryBucket - m_cLoadFactor, cEntryMax / 2 - cEntryIdeal );
  2444. // determine our current threshold sensitivity
  2445. const NativeCounter cOpSensitivity = max( 1, cEntryFlexibility / 2 );
  2446. // approximate the local (per-HOTSTUFF) threshold sensitivity
  2447. const NativeCounter ratio = ( cOp + cOpLocal - 1 ) / cOpLocal;
  2448. const NativeCounter cOpSensitivityLocal = max( 1, cOpSensitivity / ratio );
  2449. // compute the preferred entry count
  2450. NativeCounter cEntryPreferred = cEntry;
  2451. if ( cEntryIdeal + ( cEntryFlexibility - cOpSensitivity ) < cEntry )
  2452. {
  2453. cEntryPreferred = cEntry - ( cEntryFlexibility - cOpSensitivity );
  2454. }
  2455. else if ( cEntryIdeal > cEntry + ( cEntryFlexibility - cOpSensitivity ) )
  2456. {
  2457. cEntryPreferred = cEntry + ( cEntryFlexibility - cOpSensitivity );
  2458. }
  2459. // compute the preferred bucket count
  2460. const NativeCounter cBucketPreferred = max( m_cbucketMin, ( cEntryPreferred + m_cLoadFactor - 1 ) / m_cLoadFactor );
  2461. // determine the new policy
  2462. ENUMSTATE esNew = stateNil;
  2463. if ( esCurrent == stateGrow )
  2464. {
  2465. if ( cBucketPreferred < cBucketActive )
  2466. {
  2467. esNew = stateShrinkFromGrow;
  2468. }
  2469. else if ( cBucketPreferred > cBucketActive )
  2470. {
  2471. if ( cBucket == cBucketMax )
  2472. {
  2473. esNew = stateSplitFromGrow;
  2474. }
  2475. }
  2476. }
  2477. else
  2478. {
  2479. DHTAssert( esCurrent == stateShrink );
  2480. if ( cBucketPreferred < cBucketActive )
  2481. {
  2482. if ( cBucket == 0 )
  2483. {
  2484. esNew = stateMergeFromShrink;
  2485. }
  2486. }
  2487. else if ( cBucketPreferred > cBucketActive )
  2488. {
  2489. esNew = stateGrowFromShrink;
  2490. }
  2491. }
  2492. // enact the new policy
  2493. if ( m_cOpSensitivity != cOpSensitivityLocal )
  2494. {
  2495. m_cOpSensitivity = cOpSensitivityLocal;
  2496. }
  2497. if ( m_cBucketPreferred != cBucketPreferred )
  2498. {
  2499. m_cBucketPreferred = cBucketPreferred;
  2500. }
  2501. if ( esNew )
  2502. {
  2503. STTransition( esNew );
  2504. }
  2505. else
  2506. {
  2507. m_semPolicy.Release();
  2508. }
  2509. STATPolicySelection();
  2510. }
  2511. void MaintainTable( HOTSTUFF* const phs )
  2512. {
  2513. // decide on a new policy if we may have breached one of our
  2514. // thresholds
  2515. if ( phs->m_cOp > m_cOpSensitivity &&
  2516. m_semPolicy.CAvail() &&
  2517. m_semPolicy.FTryAcquire() )
  2518. {
  2519. if ( phs->m_cOp > m_cOpSensitivity )
  2520. {
  2521. SelectMaintenancePolicy( phs );
  2522. }
  2523. else
  2524. {
  2525. m_semPolicy.Release();
  2526. }
  2527. }
  2528. // perform amortized work on the table as necessary
  2529. if ( NcDIRIGetBucketMax( stateGrow ) + NcDIRIGetBucket( stateGrow ) < m_cBucketPreferred ||
  2530. m_cBucketPreferred < NcDIRIGetBucketMax( stateShrink ) + NcDIRIGetBucket( stateShrink ) )
  2531. {
  2532. PerformMaintenance();
  2533. }
  2534. }
  2535. public:
  2536. // calculate the address of the aligned block and store its offset (for free)
  2537. static void* PvMEMIAlign( void* const pv, const size_t cbAlign )
  2538. {
  2539. // round up to the nearest cache line
  2540. // NOTE: this formula always forces an offset of at least 1 byte
  2541. const ULONG_PTR ulp = ULONG_PTR( pv );
  2542. const ULONG_PTR ulpAligned = ( ( ulp + cbAlign ) / cbAlign ) * cbAlign;
  2543. const ULONG_PTR ulpOffset = ulpAligned - ulp;
  2544. DHTAssert( ulpOffset > 0 );
  2545. DHTAssert( ulpOffset <= cbAlign );
  2546. DHTAssert( ulpOffset == BYTE( ulpOffset ) ); // must fit into a single BYTE
  2547. // store the offset
  2548. BYTE *const pbAligned = (BYTE*)ulpAligned;
  2549. pbAligned[ -1 ] = BYTE( ulpOffset );
  2550. // return the aligned block
  2551. return (void*)pbAligned;
  2552. }
  2553. // retrieve the original unaligned block of memory from the aligned block
  2554. static void* PvMEMIUnalign( void* const pv )
  2555. {
  2556. // read the offset of the real block
  2557. BYTE *const pbAligned = (BYTE*)pv;
  2558. const BYTE bOffset = pbAligned[ -1 ];
  2559. DHTAssert( bOffset > 0 );
  2560. // return the real unaligned block
  2561. return (void*)( pbAligned - bOffset );
  2562. }
  2563. // allocate memory
  2564. static void* PvMEMAlloc( const size_t cbSize, const size_t cbAlign = cbCacheLine )
  2565. {
  2566. void* const pv = new BYTE[ cbSize + cbAlign ];
  2567. if ( pv )
  2568. {
  2569. return PvMEMIAlign( pv, cbAlign );
  2570. }
  2571. return NULL;
  2572. }
  2573. // free memory
  2574. static void MEMFree( void* const pv )
  2575. {
  2576. if ( pv )
  2577. {
  2578. delete [] ((BYTE*)PvMEMIUnalign( pv ));
  2579. }
  2580. }
  2581. private:
  2582. // never written
  2583. NativeCounter m_cLoadFactor; // preferred number of entries in a bucket at any given time
  2584. NativeCounter m_centryBucket; // maximum number of entries per bucket
  2585. NativeCounter m_cbBucket; // size in bytes of a bucket (rounded up to the nearest full cache-line)
  2586. NativeCounter m_rankDHTrwlBucket; // rank of the reader/writer lock on each bucket
  2587. HOTSTUFF *m_rghs; // array of HOTSTUFF objects (hashed per processor)
  2588. NativeCounter m_chs; // size of HOTSTUFF array
  2589. NativeCounter m_cbucketMin; // minimum number of buckets in the hash-table
  2590. #ifdef _WIN64
  2591. BYTE m_rgbRsvdNever[ 8 ];
  2592. #else // !_WIN64
  2593. BYTE m_rgbRsvdNever[ 4 ];
  2594. #endif // _WIN64
  2595. // rarely written
  2596. DIRPTRS m_dirptrs[ 2 ]; // directory pointers (2 copies)
  2597. BYTE *m_rgrgBucket[ cbitNativeCounter ]; // directory (array of arrays of buckets)
  2598. // no padding necessary
  2599. // often written
  2600. NativeCounter m_cOpSensitivity; // used to regulate policy changes
  2601. NativeCounter m_cBucketPreferred; // preferred table size
  2602. ENUMSTATE m_stateCur; // current state
  2603. #ifdef _WIN64
  2604. BYTE m_rgbRsvdOften[ 44 ];
  2605. #else // !_WIN64
  2606. BYTE m_rgbRsvdOften[ 20 ];
  2607. #endif // _WIN64
  2608. // always written (second only to HOTSTUFF members)
  2609. OSSYNC::CSemaphore m_semPolicy; // used to serialize policy changes
  2610. long m_cCompletions; // counts the number of metered-section completions
  2611. #ifdef _WIN64
  2612. BYTE m_rgbRsvdAlways[ 52 ];
  2613. #else // !_WIN64
  2614. BYTE m_rgbRsvdAlways[ 24 ];
  2615. #endif // _WIN64
  2616. #ifdef DHT_STATS
  2617. // performance statistics
  2618. long m_cBucketOverflowInsert; // count of overflow bucket allocations
  2619. long m_cBucketOverflowDelete; // count of overflow bucket deletions
  2620. long m_cBucketSplit; // count of bucket split operations
  2621. long m_cBucketMerge; // count of bucket merge operations
  2622. long m_cDirSplit; // count of directory split operations
  2623. long m_cDirMerge; // count of directory merge operations
  2624. long m_cTransition; // count of state transitions
  2625. long m_cSelection; // count of policy selections
  2626. long m_cSplitContend; // count of split contentions
  2627. long m_cMergeContend; // count of merge contentions
  2628. #ifdef _WIN64
  2629. BYTE m_rgbRsvdPerf[ 24 ];
  2630. #else // !_WIN64
  2631. BYTE m_rgbRsvdPerf[ 24 ];
  2632. #endif // _WIN64
  2633. #endif // DHT_STATS
  2634. #ifdef DEBUG
  2635. BOOL m_fInit; // initialization flag
  2636. #endif // DEBUG
  2637. };
  2638. /////////////////////////////////////////////////////////////////////////////////////
  2639. //
  2640. // CDynamicHashTable< CKey, CEntry >
  2641. //
  2642. /////////////////////////////////////////////////////////////////////////////////////
  2643. // ctor
  2644. template< class CKey, class CEntry >
  2645. inline CDynamicHashTable< CKey, CEntry >::
  2646. CDynamicHashTable( const NativeCounter rankDHTrwlBucket )
  2647. : m_semPolicy( CSyncBasicInfo( "CDynamicHashTable::m_semPolicy" ) )
  2648. {
  2649. #ifdef DEBUG
  2650. m_fInit = fFalse;
  2651. // zero-out this memory so the debugger won't print garbage
  2652. memset( m_rgbRsvdNever, 0, sizeof( m_rgbRsvdNever ) );
  2653. memset( m_rgbRsvdOften, 0, sizeof( m_rgbRsvdOften ) );
  2654. memset( m_rgbRsvdAlways, 0, sizeof( m_rgbRsvdAlways ) );
  2655. #ifdef DHT_STATS
  2656. memset( m_rgbRsvdPerf, 0, sizeof( m_rgbRsvdPerf ) );
  2657. #endif // DHT_STATS
  2658. #endif
  2659. // we should be on a 32-bit or 64-bit system
  2660. #ifdef _WIN64
  2661. DHTAssert( 8 == sizeof( NativeCounter ) );
  2662. #else // _!WIN64
  2663. DHTAssert( 4 == sizeof( NativeCounter ) );
  2664. #endif // _WIN64
  2665. // capture the rank for each bucket
  2666. m_rankDHTrwlBucket = rankDHTrwlBucket;
  2667. // prepare each semaphore so it can have 1 owner
  2668. m_semPolicy.Release();
  2669. }
  2670. // dtor
  2671. template< class CKey, class CEntry >
  2672. inline CDynamicHashTable< CKey, CEntry >::
  2673. ~CDynamicHashTable()
  2674. {
  2675. }
  2676. // initializes the dynamic hash table. if the table cannot be initialized,
  2677. // errOutOfMemory will be returned
  2678. template< class CKey, class CEntry >
  2679. inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >::
  2680. ErrInit( const double dblLoadFactor,
  2681. const double dblUniformity,
  2682. const NativeCounter cBucketMinimum )
  2683. {
  2684. ERR err;
  2685. NativeCounter ihs;
  2686. DHTAssert( !m_fInit );
  2687. // initialize all data by its cache-line grouping
  2688. // never written
  2689. m_cLoadFactor = 0;
  2690. m_centryBucket = 0;
  2691. m_cbBucket = 0;
  2692. m_rghs = NULL;
  2693. m_chs = OSSYNC::OSSyncGetProcessorCountMax();
  2694. m_cbucketMin = 0;
  2695. // rarely written
  2696. memset( m_dirptrs, 0, sizeof( m_dirptrs ) );
  2697. memset( m_rgrgBucket, 0, sizeof( m_rgrgBucket ) );
  2698. // often written
  2699. m_cOpSensitivity = 0;
  2700. m_cBucketPreferred = cBucketMinimum;
  2701. // NOTE: we cannot start in stateFreeze because we must go through the "halfway" completion
  2702. // function so that we copy the directory ptrs safely
  2703. m_stateCur = stateGrow;
  2704. // always written
  2705. m_cCompletions = 0;
  2706. #ifdef DHT_STATS
  2707. // performance statistics
  2708. m_cBucketOverflowInsert = 0;
  2709. m_cBucketOverflowDelete = 0;
  2710. m_cBucketSplit = 0;
  2711. m_cBucketMerge = 0;
  2712. m_cDirSplit = 0;
  2713. m_cDirMerge = 0;
  2714. m_cTransition = 0;
  2715. m_cSelection = 0;
  2716. m_cSplitContend = 0;
  2717. m_cMergeContend = 0;
  2718. #endif // DHT_STATS
  2719. // allocate the HOTSTUFF array
  2720. m_rghs = (HOTSTUFF*)PvMEMAlloc( m_chs * sizeof( HOTSTUFF ), cbCacheLine );
  2721. if ( !m_rghs )
  2722. {
  2723. err = errOutOfMemory;
  2724. goto HandleError;
  2725. }
  2726. // construct the HOTSTUFF objects
  2727. for ( ihs = 0; ihs < m_chs; ihs++ )
  2728. {
  2729. new( m_rghs + ihs ) HOTSTUFF();
  2730. }
  2731. // initialize the directory
  2732. err = ErrDIRInit( NativeCounter( dblLoadFactor * dblUniformity ), cBucketMinimum );
  2733. if ( err != errSuccess )
  2734. {
  2735. goto HandleError;
  2736. }
  2737. #ifdef DEBUG
  2738. m_fInit = fTrue;
  2739. #endif // DEBUG
  2740. return errSuccess;
  2741. HandleError:
  2742. DHTAssert( err != errSuccess );
  2743. Term();
  2744. return err;
  2745. }
  2746. // terminates the dynamic hash table. this function can be called even if the
  2747. // hash table has never been initialized or is only partially initialized
  2748. //
  2749. // NOTE: any data stored in the table at this time will be lost!
  2750. template< class CKey, class CEntry >
  2751. inline void CDynamicHashTable< CKey, CEntry >::
  2752. Term()
  2753. {
  2754. #ifdef DEBUG
  2755. m_fInit = fFalse;
  2756. #endif // DEBUG
  2757. // term the directory
  2758. DIRTerm();
  2759. if ( NULL != m_rghs )
  2760. {
  2761. // delete the HOTSTUFF aray
  2762. while ( m_chs )
  2763. {
  2764. // destruct the object
  2765. m_chs--;
  2766. m_rghs[ m_chs ].HOTSTUFF::~HOTSTUFF();
  2767. }
  2768. MEMFree( m_rghs );
  2769. m_rghs = NULL;
  2770. }
  2771. }
  2772. // acquires a read lock on the specified key and returns the lock in the
  2773. // provided lock context
  2774. template< class CKey, class CEntry >
  2775. inline void CDynamicHashTable< CKey, CEntry >::
  2776. ReadLockKey( const CKey& key, CLock* const plock )
  2777. {
  2778. DHTAssert( m_fInit );
  2779. // verify the lock
  2780. DHTAssert( plock->m_ls == CLock::lsNil );
  2781. // initialize the lock
  2782. plock->m_ls = CLock::lsRead;
  2783. // enter the state machine
  2784. const int iGroup = UiSTEnter( &plock->m_phs );
  2785. const ENUMSTATE esCurrent = EsSTGetState();
  2786. // read-lock the key through the directory
  2787. DIRReadLockKey( esCurrent, key, plock );
  2788. // try to seek to the key (sets up currency)
  2789. BKTSeek( plock, key );
  2790. // leave the state machine
  2791. STLeave( iGroup, plock->m_phs );
  2792. }
  2793. // releases the read lock in the specified lock context
  2794. template< class CKey, class CEntry >
  2795. inline void CDynamicHashTable< CKey, CEntry >::
  2796. ReadUnlockKey( CLock* const plock )
  2797. {
  2798. DHTAssert( m_fInit );
  2799. // verify the lock
  2800. DHTAssert( FBKTRead( plock ) );
  2801. DHTAssert( plock->m_pBucketHead != NULL );
  2802. DHTAssert( plock->m_pBucketHead->CRWL().FReader() );
  2803. // unlock the key through the directory
  2804. DIRReadUnlockKey( plock );
  2805. // reset the lock
  2806. plock->m_ls = CLock::lsNil;
  2807. }
  2808. // acquires a write lock on the specified key and returns the lock in the
  2809. // provided lock context
  2810. template< class CKey, class CEntry >
  2811. inline void CDynamicHashTable< CKey, CEntry >::
  2812. WriteLockKey( const CKey& key, CLock* const plock )
  2813. {
  2814. DHTAssert( m_fInit );
  2815. // verify the lock
  2816. DHTAssert( plock->m_ls == CLock::lsNil );
  2817. // initialize the lock
  2818. plock->m_ls = CLock::lsWrite;
  2819. plock->m_fInsertOrDelete = fFalse;
  2820. // enter the state machine
  2821. const int iGroup = UiSTEnter( &plock->m_phs );
  2822. const ENUMSTATE esCurrent = EsSTGetState();
  2823. // write-lock the key through the directory
  2824. DIRWriteLockKey( esCurrent, key, plock );
  2825. // try to seek to the key (sets up currency)
  2826. BKTSeek( plock, key );
  2827. // leave the state machine
  2828. STLeave( iGroup, plock->m_phs );
  2829. }
  2830. // releases the write lock in the specified lock context
  2831. template< class CKey, class CEntry >
  2832. inline void CDynamicHashTable< CKey, CEntry >::
  2833. WriteUnlockKey( CLock* const plock )
  2834. {
  2835. DHTAssert( m_fInit );
  2836. // verify the lock
  2837. DHTAssert( FBKTWrite( plock ) );
  2838. DHTAssert( plock->m_pBucketHead != NULL );
  2839. DHTAssert( plock->m_pBucketHead->CRWL().FWriter() );
  2840. // unlock the key through the directory
  2841. DIRWriteUnlockKey( plock );
  2842. // we performed an insert or delete while holding the write lock
  2843. if ( plock->m_fInsertOrDelete )
  2844. {
  2845. // perform amortized maintenance on the table
  2846. MaintainTable( plock->m_phs );
  2847. }
  2848. // reset the lock
  2849. plock->m_ls = CLock::lsNil;
  2850. plock->m_fInsertOrDelete = fFalse;
  2851. }
  2852. // retrieves the entry corresponding to the key locked by the specified lock
  2853. // context. if there is no entry for this key, errEntryNotFound will be
  2854. // returned
  2855. template< class CKey, class CEntry >
  2856. inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >::
  2857. ErrRetrieveEntry( CLock* const plock, CEntry* const pentry )
  2858. {
  2859. DHTAssert( m_fInit );
  2860. // verify the lock
  2861. DHTAssert( FBKTRead( plock ) || FBKTWrite( plock ) || FBKTScan( plock ) );
  2862. DHTAssert( plock->m_pBucketHead != NULL );
  2863. #ifdef DEBUG
  2864. if ( FBKTRead( plock ) )
  2865. {
  2866. DHTAssert( plock->m_pBucketHead->CRWL().FReader() );
  2867. }
  2868. else
  2869. {
  2870. DHTAssert( plock->m_pBucketHead->CRWL().FWriter() );
  2871. }
  2872. if ( FBKTRead( plock ) || FBKTWrite( plock ) )
  2873. {
  2874. CKeyEntry *pKeyEntry;
  2875. BKTGetEntry( plock, &pKeyEntry );
  2876. DHTAssert( pKeyEntry ? pKeyEntry->FEntryMatchesKey( plock->m_key ) : fTrue );
  2877. }
  2878. #endif
  2879. // get the entry
  2880. return ErrBKTGetEntry( plock, pentry );
  2881. }
  2882. // replaces the entry corresponding to the key locked by the specified lock
  2883. // context. the key for the new entry must match the key for the old entry.
  2884. // if there is no entry for this key, errNoCurrentEntry will be returned
  2885. template< class CKey, class CEntry >
  2886. inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >::
  2887. ErrReplaceEntry( CLock* const plock, const CEntry& entry )
  2888. {
  2889. DHTAssert( m_fInit );
  2890. // verify the lock
  2891. DHTAssert( FBKTWrite( plock ) || FBKTScan( plock ) );
  2892. DHTAssert( plock->m_pBucketHead != NULL );
  2893. DHTAssert( plock->m_pBucketHead->CRWL().FWriter() );
  2894. #ifdef DEBUG
  2895. if ( FBKTWrite( plock ) )
  2896. {
  2897. CKeyEntry *pKeyEntry;
  2898. BKTGetEntry( plock, &pKeyEntry );
  2899. DHTAssert( pKeyEntry ? pKeyEntry->FEntryMatchesKey( plock->m_key ) : fTrue );
  2900. DHTAssert( ((CKeyEntry &)entry).FEntryMatchesKey( plock->m_key ) );
  2901. }
  2902. #endif
  2903. // replace the entry
  2904. return ErrBKTReplaceEntry( plock, entry );
  2905. }
  2906. // inserts a new entry corresponding to the key locked by the specified lock
  2907. // context. if there is already an entry with this key in the table,
  2908. // errKeyDuplicate will be returned. if the new entry cannot be inserted,
  2909. // errOutOfMemory will be returned
  2910. template< class CKey, class CEntry >
  2911. inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >::
  2912. ErrInsertEntry( CLock* const plock, const CEntry& entry )
  2913. {
  2914. DHTAssert( m_fInit );
  2915. // verify the lock
  2916. DHTAssert( FBKTWrite( plock ) );
  2917. DHTAssert( plock->m_pBucketHead != NULL );
  2918. DHTAssert( plock->m_pBucketHead->CRWL().FWriter() );
  2919. /// DHTAssert( ((CKeyEntry &)entry).FEntryMatchesKey( plock->m_key ) );
  2920. // insert the entry
  2921. const ERR err = ErrBKTInsertEntry( plock, entry );
  2922. if ( errSuccess == err )
  2923. {
  2924. // maintain our stats
  2925. STATInsertEntry( plock->m_phs );
  2926. // we have performed an insert
  2927. plock->m_fInsertOrDelete = fTrue;
  2928. }
  2929. return err;
  2930. }
  2931. // deletes the entry corresponding to the key locked by the specified lock
  2932. // context. if there is no entry for this key, errNoCurrentEntry will be
  2933. // returned
  2934. template< class CKey, class CEntry >
  2935. inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >::
  2936. ErrDeleteEntry( CLock* const plock )
  2937. {
  2938. DHTAssert( m_fInit );
  2939. // verify the lock
  2940. DHTAssert( FBKTWrite( plock ) || FBKTScan( plock ) );
  2941. DHTAssert( plock->m_pBucketHead != NULL );
  2942. DHTAssert( plock->m_pBucketHead->CRWL().FWriter() );
  2943. #ifdef DEBUG
  2944. if ( FBKTWrite( plock ) )
  2945. {
  2946. CKeyEntry *pKeyEntry;
  2947. BKTGetEntry( plock, &pKeyEntry );
  2948. DHTAssert( pKeyEntry ? pKeyEntry->FEntryMatchesKey( plock->m_key ) : fTrue );
  2949. }
  2950. #endif
  2951. if ( FBKTScan( plock ) )
  2952. {
  2953. // prepare the next-entry ptr so we can move-next after the delete
  2954. // if we are deleting the last entry in the bucket, make this NULL
  2955. // to force the cursor to move into the next hash bucket
  2956. DHTAssert( plock->m_pBucket != NULL );
  2957. DHTAssert( plock->m_pEntryNext == NULL );
  2958. plock->m_pEntryNext = ( plock->m_pEntry != plock->m_pBucket->m_pEntryLast ) ? plock->m_pEntry : NULL;
  2959. }
  2960. // delete the entry
  2961. const ERR err = ErrBKTDeleteEntry( plock );
  2962. if ( errSuccess == err )
  2963. {
  2964. // maintain our stats
  2965. STATDeleteEntry( plock->m_phs );
  2966. // we have performed a delete
  2967. plock->m_fInsertOrDelete = fTrue;
  2968. }
  2969. return err;
  2970. }
  2971. // sets up the specified lock context in preparation for scanning all entries
  2972. // in the hash table by physical storage order (i.e. not by key value order)
  2973. //
  2974. // NOTE: caller MUST terminate scan with EndHashScan to release any outstanding locks
  2975. template< class CKey, class CEntry >
  2976. inline void CDynamicHashTable< CKey, CEntry >::
  2977. BeginHashScan( CLock* const plock )
  2978. {
  2979. DHTAssert( m_fInit );
  2980. // verify the lock
  2981. DHTAssert( plock->m_ls == CLock::lsNil );
  2982. // initialize the lock to start scanning at the first bucket (it may be empty!)
  2983. plock->m_ls = CLock::lsScan;
  2984. plock->m_fInsertOrDelete = fFalse;
  2985. plock->m_iBucket = 0;
  2986. // enter the state machine
  2987. const int iGroup = UiSTEnter( &plock->m_phs );
  2988. const ENUMSTATE esCurrent = EsSTGetState();
  2989. // hash to the bucket we want (this may require a retry in grow/shrink mode)
  2990. DHTAssert( plock->m_pBucketHead == NULL );
  2991. plock->m_pBucketHead = PbucketDIRIHash( esCurrent, plock->m_iBucket );
  2992. // acquire the lock as a writer
  2993. plock->m_pBucketHead->CRWL().EnterAsWriter();
  2994. // NOTE: do not retry the hash function here because bucket 0 will never disappear
  2995. // leave the state machine
  2996. STLeave( iGroup, plock->m_phs );
  2997. // set up the currency as before-first
  2998. plock->m_pBucket = plock->m_pBucketHead;
  2999. plock->m_pEntryPrev = NULL;
  3000. plock->m_pEntry = NULL;
  3001. plock->m_pEntryNext = plock->m_pBucketHead->m_pb != NULL ? &plock->m_pBucketHead->m_rgEntry[0] : NULL;
  3002. }
  3003. // sets up the specified lock context in preparation for scanning all entries
  3004. // in the hash table by physical storage order (i.e. not by key value order)
  3005. //
  3006. // NOTE: caller MUST terminate scan with EndHashScan to release any outstanding locks
  3007. template< class CKey, class CEntry >
  3008. inline void CDynamicHashTable< CKey, CEntry >::
  3009. BeginHashScanFromKey( const CKey& key, CLock* const plock )
  3010. {
  3011. NativeCounter cBucket;
  3012. NativeCounter cBucketMax;
  3013. NativeCounter iHash;
  3014. DHTAssert( m_fInit );
  3015. // verify the lock
  3016. DHTAssert( plock->m_ls == CLock::lsNil );
  3017. // initialize the lock
  3018. plock->m_ls = CLock::lsScan;
  3019. plock->m_fInsertOrDelete = fFalse;
  3020. // enter the state machine
  3021. const int iGroup = UiSTEnter( &plock->m_phs );
  3022. const ENUMSTATE esCurrent = EsSTGetState();
  3023. // write-lock the key through the directory
  3024. DIRWriteLockKey( esCurrent, key, plock );
  3025. // calculate the current bucket configuration
  3026. //
  3027. // NOTES ON WHY THIS WILL WORK:
  3028. //
  3029. // cBucket may increase/decrease if we are in grow/shrink mode, but this won't effect the
  3030. // calculation below unless it grows ahead of OR shrinks behind the bucket at iHash;
  3031. // since we have the bucket at iHash locked, it cannot grow/shrink
  3032. // cBucketMax cannot change unless we are in split mode, and even then we will be reading from the
  3033. // COPY of the cBucketMax -- not the real cBucketMax which is changing
  3034. cBucket = NcDIRIGetBucket( esCurrent );
  3035. cBucketMax = NcDIRIGetBucketMax( esCurrent );
  3036. DHTAssert( cBucketMax != 0 );
  3037. // calculate the hash value and normalize it within the limits of the current bucket configuration
  3038. iHash = CKeyEntry::Hash( key );
  3039. iHash = iHash & ( ( cBucketMax - 1 ) + cBucketMax );
  3040. if ( iHash >= cBucketMax + cBucket )
  3041. iHash -= cBucketMax;
  3042. // remember which bucket we locked
  3043. plock->m_iBucket = iHash;
  3044. #ifdef DEBUG
  3045. {
  3046. // verify that we have the correct bucket locked using only iHash
  3047. NativeCounter iExponent;
  3048. NativeCounter iRemainder;
  3049. DIRILog2( iHash, &iExponent, &iRemainder );
  3050. const PBUCKET pbucketT = PbucketDIRIResolve( iExponent, iRemainder );
  3051. DHTAssert( pbucketT == plock->m_pBucketHead );
  3052. DHTAssert( pbucketT->CRWL().FWriter() );
  3053. }
  3054. #endif // DEBUG
  3055. // leave the state machine
  3056. STLeave( iGroup, plock->m_phs );
  3057. // set up the currency as before-first
  3058. plock->m_pBucket = plock->m_pBucketHead;
  3059. plock->m_pEntryPrev = NULL;
  3060. plock->m_pEntry = NULL;
  3061. plock->m_pEntryNext = plock->m_pBucketHead->m_pb != NULL ? &plock->m_pBucketHead->m_rgEntry[0] : NULL;
  3062. }
  3063. // moves the specified lock context to the next entry in the hash table by
  3064. // physical storage order. if the end of the index is reached,
  3065. // errNoCurrentEntry is returned.
  3066. template< class CKey, class CEntry >
  3067. inline typename CDynamicHashTable< CKey, CEntry >::ERR CDynamicHashTable< CKey, CEntry >::
  3068. ErrMoveNext( CLock* const plock, BOOL* const pfNewBucket )
  3069. {
  3070. DHTAssert( m_fInit );
  3071. // verify the lock
  3072. DHTAssert( FBKTScan( plock ) );
  3073. DHTAssert( plock->m_pEntryPrev == NULL );
  3074. // move to the next entry in this bucket
  3075. if ( plock->m_pEntry )
  3076. {
  3077. // we are already on an existing entry
  3078. if ( plock->m_pEntry + 1 < PentryBKTNextMost( plock->m_pBucket ) )
  3079. {
  3080. // we have not reached the end of the current BUCKET
  3081. plock->m_pEntry++;
  3082. }
  3083. else
  3084. {
  3085. // we are at the end of the current BUCKET
  3086. plock->m_pBucket = PbucketBKTNext( plock->m_pBucket );
  3087. if ( plock->m_pBucket )
  3088. {
  3089. // we moved to the next BUCKET
  3090. plock->m_pEntry = &plock->m_pBucket->m_rgEntry[0];
  3091. }
  3092. else
  3093. {
  3094. // there are no more BUCKET structures in this chain
  3095. plock->m_pEntry = NULL;
  3096. }
  3097. }
  3098. }
  3099. else
  3100. {
  3101. // we are not on an entry (before-first or after-last)
  3102. plock->m_pEntry = plock->m_pEntryNext;
  3103. }
  3104. plock->m_pEntryNext = NULL;
  3105. if ( plock->m_pEntry != NULL )
  3106. {
  3107. // we moved to an entry successfully
  3108. DHTAssert( plock->m_pBucket );
  3109. if ( pfNewBucket )
  3110. {
  3111. *pfNewBucket = fFalse;
  3112. }
  3113. return errSuccess;
  3114. }
  3115. // try to move to the next hash-bucket
  3116. if ( pfNewBucket )
  3117. {
  3118. *pfNewBucket = fTrue;
  3119. }
  3120. return ErrSCANMoveNext( plock );
  3121. }
  3122. // terminates a scan by releasing all outstanding locks and reset the lock context
  3123. template< class CKey, class CEntry >
  3124. inline void CDynamicHashTable< CKey, CEntry >::
  3125. EndHashScan( CLock* const plock )
  3126. {
  3127. DHTAssert( m_fInit );
  3128. // verify the lock
  3129. DHTAssert( FBKTScan( plock ) );
  3130. DHTAssert( plock->m_pEntryPrev == NULL );
  3131. if ( plock->m_pBucketHead != NULL )
  3132. {
  3133. // unlock the current bucket
  3134. plock->m_pBucketHead->CRWL().LeaveAsWriter();
  3135. plock->m_pBucketHead = NULL;
  3136. // we performed an insert or delete while holding the write lock
  3137. if ( plock->m_fInsertOrDelete )
  3138. {
  3139. // perform amortized maintenance on the table
  3140. MaintainTable( plock->m_phs );
  3141. }
  3142. }
  3143. // reset the lock
  3144. plock->m_ls = CLock::lsNil;
  3145. plock->m_fInsertOrDelete = fFalse;
  3146. }
  3147. }; // namespace DHT
  3148. using namespace DHT;
  3149. #endif // __DHT_HXX_INCLUDED