Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

453 lines
19 KiB

  1. /*************************************************************************
  2. * *
  3. * IINDEX.H *
  4. * *
  5. * Copyright (C) Microsoft Corporation 1990-1994 *
  6. * All Rights reserved. *
  7. * *
  8. **************************************************************************
  9. * *
  10. * Current Owner: BinhN *
  11. * *
  12. **************************************************************************/
  13. /******************************************
  14. * Internal sort stuff.
  15. ******************************************/
  16. #ifdef _32BIT
  17. #define MAX_BLOCK_SIZE (DWORD)0x80000
  18. #else
  19. #define MAX_BLOCK_SIZE (DWORD)0x0000FF00
  20. #endif
  21. typedef struct _list
  22. {
  23. struct _list FAR * pNext;
  24. } FAR *PLIST;
  25. // - - - - - - - - -
  26. // Tree data types
  27. typedef struct OCCDATA
  28. {
  29. struct OCCDATA FAR *pNext; // Linked-list chain
  30. DWORD OccData[1]; // Array of n-DWORD
  31. } OCCDATA,
  32. FAR *POCCDATA;
  33. typedef struct TOPICDATA
  34. {
  35. struct TOPICDATA FAR *pNext; // Linked-list chain 4
  36. DWORD dwOccCount; // Count of occurrences in list 4
  37. DWORD dwTopicId; // TopicId for this topic 4
  38. POCCDATA pOccData; // First OccData in list 4
  39. POCCDATA pLastOccData; // Last inserted OccData 4
  40. } TOPICDATA, // = 20
  41. FAR *PTOPICDATA;
  42. typedef struct STRDATA
  43. {
  44. PTOPICDATA pTopic; // First Topic in list 4
  45. PTOPICDATA pLastTopic; // Last inserted Topic 4
  46. LPB pText; // Sort word as a Pascal string 4
  47. DWORD dwField; // Field Id for the sort word 4
  48. DWORD dwTopicCount; // Count of Topics in list 4
  49. DWORD dwWordLength; // Word length (from OCC data) 4
  50. } STRDATA, // = 24
  51. FAR *PSTRDATA;
  52. typedef struct BTNODE
  53. {
  54. enum TREECOLOR {RED, BLACK} color; // Color of node - for balancing 4
  55. struct BTNODE FAR *pParent; // Pointer to parent node 4
  56. struct BTNODE FAR *pLeft; // Pointer to left child node 4
  57. struct BTNODE FAR *pRight; // Pointer to right child node 4
  58. STRDATA StringData; // Pointer to string data 24
  59. } BTNODE, // = 32
  60. FAR *PBTNODE;
  61. typedef struct MERGEHEADER
  62. {
  63. DWORD dwRecordSize;
  64. LPB lpbWord; // Pascal string
  65. DWORD dwFieldId; // Field Id
  66. DWORD dwWordLength; // Real life word length
  67. DWORD dwStrLen; // Current string length
  68. DWORD dwTopicCount; // Topic count
  69. DWORD dwLastTopicId; // Last topic id
  70. PTOPICDATA pTopic; // Pointer to first Topic in list
  71. PTOPICDATA pLastTopic; // Last inserted Topic
  72. FILEOFFSET foTopicCount; // Backpatching address
  73. LPB pTopicCount; // Pointer to topic count location
  74. BYTE fEmitRecord; // Flag to denote rec is emitted
  75. BYTE Pad1; // Padding for DWORD aligned
  76. } MERGEHEADER, FAR *PMERGEHEADER;
  77. // Typedefs for an external sort buffer. Each of these has associated
  78. // with it a large (easily > 1meg) block of sorted words. A few of
  79. // these words will end up in an internal buffer. These external sort
  80. // buffers will be formed into a chain, one chain will have associated
  81. // with it in total all of the words that are going to be sorted. A
  82. // merge will be performed on the words associated with the chain to
  83. // produce a final sorted list of words.
  84. typedef struct InternalSortInfo
  85. {
  86. HFPB hfpb; // Handle to temp file
  87. PBTNODE pBalanceTree; // Root node of the balanced tree
  88. FILEOFFSET lfo; // File offset
  89. FILEOFFSET lfoRecBackPatch; // Backpatching record offset
  90. DWORD dwRecLength; // Record (data associated with 1 word) length
  91. HANDLE hSortBuffer; // Handle to sort buffer
  92. BYTE FAR *pSortBuffer; // Memory buffer for file output
  93. BYTE FAR *pStartRec; // Record start point in the buffer
  94. BYTE FAR *pCurPtr; // Current insertion point in the buffer
  95. DWORD dwMaxEsbRecSize; // Maximum record size of current ESB
  96. BYTE DeepLevel; // Deepest level of the tree
  97. BYTE Pad1;
  98. BYTE Pad2;
  99. BYTE Pad3;
  100. BYTE aszTempName[_MAX_PATH]; // Temp file for tree flush, ericjut: change from cbMAX_PATH to _MAX_PATH
  101. } ISI,
  102. FAR *LPISI;
  103. typedef HANDLE HESB;
  104. typedef struct ExternalSortBuffer
  105. {
  106. HANDLE hStruct; // This structure's handle. MUST BE 1ST!!
  107. struct ExternalSortBuffer FAR *lpesbNext; // Next buffer in the list.
  108. FILEOFFSET lfo; // This starts out as an offset in the
  109. // temp file at which the first word
  110. // associated with this buffer will
  111. // be found. As words are disposed
  112. // of it will increment.
  113. FILEOFFSET lfoMax; // This is the offset of the end of
  114. // the area of the temp file that
  115. // contains words for this external
  116. // sort buffer.
  117. DWORD dwEsbSize; // Actual size of the internal buffer.
  118. DWORD ibBuf; // Pointer to the current record in
  119. // the internal buffer.
  120. HANDLE hMem; // Handle to buffered block.
  121. LRGB lrgbMem; // Pointer to buffered block.
  122. } ESB, FAR *LPESB;
  123. // - - - - - - - - -
  124. // Information about the external sort process as a while.
  125. typedef struct ExternalSortInfo
  126. {
  127. FILEOFFSET lfoTempOffset; // Current size of the output file
  128. HFPB hfpb; // Handle to ouput file
  129. LPFBI lpfbiTemp; // Temp file buffer
  130. DWORD cesb; // Number of ESB blocks allocated
  131. LPESB lpesbRoot; // First buffer in the external-buffer linked-list
  132. DWORD cbEsbBuf; // The size of each ESB buffer.
  133. DWORD uiQueueSize; // Priority queue's size
  134. GHANDLE hPriorityQueue; // Handle to Priority Queue
  135. LPESB FAR *lrgPriorityQueue; // Priority Queue
  136. // Output buffer handling
  137. HANDLE hBuf; // Handle to output buiffer
  138. LPB pOutputBuffer; // Pointer to output buffer
  139. DWORD ibBuf; // Buffer index
  140. WORD fFlag; // Various flag
  141. WORD pad;
  142. LPB lpbQueueStr [cbMAX_PATH];
  143. BYTE aszTempName[_MAX_PATH]; // Temp sorted result name
  144. } ESI,
  145. FAR *LPESI;
  146. // Information kept that pertains directly to "tfc" term-weighting.
  147. typedef float SIGMA;
  148. typedef SIGMA HUGE *HPSIGMA;
  149. typedef SIGMA HUGE *HRGSIGMA;
  150. typedef DWORD LISIGMA;
  151. #define LASTWORD_SIZE 1024 // Size of last word buffer in each node
  152. typedef struct BTREEDATA
  153. {
  154. // Array of tree blocks
  155. PNODEINFO rgpNodeInfo[MAX_TREE_HEIGHT]; // Array of tree nodes
  156. PNODEINFO rgpTmpNodeInfo[MAX_TREE_HEIGHT]; // Array of tree nodes
  157. FILEOFFSET OffsetPointer; // File offset of the last nodes
  158. // pointer to the next node (for traversal)
  159. IH20 Header;
  160. DWORD NID; // Number of nodes allocated
  161. FLOAT rLogN; // Used for term-weighting
  162. FLOAT FAR *lrgrLog; // This will be an array of numbers that
  163. // contains a common weighting sub-expression
  164. BYTE argbLog[cLOG_MAX]; // An array of 8-bit flags. If one of
  165. // these is non-zero the corresponding
  166. // value in lrgrLog is valid
  167. BYTE fOccfLength; // Word Length field flag
  168. BYTE padding[3]; // Maintain DWORD alignment
  169. } BTREEDATA, FAR *PBTREEDATA;
  170. #define lisigmaMAX ((LISIGMA)524288L) // This value is arbitrary
  171. // but should not be allowed
  172. // to grow, if possible.
  173. typedef struct WeightInfo
  174. {
  175. HRGSIGMA hrgsigma; // Pointer to array of sigma elements.
  176. HANDLE hSigma; // Handle to "hrgsigma".
  177. FLOAT FAR *lrgrLog; // Array of LOG values to speed up processing
  178. HANDLE hLog; // Handle to "
  179. } WI;
  180. typedef struct BLKCOMBO
  181. {
  182. LPV pBlockMgr;
  183. PLIST pFreeList;
  184. DWORD dwCount;
  185. } BLKCOMBO, FAR *PBLKCOMBO;
  186. typedef struct
  187. {
  188. DWORD dwPhase; // Current indexing phase
  189. // 1: Collection phase
  190. // 2: Sort and coalate phase
  191. // 3: Permament index building phase
  192. DWORD dwIndex; // Completion index
  193. } CALLBACKINFO, FAR *PCALLBACKINFO;
  194. // - - - - - - - - -
  195. // Nerve information about the indexing process. Most memory allocated
  196. // and files created are in some way attached to one of these.
  197. typedef struct IndexParamBlock
  198. {
  199. HANDLE hStruct; // This structure's handle. MUST BE 1ST
  200. DWORD dwKey; // Key for callback
  201. FCALLBACK_MSG CallbackInfo; // User callback info
  202. //
  203. // Miscellaneous.
  204. //
  205. WI wi; // Term-weighting information.
  206. FILEOFFSET foMaxOffset; // Maximum offset of the file (file size)
  207. // Useful information to be used
  208. DWORD lcTopics; // The number of unique documents
  209. DWORD dwMaxTopicId; // Use to hold compare value for lcTopics
  210. DWORD dwMemAllowed; // Size of memory allocated for index
  211. DWORD dwMaxRecordSize; // Maximum record size in collecting word
  212. DWORD dwMaxEsbRecSize; // Current ESB maximum record size
  213. DWORD dwMaxWLen; // Maximum word's length value
  214. DWORD dwLastIndexedTopic; // For word collection
  215. HFREELIST hFreeList; // Handle to the Index FreeList
  216. //
  217. // Callbacks.
  218. //
  219. FCOMPARE lpfnCompare; // Compare function for sort
  220. LPV lpvSortParm; // Sort parameters
  221. // Sort information.
  222. //
  223. ISI isi; // Internal sort information.
  224. ESI esi; // External sort information.
  225. LPV pDataBlock; // Block manager for string
  226. BLKCOMBO BTNodeBlock; // Block manager for btnode
  227. BLKCOMBO TopicBlock; // Block manager for topic block
  228. BLKCOMBO OccBlock; // Block manager for occurrence
  229. PLIST pOccFreeList; // Free list of occurrence nodes
  230. BTREEDATA BTreeData; // BTree data info
  231. // Input/output file
  232. FILEDATA InFile; // File info for input file
  233. FILEDATA OutFile; // File info for output file
  234. PNODEINFO pIndexDataNode;
  235. // Various buffer used for update
  236. HANDLE hTmpBuf; // Temp buf for word record
  237. LPB pTmpBuf;
  238. LPB pWord; // Pointer to word record
  239. HFPB hfpbIdxFile;
  240. HANDLE hData;
  241. LPB pDataBuffer; // Buffer for new data
  242. DWORD dwDataSize; // Size of the buffer data
  243. DWORD BitCount[7][33]; // Array to hold the bit count for bit
  244. // compression scheme.
  245. // [0] = TopicID, [1] = OccCount, [2]-[6] = Occs
  246. // Statistics informations
  247. DWORD dwIndexedWord; // Total of indexed words (statistics)
  248. DWORD dwUniqueWord; // How many unique words indexed (statistics)
  249. DWORD dwByteCount; // How many bytes indexed (statistics)
  250. DWORD dwOccOffbits; // How many bits for offset (statistics)
  251. DWORD dwOccExtbits; // How many bits for extent (statistics)
  252. DWORD dwMaxFieldId; // Maximum field value
  253. DWORD dwMaxWCount; // Maximum word count value
  254. DWORD dwMaxOffset; // Maximum offset value
  255. DWORD dwTotal3bWordLen; // Total length of all words > 2 bytes
  256. DWORD dwTotal2bWordLen; // Total length of all words <= 2 bytes
  257. DWORD dwTotalUniqueWordLen; // Total length of all unique words
  258. CKEY cKey[5]; // Compression keys (2-bytes * 5)
  259. // BYTE ucNumOccFields; // The number of bits set in "occf".
  260. WORD idxf; // Index characteristic flags.
  261. WORD occf; // A flag byte that keeps track of
  262. // which occurence element fields
  263. // should be indexed.
  264. BYTE ucNumOccDataFields; // The number of bits set that are saved in OCCDATA
  265. BYTE fOccComp; // Set to 1 if Occurrences need to be sorted
  266. // in collect2.(They are added out of order)
  267. BYTE cMaxLevel;
  268. BYTE bState;
  269. BYTE szEsiTemp[cbMAX_PATH]; // Temp ESI
  270. } IPB,
  271. FAR *_LPIPB;
  272. // bState values
  273. #define INDEXING_STATE 0 // We are doing indexing
  274. #define UPDATING_STATE 1 // We are updating the index
  275. #define DELETING_STATE 2 // We are deleting data from teh index
  276. // - - - - - - - - -
  277. // These defines indicate how many bits per word occurence list are
  278. // wasted through the adoption of either the "fixed", "high bit
  279. // replacement" or "bitstream" compression schemes. This wasted space
  280. // is wasted through the insertion of one or more flag bits into the
  281. // data-stream.
  282. #define cbitWASTED_FIXED (1 + CBIT_WIDTH_BITS)
  283. // If the first bit is set, it means that the
  284. // "fixed" scheme was adopted, so the total
  285. // number of bits that was necessary to
  286. // indicate this was one. More bits are
  287. // used to store the "width" value that is
  288. // associated with this scheme. This has
  289. // been the most commonly used compression
  290. // scheme in practice.
  291. #define cbitWASTED_BELL (2 + CBIT_WIDTH_BITS)
  292. // If the first bit wasn't set, and the second
  293. // one was, it indicates that the "bell"
  294. // scheme was used. The total wasted to
  295. // indicated this scheme was two bits, plus
  296. // the "width" value (the "center")
  297. // associated with this scheme.
  298. #define cbitWASTED_BITSTREAM (2)
  299. // If neither the first bit nor the second bit
  300. // were set, the bitstream scheme was used.
  301. // The total wasted space was also two bits,
  302. // the same as for the "bell" scheme. This
  303. // has been the least-used scheme in
  304. // practice.
  305. #define lcbitBITSTREAM_ILLEGAL ((DWORD)-1L)
  306. // This value indicates that the function
  307. // is not allowed to select the "bitstream"
  308. // compression scheme.
  309. #define cbitCENTER_MAX ((CBIT)33)
  310. // Legal "center" values are 0..32. This is
  311. // weird because you'd expect it to be
  312. // 0..31 but it's not.
  313. // - - - - - - - - -
  314. // This structure is used in the occurence-list building phase of
  315. // indexing. The structure includes information local to a single
  316. // occurence list.
  317. typedef struct OccurenceListInfo
  318. {
  319. DWORD lcSublists; // The number of sub-lists in this
  320. // occurence list.
  321. CKEY ckey; // The manner in which doc-ID deltas
  322. // are compressed in this list.
  323. } OLI,
  324. FAR *LPOLI;
  325. typedef struct MergeParams
  326. {
  327. DWORD FAR *rgTopicId;
  328. DWORD dwCount;
  329. DWORD FAR *lpTopicIdLast; // internal use, last position saved
  330. } MERGEPARAMS, FAR *LPMERGEPARAMS;
  331. // - - - - - - - - -
  332. // Convert occurence list file to a final index file.
  333. /*******************************************************************
  334. * *
  335. * FUNCTIONS PROTOTYPES *
  336. * *
  337. *******************************************************************/
  338. /*********************************************************************
  339. * *
  340. * SORT FUNCTIONS (SORT.C) *
  341. * *
  342. *********************************************************************/
  343. PUBLIC ERR PASCAL FAR HugeDataSort(LPV HUGE *, DWORD, FCOMPARE, LPV,
  344. INTERRUPT_FUNC, LPV);
  345. PUBLIC VOID PASCAL FAR HugeInsertionSort (LPV HUGE *, DWORD, FCOMPARE, LPV);
  346. PUBLIC ERR PASCAL FAR PriorityQueueRemove (LPESI, FCOMPARE, LPV);
  347. PUBLIC ERR PASCAL FAR PriorityQueueCreate (LPESI, FCOMPARE, LPV);
  348. PUBLIC ERR PASCAL NEAR IndexSort (LPW, LPB, int);
  349. PUBLIC ERR PASCAL NEAR IndexMergeSort (HFILE FAR *, LSZ, LPW, LPB, int, int);
  350. /*********************************************************************
  351. * *
  352. * ENCODING FUNCTIONS (ENCODE.C) *
  353. * *
  354. *********************************************************************/
  355. PUBLIC CB PASCAL NEAR OccurrencePack (LPB, LPOCC, WORD);
  356. PUBLIC VOID PASCAL NEAR OccurrenceUnpack(LPOCC, LPB, OCCF);
  357. PUBLIC CB PASCAL NEAR CbCopySortPackedOcc (LPB, LPB, WORD);
  358. PUBLIC CBIT PASCAL NEAR CbitBitsDw (DWORD);
  359. PUBLIC void NEAR PASCAL VGetBestScheme(LPCKEY, LRGDW, DWORD, int);
  360. PUBLIC CB PASCAL FAR CbBytePack(LPB, DWORD);
  361. /*********************************************************************
  362. * *
  363. * INDEXING FUNCTIONS *
  364. * *
  365. *********************************************************************/
  366. PUBLIC VOID PASCAL FAR FreeISI (LPIPB);
  367. PUBLIC void NEAR PASCAL FreeEsi(LPIPB);
  368. PUBLIC LCB FAR PASCAL LcbGetFreeMemory(LPERRB);
  369. PUBLIC ERR FAR PASCAL SortFlushISI (_LPIPB);
  370. PUBLIC int PASCAL FAR WordRecCompare(LPB, LPB, LPV);
  371. PUBLIC ERR FAR PASCAL MergeSortTreeFile (_LPIPB, LPMERGEPARAMS);
  372. PUBLIC int FAR PASCAL CompareOccurrence (LPDW, LPDW, int);
  373. PUBLIC int FAR PASCAL StrCmp2BytePascal (LPB, LPB);
  374. ERR FAR PASCAL FlushTree(_LPIPB lpipb);
  375. PUBLIC ERR FAR PASCAL BuildBTree (HFPB, _LPIPB, LPB, HFPB, LPSTR);
  376. PUBLIC ERR FAR PASCAL FWriteBits(PFILEDATA, DWORD, BYTE);
  377. PUBLIC ERR PASCAL FAR IndexOpenRW (LPIPB, HFPB, LSZ);
  378. PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb);
  379. PUBLIC VOID PASCAL FAR FreeBTreeNode (PNODEINFO pNode);
  380. PUBLIC ERR PASCAL FAR ReadNewNode (HFPB, PNODEINFO, int);
  381. PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb);
  382. PUBLIC ERR PASCAL FAR SkipOldData (_LPIPB, PNODEINFO);
  383. PUBLIC ERR FAR PASCAL AllocSigmaTable (_LPIPB lpipb);