|
|
/*************************************************************************
* * * IINDEX.H * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Current Owner: BinhN * * * **************************************************************************/
/******************************************
* Internal sort stuff. ******************************************/ #ifdef _32BIT
#define MAX_BLOCK_SIZE (DWORD)0x80000
#else
#define MAX_BLOCK_SIZE (DWORD)0x0000FF00
#endif
typedef struct _list { struct _list FAR * pNext; } FAR *PLIST;
// - - - - - - - - -
// Tree data types
typedef struct OCCDATA { struct OCCDATA FAR *pNext; // Linked-list chain
DWORD OccData[1]; // Array of n-DWORD
} OCCDATA, FAR *POCCDATA;
typedef struct TOPICDATA { struct TOPICDATA FAR *pNext; // Linked-list chain 4
DWORD dwOccCount; // Count of occurrences in list 4
DWORD dwTopicId; // TopicId for this topic 4
POCCDATA pOccData; // First OccData in list 4
POCCDATA pLastOccData; // Last inserted OccData 4
} TOPICDATA, // = 20
FAR *PTOPICDATA;
typedef struct STRDATA { PTOPICDATA pTopic; // First Topic in list 4
PTOPICDATA pLastTopic; // Last inserted Topic 4
LPB pText; // Sort word as a Pascal string 4
DWORD dwField; // Field Id for the sort word 4
DWORD dwTopicCount; // Count of Topics in list 4
DWORD dwWordLength; // Word length (from OCC data) 4
} STRDATA, // = 24
FAR *PSTRDATA;
typedef struct BTNODE { enum TREECOLOR {RED, BLACK} color; // Color of node - for balancing 4
struct BTNODE FAR *pParent; // Pointer to parent node 4
struct BTNODE FAR *pLeft; // Pointer to left child node 4
struct BTNODE FAR *pRight; // Pointer to right child node 4
STRDATA StringData; // Pointer to string data 24
} BTNODE, // = 32
FAR *PBTNODE; typedef struct MERGEHEADER { DWORD dwRecordSize; LPB lpbWord; // Pascal string
DWORD dwFieldId; // Field Id
DWORD dwWordLength; // Real life word length
DWORD dwStrLen; // Current string length
DWORD dwTopicCount; // Topic count
DWORD dwLastTopicId; // Last topic id
PTOPICDATA pTopic; // Pointer to first Topic in list
PTOPICDATA pLastTopic; // Last inserted Topic
FILEOFFSET foTopicCount; // Backpatching address
LPB pTopicCount; // Pointer to topic count location
BYTE fEmitRecord; // Flag to denote rec is emitted
BYTE Pad1; // Padding for DWORD aligned
} MERGEHEADER, FAR *PMERGEHEADER;
// Typedefs for an external sort buffer. Each of these has associated
// with it a large (easily > 1meg) block of sorted words. A few of
// these words will end up in an internal buffer. These external sort
// buffers will be formed into a chain, one chain will have associated
// with it in total all of the words that are going to be sorted. A
// merge will be performed on the words associated with the chain to
// produce a final sorted list of words.
typedef struct InternalSortInfo { HFPB hfpb; // Handle to temp file
PBTNODE pBalanceTree; // Root node of the balanced tree
FILEOFFSET lfo; // File offset
FILEOFFSET lfoRecBackPatch; // Backpatching record offset
DWORD dwRecLength; // Record (data associated with 1 word) length
HANDLE hSortBuffer; // Handle to sort buffer
BYTE FAR *pSortBuffer; // Memory buffer for file output
BYTE FAR *pStartRec; // Record start point in the buffer
BYTE FAR *pCurPtr; // Current insertion point in the buffer
DWORD dwMaxEsbRecSize; // Maximum record size of current ESB
BYTE DeepLevel; // Deepest level of the tree
BYTE Pad1; BYTE Pad2; BYTE Pad3; BYTE aszTempName[_MAX_PATH]; // Temp file for tree flush, ericjut: change from cbMAX_PATH to _MAX_PATH
} ISI, FAR *LPISI;
typedef HANDLE HESB;
typedef struct ExternalSortBuffer { HANDLE hStruct; // This structure's handle. MUST BE 1ST!!
struct ExternalSortBuffer FAR *lpesbNext; // Next buffer in the list.
FILEOFFSET lfo; // This starts out as an offset in the
// temp file at which the first word
// associated with this buffer will
// be found. As words are disposed
// of it will increment.
FILEOFFSET lfoMax; // This is the offset of the end of
// the area of the temp file that
// contains words for this external
// sort buffer.
DWORD dwEsbSize; // Actual size of the internal buffer.
DWORD ibBuf; // Pointer to the current record in
// the internal buffer.
HANDLE hMem; // Handle to buffered block.
LRGB lrgbMem; // Pointer to buffered block.
} ESB, FAR *LPESB;
// - - - - - - - - -
// Information about the external sort process as a while.
typedef struct ExternalSortInfo { FILEOFFSET lfoTempOffset; // Current size of the output file
HFPB hfpb; // Handle to ouput file
LPFBI lpfbiTemp; // Temp file buffer
DWORD cesb; // Number of ESB blocks allocated
LPESB lpesbRoot; // First buffer in the external-buffer linked-list
DWORD cbEsbBuf; // The size of each ESB buffer.
DWORD uiQueueSize; // Priority queue's size
GHANDLE hPriorityQueue; // Handle to Priority Queue
LPESB FAR *lrgPriorityQueue; // Priority Queue
// Output buffer handling
HANDLE hBuf; // Handle to output buiffer
LPB pOutputBuffer; // Pointer to output buffer
DWORD ibBuf; // Buffer index
WORD fFlag; // Various flag
WORD pad; LPB lpbQueueStr [cbMAX_PATH]; BYTE aszTempName[_MAX_PATH]; // Temp sorted result name
} ESI, FAR *LPESI;
// Information kept that pertains directly to "tfc" term-weighting.
typedef float SIGMA; typedef SIGMA HUGE *HPSIGMA; typedef SIGMA HUGE *HRGSIGMA;
typedef DWORD LISIGMA;
#define LASTWORD_SIZE 1024 // Size of last word buffer in each node
typedef struct BTREEDATA { // Array of tree blocks
PNODEINFO rgpNodeInfo[MAX_TREE_HEIGHT]; // Array of tree nodes
PNODEINFO rgpTmpNodeInfo[MAX_TREE_HEIGHT]; // Array of tree nodes
FILEOFFSET OffsetPointer; // File offset of the last nodes
// pointer to the next node (for traversal)
IH20 Header; DWORD NID; // Number of nodes allocated
FLOAT rLogN; // Used for term-weighting
FLOAT FAR *lrgrLog; // This will be an array of numbers that
// contains a common weighting sub-expression
BYTE argbLog[cLOG_MAX]; // An array of 8-bit flags. If one of
// these is non-zero the corresponding
// value in lrgrLog is valid
BYTE fOccfLength; // Word Length field flag
BYTE padding[3]; // Maintain DWORD alignment
} BTREEDATA, FAR *PBTREEDATA; #define lisigmaMAX ((LISIGMA)524288L) // This value is arbitrary
// but should not be allowed
// to grow, if possible.
typedef struct WeightInfo { HRGSIGMA hrgsigma; // Pointer to array of sigma elements.
HANDLE hSigma; // Handle to "hrgsigma".
FLOAT FAR *lrgrLog; // Array of LOG values to speed up processing
HANDLE hLog; // Handle to "
} WI;
typedef struct BLKCOMBO { LPV pBlockMgr; PLIST pFreeList; DWORD dwCount; } BLKCOMBO, FAR *PBLKCOMBO;
typedef struct { DWORD dwPhase; // Current indexing phase
// 1: Collection phase
// 2: Sort and coalate phase
// 3: Permament index building phase
DWORD dwIndex; // Completion index
} CALLBACKINFO, FAR *PCALLBACKINFO; // - - - - - - - - -
// Nerve information about the indexing process. Most memory allocated
// and files created are in some way attached to one of these.
typedef struct IndexParamBlock { HANDLE hStruct; // This structure's handle. MUST BE 1ST
DWORD dwKey; // Key for callback
FCALLBACK_MSG CallbackInfo; // User callback info
//
// Miscellaneous.
//
WI wi; // Term-weighting information.
FILEOFFSET foMaxOffset; // Maximum offset of the file (file size)
// Useful information to be used
DWORD lcTopics; // The number of unique documents
DWORD dwMaxTopicId; // Use to hold compare value for lcTopics
DWORD dwMemAllowed; // Size of memory allocated for index
DWORD dwMaxRecordSize; // Maximum record size in collecting word
DWORD dwMaxEsbRecSize; // Current ESB maximum record size
DWORD dwMaxWLen; // Maximum word's length value
DWORD dwLastIndexedTopic; // For word collection
HFREELIST hFreeList; // Handle to the Index FreeList
//
// Callbacks.
//
FCOMPARE lpfnCompare; // Compare function for sort
LPV lpvSortParm; // Sort parameters
// Sort information.
//
ISI isi; // Internal sort information.
ESI esi; // External sort information.
LPV pDataBlock; // Block manager for string
BLKCOMBO BTNodeBlock; // Block manager for btnode
BLKCOMBO TopicBlock; // Block manager for topic block
BLKCOMBO OccBlock; // Block manager for occurrence
PLIST pOccFreeList; // Free list of occurrence nodes
BTREEDATA BTreeData; // BTree data info
// Input/output file
FILEDATA InFile; // File info for input file
FILEDATA OutFile; // File info for output file
PNODEINFO pIndexDataNode; // Various buffer used for update
HANDLE hTmpBuf; // Temp buf for word record
LPB pTmpBuf; LPB pWord; // Pointer to word record
HFPB hfpbIdxFile; HANDLE hData; LPB pDataBuffer; // Buffer for new data
DWORD dwDataSize; // Size of the buffer data
DWORD BitCount[7][33]; // Array to hold the bit count for bit
// compression scheme.
// [0] = TopicID, [1] = OccCount, [2]-[6] = Occs
// Statistics informations
DWORD dwIndexedWord; // Total of indexed words (statistics)
DWORD dwUniqueWord; // How many unique words indexed (statistics)
DWORD dwByteCount; // How many bytes indexed (statistics)
DWORD dwOccOffbits; // How many bits for offset (statistics)
DWORD dwOccExtbits; // How many bits for extent (statistics)
DWORD dwMaxFieldId; // Maximum field value
DWORD dwMaxWCount; // Maximum word count value
DWORD dwMaxOffset; // Maximum offset value
DWORD dwTotal3bWordLen; // Total length of all words > 2 bytes
DWORD dwTotal2bWordLen; // Total length of all words <= 2 bytes
DWORD dwTotalUniqueWordLen; // Total length of all unique words
CKEY cKey[5]; // Compression keys (2-bytes * 5)
// BYTE ucNumOccFields; // The number of bits set in "occf".
WORD idxf; // Index characteristic flags.
WORD occf; // A flag byte that keeps track of
// which occurence element fields
// should be indexed.
BYTE ucNumOccDataFields; // The number of bits set that are saved in OCCDATA
BYTE fOccComp; // Set to 1 if Occurrences need to be sorted
// in collect2.(They are added out of order)
BYTE cMaxLevel; BYTE bState; BYTE szEsiTemp[cbMAX_PATH]; // Temp ESI
} IPB, FAR *_LPIPB;
// bState values
#define INDEXING_STATE 0 // We are doing indexing
#define UPDATING_STATE 1 // We are updating the index
#define DELETING_STATE 2 // We are deleting data from teh index
// - - - - - - - - -
// These defines indicate how many bits per word occurence list are
// wasted through the adoption of either the "fixed", "high bit
// replacement" or "bitstream" compression schemes. This wasted space
// is wasted through the insertion of one or more flag bits into the
// data-stream.
#define cbitWASTED_FIXED (1 + CBIT_WIDTH_BITS)
// If the first bit is set, it means that the
// "fixed" scheme was adopted, so the total
// number of bits that was necessary to
// indicate this was one. More bits are
// used to store the "width" value that is
// associated with this scheme. This has
// been the most commonly used compression
// scheme in practice.
#define cbitWASTED_BELL (2 + CBIT_WIDTH_BITS)
// If the first bit wasn't set, and the second
// one was, it indicates that the "bell"
// scheme was used. The total wasted to
// indicated this scheme was two bits, plus
// the "width" value (the "center")
// associated with this scheme.
#define cbitWASTED_BITSTREAM (2)
// If neither the first bit nor the second bit
// were set, the bitstream scheme was used.
// The total wasted space was also two bits,
// the same as for the "bell" scheme. This
// has been the least-used scheme in
// practice.
#define lcbitBITSTREAM_ILLEGAL ((DWORD)-1L)
// This value indicates that the function
// is not allowed to select the "bitstream"
// compression scheme.
#define cbitCENTER_MAX ((CBIT)33)
// Legal "center" values are 0..32. This is
// weird because you'd expect it to be
// 0..31 but it's not.
// - - - - - - - - -
// This structure is used in the occurence-list building phase of
// indexing. The structure includes information local to a single
// occurence list.
typedef struct OccurenceListInfo { DWORD lcSublists; // The number of sub-lists in this
// occurence list.
CKEY ckey; // The manner in which doc-ID deltas
// are compressed in this list.
} OLI, FAR *LPOLI;
typedef struct MergeParams { DWORD FAR *rgTopicId; DWORD dwCount; DWORD FAR *lpTopicIdLast; // internal use, last position saved
} MERGEPARAMS, FAR *LPMERGEPARAMS; // - - - - - - - - -
// Convert occurence list file to a final index file.
/*******************************************************************
* * * FUNCTIONS PROTOTYPES * * * *******************************************************************/
/*********************************************************************
* * * SORT FUNCTIONS (SORT.C) * * * *********************************************************************/
PUBLIC ERR PASCAL FAR HugeDataSort(LPV HUGE *, DWORD, FCOMPARE, LPV, INTERRUPT_FUNC, LPV); PUBLIC VOID PASCAL FAR HugeInsertionSort (LPV HUGE *, DWORD, FCOMPARE, LPV); PUBLIC ERR PASCAL FAR PriorityQueueRemove (LPESI, FCOMPARE, LPV); PUBLIC ERR PASCAL FAR PriorityQueueCreate (LPESI, FCOMPARE, LPV); PUBLIC ERR PASCAL NEAR IndexSort (LPW, LPB, int); PUBLIC ERR PASCAL NEAR IndexMergeSort (HFILE FAR *, LSZ, LPW, LPB, int, int);
/*********************************************************************
* * * ENCODING FUNCTIONS (ENCODE.C) * * * *********************************************************************/
PUBLIC CB PASCAL NEAR OccurrencePack (LPB, LPOCC, WORD); PUBLIC VOID PASCAL NEAR OccurrenceUnpack(LPOCC, LPB, OCCF); PUBLIC CB PASCAL NEAR CbCopySortPackedOcc (LPB, LPB, WORD); PUBLIC CBIT PASCAL NEAR CbitBitsDw (DWORD); PUBLIC void NEAR PASCAL VGetBestScheme(LPCKEY, LRGDW, DWORD, int); PUBLIC CB PASCAL FAR CbBytePack(LPB, DWORD);
/*********************************************************************
* * * INDEXING FUNCTIONS * * * *********************************************************************/
PUBLIC VOID PASCAL FAR FreeISI (LPIPB); PUBLIC void NEAR PASCAL FreeEsi(LPIPB); PUBLIC LCB FAR PASCAL LcbGetFreeMemory(LPERRB); PUBLIC ERR FAR PASCAL SortFlushISI (_LPIPB); PUBLIC int PASCAL FAR WordRecCompare(LPB, LPB, LPV); PUBLIC ERR FAR PASCAL MergeSortTreeFile (_LPIPB, LPMERGEPARAMS); PUBLIC int FAR PASCAL CompareOccurrence (LPDW, LPDW, int); PUBLIC int FAR PASCAL StrCmp2BytePascal (LPB, LPB); ERR FAR PASCAL FlushTree(_LPIPB lpipb); PUBLIC ERR FAR PASCAL BuildBTree (HFPB, _LPIPB, LPB, HFPB, LPSTR); PUBLIC ERR FAR PASCAL FWriteBits(PFILEDATA, DWORD, BYTE); PUBLIC ERR PASCAL FAR IndexOpenRW (LPIPB, HFPB, LSZ); PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb); PUBLIC VOID PASCAL FAR FreeBTreeNode (PNODEINFO pNode); PUBLIC ERR PASCAL FAR ReadNewNode (HFPB, PNODEINFO, int); PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb); PUBLIC ERR PASCAL FAR SkipOldData (_LPIPB, PNODEINFO); PUBLIC ERR FAR PASCAL AllocSigmaTable (_LPIPB lpipb);
|