Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

172 lines
6.9 KiB

// This file contains the definition of class CVector
#ifndef __VECTOR_H__
#define __VECTOR_H__
// weighting constants
#define NEWTF_NONE 0
#define NEWTF_BINARY 1
#define NEWTF_MAXNORM 2
#define NEWTF_AUGNORM 3
#define WT_NONE 4
#define WT_TFIDF 5
#define WT_PROB 6
#define NORM_NONE 7
#define NORM_SUM 8
#define NORM_COSINE 9
#define NORM_MAX 10
// state definition constants
#define COLL_USABLE 0x00
#define COLL_UNUSABLE 0x01
#define WEIGHTED 0x02
// error definitions
#define COLLERROR_NOCONCEPTS 0xFFFFFF10
#define COLLERROR_OUTOFMEMORY 0xFFFFFF11
#define COLLERROR_BADSEQUENCE 0xFFFFFF12
#define COLLERROR_BADINPUT 0xFFFFFF13
// Weights. A 1.0 is represented in fixed point as 0xFFFF
#define WT_ONE 0xFFFF
// macros to enhance readability
#define DocSentinel(i) *((LPDWORD)m_vbVectorRange.Base + i)
#define Concept(i) *((LPDWORD)m_vbVectorConcept.Base + i)
#define TermFreq(i) *((LPWORD)m_vbVectorTermFreq.Base + i)
#define TermWt(i) *((float *)m_vbVectorWt.Base + i)
// BugBug : For now assume that no term frequency will overflow. So get the value from
// the term freq array. Later on, however, you will have to see if there is
// an over flow and if so, get the overflow.
#define GetRealTermFreq(i) *((LPWORD)m_vbVectorTermFreq.Base + i)
// Macro DocFreq is only meaningful UNTIL inversion. Before inversion, we reuse the cDocFreq field
// to hold the cumulative document frequencies instead of the raw frequencies. The advantage is that
// we don't have to use an extra field to hold a pointer to the wt list of a concept. We do, however,
// still have to maintain a field for the Doc list of a concept because this list if compressed.
#define DocFreq(i) ((ConceptStruct *)m_vbConcepts.Base + i)->cDocFreq
// DocFromCumFreq is only meaningful FROM inversion. cDocFreq changes from a holder of raw doc count
// to a pointer to the beginning of the list.
#define DocFromCumFreq(i) (((ConceptStruct *)m_vbConcepts.Base + i+1)->cDocFreq - ((ConceptStruct *)m_vbConcepts.Base + i)->cDocFreq)
#define DocList(i) ((ConceptStruct *)m_vbConcepts.Base + i)->pDocList
#define CodeByte(i) *((LPBYTE)m_vbDocInvIndex.Base + i)
#define DocWtCount(i) m_acDocWts[i]
#define Document(i) m_aDocInvIndex[i]
#define Weight(i) m_aWtInvIndex[i]
#define DocIdFromInvList(con, i) m_aDocInvIndex[((ConceptStruct *)m_vbConcepts.Base + con)->pDocList + i]
#define WtFromInvList(con, i) m_aWtInvIndex[((ConceptStruct *)m_vbConcepts.Base + con)->cDocFreq + i]
typedef struct
{
DWORD cDocFreq; // the number of documents in the collection, in which this concept occurs at least once
// just before inversion, cDocFreq is reused to hold the cumulative values. The advantage
// is that we can avoid using a third field to point to the wts of the docs.
DWORD pDocList; // a pointer to the list of documents in which this concept occurs. This is an index into an array.
} ConceptStruct;
#if 0
typedef struct
{
DWORD iConWtIndex; // index of the Con, Wt pair that has a termfreq greater than 64K
DWORD cTermFreq; // the overflowing value
} TFOverFlowStruct;
#endif
typedef struct
{
DWORD cConcepts; // Number of concepts in the dictionary of this coll
DWORD offConcepts;
DWORD cDocuments; // Number of documents in the collection
DWORD cDocWtPairs; // Number of doc,wt [ = con,freq ] pairs
DWORD offWtInvIndex;
DWORD offDocInvIndex;
DWORD cBitsUsedInEncoding;// Number of bits used to encode the doc inverted index.
#if 0
DWORD cOverFlows; // Number of overflows.
#endif
} CollHdr;
class CTextSet;
class CCollection
{
friend class CQuery;
public:
// Creator
static CCollection *NewCollection();
// Destructor
~CCollection();
// Access Functions:
void Initialize(DWORD cInEstConcepts, DWORD cInMaxConcepts, DWORD cInEstDocuments, DWORD cInMaxDocuments, DWORD cInEstConWtPairs, DWORD cInMaxConWtPairs);
void RecordConcept(DWORD ConceptId);
void NewDocument();
void WeightAndInvertVectors(BYTE TFModType, BYTE WeightType, BYTE NormType);
BOOL Serialize(HANDLE hInFile, BOOL fSaveVectors);
BOOL Unserialize(HANDLE hInFile);
void SetNumberOfConcepts(DWORD cInConcepts);
// Information Functions:
BYTE GetCollState() { return m_bCollState; }
BOOL IsConceptIdValid(DWORD ConceptId) { if (ConceptId > m_cConcepts) return FALSE; return TRUE;}
DWORD GetDocumentCount() {return m_cDocuments;}
// Save/Load Functions
void StoreImage(CPersist *pDiskImage);
static CCollection *CreateImage(CPersist *pDiskImage);
void ConnectImage(CPersist *pDiskImage);
private:
// Constructor
CCollection();
// Internal functions.
void ApplyWeightingScheme(BYTE TFModType, BYTE WeightType, BYTE NormType, DWORD iFirstConWt, DWORD cConWts);
DWORD GetDocumentGap(LPDWORD startBitPos);
private:
// Internal variables
// The following provides memory to implement the collection.
LPDWORD m_acDocWts; // array of doc,wt pair counts used to aid in the inversion process
LPWORD m_aWtInvIndex; // wt component of the Doc,Wt inverted index
LPDWORD m_aDocInvIndex; // Doc component of the Doc,Wt inverted index
// The vectors are (concept, freq) pairs. We are implementing that as two structures. One is an array of
// concepts and the other is an array of term frequencies. If we have to implement the tuple as one
// structure, we will be wasting a WORD for every structure.
MY_VIRTUAL_BUFFER m_vbConcepts; // buffer to hold an array of concept structures
MY_VIRTUAL_BUFFER m_vbVectorRange; // tracks the start and end of vector representation for a given document
// in the (Concept, Freq) array
MY_VIRTUAL_BUFFER m_vbVectorConcept; // the concept part of the vector representation
MY_VIRTUAL_BUFFER m_vbVectorTermFreq; // the term frequency part of the vector representation
MY_VIRTUAL_BUFFER m_vbVectorWt; // the temporary buffer used to convert term freq to buffer
MY_VIRTUAL_BUFFER m_vbDocInvIndex; // the buffer used to store the compressed document gaps in the document index
#if 0
MY_VIRTUAL_BUFFER m_vbTFOverFlow; // buffer to hold the term frequencies that are GT 64K.
// It is very unlikely that we will ever have a term that occurs more than 64K
// times in a document, but that case should be accounted for.
#endif
// The following track the state of the collection.
BYTE m_bCollState; // tracks the state of the collection
DWORD m_cConcepts; // number of unique concepts in the dictionary
DWORD m_cDocuments; // number of documents in the collection
DWORD m_cConWts; // number of ConWt pairs seen so far
#if 0
DWORD m_cOverFlows; // number of term frequency overflows
#endif
DWORD m_cBitsUsedInEncoding; // number of bits used to encode the doc gaps in the inverted list
BOOL m_fLoadedFromDisk; // indicates if it has been loaded from disk
// Used for integration with Ron's code
CTextSet *m_pts;
public:
CTextSet * PTextSet() {return m_pts;};
void SetTextSet(CTextSet *pts) {m_pts = pts;};
};
#endif // __VECTOR_H__