windows-nt-4.0/private/windows/win4help/ftsrch/vector.h


								// This file contains the definition of class CVector

								#ifndef __VECTOR_H__


								#define __VECTOR_H__


								// weighting constants

								#define	NEWTF_NONE		0

								#define NEWTF_BINARY	1

								#define NEWTF_MAXNORM	2

								#define NEWTF_AUGNORM	3

								#define	WT_NONE			4

								#define	WT_TFIDF		5

								#define	WT_PROB			6

								#define NORM_NONE		7

								#define	NORM_SUM		8

								#define	NORM_COSINE		9

								#define	NORM_MAX		10


								// state definition constants

								#define	COLL_USABLE		0x00

								#define	COLL_UNUSABLE	0x01

								#define	WEIGHTED		0x02


								// error definitions

								#define COLLERROR_NOCONCEPTS	0xFFFFFF10

								#define COLLERROR_OUTOFMEMORY	0xFFFFFF11

								#define	COLLERROR_BADSEQUENCE	0xFFFFFF12

								#define	COLLERROR_BADINPUT		0xFFFFFF13


								// Weights. A 1.0 is represented in fixed point as 0xFFFF

								#define	WT_ONE			0xFFFF


								// macros to enhance readability

								#define DocSentinel(i)		*((LPDWORD)m_vbVectorRange.Base + i)

								#define Concept(i)			*((LPDWORD)m_vbVectorConcept.Base + i)

								#define TermFreq(i)			*((LPWORD)m_vbVectorTermFreq.Base + i)

								#define TermWt(i)			*((float *)m_vbVectorWt.Base + i)

								// BugBug : For now assume that no term frequency will overflow. So get the value from

								//         the term freq array. Later on, however, you will have to see if there is

								//         an over flow and if so, get the overflow.

								#define GetRealTermFreq(i)	*((LPWORD)m_vbVectorTermFreq.Base + i)

								// Macro DocFreq is only meaningful UNTIL inversion. Before inversion, we reuse the cDocFreq field

								// to hold the cumulative document frequencies instead of the raw frequencies. The advantage is that

								// we don't have to use an extra field to hold a pointer to the wt list of a concept. We do, however,

								// still have to maintain a field for the Doc list of a concept because this list if compressed.

								#define DocFreq(i)			((ConceptStruct *)m_vbConcepts.Base + i)->cDocFreq

								// DocFromCumFreq is only meaningful FROM inversion. cDocFreq changes from a holder of raw doc count

								// to a pointer to the beginning of the list.

								#define	DocFromCumFreq(i)	(((ConceptStruct *)m_vbConcepts.Base + i+1)->cDocFreq - ((ConceptStruct *)m_vbConcepts.Base + i)->cDocFreq)

								#define DocList(i)			((ConceptStruct *)m_vbConcepts.Base + i)->pDocList

								#define CodeByte(i)			*((LPBYTE)m_vbDocInvIndex.Base + i)

								#define DocWtCount(i)		m_acDocWts[i]

								#define Document(i)			m_aDocInvIndex[i]

								#define	Weight(i)			m_aWtInvIndex[i]


								#define	DocIdFromInvList(con, i)	m_aDocInvIndex[((ConceptStruct *)m_vbConcepts.Base + con)->pDocList + i]

								#define WtFromInvList(con, i)		m_aWtInvIndex[((ConceptStruct *)m_vbConcepts.Base + con)->cDocFreq + i]


								typedef struct

								{

									DWORD	cDocFreq;	// the number of documents in the collection, in which this concept occurs at least once

														// just before inversion, cDocFreq is reused to hold the cumulative values. The advantage

														// is that we can avoid using a third field to point to the wts of the docs.

									DWORD	pDocList;	// a pointer to the list of documents in which this concept occurs. This is an index into an array.

								} ConceptStruct;


								#if 0

								typedef struct

								{

									DWORD iConWtIndex;	// index of the Con, Wt pair that has a termfreq greater than 64K

									DWORD cTermFreq;	// the overflowing value

								} TFOverFlowStruct;

								#endif


								typedef struct

								{

									DWORD	cConcepts;			// Number of concepts in the dictionary of this coll

									DWORD	offConcepts;

									DWORD	cDocuments;			// Number of documents in the collection

									DWORD	cDocWtPairs;		// Number of doc,wt [ = con,freq ] pairs

									DWORD	offWtInvIndex;

									DWORD	offDocInvIndex;

									DWORD	cBitsUsedInEncoding;// Number of bits used to encode the doc inverted index.

								#if 0

									DWORD	cOverFlows;			// Number of overflows.

								#endif

								} CollHdr;


								class CTextSet;


								class CCollection

								{

								friend class CQuery;


								public:


								    // Creator


								    static CCollection *NewCollection();


									// Destructor

									~CCollection();


									// Access Functions:

									void	Initialize(DWORD cInEstConcepts, DWORD cInMaxConcepts, DWORD cInEstDocuments, DWORD cInMaxDocuments, DWORD cInEstConWtPairs, DWORD cInMaxConWtPairs);

									void	RecordConcept(DWORD ConceptId);

									void	NewDocument();

									void	WeightAndInvertVectors(BYTE TFModType, BYTE WeightType, BYTE NormType);

									BOOL	Serialize(HANDLE hInFile, BOOL fSaveVectors);

									BOOL	Unserialize(HANDLE hInFile);

									void	SetNumberOfConcepts(DWORD cInConcepts);


									// Information Functions:

									BYTE	GetCollState() { return m_bCollState; }

									BOOL	IsConceptIdValid(DWORD ConceptId) { if (ConceptId > m_cConcepts) return FALSE; return TRUE;}

									DWORD	GetDocumentCount() {return m_cDocuments;}


									// Save/Load Functions

									void   StoreImage(CPersist *pDiskImage);

									static CCollection	*CreateImage(CPersist *pDiskImage);

									void   ConnectImage(CPersist *pDiskImage);


								private:

									// Constructor

									CCollection();


									// Internal functions.

									void	ApplyWeightingScheme(BYTE TFModType, BYTE WeightType, BYTE NormType, DWORD iFirstConWt, DWORD cConWts);

									DWORD	GetDocumentGap(LPDWORD startBitPos);


								private:

									// Internal variables

									// The following provides memory to implement the collection.

									LPDWORD			m_acDocWts;			// array of doc,wt pair counts used to aid in the inversion process

									LPWORD			m_aWtInvIndex;		// wt component of the Doc,Wt inverted index

									LPDWORD			m_aDocInvIndex;		// Doc component of the Doc,Wt inverted index

									// The vectors are (concept, freq) pairs. We are implementing that as two structures. One is an array of

									// concepts and the other is an array of term frequencies. If we have to implement the tuple as one

									// structure, we will be wasting a WORD for every structure.

									MY_VIRTUAL_BUFFER  m_vbConcepts;	   // buffer to hold an array of concept structures

									MY_VIRTUAL_BUFFER  m_vbVectorRange;    // tracks the start and end of vector representation for a given document

																		// in the (Concept, Freq) array

									MY_VIRTUAL_BUFFER  m_vbVectorConcept;  // the concept part of the vector representation

									MY_VIRTUAL_BUFFER  m_vbVectorTermFreq; // the term frequency part of the vector representation

									MY_VIRTUAL_BUFFER  m_vbVectorWt;	   // the temporary buffer used to convert term freq to buffer

									MY_VIRTUAL_BUFFER  m_vbDocInvIndex;    // the buffer used to store the compressed document gaps in the document index

								#if 0

									MY_VIRTUAL_BUFFER  m_vbTFOverFlow;	   // buffer to hold the term frequencies that are GT 64K.

																		// It is very unlikely that we will ever have a term that occurs more than 64K

																		// times in a document, but that case should be accounted for.

								#endif


									// The following track the state of the collection.

									BYTE	m_bCollState;	// tracks the state of the collection

									DWORD	m_cConcepts;	// number of unique concepts in the dictionary

									DWORD	m_cDocuments;	// number of documents in the collection

									DWORD	m_cConWts;		// number of ConWt pairs seen so far

								#if 0

									DWORD	m_cOverFlows;	// number of term frequency overflows

								#endif

									DWORD	m_cBitsUsedInEncoding; 	// number of bits used to encode the doc gaps in the inverted list

									BOOL	m_fLoadedFromDisk;		// indicates if it has been loaded from disk


									// Used for integration with Ron's code

								  	CTextSet      *m_pts;


								public:

									CTextSet * PTextSet() {return m_pts;};

									void SetTextSet(CTextSet *pts) {m_pts = pts;};

								};


								#endif // __VECTOR_H__