Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

172 lines
6.9 KiB

4 years ago
  1. // This file contains the definition of class CVector
  2. #ifndef __VECTOR_H__
  3. #define __VECTOR_H__
  4. // weighting constants
  5. #define NEWTF_NONE 0
  6. #define NEWTF_BINARY 1
  7. #define NEWTF_MAXNORM 2
  8. #define NEWTF_AUGNORM 3
  9. #define WT_NONE 4
  10. #define WT_TFIDF 5
  11. #define WT_PROB 6
  12. #define NORM_NONE 7
  13. #define NORM_SUM 8
  14. #define NORM_COSINE 9
  15. #define NORM_MAX 10
  16. // state definition constants
  17. #define COLL_USABLE 0x00
  18. #define COLL_UNUSABLE 0x01
  19. #define WEIGHTED 0x02
  20. // error definitions
  21. #define COLLERROR_NOCONCEPTS 0xFFFFFF10
  22. #define COLLERROR_OUTOFMEMORY 0xFFFFFF11
  23. #define COLLERROR_BADSEQUENCE 0xFFFFFF12
  24. #define COLLERROR_BADINPUT 0xFFFFFF13
  25. // Weights. A 1.0 is represented in fixed point as 0xFFFF
  26. #define WT_ONE 0xFFFF
  27. // macros to enhance readability
  28. #define DocSentinel(i) *((LPDWORD)m_vbVectorRange.Base + i)
  29. #define Concept(i) *((LPDWORD)m_vbVectorConcept.Base + i)
  30. #define TermFreq(i) *((LPWORD)m_vbVectorTermFreq.Base + i)
  31. #define TermWt(i) *((float *)m_vbVectorWt.Base + i)
  32. // BugBug : For now assume that no term frequency will overflow. So get the value from
  33. // the term freq array. Later on, however, you will have to see if there is
  34. // an over flow and if so, get the overflow.
  35. #define GetRealTermFreq(i) *((LPWORD)m_vbVectorTermFreq.Base + i)
  36. // Macro DocFreq is only meaningful UNTIL inversion. Before inversion, we reuse the cDocFreq field
  37. // to hold the cumulative document frequencies instead of the raw frequencies. The advantage is that
  38. // we don't have to use an extra field to hold a pointer to the wt list of a concept. We do, however,
  39. // still have to maintain a field for the Doc list of a concept because this list if compressed.
  40. #define DocFreq(i) ((ConceptStruct *)m_vbConcepts.Base + i)->cDocFreq
  41. // DocFromCumFreq is only meaningful FROM inversion. cDocFreq changes from a holder of raw doc count
  42. // to a pointer to the beginning of the list.
  43. #define DocFromCumFreq(i) (((ConceptStruct *)m_vbConcepts.Base + i+1)->cDocFreq - ((ConceptStruct *)m_vbConcepts.Base + i)->cDocFreq)
  44. #define DocList(i) ((ConceptStruct *)m_vbConcepts.Base + i)->pDocList
  45. #define CodeByte(i) *((LPBYTE)m_vbDocInvIndex.Base + i)
  46. #define DocWtCount(i) m_acDocWts[i]
  47. #define Document(i) m_aDocInvIndex[i]
  48. #define Weight(i) m_aWtInvIndex[i]
  49. #define DocIdFromInvList(con, i) m_aDocInvIndex[((ConceptStruct *)m_vbConcepts.Base + con)->pDocList + i]
  50. #define WtFromInvList(con, i) m_aWtInvIndex[((ConceptStruct *)m_vbConcepts.Base + con)->cDocFreq + i]
  51. typedef struct
  52. {
  53. DWORD cDocFreq; // the number of documents in the collection, in which this concept occurs at least once
  54. // just before inversion, cDocFreq is reused to hold the cumulative values. The advantage
  55. // is that we can avoid using a third field to point to the wts of the docs.
  56. DWORD pDocList; // a pointer to the list of documents in which this concept occurs. This is an index into an array.
  57. } ConceptStruct;
  58. #if 0
  59. typedef struct
  60. {
  61. DWORD iConWtIndex; // index of the Con, Wt pair that has a termfreq greater than 64K
  62. DWORD cTermFreq; // the overflowing value
  63. } TFOverFlowStruct;
  64. #endif
  65. typedef struct
  66. {
  67. DWORD cConcepts; // Number of concepts in the dictionary of this coll
  68. DWORD offConcepts;
  69. DWORD cDocuments; // Number of documents in the collection
  70. DWORD cDocWtPairs; // Number of doc,wt [ = con,freq ] pairs
  71. DWORD offWtInvIndex;
  72. DWORD offDocInvIndex;
  73. DWORD cBitsUsedInEncoding;// Number of bits used to encode the doc inverted index.
  74. #if 0
  75. DWORD cOverFlows; // Number of overflows.
  76. #endif
  77. } CollHdr;
  78. class CTextSet;
  79. class CCollection
  80. {
  81. friend class CQuery;
  82. public:
  83. // Creator
  84. static CCollection *NewCollection();
  85. // Destructor
  86. ~CCollection();
  87. // Access Functions:
  88. void Initialize(DWORD cInEstConcepts, DWORD cInMaxConcepts, DWORD cInEstDocuments, DWORD cInMaxDocuments, DWORD cInEstConWtPairs, DWORD cInMaxConWtPairs);
  89. void RecordConcept(DWORD ConceptId);
  90. void NewDocument();
  91. void WeightAndInvertVectors(BYTE TFModType, BYTE WeightType, BYTE NormType);
  92. BOOL Serialize(HANDLE hInFile, BOOL fSaveVectors);
  93. BOOL Unserialize(HANDLE hInFile);
  94. void SetNumberOfConcepts(DWORD cInConcepts);
  95. // Information Functions:
  96. BYTE GetCollState() { return m_bCollState; }
  97. BOOL IsConceptIdValid(DWORD ConceptId) { if (ConceptId > m_cConcepts) return FALSE; return TRUE;}
  98. DWORD GetDocumentCount() {return m_cDocuments;}
  99. // Save/Load Functions
  100. void StoreImage(CPersist *pDiskImage);
  101. static CCollection *CreateImage(CPersist *pDiskImage);
  102. void ConnectImage(CPersist *pDiskImage);
  103. private:
  104. // Constructor
  105. CCollection();
  106. // Internal functions.
  107. void ApplyWeightingScheme(BYTE TFModType, BYTE WeightType, BYTE NormType, DWORD iFirstConWt, DWORD cConWts);
  108. DWORD GetDocumentGap(LPDWORD startBitPos);
  109. private:
  110. // Internal variables
  111. // The following provides memory to implement the collection.
  112. LPDWORD m_acDocWts; // array of doc,wt pair counts used to aid in the inversion process
  113. LPWORD m_aWtInvIndex; // wt component of the Doc,Wt inverted index
  114. LPDWORD m_aDocInvIndex; // Doc component of the Doc,Wt inverted index
  115. // The vectors are (concept, freq) pairs. We are implementing that as two structures. One is an array of
  116. // concepts and the other is an array of term frequencies. If we have to implement the tuple as one
  117. // structure, we will be wasting a WORD for every structure.
  118. MY_VIRTUAL_BUFFER m_vbConcepts; // buffer to hold an array of concept structures
  119. MY_VIRTUAL_BUFFER m_vbVectorRange; // tracks the start and end of vector representation for a given document
  120. // in the (Concept, Freq) array
  121. MY_VIRTUAL_BUFFER m_vbVectorConcept; // the concept part of the vector representation
  122. MY_VIRTUAL_BUFFER m_vbVectorTermFreq; // the term frequency part of the vector representation
  123. MY_VIRTUAL_BUFFER m_vbVectorWt; // the temporary buffer used to convert term freq to buffer
  124. MY_VIRTUAL_BUFFER m_vbDocInvIndex; // the buffer used to store the compressed document gaps in the document index
  125. #if 0
  126. MY_VIRTUAL_BUFFER m_vbTFOverFlow; // buffer to hold the term frequencies that are GT 64K.
  127. // It is very unlikely that we will ever have a term that occurs more than 64K
  128. // times in a document, but that case should be accounted for.
  129. #endif
  130. // The following track the state of the collection.
  131. BYTE m_bCollState; // tracks the state of the collection
  132. DWORD m_cConcepts; // number of unique concepts in the dictionary
  133. DWORD m_cDocuments; // number of documents in the collection
  134. DWORD m_cConWts; // number of ConWt pairs seen so far
  135. #if 0
  136. DWORD m_cOverFlows; // number of term frequency overflows
  137. #endif
  138. DWORD m_cBitsUsedInEncoding; // number of bits used to encode the doc gaps in the inverted list
  139. BOOL m_fLoadedFromDisk; // indicates if it has been loaded from disk
  140. // Used for integration with Ron's code
  141. CTextSet *m_pts;
  142. public:
  143. CTextSet * PTextSet() {return m_pts;};
  144. void SetTextSet(CTextSet *pts) {m_pts = pts;};
  145. };
  146. #endif // __VECTOR_H__