mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
732 lines
27 KiB
732 lines
27 KiB
// TxDBase.h -- CTextDatabase class definition
|
|
|
|
#ifndef __TXDBASE_H__
|
|
|
|
#define __TXDBASE_H__
|
|
|
|
#include "SegHash.h"
|
|
#include "VMBuffer.h"
|
|
#include "Indicate.h"
|
|
#include "Classify.h"
|
|
#include "Defines.h"
|
|
#include "TextMat.h"
|
|
#include "UnbuffIO.h"
|
|
#include "IOList.h"
|
|
#include "IOStream.h"
|
|
#include "FTSIFace.h"
|
|
#include "Compress.h"
|
|
#include "Util.h"
|
|
#include "Sorting.h"
|
|
#include "dict.h"
|
|
#include "vector.h"
|
|
|
|
// !!! BugBug !!! Parts of the long comment below are now incorrect and need revision.
|
|
|
|
// Term Tags Strategies
|
|
//
|
|
// The term tag data structures go with term entries in the segmented
|
|
// hash tables. We use two hash table - the Global table and the Galactic
|
|
// table. The term tags for the two tables are designed for the actions we
|
|
// take with the hash table.
|
|
//
|
|
// Our index work proceeds in four phases --
|
|
//
|
|
// 1. Constructing Local Dictionaries
|
|
// During this phase we build a local term dictionary with very restricted
|
|
// context. The dictionary strictly covers a range of up to 65536 tokens.
|
|
// The local dictionary is an unsegmented hash table biased for speed and very
|
|
// low collision rates.
|
|
//
|
|
// 2. Linking Local Dictionaries with the Global Dictionary
|
|
// When a local dictionary reaches a capacity limit or when we must force
|
|
// our text database to a searchable state, we link local dictionary entries
|
|
// with corresponding global hash table entries. This has two effects --
|
|
//
|
|
// A. We merge the local terms with the global terms, adding new unique terms
|
|
// to the global list.
|
|
//
|
|
// B. We now have a global linked list which traverses all the references for
|
|
// each term. This was the original searchable format for the database. It
|
|
// works well so long as the database fits entirely within RAM and degrades
|
|
// when our working set significantly exceed RAM space. The current code
|
|
// doesn't construct these global links, but relies instead on the flattening
|
|
// phase below.
|
|
//
|
|
// 3. Flattening Linked Lists
|
|
// When the collection of reference links in the local dictionaries reaches
|
|
// a memory size threshold, we traverse the linked lists and construct a
|
|
// collection of flattened vectors of reference indices. At the same time we
|
|
// compress the streams of reference indices. The compression algorithm relies
|
|
// on three fields maintained for each term in the term tag --
|
|
//
|
|
// A. iNewRefFirst -- the index of the first instance in the linked stream
|
|
// B. iNewRefLast -- the index of the last instance in the linked stream
|
|
// C. cRefsNew -- the number of instances in the linked stream
|
|
//
|
|
// To merge new flattened vectors with previously accumulated vectors we
|
|
// maintain four additional term tag fields --
|
|
//
|
|
// B. iRefListBase -- index to the stream of flattened references for this term.
|
|
// C. cdwRefs -- size of the flattened reference lists in DWords.
|
|
// D. iRefSequence -- ranking order of this term relative to the galactic table
|
|
// A -1 value indicates no ranking.
|
|
//
|
|
// Note that we simply catenate the compressed reference lists from sequential
|
|
// flattening passes. Thus the stream denoted by iRefListBase has the format --
|
|
//
|
|
// {cdw, cRefsStream, iRefFirst, <<basis>, <compressed ref>...>} ...
|
|
//
|
|
// where --
|
|
//
|
|
// cdw is the size of the reference list segment in DWords
|
|
// cRefsStream is the number of references in the list.
|
|
// iRefFirst is the first reference in the list
|
|
// <basis> is a five-bit value which drives the compression
|
|
// algorithm.
|
|
// <compressed ref> values are variable length bit strings which
|
|
// represent the delta between successive reference
|
|
// indices.
|
|
//
|
|
// Note that cdw could be derrived from cRefsStream and <basis>. We include
|
|
// it in the reference stream to allow a fast traversal of the stream when
|
|
// we're looking for a particular indexing range.
|
|
//
|
|
// The iRefSequence field is a ordering value maintained in the galactic hash
|
|
// table for each unique term. It's used to speed up the process of merging new
|
|
// reference vector segments with previously accumulated segments. The strategy
|
|
// is to keep the reference segments in an incrementally committed memory address
|
|
// range and to insert segments by copying reference streams upward in memory,
|
|
// inserting the new segments as we go. The iRefSequence field gives us the memory
|
|
// order for the reference streams.
|
|
//
|
|
// Note that for terms with less than four references we keep the reference
|
|
// information entirely in the term tag. That situation is denoted by negative
|
|
// values in cRefsTotal, iRefListBase, and cdwRefs. Zero values are used to
|
|
// mark the one and two cases. The actual index values are the logical
|
|
// negation (~) of those fields in the order mentioned above. When a new vector
|
|
// segment would push the global total beyond three, we merge the old vector
|
|
// with the new one and create an external list.
|
|
//
|
|
// Note that the iRefSequence field may be undefined (-1) or defined >= 0 for
|
|
// a term with less than four references. This is because the galactic table
|
|
// may have other references that push us over the limit.
|
|
//
|
|
// Why do we bother with this complicated scheme? Its value lies in reducing
|
|
// the number of items in the reference stream. During the merge work this
|
|
// reduces the number of items that must be slid upward in memory and it
|
|
// reduces the number of iRefListBase fields that must be adjusted during
|
|
// the merge operation.Approximately 45% of all unique terms are used only
|
|
// once. By keeping small lists in the tag, we reduce the number of external
|
|
// lists by 75%.
|
|
//
|
|
// 4. Galactic Merges
|
|
// When the global table reaches a memory size threshold, we merge its reference
|
|
// information with the galactic hash table and restart our indexing work with
|
|
// an empty global table. The issue here is keeping the global table small enough
|
|
// so that it fits completely within RAM during phase 2 work.
|
|
//
|
|
// The galactic term tags contain only the accumulation fields --
|
|
//
|
|
// B. iRefListBase -- index to the stream of flattened references for this term.
|
|
// C. cdwRefs -- size of the flattened reference lists in DWords.
|
|
// D. iRefSequence -- ranking order of this term relative to the galactic table.
|
|
// A -1 value indicates no ranking.
|
|
|
|
typedef struct _TermTagGlobal
|
|
{
|
|
UINT iGlobalDesc; // Global sequence # for term.
|
|
UINT iGalacticDesc; // Galactic sequence # for term.
|
|
|
|
// UINT iNewRefFirst; // First linked global ref.
|
|
// UINT iNewRefLast; // Last linked global ref.
|
|
UINT cRefsNew; // # of linked global refs.
|
|
UINT cRefsGlobal;
|
|
|
|
} TermTagGlobal;
|
|
|
|
typedef TermTagGlobal *PTermTagGlobal;
|
|
|
|
typedef struct _TermTagGalactic
|
|
{
|
|
UINT iGalacticDesc; // Galactic sequence # for term.
|
|
|
|
} TermTagGalactic;
|
|
|
|
typedef TermTagGalactic *PTermTagGalactic;
|
|
|
|
typedef struct _DESCRIPTOR
|
|
{
|
|
PWCHAR pwDisplay; // pbImage is Sort Key, pwDisplay is Display Image.
|
|
|
|
union
|
|
{
|
|
PWCHAR pbImage; // Length given by delta with following pd->pbImage.
|
|
UINT iGalactic;
|
|
};
|
|
|
|
union
|
|
{
|
|
UINT cReferences; // Used while building a CTextDatabase
|
|
UINT iTokenInfo; // Used in CTokenCollection
|
|
UINT iTextSet; // Used in CTitleCollection
|
|
};
|
|
|
|
WORD cwDisplay;
|
|
BYTE bCharset;
|
|
BYTE fImageFlags;
|
|
|
|
} DESCRIPTOR;
|
|
|
|
typedef DESCRIPTOR *PDESCRIPTOR;
|
|
|
|
inline UINT CbImage(PDESCRIPTOR pd)
|
|
{
|
|
#ifdef MESSAGEBOXES
|
|
|
|
if (256 < ((pd+1)->pbImage - pd->pbImage))
|
|
{
|
|
char ac[256], acToken[101];
|
|
|
|
wsprintf(ac, "Token length: %d", ((pd+1)->pbImage - pd->pbImage));
|
|
|
|
::MessageBox(NULL, ac, "Very Large Token!", MB_OK);
|
|
|
|
CopyMemory(acToken, pd->pbImage, 50);
|
|
|
|
acToken[50]= 0;
|
|
|
|
wsprintf(ac, "Token Image: \"%s...\"", acToken);
|
|
|
|
::MessageBox(NULL, ac, "Part of the token image!", MB_OK);
|
|
}
|
|
|
|
#else // MESSAGEBOXES
|
|
|
|
ASSERT(1024 > ((pd+1)->pbImage - pd->pbImage));
|
|
|
|
#endif // MESSAGEBOXES
|
|
|
|
return (pd+1)->pbImage - pd->pbImage;
|
|
}
|
|
|
|
inline UINT CwDisplay(PDESCRIPTOR pd)
|
|
{
|
|
ASSERT(1024 > ((pd+1)->pwDisplay - pd->pwDisplay));
|
|
|
|
return (pd+1)->pwDisplay - pd->pwDisplay;
|
|
}
|
|
|
|
// Flag definitions for DESCRIPTOR.fImageFlags:
|
|
|
|
// #define LETTER_CHAR 0x0001
|
|
// #define CONTAINS_A_TAB 0x0002
|
|
// #define TOKEN_FLAGS_MASK 0x0003
|
|
// #define REF_TYPE_MASK 0x000C
|
|
// #define BASIS_MASK 0xF800
|
|
// #define REFS_LINKED 0x0010
|
|
|
|
// #define BASIS_SHIFT 11
|
|
|
|
// Reference types for REF_TYPE_MASK:
|
|
|
|
// #define SingleRef
|
|
// #define PairRef
|
|
// #define TripleRef
|
|
|
|
UINT CBitsToRepresent(UINT ui);
|
|
UINT FormatAToken(PDESCRIPTOR pd, int cbOffset, int iColStart, int iColLimit, PWCHAR pbLine);
|
|
|
|
void SortTokenImages(PDESCRIPTOR pdBase, PDESCRIPTOR **pppdSorted, PDESCRIPTOR **pppdTailSorted,
|
|
PUINT pcdSorted, UINT cd
|
|
);
|
|
|
|
|
|
// #define BUILD_LOCAL_HASH(hv,c) hv= ((hv << 5) | (hv >> 27)) - c
|
|
// #define BUILD_GLOBAL_HASH(hv,c) hv= ((hv >> 5) | (hv << 27)) - c
|
|
|
|
typedef struct _LocalToken
|
|
{
|
|
unsigned short iLocalDescriptorEntry;
|
|
unsigned short iLocalReferenceNext;
|
|
} LocalToken;
|
|
|
|
typedef LocalToken *PLocalToken;
|
|
|
|
// Descriptor reference tokens are processed in three phases. Tokens are
|
|
// initially created with iLocalDescriptorEntry set and iLocalDescriptorNext
|
|
// zeroed.
|
|
//
|
|
// Later when we bind a local dictionary to the global dictionary, the
|
|
// iLocalDecriptorNext field is used to link together every instance of
|
|
// each unique term in the local dictionary.
|
|
//
|
|
// Finally when we reach a specific memory limit, we flatten the linked lists
|
|
// for all local dictionaries to create a vector of reference indices for
|
|
// each unique term in the global dictionary. At this point we also map the
|
|
// LocalToken structure shown above into GlobalToken values (See below).
|
|
//
|
|
// A GlobalToken is a 16-bit value which refers uniquely to a particular
|
|
// global DESCRIPTOR. Since we can easily have more than 64K unique global
|
|
// terms, we provide an indirection mechanism which maps some 16-bit values
|
|
// into 32-bit values.
|
|
//
|
|
// Here's how it works. We divide GlobalToken values into two ranges.
|
|
// Values between 0..59,983 are absolute indices into the global vector of
|
|
// unique DESCRIPTORs. Values between 59,984 and 65,535 are mapped to 32-bit
|
|
// via a local indrection vector of 32-bit indices.
|
|
|
|
typedef USHORT GlobalToken;
|
|
typedef GlobalToken *PGlobalToken;
|
|
|
|
#define LOCAL_HASH_CLASSES 0x8000
|
|
#define LOCAL_HASH_MASK 0x7FFF
|
|
#define ENTRIES_PER_LOCAL_DICT 6552
|
|
#define MAX_REFS_PER_LDICT 0x10000
|
|
|
|
#define MAX_GLOBAL_TOKENS (0x10000 - ENTRIES_PER_LOCAL_DICT)
|
|
|
|
// Note: The constant ENTRIES_PER_LOCAL_DICT is chosen to make the
|
|
// LocalDictionary structure exactly 64K bytes.
|
|
//
|
|
// MAX_GLOBAL_TOKENS is a constant which allows streams of token
|
|
// references to fit in 2-byte granules. The first MAX_GLOBAL_TOKENS
|
|
// unique tokens we encounter are considered global. References to
|
|
// those tokens are encode in the value range [0..MAX_GLOBAL_TOKENS-1]
|
|
// while references to tokens outside that set are denoted by values
|
|
// in the range [MAX_GLOBAL_TOKENS .. 0xFFFF]. The latter values can
|
|
// be trivially mapped into indices into the local dictionary which
|
|
// corresponds to the token reference. One effect of this coding is
|
|
// that most local dictionaries will collapse to empty when we convert
|
|
// to the vector representation from the linked token representation.
|
|
|
|
typedef struct _LocalDictionary
|
|
{
|
|
PLocalToken pltFirst; // address of first token for this local dictionary
|
|
UINT clt; // count of local tokens which refer to this Local dict
|
|
PDESCRIPTOR *ppdNext; // next unused slot in apdLocal.
|
|
union
|
|
{
|
|
PDESCRIPTOR apdLocal[ENTRIES_PER_LOCAL_DICT]; // Refs to descriptors used locally
|
|
UINT aiGalactic[ENTRIES_PER_LOCAL_DICT]; // Galactic indices for local terms
|
|
};
|
|
USHORT aiTokenInstFirst[ENTRIES_PER_LOCAL_DICT]; // List heads for each local
|
|
// descriptor.
|
|
} LocalDictionary;
|
|
|
|
typedef LocalDictionary *PLocalDictionary;
|
|
|
|
#define IVB_TOKEN_STREAM 0
|
|
#define IVB_TOKEN_IMAGES 1
|
|
#define IVB_IMAGE_DESCRIPTORS 2
|
|
#define IVB_DISPLAY_IMAGES 3
|
|
|
|
#define COUNT_OF_VIRTUAL_BUFFERS 4
|
|
|
|
#define vbTokenStream m_avb[IVB_TOKEN_STREAM ]
|
|
#define vbTokenImages m_avb[IVB_TOKEN_IMAGES ]
|
|
#define vbImageDescriptors m_avb[IVB_IMAGE_DESCRIPTORS]
|
|
#define vbDisplayImages m_avb[IVB_DISPLAY_IMAGES]
|
|
|
|
// Commit and Reservation constants for the virtual buffers
|
|
// in the TextDatabaseControl object. These reservations are
|
|
// based on an upper limit of 100,000,000 bytes scanned.
|
|
|
|
|
|
#define INIT_TOKEN_REF_COMMIT 0x00010000 // 0x00430000
|
|
#define INIT_TOKEN_REF_RESERVATION 0x08000000
|
|
#define INIT_TOKEN_IMAGE_COMMIT 0x00010000 // 0x000A0000
|
|
#define INIT_TOKEN_IMAGE_RESERVATION 0x03700000
|
|
#define INIT_IMAGE_DESCRIPTOR_COMMIT 0x00010000 // 0x00160000
|
|
#define INIT_IMAGE_DESCRIPTOR_RESERVATION 0x02A00000
|
|
#define INIT_DISPLAY_IMAGE_COMMIT 0x00010000
|
|
#define INIT_DISPLAY_IMAGE_RESERVATION 0x03700000
|
|
|
|
#define BUFFER_INCREMENT 0x2FFFF
|
|
|
|
#define CB_TEMP_BLOCKS 0x10000 // Approximate block size for unbuffered I/O
|
|
#define CB_TRANSACTION_LIMIT 0x40000 // Approximate limit for unbuffered I/O transactions.
|
|
|
|
const double MEMORY_FACTOR = 0.4; // Fraction of total memory which we're
|
|
// allowed to use.
|
|
#define CBITS_BASIS_MASK 5
|
|
#define BASIS_MASK (~((~0) << CBITS_BASIS_MASK))
|
|
|
|
typedef struct _ReferenceDescriptor
|
|
{
|
|
UINT iSerialGalactic;
|
|
UINT idwRefList;
|
|
UINT cdwRefs;
|
|
UINT iLastRef;
|
|
|
|
} ReferenceDescriptor;
|
|
|
|
typedef ReferenceDescriptor *PReferenceDescriptor;
|
|
|
|
typedef struct _RefClusterDescriptor
|
|
{
|
|
UINT iFilePosLow;
|
|
UINT iFilePosHigh;
|
|
UINT cdw;
|
|
UINT cTerms;
|
|
|
|
} RefClusterDescriptor;
|
|
|
|
typedef RefClusterDescriptor *PRefClusterDescriptor;
|
|
|
|
enum {
|
|
MAX_LOCAL_DICTS = 4096,
|
|
MAX_REF_SETS = 256,
|
|
MAX_REF_CLUSTERS = 512,
|
|
CB_MERGE_BUFFER = 262144,
|
|
SPARE_FILE_BLOCKS = 6
|
|
};
|
|
|
|
// Note: alde and aiTokenRefFirst logically go together. They've been
|
|
// split apart to maintain DWord alignment for the alte items.
|
|
|
|
typedef struct _UnlinkedState
|
|
{
|
|
PDESCRIPTOR *appdLocalClasses [LOCAL_HASH_CLASSES];
|
|
PDESCRIPTOR *appdCollisionChains[ENTRIES_PER_LOCAL_DICT];
|
|
UINT cReferences [ENTRIES_PER_LOCAL_DICT];
|
|
// USHORT aiTokenInstLast [ENTRIES_PER_LOCAL_DICT]; // List tails for each local
|
|
// descriptor.
|
|
PLocalDictionary pld;
|
|
#ifdef _DEBUG
|
|
UINT cCollisions;
|
|
#endif // _DEBUG
|
|
PWCHAR pbBuffer;
|
|
PWCHAR pbCurrentLine;
|
|
int cbLineAdjustment;
|
|
|
|
// The following items are not used to construct local dictionaries.
|
|
// They are placed here so that they will be allocated only when
|
|
// the current text database is indexing text rather than processing
|
|
// queries.
|
|
|
|
RefClusterDescriptor m_rcd[MAX_REF_CLUSTERS];
|
|
|
|
PLocalDictionary m_apLocalDict [MAX_LOCAL_DICTS ]; // Need a different upper
|
|
#ifdef _DEBUG
|
|
UINT m_acLocalCollisions[MAX_LOCAL_DICTS ];
|
|
#endif // _DEBUG
|
|
UINT m_aiBaseToken [MAX_LOCAL_DICTS+1];
|
|
UINT m_aiBaseCByte [MAX_LOCAL_DICTS+1];
|
|
|
|
} UnlinkedState;
|
|
|
|
typedef struct _LOCAL_CONTEXT_1
|
|
{
|
|
CTextDatabase *ptdb;
|
|
DESCRIPTOR **ppde;
|
|
UINT iDescLimit;
|
|
UINT iLTBase;
|
|
UINT cAdded;
|
|
USHORT ild;
|
|
|
|
} LOCAL_CONTEXT_1;
|
|
|
|
typedef struct _LOCAL_CONTEXT_2
|
|
{
|
|
UINT iSerialNext;
|
|
PUINT paiSerial;
|
|
|
|
} LOCAL_CONTEXT_2;
|
|
|
|
typedef struct _CompressionState
|
|
{
|
|
// UINT iRef;
|
|
UINT cRefs;
|
|
// UINT cbitsBasis;
|
|
// union
|
|
// {
|
|
// UINT ibitNext;
|
|
// UINT cbits;
|
|
// };
|
|
|
|
} CompressionState;
|
|
|
|
typedef struct _LOCAL_CONTEXT_3
|
|
{
|
|
PUINT puiMap;
|
|
CompressionState *paCS;
|
|
UINT idBase;
|
|
UINT cdw;
|
|
UINT cNewRefLists;
|
|
|
|
} LOCAL_CONTEXT_3;
|
|
|
|
typedef struct _LOCAL_CONTEXT_4
|
|
{
|
|
PDESCRIPTOR *ppd;
|
|
PDESCRIPTOR pdBase;
|
|
|
|
} LOCAL_CONTEXT_4;
|
|
|
|
class CTextDatabase;
|
|
class CTokenList;
|
|
|
|
void MergeLocalEntries(UINT iValue, PVOID pvTag, PVOID pvEnvironment);
|
|
void AddLocalEntries (UINT iValue, PVOID pvTag, PVOID pvEnvironment);
|
|
|
|
class CTextDatabase : public CTextMatrix
|
|
{
|
|
friend class CTokenList;
|
|
friend class CTokenCollection;
|
|
friend class CHiliterTokenList;
|
|
|
|
friend void MergeLocalEntries(UINT iValue, PVOID pvTag, PVOID pvEnvironment);
|
|
friend void AddLocalEntries (UINT iValue, PVOID pvTag, PVOID pvEnvironment);
|
|
|
|
public:
|
|
|
|
// static CTextDatabase *NewTextDatabase();
|
|
|
|
virtual ~CTextDatabase();
|
|
virtual const BYTE *GetSourceName() {ASSERT(0);return NULL;} // Provide this function
|
|
|
|
DECLARE_REF_COUNTERS(CTextDatabase)
|
|
|
|
// Save/Load Interface --
|
|
|
|
void StoreImage(CPersist *pDiskImage);
|
|
|
|
int AppendText(PWCHAR pbText, int cbText, BOOL fArticleEnd, UINT iCharset= ANSI_CHARSET, UINT lcid= 0x409);
|
|
void SyncForQueries();
|
|
|
|
UINT CharacterCount ();
|
|
UINT TokenCount ();
|
|
UINT DescriptorCount();
|
|
UINT MaxTokenWidth ();
|
|
|
|
VOID GetTextMatrix(int iRowStart, int iColStart,
|
|
int cRows, int cCols, PWCHAR pbDest);
|
|
|
|
UINT TextLength(PDESCRIPTOR *ppdSorted, PUINT puiTokenMap, UINT iTokenStart, UINT iTokenLimit);
|
|
UINT CopyText (PDESCRIPTOR *ppdSorted, PUINT puiTokenMap, UINT iTokenStart, UINT iTokenLimit, PWCHAR pbBuffer, UINT cbBuffer);
|
|
|
|
void IndicateVocabularyRefs(CIndicatorSet *pisVocabulary, UINT iPartition, const UINT *piMap);
|
|
void IndicateVocabularyRefs(CIndicatorSet *pisVocabulary, CIndicatorSet *pisTokens, const UINT *piMap);
|
|
void IndicateArticleRefs (CIndicatorSet *pisArticles, UINT iDescriptor, const UINT *piMap);
|
|
void IndicateTokenRefs (CIndicatorSet *pisTokens , UINT iDescriptor);
|
|
|
|
CIndicatorSet *TopicInstancesFor (CTokenList *ptl);
|
|
CIndicatorSet *TokenInstancesFor (CTokenList *ptl);
|
|
UINT TokenInstanceCountFor(CTokenList *ptl);
|
|
CIndicatorSet *SymbolLocations();
|
|
|
|
CIndicatorSet *VocabularyFor(CIndicatorSet *pisArticles, BOOL fRemovePervasiveTerms= FALSE);
|
|
|
|
CIndicatorSet *ValidTokens(CTokenList *ptl);
|
|
|
|
inline BOOL FPhrases () { return m_fdwOptions & PHRASE_SEARCH; }
|
|
inline BOOL FPhraseFeedback() { return m_fdwOptions & PHRASE_FEEDBACK; }
|
|
inline BOOL FVectorSearch () { return m_fdwOptions & VECTOR_SEARCH; }
|
|
inline UINT IndexOptions () { return m_fdwOptions; }
|
|
|
|
CDictionary *PDict();
|
|
CCollection *PColl();
|
|
|
|
LCID SortingLCID();
|
|
|
|
protected:
|
|
|
|
#ifdef _DEBUG
|
|
|
|
CTextDatabase(PSZ pszTypeName= "TextDatabase");
|
|
|
|
#else // _DEBUG
|
|
|
|
CTextDatabase();
|
|
|
|
#endif // _DEBUG
|
|
|
|
void InitTextDatabase(BOOL fFromFile= FALSE);
|
|
|
|
void ConnectImage(CPersist *pDiskImage, BOOL fUnpackDisplayForm= TRUE);
|
|
|
|
inline int Data_cRows() { return 1; }
|
|
inline int Data_cCols() { return m_cbScanned; }
|
|
|
|
inline void Data_GetTextMatrix(int rowTop, int colLeft,
|
|
int rows, int cols, PWCHAR lpb, PUINT charsets
|
|
)
|
|
{
|
|
GetTextMatrix(rowTop, colLeft, rows, cols, lpb);
|
|
}
|
|
|
|
const UINT * TermRanks();
|
|
PUINT TokenBase();
|
|
|
|
UINT m_fdwOptions;
|
|
|
|
private:
|
|
|
|
#ifdef _DEBUG
|
|
|
|
BOOL m_fInitialized;
|
|
|
|
#endif // _DEBUG
|
|
|
|
UINT m_fFromFileImage;
|
|
UINT m_cbScanned;
|
|
UINT m_cTokensIndexed;
|
|
USHORT m_cLocalDicts;
|
|
USHORT m_iLocalDictBase;
|
|
|
|
MY_VIRTUAL_BUFFER m_avb[COUNT_OF_VIRTUAL_BUFFERS];
|
|
|
|
CSegHashTable *m_pshtGalactic;
|
|
CSegHashTable *m_pshtGlobal;
|
|
|
|
PUINT m_pwHash; // Working storage for the AppendSlave routine...
|
|
PBYTE m_pbType;
|
|
PWCHAR *m_paStart;
|
|
PWCHAR *m_paEnd;
|
|
|
|
CIndicatorSet *m_pisSymbols;
|
|
|
|
PLocalToken m_pltNext;
|
|
PUINT m_puiTokenNext;
|
|
|
|
PDESCRIPTOR m_pdNext, m_pdNextGlobal, m_pdNextGalactic, m_pdNextBound;
|
|
PWCHAR m_pbNext, m_pbNextGlobal, m_pbNextGalactic, m_pbLastGalactic;
|
|
PWCHAR m_pwDispNext, m_pwDispNextGlobal, m_pwDispNextGalactic, m_pwDispLastGalactic;
|
|
|
|
UINT m_iSerialNumberNext;
|
|
|
|
PUINT m_paiGlobalToRefList;
|
|
|
|
CUnbufferedIO *m_puioRefTemp;
|
|
CUnbufferedIO *m_puioCompressedRefs;
|
|
PRefListDescriptor m_prldTokenRefs;
|
|
UINT m_cdwCompressedRefs;
|
|
PUINT m_pdwCompressedRefs;
|
|
|
|
CUnbufferedIO *m_puioCompressedArticleRefs;
|
|
PRefListDescriptor m_prldArticleRefs;
|
|
UINT m_cdwArticleRefs;
|
|
PUINT m_pdwArticleRefs;
|
|
|
|
CUnbufferedIO *m_puioCompressedVocabularyRefs;
|
|
PRefListDescriptor m_prldVocabularyRefs;
|
|
UINT m_cdwVocabularyRefs;
|
|
PUINT m_pdwVocabularyRefs;
|
|
|
|
UINT m_cbBlockSize;
|
|
UINT m_cbTransactionLimit;
|
|
|
|
UINT m_iNextRefSet;
|
|
UINT m_ibNextFileBlockLow;
|
|
UINT m_ibNextFileBlockHigh;
|
|
|
|
PFileBlockLink m_pFirstFreeFileBlock;
|
|
PFileBlockLink m_papFileBlockLinks;
|
|
|
|
CIOList *m_piolLeft;
|
|
CIOList *m_piolRight;
|
|
CIOList *m_piolResult;
|
|
|
|
LCID m_lcidSorting;
|
|
PDESCRIPTOR *m_ppdSorted; // left-to-right sorting vector
|
|
PDESCRIPTOR *m_ppdTailSorted; // right-to-left sorting vector
|
|
UINT m_cdSorted; // number of sorted terms
|
|
UINT m_cwDisplayMax;
|
|
|
|
UINT m_cTermRanks;
|
|
PUINT m_pTermRanks;
|
|
|
|
CClassifier m_clsfTokens;
|
|
PUINT m_pafClassifications;
|
|
|
|
CDictionary *m_pDict;
|
|
CCollection *m_pColl;
|
|
|
|
// BugBug! The private members below are used only during index creation.
|
|
// Convert them to external allocations so we don't pay the price
|
|
// when we're loading an index.
|
|
|
|
UnlinkedState *m_pulstate;
|
|
|
|
virtual UINT GetPartitionInfo(const UINT **ppaiPartitions, const UINT **ppaiRanks= NULL, const UINT **ppaiMap= NULL) = 0;
|
|
virtual UINT ArticleCount() = 0;
|
|
|
|
PDESCRIPTOR DescriptorBase ();
|
|
PWCHAR ImageBase ();
|
|
PWCHAR DisplayBase ();
|
|
|
|
int AppendSlave(PWCHAR pbText, int cbText, BOOL fArticleEnd, UINT iCharset, UINT lcid);
|
|
|
|
int ExceptionFilter(IN DWORD ExceptionCode, IN PEXCEPTION_POINTERS ExceptionInfo);
|
|
|
|
USHORT SearchLocalTable(PWCHAR pbToken, UINT cbToken, UINT hv, BYTE bType, UINT iCharset, UINT lcid);
|
|
|
|
CAValRef *DescriptorList(PDESCRIPTOR pd, UINT cd);
|
|
|
|
void ExtendClassifications(PDESCRIPTOR pdSuffix);
|
|
|
|
void IndicateMappedRefs(PRefListDescriptor prld, PUINT pdwRefBase, CIndicatorSet *pisArticles, const UINT *piMap);
|
|
|
|
int IndicateRefs(PRefListDescriptor prld, PUINT pdwRefLists, CIndicatorSet *pis, BOOL fCountOnly, PUINT paiCountArray= NULL);
|
|
|
|
void WriteLargeBuff(PVOID pvBuffer, UINT iPosLow, UINT iPosHigh, UINT cbBuffer);
|
|
|
|
PLocalDictionary AllocateLocalDictionary();
|
|
PLocalDictionary MoveToNextLocalDict (PWCHAR pbScanLimit);
|
|
|
|
PDESCRIPTOR *FindTokens(CTokenList *ptl, PUINT pcd= NULL);
|
|
|
|
void BindToGlobalDict(PWCHAR pbScanLimit);
|
|
|
|
void FlattenAndMergeLinks ();
|
|
void GalacticMerge ();
|
|
void CoalesceReferenceLists();
|
|
|
|
void MergeRefLists(PRefStream prsResult, PRefStream pars, UINT cRefStreams);
|
|
void ConstructVocabularyLists();
|
|
void CompressVocabularyLists(CIOList *piolSource, UINT cdw);
|
|
void CompressArticleRefLists(CIOList *piolSource, UINT cdw);
|
|
void CompressRefLists (CIOList *piorSource, UINT cdw);
|
|
void CopyRefStreamSegment(CIOList *piolSource, CIOList *piolDestination, UINT cdw);
|
|
};
|
|
|
|
inline PDESCRIPTOR CTextDatabase::DescriptorBase() { return (PDESCRIPTOR) (vbImageDescriptors.Base); }
|
|
inline PWCHAR CTextDatabase::ImageBase () { return (PWCHAR ) (vbTokenImages .Base); }
|
|
inline PWCHAR CTextDatabase::DisplayBase () { return (PWCHAR ) (vbDisplayImages .Base); }
|
|
|
|
inline UINT CTextDatabase::CharacterCount () { return m_cbScanned; }
|
|
inline UINT CTextDatabase::DescriptorCount() { return m_iSerialNumberNext; }
|
|
inline UINT CTextDatabase::MaxTokenWidth () { return m_cwDisplayMax; }
|
|
|
|
inline PUINT CTextDatabase::TokenBase()
|
|
{
|
|
return (PUINT) (vbTokenStream.Base);
|
|
}
|
|
|
|
inline UINT CTextDatabase::TokenCount()
|
|
{
|
|
PLocalDictionary pld;
|
|
|
|
if (m_pulstate && (pld= m_pulstate->pld))
|
|
return (pld->pltFirst + pld->clt) - (PLocalToken) TokenBase();
|
|
else return m_pltNext - (PLocalToken) TokenBase();
|
|
}
|
|
|
|
inline CIndicatorSet *CTextDatabase::SymbolLocations() { return m_pisSymbols; }
|
|
|
|
inline void CTextDatabase::IndicateTokenRefs(CIndicatorSet *pisTokens, UINT iDescriptor)
|
|
{
|
|
IndicateRefs(m_prldTokenRefs + iDescriptor, m_pdwCompressedRefs, pisTokens, FALSE);
|
|
}
|
|
|
|
inline void CTextDatabase::IndicateArticleRefs(CIndicatorSet *pisArticles, UINT iDescriptor, const UINT *piMap)
|
|
{
|
|
IndicateMappedRefs(m_prldArticleRefs + iDescriptor, m_pdwArticleRefs, pisArticles, piMap);
|
|
}
|
|
|
|
inline CDictionary *CTextDatabase::PDict() {ASSERT(FVectorSearch()); return m_pDict;}
|
|
inline CCollection *CTextDatabase::PColl() {ASSERT(FVectorSearch()); return m_pColl;}
|
|
|
|
inline LCID CTextDatabase::SortingLCID() { return m_lcidSorting; }
|
|
|
|
#endif // __TXDBASE_H__
|