windows-nt-4.0/private/windows/win4help/ftsrch/txdbase.h


								// TxDBase.h -- CTextDatabase class definition


								#ifndef __TXDBASE_H__


								#define __TXDBASE_H__


								#include "SegHash.h"

								#include "VMBuffer.h"

								#include "Indicate.h"

								#include "Classify.h"

								#include  "Defines.h"

								#include  "TextMat.h"

								#include "UnbuffIO.h"

								#include   "IOList.h"

								#include "IOStream.h"

								#include "FTSIFace.h"

								#include "Compress.h"

								#include     "Util.h"

								#include  "Sorting.h"

								#include     "dict.h"

								#include   "vector.h"


								// !!! BugBug !!! Parts of the long comment below are now incorrect and need revision.


								//     Term Tags Strategies

								//

								// The term tag data structures go with term entries in the segmented

								// hash tables. We use two hash table - the Global table and the Galactic

								// table. The term tags for the two tables are designed for the actions we

								// take with the hash table.

								//

								// Our index work proceeds in four phases --

								//

								// 1. Constructing Local Dictionaries

								//    During this phase we build a local term dictionary with very restricted

								//    context. The dictionary strictly covers a range of up to 65536 tokens.

								//    The local dictionary is an unsegmented hash table biased for speed and very

								//    low collision rates.

								//

								// 2. Linking Local Dictionaries with the Global Dictionary

								//    When a local dictionary reaches a capacity limit or when we must force

								//    our text database to a searchable state, we link local dictionary entries

								//    with corresponding global hash table entries. This has two effects --

								//

								//    A. We merge the local terms with the global terms, adding new unique terms

								//       to the global list.

								//

								//    B. We now have a global linked list which traverses all the references for

								//       each term. This was the original searchable format for the database. It

								//       works well so long as the database fits entirely within RAM and degrades

								//       when our working set significantly exceed RAM space. The current code

								//       doesn't construct these global links, but relies instead on the flattening

								//       phase below.

								//

								// 3. Flattening Linked Lists

								//    When the collection of reference links in the local dictionaries reaches

								//    a memory size threshold, we traverse the linked lists and construct a

								//    collection of flattened vectors of reference indices. At the same time we

								//    compress the streams of reference indices. The compression algorithm relies

								//    on three fields maintained for each term in the term tag --

								//

								//    A. iNewRefFirst -- the index of the first instance in the linked stream

								//    B. iNewRefLast  -- the index of the last  instance in the linked stream

								//    C. cRefsNew     -- the number of instances in the linked stream

								//

								//    To merge new flattened vectors with previously accumulated vectors we

								//    maintain four additional term tag fields --

								//

								//    B. iRefListBase -- index to the stream of flattened references for this term.

								//    C. cdwRefs      -- size of the flattened reference lists in DWords.

								//    D. iRefSequence -- ranking order of this term relative to the galactic table

								//                       A -1 value indicates no ranking.

								//

								//    Note that we simply catenate the compressed reference lists from sequential

								//    flattening passes. Thus the stream denoted by iRefListBase has the format --

								//

								//        {cdw, cRefsStream, iRefFirst, <<basis>, <compressed ref>...>} ...

								//

								//    where --

								//

								//      cdw              is the size of the reference list segment in DWords

								//      cRefsStream      is the number of references in the list.

								//      iRefFirst        is the first reference in the list

								//      <basis>          is a five-bit value which drives the compression

								//                       algorithm.

								//      <compressed ref> values are variable length bit strings which

								//                       represent the delta between successive reference

								//                       indices.

								//

								//    Note that cdw could be derrived from cRefsStream and <basis>. We include

								//    it in the reference stream to allow a fast traversal of the stream when

								//    we're looking for a particular indexing range.

								//

								//    The iRefSequence field is a ordering value maintained in the galactic hash

								//    table for each unique term. It's used to speed up the process of merging new

								//    reference vector segments with previously accumulated segments. The strategy

								//    is to keep the reference segments in an incrementally committed memory address

								//    range and to insert segments by copying reference streams upward in memory,

								//    inserting the new segments as we go. The iRefSequence field gives us the memory

								//    order for the reference streams.

								//

								//    Note that for terms with less than four references we keep the reference

								//    information entirely in the term tag. That situation is denoted by negative

								//    values in cRefsTotal, iRefListBase, and cdwRefs. Zero values are used to

								//    mark the one and two cases. The actual index values are the logical

								//    negation (~) of those fields in the order mentioned above. When a new vector

								//    segment would push the global total beyond three, we merge the old vector

								//    with the new one and create an external list.

								//

								//    Note that the iRefSequence field may be undefined  (-1) or defined >= 0 for

								//    a term with less than four references. This is because the galactic table

								//    may have other references that push us over the limit.

								//

								//    Why do we bother with this complicated scheme? Its value lies in reducing

								//    the number of items in the reference stream. During the merge work this

								//    reduces the number of items that must be slid upward in memory and it

								//    reduces the number of iRefListBase fields that must be adjusted during

								//    the merge operation.Approximately 45% of all unique terms are used only

								//    once. By keeping small lists in the tag, we reduce the number of external

								//    lists by 75%.

								//

								// 4. Galactic Merges

								//    When the global table reaches a memory size threshold, we merge its reference

								//    information with the galactic hash table and restart our indexing work with

								//    an empty global table. The issue here is keeping the global table small enough

								//    so that it fits completely within RAM during phase 2 work.

								//

								//    The galactic term tags contain only the accumulation fields --

								//

								//    B. iRefListBase -- index to the stream of flattened references for this term.

								//    C. cdwRefs      -- size of the flattened reference lists in DWords.

								//    D. iRefSequence -- ranking order of this term relative to the galactic table.

								//                       A -1 value indicates no ranking.


								typedef struct _TermTagGlobal

								        {

								            UINT  iGlobalDesc;   // Global   sequence # for term.

								            UINT  iGalacticDesc; // Galactic sequence # for term.


								        //    UINT  iNewRefFirst;  // First linked global ref.

								        //    UINT  iNewRefLast;   // Last  linked global ref.

								            UINT  cRefsNew;      // # of linked global refs.

								            UINT  cRefsGlobal;


								        } TermTagGlobal;


								typedef TermTagGlobal *PTermTagGlobal;


								typedef struct _TermTagGalactic

								        {

								            UINT  iGalacticDesc; // Galactic sequence # for term.


								        } TermTagGalactic;


								typedef TermTagGalactic *PTermTagGalactic;


								typedef struct _DESCRIPTOR

								        {

											PWCHAR pwDisplay;  // pbImage is Sort Key, pwDisplay is Display Image.


								            union

								            {

								                PWCHAR pbImage;    // Length given by delta with following pd->pbImage.

								                UINT   iGalactic;

								            };


											union

								            {

								    			UINT cReferences;   // Used while building a CTextDatabase

								                UINT iTokenInfo;    // Used in CTokenCollection

								                UINT iTextSet;      // Used in CTitleCollection

								            };


											WORD  cwDisplay;

											BYTE  bCharset;

								            BYTE  fImageFlags;


								        } DESCRIPTOR;


								typedef DESCRIPTOR *PDESCRIPTOR;


								inline UINT CbImage(PDESCRIPTOR pd)

								{

								#ifdef MESSAGEBOXES


									if (256 < ((pd+1)->pbImage - pd->pbImage))

									{

									 	char ac[256], acToken[101];


										wsprintf(ac, "Token length: %d", ((pd+1)->pbImage - pd->pbImage));


									 	::MessageBox(NULL, ac, "Very Large Token!", MB_OK);


										CopyMemory(acToken, pd->pbImage, 50);


										acToken[50]= 0;


										wsprintf(ac, "Token Image: \"%s...\"", acToken);


										::MessageBox(NULL, ac, "Part of the token image!", MB_OK);

									}


								#else // MESSAGEBOXES


									ASSERT(1024 > ((pd+1)->pbImage - pd->pbImage));


								#endif // MESSAGEBOXES


								    return (pd+1)->pbImage - pd->pbImage;

								}


								inline UINT CwDisplay(PDESCRIPTOR pd)

								{

									ASSERT(1024 > ((pd+1)->pwDisplay - pd->pwDisplay));


								    return (pd+1)->pwDisplay - pd->pwDisplay;

								}


								// Flag definitions for DESCRIPTOR.fImageFlags:


								// #define LETTER_CHAR       0x0001

								// #define CONTAINS_A_TAB    0x0002

								// #define TOKEN_FLAGS_MASK  0x0003

								// #define REF_TYPE_MASK     0x000C

								// #define BASIS_MASK        0xF800

								// #define REFS_LINKED       0x0010


								// #define BASIS_SHIFT       11


								// Reference types for REF_TYPE_MASK:


								// #define SingleRef

								// #define PairRef

								// #define TripleRef


								UINT CBitsToRepresent(UINT ui);

								UINT FormatAToken(PDESCRIPTOR pd, int cbOffset, int iColStart, int iColLimit, PWCHAR pbLine);


								void SortTokenImages(PDESCRIPTOR pdBase, PDESCRIPTOR **pppdSorted, PDESCRIPTOR **pppdTailSorted,

								                     PUINT pcdSorted, UINT cd

								                    );


								// #define  BUILD_LOCAL_HASH(hv,c)  hv= ((hv << 5) | (hv >> 27)) - c

								// #define BUILD_GLOBAL_HASH(hv,c)  hv= ((hv >> 5) | (hv << 27)) - c


								typedef struct _LocalToken

								        {

								            unsigned short iLocalDescriptorEntry;

								            unsigned short iLocalReferenceNext;

								        } LocalToken;


								typedef LocalToken *PLocalToken;


								// Descriptor reference tokens are processed in three phases. Tokens are

								// initially created with iLocalDescriptorEntry set and iLocalDescriptorNext

								// zeroed.

								//

								// Later when we bind a local dictionary to the global dictionary, the

								// iLocalDecriptorNext field is used to link together every instance of

								// each unique term in the local dictionary.

								//

								// Finally when we reach a specific memory limit, we flatten the linked lists

								// for all local dictionaries to create a vector of reference indices for

								// each unique term in the global dictionary. At this point we also map the

								// LocalToken structure shown above into GlobalToken values (See below).

								//

								// A GlobalToken is a 16-bit value which refers uniquely to a particular

								// global DESCRIPTOR. Since we can easily have more than 64K unique global

								// terms, we provide an indirection mechanism which maps some 16-bit values

								// into 32-bit values.

								//

								// Here's how it works. We divide GlobalToken values into two ranges.

								// Values between 0..59,983 are absolute indices into the global vector of

								// unique DESCRIPTORs. Values between 59,984 and 65,535 are mapped to 32-bit

								// via a local indrection vector of 32-bit indices.


								typedef USHORT       GlobalToken;

								typedef GlobalToken *PGlobalToken;


								#define LOCAL_HASH_CLASSES     0x8000

								#define LOCAL_HASH_MASK        0x7FFF

								#define ENTRIES_PER_LOCAL_DICT 6552

								#define MAX_REFS_PER_LDICT     0x10000


								#define MAX_GLOBAL_TOKENS      (0x10000 - ENTRIES_PER_LOCAL_DICT)


								// Note: The constant ENTRIES_PER_LOCAL_DICT is chosen to make the

								//       LocalDictionary structure exactly 64K bytes.

								//

								//       MAX_GLOBAL_TOKENS is a constant which allows streams of token

								//       references to fit in 2-byte granules. The first MAX_GLOBAL_TOKENS

								//       unique tokens we encounter are considered global. References to

								//       those tokens are encode in the value range [0..MAX_GLOBAL_TOKENS-1]

								//       while references to tokens outside that set are denoted by values

								//       in the range [MAX_GLOBAL_TOKENS .. 0xFFFF]. The latter values can

								//       be trivially mapped into indices into the local dictionary which

								//       corresponds to the token reference. One effect of this coding is

								//       that most local dictionaries will collapse to empty when we convert

								//       to the vector representation from the linked token representation.


								typedef struct _LocalDictionary

								        {

								            PLocalToken  pltFirst;    // address of first token for this local dictionary

								            UINT         clt;         // count of local tokens which refer to this Local dict

								            PDESCRIPTOR *ppdNext;     // next unused slot in apdLocal.

								            union

								            {

								                PDESCRIPTOR apdLocal[ENTRIES_PER_LOCAL_DICT]; // Refs to descriptors used locally

								                UINT      aiGalactic[ENTRIES_PER_LOCAL_DICT]; // Galactic indices for local terms

								            };

								            USHORT       aiTokenInstFirst[ENTRIES_PER_LOCAL_DICT]; // List heads for each local

								                                                                   // descriptor.

								        } LocalDictionary;


								typedef LocalDictionary *PLocalDictionary;


								#define IVB_TOKEN_STREAM            0

								#define IVB_TOKEN_IMAGES            1

								#define IVB_IMAGE_DESCRIPTORS       2

								#define IVB_DISPLAY_IMAGES          3


								#define COUNT_OF_VIRTUAL_BUFFERS    4


								#define vbTokenStream           m_avb[IVB_TOKEN_STREAM     ]

								#define vbTokenImages           m_avb[IVB_TOKEN_IMAGES     ]

								#define vbImageDescriptors      m_avb[IVB_IMAGE_DESCRIPTORS]

								#define vbDisplayImages         m_avb[IVB_DISPLAY_IMAGES]


								// Commit and Reservation constants for the virtual buffers

								// in the TextDatabaseControl object. These reservations are

								// based on an upper limit of 100,000,000 bytes scanned.


								#define INIT_TOKEN_REF_COMMIT              0x00010000 // 0x00430000

								#define INIT_TOKEN_REF_RESERVATION         0x08000000

								#define INIT_TOKEN_IMAGE_COMMIT            0x00010000 // 0x000A0000

								#define INIT_TOKEN_IMAGE_RESERVATION       0x03700000

								#define INIT_IMAGE_DESCRIPTOR_COMMIT       0x00010000 // 0x00160000

								#define INIT_IMAGE_DESCRIPTOR_RESERVATION  0x02A00000

								#define INIT_DISPLAY_IMAGE_COMMIT          0x00010000

								#define INIT_DISPLAY_IMAGE_RESERVATION     0x03700000


								#define BUFFER_INCREMENT    0x2FFFF


								#define CB_TEMP_BLOCKS          0x10000 // Approximate block size for unbuffered I/O

								#define CB_TRANSACTION_LIMIT    0x40000 // Approximate limit for unbuffered I/O transactions.


								const double MEMORY_FACTOR = 0.4; // Fraction of total memory which we're

								                                  // allowed to use.

								#define CBITS_BASIS_MASK    5

								#define BASIS_MASK          (~((~0) << CBITS_BASIS_MASK))


								typedef struct _ReferenceDescriptor

								        {

								            UINT iSerialGalactic;

								            UINT idwRefList;

								            UINT cdwRefs;

								            UINT iLastRef;


								        } ReferenceDescriptor;


								typedef ReferenceDescriptor *PReferenceDescriptor;


								typedef struct _RefClusterDescriptor

								        {

								            UINT    iFilePosLow;

								            UINT    iFilePosHigh;

								            UINT    cdw;

								            UINT    cTerms;


								        } RefClusterDescriptor;


								typedef RefClusterDescriptor *PRefClusterDescriptor;


								enum {

								       MAX_LOCAL_DICTS   = 4096,

								       MAX_REF_SETS      = 256,

								       MAX_REF_CLUSTERS  = 512,

								       CB_MERGE_BUFFER   = 262144,

								       SPARE_FILE_BLOCKS = 6

								     };


								// Note: alde and aiTokenRefFirst logically go together. They've been

								//       split apart to maintain DWord alignment for the alte items.


								typedef struct _UnlinkedState

								        {

								            PDESCRIPTOR     *appdLocalClasses   [LOCAL_HASH_CLASSES];

								            PDESCRIPTOR     *appdCollisionChains[ENTRIES_PER_LOCAL_DICT];

								            UINT             cReferences        [ENTRIES_PER_LOCAL_DICT];

								        //    USHORT           aiTokenInstLast    [ENTRIES_PER_LOCAL_DICT];  // List tails for each local

								                                                                           // descriptor.

								            PLocalDictionary pld;

								#ifdef _DEBUG

								            UINT             cCollisions;

								#endif // _DEBUG

								            PWCHAR           pbBuffer;

								            PWCHAR           pbCurrentLine;

								            int              cbLineAdjustment;


									        // The following items are not used to construct local dictionaries.

											// They are placed here so that they will be allocated only when

											// the current text database is indexing text rather than processing

											// queries.


									        RefClusterDescriptor m_rcd[MAX_REF_CLUSTERS];


									        PLocalDictionary m_apLocalDict      [MAX_LOCAL_DICTS  ];     // Need a different upper

								#ifdef _DEBUG

									        UINT             m_acLocalCollisions[MAX_LOCAL_DICTS  ];

								#endif // _DEBUG

									        UINT             m_aiBaseToken      [MAX_LOCAL_DICTS+1];

									        UINT             m_aiBaseCByte      [MAX_LOCAL_DICTS+1];


								        } UnlinkedState;


								typedef struct _LOCAL_CONTEXT_1

								        {

								            CTextDatabase    *ptdb;

								            DESCRIPTOR      **ppde;

								            UINT              iDescLimit;

								            UINT              iLTBase;

								            UINT              cAdded;

								            USHORT            ild;


								        } LOCAL_CONTEXT_1;


								typedef struct _LOCAL_CONTEXT_2

								        {

								            UINT  iSerialNext;

								            PUINT paiSerial;


								        } LOCAL_CONTEXT_2;


								typedef struct _CompressionState

								        {

								        //    UINT iRef;

								            UINT cRefs;

								        //    UINT cbitsBasis;

								        //    union

								        //    {

								        //        UINT ibitNext;

								        //        UINT cbits;

								        //    };


								        } CompressionState;


								typedef struct _LOCAL_CONTEXT_3

								        {

								            PUINT             puiMap;

								            CompressionState *paCS;

								            UINT              idBase;

								            UINT              cdw;

								            UINT              cNewRefLists;


								        } LOCAL_CONTEXT_3;


								typedef struct _LOCAL_CONTEXT_4

								        {

								            PDESCRIPTOR *ppd;

								            PDESCRIPTOR  pdBase;


								        } LOCAL_CONTEXT_4;


								class CTextDatabase;

								class CTokenList;


								void MergeLocalEntries(UINT iValue, PVOID pvTag, PVOID pvEnvironment);

								void AddLocalEntries  (UINT iValue, PVOID pvTag, PVOID pvEnvironment);


								class CTextDatabase : public CTextMatrix

								{

								    friend class CTokenList;

								    friend class CTokenCollection;

								    friend class CHiliterTokenList;


								    friend void MergeLocalEntries(UINT iValue, PVOID pvTag, PVOID pvEnvironment);

								    friend void AddLocalEntries  (UINT iValue, PVOID pvTag, PVOID pvEnvironment);


								    public:


								    //    static CTextDatabase *NewTextDatabase();


								        virtual ~CTextDatabase();

								        virtual const BYTE *GetSourceName() {ASSERT(0);return NULL;} // Provide this function


								        DECLARE_REF_COUNTERS(CTextDatabase)


								// Save/Load Interface --


								        void StoreImage(CPersist *pDiskImage);


								        int  AppendText(PWCHAR pbText, int  cbText, BOOL fArticleEnd, UINT iCharset= ANSI_CHARSET, UINT lcid= 0x409);

								        void SyncForQueries();


								        UINT CharacterCount ();

								        UINT TokenCount     ();

										UINT DescriptorCount();

								        UINT MaxTokenWidth  ();


								        VOID GetTextMatrix(int iRowStart, int iColStart,

								                           int cRows,     int cCols,     PWCHAR pbDest);


								        UINT TextLength(PDESCRIPTOR *ppdSorted, PUINT puiTokenMap, UINT iTokenStart, UINT iTokenLimit);

								        UINT CopyText  (PDESCRIPTOR *ppdSorted, PUINT puiTokenMap, UINT iTokenStart, UINT iTokenLimit, PWCHAR pbBuffer, UINT cbBuffer);


								        void IndicateVocabularyRefs(CIndicatorSet *pisVocabulary, UINT iPartition,          const UINT *piMap);

								        void IndicateVocabularyRefs(CIndicatorSet *pisVocabulary, CIndicatorSet *pisTokens, const UINT *piMap);

								        void IndicateArticleRefs   (CIndicatorSet *pisArticles,   UINT iDescriptor,  const UINT *piMap);

								        void IndicateTokenRefs     (CIndicatorSet *pisTokens  ,   UINT iDescriptor);


								        CIndicatorSet *TopicInstancesFor    (CTokenList *ptl);

								        CIndicatorSet *TokenInstancesFor    (CTokenList *ptl);

								        UINT           TokenInstanceCountFor(CTokenList *ptl);

								        CIndicatorSet *SymbolLocations();


								        CIndicatorSet *VocabularyFor(CIndicatorSet *pisArticles, BOOL fRemovePervasiveTerms= FALSE);


										CIndicatorSet *ValidTokens(CTokenList *ptl);


								        inline BOOL FPhrases       () { return m_fdwOptions & PHRASE_SEARCH;   }

								        inline BOOL FPhraseFeedback() { return m_fdwOptions & PHRASE_FEEDBACK; }

								        inline BOOL FVectorSearch  () { return m_fdwOptions & VECTOR_SEARCH;   }

								        inline UINT IndexOptions   () { return m_fdwOptions;                   }


										CDictionary	*PDict();

										CCollection	*PColl();


								        LCID SortingLCID();


								    protected:


								#ifdef _DEBUG


								             CTextDatabase(PSZ pszTypeName= "TextDatabase");


								#else // _DEBUG


								             CTextDatabase();


								#endif // _DEBUG


								        void InitTextDatabase(BOOL fFromFile= FALSE);


								        void ConnectImage(CPersist *pDiskImage, BOOL fUnpackDisplayForm= TRUE);


								        inline int Data_cRows() { return 1; }

								        inline int Data_cCols() { return m_cbScanned; }


								        inline void Data_GetTextMatrix(int  rowTop, int  colLeft,

								                                       int  rows, int  cols, PWCHAR lpb, PUINT charsets

								                                      )

								        {

								             GetTextMatrix(rowTop, colLeft, rows, cols, lpb);

								        }


								        const UINT * TermRanks();

								        PUINT TokenBase();


								        UINT m_fdwOptions;


								    private:


								#ifdef _DEBUG


								        BOOL   m_fInitialized;


								#endif // _DEBUG


								        UINT   m_fFromFileImage;

								        UINT   m_cbScanned;

								        UINT   m_cTokensIndexed;

								        USHORT m_cLocalDicts;

								        USHORT m_iLocalDictBase;


										MY_VIRTUAL_BUFFER m_avb[COUNT_OF_VIRTUAL_BUFFERS];


								        CSegHashTable *m_pshtGalactic;

								        CSegHashTable *m_pshtGlobal;


								        PUINT   m_pwHash;  // Working storage for the AppendSlave routine...

								        PBYTE   m_pbType;

								        PWCHAR *m_paStart;

								        PWCHAR *m_paEnd;


								        CIndicatorSet *m_pisSymbols;


								        PLocalToken m_pltNext;

								        PUINT       m_puiTokenNext;


								        PDESCRIPTOR m_pdNext, m_pdNextGlobal, m_pdNextGalactic, m_pdNextBound;

								        PWCHAR      m_pbNext, m_pbNextGlobal, m_pbNextGalactic, m_pbLastGalactic;

								        PWCHAR      m_pwDispNext, m_pwDispNextGlobal, m_pwDispNextGalactic, m_pwDispLastGalactic;


								        UINT        m_iSerialNumberNext;


								        PUINT       m_paiGlobalToRefList;


								        CUnbufferedIO *m_puioRefTemp;

								        CUnbufferedIO *m_puioCompressedRefs;

								        PRefListDescriptor m_prldTokenRefs;

								        UINT           m_cdwCompressedRefs;

								        PUINT          m_pdwCompressedRefs;


								        CUnbufferedIO *m_puioCompressedArticleRefs;

								        PRefListDescriptor m_prldArticleRefs;

								        UINT           m_cdwArticleRefs;

								        PUINT          m_pdwArticleRefs;


								        CUnbufferedIO *m_puioCompressedVocabularyRefs;

								        PRefListDescriptor m_prldVocabularyRefs;

								        UINT           m_cdwVocabularyRefs;

								        PUINT          m_pdwVocabularyRefs;


								        UINT           m_cbBlockSize;

								        UINT           m_cbTransactionLimit;


								        UINT                 m_iNextRefSet;

								        UINT                 m_ibNextFileBlockLow;

								        UINT                 m_ibNextFileBlockHigh;


								        PFileBlockLink m_pFirstFreeFileBlock;

								        PFileBlockLink m_papFileBlockLinks;


								        CIOList *m_piolLeft;

								        CIOList *m_piolRight;

								        CIOList *m_piolResult;


								        LCID         m_lcidSorting;

								        PDESCRIPTOR *m_ppdSorted;       // left-to-right sorting vector

								        PDESCRIPTOR *m_ppdTailSorted;   // right-to-left sorting vector

								        UINT         m_cdSorted;        // number of sorted terms

								        UINT         m_cwDisplayMax;


								        UINT         m_cTermRanks;

								        PUINT        m_pTermRanks;


								        CClassifier m_clsfTokens;

								        PUINT       m_pafClassifications;


										CDictionary		*m_pDict;

										CCollection		*m_pColl;


								// BugBug! The private members below are used only during index creation.

								//         Convert them to external allocations so we don't pay the price

								//         when we're loading an index.


								        UnlinkedState *m_pulstate;


								        virtual UINT GetPartitionInfo(const UINT **ppaiPartitions, const UINT **ppaiRanks= NULL, const UINT **ppaiMap= NULL) = 0;

								        virtual UINT ArticleCount() = 0;


								        PDESCRIPTOR DescriptorBase   ();

								        PWCHAR      ImageBase        ();

								        PWCHAR      DisplayBase      ();


								        int AppendSlave(PWCHAR pbText, int  cbText, BOOL fArticleEnd, UINT iCharset, UINT lcid);


								        int ExceptionFilter(IN DWORD ExceptionCode, IN PEXCEPTION_POINTERS ExceptionInfo);


								        USHORT SearchLocalTable(PWCHAR pbToken, UINT cbToken, UINT  hv, BYTE bType, UINT iCharset, UINT lcid);


								        CAValRef *DescriptorList(PDESCRIPTOR pd, UINT cd);


								        void ExtendClassifications(PDESCRIPTOR pdSuffix);


								        void IndicateMappedRefs(PRefListDescriptor prld, PUINT pdwRefBase, CIndicatorSet *pisArticles, const UINT *piMap);


								        int IndicateRefs(PRefListDescriptor prld, PUINT pdwRefLists, CIndicatorSet *pis, BOOL fCountOnly, PUINT paiCountArray= NULL);


								        void WriteLargeBuff(PVOID pvBuffer, UINT iPosLow, UINT iPosHigh, UINT cbBuffer);


								        PLocalDictionary AllocateLocalDictionary();

								        PLocalDictionary MoveToNextLocalDict    (PWCHAR pbScanLimit);


								        PDESCRIPTOR *FindTokens(CTokenList *ptl, PUINT pcd= NULL);


								        void BindToGlobalDict(PWCHAR pbScanLimit);


								        void FlattenAndMergeLinks  ();

								        void GalacticMerge         ();

								        void CoalesceReferenceLists();


								        void MergeRefLists(PRefStream prsResult, PRefStream pars, UINT cRefStreams);

								        void ConstructVocabularyLists();

								        void CompressVocabularyLists(CIOList *piolSource, UINT cdw);

								        void CompressArticleRefLists(CIOList *piolSource, UINT cdw);

								        void CompressRefLists       (CIOList *piorSource, UINT cdw);

								        void CopyRefStreamSegment(CIOList *piolSource, CIOList *piolDestination, UINT cdw);

								 };


								inline PDESCRIPTOR CTextDatabase::DescriptorBase() { return (PDESCRIPTOR) (vbImageDescriptors.Base); }

								inline PWCHAR      CTextDatabase::ImageBase     () { return (PWCHAR     ) (vbTokenImages     .Base); }

								inline PWCHAR      CTextDatabase::DisplayBase   () { return (PWCHAR     ) (vbDisplayImages   .Base); }


								inline UINT CTextDatabase::CharacterCount () { return m_cbScanned;         }

								inline UINT CTextDatabase::DescriptorCount() { return m_iSerialNumberNext; }

								inline UINT CTextDatabase::MaxTokenWidth  () { return m_cwDisplayMax;      }


								inline PUINT CTextDatabase::TokenBase()

								{

								    return (PUINT) (vbTokenStream.Base);

								}


								inline UINT  CTextDatabase::TokenCount()

								{

								    PLocalDictionary pld;


									if (m_pulstate && (pld= m_pulstate->pld))

										 return (pld->pltFirst + pld->clt) - (PLocalToken) TokenBase();

								    else return m_pltNext - (PLocalToken) TokenBase();

								}


								inline CIndicatorSet *CTextDatabase::SymbolLocations() { return m_pisSymbols;    }


								inline void CTextDatabase::IndicateTokenRefs(CIndicatorSet *pisTokens, UINT iDescriptor)

								{

								    IndicateRefs(m_prldTokenRefs + iDescriptor, m_pdwCompressedRefs, pisTokens, FALSE);

								}


								inline void CTextDatabase::IndicateArticleRefs(CIndicatorSet *pisArticles, UINT iDescriptor, const UINT *piMap)

								{

								    IndicateMappedRefs(m_prldArticleRefs + iDescriptor, m_pdwArticleRefs, pisArticles, piMap);

								}


								inline	CDictionary	*CTextDatabase::PDict()  {ASSERT(FVectorSearch());  return m_pDict;}

								inline	CCollection	*CTextDatabase::PColl()  {ASSERT(FVectorSearch());  return m_pColl;}


								inline LCID CTextDatabase::SortingLCID() { return m_lcidSorting; }


								#endif // __TXDBASE_H__